Imported Upstream version 1.11.0 upstream upstream/1.11.0
authortran-tung <tran.tung@samsung.com>
Mon, 24 Feb 2025 04:13:17 +0000 (11:13 +0700)
committertran-tung <tran.tung@samsung.com>
Mon, 24 Feb 2025 04:13:17 +0000 (11:13 +0700)
416 files changed:
AUTHORS
COPYING
COPYING.LIB
ChangeLog
LICENSES
Makefile.am
Makefile.in
NEWS
README
VERSION
acinclude.m4
aclocal.m4
build-aux/config.guess
build-aux/config.rpath
build-aux/config.sub
build-aux/ltmain.sh
cipher/Makefile.am
cipher/Makefile.in
cipher/arcfour.c
cipher/aria-aesni-avx-amd64.S [new file with mode: 0644]
cipher/aria-aesni-avx2-amd64.S [new file with mode: 0644]
cipher/aria-gfni-avx512-amd64.S [new file with mode: 0644]
cipher/aria.c [new file with mode: 0644]
cipher/asm-common-aarch64.h
cipher/asm-common-amd64.h
cipher/asm-common-i386.h [new file with mode: 0644]
cipher/blake2.c
cipher/blake2b-amd64-avx2.S
cipher/blake2b-amd64-avx512.S [new file with mode: 0644]
cipher/blake2s-amd64-avx.S
cipher/blake2s-amd64-avx512.S [new file with mode: 0644]
cipher/blowfish-amd64.S
cipher/blowfish-arm.S
cipher/blowfish.c
cipher/bulkhelp.h [new file with mode: 0644]
cipher/camellia-aarch64-ce.c [new file with mode: 0644]
cipher/camellia-aarch64.S
cipher/camellia-aesni-avx-amd64.S
cipher/camellia-aesni-avx2-amd64.h
cipher/camellia-arm.S
cipher/camellia-gfni-avx2-amd64.S [new file with mode: 0644]
cipher/camellia-gfni-avx512-amd64.S [new file with mode: 0644]
cipher/camellia-glue.c
cipher/camellia-ppc8le.c [new file with mode: 0644]
cipher/camellia-ppc9le.c [new file with mode: 0644]
cipher/camellia-simd128.h [new file with mode: 0644]
cipher/camellia.h
cipher/cast5-amd64.S
cipher/cast5-arm.S
cipher/cast5.c
cipher/chacha20-aarch64.S
cipher/chacha20-amd64-avx2.S
cipher/chacha20-amd64-avx512.S [new file with mode: 0644]
cipher/chacha20-amd64-ssse3.S
cipher/chacha20-p10le-8x.s [new file with mode: 0644]
cipher/chacha20-ppc.c
cipher/chacha20-s390x.S
cipher/chacha20.c
cipher/cipher-aeswrap.c
cipher/cipher-cbc.c
cipher/cipher-ccm.c
cipher/cipher-cfb.c
cipher/cipher-ctr.c
cipher/cipher-eax.c
cipher/cipher-gcm-armv7-neon.S
cipher/cipher-gcm-armv8-aarch64-ce.S
cipher/cipher-gcm-intel-pclmul.c
cipher/cipher-gcm-ppc.c
cipher/cipher-gcm-siv.c
cipher/cipher-gcm.c
cipher/cipher-internal.h
cipher/cipher-ocb.c
cipher/cipher-ofb.c
cipher/cipher-poly1305.c
cipher/cipher-selftest.c [deleted file]
cipher/cipher-selftest.h [deleted file]
cipher/cipher-siv.c
cipher/cipher-xts.c
cipher/cipher.c
cipher/crc-armv8-aarch64-ce.S
cipher/crc-armv8-ce.c
cipher/crc-intel-pclmul.c
cipher/crc-ppc.c
cipher/crc.c
cipher/des-amd64.S
cipher/des.c
cipher/ecc-curves.c
cipher/ecc-ecdh.c
cipher/ecc-ecdsa.c
cipher/ecc.c
cipher/hash-common.c
cipher/kdf-internal.h
cipher/kdf.c
cipher/keccak-amd64-avx512.S [new file with mode: 0644]
cipher/keccak.c
cipher/keccak_permute_32.h
cipher/keccak_permute_64.h
cipher/kem-ecc.c [new file with mode: 0644]
cipher/kem-ecc.h [new file with mode: 0644]
cipher/kem.c [new file with mode: 0644]
cipher/kyber-common.c [new file with mode: 0644]
cipher/kyber-kdep.c [new file with mode: 0644]
cipher/kyber.c [new file with mode: 0644]
cipher/kyber.h [new file with mode: 0644]
cipher/mac-cmac.c
cipher/mac-gmac.c
cipher/mac-hmac.c
cipher/mac-internal.h
cipher/mac-poly1305.c
cipher/mac.c
cipher/mceliece6688128f.c [new file with mode: 0644]
cipher/mceliece6688128f.h [new file with mode: 0644]
cipher/md.c
cipher/md4.c
cipher/md5.c
cipher/poly1305-amd64-avx512.S [new file with mode: 0644]
cipher/poly1305-internal.h
cipher/poly1305-p10le.s [new file with mode: 0644]
cipher/poly1305-s390x.S
cipher/poly1305.c
cipher/primegen.c
cipher/pubkey-internal.h
cipher/pubkey-util.c
cipher/pubkey.c
cipher/rfc2268.c
cipher/rijndael-aarch64.S
cipher/rijndael-aesni.c
cipher/rijndael-amd64.S
cipher/rijndael-arm.S
cipher/rijndael-armv8-aarch32-ce.S
cipher/rijndael-armv8-aarch64-ce.S
cipher/rijndael-armv8-ce.c
cipher/rijndael-internal.h
cipher/rijndael-padlock.c
cipher/rijndael-ppc-common.h
cipher/rijndael-ppc-functions.h
cipher/rijndael-ppc.c
cipher/rijndael-ppc9le.c
cipher/rijndael-s390x.c
cipher/rijndael-ssse3-amd64-asm.S
cipher/rijndael-ssse3-amd64.c
cipher/rijndael-tables.h
cipher/rijndael-vaes-avx2-amd64.S
cipher/rijndael-vaes-avx2-i386.S [new file with mode: 0644]
cipher/rijndael-vaes-i386.c [new file with mode: 0644]
cipher/rijndael-vaes.c
cipher/rijndael.c
cipher/rmd160.c
cipher/rsa-common.c
cipher/rsa.c
cipher/salsa20-amd64.S
cipher/salsa20.c
cipher/scrypt.c
cipher/seed.c
cipher/serpent-armv7-neon.S
cipher/serpent-avx2-amd64.S
cipher/serpent-avx512-x86.c [new file with mode: 0644]
cipher/serpent-sse2-amd64.S
cipher/serpent.c
cipher/sha1-armv8-aarch64-ce.S
cipher/sha1-avx-amd64.S
cipher/sha1-avx-bmi2-amd64.S
cipher/sha1-avx2-bmi2-amd64.S
cipher/sha1-ssse3-amd64.S
cipher/sha256-armv8-aarch64-ce.S
cipher/sha256-avx-amd64.S
cipher/sha256-avx2-bmi2-amd64.S
cipher/sha256-ppc.c
cipher/sha256-ssse3-amd64.S
cipher/sha512-arm.S
cipher/sha512-armv7-neon.S
cipher/sha512-armv8-aarch64-ce.S [new file with mode: 0644]
cipher/sha512-avx-amd64.S
cipher/sha512-avx2-bmi2-amd64.S
cipher/sha512-avx512-amd64.S [new file with mode: 0644]
cipher/sha512-ppc.c
cipher/sha512-ssse3-amd64.S
cipher/sha512.c
cipher/sm3-aarch64.S
cipher/sm3-armv8-aarch64-ce.S [new file with mode: 0644]
cipher/sm3-avx-bmi2-amd64.S
cipher/sm3.c
cipher/sm4-aarch64.S [new file with mode: 0644]
cipher/sm4-aesni-avx-amd64.S
cipher/sm4-aesni-avx2-amd64.S
cipher/sm4-armv8-aarch64-ce.S [new file with mode: 0644]
cipher/sm4-armv9-aarch64-sve-ce.S [new file with mode: 0644]
cipher/sm4-gfni-avx2-amd64.S [new file with mode: 0644]
cipher/sm4-gfni-avx512-amd64.S [new file with mode: 0644]
cipher/sm4-ppc.c [new file with mode: 0644]
cipher/sm4.c
cipher/sntrup761.c [new file with mode: 0644]
cipher/sntrup761.h [new file with mode: 0644]
cipher/tiger.c
cipher/twofish-aarch64.S
cipher/twofish-amd64.S
cipher/twofish-arm.S
cipher/twofish-avx2-amd64.S
cipher/twofish.c
compat/Makefile.in
compat/compat.c
compat/libcompat.h
config.h.in
configure
configure.ac
doc/Makefile.am
doc/Makefile.in
doc/fips-fsm.pdf
doc/gcrypt.info
doc/gcrypt.info-1
doc/gcrypt.info-2
doc/gcrypt.texi
doc/gpl.texi
doc/lgpl.texi
doc/libgcrypt-modules.pdf
doc/stamp-vti
doc/version.texi
doc/yat2m.c
m4/Makefile.am
m4/Makefile.in
m4/gpg-error.m4
m4/noexecstack.m4
m4/socklen.m4 [deleted file]
mpi/Makefile.am
mpi/Makefile.in
mpi/aarch64/mpih-add1.S
mpi/aarch64/mpih-mul1.S
mpi/aarch64/mpih-mul2.S
mpi/aarch64/mpih-mul3.S
mpi/aarch64/mpih-sub1.S
mpi/alpha/mpih-add1.S
mpi/alpha/mpih-lshift.S
mpi/alpha/mpih-mul1.S
mpi/alpha/mpih-mul2.S
mpi/alpha/mpih-mul3.S
mpi/alpha/mpih-rshift.S
mpi/alpha/mpih-sub1.S
mpi/alpha/udiv-qrnnd.S
mpi/amd64/mpih-add1.S
mpi/amd64/mpih-lshift.S
mpi/amd64/mpih-mul1.S
mpi/amd64/mpih-mul2.S
mpi/amd64/mpih-mul3.S
mpi/amd64/mpih-rshift.S
mpi/amd64/mpih-sub1.S
mpi/arm/mpih-add1.S
mpi/arm/mpih-mul1.S
mpi/arm/mpih-mul2.S
mpi/arm/mpih-mul3.S
mpi/arm/mpih-sub1.S
mpi/asm-common-i386.h [new file with mode: 0644]
mpi/config.links
mpi/ec-ed25519.c
mpi/ec-inline.h
mpi/ec-nist.c
mpi/ec.c
mpi/generic/mpih-add1.c
mpi/generic/mpih-lshift.c
mpi/generic/mpih-mul1.c
mpi/generic/mpih-mul2.c
mpi/generic/mpih-mul3.c
mpi/generic/mpih-rshift.c
mpi/generic/mpih-sub1.c
mpi/generic/udiv-w-sdiv.c
mpi/hppa/mpih-add1.S
mpi/hppa/mpih-lshift.S
mpi/hppa/mpih-rshift.S
mpi/hppa/mpih-sub1.S
mpi/hppa/udiv-qrnnd.S
mpi/hppa1.1/mpih-mul1.S
mpi/hppa1.1/mpih-mul2.S
mpi/hppa1.1/mpih-mul3.S
mpi/hppa1.1/udiv-qrnnd.S
mpi/i386/mpih-add1.S
mpi/i386/mpih-lshift.S
mpi/i386/mpih-mul1.S
mpi/i386/mpih-mul2.S
mpi/i386/mpih-mul3.S
mpi/i386/mpih-rshift.S
mpi/i386/mpih-sub1.S
mpi/i386/syntax.h
mpi/longlong.h
mpi/m68k/mc68020/mpih-mul1.S
mpi/m68k/mc68020/mpih-mul2.S
mpi/m68k/mc68020/mpih-mul3.S
mpi/m68k/mpih-add1.S
mpi/m68k/mpih-lshift.S
mpi/m68k/mpih-rshift.S
mpi/m68k/mpih-sub1.S
mpi/m68k/syntax.h
mpi/mips3/mpih-add1.S
mpi/mips3/mpih-lshift.S
mpi/mips3/mpih-mul1.S
mpi/mips3/mpih-mul2.S
mpi/mips3/mpih-mul3.S
mpi/mips3/mpih-rshift.S
mpi/mips3/mpih-sub1.S
mpi/mpi-add.c
mpi/mpi-bit.c
mpi/mpi-cmp.c
mpi/mpi-div.c
mpi/mpi-gcd.c
mpi/mpi-inline.c
mpi/mpi-inline.h
mpi/mpi-internal.h
mpi/mpi-mod.c
mpi/mpi-mpow.c
mpi/mpi-mul.c
mpi/mpi-pow.c
mpi/mpi-scan.c
mpi/mpih-const-time.c
mpi/mpih-div.c
mpi/mpih-mul.c
mpi/mpiutil.c
mpi/pa7100/mpih-lshift.S
mpi/pa7100/mpih-rshift.S
mpi/power/mpih-add1.S
mpi/power/mpih-lshift.S
mpi/power/mpih-mul1.S
mpi/power/mpih-mul2.S
mpi/power/mpih-mul3.S
mpi/power/mpih-rshift.S
mpi/power/mpih-sub1.S
mpi/powerpc32/mpih-add1.S
mpi/powerpc32/mpih-lshift.S
mpi/powerpc32/mpih-mul1.S
mpi/powerpc32/mpih-mul2.S
mpi/powerpc32/mpih-mul3.S
mpi/powerpc32/mpih-rshift.S
mpi/powerpc32/mpih-sub1.S
mpi/powerpc32/syntax.h
mpi/sparc32/mpih-add1.S
mpi/sparc32/mpih-lshift.S
mpi/sparc32/mpih-rshift.S
mpi/sparc32/udiv.S
mpi/sparc32v8/mpih-mul1.S
mpi/sparc32v8/mpih-mul2.S
mpi/sparc32v8/mpih-mul3.S
mpi/supersparc/udiv.S
random/Makefile.am
random/Makefile.in
random/rand-internal.h
random/random-csprng.c
random/random-drbg.c
random/random.h
random/rndegd.c
random/rndw32.c
src/Makefile.am
src/Makefile.in
src/cipher-proto.h
src/cipher.h
src/context.c
src/context.h
src/dumpsexp.c
src/ec-context.h
src/fips.c
src/g10lib.h
src/gcrypt-int.h
src/gcrypt-testapi.h
src/gcrypt.h.in
src/gcryptrnd.c [deleted file]
src/getrandom.c [deleted file]
src/global.c
src/hmac256.c
src/hmac256.h
src/hwf-arm.c
src/hwf-common.h
src/hwf-x86.c
src/hwfeatures.c
src/libgcrypt-config.in
src/libgcrypt.def
src/libgcrypt.m4
src/libgcrypt.vers
src/misc.c
src/missing-string.c
src/mpi.h
src/mpicalc.c
src/secmem.c
src/secmem.h
src/sexp.c
src/stdmem.c
src/stdmem.h
src/types.h
src/versioninfo.rc.in
src/visibility.c
src/visibility.h
tests/Makefile.am
tests/Makefile.in
tests/aeswrap.c
tests/basic.c
tests/bench-slope.c
tests/benchmark.c
tests/curves.c
tests/hashtest-256g.in
tests/hashtest-6g.in [new file with mode: 0644]
tests/hashtest.c
tests/hmac.c
tests/keygen.c
tests/keygrip.c
tests/mpitests.c
tests/pkcs1v2.c
tests/prime.c
tests/pubkey.c
tests/random.c
tests/t-common.h
tests/t-ed25519.c
tests/t-ed448.c
tests/t-kdf.c
tests/t-kem.c [new file with mode: 0644]
tests/t-mlkem.c [new file with mode: 0644]
tests/t-mlkem.inp [new file with mode: 0644]
tests/t-mpi-bit.c
tests/t-x448.c
tests/testapi.c
tests/testdrv.c
tests/version.c

diff --git a/AUTHORS b/AUTHORS
index bc6182ec19ee02ab9ed57f86d9c000e7fcb8a402..f9161600b6c3672407d5db5dcae99669720d9be0 100644 (file)
--- a/AUTHORS
+++ b/AUTHORS
@@ -1,7 +1,6 @@
 Library: Libgcrypt
-Homepage: https://www.gnupg.org/related_software/libgcrypt/
-Download: https://ftp.gnupg.org/ftp/gcrypt/libgcrypt/
-          ftp://ftp.gnupg.org/gcrypt/libgcrypt/
+Homepage: https://gnupg.org/related_software/libgcrypt/
+Download: https://gnupg.org/ftp/gcrypt/libgcrypt/
 Repository: git://git.gnupg.org/libgcrypt.git
 Maintainer: Werner Koch <wk@gnupg.org>
 Bug reports: https://bugs.gnupg.org
@@ -32,17 +31,18 @@ List of Copyright holders
   Copyright (C) 2003 Nikos Mavroyanopoulos
   Copyright (c) 2006 CRYPTOGAMS
   Copyright (C) 2006-2007 NTT (Nippon Telegraph and Telephone Corporation)
-  Copyright (C) 2012-2023 g10 Code GmbH
+  Copyright (C) 2012-2024 g10 Code GmbH
   Copyright (C) 2012 Simon Josefsson, Niels Möller
   Copyright (c) 2012 Intel Corporation
   Copyright (C) 2013 Christian Grothoff
-  Copyright (C) 2013-2022 Jussi Kivilinna
+  Copyright (C) 2013-2024 Jussi Kivilinna
   Copyright (C) 2013-2014 Dmitry Eremin-Solenikov
   Copyright (C) 2014 Stephan Mueller
   Copyright (C) 2017 Jia Zhang
   Copyright (C) 2018 Bundesamt für Sicherheit in der Informationstechnik
   Copyright (C) 2020 Alibaba Group.
   Copyright (C) 2020 Tianjia Zhang
+  Copyright (C) 2023 Simon Josefsson
 
 
 Authors with a FSF copyright assignment
@@ -169,6 +169,9 @@ Dmitry Eremin-Solenikov <dbaryshkov@gmail.com>
 Dmitry Kasatkin <dmitry.kasatkin@intel.com>
 2012-12-14:50CAE2DB.80302@intel.com:
 
+Falko Strenzke <falko.strenzke@mtg.de>
+2023-09-27:51677567-0b78-4665-805d-fd0cdd50f7fa@mtg.de:
+
 H.J. Lu <hjl.tools@gmail.com>
 2020-01-19:20200119135241.GA4970@gmail.com:
 
@@ -211,6 +214,9 @@ Sergey V. <sftp.mtuci@gmail.com>
 Shawn Landden <shawn@git.icu>
 2019-07-09:2794651562684255@iva4-64850291ca1c.qloud-c.yandex.net:
 
+Simit Ghane <simit.ghane@lge.com>
+2024-05-06:OF22575887.761836D9-ON48258B15.0044A21E-48258B15.0044A222@lge.com:
+
 Stephan Mueller <smueller@chronox.de>
 2014-08-22:2008899.25OeoelVVA@myon.chronox.de:
 
@@ -254,6 +260,10 @@ security corporation.  See the file for details.
 The file salsa20.c is based on D.J. Bernstein's public domain code and
 taken from Nettle.  Copyright 2012 Simon Josefsson and Niels Möller.
 
+The sntrup761 code is based on public domain code written by Daniel
+J. Bernstein, Chitchanok Chuengsatiansup, Tanja Lange, and Christine
+van Vredendaal.  Copyright 2023 Simon Josefsson.
+
 
  This file is free software; as a special exception the author gives
  unlimited permission to copy and/or distribute it, with or without
diff --git a/COPYING b/COPYING
index d60c31a97a544b53039088d14fe9114583c0efc3..d159169d1050894d3ea3b98e1c965c4058208fe1 100644 (file)
--- a/COPYING
+++ b/COPYING
@@ -1,12 +1,12 @@
-                   GNU GENERAL PUBLIC LICENSE
-                      Version 2, June 1991
+                    GNU GENERAL PUBLIC LICENSE
+                       Version 2, June 1991
 
- Copyright (C) 1989, 1991 Free Software Foundation, Inc.
    59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ Copyright (C) 1989, 1991 Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  Everyone is permitted to copy and distribute verbatim copies
  of this license document, but changing it is not allowed.
 
-                           Preamble
+                            Preamble
 
   The licenses for most software are designed to take away your
 freedom to share and change it.  By contrast, the GNU General Public
@@ -15,7 +15,7 @@ software--to make sure the software is free for all its users.  This
 General Public License applies to most of the Free Software
 Foundation's software and to any other program whose authors commit to
 using it.  (Some other Free Software Foundation software is covered by
-the GNU Library General Public License instead.)  You can apply it to
+the GNU Lesser General Public License instead.)  You can apply it to
 your programs, too.
 
   When we speak of free software, we are referring to freedom, not
@@ -55,8 +55,8 @@ patent must be licensed for everyone's free use or not licensed at all.
 
   The precise terms and conditions for copying, distribution and
 modification follow.
-\f
-                   GNU GENERAL PUBLIC LICENSE
+
+                    GNU GENERAL PUBLIC LICENSE
    TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
 
   0. This License applies to any program or other work which contains
@@ -110,7 +110,7 @@ above, provided that you also meet all of these conditions:
     License.  (Exception: if the Program itself is interactive but
     does not normally print such an announcement, your work based on
     the Program is not required to print an announcement.)
-\f
+
 These requirements apply to the modified work as a whole.  If
 identifiable sections of that work are not derived from the Program,
 and can be reasonably considered independent and separate works in
@@ -168,7 +168,7 @@ access to copy from a designated place, then offering equivalent
 access to copy the source code from the same place counts as
 distribution of the source code, even though third parties are not
 compelled to copy the source along with the object code.
-\f
+
   4. You may not copy, modify, sublicense, or distribute the Program
 except as expressly provided under this License.  Any attempt
 otherwise to copy, modify, sublicense or distribute the Program is
@@ -225,7 +225,7 @@ impose that choice.
 
 This section is intended to make thoroughly clear what is believed to
 be a consequence of the rest of this License.
-\f
+
   8. If the distribution and/or use of the Program is restricted in
 certain countries either by patents or by copyrighted interfaces, the
 original copyright holder who places the Program under this License
@@ -255,7 +255,7 @@ make exceptions for this.  Our decision will be guided by the two goals
 of preserving the free status of all derivatives of our free software and
 of promoting the sharing and reuse of software generally.
 
-                           NO WARRANTY
+                            NO WARRANTY
 
   11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY
 FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW.  EXCEPT WHEN
@@ -277,9 +277,9 @@ YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER
 PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE
 POSSIBILITY OF SUCH DAMAGES.
 
-                    END OF TERMS AND CONDITIONS
-\f
-           How to Apply These Terms to Your New Programs
+                     END OF TERMS AND CONDITIONS
+
+            How to Apply These Terms to Your New Programs
 
   If you develop a new program, and you want it to be of the greatest
 possible use to the public, the best way to achieve this is to make it
@@ -303,17 +303,16 @@ the "copyright" line and a pointer to where the full notice is found.
     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     GNU General Public License for more details.
 
-    You should have received a copy of the GNU General Public License
-    along with this program; if not, write to the Free Software
-    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
-
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 
 Also add information on how to contact you by electronic and paper mail.
 
 If the program is interactive, make it output a short notice like this
 when it starts in an interactive mode:
 
-    Gnomovision version 69, Copyright (C) year  name of author
+    Gnomovision version 69, Copyright (C) year name of author
     Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
     This is free software, and you are welcome to redistribute it
     under certain conditions; type `show c' for details.
@@ -336,5 +335,5 @@ necessary.  Here is a sample; alter the names:
 This General Public License does not permit incorporating your program into
 proprietary programs.  If your program is a subroutine library, you may
 consider it more useful to permit linking proprietary applications with the
-library.  If this is what you want to do, use the GNU Library General
+library.  If this is what you want to do, use the GNU Lesser General
 Public License instead of this License.
index cf9b6b997263b8a4e007aa1edd4b16437f4583ca..4362b49151d7b34ef83b3067a8f9c9f877d72a0e 100644 (file)
@@ -1,9 +1,8 @@
-
                   GNU LESSER GENERAL PUBLIC LICENSE
                        Version 2.1, February 1999
 
  Copyright (C) 1991, 1999 Free Software Foundation, Inc.
    59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
  Everyone is permitted to copy and distribute verbatim copies
  of this license document, but changing it is not allowed.
 
@@ -23,8 +22,7 @@ specially designated software packages--typically libraries--of the
 Free Software Foundation and other authors who decide to use it.  You
 can use it too, but we suggest you first think carefully about whether
 this license or the ordinary General Public License is the better
-strategy to use in any particular case, based on the explanations
-below.
+strategy to use in any particular case, based on the explanations below.
 
   When we speak of free software, we are referring to freedom of use,
 not price.  Our General Public Licenses are designed to make sure that
@@ -57,7 +55,7 @@ modified by someone else and passed on, the recipients should know
 that what they have is not the original version, so that the original
 author's reputation will not be affected by problems that might be
 introduced by others.
-^L
+\f
   Finally, software patents pose a constant threat to the existence of
 any free program.  We wish to make sure that a company cannot
 effectively restrict the users of a free program by obtaining a
@@ -89,9 +87,9 @@ libraries.  However, the Lesser license provides advantages in certain
 special circumstances.
 
   For example, on rare occasions, there may be a special need to
-encourage the widest possible use of a certain library, so that it
-becomes a de-facto standard.  To achieve this, non-free programs must
-be allowed to use the library.  A more frequent case is that a free
+encourage the widest possible use of a certain library, so that it becomes
+a de-facto standard.  To achieve this, non-free programs must be
+allowed to use the library.  A more frequent case is that a free
 library does the same job as widely used non-free libraries.  In this
 case, there is little to gain by limiting the free library to free
 software only, so we use the Lesser General Public License.
@@ -113,7 +111,7 @@ modification follow.  Pay close attention to the difference between a
 "work based on the library" and a "work that uses the library".  The
 former contains code derived from the library, whereas the latter must
 be combined with the library in order to run.
-^L
+\f
                   GNU LESSER GENERAL PUBLIC LICENSE
    TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
 
@@ -138,8 +136,8 @@ included without limitation in the term "modification".)
   "Source code" for a work means the preferred form of the work for
 making modifications to it.  For a library, complete source code means
 all the source code for all modules it contains, plus any associated
-interface definition files, plus the scripts used to control
-compilation and installation of the library.
+interface definition files, plus the scripts used to control compilation
+and installation of the library.
 
   Activities other than copying, distribution and modification are not
 covered by this License; they are outside its scope.  The act of
@@ -218,7 +216,7 @@ instead of to this License.  (If a newer version than version 2 of the
 ordinary GNU General Public License has appeared, then you can specify
 that version instead if you wish.)  Do not make any other change in
 these notices.
-^L
+\f
   Once this change is made in a given copy, it is irreversible for
 that copy, so the ordinary GNU General Public License applies to all
 subsequent copies and derivative works made from that copy.
@@ -269,7 +267,7 @@ Library will still fall under Section 6.)
 distribute the object code for the work under the terms of Section 6.
 Any executables containing that work also fall under Section 6,
 whether or not they are linked directly with the Library itself.
-^L
+\f
   6. As an exception to the Sections above, you may also combine or
 link a "work that uses the Library" with the Library to produce a
 work containing portions of the Library, and distribute that work
@@ -305,10 +303,10 @@ of these things:
     the user installs one, as long as the modified version is
     interface-compatible with the version that the work was made with.
 
-    c) Accompany the work with a written offer, valid for at least
-    three years, to give the same user the materials specified in
-    Subsection 6a, above, for a charge no more than the cost of
-    performing this distribution.
+    c) Accompany the work with a written offer, valid for at
+    least three years, to give the same user the materials
+    specified in Subsection 6a, above, for a charge no more
+    than the cost of performing this distribution.
 
     d) If distribution of the work is made by offering access to copy
     from a designated place, offer equivalent access to copy the above
@@ -331,7 +329,7 @@ restrictions of other proprietary libraries that do not normally
 accompany the operating system.  Such a contradiction means you cannot
 use both them and the Library together in an executable that you
 distribute.
-^L
+\f
   7. You may place library facilities that are a work based on the
 Library side-by-side in a single library together with other library
 facilities not covered by this License, and distribute such a combined
@@ -372,7 +370,7 @@ subject to these terms and conditions.  You may not impose any further
 restrictions on the recipients' exercise of the rights granted herein.
 You are not responsible for enforcing compliance by third parties with
 this License.
-^L
+\f
   11. If, as a consequence of a court judgment or allegation of patent
 infringement or for any other reason (not limited to patent issues),
 conditions are imposed on you (whether by court order, agreement or
@@ -386,10 +384,9 @@ all those who receive copies directly or indirectly through you, then
 the only way you could satisfy both it and this License would be to
 refrain entirely from distribution of the Library.
 
-If any portion of this section is held invalid or unenforceable under
-any particular circumstance, the balance of the section is intended to
-apply, and the section as a whole is intended to apply in other
-circumstances.
+If any portion of this section is held invalid or unenforceable under any
+particular circumstance, the balance of the section is intended to apply,
+and the section as a whole is intended to apply in other circumstances.
 
 It is not the purpose of this section to induce you to infringe any
 patents or other property right claims or to contest validity of any
@@ -407,11 +404,11 @@ be a consequence of the rest of this License.
 
   12. If the distribution and/or use of the Library is restricted in
 certain countries either by patents or by copyrighted interfaces, the
-original copyright holder who places the Library under this License
-may add an explicit geographical distribution limitation excluding those
-countries, so that distribution is permitted only in or among
-countries not thus excluded.  In such case, this License incorporates
-the limitation as if written in the body of this License.
+original copyright holder who places the Library under this License may add
+an explicit geographical distribution limitation excluding those countries,
+so that distribution is permitted only in or among countries not thus
+excluded.  In such case, this License incorporates the limitation as if
+written in the body of this License.
 
   13. The Free Software Foundation may publish revised and/or new
 versions of the Lesser General Public License from time to time.
@@ -425,7 +422,7 @@ conditions either of that version or of any later version published by
 the Free Software Foundation.  If the Library does not specify a
 license version number, you may choose any version ever published by
 the Free Software Foundation.
-^L
+\f
   14. If you wish to incorporate parts of the Library into other free
 programs whose distribution conditions are incompatible with these,
 write to the author to ask for permission.  For software which is
@@ -459,21 +456,19 @@ SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH
 DAMAGES.
 
                      END OF TERMS AND CONDITIONS
-^L
+\f
            How to Apply These Terms to Your New Libraries
 
   If you develop a new library, and you want it to be of the greatest
 possible use to the public, we recommend making it free software that
 everyone can redistribute and change.  You can do so by permitting
-redistribution under these terms (or, alternatively, under the terms
-of the ordinary General Public License).
-
-  To apply these terms, attach the following notices to the library.
-It is safest to attach them to the start of each source file to most
-effectively convey the exclusion of warranty; and each file should
-have at least the "copyright" line and a pointer to where the full
-notice is found.
+redistribution under these terms (or, alternatively, under the terms of the
+ordinary General Public License).
 
+  To apply these terms, attach the following notices to the library.  It is
+safest to attach them to the start of each source file to most effectively
+convey the exclusion of warranty; and each file should have at least the
+"copyright" line and a pointer to where the full notice is found.
 
     <one line to give the library's name and a brief idea of what it does.>
     Copyright (C) <year>  <name of author>
@@ -490,21 +485,18 @@ notice is found.
 
     You should have received a copy of the GNU Lesser General Public
     License along with this library; if not, write to the Free Software
-    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307  USA
+    Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
 
 Also add information on how to contact you by electronic and paper mail.
 
-You should also get your employer (if you work as a programmer) or
-your school, if any, to sign a "copyright disclaimer" for the library,
-if necessary.  Here is a sample; alter the names:
+You should also get your employer (if you work as a programmer) or your
+school, if any, to sign a "copyright disclaimer" for the library, if
+necessary.  Here is a sample; alter the names:
 
   Yoyodyne, Inc., hereby disclaims all copyright interest in the
-  library `Frob' (a library for tweaking knobs) written by James
-  Random Hacker.
+  library `Frob' (a library for tweaking knobs) written by James Random Hacker.
 
   <signature of Ty Coon>, 1 April 1990
   Ty Coon, President of Vice
 
 That's all there is to it!
-
-
index 53fec8d8bfc614820a1119610bc4c566775fb414..a124cc91a7277a1a819df94a05e7995b458c4701 100644 (file)
--- a/ChangeLog
+++ b/ChangeLog
-2023-11-14  Werner Koch  <wk@gnupg.org>
+2024-06-19  Werner Koch  <wk@gnupg.org>
 
-       Release 1.10.3.
-       + commit aa1610866f8e42bdc272584f0a717f32ee050a22
+       Release 1.11.0.
+       + commit 9d94d7846cde272b8b1519ba96e53967bf0b90d2
 
 
-2023-11-07  Jussi Kivilinna  <jussi.kivilinna@iki.fi>
+2024-06-14  NIIBE Yutaka  <gniibe@fsij.org>
+
+       m4: Update gpg-error.m4.
+       + commit 89adba4518d2c7f83a37361f66b3ac290c96525c
+       * m4/gpg-error.m4: Update from libgpg-error master.
+
+2024-06-13  NIIBE Yutaka  <gniibe@fsij.org>
+
+       libgcrypt.m4: Fix setting/using GPG_ERROR_CONFIG.
+       + commit 0fd9ec3403ea66d39f32602dce53bc5219e4c158
+       libgcrypt.m4 (_AM_PATH_GPGRT_CONFIG): Don't set GPG_ERROR_CONFIG and
+       gpg_error_config_version.
+
+2024-06-06  Jakub Jelen  <jjelen@redhat.com>
+
+       cipher:aeswrap: Fix padding length check.
+       + commit dc8d84383a6bfa44f397132eae203efa0bf20e82
+       * cipher/cipher-aeswrap.c (_gcry_cipher_keywrap_decrypt_auto): When
+       padding length is 8, correctly return GPG_ERR_CHECKSUM.
+
+2024-05-21  Jakub Jelen  <jjelen@redhat.com>
+
+       mpi: Fix loop condition in bad point check.
+       + commit 1875758440158fe1943d1b19b29d77421d269256
+       * mpi/ec.c (ec_p_init): Fix loop condition to avoid out-of-range read.
+
+2024-05-17  Werner Koch  <wk@gnupg.org>
+
+       build: Remove cruft from configure.ac.
+       + commit 0526d65afd34f9282e7fd125df6ee9a3c757ae79
+       * configure.ac: Remove unused cruft.
+
+2024-05-14  NIIBE Yutaka  <gniibe@fsij.org>
+
+       m4: Include _AM_PATH_GPGRT_CONFIG definition.
+       + commit ad3b599462bdbc459f6c7be867e9a12ab46481b3
+       * src/libgcrypt.m4: Find gpgrt-config.
+
+2024-05-09  Jussi Kivilinna  <jussi.kivilinna@iki.fi>
+
+       random: fix o_flag_munging for -O1.
+       + commit 5afadba008918d651afefb842ae123cc18454c74
+       * random/Makefile.am (o_flag_munging): Also convert -O1 to -O0.
+
+2024-05-08  Jussi Kivilinna  <jussi.kivilinna@iki.fi>
+
+       chacha20-aarch64: use local symbols for read-only data.
+       + commit 3f5989014a1b7c01aced88897d878ce5105d15df
+       * cipher/chacha20-aarch64.S: Remove '.globl' mark for RODATA section
+       objects.
+
+2024-05-07  simit.ghane  <simit.ghane@lge.com>
+
+       Fix building error with '-O2' in sysroot path.
+       + commit b99952adc6ee611641709610d2e4dc90ba9acf37
+       * cipher/Makefile.am (o_flag_munging): Tweak the sed script.
+       * random/Makefile.am (o_flag_munging): Ditto.
+
+2024-05-07  NIIBE Yutaka  <gniibe@fsij.org>
+
+       cipher:kem: Remove the experimental KEMs (PGP and CMS).
+       + commit c9affe97d20bc958d733206adf350214b32ae0b2
+       * src/gcrypt.h.in (enum gcry_kem_algos): Remove
+       GCRY_KEM_OPENPGP_X25519, GCRY_KEM_CMS_X25519_X963_SHA256, and
+       GCRY_KEM_CMS_X25519_HKDF_SHA256.
+       * cipher/kem.c (_gcry_kem_keypair, _gcry_kem_encap): Likewise.
+       (_gcry_kem_decap): Likewise.
+       * cipher/kem-ecc.c (algo_to_curve): Follow the removal.
+       (algo_to_seckey_len): Likewise.
+       (openpgp_kem_kdf, _gcry_openpgp_kem_encap): Remove.
+       (_gcry_openpgp_kem_decap, cms_kem_kdf): Remove.
+       (_gcry_cms_kem_encap, _gcry_cms_kem_decap): Remove.
+       * cipher/kem-ecc.h: Follow the removal.
+       * tests/t-kem.c (test_kem_openpgp_x25519) Remove.
+       (test_kem_cms_x25519): Remove.
+       (check_kem, main): Follow the change.
+
+2024-04-30  Jussi Kivilinna  <jussi.kivilinna@iki.fi>
+
+       serpent-avx512-x86: fix CBC and CFB decryption with clang-18.
+       + commit 5a653a9129d7cc2f203ab9ad37ccdbcb832337d1
+       * cipher/serpent-avx512-x86.c (serpent_avx512_blk32): Avoid
+       '_mm512_castsi128_si512' usage to prevent non-initialized
+       vector register parts getting XOR into calculations for
+       CBC and CFB decryption.
+
+2024-04-26  Werner Koch  <wk@gnupg.org>
+
+       Let gcry_print_config show whether it is amd64 or i686.
+       + commit 0729fb84a6a1c038d4ed10f9c60be7df48558d9e
+       * src/global.c (print_config): Append a sub-cpu string.
+
+2024-04-25  Werner Koch  <wk@gnupg.org>
+
+       Require GpgRT 1.49.
+       + commit f895a69d3b4bd5b8f6fab11345c2663947b7e5e3
+       * configure.ac (LIBGCRYPT_LT_REVISION): Require 1.49
+       * src/global.c (print_config): Repalce gpgrt backward compatibility.
+       * src/misc.c (_gcry_set_gpgrt_post_log_handler): Ditto.
+       (_gcry_logv): Use new gpgrt_logv_domain.
+
+2024-04-24  NIIBE Yutaka  <gniibe@fsij.org>
+
+       cipher:kem:ecc: Support NIST curves.
+       + commit 005292cf9f41179247918711b2968dd188aae122
+       * cipher/kem-ecc (ECC_SECKEY_LEN_MAX): Fix for P521R1.
+       (algo_to_curve): Using canonical name, add NIST curves,
+       (algo_to_seckey_len): Likewise.
+       * cipher/kem.c (_gcry_kem_keypair, _gcry_kem_encap): Likewise.
+       (_gcry_kem_decap): Likewise.
+       * src/gcrypt.h.in (enum gcry_kem_algos): Likewise.
+
+       cipher:kem:ecc: Fix DHKEM implementation.
+       + commit 118fa95d8d36efb07dbebdbcbc0e3408e69ac5d2
+       * cipher/kem-ecc.c (_gcry_ecc_dhkem_decap): Fix the size.
+
+2024-04-23  Werner Koch  <wk@gnupg.org>
+
+       cipher:kem:ecc: Support brainpoolP512r1.
+       + commit 4fb99ec266e5d38e0f6df5c2fd27da71f2fb441f
+       * cipher/kem-ecc.c (algo_to_curve): Add GCRY_KEM_RAW_BP512.
+       (algo_to_seckey_len): Ditto.
+       * cipher/kem.c (_gcry_kem_keypair): Ditto.
+       (_gcry_kem_encap): Ditto.
+       (_gcry_kem_decap): Ditto.
+
+       Use explicit values for the gcry_kem_algos enum.
+       + commit 66884c2d6b35bf418a267697a2726142a95188be
+       * src/gcrypt.h.in (enum gcry_kem_algos): Use explicit values for
+       improved ABI stability.
+       (GCRY_KEM_RAW_BP512): New.
+
+       cipher:kem:ecc: Fix for Weierstrass.
+       + commit 7ad308434f159952baf578eca60efc1f5bf93019
+       * cipher/ecc-ecdh.c (_gcry_ecc_curve_mul_point): Use POINT_LEN and not
+       NBYTES to create mpi_u.
+
+       Divert log functions to the gpgrt log functions.
+       + commit ab0bdc72c79d5655e63047f768a202e063d8c18b
+       * src/misc.c (my_gpgrt_post_fatal_handler): New.
+       (_gcry_set_gpgrt_post_log_handler): New.
+       (map_log_level): New.
+       (_gcry_logv): Use gcrypt log function unless a handler has been
+       installed.
+       * src/global.c (global_init): Install post log handler.
+       * src/gcrypt.h.in (gcry_set_log_handler): Deprecate.
+
+2024-04-22  NIIBE Yutaka  <gniibe@fsij.org>
+
+       cipher:kem:ecc: Support brainpoolP256r1 and brainpoolP384r1.
+       + commit 9c65260f915f3eeebf10b51ab7c3c9b5f0004d9b
+       * cipher/kem.c (_gcry_kem_keypair): Support classic curves.
+       (_gcry_kem_encap, _gcry_kem_decap): Likewise.
+       * cipher/kem-ecc.c (algo_to_curve, algo_to_seckey_len): New.
+       (_gcry_ecc_raw_keypair): Support classic curves.
+       (_gcry_ecc_raw_encap, _gcry_ecc_raw_decap): Likewise.
+       (ecc_tweak_bits): Move to ...
+       * cipher/ecc-ecdh.c (ecc_tweak_bits): here.
+       (_gcry_ecc_curve_keypair): New.
+       (_gcry_ecc_curve_mul_point): Add length arguments.
+       * src/gcrypt-int.h (_gcry_ecc_curve_keypair): New.
+       * src/gcrypt.h.in: Add constants for brainpoolP256r1 and
+       brainpoolP384r1.
+
+2024-04-19  NIIBE Yutaka  <gniibe@fsij.org>
+
+       cipher:ecc: Return the result in SEC1 point format for Weierstrass.
+       + commit 47f2cb8f6f35cf962158d9cffc79c23fab163585
+       * cipher/ecc-ecdh.c (_gcry_ecc_curve_mul_point): Fix for
+       MPI_EC_WEIERSTRASS curve.
+
+       cipher:ecc: Add _gcry_ecc_curve_mul_point function with curve name.
+       + commit b1da8247dcbc036a5dc55dc7121ccd537c9b7ff3
+       * cipher/ecc-ecdh.c (_gcry_ecc_curve_mul_point): New.  Also support
+       Weierstrass curve as well as Montgomery one.
+       (_gcry_ecc_mul_point): Use _gcry_ecc_curve_mul_point.
+
+       cipher:kem: Rename Classic McEliece API constants.
+       + commit 72b1b8d447fb8046e5546998619206646dde2ea7
+       * src/gcrypt.h.in (GCRY_KEM_CM6688128F): Rename.
+       * cipher/kem.c (_gcry_kem_keypair): Follow the change.
+       (_gcry_kem_encap, _gcry_kem_decap): Likewise.
+       * tests/t-kem.c (test_kem_mceliece6688128f): Likewise.
+       (main): Also change CLI option name.
+
+2024-04-18  NIIBE Yutaka  <gniibe@fsij.org>
+
+       cipher: More clean up for Classic McEliece implementation.
+       + commit 9a552b80d6c44a05ab733e441664d19de6f23da3
+       * cipher/mceliece6688128f.h: Provide GCC_ATTR_UNUSED.
+       * cipher/mceliece6688128f.c: Use GCC_ATTR_UNUSED.
+
+       cipher: Fix comments of Classic McEliece implementation.
+       + commit 6478203e77d173128c4c0810db6f41198c360ca2
+       * cipher/mceliece6688128f.c: Don't use C++-style comments for old
+       compilers (< C99).
+
+2024-04-17  Simon Josefsson  <simon@josefsson.org>
+
+       cipher: Add Classic McEliece mceliece6688128f.
+       + commit 003367b91272f499d6eecb32ab9a09f383bdc788
+       * cipher/Makefile.am (libcipher_la_SOURCES): Add mceliece6688128f.{c,h}.
+       * cipher/mceliece6688128f.c, cipher/mceliece6688128f.h: New.
+       * tests/t-kem.c: Support mceliece6688128f.
+
+2024-04-16  NIIBE Yutaka  <gniibe@fsij.org>
+
+       cipher: Fix copyright notices for Kyber implementation.
+       + commit eec1a576dbd080eb54ad7c6027271fe3f71c8847
+       * cipher/kyber-common.c: Fix copyright notice sentences.
+       * cipher/kyber-kdep.c, cipher/kyber.c, cipher/kyber.h: Likewise.
+
+2024-04-09  NIIBE Yutaka  <gniibe@fsij.org>
+
+       cipher: Add X448 based ECC KEM.
+       + commit 40b4781022c5f5ebf1ec6e6d814f9c2a346cb3ca
+       * src/gcrypt.h.in (GCRY_KEM_RAW_X448, GCRY_KEM_DHKEM448): New.
+       * cipher/kem-ecc.c (_gcry_ecc_raw_encap): Support X448.
+       (ecc_dhkem_kdf, _gcry_ecc_dhkem_encap): Likewise.
+       (_gcry_ecc_dhkem_decap): Likewise.
+       * cipher/kem.c (_gcry_kem_keypair): Add support for GCRY_KEM_RAW_X448
+       and GCRY_KEM_DHKEM448.
+       (_gcry_kem_encap, _gcry_kem_decap): Likewise.
+
+2024-04-05  Werner Koch  <wk@gnupg.org>
+
+       Trailing comma removal for better portability.
+       + commit 9e6db9d74631e623164319f1fc9713afdf9e3eb9
+       * cipher/kem.c (kem_names): Remove trailing comma.
+       * src/gcrypt.h.in (enum gcry_kem_algos): Ditto.
+
+       ecc: Add bp256, bp384, bp512 aliases for Brainpool curves.
+       + commit d211e7fe967381bfb19e4f18e0be067c68200c50
+       * cipher/ecc-curves.c (curve_aliases): Add new aliases for Brainpool.
+
+2024-04-03  NIIBE Yutaka  <gniibe@fsij.org>
+
+       tests: Fix t-kem for tests with X25519 for FIPS mode.
+       + commit a78dcad69872e1d6e6eb7ead174e398d9c7d4800
+       * tests/t-kem.c (test_kem_raw_x25519): It is expected to fail.
+       (test_kem_dhkem_x25519, test_kem_openpgp_x25519): Likewise.
+       (test_kem_cms_x25519): Likewise.
+
+2024-03-28  NIIBE Yutaka  <gniibe@fsij.org>
+
+       cipher:kem: Add ECC KEM for X25519.
+       + commit 5da6c63fed34f6027a9531780252f0f54087c379
+       * cipher/Makefile.am (libcipher_la_SOURCES): Add kem-ecc.{c,h}.
+       * cipher/kem-ecc.c: New.
+       * cipher/kem-ecc.h: New.
+       * cipher/kem.c (_gcry_kem_keypair): Dispatch to _gcry_ecc_raw_keypair.
+       (_gcry_kem_encap): Dispatch to _gcry_ecc_raw_encap,
+       _gcry_ecc_dhkem_encap, _gcry_openpgp_kem_encap, and
+       _gcry_cms_kem_encap.
+       (_gcry_kem_decap): Dispatch to _gcry_ecc_raw_decap,
+       _gcry_ecc_dhkem_decap, _gcry_openpgp_kem_decap, and
+       _gcry_cms_kem_decap.
+       * src/gcrypt.h.in: Add constants for ECC KEM.
+       * tests/t-kem.c (test_kem_raw_x25519, test_kem_dhkem_x25519)
+       (test_kem_openpgp_x25519, test_kem_cms_x25519): New.
+       (check_kem, main): Add tests for ECC KEM.
+
+2024-03-27  NIIBE Yutaka  <gniibe@fsij.org>
+           Falko Strenzke  <falko.strenzke@mtg.de>
+
+       tests:basic: Add cSHAKE test vectors.
+       + commit b81076beba280c24923b0259a8973c777204c5df
+       * tests/basic.c (check_one_md): Extend with customization N and S.
+       (check_one_md_multi): Likewise.
+       (check_digests): Add test vectors for GCRY_MD_CSHAKE128 and
+       GCRY_MD_CSHAKE256.
+
+2024-03-27  NIIBE Yutaka  <gniibe@fsij.org>
+
+       md: Add cSHAKE digest algorithm and the implementation.
+       + commit 065b3f4e0271cc410c002842ea640f93e56c6a20
+       * src/gcrypt.h.in (gcry_ctl_cmds): Add GCRYCTL_MD_CUSTOMIZE.
+       (gcry_md_algos): Add GCRY_MD_CSHAKE128 and GCRY_MD_CSHAKE256.
+       (struct gcry_cshake_customization): New.
+       * cipher/keccak.c (CSHAKE_DELIMITED_SUFFIX): New.
+       (keccak_init): Support GCRY_MD_CSHAKE128 and GCRY_MD_CSHAKE256.
+       (selftests_keccak): Likewise.
+       (cshake_input_n, cshake_input_s, _gcry_cshake_customize): New.
+       (cshake128_init, cshake256_init, cshake_hash_buffers): New.
+       (_gcry_cshake128_hash_buffers, _gcry_cshake256_hash_buffers): New.
+       (_gcry_digest_spec_cshake128, _gcry_digest_spec_cshake256): New.
+       * cipher/md.c (digest_list): Add cSHAKE md_specs.
+       (digest_list_algo301): Likewise.
+       (md_customize): New.
+       (_gcry_md_ctl): Support GCRYCTL_MD_CUSTOMIZE.
+       * src/cipher.h (_gcry_cshake_customize): New.
+       (_gcry_digest_spec_cshake128, _gcry_digest_spec_cshake256): New.
+       * src/fips.c (_gcry_fips_indicator_md): Support GCRY_MD_CSHAKE128 and
+       GCRY_MD_CSHAKE256.
+       * tests/basic.c (check_one_md): Support GCRY_MD_CSHAKE128 and
+       GCRY_MD_CSHAKE256 as xof.
+       (check_one_md_multi): Exclude GCRY_MD_CSHAKE128 and GCRY_MD_CSHAKE256
+       as xof.
+       * tests/bench-slope.c (hash_bench, kdf_bench): Exclude
+       GCRY_MD_CSHAKE128 and GCRY_MD_CSHAKE256.
+       * tests/benchmark.c (md_bench): Exclude GCRY_MD_CSHAKE128 and
+       GCRY_MD_CSHAKE256.
+
+2024-03-11  NIIBE Yutaka  <gniibe@fsij.org>
+
+       Add gcry_md_hash_buffers_ext function.
+       + commit 3d48e6e37b040c955a84e58ce3147562fa74d1f3
+       * src/gcrypt.h.in (gcry_md_hash_buffers_ext): New.
+       * src/libgcrypt.def: Add gcry_md_hash_buffers_ext.
+       * src/libgcrypt.vers: Likewise.
+       * src/visibility.c: Add gcry_md_hash_buffers_ext.
+       * src/visibility.h: Add gcry_md_hash_buffers_ext.
+
+2024-03-05  NIIBE Yutaka  <gniibe@fsij.org>
+
+       kem: Fix the previous commit.
+       + commit f4bcc69c3c9b0c2c948a9944cec7894cb590b211
+       * cipher/kem.c (kem_compute_keygrip): Rever the change.
+
+2024-03-04  NIIBE Yutaka  <gniibe@fsij.org>
+
+       cipher: Fix Kyber key in SEXP, and its keygrip computation.
+       + commit 47c594386ebec3f9dea2d091c38cc7768a1840d9
+       * cipher/kem.c (kem_generate): Include public key part in SEXP of
+       private key.
+       (kem_compute_keygrip): Fix keygrip computation.
+
+       mpi: Fix ECC computation on hppa.
+       + commit b757f4130af987bdfc769b754b6e9e27882c349c
+       * mpi/ec-inline.h [__hppa] (ADD4_LIMB32, SUB4_LIMB32): New.
+       * mpi/longlong.h [__hppa] (add_ssaaaa, sub_ddmmss): Add __CLOBBER_CC.
+
+2024-03-03  NIIBE Yutaka  <gniibe@fsij.org>
+
+       tests: Fix the previous commit.
+       + commit 23ccf8eafbc452fa5cf466bee219846400bd2424
+       * tests/keygen.c (check_kem_keys): Those are not yet in the standard.
+
+2024-03-02  NIIBE Yutaka  <gniibe@fsij.org>
+
+       tests: Fix keygen for FIPS mode.
+       + commit 58f93ea56f7173a5fc3f85710dbf167cb0e5e203
+       * tests/keygen.c (check_kem_keys): Care about FIPS mode.
+
+2024-02-29  NIIBE Yutaka  <gniibe@fsij.org>
+
+       mpi: Silence warning for some architecture.
+       + commit 813a96de4ffec5d72dbb4473a42bd2b5b3f56b50
+       * mpi/mpih-const-time.c (mpih_ct_limb_greater_than): DIFF_LO is not
+       used.
+
+2024-02-22  Werner Koch  <wk@gnupg.org>
+
+       cipher: Add a way to get a keygrip for KEM algos.
+       + commit 4db7f3b07be5fc7b4c1c3b2e7548167cb1968533
+       * src/gcrypt.h.in (GCRY_PK_KEM): New.
+       * cipher/kem.c (kem_infos): New.
+       (kem_names): New.
+       (kem_generate): New.
+       (kem_compute_keygrip): New.
+       (kem_get_nbits): New.
+       (_gcry_pubkey_spec_kem): New.
+       * cipher/pubkey.c (pubkey_list): Add spec for KEM.
+       * src/cipher.h (_gcry_pubkey_spec_kem): Declare.
+       (_gcry_pubkey_spec_elg_e): Remove this used declaration.
+
+       * tests/keygen.c (check_generated_kem_key): New.
+       (check_kem_keys): New.
+       (main): Add optional argument kem and all new function.
+
+       cipher: Slight refactoring of kem.c.
+       + commit b36aee33dd00906a98d5d1295bf471140844f737
+       * cipher/kem.c (_gcry_kem_keypair): Do not claim used variables.
+       Slight refactoring to allow for a enum-case check.
+       (_gcry_kem_encap): Ditto.
+       (_gcry_kem_decap): Ditto.
+
+       * cipher/kyber.h (crypto_kem_keypair_3) [KYBER_K]: Fix syntax error.
+
+2024-02-09  Clemens Lang  <cllang@redhat.com>
+
+       sha3: Fill OIDs and partial ASN.1 structs.
+       + commit b3750fb42a44760e65d8869bc9a64e79197ae96d
+       * cipher/keccak.c (sha3_224_asn): New.
+       (oid_spec_sha3_224, oid_spec_sha3_256): Update the OIDs.
+       (sha3_256_asn): Have a correct value.
+       (sha3_384_asn): Have a correct value.
+       (oid_spec_sha3_384, oid_spec_sha3_512): Update the OIDs.
+       (sha3_512_asn): Have a correct value.
+       (shake128_asn): Have a correct value.
+       (oid_spec_shake128, oid_spec_shake256): Update the OIDs.
+       (shake256_asn): Have a correct value.
+
+2024-02-06  Werner Koch  <wk@gnupg.org>
+
+       doc: Fix link to the s-expression description.
+       + commit 52f18b9ffe6ce38eba159c39550c10a95bce3d11
+       Note that there is also thr description at
+
+        https://people.csail.mit.edu/rivest/pubs
+        /RL96.ver-1.1.html#secSexpressions
+
+2024-02-04  Jussi Kivilinna  <jussi.kivilinna@iki.fi>
+
+       Fix Kyber segfaulting on Win64.
+       + commit 0929a9f1ede27dc6c629a92d92002da14eafa68a
+       * cipher/kyber.c (prg, pkprf): Cast variadic parameters to expected
+       types ('void *' and 'size_t').
+
+       rijndael-s390x: fix AES256-XTS feature mask.
+       + commit 679b07898897e16029dbf596dbcf0141ebb99792
+       * cipher/rijndael-s390x.c (_gcry_aes_s390x_setup_acceleration): Fix
+       AES256-XTS feature mask.
+
+2024-01-30  NIIBE Yutaka  <gniibe@fsij.org>
+
+       cipher:kdf: Add X963KDF for use in CMS.
+       + commit 3abac420b30ad4aeef803b23995303ac6bb563fa
+       * src/gcrypt.h.in (gcry_kdf_algos): Add GCRY_KDF_X963_KDF.
+       * cipher/kdf.c (x963_kdf_open, x963_kdf_compute): New.
+       (x963_kdf_final, x963_kdf_close): New.
+       (_gcry_kdf_open, _gcry_kdf_final, _gcry_kdf_close): Add
+       support for GCRY_KDF_X963_KDF.
+
+       Fix the version script for non-existing symbols.
+       + commit be328b4a3476d5cbe543a761c043c5923ea1e280
+       * src/libgcrypt.vers: Remove gcry_md_get and gcry_pk_register.
+
+2024-01-29  NIIBE Yutaka  <gniibe@fsij.org>
+
+       build: Use @FGREP@ by configure for libgcrypt-config.
+       + commit 128121e74b66793fabd24e478df6ea2ab568e24a
+       * configure.ac (AC_PROG_FGREP): Add.
+       * src/libgcrypt-config.in: Use @FGREP@.
+
+2024-01-16  NIIBE Yutaka  <gniibe@fsij.org>
+
+       tests: Fix tests/basic.
+       + commit 86812491c7541a6e76cc2f2c45538b4715121b88
+       * tests/basic.c (check_ecb_cipher): Use 'i' instead of '0'.
+
+       m4: Update acinclude.m4 to use $GREP.
+       + commit 656ca459e3d87f91dc20a2fb1001344f4c872ee3
+       * acinclude.m4: Use $GREP, instead of egrep.  Simplify the detection.
+
+2024-01-15  NIIBE Yutaka  <gniibe@fsij.org>
+
+       cipher: Fix copyright notices for LGPL.
+       + commit 2eaaa8846f226ccd54f3165afe10583936d9d940
+       * cipher/kem.c: Fix LGPL name, with correct capitalizing..
+       * cipher/kyber-common.c: Likewise.
+       * cipher/kyber-kdep.c: Likewise.
+       * cipher/kyber.c: Likewise.
+       * cipher/kyber.h: Likewise.
+       * cipher/sntrup761.c: Likewise.
+       * cipher/sntrup761.h: Likewise.
+
+       tests: Add known answers test for ML-KEM (Kyber).
+       + commit 6765916e3b55dc45d93123e789973c7528df5232
+       * tests/Makefile.am (tests_bin): Add t-mlkem.
+       (EXTRA_DIST): Add t-mlkem.inp.
+       * tests/t-mlkem.c: New.
+       * tests/t-mlkem.inp: New.
+
+       cipher:kyber: Fix comment style.
+       + commit 384638bc4a0dd4315ce2955d08f51f007ba88df9
+       * cipher/kyber-common.c: Fix comments.
+       * cipher/kyber-kdep.c: Likewise.
+       * cipher/kyber.c: Likewise.
+
+       cipher: Add use of the Kyber implementation.
+       + commit f2f9d56358acf3b38764bc4465db627162f42bbf
+       * cipher/Makefile.am (EXTRA_DIST): Add kyber-common.c kyber-kdep.c.
+       (libcipher_la_SOURCES): Add kyber.c and kyber.h.
+       * cipher/kem.c (_gcry_kem_keypair): Add call to kyber_keypair.
+       (_gcry_kem_encap): Add call to kyber_encap.
+       (_gcry_kem_decap): Add call to kyber_decap.
+       * tests/t-common.h (show_note): Factor from existing uses.
+       (show_sexp, prepend_srcdir, read_textline, copy_data): Likewise.
+       (hex2buffer, reverse_buffer): Likewise.
+       * tests/t-kem.c (show_note): Remove.
+       (test_kem_sntrup761): Rename from test_kem.
+       (test_kem_mlkem512, test_kem_mlkem768, test_kem_mlkem1024): New.
+       (check_kem): Add N_LOOPS argument.  Call relevant tests by the
+       variable SELECTED_ALGO.
+       (main): Add option handling to select testing specific algorithm.
+
+       cipher:kyber: Modify VERIFY function.
+       + commit 5c9c4dca39f2806443aaec60972e648715d3b048
+       * cipher/kyber-kdep.c (crypto_kem_dec): Use verify1.
+       * cipher/kyber.c (verify1): Return 1 when success.
+
+       cipher:kyber: Modification for xof_ routines.
+       + commit fe3ecf810370086df40f4cc980d0a290e5f3a9a5
+       * cipher/kyber-kdep.c (gen_matrix): Remove comment for
+       gen_matrix.  Add calls to xof_init and xof_close.
+
+       cipher:kyber: Integrate into libgcrypt, adding glue code.
+       + commit 4e9aa70078c5537f1c56f54e78d769c8b5dc9f0c
+       * cipher/kyber-common.c: Add copyright notice.
+       * cipher/kyber-kdep.c: Likewise.
+       * cipher/kyber.c: Likewise.
+       * cipher/kyber-kdep.c: Add #undef to clear the definitions.
+       * cipher/kyber.c: Integrate into libgcrypt, also allow standalone use.
+
+       cipher:kyber: Have cipher/kyber.h.
+       + commit 31743c17d7bd1f6dc653e6dfcf76fab5356649ae
+       * cipher/kyber.h: New.
+
+       cipher:kyber: Move declarations to kyber-kdep.c.
+       + commit b6e20ed580e08278eb963a87f6355c43ba0d34d2
+       * cipher/kyber.c: Move KYBER_K-dependent declarations into...
+       * cipher/kyber-kdep.c: ... here, or kyber.h.
+
+       cipher:kyber: Functions in poly for different KYBER_K.
+       + commit efd55d9ecae549318425dc7a867c9698cbc78410
+       * cipher/kyber-common.c (load24_littleendian, cbd3): Fix the
+       compile-time condition.
+       (poly_cbd_eta1, poly_cbd_eta2): Remove.
+       (zetas): Don't export.
+       (poly_compress): Remove.
+       (poly_compress_128, poly_compress_160): New.
+       (poly_decompress): Remove.
+       (poly_decompress_128, poly_decompress_160): New.
+       (poly_getnoise_eta1): Remove.
+       (poly_getnoise_eta1_2): New.  Directly call cbd3.
+       (poly_getnoise_eta1_3_4): New.  Directly call cbd2.
+       (poly_getnoise_eta2):  Directly call cbd2.
+       * cipher/kyber.c: Fix declarations for poly_compress,
+       poly_decompress and poly_getnoise_eta1.
+
+       cipher:kyber: Make the implementation into three files.
+       + commit 01a14c0df2957900903895a3c49de2bdb708816f
+       * cipher/kyber-common.c: New.  Common part.
+       * cipher/kyber-kdep.c: New. KYBER_K dependent part.
+       * cipher/kyber.c: Move functions and variables to two files.
+
+       cipher:kyber: Constants common and k-dependent.
+       + commit fc35a5372a74c696f45b10dccbb84331673412b4
+       * cipher/kyber.c (KYBER_ETA1): Remove.
+       (KYBER_ETA1_2, KYBER_ETA1_3_4): New.
+       (KYBER_POLYCOMPRESSEDBYTES): Remove.
+       (KYBER_POLYCOMPRESSEDBYTES_2_3): New.
+       (KYBER_POLYCOMPRESSEDBYTES_4): New.
+
+       cipher:kyber: Export the KEM API only.
+       + commit 34dd0a1a75b2ee37ccd1215235f8b9f00f38bff1
+       * cipher/kyber.c: Don't export other functions and data.
+
+       cipher: Editorial clean up cipher/kyber.c for headers.
+       + commit 4b601fe5b3cff021e265df6eb64911e6a8105bd3
+       * cipher/kyber.c: Clean up.
+
+       cipher: Add headers to Kyber implementation.
+       + commit 10e9bcd5c67ed40292b06901b38fa9b94ccc09ba
+       * cipher/kyber.c: Add headers from the reference implementation.
+
+       cipher: Put the original Kyber implementation.
+       + commit 18e5c0d268b1aeac59f526b9730b39520750ca14
+       * cipher/kyber.c: Kyber reference implementation.
+
+       cipher: Allow standalone use of SNTRUP761 implementation.
+       + commit bdadd65d440b7f689f49450530a548c32007f71b
+       * cipher/sntrup761.c [HAVE_CONFIG_H]: Conditionalize.
+
+2024-01-08  NIIBE Yutaka  <gniibe@fsij.org>
+
+       Avoid use of C99 feature for enum.
+       + commit dbfb7cc76ef3f98fe62f0549b97d76f395864ae0
+       * src/gcrypt.h.in (enum gcry_kdf_algos): Remove last comma.
+       (enum gcry_kem_algos): Likewise.
+
+2023-12-21  Jussi Kivilinna  <jussi.kivilinna@iki.fi>
+
+       bench-slope: restore compiler barriers for auto-GHZ check.
+       + commit c9cb10f3be226dfd65c670ff2565d81dfc668376
+       * tests/bench-slope.c (auto_ghz_bench): Add memory barriers to
+       AUTO_GHZ_OPERATION macro when barrier is supported by compiler.
+
+       mpi/ec-inline: refactor i386 assembly to reduce register usage.
+       + commit 956f1ed4ec6ead59dc56f574f943f1fe25dac723
+       * mpi/ec-inline.h [__i386__] (ADD2_LIMB32_CARRY_OUT)
+       (ADD2_LIMB32_CARRY_IN_OUT, ADD2_LIB32_CARRY_IN, SUB2_LIMB32_CARRY_OUT)
+       (SUB2_LIMB32_CARRY_IN_OUT, SUB2_LIB32_CARRY_IN, ADD8_LIMB32)
+       (ADD10_LIMB32, ADD14_LIMB32, SUB8_LIMB32, SUB10_LIMB32)
+       (SUB14_LIMB32): New.
+       [__i386__] (ADD4_LIMB32, ADD6_LIMB32, SUB4_LIMB32, SUB6_LIMB32): Rewrite
+       to use new *_CARRY_* macros.
+       [BYTES_PER_MPI_LIMB == 4] (ADD4_LIMB64): Use ADD8_LIMB32 if available.
+       [BYTES_PER_MPI_LIMB == 4] (ADD5_LIMB64): Use ADD10_LIMB32 if available.
+       [BYTES_PER_MPI_LIMB == 4] (ADD7_LIMB64): Use ADD14_LIMB32 if available.
+       [BYTES_PER_MPI_LIMB == 4] (SUB4_LIMB64): Use SUB8_LIMB32 if available.
+       [BYTES_PER_MPI_LIMB == 4] (SUB5_LIMB64): Use SUB10_LIMB32 if available.
+       [BYTES_PER_MPI_LIMB == 4] (SUB7_LIMB64): Use SUB14_LIMB32 if available.
+
+2023-12-16  Jussi Kivilinna  <jussi.kivilinna@iki.fi>
+
+       mpi/ec-nist: fix for -Og build failure on i386.
+       + commit 90097bd2f41c217dc5c666570e5680f432cf92d3
+       * mpi/ec-nist.c (_gcry_mpi_ec_nist256_mod)
+       (_gcry_mpi_ec_nist384_mod): Load p_mult constant with carry offset
+       to stack.
+
+2023-12-12  NIIBE Yutaka  <gniibe@fsij.org>
+
+       tests: Update digest values following input changes.
+       + commit 4a104752d8a2f0ca15d7873011a14226925a773b
+       * tests/basic.c (check_digests): Update the values.
+
+2023-12-12  Andreas Metzler  <ametzler@bebt.de>
+
+       Point to gnu.org as canonical license location.
+       + commit cd056b4d1614e9e245b66782b54aad1697e22a01
+
+
+       Fix license header inconsistency.
+       + commit 25e93a02268b9f533b1ffa4f6a5fe47d686e2145
+
+
+       Unify capitalization of LGPL copyright statements.
+       + commit ca5689367a837b6541307f6ae19e0176b051d06f
+
+
+2023-12-08  NIIBE Yutaka  <gniibe@fsij.org>
+
+       cipher: Fix sntrup761.h, so that it can be used outside.
+       + commit ebc9aaacd62bbc7f048020531398ccb9f97e4437
+       * cipher/sntrup761.h [_GCRYPT_IN_LIBGCRYPT]: Ifdef-out
+       libgcrypt specific glue code.  Recover the constants.
+
+       cipher: Minor fix for C90 compiler.
+       + commit 2149888d47afea9946c3a748310ad27dfcda48c3
+       * cipher/sntrup761.c (crypto_sort_int32): Declare A, before its use.
+
+2023-12-07  NIIBE Yutaka  <gniibe@fsij.org>
+
+       cipher: Add sntrup761 to KEM API.
+       + commit 947ad42450eadec079a1c50deba90e6453f0113c
+       * cipher/kem.c (sntrup761_random): New glue code.
+       ( _gcry_kem_keypair, _gcry_kem_encap, _gcry_kem_decap): Call sntrup761
+       functions.
+       * cipher/sntrup761.h: Add glue code to libgcrypt.
+       * src/visibility.h: Update for KEM functions.
+       * tests/t-kem.c: Fix for the final KEM API.
+
+2023-12-07  Simon Josefsson  <simon@josefsson.org>
+
+       cipher: Add Streamlined NTRU Prime sntrup761.
+       + commit cf9923e1a59f2f535311f3676345d34e593ba108
+       * cipher/Makefile.am (libcipher_la_SOURCES): Add sntrup761.c and h.
+       * cipher/sntrup761.c: New.
+       * tests/Makefile.am (tests_bin): Add t-kem.
+       * tests/t-kem.c: New.
+
+2023-12-07  NIIBE Yutaka  <gniibe@fsij.org>
+
+       Add an API for Key Encapsulation Mechanism.
+       + commit 7e503fa9170fd290105714d15a335f7748860e23
+       * cipher/Makefile.am (libcipher_la_SOURCES): Add kem.c.
+       * cipher/kem.c: New.
+       * src/gcrypt-int.h (_gcry_kem_keypair, _gcry_kem_encap)
+       (_gcry_kem_decap): New.
+       * src/gcrypt.h.in (gcry_kem_keypair, gcry_kem_encap)
+       (gcry_kem_decap): New.  Add constants.
+       * src/libgcrypt.def (gcry_kem_keypair, gcry_kem_encap)
+       (gcry_kem_decap): Add symbols.
+       * src/libgcrypt.vers: Likewise.
+
+2023-11-22  NIIBE Yutaka  <gniibe@fsij.org>
+
+       tests: Fix t-x448 for error handling.
+       + commit dc1c916da4ba3960495bca450d8f4bc6897d9167
+       * tests/t-x448.c (test_cv_x448): Take the error code from ERR.
+
+2023-11-16  NIIBE Yutaka  <gniibe@fsij.org>
+
+       cipher: Fix return type of _gcry_ecc_mul_point.
+       + commit 354e53558e55fe5bc8fa4be32e1c1bace7623536
+       * cipher/ecc-ecdh.c (_gcry_ecc_mul_point): Return gpg_err_code_t.
+       * src/gcrypt-int.h (_gcry_ecc_mul_point): Return gpg_err_code_t.
+       * src/visibility.c (gcry_ecc_mul_point): Follow the change.
+
+2023-11-04  Jussi Kivilinna  <jussi.kivilinna@iki.fi>
 
        mpih_mod: avoid unintentional conditional branch.
-       + commit 11973c2219da0f732338cf080a4edeb6a89c313e
+       + commit 39d5364a9557d6f423de117601cb1e6414814f47
        * mpi/mpih-const-time.c (_gcry_mpih_mod): Avoid conditional branch
        on the_bit extraction.
 
        mpih-const-time: use constant-time comparisons conditional add/sub/abs.
-       + commit 74588de441fd98f6dd99132e67e0c2632cb2a6bd
+       + commit c419a04d529af7b5fb43732ec2b4304166c2579a
        * mpi/mpih-const-time.c (mpih_ct_limb_greater_than)
        (mpih_ct_limb_less_than): New.
        (_gcry_mpih_add_n_cond, _gcry_mpih_sub_n_cond, _gcry_mpih_abs_cond): Use
        mpih_ct_limb_greater_than and mpih_ct_limb_less_than for comparisons.
 
        const-time: add functions for generating masks from 0/1 input.
-       + commit 01e7052cb245619280769f683d697d6b2f68e041
+       + commit cf757cf90e9ae966b95dcebfd2f31b9212697f0c
        * mpi/ec-nist.c (_gcry_mpi_ec_nist192_mod, _gcry_mpi_ec_nist224_mod)
-       (_gcry_mpi_ec_nist256_mod, _gcry_mpi_ec_nist384_mod): Use mask
-       generating functions.
+       (_gcry_mpi_ec_nist256_mod, _gcry_mpi_ec_nist384_mod): Use mask generating
+       functions.
        * mpi/mpi-internal.h (ct_limb_gen_mask, ct_limb_gen_inv_mask): New.
        * mpi/mpih-const-time.c (_gcry_mpih_set_cond, _gcry_mpih_add_n_cond)
-       (_gcry_mpih_sub_n_cond, _gcry_mpih_sub_n_cond, _gcry_mpih_swap_cond):
-       Use mask generating functions.
+       (_gcry_mpih_sub_n_cond, _gcry_mpih_sub_n_cond, _gcry_mpih_swap_cond): Use
+       mask generating functions.
        * mpi/mpiutil.c (_gcry_mpi_set_cond, _gcry_mpi_swap_cond): Use mask
        generating functions.
        * src/const-time.h (DEFINE_CT_TYPE_GEN_MASK, ct_uintptr_gen_mask)
-       (ct_ulong_gen_mask, DEFINE_CT_TYPE_GEN_INV_MASK)
-       (ct_uintptr_gen_inv_mask, ct_ulong_gen_inv_mask): New.
+       (ct_ulong_gen_mask, DEFINE_CT_TYPE_GEN_INV_MASK, ct_uintptr_gen_inv_mask)
+       (ct_ulong_gen_inv_mask): New.
        (DEFINE_CT_TYPE_SELECT_FUNC): Use mask generating functions.
-       * src/const-time.c (_gcry_ct_memmov_cond): Use mask generating
-       functions.
+       * src/const-time.c (_gcry_ct_memmov_cond): Use mask generating functions.
 
        ec: avoid unintentional condition branches for 25519, 448 and 256k1.
-       + commit 237523b49f423be66261fa769e4558a36b84d15f
+       + commit 305a65c1ede8f78160100478d46efa199d334a64
        * mpi/ec.c (ec_addm_25519, ec_subm_25519, ec_mulm_25519, ec_addm_448)
        (ec_subm_448, ec_mulm_448, ec_secp256k1_mod): Use mpih_limb_is_zero
        and mpih_limb_is_not_zero instead of comparison to zero.
 
        ec-nist: avoid unintentional conditional branch by comparison.
-       + commit 2ed34074474650a50592fa86d9639614aa86476b
+       + commit a9e7aa647e4b84964c76230370d71235383e5c2d
        * mpi/ec-nist.c (_gcry_mpi_ec_nist521_mod): Use mpih_limb_is_not_zero.
        * mpi/mpi-internal.h (mpih_limb_is_not_zero): New.
 
        mpih_cmp_ui: avoid unintentional conditional branch.
-       + commit 9acddd8b95e14fb7c82c96881c1ac9000a46d703
+       + commit aab6a42d5f44724b73a02598546a5e7d8b33298e
        * mpi/mpi-internal.h (mpih_limb_is_zero): New.
        * mpi/mpih-const-time.c (_gcry_mpih_cmp_ui): Use mpih_limb_is_zero
        instead of comparison.
 
        ec-nist: use global vone and vzero.
-       + commit 610667fb2558ac5aae1393b99ceec3e50199e901
+       + commit 5c5ba1ec2b505726ee1311339ac9e8b5c62cac4a
        * mpi/ec-nist.c (vzero, vone): Remove.
        (_gcry_mpi_ec_nist192_mod, _gcry_mpi_ec_nist224_mod)
        (_gcry_mpi_ec_nist256_mod, _gcry_mpi_ec_nist384_mod): Use _gcry_ct_vzero
        and _gcry_ct_vone.
 
        mpiutil: use global vone and vzero.
-       + commit 6377725ed01e090bea1223256f58eae0bfcc2021
+       + commit d4aee9ace9a904446b987dddc2999119c4d62dae
        * mpi/mpiutil.c (_gcry_mpi_set_cond, _gcry_mpi_swap_cond): Use
        _gcry_ct_vzero and _gcry_ct_vone.
 
        mpih-const-time: use global vzero/vone variable.
-       + commit 15cd08ae4c1e7fccda89a42bb3e87f15dab4452f
+       + commit 179df341162c74da312f76363a0ff1f2f303aa78
        * mpi/mpih-const-time.c (vzero, vone): Remove.
        (_gcry_mpih_set_cond, _gcry_mpih_add_n_cond, _gcry_mpih_sub_n_cond)
        (_gcry_mpih_swap_cond, _gcry_mpih_abs_cond): Use _gcry_ct_vzero and
        _gcry_ct_vone.
 
-2023-11-06  Jussi Kivilinna  <jussi.kivilinna@iki.fi>
-
        const-time: ct_memmov_cond: switch to use dual mask approach.
-       + commit 9c0984ed2c553289a744197791a6683f4bd2ce18
+       + commit 4d3e0e30b98b2acb90acb2792b8327c26824a66f
        * src/const-time.c (_gcry_ct_memmov_cond): Use dual mask + AND/OR
        instead of single mask + XOR.
 
        const-time: prefix global symbols with _gcry_
-       + commit 3fa1b81c92e5694ef0f2d01885eccbbc9944d267
+       + commit 22dde5150ee2be01651410ed9756601ba6a29c93
        * cipher/const-time.c (ct_not_memequal, ct_memequal)
        (ct_memmov_cond): Rename these to ...
        (_gcry_ct_not_memequal, _gcry_ct_memequal)
        (ct_not_memequal, ct_memequal, ct_memmov_cond): New macros.
 
        mpih_set_cond: restore EM leakage mitigation.
-       + commit 7f0eb519897b05e41fe43b0981453181266d457c
+       + commit 0c6ec6bbe788b8c4a6982b2128d442b51323c898
        * mpi/mpih-const-time.c (_gcry_mpih_set_cond): Replace single mask + XOR
        with dual mask + AND/OR; Add comment about reason for dual mask usage.
        (_gcry_mpih_add_n_cond, _gcry_mpih_sub_n_cond, _gcry_mpih_swap_cond)
        (_gcry_mpih_abs_cond): Add comment about reason for dual mask usage.
 
        rsa, elgamal: avoid logical not operator in constant-time code.
-       + commit 3583e2ebcad55bde178acc7a862cda30d1f2cd97
+       + commit 84f934c09afac18b3f4351646c0fe6f93aede277
        * cipher/elgamal.c (elg_decrypt): Replace ! operator with calls to
        ct_is_not_zero/ct_is_zero/ct_ulong_select.
        * cipher/rsa-common.c (_gcry_rsa_pkcs1_decode_for_enc): Replace !
        (sexp_null_cond): Use ct_uintptr_select.
 
        const-time: always avoid comparison operator for byte comparison.
-       + commit 5e9ba851948f97cd3fb70de474b87609b150d06a
+       + commit 137e35ad47ee8734d0f3ffb6af1d1669c4621e0b
        * configure.ac: Remove POSSIBLE_CONDITIONAL_BRANCH_IN_BYTE_COMPARISON
        macro.
        * src/const-time.h (ct_not_equal_byte): Remove
        POSSIBLE_CONDITIONAL_BRANCH_IN_BYTE_COMPARISON ifdef.
 
        Use single constant-time memory comparison implementation.
-       + commit 892bc25ff74b1fb84259babd4292da399dc3b185
+       + commit 1e9ddbd65c4627235611d75c3198c4ec197c9a05
        * src/const-time.c (ct_not_memequal): Use original 'buf_eq_const'
        implementation here.
        (ct_memequal): New.
        * cipher/bufhelp.h (buf_eq_const): Call to 'ct_memequal'.
 
-2023-11-06  NIIBE Yutaka  <gniibe@fsij.org>
+2023-11-01  Jussi Kivilinna  <jussi.kivilinna@iki.fi>
+
+       bench-slope: update auto-GHZ for alderlake-P.
+       + commit a047a9c7d10824593d5c9ae43d8a3d6319ef3c0b
+       * tests/bench-slope.c (vone): New.
+       (auto_ghz_bench): Remove memory barrier usage; Generate constant
+       values from volatile variable.
+
+2023-10-31  NIIBE Yutaka  <gniibe@fsij.org>
 
        cipher: Fix ElGamal decryption.
-       + commit 2839aaee3e6eed6a96f0a798b02eef7e0302c7e7
+       + commit 6d1d50ba3aad1850975f717adbedb4cb8b236fa7
        * cipher/elgamal.c (elg_decrypt): Call sexp_build always.
        * cipher/rsa.c (rsa_decrypt): Return an error code of sexp_build
        when RC != 0.
 
        rsa: Use memmov_independently when unpadding.
-       + commit c98b5e4a147170d578504498ba355a77de00b0f1
+       + commit 58b62be844549ad3d57c507d834027f1e2756567
        * cipher/rsa-common.c (memmov_independently): New.
        (_gcry_rsa_pkcs1_decode_for_enc): Use memmov_independently.
        (_gcry_rsa_oaep_decode): Use memmov_independently.
 
        const-time: Add ct_memmov_cond, fix _gcry_mpih_set_cond.
-       + commit 45945be8f3c3a77ce3e86ebda7d814defcd3f76b
+       + commit bd08357436a9559766cd458d25781ee4f94012a2
        * src/const-time.c (ct_memmov_cond): New.
        * src/const-time.h (ct_memmov_cond): New.
        * mpi/mpih-const-time.c (_gcry_mpih_set_cond): Use XOR and a MASK.
 
+2023-10-30  NIIBE Yutaka  <gniibe@fsij.org>
+
        const-time: Use ct_not_memequal, instead.  Tested with AVR.
-       + commit e8072d8d32558c289ad4cf7b5be26a25eda03c20
+       + commit c31b70b2660c3d24bd54ee08c255c36d867fdea7
        * cipher/rsa-common.c (_gcry_rsa_oaep_decode): Use ct_not_memequal.
        * src/const-time.c (ct_not_memequal): Use ct_not_equal_byte.
        * src/const-time.h (ct_not_memequal): Rename from ct_memequal.
 
        build: Check if arch is VAX or compiler is MSVC.
-       + commit fee1e63c7286cb12ff973ea446bc019f575887d2
+       + commit c848459e512615c1865a23cf24debb3ad4a1e85b
        * configure.ac (AH_BOTTOM): Add check for VAX and MSVC.
        * src/const-time.h (POSSIBLE_CONDITIONAL_BRANCH_IN_BYTE_COMPARISON):
        Rename.
 
+2023-10-27  NIIBE Yutaka  <gniibe@fsij.org>
+
        rsa: Fix decoding of PKCS#1 v1.5 and OAEP padding.
-       + commit 45c9920201687754719ddc15a88a25018a552695
+       + commit 34c20427926010d6fa95b1666e4b1b60f60a8742
        * src/Makefile.am (libgcrypt_la_SOURCES): Add const-time.h and
        const-time.c.
        * src/const-time.h (ct_not_equal_byte, sexp_null_cond): New.
 
 2023-10-06  NIIBE Yutaka  <gniibe@fsij.org>
 
+       sexp: Minor clean-up of sexp output handling.
+       + commit 5e5dff0551fcd9a826db18188fa1e4a6ca45099a
+       * src/sexp.c (suitable_encoding): Remove check for starting zero.
+
+2023-10-05  NIIBE Yutaka  <gniibe@fsij.org>
+
+       doc: Minor style fixes.
+       + commit edddc5738e6b1652ccc5db6861888de66112427f
+
+
        sexp: String with \0 is considered "binary".
-       + commit 49e1e67f4e4e9f520586dc4ea8a8f2630bbf6e9c
+       + commit ddd41eb6ace02626b0bf7704fdec9b765fb717c4
        * src/sexp.c (suitable_encoding): It's "binary" when
        the buffer contains '\0'.
 
+2023-09-15  Jussi Kivilinna  <jussi.kivilinna@iki.fi>
+
+       blake2-avx512: merge some of the gather loads.
+       + commit 325786acd445f9e74e4c44ba86c5b4e06788ea46
+       * cipher/blake2b-amd64-avx512.S (GATHER_MSG_2, GATHER_MSG_3)
+       (GATHER_MSG_5, GATHER_MSG_6, GATHER_MSG_8, GATHER_MSG_9): New.
+       (LOAD_MSG_2, LOAD_MSG_3, LOAD_MSG_5, LOAD_MSG_6, LOAD_MSG_8)
+       (LOAD_MSG_9): Use GATHER_MSG_<number>.
+       (_blake2b_avx512_data): Add merged load masks ".L[4-7]_mask".
+       (_gcry_blake2b_transform_amd64_avx512): Load merged load masks
+       to %k[4-7] and clear registers on exit.
+       * cipher/blake2s-amd64-avx512.S (VPINSRD_KMASK, GATHER_MSG_2)
+       (GATHER_MSG_3, GATHER_MSG_5, GATHER_MSG_6, GATHER_MSG_8)
+       (GATHER_MSG_9): New.
+       (LOAD_MSG_2, LOAD_MSG_3, LOAD_MSG_5, LOAD_MSG_6, LOAD_MSG_8)
+       (LOAD_MSG_9): Use GATHER_MSG_<number>.
+       (_blake2s_avx512_data): Add merged load masks ".L[4-7]_mask".
+       (_gcry_blake2s_transform_amd64_avx512): Load merged load masks
+       to %k[4-7] and clear registers on exit.
+
 2023-09-01  NIIBE Yutaka  <gniibe@fsij.org>
 
        build: Change the default for --with-libtool-modification.
-       + commit 09ab619488455b0f9f27f2c8291ea646b089c13e
+       + commit 36d014f919d1c5f00dde4509da9b5e02895467c9
        * configure.ac (--with-libtool-modification): default=never.
 
-2023-08-22  NIIBE Yutaka  <gniibe@fsij.org>
+2023-08-20  Jussi Kivilinna  <jussi.kivilinna@iki.fi>
+
+       blake2b-avx512: replace VPGATHER with manual gather.
+       + commit 59f14c1db37e16aba37af185dd0677f9783536ce
+       * cipher/blake2.c (blake2b_init_ctx): Remove HWF_INTEL_FAST_VPGATHER
+       check for AVX512 implementation.
+       * cipher/blake2b-amd64-avx512.S (R16, VPINSRQ_KMASK, .Lshuf_ror16)
+       (.Lk1_mask): New.
+       (GEN_GMASK, RESET_KMASKS, .Lgmask*): Remove.
+       (GATHER_MSG): Use manual gather instead of VPGATHER.
+       (ROR_16): Use vpshufb for small speed improvement on tigerlake.
+       (_gcry_blake2b_transform_amd64_avx512): New setup & clean-up for
+       kmask registers; Reduce excess loop aligned from 64B to 16B.
+
+       twofish-avx2-amd64: replace VPGATHER with manual gather.
+       + commit ded3a1ec2ec6980750e3e9eabde001cdbebece51
+       * cipher/twofish-avx2-amd64.S (do_gather): New.
+       (g16): Switch to use 'do_gather' instead of VPGATHER instruction.
+       (__twofish_enc_blk16, __twofish_dec_blk16): Prepare stack
+       for 'do_gather'.
+       * cipher/twofish.c (twofish) [USE_AVX2]: Remove now unneeded
+       HWF_INTEL_FAST_VPGATHER check.
+
+       Avoid VPGATHER usage for most of Intel CPUs.
+       + commit f2bf9997d46590e688bad213267b8fb466e95ecd
+       * cipher/blake2.c (blake2b_init_ctx): Check for fast VPGATHER
+       for AVX512 implementation.
+       * src/hwf-x86.c (detect_x86_gnuc): Do not enable
+       HWF_INTEL_FAST_VPGATHER for Intel CPUs suffering from
+       "Downfall" vulnerability.
+
+2023-08-16  NIIBE Yutaka  <gniibe@fsij.org>
 
        build: New configure option --with-libtool-modification.
-       + commit 0ddc823e331c2a38b71f887abc917d8a5a9003af
+       + commit 2143503b8f56a4e6909dc8b4f86e20c8ad76aaed
        * Makefile.am (EXTRA_DIST): Add build-aux/libtool-patch.sed.
        * build-aux/libtool-patch.sed: New.
        * configure.ac (--with-libtool-modification): New.
+       * build-aux/ltmain.sh: Revert our own local modification.
+
+2023-07-26  NIIBE Yutaka  <gniibe@fsij.org>
+
+       doc: yat2m-stamp should depend on version.texi.
+       + commit f019c98fd418596074ffd2cc755be6c483aac932
+       * doc/Makefile.am (yat2m-stamp): Depend on version.texi.
+
+2023-07-17  Jussi Kivilinna  <jussi.kivilinna@iki.fi>
+
+       hwf-x86: use CFI statements for 'is_cpuid_available'
+       + commit a5f88f30ce612b0239b946c4424d81cf0d552e96
+       * src/hwf-x86.c (FORCE_FUNC_FRAME_POINTER): Remove.
+       (CFI_ADJUST_CFA_OFFSET, CFI_PUSH4, CFI_POP4): New.
+       (is_cpuid_available): Use CFI statements instead of frame-pointer
+       attribute.
+
+       configure: fix HAVE_GCC_ASM_CFI_DIRECTIVES check.
+       + commit 35829d38d61fa3130d88eaeea1af2591a3f07208
+       * cipher/camellia-aesni-avx2-amd64.h (enc_blk1_32): Fix dual
+       CFI_ENDPROC() usage.
+       * configure.ac (gcry_cv_gcc_asm_cfi_directives): Add missing ';'
+       after 'void asmfunc(void)'.
+       * mpi/asm-common-i386.h: New.
+       * mpi/i386/syntax.h: Remove CFI macros and instead include
+       "asm-common-i386.h".
+
+       Add VAES/AVX2 accelerated i386 implementation for AES.
+       + commit 4a42a042bcf6b25f13957207c2d13d420f450bb8
+       * cipher/Makefile.am: Add 'rijndael-vaes-i386.c' and
+       'rijndael-vaes-avx2-i386.S'.
+       * cipher/asm-common-i386.h: New.
+       * cipher/rijndael-internal.h (USE_VAES_I386): New.
+       * cipher/rijndael-vaes-avx2-i386.S: New.
+       * cipher/rijndael-vaes-i386.c: New.
+       * cipher/rijndael-vaes.c: Update header description (add 'AMD64').
+       * cipher/rijndael.c [USE_VAES]: Add 'USE_VAES_I386' to ifdef around
+       '_gcry_aes_vaes_*' function prototypes.
+       (setkey) [USE_VAES_I386]: Add setup of VAES/AVX2/i386 bulk functions.
+       * configure.ac: Add 'rijndael-vaes-i386.lo' and
+       'rijndael-vaes-avx2-i386.lo'.
+       (gcry_cv_gcc_amd64_platform_as_ok): Rename this to ...
+       (gcry_cv_gcc_x86_platform_as_ok): ... this and change to check for
+       both AMD64 and i386 assembler compatibility.
+       (gcry_cv_gcc_win32_platform_as_ok): New.
+
+       rijndael-vaes-avx2-amd64: avoid extra load in CFB & CBC IV handling.
+       + commit 13f288edd5274880cf6833c80eba24183440a66d
+       * cipher/rijndael-vaes-avx2-amd64.S
+       (_gcry_vaes_avx2_cbc_dec_amd64, _gcry_vaes_avx2_cfb_dec_amd64): Avoid
+       duplicate memory load from source buffer.
+
+       rijndael-vaes-avx2-amd64: acceleration for OCB auth.
+       + commit 6b47e85d65158f3b6b1c3c24476249bfe3a7f943
+       * cipher/rijndael-vaes-avx2-amd64.S
+       (_gcry_vaes_avx2_ocb_crypt_amd64): Add authentication mode support.
+       * cipher/rijndael-vaes.c (_gcry_vaes_avx2_ocb_crypt_amd64): Change
+       to return 'size_t' value.
+       (_gcry_aes_vaes_ocb_auth): New.
+       * cipher/rijndael.c (_gcry_aes_vaes_ocb_auth): New.
+       (do_setkey) [USE_VAES]: Add setup for 'bulk_ops->ocb_auth'.
 
 2023-07-14  Bernhard Reiter  <bernhard@intevation.de>
 
        build: Fix the notice in configure.ac.
-       + commit c8ee15dfe8696c58ba493b118f6eff1c095e02ae
+       + commit 77ce5866e06cb5ebd82a0ce34206395aa03b9352
        * configure.ac: Fix typo.
 
-2023-06-19  NIIBE Yutaka  <gniibe@fsij.org>
+2023-07-12  NIIBE Yutaka  <gniibe@fsij.org>
+
+       Use unsigned int for a bit field.
+       + commit e76e88eef7811ada4c6e1d57520ba8c439139782
+       * random/random-drbg.c (struct drbg_state_ops_s): Use unsigned.
+       (struct drbg_state_s): Likewise.
+       * src/hmac256.c (struct hmac256_context): Likewise.
+
+2023-06-28  NIIBE Yutaka  <gniibe@fsij.org>
+
+       cipher:ecc:fips: Only allow defined digest algo for EdDSA.
+       + commit d15fe6aac10b0ffc2eb8974c23095d8123025e5c
+       * cipher/ecc.c (ecc_sign): Add the check if it's proper digest
+       algo for EdDSA on FIPS mode.
+
+       cipher:ecc:fips: Reject use of SHAKE when it's ECDSA with RFC6979.
+       + commit f65c30d470f581e4df91a5aff8bb202ff0fd56ad
+       * cipher/ecc-ecdsa.c (_gcry_ecc_ecdsa_sign): Check if it's SHAKE.
+
+2023-06-25  Jussi Kivilinna  <jussi.kivilinna@iki.fi>
+
+       keccak: add md_read support for SHAKE algorithms.
+       + commit 794695ff45d64f6586d7f08e74c075c098ac3b5d
+       * cipher/hash-common.c (_gcry_hash_selftest_check_one): Adjust
+       for SHAKE algorithms now returning non-zero for digest length.
+       * cipher/keccak.c (KECCAK_CONTEXT_S): Add 'shake_in_extract_mode'
+       and 'shake_in_read_mode' flags.
+       (keccak_init): Initialize new context fields; set output length
+       for SHAKE algorithms.
+       (keccak_extract): Rename to ...
+       (do_keccak_extract): this and add return value.
+       (keccak_extract): New with 'shake_in_???_mode' checks & setup.
+       (keccak_shake_read): New.
+       (_gcry_sha3_hash_buffers): Adjust for 'spec->mdlen' not being
+       zero for SHAKE algorithms, instead check 'suffix' for type.
+       (_gcry_digest_spec_shake128): Set mdlen to 32 bytes; Set read
+       function.
+       (_gcry_digest_spec_shake256): Set mdlen to 64 bytes; Set read
+       function.
+       * cipher/md.c (md_extract): Pass return value from algo extract
+       function.
+       (_gcry_md_hash_buffers_extract): Adjust for 'spec->mdlen' not
+       being zero for SHAKE algorithms.
+       * src/cipher-proto.h (gcry_md_extract_t): Change return type
+       from 'void' to 'gpg_err_code_t'.
+       * tests/basic.c (check_one_md, check_one_md_multi): Adjust
+       for 'gcry_md_get_algo_dlen()' not being zero for SHAKE
+       algorithms.
+       (check_digests): Add md_read interface test-vectors for SHAKE128
+       and SHAKE256.
+
+2023-06-23  NIIBE Yutaka  <gniibe@fsij.org>
 
-       cipher:pubkey: Check digest size which should not be zero.
-       + commit 8cdd0d353e19a4514dfe3c99146d17f07bf0fb4d
-       * cipher/pubkey.c (gcry_pk_sign_md): Check the digest size
-       before calling _gcry_md_read.
+       cipher:rsa: Add support of SHAKE as MGF.
+       + commit 8802faadab79ec84714bbaf28eaa882860779cba
+       * cipher/rsa-common.c (_gcry_rsa_pss_encode): Support SHAKE.
+       (_gcry_rsa_pss_verify): Likewise.
 
 2023-06-16  NIIBE Yutaka  <gniibe@fsij.org>
 
        tests: Allow KDF measurement in FIPS mode.
-       + commit 5547e5255c465e99ccd3ffce2b7622e75f2d89a4
+       + commit 70b1b036f3ee06c5894e996cbd742cf3c174ca79
        * tests/bench-slope.c (bench_kdf_init): Tweak the iterations in FIPS
        mode.
        (bench_kdf_do_bench): Use larger values to avoid rejection in FIPS
        mode.
 
        cipher:kdf: Move FIPS mode check to _gcry_kdf_derive.
-       + commit 2c8562ca5a49edc0c82c8c4775edebabc31cf09e
+       + commit f4bff832c7f5a54879c1858d7b426cd12088c57c
        * cipher/kdf.c (_gcry_kdf_pkdf2): Move the checks to...
        (_gcry_kdf_derive): ... here.
 
        Remove out of core handler setting message in FIPS mode.
-       + commit d37ad2823f849ce71b509245ecc169835cf80d1e
+       + commit 6c79dcddd151b6b01a760f7aab54e6882ea5a475
        * src/global.c (_gcry_set_outofcore_handler): Don't call log_info.
 
-       cipher:ecc: Fix an error-path to release the KEY correctly.
-       + commit 31adc78fa503be388af430e9b218a83fb4b1ea7f
-       * cipher/ecc.c (ecc_generate): Set *R_SKEY to NULL.
+       cipher:ecc: Implement PCT for EdDSA.
+       + commit 3ac2bba4a4b11388949e235d0d9555478468a379
+       * cipher/ecc.c (test_keys_eddsa_fips): New.
+       (ecc_generate): Use test_keys_eddsa_fips for EdDSA.
+
+       build: Detect broken GCC for x86/AVX512 intrinsics.
+       + commit 97f4a94d5960bb53b690bbd5cdf87b64311e21cc
+       * configure.ac (HAVE_COMPATIBLE_CC_X86_AVX512_INTRINSICS): Check
+       the GCC bug 90980.
+
+       cipher:ecc: Add selftests for EdDSA.
+       + commit 547dfb5aecc1ae057e0bc599e8565f8c3fb84894
+       * cipher/ecc.c (selftest_hash_sign_eddsa): New.
+       (selftests_ecc): Add IS_EDDSA argument.
+       (run_selftests): Add tests for Ed25519 and Ed448.
+       * src/g10lib.h (_gcry_hex2buffer): New.
+       * src/sexp.c (_gcry_hex2buffer): Add.
+
+2023-06-16  Jakub Jelen  <jjelen@redhat.com>
+
+       tests: EdDSA keys work in FIPS mode.
+       + commit 73d2f5d93541747befe9a791991553d691f3a1ae
+       * tests/bench-slope.c (ecc_algo_fips_allowed): Adjust list of FIPS
+         allowed algorithms.
+       * tests/benchmark.c (ecc_bench): Ditto.
+       * tests/curves.c (check_get_params): Ditto.
+       * tests/keygrip.c (global): Ditto.
+       * tests/pubkey.c (main): Run Ed25519 test in FIPS mode.
+       * tests/t-ed25519.c (one_test): Remove FIPS exception.
+         (main): Do not record FIPS status.
+       * tests/t-ed448.c (one_test): Remove FIPS exception.
+         (main): Do not record FIPS status.
+       * tests/keygen.c (check_ecc_keys): Remove FIPS exceptions.
+
+       ecc: Enable Ed25519 and Ed448 in FIPS mode.
+       + commit c08ea202d91651b7c8f7ed37f5554330deac52b8
+       * cipher/ecc-curves.c (domain_parms): Flip the FIPS byte for EdDSA
+         curves to enable them in FIPS mode
+       * src/fips.c (valid_string_in_sexp): Mark eddsa flag valid in FIPS mode
 
 2023-06-15  NIIBE Yutaka  <gniibe@fsij.org>
 
+       cipher:ecc: Fix EdDSA secret key check.
+       + commit ed879d832659f51c40bfadcd3b96fb8890b1eefb
+       * cipher/ecc.c (check_secret_key): No reason to exclude EdDSA key.
+
+       context: Make the context chain-able.
+       + commit f4019ed225bf4478b06fdb54e01bd7b9264694a9
+       * src/context.h (_gcry_ctx_alloc): Add NEXT argurment.
+       * src/context.c (struct gcry_context): Add NEXT field.
+       (_gcry_ctx_alloc): Support the NEXT field.
+       (_gcry_ctx_get_pointer): Allow access to the NEXT field.
+       (_gcry_ctx_release): Loop following NEXT.
+       * cipher/pubkey.c (struct pk_single_data): Remove CTX_NEXT.
+       (release_single_data): Remove.
+       (_gcry_pk_single_data_push): Call _gcry_ctx_alloc with NEXT.
+       (_gcry_pk_get_single_data): Use _gcry_ctx_get_pointer to
+       access NEXT.
+
+2023-06-14  NIIBE Yutaka  <gniibe@fsij.org>
+
        cipher:pubkey: Fix non-use of flexible array member.
-       + commit 297c5a47837cfd94ed5b317b45f6e776502dd2cd
+       + commit c160e1a85f8295e388de7a0b09a351bc271f0d95
        * cipher/pubkey.c (struct pk_single_data): Use 1 as the size.
        (_gcry_pk_single_data_push): Use offsetof.
 
+       cipher:ecc: Support gcry_pk_hash_sign/verify for EdDSA.
+       + commit 86fcf8292208838b47c08e74cd0941694287e7dd
+       * cipher/pubkey.c (prepare_datasexp_to_be_signed): Allow HD=NULL,
+       and use of CTX for supplying input data and random_override.
+       (_gcry_pk_sign_md, _gcry_pk_verify_md): Support the case of HD==NULL.
+       (release_single_data): New.
+       (_gcry_pk_single_data_push, _gcry_pk_get_single_data): New.
+       (_gcry_pk_random_override_new, _gcry_pk_get_random_override): Remove.
+       * src/context.h (CONTEXT_TYPE_SINGLE_DATA): New.
+       (CONTEXT_TYPE_RANDOM_OVERRIDE): Remove.
+       * src/context.c (_gcry_ctx_alloc): Use CONTEXT_TYPE_SINGLE_DATA.
+       * src/gcrypt-int.h (_gcry_pk_single_data_push,
+       _gcry_pk_get_single_data): New.
+       (_gcry_pk_random_override_new, _gcry_pk_get_random_override): Remove.
+       * src/gcrypt.h.in (gcry_pk_input_data_push): New.
+       * src/visibility.c (gcry_pk_random_override_new): Follow the change.
+       * tests/t-ed25519.c (one_test): Use gcry_pk_hash_sign/verify API.
+       * tests/t-ed448.c (one_test): Use gcry_pk_hash_sign/verify API.
+
+2023-06-13  NIIBE Yutaka  <gniibe@fsij.org>
+
        cipher:ecc: Fix public key computation for EdDSA.
-       + commit b863ec507dae17fe2c38653e1ccf22de62b68ac4
+       + commit 469919751d6eb46ceff9df80676416ca8f474459
        * cipher/ecc-misc.c (_gcry_ecc_compute_public): Fix hard-coded length
        for digest lower bits.
 
-2023-06-02  Werner Koch  <wk@gnupg.org>
-
-       mpicalc: Allow for addm and subm.
-       + commit 2974a635e5d3ca477508527f5c401619879cb19f
-       * src/mpicalc.c (do_add, do_sub): Add arg usemod.
-       (main): Add oeprator 'M'.
+2023-06-12  NIIBE Yutaka  <gniibe@fsij.org>
+
+       cipher:pubkey: Factor out data SEXP preparation.
+       + commit 263aa80b39dcf6d98c475784b319cb91b87cec3b
+       * cipher/pubkey.c (prepare_datasexp_to_be_signed): New.
+       (_gcry_pk_sign_md, _gcry_pk_verify_md): Use the function.
+
+2023-06-09  NIIBE Yutaka  <gniibe@fsij.org>
+
+       cipher: Factor functions for ECC selftests.
+       + commit e0f7e927c594bec46afc23b782526546d0024ff5
+       * cipher/ecc.c: Factor out data for ECC selftests.
+       (selftest_hash_sign, selftest_sign): Have more arguments.
+       (selftests_ecc): Rename from selftests_ecdsa, having more args.
+       Follow the change of selftest_hash_sign, selftest_sign with data.
+       (run_selftests): Call selftests_ecc with constants of ECDSA.
+
+2023-06-04  Jussi Kivilinna  <jussi.kivilinna@iki.fi>
+
+       addm/subm/mulm: fix case when destination is same MPI as divider.
+       + commit f490ffd739f713fcf0be35b7fbbb8502dea40a0c
+       * mpi/mpi-add.c (_gcry_mpi_addm, _gcry_mpi_subm): Take copy of M when
+       W and M are the same MPI.
+       * mpi/mpi-mul.c (_gcry_mpi_mulm): Likewise.
+       * tests/mpitests.c (test_addm_subm_mulm): New.
+       (main): Run addm/subm/mulm test.
+
+2023-06-01  Jussi Kivilinna  <jussi.kivilinna@iki.fi>
+
+       twofish-avx2: de-unroll round function.
+       + commit beeff8eda4a94187ee2c72c4372ab3e634335e45
+       * cipher/twofish-avx2-amd64.S (__twofish_enc_blk16)
+       (__twofish_dec_blk16): Use loop structure instead of unrolling.
+
+2023-05-28  Jussi Kivilinna  <jussi.kivilinna@iki.fi>
+
+       serpent: add x86/AVX512 implementation.
+       + commit ce95b6ec3554aec0e4a26f700889a19d85b5d9ac
+       * cipher/Makefile.am: Add `serpent-avx512-x86.c`; Add extra CFLAG
+       handling for `serpent-avx512-x86.o` and `serpent-avx512-x86.lo`.
+       * cipher/serpent-avx512-x86.c: New.
+       * cipher/serpent.c (USE_AVX512): New.
+       (serpent_context_t): Add `use_avx512`.
+       [USE_AVX512] (_gcry_serpent_avx512_cbc_dec)
+       (_gcry_serpent_avx512_cfb_dec, _gcry_serpent_avx512_ctr_enc)
+       (_gcry_serpent_avx512_ocb_crypt, _gcry_serpent_avx512_blk32): New.
+       (serpent_setkey_internal) [USE_AVX512]: Set `use_avx512` is
+       AVX512 HW available.
+       (_gcry_serpent_ctr_enc) [USE_AVX512]: New.
+       (_gcry_serpent_cbc_dec) [USE_AVX512]: New.
+       (_gcry_serpent_cfb_dec) [USE_AVX512]: New.
+       (_gcry_serpent_ocb_crypt) [USE_AVX512]: New.
+       (serpent_crypt_blk1_16): Rename to...
+       (serpent_crypt_blk1_32): ... this; Add AVX512 code-path; Adjust for
+       increase from max 16 blocks to max 32 blocks.
+       (serpent_encrypt_blk1_16): Rename to ...
+       (serpent_encrypt_blk1_32): ... this.
+       (serpent_decrypt_blk1_16): Rename to ...
+       (serpent_decrypt_blk1_32): ... this.
+       (_gcry_serpent_xts_crypt, _gcry_serpent_ecb_crypt): Increase bulk
+       block count from 16 to 32.
+       * configure.ac (gcry_cv_cc_x86_avx512_intrinsics)
+       (ENABLE_X86_AVX512_INTRINSICS_EXTRA_CFLAGS): New.
+       (GCRYPT_ASM_CIPHERS): Add `serpent-avx512-x86.lo`.
+
+2023-05-16  NIIBE Yutaka  <gniibe@fsij.org>
+
+       build: Sync libtool from libgpg-error for 64-bit Windows.
+       + commit 01c0185e63605ec7e0b9e90ca99ef9f435b67261
+       * build-aux/ltmain.hs: Update from libgpg-error.
+
+2023-05-10  NIIBE Yutaka  <gniibe@fsij.org>
+
+       tests: Use -no-fast-install LDFLAGS for Windows.
+       + commit 33f9f0dec26df64e8184395eb6554ee379a87e3d
+       * tests/Makefile.am [HAVE_W32_SYSTEM] (AM_LDFLAGS): Conditionalize.
+
+       w32: Silence GCC warning for -Wcast-function-type.
+       + commit 501dee123efe3d7d9488bb806ac9af5f56053f06
+       * random/rndw32.c (init_system_rng): Add (void *).
+       (slow_gatherer): Likewise.
+
+2023-04-27  NIIBE Yutaka  <gniibe@fsij.org>
+
+       Update copyright notices to use URL.
+       + commit f5284460ac4c8cc41ad9d39c5d15bd063710d956
+       * build-aux/db2any: Update copyright notice.
+       * cipher/arcfour.c, cipher/blowfish.ccipher/cast5.c: Likewise.
+       * cipher/crc-armv8-ce.c, cipher/crc-intel-pclmul.c: Likewise.
+       * cipher/crc-ppc.c, cipher/crc.c, cipher/des.c: Likewise.
+       * cipher/md2.c, cipher/md4.c, cipher/md5.c: Likewise.
+       * cipher/primegen.c, cipher/rfc2268.c, cipher/rmd160.c: Likewise.
+       * cipher/seed.c, cipher/serpent.c, cipher/tiger.c: Likewise.
+       * cipher/twofish.c: Likewise.
+       * mpi/alpha/mpih-add1.S, mpi/alpha/mpih-lshift.S: Likewise.
+       * mpi/alpha/mpih-mul1.S, mpi/alpha/mpih-mul2.S: Likewise.
+       * mpi/alpha/mpih-mul3.S, mpi/alpha/mpih-rshift.S: Likewise.
+       * mpi/alpha/mpih-sub1.S, mpi/alpha/udiv-qrnnd.S: Likewise.
+       * mpi/amd64/mpih-add1.S, mpi/amd64/mpih-lshift.S: Likewise.
+       * mpi/amd64/mpih-mul1.S, mpi/amd64/mpih-mul2.S: Likewise.
+       * mpi/amd64/mpih-mul3.S, mpi/amd64/mpih-rshift.S: Likewise.
+       * mpi/amd64/mpih-sub1.S, mpi/config.links: Likewise.
+       * mpi/generic/mpih-add1.c, mpi/generic/mpih-lshift.c: Likewise.
+       * mpi/generic/mpih-mul1.c, mpi/generic/mpih-mul2.c: Likewise.
+       * mpi/generic/mpih-mul3.c, mpi/generic/mpih-rshift.c: Likewise.
+       * mpi/generic/mpih-sub1.c, mpi/generic/udiv-w-sdiv.c: Likewise.
+       * mpi/hppa/mpih-add1.S, mpi/hppa/mpih-lshift.S: Likewise.
+       * mpi/hppa/mpih-rshift.S, mpi/hppa/mpih-sub1.S: Likewise.
+       * mpi/hppa/udiv-qrnnd.S, mpi/hppa1.1/mpih-mul1.S: Likewise.
+       * mpi/hppa1.1/mpih-mul2.S, mpi/hppa1.1/mpih-mul3.S: Likewise.
+       * mpi/hppa1.1/udiv-qrnnd.S, mpi/i386/mpih-add1.S: Likewise.
+       * mpi/i386/mpih-lshift.S, mpi/i386/mpih-mul1.S: Likewise.
+       * mpi/i386/mpih-mul2.S, mpi/i386/mpih-mul3.S: Likewise.
+       * mpi/i386/mpih-rshift.S, mpi/i386/mpih-sub1.S: Likewise.
+       * mpi/i386/syntax.h, mpi/longlong.h: Likewise.
+       * mpi/m68k/mc68020/mpih-mul1.S, mpi/m68k/mc68020/mpih-mul2.S: Likewise.
+       * mpi/m68k/mc68020/mpih-mul3.S, mpi/m68k/mpih-add1.S: Likewise.
+       * mpi/m68k/mpih-lshift.S, mpi/m68k/mpih-rshift.S: Likewise.
+       * mpi/m68k/mpih-sub1.S, mpi/m68k/syntax.h: Likewise.
+       * mpi/mips3/mpih-add1.S, mpi/mips3/mpih-lshift.S: Likewise.
+       * mpi/mips3/mpih-mul1.S, mpi/mips3/mpih-mul2.S: Likewise.
+       * mpi/mips3/mpih-mul3.S, mpi/mips3/mpih-rshift.S: Likewise.
+       * mpi/mips3/mpih-sub1.S, mpi/mpi-add.c: Likewise.
+       * mpi/mpi-bit.c, mpi/mpi-cmp.c, mpi/mpi-div.c: Likewise.
+       * mpi/mpi-gcd.c, mpi/mpi-inline.c, mpi/mpi-inline.h: Likewise.
+       * mpi/mpi-internal.h, mpi/mpi-mpow.c, mpi/mpi-mul.c: Likewise.
+       * mpi/mpi-scan.c, mpi/mpih-div.c, mpi/mpih-mul.c: Likewise.
+       * mpi/pa7100/mpih-lshift.S, mpi/pa7100/mpih-rshift.S: Likewise.
+       * mpi/power/mpih-add1.S, mpi/power/mpih-lshift.S: Likewise.
+       * mpi/power/mpih-mul1.S, mpi/power/mpih-mul2.S: Likewise.
+       * mpi/power/mpih-mul3.S, mpi/power/mpih-rshift.S: Likewise.
+       * mpi/power/mpih-sub1.S, mpi/powerpc32/mpih-add1.S: Likewise.
+       * mpi/powerpc32/mpih-lshift.S, mpi/powerpc32/mpih-mul1.S: Likewise.
+       * mpi/powerpc32/mpih-mul2.S, mpi/powerpc32/mpih-mul3.S: Likewise.
+       * mpi/powerpc32/mpih-rshift.S, mpi/powerpc32/mpih-sub1.S: Likewise.
+       * mpi/powerpc32/syntax.h, mpi/sparc32/mpih-add1.S: Likewise.
+       * mpi/sparc32/mpih-lshift.S, mpi/sparc32/mpih-rshift.S: Likewise.
+       * mpi/sparc32/udiv.S, mpi/sparc32v8/mpih-mul1.S: Likewise.
+       * mpi/sparc32v8/mpih-mul2.S, mpi/sparc32v8/mpih-mul3.S: Likewise.
+       * mpi/supersparc/udiv.S: Likewise.
+       * random/random.h, random/rndegd.c: Likewise.
+       * src/cipher.h, src/libgcrypt.def, src/libgcrypt.vers: Likewise.
+       * src/missing-string.c, src/mpi.h, src/secmem.h: Likewise.
+       * src/stdmem.h, src/types.h: Likewise.
+       * tests/aeswrap.c, tests/curves.c, tests/hmac.c: Likewise.
+       * tests/keygrip.c, tests/prime.c, tests/random.c: Likewise.
+       * tests/t-kdf.c, tests/testapi.c: Likewise.
+
+       Update m4 files and Makefiles.
+       + commit 17a3394b47cb82535ef7187e8819931b00cf4fa6
+       * acinclude.m4: Use URL and add SPDX identifier.
+       * m4/noexecstack.m4: Likewise.
+       * Makefile.am: Likewise.
+       * doc/Makefile.am: Likewise.
+       * mpi/Makefile.am: Likewise.
+       * tests/Makefile.am: Likewise.
+
+2023-04-23  Jussi Kivilinna  <jussi.kivilinna@iki.fi>
+
+       mpi: optimize mpi_rshift and mpi_lshift to avoid extra MPI copying.
+       + commit fdf2e8ba654a4dcfee25586dd7e0749f2b7a92c0
+       * mpi/mpi-bit.c (_gcry_mpi_rshift): Refactor so that _gcry_mpih_rshift
+       is used to do the copying along with shifting when copying is needed
+       and refactor so that same code-path is used for both in-place and
+       copying operation.
+       (_gcry_mpi_lshift): Refactor so that _gcry_mpih_lshift is used to do
+       the copying along with shifting when copying is needed and refactor
+       so that same code-path is used for both in-place and copying operation.
+
+       mpi/amd64: optimize add_n and sub_n.
+       + commit ad4ee8d52f7199ba8bdee767044337060529069f
+       * mpi/amd64/mpih-add1.S (_gcry_mpih_add_n): New implementation
+       with 4x unrolled fast-path loop.
+       * mpi/amd64/mpih-sub1.S (_gcry_mpih_sub_n): Likewise.
+
+       mpi/amd64: fix use of 'movd' for 64-bit register move in lshift&rshift.
+       + commit 3e17e819a6a4d505828cf93fc2c258a753f1d38c
+       * mpi/amd64/mpih-lshift.S: Use 'movq' instead of 'movd' for moving
+       value to %rax.
+       * mpi/amd64/mpih-rshift.S: Likewise.
 
-2023-04-21  Jussi Kivilinna  <jussi.kivilinna@iki.fi>
+       mpi: avoid MPI copy at gcry_mpi_sub.
+       + commit 8b09db4cce1040f061034ab899cd1369a51dceeb
+       * mpi/mpi-add.c (_gcry_mpi_add): Rename function...
+       (_gcry_mpi_add_inv_sign): ... to this and add parameter for inverting
+       sign of second operand.
+       (_gcry_mpi_add): New.
+       (_gcry_mpi_sub): Remove mpi_copy and instead use new
+       '_gcry_mpi_add_inv_sign' function with inverted sign for second
+       operand.
+
+       bench-slope: add MPI benchmarking.
+       + commit e90fbf6f8dacf280d03e557a65528fc2df24f1d7
+       * tests/bench-slope.c (MPI_START_SIZE, MPI_END_SIZE, MPI_STEP_SIZE)
+       (MPI_NUM_STEPS, bench_mpi_test, mpi_test_names, bench_mpi_mode)
+       (bench_mpi_hd, bench_mpi_init, bench_mpi_fre, bench_mpi_do_bench)
+       (mpi_ops, mpi_modes, mpi_bench_one, _mpi_bench, mpi_match_test)
+       (mpi_bench): New.
+       (print_help): Add mention of 'mpi'.
+       (main): Add "mpi" tests.
+
+       cipher: restore weak-key error-code after mode specific setkey.
+       + commit 5d18b401f8a780e2465662e88cbac6974033bf3f
+       * cipher/cipher.c (cipher_setkey): Restore weak-key error-code
+       in case mode specific setkey returned success for the return code.
+
+2023-04-20  Jussi Kivilinna  <jussi.kivilinna@iki.fi>
 
        doc: add documentation for GCRYCTL_SET_ALLOW_WEAK_KEY.
-       + commit 7cdfc869b7afa057839fd361bfff9140cfbe63ed
+       + commit f3ca9fa4f7e0aee1bb5012c419deb7b927da844d
        * doc/gcrypt.texi: Add 'GCRYCTL_SET_ALLOW_WEAK_KEY' under
        'gcry_cipher_ctl'.
 
        Revert "cipher: Fix edge case for SET_ALLOW_WEAK_KEY."
-       + commit 9b1ee0574ed96800429aa9488b6dcb11a5407542
+       + commit 7146b69b490595d654228bae8c3fb5d1525e3b60
        * cipher/cipher.c (cipher_setkey): Do not reset RC.
 
 2023-04-16  Werner Koch  <wk@gnupg.org>
 
        cipher: Fix edge case for SET_ALLOW_WEAK_KEY.
-       + commit b75a58df84a5137954cb678adf8c202b39ee1def
+       + commit 30840c2c45d718e0fd93cfd40771fbefa50e31f5
        * cipher/cipher.c (cipher_setkey): Reset RC.
 
-2023-04-11  NIIBE Yutaka  <gniibe@fsij.org>
+2023-04-10  NIIBE Yutaka  <gniibe@fsij.org>
 
        random: Use getrandom only when it's appropriate.
-       + commit d41177937cea4aa1e9042ebcd195a349c40e8071
+       + commit fa21ddc158b5d7b5900856e5b131071302217a51
        * random/rndgetentropy.c (_gcry_rndgetentropy_gather_random)
        [GRND_RANDOM]: Conditionalize the use of getrandom, as it's
        not a portable function.
 
-2023-04-06  Werner Koch  <wk@gnupg.org>
-
-       Release 1.10.2.
-       + commit 1c5cbacf3d88dded5063e959ee68678ff7d0fa56
-
-
 2023-04-04  NIIBE Yutaka  <gniibe@fsij.org>
 
        cipher: Enable the fast path to ChaCha20 only when supported.
-       + commit 4128f73d3a83c7f901924488c3bbf047b75db20f
+       + commit 137f1fd82bc9136d434ca41f58d62091b64df6db
        cipher/cipher-poly1305.c (_gcry_cipher_poly1305_encrypt)
        (_gcry_cipher_poly1305_decrypt) [USE_CHACHA20]: Conditionalize.
 
+2023-04-03  Jussi Kivilinna  <jussi.kivilinna@iki.fi>
+
+       cipher-gcm-ppc: tweak loop structure a bit.
+       + commit 278ba98497e126358a6b0ee8b566cc62ebb96ab9
+       * cipher/cipher-gcm-ppc.c (_gcry_ghash_ppc_vpmsum): Increament
+       'buf' pointer right after use; Use 'for' loop for inner 4-blocks
+       loop to allow compiler to better optimize loop.
+
+2023-04-03  NIIBE Yutaka  <gniibe@fsij.org>
+
        build: Allow build with -Oz.
-       + commit f2ab06bc3cccd1f6c9a012275d90df9f3956572b
+       + commit 7edf1abb9a0d892a80cbf7ab42f64b2720671ee9
        * cipher/Makefile.am [ENABLE_O_FLAG_MUNGING]: Support -Oz.
        * random/Makefile.am [ENABLE_O_FLAG_MUNGING]: Support -Oz.
 
-2023-04-03  NIIBE Yutaka  <gniibe@fsij.org>
+2023-04-01  NIIBE Yutaka  <gniibe@fsij.org>
 
        m4: Update gpg-error.m4.
-       + commit c118a8ddd0224f951f26ae78d58d0eed5ee35779
+       + commit bcf5922eaac274f5ace991ecace01e718a9fe964
        * m4/gpg-error.m4: Update from libgpg-error master.
 
 2023-03-24  NIIBE Yutaka  <gniibe@fsij.org>
 
        fips: More elaborate way of getting FIPS pk flags indicators.
-       + commit f6f345fe89b0a61408bbc72058ab42ac6e6a7577
+       + commit 1c916b8c99ea0e30f1d81d606fd63b0c45657186
        * src/fips.c (_gcry_fips_indicator_pk_flags): List more allowed string
        in the S-expression.
        * doc/gcrypt.texi: Add document for the FIPS service indicator
 2023-03-23  NIIBE Yutaka  <gniibe@fsij.org>
 
        build: Update gpg-error.m4.
-       + commit f815ae113a2a914b0d20f0eb36d91c0351d5a797
+       + commit 0af15f1fa0ca277fba17b365519f710b41a5b78f
        * m4/gpg-error.m4: Update from libgpg-error master.
 
+2023-03-19  Jussi Kivilinna  <jussi.kivilinna@iki.fi>
+
+       camellia-simd128: use 8-bit right shift for rotate function.
+       + commit 79a79d3dc6268f4d72597f7db6c83511dbbef645
+       * cipher/camellia-simd128.h (rol32_1_16): Use vpsrlb128 for uint8
+       right shift by 7 if available.
+
+       camellia-gfni: use GFNI for uint8 right shift in FLS.
+       + commit 3c98ae9cb60a8a72d3fa6641e59775f98ec78786
+       * cipher/camellia-aesni-avx2-amd64.h (IF_GFNI, IF_NOT_GFNI): New.
+       [CAMELLIA_GFNI_BUILD] (rol32_1_32): Add GFNI variant which uses
+       vgf2p8affineqb for uint8 right shift by 7.
+       (fls32): Load 'right shift by 7' bit-matrix on GFNI build.
+       [CAMELLIA_GFNI_BUILD] (.Lright_shift_by_7): New.
+       * cipher/camellia-gfni-avx512-amd64.S (clear_regs): Don't clear %k1.
+       (rol32_1_64): Use vgf2p8affineqb for uint8 right shift by 7.
+       (fls64): Adjust for rol32_1_64 changes.
+       (.Lbyte_ones): Remove.
+       (.Lright_shift_by_7): New.
+       (_gcry_camellia_gfni_avx512_ctr_enc): Clear %k1 after use.
+
 2023-03-14  Jakub Jelen  <jjelen@redhat.com>
 
        tests: Improve test coverage for FIPS service indicators.
-       + commit 22cb410dd4456a84b5bde8be6a907afdc38a792b
+       + commit fae63f517906ba8f46d255f1b5770665f2197ad9
        * tests/basic.c (check_digests): Check the FIPS indicators.
        (check_mac): Ditto.
 
        fips: Explicitly disable overriding random in FIPS mode.
-       + commit 251f1749900e355ee6b027ee6a5c070eba16c85f
+       + commit e0a5a9eb8301991c28fae8632add8dacce81aeb4
        * src/fips.c: (_gcry_fips_indicator_function): Mark using random
        override non-approved in FIPS mode.
 
-       fips: Mark gcry_pk_encrypt/decrypt function non-approved.
-       + commit 051bbe84d889b413f158c665e5cc25b26c820388
-       * src/fips.c (_gcry_fips_indicator_function): Add
-       gcry_pk_encrypt/decrypt as non-approved.
-
-       fips: Fix fips indicator function.
-       + commit 6805d76b7ed4886f00bf704c77b0549408097219
-       * src/fips.c (_gcry_fips_indicator_function): Fix typo in sign/verify
-       function names.
-
        fips: Explicitly allow only some PK flags.
-       + commit 0b2b30c0c42fa2fea646a83a1f21a99f7a902853
+       + commit 4c1c8a707f9652dbfad8f8b531d8b84556f655f1
        * src/fips.c (_gcry_fips_indicator_pk_flags): New function for explicit
          FIPS indicator for public key algorithm flags.
        * src/g10lib.h (_gcry_fips_indicator_pk_flags): New.
 2023-03-14  Tobias Heider  <tobias.heider@canonical.com>
 
        doc: Document the new FIPS indicators.
-       + commit fcb9ec67a11763ca10fa1b64166c206da95eb006
+       + commit 0b7ad923978f708b41933d6b91d3159ffc7a84a1
        * doc/gcrypt.texi: Document the new options for FIPS indicators.
 
 2023-03-08  Tobias Heider  <tobias.heider@canonical.com>
 
        fips: Unblock MD5 in fips mode but mark non-approved in indicator.
-       + commit 0024db5afee825185ddf26cd9a91f563b8c11b1a
+       + commit dc4a60e2d70bc52ba2955f8e676341d675ab89a0
        * cipher/mac-hmac.c (_gcry_mac_type_spec_hmac_md5): Allow in fips mode.
        * cipher/md5.c (_gcry_digest_spec_md5): Allow in fips mode.
 
        fips: Add explicit indicators for md and mac algorithms.
-       + commit a51f0e66842ae989cd3966e5ef5eb1f62a3576b1
+       + commit c88672a327f6774a66d75a35f25266eec99b16f4
        * src/fips.c (_gcry_fips_indicator_mac): New function indicating
        non-approved mac algorithms.
        (_gcry_fips_indicator_md): new functions indicating non-approved
 2023-03-07  Jakub Jelen  <jjelen@redhat.com>
 
        kdf: Update tests in regards to the allowed parameters in FIPS mode.
-       + commit 397ff085749e5b47095827d19561332a67007c02
+       + commit f5fe94810f3099c9ccc2ca3a5891502922ab0576
        * cipher/kdf.c (check_one): run selftests for more approved parameters
        and check that wrong parameters correctly fail in FIPS mode.
 
        fips: Check return value from ftell.
-       + commit 076dd2ffcd953d80172770d8bb98c2c945dad7c9
+       + commit 3fd3bb31597f80c76a94ea62e42d58d796beabf1
        * src/fips.c (get_file_offset): Check return value of ftell to be able
          to detect errors.
 
        random: Remove unused SHA384 DRBGs.
-       + commit 4cff7e739829294d5f4a364d35584b42ef81af9f
+       + commit 45b80678109e5817b7cd15566a9d6c96b064b95f
        * random/random-drbg.c (global): Remove unused SHA384-based defines.
        (drbg_cores): Remove SHA384 configurations.
        (drbg_sec_strength): Remove unused SHA384.
 
        visibility: Check FIPS operational status for MD+Sign operation.
-       + commit fc19b27b543910833096a738dae0703b3dc57d51
+       + commit 654d0dfa04993ebe28c0536d42f4bc6d87c28369
        * src/visibility.c (gcry_pk_hash_sign): Check fips status before
        calling the operation itself.
        (gcry_pk_hash_verify): Ditto.
 
        ecc: Make the PCT recoverable in FIPS mode and consistent with RSA.
-       + commit c41d4f502f1b8aa08b2a79cbee3c8cd73e73adbe
+       + commit 23a2d1285e35b2eb91bb422609eb1c965c8a9bf6
        * cipher/ecc.c (test_keys_fips): Replace calls to log_fatal with
        return code on error.
        (ecc_generate): Signal error when PCT fails in FIPS mode.
 
        ecc: Do not allow skipping tests in FIPS Mode.
-       + commit 44a3f26539f7e88a77bbf4fe5d35ddd24f87ffcd
+       + commit 2ddeec574bc1ae90bb4242c4ce9ad9e7975a27bd
        * cipher/ecc.c (ecc_generate): Do not allow skipping tests PCT tests
        in FIPS mode.
 
-2023-01-23  Jakub Jelen  <jjelen@redhat.com>
+2023-03-06  Jussi Kivilinna  <jussi.kivilinna@iki.fi>
+
+       rijndael-ppc: use vector registers for key schedule calculations.
+       + commit 855f8c513d6db58cbc9d45f3e0bb2cee3ab5a189
+       * cipher/rijndael-ppc.c (_gcry_aes_sbox4_ppc8): Remove.
+       (bcast_u32_to_vec, u32_from_vec): New.
+       (_gcry_aes_ppc8_setkey): Use vectors for round key calculation
+       variables.
+
+       Add PowerPC vector implementation of SM4.
+       + commit 0b2da804ee813eee22c386ba7f253415103b34ea
+       * cipher/Makefile.am: Add 'sm4-ppc.c'.
+       * cipher/sm4-ppc.c: New.
+       * cipher/sm4.c (USE_PPC_CRYPTO): New.
+       (SM4_context): Add 'use_ppc8le' and 'use_ppc9le'.
+       [USE_PPC_CRYPTO] (_gcry_sm4_ppc8le_crypt_blk1_16)
+       (_gcry_sm4_ppc9le_crypt_blk1_16, sm4_ppc8le_crypt_blk1_16)
+       (sm4_ppc9le_crypt_blk1_16): New.
+       (sm4_setkey) [USE_PPC_CRYPTO]: Set use_ppc8le and use_ppc9le
+       based on HW features.
+       (sm4_get_crypt_blk1_16_fn) [USE_PPC_CRYPTO]: Add PowerPC
+       implementation selection.
+
+2023-03-02  Jussi Kivilinna  <jussi.kivilinna@iki.fi>
+
+       camellia-simd128: faster sbox filtering with uint8 right shift.
+       + commit 6c05c808e4e848964f67157e80f1835c5146e2bc
+       * cipher/camellia-simd128.h (if_vpsrlb128)
+       (if_not_vpsrlb128): New.
+       (filter_8bit): Use 'vpsrlb128' when available on target
+       architecture (PowerPC and AArch64).
+
+       chacha20-ppc: do not generate p9 code when target attr unavailable.
+       + commit 652598096325c2478d7d033585dadc13bec6fb1d
+       * cipher/chacha20-ppc.c (HAVE_FUNC_ATTR_TARGET): New.
+       (_gcry_chacha20_ppc9_blocks1, _gcry_chacha20_ppc9_blocks4)
+       (_gcry_chacha20_poly1305_ppc8_blocks4): Use inline functions
+       only if HAVE_FUNC_ATTR_TARGET is defined.
+
+2023-03-01  Jussi Kivilinna  <jussi.kivilinna@iki.fi>
+
+       Fix "'inline' is not at beginning of declaration" warnings.
+       + commit 0a5f7e9543a823034f6dd6dae2f647c4e213f213
+       * cipher/chacha20-ppc.c (chacha20_ppc_blocks1)
+       (chacha20_ppc_blocks4, chacha20_poly1305_ppc_blocks4): Move
+       'ASM_FUNC_ATTR_INLINE' right after 'static'.
+       * cipher/sha256-ppc.c (sha256_transform_ppc): Likewise.
+       * cipher/sha512-ppc.c (sha512_transform_ppc): Likewise.
+
+       Improve PPC target function attribute checks.
+       + commit 66bae697a853b21e0a2ff2a5031d23e3691a9084
+       * configure.ac (gcry_cv_gcc_attribute_ppc_target)
+       (gcry_cv_clang_attribute_ppc_target): Add 'always_inline'
+       function to test.
+
+2023-02-28  Jussi Kivilinna  <jussi.kivilinna@iki.fi>
+
+       camellia: add AArch64 crypto-extension implementation.
+       + commit 898c857206ada06d70c5f46ac5adaa9d7058e672
+       * cipher/Makefile.am: Add 'camellia-aarch64-ce.(c|o|lo)'.
+       (aarch64_neon_cflags): New.
+       * cipher/camellia-aarch64-ce.c: New.
+       * cipher/camellia-glue.c (USE_AARCH64_CE): New.
+       (CAMELLIA_context): Add 'use_aarch64ce'.
+       (_gcry_camellia_aarch64ce_encrypt_blk16)
+       (_gcry_camellia_aarch64ce_decrypt_blk16)
+       (_gcry_camellia_aarch64ce_keygen, camellia_aarch64ce_enc_blk16)
+       (camellia_aarch64ce_dec_blk16, aarch64ce_burn_stack_depth): New.
+       (camellia_setkey) [USE_AARCH64_CE]: Set use_aarch64ce if HW has
+       HWF_ARM_AES; Use AArch64/CE key generation if supported by HW.
+       (camellia_encrypt_blk1_32, camellia_decrypt_blk1_32)
+       [USE_AARCH64_CE]: Add AArch64/CE code path.
+
+       camellia: add POWER8/POWER9 vcrypto implementation.
+       + commit 6fa11d8b7070eb7c4c296c879213c9596bd00b1c
+       * cipher/Makefile.am: Add 'camellia-simd128.h',
+       'camellia-ppc8le.c' and 'camellia-ppc9le.c'.
+       * cipher/camellia-glue.c (USE_PPC_CRYPTO): New.
+       (CAMELLIA_context) [USE_PPC_CRYPTO]: Add 'use_ppc', 'use_ppc8'
+       and 'use_ppc9'.
+       [USE_PPC_CRYPTO] (_gcry_camellia_ppc8_encrypt_blk16)
+       (_gcry_camellia_ppc8_decrypt_blk16, _gcry_camellia_ppc8_keygen)
+       (_gcry_camellia_ppc9_encrypt_blk16)
+       (_gcry_camellia_ppc9_decrypt_blk16, _gcry_camellia_ppc9_keygen)
+       (camellia_ppc_enc_blk16, camellia_ppc_dec_blk16)
+       (ppc_burn_stack_depth): New.
+       (camellia_setkey) [USE_PPC_CRYPTO]: Setup 'use_ppc', 'use_ppc8'
+       and 'use_ppc9' and use PPC key-generation if HWF is available.
+       (camellia_encrypt_blk1_32)
+       (camellia_decrypt_blk1_32) [USE_PPC_CRYPTO]: Add 'use_ppc' paths.
+       (_gcry_camellia_ocb_crypt, _gcry_camellia_ocb_auth): Enable
+       generic bulk path when USE_PPC_CRYPTO is defined.
+       * cipher/camellia-ppc8le.c: New.
+       * cipher/camellia-ppc9le.c: New.
+       * cipher/camellia-simd128.h: New.
+       * configure.ac: Add 'camellia-ppc8le.lo' and 'camellia-ppc9le.lo'.
+
+2023-02-26  Jussi Kivilinna  <jussi.kivilinna@iki.fi>
+
+       aes-amd64-vaes: fix fast exit path in XTS function.
+       + commit 0d42d9674458a22cfccec419f4c358d743c5effb
+       * cipher/rijndael-vaes-avx2-amd64.S
+       (_gcry_vaes_avx2_xts_crypt_amd64): On fast exit path, compare
+       number of blocks left against '1' instead of '0' as following
+       branch is 'less than'.
+
+       ppc: add support for clang target attribute.
+       + commit 937a76a34540bd2558f5b34a98fbe53227000646
+       * configure.ac (gcry_cv_clang_attribute_ppc_target): New.
+       * cipher/chacha20-ppc.c [HAVE_CLANG_ATTRIBUTE_PPC_TARGET]
+       (FUNC_ATTR_TARGET_P8, FUNC_ATTR_TARGET_P9): New.
+       * cipher/rijndael-ppc.c [HAVE_CLANG_ATTRIBUTE_PPC_TARGET]
+       (FPC_OPT_ATTR): New.
+       * cipher/rijndael-ppc9le.c [HAVE_CLANG_ATTRIBUTE_PPC_TARGET]
+       (FPC_OPT_ATTR): New.
+       * cipher/sha256-ppc.c [HAVE_CLANG_ATTRIBUTE_PPC_TARGET]
+       (FUNC_ATTR_TARGET_P8, FUNC_ATTR_TARGET_P9): New.
+       * cipher/sha512-ppc.c [HAVE_CLANG_ATTRIBUTE_PPC_TARGET]
+       (FUNC_ATTR_TARGET_P8, FUNC_ATTR_TARGET_P9): New.
+       (ror64): Remove unused function.
+
+       chacha20-ppc: use target and optimize attributes for P8 and P9.
+       + commit f3d1d4a8c9f0df107a57e2cd3699253766d6e45a
+       * cipher/chacha20-ppc.c (_gcry_chacha20_ppc8_blocks1): Rename to...
+       (chacha20_ppc_blocks1): ...this; Add 'always inline' attribute.
+       (_gcry_chacha20_ppc8_blocks4): Rename to...
+       (chacha20_ppc_blocks4): ...this; Add 'always inline' attribute.
+       (_gcry_chacha20_poly1305_ppc8_blocks4): Rename to...
+       (chacha20_poly1305_ppc_blocks4): ...this; Add 'always inline'
+       attribute.
+       (FUNC_ATTR_OPT_O2, FUNC_ATTR_TARGET_P8, FUNC_ATTR_TARGET_P9): New.
+       (_gcry_chacha20_ppc8_blocks1, _gcry_chacha20_ppc8_blocks4)
+       (_gcry_chacha20_poly1305_ppc8_blocks4): New.
+       (_gcry_chacha20_ppc9_blocks1, _gcry_chacha20_ppc9_blocks4)
+       (_gcry_chacha20_poly1305_ppc9_blocks4): New.
+       * cipher/chacha20.c (CHACHA20_context_t): Add 'use_p9'.
+       (_gcry_chacha20_ppc9_blocks1, _gcry_chacha20_ppc9_blocks4)
+       (_gcry_chacha20_poly1305_ppc9_blocks4): New.
+       (chacha20_do_setkey): Set 'use_p9' if HW has HWF_PPC_ARCH_3_00.
+       (chacha20_blocks, do_chacha20_encrypt_stream_tail)
+       (_gcry_chacha20_poly1305_encrypt)
+       (_gcry_chacha20_poly1305_decrypt) [USE_PPC_VEC]: Add 'use_p9' paths.
+
+       aes-ppc: use target and optimize attributes for P8 and P9.
+       + commit 100063cf4e1ca3350f05a343d8fa0ccf305debb1
+       * cipher/rijndael-ppc-functions.h: Add PPC_OPT_ATTR attribute
+       macro for all functions.
+       * cipher/rijndael-ppc.c (FUNC_ATTR_OPT, PPC_OPT_ATTR): New.
+       (_gcry_aes_ppc8_setkey, _gcry_aes_ppc8_prepare_decryption): Add
+       PPC_OPT_ATTR attribute macro.
+       * cipher/rijndael-ppc9le.c (FUNC_ATTR_OPT, PPC_OPT_ATTR): New.
+
+       aes-ppc: add CTR32LE bulk acceleration.
+       + commit 84f2e2d0b51b7b3e75d96d8188ae6a8d8174542b
+       * cipher/rijndael-ppc-functions.h (CTR32LE_ENC_FUNC): New.
+       * cipher/rijndael-ppc.c (_gcry_aes_ppc8_ctr32le_enc): New.
+       * cipher/rijndael-ppc9le.c (_gcry_aes_ppc9le_ctr32le_enc): New.
+       * cipher/rijndael.c (_gcry_aes_ppc8_ctr32le_enc)
+       (_gcry_aes_ppc9le_ctr32le_enc): New.
+       (do_setkey): Setup _gcry_aes_ppc8_ctr32le_enc for POWER8 and
+       _gcry_aes_ppc9le_ctr32le_enc for POWER9.
+
+       aes-ppc: add ECB bulk acceleration for benchmarking purposes.
+       + commit a7ebf9d52e67015c0ae175c0a9bbff9da9fe6f32
+       * cipher/rijndael-ppc-functions.h (ECB_CRYPT_FUNC): New.
+       * cipher/rijndael-ppc.c (_gcry_aes_ppc8_ecb_crypt): New.
+       * cipher/rijndael-ppc9le.c (_gcry_aes_ppc9le_ecb_crypt): New.
+       * cipher/rijndael.c (_gcry_aes_ppc8_ecb_crypt)
+       (_gcry_aes_ppc9le_ecb_crypt): New.
+       (do_setkey): Set up _gcry_aes_ppc8_ecb_crypt for POWER8 and
+       _gcry_aes_ppc9le_ecb_crypt for POWER9.
+
+       sha2-ppc: better optimization for POWER9.
+       + commit 161614ecc1c20d94a8599888524b1d1fd0eb2754
+       * cipher/sha256-ppc.c: Change to use vector registers, generate
+       POWER8 and POWER9 from same code with help of 'target' and
+       'optimize' attribute.
+       * cipher/sha512-ppc.c: Likewise.
+       * configure.ac (gcry_cv_gcc_attribute_optimize)
+       (gcry_cv_gcc_attribute_ppc_target): New.
+
+2023-02-22  Jussi Kivilinna  <jussi.kivilinna@iki.fi>
+
+       camellia-aesni-avx: speed up for round key broadcasting.
+       + commit dd4cb5d75c8e4e666db4352d999b2111b9ddb80d
+       * cipher/camellia-aesni-avx2-amd64.h (roundsm16, fls16): Broadcast
+       round key bytes directly with 'vpshufb'.
+
+       camellia-avx2: speed up for round key broadcasting.
+       + commit b9a9755742c7bf7ca8c007d33f98aaa076a382c7
+       * cipher/camellia-aesni-avx2-amd64.h (roundsm32, fls32): Use
+       'vpbroadcastb' for loading round key.
+       * cipher/camellia-glue.c (camellia_encrypt_blk1_32)
+       (camellia_decrypt_blk1_32): Adjust num_blks thresholds for AVX2
+       implementations, 2 blks for GFNI, 4 blks for VAES and 5 blks for AESNI.
+
+       camellia-gfni-avx512: speed up for round key broadcasting.
+       + commit a4c22331f57d23832ddd019ac3108b5fa3dd942b
+       * cipher/camellia-gfni-avx512-amd64.S (roundsm64, fls64): Use
+       'vpbroadcastb' for loading round key.
+
+       camellia-avx2: add fast path for full 32 block ECB input.
+       + commit 5f25ad09fdb5eb5f83f7cc4cefe79bbeab29fec8
+       * cipher/camellia-aesni-avx2-amd64.h (enc_blk1_32, dec_blk1_32): Add
+       fast path for 32 block input.
+
+       camellia: add CTR-mode byte addition for AVX/AVX2/AVX512 impl.
+       + commit 87ae2a660d59751ddd7da40da05cfaee73f35ea7
+       * cipher/camellia-aesni-avx-amd64.S
+       (_gcry_camellia_aesni_avx_ctr_enc): Add byte addition fast-path.
+       * cipher/camellia-aesni-avx2-amd64.h (ctr_enc): Likewise.
+       * cipher/camellia-gfni-avx512-amd64.S
+       (_gcry_camellia_gfni_avx512_ctr_enc): Likewise.
+       * cipher/camellia-glue.c (CAMELLIA_context): Add 'use_avx2'.
+       (camellia_setkey, _gcry_camellia_ctr_enc, _gcry_camellia_cbc_dec)
+       (_gcry_camellia_cfb_dec, _gcry_camellia_ocb_crypt)
+       (_gcry_camellia_ocb_auth) [USE_AESNI_AVX2]: Use 'use_avx2' to check
+       if any of the AVX2 implementations is enabled.
+
+       camellia-aesni-avx: add acceleration for ECB/XTS/CTR32LE modes.
+       + commit 926cc22058a39c7a931e14590eab6fd7a78ba455
+       * cipher/camellia-aesni-avx-amd64.S (_gcry_camellia_aesni_avx_ecb_enc)
+       (_gcry_camellia_aesni_avx_ecb_dec): New.
+       * cipher/camellia-glue.c (_gcry_camellia_aesni_avx_ecb_enc)
+       (_gcry_camellia_aesni_avx_ecb_dec): New.
+       (camellia_setkey): Always enable XTS/ECB/CTR32LE bulk functions.
+       (camellia_encrypt_blk1_32, camellia_decrypt_blk1_32)
+       [USE_AESNI_AVX]: Add AESNI/AVX code-path.
+
+       sm4: add CTR-mode byte addition for AVX/AVX2/AVX512 implementations.
+       + commit 978b02fca682c9ecb71e30cdeeb6922fc8331f6e
+       * cipher/sm4-aesni-avx-amd64.S
+       (_gcry_sm4_aesni_avx_ctr_enc): Add byte addition fast-path.
+       * cipher/sm4-aesni-avx2-amd64.S
+       (_gcry_sm4_aesni_avx2_ctr_enc): Likewise.
+       * cipher/sm4-gfni-avx2-amd64.S
+       (_gcry_sm4_gfni_avx2_ctr_enc): Likewise.
+       * cipher/sm4-gfni-avx512-amd64.S
+       (_gcry_sm4_gfni_avx512_ctr_enc)
+       (_gcry_sm4_gfni_avx512_ctr_enc_blk32): Likewise.
+
+       aes-vaes-avx2: improve case when only CTR needs carry handling.
+       + commit 8f7f5a9fc63968304bacedbc2f22b9f7188bbd53
+       * cipher/rijndael-vaes-avx2-amd64.S
+       (_gcry_vaes_avx2_ctr_enc_amd64): Add handling for the case when
+       only main counter needs carry handling but generated vector counters
+       do not.
+
+       aria-avx2: add VAES accelerated implementation.
+       + commit caf402e9b41fff6cf39b914b088ea1e5f8fd3bd1
+       * cipher/aria-aesni-avx2-amd64.S (CONFIG_AS_VAES): New.
+       [CONFIG_AS_VAES]: Add VAES accelerated assembly macros and functions.
+       * cipher/aria.c (USE_VAES_AVX2): New.
+       (ARIA_context): Add 'use_vaes_avx2'.
+       (_gcry_aria_vaes_avx2_ecb_crypt_blk32)
+       (_gcry_aria_vaes_avx2_ctr_crypt_blk32)
+       (aria_avx2_ecb_crypt_blk32, aria_avx2_ctr_crypt_blk32): Add VAES/AVX2
+       code paths.
+       (aria_setkey): Enable VAES/AVX2 implementation based on HW features.
+
+       aria-avx512: small optimization for aria_diff_m.
+       + commit f359a3ec7e845aa446836bd47994fe18d6d41e08
+       * cipher/aria-gfni-avx512-amd64.S (aria_diff_m): Use 'vpternlogq' for
+       3-way XOR operation.
+       ---
+
+       Using vpternlogq gives small performance improvement on AMD Zen4. With
+       Intel tiger-lake speed is the same as before.
+
+       Benchmark on AMD Ryzen 9 7900X (zen4, turbo-freq off):
+
+       Before:
+        ARIA128        |  nanosecs/byte   mebibytes/sec   cycles/byte  auto Mhz
+               ECB enc |     0.203 ns/B      4703 MiB/s     0.953 c/B      4700
+               ECB dec |     0.204 ns/B      4675 MiB/s     0.959 c/B      4700
+               CTR enc |     0.207 ns/B      4609 MiB/s     0.973 c/B      4700
+               CTR dec |     0.207 ns/B      4608 MiB/s     0.973 c/B      4700
+
+       After (~3% faster):
+        ARIA128        |  nanosecs/byte   mebibytes/sec   cycles/byte  auto Mhz
+               ECB enc |     0.197 ns/B      4847 MiB/s     0.925 c/B      4700
+               ECB dec |     0.197 ns/B      4852 MiB/s     0.924 c/B      4700
+               CTR enc |     0.200 ns/B      4759 MiB/s     0.942 c/B      4700
+               CTR dec |     0.200 ns/B      4772 MiB/s     0.939 c/B      4700
+
+       Cc: Taehee Yoo <ap420073@gmail.com>
+
+       aria-avx: small optimization for aria_ark_8way.
+       + commit 855f1551fd921ced652dc0c3c03601dfcd063f1c
+       * cipher/aria-aesni-avx-amd64.S (aria_ark_8way): Use 'vmovd' for
+       loading key material and 'vpshufb' for broadcasting from byte
+       locations 3, 2, 1 and 0.
+
+       aria: add x86_64 GFNI/AVX512 accelerated implementation.
+       + commit 45351e6474cbbe5baaa4c488222610edc417176e
+       * cipher/Makefile.am: Add 'aria-gfni-avx512-amd64.S'.
+       * cipher/aria-gfni-avx512-amd64.S: New.
+       * cipher/aria.c (USE_GFNI_AVX512): New.
+       [USE_GFNI_AVX512] (MAX_PARALLEL_BLKS): New.
+       (ARIA_context): Add 'use_gfni_avx512'.
+       (_gcry_aria_gfni_avx512_ecb_crypt_blk64)
+       (_gcry_aria_gfni_avx512_ctr_crypt_blk64)
+       (aria_gfni_avx512_ecb_crypt_blk64)
+       (aria_gfni_avx512_ctr_crypt_blk64): New.
+       (aria_crypt_blocks) [USE_GFNI_AVX512]: Add 64 parallel block
+       AVX512/GFNI processing.
+       (_gcry_aria_ctr_enc) [USE_GFNI_AVX512]: Add 64 parallel block
+       AVX512/GFNI processing.
+       (aria_setkey): Enable GFNI/AVX512 based on HW features.
+       * configure.ac: Add 'aria-gfni-avx512-amd64.lo'.
+
+       aria: add x86_64 AESNI/GFNI/AVX/AVX2 accelerated implementations.
+       + commit f4268a8f51a89a7c0374a23f669d7a19cad304ae
+       * cipher/Makefile.am: Add 'aria-aesni-avx-amd64.S' and
+       'aria-aesni-avx2-amd64.S'.
+       * cipher/aria-aesni-avx-amd64.S: New.
+       * cipher/aria-aesni-avx2-amd64.S: New.
+       * cipher/aria.c (USE_AESNI_AVX, USE_GFNI_AVX, USE_AESNI_AVX2)
+       (USE_GFNI_AVX2, MAX_PARALLEL_BLKS, ASM_FUNC_ABI, ASM_EXTRA_STACK): New.
+       (ARIA_context): Add 'use_aesni_avx', 'use_gfni_avx',
+       'use_aesni_avx2' and 'use_gfni_avx2'.
+       (_gcry_aria_aesni_avx_ecb_crypt_blk1_16)
+       (_gcry_aria_aesni_avx_ctr_crypt_blk16)
+       (_gcry_aria_gfni_avx_ecb_crypt_blk1_16)
+       (_gcry_aria_gfni_avx_ctr_crypt_blk16)
+       (aria_avx_ecb_crypt_blk1_16, aria_avx_ctr_crypt_blk16)
+       (_gcry_aria_aesni_avx2_ecb_crypt_blk32)
+       (_gcry_aria_aesni_avx2_ctr_crypt_blk32)
+       (_gcry_aria_gfni_avx2_ecb_crypt_blk32)
+       (_gcry_aria_gfni_avx2_ctr_crypt_blk32)
+       (aria_avx2_ecb_crypt_blk32, aria_avx2_ctr_crypt_blk32): New.
+       (aria_crypt_blocks) [USE_AESNI_AVX2]: Add 32 parallel block
+       AVX2/AESNI/GFNI processing.
+       (aria_crypt_blocks) [USE_AESNI_AVX]: Add 3 to 16 parallel block
+       AVX/AESNI/GFNI processing.
+       (_gcry_aria_ctr_enc) [USE_AESNI_AVX2]: Add 32 parallel block
+       AVX2/AESNI/GFNI processing.
+       (_gcry_aria_ctr_enc) [USE_AESNI_AVX]: Add 16 parallel block
+       AVX/AESNI/GFNI processing.
+       (_gcry_aria_ctr_enc, _gcry_aria_cbc_dec, _gcry_aria_cfb_enc)
+       (_gcry_aria_ecb_crypt, _gcry_aria_xts_crypt, _gcry_aria_ctr32le_enc)
+       (_gcry_aria_ocb_crypt, _gcry_aria_ocb_auth): Use MAX_PARALLEL_BLKS
+       for parallel processing width.
+       (aria_setkey): Enable AESNI/AVX, GFNI/AVX, AESNI/AVX2, GFNI/AVX2 based
+       on HW features.
+       * configure.ac: Add 'aria-aesni-avx-amd64.lo' and
+       'aria-aesni-avx2-amd64.lo'.
+       ---
+
+       This patch adds AVX/AVX2/AESNI/GFNI accelerated ARIA block cipher
+       implementations for libgcrypt. This implementation is based on work
+       by Taehee Yoo, with following notable changes:
+        - Integration to libgcrypt, use of 'aes-common-amd64.h'.
+        - Use 'vmovddup' for loading GFNI constants.
+        - Use round loop instead of unrolling for smaller code size and
+          increased performance.
+        - Use stack for temporary storage instead of external buffers.
+        - Use merge ECB encryption/decryption to single function.
+        - Add 1 to 15 blocks support for AVX ECB functions.
+        - Add byte-addition fast path for CTR.
+
+       ===
+
+       Benchmark on AMD Ryzen 9 7900X (zen4, turbo-freq off):
+
+        AESNI/AVX:
+        ARIA128        |  nanosecs/byte   mebibytes/sec   cycles/byte  auto Mhz
+               ECB enc |     0.715 ns/B      1333 MiB/s      3.36 c/B      4700
+               ECB dec |     0.712 ns/B      1339 MiB/s      3.35 c/B      4700
+               CTR enc |     0.714 ns/B      1336 MiB/s      3.36 c/B      4700
+               CTR dec |     0.714 ns/B      1335 MiB/s      3.36 c/B      4700
+
+        GFNI/AVX:
+        ARIA128        |  nanosecs/byte   mebibytes/sec   cycles/byte  auto Mhz
+               ECB enc |     0.516 ns/B      1847 MiB/s      2.43 c/B      4700
+               ECB dec |     0.519 ns/B      1839 MiB/s      2.44 c/B      4700
+               CTR enc |     0.517 ns/B      1846 MiB/s      2.43 c/B      4700
+               CTR dec |     0.518 ns/B      1843 MiB/s      2.43 c/B      4700
+
+        AESNI/AVX2:
+        ARIA128        |  nanosecs/byte   mebibytes/sec   cycles/byte  auto Mhz
+               ECB enc |     0.416 ns/B      2292 MiB/s      1.96 c/B      4700
+               ECB dec |     0.421 ns/B      2266 MiB/s      1.98 c/B      4700
+               CTR enc |     0.415 ns/B      2298 MiB/s      1.95 c/B      4700
+               CTR dec |     0.415 ns/B      2300 MiB/s      1.95 c/B      4700
+
+        GFNI/AVX2:
+        ARIA128        |  nanosecs/byte   mebibytes/sec   cycles/byte  auto Mhz
+               ECB enc |     0.235 ns/B      4056 MiB/s      1.11 c/B      4700
+               ECB dec |     0.234 ns/B      4079 MiB/s      1.10 c/B      4700
+               CTR enc |     0.232 ns/B      4104 MiB/s      1.09 c/B      4700
+               CTR dec |     0.233 ns/B      4094 MiB/s      1.10 c/B      4700
+
+       ===
+
+       Benchmark on Intel Core i3-1115G4 (tiger-lake, turbo-freq off):
+
+        AESNI/AVX:
+        ARIA128        |  nanosecs/byte   mebibytes/sec   cycles/byte  auto Mhz
+               ECB enc |      1.26 ns/B     757.6 MiB/s      3.77 c/B      2993
+               ECB dec |      1.27 ns/B     753.1 MiB/s      3.79 c/B      2992
+               CTR enc |      1.25 ns/B     760.3 MiB/s      3.75 c/B      2992
+               CTR dec |      1.26 ns/B     759.1 MiB/s      3.76 c/B      2992
+
+        GFNI/AVX:
+        ARIA128        |  nanosecs/byte   mebibytes/sec   cycles/byte  auto Mhz
+               ECB enc |     0.967 ns/B     986.6 MiB/s      2.89 c/B      2992
+               ECB dec |     0.966 ns/B     987.1 MiB/s      2.89 c/B      2992
+               CTR enc |     0.972 ns/B     980.8 MiB/s      2.91 c/B      2993
+               CTR dec |     0.971 ns/B     982.5 MiB/s      2.90 c/B      2993
+
+        AESNI/AVX2:
+        ARIA128        |  nanosecs/byte   mebibytes/sec   cycles/byte  auto Mhz
+               ECB enc |     0.817 ns/B      1167 MiB/s      2.44 c/B      2992
+               ECB dec |     0.819 ns/B      1164 MiB/s      2.45 c/B      2992
+               CTR enc |     0.819 ns/B      1164 MiB/s      2.45 c/B      2992
+               CTR dec |     0.819 ns/B      1164 MiB/s      2.45 c/B      2992
+
+        GFNI/AVX2:
+        ARIA128        |  nanosecs/byte   mebibytes/sec   cycles/byte  auto Mhz
+               ECB enc |     0.506 ns/B      1886 MiB/s      1.51 c/B      2992
+               ECB dec |     0.505 ns/B      1887 MiB/s      1.51 c/B      2992
+               CTR enc |     0.564 ns/B      1691 MiB/s      1.69 c/B      2992
+               CTR dec |     0.565 ns/B      1689 MiB/s      1.69 c/B      2992
+
+       ===
+
+       Benchmark on AMD Ryzen 7 5800X (zen3, turbo-freq off):
+
+        AESNI/AVX:
+        ARIA128        |  nanosecs/byte   mebibytes/sec   cycles/byte  auto Mhz
+               ECB enc |     0.921 ns/B      1035 MiB/s      3.50 c/B      3800
+               ECB dec |     0.922 ns/B      1034 MiB/s      3.50 c/B      3800
+               CTR enc |     0.923 ns/B      1033 MiB/s      3.51 c/B      3800
+               CTR dec |     0.923 ns/B      1033 MiB/s      3.51 c/B      3800
+
+        AESNI/AVX2:
+        ARIA128        |  nanosecs/byte   mebibytes/sec   cycles/byte  auto Mhz
+               ECB enc |     0.559 ns/B      1707 MiB/s      2.12 c/B      3800
+               ECB dec |     0.560 ns/B      1703 MiB/s      2.13 c/B      3800
+               CTR enc |     0.570 ns/B      1672 MiB/s      2.17 c/B      3800
+               CTR dec |     0.568 ns/B      1679 MiB/s      2.16 c/B      3800
+
+       ===
+
+       Benchmark on AMD EPYC 7642 (zen2):
+
+        AESNI/AVX:
+        ARIA128        |  nanosecs/byte   mebibytes/sec   cycles/byte  auto Mhz
+               ECB enc |      1.22 ns/B     784.5 MiB/s      4.01 c/B      3298
+               ECB dec |      1.22 ns/B     784.8 MiB/s      4.00 c/B      3292
+               CTR enc |      1.22 ns/B     780.1 MiB/s      4.03 c/B      3299
+               CTR dec |      1.22 ns/B     779.1 MiB/s      4.04 c/B      3299
+
+        AESNI/AVX2:
+        ARIA128        |  nanosecs/byte   mebibytes/sec   cycles/byte  auto Mhz
+               ECB enc |     0.735 ns/B      1298 MiB/s      2.42 c/B      3299
+               ECB dec |     0.738 ns/B      1292 MiB/s      2.44 c/B      3299
+               CTR enc |     0.732 ns/B      1303 MiB/s      2.41 c/B      3299
+               CTR dec |     0.732 ns/B      1303 MiB/s      2.41 c/B      3299
+
+       ===
+
+       Benchmark on Intel Core i5-6500 (skylake):
+
+        AESNI/AVX:
+        ARIA128        |  nanosecs/byte   mebibytes/sec   cycles/byte  auto Mhz
+               ECB enc |      1.24 ns/B     766.6 MiB/s      4.48 c/B      3598
+               ECB dec |      1.25 ns/B     764.9 MiB/s      4.49 c/B      3598
+               CTR enc |      1.25 ns/B     761.7 MiB/s      4.50 c/B      3598
+               CTR dec |      1.25 ns/B     761.6 MiB/s      4.51 c/B      3598
+
+        AESNI/AVX2:
+        ARIA128        |  nanosecs/byte   mebibytes/sec   cycles/byte  auto Mhz
+               ECB enc |     0.829 ns/B      1150 MiB/s      2.98 c/B      3599
+               ECB dec |     0.831 ns/B      1147 MiB/s      2.99 c/B      3598
+               CTR enc |     0.829 ns/B      1150 MiB/s      2.98 c/B      3598
+               CTR dec |     0.828 ns/B      1152 MiB/s      2.98 c/B      3598
+
+       ===
+
+       Benchmark on Intel Core i5-2450M (sandy-bridge, turbo-freq off):
+
+        AESNI/AVX:
+        ARIA128        |  nanosecs/byte   mebibytes/sec   cycles/byte  auto Mhz
+               ECB enc |      2.11 ns/B     452.7 MiB/s      5.25 c/B      2494
+               ECB dec |      2.10 ns/B     454.5 MiB/s      5.23 c/B      2494
+               CTR enc |      2.10 ns/B     453.2 MiB/s      5.25 c/B      2494
+               CTR dec |      2.10 ns/B     453.2 MiB/s      5.25 c/B      2494
+
+       [v2]
+        - Optimization for CTR mode: Use CTR byte-addition path when
+          counter carry-overflow happen only on ctr-variable but not in
+          generated counter vector registers.
+
+       Cc: Taehee Yoo <ap420073@gmail.com>
+
+2023-01-21  Jussi Kivilinna  <jussi.kivilinna@iki.fi>
+
+       asm-common-aarch64: fix read-only section for Windows target.
+       + commit 833a904faf2b90a1b1d1b58e1e9a12f2e8e2378c
+       * cipher/asm-common-aarch64.h (SECTION_RODATA): Use .rdata for
+       _WIN32.
+
+2023-01-19  Jussi Kivilinna  <jussi.kivilinna@iki.fi>
+
+       aarch64-asm: align functions to 16 bytes.
+       + commit 6f8e750c42ffd34900fad30540e92e382111d567
+       * cipher/camellia-aarch64.S: Align functions to 16 bytes.
+       * cipher/chacha20-aarch64.S: Likewise.
+       * cipher/cipher-gcm-armv8-aarch64-ce.S: Likewise.
+       * cipher/crc-armv8-aarch64-ce.S: Likewise.
+       * cipher/rijndael-aarch64.S: Likewise.
+       * cipher/rijndael-armv8-aarch64-ce.S: Likewise.
+       * cipher/sha1-armv8-aarch64-ce.S: Likewise.
+       * cipher/sha256-armv8-aarch64-ce.S: Likewise.
+       * cipher/sha512-armv8-aarch64-ce.S: Likewise.
+       * cipher/sm3-aarch64.S: Likewise.
+       * cipher/sm3-armv8-aarch64-ce.S: Likewise.
+       * cipher/sm4-aarch64.S: Likewise.
+       * cipher/sm4-armv8-aarch64-ce.S: Likewise.
+       * cipher/sm4-armv9-aarch64-sve-ce.S: Likewise.
+       * cipher/twofish-aarch64.S: Likewise.
+       * mpi/aarch64/mpih-add1.S: Likewise.
+       * mpi/aarch64/mpih-mul1.S: Likewise.
+       * mpi/aarch64/mpih-mul2.S: Likewise.
+       * mpi/aarch64/mpih-mul3.S: Likewise.
+       * mpi/aarch64/mpih-sub1.S: Likewise.
+
+       aarch64-asm: move constant data to read-only section.
+       + commit b3d7d520a0aab8b3356749fa4765a7f1f5c676d6
+       * cipher/asm-common-aarch64.h (SECTION_RODATA)
+       (GET_DATA_POINTER): New.
+       (GET_LOCAL_POINTER): Remove.
+       * cipher/camellia-aarch64.S: Move constant data to read-only data
+       section; Remove unneeded '.ltorg'.
+       * cipher/chacha20-aarch64.S: Likewise.
+       * cipher/cipher-gcm-armv8-aarch64-ce.S: Likewise.
+       * cipher/crc-armv8-aarch64-ce.S: Likewise.
+       * cipher/rijndael-aarch64.S: Likewise.
+       * cipher/sha1-armv8-aarch64-ce.S: Likewise.
+       * cipher/sha256-armv8-aarch64-ce.S: Likewise.
+       * cipher/sm3-aarch64.S: Likewise.
+       * cipher/sm3-armv8-aarch64-ce.S: Likewise.
+       * cipher/sm4-aarch64.S: Likewise.
+       * cipher/sm4-armv9-aarch64-sve-ce.S: Likewise.
+       * cipher/twofish-aarch64.S: Likewise.
 
-       fips: Remove GCM mode from the allowed FIPS indicators.
-       + commit 1540698389ba2091baab6e020e5ec7b0d0eead59
-       * src/fips.c (_gcry_fips_indicator_cipher): Do not mark GCM mode as
-       FIPS approved.
+       s390x-asm: move constant data to read-only section.
+       + commit 65f52773d4bb59a4ea4b7e373fe79b3559ddf312
+       * cipher/chacha20-s390x.S: Move constant data to read-only
+       section; Align functions to 16 bytes.
+       * cipher/poly1305-s390x.S: Likewise.
 
-2022-12-16  NIIBE Yutaka  <gniibe@fsij.org>
+       powerpc-asm: move constant data to read-only section.
+       + commit d0a109148b799e6ba24bb0e030009e5ac3eaf5dc
+       * cipher/chacha20-p10le-8x.s: Move constant data to read-only
+       section.
+
+       mpi/amd64: align functions and inner loops to 16 bytes.
+       + commit 12ad3ea6b49b12ba7ca44bc77e4e87025a6fd095
+       * mpi/amd64/mpih-add1.S: Align function and inner loop to 16 bytes.
+       * mpi/amd64/mpih-lshift.S: Likewise.
+       * mpi/amd64/mpih-mul1.S: Likewise.
+       * mpi/amd64/mpih-mul2.S: Likewise.
+       * mpi/amd64/mpih-mul3.S: Likewise.
+       * mpi/amd64/mpih-rshift.S: Likewise.
+       * mpi/amd64/mpih-sub1.S: Likewise.
+
+       amd64-asm: move constant data to read-only section for cipher algos.
+       + commit 208b1f3a7bd8709889aa566ff030bcff57ce1cfd
+       * cipher/camellia-aesni-avx-amd64.S: Move constant data to
+       read-only section.
+       * cipher/camellia-aesni-avx2-amd64.h: Likewise.
+       * cipher/camellia-gfni-avx512-amd64.S: Likewise.
+       * cipher/chacha20-amd64-avx2.S: Likewise.
+       * cipher/chacha20-amd64-avx512.S: Likewise.
+       * cipher/chacha20-amd64-ssse3.S: Likewise.
+       * cipher/des-amd64.s: Likewise.
+       * cipher/rijndael-ssse3-amd64-asm.S: Likewise.
+       * cipher/rijndael-vaes-avx2-amd64.S: Likewise.
+       * cipher/serpent-avx2-amd64.S: Likewise.
+       * cipher/sm4-aesni-avx-amd64.S: Likewise.
+       * cipher/sm4-aesni-avx2-amd64.S: Likewise.
+       * cipher/sm4-gfni-avx2-amd64.S: Likewise.
+       * cipher/sm4-gfni-avx512-amd64.S: Likewise.
+       * cipher/twofish-avx2-amd64.S: Likewise.
+
+       amd64-asm: align functions to 16 bytes for cipher algos.
+       + commit 9d62c54de2b0cd3b1849a27f8998e1f0d43f1583
+       * cipher/blowfish-amd64.S: Align functions to 16 bytes.
+       * cipher/camellia-aesni-avx-amd64.S: Likewise.
+       * cipher/camellia-aesni-avx2-amd64.h: Likewise.
+       * cipher/camellia-gfni-avx512-amd64.S: Likewise.
+       * cipher/cast5-amd64.S: Likewise.
+       * cipher/chacha20-amd64-avx2.S: Likewise.
+       * cipher/chacha20-amd64-ssse3.S: Likewise.
+       * cipher/des-amd64.s: Likewise.
+       * cipher/rijndael-amd64.S: Likewise.
+       * cipher/rijndael-ssse3-amd64-asm.S: Likewise.
+       * cipher/salsa20-amd64.S: Likewise.
+       * cipher/serpent-avx2-amd64.S: Likewise.
+       * cipher/serpent-sse2-amd64.S: Likewise.
+       * cipher/sm4-aesni-avx-amd64.S: Likewise.
+       * cipher/sm4-aesni-avx2-amd64.S: Likewise.
+       * cipher/sm4-gfni-avx2-amd64.S: Likewise.
+       * cipher/twofish-amd64.S: Likewise.
+       * cipher/twofish-avx2-amd64.S: Likewise.
+
+       amd64-asm: move constant data to read-only section for hash/mac algos.
+       + commit 14137d685cf8d779ac4656b64b7d5adcdcf90e3a
+       * cipher/asm-common-amd64.h (SECTION_RODATA): New.
+       * cipher/blake2b-amd64-avx2.S: Use read-only section for constant
+       data.
+       * cipher/blake2b-amd64-avx512.S: Likewise.
+       * cipher/blake2s-amd64-avx.S: Likewise.
+       * cipher/blake2s-amd64-avx512.S: Likewise.
+       * cipher/poly1305-amd64-avx512.S: Likewise.
+       * cipher/sha1-avx-amd64.S: Likewise.
+       * cipher/sha1-avx-bmi2-amd64.S: Likewise.
+       * cipher/sha1-avx2-bmi2-amd64.S: Likewise.
+       * cipher/sha1-ssse3-amd64.S: Likewise.
+       * cipher/sha256-avx-amd64.S: Likewise.
+       * cipher/sha256-avx2-bmi2-amd64.S: Likewise.
+       * cipher/sha256-ssse3-amd64.S: Likewise.
+       * cipher/sha512-avx-amd64.S: Likewise.
+       * cipher/sha512-avx2-bmi2-amd64.S: Likewise.
+       * cipher/sha512-avx512-amd64.S: Likewise.
+       * cipher/sha512-ssse3-amd64.S: Likewise.
+       * cipher/sha3-avx-bmi2-amd64.S: Likewise.
+
+2023-01-17  Jussi Kivilinna  <jussi.kivilinna@iki.fi>
+
+       tests/bench-slope: skip CPU warm-up in regression tests.
+       + commit ec57b50bc60a6a5cc98249e2607b4e31bdcba553
+       * tests/bench-slope.c (warm_up_cpu): Skip in regression tests.
+
+       tests/basic: perform x86 vector cluttering only when __SSE2__ is set.
+       + commit 308a70ac9907de136fae86db40ebe25c904167a2
+       * tests/basic.c (CLUTTER_VECTOR_REGISTER_AMD64)
+       (CLUTTER_VECTOR_REGISTER_I386): Set only if __SSE2__ defined.
+       (clutter_vector_registers) [CLUTTER_VECTOR_REGISTER_AMD64]: Remove
+       __SSE2__ check for "xmm" clobbers.
+       (clutter_vector_registers) [CLUTTER_VECTOR_REGISTER_I386]: Likewise.
+
+       tests/basic: fix clutter vector register asm for amd64 and i386.
+       + commit 1a758b08bba09a7bba72b764d4f7c6c7c8f08cf1
+       * tests/basic.c (clutter_vector_registers): Pass data pointers through
+       single register for CLUTTER_VECTOR_REGISTER_AMD64 and
+       CLUTTER_VECTOR_REGISTER_I386 as compiler might attempt to allocate
+       separate pointer register for each "m" operator.
+
+       avx512: tweak zmm16-zmm31 register clearing.
+       + commit 7de2fb66e065a97f121bd16ab37efba32983a6bd
+       * cipher/asm-common-amd64.h (spec_stop_avx512): Clear ymm16
+       before and after vpopcntb.
+       * cipher/camellia-gfni-avx512-amd64.S (clear_zmm16_zmm31): Clear
+       YMM16-YMM31 registers instead of XMM16-XMM31.
+       * cipher/chacha20-amd64-avx512.S (clear_zmm16_zmm31): Likewise.
+       * cipher/keccak-amd64-avx512.S (clear_regs): Likewise.
+       (clear_avx512_4regs): Clear all 4 registers with XOR.
+       * cipher/cipher-gcm-intel-pclmul.c (_gcry_ghash_intel_pclmul)
+       (_gcry_polyval_intel_pclmul): Clear YMM16-YMM19 registers instead of
+       ZMM16-ZMM19.
+       * cipher/poly1305-amd64-avx512.S (POLY1305_BLOCKS): Clear YMM16-YMM31
+       registers after vector processing instead of XMM16-XMM31.
+       * cipher/sha512-avx512-amd64.S
+       (_gcry_sha512_transform_amd64_avx512): Likewise.
+
+2023-01-06  Jussi Kivilinna  <jussi.kivilinna@iki.fi>
+
+       aria: add generic 2-way bulk processing.
+       + commit 5e1a04f77933a8295df69d818e9effc076dc68cd
+       * cipher/aria.c (ARIA_context): Add 'bulk_prefetch_ready'.
+       (aria_crypt_2blks, aria_crypt_blocks, aria_enc_blocks, aria_dec_blocks)
+       (_gcry_aria_ctr_enc, _gcry_aria_cbc_enc, _gcry_aria_cbc_dec)
+       (_gcry_aria_cfb_enc, _gcry_aria_cfb_dec, _gcry_aria_ecb_crypt)
+       (_gcry_aria_xts_crypt, _gcry_aria_ctr32le_enc, _gcry_aria_ocb_crypt)
+       (_gcry_aria_ocb_auth): New.
+       (aria_setkey): Setup 'bulk_ops' function pointers.
+
+       Add ARIA block cipher.
+       + commit 316c6d7715d931a0fbe3ebc4e20e6f4d77b716cb
+       * cipher/Makefile.am: Add 'aria.c'.
+       * cipher/aria.c: New.
+       * cipher/cipher.c (cipher_list, cipher_list_algo301): Add ARIA cipher
+       specs.
+       * cipher/mac-cmac.c (map_mac_algo_to_cipher): Add GCRY_MAC_CMAC_ARIA.
+       (_gcry_mac_type_spec_cmac_aria): New.
+       * cipher/mac-gmac.c (map_mac_algo_to_cipher): Add GCRY_MAC_GMAC_ARIA.
+       (_gcry_mac_type_spec_gmac_aria): New.
+       * cipher/mac-internal.h (_gcry_mac_type_spec_cmac_aria)
+       (_gcry_mac_type_spec_gmac_aria)
+       (_gcry_mac_type_spec_poly1305mac_aria): New.
+       * cipher/mac-poly1305.c (poly1305mac_open): Add GCRY_MAC_GMAC_ARIA.
+       (_gcry_mac_type_spec_poly1305mac_aria): New.
+       * cipher/mac.c (mac_list, mac_list_algo201, mac_list_algo401)
+       (mac_list_algo501): Add ARIA MAC specs.
+       * configure.ac (available_ciphers): Add 'aria'.
+       (GCRYPT_CIPHERS): Add 'aria.lo'.
+       (USE_ARIA): New.
+       * doc/gcrypt.texi: Add GCRY_CIPHER_ARIA128, GCRY_CIPHER_ARIA192,
+       GCRY_CIPHER_ARIA256, GCRY_MAC_CMAC_ARIA, GCRY_MAC_GMAC_ARIA and
+       GCRY_MAC_POLY1305_ARIA.
+       * src/cipher.h (_gcry_cipher_spec_aria128, _gcry_cipher_spec_aria192)
+       (_gcry_cipher_spec_aria256): New.
+       * src/gcrypt.h.in (gcry_cipher_algos): Add GCRY_CIPHER_ARIA128,
+       GCRY_CIPHER_ARIA192 and GCRY_CIPHER_ARIA256.
+       (gcry_mac_algos): GCRY_MAC_CMAC_ARIA, GCRY_MAC_GMAC_ARIA and
+       GCRY_MAC_POLY1305_ARIA.
+       * tests/basic.c (check_ecb_cipher, check_ctr_cipher)
+       (check_cfb_cipher, check_ocb_cipher) [USE_ARIA]: Add ARIA test-vectors.
+       (check_ciphers) [USE_ARIA]: Add GCRY_CIPHER_ARIA128, GCRY_CIPHER_ARIA192
+       and GCRY_CIPHER_ARIA256.
+       (main): Also run 'check_bulk_cipher_modes' for 'cipher_modes_only'-mode.
+       * tests/bench-slope.c (bench_mac_init): Add GCRY_MAC_POLY1305_ARIA
+       setiv-handling.
+       * tests/benchmark.c (mac_bench): Likewise.
+
+2023-01-04  Jussi Kivilinna  <jussi.kivilinna@iki.fi>
+
+       sm4: add missing OCB 16-way GFNI-AVX512 path.
+       + commit 30c9a1d101b45908a17c68fa50f4cdf9bd113792
+       * cipher/sm4.c (_gcry_sm4_ocb_crypt) [USE_GFNI_AVX512]: Add 16-way
+       GFNI-AVX512 handling.
+
+       bulkhelp: change bulk function definition to allow modifying context.
+       + commit 86db4b02c6e5fd41ae958fb5e1fcf3e296a820ad
+       * cipher/bulkhelp.h (bulk_crypt_fn_t): Make 'ctx' non-constant and
+       change 'num_blks' from 'unsigned int' to 'size_t'.
+       * cipher/camellia-glue.c (camellia_encrypt_blk1_32)
+       (camellia_encrypt_blk1_64, camellia_decrypt_blk1_32)
+       (camellia_decrypt_blk1_64): Adjust to match 'bulk_crypt_fn_t'.
+       * cipher/serpent.c (serpent_crypt_blk1_16, serpent_encrypt_blk1_16)
+       (serpent_decrypt_blk1_16): Likewise.
+       * cipher/sm4.c (crypt_blk1_16_fn_t, _gcry_sm4_aesni_avx_crypt_blk1_8)
+       (sm4_aesni_avx_crypt_blk1_16, _gcry_sm4_aesni_avx2_crypt_blk1_16)
+       (sm4_aesni_avx2_crypt_blk1_16, _gcry_sm4_gfni_avx2_crypt_blk1_16)
+       (sm4_gfni_avx2_crypt_blk1_16, _gcry_sm4_gfni_avx512_crypt_blk1_16)
+       (_gcry_sm4_gfni_avx512_crypt_blk32, sm4_gfni_avx512_crypt_blk1_16)
+       (_gcry_sm4_aarch64_crypt_blk1_8, sm4_aarch64_crypt_blk1_16)
+       (_gcry_sm4_armv8_ce_crypt_blk1_8, sm4_armv8_ce_crypt_blk1_16)
+       (_gcry_sm4_armv9_sve_ce_crypt, sm4_armv9_sve_ce_crypt_blk1_16)
+       (sm4_crypt_blocks, sm4_crypt_blk1_32, sm4_encrypt_blk1_32)
+       (sm4_decrypt_blk1_32): Likewise.
+       * cipher/twofish.c (twofish_crypt_blk1_16, twofish_encrypt_blk1_16)
+       (twofish_decrypt_blk1_16): Likewise.
+
+       Add GMAC-SM4 and Poly1305-SM4.
+       + commit d1ccc409d4c655f695c7dc042a629a8898bd14eb
+       * cipher/cipher.c (cipher_list_algo301): Remove comma at the end
+       of last entry.
+       * cipher/mac-gmac.c (map_mac_algo_to_cipher): Add SM4.
+       (_gcry_mac_type_spec_gmac_sm4): New.
+       * cipher/max-internal.h (_gcry_mac_type_spec_gmac_sm4)
+       (_gcry_mac_type_spec_poly1305mac_sm4): New.
+       * cipher/mac-poly1305.c (poly1305mac_open): Add SM4.
+       (_gcry_mac_type_spec_poly1305mac_sm4): New.
+       * cipher/mac.c (mac_list, mac_list_algo401, mac_list_algo501): Add
+       GMAC-SM4 and Poly1304-SM4.
+       (mac_list_algo101): Remove comma at the end of last entry.
+       * cipher/md.c (digest_list_algo301): Remove comma at the end of
+       last entry.
+       * doc/gcrypt.texi: Add GCRY_MAC_GMAC_SM4 and GCRY_MAC_POLY1305_SM4.
+       * src/gcrypt.h.in (GCRY_MAC_GMAC_SM4, GCRY_MAC_POLY1305_SM4): New.
+       * tests/bench-slope.c (bench_mac_init): Setup IV for
+       GCRY_MAC_POLY1305_SM4.
+       * tests/benchmark.c (mac_bench): Likewise.
+
+       Fix compiler warnings seen with clang-powerpc64le target.
+       + commit f2153d797f43a5fc8b80c2d908bf7dd6ad1fcc2a
+       * cipher/rijndael-ppc-common.h (asm_sbox_be): New.
+       * cipher/rijndael-ppc.c (_gcry_aes_sbox4_ppc8): Use 'asm_sbox_be'
+       instead of 'vec_sbox_be' since this instrinsics has different
+       prototype definition on GCC and Clang ('vector uchar' vs 'vector
+       ulong long').
+       * cipher/sha256-ppc.c (vec_ror_u32): Remove unused function.
+
+2022-12-14  Jussi Kivilinna  <jussi.kivilinna@iki.fi>
+
+       Add clang support for ARM 32-bit assembly.
+       + commit 02d5d1d97b3f281cf9c854d7143e346ab76fa384
+       * configure.ac (gcry_cv_gcc_arm_platform_as_ok)
+       (gcry_cv_gcc_inline_asm_neon): Remove % prefix from register names.
+       * cipher/cipher-gcm-armv7-neon.S (vmull_p64): Prefix constant values
+       with # character instead of $.
+       * cipher/blowfish-arm.S: Remove % prefix from all register names.
+       * cipher/camellia-arm.S: Likewise.
+       * cipher/cast5-arm.S: Likewise.
+       * cipher/rijndael-arm.S: Likewise.
+       * cipher/rijndael-armv8-aarch32-ce.S: Likewise.
+       * cipher/sha512-arm.S: Likewise.
+       * cipher/sha512-armv7-neon.S: Likewise.
+       * cipher/twofish-arm.S: Likewise.
+       * mpi/arm/mpih-add1.S: Likewise.
+       * mpi/arm/mpih-mul1.S: Likewise.
+       * mpi/arm/mpih-mul2.S: Likewise.
+       * mpi/arm/mpih-mul3.S: Likewise.
+       * mpi/arm/mpih-sub1.S: Likewise.
+
+       rijndael-ppc: fix wrong inline assembly constraint.
+       + commit 3d20308cc529b53d49954e9f0b8d10fa14422303
+       * cipher/rijndael-ppc-function.h (CBC_ENC_FUNC): Fix outiv constraint.
+
+       Fix building AVX512 Intel-syntax assembly with x86-64 clang.
+       + commit 31837163fbc36680140a17892374380a214c5d16
+       * cipher/asm-common-amd64.h (spec_stop_avx512_intel_syntax): New.
+       * cipher/poly1305-amd64-avx512.S: Use spec_stop_avx512_intel_syntax
+       instead of spec_stop_avx512.
+       * cipher/sha512-avx512-amd64.S: Likewise.
+
+2022-12-14  NIIBE Yutaka  <gniibe@fsij.org>
 
        build: Fix m4 macros for strict C compiler.
-       + commit b1a3424e7f8030361c2e4806920e60ec06c1d9e3
+       + commit e3b441214f93d8f61875b8223480e57afa2a3f10
        * m4/ax_cc_for_build.m4: Fix for no arg.
        * m4/noexecstack.m4: Likewise.
 
        build: Fix configure.ac for strict C99.
-       + commit 83ea195b61d571e48f53803d2d297ec02035ae36
+       + commit f62d5cf9f2683efa867fb04332c84ec899818d22
+       * configure.ac: More fixes for other architecture.
+
+2022-12-13  NIIBE Yutaka  <gniibe@fsij.org>
+
+       build: Fix configure.ac for strict C99.
+       + commit 693ffa145378682229473b0e811a9cea7c4d307a
        * configure.ac: Add function declarations for asm functions.
 
-2022-12-07  Jussi Kivilinna  <jussi.kivilinna@iki.fi>
+2022-12-12  Jussi Kivilinna  <jussi.kivilinna@iki.fi>
+
+       avx512: tweak AVX512 spec stop, use common macro in assembly.
+       + commit 8b4e0e9e9a3696be1404938226f0ec3582f12258
+       * cipher/cipher-gcm-intel-pclmul.c: Use xmm registers for AVX512
+       spec stop.
+       * cipher/asm-common-amd64.h (spec_stop_avx512): New.
+       * cipher/blake2b-amd64-avx512.S: Use spec_stop_avx512.
+       * cipher/blake2s-amd64-avx512.S: Likewise.
+       * cipher/camellia-gfni-avx512-amd64.S: Likewise.
+       * cipher/chacha20-avx512-amd64.S: Likewise.
+       * cipher/keccak-amd64-avx512.S: Likewise.
+       * cipher/poly1305-amd64-avx512.S: Likewise.
+       * cipher/sha512-avx512-amd64.S: Likewise.
+       * cipher/sm4-gfni-avx512-amd64.S: Likewise.
+       ---
 
-       t-rsa-testparm: fix 'function declaration isn’t a prototype' warning.
-       + commit bdeea2a53e9ef869caa2f8ae2dd876ad397aed12
-       * cipher/t-rsa-testparm.c (check_rsa_testparm): Define parameters as
-       void.
+       chacha20-avx512: add handling for any input block count and tweak 16 block code a bit
+       + commit 996f81dd86ab299f92df581edad49e69ee932139
+       * cipher/chacha20-amd64-avx512.S: Add tail handling for 8/4/2/1
+       blocks; Rename `_gcry_chacha20_amd64_avx512_blocks16` to
+       `_gcry_chacha20_amd64_avx512_blocks`; Tweak 16 parallel block processing
+       for small speed improvement.
+       * cipher/chacha20.c (_gcry_chacha20_amd64_avx512_blocks16): Rename to ...
+       (_gcry_chacha20_amd64_avx512_blocks): ... this.
+       (chacha20_blocks) [USE_AVX512]: Add AVX512 code-path.
+       (do_chacha20_encrypt_stream_tail) [USE_AVX512]: Change to handle any
+       number of full input blocks instead of multiples of 16.
 
-2022-12-07  Jakub Jelen  <jjelen@redhat.com>
+2022-12-06  Jakub Jelen  <jjelen@redhat.com>
 
        fips,rsa: Prevent usage of X9.31 keygen in FIPS mode.
-       + commit 392e0ccd25f397d789a1cb59fae2f3faae46e78b
+       + commit 06ea5b5332ffdb44a0a394d766be8989bcb6a95c
        * cipher/rsa.c (rsa_generate): Do not accept use-x931 or derive-parms
        in FIPS mode.
        * tests/pubkey.c (get_keys_x931_new): Expect failure in FIPS mode.
        * doc/gcrypt.texi: Document "test-parms" and clarify some cases around
        the X9.31 keygen.
 
-2022-12-05  Jakub Jelen  <jjelen@redhat.com>
+2022-11-30  Jakub Jelen  <jjelen@redhat.com>
 
        rsa: Prevent usage of long salt in FIPS mode.
-       + commit fdd2a8b3329eb892f90d2cd803762ef06222c226
+       + commit bf1e62e59200b2046680d1d3d1599facc88cfe63
        * cipher/rsa-common.c (_gcry_rsa_pss_encode): Prevent usage of large
          salt lengths
          (_gcry_rsa_pss_verify): Ditto.
          FIPS mode
        * tests/t-rsa-pss.c (one_test_sexp): Fix function name in error message
 
+2022-11-21  NIIBE Yutaka  <gniibe@fsij.org>
+
+       random:w32: Don't emit message for diskperf when it's not useful.
+       + commit 35abf4d2eb582b78873aa324f6d02976788ffbbc
+       * random/rndw32.c (slow_gatherer): Suppress emitting by log_info.
+
 2022-11-18  Jakub Jelen  <jjelen@redhat.com>
 
        fips: Mark AES key wrapping as approved.
-       + commit 20ad5df60b035e721de7cfc40cd76a1a13051072
+       + commit c34c9e70055ee43e5ef257384fa15941f064e5a4
        * src/fips.c (_gcry_fips_indicator_cipher): Add key wrapping mode as
        approved.
 
        pkdf2: Add checks for FIPS.
-       + commit 057844700ec2e652249b0b80136229c049d41975
+       + commit f4a861f3e5ae82f278284061e4829c03edf9c3a7
        * cipher/kdf.c (_gcry_kdf_pkdf2): Require 8 chars passphrase for FIPS.
        Set bounds for salt length and iteration count in FIPS mode.
 
 2022-11-01  NIIBE Yutaka  <gniibe@fsij.org>
 
        build: Prefer gpgrt-config when available.
-       + commit 5191379da3ad653fcf08ab1babb486ca952d1643
+       + commit 0dcb7e05c9e1c9c2a23abe0a0390680741b61414
        * src/libgcrypt.m4: Overriding the decision by
        --with-libgcrypt-prefix, use gpgrt-config libgcrypt when gpgrt-config
        is available.
 
-2022-10-27  Jakub Jelen  <jjelen@redhat.com>
+2022-10-26  Jussi Kivilinna  <jussi.kivilinna@iki.fi>
+
+       sha3-avx512: fix for "x32" target.
+       + commit eab1caae7bd529c09d809d4d7c64c97ab7abeab8
+       * cipher/keccak.c (_gcry_keccak_absorb_blocks_avx512): Change size_t
+       to u64; change 'const byte **new_lanes' to 'u64 *new_lanes'.
+       (keccak_absorb_lanes64_avx512): Get new lines pointer from assembly
+       through 'u64' type.
+
+       serpent: accelerate XTS and ECB modes.
+       + commit 8a1fe5f78f9fed32cd641b3d9d02197f7ba394d8
+       * cipher/serpent-armv7-neon.S (_gcry_serpent_neon_blk8): New.
+       * cipher/serpent-avx2-amd64.S (_gcry_serpent_avx2_blk16): New.
+       * cipher/serpent-sse2-amd64.S (_gcry_serpent_sse2_blk8): New.
+       * cipher/serpent.c (_gcry_serpent_sse2_blk8)
+       (_gcry_serpent_avx2_blk16, _gcry_serpent_neon_blk8)
+       (_gcry_serpent_xts_crypt, _gcry_serpent_ecb_crypt)
+       (serpent_crypt_blk1_16, serpent_encrypt_blk1_16)
+       (serpent_decrypt_blk1_16): New.
+       (serpent_setkey): Setup XTS and ECB bulk functions.
+
+       serpent: fix compiler warning on 32-bit ARM.
+       + commit b50b7ea5cabaf3729afe370b285d78fff9e27d31
+       * cipher/serpent.c (_gcry_serpent_ocb_crypt)
+       (_gcry_serpent_ocb_auth) [USE_NEON]: Cast "Ls" to 'const void **'.
+
+       twofish: accelerate XTS and ECB modes.
+       + commit d078a928f5c6024fde89388582b83742d2b8638a
+       * cipher/twofish-amd64.S (_gcry_twofish_amd64_blk3): New.
+       * cipher/twofish-avx2-amd64.S (_gcry_twofish_avx2_blk16): New.
+       (_gcry_twofish_xts_crypt, _gcry_twofish_ecb_crypt)
+       (_gcry_twofish_avx2_blk16, _gcry_twofish_amd64_blk3)
+       (twofish_crypt_blk1_16, twofish_encrypt_blk1_16)
+       (twofish_decrypt_blk1_16): New.
+       (twofish_setkey): Setup XTS and ECB bulk functions.
+
+       sm4: accelerate ECB (for benchmarking)
+       + commit 14f39993d632815db68a5dca90e021891c9547ab
+       * cipher/sm4.c (_gcry_sm4_ecb_crypt): New.
+       (sm4_setkey): Setup ECB bulk function.
+
+       sm4: fix lookup-table prefetching.
+       + commit a43e03ef842b2bb93b10cd2b85230af1f0269ca0
+       * cipher/sm4.c (sm4_expand_key): Prefetch sbox table.
+       (sm4_get_crypt_blk1_16_fn): Do not prefetch sbox table.
+       (sm4_expand_key, _gcry_sm4_ctr_enc, _gcry_sm4_cbc_dec)
+       (_gcry_sm4_cfb_dec): Prefetch sbox table if table look-up
+       implementation is used.
+
+       camellia: accelerate ECB (for benchmarking)
+       + commit 6475d0915ffecc54a8d1105262edb28ff4026803
+       * cipher/bulkhelp.h (bulk_ecb_crypt_128): New.
+       * cipher/camellia-glue.c (_gcry_camellia_ecb_crypt): New.
+       (camellia_setkey): Select ECB bulk function with AESNI/AVX2, VAES/AVX2
+       and GFNI/AVX2.
+
+       rijndael-vaes: align asm functions.
+       + commit 7c1aa4c9452aa1259039680bc1d5ba15124c5f76
+       * cipher/rijndael-vaes-avx2-amd64.S: Align functions to 16 bytes.
+
+       rijndael: add ECB acceleration (for benchmarking purposes)
+       + commit 84f3d41acb2377d1ed0c2b9e8268de9d35e90af0
+       * cipher/cipher-internal.h (cipher_bulk_ops): Add 'ecb_crypt'.
+       * cipher/cipher.c (do_ecb_crypt): Use bulk function if available.
+       * cipher/rijndael-aesni.c (do_aesni_enc_vec8): Change asm label
+       '.Ldeclast' to '.Lenclast'.
+       (_gcry_aes_aesni_ecb_crypt): New.
+       * cipher/rijndael-armv8-aarch32-ce.S (_gcry_aes_ecb_enc_armv8_ce)
+       (_gcry_aes_ecb_dec_armv8_ce): New.
+       * cipher/rijndael-armv8-aarch64-ce.S (_gcry_aes_ecb_enc_armv8_ce)
+       (_gcry_aes_ecb_dec_armv8_ce): New.
+       * cipher/rijndael-armv8-ce.c (_gcry_aes_ocb_enc_armv8_ce)
+       (_gcry_aes_ocb_dec_armv8_ce, _gcry_aes_ocb_auth_armv8_ce): Change
+       return value from void to size_t.
+       (ocb_crypt_fn_t, xts_crypt_fn_t): Remove.
+       (_gcry_aes_armv8_ce_ocb_crypt, _gcry_aes_armv8_ce_xts_crypt): Remove
+       indirect function call; Return value from called function (allows tail
+       call optimization).
+       (_gcry_aes_armv8_ce_ocb_auth): Return value from called function (allows
+       tail call optimization).
+       (_gcry_aes_ecb_enc_armv8_ce, _gcry_aes_ecb_dec_armv8_ce)
+       (_gcry_aes_armv8_ce_ecb_crypt): New.
+       * cipher/rijndael-vaes-avx2-amd64.S
+       (_gcry_vaes_avx2_ecb_crypt_amd64): New.
+       * cipher/rijndael-vaes.c (_gcry_vaes_avx2_ecb_crypt_amd64)
+       (_gcry_aes_vaes_ecb_crypt): New.
+       * cipher/rijndael.c (_gcry_aes_aesni_ecb_crypt)
+       (_gcry_aes_vaes_ecb_crypt, _gcry_aes_armv8_ce_ecb_crypt): New.
+       (do_setkey): Setup ECB bulk function for x86 AESNI/VAES and ARM CE.
+
+       mpi/longlong: update powerpc macros from GCC.
+       + commit bf5ec001dfcbd4a293d0bd577fd70a0f8286c4e6
+       * mpi/longlong.h [__powerpc__, __powerpc64__]: Update macros.
+
+       hwf-x86: enable VPGATHER usage for AMD CPUs with AVX512.
+       + commit 4b1cb76e3587a8fdf59673a8368d47e4cd2fe151
+       * src/hwf-x86.c (detect_x86_gnuc): Move model based checks and
+       forced soft hwfeatures enablement at end; Enable VPGATHER for
+       AMD CPUs with AVX512.
+
+       sha512-avx512: enable only on Intel CPUs for now.
+       + commit c0f85e0c8657030eb979a465199a07e2819f81e4
+       * cipher/sha512.c (sha512_init_common): Enable AVX512 implementation
+       only for Intel CPUs.
+
+2022-10-26  Jakub Jelen  <jjelen@redhat.com>
 
        hmac,hkdf: Check the HMAC key length in FIPS mode.
-       + commit e7b1fbda6a9e0b6bf99062fc86139445a4e0766e
+       + commit b095ea7559734f519fbe92d570afe567330eb474
        * src/visibility.c (gcry_md_setkey): Add the check here, too.
 
        Revert "kdf:pkdf2: Require longer input when FIPS mode."
-       + commit 7f4fafb5564dec6fe65f0e93a1125cb6ddb4d1ed
+       + commit 47db7fe3a0c36523d2ccec31705cffff9a2337bc
        * cipher/kdf.c (_gcry_kdf_pkdf2): Remove the length limitation of
          passphrase input length.
 
 2022-10-24  NIIBE Yutaka  <gniibe@fsij.org>
 
        build: Update gpg-error.m4.
-       + commit e2c0920fd7dea6e254badd12958982c66080b7db
+       + commit 12b3bc5a0d9c453d02ae229aadfee82808c10220
        * m4/gpg-error.m4: Update from libgpg-error 1.46.
 
+2022-10-19  Jakub Jelen  <jjelen@redhat.com>
+
+       tests: Use proper format string for size_t.
+       + commit b77e7a225bc40ea09fa66969625b97b09a1cbf23
+
+
+       cipher: Do not run RSA encryption selftest by default.
+       + commit 4e7941587c95fc3ae5fb5686346855395ef6754b
+       * cipher/rsa.c (selftests_rsa): Skip encryption selftest as this
+         operation is not claimed as part of the certification.
+
+       Revert "tests: Expect the RSA PKCS #1.5 encryption to fail in FIPS mode"
+       + commit 7468cdfc8b6aa0c6e17c41218d5c5f2b575b16e4
+       This reverts commit f736f3c70182d9c948f9105eb769c47c5578df35. The pubkey
+       encryption has already separate explicit FIPS service indicator.
+
+       Revert "Do not allow PKCS #1.5 padding for encryption in FIPS"
+       + commit e83280b36be3be3775427c5842f4274d01992763
+       This reverts commit c7709f7b23848abf4ba65cb99cb2a9e9c7ebdefc. The pubkey
+       encryption has already separate explicit FIPS service indicator.
+
+       Revert "tests: Expect the OEAP tests to fail in FIPS mode."
+       + commit 9d56af04dce0795d30374fd575a8500fcf0ae158
+       This reverts commit 249ca431ef881d510b90a5d3db9cd8507c4d697b. The pubkey
+       encryption has already separate explicit FIPS service indicator.
+
+       Revert "fips: Disable RSA-OAEP padding in FIPS mode."
+       + commit a7b5cab05f6a7de23c565b1303eb3f198d0b7e2b
+       This reverts commit e552e37983da0c54840786eeff34481685fde1e9. The pubkey
+       encryption has already separate explicit FIPS service indicator.
+
+       fips: Mark gcry_pk_encrypt/decrypt function non-approved.
+       + commit 05cb8355d3e66f15425ad85ae2203882e80f4792
+       * src/fips.c (_gcry_fips_indicator_function): Add
+       gcry_pk_encrypt/decrypt as non-approved.
+
+       fips: Fix fips indicator function.
+       + commit c5de9e77fb332939695918710b0842030515cce0
+       * src/fips.c (_gcry_fips_indicator_function): Fix typo in sign/verify
+       function names.
+
+2022-10-08  Jussi Kivilinna  <jussi.kivilinna@iki.fi>
+
+       doc: fix RFC reference for GCM-SIV.
+       + commit 94dcd2cf5cd17244eaa96920218f32b64c0cec44
+       * doc/gcrypt.texi: Fix GCM-SIV RFC reference to RFC-8452.
+
+       mpi/longlong.h: i386: use tzcnt instruction for trailing zeros.
+       + commit f01d4b7a601f68e52c7c7b4647bfe97cb60be26e
+       * mpi/longlong.h [__i386__] (count_trailing_zeros): Add 'rep' prefix
+       for 'bsfq'.
+
+       mpi/longlong.h: x86-64: use tzcnt instruction for trailing zeros.
+       + commit 38bd31bc46b992f68c9455ed50a6280943fe6a75
+       * mpi/longlong.h [__x86_64__] (count_trailing_zeros): Add 'rep' prefix
+       for 'bsfq'.
+
+       mpi/longlong: fix generic smul_ppmm ifdef.
+       + commit d67f453d1b9de0ddb579de3a14ca8a3801bfead6
+       * mpi/longlong.h [!umul_ppmm] (smul_ppmm): Change ifdef
+       from !defined(umul_ppmm) to !defined(smul_ppmm).
+
+       mpi/longlong: provide generic implementation using double word type.
+       + commit aef0ddf4d37ae7667cdf49d406e740e8d8d311cd
+       * configure.ac: Add check for 'unsigned __int128'.
+       * mpi/longlong.h (UDWtype): Define for 32-bit or 64-bit when
+       'unsigned long long' or 'unsigned __int128' is available.
+       (add_ssaaaa, sub_ddmmss, umul_ppmm, udiv_qrnnd) [UDWtype]: New.
+
 2022-10-06  Jakub Jelen  <jjelen@redhat.com>
 
        tests: Reproducer for short dklen in FIPS mode.
-       + commit e235f38f9b9fc3cd4464bbf9081da765d46ce87d
+       + commit efdc87b305ff326f37acd3a9c2606de24a706cce
        * tests/t-kdf.c (check_pbkdf2): Add test vector with short dklen and
          verify it fails in FIPS mode
 
        random: Extend the comment about FIPS specifics.
-       + commit 96615490c7b1d5f77de7f7f2b77e775540f7f6bf
+       + commit 6e832840a8b7cdd30f77e66685ad0de863d7e84d
        * random/rndgetentropy.c (_gcry_rndgetentropy_gather_random): Clarify
          description of the chainging DRBG in FIPS mode.
 
 2022-10-04  Jakub Jelen  <jjelen@redhat.com>
 
        random: Get maximum 32B of entropy at once in FIPS Mode.
-       + commit ce0df08bbab741cd2ad19a5a0e8b65fb62774f87
+       + commit a6a6e94027abf18a51f5f93bf9fb2cfe5496bdf8
        * random/rndgetentropy.c (_gcry_rndgetentropy_gather_random): In fips
        mode, gather max 32 B of strong entropy for initialization.
 
+2022-10-02  Jakub Jelen  <jjelen@redhat.com>
+
+       tests: Avoid memory leak.
+       + commit 567bc62e1c3046594088de7209fee7c545ece1e3
+       * tests/hashtest.c (run_longtest): Avoid memory leak on error
+
+2022-10-02  Jussi Kivilinna  <jussi.kivilinna@iki.fi>
+
+       t-rsa-testparm: fix 'function declaration isn’t a prototype' warning.
+       + commit 0909186b9e66aa3a8fac7b2571915c45a7bfaeb3
+       * cipher/t-rsa-testparm.c (check_rsa_testparm): Define parameters as
+       void.
+
+       tests/benchmark: remove VLA usage.
+       + commit 6419fbb1d3dd365a89623a94448a0335ae4a8554
+       * cipher/benchmark.c (ccm_aead_init): Avoid VLA in stack array.
+
+       tests/bench-slope: remove VLA usage.
+       + commit 335b8eb1211b3b67541c689da949101db3b669fd
+       * tests/bench-slope.c (bench_set_cipher_key): New.
+       (bench_encrypt_init, bench_xts_encrypt_init): Use
+       'bench_set_cipher_key' to remove VLA usage.
+
+       cipher-ccm: remove VLA usage.
+       + commit ce60a68a1172ea20c1ff72e27e4b0115d805bf48
+       * cipher/cipher-ccm.c (do_cbc_mac): Avoid VLA for stack array.
+
+       mpi/ec: remove VLA usage.
+       + commit 9978fc22045ca7623a6e0cbf704fb48ab1550419
+       * mpi/ec-nist.c (_gcry_mpi_ec_nist192_mod, _gcry_mpi_ec_nist224_mod)
+       (_gcry_mpi_ec_nist256_mod, _gcry_mpi_ec_nist384_mod)
+       (_gcry_mpi_ec_nist521_mod): Avoid VLA for arrays.
+       * mpi/ec.c (ec_secp256k1_mod): Avoid VLA for arrays.
+
+2022-09-27  Jussi Kivilinna  <jussi.kivilinna@iki.fi>
+
+       tests/hashtest: add hugeblock & disable-hwf options and 6 gig test vectors
+       + commit 0cb29a5736cfcd6bce4ce2495cd0481f0bdb34a4
+       * .gitignore: Add 'tests/hashtest-6g'.
+       * configure.ac: Add 'tests/hashtest-6g'.
+       * tests/Makefile: Add 'hashtest-6g'.
+       * tests/hashtest-6g.in: New.
+       * tests/hashtest-256g.in: Add SHA3-512 to algos.
+       * tests/hashtest.c (use_hugeblock): New.
+       (testvectors): Add 256 GiB test vectors for BLAKE2S, BLAKE2B and
+       whirlpool; Add 6 GiB test vectors for SHA1, SHA256, SHA512, SHA3, SM3,
+       BLAKE2S, BLAKE2B, WHIRLPOOL, CRC32 and CRC24.
+       (run_longtest); Use huge 5 GiB pattern block when requested.
+       (main): Add '--hugeblock' and '--disable-hwf' options.
+       * tests/testdrv.c: Add 'hashtest-6g'; Add SHA3 to 'hashtest-256g'.
+
+2022-09-27  Jakub Jelen  <jjelen@redhat.com>
+
        keccak: Use size_t to avoid integer overflow.
-       + commit 9ee2d56e806b8018fa3ae354a65f1e70bf73dede
+       + commit 9c828129b2058c3f36e07634637929a54e8377ee
        * cipher/keccak-armv7-neon.S: Fix function name in comment and change
          parameter type to size_t.
        * cipher/keccak.c (keccak_ops_t): Change absorb function signature to
        * cipher/keccak_permute_64.h (KECCAK_F1600_ABSORB_FUNC_NAME): Change
          nlanes argument to use size_t.
 
-2022-10-04  Tobias Heider  <tobias.heider@canonical.com>
+2022-09-27  Tobias Heider  <tobias.heider@canonical.com>
 
        kdf:pkdf2: Check minimum allowed key size when running in FIPS mode.
-       + commit 52d48b710470dd48dd2a32a439898ece10ef05fd
+       + commit 3c04b692de1e7b45b764ff8d66bf84609b012e3a
        * cipher/kdf.c (_gcry_kdf_pkdf2): Add output length check.
 
-2022-10-04  NIIBE Yutaka  <gniibe@fsij.org>
+2022-09-27  NIIBE Yutaka  <gniibe@fsij.org>
 
        kdf:pkdf2: Require longer input when FIPS mode.
-       + commit d09d3d33c79daa2f8d385dfedf3f20ad205b0fba
+       + commit 857e6f467d0fc9fd858a73d84122695425970075
        * cipher/kdf.c (_gcry_kdf_pkdf2): Add length check.
 
-2022-09-22  NIIBE Yutaka  <gniibe@fsij.org>
-
-       build: Fix configure script.
-       + commit 44812a1d96fc003e6e0d01270c514b91e295d300
-       * configure.ac (AC_USE_SYSTEM_EXTENSIONS): Use it earlier.
-
 2022-09-22  Clemens Lang  <cllang@redhat.com>
 
        fips: Skip PCT if RSA keygen test-parms specified.
-       + commit 4963c127ae698d98f30483ba9d15d093aae4e51d
+       + commit c20022ffd4ad2cea51928a109dfa102d711d30ac
        * cipher/rsa.c (rsa_generate): Skip PCT is test-parms were specified.
        * tests/t-rsa-testparm.c: Add test for this functionality
        * tests/Makefile.am: Add test to build system
 
-       build: Skip PK-specific tests if algo is disabled.
-       + commit 1524b60a7ccc17fb82e91b90236a88a27b113175
-       * configure.ac: Define AM_CONDITIONALs for USE_DSA, USE_RSA,
-         USE_ELGAMAL, USE_ECC so Makefiles can depend on them.
-       * tests/Makefile.am: Skip tests that test only one public key algorithm
-         if that algorithm is disabled.
+2022-09-16  NIIBE Yutaka  <gniibe@fsij.org>
 
-2022-09-20  NIIBE Yutaka  <gniibe@fsij.org>
+       More clean up.
+       + commit 82226dad7ae0effa0d96645476f0401f94361141
+       * cipher/cipher-ccm.c (_gcry_cipher_ccm_tag): Add static qualifier.
+       * mpi/ec-ed25519.c: Include ec-internal.h.
+       * src/secmem.c (MB_WIPE_OUT): Remove extra semicolon.
+
+       Move function prototype to cipher.h.
+       + commit f97b2f706dd0a8820828e9015340a895539ed216
+       * cipher/kdf-internal.h: Move from here.
+       * src/cipher.h (blake2b_vl_hash): To here.
+
+       Minor clean up.
+       + commit 984d94fa9ffff69bd1bdb5d418889d2e6b2745e2
+       * mpi/mpi-internal.h: Remove extra semicolon from the macro.
+       * mpi/mpih-mul.c: Likewise.
+       * src/cipher-proto.h: Remove duplication for enum pk_encoding.
+       * mpi/mpi-pow.c (_gcry_mpi_powm): Initialize XSIZE.
 
        Fix _gcry_err_code_to_errno.
-       + commit 16ac1850b854abe9b6f693a489ceeb0048777bfa
+       + commit 3962623fe6de5c6d6604db90c8c0869fc3d3b7cf
        * src/gcrypt-int.h: Use gpg_err_code_to_errno.
 
-2022-08-30  Jakub Jelen  <jjelen@redhat.com>
+       Fix use of noreturn.
+       + commit 000c50e0781920d691cc60c345a7bd4d770e92d5
+       * doc/yat2m.c: Use __noreturn__.
+       * src/g10lib.h: Likewise.
+
+2022-09-08  NIIBE Yutaka  <gniibe@fsij.org>
+
+       Silence compiler warnings for unused internal value.
+       + commit 37dc9f8fd00a3aff175f34f094557899207d7ef8
+       * cipher/primegen.c (gen_prime): Fix write only variable.
+       * src/dumpsexp.c (parse_and_print): Likewise.
+
+       Fix function prototypes.
+       + commit 10d9878dd84fe89a598b6d5ac313869e5f8ffdae
+       * random/random-csprng.c (random_poll): It's no args.
+       * src/secmem.c (_gcry_secmem_module_init): Likewise.
+       (_gcry_secmem_term): Likewise.
+
+2022-08-26  NIIBE Yutaka  <gniibe@fsij.org>
+
+       cipher: Check arguments for setup_geniv.
+       + commit c9008345c114ddf2f2ecbfe8dbab03c6d0649408
+       * cipher/cipher.c (_gcry_cipher_setup_geniv): Validate the lengths.
+
+2022-08-25  NIIBE Yutaka  <gniibe@fsij.org>
+
+       cipher: Support internal generation of IV for AEAD cipher mode.
+       + commit 373b1f6c17948fa7d31880c3705391bef08a0471
+       * cipher/cipher-gcm.c (_gcry_cipher_gcm_setiv_zero): New.
+       (_gcry_cipher_gcm_encrypt, _gcry_cipher_gcm_decrypt)
+       (_gcry_cipher_gcm_authenticate): Use _gcry_cipher_gcm_setiv_zero.
+       * cipher/cipher-internal.h (struct gcry_cipher_handle): Add aead field.
+       * cipher/cipher.c (_gcry_cipher_setiv): Check calling setiv to reject
+       direct invocation in FIPS mode.
+       (_gcry_cipher_setup_geniv, _gcry_cipher_geniv): New.
+       * doc/gcrypt.texi: Add explanation for two new functions.
+       * src/gcrypt-int.h (_gcry_cipher_setup_geniv, _gcry_cipher_geniv): New.
+       * src/gcrypt.h.in (enum gcry_cipher_geniv_methods): New.
+       (gcry_cipher_setup_geniv, gcry_cipher_geniv): New.
+       * src/libgcrypt.def (gcry_cipher_setup_geniv, gcry_cipher_geniv): Add.
+       * src/libgcrypt.vers: Likewise.
+       * src/visibility.c (gcry_cipher_setup_geniv, gcry_cipher_geniv): Add.
+       * src/visibility.h: Likewise.
+
+2022-08-23  Jakub Jelen  <jjelen@redhat.com>
 
        tests: Expect the OEAP tests to fail in FIPS mode.
-       + commit 658679e0ec8be9693a3deb6b85c2b39cb112218c
+       + commit 249ca431ef881d510b90a5d3db9cd8507c4d697b
        * tests/basic.c (check_pubkey_crypt): Expect the OAEP padding encryption
          to fail in FIPS mode
        * tests/pkcs1v2.c (check_oaep): Expect the OAEP tests to fail in FIPS
          mode
 
        fips: Disable RSA-OAEP padding in FIPS mode.
-       + commit e5bfda492ab9496ed3d856a9f36250a2cc07ce70
+       + commit e552e37983da0c54840786eeff34481685fde1e9
        * cipher/pubkey-util.c (_gcry_pk_util_data_to_mpi): Block OAEP padding
          in FIPS mode for encryption
        * cipher/rsa.c (rsa_decrypt): Block OAEP padding in FIPS mode for
          decryption
 
+       gcrypt.h: Fix function name in comment.
+       + commit 0d69847e41e1803654180544fffd4cba3f49cb12
+
+
        random: Use getrandom (GRND_RANDOM) in FIPS mode.
-       + commit cf10c74bd9d5aa80798f1c0e23a9126f381b26b3
+       + commit aab1d63e4def41593312f76de016c885ffafecde
        * random/rndgetentropy.c (_gcry_rndgetentropy_gather_random): Use
          GRND_RANDOM in FIPS Mode
 
        Simplify the PCT for RSA and ECDSA.
-       + commit 285bf54b1ac7b5609a675655fe2cb9117ab78d3d
+       + commit a527d252b89958864153da9ad149e97bb96e1692
        Could be squashed.
 
        * cipher/ecc.c (test_keys_fips): Simplify to accept key in SEXP format
          format
 
        ecc: Run PCT also with the digest step.
-       + commit 076a8adaf314d593ca25c245d2a74207710a4fe7
+       + commit d259993b9456c7abe465f234c4a6f9688a16db40
        * cipher/ecc.c (test_keys_fips): New function
          (nist_generate_key): In FIPS mode, execute new PCT test
        ---
 
-       Cherry-picked from master commit:
-               505f048cac8e5af92d3431bd97ade492d1a30bc2
-
        rsa: Run PCT in FIPS mode also with digest step.
-       + commit 78151e6d6bbbbf1248b7c32cbab0b9b638ad6c11
+       + commit 505f048cac8e5af92d3431bd97ade492d1a30bc2
        * cipher/rsa.c (test_keys_fips): New.
        (generate_fips): Call test_keys_fips.
 
        fips: Add function-name based FIPS indicator.
-       + commit 822ee57f07cad6b32fac265a1a9e195d7cf99fa9
+       + commit 05a9c9d1ba1db6c1cd160fba979e9ddf4700a0c0
        * doc/gcrypt.texi: Document the new function-based fips indicator
          GCRYCTL_FIPS_SERVICE_INDICATOR_FUNCTION
        * src/fips.c (_gcry_fips_indicator_function): New function indicating
        * src/global.c (_gcry_vcontrol): Handle new FIPS indicator.
 
        fips: Run digest&sign self tests for RSA and ECC in FIPS mode.
-       + commit 06c9350165d7284cd9fe569fd23e6c6cf371dba2
+       + commit 1fc7bfc351ba1d7fa31c0c62a24ad78e9e1cfd5b
        * cipher/ecc.c (selftest_hash_sign): Implement digest & sign KAT
         (selftests_ecdsa): Run the original basic test only with extended tests
         (run_selftests): Pass-through the extended argument
         (selftests_rsa): Run the original basic test only with extended tests
         (run_selftests): Pass-through the extended argument
 
-2022-08-15  NIIBE Yutaka  <gniibe@fsij.org>
+2022-08-18  Milan Broz  <gmazyland@gmail.com>
+
+       kdf: Allow empty password for Argon2.
+       + commit a20700c55f0eb8dbb8368b756a571c116163a0bc
+       * cipher/kdf.c (_gcry_kdf_open) allow empty password for Argon2.
+
+       kdf: Restructure KDF test vectors.
+       + commit 8a1f50e66364389b8a867801ead8327a663b0c03
+       * tests/t-kdf.c: Restructure KDF test vectors to allow easy addition
+         new vectors. Also remove some ugly C code like goto again.
+
+2022-08-02  Clemens Lang  <cllang@redhat.com>
+
+       tests/basic: Add ifdefs for SM4 and CAMELLIA tests.
+       + commit 97e2c237f15dcdb44152887f4773f1f212b529bd
+       * tests/basic.c (check_gcm_siv_cipher): Do not run test vectors that
+         are known to fail when compiled without support for SM4 or CAMELLIA.
+
+2022-08-01  Jussi Kivilinna  <jussi.kivilinna@iki.fi>
+
+       basic: gcm-siv: add fips checks for SM4 and CAMELLIA128.
+       + commit a8f66410ca0ded040abea4790efaeb36bb4ccc91
+       * tests/basic.c (check_gcm_siv_cipher): Add 'flags' for test vectors
+       and set FLAG_NOFIPS for SM4 and CAMELLIA128 test vectors; Add FIPS check
+       for test vectors; Change printing "aes-gcm-siv" on fail messages to
+       "algo %d GCM-SIV".
+
+2022-07-31  Jussi Kivilinna  <jussi.kivilinna@iki.fi>
+
+       sm4: fix unused parameter compiler warning.
+       + commit a0c933f6fed0612558ff75fb23cbf78e2dea1f16
+       * cipher/sm4.c (sm4_get_crypt_blk1_16_fn): '(void)'-access ctx
+       parameter to avoid compiler warning on powerpc.
+
+2022-07-31  Tianjia Zhang  <tianjia.zhang@linux.alibaba.com>
+
+       sm4: add ARMv8 CE accelerated implementation for XTS mode.
+       + commit 8287dea8379fa9e43dc331c6bd444dd25a962e4b
+       * cipher/sm4-armv8-aarch64-ce.S (_gcry_sm4_armv8_ce_xts_crypt): New.
+       * cipher/sm4.c (_gcry_sm4_armv8_ce_xts_crypt): New.
+       (_gcry_sm4_xts_crypt) [USE_ARM_CE]: Add ARMv8 CE implementation for XTS.
+
+2022-07-31  Jussi Kivilinna  <jussi.kivilinna@iki.fi>
+
+       Simplify AES key schedule implementation.
+       + commit db5136c4d9331c001bbe91cfe6625ae7738f8575
+       * cipher/rijndael-armv8-ce.c (_gcry_aes_armv8_ce_setkey): New key
+       schedule with simplified structure and less stack usage.
+       * cipher/rijndael-internal.h (RIJNDAEL_context_s): Add
+       'keyschedule32b'.
+       (keyschenc32b): New.
+       * cipher/rijndael-ppc-common.h (vec_u32): New.
+       * cipher/rijndael-ppc.c (vec_bswap32_const): Remove.
+       (_gcry_aes_sbox4_ppc8): Optimize for less instructions emitted.
+       (keysched_idx): New.
+       (_gcry_aes_ppc8_setkey): New key schedule with simplified structure.
+       * cipher/rijndael-tables.h (rcon): Remove.
+       * cipher/rijndael.c (sbox4): New.
+       (do_setkey): New key schedule with simplified structure and less
+       stack usage.
+
+       rijndael-ppc: small speed-up for CBC and CFB encryption.
+       + commit 2ac6c24aa53024eb415d49f52229e868f72f47f8
+       * cipher/rijndael-ppc-common.h (AES_ENCRYPT_ALL): Remove
+       * cipher/rijndael-ppc-functions.h (CFB_ENC_FUNC)
+       (CBC_ENC_FUNC): Removed two block unrolled loop; Optimized single
+       block loop for shorter critical-path.
+
+2022-07-25  Jussi Kivilinna  <jussi.kivilinna@iki.fi>
+
+       sha512: add AArch64 crypto/SHA512 extension implementation.
+       + commit e51d3b8330a1d4b15e3484df90646e075c02f54b
+       * cipher/Makefile.am: Add 'sha512-armv8-aarch64-ce.S'.
+       * cipher/sha512-armv8-aarch64-ce.S: New.
+       * cipher/sha512.c (ATTR_ALIGNED_64, USE_ARM64_SHA512): New.
+       (k): Make array aligned to 64 bytes.
+       [USE_ARM64_SHA512] (_gcry_sha512_transform_armv8_ce): New.
+       [USE_ARM64_SHA512] (do_sha512_transform_armv8_ce): New.
+       (sha512_init_common) [USE_ARM64_SHA512]: Use ARMv8-SHA512 accelerated
+       implementation if HW feature available.
+       * configure.ac: Add 'sha512-armv8-aarch64-ce.lo'.
+       (gcry_cv_gcc_inline_asm_aarch64_sha3_sha512_sm3_sm4)
+       (HAVE_GCC_INLINE_ASM_AARCH64_SHA3_SHA512_SM3_SM4): New.
+
+       blake2: add AVX512 accelerated implementations.
+       + commit 909daa700e4b45d75469df298ee564b8fc2f4b72
+       * cipher/Makefile.am: Add 'blake2b-amd64-avx512.S' and
+       'blake2s-amd64-avx512.S'.
+       * cipher/blake2.c (USE_AVX512): New.
+       (ASM_FUNC_ABI): Setup attribute if USE_AVX2 or USE_AVX512 enabled in
+       addition to USE_AVX.
+       (BLAKE2B_CONTEXT_S, BLAKE2S_CONTEXT_S): Add 'use_avx512'.
+       (_gcry_blake2b_transform_amd64_avx512)
+       (_gcry_blake2s_transform_amd64_avx512): New.
+       (blake2b_transform, blake2s_transform) [USE_AVX512]: Add AVX512 path.
+       (blake2b_init_ctx, blake2s_init_ctx) [USE_AVX512]: Use AVX512 if HW
+       feature available.
+       * cipher/blake2b-amd64-avx512.S: New.
+       * cipher/blake2s-amd64-avx512.S: New.
+       * configure.ac: Add 'blake2b-amd64-avx512.lo' and
+       'blake2s-amd64-avx512.lo'.
+
+       sha3: Add x86-64 AVX512 accelerated implementation.
+       + commit beaad75f4655e5316ce24f75ef172c231fd47fc1
+       * LICENSES: Add 'cipher/keccak-amd64-avx512.S'.
+       * configure.ac: Add 'keccak-amd64-avx512.lo'.
+       * cipher/Makefile.am: Add 'keccak-amd64-avx512.S'.
+       * cipher/keccak-amd64-avx512.S: New.
+       * cipher/keccak.c (USE_64BIT_AVX512, ASM_FUNC_ABI): New.
+       [USE_64BIT_AVX512] (_gcry_keccak_f1600_state_permute64_avx512)
+       (_gcry_keccak_absorb_blocks_avx512, keccak_f1600_state_permute64_avx512)
+       (keccak_absorb_lanes64_avx512, keccak_avx512_64_ops): New.
+       (keccak_init) [USE_64BIT_AVX512]: Enable x86-64 AVX512 implementation
+       if supported by HW features.
+
+2022-07-21  Jussi Kivilinna  <jussi.kivilinna@iki.fi>
+
+       sm4-arm-sve-ce: use 32 parallel blocks for XTS and CTR32LE.
+       + commit dca0bd133dd08ec88e0b4c454cfc26c9093572a9
+       * cipher/sm4.c (sm4_crypt_blk1_32) [USE_ARM_SVE_CE]: Add SVE-SM4 code
+       path.
 
-       hmac: Allow use of shorter salt.
-       + commit ab5aef9b7b6ef757eff7bea4a17ade0ce3d3191b
-       * cipher/md.c (prepare_macpads): Move the check to...
-       * src/visibility.c (gcry_mac_setkey): ... here.
+       sm4 & camellia: add generic bulk acceleration for CTR32LE mode (GCM-SIV)
+       + commit cf956793afc2cdbd3b20caa3d186ccb8023b804c
+       * cipher/bulkhelp.h (bulk_ctr32le_enc_128): New.
+       * cipher/camellia-glue.c (_gcry_camellia_ctr32le_enc): New.
+       (camellia_setkey): Setup `bulk_ops->ctr32le_enc` if any AVX2
+       implementation is available.
+       * cipher/sm4.c (_gcry_sm4_ctr32le_enc): New.
+       (sm4_setkey): Setup `bulk_ops->ctr32le_enc`.
+       * tests/basic.c (check_gcm_siv_cipher): Add large bulk encryption
+       test vectors for SM4 and CAMELLIA128.
+
+       sm4: add amd64 GFNI/AVX512 implementation.
+       + commit eaed633c1662d8a98042ac146c981113f2807b22
+       * cipher/Makefile.am: Add 'sm4-gfni-avx512-amd64.S'.
+       * cipher/sm4-gfni-avx512-amd64.S: New.
+       * cipher/sm4-gfni.c (USE_GFNI_AVX512): New.
+       (SM4_context): Add 'use_gfni_avx512' and 'crypt_blk1_16'.
+       (_gcry_sm4_gfni_avx512_expand_key, _gcry_sm4_gfni_avx512_ctr_enc)
+       (_gcry_sm4_gfni_avx512_cbc_dec, _gcry_sm4_gfni_avx512_cfb_dec)
+       (_gcry_sm4_gfni_avx512_ocb_enc, _gcry_sm4_gfni_avx512_ocb_dec)
+       (_gcry_sm4_gfni_avx512_ocb_auth, _gcry_sm4_gfni_avx512_ctr_enc_blk32)
+       (_gcry_sm4_gfni_avx512_cbc_dec_blk32)
+       (_gcry_sm4_gfni_avx512_cfb_dec_blk32)
+       (_gcry_sm4_gfni_avx512_ocb_enc_blk32)
+       (_gcry_sm4_gfni_avx512_ocb_dec_blk32)
+       (_gcry_sm4_gfni_avx512_crypt_blk1_16)
+       (_gcry_sm4_gfni_avx512_crypt_blk32, sm4_gfni_avx512_crypt_blk1_16)
+       (sm4_crypt_blk1_32, sm4_encrypt_blk1_32, sm4_decrypt_blk1_32): New.
+       (sm4_expand_key): Add GFNI/AVX512 code-path
+       (sm4_setkey): Use GFNI/AVX512 if supported by CPU; Setup
+       `ctx->crypt_blk1_16`.
+       (sm4_encrypt, sm4_decrypt, sm4_get_crypt_blk1_16_fn, _gcry_sm4_ctr_enc)
+       (_gcry_sm4_cbc_dec, _gcry_sm4_cfb_dec, _gcry_sm4_ocb_crypt)
+       (_gcry_sm4_ocb_auth) [USE_GFNI_AVX512]: Add GFNI/AVX512 code path.
+       (_gcry_sm4_xts_crypt): Change parallel block size from 16 to 32.
+       * configure.ac: Add 'sm4-gfni-avx512-amd64.lo'.
+
+2022-07-21  Tianjia Zhang  <tianjia.zhang@linux.alibaba.com>
+
+       Add SM4 ARMv9 SVE CE assembly implementation.
+       + commit 2dc2654006746a25f9cb6b24786867f1725ac244
+       * cipher/Makefile.am: Add 'sm4-armv9-aarch64-sve-ce.S'.
+       * cipher/sm4-armv9-aarch64-sve-ce.S: New.
+       * cipher/sm4.c (USE_ARM_SVE_CE): New.
+       (SM4_context) [USE_ARM_SVE_CE]: Add 'use_arm_sve_ce'.
+       (_gcry_sm4_armv9_sve_ce_crypt, _gcry_sm4_armv9_sve_ce_ctr_enc)
+       (_gcry_sm4_armv9_sve_ce_cbc_dec, _gcry_sm4_armv9_sve_ce_cfb_dec)
+       (sm4_armv9_sve_ce_crypt_blk1_16): New.
+       (sm4_setkey): Enable ARMv9 SVE CE if supported by HW.
+       (sm4_get_crypt_blk1_16_fn) [USE_ARM_SVE_CE]: Add ARMv9 SVE CE
+       bulk functions.
+       (_gcry_sm4_ctr_enc, _gcry_sm4_cbc_dec, _gcry_sm4_cfb_dec)
+       [USE_ARM_SVE_CE]: Add ARMv9 SVE CE bulk functions.
+       * configure.ac: Add 'sm4-armv9-aarch64-sve-ce.lo'.
+
+       Add ARMv9 SVE2 and optional Crypto Extension HW features.
+       + commit ea5e71f9c8eb32d8b820be85eb34a8926cfa834d
+       * configure.ac (sve2support, gcry_cv_gcc_inline_asm_aarch64_sve2)
+       (ENABLE_SVE2_SUPPORT): New.
+       * doc/gcrypt.texi: Add "sve2, sveaes, svepmull, svesha3, svesm4" to
+       ARM hardware features list.
+       * src/g10lib.h (HWF_ARM_SVE2, HWF_ARM_SVEAES, HWF_ARM_SVEPMULL)
+       (HWF_ARM_SVESHA3, HWF_ARM_SVESM4): New.
+       * src/hwf-arm.c (arm_features): Add
+       "sve2, sveaes, svepmull, svesha3, svesm4".
+       * src/hwfeatures.c (hwflist): Add
+       "arm-sve2, arm-sveaes, arm-svepmull, arm-svesha3, arm-svesm4".
+
+       Add detection for HW feature "ARMv8 SVE"
+       + commit 8921b5221e333626884ad291881f79e0583d574a
+       * configure.ac (svesupport, gcry_cv_gcc_inline_asm_aarch64_sve)
+       (ENABLE_SVE_SUPPORT): New.
+       * doc/gcrypt.texi: Add "arm-sve" to HW features list.
+       * src/g10lib.h (HWF_ARM_SVE): New.
+       * src/hwf-arm.c (arm_features): Add "sve".
+       * src/hwfeatures.c (hwflist): Add "arm-sve".
+
+2022-07-21  NIIBE Yutaka  <gniibe@fsij.org>
+
+       Cleanup for type definitions of byte, ushort, u32, and u64.
+       + commit 3494140847cb8056d017418fefa25e7bbcfaa32c
+       * src/types.h: Use macros defined by configure script.
+       * src/hmac256.c: Fix for HAVE_U32.
+       * cipher/poly1305.c: Fix for HAVE_U64.
+
+2022-07-20  Jussi Kivilinna  <jussi.kivilinna@iki.fi>
+
+       visibility: add missing fips_is_operational check for gcry_md_extract.
+       + commit 434a58d0e784958d56ad4eceebec10ee97933108
+       * src/visibility.c (gcry_md_extract): Add 'fips_is_operational' check.
+
+       hwf-x86: fix UBSAN warning.
+       + commit 9636c88262fc5704cb4136cae975932acee6d08f
+       * src/hwf-x86.c (detect_x86_gnuc): Change `(1 << 31)` to `(1U << 31)`
+       to fix undefined behaviour.
+
+       hwf-arm: add ARM HW feature detection support for MacOS.
+       + commit 4abcedcea82036b18b3906e99f8aaf0a1f6c8b9a
+       * configure.ac: Add detection for header 'sys/sysctl.h' and system
+       function 'sysctlbyname'.
+       * src/hwf-arm.c (HAS_APPLE_SYSCTLBYNAME)
+       (detect_arm_apple_sysctlbyname): New.
+       (detect_arm_hwf_by_toolchain) [__ARM_FEATURE_CRYPTO]: Also check for
+       ENABLE_ARM_CRYPTO_SUPPORT.
+       (_gcry_hwf_detect_arm) [HAS_APPLE_SYSCTLBYNAME]: Check HWFs with
+       'detect_arm_apple_sysctlbyname' function.
+
+2022-07-19  NIIBE Yutaka  <gniibe@fsij.org>
+
+       cipher: Add buildhelp.h to source to be distributed.
+       + commit 9a134fb7f286b94904585f3c3958c944999f20cc
+       * cipher/Makefile.am (libcipher_la_SOURCES): Add bulkhelp.h.
 
-2022-07-13  NIIBE Yutaka  <gniibe@fsij.org>
+       build: Fix configure script.
+       + commit 2efb90104591eda490d9f7ba281aa29bceb92487
+       * configure.ac (AC_USE_SYSTEM_EXTENSIONS): Use it earlier.
+
+       build: Update config.guess, config.sub, and config.rpath.
+       + commit e633cc3315aa350d9d28cc27a6deb6304c68ef77
+       * build-aux/config.guess: Update from upstream.
+       * build-aux/config.sub: Ditto.
+       * build-aux/config.rpath: Update from gettext 0.21.
+
+2022-07-07  NIIBE Yutaka  <gniibe@fsij.org>
 
        cipher: Fix gcry_pk_hash_verify for explicit hash.
-       + commit 1d3a90a4d118eafa6b2f777c998e85327a77fb3c
+       + commit b2a64ed4f34abbd3871336503bec5ffeb3ad547b
        * cipher/pubkey.c (_gcry_pk_verify_md): Implement support of explicit
        hash.
        * tests/t-ecdsa.c (one_test_sexp): Use explicit hash.
 
-2022-07-13  Clemens Lang  <cllang@redhat.com>
+2022-07-07  Clemens Lang via Gcrypt-devel  <gcrypt-devel@lists.gnupg.org>
 
        tests/t-kdf: Test KDF FIPS indicator.
-       + commit 3bbcf16e0b8b63d70893f6d9cc0fe77f7d8bc17b
+       + commit 37b812f5e2a3c80d4bc104512248a07268f3c98b
         * tests/t-kdf.c (check_fips_indicators): Add test for gcry_control
           (GCRYCTL_FIPS_SERVICE_INDICATOR_KDF).
 
        tests: Test gcry_pk_hash_sign w/explicit hash algo.
-       + commit 04960f5179cd9732931b9f245a902a8a34bde964
+       + commit 45a139b166a3fa18eb1eddf7e02b5cdd890a6c37
         * tests/t-ecdsa.c (one_test_sexp): Re-run signature operation with hash
           algorithm explicitly specified in data_tmpl as documented in the
           manpage.
 
-2022-07-13  NIIBE Yutaka  <gniibe@fsij.org>
+2022-07-06  Jussi Kivilinna  <jussi.kivilinna@iki.fi>
+
+       sm4: fix wrong macro used for GFNI/AVX2 code-path.
+       + commit 66ef99bb1804d754edaa5d6b37651e92e918540e
+       * cipher/sm4.c (sm4_get_crypt_blk1_16_fn): Use USE_GFNI_AVX2 for
+       GFNI/AVX2 block.
+
+       sm4: fix use of GFNI/AVX2 accelerated key expansion.
+       + commit 935e211af145c559c9147026339ceb947edb6d6a
+       * cipher/sm4.c [USE_GFNI_AVX2] (_gcry_sm4_gfni_avx_expand_key): Remove.
+       [USE_GFNI_AVX2] (_gcry_sm4_gfni_avx2_expand_key): New.
+       (sm4_expand_key): Change '_gcry_sm4_gfni_avx_expand_key' to
+       '_gcry_sm4_gfni_avx2_expand_key'.
+
+       camellia-gfni-avx512: remove copy-paste / leftover extra instructions.
+       + commit 99b7375bd6162c7c3f481ab6d0d106bfcb5b2b07
+       * cipher/camellia-gfni-avx512-amd64.S: Remove extranous copy-pasted
+       instructions after `.Lbswap128_mask` constant; Remove left-over plaintext
+       xorring in OCB encryption input loading macro.
+
+       camellia-gfni-avx512: add missing register clearing on function exits.
+       + commit ac14d9ee7a094a2b9a26b1e3f4d36f59dbf68b40
+       * cipher/camellia-gfni-avx512-amd64.S: Add clear_regs() at externally
+       visible function epilogues.
+
+       tests/basic: enable IV checks for CBC/CFB/CTR bulk tests.
+       + commit fd3ed68754eb1741cef22bce8bc2957f3853a292
+       * cipher/cipher.c (_gcry_cipher_ctl): Add handling for
+       'PRIV_CIPHERCTL_GET_COUNTER'.
+       * src/gcrypt-testapi.h (PRIV_CIPHERCTL_GET_COUNTER): New.
+       * tests/basic.c (cipher_cbc_bulk_test, cipher_cfb_bulk_test): Restore
+       IV checks by reading current IV from CBC/CFB cipher handle using
+       PRIV_CIPHERCTL_GET_INPUT_VECTOR.
+       (cipher_ctr_bulk_test): Restore counter checks by reading current
+       counter from CTR cipher handle using PRIV_CIPHERCTL_GET_COUNTER.
+
+2022-07-06  NIIBE Yutaka  <gniibe@fsij.org>
+
+       cipher,chacha20: Conditionalize a variable.
+       + commit 8d5053fb08cf2a38360be2d2f5534b137c299e74
+       * cipher/chacha20.c (_gcry_chacha20_poly1305_decrypt): Only
+       use skip_stitched when one of macros is defined.
+
+2022-07-06  Clemens Lang  <cllang@redhat.com>
+
+       tests/basic: Skip non-FIPS tests in FIPS mode.
+       + commit 9d6203532d9012ff82aa218bd1a17797cf8017e5
+       * tests/basic.c (check_pubkey): Skip non-FIPS tests in FIPS mode, fixes
+         a logic error previously introduced in e9698002.
+
+2022-07-05  NIIBE Yutaka  <gniibe@fsij.org>
+
+       kdf: Fix memory leak on error.
+       + commit e2a7a0c9f5d258051b26babeac20e4a7cfb5a6ac
+       * cipher/kdf.c (hkdf_open): Free the memory on the error path.
+
+2022-07-01  Clemens Lang  <cllang@redhat.com>
+
+       fips: Skip selftests of unsupported PK algos.
+       + commit c5480b4934bb0adecdfc29a47b5e123f995507e1
+       * src/fips.c (run_pubkey_selftests): Do not run selftests for disabled
+         public key algorithms.
 
-       random: Fix rndjent for Windows.
-       + commit bc01c770c75703992fc0585d76d84107bdcd9fea
-       * random/jitterentropy-base-user.h [HAVE_W32_SYSTEM] (jent_ncpu):
-       Implement.
-       * random/rndjent.c (_WIN32_WINNT): Define for GetNativeSystemInfo.
-       (EOPNOTSUPP): Define when not available.
+       build: Skip PK-specific tests if algo is disabled.
+       + commit 56000fb5c42f01f1ced4e3dd0bb30662c0ba87c3
+       * configure.ac: Define AM_CONDITIONALs for USE_DSA, USE_RSA,
+         USE_ELGAMAL, USE_ECC so Makefiles can depend on them.
+       * tests/Makefile.am: Skip tests that test only one public key algorithm
+         if that algorithm is disabled.
+
+       tests/keygen.c: Skip unavailable PK algorithms.
+       + commit 572b0bf9668d6a01cd7ce1c2227d5a4899ad3502
+       * tests/keygen.c (show_mpi, check_rsa_keys, check_elg_keys,
+         check_dsa_keys, check_generated_ecc_key, check_ecc_keys): Skip tests
+         if the required public key algorithm is not available.
+
+       tests/benchmark: Skip unavailable PK algorithms.
+       + commit 78c0d76f809262184ef0198be087fc2b133c880b
+       * tests/benchmark.c (cipher_bench, rsa_bench, elg_bench, dsa_bench,
+         ecc_bench): Do not run benchmarks for PK algorithms that are not
+         supported in the current build.
+
+       tests/basic: Skip tests if PK algo is unavailable.
+       + commit e78cf3df23a2bf33dc7fdc99e55949732521668d
+       * tests/basic.c (check_pubkey): Skip tests if the required public key
+         algorithm is not available.
+
+       tests/pubkey: Skip tests if PK algo is unavailable.
+       + commit 4f4da6cbf065b3ae675e2b0d3ff56765025c2852
+       * tests/pubkey.c (check_run, main): Skip tests for unavailable
+         algorithms.
+
+       kdf: Skip tests if hash algo is not available.
+       + commit 96fafffeeba5899d3d1b4d68ce99faed23cef641
+       * tests/t-kdf.c (check_openpgp, check_pbkdf2): Test digest availability.
+
+2022-06-24  NIIBE Yutaka  <gniibe@fsij.org>
+
+       hmac,hkdf: Allow use of shorter salt for HKDF.
+       + commit 58c92098d053aae7c78cc42bdd7c80c13efc89bb
+       * cipher/md.c (prepare_macpads): Move the check to...
+       * src/visibility.c (gcry_mac_setkey): ... here.
+       * tests/t-kdf.c (check_hkdf): No failure is expected.
+
+2022-06-21  NIIBE Yutaka  <gniibe@fsij.org>
+
+       kdf,fips: Modify HKDF test for FIPS mode.
+       + commit 07722d89bac1a739b084b4412c3ff42e215d5968
+       * tests/t-kdf.c (check_hkdf): Check if shorter salts are rejected
+       correctly when FIPS mode.
+
+       kdf: Add input check for hkdf.
+       + commit e0f0c788dc0f268965c0f63eb33d9f98c0575d58
+       * cipher/kdf.c (hkdf_open): Validate the output size.
 
 2022-06-16  NIIBE Yutaka  <gniibe@fsij.org>
 
-       mpi: Allow building with --disable-asm for HPPA.
-       + commit d1cb2599e9d746bb3a088c63b24f8191072e11ef
-       * mpi/longlong.h [__hppa] (udiv_qrnnd): Only define
-       when assembler is enabled.
+       kdf: Add HKDF of RFC5869.
+       + commit fbddfb964f0b1c1ec131194b2273c3f834041c84
+       * src/gcrypt.h.in (GCRY_KDF_HKDF): New.
+       * cipher/kdf.c (hkdf_open, hkdf_compute, hkdf_final, hkdf_close): New.
+       (_gcry_kdf_open, _gcry_kdf_compute, _gcry_kdf_final, _gcry_kdf_close):
+       Handle GCRY_KDF_HKDF.
+       * tests/t-kdf.c (check_hkdf): New.  Test vectors from RFC5869.
+       (main): Call check_hkdf.
+
+2022-06-12  Jussi Kivilinna  <jussi.kivilinna@iki.fi>
+
+       ppc: enable P10 assembly with ENABLE_FORCE_SOFT_HWFEATURES on arch-3.00.
+       + commit 2c5e5ab6843d747c4b877d2c6f47226f61e9ff14
+       * cipher/chacha20.c (chacha20_do_setkey) [USE_PPC_VEC]: Enable
+       P10 assembly for HWF_PPC_ARCH_3_00 if ENABLE_FORCE_SOFT_HWFEATURES is
+       defined.
+       * cipher/poly1305.c (poly1305_init) [POLY1305_USE_PPC_VEC]: Likewise.
+       * cipher/rijndael.c (do_setkey) [USE_PPC_CRYPTO_WITH_PPC9LE]: Likewise.
+       ---
 
-2022-05-31  Jakub Jelen  <jjelen@redhat.com>
+       This change allows testing P10 implementations with P9 and with QEMU-PPC.
+
+2022-06-12  Danny Tsen  <dtsen@us.ibm.com>
+
+       Chacha20/poly1305 - Optimized chacha20/poly1305 for P10 operation.
+       + commit 88fe7ac33eb4cb4dff76a5cc7fca50da5fb0ee3a
+       * configure.ac: Added chacha20 and poly1305 assembly implementations.
+       * cipher/chacha20-p10le-8x.s: (New) - support 8 blocks (512 bytes)
+       unrolling.
+       * cipher/poly1305-p10le.s: (New) - support 4 blocks (128 bytes)
+       unrolling.
+       * cipher/Makefile.am: Added new chacha20 and poly1305 files.
+       * cipher/chacha20.c: Added PPC p10 le support for 8x chacha20.
+       * cipher/poly1305.c: Added PPC p10 le support for 4x poly1305.
+       * cipher/poly1305-internal.h: Added PPC p10 le support for poly1305.
+       ---
+
+       [jk: cosmetic changes to C code]
+       [jk: fix building on ppc64be]
+
+2022-06-08  NIIBE Yutaka  <gniibe@fsij.org>
+
+       kdf: Add support for One-Step KDF with MAC.
+       + commit 6d32bf80846a22568575a101a3fe6769ab058bb9
+       * src/gcrypt.h.in (GCRY_KDF_ONESTEP_KDF_MAC): New.
+       * cipher/kdf.c (onestep_kdf_mac_open, onestep_kdf_mac_compute): New.
+       (onestep_kdf_mac_final, onestep_kdf_mac_close): New.
+       (_gcry_kdf_open, _gcry_kdf_compute, _gcry_kdf_final, _gcry_kdf_close):
+       Add support for GCRY_KDF_ONESTEP_KDF_MAC.
 
-       tests: Fix copy paste error.
-       + commit 4b85bf33cce7ee331d4da1b99620aed6f9fbf846
-       * tests/basic.c (check_ocb_cipher_checksum): Check the right value for
-         errors
+2022-06-07  NIIBE Yutaka  <gniibe@fsij.org>
+
+       kdf: Add One-Step KDF with hash.
+       + commit f8c983cb14f8ba0921ce8fa52ea3519feae07861
+       * src/gcrypt.h.in (GCRY_KDF_ONESTEP_KDF): New.
+       * cipher/kdf.c (onestep_kdf_open, onestep_kdf_compute): New.
+       (onestep_kdf_final): New.
+       (_gcry_kdf_open, _gcry_kdf_compute, _gcry_kdf_final): Add
+       GCRY_KDF_ONESTEP_KDF support.
+       * tests/t-kdf.c (check_onestep_kdf): Add the test.
+       (main): Call check_onestep_kdf.
+
+       Fix for struct gcry_thread_cbs.
+       + commit 8d8e80ad753645b5fcbe28bb3e768079415ef848
+       * src/gcrypt.h.in (struct gcry_thread_cbs): Since it's no use any
+       more, even internally, use _GCRY_GCC_ATTR_DEPRECATED instead.
+
+2022-06-01  NIIBE Yutaka  <gniibe@fsij.org>
+
+       secmem: Remove RISC OS support.
+       + commit fd9aa21983a114e1aab605536bc2371235e9a060
+       * src/secmem.c [__riscos__]: Remove.
+
+       secmem: Clean up ERRNO handling.
+       + commit d7c900a97b62a44d6667e065c6d2384136aefa63
+       * src/secmem.c (lock_pool_pages): Use ERR only for the return value
+       from mlock.
+
+       secmem: Remove getting cap_ipc_lock by capabilities support.
+       + commit 43f51d0ec6b50a6317a6e67642bc87b9ddf45927
+       * src/secmem.c (lock_pool_pages): Remove escalation of the capability.
+
+2022-05-31  Jakub Jelen  <jjelen@redhat.com>
 
        Fix memory leaks in tests.
-       + commit 735601494adb22a6ec8b1a4eacf1f75480a7c203
+       + commit ef2e1523c33c3143b4fee0c00f88a5a0842b337f
        * tests/aeswrap.c (check_one_with_padding): Free hd on error paths
        * tests/basic.c (check_ccm_cipher): Free context on error paths
          (check_ocb_cipher_checksum): Ditto.
 2022-05-19  Jakub Jelen  <jjelen@redhat.com>
 
        cipher: Allow verification of small RSA signatures in FIPS mode.
-       + commit 468ffa8f9c471c910280e0d0ade521d0184ed533
+       + commit ca2afc9fb64d9a9b2f8930ba505d9ab6c8a57667
        * cipher/rsa.c (rsa_check_keysize): Formatting.
          (rsa_check_verify_keysize): New function.
          (rsa_verify): Allow using smaller keys for verification.
 2022-05-17  NIIBE Yutaka  <gniibe@fsij.org>
 
        Fix internal declaration of _gcry_kdf_compute.
-       + commit 6d3708942f846e389bd87fe3d7c6e7a1b3615bca
+       + commit 4019f1a66b15d2ef82eb059c432e6b09b2c69b21
        * src/gcrypt-int.h (_gcry_kdf_compute): Return gcry_err_code_t.
 
+       mpi: Allow building with --disable-asm for HPPA.
+       + commit c0692324fe8b3806eefc5017767917dca9cd94d0
+       * mpi/longlong.h [__hppa] (udiv_qrnnd): Only define
+       when assembler is enabled.
+
+2022-05-15  Jussi Kivilinna  <jussi.kivilinna@iki.fi>
+
+       aarch64-asm: use ADR for getting pointers for local labels.
+       + commit fd02e8e78470deb661269c429f3348f811c054c6
+       * cipher/asm-common-aarch64.h (GET_DATA_POINTER): Remove.
+       (GET_LOCAL_POINTER): New.
+       * cipher/camellia-aarch64.S: Use GET_LOCAL_POINTER instead of ADR
+       instruction directly.
+       * cipher/chacha20-aarch64.S: Use GET_LOCAL_POINTER instead of
+       GET_DATA_POINTER.
+       * cipher/cipher-gcm-armv8-aarch64-ce.S: Likewise.
+       * cipher/crc-armv8-aarch64-ce.S: Likewise.
+       * cipher/sha1-armv8-aarch64-ce.S: Likewise.
+       * cipher/sha256-armv8-aarch64-ce.S: Likewise.
+       * cipher/sm3-aarch64.S: Likewise.
+       * cipher/sm3-armv8-aarch64-ce.S: Likewise.
+       * cipher/sm4-aarch64.S: Likewise.
+       ---
+
+       Switch to use ADR instead of ADRP/LDR or ADRP/ADD for getting
+       data pointers within assembly files. ADR is more portable across
+       targets and does not require labels to be declared in GOT tables.
+
+2022-05-11  Jussi Kivilinna  <jussi.kivilinna@iki.fi>
+
+       cipher: move CBC/CFB/CTR self-tests to tests/basic.
+       + commit a9700956361d280746f2bffe94cbdb72c95eb3ed
+       * cipher/Makefile.am: Remove 'cipher-selftest.c' and 'cipher-selftest.h'.
+       * cipher/cipher-selftest.c: Remove (refactor these tests to
+       tests/basic.c).
+       * cipher/cipher-selftest.h: Remove.
+       * cipher/blowfish.c (selftest_ctr, selftest_cbc, selftest_cfb): Remove.
+       (selftest): Remove CTR/CBC/CFB bulk self-tests.
+       * cipher/camellia-glue.c (selftest_ctr_128, selftest_cbc_128)
+       (selftest_cfb_128): Remove.
+       (selftest): Remove CTR/CBC/CFB bulk self-tests.
+       * cipher/cast5.c (selftest_ctr, selftest_cbc, selftest_cfb): Remove.
+       (selftest): Remove CTR/CBC/CFB bulk self-tests.
+       * cipher/des.c (bulk_selftest_setkey, selftest_ctr, selftest_cbc)
+       (selftest_cfb): Remove.
+       (selftest): Remove CTR/CBC/CFB bulk self-tests.
+       * cipher/rijndael.c (selftest_basic_128, selftest_basic_192)
+       (selftest_basic_256): Allocate context from stack instead of heap and
+       handle alignment manually.
+       (selftest_ctr_128, selftest_cbc_128, selftest_cfb_128): Remove.
+       (selftest): Remove CTR/CBC/CFB bulk self-tests.
+       * cipher/serpent.c (selftest_ctr_128, selftest_cbc_128)
+       (selftest_cfb_128): Remove.
+       (selftest): Remove CTR/CBC/CFB bulk self-tests.
+       * cipher/sm4.c (selftest_ctr_128, selftest_cbc_128)
+       (selftest_cfb_128): Remove.
+       (selftest): Remove CTR/CBC/CFB bulk self-tests.
+       * cipher/twofish.c (selftest_ctr, selftest_cbc, selftest_cfb): Remove.
+       (selftest): Remove CTR/CBC/CFB bulk self-tests.
+       * tests/basic.c (buf_xor, cipher_cbc_bulk_test, buf_xor_2dst)
+       (cipher_cfb_bulk_test, cipher_ctr_bulk_test): New.
+       (check_ciphers): Run cipher_cbc_bulk_test(), cipher_cfb_bulk_test() and
+       cipher_ctr_bulk_test() for block ciphers.
+       ---
+
+       CBC/CFB/CTR bulk self-tests are quite computationally heavy and
+       slow down use cases where application opens cipher context once,
+       does processing and exits. Better place for these tests is in
+       `tests/basic`.
+
+       camellia: add amd64 GFNI/AVX512 implementation.
+       + commit 9ab61ba24b72bc109b7578a7868716910d2ea9d1
+       * cipher/Makefile.am: Add 'camellia-gfni-avx512-amd64.S'.
+       * cipher/bulkhelp.h (bulk_ocb_prepare_L_pointers_array_blk64): New.
+       * cipher/camellia-aesni-avx2-amd64.h: Rename internal functions from
+       "__camellia_???" to "FUNC_NAME(???)"; Minor changes to comments.
+       * cipher/camellia-gfni-avx512-amd64.S: New.
+       * cipher/camellia-gfni.c (USE_GFNI_AVX512): New.
+       (CAMELLIA_context): Add 'use_gfni_avx512'.
+       (_gcry_camellia_gfni_avx512_ctr_enc, _gcry_camellia_gfni_avx512_cbc_dec)
+       (_gcry_camellia_gfni_avx512_cfb_dec, _gcry_camellia_gfni_avx512_ocb_enc)
+       (_gcry_camellia_gfni_avx512_ocb_dec)
+       (_gcry_camellia_gfni_avx512_enc_blk64)
+       (_gcry_camellia_gfni_avx512_dec_blk64, avx512_burn_stack_depth): New.
+       (camellia_setkey): Use GFNI/AVX512 if supported by CPU.
+       (camellia_encrypt_blk1_64, camellia_decrypt_blk1_64): New.
+       (_gcry_camellia_ctr_enc, _gcry_camellia_cbc_dec, _gcry_camellia_cfb_dec)
+       (_gcry_camellia_ocb_crypt) [USE_GFNI_AVX512]: Add GFNI/AVX512 code path.
+       (_gcry_camellia_xts_crypt): Change parallel block size from 32 to 64.
+       (selftest_ctr_128, selftest_cbc_128, selftest_cfb_128): Increase test
+       block size.
+       * cipher/chacha20-amd64-avx512.S: Clear k-mask registers with xor.
+       * cipher/poly1305-amd64-avx512.S: Likewise.
+       * cipher/sha512-avx512-amd64.S: Likewise.
+       ---
+
+       Benchmark on Intel i3-1115G4 (tigerlake):
+
+       Before (GFNI/AVX2):
+        CAMELLIA128    |  nanosecs/byte   mebibytes/sec   cycles/byte  auto Mhz
+               CBC dec |     0.356 ns/B      2679 MiB/s      1.46 c/B      4089
+               CFB dec |     0.374 ns/B      2547 MiB/s      1.53 c/B      4089
+               CTR enc |     0.409 ns/B      2332 MiB/s      1.67 c/B      4089
+               CTR dec |     0.406 ns/B      2347 MiB/s      1.66 c/B      4089
+               XTS enc |     0.430 ns/B      2216 MiB/s      1.76 c/B      4090
+               XTS dec |     0.433 ns/B      2201 MiB/s      1.77 c/B      4090
+               OCB enc |     0.460 ns/B      2071 MiB/s      1.88 c/B      4089
+               OCB dec |     0.492 ns/B      1939 MiB/s      2.01 c/B      4089
+
+       After (GFNI/AVX512):
+        CAMELLIA128    |  nanosecs/byte   mebibytes/sec   cycles/byte  auto Mhz
+               CBC dec |     0.207 ns/B      4600 MiB/s     0.827 c/B      3989
+               CFB dec |     0.207 ns/B      4610 MiB/s     0.825 c/B      3989
+               CTR enc |     0.218 ns/B      4382 MiB/s     0.868 c/B      3990
+               CTR dec |     0.217 ns/B      4389 MiB/s     0.867 c/B      3990
+               XTS enc |     0.330 ns/B      2886 MiB/s      1.35 c/B      4097±4
+               XTS dec |     0.328 ns/B      2904 MiB/s      1.35 c/B      4097±3
+               OCB enc |     0.246 ns/B      3879 MiB/s     0.981 c/B      3990
+               OCB dec |     0.247 ns/B      3855 MiB/s     0.987 c/B      3990
+
+         CBC dec: 70% faster
+         CFB dec: 80% faster
+         CTR: 87% faster
+         XTS: 31% faster
+         OCB: 92% faster
+
 2022-05-10  NIIBE Yutaka  <gniibe@fsij.org>
 
        mpi: Fix for 64-bit for _gcry_mpih_cmp_ui.
-       + commit 03af3d5cc5d54b6f810264568d6de22cd9e7d34f
+       + commit a611e3a25d61505698e2bb38ec2db38bc6a74820
        * mpi/mpih-const-time.c (_gcry_mpih_cmp_ui): Compare 64-bit
        value correctly.
 
-2022-05-06  NIIBE Yutaka  <gniibe@fsij.org>
+       random: Fix rndjent for Windows.
+       + commit 5dc97e855bb27705a548a297b666b7be7b1c59a3
+       * random/jitterentropy-base-user.h [HAVE_W32_SYSTEM] (jent_ncpu):
+       Implement.
+       * random/rndjent.c (_WIN32_WINNT): Define for GetNativeSystemInfo.
+       (EOPNOTSUPP): Define when not available.
 
-       random:drbg: Fix the behavior for child process.
-       + commit 019a40c99011390f12168e79e3bebd0ff52cc003
-       * random/random-drbg.c (_gcry_rngdrbg_randomize): Update change of PID
-       detection.
+2022-04-30  Jussi Kivilinna  <jussi.kivilinna@iki.fi>
+
+       tests/basic: add testing for partial bulk processing code paths.
+       + commit 9ba1f0091ff408d6140ee75a56fd67f02d0d3f30
+       * tests/basic.c (check_one_cipher_core): Add 'split_mode' parameter and
+       handling for split_mode==1.
+       (check_one_cipher): Use split_mode==0 for existing check_one_cipher_core
+       calls; Add new large buffer check with split_mode==1.
+
+       sm4-aesni-avx2: add generic 1 to 16 block bulk processing function.
+       + commit e239738b4af28b64ab617900fced8a216552e9f1
+       * cipher/sm4-aesni-avx2-amd64.S: Remove unnecessary vzeroupper at
+       function entries.
+       (_gcry_sm4_aesni_avx2_crypt_blk1_16): New.
+       * cipher/sm4.c (_gcry_sm4_aesni_avx2_crypt_blk1_16)
+       (sm4_aesni_avx2_crypt_blk1_16): New.
+       (sm4_get_crypt_blk1_16_fn) [USE_AESNI_AVX2]: Add
+       'sm4_aesni_avx2_crypt_blk1_16'.
+
+       Add SM4 x86-64/GFNI/AVX2 implementation.
+       + commit 5095d60af42d898311d66b10f5204a3418a4a8af
+       * cipher/Makefile.am: Add 'sm4-gfni-avx2-amd64.S'.
+       * cipher/sm4-aesni-avx2-amd64.S: New.
+       * cipher/sm4.c (USE_GFNI_AVX2): New.
+       (SM4_context): Add 'use_gfni_avx2'.
+       (crypt_blk1_8_fn_t): Rename to...
+       (crypt_blk1_16_fn_t): ...this.
+       (sm4_aesni_avx_crypt_blk1_8): Rename to...
+       (sm4_aesni_avx_crypt_blk1_16): ...this and add handling for 9 to 16
+       input blocks.
+       (_gcry_sm4_gfni_avx_expand_key, _gcry_sm4_gfni_avx2_ctr_enc)
+       (_gcry_sm4_gfni_avx2_cbc_dec, _gcry_sm4_gfni_avx2_cfb_dec)
+       (_gcry_sm4_gfni_avx2_ocb_enc, _gcry_sm4_gfni_avx2_ocb_dec)
+       (_gcry_sm4_gfni_avx2_ocb_auth, _gcry_sm4_gfni_avx2_crypt_blk1_16)
+       (sm4_gfni_avx2_crypt_blk1_16): New.
+       (sm4_aarch64_crypt_blk1_8): Rename to...
+       (sm4_aarch64_crypt_blk1_16): ...this and add handling for 9 to 16
+       input blocks.
+       (sm4_armv8_ce_crypt_blk1_8): Rename to...
+       (sm4_armv8_ce_crypt_blk1_16): ...this and add handling for 9 to 16
+       input blocks.
+       (sm4_expand_key): Add GFNI/AVX2 path.
+       (sm4_setkey): Enable GFNI/AVX2 implementation if HW features
+       available; Disable AESNI implementations when GFNI implementation is
+       enabled.
+       (sm4_encrypt) [USE_GFNI_AVX2]: New.
+       (sm4_decrypt) [USE_GFNI_AVX2]: New.
+       (sm4_get_crypt_blk1_8_fn): Rename to...
+       (sm4_get_crypt_blk1_16_fn): ...this; Update to use *_blk1_16 functions;
+       Add GFNI/AVX2 selection.
+       (_gcry_sm4_ctr_enc, _gcry_sm4_cbc_dec, _gcry_sm4_cfb_dec)
+       (_gcry_sm4_ocb_crypt, _gcry_sm4_ocb_auth): Add GFNI/AVX2 path; Widen
+       generic bulk processing from 8 blocks to 16 blocks.
+       (_gcry_sm4_xts_crypt): Widen generic bulk processing from 8 blocks to
+       16 blocks.
+
+       sm4: add XTS bulk processing.
+       + commit aad3381e93846212c2022dba50e621e4b48f3295
+       * cipher/sm4.c (_gcry_sm4_xts_crypt): New.
+       (sm4_setkey): Set XTS bulk function.
+
+2022-04-29  Jussi Kivilinna  <jussi.kivilinna@iki.fi>
+
+       camellia-avx2: add bulk processing for XTS mode.
+       + commit 32b18cdb87b771f5c1ec87ef5e0f115f3f2d362f
+       * cipher/bulkhelp.h (bulk_xts_crypt_128): New.
+       * cipher/camellia-glue.c (_gcry_camellia_xts_crypt): New.
+       (camellia_set_key) [USE_AESNI_AVX2]: Set XTS bulk function if AVX2
+       implementation is available.
+
+       camellia-avx2: add partial parallel block processing.
+       + commit bacdc1de3f4fe063054af4e36e7fdfa5b00ccb64
+       * cipher/camellia-aesni-avx2-amd64.h: Remove unnecessary vzeroupper
+       from function entry.
+       (enc_blk1_32, dec_blk1_32): New.
+       * cipher/camellia-glue.c (avx_burn_stack_depth)
+       (avx2_burn_stack_depth): Move outside of bulk functions to deduplicate.
+       (camellia_setkey): Disable AESNI & VAES implementation when GFNI
+       implementation is enabled.
+       (_gcry_camellia_aesni_avx2_enc_blk1_32)
+       (_gcry_camellia_aesni_avx2_dec_blk1_32)
+       (_gcry_camellia_vaes_avx2_enc_blk1_32)
+       (_gcry_camellia_vaes_avx2_dec_blk1_32)
+       (_gcry_camellia_gfni_avx2_enc_blk1_32)
+       (_gcry_camellia_gfni_avx2_dec_blk1_32, camellia_encrypt_blk1_32)
+       (camellia_decrypt_blk1_32): New.
+       (_gcry_camellia_ctr_enc, _gcry_camellia_cbc_dec, _gcry_camellia_cfb_dec)
+       (_gcry_camellia_ocb_crypt, _gcry_camellia_ocb_auth): Use new bulk
+       processing helpers from 'bulkhelp.h' and 'camellia_encrypt_blk1_32'
+       and 'camellia_decrypt_blk1_32' for partial parallel processing.
+
+2022-04-24  Jussi Kivilinna  <jussi.kivilinna@iki.fi>
+
+       cipher/bulkhelp: add functions for CTR/CBC/CFB/OCB bulk processing.
+       + commit 754055ccd0438b96961601438fafb7799eae612f
+       * cipher/bulkhelp.h (bulk_crypt_fn_t, bulk_ctr_enc_128)
+       (bulk_cbc_dec_128, bulk_cfb_dec_128, bulk_ocb_crypt_128)
+       (bulk_ocb_auth_128): New.
+       * cipher/sm4.c (_gcry_sm4_ctr_enc, _gcry_sm4_cbc_dec)
+       (_gcry_sm4_cfb_dec, _gcry_sm4_ocb_crypt, _gcry_sm4_ocb_auth): Switch
+       to use helper functions from 'bulkhelp.h'.
+
+       Move bulk OCB L pointer array setup code to common header.
+       + commit 9388279803ff82ea0ccd12a83157b94c807e7a8f
+       * cipher/bulkhelp.h: New.
+       * cipher/camellia-glue.c (_gcry_camellia_ocb_crypt)
+       (_gcry_camellia_ocb_crypt): Use new
+       `bulk_ocb_prepare_L_pointers_array_blkXX` function for OCB L pointer
+       array setup.
+       * cipher/serpent.c (_gcry_serpent_ocb_crypt)
+       (_gcry_serpent_ocb_auth): Likewise.
+       * cipher/sm4.c (_gcry_sm4_ocb_crypt, _gcry_sm4_ocb_auth): Likewise.
+       * cipher/twofish.c (_gcry_twofish_ocb_crypt)
+       (_gcry_twofish_ocb_auth): Likewise.
+
+       sm4: deduplicate bulk processing function selection.
+       + commit e1c5f950838b2fa086a798f7194b618f581dca96
+       * cipher/sm4.c (crypt_blk1_8_fn_t): New.
+       (sm4_aesni_avx_crypt_blk1_8, sm4_aarch64_crypt_blk1_8)
+       (sm4_armv8_ce_crypt_blk1_8, sm4_crypt_blocks): Change first parameter
+       to void pointer type.
+       (sm4_get_crypt_blk1_8_fn): New.
+       (_gcry_sm4_ctr_enc, _gcry_sm4_cbc_dec, _gcry_sm4_cfb_dec)
+       (_gcry_sm4_ocb_crypt, _gcry_sm4_ocb_auth): Use sm4_get_crypt_blk1_8_fn
+       for selecting crypt_blk1_8.
 
-2022-05-06  Jakub Jelen  <jjelen@redhat.com>
+       Add GFNI/AVX2 implementation of Camellia.
+       + commit 4e6896eb9fce74908e15e085da00edfed0fa1923
+       * cipher/Makefile.am: Add "camellia-gfni-avx2-amd64.S".
+       * cipher/camellia-aesni-avx2-amd64.h [CAMELLIA_GFNI_BUILD]: Add GFNI
+       support.
+       * cipher/camellia-gfni-avx2-amd64.S: New.
+       * cipher/camellia-glue.c (USE_GFNI_AVX2): New.
+       (CAMELLIA_context) [USE_AESNI_AVX2]: New member "use_gfni_avx2".
+       [USE_GFNI_AVX2] (_gcry_camellia_gfni_avx2_ctr_enc)
+       (_gcry_camellia_gfni_avx2_cbc_dec, _gcry_camellia_gfni_avx2_cfb_dec)
+       (_gcry_camellia_gfni_avx2_ocb_enc, _gcry_camellia_gfni_avx2_ocb_dec)
+       (_gcry_camellia_gfni_avx2_ocb_auth): New.
+       (camellia_setkey) [USE_GFNI_AVX2]: Enable GFNI if supported by HW.
+       (_gcry_camellia_ctr_enc) [USE_GFNI_AVX2]: Add GFNI support.
+       (_gcry_camellia_cbc_dec) [USE_GFNI_AVX2]: Add GFNI support.
+       (_gcry_camellia_cfb_dec) [USE_GFNI_AVX2]: Add GFNI support.
+       (_gcry_camellia_ocb_crypt) [USE_GFNI_AVX2]: Add GFNI support.
+       (_gcry_camellia_ocb_auth) [USE_GFNI_AVX2]: Add GFNI support.
+       * configure.ac: Add "camellia-gfni-avx2-amd64.lo".
+
+       Add detection for HW feature "intel-gfni"
+       + commit 3410d40996d8f7377935192ebecf4cad66688b25
+       * configure.ac (gfnisupport, gcry_cv_gcc_inline_asm_gfni)
+       (ENABLE_GFNI_SUPPORT): New.
+       * src/g10lib.h (HWF_INTEL_GFNI): New.
+       * src/hwf-x86.c (detect_x86_gnuc): Add GFNI detection.
+       * src/hwfeatures.c (hwflist): Add "intel-gfni".
+       * doc/gcrypt.texi: Add "intel-gfni" to HW features list.
+
+2022-04-21  Jakub Jelen  <jjelen@redhat.com>
 
        tests: Expect the RSA PKCS #1.5 encryption to fail in FIPS mode.
-       + commit 1a270cda2ee5fe345f480b4eda13b92a7b7f556e
+       + commit f736f3c70182d9c948f9105eb769c47c5578df35
        * tests/basic.c (check_pubkey_crypt): Expect RSA PKCS #1.5 encryption to
          fail in FIPS mode. Expect failure when wrong padding is selected
        * tests/pkcs1v2.c (check_v15crypt): Expect RSA PKCS #1.5 encryption to
          fail in FIPS mode
 
        tests: Replace custom bit with more generic flags.
-       + commit 9c55ba3bc1ce72307886f6d88f37d908e3fad39a
+       + commit 299e2f93415984919181e0ee651719bbf83bdd2f
        * tests/basic.c (global): New flag FLAG_SPECIAL
          (check_pubkey_crypt): Change to use bitfield flags
 
        Do not allow PKCS #1.5 padding for encryption in FIPS.
-       + commit d8a13d97ccb62c8f7564192f0c8dcdc0b4d9745a
+       + commit c7709f7b23848abf4ba65cb99cb2a9e9c7ebdefc
        * cipher/pubkey-util.c (_gcry_pk_util_data_to_mpi): Block PKCS #1.5
          padding for encryption in FIPS mode
        * cipher/rsa.c (rsa_decrypt): Block PKCS #1.5 decryption in FIPS mode
 
-2022-05-06  NIIBE Yutaka  <gniibe@fsij.org>
+2022-04-21  NIIBE Yutaka  <gniibe@fsij.org>
 
        random: Not use secure memory for DRBG instance.
-       + commit 9452640125d239937dfb9cde49be7c0dde2f65ee
+       + commit f436bf4451cb2ad0a1f56aa843bf79a58878022f
        * random/random-drbg.c (drbg_instance): New at BSS.
        (_drbg_init_internal): Don't allocate at secure memory.
        (_gcry_rngdrbg_close_fds): Follow the change.
 
+2022-04-20  NIIBE Yutaka  <gniibe@fsij.org>
+
        cipher: Change the bounds for RSA key generation round.
-       + commit f6a67c2215310e9463267e15569697d3103b27f7
+       + commit cd30ed3c0d715aa0c58a32a29cfb1476163a5b94
        * cipher/rsa.c (generate_fips): Use 10 for p, 20 for q.
 
 2022-04-19  NIIBE Yutaka  <gniibe@fsij.org>
 
+       Use offsetof instead of null ptr calculation.
+       + commit 9e9f3073369905707b718aa1628d0fe5b1affed9
+       * src/secmem.c (_gcry_secmem_realloc_internal): Use offsetof.
+
+2022-04-18  NIIBE Yutaka  <gniibe@fsij.org>
+
        cipher: Fix rsa key generation.
-       + commit 26df4b8d8c938fd837b99355de5163b9364d49a5
+       + commit 51754fa2ed06cc41487324432dbea654642ef244
        * cipher/rsa.c (generate_fips): Set the least significant bit.
 
-2022-04-02  Jussi Kivilinna  <jussi.kivilinna@iki.fi>
+2022-04-12  Clemens Lang  <cllang@redhat.com>
+
+       build: Fix make dist after socklen.m4 removal.
+       + commit 922f9957f94a24c58812db9bbabfb55a8ce211a8
+       * m4/Makefile.am: Remove socklen.m4 from EXTRA_DIST
+
+2022-04-08  NIIBE Yutaka  <gniibe@fsij.org>
+
+       build: Remove configure checking for socklen_t.
+       + commit e5260b6b9f38a91797b2cb5d789e9cbbc97ec485
+       * configure.ac (gl_TYPE_SOCKLEN_T): Remove.
+       * m4/socklen.m4: Remove.
+
+2022-04-06  Tianjia Zhang  <tianjia.zhang@linux.alibaba.com>
+
+       doc: Fix missing ARM hardware features.
+       + commit a7c3e0b9b0ff636d498a9d82f4ced8b5fac50a8b
+       * doc/gcrypt.texi: Add sha3/sm3/sm4/sha512 to ARM hardware features.
+
+       build: Fix for arm crypto support.
+       + commit 972aae9fc337ecf25139737cc7083a1bb56457f8
+       * configure.ac: Correct wrong variable names.
+
+2022-04-06  Jussi Kivilinna  <jussi.kivilinna@iki.fi>
+
+       chacha20: add AVX512 implementation.
+       + commit 9a63cfd61753b2c7ef7a872a01565154f10a72c0
+       * cipher/Makefile.am: Add 'chacha20-amd64-avx512.S'.
+       * cipher/chacha20-amd64-avx512.S: New.
+       * cipher/chacha20.c (USE_AVX512): New.
+       (CHACHA20_context_s): Add 'use_avx512'.
+       [USE_AVX512] (_gcry_chacha20_amd64_avx512_blocks16): New.
+       (chacha20_do_setkey) [USE_AVX512]: Setup 'use_avx512' based on
+       HW features.
+       (do_chacha20_encrypt_stream_tail) [USE_AVX512]: Use AVX512
+       implementation if supported.
+       (_gcry_chacha20_poly1305_encrypt) [USE_AVX512]: Disable stitched
+       chacha20-poly1305 implementations if AVX512 implementation is used.
+       (_gcry_chacha20_poly1305_decrypt) [USE_AVX512]: Disable stitched
+       chacha20-poly1305 implementations if AVX512 implementation is used.
+
+       poly1305: add AVX512 implementation.
+       + commit cd3ed4977076343bb6092001cafe55673dc30e34
+       * LICENSES: Add 3-clause BSD license for poly1305-amd64-avx512.S.
+       * cipher/Makefile.am: Add 'poly1305-amd64-avx512.S'.
+       * cipher/poly1305-amd64-avx512.S: New.
+       * cipher/poly1305-internal.h (POLY1305_USE_AVX512): New.
+       (poly1305_context_s): Add 'use_avx512'.
+       * cipher/poly1305.c (ASM_FUNC_ABI, ASM_FUNC_WRAPPER_ATTR): New.
+       [POLY1305_USE_AVX512] (_gcry_poly1305_amd64_avx512_blocks)
+       (poly1305_amd64_avx512_blocks): New.
+       (poly1305_init): Use AVX512 is HW feature available (set use_avx512).
+       [USE_MPI_64BIT] (poly1305_blocks): Rename to ...
+       [USE_MPI_64BIT] (poly1305_blocks_generic): ... this.
+       [USE_MPI_64BIT] (poly1305_blocks): New.
+
+2022-04-05  NIIBE Yutaka  <gniibe@fsij.org>
+
+       doc: Update yat2m from libgpg-error.
+       + commit 5f357784662a7d3a3d3498d6ca5d5781e9e60ac0
+       * doc/yat2m.c: Update.
+
+2022-04-04  Tianjia Zhang  <tianjia.zhang@linux.alibaba.com>
+
+       Add SM3 ARMv8/AArch64/CE assembly implementation.
+       + commit fe891ff4a3cdc74957b215db4a9a9e01fefe0cd4
+       * cipher/Makefile.am: Add 'sm3-armv8-aarch64-ce.S'.
+       * cipher/sm3-armv8-aarch64-ce.S: New.
+       * cipher/sm3.c (USE_ARM_CE): New.
+       [USE_ARM_CE] (_gcry_sm3_transform_armv8_ce)
+       (do_sm3_transform_armv8_ce): New.
+       (sm3_init) [USE_ARM_CE]: New.
+       * configure.ac: Add 'sm3-armv8-aarch64-ce.lo'.
+
+2022-04-01  Jussi Kivilinna  <jussi.kivilinna@iki.fi>
 
        hwf-ppc: fix missing HWF_PPC_ARCH_3_10 in HW feature.
-       + commit e073f0ed446601bffe787912462119a98e797fa3
+       + commit 29bfb3ebbc63d7ed18b916c5c6946790fb3d15df
        * src/hwf-ppc.c (ppc_features): Add HWF_PPC_ARCH_3_10.
 
+2022-03-31  NIIBE Yutaka  <gniibe@fsij.org>
+
+       random:drbg: Fix the behavior for child process.
+       + commit df7879a86b1de8eaf2d784687155c4274574b120
+       * random/random-drbg.c (_gcry_rngdrbg_randomize): Update change of PID
+       detection.
+
+       build: When no gpg-error-config, not install libgcrypt-config.
+       + commit 2db5b5e995c21c5bd9cd193c2ed1109ba9b1a440
+       * configure.ac (USE_GPGRT_CONFIG): New.
+       * src/Makefile.am [USE_GPGRT_CONFIG]: Conditionalize the install
+       of libgcrypt-config.
+
+2022-03-30  Werner Koch  <wk@gnupg.org>
+
+       tests: Add brainpoolP256r1 to bench-slope.
+       + commit 67b36154f88ebe271a40c3f3f7b963943c656b71
+       * tests/bench-slope.c (ECC_ALGO_BRAINP256R1): New.
+       (ecc_algo_fips_allowed): Support this curve.
+       (ecc_algo_name): Ditto.
+       (ecc_algo_curve): Ditto.
+       (ecc_nbits): Ditto.
+       (bench_ecc_init): Ditto.
+
+2022-03-29  Jussi Kivilinna  <jussi.kivilinna@iki.fi>
+
+       configure: fix avx512 check for i386.
+       + commit a5d126c61cc00aa2d63c389a956330067c90cbfd
+       * configure.ac (gcry_cv_gcc_inline_asm_avx512): Do not use ZMM22
+       register; Check for broadcast memory source.
+
+2022-03-29  Tianjia Zhang  <tianjia.zhang@linux.alibaba.com>
+
+       Fix configure.ac error of intel-avx512.
+       + commit 4dc707e336a91ca9ca5a59787b9245a1ca3042c6
+       * configure.ac: Correctly set value for avx512support.
+
 2022-03-29  NIIBE Yutaka  <gniibe@fsij.org>
 
        kdf:argon2: Fix for the case output > 64.
-       + commit 13b5454d2620701863f6e89221f5f4c98d2aba8e
+       + commit 564739a58426d89db2f0c9334659949e503d2c59
        * cipher/blake2.c (blake2b_vl_hash): Fix the last step.
        * cipher/kdf.c (argon2_open): Check the value.
 
-2022-03-28  Werner Koch  <wk@gnupg.org>
+2022-03-28  NIIBE Yutaka  <gniibe@fsij.org>
 
-       Release 1.10.1.
-       + commit ae0e567820c37f9640440b3cff77d7c185aa6742
+       build: Fix for build for Windows.
+       + commit 5d6a1c396396ba7b44f11af10a9c51b1b573e03c
+       * cipher/Makefile.am: Use EXEEXT_FOR_BUILD.
+       * doc/Makefile.am: Likewise.
 
+       test: Fix cast for Windows 64-bit.
+       + commit e24fe678656156502bd3ef1b7805cddede1dd2de
+       * tests/bench-slope.c (slope_benchmark): Use uintptr_t.
+
+2022-03-28  Werner Koch  <wk@gnupg.org>
 
        hash: Add more OIDs.
-       + commit 52fd2305ba8a0c53214016c11fdf03d47761ee8e
+       + commit 26ac5e30018f2bf0a973b3a45410e6a0c8067f0e
        * cipher/sha256.c: Add X9.62 OID.
        * cipher/sha512.c: Ditto.
 
-       tests: Add brainpoolP256r1 to bench-slope.
-       + commit eeddd578120c6c28cf600016aae124223ef99e8b
-       * tests/bench-slope.c (ECC_ALGO_BRAINP256R1): New.
-       (ecc_algo_fips_allowed): Support this curve.
-       (ecc_algo_name): Ditto.
-       (ecc_algo_curve): Ditto.
-       (ecc_nbits): Ditto.
-       (bench_ecc_init): Ditto.
+       build: Improve sign-release traget.
+       + commit ec656616bbbb5db87d3cdf4f4ea695217935d843
+       * Makefile.am (sign-release): Allow running in the dist dir.
+
+2022-03-28  NIIBE Yutaka  <gniibe@fsij.org>
+
+       tests: Fix null pointer arithmetic.
+       + commit 1517a31ea476b76f67289d07e2c2821aa4ef83ef
+       * tests/bench-slope.c (slope_benchmark): Don't use null pointer.
+
+2022-03-12  Jussi Kivilinna  <jussi.kivilinna@iki.fi>
+
+       Fix building sha512-avx512 with clang.
+       + commit a0db0a121571129b7149e51e98e5fbc6e35413e8
+       * cipher/sha512-avx512-amd64.S
+       (_gcry_sha512_transform_amd64_avx512): Change "%xmm??" registers to
+       "xmm??" for clear_reg parameter.
+       * configure.ac (gcry_cv_gcc_inline_asm_avx512): Check support for
+       registers in range "zmm16-zmm31".
+
+2022-03-10  Jussi Kivilinna  <jussi.kivilinna@iki.fi>
+
+       SHA512: Add AVX512 implementation.
+       + commit 089223aa3b554e5c9a07d9176470308dab10ac8a
+       * LICENSES: Add 'cipher/sha512-avx512-amd64.S'.
+       * cipher/Makefile.am: Add 'sha512-avx512-amd64.S'.
+       * cipher/sha512-avx512-amd64.S: New.
+       * cipher/sha512.c (USE_AVX512): New.
+       (do_sha512_transform_amd64_ssse3, do_sha512_transform_amd64_avx)
+       (do_sha512_transform_amd64_avx2): Add ASM_EXTRA_STACK to return value
+       only if assembly routine returned non-zero value.
+       [USE_AVX512] (_gcry_sha512_transform_amd64_avx512)
+       (do_sha512_transform_amd64_avx512): New.
+       (sha512_init_common) [USE_AVX512]: Use AVX512 implementation if HW
+       feature supported.
+       ---
+
+       Benchmark on Intel Core i3-1115G4 (tigerlake):
+
+        Before:
+                       |  nanosecs/byte   mebibytes/sec   cycles/byte  auto Mhz
+        SHA512         |      1.51 ns/B     631.6 MiB/s      6.17 c/B      4089
+
+        After (~29% faster):
+                       |  nanosecs/byte   mebibytes/sec   cycles/byte  auto Mhz
+        SHA512         |      1.16 ns/B     819.0 MiB/s      4.76 c/B      4090
+
+2022-03-09  Jussi Kivilinna  <jussi.kivilinna@iki.fi>
+
+       rijndael-vaes-avx2: perform checksumming inline.
+       + commit d820d27a3bce6365523fbcb6ec607b23dd4ca4e2
+       * cipher/rijndael-vaes-avx2-amd64.S
+       (_gcry_vaes_avx2_ocb_checksum): Remove.
+       (_gcry_vaes_avx2_ocb_crypt_amd64): Add inline checksumming.
+
+2022-03-07  Jussi Kivilinna  <jussi.kivilinna@iki.fi>
+
+       ghash|polyval: add x86_64 VPCLMUL/AVX512 accelerated implementation.
+       + commit e6f360019369fff42411b4cca976cc8ebe09281d
+       * cipher/cipher-gcm-intel-pclmul.c (GCM_INTEL_USE_VPCLMUL_AVX512)
+       (GCM_INTEL_AGGR32_TABLE_INITIALIZED): New.
+       (ghash_setup_aggr16_avx2): Store H16 for aggr32 setup.
+       [GCM_USE_INTEL_VPCLMUL_AVX512] (GFMUL_AGGR32_ASM_VPCMUL_AVX512)
+       (gfmul_vpclmul_avx512_aggr32, gfmul_vpclmul_avx512_aggr32_le)
+       (gfmul_pclmul_avx512, gcm_lsh_avx512, load_h1h4_to_zmm1)
+       (ghash_setup_aggr8_avx512, ghash_setup_aggr16_avx512)
+       (ghash_setup_aggr32_avx512, swap128b_perm): New.
+       (_gcry_ghash_setup_intel_pclmul) [GCM_USE_INTEL_VPCLMUL_AVX512]: Enable
+       AVX512 implementation based on HW features.
+       (_gcry_ghash_intel_pclmul, _gcry_polyval_intel_pclmul): Add
+       VPCLMUL/AVX512 code path; Small tweaks to VPCLMUL/AVX2 code path; Tweaks
+       on register clearing.
+
+       Add detection for HW feature "intel-avx512"
+       + commit 8cf06145263eb23b5411fae03e1ea13e146d605e
+       * configure.ac (avx512support, gcry_cv_gcc_inline_asm_avx512)
+       (ENABLE_AVX512_SUPPORT): New.
+       * src/g10lib.h (HWF_INTEL_AVX512): New.
+       * src/hwf-x86.c (detect_x86_gnuc): Add AVX512 detection.
+       * src/hwfeatures.c (hwflist): Add "intel-avx512".
+       * doc/gcrypt.texi: Add "intel-avx512" to HW features list.
+
+2022-03-06  Jussi Kivilinna  <jussi.kivilinna@iki.fi>
+
+       ghash|polyval: add x86_64 VPCLMUL/AVX2 accelerated implementation.
+       + commit d857e85cb4d4cb9702a59364ce9a4b9d81328cb5
+       * cipher/cipher-gcm-intel-pclmul.c (GCM_INTEL_USE_VPCLMUL_AVX2)
+       (GCM_INTEL_AGGR8_TABLE_INITIALIZED)
+       (GCM_INTEL_AGGR16_TABLE_INITIALIZED): New.
+       (gfmul_pclmul): Fixes to comments.
+       [GCM_USE_INTEL_VPCLMUL_AVX2] (GFMUL_AGGR16_ASM_VPCMUL_AVX2)
+       (gfmul_vpclmul_avx2_aggr16, gfmul_vpclmul_avx2_aggr16_le)
+       (gfmul_pclmul_avx2, gcm_lsh_avx2, load_h1h2_to_ymm1)
+       (ghash_setup_aggr8_avx2, ghash_setup_aggr16_avx2): New.
+       (_gcry_ghash_setup_intel_pclmul): Add 'hw_features' parameter; Setup
+       ghash and polyval function pointers for context; Add VPCLMUL/AVX2 code
+       path; Defer aggr8 and aggr16 table initialization to until first use in
+       '_gcry_ghash_intel_pclmul' or '_gcry_polyval_intel_pclmul'.
+       [__x86_64__] (ghash_setup_aggr8): New.
+       (_gcry_ghash_intel_pclmul): Add VPCLMUL/AVX2 code path; Add call for
+       aggr8 table initialization.
+       (_gcry_polyval_intel_pclmul): Add VPCLMUL/AVX2 code path; Add call for
+       aggr8 table initialization.
+       * cipher/cipher-gcm.c [GCM_USE_INTEL_PCLMUL] (_gcry_ghash_intel_pclmul)
+       (_gcry_polyval_intel_pclmul): Remove.
+       [GCM_USE_INTEL_PCLMUL] (_gcry_ghash_setup_intel_pclmul): Add
+       'hw_features' parameter.
+       (setupM) [GCM_USE_INTEL_PCLMUL]: Pass HW features to
+       '_gcry_ghash_setup_intel_pclmul'; Let '_gcry_ghash_setup_intel_pclmul'
+       setup function pointers.
+       * cipher/cipher-internal.h (GCM_USE_INTEL_VPCLMUL_AVX2): New.
+       (gcry_cipher_handle): Add member 'gcm.hw_impl_flags'.
+
+2022-03-02  Tianjia Zhang  <tianjia.zhang@linux.alibaba.com>
+
+       Add SM4 ARMv8/AArch64/CE assembly implementation.
+       + commit 47cafffb09d8a224f07e0750f4ba882bb86cb15a
+       * cipher/Makefile.am: Add 'sm4-armv8-aarch64-ce.S'.
+       * cipher/sm4-armv8-aarch64-ce.S: New.
+       * cipher/sm4.c (USE_ARM_CE): New.
+       (SM4_context) [USE_ARM_CE]: Add 'use_arm_ce'.
+       [USE_ARM_CE] (_gcry_sm4_armv8_ce_expand_key)
+       (_gcry_sm4_armv8_ce_crypt, _gcry_sm4_armv8_ce_ctr_enc)
+       (_gcry_sm4_armv8_ce_cbc_dec, _gcry_sm4_armv8_ce_cfb_dec)
+       (_gcry_sm4_armv8_ce_crypt_blk1_8, sm4_armv8_ce_crypt_blk1_8): New.
+       (sm4_expand_key) [USE_ARM_CE]: Use ARMv8/AArch64/CE key setup.
+       (sm4_setkey): Enable ARMv8/AArch64/CE if supported by HW.
+       (sm4_encrypt) [USE_ARM_CE]: Use SM4 CE encryption.
+       (sm4_decrypt) [USE_ARM_CE]: Use SM4 CE decryption.
+       (_gcry_sm4_ctr_enc, _gcry_sm4_cbc_dec, _gcry_sm4_cfb_dec)
+       (_gcry_sm4_ocb_crypt, _gcry_sm4_ocb_auth) [USE_ARM_CE]: Add
+       ARMv8/AArch64/CE bulk functions.
+       * configure.ac: Add 'sm4-armv8-aarch64-ce.lo'.
+
+       hwf-arm: add ARMv8.2 optional crypto extension HW features.
+       + commit 7d2983979866223d96aad4806af0311671585f64
+       * src/g10lib.h (HWF_ARM_SHA3, HWF_ARM_SM3, HWF_ARM_SM4)
+       (HWF_ARM_SHA512): New.
+       * src/hwf-arm.c (arm_features): Add sha3, sm3, sm4, sha512 HW features.
+       * src/hwfeatures.c (hwflist): Add sha3, sm3, sm4, sha512 HW features.
+
+2022-02-24  Jussi Kivilinna  <jussi.kivilinna@iki.fi>
+
+       powerpc: check for missing optimization level for vector register usage.
+       + commit 6951e0f591ccff24b9ce2e43c2dcab955e3302c4
+       * cipher/Makefile.am [ENABLE_PPC_VCRYPTO_EXTRA_CFLAGS]
+       (ppc_vcrypto_cflags): Add '-O2'.
+       * configure.ac (gcry_cv_cc_ppc_altivec): Check for missing compiler
+       optimization with vec_sld_u32 inline function.
+       * configure.ac (gcry_cv_cc_ppc_altivec_cflags): Check for missing
+       compiler optimization with vec_sld_u32 inline function; Add '-O2' to
+       CFLAGS.
+
+2022-02-23  Tianjia Zhang  <tianjia.zhang@linux.alibaba.com>
+
+       Add SM4 ARMv8/AArch64 assembly implementation.
+       + commit d8825601f10aec20db118496bb68a5cd1372b7da
+       * cipher/Makefile.am: Add 'sm4-aarch64.S'.
+       * cipher/sm4-aarch64.S: New.
+       * cipher/sm4.c (USE_AARCH64_SIMD): New.
+       (SM4_context) [USE_AARCH64_SIMD]: Add 'use_aarch64_simd'.
+       [USE_AARCH64_SIMD] (_gcry_sm4_aarch64_crypt)
+       (_gcry_sm4_aarch64_ctr_enc, _gcry_sm4_aarch64_cbc_dec)
+       (_gcry_sm4_aarch64_cfb_dec, _gcry_sm4_aarch64_crypt_blk1_8)
+       (sm4_aarch64_crypt_blk1_8): New.
+       (sm4_setkey): Enable ARMv8/AArch64 if supported by HW.
+       (_gcry_sm4_ctr_enc, _gcry_sm4_cbc_dec, _gcry_sm4_cfb_dec)
+       (_gcry_sm4_ocb_crypt, _gcry_sm4_ocb_auth) [USE_AARCH64_SIMD]:
+       Add ARMv8/AArch64 bulk functions.
+       * configure.ac: Add 'sm4-aarch64.lo'.
+
+       Move VPUSH_API/VPOP_API macros to common header.
+       + commit 83e1649edd5eedd8faf24e5c10cb643218ce3c6f
+       * cipher/asm-common-aarch64.h: Add VPUSH_API/VPOP_API/CLEAR_REG macros.
+       * cipher/cipher-gcm-armv8-aarch64-ce.S: Remove common macros.
+
+2022-02-22  Jussi Kivilinna  <jussi.kivilinna@iki.fi>
+
+       Perform AEAD input 24KiB splitting only when input larger than 32KiB.
+       + commit 2508b755608ce182a7e570dc2717a6a70346b927
+       * cipher/chacha20.c (_gcry_chacha20_poly1305_encrypt)
+       (_gcry_chacha20_poly1305_decrypt): Process in 24KiB chunks if input
+       larger than 32KiB.
+       * cipher/cipher-ccm.c (_gcry_cipher_ccm_encrypt)
+       (_gcry_cipher_ccm_decrypt): Likewise.
+       * cipher/cipher-eax.c (_gcry_cipher_eax_encrypt)
+       (_gcry_cipher_eax_decrypt): Likewise.
+       * cipher/cipher-gcm.c (gcm_cipher_inner): Likewise.
+       * cipher/cipher-ocb.c (ocb_crypt): Likewise.
+       * cipher/cipher-poly2305.c (_gcry_cipher_poly1305_encrypt)
+       (_gcry_cipher_poly1305_decrypt): Likewise.
 
-2022-02-22  NIIBE Yutaka  <gniibe@fsij.org>
+2022-02-17  NIIBE Yutaka  <gniibe@fsij.org>
 
        fips: Clarify what to be hashed for the integrity check.
-       + commit 9fa4c8946ac5e79c37941c6264ac3d6314d9a3c0
+       + commit 052c5ef4cea56772b7015e36f231fa0bcbf91410
        * src/fips.c (get_file_offset): Compute the maximum offset
        of segments.
        * src/gen-note-integrity.sh: Likewise.
 
        fips: Fix gen-note-integrity.sh script not to use cmp utility.
-       + commit ad8b67f9e21982c841f31e92d2639f726f7ea4be
+       + commit 3c8b6c4a9cad59c5e1db5706f6774a3141b60210
        * src/gen-note-integrity.sh: Simplify detecting 32-bit machine
        or 64-bit machine.
 
+       Silence compiler warnings for possible alignment problem.
+       + commit 5420cbbd3ec7ebf081224796e8d1f8299f7ad985
+       * cipher/kdf.c (balloon_final): Fix the cast.
+       (_gcry_kdf_compute, _gcry_kdf_final, _gcry_kdf_close): Likewise.
+
+2022-02-16  NIIBE Yutaka  <gniibe@fsij.org>
+
        fips: More portable integrity check.
-       + commit dcc6979fd2ed32bb5a5e448e2c9da1158c1d93c2
+       + commit a340e980388243ceae6df57d101036f3f2a955be
        * src/Makefile.am (EXTRA_DIST): Change the name of the script.
        (libgcrypt.la.done): Invoce OBJCOPY with --add-section.
        (libgcrypt.so.hmac): Specify ECHO_N.
        Generate ElfN_Nhdr, and then the hmac.
 
        fips: Integrity check improvement, with only loadable segments.
-       + commit 974f4c7e698b0c1ffe3de82bad9b3f8813d1f42b
+       + commit 9dcf9305962b90febdf2d7cc73b49feadbf6a01f
        * configure.ac (READELF): Check the tool.
        * src/Makefile.am (libgcrypt.so.hmac): Use genhmac.sh with hmac256.
        * src/fips.c (get_file_offsets): Rename from get_file_offset.
        (hmac256_check): Finish scanning at the end of loadble segments.
        * src/genhmac.sh: New.
 
-2022-02-22  Clemens Lang  <cllang@redhat.com>
+       build: Clean up acinclude.m4.
+       + commit b2f110f99626afce84c23c76db0ebaaadac4ee48
+       * acinclude.m4 (GNUPG_CHECK_TYPEDEF): Remove.  We can just use
+       AC_CHECK_TYPES.
+       (GNUPG_CHECK_GNUMAKE): Remove.  It may build with other Make.
+       (GNUPG_SYS_LIBTOOL_CYGWIN32): Remove.  Now, it's done by LT_INIT.
+       (TYPE_SOCKLEN_T): Remove.  Now, we use gl_TYPE_SOCKLEN_T.
+
+       build: Remove checking Pth library.
+       + commit d98a78f0eb7711d32f473a04ea417e0f0bde6a00
+       * acinclude.m4 (GNUPG_PTH_VERSION_CHECK): Remove.
+       * configure.ac (PTH_CFLAGS, PTH_LIBS): Remove.
+
+2022-02-15  Clemens Lang via Gcrypt-devel  <gcrypt-devel@lists.gnupg.org>
 
        fips: Use ELF header to find hmac file offset.
-       + commit 4ed49a917212507de8679aaf08504922a95cf6ef
+       + commit beb5d6df5c5785db7c32a24a5d2a351cb964bfbc
        * src/fips.c [ENABLE_HMAC_BINARY_CHECK] (hmac256_check): Use ELF headers
          to locate the file offset for the HMAC in addition to information from
          the loader
 
-2022-02-21  NIIBE Yutaka  <gniibe@fsij.org>
-
-       Silence compiler warnings for possible alignment problem.
-       + commit 64fef214025949a1b0a76355b99c85594caea4ca
-       * cipher/kdf.c (_gcry_kdf_compute, _gcry_kdf_final, _gcry_kdf_close):
-       Fix the cast.
-
-2022-02-18  NIIBE Yutaka  <gniibe@fsij.org>
+2022-02-15  NIIBE Yutaka  <gniibe@fsij.org>
 
        build: Fix m4/gpg-error.m4.
-       + commit b5b7b8c5c76838350f1857a40c428c9092f9da8e
+       + commit bff9f1b024647e18b2c87dcd769c0e449f7752e6
        * m4/gpg-error.m4: Unset GPGRT_CONFIG when it doesn't work well.
 
 2022-02-14  Clemens Lang via Gcrypt-devel  <gcrypt-devel@lists.gnupg.org>
 
        hmac: Fix memory leak.
-       + commit 2bdc6614c866b0197f534e5cf3ec35d9f024facd
+       + commit 6994d874af865cc6ba95a8a6eb5a8fe048e88e07
        * src/hmac.c: Release HMAC256 context
 
        fips: Fix memory leaks in FIPS mode.
-       + commit a60f8e43dd1b02adf7d1fd54c2e1d27564dd12c1
+       + commit 7fc5d33e74164519edcd8127a35cc21228d2727f
        * cipher/pubkey.c (_gcry_pk_sign_md): Fix memory leak in FIPS mode when
          used with SHA1
        * tests/basic.c (check_one_cipher_core): Add missing free in error code
        * tests/dsa-rfc6979.c (check_dsa_rfc6979): Likewise
        * tests/pubkey.c (check_x931_derived_key): Likewise
 
-2022-02-14  NIIBE Yutaka  <gniibe@fsij.org>
+2022-02-10  NIIBE Yutaka  <gniibe@fsij.org>
+
+       Clean up for removal of memory guard support.
+       + commit 738723a1cd20b24fc667a457163a4aa6617dc182
+       * mpi/mpiutil.c (_gcry_mpi_m_check): Remove.
+       * src/g10lib.h (_gcry_check_heap): Remove.
+       * src/global.c (_gcry_check_heap): Remove.
+       * src/mpi.h (mpi_m_check): Remove.
+
+       Remove the built-in memory guard support.
+       + commit f98ca6aa34ccdbbaf94f93ae30beafe400303c97
+       * configure.ac (--enable-m-guard): Remove.
+       * src/global.c (_gcry_vcontrol): Return GPG_ERR_NOT_SUPPORTED for
+       GCRYCTL_ENABLE_M_GUARD.
+       * src/stdmem.c (use_m_guard, _gcry_private_enable_m_guard): Remove.
+       (_gcry_private_malloc): Remove the code path with use_m_guard==1.
+       (_gcry_private_malloc_secure): Likewise.
+       (_gcry_private_realloc, _gcry_private_free): Likewise.
+       (_gcry_private_check_heap): Remove.
+       * src/stdmem.h: Remove declarations for memory guard functions.
+
+2022-02-09  NIIBE Yutaka  <gniibe@fsij.org>
+
+       Fix memory allocation when GCRYCTL_ENABLE_M_GUARD.
+       + commit 90f41a1898e421c04080d35d7fea98ee18e74865
+       * configure.ac: Add check for ALIGNOF_LONG_DOUBLE.
+       * src/stdmem.c: Adjust EXTRA_ALIGN.
+
+       kdf: Prepare aligned memory in balloon_final.
+       + commit 6936f234220d12a87fe17f7fbdbb29ba9787dd95
+       * cipher/kdf.c (BALLOON_BLOCK_LEN_MAX): Rename from
+       BALLOON_SALT_LEN_MAX.
+       (balloon_xor_block): Revert the previous change.
+       (balloon_final): Prepare memory for u64.
+
+2022-02-08  Jussi Kivilinna  <jussi.kivilinna@iki.fi>
+
+       kdf: fix unaligned memory access in balloon_xor_block.
+       + commit 59b2504df8f02f82adf0dd83db219bc451b0d9cc
+       * cipher/kdf.c (balloon_xor_block): Use 'buf_xor' helper function; Change
+       pointer parameters to 'void *' type.
+       (balloon_final): Don't cast last_block to 'u64 *' for balloon_xor_block
+       call.
+
+2022-02-08  NIIBE Yutaka  <gniibe@fsij.org>
+
+       kdf: Extend original Balloon to other digest algos.
+       + commit 26bfa41e2413621c8e73dacba9b7c5ed69a1da35
+       * cipher/kdf.c (struct balloon_thread_data): Use pointer to md_spec.
+       (prng_aes_ctr_init): Select relevant cipher to match BLKLEN.
+       Use other half of digest for IV if not original Balloon.
+       (balloon_open): Support other digest algos.
+       (balloon_final): Check the error code in compute.
+
+2022-02-07  NIIBE Yutaka  <gniibe@fsij.org>
+
+       kdf: Add experimental Balloon KDF.
+       + commit 08ab32228ad20fd730979d700bf46b18e469703c
+       * cipher/kdf.c (prng_aes_ctr_init, prng_aes_ctr_get_rand64): New.
+       (prng_aes_ctr_fini, ballon_context_size): New.
+       (balloon_open): Implement with SHA-256.
+       (balloon_xor_block, balloon_compress, balloon_expand): New.
+       (balloon_compute_fill, balloon_compute_mix, balloon_compute): New.
+       (balloon_compute_all, balloon_final, balloon_close): New.
+       (_gcry_kdf_open): Check argument for GCRY_KDF_BALLOON.
+       (_gcry_kdf_compute): Dispatch for GCRY_KDF_BALLOON.
+       (_gcry_kdf_final, _gcry_kdf_close): Likewise.
+       * tests/t-kdf.c (check_balloon): New.
+       (main): Add check_balloon.
 
        kdf: Use u64.
-       + commit 6683007d696dfe64640dc741c4332784ec246388
+       + commit e257fe39b8ffafa3b1fc72b00db1ea43d29c9983
        * cipher/kdf.c (rotr64): We use u64 in libgcrypt.
 
 2022-02-04  Heiko Becker  <heirecka@exherbo.org>
 
        jitterentropy: Include <fcntl.h> and <limits.h>
-       + commit ffaef0be613121d3ee37867d82932a7a30c2bc6d
+       + commit a484bd665f4afaf39eaa0fdf3aa1c7dae75078b4
        * random/jitterentropy-base-user.h: Include <fcntl.h> for O_RDONLY
        * random/jitterentropy-base-user.h: Include <limits.h> for LONG_MAX
 
-2022-02-01  Werner Koch  <wk@gnupg.org>
+2022-02-02  Jussi Kivilinna  <jussi.kivilinna@iki.fi>
+
+       hwf-arm: add detection of ARMv8 crypto extension by toolchain config.
+       + commit d480db6e6c806835405c6795024a8fd91448c16b
+       * src/hwf-arm.c (detect_arm_hwf_by_toolchain): New.
+       (_gcry_hwf_detect_arm): Move __ARM_NEON check to
+       'detect_arm_hwf_by_toolchain' and add call to the new function.
 
-       Release 1.10.0.
-       + commit e4ab2147f3e236f7be95f9709ce09193b2ca5c1a
+2022-02-02  NIIBE Yutaka  <gniibe@fsij.org>
 
+       Remove random-daemon server and util.
+       + commit d918d8aee27987c3bb99f44a7ed3508d81009f51
+       * configure.ac (--enable-random-daemon): Remove.
+       * src/Makefile.am: Remove ENABLE_RANDOM_DAEMON things.
+       * src/gcryptrnd.c, src/getrandom.c: Remove.
 
 2022-01-31  Werner Koch  <wk@gnupg.org>
 
 
 2022-01-26  NIIBE Yutaka  <gniibe@fsij.org>
 
-       kdf: Improve new KDF API.
-       + commit 5d1da2c61981243729365724f14e3d4abacb0e6a
-       * cipher/kdf.c (struct argon2_thread_data): Change layout.
-       (argon2_iterator): Use struct gcry_kdf_pt_head.
-       (argon2_compute_segment): Rename from argon2_compute_row.
-       (argon2_open): Handle N_THREAD maximum.
-       (_gcry_kdf_iterator): Use struct gcry_kdf_pt_head.
-       (_gcry_kdf_compute_segment): Rename from _gcry_kdf_compute_row.
-       * src/gcrypt-int.h: Update declarations.
-       * src/gcrypt.h.in (struct gcry_kdf_pt_head): Expose the data type.
-       * src/libgcrypt.def, src/libgcrypt.vers: Update.
-       * src/visibility.c, src/visibility.h: Update.
-       * tests/t-kdf.c (start_thread, my_kdf_derive): Follow the change.
-
        kdf: Improve new KDF API.
        + commit f21871e241e96148cef3ad4314ad596178cf8967
        * cipher/kdf.c (struct argon2_thread_data): Change layout.
index 8be7fb244640465e860566258734ccf5f0f1ad3b..c2fea82dc58c77805b5bf0e61de901114cf2a810 100644 (file)
--- a/LICENSES
+++ b/LICENSES
@@ -19,6 +19,7 @@ with any binary distributions derived from the GNU C Library.
   - cipher/sha512-avx2-bmi2-amd64.S
   - cipher/sha512-ssse3-amd64.S
   - cipher/sha512-ssse3-i386.c
+  - cipher/sha512-avx512-amd64.S
 
 #+begin_quote
   Copyright (c) 2012, Intel Corporation
@@ -55,6 +56,36 @@ with any binary distributions derived from the GNU C Library.
   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #+end_quote
 
+  For files:
+  - cipher/poly1305-amd64-avx512.S
+
+#+begin_quote
+   Copyright (c) 2021-2022, Intel Corporation
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are met:
+
+       * Redistributions of source code must retain the above copyright notice,
+         this list of conditions and the following disclaimer.
+       * Redistributions in binary form must reproduce the above copyright
+         notice, this list of conditions and the following disclaimer in the
+         documentation and/or other materials provided with the distribution.
+       * Neither the name of Intel Corporation nor the names of its contributors
+         may be used to endorse or promote products derived from this software
+         without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+   AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+   IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+   DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+   FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+   DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+   SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+   CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+   OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#+end_quote
+
   For files:
   - random/jitterentropy-base.c
   - random/jitterentropy-gcd.c
@@ -108,6 +139,7 @@ with any binary distributions derived from the GNU C Library.
 
   For files:
   - cipher/cipher-gcm-ppc.c
+  - cipher/keccak-amd64-avx512.S
 
 #+begin_quote
  Copyright (c) 2006, CRYPTOGAMS by <appro@openssl.org>
index 87e8de6c3a4314c9ffdda2362795caaa6a900774..d60804ee85bac5285d50b368a20ca318c782273c 100644 (file)
 # GNU Lesser General Public License for more details.
 #
 # You should have received a copy of the GNU Lesser General Public
-# License along with this program; if not, see <http://www.gnu.org/licenses/>.
+# License along with this program; if not, see <https://www.gnu.org/licenses/>.
 # SPDX-License-Identifier: LGPL-2.1-or-later
 
 # Location of the released tarball archives.  This is prefixed by
 # the variable RELEASE_ARCHIVE in ~/.gnupg-autogen.rc.  For example:
 # RELEASE_ARCHIVE=wk@somehost:archive/tarballs
-RELEASE_ARCHIVE_SUFFIX  = libgcrypt/v1.10
+RELEASE_ARCHIVE_SUFFIX  = libgcrypt/v1.11
 # The variable RELEASE_SIGNING_KEY in ~/.gnupg-autogen.rc is used
 # to specify the key for signing.  For example:
 # RELEASE_SIGNKEY=D8692123C4065DEA5E0F3AB5249B39D24F25E3B6
index 43393fcdd98bd939b9f5fc1a98b8143109fc5694..09270516aeab30f137ce4dc7ceb489a569c93b36 100644 (file)
@@ -29,7 +29,7 @@
 # GNU Lesser General Public License for more details.
 #
 # You should have received a copy of the GNU Lesser General Public
-# License along with this program; if not, see <http://www.gnu.org/licenses/>.
+# License along with this program; if not, see <https://www.gnu.org/licenses/>.
 # SPDX-License-Identifier: LGPL-2.1-or-later
 VPATH = @srcdir@
 am__is_gnu_make = { \
@@ -111,8 +111,8 @@ am__aclocal_m4_deps = $(top_srcdir)/m4/ax_cc_for_build.m4 \
        $(top_srcdir)/m4/gpg-error.m4 $(top_srcdir)/m4/libtool.m4 \
        $(top_srcdir)/m4/ltoptions.m4 $(top_srcdir)/m4/ltsugar.m4 \
        $(top_srcdir)/m4/ltversion.m4 $(top_srcdir)/m4/lt~obsolete.m4 \
-       $(top_srcdir)/m4/noexecstack.m4 $(top_srcdir)/m4/socklen.m4 \
-       $(top_srcdir)/acinclude.m4 $(top_srcdir)/configure.ac
+       $(top_srcdir)/m4/noexecstack.m4 $(top_srcdir)/acinclude.m4 \
+       $(top_srcdir)/configure.ac
 am__configure_deps = $(am__aclocal_m4_deps) $(CONFIGURE_DEPENDENCIES) \
        $(ACLOCAL_M4)
 DIST_COMMON = $(srcdir)/Makefile.am $(top_srcdir)/configure \
@@ -329,9 +329,6 @@ PACKAGE_TARNAME = @PACKAGE_TARNAME@
 PACKAGE_URL = @PACKAGE_URL@
 PACKAGE_VERSION = @PACKAGE_VERSION@
 PATH_SEPARATOR = @PATH_SEPARATOR@
-PTH_CFLAGS = @PTH_CFLAGS@
-PTH_CONFIG = @PTH_CONFIG@
-PTH_LIBS = @PTH_LIBS@
 RANLIB = @RANLIB@
 RC = @RC@
 READELF = @READELF@
@@ -403,7 +400,7 @@ top_srcdir = @top_srcdir@
 # Location of the released tarball archives.  This is prefixed by
 # the variable RELEASE_ARCHIVE in ~/.gnupg-autogen.rc.  For example:
 # RELEASE_ARCHIVE=wk@somehost:archive/tarballs
-RELEASE_ARCHIVE_SUFFIX = libgcrypt/v1.10
+RELEASE_ARCHIVE_SUFFIX = libgcrypt/v1.11
 # The variable RELEASE_SIGNING_KEY in ~/.gnupg-autogen.rc is used
 # to specify the key for signing.  For example:
 # RELEASE_SIGNKEY=D8692123C4065DEA5E0F3AB5249B39D24F25E3B6
diff --git a/NEWS b/NEWS
index b767dc1170eb479b9a311cca4074c58e4eedaf0b..a06199e78ae36407409395fad56f2f4ca7543010 100644 (file)
--- a/NEWS
+++ b/NEWS
@@ -1,6 +1,123 @@
-Noteworthy changes in version 1.10.3 (2023-11-14)  [C24/A4/R3]
+Noteworthy changes in version 1.11.0 (2024-06-19)  [C25/A5/R0]
 -------------------------------------------------
 
+ * New and extended interfaces:
+
+   - Add an API for Key Encapsulation Mechanism (KEM).  [T6755]
+
+   - Add Streamlined NTRU Prime sntrup761 algorithm.  [rCcf9923e1a5]
+
+   - Add Kyber algorithm according to FIPS 203 ipd 2023-08-24.
+     [rC18e5c0d268]
+
+   - Add Classic McEliece algorithm.  [rC003367b912]
+
+   - Add One-Step KDF with hash and MAC.  [T5964]
+
+   - Add KDF algorithm HKDF of RFC-5869.  [T5964]
+
+   - Add KDF algorithm X963KDF for use in CMS.  [rC3abac420b3]
+
+   - Add GMAC-SM4 and Poly1305-SM4.  [rCd1ccc409d4]
+
+   - Add ARIA block cipher algorithm.  [rC316c6d7715]
+
+   - Add explicit FIPS indicators for MD and MAC algorithms.  [T6376]
+
+   - Add support for SHAKE as MGF in RSA.  [T6557]
+
+   - Add gcry_md_read support for SHAKE algorithms.  [T6539]
+
+   - Add gcry_md_hash_buffers_ext function.  [T7035]
+
+   - Add cSHAKE hash algorithm.  [rC065b3f4e02]
+
+   - Support internal generation of IV for AEAD cipher mode.  [T4873]
+
+ * Performance:
+
+   - Add SM3 ARMv8/AArch64/CE assembly implementation.  [rCfe891ff4a3]
+
+   - Add SM4 ARMv8/AArch64 assembly implementation.  [rCd8825601f1]
+
+   - Add SM4 GFNI/AVX2 and GFI/AVX512 implementation.
+     [rC5095d60af4,rCeaed633c16]
+
+   - Add SM4 ARMv9 SVE CE assembly implementation.  [rC2dc2654006]
+
+   - Add PowerPC vector implementation of SM4.  [rC0b2da804ee]
+
+   - Optimize ChaCha20 and Poly1305 for PPC P10 LE.  [T6006]
+
+   - Add CTR32LE bulk acceleration for AES on PPC.  [rC84f2e2d0b5]
+
+   - Add generic bulk acceleration for CTR32LE mode (GCM-SIV) for SM4
+     and Camellia.  [rCcf956793af]
+
+   - Add GFNI/AVX2 implementation of Camellia.  [rC4e6896eb9f]
+
+   - Add AVX2 and AVX512 accelerated implementations for GHASH (GCM)
+     and POLYVAL (GCM-SIV).  [rCd857e85cb4, rCe6f3600193]
+
+   - Add AVX512 implementation for SHA512.  [rC089223aa3b]
+
+   - Add AVX512 implementation for Serpent.  [rCce95b6ec35]
+
+   - Add AVX512 implementation for Poly1305 and ChaCha20
+     [rCcd3ed49770, rC9a63cfd617]
+
+   - Add AVX512 accelerated implementation for SHA3 and Blake2
+     [rCbeaad75f46,rC909daa700e]
+
+   - Add VAES/AVX2 accelerated i386 implementation for AES.
+     [rC4a42a042bc]
+
+   - Add bulk processing for XTS mode of Camellia and SM4.
+     [rC32b18cdb87, rCaad3381e93]
+
+   - Accelerate XTS and ECB modes for Twofish and Serpent.
+     [rCd078a928f5,rC8a1fe5f78f]
+
+   - Add AArch64 crypto/SHA512 extension implementation for
+     SHA512. [rCe51d3b8330]
+
+   - Add AArch64 crypto-extension implementation for Camellia.
+     [rC898c857206]
+
+   - Accelerate OCB authentication on AMD with AVX2.  [rC6b47e85d65]
+
+ * Bug fixes:
+
+   - For PowerPC check for missing optimization level for vector
+     register usage.  [T5785]
+
+   - Fix EdDSA secret key check.  [T6511]
+
+   - Fix decoding of PKCS#1-v1.5 and OAEP padding.  [rC34c2042792]
+
+   - Allow use of PKCS#1-v1.5 with SHA3 algorithms.  [T6976]
+
+   - Fix AESWRAP padding length check.  [T7130]
+
+ * Other:
+
+   - Allow empty password for Argon2 KDF.  [rCa20700c55f]
+
+   - Various constant time operation imporvements.
+
+   - Add "bp256", "bp384", "bp512" aliases for Brainpool curves.
+
+   - Support for the random server has been removed.  [T5811]
+
+   - The control code GCRYCTL_ENABLE_M_GUARD is deprecated and not
+     supported any more.  Please use valgrind or other tools.  [T5822]
+
+   - Logging is now done via the libgpg-error logging functions.
+     [rCab0bdc72c7]
+
+
+ Changes also found in 1.10.3:
+
  * Bug fixes:
 
    - Fix public key computation for other EdDSA curves.
@@ -23,11 +140,7 @@ Noteworthy changes in version 1.10.3 (2023-11-14)  [C24/A4/R3]
      [T6619]
 
 
- Release-info: https://dev.gnupg.org/T6817
-
-
-Noteworthy changes in version 1.10.2 (2023-04-06)  [C24/A4/R2]
--------------------------------------------------
+ Changes also found in 1.10.2
 
  * Bug fixes:
 
@@ -98,11 +211,7 @@ Noteworthy changes in version 1.10.2 (2023-04-06)  [C24/A4/R2]
    - Add explicit FIPS indicators for hash and MAC algorithms. [T6376]
 
 
- Release-info: https://dev.gnupg.org/T5905
-
-
-Noteworthy changes in version 1.10.1 (2022-03-28)  [C24/A4/R1]
--------------------------------------------------
+ Changes also found in 1.10.1:
 
  * Bug fixes:
 
@@ -116,7 +225,68 @@ Noteworthy changes in version 1.10.1 (2022-03-28)  [C24/A4/R1]
 
    - Add X9.62 OIDs to sha256 and sha512 modules.  [rC52fd2305ba]
 
- Release-info: https://dev.gnupg.org/T5810
+
+ * Interface changes relative to the 1.10.0 release:
+   ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+   GCRY_CIPHER_ARIA128                   NEW cipher algo.
+   GCRY_CIPHER_ARIA192                   NEW cipher algo.
+   GCRY_CIPHER_ARIA256                   NEW cipher algo.
+   gcry_cipher_geniv_methods             NEW type.
+   gcry_cipher_setup_geniv               NEW function.
+   gcry_cipher_geniv                     NEW function.
+   GCRY_PK_KEM                           NEW constant.
+   GCRY_MD_CSHAKE128                     NEW hash algo.
+   GCRY_MD_CSHAKE256                     NEW hash algo.
+   GCRYCTL_MD_CUSTOMIZE                  NEW control code.
+   gcry_cshake_customization             NEW type.
+   GCRY_MAC_CMAC_ARIA                    NEW mac algo.
+   GCRY_MAC_GMAC_SM4                     NEW mac algo.
+   GCRY_MAC_GMAC_ARIA                    NEW mac algo.
+   GCRY_MAC_POLY1305_SM4                 NEW mac algo.
+   GCRY_MAC_POLY1305_ARIA                NEW mac algo.
+   GCRY_KDF_ONESTEP_KDF                  NEW kdf algo.
+   GCRY_KDF_ONESTEP_KDF_MAC              NEW kdf algo.
+   GCRY_KDF_X963_KDF                     NEW kdf algo.
+   gcry_kem_algos                        NEW type.
+   gcry_kem_keypair                      NEW function.
+   gcry_kem_encap                        NEW function.
+   gcry_kem_decap                        NEW function.
+   GCRY_KEM_SNTRUP761                    NEW kem algo.
+   GCRY_KEM_CM6688128F                   NEW kem algo.
+   GCRY_KEM_MLKEM512                     NEW kem algo.
+   GCRY_KEM_MLKEM768                     NEW kem algo.
+   GCRY_KEM_MLKEM1024                    NEW kem algo.
+   GCRY_KEM_RAW_X25519                   NEW kem algo.
+   GCRY_KEM_RAW_X448                     NEW kem algo.
+   GCRY_KEM_RAW_BP256                    NEW kem algo.
+   GCRY_KEM_RAW_BP384                    NEW kem algo.
+   GCRY_KEM_RAW_BP512                    NEW kem algo.
+   GCRY_KEM_RAW_P256R1                   NEW kem algo.
+   GCRY_KEM_RAW_P384R1                   NEW kem algo.
+   GCRY_KEM_RAW_P521R1                   NEW kem algo.
+   GCRY_KEM_DHKEM25519                   NEW kem algo.
+   GCRY_KEM_DHKEM448                     NEW kem algo.
+   GCRY_KEM_DHKEMP256R1                  NEW kem algo.
+   GCRY_KEM_DHKEMP384R1                  NEW kem algo.
+   GCRY_KEM_DHKEMP521R1                  NEW kem algo.
+   GCRY_KEM_*_SECKEY_LEN                 NEW constants.
+   GCRY_KEM_*_PUBKEY_LEN                 NEW constants.
+   GCRY_KEM_*_ENCAPS_LEN                 NEW constants.
+   GCRY_KEM_*_CIPHER_LEN                 NEW constants.
+   GCRY_KEM_*_SHARED_LEN                 NEW constants.
+   gcry_md_hash_buffers_ext              NEW function.
+   gcry_pk_input_data_push               NEW macro.
+   GCRYCTL_ENABLE_M_GUARD                DEPRECATED feature.
+   gcry_handler_log_t                    DEPRECATED type.
+   gcry_set_log_handler                  DEPRECATED function.
+
+
+Release dates of 1.10 versions
+------------------------------
+
+ Version 1.10.3 (2023-11-14) https://dev.gnupg.org/T6817
+ Version 1.10.2 (2023-04-06) https://dev.gnupg.org/T5905
+ Version 1.10.1 (2022-03-28) https://dev.gnupg.org/T5810
 
 
 Noteworthy changes in version 1.10.0 (2022-02-01)  [C24/A4/R0]
@@ -170,7 +340,7 @@ Noteworthy changes in version 1.10.0 (2022-02-01)  [C24/A4/R0]
    - Simplification of the entropy gatherer when using the getentropy
      system call.
 
- * Interface changes relative to the 1.10.0 release:
+ * Interface changes relative to the 1.9.0 release:
    ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    GCRYCTL_SET_DECRYPTION_TAG            NEW control code.
    GCRYCTL_FIPS_SERVICE_INDICATOR_CIPHER NEW control code.
@@ -201,9 +371,11 @@ Noteworthy changes in version 1.10.0 (2022-02-01)  [C24/A4/R0]
 
  Release-info: https://dev.gnupg.org/T5691
 
- Release dates of 1.9.x versions:
- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-   Version 1.9.4 (2021-08-22)
+
+Release dates of 1.9 versions
+-----------------------------
+
+ Version 1.9.4 (2021-08-22) https://dev.gnupg.org/T5402
 
 
 Noteworthy changes in version 1.9.3 (2021-04-19)  [C23/A3/R3]
@@ -225,7 +397,7 @@ Noteworthy changes in version 1.9.3 (2021-04-19)  [C23/A3/R3]
 
    - Make keygrip computation work for compressed points.  [#4961]
 
- * Performance:
+* Performance:
 
    - Add x86_64 VAES/AVX2 accelerated implementation of Camellia.
      [0e7e60241a]
@@ -504,14 +676,16 @@ Noteworthy changes in version 1.9.0 (2021-01-19)  [C23/A3/R0]
 
  Release-info: https://dev.gnupg.org/T4294
 
- Release dates of 1.8.x versions:
- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-   Version 1.8.2 (2017-12-13)
-   Version 1.8.3 (2018-06-13)
-   Version 1.8.4 (2018-10-26)
-   Version 1.8.5 (2019-08-29)
-   Version 1.8.6 (2020-07-06)
-   Version 1.8.7 (2020-10-23)
+
+Release dates of 1.8 versions
+-----------------------------
+
+ Version 1.8.7 (2020-10-23)
+ Version 1.8.6 (2020-07-06)
+ Version 1.8.5 (2019-08-29)
+ Version 1.8.4 (2018-10-26)
+ Version 1.8.3 (2018-06-13)
+ Version 1.8.2 (2017-12-13)
 
 
 Noteworthy changes in version 1.8.1 (2017-08-27)  [C22/A2/R1]
@@ -633,15 +807,16 @@ Noteworthy changes in version 1.8.0 (2017-07-18)  [C22/A2/R0]
    GCRY_CIPHER_MODE_XTS            NEW constant.
    gcry_md_info                    DEPRECATED.
 
- * Release dates of 1.7.x versions:
-   ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-   Version 1.7.10 (2018-06-13) [C21/A1/R10]
-   Version 1.7.9  (2017-08-27) [C21/A1/R9]
-   Version 1.7.8  (2017-06-29) [C21/A1/R8]
-   Version 1.7.7  (2017-06-02) [C21/A1/R7]
-   Version 1.7.6  (2017-01-18) [C21/A1/R6]
-   Version 1.7.5  (2016-12-15) [C21/A1/R5]
-   Version 1.7.4  (2016-12-09) [C21/A1/R4]
+Release dates of 1.7 versions
+-----------------------------
+
+ Version 1.7.10 (2018-06-13) [C21/A1/R10]
+ Version 1.7.9  (2017-08-27) [C21/A1/R9]
+ Version 1.7.8  (2017-06-29) [C21/A1/R8]
+ Version 1.7.7  (2017-06-02) [C21/A1/R7]
+ Version 1.7.6  (2017-01-18) [C21/A1/R6]
+ Version 1.7.5  (2016-12-15) [C21/A1/R5]
+ Version 1.7.4  (2016-12-09) [C21/A1/R4]
 
 
 Noteworthy changes in version 1.7.3 (2016-08-17)  [C21/A1/R3]
diff --git a/README b/README
index 3c174a36f6664eadc758e0d00d45ddd1c47b5e8e..7733dbdf9b4ac592693dfbc949a5eb70178418d7 100644 (file)
--- a/README
+++ b/README
@@ -1,10 +1,10 @@
                    Libgcrypt - The GNU Crypto Library
                   ------------------------------------
-                             Version 1.10
+                             Version 1.11
 
        Copyright (C) 1989,1991-2018 Free Software Foundation, Inc.
-       Copyright (C) 2012-2023 g10 Code GmbH
-       Copyright (C) 2013-2023 Jussi Kivilinna
+       Copyright (C) 2012-2024 g10 Code GmbH
+       Copyright (C) 2013-2024 Jussi Kivilinna
 
     Libgcrypt is free software.  See the file AUTHORS for full copying
     notices, and LICENSES for notices about contributions that require
                      With this option a "make check" will take really
                      long due to extra checks for the hash algorithms.
 
-     --enable-m-guard
-                     Enable the integrated malloc checking code. Please
-                     note that this feature does not work on all CPUs
-                     (e.g. SunOS 5.7 on UltraSparc-2) and might give
-                     you a bus error.
-
      --disable-asm
                      Do not use assembler modules.  It is not possible
                      to use this on some CPU types.
     Commercial grade support for Libgcrypt is available; for a listing
     of offers see https://www.gnupg.org/service.html .
 
-    Since 2001 maintenance and development of Libgcrypt is done by
-    g10 Code GmbH and until 2021 mostly financed by donations.
+    Since 2001 maintenance and development of Libgcrypt is done by g10
+    Code GmbH and was mostly financed by donations; since 2022 a raise
+    in revenues from support contracts allows to fully finance the
+    development without resorting to donations.  Many thanks to our
+    paid developers for their work and also a big thank you to Jussi
+    Kivilinna for all of his performance work.
 
   This file is Free Software; as a special exception the authors gives
   unlimited permission to copy and/or distribute it, with or without
diff --git a/VERSION b/VERSION
index 587c5f0c73096f6b7f367f349f207d6226691778..1cac385c6cb864bab53f6846e112f5a93fd17401 100644 (file)
--- a/VERSION
+++ b/VERSION
@@ -1 +1 @@
-1.10.3
+1.11.0
index 05bf88a7965255708eb6ea81df6edd6f65144815..782b7f6acf3718d2ed8b58d4ee9aa2ff1dd3b348 100644 (file)
@@ -16,8 +16,8 @@ dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 dnl GNU Lesser General Public License for more details.
 dnl
 dnl You should have received a copy of the GNU Lesser General Public
-dnl License along with this program; if not, write to the Free Software
-dnl Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA
+dnl License along with this program; if not, see <https://www.gnu.org/licenses/>.
+dnl SPDX-License-Identifier: LGPL-2.1-or-later
 
 dnl GCRY_MSG_SHOW(PREFIX,STRING)
 dnl Print a message with a prefix.
@@ -56,43 +56,6 @@ define([GCRY_MSG_WRAP],
   ])
 
 
-dnl GNUPG_CHECK_TYPEDEF(TYPE, HAVE_NAME)
-dnl Check whether a typedef exists and create a #define $2 if it exists
-dnl
-AC_DEFUN([GNUPG_CHECK_TYPEDEF],
-  [ AC_MSG_CHECKING(for $1 typedef)
-    AC_CACHE_VAL(gnupg_cv_typedef_$1,
-    [AC_TRY_COMPILE([#define _GNU_SOURCE 1
-    #include <stdlib.h>
-    #include <sys/types.h>], [
-    #undef $1
-    int a = sizeof($1);
-    ], gnupg_cv_typedef_$1=yes, gnupg_cv_typedef_$1=no )])
-    AC_MSG_RESULT($gnupg_cv_typedef_$1)
-    if test "$gnupg_cv_typedef_$1" = yes; then
-        AC_DEFINE($2,1,[Defined if a `]$1[' is typedef'd])
-    fi
-  ])
-
-
-dnl GNUPG_CHECK_GNUMAKE
-dnl
-AC_DEFUN([GNUPG_CHECK_GNUMAKE],
-  [
-    if ${MAKE-make} --version 2>/dev/null | grep '^GNU ' >/dev/null 2>&1; then
-        :
-    else
-        AC_MSG_WARN([[
-***
-*** It seems that you are not using GNU make.  Some make tools have serious
-*** flaws and you may not be able to build this software at all. Before you
-*** complain, please try GNU make:  GNU make is easy to build and available
-*** at all GNU archives.  It is always available from ftp.gnu.org:/gnu/make.
-***]])
-    fi
-  ])
-
-
 #
 # GNUPG_SYS_SYMBOL_UNDERSCORE
 # Does the compiler prefix global symbols with an underscore?
@@ -122,21 +85,21 @@ if test "$tmp_do_check" = "yes"; then
   AC_CACHE_VAL(ac_cv_sys_symbol_underscore,
   [ac_cv_sys_symbol_underscore=no
    cat > conftest.$ac_ext <<EOF
-      void nm_test_func(){}
-      int main(){nm_test_func;return 0;}
+      void nm_test_func(void){}
+      int main(void){nm_test_func();return 0;}
 EOF
   if AC_TRY_EVAL(ac_compile); then
     # Now try to grab the symbols.
-    ac_nlist=conftest.nm
-    if AC_TRY_EVAL(NM conftest.$ac_objext \| $lt_cv_sys_global_symbol_pipe \| cut -d \' \' -f 2 \> $ac_nlist) && test -s "$ac_nlist"; then
+    nlist=conftest.nm
+    if AC_TRY_EVAL(NM conftest.$ac_objext \| "$lt_cv_sys_global_symbol_pipe" \> $nlist) && test -s "$nlist"; then
       # See whether the symbols have a leading underscore.
-      if egrep '^_nm_test_func' "$ac_nlist" >/dev/null; then
+      if $GREP ' _nm_test_func$' "$nlist" >/dev/null; then
         ac_cv_sys_symbol_underscore=yes
       else
-        if egrep '^nm_test_func ' "$ac_nlist" >/dev/null; then
+        if $GREP ' nm_test_func$' "$nlist" >/dev/null; then
           :
         else
-          echo "configure: cannot find nm_test_func in $ac_nlist" >&AS_MESSAGE_LOG_FD
+          echo "configure: cannot find nm_test_func in $nlist" >&AS_MESSAGE_LOG_FD
         fi
       fi
     else
@@ -216,7 +179,7 @@ mlock(&i, 4);
 #include <sys/types.h>
 #include <fcntl.h>
 
-int main()
+int main(void)
 {
     char *pool;
     int err;
@@ -264,12 +227,6 @@ int main()
     fi
   ])
 
-# GNUPG_SYS_LIBTOOL_CYGWIN32 - find tools needed on cygwin32
-AC_DEFUN([GNUPG_SYS_LIBTOOL_CYGWIN32],
-[AC_CHECK_TOOL(DLLTOOL, dlltool, false)
-AC_CHECK_TOOL(AS, as, false)
-])
-
 dnl LIST_MEMBER()
 dnl Check whether an element ist contained in a list.  Set `found' to
 dnl `1' if the element is found in the list, to `0' otherwise.
@@ -285,108 +242,3 @@ for n in $list; do
   fi
 done
 ])
-
-
-dnl Check for socklen_t: historically on BSD it is an int, and in
-dnl POSIX 1g it is a type of its own, but some platforms use different
-dnl types for the argument to getsockopt, getpeername, etc.  So we
-dnl have to test to find something that will work.
-AC_DEFUN([TYPE_SOCKLEN_T],
-[
-   AC_CHECK_TYPE([socklen_t], ,[
-      AC_MSG_CHECKING([for socklen_t equivalent])
-      AC_CACHE_VAL([socklen_t_equiv],
-      [
-         # Systems have either "struct sockaddr *" or
-         # "void *" as the second argument to getpeername
-         socklen_t_equiv=
-         for arg2 in "struct sockaddr" void; do
-            for t in int size_t unsigned long "unsigned long"; do
-               AC_TRY_COMPILE([
-#include <sys/types.h>
-#include <sys/socket.h>
-
-int getpeername (int, $arg2 *, $t *);
-               ],[
-                  $t len;
-                  getpeername(0,0,&len);
-               ],[
-                  socklen_t_equiv="$t"
-                  break
-               ])
-            done
-         done
-
-         if test "x$socklen_t_equiv" = x; then
-            AC_MSG_ERROR([Cannot find a type to use in place of socklen_t])
-         fi
-      ])
-      AC_MSG_RESULT($socklen_t_equiv)
-      AC_DEFINE_UNQUOTED(socklen_t, $socklen_t_equiv,
-                       [type to use in place of socklen_t if not defined])],
-      [#include <sys/types.h>
-#include <sys/socket.h>])
-])
-
-
-# GNUPG_PTH_VERSION_CHECK(REQUIRED)
-#
-# If the version is sufficient, HAVE_PTH will be set to yes.
-#
-# Taken form the m4 macros which come with Pth
-AC_DEFUN([GNUPG_PTH_VERSION_CHECK],
-  [
-    _pth_version=`$PTH_CONFIG --version | awk 'NR==1 {print [$]3}'`
-    _req_version="ifelse([$1],,1.2.0,$1)"
-
-    AC_MSG_CHECKING(for PTH - version >= $_req_version)
-    for _var in _pth_version _req_version; do
-        eval "_val=\"\$${_var}\""
-        _major=`echo $_val | sed 's/\([[0-9]]*\)\.\([[0-9]]*\)\([[ab.]]\)\([[0-9]]*\)/\1/'`
-        _minor=`echo $_val | sed 's/\([[0-9]]*\)\.\([[0-9]]*\)\([[ab.]]\)\([[0-9]]*\)/\2/'`
-        _rtype=`echo $_val | sed 's/\([[0-9]]*\)\.\([[0-9]]*\)\([[ab.]]\)\([[0-9]]*\)/\3/'`
-        _micro=`echo $_val | sed 's/\([[0-9]]*\)\.\([[0-9]]*\)\([[ab.]]\)\([[0-9]]*\)/\4/'`
-        case $_rtype in
-            "a" ) _rtype=0 ;;
-            "b" ) _rtype=1 ;;
-            "." ) _rtype=2 ;;
-        esac
-        _hex=`echo dummy | awk '{ printf("%d%02d%1d%02d", major, minor, rtype, micro); }' \
-              "major=$_major" "minor=$_minor" "rtype=$_rtype" "micro=$_micro"`
-        eval "${_var}_hex=\"\$_hex\""
-    done
-    have_pth=no
-    if test ".$_pth_version_hex" != .; then
-        if test ".$_req_version_hex" != .; then
-            if test $_pth_version_hex -ge $_req_version_hex; then
-                have_pth=yes
-            fi
-        fi
-    fi
-    if test $have_pth = yes; then
-       AC_MSG_RESULT(yes)
-       AC_MSG_CHECKING([whether PTH installation is sane])
-       AC_CACHE_VAL(gnupg_cv_pth_is_sane,[
-         _gnupg_pth_save_cflags=$CFLAGS
-         _gnupg_pth_save_ldflags=$LDFLAGS
-         _gnupg_pth_save_libs=$LIBS
-         CFLAGS="$CFLAGS `$PTH_CONFIG --cflags`"
-         LDFLAGS="$LDFLAGS `$PTH_CONFIG --ldflags`"
-         LIBS="$LIBS `$PTH_CONFIG --libs`"
-         AC_LINK_IFELSE([AC_LANG_PROGRAM([#include <pth.h>
-                                         ],
-                                         [[ pth_init ();]])],
-                        gnupg_cv_pth_is_sane=yes,
-                        gnupg_cv_pth_is_sane=no)
-         CFLAGS=$_gnupg_pth_save_cflags
-         LDFLAGS=$_gnupg_pth_save_ldflags
-         LIBS=$_gnupg_pth_save_libs
-       ])
-       if test $gnupg_cv_pth_is_sane != yes; then
-          have_pth=no
-       fi
-       AC_MSG_RESULT($gnupg_cv_pth_is_sane)
-    else
-       AC_MSG_RESULT(no)
-    fi
-  ])
index 67d20cec88a85e5a23cdb73f97cca9807b66d4cd..8a532282bb952a8435d651101c008c5d6bf2a941 100644 (file)
@@ -1212,5 +1212,4 @@ m4_include([m4/ltsugar.m4])
 m4_include([m4/ltversion.m4])
 m4_include([m4/lt~obsolete.m4])
 m4_include([m4/noexecstack.m4])
-m4_include([m4/socklen.m4])
 m4_include([acinclude.m4])
index c4bd827a7bedcf6f78866a27bf01d896c047b516..7f76b6228f73d674f58cfcc3523f99e253ee5515 100755 (executable)
@@ -1,12 +1,14 @@
 #! /bin/sh
 # Attempt to guess a canonical system name.
-#   Copyright 1992-2016 Free Software Foundation, Inc.
+#   Copyright 1992-2022 Free Software Foundation, Inc.
 
-timestamp='2016-05-15'
+# shellcheck disable=SC2006,SC2268 # see below for rationale
+
+timestamp='2022-01-09'
 
 # This file is free software; you can redistribute it and/or modify it
 # under the terms of the GNU General Public License as published by
-# the Free Software Foundation; either version 3 of the License, or
+# the Free Software Foundation, either version 3 of the License, or
 # (at your option) any later version.
 #
 # This program is distributed in the hope that it will be useful, but
@@ -15,7 +17,7 @@ timestamp='2016-05-15'
 # General Public License for more details.
 #
 # You should have received a copy of the GNU General Public License
-# along with this program; if not, see <http://www.gnu.org/licenses/>.
+# along with this program; if not, see <https://www.gnu.org/licenses/>.
 #
 # As a special exception to the GNU General Public License, if you
 # distribute this file as part of a program that contains a
@@ -27,11 +29,19 @@ timestamp='2016-05-15'
 # Originally written by Per Bothner; maintained since 2000 by Ben Elliston.
 #
 # You can get the latest version of this script from:
-# http://git.savannah.gnu.org/gitweb/?p=config.git;a=blob_plain;f=config.guess
+# https://git.savannah.gnu.org/cgit/config.git/plain/config.guess
 #
 # Please send patches to <config-patches@gnu.org>.
 
 
+# The "shellcheck disable" line above the timestamp inhibits complaints
+# about features and limitations of the classic Bourne shell that were
+# superseded or lifted in POSIX.  However, this script identifies a wide
+# variety of pre-POSIX systems that do not have POSIX shells at all, and
+# even some reasonably current systems (Solaris 10 as case-in-point) still
+# have a pre-POSIX /bin/sh.
+
+
 me=`echo "$0" | sed -e 's,.*/,,'`
 
 usage="\
@@ -39,7 +49,7 @@ Usage: $0 [OPTION]
 
 Output the configuration name of the system \`$me' is run on.
 
-Operation modes:
+Options:
   -h, --help         print this help, then exit
   -t, --time-stamp   print date of last modification, then exit
   -v, --version      print version number, then exit
@@ -50,7 +60,7 @@ version="\
 GNU config.guess ($timestamp)
 
 Originally written by Per Bothner.
-Copyright 1992-2016 Free Software Foundation, Inc.
+Copyright 1992-2022 Free Software Foundation, Inc.
 
 This is free software; see the source for copying conditions.  There is NO
 warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE."
@@ -84,7 +94,8 @@ if test $# != 0; then
   exit 1
 fi
 
-trap 'exit 1' 1 2 15
+# Just in case it came from the environment.
+GUESS=
 
 # CC_FOR_BUILD -- compiler used by this script. Note that the use of a
 # compiler to aid in system detection is discouraged as it requires
@@ -96,66 +107,90 @@ trap 'exit 1' 1 2 15
 
 # Portable tmp directory creation inspired by the Autoconf team.
 
-set_cc_for_build='
-trap "exitcode=\$?; (rm -f \$tmpfiles 2>/dev/null; rmdir \$tmp 2>/dev/null) && exit \$exitcode" 0 ;
-trap "rm -f \$tmpfiles 2>/dev/null; rmdir \$tmp 2>/dev/null; exit 1" 1 2 13 15 ;
-: ${TMPDIR=/tmp} ;
- { tmp=`(umask 077 && mktemp -d "$TMPDIR/cgXXXXXX") 2>/dev/null` && test -n "$tmp" && test -d "$tmp" ; } ||
- { test -n "$RANDOM" && tmp=$TMPDIR/cg$$-$RANDOM && (umask 077 && mkdir $tmp) ; } ||
- { tmp=$TMPDIR/cg-$$ && (umask 077 && mkdir $tmp) && echo "Warning: creating insecure temp directory" >&2 ; } ||
- { echo "$me: cannot create a temporary directory in $TMPDIR" >&2 ; exit 1 ; } ;
-dummy=$tmp/dummy ;
-tmpfiles="$dummy.c $dummy.o $dummy.rel $dummy" ;
-case $CC_FOR_BUILD,$HOST_CC,$CC in
- ,,)    echo "int x;" > $dummy.c ;
-       for c in cc gcc c89 c99 ; do
-         if ($c -c -o $dummy.o $dummy.c) >/dev/null 2>&1 ; then
-            CC_FOR_BUILD="$c"; break ;
-         fi ;
-       done ;
-       if test x"$CC_FOR_BUILD" = x ; then
-         CC_FOR_BUILD=no_compiler_found ;
-       fi
-       ;;
- ,,*)   CC_FOR_BUILD=$CC ;;
- ,*,*)  CC_FOR_BUILD=$HOST_CC ;;
-esac ; set_cc_for_build= ;'
+tmp=
+# shellcheck disable=SC2172
+trap 'test -z "$tmp" || rm -fr "$tmp"' 0 1 2 13 15
+
+set_cc_for_build() {
+    # prevent multiple calls if $tmp is already set
+    test "$tmp" && return 0
+    : "${TMPDIR=/tmp}"
+    # shellcheck disable=SC2039,SC3028
+    { tmp=`(umask 077 && mktemp -d "$TMPDIR/cgXXXXXX") 2>/dev/null` && test -n "$tmp" && test -d "$tmp" ; } ||
+       { test -n "$RANDOM" && tmp=$TMPDIR/cg$$-$RANDOM && (umask 077 && mkdir "$tmp" 2>/dev/null) ; } ||
+       { tmp=$TMPDIR/cg-$$ && (umask 077 && mkdir "$tmp" 2>/dev/null) && echo "Warning: creating insecure temp directory" >&2 ; } ||
+       { echo "$me: cannot create a temporary directory in $TMPDIR" >&2 ; exit 1 ; }
+    dummy=$tmp/dummy
+    case ${CC_FOR_BUILD-},${HOST_CC-},${CC-} in
+       ,,)    echo "int x;" > "$dummy.c"
+              for driver in cc gcc c89 c99 ; do
+                  if ($driver -c -o "$dummy.o" "$dummy.c") >/dev/null 2>&1 ; then
+                      CC_FOR_BUILD=$driver
+                      break
+                  fi
+              done
+              if test x"$CC_FOR_BUILD" = x ; then
+                  CC_FOR_BUILD=no_compiler_found
+              fi
+              ;;
+       ,,*)   CC_FOR_BUILD=$CC ;;
+       ,*,*)  CC_FOR_BUILD=$HOST_CC ;;
+    esac
+}
 
 # This is needed to find uname on a Pyramid OSx when run in the BSD universe.
 # (ghazi@noc.rutgers.edu 1994-08-24)
-if (test -f /.attbin/uname) >/dev/null 2>&1 ; then
+if test -f /.attbin/uname ; then
        PATH=$PATH:/.attbin ; export PATH
 fi
 
 UNAME_MACHINE=`(uname -m) 2>/dev/null` || UNAME_MACHINE=unknown
 UNAME_RELEASE=`(uname -r) 2>/dev/null` || UNAME_RELEASE=unknown
-UNAME_SYSTEM=`(uname -s) 2>/dev/null`  || UNAME_SYSTEM=unknown
+UNAME_SYSTEM=`(uname -s) 2>/dev/null` || UNAME_SYSTEM=unknown
 UNAME_VERSION=`(uname -v) 2>/dev/null` || UNAME_VERSION=unknown
 
-case "${UNAME_SYSTEM}" in
+case $UNAME_SYSTEM in
 Linux|GNU|GNU/*)
-       # If the system lacks a compiler, then just pick glibc.
-       # We could probably try harder.
-       LIBC=gnu
+       LIBC=unknown
 
-       eval $set_cc_for_build
-       cat <<-EOF > $dummy.c
+       set_cc_for_build
+       cat <<-EOF > "$dummy.c"
        #include <features.h>
        #if defined(__UCLIBC__)
        LIBC=uclibc
        #elif defined(__dietlibc__)
        LIBC=dietlibc
-       #else
+       #elif defined(__GLIBC__)
        LIBC=gnu
+       #else
+       #include <stdarg.h>
+       /* First heuristic to detect musl libc.  */
+       #ifdef __DEFINED_va_list
+       LIBC=musl
+       #endif
        #endif
        EOF
-       eval `$CC_FOR_BUILD -E $dummy.c 2>/dev/null | grep '^LIBC' | sed 's, ,,g'`
+       cc_set_libc=`$CC_FOR_BUILD -E "$dummy.c" 2>/dev/null | grep '^LIBC' | sed 's, ,,g'`
+       eval "$cc_set_libc"
+
+       # Second heuristic to detect musl libc.
+       if [ "$LIBC" = unknown ] &&
+          command -v ldd >/dev/null &&
+          ldd --version 2>&1 | grep -q ^musl; then
+               LIBC=musl
+       fi
+
+       # If the system lacks a compiler, then just pick glibc.
+       # We could probably try harder.
+       if [ "$LIBC" = unknown ]; then
+               LIBC=gnu
+       fi
        ;;
 esac
 
 # Note: order is significant - the case branches are not exclusive.
 
-case "${UNAME_MACHINE}:${UNAME_SYSTEM}:${UNAME_RELEASE}:${UNAME_VERSION}" in
+case $UNAME_MACHINE:$UNAME_SYSTEM:$UNAME_RELEASE:$UNAME_VERSION in
     *:NetBSD:*:*)
        # NetBSD (nbsd) targets should (where applicable) match one or
        # more of the tuples: *-*-netbsdelf*, *-*-netbsdaout*,
@@ -167,32 +202,32 @@ case "${UNAME_MACHINE}:${UNAME_SYSTEM}:${UNAME_RELEASE}:${UNAME_VERSION}" in
        #
        # Note: NetBSD doesn't particularly care about the vendor
        # portion of the name.  We always set it to "unknown".
-       sysctl="sysctl -n hw.machine_arch"
        UNAME_MACHINE_ARCH=`(uname -p 2>/dev/null || \
-           /sbin/$sysctl 2>/dev/null || \
-           /usr/sbin/$sysctl 2>/dev/null || \
+           /sbin/sysctl -n hw.machine_arch 2>/dev/null || \
+           /usr/sbin/sysctl -n hw.machine_arch 2>/dev/null || \
            echo unknown)`
-       case "${UNAME_MACHINE_ARCH}" in
+       case $UNAME_MACHINE_ARCH in
+           aarch64eb) machine=aarch64_be-unknown ;;
            armeb) machine=armeb-unknown ;;
            arm*) machine=arm-unknown ;;
            sh3el) machine=shl-unknown ;;
            sh3eb) machine=sh-unknown ;;
            sh5el) machine=sh5le-unknown ;;
            earmv*)
-               arch=`echo ${UNAME_MACHINE_ARCH} | sed -e 's,^e\(armv[0-9]\).*$,\1,'`
-               endian=`echo ${UNAME_MACHINE_ARCH} | sed -ne 's,^.*\(eb\)$,\1,p'`
+               arch=`echo "$UNAME_MACHINE_ARCH" | sed -e 's,^e\(armv[0-9]\).*$,\1,'`
+               endian=`echo "$UNAME_MACHINE_ARCH" | sed -ne 's,^.*\(eb\)$,\1,p'`
                machine=${arch}${endian}-unknown
                ;;
-           *) machine=${UNAME_MACHINE_ARCH}-unknown ;;
+           *) machine=$UNAME_MACHINE_ARCH-unknown ;;
        esac
        # The Operating System including object format, if it has switched
        # to ELF recently (or will in the future) and ABI.
-       case "${UNAME_MACHINE_ARCH}" in
+       case $UNAME_MACHINE_ARCH in
            earm*)
                os=netbsdelf
                ;;
            arm*|i386|m68k|ns32k|sh3*|sparc|vax)
-               eval $set_cc_for_build
+               set_cc_for_build
                if echo __ELF__ | $CC_FOR_BUILD -E - 2>/dev/null \
                        | grep -q __ELF__
                then
@@ -208,10 +243,10 @@ case "${UNAME_MACHINE}:${UNAME_SYSTEM}:${UNAME_RELEASE}:${UNAME_VERSION}" in
                ;;
        esac
        # Determine ABI tags.
-       case "${UNAME_MACHINE_ARCH}" in
+       case $UNAME_MACHINE_ARCH in
            earm*)
                expr='s/^earmv[0-9]/-eabi/;s/eb$//'
-               abi=`echo ${UNAME_MACHINE_ARCH} | sed -e "$expr"`
+               abi=`echo "$UNAME_MACHINE_ARCH" | sed -e "$expr"`
                ;;
        esac
        # The OS release
@@ -219,47 +254,68 @@ case "${UNAME_MACHINE}:${UNAME_SYSTEM}:${UNAME_RELEASE}:${UNAME_VERSION}" in
        # thus, need a distinct triplet. However, they do not need
        # kernel version information, so it can be replaced with a
        # suitable tag, in the style of linux-gnu.
-       case "${UNAME_VERSION}" in
+       case $UNAME_VERSION in
            Debian*)
                release='-gnu'
                ;;
            *)
-               release=`echo ${UNAME_RELEASE} | sed -e 's/[-_].*//' | cut -d. -f1,2`
+               release=`echo "$UNAME_RELEASE" | sed -e 's/[-_].*//' | cut -d. -f1,2`
                ;;
        esac
        # Since CPU_TYPE-MANUFACTURER-KERNEL-OPERATING_SYSTEM:
        # contains redundant information, the shorter form:
        # CPU_TYPE-MANUFACTURER-OPERATING_SYSTEM is used.
-       echo "${machine}-${os}${release}${abi}"
-       exit ;;
+       GUESS=$machine-${os}${release}${abi-}
+       ;;
     *:Bitrig:*:*)
        UNAME_MACHINE_ARCH=`arch | sed 's/Bitrig.//'`
-       echo ${UNAME_MACHINE_ARCH}-unknown-bitrig${UNAME_RELEASE}
-       exit ;;
+       GUESS=$UNAME_MACHINE_ARCH-unknown-bitrig$UNAME_RELEASE
+       ;;
     *:OpenBSD:*:*)
        UNAME_MACHINE_ARCH=`arch | sed 's/OpenBSD.//'`
-       echo ${UNAME_MACHINE_ARCH}-unknown-openbsd${UNAME_RELEASE}
-       exit ;;
+       GUESS=$UNAME_MACHINE_ARCH-unknown-openbsd$UNAME_RELEASE
+       ;;
+    *:SecBSD:*:*)
+       UNAME_MACHINE_ARCH=`arch | sed 's/SecBSD.//'`
+       GUESS=$UNAME_MACHINE_ARCH-unknown-secbsd$UNAME_RELEASE
+       ;;
     *:LibertyBSD:*:*)
        UNAME_MACHINE_ARCH=`arch | sed 's/^.*BSD\.//'`
-       echo ${UNAME_MACHINE_ARCH}-unknown-libertybsd${UNAME_RELEASE}
-       exit ;;
+       GUESS=$UNAME_MACHINE_ARCH-unknown-libertybsd$UNAME_RELEASE
+       ;;
+    *:MidnightBSD:*:*)
+       GUESS=$UNAME_MACHINE-unknown-midnightbsd$UNAME_RELEASE
+       ;;
     *:ekkoBSD:*:*)
-       echo ${UNAME_MACHINE}-unknown-ekkobsd${UNAME_RELEASE}
-       exit ;;
+       GUESS=$UNAME_MACHINE-unknown-ekkobsd$UNAME_RELEASE
+       ;;
     *:SolidBSD:*:*)
-       echo ${UNAME_MACHINE}-unknown-solidbsd${UNAME_RELEASE}
-       exit ;;
+       GUESS=$UNAME_MACHINE-unknown-solidbsd$UNAME_RELEASE
+       ;;
+    *:OS108:*:*)
+       GUESS=$UNAME_MACHINE-unknown-os108_$UNAME_RELEASE
+       ;;
     macppc:MirBSD:*:*)
-       echo powerpc-unknown-mirbsd${UNAME_RELEASE}
-       exit ;;
+       GUESS=powerpc-unknown-mirbsd$UNAME_RELEASE
+       ;;
     *:MirBSD:*:*)
-       echo ${UNAME_MACHINE}-unknown-mirbsd${UNAME_RELEASE}
-       exit ;;
+       GUESS=$UNAME_MACHINE-unknown-mirbsd$UNAME_RELEASE
+       ;;
     *:Sortix:*:*)
-       echo ${UNAME_MACHINE}-unknown-sortix
-       exit ;;
+       GUESS=$UNAME_MACHINE-unknown-sortix
+       ;;
+    *:Twizzler:*:*)
+       GUESS=$UNAME_MACHINE-unknown-twizzler
+       ;;
+    *:Redox:*:*)
+       GUESS=$UNAME_MACHINE-unknown-redox
+       ;;
+    mips:OSF1:*.*)
+       GUESS=mips-dec-osf1
+       ;;
     alpha:OSF1:*:*)
+       # Reset EXIT trap before exiting to avoid spurious non-zero exit code.
+       trap '' 0
        case $UNAME_RELEASE in
        *4.0)
                UNAME_RELEASE=`/usr/sbin/sizer -v | awk '{print $3}'`
@@ -273,7 +329,7 @@ case "${UNAME_MACHINE}:${UNAME_SYSTEM}:${UNAME_RELEASE}:${UNAME_VERSION}" in
        # covers most systems running today.  This code pipes the CPU
        # types through head -n 1, so we only detect the type of CPU 0.
        ALPHA_CPU_TYPE=`/usr/sbin/psrinfo -v | sed -n -e 's/^  The alpha \(.*\) processor.*$/\1/p' | head -n 1`
-       case "$ALPHA_CPU_TYPE" in
+       case $ALPHA_CPU_TYPE in
            "EV4 (21064)")
                UNAME_MACHINE=alpha ;;
            "EV4.5 (21064)")
@@ -310,126 +366,121 @@ case "${UNAME_MACHINE}:${UNAME_SYSTEM}:${UNAME_RELEASE}:${UNAME_VERSION}" in
        # A Tn.n version is a released field test version.
        # A Xn.n version is an unreleased experimental baselevel.
        # 1.2 uses "1.2" for uname -r.
-       echo ${UNAME_MACHINE}-dec-osf`echo ${UNAME_RELEASE} | sed -e 's/^[PVTX]//' | tr ABCDEFGHIJKLMNOPQRSTUVWXYZ abcdefghijklmnopqrstuvwxyz`
-       # Reset EXIT trap before exiting to avoid spurious non-zero exit code.
-       exitcode=$?
-       trap '' 0
-       exit $exitcode ;;
-    Alpha\ *:Windows_NT*:*)
-       # How do we know it's Interix rather than the generic POSIX subsystem?
-       # Should we change UNAME_MACHINE based on the output of uname instead
-       # of the specific Alpha model?
-       echo alpha-pc-interix
-       exit ;;
-    21064:Windows_NT:50:3)
-       echo alpha-dec-winnt3.5
-       exit ;;
+       OSF_REL=`echo "$UNAME_RELEASE" | sed -e 's/^[PVTX]//' | tr ABCDEFGHIJKLMNOPQRSTUVWXYZ abcdefghijklmnopqrstuvwxyz`
+       GUESS=$UNAME_MACHINE-dec-osf$OSF_REL
+       ;;
     Amiga*:UNIX_System_V:4.0:*)
-       echo m68k-unknown-sysv4
-       exit ;;
+       GUESS=m68k-unknown-sysv4
+       ;;
     *:[Aa]miga[Oo][Ss]:*:*)
-       echo ${UNAME_MACHINE}-unknown-amigaos
-       exit ;;
+       GUESS=$UNAME_MACHINE-unknown-amigaos
+       ;;
     *:[Mm]orph[Oo][Ss]:*:*)
-       echo ${UNAME_MACHINE}-unknown-morphos
-       exit ;;
+       GUESS=$UNAME_MACHINE-unknown-morphos
+       ;;
     *:OS/390:*:*)
-       echo i370-ibm-openedition
-       exit ;;
+       GUESS=i370-ibm-openedition
+       ;;
     *:z/VM:*:*)
-       echo s390-ibm-zvmoe
-       exit ;;
+       GUESS=s390-ibm-zvmoe
+       ;;
     *:OS400:*:*)
-       echo powerpc-ibm-os400
-       exit ;;
+       GUESS=powerpc-ibm-os400
+       ;;
     arm:RISC*:1.[012]*:*|arm:riscix:1.[012]*:*)
-       echo arm-acorn-riscix${UNAME_RELEASE}
-       exit ;;
+       GUESS=arm-acorn-riscix$UNAME_RELEASE
+       ;;
     arm*:riscos:*:*|arm*:RISCOS:*:*)
-       echo arm-unknown-riscos
-       exit ;;
+       GUESS=arm-unknown-riscos
+       ;;
     SR2?01:HI-UX/MPP:*:* | SR8000:HI-UX/MPP:*:*)
-       echo hppa1.1-hitachi-hiuxmpp
-       exit ;;
+       GUESS=hppa1.1-hitachi-hiuxmpp
+       ;;
     Pyramid*:OSx*:*:* | MIS*:OSx*:*:* | MIS*:SMP_DC-OSx*:*:*)
        # akee@wpdis03.wpafb.af.mil (Earle F. Ake) contributed MIS and NILE.
-       if test "`(/bin/universe) 2>/dev/null`" = att ; then
-               echo pyramid-pyramid-sysv3
-       else
-               echo pyramid-pyramid-bsd
-       fi
-       exit ;;
+       case `(/bin/universe) 2>/dev/null` in
+           att) GUESS=pyramid-pyramid-sysv3 ;;
+           *)   GUESS=pyramid-pyramid-bsd   ;;
+       esac
+       ;;
     NILE*:*:*:dcosx)
-       echo pyramid-pyramid-svr4
-       exit ;;
+       GUESS=pyramid-pyramid-svr4
+       ;;
     DRS?6000:unix:4.0:6*)
-       echo sparc-icl-nx6
-       exit ;;
+       GUESS=sparc-icl-nx6
+       ;;
     DRS?6000:UNIX_SV:4.2*:7* | DRS?6000:isis:4.2*:7*)
        case `/usr/bin/uname -p` in
-           sparc) echo sparc-icl-nx7; exit ;;
-       esac ;;
+           sparc) GUESS=sparc-icl-nx7 ;;
+       esac
+       ;;
     s390x:SunOS:*:*)
-       echo ${UNAME_MACHINE}-ibm-solaris2`echo ${UNAME_RELEASE}|sed -e 's/[^.]*//'`
-       exit ;;
+       SUN_REL=`echo "$UNAME_RELEASE" | sed -e 's/[^.]*//'`
+       GUESS=$UNAME_MACHINE-ibm-solaris2$SUN_REL
+       ;;
     sun4H:SunOS:5.*:*)
-       echo sparc-hal-solaris2`echo ${UNAME_RELEASE}|sed -e 's/[^.]*//'`
-       exit ;;
+       SUN_REL=`echo "$UNAME_RELEASE" | sed -e 's/[^.]*//'`
+       GUESS=sparc-hal-solaris2$SUN_REL
+       ;;
     sun4*:SunOS:5.*:* | tadpole*:SunOS:5.*:*)
-       echo sparc-sun-solaris2`echo ${UNAME_RELEASE}|sed -e 's/[^.]*//'`
-       exit ;;
+       SUN_REL=`echo "$UNAME_RELEASE" | sed -e 's/[^.]*//'`
+       GUESS=sparc-sun-solaris2$SUN_REL
+       ;;
     i86pc:AuroraUX:5.*:* | i86xen:AuroraUX:5.*:*)
-       echo i386-pc-auroraux${UNAME_RELEASE}
-       exit ;;
+       GUESS=i386-pc-auroraux$UNAME_RELEASE
+       ;;
     i86pc:SunOS:5.*:* | i86xen:SunOS:5.*:*)
-       eval $set_cc_for_build
+       set_cc_for_build
        SUN_ARCH=i386
        # If there is a compiler, see if it is configured for 64-bit objects.
        # Note that the Sun cc does not turn __LP64__ into 1 like gcc does.
        # This test works for both compilers.
-       if [ "$CC_FOR_BUILD" != no_compiler_found ]; then
+       if test "$CC_FOR_BUILD" != no_compiler_found; then
            if (echo '#ifdef __amd64'; echo IS_64BIT_ARCH; echo '#endif') | \
-               (CCOPTS="" $CC_FOR_BUILD -E - 2>/dev/null) | \
+               (CCOPTS="" $CC_FOR_BUILD -m64 -E - 2>/dev/null) | \
                grep IS_64BIT_ARCH >/dev/null
            then
                SUN_ARCH=x86_64
            fi
        fi
-       echo ${SUN_ARCH}-pc-solaris2`echo ${UNAME_RELEASE}|sed -e 's/[^.]*//'`
-       exit ;;
+       SUN_REL=`echo "$UNAME_RELEASE" | sed -e 's/[^.]*//'`
+       GUESS=$SUN_ARCH-pc-solaris2$SUN_REL
+       ;;
     sun4*:SunOS:6*:*)
        # According to config.sub, this is the proper way to canonicalize
        # SunOS6.  Hard to guess exactly what SunOS6 will be like, but
        # it's likely to be more like Solaris than SunOS4.
-       echo sparc-sun-solaris3`echo ${UNAME_RELEASE}|sed -e 's/[^.]*//'`
-       exit ;;
+       SUN_REL=`echo "$UNAME_RELEASE" | sed -e 's/[^.]*//'`
+       GUESS=sparc-sun-solaris3$SUN_REL
+       ;;
     sun4*:SunOS:*:*)
-       case "`/usr/bin/arch -k`" in
+       case `/usr/bin/arch -k` in
            Series*|S4*)
                UNAME_RELEASE=`uname -v`
                ;;
        esac
        # Japanese Language versions have a version number like `4.1.3-JL'.
-       echo sparc-sun-sunos`echo ${UNAME_RELEASE}|sed -e 's/-/_/'`
-       exit ;;
+       SUN_REL=`echo "$UNAME_RELEASE" | sed -e 's/-/_/'`
+       GUESS=sparc-sun-sunos$SUN_REL
+       ;;
     sun3*:SunOS:*:*)
-       echo m68k-sun-sunos${UNAME_RELEASE}
-       exit ;;
+       GUESS=m68k-sun-sunos$UNAME_RELEASE
+       ;;
     sun*:*:4.2BSD:*)
        UNAME_RELEASE=`(sed 1q /etc/motd | awk '{print substr($5,1,3)}') 2>/dev/null`
-       test "x${UNAME_RELEASE}" = x && UNAME_RELEASE=3
-       case "`/bin/arch`" in
+       test "x$UNAME_RELEASE" = x && UNAME_RELEASE=3
+       case `/bin/arch` in
            sun3)
-               echo m68k-sun-sunos${UNAME_RELEASE}
+               GUESS=m68k-sun-sunos$UNAME_RELEASE
                ;;
            sun4)
-               echo sparc-sun-sunos${UNAME_RELEASE}
+               GUESS=sparc-sun-sunos$UNAME_RELEASE
                ;;
        esac
-       exit ;;
+       ;;
     aushp:SunOS:*:*)
-       echo sparc-auspex-sunos${UNAME_RELEASE}
-       exit ;;
+       GUESS=sparc-auspex-sunos$UNAME_RELEASE
+       ;;
     # The situation for MiNT is a little confusing.  The machine name
     # can be virtually everything (everything which is not
     # "atarist" or "atariste" at least should have a processor
@@ -439,44 +490,44 @@ case "${UNAME_MACHINE}:${UNAME_SYSTEM}:${UNAME_RELEASE}:${UNAME_VERSION}" in
     # MiNT.  But MiNT is downward compatible to TOS, so this should
     # be no problem.
     atarist[e]:*MiNT:*:* | atarist[e]:*mint:*:* | atarist[e]:*TOS:*:*)
-       echo m68k-atari-mint${UNAME_RELEASE}
-       exit ;;
+       GUESS=m68k-atari-mint$UNAME_RELEASE
+       ;;
     atari*:*MiNT:*:* | atari*:*mint:*:* | atarist[e]:*TOS:*:*)
-       echo m68k-atari-mint${UNAME_RELEASE}
-       exit ;;
+       GUESS=m68k-atari-mint$UNAME_RELEASE
+       ;;
     *falcon*:*MiNT:*:* | *falcon*:*mint:*:* | *falcon*:*TOS:*:*)
-       echo m68k-atari-mint${UNAME_RELEASE}
-       exit ;;
+       GUESS=m68k-atari-mint$UNAME_RELEASE
+       ;;
     milan*:*MiNT:*:* | milan*:*mint:*:* | *milan*:*TOS:*:*)
-       echo m68k-milan-mint${UNAME_RELEASE}
-       exit ;;
+       GUESS=m68k-milan-mint$UNAME_RELEASE
+       ;;
     hades*:*MiNT:*:* | hades*:*mint:*:* | *hades*:*TOS:*:*)
-       echo m68k-hades-mint${UNAME_RELEASE}
-       exit ;;
+       GUESS=m68k-hades-mint$UNAME_RELEASE
+       ;;
     *:*MiNT:*:* | *:*mint:*:* | *:*TOS:*:*)
-       echo m68k-unknown-mint${UNAME_RELEASE}
-       exit ;;
+       GUESS=m68k-unknown-mint$UNAME_RELEASE
+       ;;
     m68k:machten:*:*)
-       echo m68k-apple-machten${UNAME_RELEASE}
-       exit ;;
+       GUESS=m68k-apple-machten$UNAME_RELEASE
+       ;;
     powerpc:machten:*:*)
-       echo powerpc-apple-machten${UNAME_RELEASE}
-       exit ;;
+       GUESS=powerpc-apple-machten$UNAME_RELEASE
+       ;;
     RISC*:Mach:*:*)
-       echo mips-dec-mach_bsd4.3
-       exit ;;
+       GUESS=mips-dec-mach_bsd4.3
+       ;;
     RISC*:ULTRIX:*:*)
-       echo mips-dec-ultrix${UNAME_RELEASE}
-       exit ;;
+       GUESS=mips-dec-ultrix$UNAME_RELEASE
+       ;;
     VAX*:ULTRIX*:*:*)
-       echo vax-dec-ultrix${UNAME_RELEASE}
-       exit ;;
+       GUESS=vax-dec-ultrix$UNAME_RELEASE
+       ;;
     2020:CLIX:*:* | 2430:CLIX:*:*)
-       echo clipper-intergraph-clix${UNAME_RELEASE}
-       exit ;;
+       GUESS=clipper-intergraph-clix$UNAME_RELEASE
+       ;;
     mips:*:*:UMIPS | mips:*:*:RISCos)
-       eval $set_cc_for_build
-       sed 's/^        //' << EOF >$dummy.c
+       set_cc_for_build
+       sed 's/^        //' << EOF > "$dummy.c"
 #ifdef __cplusplus
 #include <stdio.h>  /* for printf() prototype */
        int main (int argc, char *argv[]) {
@@ -485,95 +536,96 @@ case "${UNAME_MACHINE}:${UNAME_SYSTEM}:${UNAME_RELEASE}:${UNAME_VERSION}" in
 #endif
        #if defined (host_mips) && defined (MIPSEB)
        #if defined (SYSTYPE_SYSV)
-         printf ("mips-mips-riscos%ssysv\n", argv[1]); exit (0);
+         printf ("mips-mips-riscos%ssysv\\n", argv[1]); exit (0);
        #endif
        #if defined (SYSTYPE_SVR4)
-         printf ("mips-mips-riscos%ssvr4\n", argv[1]); exit (0);
+         printf ("mips-mips-riscos%ssvr4\\n", argv[1]); exit (0);
        #endif
        #if defined (SYSTYPE_BSD43) || defined(SYSTYPE_BSD)
-         printf ("mips-mips-riscos%sbsd\n", argv[1]); exit (0);
+         printf ("mips-mips-riscos%sbsd\\n", argv[1]); exit (0);
        #endif
        #endif
          exit (-1);
        }
 EOF
-       $CC_FOR_BUILD -o $dummy $dummy.c &&
-         dummyarg=`echo "${UNAME_RELEASE}" | sed -n 's/\([0-9]*\).*/\1/p'` &&
-         SYSTEM_NAME=`$dummy $dummyarg` &&
+       $CC_FOR_BUILD -o "$dummy" "$dummy.c" &&
+         dummyarg=`echo "$UNAME_RELEASE" | sed -n 's/\([0-9]*\).*/\1/p'` &&
+         SYSTEM_NAME=`"$dummy" "$dummyarg"` &&
            { echo "$SYSTEM_NAME"; exit; }
-       echo mips-mips-riscos${UNAME_RELEASE}
-       exit ;;
+       GUESS=mips-mips-riscos$UNAME_RELEASE
+       ;;
     Motorola:PowerMAX_OS:*:*)
-       echo powerpc-motorola-powermax
-       exit ;;
+       GUESS=powerpc-motorola-powermax
+       ;;
     Motorola:*:4.3:PL8-*)
-       echo powerpc-harris-powermax
-       exit ;;
+       GUESS=powerpc-harris-powermax
+       ;;
     Night_Hawk:*:*:PowerMAX_OS | Synergy:PowerMAX_OS:*:*)
-       echo powerpc-harris-powermax
-       exit ;;
+       GUESS=powerpc-harris-powermax
+       ;;
     Night_Hawk:Power_UNIX:*:*)
-       echo powerpc-harris-powerunix
-       exit ;;
+       GUESS=powerpc-harris-powerunix
+       ;;
     m88k:CX/UX:7*:*)
-       echo m88k-harris-cxux7
-       exit ;;
+       GUESS=m88k-harris-cxux7
+       ;;
     m88k:*:4*:R4*)
-       echo m88k-motorola-sysv4
-       exit ;;
+       GUESS=m88k-motorola-sysv4
+       ;;
     m88k:*:3*:R3*)
-       echo m88k-motorola-sysv3
-       exit ;;
+       GUESS=m88k-motorola-sysv3
+       ;;
     AViiON:dgux:*:*)
        # DG/UX returns AViiON for all architectures
        UNAME_PROCESSOR=`/usr/bin/uname -p`
-       if [ $UNAME_PROCESSOR = mc88100 ] || [ $UNAME_PROCESSOR = mc88110 ]
+       if test "$UNAME_PROCESSOR" = mc88100 || test "$UNAME_PROCESSOR" = mc88110
        then
-           if [ ${TARGET_BINARY_INTERFACE}x = m88kdguxelfx ] || \
-              [ ${TARGET_BINARY_INTERFACE}x = x ]
+           if test "$TARGET_BINARY_INTERFACE"x = m88kdguxelfx || \
+              test "$TARGET_BINARY_INTERFACE"x = x
            then
-               echo m88k-dg-dgux${UNAME_RELEASE}
+               GUESS=m88k-dg-dgux$UNAME_RELEASE
            else
-               echo m88k-dg-dguxbcs${UNAME_RELEASE}
+               GUESS=m88k-dg-dguxbcs$UNAME_RELEASE
            fi
        else
-           echo i586-dg-dgux${UNAME_RELEASE}
+           GUESS=i586-dg-dgux$UNAME_RELEASE
        fi
-       exit ;;
+       ;;
     M88*:DolphinOS:*:*)        # DolphinOS (SVR3)
-       echo m88k-dolphin-sysv3
-       exit ;;
+       GUESS=m88k-dolphin-sysv3
+       ;;
     M88*:*:R3*:*)
        # Delta 88k system running SVR3
-       echo m88k-motorola-sysv3
-       exit ;;
+       GUESS=m88k-motorola-sysv3
+       ;;
     XD88*:*:*:*) # Tektronix XD88 system running UTekV (SVR3)
-       echo m88k-tektronix-sysv3
-       exit ;;
+       GUESS=m88k-tektronix-sysv3
+       ;;
     Tek43[0-9][0-9]:UTek:*:*) # Tektronix 4300 system running UTek (BSD)
-       echo m68k-tektronix-bsd
-       exit ;;
+       GUESS=m68k-tektronix-bsd
+       ;;
     *:IRIX*:*:*)
-       echo mips-sgi-irix`echo ${UNAME_RELEASE}|sed -e 's/-/_/g'`
-       exit ;;
+       IRIX_REL=`echo "$UNAME_RELEASE" | sed -e 's/-/_/g'`
+       GUESS=mips-sgi-irix$IRIX_REL
+       ;;
     ????????:AIX?:[12].1:2)   # AIX 2.2.1 or AIX 2.1.1 is RT/PC AIX.
-       echo romp-ibm-aix     # uname -m gives an 8 hex-code CPU id
-       exit ;;               # Note that: echo "'`uname -s`'" gives 'AIX '
+       GUESS=romp-ibm-aix    # uname -m gives an 8 hex-code CPU id
+       ;;                    # Note that: echo "'`uname -s`'" gives 'AIX '
     i*86:AIX:*:*)
-       echo i386-ibm-aix
-       exit ;;
+       GUESS=i386-ibm-aix
+       ;;
     ia64:AIX:*:*)
-       if [ -x /usr/bin/oslevel ] ; then
+       if test -x /usr/bin/oslevel ; then
                IBM_REV=`/usr/bin/oslevel`
        else
-               IBM_REV=${UNAME_VERSION}.${UNAME_RELEASE}
+               IBM_REV=$UNAME_VERSION.$UNAME_RELEASE
        fi
-       echo ${UNAME_MACHINE}-ibm-aix${IBM_REV}
-       exit ;;
+       GUESS=$UNAME_MACHINE-ibm-aix$IBM_REV
+       ;;
     *:AIX:2:3)
        if grep bos325 /usr/include/stdio.h >/dev/null 2>&1; then
-               eval $set_cc_for_build
-               sed 's/^                //' << EOF >$dummy.c
+               set_cc_for_build
+               sed 's/^                //' << EOF > "$dummy.c"
                #include <sys/systemcfg.h>
 
                main()
@@ -584,77 +636,77 @@ EOF
                        exit(0);
                        }
 EOF
-               if $CC_FOR_BUILD -o $dummy $dummy.c && SYSTEM_NAME=`$dummy`
+               if $CC_FOR_BUILD -o "$dummy" "$dummy.c" && SYSTEM_NAME=`"$dummy"`
                then
-                       echo "$SYSTEM_NAME"
+                       GUESS=$SYSTEM_NAME
                else
-                       echo rs6000-ibm-aix3.2.5
+                       GUESS=rs6000-ibm-aix3.2.5
                fi
        elif grep bos324 /usr/include/stdio.h >/dev/null 2>&1; then
-               echo rs6000-ibm-aix3.2.4
+               GUESS=rs6000-ibm-aix3.2.4
        else
-               echo rs6000-ibm-aix3.2
+               GUESS=rs6000-ibm-aix3.2
        fi
-       exit ;;
+       ;;
     *:AIX:*:[4567])
        IBM_CPU_ID=`/usr/sbin/lsdev -C -c processor -S available | sed 1q | awk '{ print $1 }'`
-       if /usr/sbin/lsattr -El ${IBM_CPU_ID} | grep ' POWER' >/dev/null 2>&1; then
+       if /usr/sbin/lsattr -El "$IBM_CPU_ID" | grep ' POWER' >/dev/null 2>&1; then
                IBM_ARCH=rs6000
        else
                IBM_ARCH=powerpc
        fi
-       if [ -x /usr/bin/lslpp ] ; then
-               IBM_REV=`/usr/bin/lslpp -Lqc bos.rte.libc |
+       if test -x /usr/bin/lslpp ; then
+               IBM_REV=`/usr/bin/lslpp -Lqc bos.rte.libc | \
                           awk -F: '{ print $3 }' | sed s/[0-9]*$/0/`
        else
-               IBM_REV=${UNAME_VERSION}.${UNAME_RELEASE}
+               IBM_REV=$UNAME_VERSION.$UNAME_RELEASE
        fi
-       echo ${IBM_ARCH}-ibm-aix${IBM_REV}
-       exit ;;
+       GUESS=$IBM_ARCH-ibm-aix$IBM_REV
+       ;;
     *:AIX:*:*)
-       echo rs6000-ibm-aix
-       exit ;;
-    ibmrt:4.4BSD:*|romp-ibm:BSD:*)
-       echo romp-ibm-bsd4.4
-       exit ;;
+       GUESS=rs6000-ibm-aix
+       ;;
+    ibmrt:4.4BSD:*|romp-ibm:4.4BSD:*)
+       GUESS=romp-ibm-bsd4.4
+       ;;
     ibmrt:*BSD:*|romp-ibm:BSD:*)            # covers RT/PC BSD and
-       echo romp-ibm-bsd${UNAME_RELEASE}   # 4.3 with uname added to
-       exit ;;                             # report: romp-ibm BSD 4.3
+       GUESS=romp-ibm-bsd$UNAME_RELEASE    # 4.3 with uname added to
+       ;;                                  # report: romp-ibm BSD 4.3
     *:BOSX:*:*)
-       echo rs6000-bull-bosx
-       exit ;;
+       GUESS=rs6000-bull-bosx
+       ;;
     DPX/2?00:B.O.S.:*:*)
-       echo m68k-bull-sysv3
-       exit ;;
+       GUESS=m68k-bull-sysv3
+       ;;
     9000/[34]??:4.3bsd:1.*:*)
-       echo m68k-hp-bsd
-       exit ;;
+       GUESS=m68k-hp-bsd
+       ;;
     hp300:4.4BSD:*:* | 9000/[34]??:4.3bsd:2.*:*)
-       echo m68k-hp-bsd4.4
-       exit ;;
+       GUESS=m68k-hp-bsd4.4
+       ;;
     9000/[34678]??:HP-UX:*:*)
-       HPUX_REV=`echo ${UNAME_RELEASE}|sed -e 's/[^.]*.[0B]*//'`
-       case "${UNAME_MACHINE}" in
-           9000/31? )            HP_ARCH=m68000 ;;
-           9000/[34]?? )         HP_ARCH=m68k ;;
+       HPUX_REV=`echo "$UNAME_RELEASE" | sed -e 's/[^.]*.[0B]*//'`
+       case $UNAME_MACHINE in
+           9000/31?)            HP_ARCH=m68000 ;;
+           9000/[34]??)         HP_ARCH=m68k ;;
            9000/[678][0-9][0-9])
-               if [ -x /usr/bin/getconf ]; then
+               if test -x /usr/bin/getconf; then
                    sc_cpu_version=`/usr/bin/getconf SC_CPU_VERSION 2>/dev/null`
                    sc_kernel_bits=`/usr/bin/getconf SC_KERNEL_BITS 2>/dev/null`
-                   case "${sc_cpu_version}" in
+                   case $sc_cpu_version in
                      523) HP_ARCH=hppa1.0 ;; # CPU_PA_RISC1_0
                      528) HP_ARCH=hppa1.1 ;; # CPU_PA_RISC1_1
                      532)                      # CPU_PA_RISC2_0
-                       case "${sc_kernel_bits}" in
+                       case $sc_kernel_bits in
                          32) HP_ARCH=hppa2.0n ;;
                          64) HP_ARCH=hppa2.0w ;;
                          '') HP_ARCH=hppa2.0 ;;   # HP-UX 10.20
                        esac ;;
                    esac
                fi
-               if [ "${HP_ARCH}" = "" ]; then
-                   eval $set_cc_for_build
-                   sed 's/^            //' << EOF >$dummy.c
+               if test "$HP_ARCH" = ""; then
+                   set_cc_for_build
+                   sed 's/^            //' << EOF > "$dummy.c"
 
                #define _HPUX_SOURCE
                #include <stdlib.h>
@@ -687,13 +739,13 @@ EOF
                    exit (0);
                }
 EOF
-                   (CCOPTS="" $CC_FOR_BUILD -o $dummy $dummy.c 2>/dev/null) && HP_ARCH=`$dummy`
+                   (CCOPTS="" $CC_FOR_BUILD -o "$dummy" "$dummy.c" 2>/dev/null) && HP_ARCH=`"$dummy"`
                    test -z "$HP_ARCH" && HP_ARCH=hppa
                fi ;;
        esac
-       if [ ${HP_ARCH} = hppa2.0w ]
+       if test "$HP_ARCH" = hppa2.0w
        then
-           eval $set_cc_for_build
+           set_cc_for_build
 
            # hppa2.0w-hp-hpux* has a 64-bit kernel and a compiler generating
            # 32-bit code.  hppa64-hp-hpux* has the same kernel and a compiler
@@ -712,15 +764,15 @@ EOF
                HP_ARCH=hppa64
            fi
        fi
-       echo ${HP_ARCH}-hp-hpux${HPUX_REV}
-       exit ;;
+       GUESS=$HP_ARCH-hp-hpux$HPUX_REV
+       ;;
     ia64:HP-UX:*:*)
-       HPUX_REV=`echo ${UNAME_RELEASE}|sed -e 's/[^.]*.[0B]*//'`
-       echo ia64-hp-hpux${HPUX_REV}
-       exit ;;
+       HPUX_REV=`echo "$UNAME_RELEASE" | sed -e 's/[^.]*.[0B]*//'`
+       GUESS=ia64-hp-hpux$HPUX_REV
+       ;;
     3050*:HI-UX:*:*)
-       eval $set_cc_for_build
-       sed 's/^        //' << EOF >$dummy.c
+       set_cc_for_build
+       sed 's/^        //' << EOF > "$dummy.c"
        #include <unistd.h>
        int
        main ()
@@ -745,38 +797,38 @@ EOF
          exit (0);
        }
 EOF
-       $CC_FOR_BUILD -o $dummy $dummy.c && SYSTEM_NAME=`$dummy` &&
+       $CC_FOR_BUILD -o "$dummy" "$dummy.c" && SYSTEM_NAME=`"$dummy"` &&
                { echo "$SYSTEM_NAME"; exit; }
-       echo unknown-hitachi-hiuxwe2
-       exit ;;
-    9000/7??:4.3bsd:*:* | 9000/8?[79]:4.3bsd:*:* )
-       echo hppa1.1-hp-bsd
-       exit ;;
+       GUESS=unknown-hitachi-hiuxwe2
+       ;;
+    9000/7??:4.3bsd:*:* | 9000/8?[79]:4.3bsd:*:*)
+       GUESS=hppa1.1-hp-bsd
+       ;;
     9000/8??:4.3bsd:*:*)
-       echo hppa1.0-hp-bsd
-       exit ;;
+       GUESS=hppa1.0-hp-bsd
+       ;;
     *9??*:MPE/iX:*:* | *3000*:MPE/iX:*:*)
-       echo hppa1.0-hp-mpeix
-       exit ;;
-    hp7??:OSF1:*:* | hp8?[79]:OSF1:*:* )
-       echo hppa1.1-hp-osf
-       exit ;;
+       GUESS=hppa1.0-hp-mpeix
+       ;;
+    hp7??:OSF1:*:* | hp8?[79]:OSF1:*:*)
+       GUESS=hppa1.1-hp-osf
+       ;;
     hp8??:OSF1:*:*)
-       echo hppa1.0-hp-osf
-       exit ;;
+       GUESS=hppa1.0-hp-osf
+       ;;
     i*86:OSF1:*:*)
-       if [ -x /usr/sbin/sysversion ] ; then
-           echo ${UNAME_MACHINE}-unknown-osf1mk
+       if test -x /usr/sbin/sysversion ; then
+           GUESS=$UNAME_MACHINE-unknown-osf1mk
        else
-           echo ${UNAME_MACHINE}-unknown-osf1
+           GUESS=$UNAME_MACHINE-unknown-osf1
        fi
-       exit ;;
+       ;;
     parisc*:Lites*:*:*)
-       echo hppa1.1-hp-lites
-       exit ;;
+       GUESS=hppa1.1-hp-lites
+       ;;
     C1*:ConvexOS:*:* | convex:ConvexOS:C1*:*)
-       echo c1-convex-bsd
-       exit ;;
+       GUESS=c1-convex-bsd
+       ;;
     C2*:ConvexOS:*:* | convex:ConvexOS:C2*:*)
        if getsysinfo -f scalar_acc
        then echo c32-convex-bsd
@@ -784,139 +836,148 @@ EOF
        fi
        exit ;;
     C34*:ConvexOS:*:* | convex:ConvexOS:C34*:*)
-       echo c34-convex-bsd
-       exit ;;
+       GUESS=c34-convex-bsd
+       ;;
     C38*:ConvexOS:*:* | convex:ConvexOS:C38*:*)
-       echo c38-convex-bsd
-       exit ;;
+       GUESS=c38-convex-bsd
+       ;;
     C4*:ConvexOS:*:* | convex:ConvexOS:C4*:*)
-       echo c4-convex-bsd
-       exit ;;
+       GUESS=c4-convex-bsd
+       ;;
     CRAY*Y-MP:*:*:*)
-       echo ymp-cray-unicos${UNAME_RELEASE} | sed -e 's/\.[^.]*$/.X/'
-       exit ;;
+       CRAY_REL=`echo "$UNAME_RELEASE" | sed -e 's/\.[^.]*$/.X/'`
+       GUESS=ymp-cray-unicos$CRAY_REL
+       ;;
     CRAY*[A-Z]90:*:*:*)
-       echo ${UNAME_MACHINE}-cray-unicos${UNAME_RELEASE} \
+       echo "$UNAME_MACHINE"-cray-unicos"$UNAME_RELEASE" \
        | sed -e 's/CRAY.*\([A-Z]90\)/\1/' \
              -e y/ABCDEFGHIJKLMNOPQRSTUVWXYZ/abcdefghijklmnopqrstuvwxyz/ \
              -e 's/\.[^.]*$/.X/'
        exit ;;
     CRAY*TS:*:*:*)
-       echo t90-cray-unicos${UNAME_RELEASE} | sed -e 's/\.[^.]*$/.X/'
-       exit ;;
+       CRAY_REL=`echo "$UNAME_RELEASE" | sed -e 's/\.[^.]*$/.X/'`
+       GUESS=t90-cray-unicos$CRAY_REL
+       ;;
     CRAY*T3E:*:*:*)
-       echo alphaev5-cray-unicosmk${UNAME_RELEASE} | sed -e 's/\.[^.]*$/.X/'
-       exit ;;
+       CRAY_REL=`echo "$UNAME_RELEASE" | sed -e 's/\.[^.]*$/.X/'`
+       GUESS=alphaev5-cray-unicosmk$CRAY_REL
+       ;;
     CRAY*SV1:*:*:*)
-       echo sv1-cray-unicos${UNAME_RELEASE} | sed -e 's/\.[^.]*$/.X/'
-       exit ;;
+       CRAY_REL=`echo "$UNAME_RELEASE" | sed -e 's/\.[^.]*$/.X/'`
+       GUESS=sv1-cray-unicos$CRAY_REL
+       ;;
     *:UNICOS/mp:*:*)
-       echo craynv-cray-unicosmp${UNAME_RELEASE} | sed -e 's/\.[^.]*$/.X/'
-       exit ;;
+       CRAY_REL=`echo "$UNAME_RELEASE" | sed -e 's/\.[^.]*$/.X/'`
+       GUESS=craynv-cray-unicosmp$CRAY_REL
+       ;;
     F30[01]:UNIX_System_V:*:* | F700:UNIX_System_V:*:*)
        FUJITSU_PROC=`uname -m | tr ABCDEFGHIJKLMNOPQRSTUVWXYZ abcdefghijklmnopqrstuvwxyz`
        FUJITSU_SYS=`uname -p | tr ABCDEFGHIJKLMNOPQRSTUVWXYZ abcdefghijklmnopqrstuvwxyz | sed -e 's/\///'`
-       FUJITSU_REL=`echo ${UNAME_RELEASE} | sed -e 's/ /_/'`
-       echo "${FUJITSU_PROC}-fujitsu-${FUJITSU_SYS}${FUJITSU_REL}"
-       exit ;;
+       FUJITSU_REL=`echo "$UNAME_RELEASE" | sed -e 's/ /_/'`
+       GUESS=${FUJITSU_PROC}-fujitsu-${FUJITSU_SYS}${FUJITSU_REL}
+       ;;
     5000:UNIX_System_V:4.*:*)
        FUJITSU_SYS=`uname -p | tr ABCDEFGHIJKLMNOPQRSTUVWXYZ abcdefghijklmnopqrstuvwxyz | sed -e 's/\///'`
-       FUJITSU_REL=`echo ${UNAME_RELEASE} | tr ABCDEFGHIJKLMNOPQRSTUVWXYZ abcdefghijklmnopqrstuvwxyz | sed -e 's/ /_/'`
-       echo "sparc-fujitsu-${FUJITSU_SYS}${FUJITSU_REL}"
-       exit ;;
+       FUJITSU_REL=`echo "$UNAME_RELEASE" | tr ABCDEFGHIJKLMNOPQRSTUVWXYZ abcdefghijklmnopqrstuvwxyz | sed -e 's/ /_/'`
+       GUESS=sparc-fujitsu-${FUJITSU_SYS}${FUJITSU_REL}
+       ;;
     i*86:BSD/386:*:* | i*86:BSD/OS:*:* | *:Ascend\ Embedded/OS:*:*)
-       echo ${UNAME_MACHINE}-pc-bsdi${UNAME_RELEASE}
-       exit ;;
+       GUESS=$UNAME_MACHINE-pc-bsdi$UNAME_RELEASE
+       ;;
     sparc*:BSD/OS:*:*)
-       echo sparc-unknown-bsdi${UNAME_RELEASE}
-       exit ;;
+       GUESS=sparc-unknown-bsdi$UNAME_RELEASE
+       ;;
     *:BSD/OS:*:*)
-       echo ${UNAME_MACHINE}-unknown-bsdi${UNAME_RELEASE}
-       exit ;;
+       GUESS=$UNAME_MACHINE-unknown-bsdi$UNAME_RELEASE
+       ;;
+    arm:FreeBSD:*:*)
+       UNAME_PROCESSOR=`uname -p`
+       set_cc_for_build
+       if echo __ARM_PCS_VFP | $CC_FOR_BUILD -E - 2>/dev/null \
+           | grep -q __ARM_PCS_VFP
+       then
+           FREEBSD_REL=`echo "$UNAME_RELEASE" | sed -e 's/[-(].*//'`
+           GUESS=$UNAME_PROCESSOR-unknown-freebsd$FREEBSD_REL-gnueabi
+       else
+           FREEBSD_REL=`echo "$UNAME_RELEASE" | sed -e 's/[-(].*//'`
+           GUESS=$UNAME_PROCESSOR-unknown-freebsd$FREEBSD_REL-gnueabihf
+       fi
+       ;;
     *:FreeBSD:*:*)
        UNAME_PROCESSOR=`/usr/bin/uname -p`
-       case ${UNAME_PROCESSOR} in
+       case $UNAME_PROCESSOR in
            amd64)
-               echo x86_64-unknown-freebsd`echo ${UNAME_RELEASE}|sed -e 's/[-(].*//'` ;;
-           *)
-               echo ${UNAME_PROCESSOR}-unknown-freebsd`echo ${UNAME_RELEASE}|sed -e 's/[-(].*//'` ;;
+               UNAME_PROCESSOR=x86_64 ;;
+           i386)
+               UNAME_PROCESSOR=i586 ;;
        esac
-       exit ;;
+       FREEBSD_REL=`echo "$UNAME_RELEASE" | sed -e 's/[-(].*//'`
+       GUESS=$UNAME_PROCESSOR-unknown-freebsd$FREEBSD_REL
+       ;;
     i*:CYGWIN*:*)
-       echo ${UNAME_MACHINE}-pc-cygwin
-       exit ;;
+       GUESS=$UNAME_MACHINE-pc-cygwin
+       ;;
     *:MINGW64*:*)
-       echo ${UNAME_MACHINE}-pc-mingw64
-       exit ;;
+       GUESS=$UNAME_MACHINE-pc-mingw64
+       ;;
     *:MINGW*:*)
-       echo ${UNAME_MACHINE}-pc-mingw32
-       exit ;;
+       GUESS=$UNAME_MACHINE-pc-mingw32
+       ;;
     *:MSYS*:*)
-       echo ${UNAME_MACHINE}-pc-msys
-       exit ;;
-    i*:windows32*:*)
-       # uname -m includes "-pc" on this system.
-       echo ${UNAME_MACHINE}-mingw32
-       exit ;;
+       GUESS=$UNAME_MACHINE-pc-msys
+       ;;
     i*:PW*:*)
-       echo ${UNAME_MACHINE}-pc-pw32
-       exit ;;
+       GUESS=$UNAME_MACHINE-pc-pw32
+       ;;
+    *:SerenityOS:*:*)
+        GUESS=$UNAME_MACHINE-pc-serenity
+        ;;
     *:Interix*:*)
-       case ${UNAME_MACHINE} in
+       case $UNAME_MACHINE in
            x86)
-               echo i586-pc-interix${UNAME_RELEASE}
-               exit ;;
+               GUESS=i586-pc-interix$UNAME_RELEASE
+               ;;
            authenticamd | genuineintel | EM64T)
-               echo x86_64-unknown-interix${UNAME_RELEASE}
-               exit ;;
+               GUESS=x86_64-unknown-interix$UNAME_RELEASE
+               ;;
            IA64)
-               echo ia64-unknown-interix${UNAME_RELEASE}
-               exit ;;
+               GUESS=ia64-unknown-interix$UNAME_RELEASE
+               ;;
        esac ;;
-    [345]86:Windows_95:* | [345]86:Windows_98:* | [345]86:Windows_NT:*)
-       echo i${UNAME_MACHINE}-pc-mks
-       exit ;;
-    8664:Windows_NT:*)
-       echo x86_64-pc-mks
-       exit ;;
-    i*:Windows_NT*:* | Pentium*:Windows_NT*:*)
-       # How do we know it's Interix rather than the generic POSIX subsystem?
-       # It also conflicts with pre-2.0 versions of AT&T UWIN. Should we
-       # UNAME_MACHINE based on the output of uname instead of i386?
-       echo i586-pc-interix
-       exit ;;
     i*:UWIN*:*)
-       echo ${UNAME_MACHINE}-pc-uwin
-       exit ;;
+       GUESS=$UNAME_MACHINE-pc-uwin
+       ;;
     amd64:CYGWIN*:*:* | x86_64:CYGWIN*:*:*)
-       echo x86_64-unknown-cygwin
-       exit ;;
-    p*:CYGWIN*:*)
-       echo powerpcle-unknown-cygwin
-       exit ;;
+       GUESS=x86_64-pc-cygwin
+       ;;
     prep*:SunOS:5.*:*)
-       echo powerpcle-unknown-solaris2`echo ${UNAME_RELEASE}|sed -e 's/[^.]*//'`
-       exit ;;
+       SUN_REL=`echo "$UNAME_RELEASE" | sed -e 's/[^.]*//'`
+       GUESS=powerpcle-unknown-solaris2$SUN_REL
+       ;;
     *:GNU:*:*)
        # the GNU system
-       echo `echo ${UNAME_MACHINE}|sed -e 's,[-/].*$,,'`-unknown-${LIBC}`echo ${UNAME_RELEASE}|sed -e 's,/.*$,,'`
-       exit ;;
+       GNU_ARCH=`echo "$UNAME_MACHINE" | sed -e 's,[-/].*$,,'`
+       GNU_REL=`echo "$UNAME_RELEASE" | sed -e 's,/.*$,,'`
+       GUESS=$GNU_ARCH-unknown-$LIBC$GNU_REL
+       ;;
     *:GNU/*:*:*)
        # other systems with GNU libc and userland
-       echo ${UNAME_MACHINE}-unknown-`echo ${UNAME_SYSTEM} | sed 's,^[^/]*/,,' | tr "[:upper:]" "[:lower:]"``echo ${UNAME_RELEASE}|sed -e 's/[-(].*//'`-${LIBC}
-       exit ;;
-    i*86:Minix:*:*)
-       echo ${UNAME_MACHINE}-pc-minix
-       exit ;;
+       GNU_SYS=`echo "$UNAME_SYSTEM" | sed 's,^[^/]*/,,' | tr "[:upper:]" "[:lower:]"`
+       GNU_REL=`echo "$UNAME_RELEASE" | sed -e 's/[-(].*//'`
+       GUESS=$UNAME_MACHINE-unknown-$GNU_SYS$GNU_REL-$LIBC
+       ;;
+    *:Minix:*:*)
+       GUESS=$UNAME_MACHINE-unknown-minix
+       ;;
     aarch64:Linux:*:*)
-       echo ${UNAME_MACHINE}-unknown-linux-${LIBC}
-       exit ;;
+       GUESS=$UNAME_MACHINE-unknown-linux-$LIBC
+       ;;
     aarch64_be:Linux:*:*)
        UNAME_MACHINE=aarch64_be
-       echo ${UNAME_MACHINE}-unknown-linux-${LIBC}
-       exit ;;
+       GUESS=$UNAME_MACHINE-unknown-linux-$LIBC
+       ;;
     alpha:Linux:*:*)
-       case `sed -n '/^cpu model/s/^.*: \(.*\)/\1/p' < /proc/cpuinfo` in
+       case `sed -n '/^cpu model/s/^.*: \(.*\)/\1/p' /proc/cpuinfo 2>/dev/null` in
          EV5)   UNAME_MACHINE=alphaev5 ;;
          EV56)  UNAME_MACHINE=alphaev56 ;;
          PCA56) UNAME_MACHINE=alphapca56 ;;
@@ -927,177 +988,225 @@ EOF
        esac
        objdump --private-headers /bin/sh | grep -q ld.so.1
        if test "$?" = 0 ; then LIBC=gnulibc1 ; fi
-       echo ${UNAME_MACHINE}-unknown-linux-${LIBC}
-       exit ;;
-    arc:Linux:*:* | arceb:Linux:*:*)
-       echo ${UNAME_MACHINE}-unknown-linux-${LIBC}
-       exit ;;
+       GUESS=$UNAME_MACHINE-unknown-linux-$LIBC
+       ;;
+    arc:Linux:*:* | arceb:Linux:*:* | arc32:Linux:*:* | arc64:Linux:*:*)
+       GUESS=$UNAME_MACHINE-unknown-linux-$LIBC
+       ;;
     arm*:Linux:*:*)
-       eval $set_cc_for_build
+       set_cc_for_build
        if echo __ARM_EABI__ | $CC_FOR_BUILD -E - 2>/dev/null \
            | grep -q __ARM_EABI__
        then
-           echo ${UNAME_MACHINE}-unknown-linux-${LIBC}
+           GUESS=$UNAME_MACHINE-unknown-linux-$LIBC
        else
            if echo __ARM_PCS_VFP | $CC_FOR_BUILD -E - 2>/dev/null \
                | grep -q __ARM_PCS_VFP
            then
-               echo ${UNAME_MACHINE}-unknown-linux-${LIBC}eabi
+               GUESS=$UNAME_MACHINE-unknown-linux-${LIBC}eabi
            else
-               echo ${UNAME_MACHINE}-unknown-linux-${LIBC}eabihf
+               GUESS=$UNAME_MACHINE-unknown-linux-${LIBC}eabihf
            fi
        fi
-       exit ;;
+       ;;
     avr32*:Linux:*:*)
-       echo ${UNAME_MACHINE}-unknown-linux-${LIBC}
-       exit ;;
+       GUESS=$UNAME_MACHINE-unknown-linux-$LIBC
+       ;;
     cris:Linux:*:*)
-       echo ${UNAME_MACHINE}-axis-linux-${LIBC}
-       exit ;;
+       GUESS=$UNAME_MACHINE-axis-linux-$LIBC
+       ;;
     crisv32:Linux:*:*)
-       echo ${UNAME_MACHINE}-axis-linux-${LIBC}
-       exit ;;
+       GUESS=$UNAME_MACHINE-axis-linux-$LIBC
+       ;;
     e2k:Linux:*:*)
-       echo ${UNAME_MACHINE}-unknown-linux-${LIBC}
-       exit ;;
+       GUESS=$UNAME_MACHINE-unknown-linux-$LIBC
+       ;;
     frv:Linux:*:*)
-       echo ${UNAME_MACHINE}-unknown-linux-${LIBC}
-       exit ;;
+       GUESS=$UNAME_MACHINE-unknown-linux-$LIBC
+       ;;
     hexagon:Linux:*:*)
-       echo ${UNAME_MACHINE}-unknown-linux-${LIBC}
-       exit ;;
+       GUESS=$UNAME_MACHINE-unknown-linux-$LIBC
+       ;;
     i*86:Linux:*:*)
-       echo ${UNAME_MACHINE}-pc-linux-${LIBC}
-       exit ;;
+       GUESS=$UNAME_MACHINE-pc-linux-$LIBC
+       ;;
     ia64:Linux:*:*)
-       echo ${UNAME_MACHINE}-unknown-linux-${LIBC}
-       exit ;;
+       GUESS=$UNAME_MACHINE-unknown-linux-$LIBC
+       ;;
     k1om:Linux:*:*)
-       echo ${UNAME_MACHINE}-unknown-linux-${LIBC}
-       exit ;;
+       GUESS=$UNAME_MACHINE-unknown-linux-$LIBC
+       ;;
+    loongarch32:Linux:*:* | loongarch64:Linux:*:* | loongarchx32:Linux:*:*)
+       GUESS=$UNAME_MACHINE-unknown-linux-$LIBC
+       ;;
     m32r*:Linux:*:*)
-       echo ${UNAME_MACHINE}-unknown-linux-${LIBC}
-       exit ;;
+       GUESS=$UNAME_MACHINE-unknown-linux-$LIBC
+       ;;
     m68*:Linux:*:*)
-       echo ${UNAME_MACHINE}-unknown-linux-${LIBC}
-       exit ;;
+       GUESS=$UNAME_MACHINE-unknown-linux-$LIBC
+       ;;
     mips:Linux:*:* | mips64:Linux:*:*)
-       eval $set_cc_for_build
-       sed 's/^        //' << EOF >$dummy.c
+       set_cc_for_build
+       IS_GLIBC=0
+       test x"${LIBC}" = xgnu && IS_GLIBC=1
+       sed 's/^        //' << EOF > "$dummy.c"
        #undef CPU
-       #undef ${UNAME_MACHINE}
-       #undef ${UNAME_MACHINE}el
+       #undef mips
+       #undef mipsel
+       #undef mips64
+       #undef mips64el
+       #if ${IS_GLIBC} && defined(_ABI64)
+       LIBCABI=gnuabi64
+       #else
+       #if ${IS_GLIBC} && defined(_ABIN32)
+       LIBCABI=gnuabin32
+       #else
+       LIBCABI=${LIBC}
+       #endif
+       #endif
+
+       #if ${IS_GLIBC} && defined(__mips64) && defined(__mips_isa_rev) && __mips_isa_rev>=6
+       CPU=mipsisa64r6
+       #else
+       #if ${IS_GLIBC} && !defined(__mips64) && defined(__mips_isa_rev) && __mips_isa_rev>=6
+       CPU=mipsisa32r6
+       #else
+       #if defined(__mips64)
+       CPU=mips64
+       #else
+       CPU=mips
+       #endif
+       #endif
+       #endif
+
        #if defined(__MIPSEL__) || defined(__MIPSEL) || defined(_MIPSEL) || defined(MIPSEL)
-       CPU=${UNAME_MACHINE}el
+       MIPS_ENDIAN=el
        #else
        #if defined(__MIPSEB__) || defined(__MIPSEB) || defined(_MIPSEB) || defined(MIPSEB)
-       CPU=${UNAME_MACHINE}
+       MIPS_ENDIAN=
        #else
-       CPU=
+       MIPS_ENDIAN=
        #endif
        #endif
 EOF
-       eval `$CC_FOR_BUILD -E $dummy.c 2>/dev/null | grep '^CPU'`
-       test x"${CPU}" != x && { echo "${CPU}-unknown-linux-${LIBC}"; exit; }
+       cc_set_vars=`$CC_FOR_BUILD -E "$dummy.c" 2>/dev/null | grep '^CPU\|^MIPS_ENDIAN\|^LIBCABI'`
+       eval "$cc_set_vars"
+       test "x$CPU" != x && { echo "$CPU${MIPS_ENDIAN}-unknown-linux-$LIBCABI"; exit; }
+       ;;
+    mips64el:Linux:*:*)
+       GUESS=$UNAME_MACHINE-unknown-linux-$LIBC
        ;;
     openrisc*:Linux:*:*)
-       echo or1k-unknown-linux-${LIBC}
-       exit ;;
+       GUESS=or1k-unknown-linux-$LIBC
+       ;;
     or32:Linux:*:* | or1k*:Linux:*:*)
-       echo ${UNAME_MACHINE}-unknown-linux-${LIBC}
-       exit ;;
+       GUESS=$UNAME_MACHINE-unknown-linux-$LIBC
+       ;;
     padre:Linux:*:*)
-       echo sparc-unknown-linux-${LIBC}
-       exit ;;
+       GUESS=sparc-unknown-linux-$LIBC
+       ;;
     parisc64:Linux:*:* | hppa64:Linux:*:*)
-       echo hppa64-unknown-linux-${LIBC}
-       exit ;;
+       GUESS=hppa64-unknown-linux-$LIBC
+       ;;
     parisc:Linux:*:* | hppa:Linux:*:*)
        # Look for CPU level
        case `grep '^cpu[^a-z]*:' /proc/cpuinfo 2>/dev/null | cut -d' ' -f2` in
-         PA7*) echo hppa1.1-unknown-linux-${LIBC} ;;
-         PA8*) echo hppa2.0-unknown-linux-${LIBC} ;;
-         *)    echo hppa-unknown-linux-${LIBC} ;;
+         PA7*) GUESS=hppa1.1-unknown-linux-$LIBC ;;
+         PA8*) GUESS=hppa2.0-unknown-linux-$LIBC ;;
+         *)    GUESS=hppa-unknown-linux-$LIBC ;;
        esac
-       exit ;;
+       ;;
     ppc64:Linux:*:*)
-       echo powerpc64-unknown-linux-${LIBC}
-       exit ;;
+       GUESS=powerpc64-unknown-linux-$LIBC
+       ;;
     ppc:Linux:*:*)
-       echo powerpc-unknown-linux-${LIBC}
-       exit ;;
+       GUESS=powerpc-unknown-linux-$LIBC
+       ;;
     ppc64le:Linux:*:*)
-       echo powerpc64le-unknown-linux-${LIBC}
-       exit ;;
+       GUESS=powerpc64le-unknown-linux-$LIBC
+       ;;
     ppcle:Linux:*:*)
-       echo powerpcle-unknown-linux-${LIBC}
-       exit ;;
+       GUESS=powerpcle-unknown-linux-$LIBC
+       ;;
+    riscv32:Linux:*:* | riscv32be:Linux:*:* | riscv64:Linux:*:* | riscv64be:Linux:*:*)
+       GUESS=$UNAME_MACHINE-unknown-linux-$LIBC
+       ;;
     s390:Linux:*:* | s390x:Linux:*:*)
-       echo ${UNAME_MACHINE}-ibm-linux-${LIBC}
-       exit ;;
+       GUESS=$UNAME_MACHINE-ibm-linux-$LIBC
+       ;;
     sh64*:Linux:*:*)
-       echo ${UNAME_MACHINE}-unknown-linux-${LIBC}
-       exit ;;
+       GUESS=$UNAME_MACHINE-unknown-linux-$LIBC
+       ;;
     sh*:Linux:*:*)
-       echo ${UNAME_MACHINE}-unknown-linux-${LIBC}
-       exit ;;
+       GUESS=$UNAME_MACHINE-unknown-linux-$LIBC
+       ;;
     sparc:Linux:*:* | sparc64:Linux:*:*)
-       echo ${UNAME_MACHINE}-unknown-linux-${LIBC}
-       exit ;;
+       GUESS=$UNAME_MACHINE-unknown-linux-$LIBC
+       ;;
     tile*:Linux:*:*)
-       echo ${UNAME_MACHINE}-unknown-linux-${LIBC}
-       exit ;;
+       GUESS=$UNAME_MACHINE-unknown-linux-$LIBC
+       ;;
     vax:Linux:*:*)
-       echo ${UNAME_MACHINE}-dec-linux-${LIBC}
-       exit ;;
+       GUESS=$UNAME_MACHINE-dec-linux-$LIBC
+       ;;
     x86_64:Linux:*:*)
-       echo ${UNAME_MACHINE}-pc-linux-${LIBC}
-       exit ;;
+       set_cc_for_build
+       LIBCABI=$LIBC
+       if test "$CC_FOR_BUILD" != no_compiler_found; then
+           if (echo '#ifdef __ILP32__'; echo IS_X32; echo '#endif') | \
+               (CCOPTS="" $CC_FOR_BUILD -E - 2>/dev/null) | \
+               grep IS_X32 >/dev/null
+           then
+               LIBCABI=${LIBC}x32
+           fi
+       fi
+       GUESS=$UNAME_MACHINE-pc-linux-$LIBCABI
+       ;;
     xtensa*:Linux:*:*)
-       echo ${UNAME_MACHINE}-unknown-linux-${LIBC}
-       exit ;;
+       GUESS=$UNAME_MACHINE-unknown-linux-$LIBC
+       ;;
     i*86:DYNIX/ptx:4*:*)
        # ptx 4.0 does uname -s correctly, with DYNIX/ptx in there.
        # earlier versions are messed up and put the nodename in both
        # sysname and nodename.
-       echo i386-sequent-sysv4
-       exit ;;
+       GUESS=i386-sequent-sysv4
+       ;;
     i*86:UNIX_SV:4.2MP:2.*)
        # Unixware is an offshoot of SVR4, but it has its own version
        # number series starting with 2...
        # I am not positive that other SVR4 systems won't match this,
        # I just have to hope.  -- rms.
        # Use sysv4.2uw... so that sysv4* matches it.
-       echo ${UNAME_MACHINE}-pc-sysv4.2uw${UNAME_VERSION}
-       exit ;;
+       GUESS=$UNAME_MACHINE-pc-sysv4.2uw$UNAME_VERSION
+       ;;
     i*86:OS/2:*:*)
        # If we were able to find `uname', then EMX Unix compatibility
        # is probably installed.
-       echo ${UNAME_MACHINE}-pc-os2-emx
-       exit ;;
+       GUESS=$UNAME_MACHINE-pc-os2-emx
+       ;;
     i*86:XTS-300:*:STOP)
-       echo ${UNAME_MACHINE}-unknown-stop
-       exit ;;
+       GUESS=$UNAME_MACHINE-unknown-stop
+       ;;
     i*86:atheos:*:*)
-       echo ${UNAME_MACHINE}-unknown-atheos
-       exit ;;
+       GUESS=$UNAME_MACHINE-unknown-atheos
+       ;;
     i*86:syllable:*:*)
-       echo ${UNAME_MACHINE}-pc-syllable
-       exit ;;
+       GUESS=$UNAME_MACHINE-pc-syllable
+       ;;
     i*86:LynxOS:2.*:* | i*86:LynxOS:3.[01]*:* | i*86:LynxOS:4.[02]*:*)
-       echo i386-unknown-lynxos${UNAME_RELEASE}
-       exit ;;
+       GUESS=i386-unknown-lynxos$UNAME_RELEASE
+       ;;
     i*86:*DOS:*:*)
-       echo ${UNAME_MACHINE}-pc-msdosdjgpp
-       exit ;;
-    i*86:*:4.*:* | i*86:SYSTEM_V:4.*:*)
-       UNAME_REL=`echo ${UNAME_RELEASE} | sed 's/\/MP$//'`
+       GUESS=$UNAME_MACHINE-pc-msdosdjgpp
+       ;;
+    i*86:*:4.*:*)
+       UNAME_REL=`echo "$UNAME_RELEASE" | sed 's/\/MP$//'`
        if grep Novell /usr/include/link.h >/dev/null 2>/dev/null; then
-               echo ${UNAME_MACHINE}-univel-sysv${UNAME_REL}
+               GUESS=$UNAME_MACHINE-univel-sysv$UNAME_REL
        else
-               echo ${UNAME_MACHINE}-pc-sysv${UNAME_REL}
+               GUESS=$UNAME_MACHINE-pc-sysv$UNAME_REL
        fi
-       exit ;;
+       ;;
     i*86:*:5:[678]*)
        # UnixWare 7.x, OpenUNIX and OpenServer 6.
        case `/bin/uname -X | grep "^Machine"` in
@@ -1105,12 +1214,12 @@ EOF
            *Pentium)        UNAME_MACHINE=i586 ;;
            *Pent*|*Celeron) UNAME_MACHINE=i686 ;;
        esac
-       echo ${UNAME_MACHINE}-unknown-sysv${UNAME_RELEASE}${UNAME_SYSTEM}${UNAME_VERSION}
-       exit ;;
+       GUESS=$UNAME_MACHINE-unknown-sysv${UNAME_RELEASE}${UNAME_SYSTEM}${UNAME_VERSION}
+       ;;
     i*86:*:3.2:*)
        if test -f /usr/options/cb.name; then
                UNAME_REL=`sed -n 's/.*Version //p' </usr/options/cb.name`
-               echo ${UNAME_MACHINE}-pc-isc$UNAME_REL
+               GUESS=$UNAME_MACHINE-pc-isc$UNAME_REL
        elif /bin/uname -X 2>/dev/null >/dev/null ; then
                UNAME_REL=`(/bin/uname -X|grep Release|sed -e 's/.*= //')`
                (/bin/uname -X|grep i80486 >/dev/null) && UNAME_MACHINE=i486
@@ -1120,11 +1229,11 @@ EOF
                        && UNAME_MACHINE=i686
                (/bin/uname -X|grep '^Machine.*Pentium Pro' >/dev/null) \
                        && UNAME_MACHINE=i686
-               echo ${UNAME_MACHINE}-pc-sco$UNAME_REL
+               GUESS=$UNAME_MACHINE-pc-sco$UNAME_REL
        else
-               echo ${UNAME_MACHINE}-pc-sysv32
+               GUESS=$UNAME_MACHINE-pc-sysv32
        fi
-       exit ;;
+       ;;
     pc:*:*:*)
        # Left here for compatibility:
        # uname -m prints for DJGPP always 'pc', but it prints nothing about
@@ -1132,31 +1241,31 @@ EOF
        # Note: whatever this is, it MUST be the same as what config.sub
        # prints for the "djgpp" host, or else GDB configure will decide that
        # this is a cross-build.
-       echo i586-pc-msdosdjgpp
-       exit ;;
+       GUESS=i586-pc-msdosdjgpp
+       ;;
     Intel:Mach:3*:*)
-       echo i386-pc-mach3
-       exit ;;
+       GUESS=i386-pc-mach3
+       ;;
     paragon:*:*:*)
-       echo i860-intel-osf1
-       exit ;;
+       GUESS=i860-intel-osf1
+       ;;
     i860:*:4.*:*) # i860-SVR4
        if grep Stardent /usr/include/sys/uadmin.h >/dev/null 2>&1 ; then
-         echo i860-stardent-sysv${UNAME_RELEASE} # Stardent Vistra i860-SVR4
+         GUESS=i860-stardent-sysv$UNAME_RELEASE    # Stardent Vistra i860-SVR4
        else # Add other i860-SVR4 vendors below as they are discovered.
-         echo i860-unknown-sysv${UNAME_RELEASE}  # Unknown i860-SVR4
+         GUESS=i860-unknown-sysv$UNAME_RELEASE     # Unknown i860-SVR4
        fi
-       exit ;;
+       ;;
     mini*:CTIX:SYS*5:*)
        # "miniframe"
-       echo m68010-convergent-sysv
-       exit ;;
+       GUESS=m68010-convergent-sysv
+       ;;
     mc68k:UNIX:SYSTEM5:3.51m)
-       echo m68k-convergent-sysv
-       exit ;;
+       GUESS=m68k-convergent-sysv
+       ;;
     M680?0:D-NIX:5.3:*)
-       echo m68k-diab-dnix
-       exit ;;
+       GUESS=m68k-diab-dnix
+       ;;
     M68*:*:R3V[5678]*:*)
        test -r /sysV68 && { echo 'm68k-motorola-sysv'; exit; } ;;
     3[345]??:*:4.0:3.0 | 3[34]??A:*:4.0:3.0 | 3[34]??,*:*:4.0:3.0 | 3[34]??/*:*:4.0:3.0 | 4400:*:4.0:3.0 | 4850:*:4.0:3.0 | SKA40:*:4.0:3.0 | SDS2:*:4.0:3.0 | SHG2:*:4.0:3.0 | S7501*:*:4.0:3.0)
@@ -1164,9 +1273,9 @@ EOF
        test -r /etc/.relid \
        && OS_REL=.`sed -n 's/[^ ]* [^ ]* \([0-9][0-9]\).*/\1/p' < /etc/.relid`
        /bin/uname -p 2>/dev/null | grep 86 >/dev/null \
-         && { echo i486-ncr-sysv4.3${OS_REL}; exit; }
+         && { echo i486-ncr-sysv4.3"$OS_REL"; exit; }
        /bin/uname -p 2>/dev/null | /bin/grep entium >/dev/null \
-         && { echo i586-ncr-sysv4.3${OS_REL}; exit; } ;;
+         && { echo i586-ncr-sysv4.3"$OS_REL"; exit; } ;;
     3[34]??:*:4.0:* | 3[34]??,*:*:4.0:*)
        /bin/uname -p 2>/dev/null | grep 86 >/dev/null \
          && { echo i486-ncr-sysv4; exit; } ;;
@@ -1175,249 +1284,437 @@ EOF
        test -r /etc/.relid \
            && OS_REL=.`sed -n 's/[^ ]* [^ ]* \([0-9][0-9]\).*/\1/p' < /etc/.relid`
        /bin/uname -p 2>/dev/null | grep 86 >/dev/null \
-           && { echo i486-ncr-sysv4.3${OS_REL}; exit; }
+           && { echo i486-ncr-sysv4.3"$OS_REL"; exit; }
        /bin/uname -p 2>/dev/null | /bin/grep entium >/dev/null \
-           && { echo i586-ncr-sysv4.3${OS_REL}; exit; }
+           && { echo i586-ncr-sysv4.3"$OS_REL"; exit; }
        /bin/uname -p 2>/dev/null | /bin/grep pteron >/dev/null \
-           && { echo i586-ncr-sysv4.3${OS_REL}; exit; } ;;
+           && { echo i586-ncr-sysv4.3"$OS_REL"; exit; } ;;
     m68*:LynxOS:2.*:* | m68*:LynxOS:3.0*:*)
-       echo m68k-unknown-lynxos${UNAME_RELEASE}
-       exit ;;
+       GUESS=m68k-unknown-lynxos$UNAME_RELEASE
+       ;;
     mc68030:UNIX_System_V:4.*:*)
-       echo m68k-atari-sysv4
-       exit ;;
+       GUESS=m68k-atari-sysv4
+       ;;
     TSUNAMI:LynxOS:2.*:*)
-       echo sparc-unknown-lynxos${UNAME_RELEASE}
-       exit ;;
+       GUESS=sparc-unknown-lynxos$UNAME_RELEASE
+       ;;
     rs6000:LynxOS:2.*:*)
-       echo rs6000-unknown-lynxos${UNAME_RELEASE}
-       exit ;;
+       GUESS=rs6000-unknown-lynxos$UNAME_RELEASE
+       ;;
     PowerPC:LynxOS:2.*:* | PowerPC:LynxOS:3.[01]*:* | PowerPC:LynxOS:4.[02]*:*)
-       echo powerpc-unknown-lynxos${UNAME_RELEASE}
-       exit ;;
+       GUESS=powerpc-unknown-lynxos$UNAME_RELEASE
+       ;;
     SM[BE]S:UNIX_SV:*:*)
-       echo mips-dde-sysv${UNAME_RELEASE}
-       exit ;;
+       GUESS=mips-dde-sysv$UNAME_RELEASE
+       ;;
     RM*:ReliantUNIX-*:*:*)
-       echo mips-sni-sysv4
-       exit ;;
+       GUESS=mips-sni-sysv4
+       ;;
     RM*:SINIX-*:*:*)
-       echo mips-sni-sysv4
-       exit ;;
+       GUESS=mips-sni-sysv4
+       ;;
     *:SINIX-*:*:*)
        if uname -p 2>/dev/null >/dev/null ; then
                UNAME_MACHINE=`(uname -p) 2>/dev/null`
-               echo ${UNAME_MACHINE}-sni-sysv4
+               GUESS=$UNAME_MACHINE-sni-sysv4
        else
-               echo ns32k-sni-sysv
+               GUESS=ns32k-sni-sysv
        fi
-       exit ;;
+       ;;
     PENTIUM:*:4.0*:*)  # Unisys `ClearPath HMP IX 4000' SVR4/MP effort
                        # says <Richard.M.Bartel@ccMail.Census.GOV>
-       echo i586-unisys-sysv4
-       exit ;;
+       GUESS=i586-unisys-sysv4
+       ;;
     *:UNIX_System_V:4*:FTX*)
        # From Gerald Hewes <hewes@openmarket.com>.
        # How about differentiating between stratus architectures? -djm
-       echo hppa1.1-stratus-sysv4
-       exit ;;
+       GUESS=hppa1.1-stratus-sysv4
+       ;;
     *:*:*:FTX*)
        # From seanf@swdc.stratus.com.
-       echo i860-stratus-sysv4
-       exit ;;
+       GUESS=i860-stratus-sysv4
+       ;;
     i*86:VOS:*:*)
        # From Paul.Green@stratus.com.
-       echo ${UNAME_MACHINE}-stratus-vos
-       exit ;;
+       GUESS=$UNAME_MACHINE-stratus-vos
+       ;;
     *:VOS:*:*)
        # From Paul.Green@stratus.com.
-       echo hppa1.1-stratus-vos
-       exit ;;
+       GUESS=hppa1.1-stratus-vos
+       ;;
     mc68*:A/UX:*:*)
-       echo m68k-apple-aux${UNAME_RELEASE}
-       exit ;;
+       GUESS=m68k-apple-aux$UNAME_RELEASE
+       ;;
     news*:NEWS-OS:6*:*)
-       echo mips-sony-newsos6
-       exit ;;
+       GUESS=mips-sony-newsos6
+       ;;
     R[34]000:*System_V*:*:* | R4000:UNIX_SYSV:*:* | R*000:UNIX_SV:*:*)
-       if [ -d /usr/nec ]; then
-               echo mips-nec-sysv${UNAME_RELEASE}
+       if test -d /usr/nec; then
+               GUESS=mips-nec-sysv$UNAME_RELEASE
        else
-               echo mips-unknown-sysv${UNAME_RELEASE}
+               GUESS=mips-unknown-sysv$UNAME_RELEASE
        fi
-       exit ;;
+       ;;
     BeBox:BeOS:*:*)    # BeOS running on hardware made by Be, PPC only.
-       echo powerpc-be-beos
-       exit ;;
+       GUESS=powerpc-be-beos
+       ;;
     BeMac:BeOS:*:*)    # BeOS running on Mac or Mac clone, PPC only.
-       echo powerpc-apple-beos
-       exit ;;
+       GUESS=powerpc-apple-beos
+       ;;
     BePC:BeOS:*:*)     # BeOS running on Intel PC compatible.
-       echo i586-pc-beos
-       exit ;;
+       GUESS=i586-pc-beos
+       ;;
     BePC:Haiku:*:*)    # Haiku running on Intel PC compatible.
-       echo i586-pc-haiku
-       exit ;;
+       GUESS=i586-pc-haiku
+       ;;
     x86_64:Haiku:*:*)
-       echo x86_64-unknown-haiku
-       exit ;;
+       GUESS=x86_64-unknown-haiku
+       ;;
     SX-4:SUPER-UX:*:*)
-       echo sx4-nec-superux${UNAME_RELEASE}
-       exit ;;
+       GUESS=sx4-nec-superux$UNAME_RELEASE
+       ;;
     SX-5:SUPER-UX:*:*)
-       echo sx5-nec-superux${UNAME_RELEASE}
-       exit ;;
+       GUESS=sx5-nec-superux$UNAME_RELEASE
+       ;;
     SX-6:SUPER-UX:*:*)
-       echo sx6-nec-superux${UNAME_RELEASE}
-       exit ;;
+       GUESS=sx6-nec-superux$UNAME_RELEASE
+       ;;
     SX-7:SUPER-UX:*:*)
-       echo sx7-nec-superux${UNAME_RELEASE}
-       exit ;;
+       GUESS=sx7-nec-superux$UNAME_RELEASE
+       ;;
     SX-8:SUPER-UX:*:*)
-       echo sx8-nec-superux${UNAME_RELEASE}
-       exit ;;
+       GUESS=sx8-nec-superux$UNAME_RELEASE
+       ;;
     SX-8R:SUPER-UX:*:*)
-       echo sx8r-nec-superux${UNAME_RELEASE}
-       exit ;;
+       GUESS=sx8r-nec-superux$UNAME_RELEASE
+       ;;
     SX-ACE:SUPER-UX:*:*)
-       echo sxace-nec-superux${UNAME_RELEASE}
-       exit ;;
+       GUESS=sxace-nec-superux$UNAME_RELEASE
+       ;;
     Power*:Rhapsody:*:*)
-       echo powerpc-apple-rhapsody${UNAME_RELEASE}
-       exit ;;
+       GUESS=powerpc-apple-rhapsody$UNAME_RELEASE
+       ;;
     *:Rhapsody:*:*)
-       echo ${UNAME_MACHINE}-apple-rhapsody${UNAME_RELEASE}
-       exit ;;
+       GUESS=$UNAME_MACHINE-apple-rhapsody$UNAME_RELEASE
+       ;;
+    arm64:Darwin:*:*)
+       GUESS=aarch64-apple-darwin$UNAME_RELEASE
+       ;;
     *:Darwin:*:*)
-       UNAME_PROCESSOR=`uname -p` || UNAME_PROCESSOR=unknown
-       eval $set_cc_for_build
-       if test "$UNAME_PROCESSOR" = unknown ; then
-           UNAME_PROCESSOR=powerpc
+       UNAME_PROCESSOR=`uname -p`
+       case $UNAME_PROCESSOR in
+           unknown) UNAME_PROCESSOR=powerpc ;;
+       esac
+       if command -v xcode-select > /dev/null 2> /dev/null && \
+               ! xcode-select --print-path > /dev/null 2> /dev/null ; then
+           # Avoid executing cc if there is no toolchain installed as
+           # cc will be a stub that puts up a graphical alert
+           # prompting the user to install developer tools.
+           CC_FOR_BUILD=no_compiler_found
+       else
+           set_cc_for_build
        fi
-       if test `echo "$UNAME_RELEASE" | sed -e 's/\..*//'` -le 10 ; then
-           if [ "$CC_FOR_BUILD" != no_compiler_found ]; then
-               if (echo '#ifdef __LP64__'; echo IS_64BIT_ARCH; echo '#endif') | \
-                   (CCOPTS="" $CC_FOR_BUILD -E - 2>/dev/null) | \
-                   grep IS_64BIT_ARCH >/dev/null
-               then
-                   case $UNAME_PROCESSOR in
-                       i386) UNAME_PROCESSOR=x86_64 ;;
-                       powerpc) UNAME_PROCESSOR=powerpc64 ;;
-                   esac
-               fi
+       if test "$CC_FOR_BUILD" != no_compiler_found; then
+           if (echo '#ifdef __LP64__'; echo IS_64BIT_ARCH; echo '#endif') | \
+                  (CCOPTS="" $CC_FOR_BUILD -E - 2>/dev/null) | \
+                  grep IS_64BIT_ARCH >/dev/null
+           then
+               case $UNAME_PROCESSOR in
+                   i386) UNAME_PROCESSOR=x86_64 ;;
+                   powerpc) UNAME_PROCESSOR=powerpc64 ;;
+               esac
+           fi
+           # On 10.4-10.6 one might compile for PowerPC via gcc -arch ppc
+           if (echo '#ifdef __POWERPC__'; echo IS_PPC; echo '#endif') | \
+                  (CCOPTS="" $CC_FOR_BUILD -E - 2>/dev/null) | \
+                  grep IS_PPC >/dev/null
+           then
+               UNAME_PROCESSOR=powerpc
            fi
        elif test "$UNAME_PROCESSOR" = i386 ; then
-           # Avoid executing cc on OS X 10.9, as it ships with a stub
-           # that puts up a graphical alert prompting to install
-           # developer tools.  Any system running Mac OS X 10.7 or
-           # later (Darwin 11 and later) is required to have a 64-bit
-           # processor. This is not true of the ARM version of Darwin
-           # that Apple uses in portable devices.
-           UNAME_PROCESSOR=x86_64
+           # uname -m returns i386 or x86_64
+           UNAME_PROCESSOR=$UNAME_MACHINE
        fi
-       echo ${UNAME_PROCESSOR}-apple-darwin${UNAME_RELEASE}
-       exit ;;
+       GUESS=$UNAME_PROCESSOR-apple-darwin$UNAME_RELEASE
+       ;;
     *:procnto*:*:* | *:QNX:[0123456789]*:*)
        UNAME_PROCESSOR=`uname -p`
        if test "$UNAME_PROCESSOR" = x86; then
                UNAME_PROCESSOR=i386
                UNAME_MACHINE=pc
        fi
-       echo ${UNAME_PROCESSOR}-${UNAME_MACHINE}-nto-qnx${UNAME_RELEASE}
-       exit ;;
+       GUESS=$UNAME_PROCESSOR-$UNAME_MACHINE-nto-qnx$UNAME_RELEASE
+       ;;
     *:QNX:*:4*)
-       echo i386-pc-qnx
-       exit ;;
-    NEO-?:NONSTOP_KERNEL:*:*)
-       echo neo-tandem-nsk${UNAME_RELEASE}
-       exit ;;
+       GUESS=i386-pc-qnx
+       ;;
+    NEO-*:NONSTOP_KERNEL:*:*)
+       GUESS=neo-tandem-nsk$UNAME_RELEASE
+       ;;
     NSE-*:NONSTOP_KERNEL:*:*)
-       echo nse-tandem-nsk${UNAME_RELEASE}
-       exit ;;
-    NSR-?:NONSTOP_KERNEL:*:*)
-       echo nsr-tandem-nsk${UNAME_RELEASE}
-       exit ;;
+       GUESS=nse-tandem-nsk$UNAME_RELEASE
+       ;;
+    NSR-*:NONSTOP_KERNEL:*:*)
+       GUESS=nsr-tandem-nsk$UNAME_RELEASE
+       ;;
+    NSV-*:NONSTOP_KERNEL:*:*)
+       GUESS=nsv-tandem-nsk$UNAME_RELEASE
+       ;;
+    NSX-*:NONSTOP_KERNEL:*:*)
+       GUESS=nsx-tandem-nsk$UNAME_RELEASE
+       ;;
     *:NonStop-UX:*:*)
-       echo mips-compaq-nonstopux
-       exit ;;
+       GUESS=mips-compaq-nonstopux
+       ;;
     BS2000:POSIX*:*:*)
-       echo bs2000-siemens-sysv
-       exit ;;
+       GUESS=bs2000-siemens-sysv
+       ;;
     DS/*:UNIX_System_V:*:*)
-       echo ${UNAME_MACHINE}-${UNAME_SYSTEM}-${UNAME_RELEASE}
-       exit ;;
+       GUESS=$UNAME_MACHINE-$UNAME_SYSTEM-$UNAME_RELEASE
+       ;;
     *:Plan9:*:*)
        # "uname -m" is not consistent, so use $cputype instead. 386
        # is converted to i386 for consistency with other x86
        # operating systems.
-       if test "$cputype" = 386; then
+       if test "${cputype-}" = 386; then
            UNAME_MACHINE=i386
-       else
-           UNAME_MACHINE="$cputype"
+       elif test "x${cputype-}" != x; then
+           UNAME_MACHINE=$cputype
        fi
-       echo ${UNAME_MACHINE}-unknown-plan9
-       exit ;;
+       GUESS=$UNAME_MACHINE-unknown-plan9
+       ;;
     *:TOPS-10:*:*)
-       echo pdp10-unknown-tops10
-       exit ;;
+       GUESS=pdp10-unknown-tops10
+       ;;
     *:TENEX:*:*)
-       echo pdp10-unknown-tenex
-       exit ;;
+       GUESS=pdp10-unknown-tenex
+       ;;
     KS10:TOPS-20:*:* | KL10:TOPS-20:*:* | TYPE4:TOPS-20:*:*)
-       echo pdp10-dec-tops20
-       exit ;;
+       GUESS=pdp10-dec-tops20
+       ;;
     XKL-1:TOPS-20:*:* | TYPE5:TOPS-20:*:*)
-       echo pdp10-xkl-tops20
-       exit ;;
+       GUESS=pdp10-xkl-tops20
+       ;;
     *:TOPS-20:*:*)
-       echo pdp10-unknown-tops20
-       exit ;;
+       GUESS=pdp10-unknown-tops20
+       ;;
     *:ITS:*:*)
-       echo pdp10-unknown-its
-       exit ;;
+       GUESS=pdp10-unknown-its
+       ;;
     SEI:*:*:SEIUX)
-       echo mips-sei-seiux${UNAME_RELEASE}
-       exit ;;
+       GUESS=mips-sei-seiux$UNAME_RELEASE
+       ;;
     *:DragonFly:*:*)
-       echo ${UNAME_MACHINE}-unknown-dragonfly`echo ${UNAME_RELEASE}|sed -e 's/[-(].*//'`
-       exit ;;
+       DRAGONFLY_REL=`echo "$UNAME_RELEASE" | sed -e 's/[-(].*//'`
+       GUESS=$UNAME_MACHINE-unknown-dragonfly$DRAGONFLY_REL
+       ;;
     *:*VMS:*:*)
        UNAME_MACHINE=`(uname -p) 2>/dev/null`
-       case "${UNAME_MACHINE}" in
-           A*) echo alpha-dec-vms ; exit ;;
-           I*) echo ia64-dec-vms ; exit ;;
-           V*) echo vax-dec-vms ; exit ;;
+       case $UNAME_MACHINE in
+           A*) GUESS=alpha-dec-vms ;;
+           I*) GUESS=ia64-dec-vms ;;
+           V*) GUESS=vax-dec-vms ;;
        esac ;;
     *:XENIX:*:SysV)
-       echo i386-pc-xenix
-       exit ;;
+       GUESS=i386-pc-xenix
+       ;;
     i*86:skyos:*:*)
-       echo ${UNAME_MACHINE}-pc-skyos`echo ${UNAME_RELEASE} | sed -e 's/ .*$//'`
-       exit ;;
+       SKYOS_REL=`echo "$UNAME_RELEASE" | sed -e 's/ .*$//'`
+       GUESS=$UNAME_MACHINE-pc-skyos$SKYOS_REL
+       ;;
     i*86:rdos:*:*)
-       echo ${UNAME_MACHINE}-pc-rdos
-       exit ;;
-    i*86:AROS:*:*)
-       echo ${UNAME_MACHINE}-pc-aros
-       exit ;;
+       GUESS=$UNAME_MACHINE-pc-rdos
+       ;;
+    i*86:Fiwix:*:*)
+       GUESS=$UNAME_MACHINE-pc-fiwix
+       ;;
+    *:AROS:*:*)
+       GUESS=$UNAME_MACHINE-unknown-aros
+       ;;
     x86_64:VMkernel:*:*)
-       echo ${UNAME_MACHINE}-unknown-esx
-       exit ;;
+       GUESS=$UNAME_MACHINE-unknown-esx
+       ;;
     amd64:Isilon\ OneFS:*:*)
-       echo x86_64-unknown-onefs
-       exit ;;
+       GUESS=x86_64-unknown-onefs
+       ;;
+    *:Unleashed:*:*)
+       GUESS=$UNAME_MACHINE-unknown-unleashed$UNAME_RELEASE
+       ;;
+esac
+
+# Do we have a guess based on uname results?
+if test "x$GUESS" != x; then
+    echo "$GUESS"
+    exit
+fi
+
+# No uname command or uname output not recognized.
+set_cc_for_build
+cat > "$dummy.c" <<EOF
+#ifdef _SEQUENT_
+#include <sys/types.h>
+#include <sys/utsname.h>
+#endif
+#if defined(ultrix) || defined(_ultrix) || defined(__ultrix) || defined(__ultrix__)
+#if defined (vax) || defined (__vax) || defined (__vax__) || defined(mips) || defined(__mips) || defined(__mips__) || defined(MIPS) || defined(__MIPS__)
+#include <signal.h>
+#if defined(_SIZE_T_) || defined(SIGLOST)
+#include <sys/utsname.h>
+#endif
+#endif
+#endif
+main ()
+{
+#if defined (sony)
+#if defined (MIPSEB)
+  /* BFD wants "bsd" instead of "newsos".  Perhaps BFD should be changed,
+     I don't know....  */
+  printf ("mips-sony-bsd\n"); exit (0);
+#else
+#include <sys/param.h>
+  printf ("m68k-sony-newsos%s\n",
+#ifdef NEWSOS4
+  "4"
+#else
+  ""
+#endif
+  ); exit (0);
+#endif
+#endif
+
+#if defined (NeXT)
+#if !defined (__ARCHITECTURE__)
+#define __ARCHITECTURE__ "m68k"
+#endif
+  int version;
+  version=`(hostinfo | sed -n 's/.*NeXT Mach \([0-9]*\).*/\1/p') 2>/dev/null`;
+  if (version < 4)
+    printf ("%s-next-nextstep%d\n", __ARCHITECTURE__, version);
+  else
+    printf ("%s-next-openstep%d\n", __ARCHITECTURE__, version);
+  exit (0);
+#endif
+
+#if defined (MULTIMAX) || defined (n16)
+#if defined (UMAXV)
+  printf ("ns32k-encore-sysv\n"); exit (0);
+#else
+#if defined (CMU)
+  printf ("ns32k-encore-mach\n"); exit (0);
+#else
+  printf ("ns32k-encore-bsd\n"); exit (0);
+#endif
+#endif
+#endif
+
+#if defined (__386BSD__)
+  printf ("i386-pc-bsd\n"); exit (0);
+#endif
+
+#if defined (sequent)
+#if defined (i386)
+  printf ("i386-sequent-dynix\n"); exit (0);
+#endif
+#if defined (ns32000)
+  printf ("ns32k-sequent-dynix\n"); exit (0);
+#endif
+#endif
+
+#if defined (_SEQUENT_)
+  struct utsname un;
+
+  uname(&un);
+  if (strncmp(un.version, "V2", 2) == 0) {
+    printf ("i386-sequent-ptx2\n"); exit (0);
+  }
+  if (strncmp(un.version, "V1", 2) == 0) { /* XXX is V1 correct? */
+    printf ("i386-sequent-ptx1\n"); exit (0);
+  }
+  printf ("i386-sequent-ptx\n"); exit (0);
+#endif
+
+#if defined (vax)
+#if !defined (ultrix)
+#include <sys/param.h>
+#if defined (BSD)
+#if BSD == 43
+  printf ("vax-dec-bsd4.3\n"); exit (0);
+#else
+#if BSD == 199006
+  printf ("vax-dec-bsd4.3reno\n"); exit (0);
+#else
+  printf ("vax-dec-bsd\n"); exit (0);
+#endif
+#endif
+#else
+  printf ("vax-dec-bsd\n"); exit (0);
+#endif
+#else
+#if defined(_SIZE_T_) || defined(SIGLOST)
+  struct utsname un;
+  uname (&un);
+  printf ("vax-dec-ultrix%s\n", un.release); exit (0);
+#else
+  printf ("vax-dec-ultrix\n"); exit (0);
+#endif
+#endif
+#endif
+#if defined(ultrix) || defined(_ultrix) || defined(__ultrix) || defined(__ultrix__)
+#if defined(mips) || defined(__mips) || defined(__mips__) || defined(MIPS) || defined(__MIPS__)
+#if defined(_SIZE_T_) || defined(SIGLOST)
+  struct utsname *un;
+  uname (&un);
+  printf ("mips-dec-ultrix%s\n", un.release); exit (0);
+#else
+  printf ("mips-dec-ultrix\n"); exit (0);
+#endif
+#endif
+#endif
+
+#if defined (alliant) && defined (i860)
+  printf ("i860-alliant-bsd\n"); exit (0);
+#endif
+
+  exit (1);
+}
+EOF
+
+$CC_FOR_BUILD -o "$dummy" "$dummy.c" 2>/dev/null && SYSTEM_NAME=`"$dummy"` &&
+       { echo "$SYSTEM_NAME"; exit; }
+
+# Apollos put the system type in the environment.
+test -d /usr/apollo && { echo "$ISP-apollo-$SYSTYPE"; exit; }
+
+echo "$0: unable to guess system type" >&2
+
+case $UNAME_MACHINE:$UNAME_SYSTEM in
+    mips:Linux | mips64:Linux)
+       # If we got here on MIPS GNU/Linux, output extra information.
+       cat >&2 <<EOF
+
+NOTE: MIPS GNU/Linux systems require a C compiler to fully recognize
+the system type. Please install a C compiler and try again.
+EOF
+       ;;
 esac
 
 cat >&2 <<EOF
-$0: unable to guess system type
 
 This script (version $timestamp), has failed to recognize the
-operating system you are using. If your script is old, overwrite
-config.guess and config.sub with the latest versions from:
+operating system you are using. If your script is old, overwrite *all*
+copies of config.guess and config.sub with the latest versions from:
 
-  http://git.savannah.gnu.org/gitweb/?p=config.git;a=blob_plain;f=config.guess
+  https://git.savannah.gnu.org/cgit/config.git/plain/config.guess
 and
-  http://git.savannah.gnu.org/gitweb/?p=config.git;a=blob_plain;f=config.sub
+  https://git.savannah.gnu.org/cgit/config.git/plain/config.sub
+EOF
+
+our_year=`echo $timestamp | sed 's,-.*,,'`
+thisyear=`date +%Y`
+# shellcheck disable=SC2003
+script_age=`expr "$thisyear" - "$our_year"`
+if test "$script_age" -lt 3 ; then
+   cat >&2 <<EOF
 
 If $0 has already been updated, send the following data and any
 information you think might be pertinent to config-patches@gnu.org to
@@ -1440,16 +1737,17 @@ hostinfo               = `(hostinfo) 2>/dev/null`
 /usr/bin/oslevel       = `(/usr/bin/oslevel) 2>/dev/null`
 /usr/convex/getsysinfo = `(/usr/convex/getsysinfo) 2>/dev/null`
 
-UNAME_MACHINE = ${UNAME_MACHINE}
-UNAME_RELEASE = ${UNAME_RELEASE}
-UNAME_SYSTEM  = ${UNAME_SYSTEM}
-UNAME_VERSION = ${UNAME_VERSION}
+UNAME_MACHINE = "$UNAME_MACHINE"
+UNAME_RELEASE = "$UNAME_RELEASE"
+UNAME_SYSTEM  = "$UNAME_SYSTEM"
+UNAME_VERSION = "$UNAME_VERSION"
 EOF
+fi
 
 exit 1
 
 # Local variables:
-# eval: (add-hook 'write-file-hooks 'time-stamp)
+# eval: (add-hook 'before-save-hook 'time-stamp)
 # time-stamp-start: "timestamp='"
 # time-stamp-format: "%:y-%02m-%02d"
 # time-stamp-end: "'"
index c38b914d6be36839c83ef7f79e804e45ee9cb736..24be79cfb6c1ea846bf7ec122effade871127dc4 100755 (executable)
@@ -2,7 +2,7 @@
 # Output a system dependent set of variables, describing how to set the
 # run time search path of shared libraries in an executable.
 #
-#   Copyright 1996-2013 Free Software Foundation, Inc.
+#   Copyright 1996-2020 Free Software Foundation, Inc.
 #   Taken from GNU libtool, 2001
 #   Originally by Gordon Matzigkeit <gord@gnu.ai.mit.edu>, 1996
 #
@@ -367,11 +367,7 @@ else
     dgux*)
       hardcode_libdir_flag_spec='-L$libdir'
       ;;
-    freebsd2.2*)
-      hardcode_libdir_flag_spec='-R$libdir'
-      hardcode_direct=yes
-      ;;
-    freebsd2*)
+    freebsd2.[01]*)
       hardcode_direct=yes
       hardcode_minus_L=yes
       ;;
@@ -548,13 +544,11 @@ case "$host_os" in
   dgux*)
     library_names_spec='$libname$shrext'
     ;;
+  freebsd[23].*)
+    library_names_spec='$libname$shrext$versuffix'
+    ;;
   freebsd* | dragonfly*)
-    case "$host_os" in
-      freebsd[123]*)
-        library_names_spec='$libname$shrext$versuffix' ;;
-      *)
-        library_names_spec='$libname$shrext' ;;
-    esac
+    library_names_spec='$libname$shrext'
     ;;
   gnu*)
     library_names_spec='$libname$shrext'
index 9feb73bf088fc285489d1e68aa0d6d4c96cb5768..dba16e84c77c7d25871d80c24deff717faf4c094 100755 (executable)
@@ -1,12 +1,14 @@
 #! /bin/sh
 # Configuration validation subroutine script.
-#   Copyright 1992-2016 Free Software Foundation, Inc.
+#   Copyright 1992-2022 Free Software Foundation, Inc.
 
-timestamp='2016-06-20'
+# shellcheck disable=SC2006,SC2268 # see below for rationale
+
+timestamp='2022-01-03'
 
 # This file is free software; you can redistribute it and/or modify it
 # under the terms of the GNU General Public License as published by
-# the Free Software Foundation; either version 3 of the License, or
+# the Free Software Foundation, either version 3 of the License, or
 # (at your option) any later version.
 #
 # This program is distributed in the hope that it will be useful, but
@@ -15,7 +17,7 @@ timestamp='2016-06-20'
 # General Public License for more details.
 #
 # You should have received a copy of the GNU General Public License
-# along with this program; if not, see <http://www.gnu.org/licenses/>.
+# along with this program; if not, see <https://www.gnu.org/licenses/>.
 #
 # As a special exception to the GNU General Public License, if you
 # distribute this file as part of a program that contains a
@@ -33,7 +35,7 @@ timestamp='2016-06-20'
 # Otherwise, we print the canonical config type on stdout and succeed.
 
 # You can get the latest version of this script from:
-# http://git.savannah.gnu.org/gitweb/?p=config.git;a=blob_plain;f=config.sub
+# https://git.savannah.gnu.org/cgit/config.git/plain/config.sub
 
 # This file is supposed to be the same for all GNU packages
 # and recognize all the CPU types, system types and aliases
@@ -50,6 +52,13 @@ timestamp='2016-06-20'
 #      CPU_TYPE-MANUFACTURER-KERNEL-OPERATING_SYSTEM
 # It is wrong to echo any other type of specification.
 
+# The "shellcheck disable" line above the timestamp inhibits complaints
+# about features and limitations of the classic Bourne shell that were
+# superseded or lifted in POSIX.  However, this script identifies a wide
+# variety of pre-POSIX systems that do not have POSIX shells at all, and
+# even some reasonably current systems (Solaris 10 as case-in-point) still
+# have a pre-POSIX /bin/sh.
+
 me=`echo "$0" | sed -e 's,.*/,,'`
 
 usage="\
@@ -57,7 +66,7 @@ Usage: $0 [OPTION] CPU-MFR-OPSYS or ALIAS
 
 Canonicalize a configuration name.
 
-Operation modes:
+Options:
   -h, --help         print this help, then exit
   -t, --time-stamp   print date of last modification, then exit
   -v, --version      print version number, then exit
@@ -67,7 +76,7 @@ Report bugs and patches to <config-patches@gnu.org>."
 version="\
 GNU config.sub ($timestamp)
 
-Copyright 1992-2016 Free Software Foundation, Inc.
+Copyright 1992-2022 Free Software Foundation, Inc.
 
 This is free software; see the source for copying conditions.  There is NO
 warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE."
@@ -89,12 +98,12 @@ while test $# -gt 0 ; do
     - )        # Use stdin as input.
        break ;;
     -* )
-       echo "$me: invalid option $1$help"
+       echo "$me: invalid option $1$help" >&2
        exit 1 ;;
 
     *local*)
        # First pass through any local machine types.
-       echo $1
+       echo "$1"
        exit ;;
 
     * )
@@ -110,1242 +119,1186 @@ case $# in
     exit 1;;
 esac
 
-# Separate what the user gave into CPU-COMPANY and OS or KERNEL-OS (if any).
-# Here we must recognize all the valid KERNEL-OS combinations.
-maybe_os=`echo $1 | sed 's/^\(.*\)-\([^-]*-[^-]*\)$/\2/'`
-case $maybe_os in
-  nto-qnx* | linux-gnu* | linux-android* | linux-dietlibc | linux-newlib* | \
-  linux-musl* | linux-uclibc* | uclinux-uclibc* | uclinux-gnu* | kfreebsd*-gnu* | \
-  knetbsd*-gnu* | netbsd*-gnu* | netbsd*-eabi* | \
-  kopensolaris*-gnu* | \
-  storm-chaos* | os2-emx* | rtmk-nova*)
-    os=-$maybe_os
-    basic_machine=`echo $1 | sed 's/^\(.*\)-\([^-]*-[^-]*\)$/\1/'`
-    ;;
-  android-linux)
-    os=-linux-android
-    basic_machine=`echo $1 | sed 's/^\(.*\)-\([^-]*-[^-]*\)$/\1/'`-unknown
-    ;;
-  *)
-    basic_machine=`echo $1 | sed 's/-[^-]*$//'`
-    if [ $basic_machine != $1 ]
-    then os=`echo $1 | sed 's/.*-/-/'`
-    else os=; fi
-    ;;
-esac
+# Split fields of configuration type
+# shellcheck disable=SC2162
+saved_IFS=$IFS
+IFS="-" read field1 field2 field3 field4 <<EOF
+$1
+EOF
+IFS=$saved_IFS
 
-### Let's recognize common machines as not being operating systems so
-### that things like config.sub decstation-3100 work.  We also
-### recognize some manufacturers as not being operating systems, so we
-### can provide default operating systems below.
-case $os in
-       -sun*os*)
-               # Prevent following clause from handling this invalid input.
-               ;;
-       -dec* | -mips* | -sequent* | -encore* | -pc532* | -sgi* | -sony* | \
-       -att* | -7300* | -3300* | -delta* | -motorola* | -sun[234]* | \
-       -unicom* | -ibm* | -next | -hp | -isi* | -apollo | -altos* | \
-       -convergent* | -ncr* | -news | -32* | -3600* | -3100* | -hitachi* |\
-       -c[123]* | -convex* | -sun | -crds | -omron* | -dg | -ultra | -tti* | \
-       -harris | -dolphin | -highlevel | -gould | -cbm | -ns | -masscomp | \
-       -apple | -axis | -knuth | -cray | -microblaze*)
-               os=
-               basic_machine=$1
-               ;;
-       -bluegene*)
-               os=-cnk
-               ;;
-       -sim | -cisco | -oki | -wec | -winbond)
-               os=
-               basic_machine=$1
-               ;;
-       -scout)
-               ;;
-       -wrs)
-               os=-vxworks
-               basic_machine=$1
-               ;;
-       -chorusos*)
-               os=-chorusos
-               basic_machine=$1
-               ;;
-       -chorusrdb)
-               os=-chorusrdb
-               basic_machine=$1
-               ;;
-       -hiux*)
-               os=-hiuxwe2
-               ;;
-       -sco6)
-               os=-sco5v6
-               basic_machine=`echo $1 | sed -e 's/86-.*/86-pc/'`
-               ;;
-       -sco5)
-               os=-sco3.2v5
-               basic_machine=`echo $1 | sed -e 's/86-.*/86-pc/'`
-               ;;
-       -sco4)
-               os=-sco3.2v4
-               basic_machine=`echo $1 | sed -e 's/86-.*/86-pc/'`
-               ;;
-       -sco3.2.[4-9]*)
-               os=`echo $os | sed -e 's/sco3.2./sco3.2v/'`
-               basic_machine=`echo $1 | sed -e 's/86-.*/86-pc/'`
-               ;;
-       -sco3.2v[4-9]*)
-               # Don't forget version if it is 3.2v4 or newer.
-               basic_machine=`echo $1 | sed -e 's/86-.*/86-pc/'`
-               ;;
-       -sco5v6*)
-               # Don't forget version if it is 3.2v4 or newer.
-               basic_machine=`echo $1 | sed -e 's/86-.*/86-pc/'`
-               ;;
-       -sco*)
-               os=-sco3.2v2
-               basic_machine=`echo $1 | sed -e 's/86-.*/86-pc/'`
-               ;;
-       -udk*)
-               basic_machine=`echo $1 | sed -e 's/86-.*/86-pc/'`
-               ;;
-       -isc)
-               os=-isc2.2
-               basic_machine=`echo $1 | sed -e 's/86-.*/86-pc/'`
-               ;;
-       -clix*)
-               basic_machine=clipper-intergraph
-               ;;
-       -isc*)
-               basic_machine=`echo $1 | sed -e 's/86-.*/86-pc/'`
-               ;;
-       -lynx*178)
-               os=-lynxos178
-               ;;
-       -lynx*5)
-               os=-lynxos5
-               ;;
-       -lynx*)
-               os=-lynxos
+# Separate into logical components for further validation
+case $1 in
+       *-*-*-*-*)
+               echo Invalid configuration \`"$1"\': more than four components >&2
+               exit 1
                ;;
-       -ptx*)
-               basic_machine=`echo $1 | sed -e 's/86-.*/86-sequent/'`
+       *-*-*-*)
+               basic_machine=$field1-$field2
+               basic_os=$field3-$field4
                ;;
-       -windowsnt*)
-               os=`echo $os | sed -e 's/windowsnt/winnt/'`
+       *-*-*)
+               # Ambiguous whether COMPANY is present, or skipped and KERNEL-OS is two
+               # parts
+               maybe_os=$field2-$field3
+               case $maybe_os in
+                       nto-qnx* | linux-* | uclinux-uclibc* \
+                       | uclinux-gnu* | kfreebsd*-gnu* | knetbsd*-gnu* | netbsd*-gnu* \
+                       | netbsd*-eabi* | kopensolaris*-gnu* | cloudabi*-eabi* \
+                       | storm-chaos* | os2-emx* | rtmk-nova*)
+                               basic_machine=$field1
+                               basic_os=$maybe_os
+                               ;;
+                       android-linux)
+                               basic_machine=$field1-unknown
+                               basic_os=linux-android
+                               ;;
+                       *)
+                               basic_machine=$field1-$field2
+                               basic_os=$field3
+                               ;;
+               esac
                ;;
-       -psos*)
-               os=-psos
+       *-*)
+               # A lone config we happen to match not fitting any pattern
+               case $field1-$field2 in
+                       decstation-3100)
+                               basic_machine=mips-dec
+                               basic_os=
+                               ;;
+                       *-*)
+                               # Second component is usually, but not always the OS
+                               case $field2 in
+                                       # Prevent following clause from handling this valid os
+                                       sun*os*)
+                                               basic_machine=$field1
+                                               basic_os=$field2
+                                               ;;
+                                       zephyr*)
+                                               basic_machine=$field1-unknown
+                                               basic_os=$field2
+                                               ;;
+                                       # Manufacturers
+                                       dec* | mips* | sequent* | encore* | pc533* | sgi* | sony* \
+                                       | att* | 7300* | 3300* | delta* | motorola* | sun[234]* \
+                                       | unicom* | ibm* | next | hp | isi* | apollo | altos* \
+                                       | convergent* | ncr* | news | 32* | 3600* | 3100* \
+                                       | hitachi* | c[123]* | convex* | sun | crds | omron* | dg \
+                                       | ultra | tti* | harris | dolphin | highlevel | gould \
+                                       | cbm | ns | masscomp | apple | axis | knuth | cray \
+                                       | microblaze* | sim | cisco \
+                                       | oki | wec | wrs | winbond)
+                                               basic_machine=$field1-$field2
+                                               basic_os=
+                                               ;;
+                                       *)
+                                               basic_machine=$field1
+                                               basic_os=$field2
+                                               ;;
+                               esac
+                       ;;
+               esac
                ;;
-       -mint | -mint[0-9]*)
-               basic_machine=m68k-atari
-               os=-mint
+       *)
+               # Convert single-component short-hands not valid as part of
+               # multi-component configurations.
+               case $field1 in
+                       386bsd)
+                               basic_machine=i386-pc
+                               basic_os=bsd
+                               ;;
+                       a29khif)
+                               basic_machine=a29k-amd
+                               basic_os=udi
+                               ;;
+                       adobe68k)
+                               basic_machine=m68010-adobe
+                               basic_os=scout
+                               ;;
+                       alliant)
+                               basic_machine=fx80-alliant
+                               basic_os=
+                               ;;
+                       altos | altos3068)
+                               basic_machine=m68k-altos
+                               basic_os=
+                               ;;
+                       am29k)
+                               basic_machine=a29k-none
+                               basic_os=bsd
+                               ;;
+                       amdahl)
+                               basic_machine=580-amdahl
+                               basic_os=sysv
+                               ;;
+                       amiga)
+                               basic_machine=m68k-unknown
+                               basic_os=
+                               ;;
+                       amigaos | amigados)
+                               basic_machine=m68k-unknown
+                               basic_os=amigaos
+                               ;;
+                       amigaunix | amix)
+                               basic_machine=m68k-unknown
+                               basic_os=sysv4
+                               ;;
+                       apollo68)
+                               basic_machine=m68k-apollo
+                               basic_os=sysv
+                               ;;
+                       apollo68bsd)
+                               basic_machine=m68k-apollo
+                               basic_os=bsd
+                               ;;
+                       aros)
+                               basic_machine=i386-pc
+                               basic_os=aros
+                               ;;
+                       aux)
+                               basic_machine=m68k-apple
+                               basic_os=aux
+                               ;;
+                       balance)
+                               basic_machine=ns32k-sequent
+                               basic_os=dynix
+                               ;;
+                       blackfin)
+                               basic_machine=bfin-unknown
+                               basic_os=linux
+                               ;;
+                       cegcc)
+                               basic_machine=arm-unknown
+                               basic_os=cegcc
+                               ;;
+                       convex-c1)
+                               basic_machine=c1-convex
+                               basic_os=bsd
+                               ;;
+                       convex-c2)
+                               basic_machine=c2-convex
+                               basic_os=bsd
+                               ;;
+                       convex-c32)
+                               basic_machine=c32-convex
+                               basic_os=bsd
+                               ;;
+                       convex-c34)
+                               basic_machine=c34-convex
+                               basic_os=bsd
+                               ;;
+                       convex-c38)
+                               basic_machine=c38-convex
+                               basic_os=bsd
+                               ;;
+                       cray)
+                               basic_machine=j90-cray
+                               basic_os=unicos
+                               ;;
+                       crds | unos)
+                               basic_machine=m68k-crds
+                               basic_os=
+                               ;;
+                       da30)
+                               basic_machine=m68k-da30
+                               basic_os=
+                               ;;
+                       decstation | pmax | pmin | dec3100 | decstatn)
+                               basic_machine=mips-dec
+                               basic_os=
+                               ;;
+                       delta88)
+                               basic_machine=m88k-motorola
+                               basic_os=sysv3
+                               ;;
+                       dicos)
+                               basic_machine=i686-pc
+                               basic_os=dicos
+                               ;;
+                       djgpp)
+                               basic_machine=i586-pc
+                               basic_os=msdosdjgpp
+                               ;;
+                       ebmon29k)
+                               basic_machine=a29k-amd
+                               basic_os=ebmon
+                               ;;
+                       es1800 | OSE68k | ose68k | ose | OSE)
+                               basic_machine=m68k-ericsson
+                               basic_os=ose
+                               ;;
+                       gmicro)
+                               basic_machine=tron-gmicro
+                               basic_os=sysv
+                               ;;
+                       go32)
+                               basic_machine=i386-pc
+                               basic_os=go32
+                               ;;
+                       h8300hms)
+                               basic_machine=h8300-hitachi
+                               basic_os=hms
+                               ;;
+                       h8300xray)
+                               basic_machine=h8300-hitachi
+                               basic_os=xray
+                               ;;
+                       h8500hms)
+                               basic_machine=h8500-hitachi
+                               basic_os=hms
+                               ;;
+                       harris)
+                               basic_machine=m88k-harris
+                               basic_os=sysv3
+                               ;;
+                       hp300 | hp300hpux)
+                               basic_machine=m68k-hp
+                               basic_os=hpux
+                               ;;
+                       hp300bsd)
+                               basic_machine=m68k-hp
+                               basic_os=bsd
+                               ;;
+                       hppaosf)
+                               basic_machine=hppa1.1-hp
+                               basic_os=osf
+                               ;;
+                       hppro)
+                               basic_machine=hppa1.1-hp
+                               basic_os=proelf
+                               ;;
+                       i386mach)
+                               basic_machine=i386-mach
+                               basic_os=mach
+                               ;;
+                       isi68 | isi)
+                               basic_machine=m68k-isi
+                               basic_os=sysv
+                               ;;
+                       m68knommu)
+                               basic_machine=m68k-unknown
+                               basic_os=linux
+                               ;;
+                       magnum | m3230)
+                               basic_machine=mips-mips
+                               basic_os=sysv
+                               ;;
+                       merlin)
+                               basic_machine=ns32k-utek
+                               basic_os=sysv
+                               ;;
+                       mingw64)
+                               basic_machine=x86_64-pc
+                               basic_os=mingw64
+                               ;;
+                       mingw32)
+                               basic_machine=i686-pc
+                               basic_os=mingw32
+                               ;;
+                       mingw32ce)
+                               basic_machine=arm-unknown
+                               basic_os=mingw32ce
+                               ;;
+                       monitor)
+                               basic_machine=m68k-rom68k
+                               basic_os=coff
+                               ;;
+                       morphos)
+                               basic_machine=powerpc-unknown
+                               basic_os=morphos
+                               ;;
+                       moxiebox)
+                               basic_machine=moxie-unknown
+                               basic_os=moxiebox
+                               ;;
+                       msdos)
+                               basic_machine=i386-pc
+                               basic_os=msdos
+                               ;;
+                       msys)
+                               basic_machine=i686-pc
+                               basic_os=msys
+                               ;;
+                       mvs)
+                               basic_machine=i370-ibm
+                               basic_os=mvs
+                               ;;
+                       nacl)
+                               basic_machine=le32-unknown
+                               basic_os=nacl
+                               ;;
+                       ncr3000)
+                               basic_machine=i486-ncr
+                               basic_os=sysv4
+                               ;;
+                       netbsd386)
+                               basic_machine=i386-pc
+                               basic_os=netbsd
+                               ;;
+                       netwinder)
+                               basic_machine=armv4l-rebel
+                               basic_os=linux
+                               ;;
+                       news | news700 | news800 | news900)
+                               basic_machine=m68k-sony
+                               basic_os=newsos
+                               ;;
+                       news1000)
+                               basic_machine=m68030-sony
+                               basic_os=newsos
+                               ;;
+                       necv70)
+                               basic_machine=v70-nec
+                               basic_os=sysv
+                               ;;
+                       nh3000)
+                               basic_machine=m68k-harris
+                               basic_os=cxux
+                               ;;
+                       nh[45]000)
+                               basic_machine=m88k-harris
+                               basic_os=cxux
+                               ;;
+                       nindy960)
+                               basic_machine=i960-intel
+                               basic_os=nindy
+                               ;;
+                       mon960)
+                               basic_machine=i960-intel
+                               basic_os=mon960
+                               ;;
+                       nonstopux)
+                               basic_machine=mips-compaq
+                               basic_os=nonstopux
+                               ;;
+                       os400)
+                               basic_machine=powerpc-ibm
+                               basic_os=os400
+                               ;;
+                       OSE68000 | ose68000)
+                               basic_machine=m68000-ericsson
+                               basic_os=ose
+                               ;;
+                       os68k)
+                               basic_machine=m68k-none
+                               basic_os=os68k
+                               ;;
+                       paragon)
+                               basic_machine=i860-intel
+                               basic_os=osf
+                               ;;
+                       parisc)
+                               basic_machine=hppa-unknown
+                               basic_os=linux
+                               ;;
+                       psp)
+                               basic_machine=mipsallegrexel-sony
+                               basic_os=psp
+                               ;;
+                       pw32)
+                               basic_machine=i586-unknown
+                               basic_os=pw32
+                               ;;
+                       rdos | rdos64)
+                               basic_machine=x86_64-pc
+                               basic_os=rdos
+                               ;;
+                       rdos32)
+                               basic_machine=i386-pc
+                               basic_os=rdos
+                               ;;
+                       rom68k)
+                               basic_machine=m68k-rom68k
+                               basic_os=coff
+                               ;;
+                       sa29200)
+                               basic_machine=a29k-amd
+                               basic_os=udi
+                               ;;
+                       sei)
+                               basic_machine=mips-sei
+                               basic_os=seiux
+                               ;;
+                       sequent)
+                               basic_machine=i386-sequent
+                               basic_os=
+                               ;;
+                       sps7)
+                               basic_machine=m68k-bull
+                               basic_os=sysv2
+                               ;;
+                       st2000)
+                               basic_machine=m68k-tandem
+                               basic_os=
+                               ;;
+                       stratus)
+                               basic_machine=i860-stratus
+                               basic_os=sysv4
+                               ;;
+                       sun2)
+                               basic_machine=m68000-sun
+                               basic_os=
+                               ;;
+                       sun2os3)
+                               basic_machine=m68000-sun
+                               basic_os=sunos3
+                               ;;
+                       sun2os4)
+                               basic_machine=m68000-sun
+                               basic_os=sunos4
+                               ;;
+                       sun3)
+                               basic_machine=m68k-sun
+                               basic_os=
+                               ;;
+                       sun3os3)
+                               basic_machine=m68k-sun
+                               basic_os=sunos3
+                               ;;
+                       sun3os4)
+                               basic_machine=m68k-sun
+                               basic_os=sunos4
+                               ;;
+                       sun4)
+                               basic_machine=sparc-sun
+                               basic_os=
+                               ;;
+                       sun4os3)
+                               basic_machine=sparc-sun
+                               basic_os=sunos3
+                               ;;
+                       sun4os4)
+                               basic_machine=sparc-sun
+                               basic_os=sunos4
+                               ;;
+                       sun4sol2)
+                               basic_machine=sparc-sun
+                               basic_os=solaris2
+                               ;;
+                       sun386 | sun386i | roadrunner)
+                               basic_machine=i386-sun
+                               basic_os=
+                               ;;
+                       sv1)
+                               basic_machine=sv1-cray
+                               basic_os=unicos
+                               ;;
+                       symmetry)
+                               basic_machine=i386-sequent
+                               basic_os=dynix
+                               ;;
+                       t3e)
+                               basic_machine=alphaev5-cray
+                               basic_os=unicos
+                               ;;
+                       t90)
+                               basic_machine=t90-cray
+                               basic_os=unicos
+                               ;;
+                       toad1)
+                               basic_machine=pdp10-xkl
+                               basic_os=tops20
+                               ;;
+                       tpf)
+                               basic_machine=s390x-ibm
+                               basic_os=tpf
+                               ;;
+                       udi29k)
+                               basic_machine=a29k-amd
+                               basic_os=udi
+                               ;;
+                       ultra3)
+                               basic_machine=a29k-nyu
+                               basic_os=sym1
+                               ;;
+                       v810 | necv810)
+                               basic_machine=v810-nec
+                               basic_os=none
+                               ;;
+                       vaxv)
+                               basic_machine=vax-dec
+                               basic_os=sysv
+                               ;;
+                       vms)
+                               basic_machine=vax-dec
+                               basic_os=vms
+                               ;;
+                       vsta)
+                               basic_machine=i386-pc
+                               basic_os=vsta
+                               ;;
+                       vxworks960)
+                               basic_machine=i960-wrs
+                               basic_os=vxworks
+                               ;;
+                       vxworks68)
+                               basic_machine=m68k-wrs
+                               basic_os=vxworks
+                               ;;
+                       vxworks29k)
+                               basic_machine=a29k-wrs
+                               basic_os=vxworks
+                               ;;
+                       xbox)
+                               basic_machine=i686-pc
+                               basic_os=mingw32
+                               ;;
+                       ymp)
+                               basic_machine=ymp-cray
+                               basic_os=unicos
+                               ;;
+                       *)
+                               basic_machine=$1
+                               basic_os=
+                               ;;
+               esac
                ;;
 esac
 
-# Decode aliases for certain CPU-COMPANY combinations.
+# Decode 1-component or ad-hoc basic machines
 case $basic_machine in
-       # Recognize the basic CPU types without company name.
-       # Some are omitted here because they have special meanings below.
-       1750a | 580 \
-       | a29k \
-       | aarch64 | aarch64_be \
-       | alpha | alphaev[4-8] | alphaev56 | alphaev6[78] | alphapca5[67] \
-       | alpha64 | alpha64ev[4-8] | alpha64ev56 | alpha64ev6[78] | alpha64pca5[67] \
-       | am33_2.0 \
-       | arc | arceb \
-       | arm | arm[bl]e | arme[lb] | armv[2-8] | armv[3-8][lb] | armv7[arm] \
-       | avr | avr32 \
-       | ba \
-       | be32 | be64 \
-       | bfin \
-       | c4x | c8051 | clipper \
-       | d10v | d30v | dlx | dsp16xx \
-       | e2k | epiphany \
-       | fido | fr30 | frv | ft32 \
-       | h8300 | h8500 | hppa | hppa1.[01] | hppa2.0 | hppa2.0[nw] | hppa64 \
-       | hexagon \
-       | i370 | i860 | i960 | ia64 \
-       | ip2k | iq2000 \
-       | k1om \
-       | le32 | le64 \
-       | lm32 \
-       | m32c | m32r | m32rle | m68000 | m68k | m88k \
-       | maxq | mb | microblaze | microblazeel | mcore | mep | metag \
-       | mips | mipsbe | mipseb | mipsel | mipsle \
-       | mips16 \
-       | mips64 | mips64el \
-       | mips64octeon | mips64octeonel \
-       | mips64orion | mips64orionel \
-       | mips64r5900 | mips64r5900el \
-       | mips64vr | mips64vrel \
-       | mips64vr4100 | mips64vr4100el \
-       | mips64vr4300 | mips64vr4300el \
-       | mips64vr5000 | mips64vr5000el \
-       | mips64vr5900 | mips64vr5900el \
-       | mipsisa32 | mipsisa32el \
-       | mipsisa32r2 | mipsisa32r2el \
-       | mipsisa32r6 | mipsisa32r6el \
-       | mipsisa64 | mipsisa64el \
-       | mipsisa64r2 | mipsisa64r2el \
-       | mipsisa64r6 | mipsisa64r6el \
-       | mipsisa64sb1 | mipsisa64sb1el \
-       | mipsisa64sr71k | mipsisa64sr71kel \
-       | mipsr5900 | mipsr5900el \
-       | mipstx39 | mipstx39el \
-       | mn10200 | mn10300 \
-       | moxie \
-       | mt \
-       | msp430 \
-       | nds32 | nds32le | nds32be \
-       | nios | nios2 | nios2eb | nios2el \
-       | ns16k | ns32k \
-       | open8 | or1k | or1knd | or32 \
-       | pdp10 | pdp11 | pj | pjl \
-       | powerpc | powerpc64 | powerpc64le | powerpcle \
-       | pyramid \
-       | riscv32 | riscv64 \
-       | rl78 | rx \
-       | score \
-       | sh | sh[1234] | sh[24]a | sh[24]aeb | sh[23]e | sh[234]eb | sheb | shbe | shle | sh[1234]le | sh3ele \
-       | sh64 | sh64le \
-       | sparc | sparc64 | sparc64b | sparc64v | sparc86x | sparclet | sparclite \
-       | sparcv8 | sparcv9 | sparcv9b | sparcv9v \
-       | spu \
-       | tahoe | tic4x | tic54x | tic55x | tic6x | tic80 | tron \
-       | ubicom32 \
-       | v850 | v850e | v850e1 | v850e2 | v850es | v850e2v3 \
-       | visium \
-       | we32k \
-       | x86 | xc16x | xstormy16 | xtensa \
-       | z8k | z80)
-               basic_machine=$basic_machine-unknown
-               ;;
-       c54x)
-               basic_machine=tic54x-unknown
-               ;;
-       c55x)
-               basic_machine=tic55x-unknown
-               ;;
-       c6x)
-               basic_machine=tic6x-unknown
-               ;;
-       leon|leon[3-9])
-               basic_machine=sparc-$basic_machine
-               ;;
-       m6811 | m68hc11 | m6812 | m68hc12 | m68hcs12x | nvptx | picochip)
-               basic_machine=$basic_machine-unknown
-               os=-none
+       # Here we handle the default manufacturer of certain CPU types.  It is in
+       # some cases the only manufacturer, in others, it is the most popular.
+       w89k)
+               cpu=hppa1.1
+               vendor=winbond
                ;;
-       m88110 | m680[12346]0 | m683?2 | m68360 | m5200 | v70 | w65 | z8k)
+       op50n)
+               cpu=hppa1.1
+               vendor=oki
                ;;
-       ms1)
-               basic_machine=mt-unknown
+       op60c)
+               cpu=hppa1.1
+               vendor=oki
                ;;
-
-       strongarm | thumb | xscale)
-               basic_machine=arm-unknown
+       ibm*)
+               cpu=i370
+               vendor=ibm
                ;;
-       xgate)
-               basic_machine=$basic_machine-unknown
-               os=-none
+       orion105)
+               cpu=clipper
+               vendor=highlevel
                ;;
-       xscaleeb)
-               basic_machine=armeb-unknown
+       mac | mpw | mac-mpw)
+               cpu=m68k
+               vendor=apple
                ;;
-
-       xscaleel)
-               basic_machine=armel-unknown
+       pmac | pmac-mpw)
+               cpu=powerpc
+               vendor=apple
                ;;
 
-       # We use `pc' rather than `unknown'
-       # because (1) that's what they normally are, and
-       # (2) the word "unknown" tends to confuse beginning users.
-       i*86 | x86_64)
-         basic_machine=$basic_machine-pc
-         ;;
-       # Object if more than one company name word.
-       *-*-*)
-               echo Invalid configuration \`$1\': machine \`$basic_machine\' not recognized 1>&2
-               exit 1
-               ;;
-       # Recognize the basic CPU types with company name.
-       580-* \
-       | a29k-* \
-       | aarch64-* | aarch64_be-* \
-       | alpha-* | alphaev[4-8]-* | alphaev56-* | alphaev6[78]-* \
-       | alpha64-* | alpha64ev[4-8]-* | alpha64ev56-* | alpha64ev6[78]-* \
-       | alphapca5[67]-* | alpha64pca5[67]-* | arc-* | arceb-* \
-       | arm-*  | armbe-* | armle-* | armeb-* | armv*-* \
-       | avr-* | avr32-* \
-       | ba-* \
-       | be32-* | be64-* \
-       | bfin-* | bs2000-* \
-       | c[123]* | c30-* | [cjt]90-* | c4x-* \
-       | c8051-* | clipper-* | craynv-* | cydra-* \
-       | d10v-* | d30v-* | dlx-* \
-       | e2k-* | elxsi-* \
-       | f30[01]-* | f700-* | fido-* | fr30-* | frv-* | fx80-* \
-       | h8300-* | h8500-* \
-       | hppa-* | hppa1.[01]-* | hppa2.0-* | hppa2.0[nw]-* | hppa64-* \
-       | hexagon-* \
-       | i*86-* | i860-* | i960-* | ia64-* \
-       | ip2k-* | iq2000-* \
-       | k1om-* \
-       | le32-* | le64-* \
-       | lm32-* \
-       | m32c-* | m32r-* | m32rle-* \
-       | m68000-* | m680[012346]0-* | m68360-* | m683?2-* | m68k-* \
-       | m88110-* | m88k-* | maxq-* | mcore-* | metag-* \
-       | microblaze-* | microblazeel-* \
-       | mips-* | mipsbe-* | mipseb-* | mipsel-* | mipsle-* \
-       | mips16-* \
-       | mips64-* | mips64el-* \
-       | mips64octeon-* | mips64octeonel-* \
-       | mips64orion-* | mips64orionel-* \
-       | mips64r5900-* | mips64r5900el-* \
-       | mips64vr-* | mips64vrel-* \
-       | mips64vr4100-* | mips64vr4100el-* \
-       | mips64vr4300-* | mips64vr4300el-* \
-       | mips64vr5000-* | mips64vr5000el-* \
-       | mips64vr5900-* | mips64vr5900el-* \
-       | mipsisa32-* | mipsisa32el-* \
-       | mipsisa32r2-* | mipsisa32r2el-* \
-       | mipsisa32r6-* | mipsisa32r6el-* \
-       | mipsisa64-* | mipsisa64el-* \
-       | mipsisa64r2-* | mipsisa64r2el-* \
-       | mipsisa64r6-* | mipsisa64r6el-* \
-       | mipsisa64sb1-* | mipsisa64sb1el-* \
-       | mipsisa64sr71k-* | mipsisa64sr71kel-* \
-       | mipsr5900-* | mipsr5900el-* \
-       | mipstx39-* | mipstx39el-* \
-       | mmix-* \
-       | mt-* \
-       | msp430-* \
-       | nds32-* | nds32le-* | nds32be-* \
-       | nios-* | nios2-* | nios2eb-* | nios2el-* \
-       | none-* | np1-* | ns16k-* | ns32k-* \
-       | open8-* \
-       | or1k*-* \
-       | orion-* \
-       | pdp10-* | pdp11-* | pj-* | pjl-* | pn-* | power-* \
-       | powerpc-* | powerpc64-* | powerpc64le-* | powerpcle-* \
-       | pyramid-* \
-       | riscv32-* | riscv64-* \
-       | rl78-* | romp-* | rs6000-* | rx-* \
-       | sh-* | sh[1234]-* | sh[24]a-* | sh[24]aeb-* | sh[23]e-* | sh[34]eb-* | sheb-* | shbe-* \
-       | shle-* | sh[1234]le-* | sh3ele-* | sh64-* | sh64le-* \
-       | sparc-* | sparc64-* | sparc64b-* | sparc64v-* | sparc86x-* | sparclet-* \
-       | sparclite-* \
-       | sparcv8-* | sparcv9-* | sparcv9b-* | sparcv9v-* | sv1-* | sx*-* \
-       | tahoe-* \
-       | tic30-* | tic4x-* | tic54x-* | tic55x-* | tic6x-* | tic80-* \
-       | tile*-* \
-       | tron-* \
-       | ubicom32-* \
-       | v850-* | v850e-* | v850e1-* | v850es-* | v850e2-* | v850e2v3-* \
-       | vax-* \
-       | visium-* \
-       | we32k-* \
-       | x86-* | x86_64-* | xc16x-* | xps100-* \
-       | xstormy16-* | xtensa*-* \
-       | ymp-* \
-       | z8k-* | z80-*)
-               ;;
-       # Recognize the basic CPU types without company name, with glob match.
-       xtensa*)
-               basic_machine=$basic_machine-unknown
-               ;;
        # Recognize the various machine names and aliases which stand
        # for a CPU type and a company and sometimes even an OS.
-       386bsd)
-               basic_machine=i386-unknown
-               os=-bsd
-               ;;
        3b1 | 7300 | 7300-att | att-7300 | pc7300 | safari | unixpc)
-               basic_machine=m68000-att
+               cpu=m68000
+               vendor=att
                ;;
        3b*)
-               basic_machine=we32k-att
-               ;;
-       a29khif)
-               basic_machine=a29k-amd
-               os=-udi
-               ;;
-       abacus)
-               basic_machine=abacus-unknown
-               ;;
-       adobe68k)
-               basic_machine=m68010-adobe
-               os=-scout
-               ;;
-       alliant | fx80)
-               basic_machine=fx80-alliant
-               ;;
-       altos | altos3068)
-               basic_machine=m68k-altos
-               ;;
-       am29k)
-               basic_machine=a29k-none
-               os=-bsd
-               ;;
-       amd64)
-               basic_machine=x86_64-pc
-               ;;
-       amd64-*)
-               basic_machine=x86_64-`echo $basic_machine | sed 's/^[^-]*-//'`
-               ;;
-       amdahl)
-               basic_machine=580-amdahl
-               os=-sysv
-               ;;
-       amiga | amiga-*)
-               basic_machine=m68k-unknown
-               ;;
-       amigaos | amigados)
-               basic_machine=m68k-unknown
-               os=-amigaos
-               ;;
-       amigaunix | amix)
-               basic_machine=m68k-unknown
-               os=-sysv4
-               ;;
-       apollo68)
-               basic_machine=m68k-apollo
-               os=-sysv
-               ;;
-       apollo68bsd)
-               basic_machine=m68k-apollo
-               os=-bsd
-               ;;
-       aros)
-               basic_machine=i386-pc
-               os=-aros
-               ;;
-       asmjs)
-               basic_machine=asmjs-unknown
-               ;;
-       aux)
-               basic_machine=m68k-apple
-               os=-aux
-               ;;
-       balance)
-               basic_machine=ns32k-sequent
-               os=-dynix
-               ;;
-       blackfin)
-               basic_machine=bfin-unknown
-               os=-linux
-               ;;
-       blackfin-*)
-               basic_machine=bfin-`echo $basic_machine | sed 's/^[^-]*-//'`
-               os=-linux
+               cpu=we32k
+               vendor=att
                ;;
        bluegene*)
-               basic_machine=powerpc-ibm
-               os=-cnk
-               ;;
-       c54x-*)
-               basic_machine=tic54x-`echo $basic_machine | sed 's/^[^-]*-//'`
-               ;;
-       c55x-*)
-               basic_machine=tic55x-`echo $basic_machine | sed 's/^[^-]*-//'`
-               ;;
-       c6x-*)
-               basic_machine=tic6x-`echo $basic_machine | sed 's/^[^-]*-//'`
-               ;;
-       c90)
-               basic_machine=c90-cray
-               os=-unicos
-               ;;
-       cegcc)
-               basic_machine=arm-unknown
-               os=-cegcc
-               ;;
-       convex-c1)
-               basic_machine=c1-convex
-               os=-bsd
-               ;;
-       convex-c2)
-               basic_machine=c2-convex
-               os=-bsd
-               ;;
-       convex-c32)
-               basic_machine=c32-convex
-               os=-bsd
-               ;;
-       convex-c34)
-               basic_machine=c34-convex
-               os=-bsd
-               ;;
-       convex-c38)
-               basic_machine=c38-convex
-               os=-bsd
-               ;;
-       cray | j90)
-               basic_machine=j90-cray
-               os=-unicos
-               ;;
-       craynv)
-               basic_machine=craynv-cray
-               os=-unicosmp
-               ;;
-       cr16 | cr16-*)
-               basic_machine=cr16-unknown
-               os=-elf
-               ;;
-       crds | unos)
-               basic_machine=m68k-crds
-               ;;
-       crisv32 | crisv32-* | etraxfs*)
-               basic_machine=crisv32-axis
-               ;;
-       cris | cris-* | etrax*)
-               basic_machine=cris-axis
-               ;;
-       crx)
-               basic_machine=crx-unknown
-               os=-elf
-               ;;
-       da30 | da30-*)
-               basic_machine=m68k-da30
-               ;;
-       decstation | decstation-3100 | pmax | pmax-* | pmin | dec3100 | decstatn)
-               basic_machine=mips-dec
+               cpu=powerpc
+               vendor=ibm
+               basic_os=cnk
                ;;
        decsystem10* | dec10*)
-               basic_machine=pdp10-dec
-               os=-tops10
+               cpu=pdp10
+               vendor=dec
+               basic_os=tops10
                ;;
        decsystem20* | dec20*)
-               basic_machine=pdp10-dec
-               os=-tops20
+               cpu=pdp10
+               vendor=dec
+               basic_os=tops20
                ;;
        delta | 3300 | motorola-3300 | motorola-delta \
              | 3300-motorola | delta-motorola)
-               basic_machine=m68k-motorola
-               ;;
-       delta88)
-               basic_machine=m88k-motorola
-               os=-sysv3
-               ;;
-       dicos)
-               basic_machine=i686-pc
-               os=-dicos
+               cpu=m68k
+               vendor=motorola
                ;;
-       djgpp)
-               basic_machine=i586-pc
-               os=-msdosdjgpp
-               ;;
-       dpx20 | dpx20-*)
-               basic_machine=rs6000-bull
-               os=-bosx
-               ;;
-       dpx2* | dpx2*-bull)
-               basic_machine=m68k-bull
-               os=-sysv3
-               ;;
-       e500v[12])
-               basic_machine=powerpc-unknown
-               os=$os"spe"
-               ;;
-       e500v[12]-*)
-               basic_machine=powerpc-`echo $basic_machine | sed 's/^[^-]*-//'`
-               os=$os"spe"
-               ;;
-       ebmon29k)
-               basic_machine=a29k-amd
-               os=-ebmon
-               ;;
-       elxsi)
-               basic_machine=elxsi-elxsi
-               os=-bsd
+       dpx2*)
+               cpu=m68k
+               vendor=bull
+               basic_os=sysv3
                ;;
        encore | umax | mmax)
-               basic_machine=ns32k-encore
+               cpu=ns32k
+               vendor=encore
                ;;
-       es1800 | OSE68k | ose68k | ose | OSE)
-               basic_machine=m68k-ericsson
-               os=-ose
+       elxsi)
+               cpu=elxsi
+               vendor=elxsi
+               basic_os=${basic_os:-bsd}
                ;;
        fx2800)
-               basic_machine=i860-alliant
+               cpu=i860
+               vendor=alliant
                ;;
        genix)
-               basic_machine=ns32k-ns
-               ;;
-       gmicro)
-               basic_machine=tron-gmicro
-               os=-sysv
-               ;;
-       go32)
-               basic_machine=i386-pc
-               os=-go32
+               cpu=ns32k
+               vendor=ns
                ;;
        h3050r* | hiux*)
-               basic_machine=hppa1.1-hitachi
-               os=-hiuxwe2
-               ;;
-       h8300hms)
-               basic_machine=h8300-hitachi
-               os=-hms
-               ;;
-       h8300xray)
-               basic_machine=h8300-hitachi
-               os=-xray
-               ;;
-       h8500hms)
-               basic_machine=h8500-hitachi
-               os=-hms
-               ;;
-       harris)
-               basic_machine=m88k-harris
-               os=-sysv3
-               ;;
-       hp300-*)
-               basic_machine=m68k-hp
-               ;;
-       hp300bsd)
-               basic_machine=m68k-hp
-               os=-bsd
-               ;;
-       hp300hpux)
-               basic_machine=m68k-hp
-               os=-hpux
+               cpu=hppa1.1
+               vendor=hitachi
+               basic_os=hiuxwe2
                ;;
        hp3k9[0-9][0-9] | hp9[0-9][0-9])
-               basic_machine=hppa1.0-hp
+               cpu=hppa1.0
+               vendor=hp
                ;;
        hp9k2[0-9][0-9] | hp9k31[0-9])
-               basic_machine=m68000-hp
+               cpu=m68000
+               vendor=hp
                ;;
        hp9k3[2-9][0-9])
-               basic_machine=m68k-hp
+               cpu=m68k
+               vendor=hp
                ;;
        hp9k6[0-9][0-9] | hp6[0-9][0-9])
-               basic_machine=hppa1.0-hp
+               cpu=hppa1.0
+               vendor=hp
                ;;
        hp9k7[0-79][0-9] | hp7[0-79][0-9])
-               basic_machine=hppa1.1-hp
+               cpu=hppa1.1
+               vendor=hp
                ;;
        hp9k78[0-9] | hp78[0-9])
                # FIXME: really hppa2.0-hp
-               basic_machine=hppa1.1-hp
+               cpu=hppa1.1
+               vendor=hp
                ;;
        hp9k8[67]1 | hp8[67]1 | hp9k80[24] | hp80[24] | hp9k8[78]9 | hp8[78]9 | hp9k893 | hp893)
                # FIXME: really hppa2.0-hp
-               basic_machine=hppa1.1-hp
+               cpu=hppa1.1
+               vendor=hp
                ;;
        hp9k8[0-9][13679] | hp8[0-9][13679])
-               basic_machine=hppa1.1-hp
+               cpu=hppa1.1
+               vendor=hp
                ;;
        hp9k8[0-9][0-9] | hp8[0-9][0-9])
-               basic_machine=hppa1.0-hp
-               ;;
-       hppa-next)
-               os=-nextstep3
-               ;;
-       hppaosf)
-               basic_machine=hppa1.1-hp
-               os=-osf
-               ;;
-       hppro)
-               basic_machine=hppa1.1-hp
-               os=-proelf
-               ;;
-       i370-ibm* | ibm*)
-               basic_machine=i370-ibm
+               cpu=hppa1.0
+               vendor=hp
                ;;
        i*86v32)
-               basic_machine=`echo $1 | sed -e 's/86.*/86-pc/'`
-               os=-sysv32
+               cpu=`echo "$1" | sed -e 's/86.*/86/'`
+               vendor=pc
+               basic_os=sysv32
                ;;
        i*86v4*)
-               basic_machine=`echo $1 | sed -e 's/86.*/86-pc/'`
-               os=-sysv4
+               cpu=`echo "$1" | sed -e 's/86.*/86/'`
+               vendor=pc
+               basic_os=sysv4
                ;;
        i*86v)
-               basic_machine=`echo $1 | sed -e 's/86.*/86-pc/'`
-               os=-sysv
+               cpu=`echo "$1" | sed -e 's/86.*/86/'`
+               vendor=pc
+               basic_os=sysv
                ;;
        i*86sol2)
-               basic_machine=`echo $1 | sed -e 's/86.*/86-pc/'`
-               os=-solaris2
+               cpu=`echo "$1" | sed -e 's/86.*/86/'`
+               vendor=pc
+               basic_os=solaris2
                ;;
-       i386mach)
-               basic_machine=i386-mach
-               os=-mach
-               ;;
-       i386-vsta | vsta)
-               basic_machine=i386-unknown
-               os=-vsta
+       j90 | j90-cray)
+               cpu=j90
+               vendor=cray
+               basic_os=${basic_os:-unicos}
                ;;
        iris | iris4d)
-               basic_machine=mips-sgi
-               case $os in
-                   -irix*)
+               cpu=mips
+               vendor=sgi
+               case $basic_os in
+                   irix*)
                        ;;
                    *)
-                       os=-irix4
+                       basic_os=irix4
                        ;;
                esac
                ;;
-       isi68 | isi)
-               basic_machine=m68k-isi
-               os=-sysv
-               ;;
-       leon-*|leon[3-9]-*)
-               basic_machine=sparc-`echo $basic_machine | sed 's/-.*//'`
-               ;;
-       m68knommu)
-               basic_machine=m68k-unknown
-               os=-linux
-               ;;
-       m68knommu-*)
-               basic_machine=m68k-`echo $basic_machine | sed 's/^[^-]*-//'`
-               os=-linux
-               ;;
-       m88k-omron*)
-               basic_machine=m88k-omron
-               ;;
-       magnum | m3230)
-               basic_machine=mips-mips
-               os=-sysv
-               ;;
-       merlin)
-               basic_machine=ns32k-utek
-               os=-sysv
-               ;;
-       microblaze*)
-               basic_machine=microblaze-xilinx
-               ;;
-       mingw64)
-               basic_machine=x86_64-pc
-               os=-mingw64
-               ;;
-       mingw32)
-               basic_machine=i686-pc
-               os=-mingw32
-               ;;
-       mingw32ce)
-               basic_machine=arm-unknown
-               os=-mingw32ce
-               ;;
        miniframe)
-               basic_machine=m68000-convergent
-               ;;
-       *mint | -mint[0-9]* | *MiNT | *MiNT[0-9]*)
-               basic_machine=m68k-atari
-               os=-mint
-               ;;
-       mips3*-*)
-               basic_machine=`echo $basic_machine | sed -e 's/mips3/mips64/'`
-               ;;
-       mips3*)
-               basic_machine=`echo $basic_machine | sed -e 's/mips3/mips64/'`-unknown
-               ;;
-       monitor)
-               basic_machine=m68k-rom68k
-               os=-coff
-               ;;
-       morphos)
-               basic_machine=powerpc-unknown
-               os=-morphos
-               ;;
-       moxiebox)
-               basic_machine=moxie-unknown
-               os=-moxiebox
-               ;;
-       msdos)
-               basic_machine=i386-pc
-               os=-msdos
-               ;;
-       ms1-*)
-               basic_machine=`echo $basic_machine | sed -e 's/ms1-/mt-/'`
-               ;;
-       msys)
-               basic_machine=i686-pc
-               os=-msys
-               ;;
-       mvs)
-               basic_machine=i370-ibm
-               os=-mvs
-               ;;
-       nacl)
-               basic_machine=le32-unknown
-               os=-nacl
+               cpu=m68000
+               vendor=convergent
                ;;
-       ncr3000)
-               basic_machine=i486-ncr
-               os=-sysv4
-               ;;
-       netbsd386)
-               basic_machine=i386-unknown
-               os=-netbsd
-               ;;
-       netwinder)
-               basic_machine=armv4l-rebel
-               os=-linux
-               ;;
-       news | news700 | news800 | news900)
-               basic_machine=m68k-sony
-               os=-newsos
-               ;;
-       news1000)
-               basic_machine=m68030-sony
-               os=-newsos
+       *mint | mint[0-9]* | *MiNT | *MiNT[0-9]*)
+               cpu=m68k
+               vendor=atari
+               basic_os=mint
                ;;
        news-3600 | risc-news)
-               basic_machine=mips-sony
-               os=-newsos
-               ;;
-       necv70)
-               basic_machine=v70-nec
-               os=-sysv
-               ;;
-       next | m*-next )
-               basic_machine=m68k-next
-               case $os in
-                   -nextstep* )
+               cpu=mips
+               vendor=sony
+               basic_os=newsos
+               ;;
+       next | m*-next)
+               cpu=m68k
+               vendor=next
+               case $basic_os in
+                   openstep*)
+                       ;;
+                   nextstep*)
                        ;;
-                   -ns2*)
-                     os=-nextstep2
+                   ns2*)
+                     basic_os=nextstep2
                        ;;
                    *)
-                     os=-nextstep3
+                     basic_os=nextstep3
                        ;;
                esac
                ;;
-       nh3000)
-               basic_machine=m68k-harris
-               os=-cxux
-               ;;
-       nh[45]000)
-               basic_machine=m88k-harris
-               os=-cxux
-               ;;
-       nindy960)
-               basic_machine=i960-intel
-               os=-nindy
-               ;;
-       mon960)
-               basic_machine=i960-intel
-               os=-mon960
-               ;;
-       nonstopux)
-               basic_machine=mips-compaq
-               os=-nonstopux
-               ;;
        np1)
-               basic_machine=np1-gould
-               ;;
-       neo-tandem)
-               basic_machine=neo-tandem
-               ;;
-       nse-tandem)
-               basic_machine=nse-tandem
-               ;;
-       nsr-tandem)
-               basic_machine=nsr-tandem
+               cpu=np1
+               vendor=gould
                ;;
        op50n-* | op60c-*)
-               basic_machine=hppa1.1-oki
-               os=-proelf
-               ;;
-       openrisc | openrisc-*)
-               basic_machine=or32-unknown
-               ;;
-       os400)
-               basic_machine=powerpc-ibm
-               os=-os400
-               ;;
-       OSE68000 | ose68000)
-               basic_machine=m68000-ericsson
-               os=-ose
-               ;;
-       os68k)
-               basic_machine=m68k-none
-               os=-os68k
+               cpu=hppa1.1
+               vendor=oki
+               basic_os=proelf
                ;;
        pa-hitachi)
-               basic_machine=hppa1.1-hitachi
-               os=-hiuxwe2
-               ;;
-       paragon)
-               basic_machine=i860-intel
-               os=-osf
-               ;;
-       parisc)
-               basic_machine=hppa-unknown
-               os=-linux
-               ;;
-       parisc-*)
-               basic_machine=hppa-`echo $basic_machine | sed 's/^[^-]*-//'`
-               os=-linux
+               cpu=hppa1.1
+               vendor=hitachi
+               basic_os=hiuxwe2
                ;;
        pbd)
-               basic_machine=sparc-tti
+               cpu=sparc
+               vendor=tti
                ;;
        pbb)
-               basic_machine=m68k-tti
-               ;;
-       pc532 | pc532-*)
-               basic_machine=ns32k-pc532
-               ;;
-       pc98)
-               basic_machine=i386-pc
+               cpu=m68k
+               vendor=tti
                ;;
-       pc98-*)
-               basic_machine=i386-`echo $basic_machine | sed 's/^[^-]*-//'`
-               ;;
-       pentium | p5 | k5 | k6 | nexgen | viac3)
-               basic_machine=i586-pc
-               ;;
-       pentiumpro | p6 | 6x86 | athlon | athlon_*)
-               basic_machine=i686-pc
-               ;;
-       pentiumii | pentium2 | pentiumiii | pentium3)
-               basic_machine=i686-pc
-               ;;
-       pentium4)
-               basic_machine=i786-pc
-               ;;
-       pentium-* | p5-* | k5-* | k6-* | nexgen-* | viac3-*)
-               basic_machine=i586-`echo $basic_machine | sed 's/^[^-]*-//'`
-               ;;
-       pentiumpro-* | p6-* | 6x86-* | athlon-*)
-               basic_machine=i686-`echo $basic_machine | sed 's/^[^-]*-//'`
-               ;;
-       pentiumii-* | pentium2-* | pentiumiii-* | pentium3-*)
-               basic_machine=i686-`echo $basic_machine | sed 's/^[^-]*-//'`
-               ;;
-       pentium4-*)
-               basic_machine=i786-`echo $basic_machine | sed 's/^[^-]*-//'`
+       pc532)
+               cpu=ns32k
+               vendor=pc532
                ;;
        pn)
-               basic_machine=pn-gould
-               ;;
-       power)  basic_machine=power-ibm
-               ;;
-       ppc | ppcbe)    basic_machine=powerpc-unknown
+               cpu=pn
+               vendor=gould
                ;;
-       ppc-* | ppcbe-*)
-               basic_machine=powerpc-`echo $basic_machine | sed 's/^[^-]*-//'`
-               ;;
-       ppcle | powerpclittle | ppc-le | powerpc-little)
-               basic_machine=powerpcle-unknown
-               ;;
-       ppcle-* | powerpclittle-*)
-               basic_machine=powerpcle-`echo $basic_machine | sed 's/^[^-]*-//'`
-               ;;
-       ppc64)  basic_machine=powerpc64-unknown
-               ;;
-       ppc64-*) basic_machine=powerpc64-`echo $basic_machine | sed 's/^[^-]*-//'`
-               ;;
-       ppc64le | powerpc64little | ppc64-le | powerpc64-little)
-               basic_machine=powerpc64le-unknown
-               ;;
-       ppc64le-* | powerpc64little-*)
-               basic_machine=powerpc64le-`echo $basic_machine | sed 's/^[^-]*-//'`
+       power)
+               cpu=power
+               vendor=ibm
                ;;
        ps2)
-               basic_machine=i386-ibm
-               ;;
-       pw32)
-               basic_machine=i586-unknown
-               os=-pw32
-               ;;
-       rdos | rdos64)
-               basic_machine=x86_64-pc
-               os=-rdos
-               ;;
-       rdos32)
-               basic_machine=i386-pc
-               os=-rdos
-               ;;
-       rom68k)
-               basic_machine=m68k-rom68k
-               os=-coff
+               cpu=i386
+               vendor=ibm
                ;;
        rm[46]00)
-               basic_machine=mips-siemens
+               cpu=mips
+               vendor=siemens
                ;;
        rtpc | rtpc-*)
-               basic_machine=romp-ibm
-               ;;
-       s390 | s390-*)
-               basic_machine=s390-ibm
+               cpu=romp
+               vendor=ibm
                ;;
-       s390x | s390x-*)
-               basic_machine=s390x-ibm
-               ;;
-       sa29200)
-               basic_machine=a29k-amd
-               os=-udi
+       sde)
+               cpu=mipsisa32
+               vendor=sde
+               basic_os=${basic_os:-elf}
                ;;
-       sb1)
-               basic_machine=mipsisa64sb1-unknown
+       simso-wrs)
+               cpu=sparclite
+               vendor=wrs
+               basic_os=vxworks
                ;;
-       sb1el)
-               basic_machine=mipsisa64sb1el-unknown
+       tower | tower-32)
+               cpu=m68k
+               vendor=ncr
                ;;
-       sde)
-               basic_machine=mipsisa32-sde
-               os=-elf
+       vpp*|vx|vx-*)
+               cpu=f301
+               vendor=fujitsu
                ;;
-       sei)
-               basic_machine=mips-sei
-               os=-seiux
+       w65)
+               cpu=w65
+               vendor=wdc
                ;;
-       sequent)
-               basic_machine=i386-sequent
+       w89k-*)
+               cpu=hppa1.1
+               vendor=winbond
+               basic_os=proelf
                ;;
-       sh)
-               basic_machine=sh-hitachi
-               os=-hms
+       none)
+               cpu=none
+               vendor=none
                ;;
-       sh5el)
-               basic_machine=sh5le-unknown
+       leon|leon[3-9])
+               cpu=sparc
+               vendor=$basic_machine
                ;;
-       sh64)
-               basic_machine=sh64-unknown
+       leon-*|leon[3-9]-*)
+               cpu=sparc
+               vendor=`echo "$basic_machine" | sed 's/-.*//'`
                ;;
-       sparclite-wrs | simso-wrs)
-               basic_machine=sparclite-wrs
-               os=-vxworks
+
+       *-*)
+               # shellcheck disable=SC2162
+               saved_IFS=$IFS
+               IFS="-" read cpu vendor <<EOF
+$basic_machine
+EOF
+               IFS=$saved_IFS
                ;;
-       sps7)
-               basic_machine=m68k-bull
-               os=-sysv2
+       # We use `pc' rather than `unknown'
+       # because (1) that's what they normally are, and
+       # (2) the word "unknown" tends to confuse beginning users.
+       i*86 | x86_64)
+               cpu=$basic_machine
+               vendor=pc
                ;;
-       spur)
-               basic_machine=spur-unknown
+       # These rules are duplicated from below for sake of the special case above;
+       # i.e. things that normalized to x86 arches should also default to "pc"
+       pc98)
+               cpu=i386
+               vendor=pc
                ;;
-       st2000)
-               basic_machine=m68k-tandem
+       x64 | amd64)
+               cpu=x86_64
+               vendor=pc
                ;;
-       stratus)
-               basic_machine=i860-stratus
-               os=-sysv4
+       # Recognize the basic CPU types without company name.
+       *)
+               cpu=$basic_machine
+               vendor=unknown
                ;;
-       strongarm-* | thumb-*)
-               basic_machine=arm-`echo $basic_machine | sed 's/^[^-]*-//'`
+esac
+
+unset -v basic_machine
+
+# Decode basic machines in the full and proper CPU-Company form.
+case $cpu-$vendor in
+       # Here we handle the default manufacturer of certain CPU types in canonical form. It is in
+       # some cases the only manufacturer, in others, it is the most popular.
+       craynv-unknown)
+               vendor=cray
+               basic_os=${basic_os:-unicosmp}
                ;;
-       sun2)
-               basic_machine=m68000-sun
+       c90-unknown | c90-cray)
+               vendor=cray
+               basic_os=${Basic_os:-unicos}
                ;;
-       sun2os3)
-               basic_machine=m68000-sun
-               os=-sunos3
+       fx80-unknown)
+               vendor=alliant
                ;;
-       sun2os4)
-               basic_machine=m68000-sun
-               os=-sunos4
+       romp-unknown)
+               vendor=ibm
                ;;
-       sun3os3)
-               basic_machine=m68k-sun
-               os=-sunos3
+       mmix-unknown)
+               vendor=knuth
                ;;
-       sun3os4)
-               basic_machine=m68k-sun
-               os=-sunos4
+       microblaze-unknown | microblazeel-unknown)
+               vendor=xilinx
                ;;
-       sun4os3)
-               basic_machine=sparc-sun
-               os=-sunos3
+       rs6000-unknown)
+               vendor=ibm
                ;;
-       sun4os4)
-               basic_machine=sparc-sun
-               os=-sunos4
+       vax-unknown)
+               vendor=dec
                ;;
-       sun4sol2)
-               basic_machine=sparc-sun
-               os=-solaris2
+       pdp11-unknown)
+               vendor=dec
                ;;
-       sun3 | sun3-*)
-               basic_machine=m68k-sun
+       we32k-unknown)
+               vendor=att
                ;;
-       sun4)
-               basic_machine=sparc-sun
+       cydra-unknown)
+               vendor=cydrome
                ;;
-       sun386 | sun386i | roadrunner)
-               basic_machine=i386-sun
+       i370-ibm*)
+               vendor=ibm
                ;;
-       sv1)
-               basic_machine=sv1-cray
-               os=-unicos
+       orion-unknown)
+               vendor=highlevel
                ;;
-       symmetry)
-               basic_machine=i386-sequent
-               os=-dynix
+       xps-unknown | xps100-unknown)
+               cpu=xps100
+               vendor=honeywell
                ;;
-       t3e)
-               basic_machine=alphaev5-cray
-               os=-unicos
+
+       # Here we normalize CPU types with a missing or matching vendor
+       armh-unknown | armh-alt)
+               cpu=armv7l
+               vendor=alt
+               basic_os=${basic_os:-linux-gnueabihf}
                ;;
-       t90)
-               basic_machine=t90-cray
-               os=-unicos
+       dpx20-unknown | dpx20-bull)
+               cpu=rs6000
+               vendor=bull
+               basic_os=${basic_os:-bosx}
                ;;
-       tile*)
-               basic_machine=$basic_machine-unknown
-               os=-linux-gnu
+
+       # Here we normalize CPU types irrespective of the vendor
+       amd64-*)
+               cpu=x86_64
                ;;
-       tx39)
-               basic_machine=mipstx39-unknown
+       blackfin-*)
+               cpu=bfin
+               basic_os=linux
                ;;
-       tx39el)
-               basic_machine=mipstx39el-unknown
+       c54x-*)
+               cpu=tic54x
                ;;
-       toad1)
-               basic_machine=pdp10-xkl
-               os=-tops20
+       c55x-*)
+               cpu=tic55x
                ;;
-       tower | tower-32)
-               basic_machine=m68k-ncr
+       c6x-*)
+               cpu=tic6x
                ;;
-       tpf)
-               basic_machine=s390x-ibm
-               os=-tpf
+       e500v[12]-*)
+               cpu=powerpc
+               basic_os=${basic_os}"spe"
                ;;
-       udi29k)
-               basic_machine=a29k-amd
-               os=-udi
+       mips3*-*)
+               cpu=mips64
                ;;
-       ultra3)
-               basic_machine=a29k-nyu
-               os=-sym1
+       ms1-*)
+               cpu=mt
                ;;
-       v810 | necv810)
-               basic_machine=v810-nec
-               os=-none
+       m68knommu-*)
+               cpu=m68k
+               basic_os=linux
                ;;
-       vaxv)
-               basic_machine=vax-dec
-               os=-sysv
+       m9s12z-* | m68hcs12z-* | hcs12z-* | s12z-*)
+               cpu=s12z
                ;;
-       vms)
-               basic_machine=vax-dec
-               os=-vms
+       openrisc-*)
+               cpu=or32
                ;;
-       vpp*|vx|vx-*)
-               basic_machine=f301-fujitsu
+       parisc-*)
+               cpu=hppa
+               basic_os=linux
                ;;
-       vxworks960)
-               basic_machine=i960-wrs
-               os=-vxworks
+       pentium-* | p5-* | k5-* | k6-* | nexgen-* | viac3-*)
+               cpu=i586
                ;;
-       vxworks68)
-               basic_machine=m68k-wrs
-               os=-vxworks
+       pentiumpro-* | p6-* | 6x86-* | athlon-* | athalon_*-*)
+               cpu=i686
                ;;
-       vxworks29k)
-               basic_machine=a29k-wrs
-               os=-vxworks
+       pentiumii-* | pentium2-* | pentiumiii-* | pentium3-*)
+               cpu=i686
                ;;
-       w65*)
-               basic_machine=w65-wdc
-               os=-none
+       pentium4-*)
+               cpu=i786
                ;;
-       w89k-*)
-               basic_machine=hppa1.1-winbond
-               os=-proelf
+       pc98-*)
+               cpu=i386
                ;;
-       xbox)
-               basic_machine=i686-pc
-               os=-mingw32
+       ppc-* | ppcbe-*)
+               cpu=powerpc
                ;;
-       xps | xps100)
-               basic_machine=xps100-honeywell
+       ppcle-* | powerpclittle-*)
+               cpu=powerpcle
                ;;
-       xscale-* | xscalee[bl]-*)
-               basic_machine=`echo $basic_machine | sed 's/^xscale/arm/'`
+       ppc64-*)
+               cpu=powerpc64
                ;;
-       ymp)
-               basic_machine=ymp-cray
-               os=-unicos
+       ppc64le-* | powerpc64little-*)
+               cpu=powerpc64le
                ;;
-       z8k-*-coff)
-               basic_machine=z8k-unknown
-               os=-sim
+       sb1-*)
+               cpu=mipsisa64sb1
                ;;
-       z80-*-coff)
-               basic_machine=z80-unknown
-               os=-sim
+       sb1el-*)
+               cpu=mipsisa64sb1el
                ;;
-       none)
-               basic_machine=none-none
-               os=-none
+       sh5e[lb]-*)
+               cpu=`echo "$cpu" | sed 's/^\(sh.\)e\(.\)$/\1\2e/'`
                ;;
-
-# Here we handle the default manufacturer of certain CPU types.  It is in
-# some cases the only manufacturer, in others, it is the most popular.
-       w89k)
-               basic_machine=hppa1.1-winbond
+       spur-*)
+               cpu=spur
                ;;
-       op50n)
-               basic_machine=hppa1.1-oki
+       strongarm-* | thumb-*)
+               cpu=arm
                ;;
-       op60c)
-               basic_machine=hppa1.1-oki
+       tx39-*)
+               cpu=mipstx39
                ;;
-       romp)
-               basic_machine=romp-ibm
+       tx39el-*)
+               cpu=mipstx39el
                ;;
-       mmix)
-               basic_machine=mmix-knuth
+       x64-*)
+               cpu=x86_64
                ;;
-       rs6000)
-               basic_machine=rs6000-ibm
+       xscale-* | xscalee[bl]-*)
+               cpu=`echo "$cpu" | sed 's/^xscale/arm/'`
                ;;
-       vax)
-               basic_machine=vax-dec
+       arm64-* | aarch64le-*)
+               cpu=aarch64
                ;;
-       pdp10)
-               # there are many clones, so DEC is not a safe bet
-               basic_machine=pdp10-unknown
+
+       # Recognize the canonical CPU Types that limit and/or modify the
+       # company names they are paired with.
+       cr16-*)
+               basic_os=${basic_os:-elf}
                ;;
-       pdp11)
-               basic_machine=pdp11-dec
+       crisv32-* | etraxfs*-*)
+               cpu=crisv32
+               vendor=axis
                ;;
-       we32k)
-               basic_machine=we32k-att
+       cris-* | etrax*-*)
+               cpu=cris
+               vendor=axis
                ;;
-       sh[1234] | sh[24]a | sh[24]aeb | sh[34]eb | sh[1234]le | sh[23]ele)
-               basic_machine=sh-unknown
+       crx-*)
+               basic_os=${basic_os:-elf}
                ;;
-       sparc | sparcv8 | sparcv9 | sparcv9b | sparcv9v)
-               basic_machine=sparc-sun
+       neo-tandem)
+               cpu=neo
+               vendor=tandem
                ;;
-       cydra)
-               basic_machine=cydra-cydrome
+       nse-tandem)
+               cpu=nse
+               vendor=tandem
                ;;
-       orion)
-               basic_machine=orion-highlevel
+       nsr-tandem)
+               cpu=nsr
+               vendor=tandem
                ;;
-       orion105)
-               basic_machine=clipper-highlevel
+       nsv-tandem)
+               cpu=nsv
+               vendor=tandem
                ;;
-       mac | mpw | mac-mpw)
-               basic_machine=m68k-apple
+       nsx-tandem)
+               cpu=nsx
+               vendor=tandem
                ;;
-       pmac | pmac-mpw)
-               basic_machine=powerpc-apple
+       mipsallegrexel-sony)
+               cpu=mipsallegrexel
+               vendor=sony
                ;;
-       *-unknown)
-               # Make sure to match an already-canonicalized machine name.
+       tile*-*)
+               basic_os=${basic_os:-linux-gnu}
                ;;
+
        *)
-               echo Invalid configuration \`$1\': machine \`$basic_machine\' not recognized 1>&2
-               exit 1
+               # Recognize the canonical CPU types that are allowed with any
+               # company name.
+               case $cpu in
+                       1750a | 580 \
+                       | a29k \
+                       | aarch64 | aarch64_be \
+                       | abacus \
+                       | alpha | alphaev[4-8] | alphaev56 | alphaev6[78] \
+                       | alpha64 | alpha64ev[4-8] | alpha64ev56 | alpha64ev6[78] \
+                       | alphapca5[67] | alpha64pca5[67] \
+                       | am33_2.0 \
+                       | amdgcn \
+                       | arc | arceb | arc32 | arc64 \
+                       | arm | arm[lb]e | arme[lb] | armv* \
+                       | avr | avr32 \
+                       | asmjs \
+                       | ba \
+                       | be32 | be64 \
+                       | bfin | bpf | bs2000 \
+                       | c[123]* | c30 | [cjt]90 | c4x \
+                       | c8051 | clipper | craynv | csky | cydra \
+                       | d10v | d30v | dlx | dsp16xx \
+                       | e2k | elxsi | epiphany \
+                       | f30[01] | f700 | fido | fr30 | frv | ft32 | fx80 \
+                       | h8300 | h8500 \
+                       | hppa | hppa1.[01] | hppa2.0 | hppa2.0[nw] | hppa64 \
+                       | hexagon \
+                       | i370 | i*86 | i860 | i960 | ia16 | ia64 \
+                       | ip2k | iq2000 \
+                       | k1om \
+                       | le32 | le64 \
+                       | lm32 \
+                       | loongarch32 | loongarch64 | loongarchx32 \
+                       | m32c | m32r | m32rle \
+                       | m5200 | m68000 | m680[012346]0 | m68360 | m683?2 | m68k \
+                       | m6811 | m68hc11 | m6812 | m68hc12 | m68hcs12x \
+                       | m88110 | m88k | maxq | mb | mcore | mep | metag \
+                       | microblaze | microblazeel \
+                       | mips | mipsbe | mipseb | mipsel | mipsle \
+                       | mips16 \
+                       | mips64 | mips64eb | mips64el \
+                       | mips64octeon | mips64octeonel \
+                       | mips64orion | mips64orionel \
+                       | mips64r5900 | mips64r5900el \
+                       | mips64vr | mips64vrel \
+                       | mips64vr4100 | mips64vr4100el \
+                       | mips64vr4300 | mips64vr4300el \
+                       | mips64vr5000 | mips64vr5000el \
+                       | mips64vr5900 | mips64vr5900el \
+                       | mipsisa32 | mipsisa32el \
+                       | mipsisa32r2 | mipsisa32r2el \
+                       | mipsisa32r3 | mipsisa32r3el \
+                       | mipsisa32r5 | mipsisa32r5el \
+                       | mipsisa32r6 | mipsisa32r6el \
+                       | mipsisa64 | mipsisa64el \
+                       | mipsisa64r2 | mipsisa64r2el \
+                       | mipsisa64r3 | mipsisa64r3el \
+                       | mipsisa64r5 | mipsisa64r5el \
+                       | mipsisa64r6 | mipsisa64r6el \
+                       | mipsisa64sb1 | mipsisa64sb1el \
+                       | mipsisa64sr71k | mipsisa64sr71kel \
+                       | mipsr5900 | mipsr5900el \
+                       | mipstx39 | mipstx39el \
+                       | mmix \
+                       | mn10200 | mn10300 \
+                       | moxie \
+                       | mt \
+                       | msp430 \
+                       | nds32 | nds32le | nds32be \
+                       | nfp \
+                       | nios | nios2 | nios2eb | nios2el \
+                       | none | np1 | ns16k | ns32k | nvptx \
+                       | open8 \
+                       | or1k* \
+                       | or32 \
+                       | orion \
+                       | picochip \
+                       | pdp10 | pdp11 | pj | pjl | pn | power \
+                       | powerpc | powerpc64 | powerpc64le | powerpcle | powerpcspe \
+                       | pru \
+                       | pyramid \
+                       | riscv | riscv32 | riscv32be | riscv64 | riscv64be \
+                       | rl78 | romp | rs6000 | rx \
+                       | s390 | s390x \
+                       | score \
+                       | sh | shl \
+                       | sh[1234] | sh[24]a | sh[24]ae[lb] | sh[23]e | she[lb] | sh[lb]e \
+                       | sh[1234]e[lb] |  sh[12345][lb]e | sh[23]ele | sh64 | sh64le \
+                       | sparc | sparc64 | sparc64b | sparc64v | sparc86x | sparclet \
+                       | sparclite \
+                       | sparcv8 | sparcv9 | sparcv9b | sparcv9v | sv1 | sx* \
+                       | spu \
+                       | tahoe \
+                       | thumbv7* \
+                       | tic30 | tic4x | tic54x | tic55x | tic6x | tic80 \
+                       | tron \
+                       | ubicom32 \
+                       | v70 | v850 | v850e | v850e1 | v850es | v850e2 | v850e2v3 \
+                       | vax \
+                       | visium \
+                       | w65 \
+                       | wasm32 | wasm64 \
+                       | we32k \
+                       | x86 | x86_64 | xc16x | xgate | xps100 \
+                       | xstormy16 | xtensa* \
+                       | ymp \
+                       | z8k | z80)
+                               ;;
+
+                       *)
+                               echo Invalid configuration \`"$1"\': machine \`"$cpu-$vendor"\' not recognized 1>&2
+                               exit 1
+                               ;;
+               esac
                ;;
 esac
 
 # Here we canonicalize certain aliases for manufacturers.
-case $basic_machine in
-       *-digital*)
-               basic_machine=`echo $basic_machine | sed 's/digital.*/dec/'`
+case $vendor in
+       digital*)
+               vendor=dec
                ;;
-       *-commodore*)
-               basic_machine=`echo $basic_machine | sed 's/commodore.*/cbm/'`
+       commodore*)
+               vendor=cbm
                ;;
        *)
                ;;
@@ -1353,203 +1306,215 @@ esac
 
 # Decode manufacturer-specific aliases for certain operating systems.
 
-if [ x"$os" != x"" ]
+if test x$basic_os != x
 then
-case $os in
-       # First match some system type aliases
-       # that might get confused with valid system types.
-       # -solaris* is a basic system type, with this one exception.
-       -auroraux)
-               os=-auroraux
+
+# First recognize some ad-hoc cases, or perhaps split kernel-os, or else just
+# set os.
+case $basic_os in
+       gnu/linux*)
+               kernel=linux
+               os=`echo "$basic_os" | sed -e 's|gnu/linux|gnu|'`
+               ;;
+       os2-emx)
+               kernel=os2
+               os=`echo "$basic_os" | sed -e 's|os2-emx|emx|'`
+               ;;
+       nto-qnx*)
+               kernel=nto
+               os=`echo "$basic_os" | sed -e 's|nto-qnx|qnx|'`
+               ;;
+       *-*)
+               # shellcheck disable=SC2162
+               saved_IFS=$IFS
+               IFS="-" read kernel os <<EOF
+$basic_os
+EOF
+               IFS=$saved_IFS
+               ;;
+       # Default OS when just kernel was specified
+       nto*)
+               kernel=nto
+               os=`echo "$basic_os" | sed -e 's|nto|qnx|'`
+               ;;
+       linux*)
+               kernel=linux
+               os=`echo "$basic_os" | sed -e 's|linux|gnu|'`
                ;;
-       -solaris1 | -solaris1.*)
-               os=`echo $os | sed -e 's|solaris1|sunos4|'`
+       *)
+               kernel=
+               os=$basic_os
+               ;;
+esac
+
+# Now, normalize the OS (knowing we just have one component, it's not a kernel,
+# etc.)
+case $os in
+       # First match some system type aliases that might get confused
+       # with valid system types.
+       # solaris* is a basic system type, with this one exception.
+       auroraux)
+               os=auroraux
                ;;
-       -solaris)
-               os=-solaris2
+       bluegene*)
+               os=cnk
                ;;
-       -svr4*)
-               os=-sysv4
+       solaris1 | solaris1.*)
+               os=`echo "$os" | sed -e 's|solaris1|sunos4|'`
                ;;
-       -unixware*)
-               os=-sysv4.2uw
+       solaris)
+               os=solaris2
                ;;
-       -gnu/linux*)
-               os=`echo $os | sed -e 's|gnu/linux|linux-gnu|'`
+       unixware*)
+               os=sysv4.2uw
                ;;
-       # First accept the basic system types.
-       # The portable systems comes first.
-       # Each alternative MUST END IN A *, to match a version number.
-       # -sysv* is not here because it comes later, after sysvr4.
-       -gnu* | -bsd* | -mach* | -minix* | -genix* | -ultrix* | -irix* \
-             | -*vms* | -sco* | -esix* | -isc* | -aix* | -cnk* | -sunos | -sunos[34]*\
-             | -hpux* | -unos* | -osf* | -luna* | -dgux* | -auroraux* | -solaris* \
-             | -sym* | -kopensolaris* | -plan9* \
-             | -amigaos* | -amigados* | -msdos* | -newsos* | -unicos* | -aof* \
-             | -aos* | -aros* | -cloudabi* | -sortix* \
-             | -nindy* | -vxsim* | -vxworks* | -ebmon* | -hms* | -mvs* \
-             | -clix* | -riscos* | -uniplus* | -iris* | -rtu* | -xenix* \
-             | -hiux* | -386bsd* | -knetbsd* | -mirbsd* | -netbsd* \
-             | -bitrig* | -openbsd* | -solidbsd* | -libertybsd* \
-             | -ekkobsd* | -kfreebsd* | -freebsd* | -riscix* | -lynxos* \
-             | -bosx* | -nextstep* | -cxux* | -aout* | -elf* | -oabi* \
-             | -ptx* | -coff* | -ecoff* | -winnt* | -domain* | -vsta* \
-             | -udi* | -eabi* | -lites* | -ieee* | -go32* | -aux* \
-             | -chorusos* | -chorusrdb* | -cegcc* \
-             | -cygwin* | -msys* | -pe* | -psos* | -moss* | -proelf* | -rtems* \
-             | -midipix* | -mingw32* | -mingw64* | -linux-gnu* | -linux-android* \
-             | -linux-newlib* | -linux-musl* | -linux-uclibc* \
-             | -uxpv* | -beos* | -mpeix* | -udk* | -moxiebox* \
-             | -interix* | -uwin* | -mks* | -rhapsody* | -darwin* | -opened* \
-             | -openstep* | -oskit* | -conix* | -pw32* | -nonstopux* \
-             | -storm-chaos* | -tops10* | -tenex* | -tops20* | -its* \
-             | -os2* | -vos* | -palmos* | -uclinux* | -nucleus* \
-             | -morphos* | -superux* | -rtmk* | -rtmk-nova* | -windiss* \
-             | -powermax* | -dnix* | -nx6 | -nx7 | -sei* | -dragonfly* \
-             | -skyos* | -haiku* | -rdos* | -toppers* | -drops* | -es* \
-             | -onefs* | -tirtos* | -phoenix*)
-       # Remember, each alternative MUST END IN *, to match a version number.
-               ;;
-       -qnx*)
-               case $basic_machine in
-                   x86-* | i*86-*)
-                       ;;
-                   *)
-                       os=-nto$os
-                       ;;
-               esac
+       # es1800 is here to avoid being matched by es* (a different OS)
+       es1800*)
+               os=ose
                ;;
-       -nto-qnx*)
+       # Some version numbers need modification
+       chorusos*)
+               os=chorusos
                ;;
-       -nto*)
-               os=`echo $os | sed -e 's|nto|nto-qnx|'`
+       isc)
+               os=isc2.2
                ;;
-       -sim | -es1800* | -hms* | -xray | -os68k* | -none* | -v88r* \
-             | -windows* | -osx | -abug | -netware* | -os9* | -beos* | -haiku* \
-             | -macos* | -mpw* | -magic* | -mmixware* | -mon960* | -lnews*)
+       sco6)
+               os=sco5v6
                ;;
-       -mac*)
-               os=`echo $os | sed -e 's|mac|macos|'`
+       sco5)
+               os=sco3.2v5
                ;;
-       -linux-dietlibc)
-               os=-linux-dietlibc
+       sco4)
+               os=sco3.2v4
                ;;
-       -linux*)
-               os=`echo $os | sed -e 's|linux|linux-gnu|'`
+       sco3.2.[4-9]*)
+               os=`echo "$os" | sed -e 's/sco3.2./sco3.2v/'`
                ;;
-       -sunos5*)
-               os=`echo $os | sed -e 's|sunos5|solaris2|'`
+       sco*v* | scout)
+               # Don't match below
                ;;
-       -sunos6*)
-               os=`echo $os | sed -e 's|sunos6|solaris3|'`
+       sco*)
+               os=sco3.2v2
                ;;
-       -opened*)
-               os=-openedition
+       psos*)
+               os=psos
                ;;
-       -os400*)
-               os=-os400
+       qnx*)
+               os=qnx
                ;;
-       -wince*)
-               os=-wince
+       hiux*)
+               os=hiuxwe2
                ;;
-       -osfrose*)
-               os=-osfrose
+       lynx*178)
+               os=lynxos178
                ;;
-       -osf*)
-               os=-osf
+       lynx*5)
+               os=lynxos5
                ;;
-       -utek*)
-               os=-bsd
+       lynxos*)
+               # don't get caught up in next wildcard
                ;;
-       -dynix*)
-               os=-bsd
+       lynx*)
+               os=lynxos
                ;;
-       -acis*)
-               os=-aos
+       mac[0-9]*)
+               os=`echo "$os" | sed -e 's|mac|macos|'`
                ;;
-       -atheos*)
-               os=-atheos
+       opened*)
+               os=openedition
                ;;
-       -syllable*)
-               os=-syllable
+       os400*)
+               os=os400
                ;;
-       -386bsd)
-               os=-bsd
+       sunos5*)
+               os=`echo "$os" | sed -e 's|sunos5|solaris2|'`
                ;;
-       -ctix* | -uts*)
-               os=-sysv
+       sunos6*)
+               os=`echo "$os" | sed -e 's|sunos6|solaris3|'`
                ;;
-       -nova*)
-               os=-rtmk-nova
+       wince*)
+               os=wince
                ;;
-       -ns2 )
-               os=-nextstep2
+       utek*)
+               os=bsd
                ;;
-       -nsk*)
-               os=-nsk
+       dynix*)
+               os=bsd
                ;;
-       # Preserve the version number of sinix5.
-       -sinix5.*)
-               os=`echo $os | sed -e 's|sinix|sysv|'`
+       acis*)
+               os=aos
                ;;
-       -sinix*)
-               os=-sysv4
+       atheos*)
+               os=atheos
                ;;
-       -tpf*)
-               os=-tpf
+       syllable*)
+               os=syllable
                ;;
-       -triton*)
-               os=-sysv3
+       386bsd)
+               os=bsd
                ;;
-       -oss*)
-               os=-sysv3
+       ctix* | uts*)
+               os=sysv
                ;;
-       -svr4)
-               os=-sysv4
+       nova*)
+               os=rtmk-nova
                ;;
-       -svr3)
-               os=-sysv3
+       ns2)
+               os=nextstep2
                ;;
-       -sysvr4)
-               os=-sysv4
+       # Preserve the version number of sinix5.
+       sinix5.*)
+               os=`echo "$os" | sed -e 's|sinix|sysv|'`
                ;;
-       # This must come after -sysvr4.
-       -sysv*)
+       sinix*)
+               os=sysv4
                ;;
-       -ose*)
-               os=-ose
+       tpf*)
+               os=tpf
                ;;
-       -es1800*)
-               os=-ose
+       triton*)
+               os=sysv3
                ;;
-       -xenix)
-               os=-xenix
+       oss*)
+               os=sysv3
                ;;
-       -*mint | -mint[0-9]* | -*MiNT | -MiNT[0-9]*)
-               os=-mint
+       svr4*)
+               os=sysv4
                ;;
-       -aros*)
-               os=-aros
+       svr3)
+               os=sysv3
                ;;
-       -zvmoe)
-               os=-zvmoe
+       sysvr4)
+               os=sysv4
                ;;
-       -dicos*)
-               os=-dicos
+       ose*)
+               os=ose
                ;;
-       -nacl*)
+       *mint | mint[0-9]* | *MiNT | MiNT[0-9]*)
+               os=mint
                ;;
-       -ios)
+       dicos*)
+               os=dicos
                ;;
-       -none)
+       pikeos*)
+               # Until real need of OS specific support for
+               # particular features comes up, bare metal
+               # configurations are quite functional.
+               case $cpu in
+                   arm*)
+                       os=eabi
+                       ;;
+                   *)
+                       os=elf
+                       ;;
+               esac
                ;;
        *)
-               # Get rid of the `-' at the beginning of $os.
-               os=`echo $os | sed 's/[^-]*-//'`
-               echo Invalid configuration \`$1\': system \`$os\' not recognized 1>&2
-               exit 1
+               # No normalization, but not necessarily accepted, that comes below.
                ;;
 esac
+
 else
 
 # Here we handle the default operating systems that come with various machines.
@@ -1562,261 +1527,363 @@ else
 # will signal an error saying that MANUFACTURER isn't an operating
 # system, and we'll never get to this point.
 
-case $basic_machine in
+kernel=
+case $cpu-$vendor in
        score-*)
-               os=-elf
+               os=elf
                ;;
        spu-*)
-               os=-elf
+               os=elf
                ;;
        *-acorn)
-               os=-riscix1.2
+               os=riscix1.2
                ;;
        arm*-rebel)
-               os=-linux
+               kernel=linux
+               os=gnu
                ;;
        arm*-semi)
-               os=-aout
+               os=aout
                ;;
        c4x-* | tic4x-*)
-               os=-coff
+               os=coff
                ;;
        c8051-*)
-               os=-elf
+               os=elf
+               ;;
+       clipper-intergraph)
+               os=clix
                ;;
        hexagon-*)
-               os=-elf
+               os=elf
                ;;
        tic54x-*)
-               os=-coff
+               os=coff
                ;;
        tic55x-*)
-               os=-coff
+               os=coff
                ;;
        tic6x-*)
-               os=-coff
+               os=coff
                ;;
        # This must come before the *-dec entry.
        pdp10-*)
-               os=-tops20
+               os=tops20
                ;;
        pdp11-*)
-               os=-none
+               os=none
                ;;
        *-dec | vax-*)
-               os=-ultrix4.2
+               os=ultrix4.2
                ;;
        m68*-apollo)
-               os=-domain
+               os=domain
                ;;
        i386-sun)
-               os=-sunos4.0.2
+               os=sunos4.0.2
                ;;
        m68000-sun)
-               os=-sunos3
+               os=sunos3
                ;;
        m68*-cisco)
-               os=-aout
+               os=aout
                ;;
        mep-*)
-               os=-elf
+               os=elf
                ;;
        mips*-cisco)
-               os=-elf
+               os=elf
                ;;
        mips*-*)
-               os=-elf
+               os=elf
                ;;
        or32-*)
-               os=-coff
+               os=coff
                ;;
        *-tti)  # must be before sparc entry or we get the wrong os.
-               os=-sysv3
+               os=sysv3
                ;;
        sparc-* | *-sun)
-               os=-sunos4.1.1
+               os=sunos4.1.1
                ;;
-       *-be)
-               os=-beos
+       pru-*)
+               os=elf
                ;;
-       *-haiku)
-               os=-haiku
+       *-be)
+               os=beos
                ;;
        *-ibm)
-               os=-aix
+               os=aix
                ;;
        *-knuth)
-               os=-mmixware
+               os=mmixware
                ;;
        *-wec)
-               os=-proelf
+               os=proelf
                ;;
        *-winbond)
-               os=-proelf
+               os=proelf
                ;;
        *-oki)
-               os=-proelf
+               os=proelf
                ;;
        *-hp)
-               os=-hpux
+               os=hpux
                ;;
        *-hitachi)
-               os=-hiux
+               os=hiux
                ;;
        i860-* | *-att | *-ncr | *-altos | *-motorola | *-convergent)
-               os=-sysv
+               os=sysv
                ;;
        *-cbm)
-               os=-amigaos
+               os=amigaos
                ;;
        *-dg)
-               os=-dgux
+               os=dgux
                ;;
        *-dolphin)
-               os=-sysv3
+               os=sysv3
                ;;
        m68k-ccur)
-               os=-rtu
+               os=rtu
                ;;
        m88k-omron*)
-               os=-luna
+               os=luna
                ;;
-       *-next )
-               os=-nextstep
+       *-next)
+               os=nextstep
                ;;
        *-sequent)
-               os=-ptx
+               os=ptx
                ;;
        *-crds)
-               os=-unos
+               os=unos
                ;;
        *-ns)
-               os=-genix
+               os=genix
                ;;
        i370-*)
-               os=-mvs
-               ;;
-       *-next)
-               os=-nextstep3
+               os=mvs
                ;;
        *-gould)
-               os=-sysv
+               os=sysv
                ;;
        *-highlevel)
-               os=-bsd
+               os=bsd
                ;;
        *-encore)
-               os=-bsd
+               os=bsd
                ;;
        *-sgi)
-               os=-irix
+               os=irix
                ;;
        *-siemens)
-               os=-sysv4
+               os=sysv4
                ;;
        *-masscomp)
-               os=-rtu
+               os=rtu
                ;;
        f30[01]-fujitsu | f700-fujitsu)
-               os=-uxpv
+               os=uxpv
                ;;
        *-rom68k)
-               os=-coff
+               os=coff
                ;;
        *-*bug)
-               os=-coff
+               os=coff
                ;;
        *-apple)
-               os=-macos
+               os=macos
                ;;
        *-atari*)
-               os=-mint
+               os=mint
+               ;;
+       *-wrs)
+               os=vxworks
                ;;
        *)
-               os=-none
+               os=none
                ;;
 esac
+
 fi
 
+# Now, validate our (potentially fixed-up) OS.
+case $os in
+       # Sometimes we do "kernel-libc", so those need to count as OSes.
+       musl* | newlib* | relibc* | uclibc*)
+               ;;
+       # Likewise for "kernel-abi"
+       eabi* | gnueabi*)
+               ;;
+       # VxWorks passes extra cpu info in the 4th filed.
+       simlinux | simwindows | spe)
+               ;;
+       # Now accept the basic system types.
+       # The portable systems comes first.
+       # Each alternative MUST end in a * to match a version number.
+       gnu* | android* | bsd* | mach* | minix* | genix* | ultrix* | irix* \
+            | *vms* | esix* | aix* | cnk* | sunos | sunos[34]* \
+            | hpux* | unos* | osf* | luna* | dgux* | auroraux* | solaris* \
+            | sym* |  plan9* | psp* | sim* | xray* | os68k* | v88r* \
+            | hiux* | abug | nacl* | netware* | windows* \
+            | os9* | macos* | osx* | ios* \
+            | mpw* | magic* | mmixware* | mon960* | lnews* \
+            | amigaos* | amigados* | msdos* | newsos* | unicos* | aof* \
+            | aos* | aros* | cloudabi* | sortix* | twizzler* \
+            | nindy* | vxsim* | vxworks* | ebmon* | hms* | mvs* \
+            | clix* | riscos* | uniplus* | iris* | isc* | rtu* | xenix* \
+            | mirbsd* | netbsd* | dicos* | openedition* | ose* \
+            | bitrig* | openbsd* | secbsd* | solidbsd* | libertybsd* | os108* \
+            | ekkobsd* | freebsd* | riscix* | lynxos* | os400* \
+            | bosx* | nextstep* | cxux* | aout* | elf* | oabi* \
+            | ptx* | coff* | ecoff* | winnt* | domain* | vsta* \
+            | udi* | lites* | ieee* | go32* | aux* | hcos* \
+            | chorusrdb* | cegcc* | glidix* | serenity* \
+            | cygwin* | msys* | pe* | moss* | proelf* | rtems* \
+            | midipix* | mingw32* | mingw64* | mint* \
+            | uxpv* | beos* | mpeix* | udk* | moxiebox* \
+            | interix* | uwin* | mks* | rhapsody* | darwin* \
+            | openstep* | oskit* | conix* | pw32* | nonstopux* \
+            | storm-chaos* | tops10* | tenex* | tops20* | its* \
+            | os2* | vos* | palmos* | uclinux* | nucleus* | morphos* \
+            | scout* | superux* | sysv* | rtmk* | tpf* | windiss* \
+            | powermax* | dnix* | nx6 | nx7 | sei* | dragonfly* \
+            | skyos* | haiku* | rdos* | toppers* | drops* | es* \
+            | onefs* | tirtos* | phoenix* | fuchsia* | redox* | bme* \
+            | midnightbsd* | amdhsa* | unleashed* | emscripten* | wasi* \
+            | nsk* | powerunix* | genode* | zvmoe* | qnx* | emx* | zephyr* \
+            | fiwix* )
+               ;;
+       # This one is extra strict with allowed versions
+       sco3.2v2 | sco3.2v[4-9]* | sco5v6*)
+               # Don't forget version if it is 3.2v4 or newer.
+               ;;
+       none)
+               ;;
+       *)
+               echo Invalid configuration \`"$1"\': OS \`"$os"\' not recognized 1>&2
+               exit 1
+               ;;
+esac
+
+# As a final step for OS-related things, validate the OS-kernel combination
+# (given a valid OS), if there is a kernel.
+case $kernel-$os in
+       linux-gnu* | linux-dietlibc* | linux-android* | linux-newlib* \
+                  | linux-musl* | linux-relibc* | linux-uclibc* )
+               ;;
+       uclinux-uclibc* )
+               ;;
+       -dietlibc* | -newlib* | -musl* | -relibc* | -uclibc* )
+               # These are just libc implementations, not actual OSes, and thus
+               # require a kernel.
+               echo "Invalid configuration \`$1': libc \`$os' needs explicit kernel." 1>&2
+               exit 1
+               ;;
+       kfreebsd*-gnu* | kopensolaris*-gnu*)
+               ;;
+       vxworks-simlinux | vxworks-simwindows | vxworks-spe)
+               ;;
+       nto-qnx*)
+               ;;
+       os2-emx)
+               ;;
+       *-eabi* | *-gnueabi*)
+               ;;
+       -*)
+               # Blank kernel with real OS is always fine.
+               ;;
+       *-*)
+               echo "Invalid configuration \`$1': Kernel \`$kernel' not known to work with OS \`$os'." 1>&2
+               exit 1
+               ;;
+esac
+
 # Here we handle the case where we know the os, and the CPU type, but not the
 # manufacturer.  We pick the logical manufacturer.
-vendor=unknown
-case $basic_machine in
-       *-unknown)
-               case $os in
-                       -riscix*)
+case $vendor in
+       unknown)
+               case $cpu-$os in
+                       *-riscix*)
                                vendor=acorn
                                ;;
-                       -sunos*)
+                       *-sunos*)
                                vendor=sun
                                ;;
-                       -cnk*|-aix*)
+                       *-cnk* | *-aix*)
                                vendor=ibm
                                ;;
-                       -beos*)
+                       *-beos*)
                                vendor=be
                                ;;
-                       -hpux*)
+                       *-hpux*)
                                vendor=hp
                                ;;
-                       -mpeix*)
+                       *-mpeix*)
                                vendor=hp
                                ;;
-                       -hiux*)
+                       *-hiux*)
                                vendor=hitachi
                                ;;
-                       -unos*)
+                       *-unos*)
                                vendor=crds
                                ;;
-                       -dgux*)
+                       *-dgux*)
                                vendor=dg
                                ;;
-                       -luna*)
+                       *-luna*)
                                vendor=omron
                                ;;
-                       -genix*)
+                       *-genix*)
                                vendor=ns
                                ;;
-                       -mvs* | -opened*)
+                       *-clix*)
+                               vendor=intergraph
+                               ;;
+                       *-mvs* | *-opened*)
+                               vendor=ibm
+                               ;;
+                       *-os400*)
                                vendor=ibm
                                ;;
-                       -os400*)
+                       s390-* | s390x-*)
                                vendor=ibm
                                ;;
-                       -ptx*)
+                       *-ptx*)
                                vendor=sequent
                                ;;
-                       -tpf*)
+                       *-tpf*)
                                vendor=ibm
                                ;;
-                       -vxsim* | -vxworks* | -windiss*)
+                       *-vxsim* | *-vxworks* | *-windiss*)
                                vendor=wrs
                                ;;
-                       -aux*)
+                       *-aux*)
                                vendor=apple
                                ;;
-                       -hms*)
+                       *-hms*)
                                vendor=hitachi
                                ;;
-                       -mpw* | -macos*)
+                       *-mpw* | *-macos*)
                                vendor=apple
                                ;;
-                       -*mint | -mint[0-9]* | -*MiNT | -MiNT[0-9]*)
+                       *-*mint | *-mint[0-9]* | *-*MiNT | *-MiNT[0-9]*)
                                vendor=atari
                                ;;
-                       -vos*)
+                       *-vos*)
                                vendor=stratus
                                ;;
                esac
-               basic_machine=`echo $basic_machine | sed "s/unknown/$vendor/"`
                ;;
 esac
 
-echo $basic_machine$os
+echo "$cpu-$vendor-${kernel:+$kernel-}$os"
 exit
 
 # Local variables:
-# eval: (add-hook 'write-file-hooks 'time-stamp)
+# eval: (add-hook 'before-save-hook 'time-stamp)
 # time-stamp-start: "timestamp='"
 # time-stamp-format: "%:y-%02m-%02d"
 # time-stamp-end: "'"
index bafa8fbb75c19c6916883daacd336f601f26d83c..859599aa3d6022204ef6a53aa0ffb9cdae50dae9 100644 (file)
@@ -5686,8 +5686,8 @@ func_mode_link ()
        *-*-cygwin* | *-*-mingw* | *-*-pw32* | *-*-os2* | *-*-darwin* | *-cegcc*)
          # The PATH hackery in wrapper scripts is required on Windows
          # and Darwin in order for the loader to find any dlls it needs.
-         func_warning "\`-no-install' is ignored for $host"
-         func_warning "assuming \`-no-fast-install' instead"
+         func_warning "\`-no-install' is ignored for $host"
+         func_warning "assuming \`-no-fast-install' instead"
          fast_install=no
          ;;
        *) no_install=yes ;;
index c3d642b2ac7d49985599e21916c7dc64fe9c44e4..ea9014cc98c76890e856f0d518e5e7ad10fbe39b 100644 (file)
@@ -26,9 +26,9 @@ AM_CFLAGS = $(GPG_ERROR_CFLAGS)
 
 AM_CCASFLAGS = $(NOEXECSTACK_FLAGS)
 
-EXTRA_DIST = gost-s-box.c
+EXTRA_DIST = gost-s-box.c kyber-common.c kyber-kdep.c
 
-CLEANFILES = gost-s-box
+CLEANFILES = gost-s-box$(EXEEXT_FOR_BUILD)
 DISTCLEANFILES = gost-sb.h
 
 noinst_LTLIBRARIES = libcipher.la
@@ -55,16 +55,18 @@ libcipher_la_SOURCES = \
        cipher-eax.c \
        cipher-siv.c \
        cipher-gcm-siv.c \
-       cipher-selftest.c cipher-selftest.h \
        pubkey.c pubkey-internal.h pubkey-util.c \
        md.c \
        mac.c mac-internal.h \
        mac-hmac.c mac-cmac.c mac-gmac.c mac-poly1305.c \
        poly1305.c poly1305-internal.h \
+       kem.c sntrup761.c sntrup761.h kyber.c kyber.h kem-ecc.c kem-ecc.h \
+       mceliece6688128f.c mceliece6688128f.h \
        kdf.c kdf-internal.h \
        bithelp.h  \
        bufhelp.h  \
-       primegen.c  \
+       bulkhelp.h \
+       primegen.c \
        hash-common.c hash-common.h \
        dsa-common.c rsa-common.c \
        sha1.h
@@ -72,17 +74,21 @@ libcipher_la_SOURCES = \
 EXTRA_libcipher_la_SOURCES = \
        asm-common-aarch64.h \
        asm-common-amd64.h \
+       asm-common-i386.h \
        asm-common-s390x.h \
        asm-inline-s390x.h \
        asm-poly1305-aarch64.h \
        asm-poly1305-amd64.h \
        asm-poly1305-s390x.h \
+       aria.c aria-aesni-avx-amd64.S aria-aesni-avx2-amd64.S \
+       aria-gfni-avx512-amd64.S \
        arcfour.c arcfour-amd64.S \
        blowfish.c blowfish-amd64.S blowfish-arm.S \
        cast5.c cast5-amd64.S cast5-arm.S \
        chacha20.c chacha20-amd64-ssse3.S chacha20-amd64-avx2.S \
-       chacha20-armv7-neon.S chacha20-aarch64.S \
+       chacha20-amd64-avx512.S chacha20-armv7-neon.S chacha20-aarch64.S \
        chacha20-ppc.c chacha20-s390x.S \
+       chacha20-p10le-8x.s \
        cipher-gcm-ppc.c cipher-gcm-intel-pclmul.c cipher-gcm-armv7-neon.S \
        cipher-gcm-armv8-aarch32-ce.S cipher-gcm-armv8-aarch64-ce.S \
        crc.c crc-intel-pclmul.c crc-armv8-ce.c \
@@ -98,16 +104,18 @@ EXTRA_libcipher_la_SOURCES = \
        gostr3411-94.c \
        md4.c \
        md5.c \
-       poly1305-s390x.S \
+       poly1305-s390x.S poly1305-amd64-avx512.S \
+       poly1305-p10le.s \
        rijndael.c rijndael-internal.h rijndael-tables.h   \
        rijndael-aesni.c rijndael-padlock.c                \
        rijndael-amd64.S rijndael-arm.S                    \
        rijndael-ssse3-amd64.c rijndael-ssse3-amd64-asm.S  \
        rijndael-vaes.c rijndael-vaes-avx2-amd64.S         \
+       rijndael-vaes-i386.c rijndael-vaes-avx2-i386.S     \
        rijndael-armv8-ce.c rijndael-armv8-aarch32-ce.S    \
        rijndael-armv8-aarch64-ce.S rijndael-aarch64.S     \
        rijndael-ppc.c rijndael-ppc9le.c                   \
-       rijndael-p10le.c rijndael-gcm-p10le.s             \
+       rijndael-p10le.c rijndael-gcm-p10le.s              \
        rijndael-ppc-common.h rijndael-ppc-functions.h     \
        rijndael-s390x.c                                   \
        rmd160.c \
@@ -115,9 +123,12 @@ EXTRA_libcipher_la_SOURCES = \
        salsa20.c salsa20-amd64.S salsa20-armv7-neon.S \
        scrypt.c \
        seed.c \
-       serpent.c serpent-sse2-amd64.S \
+       serpent.c serpent-sse2-amd64.S serpent-avx2-amd64.S \
+       serpent-avx512-x86.c serpent-armv7-neon.S \
        sm4.c sm4-aesni-avx-amd64.S sm4-aesni-avx2-amd64.S \
-       serpent-avx2-amd64.S serpent-armv7-neon.S \
+       sm4-gfni-avx2-amd64.S sm4-gfni-avx512-amd64.S \
+       sm4-aarch64.S sm4-armv8-aarch64-ce.S sm4-armv9-aarch64-sve-ce.S \
+       sm4-ppc.c \
        sha1.c sha1-ssse3-amd64.S sha1-avx-amd64.S sha1-avx-bmi2-amd64.S \
        sha1-avx2-bmi2-amd64.S sha1-armv7-neon.S sha1-armv8-aarch32-ce.S \
        sha1-armv8-aarch64-ce.S sha1-intel-shaext.c \
@@ -126,11 +137,12 @@ EXTRA_libcipher_la_SOURCES = \
        sha256-armv8-aarch32-ce.S sha256-armv8-aarch64-ce.S \
        sha256-intel-shaext.c sha256-ppc.c \
        sha512.c sha512-ssse3-amd64.S sha512-avx-amd64.S \
-       sha512-avx2-bmi2-amd64.S \
-       sha512-armv7-neon.S sha512-arm.S \
+       sha512-avx2-bmi2-amd64.S sha512-avx512-amd64.S \
+       sha512-armv7-neon.S sha512-armv8-aarch64-ce.S sha512-arm.S \
        sha512-ppc.c sha512-ssse3-i386.c \
-       sm3.c sm3-avx-bmi2-amd64.S sm3-aarch64.S \
-       keccak.c keccak_permute_32.h keccak_permute_64.h keccak-armv7-neon.S \
+       sm3.c sm3-avx-bmi2-amd64.S sm3-aarch64.S sm3-armv8-aarch64-ce.S \
+       keccak.c keccak_permute_32.h keccak_permute_64.h \
+       keccak-armv7-neon.S keccak-amd64-avx512.S \
        stribog.c \
        tiger.c \
        whirlpool.c whirlpool-sse2-amd64.S \
@@ -138,22 +150,26 @@ EXTRA_libcipher_la_SOURCES = \
        twofish-avx2-amd64.S \
        rfc2268.c \
        camellia.c camellia.h camellia-glue.c camellia-aesni-avx-amd64.S \
-       camellia-aesni-avx2-amd64.h camellia-vaes-avx2-amd64.S \
-       camellia-aesni-avx2-amd64.S camellia-arm.S camellia-aarch64.S \
+       camellia-aesni-avx2-amd64.h \
+       camellia-gfni-avx2-amd64.S camellia-gfni-avx512-amd64.S \
+       camellia-vaes-avx2-amd64.S camellia-aesni-avx2-amd64.S \
+       camellia-arm.S camellia-aarch64.S camellia-aarch64-ce.c \
+       camellia-simd128.h camellia-ppc8le.c camellia-ppc9le.c \
        blake2.c \
-       blake2b-amd64-avx2.S blake2s-amd64-avx.S
+       blake2b-amd64-avx2.S blake2b-amd64-avx512.S \
+       blake2s-amd64-avx.S blake2s-amd64-avx512.S
 
 gost28147.lo: gost-sb.h
-gost-sb.h: gost-s-box
-       ./gost-s-box $@
+gost-sb.h: gost-s-box$(EXEEXT_FOR_BUILD)
+       ./gost-s-box$(EXEEXT_FOR_BUILD) $@
 
-gost-s-box: gost-s-box.c
+gost-s-box$(EXEEXT_FOR_BUILD): gost-s-box.c
        $(CC_FOR_BUILD) $(CFLAGS_FOR_BUILD) $(LDFLAGS_FOR_BUILD) \
            $(CPPFLAGS_FOR_BUILD) -o $@ $(srcdir)/gost-s-box.c
 
 
 if ENABLE_O_FLAG_MUNGING
-o_flag_munging = sed -e 's/-O\([2-9sgz][2-9sgz]*\)/-O1/' -e 's/-Ofast/-O1/g'
+o_flag_munging = sed -e 's/[[:blank:]]-O\([2-9sgz][2-9sgz]*\)/ -O1 /' -e 's/[[:blank:]]-Ofast/ -O1 /g'
 else
 o_flag_munging = cat
 endif
@@ -222,11 +238,17 @@ crc-intel-pclmul.lo: $(srcdir)/crc-intel-pclmul.c Makefile
        `echo $(LTCOMPILE) -c $< | $(instrumentation_munging) `
 
 if ENABLE_PPC_VCRYPTO_EXTRA_CFLAGS
-ppc_vcrypto_cflags = -maltivec -mvsx -mcrypto
+ppc_vcrypto_cflags = -O2 -maltivec -mvsx -mcrypto
 else
 ppc_vcrypto_cflags =
 endif
 
+if ENABLE_AARCH64_NEON_INTRINSICS_EXTRA_CFLAGS
+aarch64_neon_cflags = -O2 -march=armv8-a+crypto
+else
+aarch64_neon_cflags =
+endif
+
 rijndael-ppc.o: $(srcdir)/rijndael-ppc.c Makefile
        `echo $(COMPILE) $(ppc_vcrypto_cflags) -c $< | $(instrumentation_munging) `
 
@@ -274,3 +296,40 @@ cipher-gcm-ppc.o: $(srcdir)/cipher-gcm-ppc.c Makefile
 
 cipher-gcm-ppc.lo: $(srcdir)/cipher-gcm-ppc.c Makefile
        `echo $(LTCOMPILE) $(ppc_vcrypto_cflags) -c $< | $(instrumentation_munging) `
+
+camellia-ppc8le.o: $(srcdir)/camellia-ppc8le.c Makefile
+       `echo $(COMPILE) $(ppc_vcrypto_cflags) -c $< | $(instrumentation_munging) `
+
+camellia-ppc8le.lo: $(srcdir)/camellia-ppc8le.c Makefile
+       `echo $(LTCOMPILE) $(ppc_vcrypto_cflags) -c $< | $(instrumentation_munging) `
+
+camellia-ppc9le.o: $(srcdir)/camellia-ppc9le.c Makefile
+       `echo $(COMPILE) $(ppc_vcrypto_cflags) -c $< | $(instrumentation_munging) `
+
+camellia-ppc9le.lo: $(srcdir)/camellia-ppc9le.c Makefile
+       `echo $(LTCOMPILE) $(ppc_vcrypto_cflags) -c $< | $(instrumentation_munging) `
+
+camellia-aarch64-ce.o: $(srcdir)/camellia-aarch64-ce.c Makefile
+       `echo $(COMPILE) $(aarch64_neon_cflags) -c $< | $(instrumentation_munging) `
+
+camellia-aarch64-ce.lo: $(srcdir)/camellia-aarch64-ce.c Makefile
+       `echo $(LTCOMPILE) $(aarch64_neon_cflags) -c $< | $(instrumentation_munging) `
+
+sm4-ppc.o: $(srcdir)/sm4-ppc.c Makefile
+       `echo $(COMPILE) $(ppc_vcrypto_cflags) -c $< | $(instrumentation_munging) `
+
+sm4-ppc.lo: $(srcdir)/sm4-ppc.c Makefile
+       `echo $(LTCOMPILE) $(ppc_vcrypto_cflags) -c $< | $(instrumentation_munging) `
+
+
+if ENABLE_X86_AVX512_INTRINSICS_EXTRA_CFLAGS
+avx512f_cflags = -mavx512f
+else
+avx512f_cflags =
+endif
+
+serpent-avx512-x86.o: $(srcdir)/serpent-avx512-x86.c Makefile
+       `echo $(COMPILE) $(avx512f_cflags) -c $< | $(instrumentation_munging) `
+
+serpent-avx512-x86.lo: $(srcdir)/serpent-avx512-x86.c Makefile
+       `echo $(LTCOMPILE) $(avx512f_cflags) -c $< | $(instrumentation_munging) `
index 5872c8874f50af195dafed54c8792b08d9925e32..22358f483272542ee8a80cf8aa7cc28805e452b0 100644 (file)
@@ -115,8 +115,8 @@ am__aclocal_m4_deps = $(top_srcdir)/m4/ax_cc_for_build.m4 \
        $(top_srcdir)/m4/gpg-error.m4 $(top_srcdir)/m4/libtool.m4 \
        $(top_srcdir)/m4/ltoptions.m4 $(top_srcdir)/m4/ltsugar.m4 \
        $(top_srcdir)/m4/ltversion.m4 $(top_srcdir)/m4/lt~obsolete.m4 \
-       $(top_srcdir)/m4/noexecstack.m4 $(top_srcdir)/m4/socklen.m4 \
-       $(top_srcdir)/acinclude.m4 $(top_srcdir)/configure.ac
+       $(top_srcdir)/m4/noexecstack.m4 $(top_srcdir)/acinclude.m4 \
+       $(top_srcdir)/configure.ac
 am__configure_deps = $(am__aclocal_m4_deps) $(CONFIGURE_DEPENDENCIES) \
        $(ACLOCAL_M4)
 DIST_COMMON = $(srcdir)/Makefile.am $(am__DIST_COMMON)
@@ -130,10 +130,10 @@ am_libcipher_la_OBJECTS = cipher.lo cipher-cbc.lo cipher-cfb.lo \
        cipher-ofb.lo cipher-ctr.lo cipher-aeswrap.lo cipher-ccm.lo \
        cipher-cmac.lo cipher-gcm.lo cipher-poly1305.lo cipher-ocb.lo \
        cipher-xts.lo cipher-eax.lo cipher-siv.lo cipher-gcm-siv.lo \
-       cipher-selftest.lo pubkey.lo pubkey-util.lo md.lo mac.lo \
-       mac-hmac.lo mac-cmac.lo mac-gmac.lo mac-poly1305.lo \
-       poly1305.lo kdf.lo primegen.lo hash-common.lo dsa-common.lo \
-       rsa-common.lo
+       pubkey.lo pubkey-util.lo md.lo mac.lo mac-hmac.lo mac-cmac.lo \
+       mac-gmac.lo mac-poly1305.lo poly1305.lo kem.lo sntrup761.lo \
+       kyber.lo kem-ecc.lo mceliece6688128f.lo kdf.lo primegen.lo \
+       hash-common.lo dsa-common.lo rsa-common.lo
 libcipher_la_OBJECTS = $(am_libcipher_la_OBJECTS)
 AM_V_lt = $(am__v_lt_@AM_V@)
 am__v_lt_ = $(am__v_lt_@AM_DEFAULT_V@)
@@ -155,19 +155,29 @@ DEFAULT_INCLUDES = -I.@am__isrc@ -I$(top_builddir)
 depcomp = $(SHELL) $(top_srcdir)/build-aux/depcomp
 am__maybe_remake_depfiles = depfiles
 am__depfiles_remade = ./$(DEPDIR)/arcfour-amd64.Plo \
-       ./$(DEPDIR)/arcfour.Plo ./$(DEPDIR)/blake2.Plo \
-       ./$(DEPDIR)/blake2b-amd64-avx2.Plo \
+       ./$(DEPDIR)/arcfour.Plo ./$(DEPDIR)/aria-aesni-avx-amd64.Plo \
+       ./$(DEPDIR)/aria-aesni-avx2-amd64.Plo \
+       ./$(DEPDIR)/aria-gfni-avx512-amd64.Plo ./$(DEPDIR)/aria.Plo \
+       ./$(DEPDIR)/blake2.Plo ./$(DEPDIR)/blake2b-amd64-avx2.Plo \
+       ./$(DEPDIR)/blake2b-amd64-avx512.Plo \
        ./$(DEPDIR)/blake2s-amd64-avx.Plo \
+       ./$(DEPDIR)/blake2s-amd64-avx512.Plo \
        ./$(DEPDIR)/blowfish-amd64.Plo ./$(DEPDIR)/blowfish-arm.Plo \
-       ./$(DEPDIR)/blowfish.Plo ./$(DEPDIR)/camellia-aarch64.Plo \
+       ./$(DEPDIR)/blowfish.Plo ./$(DEPDIR)/camellia-aarch64-ce.Plo \
+       ./$(DEPDIR)/camellia-aarch64.Plo \
        ./$(DEPDIR)/camellia-aesni-avx-amd64.Plo \
        ./$(DEPDIR)/camellia-aesni-avx2-amd64.Plo \
-       ./$(DEPDIR)/camellia-arm.Plo ./$(DEPDIR)/camellia-glue.Plo \
+       ./$(DEPDIR)/camellia-arm.Plo \
+       ./$(DEPDIR)/camellia-gfni-avx2-amd64.Plo \
+       ./$(DEPDIR)/camellia-gfni-avx512-amd64.Plo \
+       ./$(DEPDIR)/camellia-glue.Plo ./$(DEPDIR)/camellia-ppc8le.Plo \
+       ./$(DEPDIR)/camellia-ppc9le.Plo \
        ./$(DEPDIR)/camellia-vaes-avx2-amd64.Plo \
        ./$(DEPDIR)/camellia.Plo ./$(DEPDIR)/cast5-amd64.Plo \
        ./$(DEPDIR)/cast5-arm.Plo ./$(DEPDIR)/cast5.Plo \
        ./$(DEPDIR)/chacha20-aarch64.Plo \
        ./$(DEPDIR)/chacha20-amd64-avx2.Plo \
+       ./$(DEPDIR)/chacha20-amd64-avx512.Plo \
        ./$(DEPDIR)/chacha20-amd64-ssse3.Plo \
        ./$(DEPDIR)/chacha20-armv7-neon.Plo \
        ./$(DEPDIR)/chacha20-ppc.Plo ./$(DEPDIR)/chacha20-s390x.Plo \
@@ -182,9 +192,8 @@ am__depfiles_remade = ./$(DEPDIR)/arcfour-amd64.Plo \
        ./$(DEPDIR)/cipher-gcm-ppc.Plo ./$(DEPDIR)/cipher-gcm-siv.Plo \
        ./$(DEPDIR)/cipher-gcm.Plo ./$(DEPDIR)/cipher-ocb.Plo \
        ./$(DEPDIR)/cipher-ofb.Plo ./$(DEPDIR)/cipher-poly1305.Plo \
-       ./$(DEPDIR)/cipher-selftest.Plo ./$(DEPDIR)/cipher-siv.Plo \
-       ./$(DEPDIR)/cipher-xts.Plo ./$(DEPDIR)/cipher.Plo \
-       ./$(DEPDIR)/crc-armv8-aarch64-ce.Plo \
+       ./$(DEPDIR)/cipher-siv.Plo ./$(DEPDIR)/cipher-xts.Plo \
+       ./$(DEPDIR)/cipher.Plo ./$(DEPDIR)/crc-armv8-aarch64-ce.Plo \
        ./$(DEPDIR)/crc-armv8-ce.Plo ./$(DEPDIR)/crc-intel-pclmul.Plo \
        ./$(DEPDIR)/crc-ppc.Plo ./$(DEPDIR)/crc.Plo \
        ./$(DEPDIR)/des-amd64.Plo ./$(DEPDIR)/des.Plo \
@@ -196,14 +205,19 @@ am__depfiles_remade = ./$(DEPDIR)/arcfour-amd64.Plo \
        ./$(DEPDIR)/elgamal.Plo ./$(DEPDIR)/gost28147.Plo \
        ./$(DEPDIR)/gostr3411-94.Plo ./$(DEPDIR)/hash-common.Plo \
        ./$(DEPDIR)/idea.Plo ./$(DEPDIR)/kdf.Plo \
+       ./$(DEPDIR)/keccak-amd64-avx512.Plo \
        ./$(DEPDIR)/keccak-armv7-neon.Plo ./$(DEPDIR)/keccak.Plo \
-       ./$(DEPDIR)/mac-cmac.Plo ./$(DEPDIR)/mac-gmac.Plo \
-       ./$(DEPDIR)/mac-hmac.Plo ./$(DEPDIR)/mac-poly1305.Plo \
-       ./$(DEPDIR)/mac.Plo ./$(DEPDIR)/md.Plo ./$(DEPDIR)/md4.Plo \
-       ./$(DEPDIR)/md5.Plo ./$(DEPDIR)/poly1305-s390x.Plo \
-       ./$(DEPDIR)/poly1305.Plo ./$(DEPDIR)/primegen.Plo \
-       ./$(DEPDIR)/pubkey-util.Plo ./$(DEPDIR)/pubkey.Plo \
-       ./$(DEPDIR)/rfc2268.Plo ./$(DEPDIR)/rijndael-aarch64.Plo \
+       ./$(DEPDIR)/kem-ecc.Plo ./$(DEPDIR)/kem.Plo \
+       ./$(DEPDIR)/kyber.Plo ./$(DEPDIR)/mac-cmac.Plo \
+       ./$(DEPDIR)/mac-gmac.Plo ./$(DEPDIR)/mac-hmac.Plo \
+       ./$(DEPDIR)/mac-poly1305.Plo ./$(DEPDIR)/mac.Plo \
+       ./$(DEPDIR)/mceliece6688128f.Plo ./$(DEPDIR)/md.Plo \
+       ./$(DEPDIR)/md4.Plo ./$(DEPDIR)/md5.Plo \
+       ./$(DEPDIR)/poly1305-amd64-avx512.Plo \
+       ./$(DEPDIR)/poly1305-s390x.Plo ./$(DEPDIR)/poly1305.Plo \
+       ./$(DEPDIR)/primegen.Plo ./$(DEPDIR)/pubkey-util.Plo \
+       ./$(DEPDIR)/pubkey.Plo ./$(DEPDIR)/rfc2268.Plo \
+       ./$(DEPDIR)/rijndael-aarch64.Plo \
        ./$(DEPDIR)/rijndael-aesni.Plo ./$(DEPDIR)/rijndael-amd64.Plo \
        ./$(DEPDIR)/rijndael-arm.Plo \
        ./$(DEPDIR)/rijndael-armv8-aarch32-ce.Plo \
@@ -215,6 +229,8 @@ am__depfiles_remade = ./$(DEPDIR)/arcfour-amd64.Plo \
        ./$(DEPDIR)/rijndael-ssse3-amd64-asm.Plo \
        ./$(DEPDIR)/rijndael-ssse3-amd64.Plo \
        ./$(DEPDIR)/rijndael-vaes-avx2-amd64.Plo \
+       ./$(DEPDIR)/rijndael-vaes-avx2-i386.Plo \
+       ./$(DEPDIR)/rijndael-vaes-i386.Plo \
        ./$(DEPDIR)/rijndael-vaes.Plo ./$(DEPDIR)/rijndael.Plo \
        ./$(DEPDIR)/rmd160.Plo ./$(DEPDIR)/rsa-common.Plo \
        ./$(DEPDIR)/rsa.Plo ./$(DEPDIR)/salsa20-amd64.Plo \
@@ -222,6 +238,7 @@ am__depfiles_remade = ./$(DEPDIR)/arcfour-amd64.Plo \
        ./$(DEPDIR)/scrypt.Plo ./$(DEPDIR)/seed.Plo \
        ./$(DEPDIR)/serpent-armv7-neon.Plo \
        ./$(DEPDIR)/serpent-avx2-amd64.Plo \
+       ./$(DEPDIR)/serpent-avx512-x86.Plo \
        ./$(DEPDIR)/serpent-sse2-amd64.Plo ./$(DEPDIR)/serpent.Plo \
        ./$(DEPDIR)/sha1-armv7-neon.Plo \
        ./$(DEPDIR)/sha1-armv8-aarch32-ce.Plo \
@@ -238,13 +255,23 @@ am__depfiles_remade = ./$(DEPDIR)/arcfour-amd64.Plo \
        ./$(DEPDIR)/sha256-intel-shaext.Plo ./$(DEPDIR)/sha256-ppc.Plo \
        ./$(DEPDIR)/sha256-ssse3-amd64.Plo ./$(DEPDIR)/sha256.Plo \
        ./$(DEPDIR)/sha512-arm.Plo ./$(DEPDIR)/sha512-armv7-neon.Plo \
+       ./$(DEPDIR)/sha512-armv8-aarch64-ce.Plo \
        ./$(DEPDIR)/sha512-avx-amd64.Plo \
        ./$(DEPDIR)/sha512-avx2-bmi2-amd64.Plo \
-       ./$(DEPDIR)/sha512-ppc.Plo ./$(DEPDIR)/sha512-ssse3-amd64.Plo \
+       ./$(DEPDIR)/sha512-avx512-amd64.Plo ./$(DEPDIR)/sha512-ppc.Plo \
+       ./$(DEPDIR)/sha512-ssse3-amd64.Plo \
        ./$(DEPDIR)/sha512-ssse3-i386.Plo ./$(DEPDIR)/sha512.Plo \
-       ./$(DEPDIR)/sm3-aarch64.Plo ./$(DEPDIR)/sm3-avx-bmi2-amd64.Plo \
-       ./$(DEPDIR)/sm3.Plo ./$(DEPDIR)/sm4-aesni-avx-amd64.Plo \
-       ./$(DEPDIR)/sm4-aesni-avx2-amd64.Plo ./$(DEPDIR)/sm4.Plo \
+       ./$(DEPDIR)/sm3-aarch64.Plo \
+       ./$(DEPDIR)/sm3-armv8-aarch64-ce.Plo \
+       ./$(DEPDIR)/sm3-avx-bmi2-amd64.Plo ./$(DEPDIR)/sm3.Plo \
+       ./$(DEPDIR)/sm4-aarch64.Plo \
+       ./$(DEPDIR)/sm4-aesni-avx-amd64.Plo \
+       ./$(DEPDIR)/sm4-aesni-avx2-amd64.Plo \
+       ./$(DEPDIR)/sm4-armv8-aarch64-ce.Plo \
+       ./$(DEPDIR)/sm4-armv9-aarch64-sve-ce.Plo \
+       ./$(DEPDIR)/sm4-gfni-avx2-amd64.Plo \
+       ./$(DEPDIR)/sm4-gfni-avx512-amd64.Plo ./$(DEPDIR)/sm4-ppc.Plo \
+       ./$(DEPDIR)/sm4.Plo ./$(DEPDIR)/sntrup761.Plo \
        ./$(DEPDIR)/stribog.Plo ./$(DEPDIR)/tiger.Plo \
        ./$(DEPDIR)/twofish-aarch64.Plo ./$(DEPDIR)/twofish-amd64.Plo \
        ./$(DEPDIR)/twofish-arm.Plo ./$(DEPDIR)/twofish-avx2-amd64.Plo \
@@ -413,9 +440,6 @@ PACKAGE_TARNAME = @PACKAGE_TARNAME@
 PACKAGE_URL = @PACKAGE_URL@
 PACKAGE_VERSION = @PACKAGE_VERSION@
 PATH_SEPARATOR = @PATH_SEPARATOR@
-PTH_CFLAGS = @PTH_CFLAGS@
-PTH_CONFIG = @PTH_CONFIG@
-PTH_LIBS = @PTH_LIBS@
 RANLIB = @RANLIB@
 RC = @RC@
 READELF = @READELF@
@@ -489,8 +513,8 @@ top_srcdir = @top_srcdir@
 AM_CPPFLAGS = -I../src -I$(top_srcdir)/src -I../mpi -I$(top_srcdir)/mpi
 AM_CFLAGS = $(GPG_ERROR_CFLAGS)
 AM_CCASFLAGS = $(NOEXECSTACK_FLAGS)
-EXTRA_DIST = gost-s-box.c
-CLEANFILES = gost-s-box
+EXTRA_DIST = gost-s-box.c kyber-common.c kyber-kdep.c
+CLEANFILES = gost-s-box$(EXEEXT_FOR_BUILD)
 DISTCLEANFILES = gost-sb.h
 noinst_LTLIBRARIES = libcipher.la
 GCRYPT_MODULES = @GCRYPT_CIPHERS@ @GCRYPT_PUBKEY_CIPHERS@ \
@@ -514,16 +538,18 @@ libcipher_la_SOURCES = \
        cipher-eax.c \
        cipher-siv.c \
        cipher-gcm-siv.c \
-       cipher-selftest.c cipher-selftest.h \
        pubkey.c pubkey-internal.h pubkey-util.c \
        md.c \
        mac.c mac-internal.h \
        mac-hmac.c mac-cmac.c mac-gmac.c mac-poly1305.c \
        poly1305.c poly1305-internal.h \
+       kem.c sntrup761.c sntrup761.h kyber.c kyber.h kem-ecc.c kem-ecc.h \
+       mceliece6688128f.c mceliece6688128f.h \
        kdf.c kdf-internal.h \
        bithelp.h  \
        bufhelp.h  \
-       primegen.c  \
+       bulkhelp.h \
+       primegen.c \
        hash-common.c hash-common.h \
        dsa-common.c rsa-common.c \
        sha1.h
@@ -531,17 +557,21 @@ libcipher_la_SOURCES = \
 EXTRA_libcipher_la_SOURCES = \
        asm-common-aarch64.h \
        asm-common-amd64.h \
+       asm-common-i386.h \
        asm-common-s390x.h \
        asm-inline-s390x.h \
        asm-poly1305-aarch64.h \
        asm-poly1305-amd64.h \
        asm-poly1305-s390x.h \
+       aria.c aria-aesni-avx-amd64.S aria-aesni-avx2-amd64.S \
+       aria-gfni-avx512-amd64.S \
        arcfour.c arcfour-amd64.S \
        blowfish.c blowfish-amd64.S blowfish-arm.S \
        cast5.c cast5-amd64.S cast5-arm.S \
        chacha20.c chacha20-amd64-ssse3.S chacha20-amd64-avx2.S \
-       chacha20-armv7-neon.S chacha20-aarch64.S \
+       chacha20-amd64-avx512.S chacha20-armv7-neon.S chacha20-aarch64.S \
        chacha20-ppc.c chacha20-s390x.S \
+       chacha20-p10le-8x.s \
        cipher-gcm-ppc.c cipher-gcm-intel-pclmul.c cipher-gcm-armv7-neon.S \
        cipher-gcm-armv8-aarch32-ce.S cipher-gcm-armv8-aarch64-ce.S \
        crc.c crc-intel-pclmul.c crc-armv8-ce.c \
@@ -557,16 +587,18 @@ EXTRA_libcipher_la_SOURCES = \
        gostr3411-94.c \
        md4.c \
        md5.c \
-       poly1305-s390x.S \
+       poly1305-s390x.S poly1305-amd64-avx512.S \
+       poly1305-p10le.s \
        rijndael.c rijndael-internal.h rijndael-tables.h   \
        rijndael-aesni.c rijndael-padlock.c                \
        rijndael-amd64.S rijndael-arm.S                    \
        rijndael-ssse3-amd64.c rijndael-ssse3-amd64-asm.S  \
        rijndael-vaes.c rijndael-vaes-avx2-amd64.S         \
+       rijndael-vaes-i386.c rijndael-vaes-avx2-i386.S     \
        rijndael-armv8-ce.c rijndael-armv8-aarch32-ce.S    \
        rijndael-armv8-aarch64-ce.S rijndael-aarch64.S     \
        rijndael-ppc.c rijndael-ppc9le.c                   \
-       rijndael-p10le.c rijndael-gcm-p10le.s             \
+       rijndael-p10le.c rijndael-gcm-p10le.s              \
        rijndael-ppc-common.h rijndael-ppc-functions.h     \
        rijndael-s390x.c                                   \
        rmd160.c \
@@ -574,9 +606,12 @@ EXTRA_libcipher_la_SOURCES = \
        salsa20.c salsa20-amd64.S salsa20-armv7-neon.S \
        scrypt.c \
        seed.c \
-       serpent.c serpent-sse2-amd64.S \
+       serpent.c serpent-sse2-amd64.S serpent-avx2-amd64.S \
+       serpent-avx512-x86.c serpent-armv7-neon.S \
        sm4.c sm4-aesni-avx-amd64.S sm4-aesni-avx2-amd64.S \
-       serpent-avx2-amd64.S serpent-armv7-neon.S \
+       sm4-gfni-avx2-amd64.S sm4-gfni-avx512-amd64.S \
+       sm4-aarch64.S sm4-armv8-aarch64-ce.S sm4-armv9-aarch64-sve-ce.S \
+       sm4-ppc.c \
        sha1.c sha1-ssse3-amd64.S sha1-avx-amd64.S sha1-avx-bmi2-amd64.S \
        sha1-avx2-bmi2-amd64.S sha1-armv7-neon.S sha1-armv8-aarch32-ce.S \
        sha1-armv8-aarch64-ce.S sha1-intel-shaext.c \
@@ -585,11 +620,12 @@ EXTRA_libcipher_la_SOURCES = \
        sha256-armv8-aarch32-ce.S sha256-armv8-aarch64-ce.S \
        sha256-intel-shaext.c sha256-ppc.c \
        sha512.c sha512-ssse3-amd64.S sha512-avx-amd64.S \
-       sha512-avx2-bmi2-amd64.S \
-       sha512-armv7-neon.S sha512-arm.S \
+       sha512-avx2-bmi2-amd64.S sha512-avx512-amd64.S \
+       sha512-armv7-neon.S sha512-armv8-aarch64-ce.S sha512-arm.S \
        sha512-ppc.c sha512-ssse3-i386.c \
-       sm3.c sm3-avx-bmi2-amd64.S sm3-aarch64.S \
-       keccak.c keccak_permute_32.h keccak_permute_64.h keccak-armv7-neon.S \
+       sm3.c sm3-avx-bmi2-amd64.S sm3-aarch64.S sm3-armv8-aarch64-ce.S \
+       keccak.c keccak_permute_32.h keccak_permute_64.h \
+       keccak-armv7-neon.S keccak-amd64-avx512.S \
        stribog.c \
        tiger.c \
        whirlpool.c whirlpool-sse2-amd64.S \
@@ -597,13 +633,17 @@ EXTRA_libcipher_la_SOURCES = \
        twofish-avx2-amd64.S \
        rfc2268.c \
        camellia.c camellia.h camellia-glue.c camellia-aesni-avx-amd64.S \
-       camellia-aesni-avx2-amd64.h camellia-vaes-avx2-amd64.S \
-       camellia-aesni-avx2-amd64.S camellia-arm.S camellia-aarch64.S \
+       camellia-aesni-avx2-amd64.h \
+       camellia-gfni-avx2-amd64.S camellia-gfni-avx512-amd64.S \
+       camellia-vaes-avx2-amd64.S camellia-aesni-avx2-amd64.S \
+       camellia-arm.S camellia-aarch64.S camellia-aarch64-ce.c \
+       camellia-simd128.h camellia-ppc8le.c camellia-ppc9le.c \
        blake2.c \
-       blake2b-amd64-avx2.S blake2s-amd64-avx.S
+       blake2b-amd64-avx2.S blake2b-amd64-avx512.S \
+       blake2s-amd64-avx.S blake2s-amd64-avx512.S
 
 @ENABLE_O_FLAG_MUNGING_FALSE@o_flag_munging = cat
-@ENABLE_O_FLAG_MUNGING_TRUE@o_flag_munging = sed -e 's/-O\([2-9sgz][2-9sgz]*\)/-O1/' -e 's/-Ofast/-O1/g'
+@ENABLE_O_FLAG_MUNGING_TRUE@o_flag_munging = sed -e 's/[[:blank:]]-O\([2-9sgz][2-9sgz]*\)/ -O1 /' -e 's/[[:blank:]]-Ofast/ -O1 /g'
 @ENABLE_INSTRUMENTATION_MUNGING_FALSE@instrumentation_munging = cat
 
 # We need to disable instrumentation for these modules as they use cc as
@@ -615,7 +655,11 @@ EXTRA_libcipher_la_SOURCES = \
 @ENABLE_INSTRUMENTATION_MUNGING_TRUE@  -e 's/-fcoverage[=,\-][=,a-z,A-Z,0-9,\,,\-]*//g'
 
 @ENABLE_PPC_VCRYPTO_EXTRA_CFLAGS_FALSE@ppc_vcrypto_cflags = 
-@ENABLE_PPC_VCRYPTO_EXTRA_CFLAGS_TRUE@ppc_vcrypto_cflags = -maltivec -mvsx -mcrypto
+@ENABLE_PPC_VCRYPTO_EXTRA_CFLAGS_TRUE@ppc_vcrypto_cflags = -O2 -maltivec -mvsx -mcrypto
+@ENABLE_AARCH64_NEON_INTRINSICS_EXTRA_CFLAGS_FALSE@aarch64_neon_cflags = 
+@ENABLE_AARCH64_NEON_INTRINSICS_EXTRA_CFLAGS_TRUE@aarch64_neon_cflags = -O2 -march=armv8-a+crypto
+@ENABLE_X86_AVX512_INTRINSICS_EXTRA_CFLAGS_FALSE@avx512f_cflags = 
+@ENABLE_X86_AVX512_INTRINSICS_EXTRA_CFLAGS_TRUE@avx512f_cflags = -mavx512f
 all: all-am
 
 .SUFFIXES:
@@ -672,17 +716,28 @@ distclean-compile:
 
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/arcfour-amd64.Plo@am__quote@ # am--include-marker
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/arcfour.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/aria-aesni-avx-amd64.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/aria-aesni-avx2-amd64.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/aria-gfni-avx512-amd64.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/aria.Plo@am__quote@ # am--include-marker
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/blake2.Plo@am__quote@ # am--include-marker
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/blake2b-amd64-avx2.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/blake2b-amd64-avx512.Plo@am__quote@ # am--include-marker
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/blake2s-amd64-avx.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/blake2s-amd64-avx512.Plo@am__quote@ # am--include-marker
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/blowfish-amd64.Plo@am__quote@ # am--include-marker
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/blowfish-arm.Plo@am__quote@ # am--include-marker
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/blowfish.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/camellia-aarch64-ce.Plo@am__quote@ # am--include-marker
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/camellia-aarch64.Plo@am__quote@ # am--include-marker
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/camellia-aesni-avx-amd64.Plo@am__quote@ # am--include-marker
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/camellia-aesni-avx2-amd64.Plo@am__quote@ # am--include-marker
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/camellia-arm.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/camellia-gfni-avx2-amd64.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/camellia-gfni-avx512-amd64.Plo@am__quote@ # am--include-marker
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/camellia-glue.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/camellia-ppc8le.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/camellia-ppc9le.Plo@am__quote@ # am--include-marker
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/camellia-vaes-avx2-amd64.Plo@am__quote@ # am--include-marker
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/camellia.Plo@am__quote@ # am--include-marker
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/cast5-amd64.Plo@am__quote@ # am--include-marker
@@ -690,6 +745,7 @@ distclean-compile:
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/cast5.Plo@am__quote@ # am--include-marker
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/chacha20-aarch64.Plo@am__quote@ # am--include-marker
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/chacha20-amd64-avx2.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/chacha20-amd64-avx512.Plo@am__quote@ # am--include-marker
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/chacha20-amd64-ssse3.Plo@am__quote@ # am--include-marker
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/chacha20-armv7-neon.Plo@am__quote@ # am--include-marker
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/chacha20-ppc.Plo@am__quote@ # am--include-marker
@@ -712,7 +768,6 @@ distclean-compile:
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/cipher-ocb.Plo@am__quote@ # am--include-marker
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/cipher-ofb.Plo@am__quote@ # am--include-marker
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/cipher-poly1305.Plo@am__quote@ # am--include-marker
-@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/cipher-selftest.Plo@am__quote@ # am--include-marker
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/cipher-siv.Plo@am__quote@ # am--include-marker
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/cipher-xts.Plo@am__quote@ # am--include-marker
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/cipher.Plo@am__quote@ # am--include-marker
@@ -739,16 +794,22 @@ distclean-compile:
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/hash-common.Plo@am__quote@ # am--include-marker
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/idea.Plo@am__quote@ # am--include-marker
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/kdf.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/keccak-amd64-avx512.Plo@am__quote@ # am--include-marker
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/keccak-armv7-neon.Plo@am__quote@ # am--include-marker
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/keccak.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/kem-ecc.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/kem.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/kyber.Plo@am__quote@ # am--include-marker
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/mac-cmac.Plo@am__quote@ # am--include-marker
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/mac-gmac.Plo@am__quote@ # am--include-marker
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/mac-hmac.Plo@am__quote@ # am--include-marker
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/mac-poly1305.Plo@am__quote@ # am--include-marker
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/mac.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/mceliece6688128f.Plo@am__quote@ # am--include-marker
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/md.Plo@am__quote@ # am--include-marker
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/md4.Plo@am__quote@ # am--include-marker
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/md5.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/poly1305-amd64-avx512.Plo@am__quote@ # am--include-marker
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/poly1305-s390x.Plo@am__quote@ # am--include-marker
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/poly1305.Plo@am__quote@ # am--include-marker
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/primegen.Plo@am__quote@ # am--include-marker
@@ -770,6 +831,8 @@ distclean-compile:
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/rijndael-ssse3-amd64-asm.Plo@am__quote@ # am--include-marker
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/rijndael-ssse3-amd64.Plo@am__quote@ # am--include-marker
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/rijndael-vaes-avx2-amd64.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/rijndael-vaes-avx2-i386.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/rijndael-vaes-i386.Plo@am__quote@ # am--include-marker
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/rijndael-vaes.Plo@am__quote@ # am--include-marker
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/rijndael.Plo@am__quote@ # am--include-marker
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/rmd160.Plo@am__quote@ # am--include-marker
@@ -782,6 +845,7 @@ distclean-compile:
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/seed.Plo@am__quote@ # am--include-marker
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/serpent-armv7-neon.Plo@am__quote@ # am--include-marker
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/serpent-avx2-amd64.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/serpent-avx512-x86.Plo@am__quote@ # am--include-marker
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/serpent-sse2-amd64.Plo@am__quote@ # am--include-marker
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/serpent.Plo@am__quote@ # am--include-marker
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/sha1-armv7-neon.Plo@am__quote@ # am--include-marker
@@ -803,18 +867,28 @@ distclean-compile:
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/sha256.Plo@am__quote@ # am--include-marker
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/sha512-arm.Plo@am__quote@ # am--include-marker
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/sha512-armv7-neon.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/sha512-armv8-aarch64-ce.Plo@am__quote@ # am--include-marker
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/sha512-avx-amd64.Plo@am__quote@ # am--include-marker
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/sha512-avx2-bmi2-amd64.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/sha512-avx512-amd64.Plo@am__quote@ # am--include-marker
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/sha512-ppc.Plo@am__quote@ # am--include-marker
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/sha512-ssse3-amd64.Plo@am__quote@ # am--include-marker
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/sha512-ssse3-i386.Plo@am__quote@ # am--include-marker
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/sha512.Plo@am__quote@ # am--include-marker
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/sm3-aarch64.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/sm3-armv8-aarch64-ce.Plo@am__quote@ # am--include-marker
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/sm3-avx-bmi2-amd64.Plo@am__quote@ # am--include-marker
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/sm3.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/sm4-aarch64.Plo@am__quote@ # am--include-marker
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/sm4-aesni-avx-amd64.Plo@am__quote@ # am--include-marker
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/sm4-aesni-avx2-amd64.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/sm4-armv8-aarch64-ce.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/sm4-armv9-aarch64-sve-ce.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/sm4-gfni-avx2-amd64.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/sm4-gfni-avx512-amd64.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/sm4-ppc.Plo@am__quote@ # am--include-marker
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/sm4.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/sntrup761.Plo@am__quote@ # am--include-marker
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/stribog.Plo@am__quote@ # am--include-marker
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/tiger.Plo@am__quote@ # am--include-marker
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/twofish-aarch64.Plo@am__quote@ # am--include-marker
@@ -1016,17 +1090,28 @@ clean-am: clean-generic clean-libtool clean-noinstLTLIBRARIES \
 distclean: distclean-am
                -rm -f ./$(DEPDIR)/arcfour-amd64.Plo
        -rm -f ./$(DEPDIR)/arcfour.Plo
+       -rm -f ./$(DEPDIR)/aria-aesni-avx-amd64.Plo
+       -rm -f ./$(DEPDIR)/aria-aesni-avx2-amd64.Plo
+       -rm -f ./$(DEPDIR)/aria-gfni-avx512-amd64.Plo
+       -rm -f ./$(DEPDIR)/aria.Plo
        -rm -f ./$(DEPDIR)/blake2.Plo
        -rm -f ./$(DEPDIR)/blake2b-amd64-avx2.Plo
+       -rm -f ./$(DEPDIR)/blake2b-amd64-avx512.Plo
        -rm -f ./$(DEPDIR)/blake2s-amd64-avx.Plo
+       -rm -f ./$(DEPDIR)/blake2s-amd64-avx512.Plo
        -rm -f ./$(DEPDIR)/blowfish-amd64.Plo
        -rm -f ./$(DEPDIR)/blowfish-arm.Plo
        -rm -f ./$(DEPDIR)/blowfish.Plo
+       -rm -f ./$(DEPDIR)/camellia-aarch64-ce.Plo
        -rm -f ./$(DEPDIR)/camellia-aarch64.Plo
        -rm -f ./$(DEPDIR)/camellia-aesni-avx-amd64.Plo
        -rm -f ./$(DEPDIR)/camellia-aesni-avx2-amd64.Plo
        -rm -f ./$(DEPDIR)/camellia-arm.Plo
+       -rm -f ./$(DEPDIR)/camellia-gfni-avx2-amd64.Plo
+       -rm -f ./$(DEPDIR)/camellia-gfni-avx512-amd64.Plo
        -rm -f ./$(DEPDIR)/camellia-glue.Plo
+       -rm -f ./$(DEPDIR)/camellia-ppc8le.Plo
+       -rm -f ./$(DEPDIR)/camellia-ppc9le.Plo
        -rm -f ./$(DEPDIR)/camellia-vaes-avx2-amd64.Plo
        -rm -f ./$(DEPDIR)/camellia.Plo
        -rm -f ./$(DEPDIR)/cast5-amd64.Plo
@@ -1034,6 +1119,7 @@ distclean: distclean-am
        -rm -f ./$(DEPDIR)/cast5.Plo
        -rm -f ./$(DEPDIR)/chacha20-aarch64.Plo
        -rm -f ./$(DEPDIR)/chacha20-amd64-avx2.Plo
+       -rm -f ./$(DEPDIR)/chacha20-amd64-avx512.Plo
        -rm -f ./$(DEPDIR)/chacha20-amd64-ssse3.Plo
        -rm -f ./$(DEPDIR)/chacha20-armv7-neon.Plo
        -rm -f ./$(DEPDIR)/chacha20-ppc.Plo
@@ -1056,7 +1142,6 @@ distclean: distclean-am
        -rm -f ./$(DEPDIR)/cipher-ocb.Plo
        -rm -f ./$(DEPDIR)/cipher-ofb.Plo
        -rm -f ./$(DEPDIR)/cipher-poly1305.Plo
-       -rm -f ./$(DEPDIR)/cipher-selftest.Plo
        -rm -f ./$(DEPDIR)/cipher-siv.Plo
        -rm -f ./$(DEPDIR)/cipher-xts.Plo
        -rm -f ./$(DEPDIR)/cipher.Plo
@@ -1083,16 +1168,22 @@ distclean: distclean-am
        -rm -f ./$(DEPDIR)/hash-common.Plo
        -rm -f ./$(DEPDIR)/idea.Plo
        -rm -f ./$(DEPDIR)/kdf.Plo
+       -rm -f ./$(DEPDIR)/keccak-amd64-avx512.Plo
        -rm -f ./$(DEPDIR)/keccak-armv7-neon.Plo
        -rm -f ./$(DEPDIR)/keccak.Plo
+       -rm -f ./$(DEPDIR)/kem-ecc.Plo
+       -rm -f ./$(DEPDIR)/kem.Plo
+       -rm -f ./$(DEPDIR)/kyber.Plo
        -rm -f ./$(DEPDIR)/mac-cmac.Plo
        -rm -f ./$(DEPDIR)/mac-gmac.Plo
        -rm -f ./$(DEPDIR)/mac-hmac.Plo
        -rm -f ./$(DEPDIR)/mac-poly1305.Plo
        -rm -f ./$(DEPDIR)/mac.Plo
+       -rm -f ./$(DEPDIR)/mceliece6688128f.Plo
        -rm -f ./$(DEPDIR)/md.Plo
        -rm -f ./$(DEPDIR)/md4.Plo
        -rm -f ./$(DEPDIR)/md5.Plo
+       -rm -f ./$(DEPDIR)/poly1305-amd64-avx512.Plo
        -rm -f ./$(DEPDIR)/poly1305-s390x.Plo
        -rm -f ./$(DEPDIR)/poly1305.Plo
        -rm -f ./$(DEPDIR)/primegen.Plo
@@ -1114,6 +1205,8 @@ distclean: distclean-am
        -rm -f ./$(DEPDIR)/rijndael-ssse3-amd64-asm.Plo
        -rm -f ./$(DEPDIR)/rijndael-ssse3-amd64.Plo
        -rm -f ./$(DEPDIR)/rijndael-vaes-avx2-amd64.Plo
+       -rm -f ./$(DEPDIR)/rijndael-vaes-avx2-i386.Plo
+       -rm -f ./$(DEPDIR)/rijndael-vaes-i386.Plo
        -rm -f ./$(DEPDIR)/rijndael-vaes.Plo
        -rm -f ./$(DEPDIR)/rijndael.Plo
        -rm -f ./$(DEPDIR)/rmd160.Plo
@@ -1126,6 +1219,7 @@ distclean: distclean-am
        -rm -f ./$(DEPDIR)/seed.Plo
        -rm -f ./$(DEPDIR)/serpent-armv7-neon.Plo
        -rm -f ./$(DEPDIR)/serpent-avx2-amd64.Plo
+       -rm -f ./$(DEPDIR)/serpent-avx512-x86.Plo
        -rm -f ./$(DEPDIR)/serpent-sse2-amd64.Plo
        -rm -f ./$(DEPDIR)/serpent.Plo
        -rm -f ./$(DEPDIR)/sha1-armv7-neon.Plo
@@ -1147,18 +1241,28 @@ distclean: distclean-am
        -rm -f ./$(DEPDIR)/sha256.Plo
        -rm -f ./$(DEPDIR)/sha512-arm.Plo
        -rm -f ./$(DEPDIR)/sha512-armv7-neon.Plo
+       -rm -f ./$(DEPDIR)/sha512-armv8-aarch64-ce.Plo
        -rm -f ./$(DEPDIR)/sha512-avx-amd64.Plo
        -rm -f ./$(DEPDIR)/sha512-avx2-bmi2-amd64.Plo
+       -rm -f ./$(DEPDIR)/sha512-avx512-amd64.Plo
        -rm -f ./$(DEPDIR)/sha512-ppc.Plo
        -rm -f ./$(DEPDIR)/sha512-ssse3-amd64.Plo
        -rm -f ./$(DEPDIR)/sha512-ssse3-i386.Plo
        -rm -f ./$(DEPDIR)/sha512.Plo
        -rm -f ./$(DEPDIR)/sm3-aarch64.Plo
+       -rm -f ./$(DEPDIR)/sm3-armv8-aarch64-ce.Plo
        -rm -f ./$(DEPDIR)/sm3-avx-bmi2-amd64.Plo
        -rm -f ./$(DEPDIR)/sm3.Plo
+       -rm -f ./$(DEPDIR)/sm4-aarch64.Plo
        -rm -f ./$(DEPDIR)/sm4-aesni-avx-amd64.Plo
        -rm -f ./$(DEPDIR)/sm4-aesni-avx2-amd64.Plo
+       -rm -f ./$(DEPDIR)/sm4-armv8-aarch64-ce.Plo
+       -rm -f ./$(DEPDIR)/sm4-armv9-aarch64-sve-ce.Plo
+       -rm -f ./$(DEPDIR)/sm4-gfni-avx2-amd64.Plo
+       -rm -f ./$(DEPDIR)/sm4-gfni-avx512-amd64.Plo
+       -rm -f ./$(DEPDIR)/sm4-ppc.Plo
        -rm -f ./$(DEPDIR)/sm4.Plo
+       -rm -f ./$(DEPDIR)/sntrup761.Plo
        -rm -f ./$(DEPDIR)/stribog.Plo
        -rm -f ./$(DEPDIR)/tiger.Plo
        -rm -f ./$(DEPDIR)/twofish-aarch64.Plo
@@ -1215,17 +1319,28 @@ installcheck-am:
 maintainer-clean: maintainer-clean-am
                -rm -f ./$(DEPDIR)/arcfour-amd64.Plo
        -rm -f ./$(DEPDIR)/arcfour.Plo
+       -rm -f ./$(DEPDIR)/aria-aesni-avx-amd64.Plo
+       -rm -f ./$(DEPDIR)/aria-aesni-avx2-amd64.Plo
+       -rm -f ./$(DEPDIR)/aria-gfni-avx512-amd64.Plo
+       -rm -f ./$(DEPDIR)/aria.Plo
        -rm -f ./$(DEPDIR)/blake2.Plo
        -rm -f ./$(DEPDIR)/blake2b-amd64-avx2.Plo
+       -rm -f ./$(DEPDIR)/blake2b-amd64-avx512.Plo
        -rm -f ./$(DEPDIR)/blake2s-amd64-avx.Plo
+       -rm -f ./$(DEPDIR)/blake2s-amd64-avx512.Plo
        -rm -f ./$(DEPDIR)/blowfish-amd64.Plo
        -rm -f ./$(DEPDIR)/blowfish-arm.Plo
        -rm -f ./$(DEPDIR)/blowfish.Plo
+       -rm -f ./$(DEPDIR)/camellia-aarch64-ce.Plo
        -rm -f ./$(DEPDIR)/camellia-aarch64.Plo
        -rm -f ./$(DEPDIR)/camellia-aesni-avx-amd64.Plo
        -rm -f ./$(DEPDIR)/camellia-aesni-avx2-amd64.Plo
        -rm -f ./$(DEPDIR)/camellia-arm.Plo
+       -rm -f ./$(DEPDIR)/camellia-gfni-avx2-amd64.Plo
+       -rm -f ./$(DEPDIR)/camellia-gfni-avx512-amd64.Plo
        -rm -f ./$(DEPDIR)/camellia-glue.Plo
+       -rm -f ./$(DEPDIR)/camellia-ppc8le.Plo
+       -rm -f ./$(DEPDIR)/camellia-ppc9le.Plo
        -rm -f ./$(DEPDIR)/camellia-vaes-avx2-amd64.Plo
        -rm -f ./$(DEPDIR)/camellia.Plo
        -rm -f ./$(DEPDIR)/cast5-amd64.Plo
@@ -1233,6 +1348,7 @@ maintainer-clean: maintainer-clean-am
        -rm -f ./$(DEPDIR)/cast5.Plo
        -rm -f ./$(DEPDIR)/chacha20-aarch64.Plo
        -rm -f ./$(DEPDIR)/chacha20-amd64-avx2.Plo
+       -rm -f ./$(DEPDIR)/chacha20-amd64-avx512.Plo
        -rm -f ./$(DEPDIR)/chacha20-amd64-ssse3.Plo
        -rm -f ./$(DEPDIR)/chacha20-armv7-neon.Plo
        -rm -f ./$(DEPDIR)/chacha20-ppc.Plo
@@ -1255,7 +1371,6 @@ maintainer-clean: maintainer-clean-am
        -rm -f ./$(DEPDIR)/cipher-ocb.Plo
        -rm -f ./$(DEPDIR)/cipher-ofb.Plo
        -rm -f ./$(DEPDIR)/cipher-poly1305.Plo
-       -rm -f ./$(DEPDIR)/cipher-selftest.Plo
        -rm -f ./$(DEPDIR)/cipher-siv.Plo
        -rm -f ./$(DEPDIR)/cipher-xts.Plo
        -rm -f ./$(DEPDIR)/cipher.Plo
@@ -1282,16 +1397,22 @@ maintainer-clean: maintainer-clean-am
        -rm -f ./$(DEPDIR)/hash-common.Plo
        -rm -f ./$(DEPDIR)/idea.Plo
        -rm -f ./$(DEPDIR)/kdf.Plo
+       -rm -f ./$(DEPDIR)/keccak-amd64-avx512.Plo
        -rm -f ./$(DEPDIR)/keccak-armv7-neon.Plo
        -rm -f ./$(DEPDIR)/keccak.Plo
+       -rm -f ./$(DEPDIR)/kem-ecc.Plo
+       -rm -f ./$(DEPDIR)/kem.Plo
+       -rm -f ./$(DEPDIR)/kyber.Plo
        -rm -f ./$(DEPDIR)/mac-cmac.Plo
        -rm -f ./$(DEPDIR)/mac-gmac.Plo
        -rm -f ./$(DEPDIR)/mac-hmac.Plo
        -rm -f ./$(DEPDIR)/mac-poly1305.Plo
        -rm -f ./$(DEPDIR)/mac.Plo
+       -rm -f ./$(DEPDIR)/mceliece6688128f.Plo
        -rm -f ./$(DEPDIR)/md.Plo
        -rm -f ./$(DEPDIR)/md4.Plo
        -rm -f ./$(DEPDIR)/md5.Plo
+       -rm -f ./$(DEPDIR)/poly1305-amd64-avx512.Plo
        -rm -f ./$(DEPDIR)/poly1305-s390x.Plo
        -rm -f ./$(DEPDIR)/poly1305.Plo
        -rm -f ./$(DEPDIR)/primegen.Plo
@@ -1313,6 +1434,8 @@ maintainer-clean: maintainer-clean-am
        -rm -f ./$(DEPDIR)/rijndael-ssse3-amd64-asm.Plo
        -rm -f ./$(DEPDIR)/rijndael-ssse3-amd64.Plo
        -rm -f ./$(DEPDIR)/rijndael-vaes-avx2-amd64.Plo
+       -rm -f ./$(DEPDIR)/rijndael-vaes-avx2-i386.Plo
+       -rm -f ./$(DEPDIR)/rijndael-vaes-i386.Plo
        -rm -f ./$(DEPDIR)/rijndael-vaes.Plo
        -rm -f ./$(DEPDIR)/rijndael.Plo
        -rm -f ./$(DEPDIR)/rmd160.Plo
@@ -1325,6 +1448,7 @@ maintainer-clean: maintainer-clean-am
        -rm -f ./$(DEPDIR)/seed.Plo
        -rm -f ./$(DEPDIR)/serpent-armv7-neon.Plo
        -rm -f ./$(DEPDIR)/serpent-avx2-amd64.Plo
+       -rm -f ./$(DEPDIR)/serpent-avx512-x86.Plo
        -rm -f ./$(DEPDIR)/serpent-sse2-amd64.Plo
        -rm -f ./$(DEPDIR)/serpent.Plo
        -rm -f ./$(DEPDIR)/sha1-armv7-neon.Plo
@@ -1346,18 +1470,28 @@ maintainer-clean: maintainer-clean-am
        -rm -f ./$(DEPDIR)/sha256.Plo
        -rm -f ./$(DEPDIR)/sha512-arm.Plo
        -rm -f ./$(DEPDIR)/sha512-armv7-neon.Plo
+       -rm -f ./$(DEPDIR)/sha512-armv8-aarch64-ce.Plo
        -rm -f ./$(DEPDIR)/sha512-avx-amd64.Plo
        -rm -f ./$(DEPDIR)/sha512-avx2-bmi2-amd64.Plo
+       -rm -f ./$(DEPDIR)/sha512-avx512-amd64.Plo
        -rm -f ./$(DEPDIR)/sha512-ppc.Plo
        -rm -f ./$(DEPDIR)/sha512-ssse3-amd64.Plo
        -rm -f ./$(DEPDIR)/sha512-ssse3-i386.Plo
        -rm -f ./$(DEPDIR)/sha512.Plo
        -rm -f ./$(DEPDIR)/sm3-aarch64.Plo
+       -rm -f ./$(DEPDIR)/sm3-armv8-aarch64-ce.Plo
        -rm -f ./$(DEPDIR)/sm3-avx-bmi2-amd64.Plo
        -rm -f ./$(DEPDIR)/sm3.Plo
+       -rm -f ./$(DEPDIR)/sm4-aarch64.Plo
        -rm -f ./$(DEPDIR)/sm4-aesni-avx-amd64.Plo
        -rm -f ./$(DEPDIR)/sm4-aesni-avx2-amd64.Plo
+       -rm -f ./$(DEPDIR)/sm4-armv8-aarch64-ce.Plo
+       -rm -f ./$(DEPDIR)/sm4-armv9-aarch64-sve-ce.Plo
+       -rm -f ./$(DEPDIR)/sm4-gfni-avx2-amd64.Plo
+       -rm -f ./$(DEPDIR)/sm4-gfni-avx512-amd64.Plo
+       -rm -f ./$(DEPDIR)/sm4-ppc.Plo
        -rm -f ./$(DEPDIR)/sm4.Plo
+       -rm -f ./$(DEPDIR)/sntrup761.Plo
        -rm -f ./$(DEPDIR)/stribog.Plo
        -rm -f ./$(DEPDIR)/tiger.Plo
        -rm -f ./$(DEPDIR)/twofish-aarch64.Plo
@@ -1405,10 +1539,10 @@ uninstall-am:
 
 
 gost28147.lo: gost-sb.h
-gost-sb.h: gost-s-box
-       ./gost-s-box $@
+gost-sb.h: gost-s-box$(EXEEXT_FOR_BUILD)
+       ./gost-s-box$(EXEEXT_FOR_BUILD) $@
 
-gost-s-box: gost-s-box.c
+gost-s-box$(EXEEXT_FOR_BUILD): gost-s-box.c
        $(CC_FOR_BUILD) $(CFLAGS_FOR_BUILD) $(LDFLAGS_FOR_BUILD) \
            $(CPPFLAGS_FOR_BUILD) -o $@ $(srcdir)/gost-s-box.c
 
@@ -1509,6 +1643,36 @@ cipher-gcm-ppc.o: $(srcdir)/cipher-gcm-ppc.c Makefile
 cipher-gcm-ppc.lo: $(srcdir)/cipher-gcm-ppc.c Makefile
        `echo $(LTCOMPILE) $(ppc_vcrypto_cflags) -c $< | $(instrumentation_munging) `
 
+camellia-ppc8le.o: $(srcdir)/camellia-ppc8le.c Makefile
+       `echo $(COMPILE) $(ppc_vcrypto_cflags) -c $< | $(instrumentation_munging) `
+
+camellia-ppc8le.lo: $(srcdir)/camellia-ppc8le.c Makefile
+       `echo $(LTCOMPILE) $(ppc_vcrypto_cflags) -c $< | $(instrumentation_munging) `
+
+camellia-ppc9le.o: $(srcdir)/camellia-ppc9le.c Makefile
+       `echo $(COMPILE) $(ppc_vcrypto_cflags) -c $< | $(instrumentation_munging) `
+
+camellia-ppc9le.lo: $(srcdir)/camellia-ppc9le.c Makefile
+       `echo $(LTCOMPILE) $(ppc_vcrypto_cflags) -c $< | $(instrumentation_munging) `
+
+camellia-aarch64-ce.o: $(srcdir)/camellia-aarch64-ce.c Makefile
+       `echo $(COMPILE) $(aarch64_neon_cflags) -c $< | $(instrumentation_munging) `
+
+camellia-aarch64-ce.lo: $(srcdir)/camellia-aarch64-ce.c Makefile
+       `echo $(LTCOMPILE) $(aarch64_neon_cflags) -c $< | $(instrumentation_munging) `
+
+sm4-ppc.o: $(srcdir)/sm4-ppc.c Makefile
+       `echo $(COMPILE) $(ppc_vcrypto_cflags) -c $< | $(instrumentation_munging) `
+
+sm4-ppc.lo: $(srcdir)/sm4-ppc.c Makefile
+       `echo $(LTCOMPILE) $(ppc_vcrypto_cflags) -c $< | $(instrumentation_munging) `
+
+serpent-avx512-x86.o: $(srcdir)/serpent-avx512-x86.c Makefile
+       `echo $(COMPILE) $(avx512f_cflags) -c $< | $(instrumentation_munging) `
+
+serpent-avx512-x86.lo: $(srcdir)/serpent-avx512-x86.c Makefile
+       `echo $(LTCOMPILE) $(avx512f_cflags) -c $< | $(instrumentation_munging) `
+
 # Tell versions [3.59,3.63) of GNU make to not export all variables.
 # Otherwise a system limit (for SysV at least) may be exceeded.
 .NOEXPORT:
index 353de00bd7190e11da442c6fcd291abb1e87df75..c8d22c70176ccfcdb394b1d5a5d03b3f2b049d51 100644 (file)
@@ -4,7 +4,7 @@
  * This file is part of Libgcrypt.
  *
  * Libgcrypt is free software; you can redistribute it and/or modify
- * it under the terms of the GNU Lesser general Public License as
+ * it under the terms of the GNU Lesser General Public License as
  * published by the Free Software Foundation; either version 2.1 of
  * the License, or (at your option) any later version.
  *
@@ -14,8 +14,8 @@
  * GNU Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA
+ * License along with this program; if not, see <https://www.gnu.org/licenses/>.
+ * SPDX-License-Identifier: LGPL-2.1-or-later
  *
  * For a description of the algorithm, see:
  *   Bruce Schneier: Applied Cryptography. John Wiley & Sons, 1996.
diff --git a/cipher/aria-aesni-avx-amd64.S b/cipher/aria-aesni-avx-amd64.S
new file mode 100644 (file)
index 0000000..2a88c1e
--- /dev/null
@@ -0,0 +1,1440 @@
+/* aria-aesni-avx-amd64.S  -  AESNI/GFNI/AVX implementation of ARIA cipher
+ *
+ * Copyright (C) 2022-2023 Taehee Yoo <ap420073@gmail.com>
+ * Copyright (C) 2023 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <config.h>
+
+#ifdef __x86_64
+#if (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
+     defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) && \
+    defined(ENABLE_AVX_SUPPORT) && defined(ENABLE_AESNI_SUPPORT)
+
+#include "asm-common-amd64.h"
+
+#ifdef ENABLE_GFNI_SUPPORT
+#  define CONFIG_AS_GFNI 1
+#endif
+
+/* struct ARIA_context: */
+#define ARIA_BLOCK_SIZE  16
+#define ARIA_MAX_RD_KEYS 17
+#define ARIA_CTX_enc_key 0
+#define ARIA_CTX_dec_key (ARIA_CTX_enc_key + (ARIA_BLOCK_SIZE * ARIA_MAX_RD_KEYS))
+#define ARIA_CTX_rounds (ARIA_CTX_dec_key + (ARIA_BLOCK_SIZE * ARIA_MAX_RD_KEYS))
+
+/* register macros */
+#define CTX %rdi
+
+/* helper macros */
+#define STACK_DEPTH (2 * 8 + 16 * 16 + 15)
+
+#define BV8(a0, a1, a2, a3, a4, a5, a6, a7)            \
+       ( (((a0) & 1) << 0) |                           \
+         (((a1) & 1) << 1) |                           \
+         (((a2) & 1) << 2) |                           \
+         (((a3) & 1) << 3) |                           \
+         (((a4) & 1) << 4) |                           \
+         (((a5) & 1) << 5) |                           \
+         (((a6) & 1) << 6) |                           \
+         (((a7) & 1) << 7) )
+
+#define BM8X8(l0, l1, l2, l3, l4, l5, l6, l7)          \
+       ( ((l7) << (0 * 8)) |                           \
+         ((l6) << (1 * 8)) |                           \
+         ((l5) << (2 * 8)) |                           \
+         ((l4) << (3 * 8)) |                           \
+         ((l3) << (4 * 8)) |                           \
+         ((l2) << (5 * 8)) |                           \
+         ((l1) << (6 * 8)) |                           \
+         ((l0) << (7 * 8)) )
+
+/* asm macros */
+#define inc_le128(x, minus_one, tmp)                   \
+       vpcmpeqq minus_one, x, tmp;                     \
+       vpsubq minus_one, x, x;                         \
+       vpslldq $8, tmp, tmp;                           \
+       vpsubq tmp, x, x;
+
+#define filter_8bit(x, lo_t, hi_t, mask4bit, tmp0)     \
+       vpand x, mask4bit, tmp0;                        \
+       vpandn x, mask4bit, x;                          \
+       vpsrld $4, x, x;                                \
+                                                       \
+       vpshufb tmp0, lo_t, tmp0;                       \
+       vpshufb x, hi_t, x;                             \
+       vpxor tmp0, x, x;
+
+#define transpose_4x4(x0, x1, x2, x3, t1, t2)          \
+       vpunpckhdq x1, x0, t2;                          \
+       vpunpckldq x1, x0, x0;                          \
+                                                       \
+       vpunpckldq x3, x2, t1;                          \
+       vpunpckhdq x3, x2, x2;                          \
+                                                       \
+       vpunpckhqdq t1, x0, x1;                         \
+       vpunpcklqdq t1, x0, x0;                         \
+                                                       \
+       vpunpckhqdq x2, t2, x3;                         \
+       vpunpcklqdq x2, t2, x2;
+
+#define byteslice_16x16b(a0, b0, c0, d0,               \
+                        a1, b1, c1, d1,                \
+                        a2, b2, c2, d2,                \
+                        a3, b3, c3, d3,                \
+                        st0, st1)                      \
+       vmovdqu d2, st0;                                \
+       vmovdqu d3, st1;                                \
+       transpose_4x4(a0, a1, a2, a3, d2, d3);          \
+       transpose_4x4(b0, b1, b2, b3, d2, d3);          \
+       vmovdqu st0, d2;                                \
+       vmovdqu st1, d3;                                \
+                                                       \
+       vmovdqu a0, st0;                                \
+       vmovdqu a1, st1;                                \
+       transpose_4x4(c0, c1, c2, c3, a0, a1);          \
+       transpose_4x4(d0, d1, d2, d3, a0, a1);          \
+                                                       \
+       vmovdqu .Lshufb_16x16b rRIP, a0;                \
+       vmovdqu st1, a1;                                \
+       vpshufb a0, a2, a2;                             \
+       vpshufb a0, a3, a3;                             \
+       vpshufb a0, b0, b0;                             \
+       vpshufb a0, b1, b1;                             \
+       vpshufb a0, b2, b2;                             \
+       vpshufb a0, b3, b3;                             \
+       vpshufb a0, a1, a1;                             \
+       vpshufb a0, c0, c0;                             \
+       vpshufb a0, c1, c1;                             \
+       vpshufb a0, c2, c2;                             \
+       vpshufb a0, c3, c3;                             \
+       vpshufb a0, d0, d0;                             \
+       vpshufb a0, d1, d1;                             \
+       vpshufb a0, d2, d2;                             \
+       vpshufb a0, d3, d3;                             \
+       vmovdqu d3, st1;                                \
+       vmovdqu st0, d3;                                \
+       vpshufb a0, d3, a0;                             \
+       vmovdqu d2, st0;                                \
+                                                       \
+       transpose_4x4(a0, b0, c0, d0, d2, d3);          \
+       transpose_4x4(a1, b1, c1, d1, d2, d3);          \
+       vmovdqu st0, d2;                                \
+       vmovdqu st1, d3;                                \
+                                                       \
+       vmovdqu b0, st0;                                \
+       vmovdqu b1, st1;                                \
+       transpose_4x4(a2, b2, c2, d2, b0, b1);          \
+       transpose_4x4(a3, b3, c3, d3, b0, b1);          \
+       vmovdqu st0, b0;                                \
+       vmovdqu st1, b1;                                \
+       /* does not adjust output bytes inside vectors */
+
+#define debyteslice_16x16b(a0, b0, c0, d0,             \
+                          a1, b1, c1, d1,              \
+                          a2, b2, c2, d2,              \
+                          a3, b3, c3, d3,              \
+                          st0, st1)                    \
+       vmovdqu d2, st0;                                \
+       vmovdqu d3, st1;                                \
+       transpose_4x4(a0, a1, a2, a3, d2, d3);          \
+       transpose_4x4(b0, b1, b2, b3, d2, d3);          \
+       vmovdqu st0, d2;                                \
+       vmovdqu st1, d3;                                \
+                                                       \
+       vmovdqu a0, st0;                                \
+       vmovdqu a1, st1;                                \
+       transpose_4x4(c0, c1, c2, c3, a0, a1);          \
+       transpose_4x4(d0, d1, d2, d3, a0, a1);          \
+                                                       \
+       vmovdqu .Lshufb_16x16b rRIP, a0;                \
+       vmovdqu st1, a1;                                \
+       vpshufb a0, a2, a2;                             \
+       vpshufb a0, a3, a3;                             \
+       vpshufb a0, b0, b0;                             \
+       vpshufb a0, b1, b1;                             \
+       vpshufb a0, b2, b2;                             \
+       vpshufb a0, b3, b3;                             \
+       vpshufb a0, a1, a1;                             \
+       vpshufb a0, c0, c0;                             \
+       vpshufb a0, c1, c1;                             \
+       vpshufb a0, c2, c2;                             \
+       vpshufb a0, c3, c3;                             \
+       vpshufb a0, d0, d0;                             \
+       vpshufb a0, d1, d1;                             \
+       vpshufb a0, d2, d2;                             \
+       vpshufb a0, d3, d3;                             \
+       vmovdqu d3, st1;                                \
+       vmovdqu st0, d3;                                \
+       vpshufb a0, d3, a0;                             \
+       vmovdqu d2, st0;                                \
+                                                       \
+       transpose_4x4(c0, d0, a0, b0, d2, d3);          \
+       transpose_4x4(c1, d1, a1, b1, d2, d3);          \
+       vmovdqu st0, d2;                                \
+       vmovdqu st1, d3;                                \
+                                                       \
+       vmovdqu b0, st0;                                \
+       vmovdqu b1, st1;                                \
+       transpose_4x4(c2, d2, a2, b2, b0, b1);          \
+       transpose_4x4(c3, d3, a3, b3, b0, b1);          \
+       vmovdqu st0, b0;                                \
+       vmovdqu st1, b1;                                \
+       /* does not adjust output bytes inside vectors */
+
+/* load blocks to registers */
+#define inpack16_pre(x0, x1, x2, x3,                   \
+                    x4, x5, x6, x7,                    \
+                    y0, y1, y2, y3,                    \
+                    y4, y5, y6, y7,                    \
+                    rio)                               \
+       vmovdqu (0 * 16)(rio), x0;                      \
+       vmovdqu (1 * 16)(rio), x1;                      \
+       vmovdqu (2 * 16)(rio), x2;                      \
+       vmovdqu (3 * 16)(rio), x3;                      \
+       vmovdqu (4 * 16)(rio), x4;                      \
+       vmovdqu (5 * 16)(rio), x5;                      \
+       vmovdqu (6 * 16)(rio), x6;                      \
+       vmovdqu (7 * 16)(rio), x7;                      \
+       vmovdqu (8 * 16)(rio), y0;                      \
+       vmovdqu (9 * 16)(rio), y1;                      \
+       vmovdqu (10 * 16)(rio), y2;                     \
+       vmovdqu (11 * 16)(rio), y3;                     \
+       vmovdqu (12 * 16)(rio), y4;                     \
+       vmovdqu (13 * 16)(rio), y5;                     \
+       vmovdqu (14 * 16)(rio), y6;                     \
+       vmovdqu (15 * 16)(rio), y7;
+
+/* byteslice blocks and store to temporary memory */
+#define inpack16_post(x0, x1, x2, x3,                  \
+                     x4, x5, x6, x7,                   \
+                     y0, y1, y2, y3,                   \
+                     y4, y5, y6, y7,                   \
+                     mem_ab, mem_cd)                   \
+       byteslice_16x16b(x0, x1, x2, x3,                \
+                        x4, x5, x6, x7,                \
+                        y0, y1, y2, y3,                \
+                        y4, y5, y6, y7,                \
+                        (mem_ab), (mem_cd));           \
+                                                       \
+       vmovdqu x0, 0 * 16(mem_ab);                     \
+       vmovdqu x1, 1 * 16(mem_ab);                     \
+       vmovdqu x2, 2 * 16(mem_ab);                     \
+       vmovdqu x3, 3 * 16(mem_ab);                     \
+       vmovdqu x4, 4 * 16(mem_ab);                     \
+       vmovdqu x5, 5 * 16(mem_ab);                     \
+       vmovdqu x6, 6 * 16(mem_ab);                     \
+       vmovdqu x7, 7 * 16(mem_ab);                     \
+       vmovdqu y0, 0 * 16(mem_cd);                     \
+       vmovdqu y1, 1 * 16(mem_cd);                     \
+       vmovdqu y2, 2 * 16(mem_cd);                     \
+       vmovdqu y3, 3 * 16(mem_cd);                     \
+       vmovdqu y4, 4 * 16(mem_cd);                     \
+       vmovdqu y5, 5 * 16(mem_cd);                     \
+       vmovdqu y6, 6 * 16(mem_cd);                     \
+       vmovdqu y7, 7 * 16(mem_cd);
+
+#define write_output(x0, x1, x2, x3,                   \
+                    x4, x5, x6, x7,                    \
+                    y0, y1, y2, y3,                    \
+                    y4, y5, y6, y7,                    \
+                    mem)                               \
+       vmovdqu x0, 0 * 16(mem);                        \
+       vmovdqu x1, 1 * 16(mem);                        \
+       vmovdqu x2, 2 * 16(mem);                        \
+       vmovdqu x3, 3 * 16(mem);                        \
+       vmovdqu x4, 4 * 16(mem);                        \
+       vmovdqu x5, 5 * 16(mem);                        \
+       vmovdqu x6, 6 * 16(mem);                        \
+       vmovdqu x7, 7 * 16(mem);                        \
+       vmovdqu y0, 8 * 16(mem);                        \
+       vmovdqu y1, 9 * 16(mem);                        \
+       vmovdqu y2, 10 * 16(mem);                       \
+       vmovdqu y3, 11 * 16(mem);                       \
+       vmovdqu y4, 12 * 16(mem);                       \
+       vmovdqu y5, 13 * 16(mem);                       \
+       vmovdqu y6, 14 * 16(mem);                       \
+       vmovdqu y7, 15 * 16(mem);
+
+#define vload_if_enough_nblks(blk_offs, rnblks, rio, v)        \
+       vpxor v, v, v;                                  \
+       cmp $(blk_offs), rnblks;                        \
+       jbe 1f;                                         \
+       vmovdqu (blk_offs * 16)(rio), v;                \
+       1:;
+
+#define vstore_if_enough_nblks(blk_offs, rnblks, mem, v)\
+       cmp $(blk_offs), rnblks;                        \
+       jbe 1f;                                         \
+       vmovdqu v, (blk_offs * 16)(mem);                \
+       1:;
+
+#define inpack_1_15_pre(x0, x1, x2, x3,                        \
+                       x4, x5, x6, x7,                 \
+                       y0, y1, y2, y3,                 \
+                       y4, y5, y6, y7,                 \
+                       rio, rnblks)                    \
+       vmovdqu (0 * 16)(rio), x0;                      \
+       vload_if_enough_nblks(1, rnblks, rio, x1);      \
+       vload_if_enough_nblks(2, rnblks, rio, x2);      \
+       vload_if_enough_nblks(3, rnblks, rio, x3);      \
+       vload_if_enough_nblks(4, rnblks, rio, x4);      \
+       vload_if_enough_nblks(5, rnblks, rio, x5);      \
+       vload_if_enough_nblks(6, rnblks, rio, x6);      \
+       vload_if_enough_nblks(7, rnblks, rio, x7);      \
+       vload_if_enough_nblks(8, rnblks, rio, y0);      \
+       vload_if_enough_nblks(9, rnblks, rio, y1);      \
+       vload_if_enough_nblks(10, rnblks, rio, y2);     \
+       vload_if_enough_nblks(11, rnblks, rio, y3);     \
+       vload_if_enough_nblks(12, rnblks, rio, y4);     \
+       vload_if_enough_nblks(13, rnblks, rio, y5);     \
+       vload_if_enough_nblks(14, rnblks, rio, y6);     \
+       vpxor y7, y7, y7;
+
+#define write_output_1_15(x0, x1, x2, x3,              \
+                         x4, x5, x6, x7,               \
+                         y0, y1, y2, y3,               \
+                         y4, y5, y6, y7,               \
+                         mem, rnblks)                  \
+       vmovdqu x0, (0 * 16)(mem);                      \
+       vstore_if_enough_nblks(1, rnblks, mem, x1);     \
+       vstore_if_enough_nblks(2, rnblks, mem, x2);     \
+       vstore_if_enough_nblks(3, rnblks, mem, x3);     \
+       vstore_if_enough_nblks(4, rnblks, mem, x4);     \
+       vstore_if_enough_nblks(5, rnblks, mem, x5);     \
+       vstore_if_enough_nblks(6, rnblks, mem, x6);     \
+       vstore_if_enough_nblks(7, rnblks, mem, x7);     \
+       vstore_if_enough_nblks(8, rnblks, mem, y0);     \
+       vstore_if_enough_nblks(9, rnblks, mem, y1);     \
+       vstore_if_enough_nblks(10, rnblks, mem, y2);    \
+       vstore_if_enough_nblks(11, rnblks, mem, y3);    \
+       vstore_if_enough_nblks(12, rnblks, mem, y4);    \
+       vstore_if_enough_nblks(13, rnblks, mem, y5);    \
+       vstore_if_enough_nblks(14, rnblks, mem, y6);
+
+#define aria_store_state_8way(x0, x1, x2, x3,          \
+                             x4, x5, x6, x7,           \
+                             mem_tmp, idx)             \
+       vmovdqu x0, ((idx + 0) * 16)(mem_tmp);          \
+       vmovdqu x1, ((idx + 1) * 16)(mem_tmp);          \
+       vmovdqu x2, ((idx + 2) * 16)(mem_tmp);          \
+       vmovdqu x3, ((idx + 3) * 16)(mem_tmp);          \
+       vmovdqu x4, ((idx + 4) * 16)(mem_tmp);          \
+       vmovdqu x5, ((idx + 5) * 16)(mem_tmp);          \
+       vmovdqu x6, ((idx + 6) * 16)(mem_tmp);          \
+       vmovdqu x7, ((idx + 7) * 16)(mem_tmp);
+
+#define aria_load_state_8way(x0, x1, x2, x3,           \
+                            x4, x5, x6, x7,            \
+                            mem_tmp, idx)              \
+       vmovdqu ((idx + 0) * 16)(mem_tmp), x0;          \
+       vmovdqu ((idx + 1) * 16)(mem_tmp), x1;          \
+       vmovdqu ((idx + 2) * 16)(mem_tmp), x2;          \
+       vmovdqu ((idx + 3) * 16)(mem_tmp), x3;          \
+       vmovdqu ((idx + 4) * 16)(mem_tmp), x4;          \
+       vmovdqu ((idx + 5) * 16)(mem_tmp), x5;          \
+       vmovdqu ((idx + 6) * 16)(mem_tmp), x6;          \
+       vmovdqu ((idx + 7) * 16)(mem_tmp), x7;
+
+#define aria_ark_8way(x0, x1, x2, x3,                  \
+                     x4, x5, x6, x7,                   \
+                     t0, t1, t2, rk,                   \
+                     idx, round)                       \
+       /* AddRoundKey */                               \
+       vmovd ((round * 16) + idx + 0)(rk), t0;         \
+       vpshufb .Lthree_x16 rRIP, t0, t2;               \
+       vpxor t2, x0, x0;                               \
+       vpshufb .Ltwo_x16 rRIP, t0, t2;                 \
+       vpxor t2, x1, x1;                               \
+       vpshufb .Lone_x16 rRIP, t0, t2;                 \
+       vpxor t2, x2, x2;                               \
+       vpshufb t1, t0, t2;                             \
+       vpxor t2, x3, x3;                               \
+       vmovd ((round * 16) + idx + 4)(rk), t0;         \
+       vpshufb .Lthree_x16 rRIP, t0, t2;               \
+       vpxor t2, x4, x4;                               \
+       vpshufb .Ltwo_x16 rRIP, t0, t2;                 \
+       vpxor t2, x5, x5;                               \
+       vpshufb .Lone_x16 rRIP, t0, t2;                 \
+       vpxor t2, x6, x6;                               \
+       vpshufb t1, t0, t2;                             \
+       vpxor t2, x7, x7;
+
+#ifdef CONFIG_AS_GFNI
+#define aria_sbox_8way_gfni(x0, x1, x2, x3,            \
+                           x4, x5, x6, x7,             \
+                           t0, t1, t2, t3,             \
+                           t4, t5, t6, t7)             \
+       vmovddup .Ltf_s2_bitmatrix rRIP, t0;            \
+       vmovddup .Ltf_inv_bitmatrix rRIP, t1;           \
+       vmovddup .Ltf_id_bitmatrix rRIP, t2;            \
+       vmovddup .Ltf_aff_bitmatrix rRIP, t3;           \
+       vmovddup .Ltf_x2_bitmatrix rRIP, t4;            \
+       vgf2p8affineinvqb $(tf_s2_const), t0, x1, x1;   \
+       vgf2p8affineinvqb $(tf_s2_const), t0, x5, x5;   \
+       vgf2p8affineqb $(tf_inv_const), t1, x2, x2;     \
+       vgf2p8affineqb $(tf_inv_const), t1, x6, x6;     \
+       vgf2p8affineinvqb $0, t2, x2, x2;               \
+       vgf2p8affineinvqb $0, t2, x6, x6;               \
+       vgf2p8affineinvqb $(tf_aff_const), t3, x0, x0;  \
+       vgf2p8affineinvqb $(tf_aff_const), t3, x4, x4;  \
+       vgf2p8affineqb $(tf_x2_const), t4, x3, x3;      \
+       vgf2p8affineqb $(tf_x2_const), t4, x7, x7;      \
+       vgf2p8affineinvqb $0, t2, x3, x3;               \
+       vgf2p8affineinvqb $0, t2, x7, x7
+#endif /* CONFIG_AS_GFNI */
+
+#define aria_sbox_8way(x0, x1, x2, x3,                 \
+                      x4, x5, x6, x7,                  \
+                      t0, t1, t2, t3,                  \
+                      t4, t5, t6, t7)                  \
+       vmovdqa .Linv_shift_row rRIP, t0;               \
+       vmovdqa .Lshift_row rRIP, t1;                   \
+       vbroadcastss .L0f0f0f0f rRIP, t6;               \
+       vmovdqa .Ltf_lo__inv_aff__and__s2 rRIP, t2;     \
+       vmovdqa .Ltf_hi__inv_aff__and__s2 rRIP, t3;     \
+       vmovdqa .Ltf_lo__x2__and__fwd_aff rRIP, t4;     \
+       vmovdqa .Ltf_hi__x2__and__fwd_aff rRIP, t5;     \
+                                                       \
+       vaesenclast t7, x0, x0;                         \
+       vaesenclast t7, x4, x4;                         \
+       vaesenclast t7, x1, x1;                         \
+       vaesenclast t7, x5, x5;                         \
+       vaesdeclast t7, x2, x2;                         \
+       vaesdeclast t7, x6, x6;                         \
+                                                       \
+       /* AES inverse shift rows */                    \
+       vpshufb t0, x0, x0;                             \
+       vpshufb t0, x4, x4;                             \
+       vpshufb t0, x1, x1;                             \
+       vpshufb t0, x5, x5;                             \
+       vpshufb t1, x3, x3;                             \
+       vpshufb t1, x7, x7;                             \
+       vpshufb t1, x2, x2;                             \
+       vpshufb t1, x6, x6;                             \
+                                                       \
+       /* affine transformation for S2 */              \
+       filter_8bit(x1, t2, t3, t6, t0);                \
+       /* affine transformation for S2 */              \
+       filter_8bit(x5, t2, t3, t6, t0);                \
+                                                       \
+       /* affine transformation for X2 */              \
+       filter_8bit(x3, t4, t5, t6, t0);                \
+       /* affine transformation for X2 */              \
+       filter_8bit(x7, t4, t5, t6, t0);                \
+       vaesdeclast t7, x3, x3;                         \
+       vaesdeclast t7, x7, x7;
+
+#define aria_diff_m(x0, x1, x2, x3,                    \
+                   t0, t1, t2, t3)                     \
+       /* T = rotr32(X, 8); */                         \
+       /* X ^= T */                                    \
+       vpxor x0, x3, t0;                               \
+       vpxor x1, x0, t1;                               \
+       vpxor x2, x1, t2;                               \
+       vpxor x3, x2, t3;                               \
+       /* X = T ^ rotr(X, 16); */                      \
+       vpxor t2, x0, x0;                               \
+       vpxor x1, t3, t3;                               \
+       vpxor t0, x2, x2;                               \
+       vpxor t1, x3, x1;                               \
+       vmovdqu t3, x3;
+
+#define aria_diff_word(x0, x1, x2, x3,                 \
+                      x4, x5, x6, x7,                  \
+                      y0, y1, y2, y3,                  \
+                      y4, y5, y6, y7)                  \
+       /* t1 ^= t2; */                                 \
+       vpxor y0, x4, x4;                               \
+       vpxor y1, x5, x5;                               \
+       vpxor y2, x6, x6;                               \
+       vpxor y3, x7, x7;                               \
+                                                       \
+       /* t2 ^= t3; */                                 \
+       vpxor y4, y0, y0;                               \
+       vpxor y5, y1, y1;                               \
+       vpxor y6, y2, y2;                               \
+       vpxor y7, y3, y3;                               \
+                                                       \
+       /* t0 ^= t1; */                                 \
+       vpxor x4, x0, x0;                               \
+       vpxor x5, x1, x1;                               \
+       vpxor x6, x2, x2;                               \
+       vpxor x7, x3, x3;                               \
+                                                       \
+       /* t3 ^= t1; */                                 \
+       vpxor x4, y4, y4;                               \
+       vpxor x5, y5, y5;                               \
+       vpxor x6, y6, y6;                               \
+       vpxor x7, y7, y7;                               \
+                                                       \
+       /* t2 ^= t0; */                                 \
+       vpxor x0, y0, y0;                               \
+       vpxor x1, y1, y1;                               \
+       vpxor x2, y2, y2;                               \
+       vpxor x3, y3, y3;                               \
+                                                       \
+       /* t1 ^= t2; */                                 \
+       vpxor y0, x4, x4;                               \
+       vpxor y1, x5, x5;                               \
+       vpxor y2, x6, x6;                               \
+       vpxor y3, x7, x7;
+
+#define aria_fe(x0, x1, x2, x3,                                \
+               x4, x5, x6, x7,                         \
+               y0, y1, y2, y3,                         \
+               y4, y5, y6, y7,                         \
+               mem_tmp, rk, round)                     \
+       vpxor y7, y7, y7;                               \
+       aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,   \
+                     y0, y7, y2, rk, 8, round);        \
+                                                       \
+       aria_sbox_8way(x2, x3, x0, x1, x6, x7, x4, x5,  \
+                      y0, y1, y2, y3, y4, y5, y6, y7); \
+                                                       \
+       aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3);    \
+       aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3);    \
+       aria_store_state_8way(x0, x1, x2, x3,           \
+                             x4, x5, x6, x7,           \
+                             mem_tmp, 8);              \
+                                                       \
+       aria_load_state_8way(x0, x1, x2, x3,            \
+                            x4, x5, x6, x7,            \
+                            mem_tmp, 0);               \
+       aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,   \
+                     y0, y7, y2, rk, 0, round);        \
+                                                       \
+       aria_sbox_8way(x2, x3, x0, x1, x6, x7, x4, x5,  \
+                      y0, y1, y2, y3, y4, y5, y6, y7); \
+                                                       \
+       aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3);    \
+       aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3);    \
+       aria_store_state_8way(x0, x1, x2, x3,           \
+                             x4, x5, x6, x7,           \
+                             mem_tmp, 0);              \
+       aria_load_state_8way(y0, y1, y2, y3,            \
+                            y4, y5, y6, y7,            \
+                            mem_tmp, 8);               \
+       aria_diff_word(x0, x1, x2, x3,                  \
+                      x4, x5, x6, x7,                  \
+                      y0, y1, y2, y3,                  \
+                      y4, y5, y6, y7);                 \
+       /* aria_diff_byte()                             \
+        * T3 = ABCD -> BADC                            \
+        * T3 = y4, y5, y6, y7 -> y5, y4, y7, y6        \
+        * T0 = ABCD -> CDAB                            \
+        * T0 = x0, x1, x2, x3 -> x2, x3, x0, x1        \
+        * T1 = ABCD -> DCBA                            \
+        * T1 = x4, x5, x6, x7 -> x7, x6, x5, x4        \
+        */                                             \
+       aria_diff_word(x2, x3, x0, x1,                  \
+                      x7, x6, x5, x4,                  \
+                      y0, y1, y2, y3,                  \
+                      y5, y4, y7, y6);                 \
+       aria_store_state_8way(x3, x2, x1, x0,           \
+                             x6, x7, x4, x5,           \
+                             mem_tmp, 0);
+
+#define aria_fo(x0, x1, x2, x3,                                \
+               x4, x5, x6, x7,                         \
+               y0, y1, y2, y3,                         \
+               y4, y5, y6, y7,                         \
+               mem_tmp, rk, round)                     \
+       vpxor y7, y7, y7;                               \
+       aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,   \
+                     y0, y7, y2, rk, 8, round);        \
+                                                       \
+       aria_sbox_8way(x0, x1, x2, x3, x4, x5, x6, x7,  \
+                      y0, y1, y2, y3, y4, y5, y6, y7); \
+                                                       \
+       aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3);    \
+       aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3);    \
+       aria_store_state_8way(x0, x1, x2, x3,           \
+                             x4, x5, x6, x7,           \
+                             mem_tmp, 8);              \
+                                                       \
+       aria_load_state_8way(x0, x1, x2, x3,            \
+                            x4, x5, x6, x7,            \
+                            mem_tmp, 0);               \
+       aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,   \
+                     y0, y7, y2, rk, 0, round);        \
+                                                       \
+       aria_sbox_8way(x0, x1, x2, x3, x4, x5, x6, x7,  \
+                      y0, y1, y2, y3, y4, y5, y6, y7); \
+                                                       \
+       aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3);    \
+       aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3);    \
+       aria_store_state_8way(x0, x1, x2, x3,           \
+                             x4, x5, x6, x7,           \
+                             mem_tmp, 0);              \
+       aria_load_state_8way(y0, y1, y2, y3,            \
+                            y4, y5, y6, y7,            \
+                            mem_tmp, 8);               \
+       aria_diff_word(x0, x1, x2, x3,                  \
+                      x4, x5, x6, x7,                  \
+                      y0, y1, y2, y3,                  \
+                      y4, y5, y6, y7);                 \
+       /* aria_diff_byte()                             \
+        * T1 = ABCD -> BADC                            \
+        * T1 = x4, x5, x6, x7 -> x5, x4, x7, x6        \
+        * T2 = ABCD -> CDAB                            \
+        * T2 = y0, y1, y2, y3, -> y2, y3, y0, y1       \
+        * T3 = ABCD -> DCBA                            \
+        * T3 = y4, y5, y6, y7 -> y7, y6, y5, y4        \
+        */                                             \
+       aria_diff_word(x0, x1, x2, x3,                  \
+                      x5, x4, x7, x6,                  \
+                      y2, y3, y0, y1,                  \
+                      y7, y6, y5, y4);                 \
+       aria_store_state_8way(x3, x2, x1, x0,           \
+                             x6, x7, x4, x5,           \
+                             mem_tmp, 0);
+
+#define aria_ff(x0, x1, x2, x3,                                \
+               x4, x5, x6, x7,                         \
+               y0, y1, y2, y3,                         \
+               y4, y5, y6, y7,                         \
+               mem_tmp, rk, round, last_round)         \
+       vpxor y7, y7, y7;                               \
+       aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,   \
+                     y0, y7, y2, rk, 8, round);        \
+                                                       \
+       aria_sbox_8way(x2, x3, x0, x1, x6, x7, x4, x5,  \
+                      y0, y1, y2, y3, y4, y5, y6, y7); \
+                                                       \
+       aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,   \
+                     y0, y7, y2, rk, 8, last_round);   \
+                                                       \
+       aria_store_state_8way(x0, x1, x2, x3,           \
+                             x4, x5, x6, x7,           \
+                             mem_tmp, 8);              \
+                                                       \
+       aria_load_state_8way(x0, x1, x2, x3,            \
+                            x4, x5, x6, x7,            \
+                            mem_tmp, 0);               \
+       aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,   \
+                     y0, y7, y2, rk, 0, round);        \
+                                                       \
+       aria_sbox_8way(x2, x3, x0, x1, x6, x7, x4, x5,  \
+                      y0, y1, y2, y3, y4, y5, y6, y7); \
+                                                       \
+       aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,   \
+                     y0, y7, y2, rk, 0, last_round);   \
+                                                       \
+       aria_load_state_8way(y0, y1, y2, y3,            \
+                            y4, y5, y6, y7,            \
+                            mem_tmp, 8);
+
+#ifdef CONFIG_AS_GFNI
+#define aria_fe_gfni(x0, x1, x2, x3,                   \
+                    x4, x5, x6, x7,                    \
+                    y0, y1, y2, y3,                    \
+                    y4, y5, y6, y7,                    \
+                    mem_tmp, rk, round)                \
+       vpxor y7, y7, y7;                               \
+       aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,   \
+                     y0, y7, y2, rk, 8, round);        \
+                                                       \
+       aria_sbox_8way_gfni(x2, x3, x0, x1,             \
+                           x6, x7, x4, x5,             \
+                           y0, y1, y2, y3,             \
+                           y4, y5, y6, y7);            \
+                                                       \
+       aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3);    \
+       aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3);    \
+       aria_store_state_8way(x0, x1, x2, x3,           \
+                             x4, x5, x6, x7,           \
+                             mem_tmp, 8);              \
+                                                       \
+       aria_load_state_8way(x0, x1, x2, x3,            \
+                            x4, x5, x6, x7,            \
+                            mem_tmp, 0);               \
+       aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,   \
+                     y0, y7, y2, rk, 0, round);        \
+                                                       \
+       aria_sbox_8way_gfni(x2, x3, x0, x1,             \
+                           x6, x7, x4, x5,             \
+                           y0, y1, y2, y3,             \
+                           y4, y5, y6, y7);            \
+                                                       \
+       aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3);    \
+       aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3);    \
+       aria_store_state_8way(x0, x1, x2, x3,           \
+                             x4, x5, x6, x7,           \
+                             mem_tmp, 0);              \
+       aria_load_state_8way(y0, y1, y2, y3,            \
+                            y4, y5, y6, y7,            \
+                            mem_tmp, 8);               \
+       aria_diff_word(x0, x1, x2, x3,                  \
+                      x4, x5, x6, x7,                  \
+                      y0, y1, y2, y3,                  \
+                      y4, y5, y6, y7);                 \
+       /* aria_diff_byte()                             \
+        * T3 = ABCD -> BADC                            \
+        * T3 = y4, y5, y6, y7 -> y5, y4, y7, y6        \
+        * T0 = ABCD -> CDAB                            \
+        * T0 = x0, x1, x2, x3 -> x2, x3, x0, x1        \
+        * T1 = ABCD -> DCBA                            \
+        * T1 = x4, x5, x6, x7 -> x7, x6, x5, x4        \
+        */                                             \
+       aria_diff_word(x2, x3, x0, x1,                  \
+                      x7, x6, x5, x4,                  \
+                      y0, y1, y2, y3,                  \
+                      y5, y4, y7, y6);                 \
+       aria_store_state_8way(x3, x2, x1, x0,           \
+                             x6, x7, x4, x5,           \
+                             mem_tmp, 0);
+
+#define aria_fo_gfni(x0, x1, x2, x3,                   \
+                    x4, x5, x6, x7,                    \
+                    y0, y1, y2, y3,                    \
+                    y4, y5, y6, y7,                    \
+                    mem_tmp, rk, round)                \
+       vpxor y7, y7, y7;                               \
+       aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,   \
+                     y0, y7, y2, rk, 8, round);        \
+                                                       \
+       aria_sbox_8way_gfni(x0, x1, x2, x3,             \
+                           x4, x5, x6, x7,             \
+                           y0, y1, y2, y3,             \
+                           y4, y5, y6, y7);            \
+                                                       \
+       aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3);    \
+       aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3);    \
+       aria_store_state_8way(x0, x1, x2, x3,           \
+                             x4, x5, x6, x7,           \
+                             mem_tmp, 8);              \
+                                                       \
+       aria_load_state_8way(x0, x1, x2, x3,            \
+                            x4, x5, x6, x7,            \
+                            mem_tmp, 0);               \
+       aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,   \
+                     y0, y7, y2, rk, 0, round);        \
+                                                       \
+       aria_sbox_8way_gfni(x0, x1, x2, x3,             \
+                           x4, x5, x6, x7,             \
+                           y0, y1, y2, y3,             \
+                           y4, y5, y6, y7);            \
+                                                       \
+       aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3);    \
+       aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3);    \
+       aria_store_state_8way(x0, x1, x2, x3,           \
+                             x4, x5, x6, x7,           \
+                             mem_tmp, 0);              \
+       aria_load_state_8way(y0, y1, y2, y3,            \
+                            y4, y5, y6, y7,            \
+                            mem_tmp, 8);               \
+       aria_diff_word(x0, x1, x2, x3,                  \
+                      x4, x5, x6, x7,                  \
+                      y0, y1, y2, y3,                  \
+                      y4, y5, y6, y7);                 \
+       /* aria_diff_byte()                             \
+        * T1 = ABCD -> BADC                            \
+        * T1 = x4, x5, x6, x7 -> x5, x4, x7, x6        \
+        * T2 = ABCD -> CDAB                            \
+        * T2 = y0, y1, y2, y3, -> y2, y3, y0, y1       \
+        * T3 = ABCD -> DCBA                            \
+        * T3 = y4, y5, y6, y7 -> y7, y6, y5, y4        \
+        */                                             \
+       aria_diff_word(x0, x1, x2, x3,                  \
+                      x5, x4, x7, x6,                  \
+                      y2, y3, y0, y1,                  \
+                      y7, y6, y5, y4);                 \
+       aria_store_state_8way(x3, x2, x1, x0,           \
+                             x6, x7, x4, x5,           \
+                             mem_tmp, 0);
+
+#define aria_ff_gfni(x0, x1, x2, x3,                   \
+               x4, x5, x6, x7,                         \
+               y0, y1, y2, y3,                         \
+               y4, y5, y6, y7,                         \
+               mem_tmp, rk, round, last_round)         \
+       vpxor y7, y7, y7;                               \
+       aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,   \
+                     y0, y7, y2, rk, 8, round);        \
+                                                       \
+       aria_sbox_8way_gfni(x2, x3, x0, x1,             \
+                           x6, x7, x4, x5,             \
+                           y0, y1, y2, y3,             \
+                           y4, y5, y6, y7);            \
+                                                       \
+       aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,   \
+                     y0, y7, y2, rk, 8, last_round);   \
+                                                       \
+       aria_store_state_8way(x0, x1, x2, x3,           \
+                             x4, x5, x6, x7,           \
+                             mem_tmp, 8);              \
+                                                       \
+       aria_load_state_8way(x0, x1, x2, x3,            \
+                            x4, x5, x6, x7,            \
+                            mem_tmp, 0);               \
+       aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,   \
+                     y0, y7, y2, rk, 0, round);        \
+                                                       \
+       aria_sbox_8way_gfni(x2, x3, x0, x1,             \
+                           x6, x7, x4, x5,             \
+                           y0, y1, y2, y3,             \
+                           y4, y5, y6, y7);            \
+                                                       \
+       aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,   \
+                     y0, y7, y2, rk, 0, last_round);   \
+                                                       \
+       aria_load_state_8way(y0, y1, y2, y3,            \
+                            y4, y5, y6, y7,            \
+                            mem_tmp, 8);
+
+#endif /* CONFIG_AS_GFNI */
+
+
+SECTION_RODATA
+.align 16
+
+#define SHUFB_BYTES(idx) \
+       0 + (idx), 4 + (idx), 8 + (idx), 12 + (idx)
+
+.Lshufb_16x16b:
+       .byte SHUFB_BYTES(0), SHUFB_BYTES(1), SHUFB_BYTES(2), SHUFB_BYTES(3);
+/* For isolating SubBytes from AESENCLAST, inverse shift row */
+.Linv_shift_row:
+       .byte 0x00, 0x0d, 0x0a, 0x07, 0x04, 0x01, 0x0e, 0x0b
+       .byte 0x08, 0x05, 0x02, 0x0f, 0x0c, 0x09, 0x06, 0x03
+.Lshift_row:
+       .byte 0x00, 0x05, 0x0a, 0x0f, 0x04, 0x09, 0x0e, 0x03
+       .byte 0x08, 0x0d, 0x02, 0x07, 0x0c, 0x01, 0x06, 0x0b
+/* For CTR-mode IV byteswap */
+.Lbswap128_mask:
+       .byte 0x0f, 0x0e, 0x0d, 0x0c, 0x0b, 0x0a, 0x09, 0x08
+       .byte 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00
+
+/* AES inverse affine and S2 combined:
+ *      1 1 0 0 0 0 0 1     x0     0
+ *      0 1 0 0 1 0 0 0     x1     0
+ *      1 1 0 0 1 1 1 1     x2     0
+ *      0 1 1 0 1 0 0 1     x3     1
+ *      0 1 0 0 1 1 0 0  *  x4  +  0
+ *      0 1 0 1 1 0 0 0     x5     0
+ *      0 0 0 0 0 1 0 1     x6     0
+ *      1 1 1 0 0 1 1 1     x7     1
+ */
+.Ltf_lo__inv_aff__and__s2:
+       .octa 0x92172DA81A9FA520B2370D883ABF8500
+.Ltf_hi__inv_aff__and__s2:
+       .octa 0x2B15FFC1AF917B45E6D8320C625CB688
+
+/* X2 and AES forward affine combined:
+ *      1 0 1 1 0 0 0 1     x0     0
+ *      0 1 1 1 1 0 1 1     x1     0
+ *      0 0 0 1 1 0 1 0     x2     1
+ *      0 1 0 0 0 1 0 0     x3     0
+ *      0 0 1 1 1 0 1 1  *  x4  +  0
+ *      0 1 0 0 1 0 0 0     x5     0
+ *      1 1 0 1 0 0 1 1     x6     0
+ *      0 1 0 0 1 0 1 0     x7     0
+ */
+.Ltf_lo__x2__and__fwd_aff:
+       .octa 0xEFAE0544FCBD1657B8F95213ABEA4100
+.Ltf_hi__x2__and__fwd_aff:
+       .octa 0x3F893781E95FE1576CDA64D2BA0CB204
+
+.Lthree_x16:
+       .byte 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3
+.Ltwo_x16:
+       .byte 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
+.Lone_x16:
+       .byte 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
+
+.Lbige_addb_1:
+       .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1
+.Lbige_addb_2:
+       .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2
+.Lbige_addb_3:
+       .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3
+.Lbige_addb_4:
+       .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4
+.Lbige_addb_5:
+       .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5
+.Lbige_addb_6:
+       .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6
+.Lbige_addb_7:
+       .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7
+.Lbige_addb_8:
+       .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 8
+.Lbige_addb_9:
+       .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 9
+.Lbige_addb_10:
+       .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 10
+.Lbige_addb_11:
+       .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 11
+.Lbige_addb_12:
+       .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 12
+.Lbige_addb_13:
+       .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 13
+.Lbige_addb_14:
+       .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 14
+.Lbige_addb_15:
+       .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 15
+
+#ifdef CONFIG_AS_GFNI
+.align 8
+/* AES affine: */
+#define tf_aff_const BV8(1, 1, 0, 0, 0, 1, 1, 0)
+.Ltf_aff_bitmatrix:
+       .quad BM8X8(BV8(1, 0, 0, 0, 1, 1, 1, 1),
+                   BV8(1, 1, 0, 0, 0, 1, 1, 1),
+                   BV8(1, 1, 1, 0, 0, 0, 1, 1),
+                   BV8(1, 1, 1, 1, 0, 0, 0, 1),
+                   BV8(1, 1, 1, 1, 1, 0, 0, 0),
+                   BV8(0, 1, 1, 1, 1, 1, 0, 0),
+                   BV8(0, 0, 1, 1, 1, 1, 1, 0),
+                   BV8(0, 0, 0, 1, 1, 1, 1, 1))
+
+/* AES inverse affine: */
+#define tf_inv_const BV8(1, 0, 1, 0, 0, 0, 0, 0)
+.Ltf_inv_bitmatrix:
+       .quad BM8X8(BV8(0, 0, 1, 0, 0, 1, 0, 1),
+                   BV8(1, 0, 0, 1, 0, 0, 1, 0),
+                   BV8(0, 1, 0, 0, 1, 0, 0, 1),
+                   BV8(1, 0, 1, 0, 0, 1, 0, 0),
+                   BV8(0, 1, 0, 1, 0, 0, 1, 0),
+                   BV8(0, 0, 1, 0, 1, 0, 0, 1),
+                   BV8(1, 0, 0, 1, 0, 1, 0, 0),
+                   BV8(0, 1, 0, 0, 1, 0, 1, 0))
+
+/* S2: */
+#define tf_s2_const BV8(0, 1, 0, 0, 0, 1, 1, 1)
+.Ltf_s2_bitmatrix:
+       .quad BM8X8(BV8(0, 1, 0, 1, 0, 1, 1, 1),
+                   BV8(0, 0, 1, 1, 1, 1, 1, 1),
+                   BV8(1, 1, 1, 0, 1, 1, 0, 1),
+                   BV8(1, 1, 0, 0, 0, 0, 1, 1),
+                   BV8(0, 1, 0, 0, 0, 0, 1, 1),
+                   BV8(1, 1, 0, 0, 1, 1, 1, 0),
+                   BV8(0, 1, 1, 0, 0, 0, 1, 1),
+                   BV8(1, 1, 1, 1, 0, 1, 1, 0))
+
+/* X2: */
+#define tf_x2_const BV8(0, 0, 1, 1, 0, 1, 0, 0)
+.Ltf_x2_bitmatrix:
+       .quad BM8X8(BV8(0, 0, 0, 1, 1, 0, 0, 0),
+                   BV8(0, 0, 1, 0, 0, 1, 1, 0),
+                   BV8(0, 0, 0, 0, 1, 0, 1, 0),
+                   BV8(1, 1, 1, 0, 0, 0, 1, 1),
+                   BV8(1, 1, 1, 0, 1, 1, 0, 0),
+                   BV8(0, 1, 1, 0, 1, 0, 1, 1),
+                   BV8(1, 0, 1, 1, 1, 1, 0, 1),
+                   BV8(1, 0, 0, 1, 0, 0, 1, 1))
+
+/* Identity matrix: */
+.Ltf_id_bitmatrix:
+       .quad BM8X8(BV8(1, 0, 0, 0, 0, 0, 0, 0),
+                   BV8(0, 1, 0, 0, 0, 0, 0, 0),
+                   BV8(0, 0, 1, 0, 0, 0, 0, 0),
+                   BV8(0, 0, 0, 1, 0, 0, 0, 0),
+                   BV8(0, 0, 0, 0, 1, 0, 0, 0),
+                   BV8(0, 0, 0, 0, 0, 1, 0, 0),
+                   BV8(0, 0, 0, 0, 0, 0, 1, 0),
+                   BV8(0, 0, 0, 0, 0, 0, 0, 1))
+#endif /* CONFIG_AS_GFNI */
+
+/* 4-bit mask */
+.align 4
+.L0f0f0f0f:
+       .long 0x0f0f0f0f
+
+.text
+
+.align 16
+ELF(.type __aria_aesni_avx_crypt_16way,@function;)
+__aria_aesni_avx_crypt_16way:
+       /* input:
+       *      %r9: rk
+       *      %rsi: dst
+       *      %rdx: src
+       *      %xmm0..%xmm15: 16 byte-sliced blocks
+       */
+       CFI_STARTPROC();
+
+       movq %rsi, %rax;
+       leaq 8 * 16(%rax), %r8;
+
+       movl ARIA_CTX_rounds(CTX), %r10d;
+       subl $2, %r10d;
+
+       inpack16_post(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
+                     %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
+                     %xmm15, %rax, %r8);
+       aria_fo(%xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm15,
+               %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
+               %rax, %r9, 0);
+       leaq 1*16(%r9), %r9;
+
+.align 16
+.Loop_aesni:
+       aria_fe(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
+               %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
+               %xmm15, %rax, %r9, 0);
+       aria_fo(%xmm9, %xmm8, %xmm11, %xmm10, %xmm12, %xmm13, %xmm14, %xmm15,
+               %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
+               %rax, %r9, 1);
+       leaq 2*16(%r9), %r9;
+       subl $2, %r10d;
+       jnz .Loop_aesni;
+
+       aria_ff(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
+               %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
+               %xmm15, %rax, %r9, 0, 1);
+
+       debyteslice_16x16b(%xmm8, %xmm12, %xmm1, %xmm4,
+                          %xmm9, %xmm13, %xmm0, %xmm5,
+                          %xmm10, %xmm14, %xmm3, %xmm6,
+                          %xmm11, %xmm15, %xmm2, %xmm7,
+                          (%rax), (%r8));
+
+       ret_spec_stop;
+       CFI_ENDPROC();
+ELF(.size __aria_aesni_avx_crypt_16way,.-__aria_aesni_avx_crypt_16way;)
+
+.align 16
+.globl _gcry_aria_aesni_avx_ecb_crypt_blk1_16
+ELF(.type _gcry_aria_aesni_avx_ecb_crypt_blk1_16,@function;)
+_gcry_aria_aesni_avx_ecb_crypt_blk1_16:
+       /* input:
+       *      %rdi: ctx, CTX
+       *      %rsi: dst
+       *      %rdx: src
+       *      %rcx: round keys
+       *      %r8: num blocks
+       */
+       CFI_STARTPROC();
+
+       pushq %rbp;
+       CFI_PUSH(%rbp);
+       movq %rsp, %rbp;
+       CFI_DEF_CFA_REGISTER(%rbp);
+
+       subq $(16 * 16), %rsp;
+       andq $~15, %rsp;
+
+       movq %rcx, %r9;
+       movq %rsi, %r11;
+       movq %rsp, %rsi; /* use stack for temporary store */
+
+       cmpq $16, %r8;
+       jb .Lecb_less_than_16;
+
+       inpack16_pre(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
+                    %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
+                    %xmm15, %rdx);
+
+       call __aria_aesni_avx_crypt_16way;
+
+       write_output(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
+                    %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
+                    %xmm15, %r11);
+
+.Lecb_end:
+       movl $STACK_DEPTH, %eax;
+       leave;
+       CFI_LEAVE();
+       vzeroall;
+       ret_spec_stop;
+
+.Lecb_less_than_16:
+       pushq %r8;
+       inpack_1_15_pre(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
+                       %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
+                       %xmm15, %rdx, %r8d);
+
+       call __aria_aesni_avx_crypt_16way;
+
+       popq %rax;
+       write_output_1_15(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6,
+                         %xmm7, %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13,
+                         %xmm14, %xmm15, %r11, %eax);
+
+       jmp .Lecb_end;
+       CFI_ENDPROC();
+ELF(.size _gcry_aria_aesni_avx_ecb_crypt_blk1_16,
+         .-_gcry_aria_aesni_avx_ecb_crypt_blk1_16;)
+
+.align 16
+ELF(.type __aria_aesni_avx_ctr_gen_keystream_16way,@function;)
+__aria_aesni_avx_ctr_gen_keystream_16way:
+       /* input:
+       *      %rdi: ctx
+       *      %rsi: dst
+       *      %rdx: src
+       *      %rcx: keystream
+       *      %r8: iv (big endian, 128bit)
+       */
+       CFI_STARTPROC();
+
+       /* load IV */
+       vmovdqu (%r8), %xmm8;
+       cmpb $(0x100 - 16), 15(%r8);
+       jbe .Lctr_byteadd;
+
+       /* byteswap */
+       vmovdqa .Lbswap128_mask rRIP, %xmm1;
+       vpshufb %xmm1, %xmm8, %xmm3; /* be => le */
+
+       vpcmpeqd %xmm0, %xmm0, %xmm0;
+       vpsrldq $8, %xmm0, %xmm0; /* low: -1, high: 0 */
+
+       /* construct IVs */
+       inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
+       vpshufb %xmm1, %xmm3, %xmm9;
+       inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
+       vpshufb %xmm1, %xmm3, %xmm10;
+       inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
+       vpshufb %xmm1, %xmm3, %xmm11;
+       inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
+       vpshufb %xmm1, %xmm3, %xmm12;
+       inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
+       vpshufb %xmm1, %xmm3, %xmm13;
+       inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
+       vpshufb %xmm1, %xmm3, %xmm14;
+       inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
+       vpshufb %xmm1, %xmm3, %xmm15;
+       vmovdqu %xmm8, (0 * 16)(%rcx);
+       vmovdqu %xmm9, (1 * 16)(%rcx);
+       vmovdqu %xmm10, (2 * 16)(%rcx);
+       vmovdqu %xmm11, (3 * 16)(%rcx);
+       vmovdqu %xmm12, (4 * 16)(%rcx);
+       vmovdqu %xmm13, (5 * 16)(%rcx);
+       vmovdqu %xmm14, (6 * 16)(%rcx);
+       vmovdqu %xmm15, (7 * 16)(%rcx);
+
+       inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
+       vpshufb %xmm1, %xmm3, %xmm8;
+       inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
+       vpshufb %xmm1, %xmm3, %xmm9;
+       inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
+       vpshufb %xmm1, %xmm3, %xmm10;
+       inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
+       vpshufb %xmm1, %xmm3, %xmm11;
+       inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
+       vpshufb %xmm1, %xmm3, %xmm12;
+       inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
+       vpshufb %xmm1, %xmm3, %xmm13;
+       inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
+       vpshufb %xmm1, %xmm3, %xmm14;
+       inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
+       vpshufb %xmm1, %xmm3, %xmm15;
+       inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
+       vpshufb %xmm1, %xmm3, %xmm4;
+       vmovdqu %xmm4, (%r8);
+
+       vmovdqu (0 * 16)(%rcx), %xmm0;
+       vmovdqu (1 * 16)(%rcx), %xmm1;
+       vmovdqu (2 * 16)(%rcx), %xmm2;
+       vmovdqu (3 * 16)(%rcx), %xmm3;
+       vmovdqu (4 * 16)(%rcx), %xmm4;
+       vmovdqu (5 * 16)(%rcx), %xmm5;
+       vmovdqu (6 * 16)(%rcx), %xmm6;
+       vmovdqu (7 * 16)(%rcx), %xmm7;
+
+       ret_spec_stop;
+
+.align 8
+.Lctr_byteadd_full_ctr_carry:
+       addb $16, 15(%r8);
+       pushq %rcx;
+       movl $14, %ecx;
+       1:
+         adcb $0, (%r8, %rcx);
+         jnc 2f;
+         loop 1b;
+       2:
+       popq %rcx;
+       jmp .Lctr_byteadd_xmm;
+.align 8
+.Lctr_byteadd:
+       je .Lctr_byteadd_full_ctr_carry;
+       addb $16, 15(%r8);
+.Lctr_byteadd_xmm:
+       vmovdqa %xmm8, %xmm0;
+       vpaddb .Lbige_addb_1 rRIP, %xmm8, %xmm1;
+       vpaddb .Lbige_addb_2 rRIP, %xmm8, %xmm2;
+       vpaddb .Lbige_addb_3 rRIP, %xmm8, %xmm3;
+       vpaddb .Lbige_addb_4 rRIP, %xmm8, %xmm4;
+       vpaddb .Lbige_addb_5 rRIP, %xmm8, %xmm5;
+       vpaddb .Lbige_addb_6 rRIP, %xmm8, %xmm6;
+       vpaddb .Lbige_addb_7 rRIP, %xmm8, %xmm7;
+       vpaddb .Lbige_addb_8 rRIP, %xmm0, %xmm8;
+       vpaddb .Lbige_addb_9 rRIP, %xmm0, %xmm9;
+       vpaddb .Lbige_addb_10 rRIP, %xmm0, %xmm10;
+       vpaddb .Lbige_addb_11 rRIP, %xmm0, %xmm11;
+       vpaddb .Lbige_addb_12 rRIP, %xmm0, %xmm12;
+       vpaddb .Lbige_addb_13 rRIP, %xmm0, %xmm13;
+       vpaddb .Lbige_addb_14 rRIP, %xmm0, %xmm14;
+       vpaddb .Lbige_addb_15 rRIP, %xmm0, %xmm15;
+
+       ret_spec_stop;
+       CFI_ENDPROC();
+ELF(.size __aria_aesni_avx_ctr_gen_keystream_16way,.-__aria_aesni_avx_ctr_gen_keystream_16way;)
+
+.align 16
+.globl _gcry_aria_aesni_avx_ctr_crypt_blk16
+ELF(.type _gcry_aria_aesni_avx_ctr_crypt_blk16,@function;)
+_gcry_aria_aesni_avx_ctr_crypt_blk16:
+       /* input:
+       *      %rdi: ctx
+       *      %rsi: dst
+       *      %rdx: src
+       *      %rcx: iv (big endian, 128bit)
+       */
+       CFI_STARTPROC();
+
+       pushq %rbp;
+       CFI_PUSH(%rbp);
+       movq %rsp, %rbp;
+       CFI_DEF_CFA_REGISTER(%rbp);
+
+       subq $(16 * 16), %rsp;
+       andq $~15, %rsp;
+
+       movq %rcx, %r8;  /* %r8: iv */
+       movq %rsp, %rcx; /* %rcx: keystream */
+       call __aria_aesni_avx_ctr_gen_keystream_16way;
+
+       pushq %rsi;
+       movq %rdx, %r11;
+       movq %rcx, %rsi; /* use stack for temporary store */
+       movq %rcx, %rdx;
+       leaq ARIA_CTX_enc_key(CTX), %r9;
+
+       call __aria_aesni_avx_crypt_16way;
+
+       popq %rsi;
+       vpxor (0 * 16)(%r11), %xmm1, %xmm1;
+       vpxor (1 * 16)(%r11), %xmm0, %xmm0;
+       vpxor (2 * 16)(%r11), %xmm3, %xmm3;
+       vpxor (3 * 16)(%r11), %xmm2, %xmm2;
+       vpxor (4 * 16)(%r11), %xmm4, %xmm4;
+       vpxor (5 * 16)(%r11), %xmm5, %xmm5;
+       vpxor (6 * 16)(%r11), %xmm6, %xmm6;
+       vpxor (7 * 16)(%r11), %xmm7, %xmm7;
+       vpxor (8 * 16)(%r11), %xmm8, %xmm8;
+       vpxor (9 * 16)(%r11), %xmm9, %xmm9;
+       vpxor (10 * 16)(%r11), %xmm10, %xmm10;
+       vpxor (11 * 16)(%r11), %xmm11, %xmm11;
+       vpxor (12 * 16)(%r11), %xmm12, %xmm12;
+       vpxor (13 * 16)(%r11), %xmm13, %xmm13;
+       vpxor (14 * 16)(%r11), %xmm14, %xmm14;
+       vpxor (15 * 16)(%r11), %xmm15, %xmm15;
+       write_output(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
+                    %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
+                    %xmm15, %rsi);
+
+       movl $STACK_DEPTH, %eax;
+       leave;
+       CFI_LEAVE();
+       vzeroall;
+       ret_spec_stop;
+       CFI_ENDPROC();
+ELF(.size _gcry_aria_aesni_avx_ctr_crypt_blk16,.-_gcry_aria_aesni_avx_ctr_crypt_blk16;)
+
+#ifdef CONFIG_AS_GFNI
+.align 16
+ELF(.type __aria_gfni_avx_crypt_16way,@function;)
+__aria_gfni_avx_crypt_16way:
+       /* input:
+       *      %r9: rk
+       *      %rsi: dst
+       *      %rdx: src
+       *      %xmm0..%xmm15: 16 byte-sliced blocks
+       */
+       CFI_STARTPROC();
+
+       movq %rsi, %rax;
+       leaq 8 * 16(%rax), %r8;
+
+       movl ARIA_CTX_rounds(CTX), %r10d;
+       subl $2, %r10d;
+
+       inpack16_post(%xmm0, %xmm1, %xmm2, %xmm3,
+                     %xmm4, %xmm5, %xmm6, %xmm7,
+                     %xmm8, %xmm9, %xmm10, %xmm11,
+                     %xmm12, %xmm13, %xmm14,
+                     %xmm15, %rax, %r8);
+       aria_fo_gfni(%xmm8, %xmm9, %xmm10, %xmm11,
+                    %xmm12, %xmm13, %xmm14, %xmm15,
+                    %xmm0, %xmm1, %xmm2, %xmm3,
+                    %xmm4, %xmm5, %xmm6, %xmm7,
+                    %rax, %r9, 0);
+       leaq 1*16(%r9), %r9;
+
+.align 16
+.Loop_gfni:
+       aria_fe_gfni(%xmm1, %xmm0, %xmm3, %xmm2,
+                    %xmm4, %xmm5, %xmm6, %xmm7,
+                    %xmm8, %xmm9, %xmm10, %xmm11,
+                    %xmm12, %xmm13, %xmm14,
+                    %xmm15, %rax, %r9, 0);
+       aria_fo_gfni(%xmm9, %xmm8, %xmm11, %xmm10,
+                    %xmm12, %xmm13, %xmm14, %xmm15,
+                    %xmm0, %xmm1, %xmm2, %xmm3,
+                    %xmm4, %xmm5, %xmm6, %xmm7,
+                    %rax, %r9, 1);
+       leaq 2*16(%r9), %r9;
+       subl $2, %r10d;
+       jnz .Loop_gfni;
+
+       aria_ff_gfni(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
+                    %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
+                    %xmm15, %rax, %r9, 0, 1);
+
+       debyteslice_16x16b(%xmm8, %xmm12, %xmm1, %xmm4,
+                          %xmm9, %xmm13, %xmm0, %xmm5,
+                          %xmm10, %xmm14, %xmm3, %xmm6,
+                          %xmm11, %xmm15, %xmm2, %xmm7,
+                          (%rax), (%r8));
+
+       ret_spec_stop;
+       CFI_ENDPROC();
+ELF(.size __aria_gfni_avx_crypt_16way,.-__aria_gfni_avx_crypt_16way;)
+
+.align 16
+.globl _gcry_aria_gfni_avx_ecb_crypt_blk1_16
+ELF(.type _gcry_aria_gfni_avx_ecb_crypt_blk1_16,@function;)
+_gcry_aria_gfni_avx_ecb_crypt_blk1_16:
+       /* input:
+       *      %rdi: ctx, CTX
+       *      %rsi: dst
+       *      %rdx: src
+       *      %rcx: round keys
+       *      %r8: num blocks
+       */
+       CFI_STARTPROC();
+
+       pushq %rbp;
+       CFI_PUSH(%rbp);
+       movq %rsp, %rbp;
+       CFI_DEF_CFA_REGISTER(%rbp);
+
+       subq $(16 * 16), %rsp;
+       andq $~15, %rsp;
+
+       movq %rcx, %r9;
+       movq %rsi, %r11;
+       movq %rsp, %rsi; /* use stack for temporary store */
+
+       cmpq $16, %r8;
+       jb .Lecb_less_than_16_gfni;
+
+       inpack16_pre(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
+                    %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
+                    %xmm15, %rdx);
+
+       call __aria_gfni_avx_crypt_16way;
+
+       write_output(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
+                    %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
+                    %xmm15, %r11);
+
+.Lecb_end_gfni:
+       movl $STACK_DEPTH, %eax;
+       leave;
+       CFI_LEAVE();
+       vzeroall;
+       ret_spec_stop;
+
+.Lecb_less_than_16_gfni:
+       pushq %r8;
+       inpack_1_15_pre(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
+                       %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
+                       %xmm15, %rdx, %r8d);
+
+       call __aria_gfni_avx_crypt_16way;
+
+       popq %rax;
+       write_output_1_15(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6,
+                         %xmm7, %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13,
+                         %xmm14, %xmm15, %r11, %eax);
+
+       jmp .Lecb_end_gfni;
+       CFI_ENDPROC();
+ELF(.size _gcry_aria_gfni_avx_ecb_crypt_blk1_16,
+         .-_gcry_aria_gfni_avx_ecb_crypt_blk1_16;)
+
+.align 16
+.globl _gcry_aria_gfni_avx_ctr_crypt_blk16
+ELF(.type _gcry_aria_gfni_avx_ctr_crypt_blk16,@function;)
+_gcry_aria_gfni_avx_ctr_crypt_blk16:
+       /* input:
+       *      %rdi: ctx
+       *      %rsi: dst
+       *      %rdx: src
+       *      %rcx: iv (big endian, 128bit)
+       */
+       CFI_STARTPROC();
+
+       pushq %rbp;
+       CFI_PUSH(%rbp);
+       movq %rsp, %rbp;
+       CFI_DEF_CFA_REGISTER(%rbp);
+
+       subq $(16 * 16), %rsp;
+       andq $~15, %rsp;
+
+       movq %rcx, %r8;  /* %r8: iv */
+       movq %rsp, %rcx; /* %rcx: keystream */
+       call __aria_aesni_avx_ctr_gen_keystream_16way
+
+       pushq %rsi;
+       movq %rdx, %r11;
+       movq %rcx, %rsi; /* use stack for temporary store */
+       movq %rcx, %rdx;
+       leaq ARIA_CTX_enc_key(CTX), %r9;
+
+       call __aria_gfni_avx_crypt_16way;
+
+       popq %rsi;
+       vpxor (0 * 16)(%r11), %xmm1, %xmm1;
+       vpxor (1 * 16)(%r11), %xmm0, %xmm0;
+       vpxor (2 * 16)(%r11), %xmm3, %xmm3;
+       vpxor (3 * 16)(%r11), %xmm2, %xmm2;
+       vpxor (4 * 16)(%r11), %xmm4, %xmm4;
+       vpxor (5 * 16)(%r11), %xmm5, %xmm5;
+       vpxor (6 * 16)(%r11), %xmm6, %xmm6;
+       vpxor (7 * 16)(%r11), %xmm7, %xmm7;
+       vpxor (8 * 16)(%r11), %xmm8, %xmm8;
+       vpxor (9 * 16)(%r11), %xmm9, %xmm9;
+       vpxor (10 * 16)(%r11), %xmm10, %xmm10;
+       vpxor (11 * 16)(%r11), %xmm11, %xmm11;
+       vpxor (12 * 16)(%r11), %xmm12, %xmm12;
+       vpxor (13 * 16)(%r11), %xmm13, %xmm13;
+       vpxor (14 * 16)(%r11), %xmm14, %xmm14;
+       vpxor (15 * 16)(%r11), %xmm15, %xmm15;
+       write_output(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
+                    %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
+                    %xmm15, %rsi);
+
+       movl $STACK_DEPTH, %eax;
+       leave;
+       CFI_LEAVE();
+       vzeroall;
+       ret_spec_stop;
+       CFI_ENDPROC();
+ELF(.size _gcry_aria_gfni_avx_ctr_crypt_blk16,.-_gcry_aria_gfni_avx_ctr_crypt_blk16;)
+#endif /* CONFIG_AS_GFNI */
+
+#endif /* ENABLE_AVX_SUPPORT && ENABLE_AESNI_SUPPORT */
+#endif /* __x86_64 */
diff --git a/cipher/aria-aesni-avx2-amd64.S b/cipher/aria-aesni-avx2-amd64.S
new file mode 100644 (file)
index 0000000..d33fa54
--- /dev/null
@@ -0,0 +1,1830 @@
+/* aria-aesni-avx2-amd64.S  -  AESNI/GFNI/AVX2 implementation of ARIA cipher
+ *
+ * Copyright (C) 2022-2023 Taehee Yoo <ap420073@gmail.com>
+ * Copyright (C) 2023 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <config.h>
+
+#ifdef __x86_64
+#if (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
+     defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) && \
+    defined(ENABLE_AVX2_SUPPORT) && defined(ENABLE_AESNI_SUPPORT)
+
+#include "asm-common-amd64.h"
+
+#ifdef ENABLE_GFNI_SUPPORT
+#  define CONFIG_AS_GFNI 1
+#endif
+#ifdef HAVE_GCC_INLINE_ASM_VAES_VPCLMUL
+#  define CONFIG_AS_VAES 1
+#endif
+
+/* struct ARIA_context: */
+#define ARIA_BLOCK_SIZE  16
+#define ARIA_MAX_RD_KEYS 17
+#define ARIA_CTX_enc_key 0
+#define ARIA_CTX_dec_key (ARIA_CTX_enc_key + (ARIA_BLOCK_SIZE * ARIA_MAX_RD_KEYS))
+#define ARIA_CTX_rounds (ARIA_CTX_dec_key + (ARIA_BLOCK_SIZE * ARIA_MAX_RD_KEYS))
+
+/* register macros */
+#define CTX %rdi
+
+#define ymm0_x xmm0
+#define ymm1_x xmm1
+#define ymm2_x xmm2
+#define ymm3_x xmm3
+#define ymm4_x xmm4
+#define ymm5_x xmm5
+#define ymm6_x xmm6
+#define ymm7_x xmm7
+#define ymm8_x xmm8
+#define ymm9_x xmm9
+#define ymm10_x xmm10
+#define ymm11_x xmm11
+#define ymm12_x xmm12
+#define ymm13_x xmm13
+#define ymm14_x xmm14
+#define ymm15_x xmm15
+
+/* helper macros */
+#define STACK_DEPTH (2 * 8 + 16 * 32 + 31)
+
+#define BV8(a0, a1, a2, a3, a4, a5, a6, a7)            \
+       ( (((a0) & 1) << 0) |                           \
+         (((a1) & 1) << 1) |                           \
+         (((a2) & 1) << 2) |                           \
+         (((a3) & 1) << 3) |                           \
+         (((a4) & 1) << 4) |                           \
+         (((a5) & 1) << 5) |                           \
+         (((a6) & 1) << 6) |                           \
+         (((a7) & 1) << 7) )
+
+#define BM8X8(l0, l1, l2, l3, l4, l5, l6, l7)          \
+       ( ((l7) << (0 * 8)) |                           \
+         ((l6) << (1 * 8)) |                           \
+         ((l5) << (2 * 8)) |                           \
+         ((l4) << (3 * 8)) |                           \
+         ((l3) << (4 * 8)) |                           \
+         ((l2) << (5 * 8)) |                           \
+         ((l1) << (6 * 8)) |                           \
+         ((l0) << (7 * 8)) )
+
+/* asm macros */
+#define inc_le128(x, minus_one, tmp)                   \
+       vpcmpeqq minus_one, x, tmp;                     \
+       vpsubq minus_one, x, x;                         \
+       vpslldq $8, tmp, tmp;                           \
+       vpsubq tmp, x, x;
+
+#define filter_8bit(x, lo_t, hi_t, mask4bit, tmp0)     \
+       vpand x, mask4bit, tmp0;                        \
+       vpandn x, mask4bit, x;                          \
+       vpsrld $4, x, x;                                \
+                                                       \
+       vpshufb tmp0, lo_t, tmp0;                       \
+       vpshufb x, hi_t, x;                             \
+       vpxor tmp0, x, x;
+
+#define transpose_4x4(x0, x1, x2, x3, t1, t2)          \
+       vpunpckhdq x1, x0, t2;                          \
+       vpunpckldq x1, x0, x0;                          \
+                                                       \
+       vpunpckldq x3, x2, t1;                          \
+       vpunpckhdq x3, x2, x2;                          \
+                                                       \
+       vpunpckhqdq t1, x0, x1;                         \
+       vpunpcklqdq t1, x0, x0;                         \
+                                                       \
+       vpunpckhqdq x2, t2, x3;                         \
+       vpunpcklqdq x2, t2, x2;
+
+#define byteslice_16x16b(a0, b0, c0, d0,               \
+                        a1, b1, c1, d1,                \
+                        a2, b2, c2, d2,                \
+                        a3, b3, c3, d3,                \
+                        st0, st1)                      \
+       vmovdqu d2, st0;                                \
+       vmovdqu d3, st1;                                \
+       transpose_4x4(a0, a1, a2, a3, d2, d3);          \
+       transpose_4x4(b0, b1, b2, b3, d2, d3);          \
+       vmovdqu st0, d2;                                \
+       vmovdqu st1, d3;                                \
+                                                       \
+       vmovdqu a0, st0;                                \
+       vmovdqu a1, st1;                                \
+       transpose_4x4(c0, c1, c2, c3, a0, a1);          \
+       transpose_4x4(d0, d1, d2, d3, a0, a1);          \
+                                                       \
+       vbroadcasti128 .Lshufb_16x16b rRIP, a0;         \
+       vmovdqu st1, a1;                                \
+       vpshufb a0, a2, a2;                             \
+       vpshufb a0, a3, a3;                             \
+       vpshufb a0, b0, b0;                             \
+       vpshufb a0, b1, b1;                             \
+       vpshufb a0, b2, b2;                             \
+       vpshufb a0, b3, b3;                             \
+       vpshufb a0, a1, a1;                             \
+       vpshufb a0, c0, c0;                             \
+       vpshufb a0, c1, c1;                             \
+       vpshufb a0, c2, c2;                             \
+       vpshufb a0, c3, c3;                             \
+       vpshufb a0, d0, d0;                             \
+       vpshufb a0, d1, d1;                             \
+       vpshufb a0, d2, d2;                             \
+       vpshufb a0, d3, d3;                             \
+       vmovdqu d3, st1;                                \
+       vmovdqu st0, d3;                                \
+       vpshufb a0, d3, a0;                             \
+       vmovdqu d2, st0;                                \
+                                                       \
+       transpose_4x4(a0, b0, c0, d0, d2, d3);          \
+       transpose_4x4(a1, b1, c1, d1, d2, d3);          \
+       vmovdqu st0, d2;                                \
+       vmovdqu st1, d3;                                \
+                                                       \
+       vmovdqu b0, st0;                                \
+       vmovdqu b1, st1;                                \
+       transpose_4x4(a2, b2, c2, d2, b0, b1);          \
+       transpose_4x4(a3, b3, c3, d3, b0, b1);          \
+       vmovdqu st0, b0;                                \
+       vmovdqu st1, b1;                                \
+       /* does not adjust output bytes inside vectors */
+
+#define debyteslice_16x16b(a0, b0, c0, d0,             \
+                          a1, b1, c1, d1,              \
+                          a2, b2, c2, d2,              \
+                          a3, b3, c3, d3,              \
+                          st0, st1)                    \
+       vmovdqu d2, st0;                                \
+       vmovdqu d3, st1;                                \
+       transpose_4x4(a0, a1, a2, a3, d2, d3);          \
+       transpose_4x4(b0, b1, b2, b3, d2, d3);          \
+       vmovdqu st0, d2;                                \
+       vmovdqu st1, d3;                                \
+                                                       \
+       vmovdqu a0, st0;                                \
+       vmovdqu a1, st1;                                \
+       transpose_4x4(c0, c1, c2, c3, a0, a1);          \
+       transpose_4x4(d0, d1, d2, d3, a0, a1);          \
+                                                       \
+       vbroadcasti128 .Lshufb_16x16b rRIP, a0;         \
+       vmovdqu st1, a1;                                \
+       vpshufb a0, a2, a2;                             \
+       vpshufb a0, a3, a3;                             \
+       vpshufb a0, b0, b0;                             \
+       vpshufb a0, b1, b1;                             \
+       vpshufb a0, b2, b2;                             \
+       vpshufb a0, b3, b3;                             \
+       vpshufb a0, a1, a1;                             \
+       vpshufb a0, c0, c0;                             \
+       vpshufb a0, c1, c1;                             \
+       vpshufb a0, c2, c2;                             \
+       vpshufb a0, c3, c3;                             \
+       vpshufb a0, d0, d0;                             \
+       vpshufb a0, d1, d1;                             \
+       vpshufb a0, d2, d2;                             \
+       vpshufb a0, d3, d3;                             \
+       vmovdqu d3, st1;                                \
+       vmovdqu st0, d3;                                \
+       vpshufb a0, d3, a0;                             \
+       vmovdqu d2, st0;                                \
+                                                       \
+       transpose_4x4(c0, d0, a0, b0, d2, d3);          \
+       transpose_4x4(c1, d1, a1, b1, d2, d3);          \
+       vmovdqu st0, d2;                                \
+       vmovdqu st1, d3;                                \
+                                                       \
+       vmovdqu b0, st0;                                \
+       vmovdqu b1, st1;                                \
+       transpose_4x4(c2, d2, a2, b2, b0, b1);          \
+       transpose_4x4(c3, d3, a3, b3, b0, b1);          \
+       vmovdqu st0, b0;                                \
+       vmovdqu st1, b1;                                \
+       /* does not adjust output bytes inside vectors */
+
+/* load blocks to registers and apply pre-whitening */
+#define inpack16_pre(x0, x1, x2, x3,                   \
+                    x4, x5, x6, x7,                    \
+                    y0, y1, y2, y3,                    \
+                    y4, y5, y6, y7,                    \
+                    rio)                               \
+       vmovdqu (0 * 32)(rio), x0;                      \
+       vmovdqu (1 * 32)(rio), x1;                      \
+       vmovdqu (2 * 32)(rio), x2;                      \
+       vmovdqu (3 * 32)(rio), x3;                      \
+       vmovdqu (4 * 32)(rio), x4;                      \
+       vmovdqu (5 * 32)(rio), x5;                      \
+       vmovdqu (6 * 32)(rio), x6;                      \
+       vmovdqu (7 * 32)(rio), x7;                      \
+       vmovdqu (8 * 32)(rio), y0;                      \
+       vmovdqu (9 * 32)(rio), y1;                      \
+       vmovdqu (10 * 32)(rio), y2;                     \
+       vmovdqu (11 * 32)(rio), y3;                     \
+       vmovdqu (12 * 32)(rio), y4;                     \
+       vmovdqu (13 * 32)(rio), y5;                     \
+       vmovdqu (14 * 32)(rio), y6;                     \
+       vmovdqu (15 * 32)(rio), y7;
+
+/* byteslice pre-whitened blocks and store to temporary memory */
+#define inpack16_post(x0, x1, x2, x3,                  \
+                     x4, x5, x6, x7,                   \
+                     y0, y1, y2, y3,                   \
+                     y4, y5, y6, y7,                   \
+                     mem_ab, mem_cd)                   \
+       byteslice_16x16b(x0, x1, x2, x3,                \
+                        x4, x5, x6, x7,                \
+                        y0, y1, y2, y3,                \
+                        y4, y5, y6, y7,                \
+                        (mem_ab), (mem_cd));           \
+                                                       \
+       vmovdqu x0, 0 * 32(mem_ab);                     \
+       vmovdqu x1, 1 * 32(mem_ab);                     \
+       vmovdqu x2, 2 * 32(mem_ab);                     \
+       vmovdqu x3, 3 * 32(mem_ab);                     \
+       vmovdqu x4, 4 * 32(mem_ab);                     \
+       vmovdqu x5, 5 * 32(mem_ab);                     \
+       vmovdqu x6, 6 * 32(mem_ab);                     \
+       vmovdqu x7, 7 * 32(mem_ab);                     \
+       vmovdqu y0, 0 * 32(mem_cd);                     \
+       vmovdqu y1, 1 * 32(mem_cd);                     \
+       vmovdqu y2, 2 * 32(mem_cd);                     \
+       vmovdqu y3, 3 * 32(mem_cd);                     \
+       vmovdqu y4, 4 * 32(mem_cd);                     \
+       vmovdqu y5, 5 * 32(mem_cd);                     \
+       vmovdqu y6, 6 * 32(mem_cd);                     \
+       vmovdqu y7, 7 * 32(mem_cd);
+
+#define write_output(x0, x1, x2, x3,                   \
+                    x4, x5, x6, x7,                    \
+                    y0, y1, y2, y3,                    \
+                    y4, y5, y6, y7,                    \
+                    mem)                               \
+       vmovdqu x0, 0 * 32(mem);                        \
+       vmovdqu x1, 1 * 32(mem);                        \
+       vmovdqu x2, 2 * 32(mem);                        \
+       vmovdqu x3, 3 * 32(mem);                        \
+       vmovdqu x4, 4 * 32(mem);                        \
+       vmovdqu x5, 5 * 32(mem);                        \
+       vmovdqu x6, 6 * 32(mem);                        \
+       vmovdqu x7, 7 * 32(mem);                        \
+       vmovdqu y0, 8 * 32(mem);                        \
+       vmovdqu y1, 9 * 32(mem);                        \
+       vmovdqu y2, 10 * 32(mem);                       \
+       vmovdqu y3, 11 * 32(mem);                       \
+       vmovdqu y4, 12 * 32(mem);                       \
+       vmovdqu y5, 13 * 32(mem);                       \
+       vmovdqu y6, 14 * 32(mem);                       \
+       vmovdqu y7, 15 * 32(mem);                       \
+
+#define aria_store_state_8way(x0, x1, x2, x3,          \
+                             x4, x5, x6, x7,           \
+                             mem_tmp, idx)             \
+       vmovdqu x0, ((idx + 0) * 32)(mem_tmp);          \
+       vmovdqu x1, ((idx + 1) * 32)(mem_tmp);          \
+       vmovdqu x2, ((idx + 2) * 32)(mem_tmp);          \
+       vmovdqu x3, ((idx + 3) * 32)(mem_tmp);          \
+       vmovdqu x4, ((idx + 4) * 32)(mem_tmp);          \
+       vmovdqu x5, ((idx + 5) * 32)(mem_tmp);          \
+       vmovdqu x6, ((idx + 6) * 32)(mem_tmp);          \
+       vmovdqu x7, ((idx + 7) * 32)(mem_tmp);
+
+#define aria_load_state_8way(x0, x1, x2, x3,           \
+                            x4, x5, x6, x7,            \
+                            mem_tmp, idx)              \
+       vmovdqu ((idx + 0) * 32)(mem_tmp), x0;          \
+       vmovdqu ((idx + 1) * 32)(mem_tmp), x1;          \
+       vmovdqu ((idx + 2) * 32)(mem_tmp), x2;          \
+       vmovdqu ((idx + 3) * 32)(mem_tmp), x3;          \
+       vmovdqu ((idx + 4) * 32)(mem_tmp), x4;          \
+       vmovdqu ((idx + 5) * 32)(mem_tmp), x5;          \
+       vmovdqu ((idx + 6) * 32)(mem_tmp), x6;          \
+       vmovdqu ((idx + 7) * 32)(mem_tmp), x7;
+
+#define aria_ark_8way(x0, x1, x2, x3,                  \
+                     x4, x5, x6, x7,                   \
+                     t0, rk, idx, round)               \
+       /* AddRoundKey */                               \
+       vpbroadcastb ((round * 16) + idx + 3)(rk), t0;  \
+       vpxor t0, x0, x0;                               \
+       vpbroadcastb ((round * 16) + idx + 2)(rk), t0;  \
+       vpxor t0, x1, x1;                               \
+       vpbroadcastb ((round * 16) + idx + 1)(rk), t0;  \
+       vpxor t0, x2, x2;                               \
+       vpbroadcastb ((round * 16) + idx + 0)(rk), t0;  \
+       vpxor t0, x3, x3;                               \
+       vpbroadcastb ((round * 16) + idx + 7)(rk), t0;  \
+       vpxor t0, x4, x4;                               \
+       vpbroadcastb ((round * 16) + idx + 6)(rk), t0;  \
+       vpxor t0, x5, x5;                               \
+       vpbroadcastb ((round * 16) + idx + 5)(rk), t0;  \
+       vpxor t0, x6, x6;                               \
+       vpbroadcastb ((round * 16) + idx + 4)(rk), t0;  \
+       vpxor t0, x7, x7;
+
+#ifdef CONFIG_AS_GFNI
+#define aria_sbox_8way_gfni(x0, x1, x2, x3,            \
+                           x4, x5, x6, x7,             \
+                           t0, t1, t2, t3,             \
+                           t4, t5, t6, t7)             \
+       vpbroadcastq .Ltf_s2_bitmatrix rRIP, t0;        \
+       vpbroadcastq .Ltf_inv_bitmatrix rRIP, t1;       \
+       vpbroadcastq .Ltf_id_bitmatrix rRIP, t2;        \
+       vpbroadcastq .Ltf_aff_bitmatrix rRIP, t3;       \
+       vpbroadcastq .Ltf_x2_bitmatrix rRIP, t4;        \
+       vgf2p8affineinvqb $(tf_s2_const), t0, x1, x1;   \
+       vgf2p8affineinvqb $(tf_s2_const), t0, x5, x5;   \
+       vgf2p8affineqb $(tf_inv_const), t1, x2, x2;     \
+       vgf2p8affineqb $(tf_inv_const), t1, x6, x6;     \
+       vgf2p8affineinvqb $0, t2, x2, x2;               \
+       vgf2p8affineinvqb $0, t2, x6, x6;               \
+       vgf2p8affineinvqb $(tf_aff_const), t3, x0, x0;  \
+       vgf2p8affineinvqb $(tf_aff_const), t3, x4, x4;  \
+       vgf2p8affineqb $(tf_x2_const), t4, x3, x3;      \
+       vgf2p8affineqb $(tf_x2_const), t4, x7, x7;      \
+       vgf2p8affineinvqb $0, t2, x3, x3;               \
+       vgf2p8affineinvqb $0, t2, x7, x7
+#endif /* CONFIG_AS_GFNI */
+
+#ifdef CONFIG_AS_VAES
+#define aria_sbox_8way_vaes(x0, x1, x2, x3,            \
+                           x4, x5, x6, x7,             \
+                           t0, t1, t2, t3,             \
+                           t4, t5, t6, t7)             \
+       vpxor t7, t7, t7;                               \
+       vpxor t6, t6, t6;                               \
+       vbroadcasti128 .Linv_shift_row rRIP, t0;        \
+       vbroadcasti128 .Lshift_row rRIP, t1;            \
+       vbroadcasti128 .Ltf_lo__inv_aff__and__s2 rRIP, t2;\
+       vbroadcasti128 .Ltf_hi__inv_aff__and__s2 rRIP, t3;\
+       vbroadcasti128 .Ltf_lo__x2__and__fwd_aff rRIP, t4;\
+       vbroadcasti128 .Ltf_hi__x2__and__fwd_aff rRIP, t5;\
+                                                       \
+       vaesenclast t7, x0, x0;                         \
+       vaesenclast t7, x4, x4;                         \
+       vaesenclast t7, x1, x1;                         \
+       vaesenclast t7, x5, x5;                         \
+       vaesdeclast t7, x2, x2;                         \
+       vaesdeclast t7, x6, x6;                         \
+                                                       \
+       vpbroadcastd .L0f0f0f0f rRIP, t6;               \
+                                                       \
+       /* AES inverse shift rows */                    \
+       vpshufb t0, x0, x0;                             \
+       vpshufb t0, x4, x4;                             \
+       vpshufb t0, x1, x1;                             \
+       vpshufb t0, x5, x5;                             \
+       vpshufb t1, x3, x3;                             \
+       vpshufb t1, x7, x7;                             \
+       vpshufb t1, x2, x2;                             \
+       vpshufb t1, x6, x6;                             \
+                                                       \
+       /* affine transformation for S2 */              \
+       filter_8bit(x1, t2, t3, t6, t0);                \
+       /* affine transformation for S2 */              \
+       filter_8bit(x5, t2, t3, t6, t0);                \
+                                                       \
+       /* affine transformation for X2 */              \
+       filter_8bit(x3, t4, t5, t6, t0);                \
+       /* affine transformation for X2 */              \
+       filter_8bit(x7, t4, t5, t6, t0);                \
+                                                       \
+       vaesdeclast t7, x3, x3;                         \
+       vaesdeclast t7, x7, x7;
+#endif /* CONFIG_AS_VAES */
+
+#define aria_sbox_8way(x0, x1, x2, x3,                 \
+                      x4, x5, x6, x7,                  \
+                      t0, t1, t2, t3,                  \
+                      t4, t5, t6, t7)                  \
+       vpxor t7, t7, t7;                               \
+       vpxor t6, t6, t6;                               \
+       vbroadcasti128 .Linv_shift_row rRIP, t0;        \
+       vbroadcasti128 .Lshift_row rRIP, t1;            \
+       vbroadcasti128 .Ltf_lo__inv_aff__and__s2 rRIP, t2;\
+       vbroadcasti128 .Ltf_hi__inv_aff__and__s2 rRIP, t3;\
+       vbroadcasti128 .Ltf_lo__x2__and__fwd_aff rRIP, t4;\
+       vbroadcasti128 .Ltf_hi__x2__and__fwd_aff rRIP, t5;\
+                                                       \
+       vextracti128 $1, x0, t6##_x;                    \
+       vaesenclast t7##_x, x0##_x, x0##_x;             \
+       vaesenclast t7##_x, t6##_x, t6##_x;             \
+       vinserti128 $1, t6##_x, x0, x0;                 \
+                                                       \
+       vextracti128 $1, x4, t6##_x;                    \
+       vaesenclast t7##_x, x4##_x, x4##_x;             \
+       vaesenclast t7##_x, t6##_x, t6##_x;             \
+       vinserti128 $1, t6##_x, x4, x4;                 \
+                                                       \
+       vextracti128 $1, x1, t6##_x;                    \
+       vaesenclast t7##_x, x1##_x, x1##_x;             \
+       vaesenclast t7##_x, t6##_x, t6##_x;             \
+       vinserti128 $1, t6##_x, x1, x1;                 \
+                                                       \
+       vextracti128 $1, x5, t6##_x;                    \
+       vaesenclast t7##_x, x5##_x, x5##_x;             \
+       vaesenclast t7##_x, t6##_x, t6##_x;             \
+       vinserti128 $1, t6##_x, x5, x5;                 \
+                                                       \
+       vextracti128 $1, x2, t6##_x;                    \
+       vaesdeclast t7##_x, x2##_x, x2##_x;             \
+       vaesdeclast t7##_x, t6##_x, t6##_x;             \
+       vinserti128 $1, t6##_x, x2, x2;                 \
+                                                       \
+       vextracti128 $1, x6, t6##_x;                    \
+       vaesdeclast t7##_x, x6##_x, x6##_x;             \
+       vaesdeclast t7##_x, t6##_x, t6##_x;             \
+       vinserti128 $1, t6##_x, x6, x6;                 \
+                                                       \
+       vpbroadcastd .L0f0f0f0f rRIP, t6;               \
+                                                       \
+       /* AES inverse shift rows */                    \
+       vpshufb t0, x0, x0;                             \
+       vpshufb t0, x4, x4;                             \
+       vpshufb t0, x1, x1;                             \
+       vpshufb t0, x5, x5;                             \
+       vpshufb t1, x3, x3;                             \
+       vpshufb t1, x7, x7;                             \
+       vpshufb t1, x2, x2;                             \
+       vpshufb t1, x6, x6;                             \
+                                                       \
+       /* affine transformation for S2 */              \
+       filter_8bit(x1, t2, t3, t6, t0);                \
+       /* affine transformation for S2 */              \
+       filter_8bit(x5, t2, t3, t6, t0);                \
+                                                       \
+       /* affine transformation for X2 */              \
+       filter_8bit(x3, t4, t5, t6, t0);                \
+       /* affine transformation for X2 */              \
+       filter_8bit(x7, t4, t5, t6, t0);                \
+                                                       \
+       vpxor t6, t6, t6;                               \
+       vextracti128 $1, x3, t6##_x;                    \
+       vaesdeclast t7##_x, x3##_x, x3##_x;             \
+       vaesdeclast t7##_x, t6##_x, t6##_x;             \
+       vinserti128 $1, t6##_x, x3, x3;                 \
+                                                       \
+       vextracti128 $1, x7, t6##_x;                    \
+       vaesdeclast t7##_x, x7##_x, x7##_x;             \
+       vaesdeclast t7##_x, t6##_x, t6##_x;             \
+       vinserti128 $1, t6##_x, x7, x7;
+
+#define aria_diff_m(x0, x1, x2, x3,                    \
+                   t0, t1, t2, t3)                     \
+       /* T = rotr32(X, 8); */                         \
+       /* X ^= T */                                    \
+       vpxor x0, x3, t0;                               \
+       vpxor x1, x0, t1;                               \
+       vpxor x2, x1, t2;                               \
+       vpxor x3, x2, t3;                               \
+       /* X = T ^ rotr(X, 16); */                      \
+       vpxor t2, x0, x0;                               \
+       vpxor x1, t3, t3;                               \
+       vpxor t0, x2, x2;                               \
+       vpxor t1, x3, x1;                               \
+       vmovdqu t3, x3;
+
+#define aria_diff_word(x0, x1, x2, x3,                 \
+                      x4, x5, x6, x7,                  \
+                      y0, y1, y2, y3,                  \
+                      y4, y5, y6, y7)                  \
+       /* t1 ^= t2; */                                 \
+       vpxor y0, x4, x4;                               \
+       vpxor y1, x5, x5;                               \
+       vpxor y2, x6, x6;                               \
+       vpxor y3, x7, x7;                               \
+                                                       \
+       /* t2 ^= t3; */                                 \
+       vpxor y4, y0, y0;                               \
+       vpxor y5, y1, y1;                               \
+       vpxor y6, y2, y2;                               \
+       vpxor y7, y3, y3;                               \
+                                                       \
+       /* t0 ^= t1; */                                 \
+       vpxor x4, x0, x0;                               \
+       vpxor x5, x1, x1;                               \
+       vpxor x6, x2, x2;                               \
+       vpxor x7, x3, x3;                               \
+                                                       \
+       /* t3 ^= t1; */                                 \
+       vpxor x4, y4, y4;                               \
+       vpxor x5, y5, y5;                               \
+       vpxor x6, y6, y6;                               \
+       vpxor x7, y7, y7;                               \
+                                                       \
+       /* t2 ^= t0; */                                 \
+       vpxor x0, y0, y0;                               \
+       vpxor x1, y1, y1;                               \
+       vpxor x2, y2, y2;                               \
+       vpxor x3, y3, y3;                               \
+                                                       \
+       /* t1 ^= t2; */                                 \
+       vpxor y0, x4, x4;                               \
+       vpxor y1, x5, x5;                               \
+       vpxor y2, x6, x6;                               \
+       vpxor y3, x7, x7;
+
+#define aria_fe(x0, x1, x2, x3,                                \
+               x4, x5, x6, x7,                         \
+               y0, y1, y2, y3,                         \
+               y4, y5, y6, y7,                         \
+               mem_tmp, rk, round)                     \
+       aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,   \
+                     y0, rk, 8, round);                \
+                                                       \
+       aria_sbox_8way(x2, x3, x0, x1, x6, x7, x4, x5,  \
+                      y0, y1, y2, y3, y4, y5, y6, y7); \
+                                                       \
+       aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3);    \
+       aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3);    \
+       aria_store_state_8way(x0, x1, x2, x3,           \
+                             x4, x5, x6, x7,           \
+                             mem_tmp, 8);              \
+                                                       \
+       aria_load_state_8way(x0, x1, x2, x3,            \
+                            x4, x5, x6, x7,            \
+                            mem_tmp, 0);               \
+       aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,   \
+                     y0, rk, 0, round);                \
+                                                       \
+       aria_sbox_8way(x2, x3, x0, x1, x6, x7, x4, x5,  \
+                      y0, y1, y2, y3, y4, y5, y6, y7); \
+                                                       \
+       aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3);    \
+       aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3);    \
+       aria_store_state_8way(x0, x1, x2, x3,           \
+                             x4, x5, x6, x7,           \
+                             mem_tmp, 0);              \
+       aria_load_state_8way(y0, y1, y2, y3,            \
+                            y4, y5, y6, y7,            \
+                            mem_tmp, 8);               \
+       aria_diff_word(x0, x1, x2, x3,                  \
+                      x4, x5, x6, x7,                  \
+                      y0, y1, y2, y3,                  \
+                      y4, y5, y6, y7);                 \
+       /* aria_diff_byte()                             \
+        * T3 = ABCD -> BADC                            \
+        * T3 = y4, y5, y6, y7 -> y5, y4, y7, y6        \
+        * T0 = ABCD -> CDAB                            \
+        * T0 = x0, x1, x2, x3 -> x2, x3, x0, x1        \
+        * T1 = ABCD -> DCBA                            \
+        * T1 = x4, x5, x6, x7 -> x7, x6, x5, x4        \
+        */                                             \
+       aria_diff_word(x2, x3, x0, x1,                  \
+                      x7, x6, x5, x4,                  \
+                      y0, y1, y2, y3,                  \
+                      y5, y4, y7, y6);                 \
+       aria_store_state_8way(x3, x2, x1, x0,           \
+                             x6, x7, x4, x5,           \
+                             mem_tmp, 0);
+
+#define aria_fo(x0, x1, x2, x3,                                \
+               x4, x5, x6, x7,                         \
+               y0, y1, y2, y3,                         \
+               y4, y5, y6, y7,                         \
+               mem_tmp, rk, round)                     \
+       aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,   \
+                     y0, rk, 8, round);                \
+                                                       \
+       aria_sbox_8way(x0, x1, x2, x3, x4, x5, x6, x7,  \
+                      y0, y1, y2, y3, y4, y5, y6, y7); \
+                                                       \
+       aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3);    \
+       aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3);    \
+       aria_store_state_8way(x0, x1, x2, x3,           \
+                             x4, x5, x6, x7,           \
+                             mem_tmp, 8);              \
+                                                       \
+       aria_load_state_8way(x0, x1, x2, x3,            \
+                            x4, x5, x6, x7,            \
+                            mem_tmp, 0);               \
+       aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,   \
+                     y0, rk, 0, round);                \
+                                                       \
+       aria_sbox_8way(x0, x1, x2, x3, x4, x5, x6, x7,  \
+                      y0, y1, y2, y3, y4, y5, y6, y7); \
+                                                       \
+       aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3);    \
+       aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3);    \
+       aria_store_state_8way(x0, x1, x2, x3,           \
+                             x4, x5, x6, x7,           \
+                             mem_tmp, 0);              \
+       aria_load_state_8way(y0, y1, y2, y3,            \
+                            y4, y5, y6, y7,            \
+                            mem_tmp, 8);               \
+       aria_diff_word(x0, x1, x2, x3,                  \
+                      x4, x5, x6, x7,                  \
+                      y0, y1, y2, y3,                  \
+                      y4, y5, y6, y7);                 \
+       /* aria_diff_byte()                             \
+        * T1 = ABCD -> BADC                            \
+        * T1 = x4, x5, x6, x7 -> x5, x4, x7, x6        \
+        * T2 = ABCD -> CDAB                            \
+        * T2 = y0, y1, y2, y3, -> y2, y3, y0, y1       \
+        * T3 = ABCD -> DCBA                            \
+        * T3 = y4, y5, y6, y7 -> y7, y6, y5, y4        \
+        */                                             \
+       aria_diff_word(x0, x1, x2, x3,                  \
+                      x5, x4, x7, x6,                  \
+                      y2, y3, y0, y1,                  \
+                      y7, y6, y5, y4);                 \
+       aria_store_state_8way(x3, x2, x1, x0,           \
+                             x6, x7, x4, x5,           \
+                             mem_tmp, 0);
+
+#define aria_ff(x0, x1, x2, x3,                                \
+               x4, x5, x6, x7,                         \
+               y0, y1, y2, y3,                         \
+               y4, y5, y6, y7,                         \
+               mem_tmp, rk, round, last_round)         \
+       aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,   \
+                     y0, rk, 8, round);                \
+                                                       \
+       aria_sbox_8way(x2, x3, x0, x1, x6, x7, x4, x5,  \
+                      y0, y1, y2, y3, y4, y5, y6, y7); \
+                                                       \
+       aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,   \
+                     y0, rk, 8, last_round);           \
+                                                       \
+       aria_store_state_8way(x0, x1, x2, x3,           \
+                             x4, x5, x6, x7,           \
+                             mem_tmp, 8);              \
+                                                       \
+       aria_load_state_8way(x0, x1, x2, x3,            \
+                            x4, x5, x6, x7,            \
+                            mem_tmp, 0);               \
+       aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,   \
+                     y0, rk, 0, round);                \
+                                                       \
+       aria_sbox_8way(x2, x3, x0, x1, x6, x7, x4, x5,  \
+                      y0, y1, y2, y3, y4, y5, y6, y7); \
+                                                       \
+       aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,   \
+                     y0, rk, 0, last_round);           \
+                                                       \
+       aria_load_state_8way(y0, y1, y2, y3,            \
+                            y4, y5, y6, y7,            \
+                            mem_tmp, 8);
+
+#ifdef CONFIG_AS_GFNI
+#define aria_fe_gfni(x0, x1, x2, x3,                   \
+                    x4, x5, x6, x7,                    \
+                    y0, y1, y2, y3,                    \
+                    y4, y5, y6, y7,                    \
+                    mem_tmp, rk, round)                \
+       aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,   \
+                     y0, rk, 8, round);                \
+                                                       \
+       aria_sbox_8way_gfni(x2, x3, x0, x1,             \
+                           x6, x7, x4, x5,             \
+                           y0, y1, y2, y3,             \
+                           y4, y5, y6, y7);            \
+                                                       \
+       aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3);    \
+       aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3);    \
+       aria_store_state_8way(x0, x1, x2, x3,           \
+                             x4, x5, x6, x7,           \
+                             mem_tmp, 8);              \
+                                                       \
+       aria_load_state_8way(x0, x1, x2, x3,            \
+                            x4, x5, x6, x7,            \
+                            mem_tmp, 0);               \
+       aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,   \
+                     y0, rk, 0, round);                \
+                                                       \
+       aria_sbox_8way_gfni(x2, x3, x0, x1,             \
+                           x6, x7, x4, x5,             \
+                           y0, y1, y2, y3,             \
+                           y4, y5, y6, y7);            \
+                                                       \
+       aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3);    \
+       aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3);    \
+       aria_store_state_8way(x0, x1, x2, x3,           \
+                             x4, x5, x6, x7,           \
+                             mem_tmp, 0);              \
+       aria_load_state_8way(y0, y1, y2, y3,            \
+                            y4, y5, y6, y7,            \
+                            mem_tmp, 8);               \
+       aria_diff_word(x0, x1, x2, x3,                  \
+                      x4, x5, x6, x7,                  \
+                      y0, y1, y2, y3,                  \
+                      y4, y5, y6, y7);                 \
+       /* aria_diff_byte()                             \
+        * T3 = ABCD -> BADC                            \
+        * T3 = y4, y5, y6, y7 -> y5, y4, y7, y6        \
+        * T0 = ABCD -> CDAB                            \
+        * T0 = x0, x1, x2, x3 -> x2, x3, x0, x1        \
+        * T1 = ABCD -> DCBA                            \
+        * T1 = x4, x5, x6, x7 -> x7, x6, x5, x4        \
+        */                                             \
+       aria_diff_word(x2, x3, x0, x1,                  \
+                      x7, x6, x5, x4,                  \
+                      y0, y1, y2, y3,                  \
+                      y5, y4, y7, y6);                 \
+       aria_store_state_8way(x3, x2, x1, x0,           \
+                             x6, x7, x4, x5,           \
+                             mem_tmp, 0);
+
+#define aria_fo_gfni(x0, x1, x2, x3,                   \
+                    x4, x5, x6, x7,                    \
+                    y0, y1, y2, y3,                    \
+                    y4, y5, y6, y7,                    \
+                    mem_tmp, rk, round)                \
+       aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,   \
+                     y0, rk, 8, round);                \
+                                                       \
+       aria_sbox_8way_gfni(x0, x1, x2, x3,             \
+                           x4, x5, x6, x7,             \
+                           y0, y1, y2, y3,             \
+                           y4, y5, y6, y7);            \
+                                                       \
+       aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3);    \
+       aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3);    \
+       aria_store_state_8way(x0, x1, x2, x3,           \
+                             x4, x5, x6, x7,           \
+                             mem_tmp, 8);              \
+                                                       \
+       aria_load_state_8way(x0, x1, x2, x3,            \
+                            x4, x5, x6, x7,            \
+                            mem_tmp, 0);               \
+       aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,   \
+                     y0, rk, 0, round);                \
+                                                       \
+       aria_sbox_8way_gfni(x0, x1, x2, x3,             \
+                           x4, x5, x6, x7,             \
+                           y0, y1, y2, y3,             \
+                           y4, y5, y6, y7);            \
+                                                       \
+       aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3);    \
+       aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3);    \
+       aria_store_state_8way(x0, x1, x2, x3,           \
+                             x4, x5, x6, x7,           \
+                             mem_tmp, 0);              \
+       aria_load_state_8way(y0, y1, y2, y3,            \
+                            y4, y5, y6, y7,            \
+                            mem_tmp, 8);               \
+       aria_diff_word(x0, x1, x2, x3,                  \
+                      x4, x5, x6, x7,                  \
+                      y0, y1, y2, y3,                  \
+                      y4, y5, y6, y7);                 \
+       /* aria_diff_byte()                             \
+        * T1 = ABCD -> BADC                            \
+        * T1 = x4, x5, x6, x7 -> x5, x4, x7, x6        \
+        * T2 = ABCD -> CDAB                            \
+        * T2 = y0, y1, y2, y3, -> y2, y3, y0, y1       \
+        * T3 = ABCD -> DCBA                            \
+        * T3 = y4, y5, y6, y7 -> y7, y6, y5, y4        \
+        */                                             \
+       aria_diff_word(x0, x1, x2, x3,                  \
+                      x5, x4, x7, x6,                  \
+                      y2, y3, y0, y1,                  \
+                      y7, y6, y5, y4);                 \
+       aria_store_state_8way(x3, x2, x1, x0,           \
+                             x6, x7, x4, x5,           \
+                             mem_tmp, 0);
+
+#define aria_ff_gfni(x0, x1, x2, x3,                   \
+               x4, x5, x6, x7,                         \
+               y0, y1, y2, y3,                         \
+               y4, y5, y6, y7,                         \
+               mem_tmp, rk, round, last_round)         \
+       aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,   \
+                     y0, rk, 8, round);                \
+                                                       \
+       aria_sbox_8way_gfni(x2, x3, x0, x1,             \
+                           x6, x7, x4, x5,             \
+                           y0, y1, y2, y3,             \
+                           y4, y5, y6, y7);            \
+                                                       \
+       aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,   \
+                     y0, rk, 8, last_round);           \
+                                                       \
+       aria_store_state_8way(x0, x1, x2, x3,           \
+                             x4, x5, x6, x7,           \
+                             mem_tmp, 8);              \
+                                                       \
+       aria_load_state_8way(x0, x1, x2, x3,            \
+                            x4, x5, x6, x7,            \
+                            mem_tmp, 0);               \
+       aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,   \
+                     y0, rk, 0, round);                \
+                                                       \
+       aria_sbox_8way_gfni(x2, x3, x0, x1,             \
+                           x6, x7, x4, x5,             \
+                           y0, y1, y2, y3,             \
+                           y4, y5, y6, y7);            \
+                                                       \
+       aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,   \
+                     y0, rk, 0, last_round);           \
+                                                       \
+       aria_load_state_8way(y0, y1, y2, y3,            \
+                            y4, y5, y6, y7,            \
+                            mem_tmp, 8);
+#endif /* CONFIG_AS_GFNI */
+
+#ifdef CONFIG_AS_VAES
+#define aria_fe_vaes(x0, x1, x2, x3,                   \
+                    x4, x5, x6, x7,                    \
+                    y0, y1, y2, y3,                    \
+                    y4, y5, y6, y7,                    \
+                    mem_tmp, rk, round)                \
+       aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,   \
+                     y0, rk, 8, round);                \
+                                                       \
+       aria_sbox_8way_vaes(x2, x3, x0, x1, x6, x7, x4, \
+                           x5, y0, y1, y2, y3, y4, y5, \
+                           y6, y7);                    \
+                                                       \
+       aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3);    \
+       aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3);    \
+       aria_store_state_8way(x0, x1, x2, x3,           \
+                             x4, x5, x6, x7,           \
+                             mem_tmp, 8);              \
+                                                       \
+       aria_load_state_8way(x0, x1, x2, x3,            \
+                            x4, x5, x6, x7,            \
+                            mem_tmp, 0);               \
+       aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,   \
+                     y0, rk, 0, round);                \
+                                                       \
+       aria_sbox_8way_vaes(x2, x3, x0, x1, x6, x7, x4, \
+                           x5, y0, y1, y2, y3, y4, y5, \
+                           y6, y7);                    \
+                                                       \
+       aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3);    \
+       aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3);    \
+       aria_store_state_8way(x0, x1, x2, x3,           \
+                             x4, x5, x6, x7,           \
+                             mem_tmp, 0);              \
+       aria_load_state_8way(y0, y1, y2, y3,            \
+                            y4, y5, y6, y7,            \
+                            mem_tmp, 8);               \
+       aria_diff_word(x0, x1, x2, x3,                  \
+                      x4, x5, x6, x7,                  \
+                      y0, y1, y2, y3,                  \
+                      y4, y5, y6, y7);                 \
+       /* aria_diff_byte()                             \
+        * T3 = ABCD -> BADC                            \
+        * T3 = y4, y5, y6, y7 -> y5, y4, y7, y6        \
+        * T0 = ABCD -> CDAB                            \
+        * T0 = x0, x1, x2, x3 -> x2, x3, x0, x1        \
+        * T1 = ABCD -> DCBA                            \
+        * T1 = x4, x5, x6, x7 -> x7, x6, x5, x4        \
+        */                                             \
+       aria_diff_word(x2, x3, x0, x1,                  \
+                      x7, x6, x5, x4,                  \
+                      y0, y1, y2, y3,                  \
+                      y5, y4, y7, y6);                 \
+       aria_store_state_8way(x3, x2, x1, x0,           \
+                             x6, x7, x4, x5,           \
+                             mem_tmp, 0);
+
+#define aria_fo_vaes(x0, x1, x2, x3,                   \
+                    x4, x5, x6, x7,                    \
+                    y0, y1, y2, y3,                    \
+                    y4, y5, y6, y7,                    \
+                    mem_tmp, rk, round)                \
+       aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,   \
+                     y0, rk, 8, round);                \
+                                                       \
+       aria_sbox_8way_vaes(x0, x1, x2, x3, x4, x5, x6, \
+                           x7, y0, y1, y2, y3, y4, y5, \
+                           y6, y7);                    \
+                                                       \
+       aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3);    \
+       aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3);    \
+       aria_store_state_8way(x0, x1, x2, x3,           \
+                             x4, x5, x6, x7,           \
+                             mem_tmp, 8);              \
+                                                       \
+       aria_load_state_8way(x0, x1, x2, x3,            \
+                            x4, x5, x6, x7,            \
+                            mem_tmp, 0);               \
+       aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,   \
+                     y0, rk, 0, round);                \
+                                                       \
+       aria_sbox_8way_vaes(x0, x1, x2, x3, x4, x5, x6, \
+                           x7, y0, y1, y2, y3, y4, y5, \
+                           y6, y7);    \
+                                                       \
+       aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3);    \
+       aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3);    \
+       aria_store_state_8way(x0, x1, x2, x3,           \
+                             x4, x5, x6, x7,           \
+                             mem_tmp, 0);              \
+       aria_load_state_8way(y0, y1, y2, y3,            \
+                            y4, y5, y6, y7,            \
+                            mem_tmp, 8);               \
+       aria_diff_word(x0, x1, x2, x3,                  \
+                      x4, x5, x6, x7,                  \
+                      y0, y1, y2, y3,                  \
+                      y4, y5, y6, y7);                 \
+       /* aria_diff_byte()                             \
+        * T1 = ABCD -> BADC                            \
+        * T1 = x4, x5, x6, x7 -> x5, x4, x7, x6        \
+        * T2 = ABCD -> CDAB                            \
+        * T2 = y0, y1, y2, y3, -> y2, y3, y0, y1       \
+        * T3 = ABCD -> DCBA                            \
+        * T3 = y4, y5, y6, y7 -> y7, y6, y5, y4        \
+        */                                             \
+       aria_diff_word(x0, x1, x2, x3,                  \
+                      x5, x4, x7, x6,                  \
+                      y2, y3, y0, y1,                  \
+                      y7, y6, y5, y4);                 \
+       aria_store_state_8way(x3, x2, x1, x0,           \
+                             x6, x7, x4, x5,           \
+                             mem_tmp, 0);
+
+#define aria_ff_vaes(x0, x1, x2, x3,                   \
+                    x4, x5, x6, x7,                    \
+                    y0, y1, y2, y3,                    \
+                    y4, y5, y6, y7,                    \
+                    mem_tmp, rk, round, last_round)    \
+       aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,   \
+                     y0, rk, 8, round);                \
+                                                       \
+       aria_sbox_8way_vaes(x2, x3, x0, x1, x6, x7, x4, \
+                           x5, y0, y1, y2, y3, y4, y5, \
+                           y6, y7);                    \
+                                                       \
+       aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,   \
+                     y0, rk, 8, last_round);           \
+                                                       \
+       aria_store_state_8way(x0, x1, x2, x3,           \
+                             x4, x5, x6, x7,           \
+                             mem_tmp, 8);              \
+                                                       \
+       aria_load_state_8way(x0, x1, x2, x3,            \
+                            x4, x5, x6, x7,            \
+                            mem_tmp, 0);               \
+       aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,   \
+                     y0, rk, 0, round);                \
+                                                       \
+       aria_sbox_8way_vaes(x2, x3, x0, x1, x6, x7, x4, \
+                           x5, y0, y1, y2, y3, y4, y5, \
+                           y6, y7);    \
+                                                       \
+       aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,   \
+                     y0, rk, 0, last_round);           \
+                                                       \
+       aria_load_state_8way(y0, y1, y2, y3,            \
+                            y4, y5, y6, y7,            \
+                            mem_tmp, 8);
+#endif /* CONFIG_AS_VAES */
+
+SECTION_RODATA
+.align 32
+#define SHUFB_BYTES(idx) \
+       0 + (idx), 4 + (idx), 8 + (idx), 12 + (idx)
+.Lshufb_16x16b:
+       .byte SHUFB_BYTES(0), SHUFB_BYTES(1), SHUFB_BYTES(2), SHUFB_BYTES(3)
+       .byte SHUFB_BYTES(0), SHUFB_BYTES(1), SHUFB_BYTES(2), SHUFB_BYTES(3)
+
+.align 32
+.Lbige_addb_0_1:
+       .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+       .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1
+.Lbige_addb_2_3:
+       .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2
+       .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3
+.Lbige_addb_4_5:
+       .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4
+       .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5
+.Lbige_addb_6_7:
+       .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6
+       .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7
+.Lbige_addb_8_9:
+       .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 8
+       .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 9
+.Lbige_addb_10_11:
+       .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 10
+       .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 11
+.Lbige_addb_12_13:
+       .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 12
+       .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 13
+.Lbige_addb_14_15:
+       .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 14
+       .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 15
+.Lbige_addb_16_16:
+       .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 16
+       .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 16
+
+.align 16
+/* For isolating SubBytes from AESENCLAST, inverse shift row */
+.Linv_shift_row:
+       .byte 0x00, 0x0d, 0x0a, 0x07, 0x04, 0x01, 0x0e, 0x0b
+       .byte 0x08, 0x05, 0x02, 0x0f, 0x0c, 0x09, 0x06, 0x03
+.Lshift_row:
+       .byte 0x00, 0x05, 0x0a, 0x0f, 0x04, 0x09, 0x0e, 0x03
+       .byte 0x08, 0x0d, 0x02, 0x07, 0x0c, 0x01, 0x06, 0x0b
+/* For CTR-mode IV byteswap */
+.Lbswap128_mask:
+       .byte 0x0f, 0x0e, 0x0d, 0x0c, 0x0b, 0x0a, 0x09, 0x08
+       .byte 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00
+
+/* AES inverse affine and S2 combined:
+ *      1 1 0 0 0 0 0 1     x0     0
+ *      0 1 0 0 1 0 0 0     x1     0
+ *      1 1 0 0 1 1 1 1     x2     0
+ *      0 1 1 0 1 0 0 1     x3     1
+ *      0 1 0 0 1 1 0 0  *  x4  +  0
+ *      0 1 0 1 1 0 0 0     x5     0
+ *      0 0 0 0 0 1 0 1     x6     0
+ *      1 1 1 0 0 1 1 1     x7     1
+ */
+.Ltf_lo__inv_aff__and__s2:
+       .octa 0x92172DA81A9FA520B2370D883ABF8500
+.Ltf_hi__inv_aff__and__s2:
+       .octa 0x2B15FFC1AF917B45E6D8320C625CB688
+
+/* X2 and AES forward affine combined:
+ *      1 0 1 1 0 0 0 1     x0     0
+ *      0 1 1 1 1 0 1 1     x1     0
+ *      0 0 0 1 1 0 1 0     x2     1
+ *      0 1 0 0 0 1 0 0     x3     0
+ *      0 0 1 1 1 0 1 1  *  x4  +  0
+ *      0 1 0 0 1 0 0 0     x5     0
+ *      1 1 0 1 0 0 1 1     x6     0
+ *      0 1 0 0 1 0 1 0     x7     0
+ */
+.Ltf_lo__x2__and__fwd_aff:
+       .octa 0xEFAE0544FCBD1657B8F95213ABEA4100
+.Ltf_hi__x2__and__fwd_aff:
+       .octa 0x3F893781E95FE1576CDA64D2BA0CB204
+
+#ifdef CONFIG_AS_GFNI
+.align 8
+/* AES affine: */
+#define tf_aff_const BV8(1, 1, 0, 0, 0, 1, 1, 0)
+.Ltf_aff_bitmatrix:
+       .quad BM8X8(BV8(1, 0, 0, 0, 1, 1, 1, 1),
+                   BV8(1, 1, 0, 0, 0, 1, 1, 1),
+                   BV8(1, 1, 1, 0, 0, 0, 1, 1),
+                   BV8(1, 1, 1, 1, 0, 0, 0, 1),
+                   BV8(1, 1, 1, 1, 1, 0, 0, 0),
+                   BV8(0, 1, 1, 1, 1, 1, 0, 0),
+                   BV8(0, 0, 1, 1, 1, 1, 1, 0),
+                   BV8(0, 0, 0, 1, 1, 1, 1, 1))
+
+/* AES inverse affine: */
+#define tf_inv_const BV8(1, 0, 1, 0, 0, 0, 0, 0)
+.Ltf_inv_bitmatrix:
+       .quad BM8X8(BV8(0, 0, 1, 0, 0, 1, 0, 1),
+                   BV8(1, 0, 0, 1, 0, 0, 1, 0),
+                   BV8(0, 1, 0, 0, 1, 0, 0, 1),
+                   BV8(1, 0, 1, 0, 0, 1, 0, 0),
+                   BV8(0, 1, 0, 1, 0, 0, 1, 0),
+                   BV8(0, 0, 1, 0, 1, 0, 0, 1),
+                   BV8(1, 0, 0, 1, 0, 1, 0, 0),
+                   BV8(0, 1, 0, 0, 1, 0, 1, 0))
+
+/* S2: */
+#define tf_s2_const BV8(0, 1, 0, 0, 0, 1, 1, 1)
+.Ltf_s2_bitmatrix:
+       .quad BM8X8(BV8(0, 1, 0, 1, 0, 1, 1, 1),
+                   BV8(0, 0, 1, 1, 1, 1, 1, 1),
+                   BV8(1, 1, 1, 0, 1, 1, 0, 1),
+                   BV8(1, 1, 0, 0, 0, 0, 1, 1),
+                   BV8(0, 1, 0, 0, 0, 0, 1, 1),
+                   BV8(1, 1, 0, 0, 1, 1, 1, 0),
+                   BV8(0, 1, 1, 0, 0, 0, 1, 1),
+                   BV8(1, 1, 1, 1, 0, 1, 1, 0))
+
+/* X2: */
+#define tf_x2_const BV8(0, 0, 1, 1, 0, 1, 0, 0)
+.Ltf_x2_bitmatrix:
+       .quad BM8X8(BV8(0, 0, 0, 1, 1, 0, 0, 0),
+                   BV8(0, 0, 1, 0, 0, 1, 1, 0),
+                   BV8(0, 0, 0, 0, 1, 0, 1, 0),
+                   BV8(1, 1, 1, 0, 0, 0, 1, 1),
+                   BV8(1, 1, 1, 0, 1, 1, 0, 0),
+                   BV8(0, 1, 1, 0, 1, 0, 1, 1),
+                   BV8(1, 0, 1, 1, 1, 1, 0, 1),
+                   BV8(1, 0, 0, 1, 0, 0, 1, 1))
+
+/* Identity matrix: */
+.Ltf_id_bitmatrix:
+       .quad BM8X8(BV8(1, 0, 0, 0, 0, 0, 0, 0),
+                   BV8(0, 1, 0, 0, 0, 0, 0, 0),
+                   BV8(0, 0, 1, 0, 0, 0, 0, 0),
+                   BV8(0, 0, 0, 1, 0, 0, 0, 0),
+                   BV8(0, 0, 0, 0, 1, 0, 0, 0),
+                   BV8(0, 0, 0, 0, 0, 1, 0, 0),
+                   BV8(0, 0, 0, 0, 0, 0, 1, 0),
+                   BV8(0, 0, 0, 0, 0, 0, 0, 1))
+
+#endif /* CONFIG_AS_GFNI */
+
+/* 4-bit mask */
+.align 4
+.L0f0f0f0f:
+       .long 0x0f0f0f0f
+
+.text
+
+.align 16
+ELF(.type __aria_aesni_avx2_crypt_32way,@function;)
+__aria_aesni_avx2_crypt_32way:
+       /* input:
+        *      %r9: rk
+        *      %rsi: dst
+        *      %rdx: src
+        *      %ymm0..%ymm15: byte-sliced blocks
+        */
+       CFI_STARTPROC();
+
+       movq %rsi, %rax;
+       leaq 8 * 32(%rax), %r8;
+
+       movl ARIA_CTX_rounds(CTX), %r10d;
+       subl $2, %r10d;
+
+       inpack16_post(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
+                     %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
+                     %ymm15, %rax, %r8);
+       aria_fo(%ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, %ymm15,
+               %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
+               %rax, %r9, 0);
+       leaq 1*16(%r9), %r9;
+
+.align 16
+.Loop_aesni:
+       aria_fe(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7,
+               %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
+               %ymm15, %rax, %r9, 0);
+       aria_fo(%ymm9, %ymm8, %ymm11, %ymm10, %ymm12, %ymm13, %ymm14, %ymm15,
+               %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
+               %rax, %r9, 1);
+       leaq 2*16(%r9), %r9;
+       subl $2, %r10d;
+       jnz .Loop_aesni;
+
+       aria_ff(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7,
+               %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
+               %ymm15, %rax, %r9, 0, 1);
+
+       debyteslice_16x16b(%ymm8, %ymm12, %ymm1, %ymm4,
+                          %ymm9, %ymm13, %ymm0, %ymm5,
+                          %ymm10, %ymm14, %ymm3, %ymm6,
+                          %ymm11, %ymm15, %ymm2, %ymm7,
+                          (%rax), (%r8));
+
+       ret_spec_stop;
+       CFI_ENDPROC();
+ELF(.size __aria_aesni_avx2_crypt_32way,.-__aria_aesni_avx2_crypt_32way;)
+
+.align 16
+.globl _gcry_aria_aesni_avx2_ecb_crypt_blk32
+ELF(.type _gcry_aria_aesni_avx2_ecb_crypt_blk32,@function;)
+_gcry_aria_aesni_avx2_ecb_crypt_blk32:
+       /* input:
+        *      %rdi: ctx, CTX
+        *      %rsi: dst
+        *      %rdx: src
+        *      %rcx: round keys
+        */
+       CFI_STARTPROC();
+
+       pushq %rbp;
+       CFI_PUSH(%rbp);
+       movq %rsp, %rbp;
+       CFI_DEF_CFA_REGISTER(%rbp);
+
+       subq $(16 * 32), %rsp;
+       andq $~31, %rsp;
+
+       movq %rcx, %r9;
+       movq %rsi, %r11;
+       movq %rsp, %rsi; /* use stack for temporary store */
+
+       inpack16_pre(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
+                    %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
+                    %ymm15, %rdx);
+
+       call __aria_aesni_avx2_crypt_32way;
+
+       write_output(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7,
+                    %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
+                    %ymm15, %r11);
+
+       movl $STACK_DEPTH, %eax;
+       leave;
+       CFI_LEAVE();
+       vzeroall;
+       ret_spec_stop;
+       CFI_ENDPROC();
+ELF(.size _gcry_aria_aesni_avx2_ecb_crypt_blk32,
+         .-_gcry_aria_aesni_avx2_ecb_crypt_blk32;)
+
+.align 16
+ELF(.type __aria_aesni_avx2_ctr_gen_keystream_32way,@function;)
+__aria_aesni_avx2_ctr_gen_keystream_32way:
+       /* input:
+        *      %rdi: ctx
+        *      %rsi: dst
+        *      %rdx: src
+        *      %rcx: keystream
+        *      %r8: iv (big endian, 128bit)
+        */
+       CFI_STARTPROC();
+
+       cmpb $(0x100 - 32), 15(%r8);
+       jbe .Lctr_byteadd;
+
+       movq 8(%r8), %r11;
+       bswapq %r11;
+
+       vbroadcasti128 .Lbswap128_mask rRIP, %ymm6;
+       vpcmpeqd %ymm0, %ymm0, %ymm0;
+       vpsrldq $8, %ymm0, %ymm0;   /* ab: -1:0 ; cd: -1:0 */
+       vpaddq %ymm0, %ymm0, %ymm5; /* ab: -2:0 ; cd: -2:0 */
+
+       /* load IV and byteswap */
+       vmovdqu (%r8), %xmm7;
+       vpshufb %xmm6, %xmm7, %xmm7;
+       vmovdqa %xmm7, %xmm3;
+       inc_le128(%xmm7, %xmm0, %xmm4);
+       vinserti128 $1, %xmm7, %ymm3, %ymm3;
+       vpshufb %ymm6, %ymm3, %ymm8; /* +1 ; +0 */
+
+       /* check need for handling 64-bit overflow and carry */
+       cmpq $(0xffffffffffffffff - 32), %r11;
+       ja .Lhandle_ctr_carry;
+
+       /* construct IVs */
+       vpsubq %ymm5, %ymm3, %ymm3; /* +3 ; +2 */
+       vpshufb %ymm6, %ymm3, %ymm9;
+       vpsubq %ymm5, %ymm3, %ymm3; /* +5 ; +4 */
+       vpshufb %ymm6, %ymm3, %ymm10;
+       vpsubq %ymm5, %ymm3, %ymm3; /* +7 ; +6 */
+       vpshufb %ymm6, %ymm3, %ymm11;
+       vpsubq %ymm5, %ymm3, %ymm3; /* +9 ; +8 */
+       vpshufb %ymm6, %ymm3, %ymm12;
+       vpsubq %ymm5, %ymm3, %ymm3; /* +11 ; +10 */
+       vpshufb %ymm6, %ymm3, %ymm13;
+       vpsubq %ymm5, %ymm3, %ymm3; /* +13 ; +12 */
+       vpshufb %ymm6, %ymm3, %ymm14;
+       vpsubq %ymm5, %ymm3, %ymm3; /* +15 ; +14 */
+       vpshufb %ymm6, %ymm3, %ymm15;
+       vmovdqu %ymm8, (0 * 32)(%rcx);
+       vmovdqu %ymm9, (1 * 32)(%rcx);
+       vmovdqu %ymm10, (2 * 32)(%rcx);
+       vmovdqu %ymm11, (3 * 32)(%rcx);
+       vmovdqu %ymm12, (4 * 32)(%rcx);
+       vmovdqu %ymm13, (5 * 32)(%rcx);
+       vmovdqu %ymm14, (6 * 32)(%rcx);
+       vmovdqu %ymm15, (7 * 32)(%rcx);
+
+       vpsubq %ymm5, %ymm3, %ymm3; /* +17 ; +16 */
+       vpshufb %ymm6, %ymm3, %ymm8;
+       vpsubq %ymm5, %ymm3, %ymm3; /* +19 ; +18 */
+       vpshufb %ymm6, %ymm3, %ymm9;
+       vpsubq %ymm5, %ymm3, %ymm3; /* +21 ; +20 */
+       vpshufb %ymm6, %ymm3, %ymm10;
+       vpsubq %ymm5, %ymm3, %ymm3; /* +23 ; +22 */
+       vpshufb %ymm6, %ymm3, %ymm11;
+       vpsubq %ymm5, %ymm3, %ymm3; /* +25 ; +24 */
+       vpshufb %ymm6, %ymm3, %ymm12;
+       vpsubq %ymm5, %ymm3, %ymm3; /* +27 ; +26 */
+       vpshufb %ymm6, %ymm3, %ymm13;
+       vpsubq %ymm5, %ymm3, %ymm3; /* +29 ; +28 */
+       vpshufb %ymm6, %ymm3, %ymm14;
+       vpsubq %ymm5, %ymm3, %ymm3; /* +31 ; +30 */
+       vpshufb %ymm6, %ymm3, %ymm15;
+       vpsubq %ymm5, %ymm3, %ymm3; /* +32 */
+       vpshufb %xmm6, %xmm3, %xmm3;
+       vmovdqu %xmm3, (%r8);
+       vmovdqu (0 * 32)(%rcx), %ymm0;
+       vmovdqu (1 * 32)(%rcx), %ymm1;
+       vmovdqu (2 * 32)(%rcx), %ymm2;
+       vmovdqu (3 * 32)(%rcx), %ymm3;
+       vmovdqu (4 * 32)(%rcx), %ymm4;
+       vmovdqu (5 * 32)(%rcx), %ymm5;
+       vmovdqu (6 * 32)(%rcx), %ymm6;
+       vmovdqu (7 * 32)(%rcx), %ymm7;
+       jmp .Lctr_carry_done;
+
+       .Lhandle_ctr_carry:
+       /* construct IVs */
+       inc_le128(%ymm3, %ymm0, %ymm4);
+       inc_le128(%ymm3, %ymm0, %ymm4);
+       vpshufb %ymm6, %ymm3, %ymm9; /* +3 ; +2 */
+       inc_le128(%ymm3, %ymm0, %ymm4);
+       inc_le128(%ymm3, %ymm0, %ymm4);
+       vpshufb %ymm6, %ymm3, %ymm10; /* +5 ; +4 */
+       inc_le128(%ymm3, %ymm0, %ymm4);
+       inc_le128(%ymm3, %ymm0, %ymm4);
+       vpshufb %ymm6, %ymm3, %ymm11; /* +7 ; +6 */
+       inc_le128(%ymm3, %ymm0, %ymm4);
+       inc_le128(%ymm3, %ymm0, %ymm4);
+       vpshufb %ymm6, %ymm3, %ymm12; /* +9 ; +8 */
+       inc_le128(%ymm3, %ymm0, %ymm4);
+       inc_le128(%ymm3, %ymm0, %ymm4);
+       vpshufb %ymm6, %ymm3, %ymm13; /* +11 ; +10 */
+       inc_le128(%ymm3, %ymm0, %ymm4);
+       inc_le128(%ymm3, %ymm0, %ymm4);
+       vpshufb %ymm6, %ymm3, %ymm14; /* +13 ; +12 */
+       inc_le128(%ymm3, %ymm0, %ymm4);
+       inc_le128(%ymm3, %ymm0, %ymm4);
+       vpshufb %ymm6, %ymm3, %ymm15; /* +15 ; +14 */
+       vmovdqu %ymm8, (0 * 32)(%rcx);
+       vmovdqu %ymm9, (1 * 32)(%rcx);
+       vmovdqu %ymm10, (2 * 32)(%rcx);
+       vmovdqu %ymm11, (3 * 32)(%rcx);
+       vmovdqu %ymm12, (4 * 32)(%rcx);
+       vmovdqu %ymm13, (5 * 32)(%rcx);
+       vmovdqu %ymm14, (6 * 32)(%rcx);
+       vmovdqu %ymm15, (7 * 32)(%rcx);
+
+       inc_le128(%ymm3, %ymm0, %ymm4);
+       inc_le128(%ymm3, %ymm0, %ymm4);
+       vpshufb %ymm6, %ymm3, %ymm8; /* +17 ; +16 */
+       inc_le128(%ymm3, %ymm0, %ymm4);
+       inc_le128(%ymm3, %ymm0, %ymm4);
+       vpshufb %ymm6, %ymm3, %ymm9; /* +19 ; +18 */
+       inc_le128(%ymm3, %ymm0, %ymm4);
+       inc_le128(%ymm3, %ymm0, %ymm4);
+       vpshufb %ymm6, %ymm3, %ymm10; /* +21 ; +20 */
+       inc_le128(%ymm3, %ymm0, %ymm4);
+       inc_le128(%ymm3, %ymm0, %ymm4);
+       vpshufb %ymm6, %ymm3, %ymm11; /* +23 ; +22 */
+       inc_le128(%ymm3, %ymm0, %ymm4);
+       inc_le128(%ymm3, %ymm0, %ymm4);
+       vpshufb %ymm6, %ymm3, %ymm12; /* +25 ; +24 */
+       inc_le128(%ymm3, %ymm0, %ymm4);
+       inc_le128(%ymm3, %ymm0, %ymm4);
+       vpshufb %ymm6, %ymm3, %ymm13; /* +27 ; +26 */
+       inc_le128(%ymm3, %ymm0, %ymm4);
+       inc_le128(%ymm3, %ymm0, %ymm4);
+       vpshufb %ymm6, %ymm3, %ymm14; /* +29 ; +28 */
+       inc_le128(%ymm3, %ymm0, %ymm4);
+       inc_le128(%ymm3, %ymm0, %ymm4);
+       vpshufb %ymm6, %ymm3, %ymm15; /* +31 ; +30 */
+       inc_le128(%ymm3, %ymm0, %ymm4);
+       vextracti128 $1, %ymm3, %xmm3;
+       vpshufb %xmm6, %xmm3, %xmm3; /* +32 */
+       vmovdqu %xmm3, (%r8);
+       vmovdqu (0 * 32)(%rcx), %ymm0;
+       vmovdqu (1 * 32)(%rcx), %ymm1;
+       vmovdqu (2 * 32)(%rcx), %ymm2;
+       vmovdqu (3 * 32)(%rcx), %ymm3;
+       vmovdqu (4 * 32)(%rcx), %ymm4;
+       vmovdqu (5 * 32)(%rcx), %ymm5;
+       vmovdqu (6 * 32)(%rcx), %ymm6;
+       vmovdqu (7 * 32)(%rcx), %ymm7;
+
+.Lctr_carry_done:
+       ret_spec_stop;
+
+.align 8
+.Lctr_byteadd_full_ctr_carry:
+       addb $32, 15(%r8);
+       pushq %rcx;
+       movl $14, %ecx;
+       1:
+         adcb $0, (%r8, %rcx);
+         jnc 2f;
+         loop 1b;
+       2:
+       popq %rcx;
+       jmp .Lctr_byteadd_ymm;
+.align 8
+.Lctr_byteadd:
+       vbroadcasti128 (%r8), %ymm8;
+       je .Lctr_byteadd_full_ctr_carry;
+       addb $32, 15(%r8);
+.Lctr_byteadd_ymm:
+       vpaddb .Lbige_addb_16_16 rRIP, %ymm8, %ymm15;
+       vpaddb .Lbige_addb_0_1 rRIP, %ymm8, %ymm0;
+       vpaddb .Lbige_addb_2_3 rRIP, %ymm8, %ymm1;
+       vpaddb .Lbige_addb_4_5 rRIP, %ymm8, %ymm2;
+       vpaddb .Lbige_addb_6_7 rRIP, %ymm8, %ymm3;
+       vpaddb .Lbige_addb_8_9 rRIP, %ymm8, %ymm4;
+       vpaddb .Lbige_addb_10_11 rRIP, %ymm8, %ymm5;
+       vpaddb .Lbige_addb_12_13 rRIP, %ymm8, %ymm6;
+       vpaddb .Lbige_addb_14_15 rRIP, %ymm8, %ymm7;
+       vpaddb .Lbige_addb_0_1 rRIP, %ymm15, %ymm8;
+       vpaddb .Lbige_addb_2_3 rRIP, %ymm15, %ymm9;
+       vpaddb .Lbige_addb_4_5 rRIP, %ymm15, %ymm10;
+       vpaddb .Lbige_addb_6_7 rRIP, %ymm15, %ymm11;
+       vpaddb .Lbige_addb_8_9 rRIP, %ymm15, %ymm12;
+       vpaddb .Lbige_addb_10_11 rRIP, %ymm15, %ymm13;
+       vpaddb .Lbige_addb_12_13 rRIP, %ymm15, %ymm14;
+       vpaddb .Lbige_addb_14_15 rRIP, %ymm15, %ymm15;
+
+       ret_spec_stop;
+       CFI_ENDPROC();
+ELF(.size __aria_aesni_avx2_ctr_gen_keystream_32way,
+         .-__aria_aesni_avx2_ctr_gen_keystream_32way;)
+
+.align 16
+.globl _gcry_aria_aesni_avx2_ctr_crypt_blk32
+ELF(.type _gcry_aria_aesni_avx2_ctr_crypt_blk32,@function;)
+_gcry_aria_aesni_avx2_ctr_crypt_blk32:
+       /* input:
+        *      %rdi: ctx
+        *      %rsi: dst
+        *      %rdx: src
+        *      %rcx: iv (big endian, 128bit)
+        */
+       CFI_STARTPROC();
+
+       pushq %rbp;
+       CFI_PUSH(%rbp);
+       movq %rsp, %rbp;
+       CFI_DEF_CFA_REGISTER(%rbp);
+
+       subq $(16 * 32), %rsp;
+       andq $~31, %rsp;
+
+       movq %rcx, %r8;  /* %r8: iv */
+       movq %rsp, %rcx; /* %rcx: keystream */
+       call __aria_aesni_avx2_ctr_gen_keystream_32way;
+
+       pushq %rsi;
+       movq %rdx, %r11;
+       movq %rcx, %rsi; /* use stack for temporary store */
+       movq %rcx, %rdx;
+       leaq ARIA_CTX_enc_key(CTX), %r9;
+
+       call __aria_aesni_avx2_crypt_32way;
+
+       popq %rsi;
+       vpxor (0 * 32)(%r11), %ymm1, %ymm1;
+       vpxor (1 * 32)(%r11), %ymm0, %ymm0;
+       vpxor (2 * 32)(%r11), %ymm3, %ymm3;
+       vpxor (3 * 32)(%r11), %ymm2, %ymm2;
+       vpxor (4 * 32)(%r11), %ymm4, %ymm4;
+       vpxor (5 * 32)(%r11), %ymm5, %ymm5;
+       vpxor (6 * 32)(%r11), %ymm6, %ymm6;
+       vpxor (7 * 32)(%r11), %ymm7, %ymm7;
+       vpxor (8 * 32)(%r11), %ymm8, %ymm8;
+       vpxor (9 * 32)(%r11), %ymm9, %ymm9;
+       vpxor (10 * 32)(%r11), %ymm10, %ymm10;
+       vpxor (11 * 32)(%r11), %ymm11, %ymm11;
+       vpxor (12 * 32)(%r11), %ymm12, %ymm12;
+       vpxor (13 * 32)(%r11), %ymm13, %ymm13;
+       vpxor (14 * 32)(%r11), %ymm14, %ymm14;
+       vpxor (15 * 32)(%r11), %ymm15, %ymm15;
+       write_output(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7,
+                    %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
+                    %ymm15, %rsi);
+
+       movl $STACK_DEPTH, %eax;
+       leave;
+       CFI_LEAVE();
+       vzeroall;
+       ret_spec_stop;
+       CFI_ENDPROC();
+ELF(.size _gcry_aria_aesni_avx2_ctr_crypt_blk32,
+         .-_gcry_aria_aesni_avx2_ctr_crypt_blk32;)
+
+#ifdef CONFIG_AS_VAES
+.align 16
+ELF(.type __aria_vaes_avx2_crypt_32way,@function;)
+__aria_vaes_avx2_crypt_32way:
+       /* input:
+        *      %r9: rk
+        *      %rsi: dst
+        *      %rdx: src
+        *      %ymm0..%ymm15: byte-sliced blocks
+        */
+       CFI_STARTPROC();
+
+       movq %rsi, %rax;
+       leaq 8 * 32(%rax), %r8;
+
+       movl ARIA_CTX_rounds(CTX), %r10d;
+       subl $2, %r10d;
+
+       inpack16_post(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
+                     %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
+                     %ymm15, %rax, %r8);
+       aria_fo_vaes(%ymm8, %ymm9, %ymm10, %ymm11,
+                    %ymm12, %ymm13, %ymm14, %ymm15,
+                    %ymm0, %ymm1, %ymm2, %ymm3,
+                    %ymm4, %ymm5, %ymm6, %ymm7,
+                    %rax, %r9, 0);
+       leaq 1*16(%r9), %r9;
+
+.align 16
+.Loop_vaes:
+       aria_fe_vaes(%ymm1, %ymm0, %ymm3, %ymm2,
+                    %ymm4, %ymm5, %ymm6, %ymm7,
+                    %ymm8, %ymm9, %ymm10, %ymm11,
+                    %ymm12, %ymm13, %ymm14, %ymm15,
+                    %rax, %r9, 0);
+       aria_fo_vaes(%ymm9, %ymm8, %ymm11, %ymm10,
+                    %ymm12, %ymm13, %ymm14, %ymm15,
+                    %ymm0, %ymm1, %ymm2, %ymm3,
+                    %ymm4, %ymm5, %ymm6, %ymm7,
+                    %rax, %r9, 1);
+       leaq 2*16(%r9), %r9;
+       subl $2, %r10d;
+       jnz .Loop_vaes;
+
+       aria_ff_vaes(%ymm1, %ymm0, %ymm3, %ymm2,
+                    %ymm4, %ymm5, %ymm6, %ymm7,
+                    %ymm8, %ymm9, %ymm10, %ymm11,
+                    %ymm12, %ymm13, %ymm14, %ymm15,
+                    %rax, %r9, 0, 1);
+
+       debyteslice_16x16b(%ymm8, %ymm12, %ymm1, %ymm4,
+                          %ymm9, %ymm13, %ymm0, %ymm5,
+                          %ymm10, %ymm14, %ymm3, %ymm6,
+                          %ymm11, %ymm15, %ymm2, %ymm7,
+                          (%rax), (%r8));
+
+       ret_spec_stop;
+       CFI_ENDPROC();
+ELF(.size __aria_vaes_avx2_crypt_32way,.-__aria_vaes_avx2_crypt_32way;)
+
+.align 16
+.globl _gcry_aria_vaes_avx2_ecb_crypt_blk32
+ELF(.type _gcry_aria_vaes_avx2_ecb_crypt_blk32,@function;)
+_gcry_aria_vaes_avx2_ecb_crypt_blk32:
+       /* input:
+        *      %rdi: ctx, CTX
+        *      %rsi: dst
+        *      %rdx: src
+        *      %rcx: round keys
+        */
+       CFI_STARTPROC();
+
+       pushq %rbp;
+       CFI_PUSH(%rbp);
+       movq %rsp, %rbp;
+       CFI_DEF_CFA_REGISTER(%rbp);
+
+       subq $(16 * 32), %rsp;
+       andq $~31, %rsp;
+
+       movq %rcx, %r9;
+       movq %rsi, %r11;
+       movq %rsp, %rsi; /* use stack for temporary store */
+
+       inpack16_pre(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
+                    %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
+                    %ymm15, %rdx);
+
+       call __aria_vaes_avx2_crypt_32way;
+
+       write_output(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7,
+                    %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
+                    %ymm15, %r11);
+
+       movl $STACK_DEPTH, %eax;
+       leave;
+       CFI_LEAVE();
+       vzeroall;
+       ret_spec_stop;
+       CFI_ENDPROC();
+ELF(.size _gcry_aria_vaes_avx2_ecb_crypt_blk32,
+         .-_gcry_aria_vaes_avx2_ecb_crypt_blk32;)
+
+.align 16
+.globl _gcry_aria_vaes_avx2_ctr_crypt_blk32
+ELF(.type _gcry_aria_vaes_avx2_ctr_crypt_blk32,@function;)
+_gcry_aria_vaes_avx2_ctr_crypt_blk32:
+       /* input:
+        *      %rdi: ctx
+        *      %rsi: dst
+        *      %rdx: src
+        *      %rcx: iv (big endian, 128bit)
+        */
+       CFI_STARTPROC();
+
+       pushq %rbp;
+       CFI_PUSH(%rbp);
+       movq %rsp, %rbp;
+       CFI_DEF_CFA_REGISTER(%rbp);
+
+       subq $(16 * 32), %rsp;
+       andq $~31, %rsp;
+
+       movq %rcx, %r8;  /* %r8: iv */
+       movq %rsp, %rcx; /* %rcx: keystream */
+       call __aria_aesni_avx2_ctr_gen_keystream_32way;
+
+       pushq %rsi;
+       movq %rdx, %r11;
+       movq %rcx, %rsi; /* use stack for temporary store */
+       movq %rcx, %rdx;
+       leaq ARIA_CTX_enc_key(CTX), %r9;
+
+       call __aria_vaes_avx2_crypt_32way;
+
+       popq %rsi;
+       vpxor (0 * 32)(%r11), %ymm1, %ymm1;
+       vpxor (1 * 32)(%r11), %ymm0, %ymm0;
+       vpxor (2 * 32)(%r11), %ymm3, %ymm3;
+       vpxor (3 * 32)(%r11), %ymm2, %ymm2;
+       vpxor (4 * 32)(%r11), %ymm4, %ymm4;
+       vpxor (5 * 32)(%r11), %ymm5, %ymm5;
+       vpxor (6 * 32)(%r11), %ymm6, %ymm6;
+       vpxor (7 * 32)(%r11), %ymm7, %ymm7;
+       vpxor (8 * 32)(%r11), %ymm8, %ymm8;
+       vpxor (9 * 32)(%r11), %ymm9, %ymm9;
+       vpxor (10 * 32)(%r11), %ymm10, %ymm10;
+       vpxor (11 * 32)(%r11), %ymm11, %ymm11;
+       vpxor (12 * 32)(%r11), %ymm12, %ymm12;
+       vpxor (13 * 32)(%r11), %ymm13, %ymm13;
+       vpxor (14 * 32)(%r11), %ymm14, %ymm14;
+       vpxor (15 * 32)(%r11), %ymm15, %ymm15;
+       write_output(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7,
+                    %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
+                    %ymm15, %rsi);
+
+       movl $STACK_DEPTH, %eax;
+       leave;
+       CFI_LEAVE();
+       vzeroall;
+       ret_spec_stop;
+       CFI_ENDPROC();
+ELF(.size _gcry_aria_vaes_avx2_ctr_crypt_blk32,
+         .-_gcry_aria_vaes_avx2_ctr_crypt_blk32;)
+#endif /* CONFIG_AS_VAES */
+
+#ifdef CONFIG_AS_GFNI
+.align 16
+ELF(.type __aria_gfni_avx2_crypt_32way,@function;)
+__aria_gfni_avx2_crypt_32way:
+       /* input:
+        *      %r9: rk
+        *      %rsi: dst
+        *      %rdx: src
+        *      %ymm0..%ymm15: byte-sliced blocks
+        */
+       CFI_STARTPROC();
+
+       movq %rsi, %rax;
+       leaq 8 * 32(%rax), %r8;
+
+       movl ARIA_CTX_rounds(CTX), %r10d;
+       subl $2, %r10d;
+
+       inpack16_post(%ymm0, %ymm1, %ymm2, %ymm3,
+                     %ymm4, %ymm5, %ymm6, %ymm7,
+                     %ymm8, %ymm9, %ymm10, %ymm11,
+                     %ymm12, %ymm13, %ymm14,
+                     %ymm15, %rax, %r8);
+       aria_fo_gfni(%ymm8, %ymm9, %ymm10, %ymm11,
+                    %ymm12, %ymm13, %ymm14, %ymm15,
+                    %ymm0, %ymm1, %ymm2, %ymm3,
+                    %ymm4, %ymm5, %ymm6, %ymm7,
+                    %rax, %r9, 0);
+       leaq 1*16(%r9), %r9;
+
+.align 16
+.Loop_gfni:
+       aria_fe_gfni(%ymm1, %ymm0, %ymm3, %ymm2,
+                    %ymm4, %ymm5, %ymm6, %ymm7,
+                    %ymm8, %ymm9, %ymm10, %ymm11,
+                    %ymm12, %ymm13, %ymm14,
+                    %ymm15, %rax, %r9, 0);
+       aria_fo_gfni(%ymm9, %ymm8, %ymm11, %ymm10,
+                    %ymm12, %ymm13, %ymm14, %ymm15,
+                    %ymm0, %ymm1, %ymm2, %ymm3,
+                    %ymm4, %ymm5, %ymm6, %ymm7,
+                    %rax, %r9, 1);
+       leaq 2*16(%r9), %r9;
+       subl $2, %r10d;
+       jnz .Loop_gfni;
+
+       aria_ff_gfni(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7,
+                    %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
+                    %ymm15, %rax, %r9, 0, 1);
+
+       debyteslice_16x16b(%ymm8, %ymm12, %ymm1, %ymm4,
+                          %ymm9, %ymm13, %ymm0, %ymm5,
+                          %ymm10, %ymm14, %ymm3, %ymm6,
+                          %ymm11, %ymm15, %ymm2, %ymm7,
+                          (%rax), (%r8));
+
+       ret_spec_stop;
+       CFI_ENDPROC();
+ELF(.size __aria_gfni_avx2_crypt_32way,.-__aria_gfni_avx2_crypt_32way;)
+
+.align 16
+.globl _gcry_aria_gfni_avx2_ecb_crypt_blk32
+ELF(.type _gcry_aria_gfni_avx2_ecb_crypt_blk32,@function;)
+_gcry_aria_gfni_avx2_ecb_crypt_blk32:
+       /* input:
+        *      %rdi: ctx, CTX
+        *      %rsi: dst
+        *      %rdx: src
+        *      %rcx: round keys
+        */
+       CFI_STARTPROC();
+
+       pushq %rbp;
+       CFI_PUSH(%rbp);
+       movq %rsp, %rbp;
+       CFI_DEF_CFA_REGISTER(%rbp);
+
+       subq $(16 * 32), %rsp;
+       andq $~31, %rsp;
+
+       movq %rcx, %r9;
+       movq %rsi, %r11;
+       movq %rsp, %rsi; /* use stack for temporary store */
+
+       inpack16_pre(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
+                    %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
+                    %ymm15, %rdx);
+
+       call __aria_gfni_avx2_crypt_32way;
+
+       write_output(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7,
+                    %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
+                    %ymm15, %r11);
+
+       movl $STACK_DEPTH, %eax;
+       leave;
+       CFI_LEAVE();
+       vzeroall;
+       ret_spec_stop;
+       CFI_ENDPROC();
+ELF(.size _gcry_aria_gfni_avx2_ecb_crypt_blk32,
+         .-_gcry_aria_gfni_avx2_ecb_crypt_blk32;)
+
+.align 16
+.globl _gcry_aria_gfni_avx2_ctr_crypt_blk32
+ELF(.type _gcry_aria_gfni_avx2_ctr_crypt_blk32,@function;)
+_gcry_aria_gfni_avx2_ctr_crypt_blk32:
+       /* input:
+        *      %rdi: ctx
+        *      %rsi: dst
+        *      %rdx: src
+        *      %rcx: iv (big endian, 128bit)
+        */
+       CFI_STARTPROC();
+
+       pushq %rbp;
+       CFI_PUSH(%rbp);
+       movq %rsp, %rbp;
+       CFI_DEF_CFA_REGISTER(%rbp);
+
+       subq $(16 * 32), %rsp;
+       andq $~31, %rsp;
+
+       movq %rcx, %r8;  /* %r8: iv */
+       movq %rsp, %rcx; /* %rcx: keystream */
+       call __aria_aesni_avx2_ctr_gen_keystream_32way;
+
+       pushq %rsi;
+       movq %rdx, %r11;
+       movq %rcx, %rsi; /* use stack for temporary store */
+       movq %rcx, %rdx;
+       leaq ARIA_CTX_enc_key(CTX), %r9;
+
+       call __aria_gfni_avx2_crypt_32way;
+
+       popq %rsi;
+       vpxor (0 * 32)(%r11), %ymm1, %ymm1;
+       vpxor (1 * 32)(%r11), %ymm0, %ymm0;
+       vpxor (2 * 32)(%r11), %ymm3, %ymm3;
+       vpxor (3 * 32)(%r11), %ymm2, %ymm2;
+       vpxor (4 * 32)(%r11), %ymm4, %ymm4;
+       vpxor (5 * 32)(%r11), %ymm5, %ymm5;
+       vpxor (6 * 32)(%r11), %ymm6, %ymm6;
+       vpxor (7 * 32)(%r11), %ymm7, %ymm7;
+       vpxor (8 * 32)(%r11), %ymm8, %ymm8;
+       vpxor (9 * 32)(%r11), %ymm9, %ymm9;
+       vpxor (10 * 32)(%r11), %ymm10, %ymm10;
+       vpxor (11 * 32)(%r11), %ymm11, %ymm11;
+       vpxor (12 * 32)(%r11), %ymm12, %ymm12;
+       vpxor (13 * 32)(%r11), %ymm13, %ymm13;
+       vpxor (14 * 32)(%r11), %ymm14, %ymm14;
+       vpxor (15 * 32)(%r11), %ymm15, %ymm15;
+       write_output(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7,
+                    %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
+                    %ymm15, %rsi);
+
+       movl $STACK_DEPTH, %eax;
+       leave;
+       CFI_LEAVE();
+       vzeroall;
+       ret_spec_stop;
+       CFI_ENDPROC();
+ELF(.size _gcry_aria_gfni_avx2_ctr_crypt_blk32,
+         .-_gcry_aria_gfni_avx2_ctr_crypt_blk32;)
+#endif /* CONFIG_AS_GFNI */
+
+#endif /* ENABLE_AVX2_SUPPORT && ENABLE_AESNI_SUPPORT */
+#endif /* __x86_64 */
diff --git a/cipher/aria-gfni-avx512-amd64.S b/cipher/aria-gfni-avx512-amd64.S
new file mode 100644 (file)
index 0000000..0eaa2de
--- /dev/null
@@ -0,0 +1,1010 @@
+/* aria-gfni-avx512-amd64.S  -  GFNI/AVX512 implementation of ARIA cipher
+ *
+ * Copyright (C) 2022-2023 Taehee Yoo <ap420073@gmail.com>
+ * Copyright (C) 2023 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <config.h>
+
+#ifdef __x86_64
+#if (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
+     defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) && \
+    defined(ENABLE_AVX512_SUPPORT) && defined(ENABLE_GFNI_SUPPORT)
+
+#include "asm-common-amd64.h"
+
+/* struct ARIA_context: */
+#define ARIA_BLOCK_SIZE  16
+#define ARIA_MAX_RD_KEYS 17
+#define ARIA_CTX_enc_key 0
+#define ARIA_CTX_dec_key (ARIA_CTX_enc_key + (ARIA_BLOCK_SIZE * ARIA_MAX_RD_KEYS))
+#define ARIA_CTX_rounds (ARIA_CTX_dec_key + (ARIA_BLOCK_SIZE * ARIA_MAX_RD_KEYS))
+
+/* register macros */
+#define CTX %rdi
+
+/* helper macros */
+#define STACK_DEPTH (2 * 8 + 16 * 64 + 63)
+
+#define BV8(a0, a1, a2, a3, a4, a5, a6, a7)            \
+       ( (((a0) & 1) << 0) |                           \
+         (((a1) & 1) << 1) |                           \
+         (((a2) & 1) << 2) |                           \
+         (((a3) & 1) << 3) |                           \
+         (((a4) & 1) << 4) |                           \
+         (((a5) & 1) << 5) |                           \
+         (((a6) & 1) << 6) |                           \
+         (((a7) & 1) << 7) )
+
+#define BM8X8(l0, l1, l2, l3, l4, l5, l6, l7)          \
+       ( ((l7) << (0 * 8)) |                           \
+         ((l6) << (1 * 8)) |                           \
+         ((l5) << (2 * 8)) |                           \
+         ((l4) << (3 * 8)) |                           \
+         ((l3) << (4 * 8)) |                           \
+         ((l2) << (5 * 8)) |                           \
+         ((l1) << (6 * 8)) |                           \
+         ((l0) << (7 * 8)) )
+
+/* asm macros */
+#define clear_vec4(v0,v1,v2,v3) \
+       vpxord v0, v0, v0; \
+       vpxord v1, v1, v1; \
+       vpxord v2, v2, v2; \
+       vpxord v3, v3, v3
+
+#define clear_zmm16_zmm31() \
+       clear_vec4(%ymm16, %ymm20, %ymm24, %ymm28); \
+       clear_vec4(%ymm17, %ymm21, %ymm25, %ymm29); \
+       clear_vec4(%ymm18, %ymm22, %ymm26, %ymm30); \
+       clear_vec4(%ymm19, %ymm23, %ymm27, %ymm31)
+
+#define clear_regs() \
+       kxorq %k1, %k1, %k1; \
+       vzeroall; \
+       clear_zmm16_zmm31()
+
+#define add_le128(out, in, lo_counter, hi_counter1)    \
+       vpaddq lo_counter, in, out;                     \
+       vpcmpuq $1, lo_counter, out, %k1;               \
+       kaddb %k1, %k1, %k1;                            \
+       vpaddq hi_counter1, out, out{%k1};
+
+#define filter_8bit(x, lo_t, hi_t, mask4bit, tmp0)     \
+       vpandq x, mask4bit, tmp0;                       \
+       vpandqn x, mask4bit, x;                         \
+       vpsrld $4, x, x;                                \
+                                                       \
+       vpshufb tmp0, lo_t, tmp0;                       \
+       vpshufb x, hi_t, x;                             \
+       vpxorq tmp0, x, x;
+
+#define transpose_4x4(x0, x1, x2, x3, t1, t2)          \
+       vpunpckhdq x1, x0, t2;                          \
+       vpunpckldq x1, x0, x0;                          \
+                                                       \
+       vpunpckldq x3, x2, t1;                          \
+       vpunpckhdq x3, x2, x2;                          \
+                                                       \
+       vpunpckhqdq t1, x0, x1;                         \
+       vpunpcklqdq t1, x0, x0;                         \
+                                                       \
+       vpunpckhqdq x2, t2, x3;                         \
+       vpunpcklqdq x2, t2, x2;
+
+#define byteslice_16x16b(a0, b0, c0, d0,               \
+                        a1, b1, c1, d1,                \
+                        a2, b2, c2, d2,                \
+                        a3, b3, c3, d3,                \
+                        st0, st1)                      \
+       vmovdqu64 d2, st0;                              \
+       vmovdqu64 d3, st1;                              \
+       transpose_4x4(a0, a1, a2, a3, d2, d3);          \
+       transpose_4x4(b0, b1, b2, b3, d2, d3);          \
+       vmovdqu64 st0, d2;                              \
+       vmovdqu64 st1, d3;                              \
+                                                       \
+       vmovdqu64 a0, st0;                              \
+       vmovdqu64 a1, st1;                              \
+       transpose_4x4(c0, c1, c2, c3, a0, a1);          \
+       transpose_4x4(d0, d1, d2, d3, a0, a1);          \
+                                                       \
+       vbroadcasti64x2 .Lshufb_16x16b rRIP, a0;        \
+       vmovdqu64 st1, a1;                              \
+       vpshufb a0, a2, a2;                             \
+       vpshufb a0, a3, a3;                             \
+       vpshufb a0, b0, b0;                             \
+       vpshufb a0, b1, b1;                             \
+       vpshufb a0, b2, b2;                             \
+       vpshufb a0, b3, b3;                             \
+       vpshufb a0, a1, a1;                             \
+       vpshufb a0, c0, c0;                             \
+       vpshufb a0, c1, c1;                             \
+       vpshufb a0, c2, c2;                             \
+       vpshufb a0, c3, c3;                             \
+       vpshufb a0, d0, d0;                             \
+       vpshufb a0, d1, d1;                             \
+       vpshufb a0, d2, d2;                             \
+       vpshufb a0, d3, d3;                             \
+       vmovdqu64 d3, st1;                              \
+       vmovdqu64 st0, d3;                              \
+       vpshufb a0, d3, a0;                             \
+       vmovdqu64 d2, st0;                              \
+                                                       \
+       transpose_4x4(a0, b0, c0, d0, d2, d3);          \
+       transpose_4x4(a1, b1, c1, d1, d2, d3);          \
+       vmovdqu64 st0, d2;                              \
+       vmovdqu64 st1, d3;                              \
+                                                       \
+       vmovdqu64 b0, st0;                              \
+       vmovdqu64 b1, st1;                              \
+       transpose_4x4(a2, b2, c2, d2, b0, b1);          \
+       transpose_4x4(a3, b3, c3, d3, b0, b1);          \
+       vmovdqu64 st0, b0;                              \
+       vmovdqu64 st1, b1;                              \
+       /* does not adjust output bytes inside vectors */
+
+#define debyteslice_16x16b(a0, b0, c0, d0,             \
+                          a1, b1, c1, d1,              \
+                          a2, b2, c2, d2,              \
+                          a3, b3, c3, d3,              \
+                          st0, st1)                    \
+       vmovdqu64 d2, st0;                              \
+       vmovdqu64 d3, st1;                              \
+       transpose_4x4(a0, a1, a2, a3, d2, d3);          \
+       transpose_4x4(b0, b1, b2, b3, d2, d3);          \
+       vmovdqu64 st0, d2;                              \
+       vmovdqu64 st1, d3;                              \
+                                                       \
+       vmovdqu64 a0, st0;                              \
+       vmovdqu64 a1, st1;                              \
+       transpose_4x4(c0, c1, c2, c3, a0, a1);          \
+       transpose_4x4(d0, d1, d2, d3, a0, a1);          \
+                                                       \
+       vbroadcasti64x2 .Lshufb_16x16b rRIP, a0;        \
+       vmovdqu64 st1, a1;                              \
+       vpshufb a0, a2, a2;                             \
+       vpshufb a0, a3, a3;                             \
+       vpshufb a0, b0, b0;                             \
+       vpshufb a0, b1, b1;                             \
+       vpshufb a0, b2, b2;                             \
+       vpshufb a0, b3, b3;                             \
+       vpshufb a0, a1, a1;                             \
+       vpshufb a0, c0, c0;                             \
+       vpshufb a0, c1, c1;                             \
+       vpshufb a0, c2, c2;                             \
+       vpshufb a0, c3, c3;                             \
+       vpshufb a0, d0, d0;                             \
+       vpshufb a0, d1, d1;                             \
+       vpshufb a0, d2, d2;                             \
+       vpshufb a0, d3, d3;                             \
+       vmovdqu64 d3, st1;                              \
+       vmovdqu64 st0, d3;                              \
+       vpshufb a0, d3, a0;                             \
+       vmovdqu64 d2, st0;                              \
+                                                       \
+       transpose_4x4(c0, d0, a0, b0, d2, d3);          \
+       transpose_4x4(c1, d1, a1, b1, d2, d3);          \
+       vmovdqu64 st0, d2;                              \
+       vmovdqu64 st1, d3;                              \
+                                                       \
+       vmovdqu64 b0, st0;                              \
+       vmovdqu64 b1, st1;                              \
+       transpose_4x4(c2, d2, a2, b2, b0, b1);          \
+       transpose_4x4(c3, d3, a3, b3, b0, b1);          \
+       vmovdqu64 st0, b0;                              \
+       vmovdqu64 st1, b1;                              \
+       /* does not adjust output bytes inside vectors */
+
+/* load blocks to registers and apply pre-whitening */
+#define inpack16_pre(x0, x1, x2, x3,                   \
+                    x4, x5, x6, x7,                    \
+                    y0, y1, y2, y3,                    \
+                    y4, y5, y6, y7,                    \
+                    rio)                               \
+       vmovdqu64 (0 * 64)(rio), x0;                    \
+       vmovdqu64 (1 * 64)(rio), x1;                    \
+       vmovdqu64 (2 * 64)(rio), x2;                    \
+       vmovdqu64 (3 * 64)(rio), x3;                    \
+       vmovdqu64 (4 * 64)(rio), x4;                    \
+       vmovdqu64 (5 * 64)(rio), x5;                    \
+       vmovdqu64 (6 * 64)(rio), x6;                    \
+       vmovdqu64 (7 * 64)(rio), x7;                    \
+       vmovdqu64 (8 * 64)(rio), y0;                    \
+       vmovdqu64 (9 * 64)(rio), y1;                    \
+       vmovdqu64 (10 * 64)(rio), y2;                   \
+       vmovdqu64 (11 * 64)(rio), y3;                   \
+       vmovdqu64 (12 * 64)(rio), y4;                   \
+       vmovdqu64 (13 * 64)(rio), y5;                   \
+       vmovdqu64 (14 * 64)(rio), y6;                   \
+       vmovdqu64 (15 * 64)(rio), y7;
+
+/* byteslice pre-whitened blocks and store to temporary memory */
+#define inpack16_post(x0, x1, x2, x3,                  \
+                     x4, x5, x6, x7,                   \
+                     y0, y1, y2, y3,                   \
+                     y4, y5, y6, y7,                   \
+                     mem_ab, mem_cd)                   \
+       byteslice_16x16b(x0, x1, x2, x3,                \
+                        x4, x5, x6, x7,                \
+                        y0, y1, y2, y3,                \
+                        y4, y5, y6, y7,                \
+                        (mem_ab), (mem_cd));           \
+                                                       \
+       vmovdqu64 x0, 0 * 64(mem_ab);                   \
+       vmovdqu64 x1, 1 * 64(mem_ab);                   \
+       vmovdqu64 x2, 2 * 64(mem_ab);                   \
+       vmovdqu64 x3, 3 * 64(mem_ab);                   \
+       vmovdqu64 x4, 4 * 64(mem_ab);                   \
+       vmovdqu64 x5, 5 * 64(mem_ab);                   \
+       vmovdqu64 x6, 6 * 64(mem_ab);                   \
+       vmovdqu64 x7, 7 * 64(mem_ab);                   \
+       vmovdqu64 y0, 0 * 64(mem_cd);                   \
+       vmovdqu64 y1, 1 * 64(mem_cd);                   \
+       vmovdqu64 y2, 2 * 64(mem_cd);                   \
+       vmovdqu64 y3, 3 * 64(mem_cd);                   \
+       vmovdqu64 y4, 4 * 64(mem_cd);                   \
+       vmovdqu64 y5, 5 * 64(mem_cd);                   \
+       vmovdqu64 y6, 6 * 64(mem_cd);                   \
+       vmovdqu64 y7, 7 * 64(mem_cd);
+
+#define write_output(x0, x1, x2, x3,                   \
+                    x4, x5, x6, x7,                    \
+                    y0, y1, y2, y3,                    \
+                    y4, y5, y6, y7,                    \
+                    mem)                               \
+       vmovdqu64 x0, 0 * 64(mem);                      \
+       vmovdqu64 x1, 1 * 64(mem);                      \
+       vmovdqu64 x2, 2 * 64(mem);                      \
+       vmovdqu64 x3, 3 * 64(mem);                      \
+       vmovdqu64 x4, 4 * 64(mem);                      \
+       vmovdqu64 x5, 5 * 64(mem);                      \
+       vmovdqu64 x6, 6 * 64(mem);                      \
+       vmovdqu64 x7, 7 * 64(mem);                      \
+       vmovdqu64 y0, 8 * 64(mem);                      \
+       vmovdqu64 y1, 9 * 64(mem);                      \
+       vmovdqu64 y2, 10 * 64(mem);                     \
+       vmovdqu64 y3, 11 * 64(mem);                     \
+       vmovdqu64 y4, 12 * 64(mem);                     \
+       vmovdqu64 y5, 13 * 64(mem);                     \
+       vmovdqu64 y6, 14 * 64(mem);                     \
+       vmovdqu64 y7, 15 * 64(mem);                     \
+
+#define aria_store_state_8way(x0, x1, x2, x3,          \
+                             x4, x5, x6, x7,           \
+                             mem_tmp, idx)             \
+       vmovdqu64 x0, ((idx + 0) * 64)(mem_tmp);        \
+       vmovdqu64 x1, ((idx + 1) * 64)(mem_tmp);        \
+       vmovdqu64 x2, ((idx + 2) * 64)(mem_tmp);        \
+       vmovdqu64 x3, ((idx + 3) * 64)(mem_tmp);        \
+       vmovdqu64 x4, ((idx + 4) * 64)(mem_tmp);        \
+       vmovdqu64 x5, ((idx + 5) * 64)(mem_tmp);        \
+       vmovdqu64 x6, ((idx + 6) * 64)(mem_tmp);        \
+       vmovdqu64 x7, ((idx + 7) * 64)(mem_tmp);
+
+#define aria_load_state_8way(x0, x1, x2, x3,           \
+                            x4, x5, x6, x7,            \
+                            mem_tmp, idx)              \
+       vmovdqu64 ((idx + 0) * 64)(mem_tmp), x0;        \
+       vmovdqu64 ((idx + 1) * 64)(mem_tmp), x1;        \
+       vmovdqu64 ((idx + 2) * 64)(mem_tmp), x2;        \
+       vmovdqu64 ((idx + 3) * 64)(mem_tmp), x3;        \
+       vmovdqu64 ((idx + 4) * 64)(mem_tmp), x4;        \
+       vmovdqu64 ((idx + 5) * 64)(mem_tmp), x5;        \
+       vmovdqu64 ((idx + 6) * 64)(mem_tmp), x6;        \
+       vmovdqu64 ((idx + 7) * 64)(mem_tmp), x7;
+
+#define aria_ark_16way(x0, x1, x2, x3,                 \
+                      x4, x5, x6, x7,                  \
+                      y0, y1, y2, y3,                  \
+                      y4, y5, y6, y7,                  \
+                      t0, rk, round)                   \
+       /* AddRoundKey */                               \
+       vpbroadcastb ((round * 16) + 3)(rk), t0;        \
+       vpxorq t0, x0, x0;                              \
+       vpbroadcastb ((round * 16) + 2)(rk), t0;        \
+       vpxorq t0, x1, x1;                              \
+       vpbroadcastb ((round * 16) + 1)(rk), t0;        \
+       vpxorq t0, x2, x2;                              \
+       vpbroadcastb ((round * 16) + 0)(rk), t0;        \
+       vpxorq t0, x3, x3;                              \
+       vpbroadcastb ((round * 16) + 7)(rk), t0;        \
+       vpxorq t0, x4, x4;                              \
+       vpbroadcastb ((round * 16) + 6)(rk), t0;        \
+       vpxorq t0, x5, x5;                              \
+       vpbroadcastb ((round * 16) + 5)(rk), t0;        \
+       vpxorq t0, x6, x6;                              \
+       vpbroadcastb ((round * 16) + 4)(rk), t0;        \
+       vpxorq t0, x7, x7;                              \
+       vpbroadcastb ((round * 16) + 11)(rk), t0;       \
+       vpxorq t0, y0, y0;                              \
+       vpbroadcastb ((round * 16) + 10)(rk), t0;       \
+       vpxorq t0, y1, y1;                              \
+       vpbroadcastb ((round * 16) + 9)(rk), t0;        \
+       vpxorq t0, y2, y2;                              \
+       vpbroadcastb ((round * 16) + 8)(rk), t0;        \
+       vpxorq t0, y3, y3;                              \
+       vpbroadcastb ((round * 16) + 15)(rk), t0;       \
+       vpxorq t0, y4, y4;                              \
+       vpbroadcastb ((round * 16) + 14)(rk), t0;       \
+       vpxorq t0, y5, y5;                              \
+       vpbroadcastb ((round * 16) + 13)(rk), t0;       \
+       vpxorq t0, y6, y6;                              \
+       vpbroadcastb ((round * 16) + 12)(rk), t0;       \
+       vpxorq t0, y7, y7;
+
+#define aria_sbox_8way_gfni(x0, x1, x2, x3,            \
+                           x4, x5, x6, x7,             \
+                           t0, t1, t2, t3,             \
+                           t4, t5, t6, t7)             \
+       vpbroadcastq .Ltf_s2_bitmatrix rRIP, t0;        \
+       vpbroadcastq .Ltf_inv_bitmatrix rRIP, t1;       \
+       vpbroadcastq .Ltf_id_bitmatrix rRIP, t2;        \
+       vpbroadcastq .Ltf_aff_bitmatrix rRIP, t3;       \
+       vpbroadcastq .Ltf_x2_bitmatrix rRIP, t4;        \
+       vgf2p8affineinvqb $(tf_s2_const), t0, x1, x1;   \
+       vgf2p8affineinvqb $(tf_s2_const), t0, x5, x5;   \
+       vgf2p8affineqb $(tf_inv_const), t1, x2, x2;     \
+       vgf2p8affineqb $(tf_inv_const), t1, x6, x6;     \
+       vgf2p8affineinvqb $0, t2, x2, x2;               \
+       vgf2p8affineinvqb $0, t2, x6, x6;               \
+       vgf2p8affineinvqb $(tf_aff_const), t3, x0, x0;  \
+       vgf2p8affineinvqb $(tf_aff_const), t3, x4, x4;  \
+       vgf2p8affineqb $(tf_x2_const), t4, x3, x3;      \
+       vgf2p8affineqb $(tf_x2_const), t4, x7, x7;      \
+       vgf2p8affineinvqb $0, t2, x3, x3;               \
+       vgf2p8affineinvqb $0, t2, x7, x7;
+
+#define aria_sbox_16way_gfni(x0, x1, x2, x3,           \
+                            x4, x5, x6, x7,            \
+                            y0, y1, y2, y3,            \
+                            y4, y5, y6, y7,            \
+                            t0, t1, t2, t3,            \
+                            t4, t5, t6, t7)            \
+       vpbroadcastq .Ltf_s2_bitmatrix rRIP, t0;        \
+       vpbroadcastq .Ltf_inv_bitmatrix rRIP, t1;       \
+       vpbroadcastq .Ltf_id_bitmatrix rRIP, t2;        \
+       vpbroadcastq .Ltf_aff_bitmatrix rRIP, t3;       \
+       vpbroadcastq .Ltf_x2_bitmatrix rRIP, t4;        \
+       vgf2p8affineinvqb $(tf_s2_const), t0, x1, x1;   \
+       vgf2p8affineinvqb $(tf_s2_const), t0, x5, x5;   \
+       vgf2p8affineqb $(tf_inv_const), t1, x2, x2;     \
+       vgf2p8affineqb $(tf_inv_const), t1, x6, x6;     \
+       vgf2p8affineinvqb $0, t2, x2, x2;               \
+       vgf2p8affineinvqb $0, t2, x6, x6;               \
+       vgf2p8affineinvqb $(tf_aff_const), t3, x0, x0;  \
+       vgf2p8affineinvqb $(tf_aff_const), t3, x4, x4;  \
+       vgf2p8affineqb $(tf_x2_const), t4, x3, x3;      \
+       vgf2p8affineqb $(tf_x2_const), t4, x7, x7;      \
+       vgf2p8affineinvqb $0, t2, x3, x3;               \
+       vgf2p8affineinvqb $0, t2, x7, x7;               \
+       vgf2p8affineinvqb $(tf_s2_const), t0, y1, y1;   \
+       vgf2p8affineinvqb $(tf_s2_const), t0, y5, y5;   \
+       vgf2p8affineqb $(tf_inv_const), t1, y2, y2;     \
+       vgf2p8affineqb $(tf_inv_const), t1, y6, y6;     \
+       vgf2p8affineinvqb $0, t2, y2, y2;               \
+       vgf2p8affineinvqb $0, t2, y6, y6;               \
+       vgf2p8affineinvqb $(tf_aff_const), t3, y0, y0;  \
+       vgf2p8affineinvqb $(tf_aff_const), t3, y4, y4;  \
+       vgf2p8affineqb $(tf_x2_const), t4, y3, y3;      \
+       vgf2p8affineqb $(tf_x2_const), t4, y7, y7;      \
+       vgf2p8affineinvqb $0, t2, y3, y3;               \
+       vgf2p8affineinvqb $0, t2, y7, y7;
+
+#define aria_diff_m(x0, x1, x2, x3,                    \
+                   t0, t1, t2, t3)                     \
+       /* T = rotr32(X, 8); */                         \
+       /* X ^= T */                                    \
+       /* X = T ^ rotr(X, 16); */                      \
+       vmovdqa64 x0, t0;                               \
+       vmovdqa64 x3, t3;                               \
+       vpternlogq $0x96, x2, x1, x0;                   \
+       vpternlogq $0x96, x2, x1, x3;                   \
+       vpternlogq $0x96, t0, t3, x2;                   \
+       vpternlogq $0x96, t0, t3, x1;
+
+#define aria_diff_word(x0, x1, x2, x3,                 \
+                      x4, x5, x6, x7,                  \
+                      y0, y1, y2, y3,                  \
+                      y4, y5, y6, y7)                  \
+       /* t1 ^= t2; */                                 \
+       vpxorq y0, x4, x4;                              \
+       vpxorq y1, x5, x5;                              \
+       vpxorq y2, x6, x6;                              \
+       vpxorq y3, x7, x7;                              \
+                                                       \
+       /* t2 ^= t3; */                                 \
+       vpxorq y4, y0, y0;                              \
+       vpxorq y5, y1, y1;                              \
+       vpxorq y6, y2, y2;                              \
+       vpxorq y7, y3, y3;                              \
+                                                       \
+       /* t0 ^= t1; */                                 \
+       vpxorq x4, x0, x0;                              \
+       vpxorq x5, x1, x1;                              \
+       vpxorq x6, x2, x2;                              \
+       vpxorq x7, x3, x3;                              \
+                                                       \
+       /* t3 ^= t1; */                                 \
+       vpxorq x4, y4, y4;                              \
+       vpxorq x5, y5, y5;                              \
+       vpxorq x6, y6, y6;                              \
+       vpxorq x7, y7, y7;                              \
+                                                       \
+       /* t2 ^= t0; */                                 \
+       vpxorq x0, y0, y0;                              \
+       vpxorq x1, y1, y1;                              \
+       vpxorq x2, y2, y2;                              \
+       vpxorq x3, y3, y3;                              \
+                                                       \
+       /* t1 ^= t2; */                                 \
+       vpxorq y0, x4, x4;                              \
+       vpxorq y1, x5, x5;                              \
+       vpxorq y2, x6, x6;                              \
+       vpxorq y3, x7, x7;
+
+#define aria_fe_gfni(x0, x1, x2, x3,                   \
+                    x4, x5, x6, x7,                    \
+                    y0, y1, y2, y3,                    \
+                    y4, y5, y6, y7,                    \
+                    z0, z1, z2, z3,                    \
+                    z4, z5, z6, z7,                    \
+                    mem_tmp, rk, round)                \
+       aria_ark_16way(x0, x1, x2, x3, x4, x5, x6, x7,  \
+                      y0, y1, y2, y3, y4, y5, y6, y7,  \
+                      z0, rk, round);                  \
+                                                       \
+       aria_sbox_16way_gfni(x2, x3, x0, x1,            \
+                            x6, x7, x4, x5,            \
+                            y2, y3, y0, y1,            \
+                            y6, y7, y4, y5,            \
+                            z0, z1, z2, z3,            \
+                            z4, z5, z6, z7);           \
+                                                       \
+       aria_diff_m(x0, x1, x2, x3, z0, z1, z2, z3);    \
+       aria_diff_m(x4, x5, x6, x7, z0, z1, z2, z3);    \
+       aria_diff_m(y0, y1, y2, y3, z0, z1, z2, z3);    \
+       aria_diff_m(y4, y5, y6, y7, z0, z1, z2, z3);    \
+       aria_diff_word(x0, x1, x2, x3,                  \
+                      x4, x5, x6, x7,                  \
+                      y0, y1, y2, y3,                  \
+                      y4, y5, y6, y7);                 \
+       /* aria_diff_byte()                             \
+        * T3 = ABCD -> BADC                            \
+        * T3 = y4, y5, y6, y7 -> y5, y4, y7, y6        \
+        * T0 = ABCD -> CDAB                            \
+        * T0 = x0, x1, x2, x3 -> x2, x3, x0, x1        \
+        * T1 = ABCD -> DCBA                            \
+        * T1 = x4, x5, x6, x7 -> x7, x6, x5, x4        \
+        */                                             \
+       aria_diff_word(x2, x3, x0, x1,                  \
+                      x7, x6, x5, x4,                  \
+                      y0, y1, y2, y3,                  \
+                      y5, y4, y7, y6);                 \
+
+
+#define aria_fo_gfni(x0, x1, x2, x3,                   \
+                    x4, x5, x6, x7,                    \
+                    y0, y1, y2, y3,                    \
+                    y4, y5, y6, y7,                    \
+                    z0, z1, z2, z3,                    \
+                    z4, z5, z6, z7,                    \
+                    mem_tmp, rk, round)                \
+       aria_ark_16way(x0, x1, x2, x3, x4, x5, x6, x7,  \
+                      y0, y1, y2, y3, y4, y5, y6, y7,  \
+                      z0, rk, round);                  \
+                                                       \
+       aria_sbox_16way_gfni(x0, x1, x2, x3,            \
+                            x4, x5, x6, x7,            \
+                            y0, y1, y2, y3,            \
+                            y4, y5, y6, y7,            \
+                            z0, z1, z2, z3,            \
+                            z4, z5, z6, z7);           \
+                                                       \
+       aria_diff_m(x0, x1, x2, x3, z0, z1, z2, z3);    \
+       aria_diff_m(x4, x5, x6, x7, z0, z1, z2, z3);    \
+       aria_diff_m(y0, y1, y2, y3, z0, z1, z2, z3);    \
+       aria_diff_m(y4, y5, y6, y7, z0, z1, z2, z3);    \
+       aria_diff_word(x0, x1, x2, x3,                  \
+                      x4, x5, x6, x7,                  \
+                      y0, y1, y2, y3,                  \
+                      y4, y5, y6, y7);                 \
+       /* aria_diff_byte()                             \
+        * T1 = ABCD -> BADC                            \
+        * T1 = x4, x5, x6, x7 -> x5, x4, x7, x6        \
+        * T2 = ABCD -> CDAB                            \
+        * T2 = y0, y1, y2, y3, -> y2, y3, y0, y1       \
+        * T3 = ABCD -> DCBA                            \
+        * T3 = y4, y5, y6, y7 -> y7, y6, y5, y4        \
+        */                                             \
+       aria_diff_word(x0, x1, x2, x3,                  \
+                      x5, x4, x7, x6,                  \
+                      y2, y3, y0, y1,                  \
+                      y7, y6, y5, y4);
+
+#define aria_ff_gfni(x0, x1, x2, x3,                   \
+                    x4, x5, x6, x7,                    \
+                    y0, y1, y2, y3,                    \
+                    y4, y5, y6, y7,                    \
+                    z0, z1, z2, z3,                    \
+                    z4, z5, z6, z7,                    \
+                    mem_tmp, rk, round, last_round)    \
+       aria_ark_16way(x0, x1, x2, x3,                  \
+                      x4, x5, x6, x7,                  \
+                      y0, y1, y2, y3,                  \
+                      y4, y5, y6, y7,                  \
+                      z0, rk, round);                  \
+       aria_sbox_16way_gfni(x2, x3, x0, x1,            \
+                            x6, x7, x4, x5,            \
+                            y2, y3, y0, y1,            \
+                            y6, y7, y4, y5,            \
+                            z0, z1, z2, z3,            \
+                            z4, z5, z6, z7);           \
+       aria_ark_16way(x0, x1, x2, x3,                  \
+                      x4, x5, x6, x7,                  \
+                      y0, y1, y2, y3,                  \
+                      y4, y5, y6, y7,                  \
+                      z0, rk, last_round);
+
+SECTION_RODATA
+.align 64
+.Lcounter0123_lo:
+       .quad 0, 0
+       .quad 1, 0
+       .quad 2, 0
+       .quad 3, 0
+
+.align 32
+#define SHUFB_BYTES(idx) \
+       0 + (idx), 4 + (idx), 8 + (idx), 12 + (idx)
+.Lshufb_16x16b:
+       .byte SHUFB_BYTES(0), SHUFB_BYTES(1), SHUFB_BYTES(2), SHUFB_BYTES(3)
+       .byte SHUFB_BYTES(0), SHUFB_BYTES(1), SHUFB_BYTES(2), SHUFB_BYTES(3)
+
+.align 16
+.Lcounter4444_lo:
+       .quad 4, 0
+.Lcounter8888_lo:
+       .quad 8, 0
+.Lcounter16161616_lo:
+       .quad 16, 0
+.Lcounter1111_hi:
+       .quad 0, 1
+
+/* For CTR-mode IV byteswap */
+.Lbswap128_mask:
+       .byte 0x0f, 0x0e, 0x0d, 0x0c, 0x0b, 0x0a, 0x09, 0x08
+       .byte 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00
+
+.align 8
+/* AES affine: */
+#define tf_aff_const BV8(1, 1, 0, 0, 0, 1, 1, 0)
+.Ltf_aff_bitmatrix:
+       .quad BM8X8(BV8(1, 0, 0, 0, 1, 1, 1, 1),
+                   BV8(1, 1, 0, 0, 0, 1, 1, 1),
+                   BV8(1, 1, 1, 0, 0, 0, 1, 1),
+                   BV8(1, 1, 1, 1, 0, 0, 0, 1),
+                   BV8(1, 1, 1, 1, 1, 0, 0, 0),
+                   BV8(0, 1, 1, 1, 1, 1, 0, 0),
+                   BV8(0, 0, 1, 1, 1, 1, 1, 0),
+                   BV8(0, 0, 0, 1, 1, 1, 1, 1))
+
+/* AES inverse affine: */
+#define tf_inv_const BV8(1, 0, 1, 0, 0, 0, 0, 0)
+.Ltf_inv_bitmatrix:
+       .quad BM8X8(BV8(0, 0, 1, 0, 0, 1, 0, 1),
+                   BV8(1, 0, 0, 1, 0, 0, 1, 0),
+                   BV8(0, 1, 0, 0, 1, 0, 0, 1),
+                   BV8(1, 0, 1, 0, 0, 1, 0, 0),
+                   BV8(0, 1, 0, 1, 0, 0, 1, 0),
+                   BV8(0, 0, 1, 0, 1, 0, 0, 1),
+                   BV8(1, 0, 0, 1, 0, 1, 0, 0),
+                   BV8(0, 1, 0, 0, 1, 0, 1, 0))
+
+/* S2: */
+#define tf_s2_const BV8(0, 1, 0, 0, 0, 1, 1, 1)
+.Ltf_s2_bitmatrix:
+       .quad BM8X8(BV8(0, 1, 0, 1, 0, 1, 1, 1),
+                   BV8(0, 0, 1, 1, 1, 1, 1, 1),
+                   BV8(1, 1, 1, 0, 1, 1, 0, 1),
+                   BV8(1, 1, 0, 0, 0, 0, 1, 1),
+                   BV8(0, 1, 0, 0, 0, 0, 1, 1),
+                   BV8(1, 1, 0, 0, 1, 1, 1, 0),
+                   BV8(0, 1, 1, 0, 0, 0, 1, 1),
+                   BV8(1, 1, 1, 1, 0, 1, 1, 0))
+
+/* X2: */
+#define tf_x2_const BV8(0, 0, 1, 1, 0, 1, 0, 0)
+.Ltf_x2_bitmatrix:
+       .quad BM8X8(BV8(0, 0, 0, 1, 1, 0, 0, 0),
+                   BV8(0, 0, 1, 0, 0, 1, 1, 0),
+                   BV8(0, 0, 0, 0, 1, 0, 1, 0),
+                   BV8(1, 1, 1, 0, 0, 0, 1, 1),
+                   BV8(1, 1, 1, 0, 1, 1, 0, 0),
+                   BV8(0, 1, 1, 0, 1, 0, 1, 1),
+                   BV8(1, 0, 1, 1, 1, 1, 0, 1),
+                   BV8(1, 0, 0, 1, 0, 0, 1, 1))
+
+/* Identity matrix: */
+.Ltf_id_bitmatrix:
+       .quad BM8X8(BV8(1, 0, 0, 0, 0, 0, 0, 0),
+                   BV8(0, 1, 0, 0, 0, 0, 0, 0),
+                   BV8(0, 0, 1, 0, 0, 0, 0, 0),
+                   BV8(0, 0, 0, 1, 0, 0, 0, 0),
+                   BV8(0, 0, 0, 0, 1, 0, 0, 0),
+                   BV8(0, 0, 0, 0, 0, 1, 0, 0),
+                   BV8(0, 0, 0, 0, 0, 0, 1, 0),
+                   BV8(0, 0, 0, 0, 0, 0, 0, 1))
+
+/* CTR byte addition constants */
+.align 64
+.Lbige_addb_0_1:
+       .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+       .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1
+.Lbige_addb_2_3:
+       .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2
+       .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3
+.Lbige_addb_4_5:
+       .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4
+       .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5
+.Lbige_addb_6_7:
+       .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6
+       .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7
+.Lbige_addb_8_9:
+       .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 8
+       .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 9
+.Lbige_addb_10_11:
+       .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 10
+       .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 11
+.Lbige_addb_12_13:
+       .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 12
+       .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 13
+.Lbige_addb_14_15:
+       .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 14
+       .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 15
+.Lbige_addb_16:
+       .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 16
+
+.text
+
+.align 16
+ELF(.type __aria_gfni_avx512_crypt_64way,@function;)
+__aria_gfni_avx512_crypt_64way:
+       /* input:
+        *      %r9: rk
+        *      %rsi: dst
+        *      %rdx: src
+        *      %zmm0..%zmm15: byte-sliced blocks
+        */
+       CFI_STARTPROC();
+
+       movq %rsi, %rax;
+       leaq 8 * 64(%rax), %r8;
+
+       movl ARIA_CTX_rounds(CTX), %r10d;
+       subl $2, %r10d;
+
+       inpack16_post(%zmm0, %zmm1, %zmm2, %zmm3,
+                     %zmm4, %zmm5, %zmm6, %zmm7,
+                     %zmm8, %zmm9, %zmm10, %zmm11,
+                     %zmm12, %zmm13, %zmm14,
+                     %zmm15, %rax, %r8);
+       aria_fo_gfni(%zmm0, %zmm1, %zmm2, %zmm3,
+                    %zmm4, %zmm5, %zmm6, %zmm7,
+                    %zmm8, %zmm9, %zmm10, %zmm11,
+                    %zmm12, %zmm13, %zmm14, %zmm15,
+                    %zmm24, %zmm25, %zmm26, %zmm27,
+                    %zmm28, %zmm29, %zmm30, %zmm31,
+                    %rax, %r9, 0);
+       leaq 1*16(%r9), %r9;
+
+.align 16
+.Loop_gfni:
+       aria_fe_gfni(%zmm3, %zmm2, %zmm1, %zmm0,
+                    %zmm6, %zmm7, %zmm4, %zmm5,
+                    %zmm9, %zmm8, %zmm11, %zmm10,
+                    %zmm12, %zmm13, %zmm14, %zmm15,
+                    %zmm24, %zmm25, %zmm26, %zmm27,
+                    %zmm28, %zmm29, %zmm30, %zmm31,
+                    %rax, %r9, 0);
+       aria_fo_gfni(%zmm0, %zmm1, %zmm2, %zmm3,
+                    %zmm4, %zmm5, %zmm6, %zmm7,
+                    %zmm8, %zmm9, %zmm10, %zmm11,
+                    %zmm12, %zmm13, %zmm14, %zmm15,
+                    %zmm24, %zmm25, %zmm26, %zmm27,
+                    %zmm28, %zmm29, %zmm30, %zmm31,
+                    %rax, %r9, 1);
+       leaq 2*16(%r9), %r9;
+       subl $2, %r10d;
+       jnz .Loop_gfni;
+
+       aria_ff_gfni(%zmm3, %zmm2, %zmm1, %zmm0,
+                    %zmm6, %zmm7, %zmm4, %zmm5,
+                    %zmm9, %zmm8, %zmm11, %zmm10,
+                    %zmm12, %zmm13, %zmm14, %zmm15,
+                    %zmm24, %zmm25, %zmm26, %zmm27,
+                    %zmm28, %zmm29, %zmm30, %zmm31,
+                    %rax, %r9, 0, 1);
+
+       debyteslice_16x16b(%zmm9, %zmm12, %zmm3, %zmm6,
+                          %zmm8, %zmm13, %zmm2, %zmm7,
+                          %zmm11, %zmm14, %zmm1, %zmm4,
+                          %zmm10, %zmm15, %zmm0, %zmm5,
+                          (%rax), (%r8));
+
+       ret_spec_stop;
+       CFI_ENDPROC();
+ELF(.size __aria_gfni_avx512_crypt_64way,.-__aria_gfni_avx512_crypt_64way;)
+
+.align 16
+.globl _gcry_aria_gfni_avx512_ecb_crypt_blk64
+ELF(.type _gcry_aria_gfni_avx512_ecb_crypt_blk64,@function;)
+_gcry_aria_gfni_avx512_ecb_crypt_blk64:
+       /* input:
+        *      %rdi: ctx, CTX
+        *      %rsi: dst
+        *      %rdx: src
+        *      %rcx: round keys
+        */
+       CFI_STARTPROC();
+       spec_stop_avx512;
+
+       pushq %rbp;
+       CFI_PUSH(%rbp);
+       movq %rsp, %rbp;
+       CFI_DEF_CFA_REGISTER(%rbp);
+
+       subq $(16 * 64), %rsp;
+       andq $~63, %rsp;
+
+       movq %rcx, %r9;
+       movq %rsi, %r11;
+       movq %rsp, %rsi; /* use stack for temporary store */
+
+       inpack16_pre(%zmm0, %zmm1, %zmm2, %zmm3, %zmm4, %zmm5, %zmm6, %zmm7,
+                    %zmm8, %zmm9, %zmm10, %zmm11, %zmm12, %zmm13, %zmm14,
+                    %zmm15, %rdx);
+
+       call __aria_gfni_avx512_crypt_64way;
+
+       write_output(%zmm3, %zmm2, %zmm1, %zmm0, %zmm6, %zmm7, %zmm4, %zmm5,
+                    %zmm9, %zmm8, %zmm11, %zmm10, %zmm12, %zmm13, %zmm14,
+                    %zmm15, %r11);
+
+       movl $STACK_DEPTH, %eax;
+       leave;
+       CFI_LEAVE();
+       clear_regs();
+       ret_spec_stop;
+       CFI_ENDPROC();
+ELF(.size _gcry_aria_gfni_avx512_ecb_crypt_blk64,
+         .-_gcry_aria_gfni_avx512_ecb_crypt_blk64;)
+
+.align 16
+ELF(.type __aria_gfni_avx512_ctr_gen_keystream_64way,@function;)
+__aria_gfni_avx512_ctr_gen_keystream_64way:
+       /* input:
+        *      %rdi: ctx
+        *      %rsi: dst
+        *      %rdx: src
+        *      %rcx: keystream
+        *      %r8: iv (big endian, 128bit)
+        */
+       CFI_STARTPROC();
+
+       cmpb $(0x100 - 64), 15(%r8);
+       jbe .Lctr_byteadd;
+
+       vbroadcasti64x2 .Lbswap128_mask rRIP, %zmm19;
+       vmovdqa64 .Lcounter0123_lo rRIP, %zmm21;
+       vbroadcasti64x2 .Lcounter4444_lo rRIP, %zmm22;
+       vbroadcasti64x2 .Lcounter8888_lo rRIP, %zmm23;
+       vbroadcasti64x2 .Lcounter16161616_lo rRIP, %zmm24;
+       vbroadcasti64x2 .Lcounter1111_hi rRIP, %zmm25;
+
+       /* load IV and byteswap */
+       movq 8(%r8), %r11;
+       movq (%r8), %r10;
+       bswapq %r11;
+       bswapq %r10;
+       vbroadcasti64x2 (%r8), %zmm20;
+       vpshufb %zmm19, %zmm20, %zmm20;
+
+       /* check need for handling 64-bit overflow and carry */
+       cmpq $(0xffffffffffffffff - 64), %r11;
+       ja .Lload_ctr_carry;
+
+       /* construct IVs */
+       vpaddq %zmm21, %zmm20, %zmm0; /* +0:+1:+2:+3 */
+       vpaddq %zmm22, %zmm0, %zmm1; /* +4:+5:+6:+7 */
+       vpaddq %zmm23, %zmm0, %zmm2; /* +8:+9:+10:+11 */
+       vpaddq %zmm23, %zmm1, %zmm3; /* +12:+13:+14:+15 */
+       vpaddq %zmm24, %zmm0, %zmm4; /* +16... */
+       vpaddq %zmm24, %zmm1, %zmm5; /* +20... */
+       vpaddq %zmm24, %zmm2, %zmm6; /* +24... */
+       vpaddq %zmm24, %zmm3, %zmm7; /* +28... */
+       vpaddq %zmm24, %zmm4, %zmm8; /* +32... */
+       vpaddq %zmm24, %zmm5, %zmm9; /* +36... */
+       vpaddq %zmm24, %zmm6, %zmm10; /* +40... */
+       vpaddq %zmm24, %zmm7, %zmm11; /* +44... */
+       vpaddq %zmm24, %zmm8, %zmm12; /* +48... */
+       vpaddq %zmm24, %zmm9, %zmm13; /* +52... */
+       vpaddq %zmm24, %zmm10, %zmm14; /* +56... */
+       vpaddq %zmm24, %zmm11, %zmm15; /* +60... */
+       jmp .Lload_ctr_done;
+
+.Lload_ctr_carry:
+       /* construct IVs */
+       add_le128(%zmm0, %zmm20, %zmm21, %zmm25); /* +0:+1:+2:+3 */
+       add_le128(%zmm1, %zmm0, %zmm22, %zmm25); /* +4:+5:+6:+7 */
+       add_le128(%zmm2, %zmm0, %zmm23, %zmm25); /* +8:+9:+10:+11 */
+       add_le128(%zmm3, %zmm1, %zmm23, %zmm25); /* +12:+13:+14:+15 */
+       add_le128(%zmm4, %zmm0, %zmm24, %zmm25); /* +16... */
+       add_le128(%zmm5, %zmm1, %zmm24, %zmm25); /* +20... */
+       add_le128(%zmm6, %zmm2, %zmm24, %zmm25); /* +24... */
+       add_le128(%zmm7, %zmm3, %zmm24, %zmm25); /* +28... */
+       add_le128(%zmm8, %zmm4, %zmm24, %zmm25); /* +32... */
+       add_le128(%zmm9, %zmm5, %zmm24, %zmm25); /* +36... */
+       add_le128(%zmm10, %zmm6, %zmm24, %zmm25); /* +40... */
+       add_le128(%zmm11, %zmm7, %zmm24, %zmm25); /* +44... */
+       add_le128(%zmm12, %zmm8, %zmm24, %zmm25); /* +48... */
+       add_le128(%zmm13, %zmm9, %zmm24, %zmm25); /* +52... */
+       add_le128(%zmm14, %zmm10, %zmm24, %zmm25); /* +56... */
+       add_le128(%zmm15, %zmm11, %zmm24, %zmm25); /* +60... */
+
+.Lload_ctr_done:
+       /* Byte-swap IVs and update counter. */
+       addq $64, %r11;
+       adcq $0, %r10;
+       vpshufb %zmm19, %zmm15, %zmm15;
+       vpshufb %zmm19, %zmm14, %zmm14;
+       vpshufb %zmm19, %zmm13, %zmm13;
+       vpshufb %zmm19, %zmm12, %zmm12;
+       vpshufb %zmm19, %zmm11, %zmm11;
+       vpshufb %zmm19, %zmm10, %zmm10;
+       vpshufb %zmm19, %zmm9, %zmm9;
+       vpshufb %zmm19, %zmm8, %zmm8;
+       bswapq %r11;
+       bswapq %r10;
+       vpshufb %zmm19, %zmm7, %zmm7;
+       vpshufb %zmm19, %zmm6, %zmm6;
+       vpshufb %zmm19, %zmm5, %zmm5;
+       vpshufb %zmm19, %zmm4, %zmm4;
+       vpshufb %zmm19, %zmm3, %zmm3;
+       vpshufb %zmm19, %zmm2, %zmm2;
+       vpshufb %zmm19, %zmm1, %zmm1;
+       vpshufb %zmm19, %zmm0, %zmm0;
+       movq %r11, 8(%r8);
+       movq %r10, (%r8);
+
+       ret_spec_stop;
+
+.align 16
+.Lctr_byteadd_full_ctr_carry:
+       movq 8(%r8), %r11;
+       movq (%r8), %r10;
+       bswapq %r11;
+       bswapq %r10;
+       addq $64, %r11;
+       adcq $0, %r10;
+       bswapq %r11;
+       bswapq %r10;
+       movq %r11, 8(%r8);
+       movq %r10, (%r8);
+       jmp .Lctr_byteadd_zmm;
+.align 16
+.Lctr_byteadd:
+       vbroadcasti64x2 (%r8), %zmm3;
+       je .Lctr_byteadd_full_ctr_carry;
+       addb $64, 15(%r8);
+.Lctr_byteadd_zmm:
+       vbroadcasti64x2 .Lbige_addb_16 rRIP, %zmm16;
+       vmovdqa64 .Lbige_addb_0_1 rRIP, %zmm17;
+       vmovdqa64 .Lbige_addb_4_5 rRIP, %zmm18;
+       vmovdqa64 .Lbige_addb_8_9 rRIP, %zmm19;
+       vmovdqa64 .Lbige_addb_12_13 rRIP, %zmm20;
+       vpaddb %zmm16, %zmm3, %zmm7;
+       vpaddb %zmm17, %zmm3, %zmm0;
+       vpaddb %zmm18, %zmm3, %zmm1;
+       vpaddb %zmm19, %zmm3, %zmm2;
+       vpaddb %zmm20, %zmm3, %zmm3;
+       vpaddb %zmm16, %zmm7, %zmm11;
+       vpaddb %zmm17, %zmm7, %zmm4;
+       vpaddb %zmm18, %zmm7, %zmm5;
+       vpaddb %zmm19, %zmm7, %zmm6;
+       vpaddb %zmm20, %zmm7, %zmm7;
+       vpaddb %zmm16, %zmm11, %zmm15;
+       vpaddb %zmm17, %zmm11, %zmm8;
+       vpaddb %zmm18, %zmm11, %zmm9;
+       vpaddb %zmm19, %zmm11, %zmm10;
+       vpaddb %zmm20, %zmm11, %zmm11;
+       vpaddb %zmm17, %zmm15, %zmm12;
+       vpaddb %zmm18, %zmm15, %zmm13;
+       vpaddb %zmm19, %zmm15, %zmm14;
+       vpaddb %zmm20, %zmm15, %zmm15;
+
+       ret_spec_stop;
+       CFI_ENDPROC();
+ELF(.size __aria_gfni_avx512_ctr_gen_keystream_64way,
+         .-__aria_gfni_avx512_ctr_gen_keystream_64way;)
+
+.align 16
+.globl _gcry_aria_gfni_avx512_ctr_crypt_blk64
+ELF(.type _gcry_aria_gfni_avx512_ctr_crypt_blk64,@function;)
+_gcry_aria_gfni_avx512_ctr_crypt_blk64:
+       /* input:
+        *      %rdi: ctx
+        *      %rsi: dst
+        *      %rdx: src
+        *      %rcx: iv (big endian, 128bit)
+        */
+       CFI_STARTPROC();
+       spec_stop_avx512;
+
+       pushq %rbp;
+       CFI_PUSH(%rbp);
+       movq %rsp, %rbp;
+       CFI_DEF_CFA_REGISTER(%rbp);
+
+       subq $(16 * 64), %rsp;
+       andq $~63, %rsp;
+
+       movq %rcx, %r8;  /* %r8: iv */
+       movq %rsp, %rcx; /* %rcx: keystream */
+       call __aria_gfni_avx512_ctr_gen_keystream_64way
+
+       pushq %rsi;
+       movq %rdx, %r11;
+       movq %rcx, %rsi;
+       movq %rcx, %rdx;
+       leaq ARIA_CTX_enc_key(CTX), %r9;
+
+       call __aria_gfni_avx512_crypt_64way;
+
+       popq %rsi;
+       vpxorq (0 * 64)(%r11), %zmm3, %zmm3;
+       vpxorq (1 * 64)(%r11), %zmm2, %zmm2;
+       vpxorq (2 * 64)(%r11), %zmm1, %zmm1;
+       vpxorq (3 * 64)(%r11), %zmm0, %zmm0;
+       vpxorq (4 * 64)(%r11), %zmm6, %zmm6;
+       vpxorq (5 * 64)(%r11), %zmm7, %zmm7;
+       vpxorq (6 * 64)(%r11), %zmm4, %zmm4;
+       vpxorq (7 * 64)(%r11), %zmm5, %zmm5;
+       vpxorq (8 * 64)(%r11), %zmm9, %zmm9;
+       vpxorq (9 * 64)(%r11), %zmm8, %zmm8;
+       vpxorq (10 * 64)(%r11), %zmm11, %zmm11;
+       vpxorq (11 * 64)(%r11), %zmm10, %zmm10;
+       vpxorq (12 * 64)(%r11), %zmm12, %zmm12;
+       vpxorq (13 * 64)(%r11), %zmm13, %zmm13;
+       vpxorq (14 * 64)(%r11), %zmm14, %zmm14;
+       vpxorq (15 * 64)(%r11), %zmm15, %zmm15;
+       write_output(%zmm3, %zmm2, %zmm1, %zmm0, %zmm6, %zmm7, %zmm4, %zmm5,
+                    %zmm9, %zmm8, %zmm11, %zmm10, %zmm12, %zmm13, %zmm14,
+                    %zmm15, %rsi);
+
+       movl $STACK_DEPTH, %eax;
+       leave;
+       CFI_LEAVE();
+       clear_regs();
+       ret_spec_stop;
+       CFI_ENDPROC();
+ELF(.size _gcry_aria_gfni_avx512_ctr_crypt_blk64,
+         .-_gcry_aria_gfni_avx512_ctr_crypt_blk64;)
+
+#endif /* ENABLE_AVX512_SUPPORT && ENABLE_GFNI_SUPPORT */
+#endif /* __x86_64 */
diff --git a/cipher/aria.c b/cipher/aria.c
new file mode 100644 (file)
index 0000000..bc2d438
--- /dev/null
@@ -0,0 +1,1768 @@
+/* aria.c  -  ARIA Cipher Algorithm
+ *
+ * Copyright (C) 2022-2023 Taehee Yoo <ap420073@gmail.com>
+ * Copyright (C) 2023 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <config.h>
+
+#include "types.h"
+#include "g10lib.h"
+#include "cipher.h"
+#include "bufhelp.h"
+#include "cipher-internal.h"
+#include "bulkhelp.h"
+
+/* Attribute macro to force alignment to 64 bytes.  */
+#ifdef HAVE_GCC_ATTRIBUTE_ALIGNED
+# define ATTR_ALIGNED_64  __attribute__ ((aligned (64)))
+#else
+# define ATTR_ALIGNED_64
+#endif
+
+/* Attribute macro to force inlining of function. */
+#if __GNUC__ >= 4
+#  define ALWAYS_INLINE inline __attribute__ ((always_inline))
+#else
+#  define ALWAYS_INLINE inline
+#endif
+
+/* Attribute macro to prevent inlining of function. */
+#if __GNUC__ >= 4
+#  define NO_INLINE __attribute__ ((noinline))
+#else
+#  define NO_INLINE
+#endif
+
+
+/* USE_AESNI_AVX inidicates whether to compile with Intel AES-NI/AVX code. */
+#undef USE_AESNI_AVX
+#if defined(ENABLE_AESNI_SUPPORT) && defined(ENABLE_AVX_SUPPORT)
+# if defined(__x86_64__) && (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
+     defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS))
+#  define USE_AESNI_AVX 1
+# endif
+#endif
+
+/* USE_GFNI_AVX inidicates whether to compile with Intel GFNI/AVX code. */
+#undef USE_GFNI_AVX
+#if defined(USE_AESNI_AVX) && defined(ENABLE_GFNI_SUPPORT)
+# define USE_GFNI_AVX 1
+#endif
+
+/* USE_AESNI_AVX2 inidicates whether to compile with Intel AES-NI/AVX2 code. */
+#undef USE_AESNI_AVX2
+#if defined(ENABLE_AESNI_SUPPORT) && defined(ENABLE_AVX2_SUPPORT)
+# if defined(__x86_64__) && (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
+     defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS))
+#  define USE_AESNI_AVX2 1
+# endif
+#endif
+
+/* USE_VAES_AVX2 inidicates whether to compile with Intel VAES/AVX2 code. */
+#undef USE_VAES_AVX2
+#if defined(USE_AESNI_AVX2) && defined(HAVE_GCC_INLINE_ASM_VAES_VPCLMUL)
+# define USE_VAES_AVX2 1
+#endif
+
+/* USE_GFNI_AVX2 inidicates whether to compile with Intel GFNI/AVX2 code. */
+#undef USE_GFNI_AVX2
+#if defined(USE_AESNI_AVX2) && defined(ENABLE_GFNI_SUPPORT)
+# define USE_GFNI_AVX2 1
+#endif
+
+/* USE_GFNI_AVX512 inidicates whether to compile with Intel GFNI/AVX512 code. */
+#undef USE_GFNI_AVX512
+#if defined(ENABLE_GFNI_SUPPORT) && defined(ENABLE_AVX512_SUPPORT)
+# if defined(__x86_64__) && (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
+     defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS))
+#  define USE_GFNI_AVX512 1
+# endif
+#endif
+
+/* How many parallel blocks to handle in bulk processing functions. */
+#if defined(USE_GFNI_AVX512)
+# define MAX_PARALLEL_BLKS 64
+#elif defined(USE_AESNI_AVX2)
+# define MAX_PARALLEL_BLKS 32
+#elif defined(USE_AESNI_AVX)
+# define MAX_PARALLEL_BLKS 16
+#else
+# define MAX_PARALLEL_BLKS 8
+#endif
+
+/* Assembly implementations use SystemV ABI, ABI conversion and additional
+ * stack to store XMM6-XMM15 needed on Win64. */
+#undef ASM_FUNC_ABI
+#undef ASM_EXTRA_STACK
+#if defined(USE_AESNI_AVX) || defined(USE_AESNI_AVX2) || \
+    defined(USE_GFNI_AVX512)
+# ifdef HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS
+#  define ASM_FUNC_ABI __attribute__((sysv_abi))
+#  define ASM_EXTRA_STACK (10 * 16)
+# else
+#  define ASM_FUNC_ABI
+#  define ASM_EXTRA_STACK 0
+# endif
+#endif
+
+
+static const char *aria_selftest (void);
+
+
+#define ARIA_MIN_KEY_SIZE      16
+#define ARIA_MAX_KEY_SIZE      32
+#define ARIA_BLOCK_SIZE                16
+#define ARIA_MAX_RD_KEYS       17
+#define ARIA_RD_KEY_WORDS      (ARIA_BLOCK_SIZE / sizeof(u32))
+
+
+typedef struct
+{
+  u32 enc_key[ARIA_MAX_RD_KEYS][ARIA_RD_KEY_WORDS];
+  u32 dec_key[ARIA_MAX_RD_KEYS][ARIA_RD_KEY_WORDS];
+  int rounds;
+
+  unsigned int decryption_prepared:1; /* The decryption key is set up. */
+  unsigned int bulk_prefetch_ready:1; /* Look-up table prefetch ready for
+                                      * current bulk operation. */
+
+#ifdef USE_AESNI_AVX
+  unsigned int use_aesni_avx:1;
+  unsigned int use_gfni_avx:1;
+#endif
+#ifdef USE_AESNI_AVX2
+  unsigned int use_aesni_avx2:1;
+  unsigned int use_vaes_avx2:1;
+  unsigned int use_gfni_avx2:1;
+#endif
+#ifdef USE_GFNI_AVX512
+  unsigned int use_gfni_avx512:1;
+#endif
+} ARIA_context;
+
+
+static const u32 key_rc[20] =
+  {
+    0x517cc1b7, 0x27220a94, 0xfe13abe8, 0xfa9a6ee0,
+    0x6db14acc, 0x9e21c820, 0xff28b1d5, 0xef5de2b0,
+    0xdb92371d, 0x2126e970, 0x03249775, 0x04e8c90e,
+    0x517cc1b7, 0x27220a94, 0xfe13abe8, 0xfa9a6ee0,
+    0x6db14acc, 0x9e21c820, 0xff28b1d5, 0xef5de2b0
+  };
+
+
+static struct
+{
+  volatile u32 counter_head;
+  u32 cacheline_align[64 / 4 - 1];
+  u32 s1[256];
+  u32 s2[256];
+  u32 x1[256];
+  u32 x2[256];
+  volatile u32 counter_tail;
+} sboxes ATTR_ALIGNED_64 =
+  {
+    0,
+    { 0, },
+
+    { /* s1 */
+      0x00636363, 0x007c7c7c, 0x00777777, 0x007b7b7b,
+      0x00f2f2f2, 0x006b6b6b, 0x006f6f6f, 0x00c5c5c5,
+      0x00303030, 0x00010101, 0x00676767, 0x002b2b2b,
+      0x00fefefe, 0x00d7d7d7, 0x00ababab, 0x00767676,
+      0x00cacaca, 0x00828282, 0x00c9c9c9, 0x007d7d7d,
+      0x00fafafa, 0x00595959, 0x00474747, 0x00f0f0f0,
+      0x00adadad, 0x00d4d4d4, 0x00a2a2a2, 0x00afafaf,
+      0x009c9c9c, 0x00a4a4a4, 0x00727272, 0x00c0c0c0,
+      0x00b7b7b7, 0x00fdfdfd, 0x00939393, 0x00262626,
+      0x00363636, 0x003f3f3f, 0x00f7f7f7, 0x00cccccc,
+      0x00343434, 0x00a5a5a5, 0x00e5e5e5, 0x00f1f1f1,
+      0x00717171, 0x00d8d8d8, 0x00313131, 0x00151515,
+      0x00040404, 0x00c7c7c7, 0x00232323, 0x00c3c3c3,
+      0x00181818, 0x00969696, 0x00050505, 0x009a9a9a,
+      0x00070707, 0x00121212, 0x00808080, 0x00e2e2e2,
+      0x00ebebeb, 0x00272727, 0x00b2b2b2, 0x00757575,
+      0x00090909, 0x00838383, 0x002c2c2c, 0x001a1a1a,
+      0x001b1b1b, 0x006e6e6e, 0x005a5a5a, 0x00a0a0a0,
+      0x00525252, 0x003b3b3b, 0x00d6d6d6, 0x00b3b3b3,
+      0x00292929, 0x00e3e3e3, 0x002f2f2f, 0x00848484,
+      0x00535353, 0x00d1d1d1, 0x00000000, 0x00ededed,
+      0x00202020, 0x00fcfcfc, 0x00b1b1b1, 0x005b5b5b,
+      0x006a6a6a, 0x00cbcbcb, 0x00bebebe, 0x00393939,
+      0x004a4a4a, 0x004c4c4c, 0x00585858, 0x00cfcfcf,
+      0x00d0d0d0, 0x00efefef, 0x00aaaaaa, 0x00fbfbfb,
+      0x00434343, 0x004d4d4d, 0x00333333, 0x00858585,
+      0x00454545, 0x00f9f9f9, 0x00020202, 0x007f7f7f,
+      0x00505050, 0x003c3c3c, 0x009f9f9f, 0x00a8a8a8,
+      0x00515151, 0x00a3a3a3, 0x00404040, 0x008f8f8f,
+      0x00929292, 0x009d9d9d, 0x00383838, 0x00f5f5f5,
+      0x00bcbcbc, 0x00b6b6b6, 0x00dadada, 0x00212121,
+      0x00101010, 0x00ffffff, 0x00f3f3f3, 0x00d2d2d2,
+      0x00cdcdcd, 0x000c0c0c, 0x00131313, 0x00ececec,
+      0x005f5f5f, 0x00979797, 0x00444444, 0x00171717,
+      0x00c4c4c4, 0x00a7a7a7, 0x007e7e7e, 0x003d3d3d,
+      0x00646464, 0x005d5d5d, 0x00191919, 0x00737373,
+      0x00606060, 0x00818181, 0x004f4f4f, 0x00dcdcdc,
+      0x00222222, 0x002a2a2a, 0x00909090, 0x00888888,
+      0x00464646, 0x00eeeeee, 0x00b8b8b8, 0x00141414,
+      0x00dedede, 0x005e5e5e, 0x000b0b0b, 0x00dbdbdb,
+      0x00e0e0e0, 0x00323232, 0x003a3a3a, 0x000a0a0a,
+      0x00494949, 0x00060606, 0x00242424, 0x005c5c5c,
+      0x00c2c2c2, 0x00d3d3d3, 0x00acacac, 0x00626262,
+      0x00919191, 0x00959595, 0x00e4e4e4, 0x00797979,
+      0x00e7e7e7, 0x00c8c8c8, 0x00373737, 0x006d6d6d,
+      0x008d8d8d, 0x00d5d5d5, 0x004e4e4e, 0x00a9a9a9,
+      0x006c6c6c, 0x00565656, 0x00f4f4f4, 0x00eaeaea,
+      0x00656565, 0x007a7a7a, 0x00aeaeae, 0x00080808,
+      0x00bababa, 0x00787878, 0x00252525, 0x002e2e2e,
+      0x001c1c1c, 0x00a6a6a6, 0x00b4b4b4, 0x00c6c6c6,
+      0x00e8e8e8, 0x00dddddd, 0x00747474, 0x001f1f1f,
+      0x004b4b4b, 0x00bdbdbd, 0x008b8b8b, 0x008a8a8a,
+      0x00707070, 0x003e3e3e, 0x00b5b5b5, 0x00666666,
+      0x00484848, 0x00030303, 0x00f6f6f6, 0x000e0e0e,
+      0x00616161, 0x00353535, 0x00575757, 0x00b9b9b9,
+      0x00868686, 0x00c1c1c1, 0x001d1d1d, 0x009e9e9e,
+      0x00e1e1e1, 0x00f8f8f8, 0x00989898, 0x00111111,
+      0x00696969, 0x00d9d9d9, 0x008e8e8e, 0x00949494,
+      0x009b9b9b, 0x001e1e1e, 0x00878787, 0x00e9e9e9,
+      0x00cecece, 0x00555555, 0x00282828, 0x00dfdfdf,
+      0x008c8c8c, 0x00a1a1a1, 0x00898989, 0x000d0d0d,
+      0x00bfbfbf, 0x00e6e6e6, 0x00424242, 0x00686868,
+      0x00414141, 0x00999999, 0x002d2d2d, 0x000f0f0f,
+      0x00b0b0b0, 0x00545454, 0x00bbbbbb, 0x00161616
+    },
+    { /* s2 */
+      0xe200e2e2, 0x4e004e4e, 0x54005454, 0xfc00fcfc,
+      0x94009494, 0xc200c2c2, 0x4a004a4a, 0xcc00cccc,
+      0x62006262, 0x0d000d0d, 0x6a006a6a, 0x46004646,
+      0x3c003c3c, 0x4d004d4d, 0x8b008b8b, 0xd100d1d1,
+      0x5e005e5e, 0xfa00fafa, 0x64006464, 0xcb00cbcb,
+      0xb400b4b4, 0x97009797, 0xbe00bebe, 0x2b002b2b,
+      0xbc00bcbc, 0x77007777, 0x2e002e2e, 0x03000303,
+      0xd300d3d3, 0x19001919, 0x59005959, 0xc100c1c1,
+      0x1d001d1d, 0x06000606, 0x41004141, 0x6b006b6b,
+      0x55005555, 0xf000f0f0, 0x99009999, 0x69006969,
+      0xea00eaea, 0x9c009c9c, 0x18001818, 0xae00aeae,
+      0x63006363, 0xdf00dfdf, 0xe700e7e7, 0xbb00bbbb,
+      0x00000000, 0x73007373, 0x66006666, 0xfb00fbfb,
+      0x96009696, 0x4c004c4c, 0x85008585, 0xe400e4e4,
+      0x3a003a3a, 0x09000909, 0x45004545, 0xaa00aaaa,
+      0x0f000f0f, 0xee00eeee, 0x10001010, 0xeb00ebeb,
+      0x2d002d2d, 0x7f007f7f, 0xf400f4f4, 0x29002929,
+      0xac00acac, 0xcf00cfcf, 0xad00adad, 0x91009191,
+      0x8d008d8d, 0x78007878, 0xc800c8c8, 0x95009595,
+      0xf900f9f9, 0x2f002f2f, 0xce00cece, 0xcd00cdcd,
+      0x08000808, 0x7a007a7a, 0x88008888, 0x38003838,
+      0x5c005c5c, 0x83008383, 0x2a002a2a, 0x28002828,
+      0x47004747, 0xdb00dbdb, 0xb800b8b8, 0xc700c7c7,
+      0x93009393, 0xa400a4a4, 0x12001212, 0x53005353,
+      0xff00ffff, 0x87008787, 0x0e000e0e, 0x31003131,
+      0x36003636, 0x21002121, 0x58005858, 0x48004848,
+      0x01000101, 0x8e008e8e, 0x37003737, 0x74007474,
+      0x32003232, 0xca00caca, 0xe900e9e9, 0xb100b1b1,
+      0xb700b7b7, 0xab00abab, 0x0c000c0c, 0xd700d7d7,
+      0xc400c4c4, 0x56005656, 0x42004242, 0x26002626,
+      0x07000707, 0x98009898, 0x60006060, 0xd900d9d9,
+      0xb600b6b6, 0xb900b9b9, 0x11001111, 0x40004040,
+      0xec00ecec, 0x20002020, 0x8c008c8c, 0xbd00bdbd,
+      0xa000a0a0, 0xc900c9c9, 0x84008484, 0x04000404,
+      0x49004949, 0x23002323, 0xf100f1f1, 0x4f004f4f,
+      0x50005050, 0x1f001f1f, 0x13001313, 0xdc00dcdc,
+      0xd800d8d8, 0xc000c0c0, 0x9e009e9e, 0x57005757,
+      0xe300e3e3, 0xc300c3c3, 0x7b007b7b, 0x65006565,
+      0x3b003b3b, 0x02000202, 0x8f008f8f, 0x3e003e3e,
+      0xe800e8e8, 0x25002525, 0x92009292, 0xe500e5e5,
+      0x15001515, 0xdd00dddd, 0xfd00fdfd, 0x17001717,
+      0xa900a9a9, 0xbf00bfbf, 0xd400d4d4, 0x9a009a9a,
+      0x7e007e7e, 0xc500c5c5, 0x39003939, 0x67006767,
+      0xfe00fefe, 0x76007676, 0x9d009d9d, 0x43004343,
+      0xa700a7a7, 0xe100e1e1, 0xd000d0d0, 0xf500f5f5,
+      0x68006868, 0xf200f2f2, 0x1b001b1b, 0x34003434,
+      0x70007070, 0x05000505, 0xa300a3a3, 0x8a008a8a,
+      0xd500d5d5, 0x79007979, 0x86008686, 0xa800a8a8,
+      0x30003030, 0xc600c6c6, 0x51005151, 0x4b004b4b,
+      0x1e001e1e, 0xa600a6a6, 0x27002727, 0xf600f6f6,
+      0x35003535, 0xd200d2d2, 0x6e006e6e, 0x24002424,
+      0x16001616, 0x82008282, 0x5f005f5f, 0xda00dada,
+      0xe600e6e6, 0x75007575, 0xa200a2a2, 0xef00efef,
+      0x2c002c2c, 0xb200b2b2, 0x1c001c1c, 0x9f009f9f,
+      0x5d005d5d, 0x6f006f6f, 0x80008080, 0x0a000a0a,
+      0x72007272, 0x44004444, 0x9b009b9b, 0x6c006c6c,
+      0x90009090, 0x0b000b0b, 0x5b005b5b, 0x33003333,
+      0x7d007d7d, 0x5a005a5a, 0x52005252, 0xf300f3f3,
+      0x61006161, 0xa100a1a1, 0xf700f7f7, 0xb000b0b0,
+      0xd600d6d6, 0x3f003f3f, 0x7c007c7c, 0x6d006d6d,
+      0xed00eded, 0x14001414, 0xe000e0e0, 0xa500a5a5,
+      0x3d003d3d, 0x22002222, 0xb300b3b3, 0xf800f8f8,
+      0x89008989, 0xde00dede, 0x71007171, 0x1a001a1a,
+      0xaf00afaf, 0xba00baba, 0xb500b5b5, 0x81008181
+    },
+    { /* x1 */
+      0x52520052, 0x09090009, 0x6a6a006a, 0xd5d500d5,
+      0x30300030, 0x36360036, 0xa5a500a5, 0x38380038,
+      0xbfbf00bf, 0x40400040, 0xa3a300a3, 0x9e9e009e,
+      0x81810081, 0xf3f300f3, 0xd7d700d7, 0xfbfb00fb,
+      0x7c7c007c, 0xe3e300e3, 0x39390039, 0x82820082,
+      0x9b9b009b, 0x2f2f002f, 0xffff00ff, 0x87870087,
+      0x34340034, 0x8e8e008e, 0x43430043, 0x44440044,
+      0xc4c400c4, 0xdede00de, 0xe9e900e9, 0xcbcb00cb,
+      0x54540054, 0x7b7b007b, 0x94940094, 0x32320032,
+      0xa6a600a6, 0xc2c200c2, 0x23230023, 0x3d3d003d,
+      0xeeee00ee, 0x4c4c004c, 0x95950095, 0x0b0b000b,
+      0x42420042, 0xfafa00fa, 0xc3c300c3, 0x4e4e004e,
+      0x08080008, 0x2e2e002e, 0xa1a100a1, 0x66660066,
+      0x28280028, 0xd9d900d9, 0x24240024, 0xb2b200b2,
+      0x76760076, 0x5b5b005b, 0xa2a200a2, 0x49490049,
+      0x6d6d006d, 0x8b8b008b, 0xd1d100d1, 0x25250025,
+      0x72720072, 0xf8f800f8, 0xf6f600f6, 0x64640064,
+      0x86860086, 0x68680068, 0x98980098, 0x16160016,
+      0xd4d400d4, 0xa4a400a4, 0x5c5c005c, 0xcccc00cc,
+      0x5d5d005d, 0x65650065, 0xb6b600b6, 0x92920092,
+      0x6c6c006c, 0x70700070, 0x48480048, 0x50500050,
+      0xfdfd00fd, 0xeded00ed, 0xb9b900b9, 0xdada00da,
+      0x5e5e005e, 0x15150015, 0x46460046, 0x57570057,
+      0xa7a700a7, 0x8d8d008d, 0x9d9d009d, 0x84840084,
+      0x90900090, 0xd8d800d8, 0xabab00ab, 0x00000000,
+      0x8c8c008c, 0xbcbc00bc, 0xd3d300d3, 0x0a0a000a,
+      0xf7f700f7, 0xe4e400e4, 0x58580058, 0x05050005,
+      0xb8b800b8, 0xb3b300b3, 0x45450045, 0x06060006,
+      0xd0d000d0, 0x2c2c002c, 0x1e1e001e, 0x8f8f008f,
+      0xcaca00ca, 0x3f3f003f, 0x0f0f000f, 0x02020002,
+      0xc1c100c1, 0xafaf00af, 0xbdbd00bd, 0x03030003,
+      0x01010001, 0x13130013, 0x8a8a008a, 0x6b6b006b,
+      0x3a3a003a, 0x91910091, 0x11110011, 0x41410041,
+      0x4f4f004f, 0x67670067, 0xdcdc00dc, 0xeaea00ea,
+      0x97970097, 0xf2f200f2, 0xcfcf00cf, 0xcece00ce,
+      0xf0f000f0, 0xb4b400b4, 0xe6e600e6, 0x73730073,
+      0x96960096, 0xacac00ac, 0x74740074, 0x22220022,
+      0xe7e700e7, 0xadad00ad, 0x35350035, 0x85850085,
+      0xe2e200e2, 0xf9f900f9, 0x37370037, 0xe8e800e8,
+      0x1c1c001c, 0x75750075, 0xdfdf00df, 0x6e6e006e,
+      0x47470047, 0xf1f100f1, 0x1a1a001a, 0x71710071,
+      0x1d1d001d, 0x29290029, 0xc5c500c5, 0x89890089,
+      0x6f6f006f, 0xb7b700b7, 0x62620062, 0x0e0e000e,
+      0xaaaa00aa, 0x18180018, 0xbebe00be, 0x1b1b001b,
+      0xfcfc00fc, 0x56560056, 0x3e3e003e, 0x4b4b004b,
+      0xc6c600c6, 0xd2d200d2, 0x79790079, 0x20200020,
+      0x9a9a009a, 0xdbdb00db, 0xc0c000c0, 0xfefe00fe,
+      0x78780078, 0xcdcd00cd, 0x5a5a005a, 0xf4f400f4,
+      0x1f1f001f, 0xdddd00dd, 0xa8a800a8, 0x33330033,
+      0x88880088, 0x07070007, 0xc7c700c7, 0x31310031,
+      0xb1b100b1, 0x12120012, 0x10100010, 0x59590059,
+      0x27270027, 0x80800080, 0xecec00ec, 0x5f5f005f,
+      0x60600060, 0x51510051, 0x7f7f007f, 0xa9a900a9,
+      0x19190019, 0xb5b500b5, 0x4a4a004a, 0x0d0d000d,
+      0x2d2d002d, 0xe5e500e5, 0x7a7a007a, 0x9f9f009f,
+      0x93930093, 0xc9c900c9, 0x9c9c009c, 0xefef00ef,
+      0xa0a000a0, 0xe0e000e0, 0x3b3b003b, 0x4d4d004d,
+      0xaeae00ae, 0x2a2a002a, 0xf5f500f5, 0xb0b000b0,
+      0xc8c800c8, 0xebeb00eb, 0xbbbb00bb, 0x3c3c003c,
+      0x83830083, 0x53530053, 0x99990099, 0x61610061,
+      0x17170017, 0x2b2b002b, 0x04040004, 0x7e7e007e,
+      0xbaba00ba, 0x77770077, 0xd6d600d6, 0x26260026,
+      0xe1e100e1, 0x69690069, 0x14140014, 0x63630063,
+      0x55550055, 0x21210021, 0x0c0c000c, 0x7d7d007d
+    },
+    { /* x2 */
+      0x30303000, 0x68686800, 0x99999900, 0x1b1b1b00,
+      0x87878700, 0xb9b9b900, 0x21212100, 0x78787800,
+      0x50505000, 0x39393900, 0xdbdbdb00, 0xe1e1e100,
+      0x72727200, 0x09090900, 0x62626200, 0x3c3c3c00,
+      0x3e3e3e00, 0x7e7e7e00, 0x5e5e5e00, 0x8e8e8e00,
+      0xf1f1f100, 0xa0a0a000, 0xcccccc00, 0xa3a3a300,
+      0x2a2a2a00, 0x1d1d1d00, 0xfbfbfb00, 0xb6b6b600,
+      0xd6d6d600, 0x20202000, 0xc4c4c400, 0x8d8d8d00,
+      0x81818100, 0x65656500, 0xf5f5f500, 0x89898900,
+      0xcbcbcb00, 0x9d9d9d00, 0x77777700, 0xc6c6c600,
+      0x57575700, 0x43434300, 0x56565600, 0x17171700,
+      0xd4d4d400, 0x40404000, 0x1a1a1a00, 0x4d4d4d00,
+      0xc0c0c000, 0x63636300, 0x6c6c6c00, 0xe3e3e300,
+      0xb7b7b700, 0xc8c8c800, 0x64646400, 0x6a6a6a00,
+      0x53535300, 0xaaaaaa00, 0x38383800, 0x98989800,
+      0x0c0c0c00, 0xf4f4f400, 0x9b9b9b00, 0xededed00,
+      0x7f7f7f00, 0x22222200, 0x76767600, 0xafafaf00,
+      0xdddddd00, 0x3a3a3a00, 0x0b0b0b00, 0x58585800,
+      0x67676700, 0x88888800, 0x06060600, 0xc3c3c300,
+      0x35353500, 0x0d0d0d00, 0x01010100, 0x8b8b8b00,
+      0x8c8c8c00, 0xc2c2c200, 0xe6e6e600, 0x5f5f5f00,
+      0x02020200, 0x24242400, 0x75757500, 0x93939300,
+      0x66666600, 0x1e1e1e00, 0xe5e5e500, 0xe2e2e200,
+      0x54545400, 0xd8d8d800, 0x10101000, 0xcecece00,
+      0x7a7a7a00, 0xe8e8e800, 0x08080800, 0x2c2c2c00,
+      0x12121200, 0x97979700, 0x32323200, 0xababab00,
+      0xb4b4b400, 0x27272700, 0x0a0a0a00, 0x23232300,
+      0xdfdfdf00, 0xefefef00, 0xcacaca00, 0xd9d9d900,
+      0xb8b8b800, 0xfafafa00, 0xdcdcdc00, 0x31313100,
+      0x6b6b6b00, 0xd1d1d100, 0xadadad00, 0x19191900,
+      0x49494900, 0xbdbdbd00, 0x51515100, 0x96969600,
+      0xeeeeee00, 0xe4e4e400, 0xa8a8a800, 0x41414100,
+      0xdadada00, 0xffffff00, 0xcdcdcd00, 0x55555500,
+      0x86868600, 0x36363600, 0xbebebe00, 0x61616100,
+      0x52525200, 0xf8f8f800, 0xbbbbbb00, 0x0e0e0e00,
+      0x82828200, 0x48484800, 0x69696900, 0x9a9a9a00,
+      0xe0e0e000, 0x47474700, 0x9e9e9e00, 0x5c5c5c00,
+      0x04040400, 0x4b4b4b00, 0x34343400, 0x15151500,
+      0x79797900, 0x26262600, 0xa7a7a700, 0xdedede00,
+      0x29292900, 0xaeaeae00, 0x92929200, 0xd7d7d700,
+      0x84848400, 0xe9e9e900, 0xd2d2d200, 0xbababa00,
+      0x5d5d5d00, 0xf3f3f300, 0xc5c5c500, 0xb0b0b000,
+      0xbfbfbf00, 0xa4a4a400, 0x3b3b3b00, 0x71717100,
+      0x44444400, 0x46464600, 0x2b2b2b00, 0xfcfcfc00,
+      0xebebeb00, 0x6f6f6f00, 0xd5d5d500, 0xf6f6f600,
+      0x14141400, 0xfefefe00, 0x7c7c7c00, 0x70707000,
+      0x5a5a5a00, 0x7d7d7d00, 0xfdfdfd00, 0x2f2f2f00,
+      0x18181800, 0x83838300, 0x16161600, 0xa5a5a500,
+      0x91919100, 0x1f1f1f00, 0x05050500, 0x95959500,
+      0x74747400, 0xa9a9a900, 0xc1c1c100, 0x5b5b5b00,
+      0x4a4a4a00, 0x85858500, 0x6d6d6d00, 0x13131300,
+      0x07070700, 0x4f4f4f00, 0x4e4e4e00, 0x45454500,
+      0xb2b2b200, 0x0f0f0f00, 0xc9c9c900, 0x1c1c1c00,
+      0xa6a6a600, 0xbcbcbc00, 0xececec00, 0x73737300,
+      0x90909000, 0x7b7b7b00, 0xcfcfcf00, 0x59595900,
+      0x8f8f8f00, 0xa1a1a100, 0xf9f9f900, 0x2d2d2d00,
+      0xf2f2f200, 0xb1b1b100, 0x00000000, 0x94949400,
+      0x37373700, 0x9f9f9f00, 0xd0d0d000, 0x2e2e2e00,
+      0x9c9c9c00, 0x6e6e6e00, 0x28282800, 0x3f3f3f00,
+      0x80808000, 0xf0f0f000, 0x3d3d3d00, 0xd3d3d300,
+      0x25252500, 0x8a8a8a00, 0xb5b5b500, 0xe7e7e700,
+      0x42424200, 0xb3b3b300, 0xc7c7c700, 0xeaeaea00,
+      0xf7f7f700, 0x4c4c4c00, 0x11111100, 0x33333300,
+      0x03030300, 0xa2a2a200, 0xacacac00, 0x60606000
+    },
+    0
+  };
+
+#ifdef USE_AESNI_AVX
+extern unsigned int
+_gcry_aria_aesni_avx_ecb_crypt_blk1_16(const void *ctx, byte *out,
+                                      const byte *in, const void *key,
+                                      u64 nblks) ASM_FUNC_ABI;
+extern unsigned int
+_gcry_aria_aesni_avx_ctr_crypt_blk16(const void *ctx, byte *out,
+                                    const byte *in, byte *iv) ASM_FUNC_ABI;
+
+#ifdef USE_GFNI_AVX
+extern unsigned int
+_gcry_aria_gfni_avx_ecb_crypt_blk1_16(const void *ctx, byte *out,
+                                     const byte *in, const void *key,
+                                     u64 nblks) ASM_FUNC_ABI;
+extern unsigned int
+_gcry_aria_gfni_avx_ctr_crypt_blk16(const void *ctx, byte *out,
+                                   const byte *in, byte *iv) ASM_FUNC_ABI;
+#endif /* USE_GFNI_AVX */
+
+static inline unsigned int
+aria_avx_ecb_crypt_blk1_16(const ARIA_context *ctx, byte *out, const byte *in,
+                          const u32 key[][ARIA_RD_KEY_WORDS], size_t nblks)
+{
+  if (0) { }
+#ifdef USE_GFNI_AVX
+  else if (ctx->use_gfni_avx)
+    return _gcry_aria_gfni_avx_ecb_crypt_blk1_16(ctx, out, in, key, nblks)
+               + ASM_EXTRA_STACK;
+#endif /* USE_GFNI_AVX */
+  else
+    return _gcry_aria_aesni_avx_ecb_crypt_blk1_16(ctx, out, in, key, nblks)
+               + ASM_EXTRA_STACK;
+}
+
+static inline unsigned int
+aria_avx_ctr_crypt_blk16(const ARIA_context *ctx, byte *out, const byte *in,
+                        byte *iv)
+{
+  if (0) { }
+#ifdef USE_GFNI_AVX
+  else if (ctx->use_gfni_avx)
+    return _gcry_aria_gfni_avx_ctr_crypt_blk16(ctx, out, in, iv)
+               + ASM_EXTRA_STACK;
+#endif /* USE_GFNI_AVX */
+  else
+    return _gcry_aria_aesni_avx_ctr_crypt_blk16(ctx, out, in, iv)
+               + ASM_EXTRA_STACK;
+}
+#endif /* USE_AESNI_AVX */
+
+#ifdef USE_AESNI_AVX2
+extern unsigned int
+_gcry_aria_aesni_avx2_ecb_crypt_blk32(const void *ctx, byte *out,
+                                     const byte *in,
+                                     const void *key) ASM_FUNC_ABI;
+extern unsigned int
+_gcry_aria_aesni_avx2_ctr_crypt_blk32(const void *ctx, byte *out,
+                                     const byte *in, byte *iv) ASM_FUNC_ABI;
+
+#ifdef USE_VAES_AVX2
+extern unsigned int
+_gcry_aria_vaes_avx2_ecb_crypt_blk32(const void *ctx, byte *out,
+                                    const byte *in,
+                                    const void *key) ASM_FUNC_ABI;
+extern unsigned int
+_gcry_aria_vaes_avx2_ctr_crypt_blk32(const void *ctx, byte *out,
+                                    const byte *in, byte *iv) ASM_FUNC_ABI;
+#endif /* USE_VAES_AVX2 */
+
+#ifdef USE_GFNI_AVX2
+extern unsigned int
+_gcry_aria_gfni_avx2_ecb_crypt_blk32(const void *ctx, byte *out,
+                                    const byte *in,
+                                    const void *key) ASM_FUNC_ABI;
+extern unsigned int
+_gcry_aria_gfni_avx2_ctr_crypt_blk32(const void *ctx, byte *out,
+                                    const byte *in, byte *iv) ASM_FUNC_ABI;
+#endif /* USE_GFNI_AVX2 */
+
+static inline unsigned int
+aria_avx2_ecb_crypt_blk32(const ARIA_context *ctx, byte *out, const byte *in,
+                         const u32 key[][ARIA_RD_KEY_WORDS])
+{
+  if (0) { }
+#ifdef USE_GFNI_AVX2
+  else if (ctx->use_gfni_avx2)
+    return _gcry_aria_gfni_avx2_ecb_crypt_blk32(ctx, out, in, key)
+               + ASM_EXTRA_STACK;
+#endif /* USE_GFNI_AVX2 */
+#ifdef USE_VAES_AVX2
+  else if (ctx->use_vaes_avx2)
+    return _gcry_aria_vaes_avx2_ecb_crypt_blk32(ctx, out, in, key)
+               + ASM_EXTRA_STACK;
+#endif /* USE_VAES_AVX2 */
+  else
+    return _gcry_aria_aesni_avx2_ecb_crypt_blk32(ctx, out, in, key)
+               + ASM_EXTRA_STACK;
+}
+
+static inline unsigned int
+aria_avx2_ctr_crypt_blk32(const ARIA_context *ctx, byte *out, const byte *in,
+                         byte *iv)
+{
+  if (0) { }
+#ifdef USE_GFNI_AVX2
+  else if (ctx->use_gfni_avx2)
+    return _gcry_aria_gfni_avx2_ctr_crypt_blk32(ctx, out, in, iv)
+               + ASM_EXTRA_STACK;
+#endif /* USE_GFNI_AVX2 */
+#ifdef USE_VAES_AVX2
+  else if (ctx->use_vaes_avx2)
+    return _gcry_aria_vaes_avx2_ctr_crypt_blk32(ctx, out, in, iv)
+               + ASM_EXTRA_STACK;
+#endif /* USE_VAES_AVX2 */
+  else
+    return _gcry_aria_aesni_avx2_ctr_crypt_blk32(ctx, out, in, iv)
+               + ASM_EXTRA_STACK;
+}
+#endif /* USE_AESNI_AVX2 */
+
+#ifdef USE_GFNI_AVX512
+extern unsigned int
+_gcry_aria_gfni_avx512_ecb_crypt_blk64(const void *ctx, byte *out,
+                                      const byte *in,
+                                      const void *key) ASM_FUNC_ABI;
+extern unsigned int
+_gcry_aria_gfni_avx512_ctr_crypt_blk64(const void *ctx, byte *out,
+                                      const byte *in, byte *iv) ASM_FUNC_ABI;
+
+static inline unsigned int
+aria_gfni_avx512_ecb_crypt_blk64(const ARIA_context *ctx, byte *out,
+                                const byte *in,
+                                const u32 key[][ARIA_RD_KEY_WORDS])
+{
+  return _gcry_aria_gfni_avx512_ecb_crypt_blk64(ctx, out, in, key)
+               + ASM_EXTRA_STACK;
+}
+
+static inline unsigned int
+aria_gfni_avx512_ctr_crypt_blk64(const ARIA_context *ctx, byte *out,
+                                const byte *in, byte *iv)
+{
+  return _gcry_aria_gfni_avx512_ctr_crypt_blk64(ctx, out, in, iv)
+               + ASM_EXTRA_STACK;
+}
+#endif /* USE_AESNI_AVX2 */
+
+/* Prefetching for sbox tables. */
+static inline void
+prefetch_table(const volatile byte *tab, size_t len)
+{
+  size_t i;
+
+  for (i = 0; len - i >= 8 * 32; i += 8 * 32)
+    {
+      (void)tab[i + 0 * 32];
+      (void)tab[i + 1 * 32];
+      (void)tab[i + 2 * 32];
+      (void)tab[i + 3 * 32];
+      (void)tab[i + 4 * 32];
+      (void)tab[i + 5 * 32];
+      (void)tab[i + 6 * 32];
+      (void)tab[i + 7 * 32];
+    }
+  for (; i < len; i += 32)
+    {
+      (void)tab[i];
+    }
+
+  (void)tab[len - 1];
+}
+
+static inline void
+prefetch_sboxes(void)
+{
+  /* Modify counters to trigger copy-on-write and unsharing if physical pages
+   * of look-up table are shared between processes.  Modifying counters also
+   * causes checksums for pages to change and hint same-page merging algorithm
+   * that these pages are frequently changing.  */
+  sboxes.counter_head++;
+  sboxes.counter_tail++;
+
+  /* Prefetch look-up tables to cache.  */
+  prefetch_table((const void *)&sboxes, sizeof(sboxes));
+}
+
+
+static ALWAYS_INLINE
+u32 rotr32(u32 v, u32 r)
+{
+  return ror(v, r);
+}
+
+static ALWAYS_INLINE
+u32 bswap32(u32 v)
+{
+  return _gcry_bswap32(v);
+}
+
+static ALWAYS_INLINE u32
+get_u8(u32 x, u32 y)
+{
+  return (x >> ((3 - y) * 8)) & 0xFF;
+}
+
+static ALWAYS_INLINE u32
+make_u32(byte v0, byte v1, byte v2, byte v3)
+{
+  return ((u32)v0 << 24) | ((u32)v1 << 16) | ((u32)v2 <<  8) | ((u32)v3);
+}
+
+static ALWAYS_INLINE u32
+aria_m(u32 t0)
+{
+  return rotr32(t0, 8) ^ rotr32(t0 ^ rotr32(t0, 8), 16);
+}
+
+/* S-Box Layer 1 + M */
+static ALWAYS_INLINE void
+aria_sbox_layer1_with_pre_diff(u32 *t0, u32 *t1, u32 *t2, u32 *t3)
+{
+  *t0 = sboxes.s1[get_u8(*t0, 0)] ^
+       sboxes.s2[get_u8(*t0, 1)] ^
+       sboxes.x1[get_u8(*t0, 2)] ^
+       sboxes.x2[get_u8(*t0, 3)];
+  *t1 = sboxes.s1[get_u8(*t1, 0)] ^
+       sboxes.s2[get_u8(*t1, 1)] ^
+       sboxes.x1[get_u8(*t1, 2)] ^
+       sboxes.x2[get_u8(*t1, 3)];
+  *t2 = sboxes.s1[get_u8(*t2, 0)] ^
+       sboxes.s2[get_u8(*t2, 1)] ^
+       sboxes.x1[get_u8(*t2, 2)] ^
+       sboxes.x2[get_u8(*t2, 3)];
+  *t3 = sboxes.s1[get_u8(*t3, 0)] ^
+       sboxes.s2[get_u8(*t3, 1)] ^
+       sboxes.x1[get_u8(*t3, 2)] ^
+       sboxes.x2[get_u8(*t3, 3)];
+}
+
+/* S-Box Layer 2 + M */
+static ALWAYS_INLINE void
+aria_sbox_layer2_with_pre_diff(u32 *t0, u32 *t1, u32 *t2, u32 *t3)
+{
+  *t0 = sboxes.x1[get_u8(*t0, 0)] ^
+       sboxes.x2[get_u8(*t0, 1)] ^
+       sboxes.s1[get_u8(*t0, 2)] ^
+       sboxes.s2[get_u8(*t0, 3)];
+  *t1 = sboxes.x1[get_u8(*t1, 0)] ^
+       sboxes.x2[get_u8(*t1, 1)] ^
+       sboxes.s1[get_u8(*t1, 2)] ^
+       sboxes.s2[get_u8(*t1, 3)];
+  *t2 = sboxes.x1[get_u8(*t2, 0)] ^
+       sboxes.x2[get_u8(*t2, 1)] ^
+       sboxes.s1[get_u8(*t2, 2)] ^
+       sboxes.s2[get_u8(*t2, 3)];
+  *t3 = sboxes.x1[get_u8(*t3, 0)] ^
+       sboxes.x2[get_u8(*t3, 1)] ^
+       sboxes.s1[get_u8(*t3, 2)] ^
+       sboxes.s2[get_u8(*t3, 3)];
+}
+
+/* Word-level diffusion */
+static ALWAYS_INLINE void
+aria_diff_word(u32 *t0, u32 *t1, u32 *t2, u32 *t3)
+{
+  *t1 ^= *t2;
+  *t2 ^= *t3;
+  *t0 ^= *t1;
+
+  *t3 ^= *t1;
+  *t2 ^= *t0;
+  *t1 ^= *t2;
+}
+
+/* Byte-level diffusion */
+static inline void aria_diff_byte(u32 *t1, u32 *t2, u32 *t3)
+{
+  *t1 = ((*t1 << 8) & 0xff00ff00) ^ ((*t1 >> 8) & 0x00ff00ff);
+  *t2 = rotr32(*t2, 16);
+  *t3 = bswap32(*t3);
+}
+
+/* Key XOR Layer */
+static ALWAYS_INLINE void
+aria_add_round_key(u32 *rk, u32 *t0, u32 *t1, u32 *t2, u32 *t3)
+{
+  *t0 ^= rk[0];
+  *t1 ^= rk[1];
+  *t2 ^= rk[2];
+  *t3 ^= rk[3];
+}
+
+/* Odd round Substitution & Diffusion */
+static ALWAYS_INLINE void
+aria_subst_diff_odd(u32 *t0, u32 *t1, u32 *t2, u32 *t3)
+{
+  aria_sbox_layer1_with_pre_diff(t0, t1, t2, t3);
+  aria_diff_word(t0, t1, t2, t3);
+  aria_diff_byte(t1, t2, t3);
+  aria_diff_word(t0, t1, t2, t3);
+}
+
+/* Even round Substitution & Diffusion */
+static ALWAYS_INLINE void
+aria_subst_diff_even(u32 *t0, u32 *t1, u32 *t2, u32 *t3)
+{
+  aria_sbox_layer2_with_pre_diff(t0, t1, t2, t3);
+  aria_diff_word(t0, t1, t2, t3);
+  aria_diff_byte(t3, t0, t1);
+  aria_diff_word(t0, t1, t2, t3);
+}
+
+/* Last round */
+static ALWAYS_INLINE void
+aria_last_round(u32 *t0, u32 *t1, u32 *t2, u32 *t3)
+{
+  *t0 = make_u32((byte)(sboxes.x1[get_u8(*t0, 0)]),
+                (byte)(sboxes.x2[get_u8(*t0, 1)] >> 24),
+                (byte)(sboxes.s1[get_u8(*t0, 2)]),
+                (byte)(sboxes.s2[get_u8(*t0, 3)]));
+  *t1 = make_u32((byte)(sboxes.x1[get_u8(*t1, 0)]),
+                (byte)(sboxes.x2[get_u8(*t1, 1)] >> 24),
+                (byte)(sboxes.s1[get_u8(*t1, 2)]),
+                (byte)(sboxes.s2[get_u8(*t1, 3)]));
+  *t2 = make_u32((byte)(sboxes.x1[get_u8(*t2, 0)]),
+                (byte)(sboxes.x2[get_u8(*t2, 1)] >> 24),
+                (byte)(sboxes.s1[get_u8(*t2, 2)]),
+                (byte)(sboxes.s2[get_u8(*t2, 3)]));
+  *t3 = make_u32((byte)(sboxes.x1[get_u8(*t3, 0)]),
+                (byte)(sboxes.x2[get_u8(*t3, 1)] >> 24),
+                (byte)(sboxes.s1[get_u8(*t3, 2)]),
+                (byte)(sboxes.s2[get_u8(*t3, 3)]));
+}
+
+/* Q, R Macro expanded ARIA GSRK */
+static ALWAYS_INLINE void
+aria_gsrk(u32 *rk, u32 *x, u32 *y, u32 n)
+{
+  int q = 4 - (n / 32);
+  int r = n % 32;
+
+  rk[0] = (x[0]) ^
+         ((y[q % 4]) >> r) ^
+         ((y[(q + 3) % 4]) << (32 - r));
+  rk[1] = (x[1]) ^
+         ((y[(q + 1) % 4]) >> r) ^
+         ((y[q % 4]) << (32 - r));
+  rk[2] = (x[2]) ^
+         ((y[(q + 2) % 4]) >> r) ^
+         ((y[(q + 1) % 4]) << (32 - r));
+  rk[3] = (x[3]) ^
+         ((y[(q + 3) % 4]) >> r) ^
+         ((y[(q + 2) % 4]) << (32 - r));
+}
+
+
+static NO_INLINE void
+aria_set_encrypt_key(ARIA_context *ctx, const byte *in_key, u32 key_len)
+{
+  u32 w0[4], w1[4], w2[4], w3[4];
+  u32 reg0, reg1, reg2, reg3;
+  const u32 *ck;
+  int rkidx = 0;
+
+  ctx->rounds = (key_len + 32) / 4;
+  prefetch_sboxes();
+
+  ck = &key_rc[(key_len - 16) / 2];
+
+  w0[0] = buf_get_be32(in_key + 0);
+  w0[1] = buf_get_be32(in_key + 4);
+  w0[2] = buf_get_be32(in_key + 8);
+  w0[3] = buf_get_be32(in_key + 12);
+
+  reg0 = w0[0] ^ ck[0];
+  reg1 = w0[1] ^ ck[1];
+  reg2 = w0[2] ^ ck[2];
+  reg3 = w0[3] ^ ck[3];
+
+  aria_subst_diff_odd(&reg0, &reg1, &reg2, &reg3);
+
+  if (key_len > 16)
+    {
+      w1[0] = buf_get_be32(in_key + 16);
+      w1[1] = buf_get_be32(in_key + 20);
+      if (key_len > 24)
+       {
+         w1[2] = buf_get_be32(in_key + 24);
+         w1[3] = buf_get_be32(in_key + 28);
+       }
+      else
+       {
+         w1[2] = 0;
+         w1[3] = 0;
+       }
+    }
+  else
+    {
+      w1[0] = 0;
+      w1[1] = 0;
+      w1[2] = 0;
+      w1[3] = 0;
+    }
+
+  w1[0] ^= reg0;
+  w1[1] ^= reg1;
+  w1[2] ^= reg2;
+  w1[3] ^= reg3;
+
+  reg0 = w1[0];
+  reg1 = w1[1];
+  reg2 = w1[2];
+  reg3 = w1[3];
+
+  reg0 ^= ck[4];
+  reg1 ^= ck[5];
+  reg2 ^= ck[6];
+  reg3 ^= ck[7];
+
+  aria_subst_diff_even(&reg0, &reg1, &reg2, &reg3);
+
+  reg0 ^= w0[0];
+  reg1 ^= w0[1];
+  reg2 ^= w0[2];
+  reg3 ^= w0[3];
+
+  w2[0] = reg0;
+  w2[1] = reg1;
+  w2[2] = reg2;
+  w2[3] = reg3;
+
+  reg0 ^= ck[8];
+  reg1 ^= ck[9];
+  reg2 ^= ck[10];
+  reg3 ^= ck[11];
+
+  aria_subst_diff_odd(&reg0, &reg1, &reg2, &reg3);
+
+  w3[0] = reg0 ^ w1[0];
+  w3[1] = reg1 ^ w1[1];
+  w3[2] = reg2 ^ w1[2];
+  w3[3] = reg3 ^ w1[3];
+
+  aria_gsrk(ctx->enc_key[rkidx], w0, w1, 19);
+  rkidx++;
+  aria_gsrk(ctx->enc_key[rkidx], w1, w2, 19);
+  rkidx++;
+  aria_gsrk(ctx->enc_key[rkidx], w2, w3, 19);
+  rkidx++;
+  aria_gsrk(ctx->enc_key[rkidx], w3, w0, 19);
+
+  rkidx++;
+  aria_gsrk(ctx->enc_key[rkidx], w0, w1, 31);
+  rkidx++;
+  aria_gsrk(ctx->enc_key[rkidx], w1, w2, 31);
+  rkidx++;
+  aria_gsrk(ctx->enc_key[rkidx], w2, w3, 31);
+  rkidx++;
+  aria_gsrk(ctx->enc_key[rkidx], w3, w0, 31);
+
+  rkidx++;
+  aria_gsrk(ctx->enc_key[rkidx], w0, w1, 67);
+  rkidx++;
+  aria_gsrk(ctx->enc_key[rkidx], w1, w2, 67);
+  rkidx++;
+  aria_gsrk(ctx->enc_key[rkidx], w2, w3, 67);
+  rkidx++;
+  aria_gsrk(ctx->enc_key[rkidx], w3, w0, 67);
+
+  rkidx++;
+  aria_gsrk(ctx->enc_key[rkidx], w0, w1, 97);
+  if (key_len > 16)
+    {
+      rkidx++;
+      aria_gsrk(ctx->enc_key[rkidx], w1, w2, 97);
+      rkidx++;
+      aria_gsrk(ctx->enc_key[rkidx], w2, w3, 97);
+
+      if (key_len > 24)
+       {
+         rkidx++;
+         aria_gsrk(ctx->enc_key[rkidx], w3, w0, 97);
+
+         rkidx++;
+         aria_gsrk(ctx->enc_key[rkidx], w0, w1, 109);
+       }
+    }
+
+  wipememory(w0, sizeof(w0));
+  wipememory(w1, sizeof(w1));
+  wipememory(w2, sizeof(w2));
+  wipememory(w3, sizeof(w3));
+}
+
+static void
+aria_set_decrypt_key(ARIA_context *ctx)
+{
+  int i;
+
+  for (i = 0; i < 4; i++)
+    {
+      ctx->dec_key[0][i] = ctx->enc_key[ctx->rounds][i];
+      ctx->dec_key[ctx->rounds][i] = ctx->enc_key[0][i];
+    }
+
+  for (i = 1; i < ctx->rounds; i++)
+    {
+      ctx->dec_key[i][0] = aria_m(ctx->enc_key[ctx->rounds - i][0]);
+      ctx->dec_key[i][1] = aria_m(ctx->enc_key[ctx->rounds - i][1]);
+      ctx->dec_key[i][2] = aria_m(ctx->enc_key[ctx->rounds - i][2]);
+      ctx->dec_key[i][3] = aria_m(ctx->enc_key[ctx->rounds - i][3]);
+
+      aria_diff_word(&ctx->dec_key[i][0], &ctx->dec_key[i][1],
+                    &ctx->dec_key[i][2], &ctx->dec_key[i][3]);
+      aria_diff_byte(&ctx->dec_key[i][1],
+                    &ctx->dec_key[i][2], &ctx->dec_key[i][3]);
+      aria_diff_word(&ctx->dec_key[i][0], &ctx->dec_key[i][1],
+                    &ctx->dec_key[i][2], &ctx->dec_key[i][3]);
+    }
+}
+
+static NO_INLINE unsigned int
+aria_crypt(ARIA_context *ctx, byte *out, const byte *in,
+          u32 key[][ARIA_RD_KEY_WORDS])
+{
+  u32 reg0, reg1, reg2, reg3;
+  int rounds = ctx->rounds;
+  int rkidx = 0;
+
+  reg0 = buf_get_be32(in + 0);
+  reg1 = buf_get_be32(in + 4);
+  reg2 = buf_get_be32(in + 8);
+  reg3 = buf_get_be32(in + 12);
+
+  aria_add_round_key(key[rkidx], &reg0, &reg1, &reg2, &reg3);
+  rkidx++;
+
+  while (1)
+    {
+      aria_subst_diff_odd(&reg0, &reg1, &reg2, &reg3);
+      aria_add_round_key(key[rkidx], &reg0, &reg1, &reg2, &reg3);
+      rkidx++;
+
+      if (rkidx >= rounds)
+       break;
+
+      aria_subst_diff_even(&reg0, &reg1, &reg2, &reg3);
+      aria_add_round_key(key[rkidx], &reg0, &reg1, &reg2, &reg3);
+      rkidx++;
+    }
+
+  aria_last_round(&reg0, &reg1, &reg2, &reg3);
+  aria_add_round_key(key[rkidx], &reg0, &reg1, &reg2, &reg3);
+
+  buf_put_be32(out + 0, reg0);
+  buf_put_be32(out + 4, reg1);
+  buf_put_be32(out + 8, reg2);
+  buf_put_be32(out + 12, reg3);
+
+  return 4 * sizeof(void *) + 4 * sizeof(u32); /* stack burn depth */
+}
+
+unsigned int
+aria_encrypt(void *c, byte *outbuf, const byte *inbuf)
+{
+  ARIA_context *ctx = (ARIA_context *)c;
+
+  prefetch_sboxes ();
+
+  return aria_crypt (ctx, outbuf, inbuf, ctx->enc_key);
+}
+
+unsigned int
+aria_decrypt(void *c, byte *outbuf, const byte *inbuf)
+{
+  ARIA_context *ctx = (ARIA_context *)c;
+
+  if (!ctx->decryption_prepared)
+    {
+      aria_set_decrypt_key (ctx);
+      ctx->decryption_prepared = 1;
+    }
+
+  prefetch_sboxes ();
+
+  return aria_crypt (ctx, outbuf, inbuf, ctx->dec_key);
+}
+
+
+static unsigned int
+aria_crypt_2blks(ARIA_context *ctx, byte *out, const byte *in,
+                u32 key[][ARIA_RD_KEY_WORDS])
+{
+  u32 ra0, ra1, ra2, ra3;
+  u32 rb0, rb1, rb2, rb3;
+  int rounds = ctx->rounds;
+  int rkidx = 0;
+
+  ra0 = buf_get_be32(in + 0);
+  ra1 = buf_get_be32(in + 4);
+  ra2 = buf_get_be32(in + 8);
+  ra3 = buf_get_be32(in + 12);
+  rb0 = buf_get_be32(in + 16);
+  rb1 = buf_get_be32(in + 20);
+  rb2 = buf_get_be32(in + 24);
+  rb3 = buf_get_be32(in + 28);
+
+  while (1)
+    {
+      aria_add_round_key(key[rkidx], &ra0, &ra1, &ra2, &ra3);
+      aria_add_round_key(key[rkidx], &rb0, &rb1, &rb2, &rb3);
+      rkidx++;
+
+      aria_subst_diff_odd(&ra0, &ra1, &ra2, &ra3);
+      aria_subst_diff_odd(&rb0, &rb1, &rb2, &rb3);
+      aria_add_round_key(key[rkidx], &ra0, &ra1, &ra2, &ra3);
+      aria_add_round_key(key[rkidx], &rb0, &rb1, &rb2, &rb3);
+      rkidx++;
+
+      if (rkidx >= rounds)
+       break;
+
+      aria_subst_diff_even(&ra0, &ra1, &ra2, &ra3);
+      aria_subst_diff_even(&rb0, &rb1, &rb2, &rb3);
+    }
+
+  aria_last_round(&ra0, &ra1, &ra2, &ra3);
+  aria_last_round(&rb0, &rb1, &rb2, &rb3);
+  aria_add_round_key(key[rkidx], &ra0, &ra1, &ra2, &ra3);
+  aria_add_round_key(key[rkidx], &rb0, &rb1, &rb2, &rb3);
+
+  buf_put_be32(out + 0, ra0);
+  buf_put_be32(out + 4, ra1);
+  buf_put_be32(out + 8, ra2);
+  buf_put_be32(out + 12, ra3);
+  buf_put_be32(out + 16, rb0);
+  buf_put_be32(out + 20, rb1);
+  buf_put_be32(out + 24, rb2);
+  buf_put_be32(out + 28, rb3);
+
+  return 4 * sizeof(void *) + 8 * sizeof(u32); /* stack burn depth */
+}
+
+static unsigned int
+aria_crypt_blocks (ARIA_context *ctx, byte *out, const byte *in,
+                  size_t num_blks, u32 key[][ARIA_RD_KEY_WORDS])
+{
+  unsigned int burn_depth = 0;
+
+#ifdef USE_GFNI_AVX512
+  if (ctx->use_gfni_avx512)
+    {
+      unsigned int nburn = 0;
+
+      while (num_blks >= 64)
+       {
+         nburn = aria_gfni_avx512_ecb_crypt_blk64 (ctx, out, in, key);
+         in += 64 * ARIA_BLOCK_SIZE;
+         out += 64 * ARIA_BLOCK_SIZE;
+         num_blks -= 64;
+       }
+
+      burn_depth = nburn > burn_depth ? nburn : burn_depth;
+
+      if (num_blks == 0)
+       return burn_depth;
+    }
+#endif /* USE_AESNI_AVX2 */
+
+#ifdef USE_AESNI_AVX2
+  if (ctx->use_aesni_avx2 || ctx->use_gfni_avx2)
+    {
+      unsigned int nburn = 0;
+
+      while (num_blks >= 32)
+       {
+         nburn = aria_avx2_ecb_crypt_blk32 (ctx, out, in, key);
+         in += 32 * ARIA_BLOCK_SIZE;
+         out += 32 * ARIA_BLOCK_SIZE;
+         num_blks -= 32;
+       }
+
+      burn_depth = nburn > burn_depth ? nburn : burn_depth;
+
+      if (num_blks == 0)
+       return burn_depth;
+    }
+#endif /* USE_AESNI_AVX2 */
+
+#ifdef USE_AESNI_AVX
+  if (ctx->use_aesni_avx || ctx->use_gfni_avx)
+    {
+      unsigned int nburn = 0;
+
+      while (num_blks >= 3)
+       {
+         size_t curr_blks = num_blks < 16 ? num_blks : 16;
+         nburn = aria_avx_ecb_crypt_blk1_16 (ctx, out, in, key, curr_blks);
+         in += curr_blks * ARIA_BLOCK_SIZE;
+         out += curr_blks * ARIA_BLOCK_SIZE;
+         num_blks -= curr_blks;
+       }
+
+      burn_depth = nburn > burn_depth ? nburn : burn_depth;
+
+      if (num_blks == 0)
+       return burn_depth;
+    }
+#endif /* USE_AESNI_AVX */
+
+  if (!ctx->bulk_prefetch_ready)
+    {
+      prefetch_sboxes();
+      ctx->bulk_prefetch_ready = 1;
+    }
+
+  while (num_blks >= 2)
+    {
+      unsigned int nburn = aria_crypt_2blks (ctx, out, in, key);
+      burn_depth = nburn > burn_depth ? nburn : burn_depth;
+      out += 2 * ARIA_BLOCK_SIZE;
+      in += 2 * ARIA_BLOCK_SIZE;
+      num_blks -= 2;
+    }
+
+  while (num_blks)
+    {
+      unsigned int nburn = aria_crypt (ctx, out, in, key);
+      burn_depth = nburn > burn_depth ? nburn : burn_depth;
+      out += ARIA_BLOCK_SIZE;
+      in += ARIA_BLOCK_SIZE;
+      num_blks--;
+    }
+
+  if (burn_depth)
+    burn_depth += sizeof(void *) * 5;
+  return burn_depth;
+}
+
+static unsigned int
+aria_enc_blocks (void *c, byte *out, const byte *in, size_t num_blks)
+{
+  ARIA_context *ctx = (ARIA_context *)c;
+
+  return aria_crypt_blocks (ctx, out, in, num_blks, ctx->enc_key);
+}
+
+static unsigned int
+aria_dec_blocks (void *c, byte *out, const byte *in, size_t num_blks)
+{
+  ARIA_context *ctx = (ARIA_context *)c;
+
+  return aria_crypt_blocks (ctx, out, in, num_blks, ctx->dec_key);
+}
+
+
+/* Bulk encryption of complete blocks in CTR mode.  This function is only
+   intended for the bulk encryption feature of cipher.c.  CTR is expected to be
+   of size 16. */
+static void
+_gcry_aria_ctr_enc(void *context, unsigned char *ctr,
+                  void *outbuf_arg, const void *inbuf_arg,
+                  size_t nblocks)
+{
+  ARIA_context *ctx = context;
+  byte *outbuf = outbuf_arg;
+  const byte *inbuf = inbuf_arg;
+  int burn_stack_depth = 0;
+
+#ifdef USE_GFNI_AVX512
+  if (ctx->use_gfni_avx512)
+    {
+      size_t nburn = 0;
+
+      while (nblocks >= 64)
+       {
+         nburn = aria_gfni_avx512_ctr_crypt_blk64 (ctx, outbuf, inbuf, ctr);
+         inbuf += 64 * ARIA_BLOCK_SIZE;
+         outbuf += 64 * ARIA_BLOCK_SIZE;
+         nblocks -= 64;
+       }
+
+      burn_stack_depth = nburn > burn_stack_depth ? nburn : burn_stack_depth;
+    }
+#endif /* USE_AESNI_AVX */
+
+#ifdef USE_AESNI_AVX2
+  if (ctx->use_aesni_avx2 || ctx->use_gfni_avx2)
+    {
+      size_t nburn = 0;
+
+      while (nblocks >= 32)
+       {
+         nburn = aria_avx2_ctr_crypt_blk32 (ctx, outbuf, inbuf, ctr);
+         inbuf += 32 * ARIA_BLOCK_SIZE;
+         outbuf += 32 * ARIA_BLOCK_SIZE;
+         nblocks -= 32;
+       }
+
+      burn_stack_depth = nburn > burn_stack_depth ? nburn : burn_stack_depth;
+    }
+#endif /* USE_AESNI_AVX */
+
+#ifdef USE_AESNI_AVX
+  if (ctx->use_aesni_avx || ctx->use_gfni_avx)
+    {
+      size_t nburn = 0;
+
+      while (nblocks >= 16)
+       {
+         nburn = aria_avx_ctr_crypt_blk16 (ctx, outbuf, inbuf, ctr);
+         inbuf += 16 * ARIA_BLOCK_SIZE;
+         outbuf += 16 * ARIA_BLOCK_SIZE;
+         nblocks -= 16;
+       }
+
+      burn_stack_depth = nburn > burn_stack_depth ? nburn : burn_stack_depth;
+    }
+#endif /* USE_AESNI_AVX */
+
+  /* Process remaining blocks. */
+  if (nblocks)
+    {
+      byte tmpbuf[MAX_PARALLEL_BLKS * ARIA_BLOCK_SIZE];
+      unsigned int tmp_used = ARIA_BLOCK_SIZE;
+      size_t nburn = 0;
+
+      ctx->bulk_prefetch_ready = 0;
+
+      nburn = bulk_ctr_enc_128(ctx, aria_enc_blocks, outbuf, inbuf,
+                              nblocks, ctr, tmpbuf,
+                              sizeof(tmpbuf) / ARIA_BLOCK_SIZE, &tmp_used);
+      burn_stack_depth = nburn > burn_stack_depth ? nburn : burn_stack_depth;
+
+      wipememory (tmpbuf, tmp_used);
+    }
+
+  if (burn_stack_depth)
+    _gcry_burn_stack (burn_stack_depth);
+}
+
+/* Bulk encryption of complete blocks in CBC mode. */
+static void
+_gcry_aria_cbc_enc (void *context, unsigned char *iv,
+                   void *outbuf_arg, const void *inbuf_arg,
+                   size_t nblocks, int cbc_mac)
+{
+  ARIA_context *ctx = context;
+  unsigned char *outbuf = outbuf_arg;
+  const unsigned char *inbuf = inbuf_arg;
+  unsigned char *last_iv;
+  unsigned int burn_depth = 0;
+
+  prefetch_sboxes();
+
+  last_iv = iv;
+
+  for (; nblocks; nblocks--)
+    {
+      cipher_block_xor (outbuf, inbuf, last_iv, ARIA_BLOCK_SIZE);
+
+      burn_depth = aria_crypt (ctx, outbuf, outbuf, ctx->enc_key);
+
+      last_iv = outbuf;
+      inbuf += ARIA_BLOCK_SIZE;
+      if (!cbc_mac)
+       outbuf += ARIA_BLOCK_SIZE;
+    }
+
+  if (last_iv != iv)
+    cipher_block_cpy (iv, last_iv, ARIA_BLOCK_SIZE);
+
+  if (burn_depth)
+    _gcry_burn_stack (burn_depth + 4 * sizeof(void *));
+}
+
+/* Bulk decryption of complete blocks in CBC mode.  This function is only
+   intended for the bulk encryption feature of cipher.c. */
+static void
+_gcry_aria_cbc_dec(void *context, unsigned char *iv,
+                  void *outbuf_arg, const void *inbuf_arg,
+                  size_t nblocks)
+{
+  ARIA_context *ctx = context;
+  unsigned char *outbuf = outbuf_arg;
+  const unsigned char *inbuf = inbuf_arg;
+  int burn_stack_depth = 0;
+
+  if (!ctx->decryption_prepared)
+    {
+      aria_set_decrypt_key (ctx);
+      ctx->decryption_prepared = 1;
+    }
+
+  /* Process remaining blocks. */
+  if (nblocks)
+    {
+      unsigned char tmpbuf[MAX_PARALLEL_BLKS * ARIA_BLOCK_SIZE];
+      unsigned int tmp_used = ARIA_BLOCK_SIZE;
+      size_t nburn;
+
+      ctx->bulk_prefetch_ready = 0;
+
+      nburn = bulk_cbc_dec_128(ctx, aria_dec_blocks, outbuf, inbuf,
+                              nblocks, iv, tmpbuf,
+                              sizeof(tmpbuf) / ARIA_BLOCK_SIZE, &tmp_used);
+      burn_stack_depth = nburn > burn_stack_depth ? nburn : burn_stack_depth;
+
+      wipememory (tmpbuf, tmp_used);
+    }
+
+  if (burn_stack_depth)
+    _gcry_burn_stack (burn_stack_depth);
+}
+
+/* Bulk encryption of complete blocks in CFB mode. */
+static void
+_gcry_aria_cfb_enc (void *context, unsigned char *iv,
+                   void *outbuf_arg, const void *inbuf_arg,
+                   size_t nblocks)
+{
+  ARIA_context *ctx = context;
+  unsigned char *outbuf = outbuf_arg;
+  const unsigned char *inbuf = inbuf_arg;
+  unsigned int burn_depth = 0;
+
+  prefetch_sboxes();
+
+  for (; nblocks; nblocks--)
+    {
+      /* Encrypt the IV. */
+      burn_depth = aria_crypt (ctx, iv, iv, ctx->enc_key);
+      /* XOR the input with the IV and store input into IV.  */
+      cipher_block_xor_2dst(outbuf, iv, inbuf, ARIA_BLOCK_SIZE);
+      outbuf += ARIA_BLOCK_SIZE;
+      inbuf += ARIA_BLOCK_SIZE;
+    }
+
+  if (burn_depth)
+    _gcry_burn_stack (burn_depth + 4 * sizeof(void *));
+}
+
+/* Bulk decryption of complete blocks in CFB mode.  This function is only
+   intended for the bulk encryption feature of cipher.c. */
+static void
+_gcry_aria_cfb_dec(void *context, unsigned char *iv,
+                  void *outbuf_arg, const void *inbuf_arg,
+                  size_t nblocks)
+{
+  ARIA_context *ctx = context;
+  unsigned char *outbuf = outbuf_arg;
+  const unsigned char *inbuf = inbuf_arg;
+  int burn_stack_depth = 0;
+
+  /* Process remaining blocks. */
+  if (nblocks)
+    {
+      unsigned char tmpbuf[MAX_PARALLEL_BLKS * ARIA_BLOCK_SIZE];
+      unsigned int tmp_used = ARIA_BLOCK_SIZE;
+      size_t nburn;
+
+      ctx->bulk_prefetch_ready = 0;
+
+      nburn = bulk_cfb_dec_128(ctx, aria_enc_blocks, outbuf, inbuf,
+                              nblocks, iv, tmpbuf,
+                              sizeof(tmpbuf) / ARIA_BLOCK_SIZE, &tmp_used);
+      burn_stack_depth = nburn > burn_stack_depth ? nburn : burn_stack_depth;
+
+      wipememory (tmpbuf, tmp_used);
+    }
+
+  if (burn_stack_depth)
+    _gcry_burn_stack (burn_stack_depth);
+}
+
+/* Bulk encryption/decryption in ECB mode. */
+static void
+_gcry_aria_ecb_crypt (void *context, void *outbuf_arg,
+                     const void *inbuf_arg, size_t nblocks, int encrypt)
+{
+  ARIA_context *ctx = context;
+  unsigned char *outbuf = outbuf_arg;
+  const unsigned char *inbuf = inbuf_arg;
+  int burn_stack_depth = 0;
+
+  if (!encrypt && !ctx->decryption_prepared)
+    {
+      aria_set_decrypt_key (ctx);
+      ctx->decryption_prepared = 1;
+    }
+
+  /* Process remaining blocks. */
+  if (nblocks)
+    {
+      bulk_crypt_fn_t crypt_blk1_n;
+      size_t nburn;
+
+      ctx->bulk_prefetch_ready = 0;
+      crypt_blk1_n = encrypt ? aria_enc_blocks : aria_dec_blocks;
+
+      nburn = bulk_ecb_crypt_128(ctx, crypt_blk1_n,
+                                outbuf, inbuf, nblocks, MAX_PARALLEL_BLKS);
+      burn_stack_depth = nburn > burn_stack_depth ? nburn : burn_stack_depth;
+    }
+
+  if (burn_stack_depth)
+    _gcry_burn_stack (burn_stack_depth);
+}
+
+/* Bulk encryption/decryption of complete blocks in XTS mode. */
+static void
+_gcry_aria_xts_crypt (void *context, unsigned char *tweak, void *outbuf_arg,
+                     const void *inbuf_arg, size_t nblocks, int encrypt)
+{
+  ARIA_context *ctx = context;
+  unsigned char *outbuf = outbuf_arg;
+  const unsigned char *inbuf = inbuf_arg;
+  int burn_stack_depth = 0;
+
+  if (!encrypt && !ctx->decryption_prepared)
+    {
+      aria_set_decrypt_key (ctx);
+      ctx->decryption_prepared = 1;
+    }
+
+  /* Process remaining blocks. */
+  if (nblocks)
+    {
+      unsigned char tmpbuf[MAX_PARALLEL_BLKS * ARIA_BLOCK_SIZE];
+      unsigned int tmp_used = ARIA_BLOCK_SIZE;
+      bulk_crypt_fn_t crypt_blk1_n;
+      size_t nburn;
+
+      ctx->bulk_prefetch_ready = 0;
+      crypt_blk1_n = encrypt ? aria_enc_blocks : aria_dec_blocks;
+
+      nburn = bulk_xts_crypt_128(ctx, crypt_blk1_n,
+                                outbuf, inbuf, nblocks,
+                                tweak, tmpbuf,
+                                sizeof(tmpbuf) / ARIA_BLOCK_SIZE,
+                                &tmp_used);
+      burn_stack_depth = nburn > burn_stack_depth ? nburn : burn_stack_depth;
+
+      wipememory (tmpbuf, tmp_used);
+    }
+
+  if (burn_stack_depth)
+    _gcry_burn_stack (burn_stack_depth);
+}
+
+/* Bulk encryption of complete blocks in CTR32LE mode (for GCM-SIV). */
+static void
+_gcry_aria_ctr32le_enc(void *context, unsigned char *ctr,
+                      void *outbuf_arg, const void *inbuf_arg,
+                      size_t nblocks)
+{
+  ARIA_context *ctx = context;
+  byte *outbuf = outbuf_arg;
+  const byte *inbuf = inbuf_arg;
+  int burn_stack_depth = 0;
+
+  /* Process remaining blocks. */
+  if (nblocks)
+    {
+      unsigned char tmpbuf[MAX_PARALLEL_BLKS * ARIA_BLOCK_SIZE];
+      unsigned int tmp_used = ARIA_BLOCK_SIZE;
+      size_t nburn;
+
+      ctx->bulk_prefetch_ready = 0;
+
+      nburn = bulk_ctr32le_enc_128 (ctx, aria_enc_blocks, outbuf, inbuf,
+                                   nblocks, ctr, tmpbuf,
+                                   sizeof(tmpbuf) / ARIA_BLOCK_SIZE,
+                                   &tmp_used);
+      burn_stack_depth = nburn > burn_stack_depth ? nburn : burn_stack_depth;
+
+      wipememory (tmpbuf, tmp_used);
+    }
+
+  if (burn_stack_depth)
+    _gcry_burn_stack (burn_stack_depth);
+}
+
+/* Bulk encryption/decryption of complete blocks in OCB mode. */
+static size_t
+_gcry_aria_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg,
+                     const void *inbuf_arg, size_t nblocks, int encrypt)
+{
+  ARIA_context *ctx = (void *)&c->context.c;
+  unsigned char *outbuf = outbuf_arg;
+  const unsigned char *inbuf = inbuf_arg;
+  u64 blkn = c->u_mode.ocb.data_nblocks;
+  int burn_stack_depth = 0;
+
+  if (!encrypt && !ctx->decryption_prepared)
+    {
+      aria_set_decrypt_key (ctx);
+      ctx->decryption_prepared = 1;
+    }
+
+  /* Process remaining blocks. */
+  if (nblocks)
+    {
+      unsigned char tmpbuf[MAX_PARALLEL_BLKS * ARIA_BLOCK_SIZE];
+      unsigned int tmp_used = ARIA_BLOCK_SIZE;
+      bulk_crypt_fn_t crypt_blk1_n;
+      size_t nburn;
+
+      ctx->bulk_prefetch_ready = 0;
+      crypt_blk1_n = encrypt ? aria_enc_blocks : aria_dec_blocks;
+
+      nburn = bulk_ocb_crypt_128 (c, ctx, crypt_blk1_n, outbuf, inbuf, nblocks,
+                                 &blkn, encrypt, tmpbuf,
+                                 sizeof(tmpbuf) / ARIA_BLOCK_SIZE,
+                                 &tmp_used);
+      burn_stack_depth = nburn > burn_stack_depth ? nburn : burn_stack_depth;
+
+      wipememory (tmpbuf, tmp_used);
+    }
+
+  c->u_mode.ocb.data_nblocks = blkn;
+
+  if (burn_stack_depth)
+    _gcry_burn_stack (burn_stack_depth);
+
+  return 0;
+}
+
+/* Bulk authentication of complete blocks in OCB mode. */
+static size_t
+_gcry_aria_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg, size_t nblocks)
+{
+  ARIA_context *ctx = (void *)&c->context.c;
+  const unsigned char *abuf = abuf_arg;
+  u64 blkn = c->u_mode.ocb.aad_nblocks;
+  int burn_stack_depth = 0;
+
+  /* Process remaining blocks. */
+  if (nblocks)
+    {
+      unsigned char tmpbuf[MAX_PARALLEL_BLKS * ARIA_BLOCK_SIZE];
+      unsigned int tmp_used = ARIA_BLOCK_SIZE;
+      size_t nburn;
+
+      ctx->bulk_prefetch_ready = 0;
+
+      nburn = bulk_ocb_auth_128 (c, ctx, aria_enc_blocks, abuf, nblocks,
+                                &blkn, tmpbuf,
+                                sizeof(tmpbuf) / ARIA_BLOCK_SIZE, &tmp_used);
+      burn_stack_depth = nburn > burn_stack_depth ? nburn : burn_stack_depth;
+
+      wipememory (tmpbuf, tmp_used);
+    }
+
+  c->u_mode.ocb.aad_nblocks = blkn;
+
+  if (burn_stack_depth)
+    _gcry_burn_stack (burn_stack_depth);
+
+  return 0;
+}
+
+
+static gcry_err_code_t
+aria_setkey(void *c, const byte *key, unsigned keylen,
+           cipher_bulk_ops_t *bulk_ops)
+{
+  ARIA_context *ctx = c;
+  static int initialized = 0;
+  static const char *selftest_failed = NULL;
+  unsigned int hwf = _gcry_get_hw_features ();
+
+  (void)hwf;
+
+  if (keylen != 16 && keylen != 24 && keylen != 32)
+    return GPG_ERR_INV_KEYLEN;
+
+  if (!initialized)
+    {
+      initialized = 1;
+      selftest_failed = aria_selftest ();
+      if (selftest_failed)
+       log_error("%s\n", selftest_failed);
+    }
+
+  if (selftest_failed)
+    return GPG_ERR_SELFTEST_FAILED;
+
+#ifdef USE_GFNI_AVX512
+  ctx->use_gfni_avx512 = (hwf & HWF_INTEL_GFNI) && (hwf & HWF_INTEL_AVX512);
+#endif
+#ifdef USE_AESNI_AVX2
+  ctx->use_aesni_avx2 = (hwf & HWF_INTEL_AESNI) && (hwf & HWF_INTEL_AVX2);
+#endif
+#ifdef USE_GFNI_AVX2
+  ctx->use_gfni_avx2 = (hwf & HWF_INTEL_GFNI) && (hwf & HWF_INTEL_AVX2);
+#endif
+#ifdef USE_VAES_AVX2
+  ctx->use_vaes_avx2 = (hwf & HWF_INTEL_VAES_VPCLMUL) && (hwf & HWF_INTEL_AVX2);
+#endif
+#ifdef USE_AESNI_AVX
+  ctx->use_aesni_avx = (hwf & HWF_INTEL_AESNI) && (hwf & HWF_INTEL_AVX);
+#endif
+#ifdef USE_GFNI_AVX
+  ctx->use_gfni_avx = (hwf & HWF_INTEL_GFNI) && (hwf & HWF_INTEL_AVX);
+#endif
+
+  /* Setup bulk encryption routines.  */
+  memset (bulk_ops, 0, sizeof(*bulk_ops));
+  bulk_ops->cbc_enc = _gcry_aria_cbc_enc;
+  bulk_ops->cbc_dec = _gcry_aria_cbc_dec;
+  bulk_ops->cfb_enc = _gcry_aria_cfb_enc;
+  bulk_ops->cfb_dec = _gcry_aria_cfb_dec;
+  bulk_ops->ctr_enc = _gcry_aria_ctr_enc;
+  bulk_ops->ctr32le_enc = _gcry_aria_ctr32le_enc;
+  bulk_ops->ecb_crypt = _gcry_aria_ecb_crypt;
+  bulk_ops->xts_crypt = _gcry_aria_xts_crypt;
+  bulk_ops->ocb_crypt = _gcry_aria_ocb_crypt;
+  bulk_ops->ocb_auth = _gcry_aria_ocb_auth;
+
+  /* Setup context and encryption key. */
+  ctx->decryption_prepared = 0;
+  aria_set_encrypt_key (ctx, key, keylen);
+
+  _gcry_burn_stack (3 * sizeof(void *) + 5 * 4 * sizeof(u32));
+  return 0;
+}
+
+
+static const char *
+aria_selftest (void)
+{
+  ARIA_context ctx;
+  byte scratch[16];
+
+  static const byte key[16] = {
+    0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
+    0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f
+  };
+  static const byte plaintext[16] = {
+    0x00, 0x11, 0x22, 0x33, 0x44, 0x55, 0x66, 0x77,
+    0x88, 0x99, 0xaa, 0xbb, 0xcc, 0xdd, 0xee, 0xff,
+  };
+  static const byte ciphertext[16] = {
+    0xd7, 0x18, 0xfb, 0xd6, 0xab, 0x64, 0x4c, 0x73,
+    0x9d, 0xa9, 0x5f, 0x3b, 0xe6, 0x45, 0x17, 0x78
+  };
+
+  memset (&ctx, 0, sizeof(ctx));
+
+  aria_set_encrypt_key (&ctx, key, 16);
+  aria_encrypt (&ctx, scratch, plaintext);
+  if (memcmp (scratch, ciphertext, sizeof (ciphertext)))
+    return "ARIA test encryption failed.";
+  aria_decrypt (&ctx, scratch, scratch);
+  if (memcmp (scratch, plaintext, sizeof (plaintext)))
+    return "ARIA test decryption failed.";
+
+  return NULL;
+}
+
+
+static const gcry_cipher_oid_spec_t aria128_oids[] =
+  {
+    { "1.2.410.200046.1.1.1", GCRY_CIPHER_MODE_ECB },
+    { "1.2.410.200046.1.1.2", GCRY_CIPHER_MODE_CBC },
+    { "1.2.410.200046.1.1.3", GCRY_CIPHER_MODE_CFB },
+    { "1.2.410.200046.1.1.4", GCRY_CIPHER_MODE_OFB },
+    { "1.2.410.200046.1.1.5", GCRY_CIPHER_MODE_CTR },
+    { "1.2.410.200046.1.1.34", GCRY_CIPHER_MODE_GCM },
+    { "1.2.410.200046.1.1.37", GCRY_CIPHER_MODE_CCM },
+    { NULL }
+  };
+
+static const gcry_cipher_oid_spec_t aria192_oids[] =
+  {
+    { "1.2.410.200046.1.1.6", GCRY_CIPHER_MODE_ECB },
+    { "1.2.410.200046.1.1.7", GCRY_CIPHER_MODE_CBC },
+    { "1.2.410.200046.1.1.8", GCRY_CIPHER_MODE_CFB },
+    { "1.2.410.200046.1.1.9", GCRY_CIPHER_MODE_OFB },
+    { "1.2.410.200046.1.1.10", GCRY_CIPHER_MODE_CTR },
+    { "1.2.410.200046.1.1.35", GCRY_CIPHER_MODE_GCM },
+    { "1.2.410.200046.1.1.38", GCRY_CIPHER_MODE_CCM },
+    { NULL }
+  };
+
+static const gcry_cipher_oid_spec_t aria256_oids[] =
+  {
+    { "1.2.410.200046.1.1.11", GCRY_CIPHER_MODE_ECB },
+    { "1.2.410.200046.1.1.12", GCRY_CIPHER_MODE_CBC },
+    { "1.2.410.200046.1.1.13", GCRY_CIPHER_MODE_CFB },
+    { "1.2.410.200046.1.1.14", GCRY_CIPHER_MODE_OFB },
+    { "1.2.410.200046.1.1.15", GCRY_CIPHER_MODE_CTR },
+    { "1.2.410.200046.1.1.36", GCRY_CIPHER_MODE_GCM },
+    { "1.2.410.200046.1.1.39", GCRY_CIPHER_MODE_CCM },
+    { NULL }
+  };
+
+gcry_cipher_spec_t _gcry_cipher_spec_aria128 =
+  {
+    GCRY_CIPHER_ARIA128, { 0, 0 },
+    "ARIA128", NULL, aria128_oids, ARIA_BLOCK_SIZE, 128,
+    sizeof(ARIA_context), aria_setkey, aria_encrypt, aria_decrypt
+  };
+
+gcry_cipher_spec_t _gcry_cipher_spec_aria192 =
+  {
+    GCRY_CIPHER_ARIA192, { 0, 0 },
+    "ARIA192",NULL,aria192_oids, ARIA_BLOCK_SIZE, 192,
+    sizeof(ARIA_context), aria_setkey, aria_encrypt, aria_decrypt
+  };
+
+gcry_cipher_spec_t _gcry_cipher_spec_aria256 =
+  {
+    GCRY_CIPHER_ARIA256, { 0, 0 },
+    "ARIA256", NULL, aria256_oids, ARIA_BLOCK_SIZE, 256,
+    sizeof(ARIA_context), aria_setkey, aria_encrypt, aria_decrypt
+  };
index 451539e829af1f166371f4db4eac873e9c836916..3a72d7c45cc6ae18e6213ab93433e04f2eeef495 100644 (file)
 # define ELF(...) /*_*/
 #endif
 
+#ifdef _WIN32
+# define SECTION_RODATA .section .rdata
+#else
+# define SECTION_RODATA .section .rodata
+#endif
+
 #ifdef __APPLE__
 #define GET_DATA_POINTER(reg, name) \
        adrp    reg, name@GOTPAGE ; \
        add     reg, reg, name@GOTPAGEOFF ;
-#elif defined(_WIN32)
+#else
 #define GET_DATA_POINTER(reg, name) \
        adrp    reg, name ; \
        add     reg, reg, #:lo12:name ;
-#else
-#define GET_DATA_POINTER(reg, name) \
-       adrp    reg, :got:name ; \
-       ldr     reg, [reg, #:got_lo12:name] ;
 #endif
 
 #ifdef HAVE_GCC_ASM_CFI_DIRECTIVES
 #define ret_spec_stop \
        ret; dsb sy; isb;
 
+#define CLEAR_REG(reg) movi reg.16b, #0;
+
+#define VPUSH_ABI \
+       stp d8, d9, [sp, #-16]!; \
+       CFI_ADJUST_CFA_OFFSET(16); \
+       stp d10, d11, [sp, #-16]!; \
+       CFI_ADJUST_CFA_OFFSET(16); \
+       stp d12, d13, [sp, #-16]!; \
+       CFI_ADJUST_CFA_OFFSET(16); \
+       stp d14, d15, [sp, #-16]!; \
+       CFI_ADJUST_CFA_OFFSET(16);
+
+#define VPOP_ABI \
+       ldp d14, d15, [sp], #16; \
+       CFI_ADJUST_CFA_OFFSET(-16); \
+       ldp d12, d13, [sp], #16; \
+       CFI_ADJUST_CFA_OFFSET(-16); \
+       ldp d10, d11, [sp], #16; \
+       CFI_ADJUST_CFA_OFFSET(-16); \
+       ldp d8, d9, [sp], #16; \
+       CFI_ADJUST_CFA_OFFSET(-16);
+
 #endif /* GCRY_ASM_COMMON_AARCH64_H */
index 97912b1b14751143833a7189e2e3a9784c35eb30..870fef9aa486b8a57c3430b217017fd2012c3de2 100644 (file)
 # define ELF(...) /*_*/
 #endif
 
+#ifdef HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS
+# define SECTION_RODATA .section .rdata
+#else
+# define SECTION_RODATA .section .rodata
+#endif
+
 #ifdef __PIC__
 #  define rRIP (%rip)
 #else
 # define EXIT_SYSV_FUNC
 #endif
 
-/* 'ret' instruction replacement for straight-line speculation mitigation */
+/* 'ret' instruction replacement for straight-line speculation mitigation. */
 #define ret_spec_stop \
        ret; int3;
 
+/* This prevents speculative execution on old AVX512 CPUs, to prevent
+ * speculative execution to AVX512 code. The vpopcntb instruction is
+ * available on newer CPUs that do not suffer from significant frequency
+ * drop when 512-bit vectors are utilized. */
+#define spec_stop_avx512 \
+       vpxord %ymm16, %ymm16, %ymm16; \
+       vpopcntb %xmm16, %xmm16; /* Supported only by newer AVX512 CPUs. */ \
+       vpxord %ymm16, %ymm16, %ymm16;
+
+#define spec_stop_avx512_intel_syntax \
+       vpxord ymm16, ymm16, ymm16; \
+       vpopcntb xmm16, xmm16; /* Supported only by newer AVX512 CPUs. */ \
+       vpxord ymm16, ymm16, ymm16;
+
 #endif /* GCRY_ASM_COMMON_AMD64_H */
diff --git a/cipher/asm-common-i386.h b/cipher/asm-common-i386.h
new file mode 100644 (file)
index 0000000..d746ebc
--- /dev/null
@@ -0,0 +1,161 @@
+/* asm-common-i386.h  -  Common macros for i386 assembly
+ *
+ * Copyright (C) 2023 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef GCRY_ASM_COMMON_I386_H
+#define GCRY_ASM_COMMON_I386_H
+
+#include <config.h>
+
+#ifdef HAVE_COMPATIBLE_GCC_I386_PLATFORM_AS
+# define ELF(...) __VA_ARGS__
+#else
+# define ELF(...) /*_*/
+#endif
+
+#ifdef HAVE_COMPATIBLE_GCC_WIN32_PLATFORM_AS
+# define SECTION_RODATA .section .rdata
+#else
+# define SECTION_RODATA .section .rodata
+#endif
+
+#ifdef HAVE_COMPATIBLE_GCC_WIN32_PLATFORM_AS
+# define SYM_NAME(name) _##name
+#else
+# define SYM_NAME(name) name
+#endif
+
+#ifdef HAVE_COMPATIBLE_GCC_WIN32_PLATFORM_AS
+# define DECL_GET_PC_THUNK(reg)
+# define GET_DATA_POINTER(name, reg) leal name, %reg
+#else
+# define DECL_GET_PC_THUNK(reg) \
+      .type __gcry_get_pc_thunk_##reg, @function; \
+      .align 16; \
+      __gcry_get_pc_thunk_##reg:; \
+       CFI_STARTPROC(); \
+       movl (%esp), %reg; \
+       ret_spec_stop; \
+       CFI_ENDPROC()
+# define GET_DATA_POINTER(name, reg) \
+       call __gcry_get_pc_thunk_##reg; \
+       addl $_GLOBAL_OFFSET_TABLE_, %reg; \
+       movl name##@GOT(%reg), %reg;
+#endif
+
+#ifdef HAVE_GCC_ASM_CFI_DIRECTIVES
+/* CFI directives to emit DWARF stack unwinding information. */
+# define CFI_STARTPROC()            .cfi_startproc
+# define CFI_ENDPROC()              .cfi_endproc
+# define CFI_REMEMBER_STATE()       .cfi_remember_state
+# define CFI_RESTORE_STATE()        .cfi_restore_state
+# define CFI_ADJUST_CFA_OFFSET(off) .cfi_adjust_cfa_offset off
+# define CFI_REL_OFFSET(reg,off)    .cfi_rel_offset reg, off
+# define CFI_DEF_CFA_REGISTER(reg)  .cfi_def_cfa_register reg
+# define CFI_REGISTER(ro,rn)        .cfi_register ro, rn
+# define CFI_RESTORE(reg)           .cfi_restore reg
+
+# define CFI_PUSH(reg) \
+       CFI_ADJUST_CFA_OFFSET(4); CFI_REL_OFFSET(reg, 0)
+# define CFI_POP(reg) \
+       CFI_ADJUST_CFA_OFFSET(-4); CFI_RESTORE(reg)
+# define CFI_POP_TMP_REG() \
+       CFI_ADJUST_CFA_OFFSET(-4);
+# define CFI_LEAVE() \
+       CFI_ADJUST_CFA_OFFSET(-4); CFI_DEF_CFA_REGISTER(%esp)
+
+/* CFA expressions are used for pointing CFA and registers to
+ * %rsp relative offsets. */
+# define DW_REGNO_eax 0
+# define DW_REGNO_edx 1
+# define DW_REGNO_ecx 2
+# define DW_REGNO_ebx 3
+# define DW_REGNO_esi 4
+# define DW_REGNO_edi 5
+# define DW_REGNO_ebp 6
+# define DW_REGNO_esp 7
+
+# define DW_REGNO(reg) DW_REGNO_ ## reg
+
+/* Fixed length encoding used for integers for now. */
+# define DW_SLEB128_7BIT(value) \
+       0x00|((value) & 0x7f)
+# define DW_SLEB128_28BIT(value) \
+       0x80|((value)&0x7f), \
+       0x80|(((value)>>7)&0x7f), \
+       0x80|(((value)>>14)&0x7f), \
+       0x00|(((value)>>21)&0x7f)
+
+# define CFI_CFA_ON_STACK(esp_offs,cfa_depth) \
+       .cfi_escape \
+         0x0f, /* DW_CFA_def_cfa_expression */ \
+           DW_SLEB128_7BIT(11), /* length */ \
+         0x77, /* DW_OP_breg7, rsp + constant */ \
+           DW_SLEB128_28BIT(esp_offs), \
+         0x06, /* DW_OP_deref */ \
+         0x23, /* DW_OP_plus_constu */ \
+           DW_SLEB128_28BIT((cfa_depth)+4)
+
+# define CFI_REG_ON_STACK(reg,esp_offs) \
+       .cfi_escape \
+         0x10, /* DW_CFA_expression */ \
+           DW_SLEB128_7BIT(DW_REGNO(reg)), \
+           DW_SLEB128_7BIT(5), /* length */ \
+         0x77, /* DW_OP_breg7, rsp + constant */ \
+           DW_SLEB128_28BIT(esp_offs)
+
+#else
+# define CFI_STARTPROC()
+# define CFI_ENDPROC()
+# define CFI_REMEMBER_STATE()
+# define CFI_RESTORE_STATE()
+# define CFI_ADJUST_CFA_OFFSET(off)
+# define CFI_REL_OFFSET(reg,off)
+# define CFI_DEF_CFA_REGISTER(reg)
+# define CFI_REGISTER(ro,rn)
+# define CFI_RESTORE(reg)
+
+# define CFI_PUSH(reg)
+# define CFI_POP(reg)
+# define CFI_POP_TMP_REG()
+# define CFI_LEAVE()
+
+# define CFI_CFA_ON_STACK(rsp_offs,cfa_depth)
+# define CFI_REG_ON_STACK(reg,rsp_offs)
+#endif
+
+/* 'ret' instruction replacement for straight-line speculation mitigation. */
+#define ret_spec_stop \
+       ret; int3;
+
+/* This prevents speculative execution on old AVX512 CPUs, to prevent
+ * speculative execution to AVX512 code. The vpopcntb instruction is
+ * available on newer CPUs that do not suffer from significant frequency
+ * drop when 512-bit vectors are utilized. */
+#define spec_stop_avx512 \
+       vpxord %ymm7, %ymm7, %ymm7; \
+       vpopcntb %xmm7, %xmm7; /* Supported only by newer AVX512 CPUs. */ \
+       vpxord %ymm7, %ymm7, %ymm7;
+
+#define spec_stop_avx512_intel_syntax \
+       vpxord ymm7, ymm7, ymm7; \
+       vpopcntb xmm7, xmm7; /* Supported only by newer AVX512 CPUs. */ \
+       vpxord ymm7, ymm7, ymm7;
+
+#endif /* GCRY_ASM_COMMON_AMD64_H */
index d7f9a7e468286af151f47f5d10b97eb63ab234b3..451e71f64ed390b2757ef2435e1f7d661e953f25 100644 (file)
@@ -4,7 +4,7 @@
  * This file is part of Libgcrypt.
  *
  * Libgcrypt is free software; you can redistribute it and/or modify
- * it under the terms of the GNU Lesser general Public License as
+ * it under the terms of the GNU Lesser General Public License as
  * published by the Free Software Foundation; either version 2.1 of
  * the License, or (at your option) any later version.
  *
 # define USE_AVX2 1
 #endif
 
+/* USE_AVX512 indicates whether to compile with Intel AVX512 code. */
+#undef USE_AVX512
+#if defined(__x86_64__) && defined(HAVE_GCC_INLINE_ASM_AVX512) && \
+    (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
+     defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS))
+# define USE_AVX512 1
+#endif
+
 /* AMD64 assembly implementations use SystemV ABI, ABI conversion and additional
  * stack to store XMM6-XMM15 needed on Win64. */
 #undef ASM_FUNC_ABI
 #undef ASM_EXTRA_STACK
-#if defined(USE_AVX2) && defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)
+#if (defined(USE_AVX) || defined(USE_AVX2) || defined(USE_AVX512)) \
+    && defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)
 # define ASM_FUNC_ABI __attribute__((sysv_abi))
 # define ASM_EXTRA_STACK (10 * 16)
 #else
@@ -98,6 +107,9 @@ typedef struct BLAKE2B_CONTEXT_S
 #ifdef USE_AVX2
   unsigned int use_avx2:1;
 #endif
+#ifdef USE_AVX512
+  unsigned int use_avx512:1;
+#endif
 } BLAKE2B_CONTEXT;
 
 typedef struct
@@ -132,6 +144,9 @@ typedef struct BLAKE2S_CONTEXT_S
 #ifdef USE_AVX
   unsigned int use_avx:1;
 #endif
+#ifdef USE_AVX512
+  unsigned int use_avx512:1;
+#endif
 } BLAKE2S_CONTEXT;
 
 typedef unsigned int (*blake2_transform_t)(void *S, const void *inblk,
@@ -346,6 +361,12 @@ unsigned int _gcry_blake2b_transform_amd64_avx2(BLAKE2B_STATE *S,
                                                 size_t nblks) ASM_FUNC_ABI;
 #endif
 
+#ifdef USE_AVX512
+unsigned int _gcry_blake2b_transform_amd64_avx512(BLAKE2B_STATE *S,
+                                                  const void *inblks,
+                                                  size_t nblks) ASM_FUNC_ABI;
+#endif
+
 static unsigned int blake2b_transform(void *ctx, const void *inblks,
                                       size_t nblks)
 {
@@ -354,8 +375,12 @@ static unsigned int blake2b_transform(void *ctx, const void *inblks,
 
   if (0)
     {}
+#ifdef USE_AVX512
+  else if (c->use_avx512)
+    nburn = _gcry_blake2b_transform_amd64_avx512(&c->state, inblks, nblks);
+#endif
 #ifdef USE_AVX2
-  if (c->use_avx2)
+  else if (c->use_avx2)
     nburn = _gcry_blake2b_transform_amd64_avx2(&c->state, inblks, nblks);
 #endif
   else
@@ -468,6 +493,9 @@ static gcry_err_code_t blake2b_init_ctx(void *ctx, unsigned int flags,
 #ifdef USE_AVX2
   c->use_avx2 = !!(features & HWF_INTEL_AVX2);
 #endif
+#ifdef USE_AVX512
+  c->use_avx512 = !!(features & HWF_INTEL_AVX512);
+#endif
 
   c->outlen = dbits / 8;
   c->buflen = 0;
@@ -670,6 +698,12 @@ unsigned int _gcry_blake2s_transform_amd64_avx(BLAKE2S_STATE *S,
                                                size_t nblks) ASM_FUNC_ABI;
 #endif
 
+#ifdef USE_AVX512
+unsigned int _gcry_blake2s_transform_amd64_avx512(BLAKE2S_STATE *S,
+                                                  const void *inblks,
+                                                  size_t nblks) ASM_FUNC_ABI;
+#endif
+
 static unsigned int blake2s_transform(void *ctx, const void *inblks,
                                       size_t nblks)
 {
@@ -677,9 +711,13 @@ static unsigned int blake2s_transform(void *ctx, const void *inblks,
   unsigned int nburn;
 
   if (0)
-    {}
+    { }
+#ifdef USE_AVX512
+  else if (c->use_avx512)
+    nburn = _gcry_blake2s_transform_amd64_avx512(&c->state, inblks, nblks);
+#endif
 #ifdef USE_AVX
-  if (c->use_avx)
+  else if (c->use_avx)
     nburn = _gcry_blake2s_transform_amd64_avx(&c->state, inblks, nblks);
 #endif
   else
@@ -792,6 +830,9 @@ static gcry_err_code_t blake2s_init_ctx(void *ctx, unsigned int flags,
 #ifdef USE_AVX
   c->use_avx = !!(features & HWF_INTEL_AVX);
 #endif
+#ifdef USE_AVX
+  c->use_avx512 = !!(features & HWF_INTEL_AVX512);
+#endif
 
   c->outlen = dbits / 8;
   c->buflen = 0;
index 3601b65f300a35fc5f8065df3242ff02f28f0c4f..43c2cce187a4cd7063d8f67a1a15e92ad1ae6466 100644 (file)
@@ -31,8 +31,6 @@
 
 #include "asm-common-amd64.h"
 
-.text
-
 /* register macros */
 #define RSTATE  %rdi
 #define RINBLKS %rsi
         G2(ROW1, ROW2, ROW3, ROW4, m4); \
         UNDIAGONALIZE(ROW1, ROW2, ROW3, ROW4);
 
-blake2b_data:
+SECTION_RODATA
 .align 32
+ELF(.type _blake2b_avx2_data,@object;)
+_blake2b_avx2_data:
 .Liv:
         .quad 0x6a09e667f3bcc908, 0xbb67ae8584caa73b
         .quad 0x3c6ef372fe94f82b, 0xa54ff53a5f1d36f1
@@ -197,6 +197,7 @@ blake2b_data:
 .Lshuf_ror24:
         .byte 3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10
 
+.text
 .align 64
 .globl _gcry_blake2b_transform_amd64_avx2
 ELF(.type _gcry_blake2b_transform_amd64_avx2,@function;)
diff --git a/cipher/blake2b-amd64-avx512.S b/cipher/blake2b-amd64-avx512.S
new file mode 100644 (file)
index 0000000..b030849
--- /dev/null
@@ -0,0 +1,429 @@
+/* blake2b-amd64-avx512.S  -  AVX512 implementation of BLAKE2b
+ *
+ * Copyright (C) 2022 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+/* The code is based on public-domain/CC0 BLAKE2 reference implementation
+ * by Samual Neves, at https://github.com/BLAKE2/BLAKE2/tree/master/sse
+ * Copyright 2012, Samuel Neves <sneves@dei.uc.pt>
+ */
+
+#ifdef __x86_64
+#include <config.h>
+#if defined(HAVE_GCC_INLINE_ASM_AVX512) && \
+   (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
+    defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS))
+
+#include "asm-common-amd64.h"
+
+/* register macros */
+#define RSTATE  %rdi
+#define RINBLKS %rsi
+#define RNBLKS  %rdx
+#define RIV     %rcx
+
+/* state structure */
+#define STATE_H 0
+#define STATE_T (STATE_H + 8 * 8)
+#define STATE_F (STATE_T + 2 * 8)
+
+/* vector registers */
+#define ROW1  %ymm0
+#define ROW2  %ymm1
+#define ROW3  %ymm2
+#define ROW4  %ymm3
+#define TMP1  %ymm4
+#define TMP1x %xmm4
+#define R16   %ymm13
+
+#define MA1   %ymm5
+#define MA2   %ymm6
+#define MA3   %ymm7
+#define MA4   %ymm8
+#define MA1x  %xmm5
+#define MA2x  %xmm6
+#define MA3x  %xmm7
+#define MA4x  %xmm8
+
+#define MB1   %ymm9
+#define MB2   %ymm10
+#define MB3   %ymm11
+#define MB4   %ymm12
+#define MB1x  %xmm9
+#define MB2x  %xmm10
+#define MB3x  %xmm11
+#define MB4x  %xmm12
+
+/**********************************************************************
+  blake2b/AVX2
+ **********************************************************************/
+
+#define VPINSRQ_KMASK(kpos, qpos, mem, vreg) \
+        vmovdqu64 -((qpos) * 8) + mem, vreg {kpos}
+
+#define GATHER_MSG(m1, m2, m3, m4, m1x, m2x, m3x, m4x, \
+                   s0, s1, s2, s3, s4, s5, s6, s7, s8, \
+                   s9, s10, s11, s12, s13, s14, s15) \
+        vmovq (s0)*8(RINBLKS), m1x; \
+          vmovq (s1)*8(RINBLKS), m2x; \
+            vmovq (s8)*8(RINBLKS), m3x; \
+              vmovq (s9)*8(RINBLKS), m4x; \
+        VPINSRQ_KMASK(%k1, 1, (s2)*8(RINBLKS), m1); \
+          VPINSRQ_KMASK(%k1, 1, (s3)*8(RINBLKS), m2); \
+            VPINSRQ_KMASK(%k1, 1, (s10)*8(RINBLKS), m3); \
+              VPINSRQ_KMASK(%k1, 1, (s11)*8(RINBLKS), m4); \
+        VPINSRQ_KMASK(%k2, 2, (s4)*8(RINBLKS), m1); \
+          VPINSRQ_KMASK(%k2, 2, (s5)*8(RINBLKS), m2); \
+            VPINSRQ_KMASK(%k2, 2, (s12)*8(RINBLKS), m3); \
+              VPINSRQ_KMASK(%k2, 2, (s13)*8(RINBLKS), m4); \
+        VPINSRQ_KMASK(%k3, 3, (s6)*8(RINBLKS), m1); \
+          VPINSRQ_KMASK(%k3, 3, (s7)*8(RINBLKS), m2); \
+            VPINSRQ_KMASK(%k3, 3, (s14)*8(RINBLKS), m3); \
+              VPINSRQ_KMASK(%k3, 3, (s15)*8(RINBLKS), m4);
+
+#define GATHER_MSG_2(m1, m2, m3, m4, m1x, m2x, m3x, m4x, \
+                     s0, s1, s2, s3, s4, s5, s6, s7, s8, \
+                     s9, s10, s11, s12, s13, s14, s15) \
+        vmovdqu (s0)*8(RINBLKS), m1x; /* merged load */ \
+          vmovq (s1)*8(RINBLKS), m2x; \
+            vmovq (s8)*8(RINBLKS), m3x; \
+              vmovq (s9)*8(RINBLKS), m4x; \
+          VPINSRQ_KMASK(%k1, 1, (s3)*8(RINBLKS), m2); \
+            VPINSRQ_KMASK(%k1, 1, (s10)*8(RINBLKS), m3); \
+              VPINSRQ_KMASK(%k1, 1, (s11)*8(RINBLKS), m4); \
+        VPINSRQ_KMASK(%k2, 2, (s4)*8(RINBLKS), m1); \
+          VPINSRQ_KMASK(%k2, 2, (s5)*8(RINBLKS), m2); \
+            VPINSRQ_KMASK(%k2, 2, (s12)*8(RINBLKS), m3); \
+              VPINSRQ_KMASK(%k2, 2, (s13)*8(RINBLKS), m4); \
+        VPINSRQ_KMASK(%k3, 3, (s6)*8(RINBLKS), m1); \
+          VPINSRQ_KMASK(%k3, 3, (s7)*8(RINBLKS), m2); \
+            VPINSRQ_KMASK(%k3, 3, (s14)*8(RINBLKS), m3); \
+              VPINSRQ_KMASK(%k3, 3, (s15)*8(RINBLKS), m4);
+
+#define GATHER_MSG_3(m1, m2, m3, m4, m1x, m2x, m3x, m4x, \
+                     s0, s1, s2, s3, s4, s5, s6, s7, s8, \
+                     s9, s10, s11, s12, s13, s14, s15) \
+        vmovq (s0)*8(RINBLKS), m1x; \
+          vmovq (s1)*8(RINBLKS), m2x; \
+            vmovdqu64 (s8)*8(RINBLKS), m3 {%k4}{z}; /* merged load */ \
+              vmovq (s9)*8(RINBLKS), m4x; \
+        VPINSRQ_KMASK(%k1, 1, (s2)*8(RINBLKS), m1); \
+          VPINSRQ_KMASK(%k1, 1, (s3)*8(RINBLKS), m2); \
+            VPINSRQ_KMASK(%k1, 1, (s10)*8(RINBLKS), m3); \
+              VPINSRQ_KMASK(%k1, 1, (s11)*8(RINBLKS), m4); \
+        VPINSRQ_KMASK(%k2, 2, (s4)*8(RINBLKS), m1); \
+          VPINSRQ_KMASK(%k2, 2, (s5)*8(RINBLKS), m2); \
+              VPINSRQ_KMASK(%k2, 2, (s13)*8(RINBLKS), m4); \
+        VPINSRQ_KMASK(%k3, 3, (s6)*8(RINBLKS), m1); \
+          VPINSRQ_KMASK(%k3, 3, (s7)*8(RINBLKS), m2); \
+            VPINSRQ_KMASK(%k3, 3, (s14)*8(RINBLKS), m3); \
+              VPINSRQ_KMASK(%k3, 3, (s15)*8(RINBLKS), m4);
+
+#define GATHER_MSG_5(m1, m2, m3, m4, m1x, m2x, m3x, m4x, \
+                     s0, s1, s2, s3, s4, s5, s6, s7, s8, \
+                     s9, s10, s11, s12, s13, s14, s15) \
+        vmovq (s0)*8(RINBLKS), m1x; \
+          vmovq (s1)*8(RINBLKS), m2x; \
+            vmovq (s8)*8(RINBLKS), m3x; \
+              vmovq (s9)*8(RINBLKS), m4x; \
+        VPINSRQ_KMASK(%k5, 1, (s2)*8(RINBLKS), m1); /* merged load */ \
+          VPINSRQ_KMASK(%k6, 1, (s3)*8(RINBLKS), m2); /* merged load */ \
+            VPINSRQ_KMASK(%k1, 1, (s10)*8(RINBLKS), m3); \
+              VPINSRQ_KMASK(%k1, 1, (s11)*8(RINBLKS), m4); \
+        VPINSRQ_KMASK(%k2, 2, (s4)*8(RINBLKS), m1); \
+            VPINSRQ_KMASK(%k2, 2, (s12)*8(RINBLKS), m3); \
+              VPINSRQ_KMASK(%k2, 2, (s13)*8(RINBLKS), m4); \
+          VPINSRQ_KMASK(%k3, 3, (s7)*8(RINBLKS), m2); \
+            VPINSRQ_KMASK(%k3, 3, (s14)*8(RINBLKS), m3); \
+              VPINSRQ_KMASK(%k3, 3, (s15)*8(RINBLKS), m4);
+
+#define GATHER_MSG_6(m1, m2, m3, m4, m1x, m2x, m3x, m4x, \
+                     s0, s1, s2, s3, s4, s5, s6, s7, s8, \
+                     s9, s10, s11, s12, s13, s14, s15) \
+        vmovdqu64 (s0)*8(RINBLKS), m1 {%k4}{z}; /* merged load */; \
+          vmovq (s1)*8(RINBLKS), m2x; \
+            vmovq (s8)*8(RINBLKS), m3x; \
+              vmovq (s9)*8(RINBLKS), m4x; \
+        VPINSRQ_KMASK(%k1, 1, (s2)*8(RINBLKS), m1); \
+          VPINSRQ_KMASK(%k1, 1, (s3)*8(RINBLKS), m2); \
+            VPINSRQ_KMASK(%k5, 1, (s10)*8(RINBLKS), m3); /* merged load */ \
+              VPINSRQ_KMASK(%k1, 1, (s11)*8(RINBLKS), m4); \
+          VPINSRQ_KMASK(%k2, 2, (s5)*8(RINBLKS), m2); \
+            VPINSRQ_KMASK(%k2, 2, (s12)*8(RINBLKS), m3); \
+              VPINSRQ_KMASK(%k2, 2, (s13)*8(RINBLKS), m4); \
+        VPINSRQ_KMASK(%k3, 3, (s6)*8(RINBLKS), m1); \
+          VPINSRQ_KMASK(%k3, 3, (s7)*8(RINBLKS), m2); \
+              VPINSRQ_KMASK(%k3, 3, (s15)*8(RINBLKS), m4);
+
+#define GATHER_MSG_8(m1, m2, m3, m4, m1x, m2x, m3x, m4x, \
+                     s0, s1, s2, s3, s4, s5, s6, s7, s8, \
+                     s9, s10, s11, s12, s13, s14, s15) \
+        vmovq (s0)*8(RINBLKS), m1x; \
+          vmovq (s1)*8(RINBLKS), m2x; \
+            vmovdqu (s8)*8(RINBLKS), m3x; /* merged load */ \
+              vmovq (s9)*8(RINBLKS), m4x; \
+        VPINSRQ_KMASK(%k1, 1, (s2)*8(RINBLKS), m1); \
+          VPINSRQ_KMASK(%k1, 1, (s3)*8(RINBLKS), m2); \
+              VPINSRQ_KMASK(%k1, 1, (s11)*8(RINBLKS), m4); \
+        VPINSRQ_KMASK(%k2, 2, (s4)*8(RINBLKS), m1); \
+          VPINSRQ_KMASK(%k2, 2, (s5)*8(RINBLKS), m2); \
+            VPINSRQ_KMASK(%k2, 2, (s12)*8(RINBLKS), m3); \
+              vinserti64x2 $1, (s13)*8(RINBLKS), m4, m4; /* merged load */ \
+        VPINSRQ_KMASK(%k3, 3, (s6)*8(RINBLKS), m1); \
+          VPINSRQ_KMASK(%k3, 3, (s7)*8(RINBLKS), m2); \
+            VPINSRQ_KMASK(%k3, 3, (s14)*8(RINBLKS), m3);
+
+#define GATHER_MSG_9(m1, m2, m3, m4, m1x, m2x, m3x, m4x, \
+                     s0, s1, s2, s3, s4, s5, s6, s7, s8, \
+                     s9, s10, s11, s12, s13, s14, s15) \
+        vmovq (s0)*8(RINBLKS), m1x; \
+          vmovdqu64 (s1)*8(RINBLKS), m2 {%k7}{z}; /* merged load */; \
+            vmovq (s8)*8(RINBLKS), m3x; \
+              vmovq (s9)*8(RINBLKS), m4x; \
+        VPINSRQ_KMASK(%k1, 1, (s2)*8(RINBLKS), m1); \
+          VPINSRQ_KMASK(%k1, 1, (s3)*8(RINBLKS), m2); \
+            VPINSRQ_KMASK(%k1, 1, (s10)*8(RINBLKS), m3); \
+              VPINSRQ_KMASK(%k1, 1, (s11)*8(RINBLKS), m4); \
+        VPINSRQ_KMASK(%k2, 2, (s4)*8(RINBLKS), m1); \
+          VPINSRQ_KMASK(%k2, 2, (s5)*8(RINBLKS), m2); \
+            VPINSRQ_KMASK(%k2, 2, (s12)*8(RINBLKS), m3); \
+              VPINSRQ_KMASK(%k2, 2, (s13)*8(RINBLKS), m4); \
+        VPINSRQ_KMASK(%k3, 3, (s6)*8(RINBLKS), m1); \
+            VPINSRQ_KMASK(%k3, 3, (s14)*8(RINBLKS), m3); \
+              VPINSRQ_KMASK(%k3, 3, (s15)*8(RINBLKS), m4);
+
+#define LOAD_MSG_0(m1, m2, m3, m4, m1x, m2x, m3x, m4x) \
+        GATHER_MSG(m1, m2, m3, m4, m1x, m2x, m3x, m4x, \
+                    0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15)
+#define LOAD_MSG_1(m1, m2, m3, m4, m1x, m2x, m3x, m4x) \
+        GATHER_MSG(m1, m2, m3, m4, m1x, m2x, m3x, m4x, \
+                   14, 10,  4,  8,  9, 15, 13,  6,  1, 12,  0,  2, 11,  7,  5,  3)
+#define LOAD_MSG_2(m1, m2, m3, m4, m1x, m2x, m3x, m4x) \
+        GATHER_MSG_2(m1, m2, m3, m4, m1x, m2x, m3x, m4x, \
+                     11,  8, 12,  0,  5,  2, 15, 13, 10, 14,  3,  6,  7,  1,  9,  4)
+#define LOAD_MSG_3(m1, m2, m3, m4, m1x, m2x, m3x, m4x) \
+        GATHER_MSG_3(m1, m2, m3, m4, m1x, m2x, m3x, m4x, \
+                     7,  9,  3,  1, 13, 12, 11, 14,  2,  6,  5, 10,  4,  0, 15,  8)
+#define LOAD_MSG_4(m1, m2, m3, m4, m1x, m2x, m3x, m4x) \
+        GATHER_MSG(m1, m2, m3, m4, m1x, m2x, m3x, m4x, \
+                    9,  0,  5,  7,  2,  4, 10, 15, 14,  1, 11, 12,  6,  8,  3, 13)
+#define LOAD_MSG_5(m1, m2, m3, m4, m1x, m2x, m3x, m4x) \
+        GATHER_MSG_5(m1, m2, m3, m4, m1x, m2x, m3x, m4x, \
+                     2, 12,  6, 10,  0, 11,  8,  3,  4, 13,  7,  5, 15, 14,  1,  9)
+#define LOAD_MSG_6(m1, m2, m3, m4, m1x, m2x, m3x, m4x) \
+        GATHER_MSG_6(m1, m2, m3, m4, m1x, m2x, m3x, m4x, \
+                     12,  5,  1, 15, 14, 13,  4, 10,  0,  7,  6,  3,  9,  2,  8, 11)
+#define LOAD_MSG_7(m1, m2, m3, m4, m1x, m2x, m3x, m4x) \
+        GATHER_MSG(m1, m2, m3, m4, m1x, m2x, m3x, m4x, \
+                   13, 11,  7, 14, 12,  1,  3,  9,  5,  0, 15,  4,  8,  6,  2, 10)
+#define LOAD_MSG_8(m1, m2, m3, m4, m1x, m2x, m3x, m4x) \
+        GATHER_MSG_8(m1, m2, m3, m4, m1x, m2x, m3x, m4x, \
+                     6, 15, 14,  9, 11,  3,  0,  8, 12,  2, 13,  7,  1,  4, 10,  5)
+#define LOAD_MSG_9(m1, m2, m3, m4, m1x, m2x, m3x, m4x) \
+        GATHER_MSG_9(m1, m2, m3, m4, m1x, m2x, m3x, m4x, \
+                     10,  2,  8,  4,  7,  6,  1,  5, 15, 11,  9, 14,  3, 12, 13 , 0)
+#define LOAD_MSG_10(m1, m2, m3, m4, m1x, m2x, m3x, m4x) \
+        LOAD_MSG_0(m1, m2, m3, m4, m1x, m2x, m3x, m4x)
+#define LOAD_MSG_11(m1, m2, m3, m4, m1x, m2x, m3x, m4x) \
+        LOAD_MSG_1(m1, m2, m3, m4, m1x, m2x, m3x, m4x)
+
+#define LOAD_MSG(r, m1, m2, m3, m4) \
+        LOAD_MSG_##r(m1, m2, m3, m4, m1##x, m2##x, m3##x, m4##x)
+
+#define ROR_32(in, out) vpshufd $0xb1, in, out
+
+#define ROR_24(in, out) vprorq $24, in, out
+
+#define ROR_16(in, out) vpshufb R16, in, out
+
+#define ROR_63(in, out) vprorq $63, in, out
+
+#define G(r1, r2, r3, r4, m, ROR_A, ROR_B) \
+        vpaddq m, r1, r1; \
+        vpaddq r2, r1, r1; \
+        vpxor r1, r4, r4; \
+        ROR_A(r4, r4); \
+        vpaddq r4, r3, r3; \
+        vpxor r3, r2, r2; \
+        ROR_B(r2, r2)
+
+#define G1(r1, r2, r3, r4, m) \
+        G(r1, r2, r3, r4, m, ROR_32, ROR_24)
+
+#define G2(r1, r2, r3, r4, m) \
+        G(r1, r2, r3, r4, m, ROR_16, ROR_63)
+
+#define MM_SHUFFLE(z,y,x,w) \
+        (((z) << 6) | ((y) << 4) | ((x) << 2) | (w))
+
+#define DIAGONALIZE(r1, r2, r3, r4) \
+        vpermq $MM_SHUFFLE(0,3,2,1), r2, r2; \
+        vpermq $MM_SHUFFLE(1,0,3,2), r3, r3; \
+        vpermq $MM_SHUFFLE(2,1,0,3), r4, r4
+
+#define UNDIAGONALIZE(r1, r2, r3, r4) \
+        vpermq $MM_SHUFFLE(2,1,0,3), r2, r2; \
+        vpermq $MM_SHUFFLE(1,0,3,2), r3, r3; \
+        vpermq $MM_SHUFFLE(0,3,2,1), r4, r4
+
+#define ROUND(r, m1, m2, m3, m4) \
+        G1(ROW1, ROW2, ROW3, ROW4, m1); \
+        G2(ROW1, ROW2, ROW3, ROW4, m2); \
+        DIAGONALIZE(ROW1, ROW2, ROW3, ROW4); \
+        G1(ROW1, ROW2, ROW3, ROW4, m3); \
+        G2(ROW1, ROW2, ROW3, ROW4, m4); \
+        UNDIAGONALIZE(ROW1, ROW2, ROW3, ROW4)
+
+SECTION_RODATA
+
+.align 32
+ELF(.type _blake2b_avx512_data,@object;)
+_blake2b_avx512_data:
+.Liv:
+        .quad 0x6a09e667f3bcc908, 0xbb67ae8584caa73b
+        .quad 0x3c6ef372fe94f82b, 0xa54ff53a5f1d36f1
+        .quad 0x510e527fade682d1, 0x9b05688c2b3e6c1f
+        .quad 0x1f83d9abfb41bd6b, 0x5be0cd19137e2179
+.Lshuf_ror16:
+        .byte 2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9
+.Lk1_mask:
+       .byte (1 << 1)
+.Lk4_mask:
+       .byte (1 << 0) + (1 << 2)
+.Lk5_mask:
+       .byte (1 << 1) + (1 << 3)
+.Lk6_mask:
+       .byte (1 << 1) + (1 << 2)
+.Lk7_mask:
+       .byte (1 << 0) + (1 << 3)
+
+.text
+
+.align 64
+.globl _gcry_blake2b_transform_amd64_avx512
+ELF(.type _gcry_blake2b_transform_amd64_avx512,@function;)
+
+_gcry_blake2b_transform_amd64_avx512:
+        /* input:
+         *     %rdi: state
+         *     %rsi: blks
+         *     %rdx: num_blks
+         */
+        CFI_STARTPROC();
+
+        spec_stop_avx512;
+
+        kmovb .Lk1_mask rRIP, %k1;
+        kshiftlb $1, %k1, %k2;
+        kshiftlb $2, %k1, %k3;
+        kmovb .Lk4_mask rRIP, %k4;
+        kmovb .Lk5_mask rRIP, %k5;
+        kmovb .Lk6_mask rRIP, %k6;
+        kmovb .Lk7_mask rRIP, %k7;
+
+        addq $128, (STATE_T + 0)(RSTATE);
+        adcq $0, (STATE_T + 8)(RSTATE);
+
+        vbroadcasti128 .Lshuf_ror16 rRIP, R16;
+
+        vmovdqa .Liv+(0 * 8) rRIP, ROW3;
+        vmovdqa .Liv+(4 * 8) rRIP, ROW4;
+
+        vmovdqu (STATE_H + 0 * 8)(RSTATE), ROW1;
+        vmovdqu (STATE_H + 4 * 8)(RSTATE), ROW2;
+
+        vpxor (STATE_T)(RSTATE), ROW4, ROW4;
+
+        LOAD_MSG(0, MA1, MA2, MA3, MA4);
+        LOAD_MSG(1, MB1, MB2, MB3, MB4);
+
+.align 16
+.Loop:
+        ROUND(0, MA1, MA2, MA3, MA4);
+                                      LOAD_MSG(2, MA1, MA2, MA3, MA4);
+        ROUND(1, MB1, MB2, MB3, MB4);
+                                      LOAD_MSG(3, MB1, MB2, MB3, MB4);
+        ROUND(2, MA1, MA2, MA3, MA4);
+                                      LOAD_MSG(4, MA1, MA2, MA3, MA4);
+        ROUND(3, MB1, MB2, MB3, MB4);
+                                      LOAD_MSG(5, MB1, MB2, MB3, MB4);
+        ROUND(4, MA1, MA2, MA3, MA4);
+                                      LOAD_MSG(6, MA1, MA2, MA3, MA4);
+        ROUND(5, MB1, MB2, MB3, MB4);
+                                      LOAD_MSG(7, MB1, MB2, MB3, MB4);
+        ROUND(6, MA1, MA2, MA3, MA4);
+                                      LOAD_MSG(8, MA1, MA2, MA3, MA4);
+        ROUND(7, MB1, MB2, MB3, MB4);
+                                      LOAD_MSG(9, MB1, MB2, MB3, MB4);
+        ROUND(8, MA1, MA2, MA3, MA4);
+                                      LOAD_MSG(10, MA1, MA2, MA3, MA4);
+        ROUND(9, MB1, MB2, MB3, MB4);
+                                      LOAD_MSG(11, MB1, MB2, MB3, MB4);
+        sub $1, RNBLKS;
+        jz .Loop_end;
+
+        lea 128(RINBLKS), RINBLKS;
+        addq $128, (STATE_T + 0)(RSTATE);
+        adcq $0, (STATE_T + 8)(RSTATE);
+
+        ROUND(10, MA1, MA2, MA3, MA4);
+                                      LOAD_MSG(0, MA1, MA2, MA3, MA4);
+        ROUND(11, MB1, MB2, MB3, MB4);
+                                      LOAD_MSG(1, MB1, MB2, MB3, MB4);
+
+        vpternlogq $0x96, (STATE_H + 0 * 8)(RSTATE), ROW3, ROW1;
+        vpternlogq $0x96, (STATE_H + 4 * 8)(RSTATE), ROW4, ROW2;
+
+        vmovdqa .Liv+(0 * 8) rRIP, ROW3;
+        vmovdqa .Liv+(4 * 8) rRIP, ROW4;
+
+        vmovdqu ROW1, (STATE_H + 0 * 8)(RSTATE);
+        vmovdqu ROW2, (STATE_H + 4 * 8)(RSTATE);
+
+        vpxor (STATE_T)(RSTATE), ROW4, ROW4;
+
+        jmp .Loop;
+
+.align 16
+.Loop_end:
+        ROUND(10, MA1, MA2, MA3, MA4);
+        ROUND(11, MB1, MB2, MB3, MB4);
+
+        vpternlogq $0x96, (STATE_H + 0 * 8)(RSTATE), ROW3, ROW1;
+        vpternlogq $0x96, (STATE_H + 4 * 8)(RSTATE), ROW4, ROW2;
+
+        vmovdqu ROW1, (STATE_H + 0 * 8)(RSTATE);
+        vmovdqu ROW2, (STATE_H + 4 * 8)(RSTATE);
+
+        xorl %eax, %eax;
+        kxord %k1, %k1, %k1;
+        kxord %k2, %k2, %k2;
+        kxord %k3, %k3, %k3;
+        kxord %k4, %k4, %k4;
+        kxord %k5, %k5, %k5;
+        kxord %k6, %k6, %k6;
+        kxord %k7, %k7, %k7;
+
+        vzeroall;
+        ret_spec_stop;
+        CFI_ENDPROC();
+ELF(.size _gcry_blake2b_transform_amd64_avx512,
+    .-_gcry_blake2b_transform_amd64_avx512;)
+
+#endif /*defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS)*/
+#endif /*__x86_64*/
index 5094b4c1d0de045f37712de4e62f4ecc6eac10df..44b82ab2d1fe2dcc633b98ee5dcea4f0fc3ceb8d 100644 (file)
@@ -31,8 +31,6 @@
 
 #include "asm-common-amd64.h"
 
-.text
-
 /* register macros */
 #define RSTATE  %rdi
 #define RINBLKS %rsi
         G2(ROW1, ROW2, ROW3, ROW4, m4); \
         UNDIAGONALIZE(ROW1, ROW2, ROW3, ROW4);
 
-blake2s_data:
+SECTION_RODATA
+
 .align 16
+ELF(.type _blake2s_avx_data,@object;)
+_blake2s_avx_data:
 .Liv:
         .long 0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A
         .long 0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19
@@ -181,6 +182,8 @@ blake2s_data:
 .Lshuf_ror8:
         .byte 1,2,3,0,5,6,7,4,9,10,11,8,13,14,15,12
 
+.text
+
 .align 64
 .globl _gcry_blake2s_transform_amd64_avx
 ELF(.type _gcry_blake2s_transform_amd64_avx,@function;)
diff --git a/cipher/blake2s-amd64-avx512.S b/cipher/blake2s-amd64-avx512.S
new file mode 100644 (file)
index 0000000..543944b
--- /dev/null
@@ -0,0 +1,397 @@
+/* blake2s-amd64-avx512.S  -  AVX512 implementation of BLAKE2s
+ *
+ * Copyright (C) 2022 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+/* The code is based on public-domain/CC0 BLAKE2 reference implementation
+ * by Samual Neves, at https://github.com/BLAKE2/BLAKE2/tree/master/sse
+ * Copyright 2012, Samuel Neves <sneves@dei.uc.pt>
+ */
+
+#ifdef __x86_64
+#include <config.h>
+#if defined(HAVE_GCC_INLINE_ASM_AVX512) && \
+   (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
+    defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS))
+
+#include "asm-common-amd64.h"
+
+/* register macros */
+#define RSTATE  %rdi
+#define RINBLKS %rsi
+#define RNBLKS  %rdx
+#define RIV     %rcx
+
+/* state structure */
+#define STATE_H 0
+#define STATE_T (STATE_H + 8 * 4)
+#define STATE_F (STATE_T + 2 * 4)
+
+/* vector registers */
+#define ROW1  %xmm0
+#define ROW2  %xmm1
+#define ROW3  %xmm2
+#define ROW4  %xmm3
+#define TMP1  %xmm4
+#define TMP1x %xmm4
+
+#define MA1   %xmm5
+#define MA2   %xmm6
+#define MA3   %xmm7
+#define MA4   %xmm8
+
+#define MB1   %xmm9
+#define MB2   %xmm10
+#define MB3   %xmm11
+#define MB4   %xmm12
+
+/**********************************************************************
+  blake2s/AVX
+ **********************************************************************/
+
+#define VPINSRD_KMASK(kpos, dpos, mem, vreg) \
+        vmovdqu32 -((dpos) * 4) + mem, vreg {kpos}
+
+#define GATHER_MSG(m1, m2, m3, m4, \
+                   s0, s1, s2, s3, s4, s5, s6, s7, s8, \
+                   s9, s10, s11, s12, s13, s14, s15) \
+        vmovd (s0)*4(RINBLKS), m1; \
+          vmovd (s1)*4(RINBLKS), m2; \
+            vmovd (s8)*4(RINBLKS), m3; \
+              vmovd (s9)*4(RINBLKS), m4; \
+        vpinsrd $1, (s2)*4(RINBLKS), m1, m1; \
+          vpinsrd $1, (s3)*4(RINBLKS), m2, m2; \
+            vpinsrd $1, (s10)*4(RINBLKS), m3, m3; \
+              vpinsrd $1, (s11)*4(RINBLKS), m4, m4; \
+        vpinsrd $2, (s4)*4(RINBLKS), m1, m1; \
+          vpinsrd $2, (s5)*4(RINBLKS), m2, m2; \
+            vpinsrd $2, (s12)*4(RINBLKS), m3, m3; \
+              vpinsrd $2, (s13)*4(RINBLKS), m4, m4; \
+        vpinsrd $3, (s6)*4(RINBLKS), m1, m1; \
+          vpinsrd $3, (s7)*4(RINBLKS), m2, m2; \
+            vpinsrd $3, (s14)*4(RINBLKS), m3, m3; \
+              vpinsrd $3, (s15)*4(RINBLKS), m4, m4;
+
+#define GATHER_MSG_2(m1, m2, m3, m4, \
+                   s0, s1, s2, s3, s4, s5, s6, s7, s8, \
+                   s9, s10, s11, s12, s13, s14, s15) \
+        vmovq (s0)*4(RINBLKS), m1; /* merged load */ \
+          vmovd (s1)*4(RINBLKS), m2; \
+            vmovd (s8)*4(RINBLKS), m3; \
+              vmovd (s9)*4(RINBLKS), m4; \
+          vpinsrd $1, (s3)*4(RINBLKS), m2, m2; \
+            vpinsrd $1, (s10)*4(RINBLKS), m3, m3; \
+              vpinsrd $1, (s11)*4(RINBLKS), m4, m4; \
+        vpinsrd $2, (s4)*4(RINBLKS), m1, m1; \
+          vpinsrd $2, (s5)*4(RINBLKS), m2, m2; \
+            vpinsrd $2, (s12)*4(RINBLKS), m3, m3; \
+              vpinsrd $2, (s13)*4(RINBLKS), m4, m4; \
+        vpinsrd $3, (s6)*4(RINBLKS), m1, m1; \
+          vpinsrd $3, (s7)*4(RINBLKS), m2, m2; \
+            vpinsrd $3, (s14)*4(RINBLKS), m3, m3; \
+              vpinsrd $3, (s15)*4(RINBLKS), m4, m4;
+
+#define GATHER_MSG_3(m1, m2, m3, m4, \
+                   s0, s1, s2, s3, s4, s5, s6, s7, s8, \
+                   s9, s10, s11, s12, s13, s14, s15) \
+        vmovd (s0)*4(RINBLKS), m1; \
+          vmovd (s1)*4(RINBLKS), m2; \
+            vmovdqu32 (s8)*4(RINBLKS), m3 {%k4}{z}; /* merged load */ \
+              vmovd (s9)*4(RINBLKS), m4; \
+        vpinsrd $1, (s2)*4(RINBLKS), m1, m1; \
+          vpinsrd $1, (s3)*4(RINBLKS), m2, m2; \
+            vpinsrd $1, (s10)*4(RINBLKS), m3, m3; \
+              vpinsrd $1, (s11)*4(RINBLKS), m4, m4; \
+        vpinsrd $2, (s4)*4(RINBLKS), m1, m1; \
+          vpinsrd $2, (s5)*4(RINBLKS), m2, m2; \
+            vpinsrd $2, (s12)*4(RINBLKS), m3, m3; \
+              vpinsrd $2, (s13)*4(RINBLKS), m4, m4; \
+        vpinsrd $3, (s6)*4(RINBLKS), m1, m1; \
+          vpinsrd $3, (s7)*4(RINBLKS), m2, m2; \
+            vpinsrd $3, (s14)*4(RINBLKS), m3, m3; \
+              vpinsrd $3, (s15)*4(RINBLKS), m4, m4;
+
+#define GATHER_MSG_5(m1, m2, m3, m4, \
+                   s0, s1, s2, s3, s4, s5, s6, s7, s8, \
+                   s9, s10, s11, s12, s13, s14, s15) \
+        vmovd (s0)*4(RINBLKS), m1; \
+          vmovd (s1)*4(RINBLKS), m2; \
+            vmovd (s8)*4(RINBLKS), m3; \
+              vmovd (s9)*4(RINBLKS), m4; \
+        VPINSRD_KMASK(%k5, 1, (s2)*4(RINBLKS), m1); /* merged load */ \
+          VPINSRD_KMASK(%k6, 1, (s3)*4(RINBLKS), m2); /* merged load */ \
+            vpinsrd $1, (s10)*4(RINBLKS), m3, m3; \
+              vpinsrd $1, (s11)*4(RINBLKS), m4, m4; \
+        vpinsrd $2, (s4)*4(RINBLKS), m1, m1; \
+            vpinsrd $2, (s12)*4(RINBLKS), m3, m3; \
+              vpinsrd $2, (s13)*4(RINBLKS), m4, m4; \
+          vpinsrd $3, (s7)*4(RINBLKS), m2, m2; \
+            vpinsrd $3, (s14)*4(RINBLKS), m3, m3; \
+              vpinsrd $3, (s15)*4(RINBLKS), m4, m4;
+
+#define GATHER_MSG_6(m1, m2, m3, m4, \
+                   s0, s1, s2, s3, s4, s5, s6, s7, s8, \
+                   s9, s10, s11, s12, s13, s14, s15) \
+        vmovdqu32 (s0)*4(RINBLKS), m1 {%k4}{z}; /* merged load */; \
+          vmovd (s1)*4(RINBLKS), m2; \
+            vmovd (s8)*4(RINBLKS), m3; \
+              vmovd (s9)*4(RINBLKS), m4; \
+        vpinsrd $1, (s2)*4(RINBLKS), m1, m1; \
+          vpinsrd $1, (s3)*4(RINBLKS), m2, m2; \
+            VPINSRD_KMASK(%k5, 1, (s10)*4(RINBLKS), m3); /* merged load */ \
+              vpinsrd $1, (s11)*4(RINBLKS), m4, m4; \
+          vpinsrd $2, (s5)*4(RINBLKS), m2, m2; \
+            vpinsrd $2, (s12)*4(RINBLKS), m3, m3; \
+              vpinsrd $2, (s13)*4(RINBLKS), m4, m4; \
+        vpinsrd $3, (s6)*4(RINBLKS), m1, m1; \
+          vpinsrd $3, (s7)*4(RINBLKS), m2, m2; \
+              vpinsrd $3, (s15)*4(RINBLKS), m4, m4;
+
+#define GATHER_MSG_8(m1, m2, m3, m4, \
+                   s0, s1, s2, s3, s4, s5, s6, s7, s8, \
+                   s9, s10, s11, s12, s13, s14, s15) \
+        vmovd (s0)*4(RINBLKS), m1; \
+          vmovd (s1)*4(RINBLKS), m2; \
+            vmovq (s8)*4(RINBLKS), m3; \
+              vmovd (s9)*4(RINBLKS), m4; \
+        vpinsrd $1, (s2)*4(RINBLKS), m1, m1; \
+          vpinsrd $1, (s3)*4(RINBLKS), m2, m2; \
+              vpinsrd $1, (s11)*4(RINBLKS), m4, m4; \
+        vpinsrd $2, (s4)*4(RINBLKS), m1, m1; \
+          vpinsrd $2, (s5)*4(RINBLKS), m2, m2; \
+            vpinsrd $2, (s12)*4(RINBLKS), m3, m3; \
+              vpinsrq $1, (s13)*4(RINBLKS), m4, m4; \
+        vpinsrd $3, (s6)*4(RINBLKS), m1, m1; \
+          vpinsrd $3, (s7)*4(RINBLKS), m2, m2; \
+            vpinsrd $3, (s14)*4(RINBLKS), m3, m3;
+
+#define GATHER_MSG_9(m1, m2, m3, m4, \
+                   s0, s1, s2, s3, s4, s5, s6, s7, s8, \
+                   s9, s10, s11, s12, s13, s14, s15) \
+        vmovd (s0)*4(RINBLKS), m1; \
+          vmovdqu32 (s1)*4(RINBLKS), m2 {%k7}{z}; /* merged load */; \
+            vmovd (s8)*4(RINBLKS), m3; \
+              vmovd (s9)*4(RINBLKS), m4; \
+        vpinsrd $1, (s2)*4(RINBLKS), m1, m1; \
+          vpinsrd $1, (s3)*4(RINBLKS), m2, m2; \
+            vpinsrd $1, (s10)*4(RINBLKS), m3, m3; \
+              vpinsrd $1, (s11)*4(RINBLKS), m4, m4; \
+        vpinsrd $2, (s4)*4(RINBLKS), m1, m1; \
+          vpinsrd $2, (s5)*4(RINBLKS), m2, m2; \
+            vpinsrd $2, (s12)*4(RINBLKS), m3, m3; \
+              vpinsrd $2, (s13)*4(RINBLKS), m4, m4; \
+        vpinsrd $3, (s6)*4(RINBLKS), m1, m1; \
+            vpinsrd $3, (s14)*4(RINBLKS), m3, m3; \
+              vpinsrd $3, (s15)*4(RINBLKS), m4, m4;
+
+#define LOAD_MSG_0(m1, m2, m3, m4) \
+        GATHER_MSG(m1, m2, m3, m4, \
+                    0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15)
+#define LOAD_MSG_1(m1, m2, m3, m4) \
+        GATHER_MSG(m1, m2, m3, m4, \
+                   14, 10,  4,  8,  9, 15, 13,  6,  1, 12,  0,  2, 11,  7,  5,  3)
+#define LOAD_MSG_2(m1, m2, m3, m4) \
+        GATHER_MSG_2(m1, m2, m3, m4, \
+                     11,  8, 12,  0,  5,  2, 15, 13, 10, 14,  3,  6,  7,  1,  9,  4)
+#define LOAD_MSG_3(m1, m2, m3, m4) \
+        GATHER_MSG_3(m1, m2, m3, m4, \
+                     7,  9,  3,  1, 13, 12, 11, 14,  2,  6,  5, 10,  4,  0, 15,  8)
+#define LOAD_MSG_4(m1, m2, m3, m4) \
+        GATHER_MSG(m1, m2, m3, m4, \
+                    9,  0,  5,  7,  2,  4, 10, 15, 14,  1, 11, 12,  6,  8,  3, 13)
+#define LOAD_MSG_5(m1, m2, m3, m4) \
+        GATHER_MSG_5(m1, m2, m3, m4, \
+                     2, 12,  6, 10,  0, 11,  8,  3,  4, 13,  7,  5, 15, 14,  1,  9)
+#define LOAD_MSG_6(m1, m2, m3, m4) \
+        GATHER_MSG_6(m1, m2, m3, m4, \
+                     12,  5,  1, 15, 14, 13,  4, 10,  0,  7,  6,  3,  9,  2,  8, 11)
+#define LOAD_MSG_7(m1, m2, m3, m4) \
+        GATHER_MSG(m1, m2, m3, m4, \
+                   13, 11,  7, 14, 12,  1,  3,  9,  5,  0, 15,  4,  8,  6,  2, 10)
+#define LOAD_MSG_8(m1, m2, m3, m4) \
+        GATHER_MSG_8(m1, m2, m3, m4, \
+                     6, 15, 14,  9, 11,  3,  0,  8, 12,  2, 13,  7,  1,  4, 10,  5)
+#define LOAD_MSG_9(m1, m2, m3, m4) \
+        GATHER_MSG_9(m1, m2, m3, m4, \
+                     10,  2,  8,  4,  7,  6,  1,  5, 15, 11,  9, 14,  3, 12, 13 , 0)
+
+#define LOAD_MSG(r, m1, m2, m3, m4) LOAD_MSG_##r(m1, m2, m3, m4)
+
+#define ROR_16(in, out) vprord $16, in, out;
+
+#define ROR_8(in, out)  vprord $8, in, out;
+
+#define ROR_12(in, out) vprord $12, in, out;
+
+#define ROR_7(in, out) vprord $7, in, out;
+
+#define G(r1, r2, r3, r4, m, ROR_A, ROR_B) \
+        vpaddd m, r1, r1; \
+        vpaddd r2, r1, r1; \
+        vpxor r1, r4, r4; \
+        ROR_A(r4, r4); \
+        vpaddd r4, r3, r3; \
+        vpxor r3, r2, r2; \
+        ROR_B(r2, r2);
+
+#define G1(r1, r2, r3, r4, m) \
+        G(r1, r2, r3, r4, m, ROR_16, ROR_12);
+
+#define G2(r1, r2, r3, r4, m) \
+        G(r1, r2, r3, r4, m, ROR_8, ROR_7);
+
+#define MM_SHUFFLE(z,y,x,w) \
+        (((z) << 6) | ((y) << 4) | ((x) << 2) | (w))
+
+#define DIAGONALIZE(r1, r2, r3, r4) \
+        vpshufd $MM_SHUFFLE(0,3,2,1), r2, r2; \
+        vpshufd $MM_SHUFFLE(1,0,3,2), r3, r3; \
+        vpshufd $MM_SHUFFLE(2,1,0,3), r4, r4;
+
+#define UNDIAGONALIZE(r1, r2, r3, r4) \
+        vpshufd $MM_SHUFFLE(2,1,0,3), r2, r2; \
+        vpshufd $MM_SHUFFLE(1,0,3,2), r3, r3; \
+        vpshufd $MM_SHUFFLE(0,3,2,1), r4, r4;
+
+#define ROUND(r, m1, m2, m3, m4) \
+        G1(ROW1, ROW2, ROW3, ROW4, m1); \
+        G2(ROW1, ROW2, ROW3, ROW4, m2); \
+        DIAGONALIZE(ROW1, ROW2, ROW3, ROW4); \
+        G1(ROW1, ROW2, ROW3, ROW4, m3); \
+        G2(ROW1, ROW2, ROW3, ROW4, m4); \
+        UNDIAGONALIZE(ROW1, ROW2, ROW3, ROW4);
+
+SECTION_RODATA
+
+ELF(.type _blake2s_avx512_data,@object;)
+.align 16
+_blake2s_avx512_data:
+.Liv:
+        .long 0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A
+        .long 0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19
+.Lk4_mask:
+       .byte (1 << 0) + (1 << 2)
+.Lk5_mask:
+       .byte (1 << 1) + (1 << 3)
+.Lk6_mask:
+       .byte (1 << 1) + (1 << 2)
+.Lk7_mask:
+       .byte (1 << 0) + (1 << 3)
+
+.text
+
+.align 64
+.globl _gcry_blake2s_transform_amd64_avx512
+ELF(.type _gcry_blake2s_transform_amd64_avx512,@function;)
+
+_gcry_blake2s_transform_amd64_avx512:
+        /* input:
+         *     %rdi: state
+         *     %rsi: blks
+         *     %rdx: num_blks
+         */
+        CFI_STARTPROC();
+
+        spec_stop_avx512;
+
+        kmovb .Lk4_mask rRIP, %k4;
+        kmovb .Lk5_mask rRIP, %k5;
+        kmovb .Lk6_mask rRIP, %k6;
+        kmovb .Lk7_mask rRIP, %k7;
+
+        addq $64, (STATE_T + 0)(RSTATE);
+
+        vmovdqa .Liv+(0 * 4) rRIP, ROW3;
+        vmovdqa .Liv+(4 * 4) rRIP, ROW4;
+
+        vmovdqu (STATE_H + 0 * 4)(RSTATE), ROW1;
+        vmovdqu (STATE_H + 4 * 4)(RSTATE), ROW2;
+
+        vpxor (STATE_T)(RSTATE), ROW4, ROW4;
+
+        LOAD_MSG(0, MA1, MA2, MA3, MA4);
+        LOAD_MSG(1, MB1, MB2, MB3, MB4);
+        jmp .Loop;
+
+.align 64, 0xcc
+.Loop:
+        ROUND(0, MA1, MA2, MA3, MA4);
+                                      LOAD_MSG(2, MA1, MA2, MA3, MA4);
+        ROUND(1, MB1, MB2, MB3, MB4);
+                                      LOAD_MSG(3, MB1, MB2, MB3, MB4);
+        ROUND(2, MA1, MA2, MA3, MA4);
+                                      LOAD_MSG(4, MA1, MA2, MA3, MA4);
+        ROUND(3, MB1, MB2, MB3, MB4);
+                                      LOAD_MSG(5, MB1, MB2, MB3, MB4);
+        ROUND(4, MA1, MA2, MA3, MA4);
+                                      LOAD_MSG(6, MA1, MA2, MA3, MA4);
+        ROUND(5, MB1, MB2, MB3, MB4);
+                                      LOAD_MSG(7, MB1, MB2, MB3, MB4);
+        ROUND(6, MA1, MA2, MA3, MA4);
+                                      LOAD_MSG(8, MA1, MA2, MA3, MA4);
+        ROUND(7, MB1, MB2, MB3, MB4);
+                                      LOAD_MSG(9, MB1, MB2, MB3, MB4);
+        sub $1, RNBLKS;
+        jz .Loop_end;
+
+        lea 64(RINBLKS), RINBLKS;
+        addq $64, (STATE_T + 0)(RSTATE);
+
+        ROUND(8, MA1, MA2, MA3, MA4);
+                                      LOAD_MSG(0, MA1, MA2, MA3, MA4);
+        ROUND(9, MB1, MB2, MB3, MB4);
+                                      LOAD_MSG(1, MB1, MB2, MB3, MB4);
+
+        vpternlogq $0x96, (STATE_H + 0 * 4)(RSTATE), ROW3, ROW1;
+        vpternlogq $0x96, (STATE_H + 4 * 4)(RSTATE), ROW4, ROW2;
+
+        vmovdqa .Liv+(0 * 4) rRIP, ROW3;
+        vmovdqa .Liv+(4 * 4) rRIP, ROW4;
+
+        vmovdqu ROW1, (STATE_H + 0 * 4)(RSTATE);
+        vmovdqu ROW2, (STATE_H + 4 * 4)(RSTATE);
+
+        vpxor (STATE_T)(RSTATE), ROW4, ROW4;
+
+        jmp .Loop;
+
+.align 64, 0xcc
+.Loop_end:
+        ROUND(8, MA1, MA2, MA3, MA4);
+        ROUND(9, MB1, MB2, MB3, MB4);
+
+        vpternlogq $0x96, (STATE_H + 0 * 4)(RSTATE), ROW3, ROW1;
+        vpternlogq $0x96, (STATE_H + 4 * 4)(RSTATE), ROW4, ROW2;
+
+        vmovdqu ROW1, (STATE_H + 0 * 4)(RSTATE);
+        vmovdqu ROW2, (STATE_H + 4 * 4)(RSTATE);
+
+        xorl %eax, %eax;
+        kxord %k4, %k4, %k4;
+        kxord %k5, %k5, %k5;
+        kxord %k6, %k6, %k6;
+        kxord %k7, %k7, %k7;
+
+        vzeroall;
+        ret_spec_stop;
+        CFI_ENDPROC();
+ELF(.size _gcry_blake2s_transform_amd64_avx512,
+    .-_gcry_blake2s_transform_amd64_avx512;)
+
+#endif /*defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS)*/
+#endif /*__x86_64*/
index 2b4ffa1a3fd4f8f2cf9a13759f86ac89da058bcd..95d57a99b117bb15646dec2481f8397798cc6ddc 100644 (file)
        bswapq                  RX0; \
        movq RX0,               (RIO);
 
-.align 8
+.align 16
 ELF(.type   __blowfish_enc_blk1,@function;)
 
 __blowfish_enc_blk1:
@@ -155,7 +155,7 @@ __blowfish_enc_blk1:
        CFI_ENDPROC();
 ELF(.size __blowfish_enc_blk1,.-__blowfish_enc_blk1;)
 
-.align 8
+.align 16
 .globl  _gcry_blowfish_amd64_do_encrypt
 ELF(.type   _gcry_blowfish_amd64_do_encrypt,@function;)
 
@@ -186,7 +186,7 @@ _gcry_blowfish_amd64_do_encrypt:
        CFI_ENDPROC();
 ELF(.size _gcry_blowfish_amd64_do_encrypt,.-_gcry_blowfish_amd64_do_encrypt;)
 
-.align 8
+.align 16
 .globl  _gcry_blowfish_amd64_encrypt_block
 ELF(.type   _gcry_blowfish_amd64_encrypt_block,@function;)
 
@@ -214,7 +214,7 @@ _gcry_blowfish_amd64_encrypt_block:
        CFI_ENDPROC();
 ELF(.size _gcry_blowfish_amd64_encrypt_block,.-_gcry_blowfish_amd64_encrypt_block;)
 
-.align 8
+.align 16
 .globl  _gcry_blowfish_amd64_decrypt_block
 ELF(.type   _gcry_blowfish_amd64_decrypt_block,@function;)
 
@@ -342,7 +342,7 @@ ELF(.size _gcry_blowfish_amd64_decrypt_block,.-_gcry_blowfish_amd64_decrypt_bloc
        bswapq                  RX2; \
        bswapq                  RX3;
 
-.align 8
+.align 16
 ELF(.type   __blowfish_enc_blk4,@function;)
 
 __blowfish_enc_blk4:
@@ -371,7 +371,7 @@ __blowfish_enc_blk4:
        CFI_ENDPROC();
 ELF(.size __blowfish_enc_blk4,.-__blowfish_enc_blk4;)
 
-.align 8
+.align 16
 ELF(.type   __blowfish_dec_blk4,@function;)
 
 __blowfish_dec_blk4:
@@ -402,7 +402,7 @@ __blowfish_dec_blk4:
        CFI_ENDPROC();
 ELF(.size __blowfish_dec_blk4,.-__blowfish_dec_blk4;)
 
-.align 8
+.align 16
 .globl  _gcry_blowfish_amd64_ctr_enc
 ELF(.type   _gcry_blowfish_amd64_ctr_enc,@function;)
 _gcry_blowfish_amd64_ctr_enc:
@@ -472,7 +472,7 @@ _gcry_blowfish_amd64_ctr_enc:
        CFI_ENDPROC();
 ELF(.size _gcry_blowfish_amd64_ctr_enc,.-_gcry_blowfish_amd64_ctr_enc;)
 
-.align 8
+.align 16
 .globl  _gcry_blowfish_amd64_cbc_dec
 ELF(.type   _gcry_blowfish_amd64_cbc_dec,@function;)
 _gcry_blowfish_amd64_cbc_dec:
@@ -533,7 +533,7 @@ _gcry_blowfish_amd64_cbc_dec:
        CFI_ENDPROC();
 ELF(.size _gcry_blowfish_amd64_cbc_dec,.-_gcry_blowfish_amd64_cbc_dec;)
 
-.align 8
+.align 16
 .globl  _gcry_blowfish_amd64_cfb_dec
 ELF(.type   _gcry_blowfish_amd64_cfb_dec,@function;)
 _gcry_blowfish_amd64_cfb_dec:
index b30aa31f1df179d885c3fddeaa2fac33915aa31d..a5101b5c01c3580111743175b05fdc1899cba596 100644 (file)
 #define p      (s3 + (1 * 256) * 4)
 
 /* register macros */
-#define CTXs0 %r0
-#define CTXs1 %r9
-#define CTXs2 %r8
-#define CTXs3 %r10
-#define RMASK %lr
-#define RKEYL %r2
-#define RKEYR %ip
+#define CTXs0 r0
+#define CTXs1 r9
+#define CTXs2 r8
+#define CTXs3 r10
+#define RMASK lr
+#define RKEYL r2
+#define RKEYR ip
 
-#define RL0 %r3
-#define RR0 %r4
+#define RL0 r3
+#define RR0 r4
 
-#define RL1 %r9
-#define RR1 %r10
+#define RL1 r9
+#define RR1 r10
 
-#define RT0 %r11
-#define RT1 %r7
-#define RT2 %r5
-#define RT3 %r6
+#define RT0 r11
+#define RT1 r7
+#define RT2 r5
+#define RT3 r6
 
 /* helper macros */
 #define ldr_unaligned_le(rout, rsrc, offs, rtmp) \
@@ -250,7 +250,7 @@ __blowfish_enc_blk1:
         * output:
         *      [RR0, RL0]: dst
         */
-       push {%lr};
+       push {lr};
 
        add CTXs1, CTXs0, #(s1 - s0);
        add CTXs2, CTXs0, #(s2 - s0);
@@ -268,7 +268,7 @@ __blowfish_enc_blk1:
        round_enc(16);
        add_roundkey_enc();
 
-       pop {%pc};
+       pop {pc};
 .size __blowfish_enc_blk1,.-__blowfish_enc_blk1;
 
 .align 8
@@ -277,22 +277,22 @@ __blowfish_enc_blk1:
 
 _gcry_blowfish_arm_do_encrypt:
        /* input:
-        *      %r0: ctx, CTX
-        *      %r1: u32 *ret_xl
-        *      %r2: u32 *ret_xr
+        *      r0: ctx, CTX
+        *      r1: u32 *ret_xl
+        *      r2: u32 *ret_xr
         */
-       push {%r2, %r4-%r11, %ip, %lr};
+       push {r2, r4-r11, ip, lr};
 
-       ldr RL0, [%r1];
-       ldr RR0, [%r2];
+       ldr RL0, [r1];
+       ldr RR0, [r2];
 
        bl __blowfish_enc_blk1;
 
-       pop {%r2};
-       str RR0, [%r1];
-       str RL0, [%r2];
+       pop {r2};
+       str RR0, [r1];
+       str RL0, [r2];
 
-       pop {%r4-%r11, %ip, %pc};
+       pop {r4-r11, ip, pc};
 .size _gcry_blowfish_arm_do_encrypt,.-_gcry_blowfish_arm_do_encrypt;
 
 .align 3
@@ -301,19 +301,19 @@ _gcry_blowfish_arm_do_encrypt:
 
 _gcry_blowfish_arm_encrypt_block:
        /* input:
-        *      %r0: ctx, CTX
-        *      %r1: dst
-        *      %r2: src
+        *      r0: ctx, CTX
+        *      r1: dst
+        *      r2: src
         */
-       push {%r4-%r11, %ip, %lr};
+       push {r4-r11, ip, lr};
 
-       read_block(%r2, 0, RL0, RR0, RT0);
+       read_block(r2, 0, RL0, RR0, RT0);
 
        bl __blowfish_enc_blk1;
 
-       write_block(%r1, 0, RR0, RL0, RT0, RT1);
+       write_block(r1, 0, RR0, RL0, RT0, RT1);
 
-       pop {%r4-%r11, %ip, %pc};
+       pop {r4-r11, ip, pc};
 .size _gcry_blowfish_arm_encrypt_block,.-_gcry_blowfish_arm_encrypt_block;
 
 .align 3
@@ -322,18 +322,18 @@ _gcry_blowfish_arm_encrypt_block:
 
 _gcry_blowfish_arm_decrypt_block:
        /* input:
-        *      %r0: ctx, CTX
-        *      %r1: dst
-        *      %r2: src
+        *      r0: ctx, CTX
+        *      r1: dst
+        *      r2: src
         */
-       push {%r4-%r11, %ip, %lr};
+       push {r4-r11, ip, lr};
 
        add CTXs1, CTXs0, #(s1 - s0);
        add CTXs2, CTXs0, #(s2 - s0);
        mov RMASK, #(0xff << 2); /* byte mask */
        add CTXs3, CTXs1, #(s3 - s1);
 
-       read_block(%r2, 0, RL0, RR0, RT0);
+       read_block(r2, 0, RL0, RR0, RT0);
 
        load_roundkey_dec(17);
        round_dec(15);
@@ -346,9 +346,9 @@ _gcry_blowfish_arm_decrypt_block:
        round_dec(1);
        add_roundkey_dec();
 
-       write_block(%r1, 0, RR0, RL0, RT0, RT1);
+       write_block(r1, 0, RR0, RL0, RT0, RT1);
 
-       pop {%r4-%r11, %ip, %pc};
+       pop {r4-r11, ip, pc};
 .size _gcry_blowfish_arm_decrypt_block,.-_gcry_blowfish_arm_decrypt_block;
 
 /***********************************************************************
@@ -548,7 +548,7 @@ _gcry_blowfish_arm_enc_blk2:
         * output:
         *      [RR0, RL0], [RR1, RL1]: dst
         */
-       push {RT0,%lr};
+       push {RT0,lr};
 
        add CTXs2, CTXs0, #(s2 - s0);
        mov RMASK, #(0xff << 2); /* byte mask */
@@ -568,7 +568,7 @@ _gcry_blowfish_arm_enc_blk2:
        host_to_be(RR1, RT0);
        host_to_be(RL1, RT0);
 
-       pop {RT0,%pc};
+       pop {RT0,pc};
 .size _gcry_blowfish_arm_enc_blk2,.-_gcry_blowfish_arm_enc_blk2;
 
 .align 3
@@ -577,40 +577,40 @@ _gcry_blowfish_arm_enc_blk2:
 
 _gcry_blowfish_arm_cfb_dec:
        /* input:
-        *      %r0: CTX
-        *      %r1: dst (2 blocks)
-        *      %r2: src (2 blocks)
-        *      %r3: iv (64bit)
+        *      r0: CTX
+        *      r1: dst (2 blocks)
+        *      r2: src (2 blocks)
+        *      r3: iv (64bit)
         */
-       push {%r2, %r4-%r11, %ip, %lr};
+       push {r2, r4-r11, ip, lr};
 
-       mov %lr, %r3;
+       mov lr, r3;
 
-       /* Load input (iv/%r3 is aligned, src/%r2 might not be) */
-       ldm %r3, {RL0, RR0};
+       /* Load input (iv/r3 is aligned, src/r2 might not be) */
+       ldm r3, {RL0, RR0};
        host_to_be(RL0, RT0);
        host_to_be(RR0, RT0);
-       read_block(%r2, 0, RL1, RR1, RT0);
+       read_block(r2, 0, RL1, RR1, RT0);
 
        /* Update IV, load src[1] and save to iv[0] */
-       read_block_host(%r2, 8, %r5, %r6, RT0);
-       stm %lr, {%r5, %r6};
+       read_block_host(r2, 8, r5, r6, RT0);
+       stm lr, {r5, r6};
 
        bl _gcry_blowfish_arm_enc_blk2;
-       /* result in RR0:RL0, RR1:RL1 = %r4:%r3, %r10:%r9 */
+       /* result in RR0:RL0, RR1:RL1 = r4:r3, r10:r9 */
 
-       /* %r1: dst, %r0: %src */
-       pop {%r0};
+       /* r1: dst, r0: src */
+       pop {r0};
 
        /* dst = src ^ result */
-       read_block2_host(%r0, %r5, %r6, %r7, %r8, %lr);
-       eor %r5, %r4;
-       eor %r6, %r3;
-       eor %r7, %r10;
-       eor %r8, %r9;
-       write_block2_host(%r1, %r5, %r6, %r7, %r8, %r9, %r10);
-
-       pop {%r4-%r11, %ip, %pc};
+       read_block2_host(r0, r5, r6, r7, r8, lr);
+       eor r5, r4;
+       eor r6, r3;
+       eor r7, r10;
+       eor r8, r9;
+       write_block2_host(r1, r5, r6, r7, r8, r9, r10);
+
+       pop {r4-r11, ip, pc};
 .ltorg
 .size _gcry_blowfish_arm_cfb_dec,.-_gcry_blowfish_arm_cfb_dec;
 
@@ -620,42 +620,42 @@ _gcry_blowfish_arm_cfb_dec:
 
 _gcry_blowfish_arm_ctr_enc:
        /* input:
-        *      %r0: CTX
-        *      %r1: dst (2 blocks)
-        *      %r2: src (2 blocks)
-        *      %r3: iv (64bit, big-endian)
+        *      r0: CTX
+        *      r1: dst (2 blocks)
+        *      r2: src (2 blocks)
+        *      r3: iv (64bit, big-endian)
         */
-       push {%r2, %r4-%r11, %ip, %lr};
+       push {r2, r4-r11, ip, lr};
 
-       mov %lr, %r3;
+       mov lr, r3;
 
        /* Load IV (big => host endian) */
-       read_block_aligned(%lr, 0, RL0, RR0, be_to_host, RT0);
+       read_block_aligned(lr, 0, RL0, RR0, be_to_host, RT0);
 
        /* Construct IVs */
        adds RR1, RR0, #1; /* +1 */
        adc RL1, RL0, #0;
-       adds %r6, RR1, #1; /* +2 */
-       adc %r5, RL1, #0;
+       adds r6, RR1, #1; /* +2 */
+       adc r5, RL1, #0;
 
        /* Store new IV (host => big-endian) */
-       write_block_aligned(%lr, 0, %r5, %r6, host_to_be, RT0);
+       write_block_aligned(lr, 0, r5, r6, host_to_be, RT0);
 
        bl _gcry_blowfish_arm_enc_blk2;
-       /* result in RR0:RL0, RR1:RL1 = %r4:%r3, %r10:%r9 */
+       /* result in RR0:RL0, RR1:RL1 = r4:r3, r10:r9 */
 
-       /* %r1: dst, %r0: %src */
-       pop {%r0};
+       /* r1: dst, r0: src */
+       pop {r0};
 
        /* XOR key-stream with plaintext */
-       read_block2_host(%r0, %r5, %r6, %r7, %r8, %lr);
-       eor %r5, %r4;
-       eor %r6, %r3;
-       eor %r7, %r10;
-       eor %r8, %r9;
-       write_block2_host(%r1, %r5, %r6, %r7, %r8, %r9, %r10);
-
-       pop {%r4-%r11, %ip, %pc};
+       read_block2_host(r0, r5, r6, r7, r8, lr);
+       eor r5, r4;
+       eor r6, r3;
+       eor r7, r10;
+       eor r8, r9;
+       write_block2_host(r1, r5, r6, r7, r8, r9, r10);
+
+       pop {r4-r11, ip, pc};
 .ltorg
 .size _gcry_blowfish_arm_ctr_enc,.-_gcry_blowfish_arm_ctr_enc;
 
@@ -697,45 +697,45 @@ _gcry_blowfish_arm_dec_blk2:
 
 _gcry_blowfish_arm_cbc_dec:
        /* input:
-        *      %r0: CTX
-        *      %r1: dst (2 blocks)
-        *      %r2: src (2 blocks)
-        *      %r3: iv (64bit)
+        *      r0: CTX
+        *      r1: dst (2 blocks)
+        *      r2: src (2 blocks)
+        *      r3: iv (64bit)
         */
-       push {%r2-%r11, %ip, %lr};
+       push {r2-r11, ip, lr};
 
-       read_block2(%r2, RL0, RR0, RL1, RR1, RT0);
+       read_block2(r2, RL0, RR0, RL1, RR1, RT0);
 
        /* dec_blk2 is only used by cbc_dec, jump directly in/out instead
         * of function call. */
        b _gcry_blowfish_arm_dec_blk2;
 .Ldec_cbc_tail:
-       /* result in RR0:RL0, RR1:RL1 = %r4:%r3, %r10:%r9 */
+       /* result in RR0:RL0, RR1:RL1 = r4:r3, r10:r9 */
 
-       /* %r0: %src, %r1: dst, %r2: iv */
-       pop {%r0, %r2};
+       /* r0: src, r1: dst, r2: iv */
+       pop {r0, r2};
 
-       /* load IV+1 (src[0]) to %r7:%r8. Might be unaligned. */
-       read_block_host(%r0, 0, %r7, %r8, %r5);
-       /* load IV (iv[0]) to %r5:%r6. 'iv' is aligned. */
-       ldm %r2, {%r5, %r6};
+       /* load IV+1 (src[0]) to r7:r8. Might be unaligned. */
+       read_block_host(r0, 0, r7, r8, r5);
+       /* load IV (iv[0]) to r5:r6. 'iv' is aligned. */
+       ldm r2, {r5, r6};
 
        /* out[1] ^= IV+1 */
-       eor %r10, %r7;
-       eor %r9, %r8;
+       eor r10, r7;
+       eor r9, r8;
        /* out[0] ^= IV */
-       eor %r4, %r5;
-       eor %r3, %r6;
+       eor r4, r5;
+       eor r3, r6;
 
-       /* load IV+2 (src[1]) to %r7:%r8. Might be unaligned. */
-       read_block_host(%r0, 8, %r7, %r8, %r5);
+       /* load IV+2 (src[1]) to r7:r8. Might be unaligned. */
+       read_block_host(r0, 8, r7, r8, r5);
        /* store IV+2 to iv[0] (aligned). */
-       stm %r2, {%r7, %r8};
+       stm r2, {r7, r8};
 
        /* store result to dst[0-3]. Might be unaligned. */
-       write_block2_host(%r1, %r4, %r3, %r10, %r9, %r5, %r6);
+       write_block2_host(r1, r4, r3, r10, r9, r5, r6);
 
-       pop {%r4-%r11, %ip, %pc};
+       pop {r4-r11, ip, pc};
 .ltorg
 .size _gcry_blowfish_arm_cbc_dec,.-_gcry_blowfish_arm_cbc_dec;
 
index 7b001306c7bb6f7185fc8ff4dd9ba071339abdd3..87abd563a8492c6496ba3ac6773d050c5b70e696 100644 (file)
@@ -4,7 +4,7 @@
  * This file is part of Libgcrypt.
  *
  * Libgcrypt is free software; you can redistribute it and/or modify
- * it under the terms of the GNU Lesser general Public License as
+ * it under the terms of the GNU Lesser General Public License as
  * published by the Free Software Foundation; either version 2.1 of
  * the License, or (at your option) any later version.
  *
@@ -14,8 +14,8 @@
  * GNU Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA
+ * License along with this program; if not, see <https://www.gnu.org/licenses/>.
+ * SPDX-License-Identifier: LGPL-2.1-or-later
  *
  * For a description of the algorithm, see:
  *   Bruce Schneier: Applied Cryptography. John Wiley & Sons, 1996.
@@ -38,7 +38,6 @@
 #include "cipher.h"
 #include "bufhelp.h"
 #include "cipher-internal.h"
-#include "cipher-selftest.h"
 
 #define BLOWFISH_BLOCKSIZE 8
 #define BLOWFISH_KEY_MIN_BITS 8
@@ -856,48 +855,6 @@ _gcry_blowfish_cfb_dec(void *context, unsigned char *iv, void *outbuf_arg,
 }
 
 
-/* Run the self-tests for BLOWFISH-CTR, tests IV increment of bulk CTR
-   encryption.  Returns NULL on success. */
-static const char *
-selftest_ctr (void)
-{
-  const int nblocks = 4+1;
-  const int blocksize = BLOWFISH_BLOCKSIZE;
-  const int context_size = sizeof(BLOWFISH_context);
-
-  return _gcry_selftest_helper_ctr("BLOWFISH", &bf_setkey,
-           &encrypt_block, nblocks, blocksize, context_size);
-}
-
-
-/* Run the self-tests for BLOWFISH-CBC, tests bulk CBC decryption.
-   Returns NULL on success. */
-static const char *
-selftest_cbc (void)
-{
-  const int nblocks = 4+2;
-  const int blocksize = BLOWFISH_BLOCKSIZE;
-  const int context_size = sizeof(BLOWFISH_context);
-
-  return _gcry_selftest_helper_cbc("BLOWFISH", &bf_setkey,
-           &encrypt_block, nblocks, blocksize, context_size);
-}
-
-
-/* Run the self-tests for BLOWFISH-CFB, tests bulk CBC decryption.
-   Returns NULL on success. */
-static const char *
-selftest_cfb (void)
-{
-  const int nblocks = 4+2;
-  const int blocksize = BLOWFISH_BLOCKSIZE;
-  const int context_size = sizeof(BLOWFISH_context);
-
-  return _gcry_selftest_helper_cfb("BLOWFISH", &bf_setkey,
-           &encrypt_block, nblocks, blocksize, context_size);
-}
-
-
 static const char*
 selftest(void)
 {
@@ -911,7 +868,6 @@ selftest(void)
     { 0x41, 0x79, 0x6E, 0xA0, 0x52, 0x61, 0x6E, 0xE4 };
   static const byte cipher3[] =
     { 0xE1, 0x13, 0xF4, 0x10, 0x2C, 0xFC, 0xCE, 0x43 };
-  const char *r;
 
   bf_setkey( (void *) &c,
              (const unsigned char*)"abcdefghijklmnopqrstuvwxyz", 26,
@@ -931,15 +887,6 @@ selftest(void)
   if( memcmp( buffer, plain3, 8 ) )
     return "Blowfish selftest failed (4).";
 
-  if ( (r = selftest_cbc ()) )
-    return r;
-
-  if ( (r = selftest_cfb ()) )
-    return r;
-
-  if ( (r = selftest_ctr ()) )
-    return r;
-
   return NULL;
 }
 
diff --git a/cipher/bulkhelp.h b/cipher/bulkhelp.h
new file mode 100644 (file)
index 0000000..833262e
--- /dev/null
@@ -0,0 +1,493 @@
+/* bulkhelp.h  -  Some bulk processing helpers
+ * Copyright (C) 2022 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+#ifndef GCRYPT_BULKHELP_H
+#define GCRYPT_BULKHELP_H
+
+
+#include "g10lib.h"
+#include "cipher-internal.h"
+
+
+#ifdef __x86_64__
+/* Use u64 to store pointers for x32 support (assembly function assumes
+ * 64-bit pointers). */
+typedef u64 ocb_L_uintptr_t;
+#else
+typedef uintptr_t ocb_L_uintptr_t;
+#endif
+
+typedef unsigned int (*bulk_crypt_fn_t) (void *ctx, byte *out,
+                                         const byte *in,
+                                         size_t num_blks);
+
+
+static inline ocb_L_uintptr_t *
+bulk_ocb_prepare_L_pointers_array_blk64 (gcry_cipher_hd_t c,
+                                         ocb_L_uintptr_t Ls[64], u64 blkn)
+{
+  unsigned int n = 64 - (blkn % 64);
+  unsigned int i;
+
+  for (i = 0; i < 64; i += 8)
+    {
+      Ls[(i + 0 + n) % 64] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
+      Ls[(i + 1 + n) % 64] = (uintptr_t)(void *)c->u_mode.ocb.L[1];
+      Ls[(i + 2 + n) % 64] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
+      Ls[(i + 3 + n) % 64] = (uintptr_t)(void *)c->u_mode.ocb.L[2];
+      Ls[(i + 4 + n) % 64] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
+      Ls[(i + 5 + n) % 64] = (uintptr_t)(void *)c->u_mode.ocb.L[1];
+      Ls[(i + 6 + n) % 64] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
+    }
+
+  Ls[(7 + n) % 64] = (uintptr_t)(void *)c->u_mode.ocb.L[3];
+  Ls[(15 + n) % 64] = (uintptr_t)(void *)c->u_mode.ocb.L[4];
+  Ls[(23 + n) % 64] = (uintptr_t)(void *)c->u_mode.ocb.L[3];
+  Ls[(31 + n) % 64] = (uintptr_t)(void *)c->u_mode.ocb.L[5];
+  Ls[(39 + n) % 64] = (uintptr_t)(void *)c->u_mode.ocb.L[3];
+  Ls[(47 + n) % 64] = (uintptr_t)(void *)c->u_mode.ocb.L[4];
+  Ls[(55 + n) % 64] = (uintptr_t)(void *)c->u_mode.ocb.L[3];
+  return &Ls[(63 + n) % 64];
+}
+
+
+static inline ocb_L_uintptr_t *
+bulk_ocb_prepare_L_pointers_array_blk32 (gcry_cipher_hd_t c,
+                                         ocb_L_uintptr_t Ls[32], u64 blkn)
+{
+  unsigned int n = 32 - (blkn % 32);
+  unsigned int i;
+
+  for (i = 0; i < 32; i += 8)
+    {
+      Ls[(i + 0 + n) % 32] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
+      Ls[(i + 1 + n) % 32] = (uintptr_t)(void *)c->u_mode.ocb.L[1];
+      Ls[(i + 2 + n) % 32] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
+      Ls[(i + 3 + n) % 32] = (uintptr_t)(void *)c->u_mode.ocb.L[2];
+      Ls[(i + 4 + n) % 32] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
+      Ls[(i + 5 + n) % 32] = (uintptr_t)(void *)c->u_mode.ocb.L[1];
+      Ls[(i + 6 + n) % 32] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
+    }
+
+  Ls[(7 + n) % 32] = (uintptr_t)(void *)c->u_mode.ocb.L[3];
+  Ls[(15 + n) % 32] = (uintptr_t)(void *)c->u_mode.ocb.L[4];
+  Ls[(23 + n) % 32] = (uintptr_t)(void *)c->u_mode.ocb.L[3];
+  return &Ls[(31 + n) % 32];
+}
+
+
+static inline ocb_L_uintptr_t *
+bulk_ocb_prepare_L_pointers_array_blk16 (gcry_cipher_hd_t c,
+                                         ocb_L_uintptr_t Ls[16], u64 blkn)
+{
+  unsigned int n = 16 - (blkn % 16);
+  unsigned int i;
+
+  for (i = 0; i < 16; i += 8)
+    {
+      Ls[(i + 0 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
+      Ls[(i + 1 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[1];
+      Ls[(i + 2 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
+      Ls[(i + 3 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[2];
+      Ls[(i + 4 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
+      Ls[(i + 5 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[1];
+      Ls[(i + 6 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
+    }
+
+  Ls[(7 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[3];
+  return &Ls[(15 + n) % 16];
+}
+
+
+static inline ocb_L_uintptr_t *
+bulk_ocb_prepare_L_pointers_array_blk8 (gcry_cipher_hd_t c,
+                                        ocb_L_uintptr_t Ls[8], u64 blkn)
+{
+  unsigned int n = 8 - (blkn % 8);
+
+  Ls[(0 + n) % 8] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
+  Ls[(1 + n) % 8] = (uintptr_t)(void *)c->u_mode.ocb.L[1];
+  Ls[(2 + n) % 8] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
+  Ls[(3 + n) % 8] = (uintptr_t)(void *)c->u_mode.ocb.L[2];
+  Ls[(4 + n) % 8] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
+  Ls[(5 + n) % 8] = (uintptr_t)(void *)c->u_mode.ocb.L[1];
+  Ls[(6 + n) % 8] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
+  Ls[(7 + n) % 8] = (uintptr_t)(void *)c->u_mode.ocb.L[3];
+
+  return &Ls[(7 + n) % 8];
+}
+
+
+static inline unsigned int
+bulk_ctr_enc_128 (void *priv, bulk_crypt_fn_t crypt_fn, byte *outbuf,
+                  const byte *inbuf, size_t nblocks, byte *ctr,
+                  byte *tmpbuf, size_t tmpbuf_nblocks,
+                  unsigned int *num_used_tmpblocks)
+{
+  unsigned int tmp_used = 16;
+  unsigned int burn_depth = 0;
+  unsigned int nburn;
+
+  while (nblocks >= 1)
+    {
+      size_t curr_blks = nblocks > tmpbuf_nblocks ? tmpbuf_nblocks : nblocks;
+      size_t i;
+
+      if (curr_blks * 16 > tmp_used)
+        tmp_used = curr_blks * 16;
+
+      cipher_block_cpy (tmpbuf + 0 * 16, ctr, 16);
+      for (i = 1; i < curr_blks; i++)
+        {
+          cipher_block_cpy (&tmpbuf[i * 16], ctr, 16);
+          cipher_block_add (&tmpbuf[i * 16], i, 16);
+        }
+      cipher_block_add (ctr, curr_blks, 16);
+
+      nburn = crypt_fn (priv, tmpbuf, tmpbuf, curr_blks);
+      burn_depth = nburn > burn_depth ? nburn : burn_depth;
+
+      for (i = 0; i < curr_blks; i++)
+        {
+          cipher_block_xor (outbuf, &tmpbuf[i * 16], inbuf, 16);
+          outbuf += 16;
+          inbuf += 16;
+        }
+
+      nblocks -= curr_blks;
+    }
+
+  *num_used_tmpblocks = tmp_used;
+  return burn_depth;
+}
+
+
+static inline unsigned int
+bulk_ctr32le_enc_128 (void *priv, bulk_crypt_fn_t crypt_fn, byte *outbuf,
+                      const byte *inbuf, size_t nblocks, byte *ctr,
+                      byte *tmpbuf, size_t tmpbuf_nblocks,
+                      unsigned int *num_used_tmpblocks)
+{
+  unsigned int tmp_used = 16;
+  unsigned int burn_depth = 0;
+  unsigned int nburn;
+
+  while (nblocks >= 1)
+    {
+      size_t curr_blks = nblocks > tmpbuf_nblocks ? tmpbuf_nblocks : nblocks;
+      u64 ctr_lo = buf_get_le64(ctr + 0 * 8);
+      u64 ctr_hi = buf_get_he64(ctr + 1 * 8);
+      size_t i;
+
+      if (curr_blks * 16 > tmp_used)
+        tmp_used = curr_blks * 16;
+
+      cipher_block_cpy (tmpbuf + 0 * 16, ctr, 16);
+      for (i = 1; i < curr_blks; i++)
+        {
+          u32 lo_u32 = (u32)ctr_lo + i;
+          u64 lo_u64 = ctr_lo & ~(u64)(u32)-1;
+          lo_u64 += lo_u32;
+          buf_put_le64(&tmpbuf[0 * 8 + i * 16], lo_u64);
+          buf_put_he64(&tmpbuf[1 * 8 + i * 16], ctr_hi);
+        }
+      buf_put_le32(ctr, (u32)ctr_lo + curr_blks);
+
+      nburn = crypt_fn (priv, tmpbuf, tmpbuf, curr_blks);
+      burn_depth = nburn > burn_depth ? nburn : burn_depth;
+
+      for (i = 0; i < curr_blks; i++)
+        {
+          cipher_block_xor (outbuf, &tmpbuf[i * 16], inbuf, 16);
+          outbuf += 16;
+          inbuf += 16;
+        }
+
+      nblocks -= curr_blks;
+    }
+
+  *num_used_tmpblocks = tmp_used;
+  return burn_depth;
+}
+
+
+static inline unsigned int
+bulk_cbc_dec_128 (void *priv, bulk_crypt_fn_t crypt_fn, byte *outbuf,
+                  const byte *inbuf, size_t nblocks, byte *iv,
+                  byte *tmpbuf, size_t tmpbuf_nblocks,
+                  unsigned int *num_used_tmpblocks)
+{
+  unsigned int tmp_used = 16;
+  unsigned int burn_depth = 0;
+  unsigned int nburn;
+
+  while (nblocks >= 1)
+    {
+      size_t curr_blks = nblocks > tmpbuf_nblocks ? tmpbuf_nblocks : nblocks;
+      size_t i;
+
+      if (curr_blks * 16 > tmp_used)
+        tmp_used = curr_blks * 16;
+
+      nburn = crypt_fn (priv, tmpbuf, inbuf, curr_blks);
+      burn_depth = nburn > burn_depth ? nburn : burn_depth;
+
+      for (i = 0; i < curr_blks; i++)
+        {
+          cipher_block_xor_n_copy_2(outbuf, &tmpbuf[i * 16], iv, inbuf, 16);
+          outbuf += 16;
+          inbuf += 16;
+        }
+
+      nblocks -= curr_blks;
+    }
+
+  *num_used_tmpblocks = tmp_used;
+  return burn_depth;
+}
+
+
+static inline unsigned int
+bulk_cfb_dec_128 (void *priv, bulk_crypt_fn_t crypt_fn, byte *outbuf,
+                  const byte *inbuf, size_t nblocks, byte *iv,
+                  byte *tmpbuf, size_t tmpbuf_nblocks,
+                  unsigned int *num_used_tmpblocks)
+{
+  unsigned int tmp_used = 16;
+  unsigned int burn_depth = 0;
+  unsigned int nburn;
+
+  while (nblocks >= 1)
+    {
+      size_t curr_blks = nblocks > tmpbuf_nblocks ? tmpbuf_nblocks : nblocks;
+      size_t i;
+
+      if (curr_blks * 16 > tmp_used)
+        tmp_used = curr_blks * 16;
+
+      cipher_block_cpy (&tmpbuf[0 * 16], iv, 16);
+      if (curr_blks > 1)
+        memcpy (&tmpbuf[1 * 16], &inbuf[(1 - 1) * 16], 16 * curr_blks - 16);
+      cipher_block_cpy (iv, &inbuf[(curr_blks - 1) * 16], 16);
+
+      nburn = crypt_fn (priv, tmpbuf, tmpbuf, curr_blks);
+      burn_depth = nburn > burn_depth ? nburn : burn_depth;
+
+      for (i = 0; i < curr_blks; i++)
+        {
+          cipher_block_xor (outbuf, inbuf, &tmpbuf[i * 16], 16);
+          outbuf += 16;
+          inbuf += 16;
+        }
+
+      nblocks -= curr_blks;
+    }
+
+  *num_used_tmpblocks = tmp_used;
+  return burn_depth;
+}
+
+
+static inline unsigned int
+bulk_ocb_crypt_128 (gcry_cipher_hd_t c, void *priv, bulk_crypt_fn_t crypt_fn,
+                    byte *outbuf, const byte *inbuf, size_t nblocks, u64 *blkn,
+                    int encrypt, byte *tmpbuf, size_t tmpbuf_nblocks,
+                    unsigned int *num_used_tmpblocks)
+{
+  unsigned int tmp_used = 16;
+  unsigned int burn_depth = 0;
+  unsigned int nburn;
+
+  while (nblocks >= 1)
+    {
+      size_t curr_blks = nblocks > tmpbuf_nblocks ? tmpbuf_nblocks : nblocks;
+      size_t i;
+
+      if (curr_blks * 16 > tmp_used)
+        tmp_used = curr_blks * 16;
+
+      for (i = 0; i < curr_blks; i++)
+        {
+          const unsigned char *l = ocb_get_l(c, ++*blkn);
+
+          /* Checksum_i = Checksum_{i-1} xor P_i  */
+          if (encrypt)
+            cipher_block_xor_1(c->u_ctr.ctr, &inbuf[i * 16], 16);
+
+          /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
+          cipher_block_xor_2dst (&tmpbuf[i * 16], c->u_iv.iv, l, 16);
+          cipher_block_xor (&outbuf[i * 16], &inbuf[i * 16],
+                            c->u_iv.iv, 16);
+        }
+
+      /* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i)  */
+      nburn = crypt_fn (priv, outbuf, outbuf, curr_blks);
+      burn_depth = nburn > burn_depth ? nburn : burn_depth;
+
+      for (i = 0; i < curr_blks; i++)
+        {
+          cipher_block_xor_1 (&outbuf[i * 16], &tmpbuf[i * 16], 16);
+
+          /* Checksum_i = Checksum_{i-1} xor P_i  */
+          if (!encrypt)
+              cipher_block_xor_1(c->u_ctr.ctr, &outbuf[i * 16], 16);
+        }
+
+      outbuf += curr_blks * 16;
+      inbuf  += curr_blks * 16;
+      nblocks -= curr_blks;
+    }
+
+  *num_used_tmpblocks = tmp_used;
+  return burn_depth;
+}
+
+
+static inline unsigned int
+bulk_ocb_auth_128 (gcry_cipher_hd_t c, void *priv, bulk_crypt_fn_t crypt_fn,
+                   const byte *abuf, size_t nblocks, u64 *blkn, byte *tmpbuf,
+                   size_t tmpbuf_nblocks, unsigned int *num_used_tmpblocks)
+{
+  unsigned int tmp_used = 16;
+  unsigned int burn_depth = 0;
+  unsigned int nburn;
+
+  while (nblocks >= 1)
+    {
+      size_t curr_blks = nblocks > tmpbuf_nblocks ? tmpbuf_nblocks : nblocks;
+      size_t i;
+
+      if (curr_blks * 16 > tmp_used)
+        tmp_used = curr_blks * 16;
+
+      for (i = 0; i < curr_blks; i++)
+        {
+          const unsigned char *l = ocb_get_l(c, ++*blkn);
+
+          /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
+          cipher_block_xor_2dst (&tmpbuf[i * 16],
+                                  c->u_mode.ocb.aad_offset, l, 16);
+          cipher_block_xor_1 (&tmpbuf[i * 16], &abuf[i * 16], 16);
+        }
+
+      /* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i)  */
+      nburn = crypt_fn (priv, tmpbuf, tmpbuf, curr_blks);
+      burn_depth = nburn > burn_depth ? nburn : burn_depth;
+
+      for (i = 0; i < curr_blks; i++)
+        {
+          cipher_block_xor_1 (c->u_mode.ocb.aad_sum, &tmpbuf[i * 16], 16);
+        }
+
+      abuf += curr_blks * 16;
+      nblocks -= curr_blks;
+    }
+
+  *num_used_tmpblocks = tmp_used;
+  return burn_depth;
+}
+
+
+static inline unsigned int
+bulk_xts_crypt_128 (void *priv, bulk_crypt_fn_t crypt_fn, byte *outbuf,
+                    const byte *inbuf, size_t nblocks, byte *tweak,
+                    byte *tmpbuf, size_t tmpbuf_nblocks,
+                    unsigned int *num_used_tmpblocks)
+{
+  u64 tweak_lo, tweak_hi, tweak_next_lo, tweak_next_hi, tmp_lo, tmp_hi, carry;
+  unsigned int tmp_used = 16;
+  unsigned int burn_depth = 0;
+  unsigned int nburn;
+
+  tweak_next_lo = buf_get_le64 (tweak + 0);
+  tweak_next_hi = buf_get_le64 (tweak + 8);
+
+  while (nblocks >= 1)
+    {
+      size_t curr_blks = nblocks > tmpbuf_nblocks ? tmpbuf_nblocks : nblocks;
+      size_t i;
+
+      if (curr_blks * 16 > tmp_used)
+        tmp_used = curr_blks * 16;
+
+      for (i = 0; i < curr_blks; i++)
+        {
+          tweak_lo = tweak_next_lo;
+          tweak_hi = tweak_next_hi;
+
+          /* Generate next tweak. */
+          carry = -(tweak_next_hi >> 63) & 0x87;
+          tweak_next_hi = (tweak_next_hi << 1) + (tweak_next_lo >> 63);
+          tweak_next_lo = (tweak_next_lo << 1) ^ carry;
+
+          /* Xor-Encrypt/Decrypt-Xor block. */
+          tmp_lo = buf_get_le64 (inbuf + i * 16 + 0) ^ tweak_lo;
+          tmp_hi = buf_get_le64 (inbuf + i * 16 + 8) ^ tweak_hi;
+          buf_put_he64 (&tmpbuf[i * 16 + 0], tweak_lo);
+          buf_put_he64 (&tmpbuf[i * 16 + 8], tweak_hi);
+          buf_put_le64 (outbuf + i * 16 + 0, tmp_lo);
+          buf_put_le64 (outbuf + i * 16 + 8, tmp_hi);
+        }
+
+      nburn = crypt_fn (priv, outbuf, outbuf, curr_blks);
+      burn_depth = nburn > burn_depth ? nburn : burn_depth;
+
+      for (i = 0; i < curr_blks; i++)
+        {
+          /* Xor-Encrypt/Decrypt-Xor block. */
+          tweak_lo = buf_get_he64 (&tmpbuf[i * 16 + 0]);
+          tweak_hi = buf_get_he64 (&tmpbuf[i * 16 + 8]);
+          tmp_lo = buf_get_le64 (outbuf + i * 16 + 0) ^ tweak_lo;
+          tmp_hi = buf_get_le64 (outbuf + i * 16 + 8) ^ tweak_hi;
+          buf_put_le64 (outbuf + i * 16 + 0, tmp_lo);
+          buf_put_le64 (outbuf + i * 16 + 8, tmp_hi);
+        }
+
+      inbuf += curr_blks * 16;
+      outbuf += curr_blks * 16;
+      nblocks -= curr_blks;
+    }
+
+  buf_put_le64 (tweak + 0, tweak_next_lo);
+  buf_put_le64 (tweak + 8, tweak_next_hi);
+
+  *num_used_tmpblocks = tmp_used;
+  return burn_depth;
+}
+
+static inline unsigned int
+bulk_ecb_crypt_128 (void *priv, bulk_crypt_fn_t crypt_fn, byte *outbuf,
+                   const byte *inbuf, size_t nblocks, size_t fn_max_nblocks)
+{
+  unsigned int burn_depth = 0;
+  unsigned int nburn;
+
+  while (nblocks >= 1)
+    {
+      size_t curr_blks = nblocks > fn_max_nblocks ? fn_max_nblocks : nblocks;
+      nburn = crypt_fn (priv, outbuf, inbuf, curr_blks);
+      burn_depth = nburn > burn_depth ? nburn : burn_depth;
+      inbuf += curr_blks * 16;
+      outbuf += curr_blks * 16;
+      nblocks -= curr_blks;
+    }
+
+  return burn_depth;
+}
+
+#endif /*GCRYPT_BULKHELP_H*/
diff --git a/cipher/camellia-aarch64-ce.c b/cipher/camellia-aarch64-ce.c
new file mode 100644 (file)
index 0000000..76813e9
--- /dev/null
@@ -0,0 +1,42 @@
+/* camellia-aarch64-ce.c - ARMv8/CE Camellia implementation
+ * Copyright (C) 2023 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <config.h>
+
+#if defined(__AARCH64EL__) && \
+    defined(HAVE_COMPATIBLE_GCC_AARCH64_PLATFORM_AS) && \
+    defined(HAVE_GCC_INLINE_ASM_AARCH64_CRYPTO) && \
+    defined(HAVE_COMPATIBLE_CC_AARCH64_NEON_INTRINSICS) && \
+    (__GNUC__ >= 4)
+
+#ifdef HAVE_GCC_ATTRIBUTE_OPTIMIZE
+# define FUNC_ATTR_OPT __attribute__((optimize("-O2")))
+#else
+# define FUNC_ATTR_OPT
+#endif
+
+#define SIMD128_OPT_ATTR FUNC_ATTR_OPT
+
+#define FUNC_ENC_BLK16 _gcry_camellia_aarch64ce_encrypt_blk16
+#define FUNC_DEC_BLK16 _gcry_camellia_aarch64ce_decrypt_blk16
+#define FUNC_KEY_SETUP _gcry_camellia_aarch64ce_keygen
+
+#include "camellia-simd128.h"
+
+#endif /* __AARCH64EL__ */
index 30b568d39a0b19555a8e02ac40b93d58f6c60b51..1d820553cf66cfa023ff89a2dfbe62516702c11d 100644 (file)
 .globl _gcry_camellia_arm_encrypt_block
 ELF(.type   _gcry_camellia_arm_encrypt_block,@function;)
 
+.align 4
 _gcry_camellia_arm_encrypt_block:
        CFI_STARTPROC()
        stp x19, x30, [sp, #-16]!
@@ -214,7 +215,7 @@ _gcry_camellia_arm_encrypt_block:
         *      w3: keybitlen
         */
 
-       adr RTAB1,  _gcry_camellia_arm_tables;
+       GET_DATA_POINTER(RTAB1, _gcry_camellia_arm_tables);
        mov RMASK, #(0xff<<4); /* byte mask */
        add RTAB2, RTAB1, #(1 * 4);
        add RTAB3, RTAB1, #(2 * 4);
@@ -240,7 +241,6 @@ _gcry_camellia_arm_encrypt_block:
        CFI_RESTORE(x30)
        ret_spec_stop;
        CFI_RESTORE_STATE()
-.ltorg
 
 .Lenc_256:
        enc_fls(24);
@@ -254,12 +254,12 @@ _gcry_camellia_arm_encrypt_block:
        CFI_RESTORE(x30)
        ret_spec_stop;
        CFI_ENDPROC()
-.ltorg
 ELF(.size _gcry_camellia_arm_encrypt_block,.-_gcry_camellia_arm_encrypt_block;)
 
 .globl _gcry_camellia_arm_decrypt_block
 ELF(.type   _gcry_camellia_arm_decrypt_block,@function;)
 
+.align 4
 _gcry_camellia_arm_decrypt_block:
        CFI_STARTPROC()
        stp x19, x30, [sp, #-16]!
@@ -274,7 +274,7 @@ _gcry_camellia_arm_decrypt_block:
         *      w3: keybitlen
         */
 
-       adr RTAB1,  _gcry_camellia_arm_tables;
+       GET_DATA_POINTER(RTAB1, _gcry_camellia_arm_tables);
        mov RMASK, #(0xff<<4); /* byte mask */
        add RTAB2, RTAB1, #(1 * 4);
        add RTAB3, RTAB1, #(2 * 4);
@@ -301,7 +301,6 @@ _gcry_camellia_arm_decrypt_block:
        CFI_RESTORE(x30)
        ret_spec_stop;
        CFI_RESTORE_STATE()
-.ltorg
 
 .Ldec_256:
        inpack(32);
@@ -310,11 +309,11 @@ _gcry_camellia_arm_decrypt_block:
 
        b .Ldec_128;
        CFI_ENDPROC()
-.ltorg
 ELF(.size _gcry_camellia_arm_decrypt_block,.-_gcry_camellia_arm_decrypt_block;)
 
 /* Encryption/Decryption tables */
-ELF(.type  _gcry_camellia_arm_tables,@object;)
+SECTION_RODATA
+ELF(.type _gcry_camellia_arm_tables,%object;)
 .balign 32
 _gcry_camellia_arm_tables:
 .Lcamellia_sp1110:
index 5c304e574019d734b7fbc45da19ccba258112141..76e62ea895ec7e215b15843ec8d5c5678a744d16 100644 (file)
@@ -1,6 +1,6 @@
 /* camellia-avx-aesni-amd64.S  -  AES-NI/AVX implementation of Camellia cipher
  *
- * Copyright (C) 2013-2015,2020 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ * Copyright (C) 2013-2015,2020,2023 Jussi Kivilinna <jussi.kivilinna@iki.fi>
  *
  * This file is part of Libgcrypt.
  *
        filter_8bit(x2, t2, t3, t7, t6); \
        filter_8bit(x5, t2, t3, t7, t6); \
        \
-       vpxor t6, t6, t6; \
        vmovq key, t0; \
        \
        /* postfilter sbox 2 */ \
        filter_8bit(x1, t4, t5, t7, t2); \
        filter_8bit(x4, t4, t5, t7, t2); \
        \
-       vpsrldq $5, t0, t5; \
-       vpsrldq $1, t0, t1; \
-       vpsrldq $2, t0, t2; \
-       vpsrldq $3, t0, t3; \
-       vpsrldq $4, t0, t4; \
-       vpshufb t6, t0, t0; \
-       vpshufb t6, t1, t1; \
-       vpshufb t6, t2, t2; \
-       vpshufb t6, t3, t3; \
-       vpshufb t6, t4, t4; \
-       vpsrldq $2, t5, t7; \
-       vpshufb t6, t7, t7; \
+       vpshufb .Lbyte_threes rRIP, t0, t3; \
+       vpshufb .Lbyte_twos rRIP, t0, t2; \
        \
        /* P-function */ \
        vpxor x5, x0, x0; \
        vpxor x7, x2, x2; \
        vpxor x4, x3, x3; \
        \
+       vpshufb .Lbyte_ones rRIP, t0, t1; \
+       vpshufb .Lbyte_sevens rRIP, t0, t7; \
+       \
        vpxor x2, x4, x4; \
        vpxor x3, x5, x5; \
        vpxor x0, x6, x6; \
        vpxor x1, x7, x7; \
        \
+       vpshufb .Lbyte_sixs rRIP, t0, t6; \
+       vpshufb .Lbyte_fives rRIP, t0, t5; \
        vpxor x7, x0, x0; \
        vpxor x4, x1, x1; \
        vpxor x5, x2, x2; \
        vpxor x6, x3, x3; \
        \
+       vpshufb .Lbyte_fours rRIP, t0, t4; \
+       \
        vpxor x3, x4, x4; \
        vpxor x0, x5, x5; \
        vpxor x1, x6, x6; \
        /* Add key material and result to CD (x becomes new CD) */ \
        \
        vpxor t3, x4, x4; \
+       vpxor t3, t3, t3; \
        vpxor 0 * 16(mem_cd), x4, x4; \
        \
+       vpshufb t3, t0, t0; \
+       \
        vpxor t2, x5, x5; \
        vpxor 1 * 16(mem_cd), x5, x5; \
        \
-       vpsrldq $1, t5, t3; \
-       vpshufb t6, t5, t5; \
-       vpshufb t6, t3, t6; \
-       \
        vpxor t1, x6, x6; \
        vpxor 2 * 16(mem_cd), x6, x6; \
        \
        vpxor tt0, tt0, tt0; \
        vmovd kll, t0; \
        vpshufb tt0, t0, t3; \
-       vpsrldq $1, t0, t0; \
-       vpshufb tt0, t0, t2; \
-       vpsrldq $1, t0, t0; \
-       vpshufb tt0, t0, t1; \
-       vpsrldq $1, t0, t0; \
-       vpshufb tt0, t0, t0; \
+       vpshufb .Lbyte_ones rRIP, t0, t2; \
+       vpshufb .Lbyte_twos rRIP, t0, t1; \
+       vpshufb .Lbyte_threes rRIP, t0, t0; \
        \
        vpand l0, t0, t0; \
        vpand l1, t1, t1; \
        \
        vmovd krr, t0; \
        vpshufb tt0, t0, t3; \
-       vpsrldq $1, t0, t0; \
-       vpshufb tt0, t0, t2; \
-       vpsrldq $1, t0, t0; \
-       vpshufb tt0, t0, t1; \
-       vpsrldq $1, t0, t0; \
-       vpshufb tt0, t0, t0; \
+       vpshufb .Lbyte_ones rRIP, t0, t2; \
+       vpshufb .Lbyte_twos rRIP, t0, t1; \
+       vpshufb .Lbyte_threes rRIP, t0, t0; \
        \
        vpor 4 * 16(r), t0, t0; \
        vpor 5 * 16(r), t1, t1; \
         */ \
        vmovd krl, t0; \
        vpshufb tt0, t0, t3; \
-       vpsrldq $1, t0, t0; \
-       vpshufb tt0, t0, t2; \
-       vpsrldq $1, t0, t0; \
-       vpshufb tt0, t0, t1; \
-       vpsrldq $1, t0, t0; \
-       vpshufb tt0, t0, t0; \
+       vpshufb .Lbyte_ones rRIP, t0, t2; \
+       vpshufb .Lbyte_twos rRIP, t0, t1; \
+       vpshufb .Lbyte_threes rRIP, t0, t0; \
        \
        vpand 0 * 16(r), t0, t0; \
        vpand 1 * 16(r), t1, t1; \
        \
        vmovd klr, t0; \
        vpshufb tt0, t0, t3; \
-       vpsrldq $1, t0, t0; \
-       vpshufb tt0, t0, t2; \
-       vpsrldq $1, t0, t0; \
-       vpshufb tt0, t0, t1; \
-       vpsrldq $1, t0, t0; \
-       vpshufb tt0, t0, t0; \
+       vpshufb .Lbyte_ones rRIP, t0, t2; \
+       vpshufb .Lbyte_twos rRIP, t0, t1; \
+       vpshufb .Lbyte_threes rRIP, t0, t0; \
        \
        vpor l4, t0, t0; \
        vpor l5, t1, t1; \
        vmovdqu y6, 14 * 16(rio); \
        vmovdqu y7, 15 * 16(rio);
 
-.text
+SECTION_RODATA
+
+ELF(.type _camellia_aesni_avx_data,@object;)
+_camellia_aesni_avx_data:
 .align 16
 
 #define SHUFB_BYTES(idx) \
        .long 0x80808080
        .long 0x80808080
 
+.Lbyte_ones:
+       .quad 1 * 0x0101010101010101
+       .quad 1 * 0x0101010101010101
+.Lbyte_twos:
+       .quad 2 * 0x0101010101010101
+       .quad 2 * 0x0101010101010101
+.Lbyte_threes:
+       .quad 3 * 0x0101010101010101
+       .quad 3 * 0x0101010101010101
+.Lbyte_fours:
+       .quad 4 * 0x0101010101010101
+       .quad 4 * 0x0101010101010101
+.Lbyte_fives:
+       .quad 5 * 0x0101010101010101
+       .quad 5 * 0x0101010101010101
+.Lbyte_sixs:
+       .quad 6 * 0x0101010101010101
+       .quad 6 * 0x0101010101010101
+.Lbyte_sevens:
+       .quad 7 * 0x0101010101010101
+       .quad 7 * 0x0101010101010101
+
 /* For CTR-mode IV byteswap */
 .Lbswap128_mask:
        .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
 .Ltranspose_8x8_shuf:
        .byte 0, 1, 4, 5, 2, 3, 6, 7, 8+0, 8+1, 8+4, 8+5, 8+2, 8+3, 8+6, 8+7
 
+/* CTR byte addition constants */
+.Lbige_addb_1:
+       .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1
+.Lbige_addb_2:
+       .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2
+.Lbige_addb_3:
+       .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3
+.Lbige_addb_4:
+       .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4
+.Lbige_addb_5:
+       .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5
+.Lbige_addb_6:
+       .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6
+.Lbige_addb_7:
+       .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7
+.Lbige_addb_8:
+       .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 8
+.Lbige_addb_9:
+       .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 9
+.Lbige_addb_10:
+       .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 10
+.Lbige_addb_11:
+       .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 11
+.Lbige_addb_12:
+       .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 12
+.Lbige_addb_13:
+       .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 13
+.Lbige_addb_14:
+       .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 14
+.Lbige_addb_15:
+       .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 15
+
 .align 4
 /* 4-bit mask */
 .L0f0f0f0f:
        .long 0x0f0f0f0f
 
+.text
 
-.align 8
+.align 16
 ELF(.type   __camellia_enc_blk16,@function;)
 
 __camellia_enc_blk16:
@@ -826,7 +867,7 @@ __camellia_enc_blk16:
        CFI_ENDPROC();
 ELF(.size __camellia_enc_blk16,.-__camellia_enc_blk16;)
 
-.align 8
+.align 16
 ELF(.type   __camellia_dec_blk16,@function;)
 
 __camellia_dec_blk16:
@@ -897,7 +938,7 @@ ELF(.size __camellia_dec_blk16,.-__camellia_dec_blk16;)
        vpslldq $8, tmp, tmp; \
        vpsubq tmp, x, x;
 
-.align 8
+.align 16
 .globl _gcry_camellia_aesni_avx_ctr_enc
 ELF(.type   _gcry_camellia_aesni_avx_ctr_enc,@function;)
 
@@ -926,6 +967,9 @@ _gcry_camellia_aesni_avx_ctr_enc:
        andq $~31, %rsp;
        movq %rsp, %rax;
 
+       cmpb $(0x100 - 16), 15(%rcx);
+       jbe .Lctr_byteadd;
+
        vmovdqa .Lbswap128_mask rRIP, %xmm14;
 
        /* load IV and byteswap */
@@ -974,6 +1018,8 @@ _gcry_camellia_aesni_avx_ctr_enc:
        vpshufb .Lbswap128_mask rRIP, %xmm13, %xmm13; /* le => be */
        vmovdqu %xmm13, (%rcx);
 
+.align 8
+.Lload_ctr_done:
        /* inpack16_pre: */
        vmovq (key_table)(CTX), %xmm15;
        vpshufb .Lpack_bswap rRIP, %xmm15, %xmm15;
@@ -1022,10 +1068,143 @@ _gcry_camellia_aesni_avx_ctr_enc:
        leave;
        CFI_LEAVE();
        ret_spec_stop;
+
+.align 8
+.Lctr_byteadd_full_ctr_carry:
+       movq 8(%rcx), %r11;
+       movq (%rcx), %r10;
+       bswapq %r11;
+       bswapq %r10;
+       addq $16, %r11;
+       adcq $0, %r10;
+       bswapq %r11;
+       bswapq %r10;
+       movq %r11, 8(%rcx);
+       movq %r10, (%rcx);
+       jmp .Lctr_byteadd_xmm;
+.align 8
+.Lctr_byteadd:
+       vmovdqu (%rcx), %xmm15;
+       je .Lctr_byteadd_full_ctr_carry;
+       addb $16, 15(%rcx);
+.Lctr_byteadd_xmm:
+       vmovdqa %xmm15, %xmm0;
+       vpaddb .Lbige_addb_1 rRIP, %xmm15, %xmm14;
+       vmovdqu %xmm15, 15 * 16(%rax);
+       vpaddb .Lbige_addb_2 rRIP, %xmm15, %xmm13;
+       vmovdqu %xmm14, 14 * 16(%rax);
+       vpaddb .Lbige_addb_3 rRIP, %xmm15, %xmm12;
+       vmovdqu %xmm13, 13 * 16(%rax);
+       vpaddb .Lbige_addb_4 rRIP, %xmm15, %xmm11;
+       vpaddb .Lbige_addb_5 rRIP, %xmm15, %xmm10;
+       vpaddb .Lbige_addb_6 rRIP, %xmm15, %xmm9;
+       vpaddb .Lbige_addb_7 rRIP, %xmm15, %xmm8;
+       vpaddb .Lbige_addb_8 rRIP, %xmm0, %xmm7;
+       vpaddb .Lbige_addb_9 rRIP, %xmm0, %xmm6;
+       vpaddb .Lbige_addb_10 rRIP, %xmm0, %xmm5;
+       vpaddb .Lbige_addb_11 rRIP, %xmm0, %xmm4;
+       vpaddb .Lbige_addb_12 rRIP, %xmm0, %xmm3;
+       vpaddb .Lbige_addb_13 rRIP, %xmm0, %xmm2;
+       vpaddb .Lbige_addb_14 rRIP, %xmm0, %xmm1;
+       vpaddb .Lbige_addb_15 rRIP, %xmm0, %xmm0;
+
+       jmp .Lload_ctr_done;
        CFI_ENDPROC();
 ELF(.size _gcry_camellia_aesni_avx_ctr_enc,.-_gcry_camellia_aesni_avx_ctr_enc;)
 
-.align 8
+.align 16
+.globl _gcry_camellia_aesni_avx_ecb_enc
+ELF(.type   _gcry_camellia_aesni_avx_ecb_enc,@function;)
+
+_gcry_camellia_aesni_avx_ecb_enc:
+       /* input:
+        *      %rdi: ctx, CTX
+        *      %rsi: dst (16 blocks)
+        *      %rdx: src (16 blocks)
+        */
+       CFI_STARTPROC();
+
+       pushq %rbp;
+       CFI_PUSH(%rbp);
+       movq %rsp, %rbp;
+       CFI_DEF_CFA_REGISTER(%rbp);
+
+       vzeroupper;
+
+       cmpl $128, key_bitlength(CTX);
+       movl $32, %r8d;
+       movl $24, %eax;
+       cmovel %eax, %r8d; /* max */
+
+       inpack16_pre(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
+                    %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
+                    %xmm15, %rdx, (key_table)(CTX));
+
+       subq $(16 * 16), %rsp;
+       andq $~31, %rsp;
+       movq %rsp, %rax;
+
+       call __camellia_enc_blk16;
+
+       write_output(%xmm7, %xmm6, %xmm5, %xmm4, %xmm3, %xmm2, %xmm1, %xmm0,
+                    %xmm15, %xmm14, %xmm13, %xmm12, %xmm11, %xmm10, %xmm9,
+                    %xmm8, %rsi);
+
+       vzeroall;
+
+       leave;
+       CFI_LEAVE();
+       ret_spec_stop;
+       CFI_ENDPROC();
+ELF(.size _gcry_camellia_aesni_avx_ecb_enc,.-_gcry_camellia_aesni_avx_ecb_enc;)
+
+.align 16
+.globl _gcry_camellia_aesni_avx_ecb_dec
+ELF(.type   _gcry_camellia_aesni_avx_ecb_dec,@function;)
+
+_gcry_camellia_aesni_avx_ecb_dec:
+       /* input:
+        *      %rdi: ctx, CTX
+        *      %rsi: dst (16 blocks)
+        *      %rdx: src (16 blocks)
+        */
+       CFI_STARTPROC();
+
+       pushq %rbp;
+       CFI_PUSH(%rbp);
+       movq %rsp, %rbp;
+       CFI_DEF_CFA_REGISTER(%rbp);
+
+       vzeroupper;
+
+       cmpl $128, key_bitlength(CTX);
+       movl $32, %r8d;
+       movl $24, %eax;
+       cmovel %eax, %r8d; /* max */
+
+       inpack16_pre(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
+                    %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
+                    %xmm15, %rdx, (key_table)(CTX, %r8, 8));
+
+       subq $(16 * 16), %rsp;
+       andq $~31, %rsp;
+       movq %rsp, %rax;
+
+       call __camellia_dec_blk16;
+
+       write_output(%xmm7, %xmm6, %xmm5, %xmm4, %xmm3, %xmm2, %xmm1, %xmm0,
+                    %xmm15, %xmm14, %xmm13, %xmm12, %xmm11, %xmm10, %xmm9,
+                    %xmm8, %rsi);
+
+       vzeroall;
+
+       leave;
+       CFI_LEAVE();
+       ret_spec_stop;
+       CFI_ENDPROC();
+ELF(.size _gcry_camellia_aesni_avx_ecb_dec,.-_gcry_camellia_aesni_avx_ecb_dec;)
+
+.align 16
 .globl _gcry_camellia_aesni_avx_cbc_dec
 ELF(.type   _gcry_camellia_aesni_avx_cbc_dec,@function;)
 
@@ -1098,7 +1277,7 @@ _gcry_camellia_aesni_avx_cbc_dec:
        CFI_ENDPROC();
 ELF(.size _gcry_camellia_aesni_avx_cbc_dec,.-_gcry_camellia_aesni_avx_cbc_dec;)
 
-.align 8
+.align 16
 .globl _gcry_camellia_aesni_avx_cfb_dec
 ELF(.type   _gcry_camellia_aesni_avx_cfb_dec,@function;)
 
@@ -1180,7 +1359,7 @@ _gcry_camellia_aesni_avx_cfb_dec:
        CFI_ENDPROC();
 ELF(.size _gcry_camellia_aesni_avx_cfb_dec,.-_gcry_camellia_aesni_avx_cfb_dec;)
 
-.align 8
+.align 16
 .globl _gcry_camellia_aesni_avx_ocb_enc
 ELF(.type   _gcry_camellia_aesni_avx_ocb_enc,@function;)
 
@@ -1332,7 +1511,7 @@ _gcry_camellia_aesni_avx_ocb_enc:
        CFI_ENDPROC();
 ELF(.size _gcry_camellia_aesni_avx_ocb_enc,.-_gcry_camellia_aesni_avx_ocb_enc;)
 
-.align 8
+.align 16
 .globl _gcry_camellia_aesni_avx_ocb_dec
 ELF(.type   _gcry_camellia_aesni_avx_ocb_dec,@function;)
 
@@ -1503,7 +1682,7 @@ _gcry_camellia_aesni_avx_ocb_dec:
        CFI_ENDPROC();
 ELF(.size _gcry_camellia_aesni_avx_ocb_dec,.-_gcry_camellia_aesni_avx_ocb_dec;)
 
-.align 8
+.align 16
 .globl _gcry_camellia_aesni_avx_ocb_auth
 ELF(.type   _gcry_camellia_aesni_avx_ocb_auth,@function;)
 
@@ -1720,6 +1899,10 @@ ELF(.size _gcry_camellia_aesni_avx_ocb_auth,.-_gcry_camellia_aesni_avx_ocb_auth;
        vpsllq $(64-(nror)), out, out; \
        vpaddd t0, out, out;
 
+SECTION_RODATA
+
+ELF(.type _camellia_aesni_avx_keysetup_data,@object;)
+_camellia_aesni_avx_keysetup_data:
 
 .align 16
 .Linv_shift_row_and_unpcklbw:
@@ -1752,8 +1935,9 @@ ELF(.size _gcry_camellia_aesni_avx_ocb_auth,.-_gcry_camellia_aesni_avx_ocb_auth;
 .Lsigma6:
        .long 0xB3E6C1FD, 0xB05688C2;
 
+.text
 
-.align 8
+.align 16
 ELF(.type  __camellia_avx_setup128,@function;)
 __camellia_avx_setup128:
        /* input:
@@ -2100,7 +2284,7 @@ __camellia_avx_setup128:
        CFI_ENDPROC();
 ELF(.size __camellia_avx_setup128,.-__camellia_avx_setup128;)
 
-.align 8
+.align 16
 ELF(.type  __camellia_avx_setup256,@function;)
 
 __camellia_avx_setup256:
@@ -2580,7 +2764,7 @@ __camellia_avx_setup256:
        CFI_ENDPROC();
 ELF(.size __camellia_avx_setup256,.-__camellia_avx_setup256;)
 
-.align 8
+.align 16
 .globl _gcry_camellia_aesni_avx_keygen
 ELF(.type  _gcry_camellia_aesni_avx_keygen,@function;)
 
index e93c40b89856186c27ed094f2e0ba4a7043a3286..4c3fb4b26522bf1acfb54978a7629d5523e3fa2e 100644 (file)
@@ -1,6 +1,6 @@
-/* camellia-aesni-avx2-amd64.h - AES-NI/VAES/AVX2 implementation of Camellia
+/* camellia-aesni-avx2-amd64.h - AES-NI/VAES/GFNI/AVX2 implementation of Camellia
  *
- * Copyright (C) 2013-2015,2020-2021 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ * Copyright (C) 2013-2015,2020-2023 Jussi Kivilinna <jussi.kivilinna@iki.fi>
  *
  * This file is part of Libgcrypt.
  *
@@ -36,6 +36,8 @@
 /**********************************************************************
   helper macros
  **********************************************************************/
+
+#ifndef CAMELLIA_GFNI_BUILD
 #define filter_8bit(x, lo_t, hi_t, mask4bit, tmp0) \
        vpand x, mask4bit, tmp0; \
        vpandn x, mask4bit, x; \
@@ -44,6 +46,7 @@
        vpshufb tmp0, lo_t, tmp0; \
        vpshufb x, hi_t, x; \
        vpxor tmp0, x, x;
+#endif
 
 #define ymm0_x xmm0
 #define ymm1_x xmm1
 # define IF_VAES(...)
 #endif
 
+#ifdef CAMELLIA_GFNI_BUILD
+# define IF_GFNI(...) __VA_ARGS__
+# define IF_NOT_GFNI(...)
+#else
+# define IF_GFNI(...)
+# define IF_NOT_GFNI(...) __VA_ARGS__
+#endif
+
+/**********************************************************************
+  GFNI helper macros and constants
+ **********************************************************************/
+
+#ifdef CAMELLIA_GFNI_BUILD
+
+#define BV8(a0,a1,a2,a3,a4,a5,a6,a7) \
+       ( (((a0) & 1) << 0) | \
+         (((a1) & 1) << 1) | \
+         (((a2) & 1) << 2) | \
+         (((a3) & 1) << 3) | \
+         (((a4) & 1) << 4) | \
+         (((a5) & 1) << 5) | \
+         (((a6) & 1) << 6) | \
+         (((a7) & 1) << 7) )
+
+#define BM8X8(l0,l1,l2,l3,l4,l5,l6,l7) \
+       ( ((l7) << (0 * 8)) | \
+         ((l6) << (1 * 8)) | \
+         ((l5) << (2 * 8)) | \
+         ((l4) << (3 * 8)) | \
+         ((l3) << (4 * 8)) | \
+         ((l2) << (5 * 8)) | \
+         ((l1) << (6 * 8)) | \
+         ((l0) << (7 * 8)) )
+
+/* Pre-filters and post-filters constants for Camellia sboxes s1, s2, s3 and s4.
+ *   See http://urn.fi/URN:NBN:fi:oulu-201305311409, pages 43-48.
+ *
+ * Pre-filters are directly from above source, "θ₁"/"θ₄". Post-filters are
+ * combination of function "A" (AES SubBytes affine transformation) and
+ * "ψ₁"/"ψ₂"/"ψ₃".
+ */
+
+/* Constant from "θ₁(x)" and "θ₄(x)" functions. */
+#define pre_filter_constant_s1234 BV8(1, 0, 1, 0, 0, 0, 1, 0)
+
+/* Constant from "ψ₁(A(x))" function: */
+#define post_filter_constant_s14  BV8(0, 1, 1, 1, 0, 1, 1, 0)
+
+/* Constant from "ψ₂(A(x))" function: */
+#define post_filter_constant_s2   BV8(0, 0, 1, 1, 1, 0, 1, 1)
+
+/* Constant from "ψ₃(A(x))" function: */
+#define post_filter_constant_s3   BV8(1, 1, 1, 0, 1, 1, 0, 0)
+
+#endif /* CAMELLIA_GFNI_BUILD */
+
 /**********************************************************************
   32-way camellia
  **********************************************************************/
 
-/*
+#ifdef CAMELLIA_GFNI_BUILD
+
+/* roundsm32 (GFNI version)
  * IN:
  *   x0..x7: byte-sliced AB state
  *   mem_cd: register pointer storing CD state
  * OUT:
  *   x0..x7: new byte-sliced CD state
  */
+#define roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, t0, t1, t2, t3, t4, t5, \
+                 t6, t7, mem_cd, key) \
+       /* \
+        * S-function with AES subbytes \
+        */ \
+       vpbroadcastq .Lpre_filter_bitmatrix_s123 rRIP, t5; \
+       vpbroadcastq .Lpre_filter_bitmatrix_s4 rRIP, t2; \
+       vpbroadcastq .Lpost_filter_bitmatrix_s14 rRIP, t4; \
+       vpbroadcastq .Lpost_filter_bitmatrix_s2 rRIP, t3; \
+       vpbroadcastq .Lpost_filter_bitmatrix_s3 rRIP, t6; \
+       \
+       /* prefilter sboxes */ \
+       vgf2p8affineqb $(pre_filter_constant_s1234), t5, x0, x0; \
+       vgf2p8affineqb $(pre_filter_constant_s1234), t5, x7, x7; \
+       vgf2p8affineqb $(pre_filter_constant_s1234), t2, x3, x3; \
+       vgf2p8affineqb $(pre_filter_constant_s1234), t2, x6, x6; \
+       vgf2p8affineqb $(pre_filter_constant_s1234), t5, x2, x2; \
+       vgf2p8affineqb $(pre_filter_constant_s1234), t5, x5, x5; \
+       vgf2p8affineqb $(pre_filter_constant_s1234), t5, x1, x1; \
+       vgf2p8affineqb $(pre_filter_constant_s1234), t5, x4, x4; \
+       \
+       /* sbox GF8 inverse + postfilter sboxes 1 and 4 */ \
+       vgf2p8affineinvqb $(post_filter_constant_s14), t4, x0, x0; \
+       vgf2p8affineinvqb $(post_filter_constant_s14), t4, x7, x7; \
+       vgf2p8affineinvqb $(post_filter_constant_s14), t4, x3, x3; \
+       vgf2p8affineinvqb $(post_filter_constant_s14), t4, x6, x6; \
+       \
+       /* sbox GF8 inverse + postfilter sbox 3 */ \
+       vgf2p8affineinvqb $(post_filter_constant_s3), t6, x2, x2; \
+       vgf2p8affineinvqb $(post_filter_constant_s3), t6, x5, x5; \
+       \
+       /* sbox GF8 inverse + postfilter sbox 2 */ \
+       vgf2p8affineinvqb $(post_filter_constant_s2), t3, x1, x1; \
+       vgf2p8affineinvqb $(post_filter_constant_s2), t3, x4, x4; \
+       \
+       vpbroadcastb 7+key, t7; \
+       vpbroadcastb 6+key, t6; \
+       \
+       /* P-function */ \
+       vpxor x5, x0, x0; \
+       vpxor x6, x1, x1; \
+       vpxor x7, x2, x2; \
+       vpxor x4, x3, x3; \
+       \
+       vpbroadcastb 5+key, t5; \
+       vpbroadcastb 4+key, t4; \
+       \
+       vpxor x2, x4, x4; \
+       vpxor x3, x5, x5; \
+       vpxor x0, x6, x6; \
+       vpxor x1, x7, x7; \
+       \
+       vpbroadcastb 3+key, t3; \
+       vpbroadcastb 2+key, t2; \
+       \
+       vpxor x7, x0, x0; \
+       vpxor x4, x1, x1; \
+       vpxor x5, x2, x2; \
+       vpxor x6, x3, x3; \
+       \
+       vpbroadcastb 1+key, t1; \
+       vpbroadcastb 0+key, t0; \
+       \
+       vpxor x3, x4, x4; \
+       vpxor x0, x5, x5; \
+       vpxor x1, x6, x6; \
+       vpxor x2, x7, x7; /* note: high and low parts swapped */ \
+       \
+       /* Add key material and result to CD (x becomes new CD) */ \
+       \
+       vpxor t7, x0, x0; \
+       vpxor 4 * 32(mem_cd), x0, x0; \
+       \
+       vpxor t6, x1, x1; \
+       vpxor 5 * 32(mem_cd), x1, x1; \
+       \
+       vpxor t5, x2, x2; \
+       vpxor 6 * 32(mem_cd), x2, x2; \
+       \
+       vpxor t4, x3, x3; \
+       vpxor 7 * 32(mem_cd), x3, x3; \
+       \
+       vpxor t3, x4, x4; \
+       vpxor 0 * 32(mem_cd), x4, x4; \
+       \
+       vpxor t2, x5, x5; \
+       vpxor 1 * 32(mem_cd), x5, x5; \
+       \
+       vpxor t1, x6, x6; \
+       vpxor 2 * 32(mem_cd), x6, x6; \
+       \
+       vpxor t0, x7, x7; \
+       vpxor 3 * 32(mem_cd), x7, x7;
+
+#else /* CAMELLIA_GFNI_BUILD */
 
+/* roundsm32 (AES-NI / VAES version)
+ * IN:
+ *   x0..x7: byte-sliced AB state
+ *   mem_cd: register pointer storing CD state
+ *   key: index for key material
+ * OUT:
+ *   x0..x7: new byte-sliced CD state
+ */
 #define roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, t0, t1, t2, t3, t4, t5, \
                  t6, t7, mem_cd, key) \
        /* \
        filter_8bit(x1, t5, t6, t7, t4); \
        filter_8bit(x4, t5, t6, t7, t4); \
        \
-       vpxor t4##_x, t4##_x, t4##_x; \
+       vpxor t4, t4, t4; \
        \
        /* AES subbytes + AES shift rows */ \
        IF_AESNI(vextracti128 $1, x2, t6##_x; \
        filter_8bit(x2, t2, t3, t7, t6); \
        filter_8bit(x5, t2, t3, t7, t6); \
        \
-       vpbroadcastq key, t0; /* higher 64-bit duplicate ignored */ \
-       \
        /* postfilter sbox 2 */ \
        filter_8bit(x1, t4, t5, t7, t2); \
        filter_8bit(x4, t4, t5, t7, t2); \
-       vpxor t7, t7, t7; \
        \
-       vpsrldq $1, t0, t1; \
-       vpsrldq $2, t0, t2; \
-       vpshufb t7, t1, t1; \
-       vpsrldq $3, t0, t3; \
+       vpbroadcastb 7+key, t7; \
+       vpbroadcastb 6+key, t6; \
        \
        /* P-function */ \
        vpxor x5, x0, x0; \
        vpxor x7, x2, x2; \
        vpxor x4, x3, x3; \
        \
-       vpshufb t7, t2, t2; \
-       vpsrldq $4, t0, t4; \
-       vpshufb t7, t3, t3; \
-       vpsrldq $5, t0, t5; \
-       vpshufb t7, t4, t4; \
+       vpbroadcastb 5+key, t5; \
+       vpbroadcastb 4+key, t4; \
        \
        vpxor x2, x4, x4; \
        vpxor x3, x5, x5; \
        vpxor x0, x6, x6; \
        vpxor x1, x7, x7; \
        \
-       vpsrldq $6, t0, t6; \
-       vpshufb t7, t5, t5; \
-       vpshufb t7, t6, t6; \
+       vpbroadcastb 3+key, t3; \
+       vpbroadcastb 2+key, t2; \
        \
        vpxor x7, x0, x0; \
        vpxor x4, x1, x1; \
        vpxor x5, x2, x2; \
        vpxor x6, x3, x3; \
        \
+       vpbroadcastb 1+key, t1; \
+       vpbroadcastb 0+key, t0; \
+       \
        vpxor x3, x4, x4; \
        vpxor x0, x5, x5; \
        vpxor x1, x6, x6; \
        \
        /* Add key material and result to CD (x becomes new CD) */ \
        \
-       vpxor t6, x1, x1; \
-       vpxor 5 * 32(mem_cd), x1, x1; \
-       \
-       vpsrldq $7, t0, t6; \
-       vpshufb t7, t0, t0; \
-       vpshufb t7, t6, t7; \
-       \
        vpxor t7, x0, x0; \
        vpxor 4 * 32(mem_cd), x0, x0; \
        \
+       vpxor t6, x1, x1; \
+       vpxor 5 * 32(mem_cd), x1, x1; \
+       \
        vpxor t5, x2, x2; \
        vpxor 6 * 32(mem_cd), x2, x2; \
        \
        vpxor t0, x7, x7; \
        vpxor 3 * 32(mem_cd), x7, x7;
 
+#endif /* CAMELLIA_GFNI_BUILD */
+
 /*
  * IN/OUT:
  *  x0..x7: byte-sliced AB state preloaded
  * OUT:
  *  v0..3: (IN <<< 1)
  */
+#ifdef CAMELLIA_GFNI_BUILD
+#define rol32_1_32(v0, v1, v2, v3, t0, t1, t2, right_shift_by_7) \
+       vgf2p8affineqb $0, right_shift_by_7, v0, t0; \
+       vpaddb v0, v0, v0; \
+       \
+       vgf2p8affineqb $0, right_shift_by_7, v1, t1; \
+       vpaddb v1, v1, v1; \
+       \
+       vgf2p8affineqb $0, right_shift_by_7, v2, t2; \
+       vpaddb v2, v2, v2; \
+       \
+       vpor t0, v1, v1; \
+       \
+       vgf2p8affineqb $0, right_shift_by_7, v3, t0; \
+       vpaddb v3, v3, v3; \
+       \
+       vpor t1, v2, v2; \
+       vpor t2, v3, v3; \
+       vpor t0, v0, v0;
+#else
 #define rol32_1_32(v0, v1, v2, v3, t0, t1, t2, zero) \
        vpcmpgtb v0, zero, t0; \
        vpaddb v0, v0, v0; \
        vpor t1, v2, v2; \
        vpor t2, v3, v3; \
        vpor t0, v0, v0;
+#endif
 
 /*
  * IN:
         * t0 &= ll; \
         * lr ^= rol32(t0, 1); \
         */ \
-       vpbroadcastd kll, t0; /* only lowest 32-bit used */ \
-       vpxor tt0, tt0, tt0; \
-       vpshufb tt0, t0, t3; \
-       vpsrldq $1, t0, t0; \
-       vpshufb tt0, t0, t2; \
-       vpsrldq $1, t0, t0; \
-       vpshufb tt0, t0, t1; \
-       vpsrldq $1, t0, t0; \
-       vpshufb tt0, t0, t0; \
+       IF_NOT_GFNI(vpxor tt0, tt0, tt0); \
+       IF_GFNI(vpbroadcastq .Lright_shift_by_7 rRIP, tt0); \
+       vpbroadcastb 0+kll, t3; \
+       vpbroadcastb 1+kll, t2; \
+       vpbroadcastb 2+kll, t1; \
+       vpbroadcastb 3+kll, t0; \
        \
        vpand l0, t0, t0; \
        vpand l1, t1, t1; \
        rol32_1_32(t3, t2, t1, t0, tt1, tt2, tt3, tt0); \
        \
        vpxor l4, t0, l4; \
-       vpbroadcastd krr, t0; /* only lowest 32-bit used */ \
        vmovdqu l4, 4 * 32(l); \
        vpxor l5, t1, l5; \
        vmovdqu l5, 5 * 32(l); \
         * rl ^= t2; \
         */ \
        \
-       vpshufb tt0, t0, t3; \
-       vpsrldq $1, t0, t0; \
-       vpshufb tt0, t0, t2; \
-       vpsrldq $1, t0, t0; \
-       vpshufb tt0, t0, t1; \
-       vpsrldq $1, t0, t0; \
-       vpshufb tt0, t0, t0; \
+       vpbroadcastb 0+krr, t3; \
+       vpbroadcastb 1+krr, t2; \
+       vpbroadcastb 2+krr, t1; \
+       vpbroadcastb 3+krr, t0; \
        \
        vpor 4 * 32(r), t0, t0; \
        vpor 5 * 32(r), t1, t1; \
        vpxor 2 * 32(r), t2, t2; \
        vpxor 3 * 32(r), t3, t3; \
        vmovdqu t0, 0 * 32(r); \
-       vpbroadcastd krl, t0; /* only lowest 32-bit used */ \
        vmovdqu t1, 1 * 32(r); \
        vmovdqu t2, 2 * 32(r); \
        vmovdqu t3, 3 * 32(r); \
         * t2 &= rl; \
         * rr ^= rol32(t2, 1); \
         */ \
-       vpshufb tt0, t0, t3; \
-       vpsrldq $1, t0, t0; \
-       vpshufb tt0, t0, t2; \
-       vpsrldq $1, t0, t0; \
-       vpshufb tt0, t0, t1; \
-       vpsrldq $1, t0, t0; \
-       vpshufb tt0, t0, t0; \
+       vpbroadcastb 0+krl, t3; \
+       vpbroadcastb 1+krl, t2; \
+       vpbroadcastb 2+krl, t1; \
+       vpbroadcastb 3+krl, t0; \
        \
        vpand 0 * 32(r), t0, t0; \
        vpand 1 * 32(r), t1, t1; \
        vpxor 6 * 32(r), t2, t2; \
        vpxor 7 * 32(r), t3, t3; \
        vmovdqu t0, 4 * 32(r); \
-       vpbroadcastd klr, t0; /* only lowest 32-bit used */ \
        vmovdqu t1, 5 * 32(r); \
        vmovdqu t2, 6 * 32(r); \
        vmovdqu t3, 7 * 32(r); \
         * ll ^= t0; \
         */ \
        \
-       vpshufb tt0, t0, t3; \
-       vpsrldq $1, t0, t0; \
-       vpshufb tt0, t0, t2; \
-       vpsrldq $1, t0, t0; \
-       vpshufb tt0, t0, t1; \
-       vpsrldq $1, t0, t0; \
-       vpshufb tt0, t0, t0; \
+       vpbroadcastb 0+klr, t3; \
+       vpbroadcastb 1+klr, t2; \
+       vpbroadcastb 2+klr, t1; \
+       vpbroadcastb 3+klr, t0; \
        \
        vpor l4, t0, t0; \
        vpor l5, t1, t1; \
        vmovdqu y6, 14 * 32(rio); \
        vmovdqu y7, 15 * 32(rio);
 
-.text
+SECTION_RODATA
+
 .align 32
 
 #define SHUFB_BYTES(idx) \
        0 + (idx), 4 + (idx), 8 + (idx), 12 + (idx)
 
-.Lshufb_16x16b:
-       .byte SHUFB_BYTES(0), SHUFB_BYTES(1), SHUFB_BYTES(2), SHUFB_BYTES(3)
-       .byte SHUFB_BYTES(0), SHUFB_BYTES(1), SHUFB_BYTES(2), SHUFB_BYTES(3)
+FUNC_NAME(_constants):
+ELF(.type   FUNC_NAME(_constants),@object;)
 
 .Lpack_bswap:
        .long 0x00010203, 0x04050607, 0x80808080, 0x80808080
        .long 0x00010203, 0x04050607, 0x80808080, 0x80808080
 
+.Lshufb_16x16b:
+       .byte SHUFB_BYTES(0), SHUFB_BYTES(1), SHUFB_BYTES(2), SHUFB_BYTES(3)
+
 /* For CTR-mode IV byteswap */
 .Lbswap128_mask:
        .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
 
+/* CTR byte addition constants */
+.align 32
+.Lbige_addb_0_1:
+       .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+       .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1
+.Lbige_addb_2_3:
+       .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2
+       .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3
+.Lbige_addb_4_5:
+       .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4
+       .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5
+.Lbige_addb_6_7:
+       .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6
+       .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7
+.Lbige_addb_8_9:
+       .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 8
+       .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 9
+.Lbige_addb_10_11:
+       .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 10
+       .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 11
+.Lbige_addb_12_13:
+       .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 12
+       .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 13
+.Lbige_addb_14_15:
+       .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 14
+       .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 15
+.Lbige_addb_16_16:
+       .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 16
+       .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 16
+
+#ifdef CAMELLIA_GFNI_BUILD
+
+.align 64
+/* Pre-filters and post-filters bit-matrixes for Camellia sboxes s1, s2, s3
+ * and s4.
+ *   See http://urn.fi/URN:NBN:fi:oulu-201305311409, pages 43-48.
+ *
+ * Pre-filters are directly from above source, "θ₁"/"θ₄". Post-filters are
+ * combination of function "A" (AES SubBytes affine transformation) and
+ * "ψ₁"/"ψ₂"/"ψ₃".
+ */
+
+/* Bit-matrix from "θ₁(x)" function: */
+.Lpre_filter_bitmatrix_s123:
+       .quad BM8X8(BV8(1, 1, 1, 0, 1, 1, 0, 1),
+                   BV8(0, 0, 1, 1, 0, 0, 1, 0),
+                   BV8(1, 1, 0, 1, 0, 0, 0, 0),
+                   BV8(1, 0, 1, 1, 0, 0, 1, 1),
+                   BV8(0, 0, 0, 0, 1, 1, 0, 0),
+                   BV8(1, 0, 1, 0, 0, 1, 0, 0),
+                   BV8(0, 0, 1, 0, 1, 1, 0, 0),
+                   BV8(1, 0, 0, 0, 0, 1, 1, 0))
+
+/* Bit-matrix from "θ₄(x)" function: */
+.Lpre_filter_bitmatrix_s4:
+       .quad BM8X8(BV8(1, 1, 0, 1, 1, 0, 1, 1),
+                   BV8(0, 1, 1, 0, 0, 1, 0, 0),
+                   BV8(1, 0, 1, 0, 0, 0, 0, 1),
+                   BV8(0, 1, 1, 0, 0, 1, 1, 1),
+                   BV8(0, 0, 0, 1, 1, 0, 0, 0),
+                   BV8(0, 1, 0, 0, 1, 0, 0, 1),
+                   BV8(0, 1, 0, 1, 1, 0, 0, 0),
+                   BV8(0, 0, 0, 0, 1, 1, 0, 1))
+
+/* Bit-matrix from "ψ₁(A(x))" function: */
+.Lpost_filter_bitmatrix_s14:
+       .quad BM8X8(BV8(0, 0, 0, 0, 0, 0, 0, 1),
+                   BV8(0, 1, 1, 0, 0, 1, 1, 0),
+                   BV8(1, 0, 1, 1, 1, 1, 1, 0),
+                   BV8(0, 0, 0, 1, 1, 0, 1, 1),
+                   BV8(1, 0, 0, 0, 1, 1, 1, 0),
+                   BV8(0, 1, 0, 1, 1, 1, 1, 0),
+                   BV8(0, 1, 1, 1, 1, 1, 1, 1),
+                   BV8(0, 0, 0, 1, 1, 1, 0, 0))
+
+/* Bit-matrix from "ψ₂(A(x))" function: */
+.Lpost_filter_bitmatrix_s2:
+       .quad BM8X8(BV8(0, 0, 0, 1, 1, 1, 0, 0),
+                   BV8(0, 0, 0, 0, 0, 0, 0, 1),
+                   BV8(0, 1, 1, 0, 0, 1, 1, 0),
+                   BV8(1, 0, 1, 1, 1, 1, 1, 0),
+                   BV8(0, 0, 0, 1, 1, 0, 1, 1),
+                   BV8(1, 0, 0, 0, 1, 1, 1, 0),
+                   BV8(0, 1, 0, 1, 1, 1, 1, 0),
+                   BV8(0, 1, 1, 1, 1, 1, 1, 1))
+
+/* Bit-matrix from "ψ₃(A(x))" function: */
+.Lpost_filter_bitmatrix_s3:
+       .quad BM8X8(BV8(0, 1, 1, 0, 0, 1, 1, 0),
+                   BV8(1, 0, 1, 1, 1, 1, 1, 0),
+                   BV8(0, 0, 0, 1, 1, 0, 1, 1),
+                   BV8(1, 0, 0, 0, 1, 1, 1, 0),
+                   BV8(0, 1, 0, 1, 1, 1, 1, 0),
+                   BV8(0, 1, 1, 1, 1, 1, 1, 1),
+                   BV8(0, 0, 0, 1, 1, 1, 0, 0),
+                   BV8(0, 0, 0, 0, 0, 0, 0, 1))
+
+/* Bit-matrix for right shifting uint8_t values in vector by 7. */
+.Lright_shift_by_7:
+       .quad BM8X8(BV8(0, 0, 0, 0, 0, 0, 0, 1),
+                   BV8(0, 0, 0, 0, 0, 0, 0, 0),
+                   BV8(0, 0, 0, 0, 0, 0, 0, 0),
+                   BV8(0, 0, 0, 0, 0, 0, 0, 0),
+                   BV8(0, 0, 0, 0, 0, 0, 0, 0),
+                   BV8(0, 0, 0, 0, 0, 0, 0, 0),
+                   BV8(0, 0, 0, 0, 0, 0, 0, 0),
+                   BV8(0, 0, 0, 0, 0, 0, 0, 0))
+
+#else /* CAMELLIA_GFNI_BUILD */
+
 /*
  * pre-SubByte transform
  *
 .L0f0f0f0f:
        .long 0x0f0f0f0f
 
+#endif /* CAMELLIA_GFNI_BUILD */
 
-.align 8
-ELF(.type   __camellia_enc_blk32,@function;)
+ELF(.size FUNC_NAME(_constants),.-FUNC_NAME(_constants);)
 
-__camellia_enc_blk32:
+.text
+
+.align 16
+ELF(.type   FUNC_NAME(enc_blk32),@function;)
+
+FUNC_NAME(enc_blk32):
        /* input:
         *      %rdi: ctx, CTX
         *      %rax: temporary storage, 512 bytes
@@ -817,19 +1097,19 @@ __camellia_enc_blk32:
 
        ret_spec_stop;
        CFI_ENDPROC();
-ELF(.size __camellia_enc_blk32,.-__camellia_enc_blk32;)
+ELF(.size FUNC_NAME(enc_blk32),.-FUNC_NAME(enc_blk32);)
 
-.align 8
-ELF(.type   __camellia_dec_blk32,@function;)
+.align 16
+ELF(.type   FUNC_NAME(dec_blk32),@function;)
 
-__camellia_dec_blk32:
+FUNC_NAME(dec_blk32):
        /* input:
         *      %rdi: ctx, CTX
         *      %rax: temporary storage, 512 bytes
         *      %r8d: 24 for 16 byte key, 32 for larger
-        *      %ymm0..%ymm15: 16 encrypted blocks
+        *      %ymm0..%ymm15: 32 encrypted blocks
         * output:
-        *      %ymm0..%ymm15: 16 plaintext blocks, order swapped:
+        *      %ymm0..%ymm15: 32 plaintext blocks, order swapped:
         *       7, 8, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8
         */
        CFI_STARTPROC();
@@ -882,7 +1162,7 @@ __camellia_dec_blk32:
 
        ret_spec_stop;
        CFI_ENDPROC();
-ELF(.size __camellia_dec_blk32,.-__camellia_dec_blk32;)
+ELF(.size FUNC_NAME(dec_blk32),.-FUNC_NAME(dec_blk32);)
 
 #define inc_le128(x, minus_one, tmp) \
        vpcmpeqq minus_one, x, tmp; \
@@ -890,7 +1170,7 @@ ELF(.size __camellia_dec_blk32,.-__camellia_dec_blk32;)
        vpslldq $8, tmp, tmp; \
        vpsubq tmp, x, x;
 
-.align 8
+.align 16
 .globl FUNC_NAME(ctr_enc)
 ELF(.type   FUNC_NAME(ctr_enc),@function;)
 
@@ -908,11 +1188,6 @@ FUNC_NAME(ctr_enc):
        movq %rsp, %rbp;
        CFI_DEF_CFA_REGISTER(%rbp);
 
-       movq 8(%rcx), %r11;
-       bswapq %r11;
-
-       vzeroupper;
-
        cmpl $128, key_bitlength(CTX);
        movl $32, %r8d;
        movl $24, %eax;
@@ -922,6 +1197,12 @@ FUNC_NAME(ctr_enc):
        andq $~63, %rsp;
        movq %rsp, %rax;
 
+       cmpb $(0x100 - 32), 15(%rcx);
+       jbe .Lctr_byteadd;
+
+       movq 8(%rcx), %r11;
+       bswapq %r11;
+
        vpcmpeqd %ymm15, %ymm15, %ymm15;
        vpsrldq $8, %ymm15, %ymm15; /* ab: -1:0 ; cd: -1:0 */
 
@@ -1034,9 +1315,9 @@ FUNC_NAME(ctr_enc):
        vpshufb .Lbswap128_mask rRIP, %xmm13, %xmm13;
        vmovdqu %xmm13, (%rcx);
 
-.align 4
+.align 8
 .Lload_ctr_done:
-       /* inpack16_pre: */
+       /* inpack32_pre: */
        vpbroadcastq (key_table)(CTX), %ymm15;
        vpshufb .Lpack_bswap rRIP, %ymm15, %ymm15;
        vpxor %ymm0, %ymm15, %ymm0;
@@ -1056,7 +1337,7 @@ FUNC_NAME(ctr_enc):
        vpxor 14 * 32(%rax), %ymm15, %ymm14;
        vpxor 15 * 32(%rax), %ymm15, %ymm15;
 
-       call __camellia_enc_blk32;
+       call FUNC_NAME(enc_blk32);
 
        vpxor 0 * 32(%rdx), %ymm7, %ymm7;
        vpxor 1 * 32(%rdx), %ymm6, %ymm6;
@@ -1074,7 +1355,6 @@ FUNC_NAME(ctr_enc):
        vpxor 13 * 32(%rdx), %ymm10, %ymm10;
        vpxor 14 * 32(%rdx), %ymm9, %ymm9;
        vpxor 15 * 32(%rdx), %ymm8, %ymm8;
-       leaq 32 * 16(%rdx), %rdx;
 
        write_output(%ymm7, %ymm6, %ymm5, %ymm4, %ymm3, %ymm2, %ymm1, %ymm0,
                     %ymm15, %ymm14, %ymm13, %ymm12, %ymm11, %ymm10, %ymm9,
@@ -1085,10 +1365,52 @@ FUNC_NAME(ctr_enc):
        leave;
        CFI_LEAVE();
        ret_spec_stop;
+
+.align 8
+.Lctr_byteadd_full_ctr_carry:
+       movq 8(%rcx), %r11;
+       movq (%rcx), %r10;
+       bswapq %r11;
+       bswapq %r10;
+       addq $32, %r11;
+       adcq $0, %r10;
+       bswapq %r11;
+       bswapq %r10;
+       movq %r11, 8(%rcx);
+       movq %r10, (%rcx);
+       jmp .Lctr_byteadd_ymm;
+.align 8
+.Lctr_byteadd:
+       vbroadcasti128 (%rcx), %ymm8;
+       je .Lctr_byteadd_full_ctr_carry;
+       addb $32, 15(%rcx);
+.Lctr_byteadd_ymm:
+       vpaddb .Lbige_addb_16_16 rRIP, %ymm8, %ymm0;
+       vpaddb .Lbige_addb_0_1 rRIP, %ymm8, %ymm15;
+       vpaddb .Lbige_addb_2_3 rRIP, %ymm8, %ymm14;
+       vmovdqu %ymm15, 15 * 32(%rax);
+       vpaddb .Lbige_addb_4_5 rRIP, %ymm8, %ymm13;
+       vmovdqu %ymm14, 14 * 32(%rax);
+       vpaddb .Lbige_addb_6_7 rRIP, %ymm8, %ymm12;
+       vmovdqu %ymm13, 13 * 32(%rax);
+       vpaddb .Lbige_addb_8_9 rRIP, %ymm8, %ymm11;
+       vpaddb .Lbige_addb_10_11 rRIP, %ymm8, %ymm10;
+       vpaddb .Lbige_addb_12_13 rRIP, %ymm8, %ymm9;
+       vpaddb .Lbige_addb_14_15 rRIP, %ymm8, %ymm8;
+       vpaddb .Lbige_addb_0_1 rRIP, %ymm0, %ymm7;
+       vpaddb .Lbige_addb_2_3 rRIP, %ymm0, %ymm6;
+       vpaddb .Lbige_addb_4_5 rRIP, %ymm0, %ymm5;
+       vpaddb .Lbige_addb_6_7 rRIP, %ymm0, %ymm4;
+       vpaddb .Lbige_addb_8_9 rRIP, %ymm0, %ymm3;
+       vpaddb .Lbige_addb_10_11 rRIP, %ymm0, %ymm2;
+       vpaddb .Lbige_addb_12_13 rRIP, %ymm0, %ymm1;
+       vpaddb .Lbige_addb_14_15 rRIP, %ymm0, %ymm0;
+
+       jmp .Lload_ctr_done;
        CFI_ENDPROC();
 ELF(.size FUNC_NAME(ctr_enc),.-FUNC_NAME(ctr_enc);)
 
-.align 8
+.align 16
 .globl FUNC_NAME(cbc_dec)
 ELF(.type   FUNC_NAME(cbc_dec),@function;)
 
@@ -1106,8 +1428,6 @@ FUNC_NAME(cbc_dec):
        movq %rsp, %rbp;
        CFI_DEF_CFA_REGISTER(%rbp);
 
-       vzeroupper;
-
        movq %rcx, %r9;
 
        cmpl $128, key_bitlength(CTX);
@@ -1123,7 +1443,7 @@ FUNC_NAME(cbc_dec):
                     %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
                     %ymm15, %rdx, (key_table)(CTX, %r8, 8));
 
-       call __camellia_dec_blk32;
+       call FUNC_NAME(dec_blk32);
 
        /* XOR output with IV */
        vmovdqu %ymm8, (%rax);
@@ -1165,7 +1485,7 @@ FUNC_NAME(cbc_dec):
        CFI_ENDPROC();
 ELF(.size FUNC_NAME(cbc_dec),.-FUNC_NAME(cbc_dec);)
 
-.align 8
+.align 16
 .globl FUNC_NAME(cfb_dec)
 ELF(.type   FUNC_NAME(cfb_dec),@function;)
 
@@ -1183,8 +1503,6 @@ FUNC_NAME(cfb_dec):
        movq %rsp, %rbp;
        CFI_DEF_CFA_REGISTER(%rbp);
 
-       vzeroupper;
-
        cmpl $128, key_bitlength(CTX);
        movl $32, %r8d;
        movl $24, %eax;
@@ -1194,7 +1512,7 @@ FUNC_NAME(cfb_dec):
        andq $~63, %rsp;
        movq %rsp, %rax;
 
-       /* inpack16_pre: */
+       /* inpack32_pre: */
        vpbroadcastq (key_table)(CTX), %ymm0;
        vpshufb .Lpack_bswap rRIP, %ymm0, %ymm0;
        vmovdqu (%rcx), %xmm15;
@@ -1218,7 +1536,7 @@ FUNC_NAME(cfb_dec):
        vpxor (13 * 32 + 16)(%rdx), %ymm0, %ymm1;
        vpxor (14 * 32 + 16)(%rdx), %ymm0, %ymm0;
 
-       call __camellia_enc_blk32;
+       call FUNC_NAME(enc_blk32);
 
        vpxor 0 * 32(%rdx), %ymm7, %ymm7;
        vpxor 1 * 32(%rdx), %ymm6, %ymm6;
@@ -1249,7 +1567,7 @@ FUNC_NAME(cfb_dec):
        CFI_ENDPROC();
 ELF(.size FUNC_NAME(cfb_dec),.-FUNC_NAME(cfb_dec);)
 
-.align 8
+.align 16
 .globl FUNC_NAME(ocb_enc)
 ELF(.type   FUNC_NAME(ocb_enc),@function;)
 
@@ -1269,8 +1587,6 @@ FUNC_NAME(ocb_enc):
        movq %rsp, %rbp;
        CFI_DEF_CFA_REGISTER(%rbp);
 
-       vzeroupper;
-
        subq $(16 * 32 + 4 * 8), %rsp;
        andq $~63, %rsp;
        movq %rsp, %rax;
@@ -1363,7 +1679,7 @@ FUNC_NAME(ocb_enc):
        movl $24, %r10d;
        cmovel %r10d, %r8d; /* max */
 
-       /* inpack16_pre: */
+       /* inpack32_pre: */
        vpbroadcastq (key_table)(CTX), %ymm15;
        vpshufb .Lpack_bswap rRIP, %ymm15, %ymm15;
        vpxor %ymm0, %ymm15, %ymm0;
@@ -1383,7 +1699,7 @@ FUNC_NAME(ocb_enc):
        vpxor 14 * 32(%rax), %ymm15, %ymm14;
        vpxor 15 * 32(%rax), %ymm15, %ymm15;
 
-       call __camellia_enc_blk32;
+       call FUNC_NAME(enc_blk32);
 
        vpxor 0 * 32(%rsi), %ymm7, %ymm7;
        vpxor 1 * 32(%rsi), %ymm6, %ymm6;
@@ -1423,7 +1739,7 @@ FUNC_NAME(ocb_enc):
        CFI_ENDPROC();
 ELF(.size FUNC_NAME(ocb_enc),.-FUNC_NAME(ocb_enc);)
 
-.align 8
+.align 16
 .globl FUNC_NAME(ocb_dec)
 ELF(.type   FUNC_NAME(ocb_dec),@function;)
 
@@ -1443,8 +1759,6 @@ FUNC_NAME(ocb_dec):
        movq %rsp, %rbp;
        CFI_DEF_CFA_REGISTER(%rbp);
 
-       vzeroupper;
-
        subq $(16 * 32 + 4 * 8), %rsp;
        andq $~63, %rsp;
        movq %rsp, %rax;
@@ -1532,7 +1846,7 @@ FUNC_NAME(ocb_dec):
        movl $24, %r9d;
        cmovel %r9d, %r8d; /* max */
 
-       /* inpack16_pre: */
+       /* inpack32_pre: */
        vpbroadcastq (key_table)(CTX, %r8, 8), %ymm15;
        vpshufb .Lpack_bswap rRIP, %ymm15, %ymm15;
        vpxor %ymm0, %ymm15, %ymm0;
@@ -1552,7 +1866,7 @@ FUNC_NAME(ocb_dec):
        vpxor 14 * 32(%rax), %ymm15, %ymm14;
        vpxor 15 * 32(%rax), %ymm15, %ymm15;
 
-       call __camellia_dec_blk32;
+       call FUNC_NAME(dec_blk32);
 
        vpxor 0 * 32(%rsi), %ymm7, %ymm7;
        vpxor 1 * 32(%rsi), %ymm6, %ymm6;
@@ -1620,7 +1934,7 @@ FUNC_NAME(ocb_dec):
        CFI_ENDPROC();
 ELF(.size FUNC_NAME(ocb_dec),.-FUNC_NAME(ocb_dec);)
 
-.align 8
+.align 16
 .globl FUNC_NAME(ocb_auth)
 ELF(.type   FUNC_NAME(ocb_auth),@function;)
 
@@ -1639,8 +1953,6 @@ FUNC_NAME(ocb_auth):
        movq %rsp, %rbp;
        CFI_DEF_CFA_REGISTER(%rbp);
 
-       vzeroupper;
-
        subq $(16 * 32 + 4 * 8), %rsp;
        andq $~63, %rsp;
        movq %rsp, %rax;
@@ -1728,7 +2040,7 @@ FUNC_NAME(ocb_auth):
 
        movq %rcx, %r10;
 
-       /* inpack16_pre: */
+       /* inpack32_pre: */
        vpbroadcastq (key_table)(CTX), %ymm15;
        vpshufb .Lpack_bswap rRIP, %ymm15, %ymm15;
        vpxor %ymm0, %ymm15, %ymm0;
@@ -1748,7 +2060,7 @@ FUNC_NAME(ocb_auth):
        vpxor 14 * 32(%rax), %ymm15, %ymm14;
        vpxor 15 * 32(%rax), %ymm15, %ymm15;
 
-       call __camellia_enc_blk32;
+       call FUNC_NAME(enc_blk32);
 
        vpxor %ymm7, %ymm6, %ymm6;
        vpxor %ymm5, %ymm4, %ymm4;
@@ -1791,4 +2103,225 @@ FUNC_NAME(ocb_auth):
        CFI_ENDPROC();
 ELF(.size FUNC_NAME(ocb_auth),.-FUNC_NAME(ocb_auth);)
 
+.align 16
+.globl FUNC_NAME(enc_blk1_32)
+ELF(.type   FUNC_NAME(enc_blk1_32),@function;)
+
+FUNC_NAME(enc_blk1_32):
+       /* input:
+        *      %rdi: ctx, CTX
+        *      %rsi: dst (32 blocks)
+        *      %rdx: src (32 blocks)
+        *      %ecx: nblocks (1 to 32)
+        */
+       CFI_STARTPROC();
+
+       pushq %rbp;
+       CFI_PUSH(%rbp);
+       movq %rsp, %rbp;
+       CFI_DEF_CFA_REGISTER(%rbp);
+
+       movl %ecx, %r9d;
+
+       cmpl $128, key_bitlength(CTX);
+       movl $32, %r8d;
+       movl $24, %eax;
+       cmovel %eax, %r8d; /* max */
+
+       subq $(16 * 32), %rsp;
+       andq $~63, %rsp;
+       movq %rsp, %rax;
+
+       cmpl $31, %ecx;
+       vpxor %xmm0, %xmm0, %xmm0;
+       ja .Lenc_blk32;
+       jb 2f;
+         vmovdqu 15 * 32(%rdx), %xmm0;
+       2:
+         vmovdqu %ymm0, (%rax);
+
+       vpbroadcastq (key_table)(CTX), %ymm0;
+       vpshufb .Lpack_bswap rRIP, %ymm0, %ymm0;
+
+#define LOAD_INPUT(offset, ymm) \
+       cmpl $(1 + 2 * (offset)), %ecx; \
+       jb 2f; \
+       ja 1f; \
+         vmovdqu (offset) * 32(%rdx), %ymm##_x; \
+         vpxor %ymm0, %ymm, %ymm; \
+         jmp 2f; \
+       1: \
+         vpxor (offset) * 32(%rdx), %ymm0, %ymm;
+
+       LOAD_INPUT(0, ymm15);
+       LOAD_INPUT(1, ymm14);
+       LOAD_INPUT(2, ymm13);
+       LOAD_INPUT(3, ymm12);
+       LOAD_INPUT(4, ymm11);
+       LOAD_INPUT(5, ymm10);
+       LOAD_INPUT(6, ymm9);
+       LOAD_INPUT(7, ymm8);
+       LOAD_INPUT(8, ymm7);
+       LOAD_INPUT(9, ymm6);
+       LOAD_INPUT(10, ymm5);
+       LOAD_INPUT(11, ymm4);
+       LOAD_INPUT(12, ymm3);
+       LOAD_INPUT(13, ymm2);
+       LOAD_INPUT(14, ymm1);
+       vpxor (%rax), %ymm0, %ymm0;
+
+2:
+       call FUNC_NAME(enc_blk32);
+
+#define STORE_OUTPUT(ymm, offset) \
+       cmpl $(1 + 2 * (offset)), %r9d; \
+       jb 2f; \
+       ja 1f; \
+         vmovdqu %ymm##_x, (offset) * 32(%rsi); \
+         jmp 2f; \
+       1: \
+         vmovdqu %ymm, (offset) * 32(%rsi);
+
+       STORE_OUTPUT(ymm7, 0);
+       STORE_OUTPUT(ymm6, 1);
+       STORE_OUTPUT(ymm5, 2);
+       STORE_OUTPUT(ymm4, 3);
+       STORE_OUTPUT(ymm3, 4);
+       STORE_OUTPUT(ymm2, 5);
+       STORE_OUTPUT(ymm1, 6);
+       STORE_OUTPUT(ymm0, 7);
+       STORE_OUTPUT(ymm15, 8);
+       STORE_OUTPUT(ymm14, 9);
+       STORE_OUTPUT(ymm13, 10);
+       STORE_OUTPUT(ymm12, 11);
+       STORE_OUTPUT(ymm11, 12);
+       STORE_OUTPUT(ymm10, 13);
+       STORE_OUTPUT(ymm9, 14);
+       STORE_OUTPUT(ymm8, 15);
+       jmp .Lenc_blk32_done;
+
+.align 8
+.Lenc_blk32:
+       inpack32_pre(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
+                    %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
+                    %ymm15, %rdx, (key_table)(CTX));
+
+       call FUNC_NAME(enc_blk32);
+
+       write_output(%ymm7, %ymm6, %ymm5, %ymm4, %ymm3, %ymm2, %ymm1, %ymm0,
+                    %ymm15, %ymm14, %ymm13, %ymm12, %ymm11, %ymm10, %ymm9,
+                    %ymm8, %rsi);
+
+.align 8
+2:
+.Lenc_blk32_done:
+       vzeroall;
+
+       leave;
+       CFI_LEAVE();
+       ret_spec_stop;
+       CFI_ENDPROC();
+ELF(.size FUNC_NAME(enc_blk1_32),.-FUNC_NAME(enc_blk1_32);)
+
+.align 16
+.globl FUNC_NAME(dec_blk1_32)
+ELF(.type   FUNC_NAME(dec_blk1_32),@function;)
+
+FUNC_NAME(dec_blk1_32):
+       /* input:
+        *      %rdi: ctx, CTX
+        *      %rsi: dst (32 blocks)
+        *      %rdx: src (32 blocks)
+        *      %ecx: nblocks (1 to 32)
+        */
+       CFI_STARTPROC();
+
+       pushq %rbp;
+       CFI_PUSH(%rbp);
+       movq %rsp, %rbp;
+       CFI_DEF_CFA_REGISTER(%rbp);
+
+       movl %ecx, %r9d;
+
+       cmpl $128, key_bitlength(CTX);
+       movl $32, %r8d;
+       movl $24, %eax;
+       cmovel %eax, %r8d; /* max */
+
+       subq $(16 * 32), %rsp;
+       andq $~63, %rsp;
+       movq %rsp, %rax;
+
+       cmpl $31, %ecx;
+       vpxor %xmm0, %xmm0, %xmm0;
+       ja .Ldec_blk32;
+       jb 2f;
+         vmovdqu 15 * 32(%rdx), %xmm0;
+       2:
+         vmovdqu %ymm0, (%rax);
+
+       vpbroadcastq (key_table)(CTX, %r8, 8), %ymm0;
+       vpshufb .Lpack_bswap rRIP, %ymm0, %ymm0;
+
+       LOAD_INPUT(0, ymm15);
+       LOAD_INPUT(1, ymm14);
+       LOAD_INPUT(2, ymm13);
+       LOAD_INPUT(3, ymm12);
+       LOAD_INPUT(4, ymm11);
+       LOAD_INPUT(5, ymm10);
+       LOAD_INPUT(6, ymm9);
+       LOAD_INPUT(7, ymm8);
+       LOAD_INPUT(8, ymm7);
+       LOAD_INPUT(9, ymm6);
+       LOAD_INPUT(10, ymm5);
+       LOAD_INPUT(11, ymm4);
+       LOAD_INPUT(12, ymm3);
+       LOAD_INPUT(13, ymm2);
+       LOAD_INPUT(14, ymm1);
+       vpxor (%rax), %ymm0, %ymm0;
+
+2:
+       call FUNC_NAME(dec_blk32);
+
+       STORE_OUTPUT(ymm7, 0);
+       STORE_OUTPUT(ymm6, 1);
+       STORE_OUTPUT(ymm5, 2);
+       STORE_OUTPUT(ymm4, 3);
+       STORE_OUTPUT(ymm3, 4);
+       STORE_OUTPUT(ymm2, 5);
+       STORE_OUTPUT(ymm1, 6);
+       STORE_OUTPUT(ymm0, 7);
+       STORE_OUTPUT(ymm15, 8);
+       STORE_OUTPUT(ymm14, 9);
+       STORE_OUTPUT(ymm13, 10);
+       STORE_OUTPUT(ymm12, 11);
+       STORE_OUTPUT(ymm11, 12);
+       STORE_OUTPUT(ymm10, 13);
+       STORE_OUTPUT(ymm9, 14);
+       STORE_OUTPUT(ymm8, 15);
+
+.align 8
+2:
+.Ldec_blk32_done:
+       vzeroall;
+
+       leave;
+       CFI_LEAVE();
+       ret_spec_stop;
+
+.align 8
+.Ldec_blk32:
+       inpack32_pre(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
+                    %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
+                    %ymm15, %rdx, (key_table)(CTX, %r8, 8));
+
+       call FUNC_NAME(dec_blk32);
+
+       write_output(%ymm7, %ymm6, %ymm5, %ymm4, %ymm3, %ymm2, %ymm1, %ymm0,
+                    %ymm15, %ymm14, %ymm13, %ymm12, %ymm11, %ymm10, %ymm9,
+                    %ymm8, %rsi);
+       jmp .Ldec_blk32_done;
+       CFI_ENDPROC();
+ELF(.size FUNC_NAME(dec_blk1_32),.-FUNC_NAME(dec_blk1_32);)
+
 #endif /* GCRY_CAMELLIA_AESNI_AVX2_AMD64_H */
index a3d87d110938d25113408ec601ad56ecbbeae85c..decd40c2b31dce5aad12d2cf60d52597757da814 100644 (file)
 #define key_table 0
 
 /* register macros */
-#define CTX %r0
-#define RTAB1 %ip
-#define RTAB3 %r1
-#define RMASK %lr
+#define CTX r0
+#define RTAB1 ip
+#define RTAB3 r1
+#define RMASK lr
 
-#define IL %r2
-#define IR %r3
+#define IL r2
+#define IR r3
 
-#define XL %r4
-#define XR %r5
-#define YL %r6
-#define YR %r7
+#define XL r4
+#define XR r5
+#define YL r6
+#define YR r7
 
-#define RT0 %r8
-#define RT1 %r9
-#define RT2 %r10
-#define RT3 %r11
+#define RT0 r8
+#define RT1 r9
+#define RT2 r10
+#define RT3 r11
 
 /* helper macros */
 #define ldr_unaligned_be(rout, rsrc, offs, rtmp) \
            (n) * 2 + 0, (n) * 2 + 1);
 
 #define inpack(n) \
-       ldr_input_be(%r2, XL, XR, YL, YR, RT0); \
+       ldr_input_be(r2, XL, XR, YL, YR, RT0); \
        ldr RT0, [CTX, #(key_table + ((n) * 8) + 0)]; \
        ldr RT1, [CTX, #(key_table + ((n) * 8) + 4)]; \
        eor XL, RT0; \
        ldr RT1, [CTX, #(key_table + ((n) * 8) + 4)]; \
        eor YL, RT0; \
        eor YR, RT1; \
-       str_output_be(%r1, YL, YR, XL, XR, RT0, RT1);
+       str_output_be(r1, YL, YR, XL, XR, RT0, RT1);
 
 .align 3
 .globl _gcry_camellia_arm_encrypt_block
 
 _gcry_camellia_arm_encrypt_block:
        /* input:
-        *      %r0: keytable
-        *      %r1: dst
-        *      %r2: src
-        *      %r3: keybitlen
+        *      r0: keytable
+        *      r1: dst
+        *      r2: src
+        *      r3: keybitlen
         */
-       push {%r1, %r4-%r11, %ip, %lr};
+       push {r1, r4-r11, ip, lr};
 
        GET_DATA_POINTER(RTAB1, .Lcamellia_sp1110, RTAB3);
        mov RMASK, #0xff;
        add RTAB3, RTAB1, #(2 * 4);
-       push {%r3};
+       push {r3};
        mov RMASK, RMASK, lsl#4 /* byte mask */
 
        inpack(0);
@@ -292,20 +292,20 @@ _gcry_camellia_arm_encrypt_block:
        cmp RT0, #(16 * 8);
        bne .Lenc_256;
 
-       pop {%r1};
+       pop {r1};
        outunpack(24);
 
-       pop {%r4-%r11, %ip, %pc};
+       pop {r4-r11, ip, pc};
 .ltorg
 
 .Lenc_256:
        enc_fls(24);
        enc_rounds(24);
 
-       pop {%r1};
+       pop {r1};
        outunpack(32);
 
-       pop {%r4-%r11, %ip, %pc};
+       pop {r4-r11, ip, pc};
 .ltorg
 .size _gcry_camellia_arm_encrypt_block,.-_gcry_camellia_arm_encrypt_block;
 
@@ -315,19 +315,19 @@ _gcry_camellia_arm_encrypt_block:
 
 _gcry_camellia_arm_decrypt_block:
        /* input:
-        *      %r0: keytable
-        *      %r1: dst
-        *      %r2: src
-        *      %r3: keybitlen
+        *      r0: keytable
+        *      r1: dst
+        *      r2: src
+        *      r3: keybitlen
         */
-       push {%r1, %r4-%r11, %ip, %lr};
+       push {r1, r4-r11, ip, lr};
 
        GET_DATA_POINTER(RTAB1, .Lcamellia_sp1110, RTAB3);
        mov RMASK, #0xff;
        add RTAB3, RTAB1, #(2 * 4);
        mov RMASK, RMASK, lsl#4 /* byte mask */
 
-       cmp %r3, #(16 * 8);
+       cmp r3, #(16 * 8);
        bne .Ldec_256;
 
        inpack(24);
@@ -339,10 +339,10 @@ _gcry_camellia_arm_decrypt_block:
        dec_fls(8);
        dec_rounds(0);
 
-       pop {%r1};
+       pop {r1};
        outunpack(0);
 
-       pop {%r4-%r11, %ip, %pc};
+       pop {r4-r11, ip, pc};
 .ltorg
 
 .Ldec_256:
diff --git a/cipher/camellia-gfni-avx2-amd64.S b/cipher/camellia-gfni-avx2-amd64.S
new file mode 100644 (file)
index 0000000..20c9a43
--- /dev/null
@@ -0,0 +1,34 @@
+/* camellia-vaes-avx2-amd64.S  -  GFNI/AVX2 implementation of Camellia cipher
+ *
+ * Copyright (C) 2022 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <config.h>
+
+#ifdef __x86_64
+#if (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
+     defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) && \
+    defined(ENABLE_GFNI_SUPPORT) && defined(ENABLE_AVX2_SUPPORT)
+
+#define CAMELLIA_GFNI_BUILD 1
+#define FUNC_NAME(func) _gcry_camellia_gfni_avx2_ ## func
+
+#include "camellia-aesni-avx2-amd64.h"
+
+#endif /* defined(ENABLE_GFNI_SUPPORT) && defined(ENABLE_AVX2_SUPPORT) */
+#endif /* __x86_64 */
diff --git a/cipher/camellia-gfni-avx512-amd64.S b/cipher/camellia-gfni-avx512-amd64.S
new file mode 100644 (file)
index 0000000..643eed3
--- /dev/null
@@ -0,0 +1,1634 @@
+/* camellia-gfni-avx512-amd64.S - GFNI/AVX512 implementation of Camellia
+ *
+ * Copyright (C) 2022-2023 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <config.h>
+
+#ifdef __x86_64
+#if (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
+     defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) && \
+    defined(ENABLE_GFNI_SUPPORT) && defined(ENABLE_AVX512_SUPPORT)
+
+#include "asm-common-amd64.h"
+
+#define CAMELLIA_TABLE_BYTE_LEN 272
+
+/* struct CAMELLIA_context: */
+#define key_table 0
+#define key_bitlength CAMELLIA_TABLE_BYTE_LEN
+
+/* register macros */
+#define CTX %rdi
+#define RIO %r8
+
+/**********************************************************************
+  helper macros
+ **********************************************************************/
+
+#define zmm0_x xmm0
+#define zmm1_x xmm1
+#define zmm2_x xmm2
+#define zmm3_x xmm3
+#define zmm4_x xmm4
+#define zmm5_x xmm5
+#define zmm6_x xmm6
+#define zmm7_x xmm7
+#define zmm8_x xmm8
+#define zmm9_x xmm9
+#define zmm10_x xmm10
+#define zmm11_x xmm11
+#define zmm12_x xmm12
+#define zmm13_x xmm13
+#define zmm14_x xmm14
+#define zmm15_x xmm15
+
+#define zmm0_y ymm0
+#define zmm1_y ymm1
+#define zmm2_y ymm2
+#define zmm3_y ymm3
+#define zmm4_y ymm4
+#define zmm5_y ymm5
+#define zmm6_y ymm6
+#define zmm7_y ymm7
+#define zmm8_y ymm8
+#define zmm9_y ymm9
+#define zmm10_y ymm10
+#define zmm11_y ymm11
+#define zmm12_y ymm12
+#define zmm13_y ymm13
+#define zmm14_y ymm14
+#define zmm15_y ymm15
+
+#define mem_ab_0 %zmm16
+#define mem_ab_1 %zmm17
+#define mem_ab_2 %zmm31
+#define mem_ab_3 %zmm18
+#define mem_ab_4 %zmm19
+#define mem_ab_5 %zmm20
+#define mem_ab_6 %zmm21
+#define mem_ab_7 %zmm22
+#define mem_cd_0 %zmm23
+#define mem_cd_1 %zmm24
+#define mem_cd_2 %zmm30
+#define mem_cd_3 %zmm25
+#define mem_cd_4 %zmm26
+#define mem_cd_5 %zmm27
+#define mem_cd_6 %zmm28
+#define mem_cd_7 %zmm29
+
+#define clear_vec4(v0,v1,v2,v3) \
+       vpxord v0, v0, v0; \
+       vpxord v1, v1, v1; \
+       vpxord v2, v2, v2; \
+       vpxord v3, v3, v3
+
+#define clear_zmm16_zmm31() \
+       clear_vec4(%ymm16, %ymm20, %ymm24, %ymm28); \
+       clear_vec4(%ymm17, %ymm21, %ymm25, %ymm29); \
+       clear_vec4(%ymm18, %ymm22, %ymm26, %ymm30); \
+       clear_vec4(%ymm19, %ymm23, %ymm27, %ymm31)
+
+#define clear_regs() \
+       vzeroall; \
+       clear_zmm16_zmm31()
+
+/**********************************************************************
+  GFNI helper macros and constants
+ **********************************************************************/
+
+#define BV8(a0,a1,a2,a3,a4,a5,a6,a7) \
+       ( (((a0) & 1) << 0) | \
+         (((a1) & 1) << 1) | \
+         (((a2) & 1) << 2) | \
+         (((a3) & 1) << 3) | \
+         (((a4) & 1) << 4) | \
+         (((a5) & 1) << 5) | \
+         (((a6) & 1) << 6) | \
+         (((a7) & 1) << 7) )
+
+#define BM8X8(l0,l1,l2,l3,l4,l5,l6,l7) \
+       ( ((l7) << (0 * 8)) | \
+         ((l6) << (1 * 8)) | \
+         ((l5) << (2 * 8)) | \
+         ((l4) << (3 * 8)) | \
+         ((l3) << (4 * 8)) | \
+         ((l2) << (5 * 8)) | \
+         ((l1) << (6 * 8)) | \
+         ((l0) << (7 * 8)) )
+
+/* Pre-filters and post-filters constants for Camellia sboxes s1, s2, s3 and s4.
+ *   See http://urn.fi/URN:NBN:fi:oulu-201305311409, pages 43-48.
+ *
+ * Pre-filters are directly from above source, "θ₁"/"θ₄". Post-filters are
+ * combination of function "A" (AES SubBytes affine transformation) and
+ * "ψ₁"/"ψ₂"/"ψ₃".
+ */
+
+/* Constant from "θ₁(x)" and "θ₄(x)" functions. */
+#define pre_filter_constant_s1234 BV8(1, 0, 1, 0, 0, 0, 1, 0)
+
+/* Constant from "ψ₁(A(x))" function: */
+#define post_filter_constant_s14  BV8(0, 1, 1, 1, 0, 1, 1, 0)
+
+/* Constant from "ψ₂(A(x))" function: */
+#define post_filter_constant_s2   BV8(0, 0, 1, 1, 1, 0, 1, 1)
+
+/* Constant from "ψ₃(A(x))" function: */
+#define post_filter_constant_s3   BV8(1, 1, 1, 0, 1, 1, 0, 0)
+
+/**********************************************************************
+  64-way parallel camellia
+ **********************************************************************/
+
+/* roundsm64 (GFNI/AVX512 version)
+ * IN:
+ *   x0..x7: byte-sliced AB state
+ *   mem_cd: register pointer storing CD state
+ *   key: index for key material
+ * OUT:
+ *   x0..x7: new byte-sliced CD state
+ */
+#define roundsm64(x0, x1, x2, x3, x4, x5, x6, x7, t0, t1, t2, t3, t4, t5, \
+                 t6, t7, mem_cd, key) \
+       /* \
+        * S-function with AES subbytes \
+        */ \
+       vpbroadcastq .Lpre_filter_bitmatrix_s123 rRIP, t5; \
+       vpbroadcastq .Lpre_filter_bitmatrix_s4 rRIP, t2; \
+       vpbroadcastq .Lpost_filter_bitmatrix_s14 rRIP, t4; \
+       vpbroadcastq .Lpost_filter_bitmatrix_s2 rRIP, t3; \
+       vpbroadcastq .Lpost_filter_bitmatrix_s3 rRIP, t6; \
+       \
+       /* prefilter sboxes */ \
+       vgf2p8affineqb $(pre_filter_constant_s1234), t5, x0, x0; \
+       vgf2p8affineqb $(pre_filter_constant_s1234), t5, x7, x7; \
+       vgf2p8affineqb $(pre_filter_constant_s1234), t2, x3, x3; \
+       vgf2p8affineqb $(pre_filter_constant_s1234), t2, x6, x6; \
+       vgf2p8affineqb $(pre_filter_constant_s1234), t5, x2, x2; \
+       vgf2p8affineqb $(pre_filter_constant_s1234), t5, x5, x5; \
+       vgf2p8affineqb $(pre_filter_constant_s1234), t5, x1, x1; \
+       vgf2p8affineqb $(pre_filter_constant_s1234), t5, x4, x4; \
+       \
+       /* sbox GF8 inverse + postfilter sboxes 1 and 4 */ \
+       vgf2p8affineinvqb $(post_filter_constant_s14), t4, x0, x0; \
+       vgf2p8affineinvqb $(post_filter_constant_s14), t4, x7, x7; \
+       vgf2p8affineinvqb $(post_filter_constant_s14), t4, x3, x3; \
+       vgf2p8affineinvqb $(post_filter_constant_s14), t4, x6, x6; \
+       \
+       /* sbox GF8 inverse + postfilter sbox 3 */ \
+       vgf2p8affineinvqb $(post_filter_constant_s3), t6, x2, x2; \
+       vgf2p8affineinvqb $(post_filter_constant_s3), t6, x5, x5; \
+       \
+       /* sbox GF8 inverse + postfilter sbox 2 */ \
+       vgf2p8affineinvqb $(post_filter_constant_s2), t3, x1, x1; \
+       vgf2p8affineinvqb $(post_filter_constant_s2), t3, x4, x4; \
+       \
+       vpbroadcastb 7+key, t7; \
+       vpbroadcastb 6+key, t6; \
+       \
+       /* P-function */ \
+       vpxorq x5, x0, x0; \
+       vpxorq x6, x1, x1; \
+       vpxorq x7, x2, x2; \
+       vpxorq x4, x3, x3; \
+       \
+       vpbroadcastb 5+key, t5; \
+       vpbroadcastb 4+key, t4; \
+       \
+       vpxorq x2, x4, x4; \
+       vpxorq x3, x5, x5; \
+       vpxorq x0, x6, x6; \
+       vpxorq x1, x7, x7; \
+       \
+       vpbroadcastb 3+key, t3; \
+       vpbroadcastb 2+key, t2; \
+       \
+       vpxorq x7, x0, x0; \
+       vpxorq x4, x1, x1; \
+       vpxorq x5, x2, x2; \
+       vpxorq x6, x3, x3; \
+       \
+       vpbroadcastb 1+key, t1; \
+       vpbroadcastb 0+key, t0; \
+       \
+       vpxorq x3, x4, x4; \
+       vpxorq x0, x5, x5; \
+       vpxorq x1, x6, x6; \
+       vpxorq x2, x7, x7; /* note: high and low parts swapped */ \
+       \
+       /* Add key material and result to CD (x becomes new CD) */ \
+       \
+       vpternlogq $0x96, mem_cd##_4, t7, x0; \
+       vpternlogq $0x96, mem_cd##_5, t6, x1; \
+       vpternlogq $0x96, mem_cd##_6, t5, x2; \
+       vpternlogq $0x96, mem_cd##_7, t4, x3; \
+       vpternlogq $0x96, mem_cd##_0, t3, x4; \
+       vpternlogq $0x96, mem_cd##_1, t2, x5; \
+       vpternlogq $0x96, mem_cd##_2, t1, x6; \
+       vpternlogq $0x96, mem_cd##_3, t0, x7;
+
+/*
+ * IN/OUT:
+ *  x0..x7: byte-sliced AB state preloaded
+ *  mem_ab: byte-sliced AB state in memory
+ *  mem_cb: byte-sliced CD state in memory
+ */
+#define two_roundsm64(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
+                     y6, y7, mem_ab, mem_cd, i, dir, store_ab) \
+       roundsm64(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
+                 y6, y7, mem_cd, (key_table + (i) * 8)(CTX)); \
+       \
+       vmovdqu64 x0, mem_cd##_4; \
+       vmovdqu64 x1, mem_cd##_5; \
+       vmovdqu64 x2, mem_cd##_6; \
+       vmovdqu64 x3, mem_cd##_7; \
+       vmovdqu64 x4, mem_cd##_0; \
+       vmovdqu64 x5, mem_cd##_1; \
+       vmovdqu64 x6, mem_cd##_2; \
+       vmovdqu64 x7, mem_cd##_3; \
+       \
+       roundsm64(x4, x5, x6, x7, x0, x1, x2, x3, y0, y1, y2, y3, y4, y5, \
+                 y6, y7, mem_ab, (key_table + ((i) + (dir)) * 8)(CTX)); \
+       \
+       store_ab(x0, x1, x2, x3, x4, x5, x6, x7, mem_ab);
+
+#define dummy_store(x0, x1, x2, x3, x4, x5, x6, x7, mem_ab) /* do nothing */
+
+#define store_ab_state(x0, x1, x2, x3, x4, x5, x6, x7, mem_ab) \
+       /* Store new AB state */ \
+       vmovdqu64 x4, mem_ab##_4; \
+       vmovdqu64 x5, mem_ab##_5; \
+       vmovdqu64 x6, mem_ab##_6; \
+       vmovdqu64 x7, mem_ab##_7; \
+       vmovdqu64 x0, mem_ab##_0; \
+       vmovdqu64 x1, mem_ab##_1; \
+       vmovdqu64 x2, mem_ab##_2; \
+       vmovdqu64 x3, mem_ab##_3;
+
+#define enc_rounds64(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
+                     y6, y7, mem_ab, mem_cd, i) \
+       two_roundsm64(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
+                     y6, y7, mem_ab, mem_cd, (i) + 2, 1, store_ab_state); \
+       two_roundsm64(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
+                     y6, y7, mem_ab, mem_cd, (i) + 4, 1, store_ab_state); \
+       two_roundsm64(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
+                     y6, y7, mem_ab, mem_cd, (i) + 6, 1, dummy_store);
+
+#define dec_rounds64(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
+                     y6, y7, mem_ab, mem_cd, i) \
+       two_roundsm64(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
+                     y6, y7, mem_ab, mem_cd, (i) + 7, -1, store_ab_state); \
+       two_roundsm64(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
+                     y6, y7, mem_ab, mem_cd, (i) + 5, -1, store_ab_state); \
+       two_roundsm64(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
+                     y6, y7, mem_ab, mem_cd, (i) + 3, -1, dummy_store);
+
+/*
+ * IN:
+ *  v0..3: byte-sliced 32-bit integers
+ * OUT:
+ *  v0..3: (IN << 1)
+ *  t0, t1, t2, zero: (IN >> 7)
+ */
+#define rol32_1_64(v0, v1, v2, v3, t0, t1, t2, t3, right_shift_by_7) \
+       vgf2p8affineqb $0, right_shift_by_7, v0, t0; \
+       vpaddb v0, v0, v0; \
+       \
+       vgf2p8affineqb $0, right_shift_by_7, v1, t1; \
+       vpaddb v1, v1, v1; \
+       \
+       vgf2p8affineqb $0, right_shift_by_7, v2, t2; \
+       vpaddb v2, v2, v2; \
+       \
+       vgf2p8affineqb $0, right_shift_by_7, v3, t3; \
+       vpaddb v3, v3, v3;
+
+/*
+ * IN:
+ *   r: byte-sliced AB state in memory
+ *   l: byte-sliced CD state in memory
+ * OUT:
+ *   x0..x7: new byte-sliced CD state
+ */
+#define fls64(l, l0, l1, l2, l3, l4, l5, l6, l7, r, t0, t1, t2, t3, tt0, \
+             tt1, tt2, tt3, kll, klr, krl, krr, tmp) \
+       /* \
+        * t0 = kll; \
+        * t0 &= ll; \
+        * lr ^= rol32(t0, 1); \
+        */ \
+       vpbroadcastq .Lright_shift_by_7 rRIP, tmp; \
+       vpbroadcastb 0+kll, t3; \
+       vpbroadcastb 1+kll, t2; \
+       vpbroadcastb 2+kll, t1; \
+       vpbroadcastb 3+kll, t0; \
+       \
+       vpandq l0, t0, t0; \
+       vpandq l1, t1, t1; \
+       vpandq l2, t2, t2; \
+       vpandq l3, t3, t3; \
+       \
+       rol32_1_64(t3, t2, t1, t0, tt0, tt1, tt2, tt3, tmp); \
+       \
+       vpternlogq $0x96, tt2, t0, l4; \
+       vmovdqu64 l4, l##_4; \
+       vpternlogq $0x96, tt1, t1, l5; \
+       vmovdqu64 l5, l##_5; \
+       vpternlogq $0x96, tt0, t2, l6; \
+       vmovdqu64 l6, l##_6; \
+       vpternlogq $0x96, tt3, t3, l7; \
+       vmovdqu64 l7, l##_7; \
+       \
+       /* \
+        * t2 = krr; \
+        * t2 |= rr; \
+        * rl ^= t2; \
+        */ \
+       \
+       vpbroadcastb 0+krr, t3; \
+       vpbroadcastb 1+krr, t2; \
+       vpbroadcastb 2+krr, t1; \
+       vpbroadcastb 3+krr, t0; \
+       \
+       vpternlogq $0x1e, r##_4, t0, r##_0; \
+       vpternlogq $0x1e, r##_5, t1, r##_1; \
+       vpternlogq $0x1e, r##_6, t2, r##_2; \
+       vpternlogq $0x1e, r##_7, t3, r##_3; \
+       \
+       /* \
+        * t2 = krl; \
+        * t2 &= rl; \
+        * rr ^= rol32(t2, 1); \
+        */ \
+       vpbroadcastb 0+krl, t3; \
+       vpbroadcastb 1+krl, t2; \
+       vpbroadcastb 2+krl, t1; \
+       vpbroadcastb 3+krl, t0; \
+       \
+       vpandq r##_0, t0, t0; \
+       vpandq r##_1, t1, t1; \
+       vpandq r##_2, t2, t2; \
+       vpandq r##_3, t3, t3; \
+       \
+       rol32_1_64(t3, t2, t1, t0, tt0, tt1, tt2, tt3, tmp); \
+       \
+       vpternlogq $0x96, tt2, t0, r##_4; \
+       vpternlogq $0x96, tt1, t1, r##_5; \
+       vpternlogq $0x96, tt0, t2, r##_6; \
+       vpternlogq $0x96, tt3, t3, r##_7; \
+       \
+       /* \
+        * t0 = klr; \
+        * t0 |= lr; \
+        * ll ^= t0; \
+        */ \
+       \
+       vpbroadcastb 0+klr, t3; \
+       vpbroadcastb 1+klr, t2; \
+       vpbroadcastb 2+klr, t1; \
+       vpbroadcastb 3+klr, t0; \
+       \
+       vpternlogq $0x1e, l4, t0, l0; \
+       vmovdqu64 l0, l##_0; \
+       vpternlogq $0x1e, l5, t1, l1; \
+       vmovdqu64 l1, l##_1; \
+       vpternlogq $0x1e, l6, t2, l2; \
+       vmovdqu64 l2, l##_2; \
+       vpternlogq $0x1e, l7, t3, l3; \
+       vmovdqu64 l3, l##_3;
+
+#define transpose_4x4(x0, x1, x2, x3, t1, t2) \
+       vpunpckhdq x1, x0, t2; \
+       vpunpckldq x1, x0, x0; \
+       \
+       vpunpckldq x3, x2, t1; \
+       vpunpckhdq x3, x2, x2; \
+       \
+       vpunpckhqdq t1, x0, x1; \
+       vpunpcklqdq t1, x0, x0; \
+       \
+       vpunpckhqdq x2, t2, x3; \
+       vpunpcklqdq x2, t2, x2;
+
+#define byteslice_16x16b_fast(a0, b0, c0, d0, a1, b1, c1, d1, a2, b2, c2, d2, \
+                             a3, b3, c3, d3, st0, st1) \
+       transpose_4x4(a0, a1, a2, a3, st0, st1); \
+       transpose_4x4(b0, b1, b2, b3, st0, st1); \
+       \
+       transpose_4x4(c0, c1, c2, c3, st0, st1); \
+       transpose_4x4(d0, d1, d2, d3, st0, st1); \
+       \
+       vbroadcasti64x2 .Lshufb_16x16b rRIP, st0; \
+       vpshufb st0, a0, a0; \
+       vpshufb st0, a1, a1; \
+       vpshufb st0, a2, a2; \
+       vpshufb st0, a3, a3; \
+       vpshufb st0, b0, b0; \
+       vpshufb st0, b1, b1; \
+       vpshufb st0, b2, b2; \
+       vpshufb st0, b3, b3; \
+       vpshufb st0, c0, c0; \
+       vpshufb st0, c1, c1; \
+       vpshufb st0, c2, c2; \
+       vpshufb st0, c3, c3; \
+       vpshufb st0, d0, d0; \
+       vpshufb st0, d1, d1; \
+       vpshufb st0, d2, d2; \
+       vpshufb st0, d3, d3; \
+       \
+       transpose_4x4(a0, b0, c0, d0, st0, st1); \
+       transpose_4x4(a1, b1, c1, d1, st0, st1); \
+       \
+       transpose_4x4(a2, b2, c2, d2, st0, st1); \
+       transpose_4x4(a3, b3, c3, d3, st0, st1); \
+       /* does not adjust output bytes inside vectors */
+
+/* load blocks to registers and apply pre-whitening */
+#define inpack64_pre(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
+                    y6, y7, rio, key) \
+       vpbroadcastq key, x0; \
+       vpshufb .Lpack_bswap rRIP, x0, x0; \
+       \
+       vpxorq 0 * 64(rio), x0, y7; \
+       vpxorq 1 * 64(rio), x0, y6; \
+       vpxorq 2 * 64(rio), x0, y5; \
+       vpxorq 3 * 64(rio), x0, y4; \
+       vpxorq 4 * 64(rio), x0, y3; \
+       vpxorq 5 * 64(rio), x0, y2; \
+       vpxorq 6 * 64(rio), x0, y1; \
+       vpxorq 7 * 64(rio), x0, y0; \
+       vpxorq 8 * 64(rio), x0, x7; \
+       vpxorq 9 * 64(rio), x0, x6; \
+       vpxorq 10 * 64(rio), x0, x5; \
+       vpxorq 11 * 64(rio), x0, x4; \
+       vpxorq 12 * 64(rio), x0, x3; \
+       vpxorq 13 * 64(rio), x0, x2; \
+       vpxorq 14 * 64(rio), x0, x1; \
+       vpxorq 15 * 64(rio), x0, x0;
+
+/* byteslice pre-whitened blocks and store to temporary memory */
+#define inpack64_post(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
+                     y6, y7, mem_ab, mem_cd, tmp0, tmp1) \
+       byteslice_16x16b_fast(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, \
+                             y4, y5, y6, y7, tmp0, tmp1); \
+       \
+       vmovdqu64 x0, mem_ab##_0; \
+       vmovdqu64 x1, mem_ab##_1; \
+       vmovdqu64 x2, mem_ab##_2; \
+       vmovdqu64 x3, mem_ab##_3; \
+       vmovdqu64 x4, mem_ab##_4; \
+       vmovdqu64 x5, mem_ab##_5; \
+       vmovdqu64 x6, mem_ab##_6; \
+       vmovdqu64 x7, mem_ab##_7; \
+       vmovdqu64 y0, mem_cd##_0; \
+       vmovdqu64 y1, mem_cd##_1; \
+       vmovdqu64 y2, mem_cd##_2; \
+       vmovdqu64 y3, mem_cd##_3; \
+       vmovdqu64 y4, mem_cd##_4; \
+       vmovdqu64 y5, mem_cd##_5; \
+       vmovdqu64 y6, mem_cd##_6; \
+       vmovdqu64 y7, mem_cd##_7;
+
+/* de-byteslice, apply post-whitening and store blocks */
+#define outunpack64(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, \
+                   y5, y6, y7, key, tmp0, tmp1) \
+       byteslice_16x16b_fast(y0, y4, x0, x4, y1, y5, x1, x5, y2, y6, x2, x6, \
+                             y3, y7, x3, x7, tmp0, tmp1); \
+       \
+       vpbroadcastq key, tmp0; \
+       vpshufb .Lpack_bswap rRIP, tmp0, tmp0; \
+       \
+       vpxorq tmp0, y7, y7; \
+       vpxorq tmp0, y6, y6; \
+       vpxorq tmp0, y5, y5; \
+       vpxorq tmp0, y4, y4; \
+       vpxorq tmp0, y3, y3; \
+       vpxorq tmp0, y2, y2; \
+       vpxorq tmp0, y1, y1; \
+       vpxorq tmp0, y0, y0; \
+       vpxorq tmp0, x7, x7; \
+       vpxorq tmp0, x6, x6; \
+       vpxorq tmp0, x5, x5; \
+       vpxorq tmp0, x4, x4; \
+       vpxorq tmp0, x3, x3; \
+       vpxorq tmp0, x2, x2; \
+       vpxorq tmp0, x1, x1; \
+       vpxorq tmp0, x0, x0;
+
+#define write_output(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
+                    y6, y7, rio) \
+       vmovdqu64 x0, 0 * 64(rio); \
+       vmovdqu64 x1, 1 * 64(rio); \
+       vmovdqu64 x2, 2 * 64(rio); \
+       vmovdqu64 x3, 3 * 64(rio); \
+       vmovdqu64 x4, 4 * 64(rio); \
+       vmovdqu64 x5, 5 * 64(rio); \
+       vmovdqu64 x6, 6 * 64(rio); \
+       vmovdqu64 x7, 7 * 64(rio); \
+       vmovdqu64 y0, 8 * 64(rio); \
+       vmovdqu64 y1, 9 * 64(rio); \
+       vmovdqu64 y2, 10 * 64(rio); \
+       vmovdqu64 y3, 11 * 64(rio); \
+       vmovdqu64 y4, 12 * 64(rio); \
+       vmovdqu64 y5, 13 * 64(rio); \
+       vmovdqu64 y6, 14 * 64(rio); \
+       vmovdqu64 y7, 15 * 64(rio);
+
+SECTION_RODATA
+
+#define SHUFB_BYTES(idx) \
+       0 + (idx), 4 + (idx), 8 + (idx), 12 + (idx)
+
+_gcry_camellia_gfni_avx512__constants:
+ELF(.type   _gcry_camellia_gfni_avx512__constants,@object;)
+
+.align 64
+.Lpack_bswap:
+       .long 0x00010203, 0x04050607, 0x80808080, 0x80808080
+       .long 0x00010203, 0x04050607, 0x80808080, 0x80808080
+       .long 0x00010203, 0x04050607, 0x80808080, 0x80808080
+       .long 0x00010203, 0x04050607, 0x80808080, 0x80808080
+
+.Lcounter0123_lo:
+       .quad 0, 0
+       .quad 1, 0
+       .quad 2, 0
+       .quad 3, 0
+
+.align 16
+.Lcounter4444_lo:
+       .quad 4, 0
+.Lcounter8888_lo:
+       .quad 8, 0
+.Lcounter16161616_lo:
+       .quad 16, 0
+.Lcounter1111_hi:
+       .quad 0, 1
+
+.Lshufb_16x16b:
+       .byte SHUFB_BYTES(0), SHUFB_BYTES(1), SHUFB_BYTES(2), SHUFB_BYTES(3)
+
+/* For CTR-mode IV byteswap */
+.Lbswap128_mask:
+       .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
+
+/* Pre-filters and post-filters bit-matrixes for Camellia sboxes s1, s2, s3
+ * and s4.
+ *   See http://urn.fi/URN:NBN:fi:oulu-201305311409, pages 43-48.
+ *
+ * Pre-filters are directly from above source, "θ₁"/"θ₄". Post-filters are
+ * combination of function "A" (AES SubBytes affine transformation) and
+ * "ψ₁"/"ψ₂"/"ψ₃".
+ */
+
+/* Bit-matrix from "θ₁(x)" function: */
+.Lpre_filter_bitmatrix_s123:
+       .quad BM8X8(BV8(1, 1, 1, 0, 1, 1, 0, 1),
+                   BV8(0, 0, 1, 1, 0, 0, 1, 0),
+                   BV8(1, 1, 0, 1, 0, 0, 0, 0),
+                   BV8(1, 0, 1, 1, 0, 0, 1, 1),
+                   BV8(0, 0, 0, 0, 1, 1, 0, 0),
+                   BV8(1, 0, 1, 0, 0, 1, 0, 0),
+                   BV8(0, 0, 1, 0, 1, 1, 0, 0),
+                   BV8(1, 0, 0, 0, 0, 1, 1, 0))
+
+/* Bit-matrix from "θ₄(x)" function: */
+.Lpre_filter_bitmatrix_s4:
+       .quad BM8X8(BV8(1, 1, 0, 1, 1, 0, 1, 1),
+                   BV8(0, 1, 1, 0, 0, 1, 0, 0),
+                   BV8(1, 0, 1, 0, 0, 0, 0, 1),
+                   BV8(0, 1, 1, 0, 0, 1, 1, 1),
+                   BV8(0, 0, 0, 1, 1, 0, 0, 0),
+                   BV8(0, 1, 0, 0, 1, 0, 0, 1),
+                   BV8(0, 1, 0, 1, 1, 0, 0, 0),
+                   BV8(0, 0, 0, 0, 1, 1, 0, 1))
+
+/* Bit-matrix from "ψ₁(A(x))" function: */
+.Lpost_filter_bitmatrix_s14:
+       .quad BM8X8(BV8(0, 0, 0, 0, 0, 0, 0, 1),
+                   BV8(0, 1, 1, 0, 0, 1, 1, 0),
+                   BV8(1, 0, 1, 1, 1, 1, 1, 0),
+                   BV8(0, 0, 0, 1, 1, 0, 1, 1),
+                   BV8(1, 0, 0, 0, 1, 1, 1, 0),
+                   BV8(0, 1, 0, 1, 1, 1, 1, 0),
+                   BV8(0, 1, 1, 1, 1, 1, 1, 1),
+                   BV8(0, 0, 0, 1, 1, 1, 0, 0))
+
+/* Bit-matrix from "ψ₂(A(x))" function: */
+.Lpost_filter_bitmatrix_s2:
+       .quad BM8X8(BV8(0, 0, 0, 1, 1, 1, 0, 0),
+                   BV8(0, 0, 0, 0, 0, 0, 0, 1),
+                   BV8(0, 1, 1, 0, 0, 1, 1, 0),
+                   BV8(1, 0, 1, 1, 1, 1, 1, 0),
+                   BV8(0, 0, 0, 1, 1, 0, 1, 1),
+                   BV8(1, 0, 0, 0, 1, 1, 1, 0),
+                   BV8(0, 1, 0, 1, 1, 1, 1, 0),
+                   BV8(0, 1, 1, 1, 1, 1, 1, 1))
+
+/* Bit-matrix from "ψ₃(A(x))" function: */
+.Lpost_filter_bitmatrix_s3:
+       .quad BM8X8(BV8(0, 1, 1, 0, 0, 1, 1, 0),
+                   BV8(1, 0, 1, 1, 1, 1, 1, 0),
+                   BV8(0, 0, 0, 1, 1, 0, 1, 1),
+                   BV8(1, 0, 0, 0, 1, 1, 1, 0),
+                   BV8(0, 1, 0, 1, 1, 1, 1, 0),
+                   BV8(0, 1, 1, 1, 1, 1, 1, 1),
+                   BV8(0, 0, 0, 1, 1, 1, 0, 0),
+                   BV8(0, 0, 0, 0, 0, 0, 0, 1))
+
+/* Bit-matrix for right shifting uint8_t values in vector by 7. */
+.Lright_shift_by_7:
+       .quad BM8X8(BV8(0, 0, 0, 0, 0, 0, 0, 1),
+                   BV8(0, 0, 0, 0, 0, 0, 0, 0),
+                   BV8(0, 0, 0, 0, 0, 0, 0, 0),
+                   BV8(0, 0, 0, 0, 0, 0, 0, 0),
+                   BV8(0, 0, 0, 0, 0, 0, 0, 0),
+                   BV8(0, 0, 0, 0, 0, 0, 0, 0),
+                   BV8(0, 0, 0, 0, 0, 0, 0, 0),
+                   BV8(0, 0, 0, 0, 0, 0, 0, 0))
+
+/* CTR byte addition constants */
+.align 64
+.Lbige_addb_0_1:
+       .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+       .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1
+.Lbige_addb_2_3:
+       .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2
+       .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3
+.Lbige_addb_4_5:
+       .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4
+       .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5
+.Lbige_addb_6_7:
+       .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6
+       .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7
+.Lbige_addb_8_9:
+       .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 8
+       .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 9
+.Lbige_addb_10_11:
+       .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 10
+       .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 11
+.Lbige_addb_12_13:
+       .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 12
+       .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 13
+.Lbige_addb_14_15:
+       .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 14
+       .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 15
+.Lbige_addb_16:
+       .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 16
+
+ELF(.size _gcry_camellia_gfni_avx512__constants,.-_gcry_camellia_gfni_avx512__constants;)
+
+.text
+
+.align 16
+ELF(.type   __camellia_gfni_avx512_enc_blk64,@function;)
+
+__camellia_gfni_avx512_enc_blk64:
+       /* input:
+        *      %rdi: ctx, CTX
+        *      %r8d: 24 for 16 byte key, 32 for larger
+        *      %zmm0..%zmm15: 64 plaintext blocks
+        * output:
+        *      %zmm0..%zmm15: 64 encrypted blocks, order swapped:
+        *       7, 8, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8
+        */
+       CFI_STARTPROC();
+
+       leaq (-8 * 8)(CTX, %r8, 8), %r8;
+
+       inpack64_post(%zmm0, %zmm1, %zmm2, %zmm3, %zmm4, %zmm5, %zmm6, %zmm7,
+                     %zmm8, %zmm9, %zmm10, %zmm11, %zmm12, %zmm13, %zmm14,
+                     %zmm15, mem_ab, mem_cd, %zmm30, %zmm31);
+
+.align 8
+.Lenc_loop:
+       enc_rounds64(%zmm0, %zmm1, %zmm2, %zmm3, %zmm4, %zmm5, %zmm6, %zmm7,
+                    %zmm8, %zmm9, %zmm10, %zmm11, %zmm12, %zmm13, %zmm14,
+                    %zmm15, mem_ab, mem_cd, 0);
+
+       cmpq %r8, CTX;
+       je .Lenc_done;
+       leaq (8 * 8)(CTX), CTX;
+
+       fls64(mem_ab, %zmm0, %zmm1, %zmm2, %zmm3, %zmm4, %zmm5, %zmm6, %zmm7,
+             mem_cd, %zmm8, %zmm9, %zmm10, %zmm11, %zmm12, %zmm13, %zmm14,
+             %zmm15,
+             ((key_table) + 0)(CTX),
+             ((key_table) + 4)(CTX),
+             ((key_table) + 8)(CTX),
+             ((key_table) + 12)(CTX),
+             %zmm31);
+       jmp .Lenc_loop;
+
+.align 8
+.Lenc_done:
+       /* load CD for output */
+       vmovdqu64 mem_cd_0, %zmm8;
+       vmovdqu64 mem_cd_1, %zmm9;
+       vmovdqu64 mem_cd_2, %zmm10;
+       vmovdqu64 mem_cd_3, %zmm11;
+       vmovdqu64 mem_cd_4, %zmm12;
+       vmovdqu64 mem_cd_5, %zmm13;
+       vmovdqu64 mem_cd_6, %zmm14;
+       vmovdqu64 mem_cd_7, %zmm15;
+
+       outunpack64(%zmm0, %zmm1, %zmm2, %zmm3, %zmm4, %zmm5, %zmm6, %zmm7,
+                   %zmm8, %zmm9, %zmm10, %zmm11, %zmm12, %zmm13, %zmm14,
+                   %zmm15, ((key_table) + 8 * 8)(%r8), %zmm30, %zmm31);
+
+       ret_spec_stop;
+       CFI_ENDPROC();
+ELF(.size __camellia_gfni_avx512_enc_blk64,.-__camellia_gfni_avx512_enc_blk64;)
+
+.align 16
+ELF(.type   __camellia_gfni_avx512_dec_blk64,@function;)
+
+__camellia_gfni_avx512_dec_blk64:
+       /* input:
+        *      %rdi: ctx, CTX
+        *      %r8d: 24 for 16 byte key, 32 for larger
+        *      %zmm0..%zmm15: 64 encrypted blocks
+        * output:
+        *      %zmm0..%zmm15: 64 plaintext blocks, order swapped:
+        *       7, 8, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8
+        */
+       CFI_STARTPROC();
+
+       movq %r8, %rcx;
+       movq CTX, %r8
+       leaq (-8 * 8)(CTX, %rcx, 8), CTX;
+
+       inpack64_post(%zmm0, %zmm1, %zmm2, %zmm3, %zmm4, %zmm5, %zmm6, %zmm7,
+                     %zmm8, %zmm9, %zmm10, %zmm11, %zmm12, %zmm13, %zmm14,
+                     %zmm15, mem_ab, mem_cd, %zmm30, %zmm31);
+
+.align 8
+.Ldec_loop:
+       dec_rounds64(%zmm0, %zmm1, %zmm2, %zmm3, %zmm4, %zmm5, %zmm6, %zmm7,
+                    %zmm8, %zmm9, %zmm10, %zmm11, %zmm12, %zmm13, %zmm14,
+                    %zmm15, mem_ab, mem_cd, 0);
+
+       cmpq %r8, CTX;
+       je .Ldec_done;
+
+       fls64(mem_ab, %zmm0, %zmm1, %zmm2, %zmm3, %zmm4, %zmm5, %zmm6, %zmm7,
+             mem_cd, %zmm8, %zmm9, %zmm10, %zmm11, %zmm12, %zmm13, %zmm14,
+             %zmm15,
+             ((key_table) + 8)(CTX),
+             ((key_table) + 12)(CTX),
+             ((key_table) + 0)(CTX),
+             ((key_table) + 4)(CTX),
+             %zmm31);
+
+       leaq (-8 * 8)(CTX), CTX;
+       jmp .Ldec_loop;
+
+.align 8
+.Ldec_done:
+       /* load CD for output */
+       vmovdqu64 mem_cd_0, %zmm8;
+       vmovdqu64 mem_cd_1, %zmm9;
+       vmovdqu64 mem_cd_2, %zmm10;
+       vmovdqu64 mem_cd_3, %zmm11;
+       vmovdqu64 mem_cd_4, %zmm12;
+       vmovdqu64 mem_cd_5, %zmm13;
+       vmovdqu64 mem_cd_6, %zmm14;
+       vmovdqu64 mem_cd_7, %zmm15;
+
+       outunpack64(%zmm0, %zmm1, %zmm2, %zmm3, %zmm4, %zmm5, %zmm6, %zmm7,
+                   %zmm8, %zmm9, %zmm10, %zmm11, %zmm12, %zmm13, %zmm14,
+                   %zmm15, (key_table)(CTX), %zmm30, %zmm31);
+
+       ret_spec_stop;
+       CFI_ENDPROC();
+ELF(.size __camellia_gfni_avx512_dec_blk64,.-__camellia_gfni_avx512_dec_blk64;)
+
+#define add_le128(out, in, lo_counter, hi_counter1) \
+       vpaddq lo_counter, in, out; \
+       vpcmpuq $1, lo_counter, out, %k1; \
+       kaddb %k1, %k1, %k1; \
+       vpaddq hi_counter1, out, out{%k1};
+
+.align 16
+.globl _gcry_camellia_gfni_avx512_ctr_enc
+ELF(.type   _gcry_camellia_gfni_avx512_ctr_enc,@function;)
+
+_gcry_camellia_gfni_avx512_ctr_enc:
+       /* input:
+        *      %rdi: ctx, CTX
+        *      %rsi: dst (64 blocks)
+        *      %rdx: src (64 blocks)
+        *      %rcx: iv (big endian, 128bit)
+        */
+       CFI_STARTPROC();
+       spec_stop_avx512;
+
+       cmpl $128, key_bitlength(CTX);
+       movl $32, %r8d;
+       movl $24, %eax;
+       cmovel %eax, %r8d; /* max */
+
+       cmpb $(0x100 - 64), 15(%rcx);
+       jbe .Lctr_byteadd;
+
+       vbroadcasti64x2 .Lbswap128_mask rRIP, %zmm19;
+       vmovdqa64 .Lcounter0123_lo rRIP, %zmm21;
+       vbroadcasti64x2 .Lcounter4444_lo rRIP, %zmm22;
+       vbroadcasti64x2 .Lcounter8888_lo rRIP, %zmm23;
+       vbroadcasti64x2 .Lcounter16161616_lo rRIP, %zmm24;
+       vbroadcasti64x2 .Lcounter1111_hi rRIP, %zmm25;
+
+       /* load IV and byteswap */
+       movq 8(%rcx), %r11;
+       movq (%rcx), %r10;
+       bswapq %r11;
+       bswapq %r10;
+       vbroadcasti64x2 (%rcx), %zmm0;
+       vpshufb %zmm19, %zmm0, %zmm0;
+
+       /* check need for handling 64-bit overflow and carry */
+       cmpq $(0xffffffffffffffff - 64), %r11;
+       ja .Lload_ctr_carry;
+
+       /* construct IVs */
+       vpaddq %zmm21, %zmm0, %zmm15;  /* +0:+1:+2:+3 */
+       vpaddq %zmm22, %zmm15, %zmm14; /* +4:+5:+6:+7 */
+       vpaddq %zmm23, %zmm15, %zmm13; /* +8:+9:+10:+11 */
+       vpaddq %zmm23, %zmm14, %zmm12; /* +12:+13:+14:+15 */
+       vpaddq %zmm24, %zmm15, %zmm11; /* +16... */
+       vpaddq %zmm24, %zmm14, %zmm10; /* +20... */
+       vpaddq %zmm24, %zmm13, %zmm9; /* +24... */
+       vpaddq %zmm24, %zmm12, %zmm8; /* +28... */
+       vpaddq %zmm24, %zmm11, %zmm7; /* +32... */
+       vpaddq %zmm24, %zmm10, %zmm6; /* +36... */
+       vpaddq %zmm24, %zmm9, %zmm5; /* +40... */
+       vpaddq %zmm24, %zmm8, %zmm4; /* +44... */
+       vpaddq %zmm24, %zmm7, %zmm3; /* +48... */
+       vpaddq %zmm24, %zmm6, %zmm2; /* +52... */
+       vpaddq %zmm24, %zmm5, %zmm1; /* +56... */
+       vpaddq %zmm24, %zmm4, %zmm0; /* +60... */
+       jmp .Lload_ctr_done;
+
+.align 4
+.Lload_ctr_carry:
+       /* construct IVs */
+       add_le128(%zmm15, %zmm0, %zmm21, %zmm25);  /* +0:+1:+2:+3 */
+       add_le128(%zmm14, %zmm15, %zmm22, %zmm25); /* +4:+5:+6:+7 */
+       add_le128(%zmm13, %zmm15, %zmm23, %zmm25); /* +8:+9:+10:+11 */
+       add_le128(%zmm12, %zmm14, %zmm23, %zmm25); /* +12:+13:+14:+15 */
+       add_le128(%zmm11, %zmm15, %zmm24, %zmm25); /* +16... */
+       add_le128(%zmm10, %zmm14, %zmm24, %zmm25); /* +20... */
+       add_le128(%zmm9, %zmm13, %zmm24, %zmm25); /* +24... */
+       add_le128(%zmm8, %zmm12, %zmm24, %zmm25); /* +28... */
+       add_le128(%zmm7, %zmm11, %zmm24, %zmm25); /* +32... */
+       add_le128(%zmm6, %zmm10, %zmm24, %zmm25); /* +36... */
+       add_le128(%zmm5, %zmm9, %zmm24, %zmm25); /* +40... */
+       add_le128(%zmm4, %zmm8, %zmm24, %zmm25); /* +44... */
+       add_le128(%zmm3, %zmm7, %zmm24, %zmm25); /* +48... */
+       add_le128(%zmm2, %zmm6, %zmm24, %zmm25); /* +52... */
+       add_le128(%zmm1, %zmm5, %zmm24, %zmm25); /* +56... */
+       add_le128(%zmm0, %zmm4, %zmm24, %zmm25); /* +60... */
+       kxorq %k1, %k1, %k1;
+
+.align 4
+.Lload_ctr_done:
+       vbroadcasti64x2 .Lpack_bswap rRIP, %zmm17;
+       vpbroadcastq (key_table)(CTX), %zmm16;
+       vpshufb %zmm17, %zmm16, %zmm16;
+
+       /* Byte-swap IVs and update counter. */
+       addq $64, %r11;
+       adcq $0, %r10;
+       vpshufb %zmm19, %zmm15, %zmm15;
+       vpshufb %zmm19, %zmm14, %zmm14;
+       vpshufb %zmm19, %zmm13, %zmm13;
+       vpshufb %zmm19, %zmm12, %zmm12;
+       vpshufb %zmm19, %zmm11, %zmm11;
+       vpshufb %zmm19, %zmm10, %zmm10;
+       vpshufb %zmm19, %zmm9, %zmm9;
+       vpshufb %zmm19, %zmm8, %zmm8;
+       bswapq %r11;
+       bswapq %r10;
+       vpshufb %zmm19, %zmm7, %zmm7;
+       vpshufb %zmm19, %zmm6, %zmm6;
+       vpshufb %zmm19, %zmm5, %zmm5;
+       vpshufb %zmm19, %zmm4, %zmm4;
+       vpshufb %zmm19, %zmm3, %zmm3;
+       vpshufb %zmm19, %zmm2, %zmm2;
+       vpshufb %zmm19, %zmm1, %zmm1;
+       vpshufb %zmm19, %zmm0, %zmm0;
+       movq %r11, 8(%rcx);
+       movq %r10, (%rcx);
+
+.align 16
+.Lctr_inpack64_pre:
+       /* inpack64_pre: */
+       vpxorq %zmm0, %zmm16, %zmm0;
+       vpxorq %zmm1, %zmm16, %zmm1;
+       vpxorq %zmm2, %zmm16, %zmm2;
+       vpxorq %zmm3, %zmm16, %zmm3;
+       vpxorq %zmm4, %zmm16, %zmm4;
+       vpxorq %zmm5, %zmm16, %zmm5;
+       vpxorq %zmm6, %zmm16, %zmm6;
+       vpxorq %zmm7, %zmm16, %zmm7;
+       vpxorq %zmm8, %zmm16, %zmm8;
+       vpxorq %zmm9, %zmm16, %zmm9;
+       vpxorq %zmm10, %zmm16, %zmm10;
+       vpxorq %zmm11, %zmm16, %zmm11;
+       vpxorq %zmm12, %zmm16, %zmm12;
+       vpxorq %zmm13, %zmm16, %zmm13;
+       vpxorq %zmm14, %zmm16, %zmm14;
+       vpxorq %zmm15, %zmm16, %zmm15;
+
+       call __camellia_gfni_avx512_enc_blk64;
+
+       vpxorq 0 * 64(%rdx), %zmm7, %zmm7;
+       vpxorq 1 * 64(%rdx), %zmm6, %zmm6;
+       vpxorq 2 * 64(%rdx), %zmm5, %zmm5;
+       vpxorq 3 * 64(%rdx), %zmm4, %zmm4;
+       vpxorq 4 * 64(%rdx), %zmm3, %zmm3;
+       vpxorq 5 * 64(%rdx), %zmm2, %zmm2;
+       vpxorq 6 * 64(%rdx), %zmm1, %zmm1;
+       vpxorq 7 * 64(%rdx), %zmm0, %zmm0;
+       vpxorq 8 * 64(%rdx), %zmm15, %zmm15;
+       vpxorq 9 * 64(%rdx), %zmm14, %zmm14;
+       vpxorq 10 * 64(%rdx), %zmm13, %zmm13;
+       vpxorq 11 * 64(%rdx), %zmm12, %zmm12;
+       vpxorq 12 * 64(%rdx), %zmm11, %zmm11;
+       vpxorq 13 * 64(%rdx), %zmm10, %zmm10;
+       vpxorq 14 * 64(%rdx), %zmm9, %zmm9;
+       vpxorq 15 * 64(%rdx), %zmm8, %zmm8;
+
+       write_output(%zmm7, %zmm6, %zmm5, %zmm4, %zmm3, %zmm2, %zmm1, %zmm0,
+                    %zmm15, %zmm14, %zmm13, %zmm12, %zmm11, %zmm10, %zmm9,
+                    %zmm8, %rsi);
+
+       clear_regs();
+
+       ret_spec_stop;
+
+.align 16
+.Lctr_byteadd_full_ctr_carry:
+       movq 8(%rcx), %r11;
+       movq (%rcx), %r10;
+       bswapq %r11;
+       bswapq %r10;
+       addq $64, %r11;
+       adcq $0, %r10;
+       bswapq %r11;
+       bswapq %r10;
+       movq %r11, 8(%rcx);
+       movq %r10, (%rcx);
+       jmp .Lctr_byteadd_zmm;
+.align 16
+.Lctr_byteadd:
+       vbroadcasti64x2 (%rcx), %zmm12;
+       je .Lctr_byteadd_full_ctr_carry;
+       addb $64, 15(%rcx);
+.Lctr_byteadd_zmm:
+       vbroadcasti64x2 .Lbige_addb_16 rRIP, %zmm16;
+       vmovdqa64 .Lbige_addb_0_1 rRIP, %zmm17;
+       vmovdqa64 .Lbige_addb_4_5 rRIP, %zmm18;
+       vmovdqa64 .Lbige_addb_8_9 rRIP, %zmm19;
+       vmovdqa64 .Lbige_addb_12_13 rRIP, %zmm20;
+       vpaddb %zmm16, %zmm12, %zmm8;
+       vpaddb %zmm17, %zmm12, %zmm15;
+       vpaddb %zmm18, %zmm12, %zmm14;
+       vpaddb %zmm19, %zmm12, %zmm13;
+       vpaddb %zmm20, %zmm12, %zmm12;
+       vpaddb %zmm16, %zmm8, %zmm4;
+       vpaddb %zmm17, %zmm8, %zmm11;
+       vpaddb %zmm18, %zmm8, %zmm10;
+       vpaddb %zmm19, %zmm8, %zmm9;
+       vpaddb %zmm20, %zmm8, %zmm8;
+       vpaddb %zmm16, %zmm4, %zmm0;
+       vpaddb %zmm17, %zmm4, %zmm7;
+       vpaddb %zmm18, %zmm4, %zmm6;
+       vpaddb %zmm19, %zmm4, %zmm5;
+       vpaddb %zmm20, %zmm4, %zmm4;
+       vpaddb %zmm17, %zmm0, %zmm3;
+       vpaddb %zmm18, %zmm0, %zmm2;
+       vpaddb %zmm19, %zmm0, %zmm1;
+       vpaddb %zmm20, %zmm0, %zmm0;
+
+       vbroadcasti64x2 .Lpack_bswap rRIP, %zmm17
+       vpbroadcastq (key_table)(CTX), %zmm16;
+       vpshufb %zmm17, %zmm16, %zmm16;
+
+       jmp .Lctr_inpack64_pre;
+       CFI_ENDPROC();
+ELF(.size _gcry_camellia_gfni_avx512_ctr_enc,.-_gcry_camellia_gfni_avx512_ctr_enc;)
+
+.align 16
+.globl _gcry_camellia_gfni_avx512_cbc_dec
+ELF(.type   _gcry_camellia_gfni_avx512_cbc_dec,@function;)
+
+_gcry_camellia_gfni_avx512_cbc_dec:
+       /* input:
+        *      %rdi: ctx, CTX
+        *      %rsi: dst (64 blocks)
+        *      %rdx: src (64 blocks)
+        *      %rcx: iv
+        */
+       CFI_STARTPROC();
+       spec_stop_avx512;
+
+       movq %rcx, %r9;
+
+       cmpl $128, key_bitlength(CTX);
+       movl $32, %r8d;
+       movl $24, %eax;
+       cmovel %eax, %r8d; /* max */
+
+       inpack64_pre(%zmm0, %zmm1, %zmm2, %zmm3, %zmm4, %zmm5, %zmm6, %zmm7,
+                    %zmm8, %zmm9, %zmm10, %zmm11, %zmm12, %zmm13, %zmm14,
+                    %zmm15, %rdx, (key_table)(CTX, %r8, 8));
+
+       call __camellia_gfni_avx512_dec_blk64;
+
+       /* XOR output with IV */
+       vmovdqu64 (%r9), %xmm16;
+       vinserti64x2 $1, (0 * 16)(%rdx), %ymm16, %ymm16;
+       vinserti64x4 $1, (1 * 16)(%rdx), %zmm16, %zmm16;
+       vpxorq %zmm16, %zmm7, %zmm7;
+       vpxorq (0 * 64 + 48)(%rdx), %zmm6, %zmm6;
+       vpxorq (1 * 64 + 48)(%rdx), %zmm5, %zmm5;
+       vpxorq (2 * 64 + 48)(%rdx), %zmm4, %zmm4;
+       vpxorq (3 * 64 + 48)(%rdx), %zmm3, %zmm3;
+       vpxorq (4 * 64 + 48)(%rdx), %zmm2, %zmm2;
+       vpxorq (5 * 64 + 48)(%rdx), %zmm1, %zmm1;
+       vpxorq (6 * 64 + 48)(%rdx), %zmm0, %zmm0;
+       vpxorq (7 * 64 + 48)(%rdx), %zmm15, %zmm15;
+       vpxorq (8 * 64 + 48)(%rdx), %zmm14, %zmm14;
+       vpxorq (9 * 64 + 48)(%rdx), %zmm13, %zmm13;
+       vpxorq (10 * 64 + 48)(%rdx), %zmm12, %zmm12;
+       vpxorq (11 * 64 + 48)(%rdx), %zmm11, %zmm11;
+       vpxorq (12 * 64 + 48)(%rdx), %zmm10, %zmm10;
+       vpxorq (13 * 64 + 48)(%rdx), %zmm9, %zmm9;
+       vpxorq (14 * 64 + 48)(%rdx), %zmm8, %zmm8;
+       vmovdqu64 (15 * 64 + 48)(%rdx), %xmm16;
+
+       write_output(%zmm7, %zmm6, %zmm5, %zmm4, %zmm3, %zmm2, %zmm1, %zmm0,
+                    %zmm15, %zmm14, %zmm13, %zmm12, %zmm11, %zmm10, %zmm9,
+                    %zmm8, %rsi);
+
+       /* store new IV */
+       vmovdqu64 %xmm16, (0)(%r9);
+
+       clear_regs();
+
+       ret_spec_stop;
+       CFI_ENDPROC();
+ELF(.size _gcry_camellia_gfni_avx512_cbc_dec,.-_gcry_camellia_gfni_avx512_cbc_dec;)
+
+.align 16
+.globl _gcry_camellia_gfni_avx512_cfb_dec
+ELF(.type   _gcry_camellia_gfni_avx512_cfb_dec,@function;)
+
+_gcry_camellia_gfni_avx512_cfb_dec:
+       /* input:
+        *      %rdi: ctx, CTX
+        *      %rsi: dst (32 blocks)
+        *      %rdx: src (32 blocks)
+        *      %rcx: iv
+        */
+       CFI_STARTPROC();
+       spec_stop_avx512;
+
+       cmpl $128, key_bitlength(CTX);
+       movl $32, %r8d;
+       movl $24, %eax;
+       cmovel %eax, %r8d; /* max */
+
+       /* inpack64_pre: */
+       vpbroadcastq (key_table)(CTX), %zmm0;
+       vpshufb .Lpack_bswap rRIP, %zmm0, %zmm0;
+       vmovdqu64 (%rcx), %xmm15;
+       vinserti64x2 $1, (%rdx), %ymm15, %ymm15;
+       vinserti64x4 $1, 16(%rdx), %zmm15, %zmm15;
+       vpxorq %zmm15, %zmm0, %zmm15;
+       vpxorq (0 * 64 + 48)(%rdx), %zmm0, %zmm14;
+       vpxorq (1 * 64 + 48)(%rdx), %zmm0, %zmm13;
+       vpxorq (2 * 64 + 48)(%rdx), %zmm0, %zmm12;
+       vpxorq (3 * 64 + 48)(%rdx), %zmm0, %zmm11;
+       vpxorq (4 * 64 + 48)(%rdx), %zmm0, %zmm10;
+       vpxorq (5 * 64 + 48)(%rdx), %zmm0, %zmm9;
+       vpxorq (6 * 64 + 48)(%rdx), %zmm0, %zmm8;
+       vpxorq (7 * 64 + 48)(%rdx), %zmm0, %zmm7;
+       vpxorq (8 * 64 + 48)(%rdx), %zmm0, %zmm6;
+       vpxorq (9 * 64 + 48)(%rdx), %zmm0, %zmm5;
+       vpxorq (10 * 64 + 48)(%rdx), %zmm0, %zmm4;
+       vpxorq (11 * 64 + 48)(%rdx), %zmm0, %zmm3;
+       vpxorq (12 * 64 + 48)(%rdx), %zmm0, %zmm2;
+       vpxorq (13 * 64 + 48)(%rdx), %zmm0, %zmm1;
+       vpxorq (14 * 64 + 48)(%rdx), %zmm0, %zmm0;
+       vmovdqu64 (15 * 64 + 48)(%rdx), %xmm16;
+       vmovdqu64 %xmm16, (%rcx); /* store new IV */
+
+       call __camellia_gfni_avx512_enc_blk64;
+
+       vpxorq 0 * 64(%rdx), %zmm7, %zmm7;
+       vpxorq 1 * 64(%rdx), %zmm6, %zmm6;
+       vpxorq 2 * 64(%rdx), %zmm5, %zmm5;
+       vpxorq 3 * 64(%rdx), %zmm4, %zmm4;
+       vpxorq 4 * 64(%rdx), %zmm3, %zmm3;
+       vpxorq 5 * 64(%rdx), %zmm2, %zmm2;
+       vpxorq 6 * 64(%rdx), %zmm1, %zmm1;
+       vpxorq 7 * 64(%rdx), %zmm0, %zmm0;
+       vpxorq 8 * 64(%rdx), %zmm15, %zmm15;
+       vpxorq 9 * 64(%rdx), %zmm14, %zmm14;
+       vpxorq 10 * 64(%rdx), %zmm13, %zmm13;
+       vpxorq 11 * 64(%rdx), %zmm12, %zmm12;
+       vpxorq 12 * 64(%rdx), %zmm11, %zmm11;
+       vpxorq 13 * 64(%rdx), %zmm10, %zmm10;
+       vpxorq 14 * 64(%rdx), %zmm9, %zmm9;
+       vpxorq 15 * 64(%rdx), %zmm8, %zmm8;
+
+       write_output(%zmm7, %zmm6, %zmm5, %zmm4, %zmm3, %zmm2, %zmm1, %zmm0,
+                    %zmm15, %zmm14, %zmm13, %zmm12, %zmm11, %zmm10, %zmm9,
+                    %zmm8, %rsi);
+
+       clear_regs();
+
+       ret_spec_stop;
+       CFI_ENDPROC();
+ELF(.size _gcry_camellia_gfni_avx512_cfb_dec,.-_gcry_camellia_gfni_avx512_cfb_dec;)
+
+.align 16
+.globl _gcry_camellia_gfni_avx512_ocb_enc
+ELF(.type   _gcry_camellia_gfni_avx512_ocb_enc,@function;)
+
+_gcry_camellia_gfni_avx512_ocb_enc:
+       /* input:
+        *      %rdi: ctx, CTX
+        *      %rsi: dst (64 blocks)
+        *      %rdx: src (64 blocks)
+        *      %rcx: offset
+        *      %r8 : checksum
+        *      %r9 : L pointers (void *L[64])
+        */
+       CFI_STARTPROC();
+       spec_stop_avx512;
+
+       pushq %r12;
+       CFI_PUSH(%r12);
+       pushq %r13;
+       CFI_PUSH(%r13);
+       pushq %r14;
+       CFI_PUSH(%r14);
+       pushq %r15;
+       CFI_PUSH(%r15);
+       pushq %rbx;
+       CFI_PUSH(%rbx);
+
+       vmovdqu64 (%rcx), %xmm30;
+
+       /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
+       /* Checksum_i = Checksum_{i-1} xor P_i  */
+       /* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i)  */
+
+#define OCB_INPUT(n, l0reg, l1reg, l2reg, l3reg, zreg, zplain) \
+         vmovdqu64 (n * 64)(%rdx), zplain; \
+         vpxorq (l0reg), %xmm30, %xmm16; \
+         vpxorq (l1reg), %xmm16, %xmm30; \
+         vinserti64x2 $1, %xmm30, %ymm16, %ymm16; \
+         vpxorq (l2reg), %xmm30, %xmm30; \
+         vinserti64x2 $2, %xmm30, %zmm16, %zmm16; \
+         vpxorq (l3reg), %xmm30, %xmm30; \
+         vinserti64x2 $3, %xmm30, %zmm16, %zmm16; \
+         vpxorq zplain, %zmm16, zreg; \
+         vmovdqu64 %zmm16, (n * 64)(%rsi);
+
+#define OCB_LOAD_PTRS(n) \
+         movq ((n * 4 * 8) + (0 * 8))(%r9), %r10; \
+         movq ((n * 4 * 8) + (1 * 8))(%r9), %r11; \
+         movq ((n * 4 * 8) + (2 * 8))(%r9), %r12; \
+         movq ((n * 4 * 8) + (3 * 8))(%r9), %r13; \
+         movq ((n * 4 * 8) + (4 * 8))(%r9), %r14; \
+         movq ((n * 4 * 8) + (5 * 8))(%r9), %r15; \
+         movq ((n * 4 * 8) + (6 * 8))(%r9), %rax; \
+         movq ((n * 4 * 8) + (7 * 8))(%r9), %rbx;
+
+       OCB_LOAD_PTRS(0);
+       OCB_INPUT(0, %r10, %r11, %r12, %r13, %zmm15, %zmm20);
+       OCB_INPUT(1, %r14, %r15, %rax, %rbx, %zmm14, %zmm21);
+       OCB_LOAD_PTRS(2);
+       OCB_INPUT(2, %r10, %r11, %r12, %r13, %zmm13, %zmm22);
+       vpternlogq $0x96, %zmm20, %zmm21, %zmm22;
+       OCB_INPUT(3, %r14, %r15, %rax, %rbx, %zmm12, %zmm23);
+       OCB_LOAD_PTRS(4);
+       OCB_INPUT(4, %r10, %r11, %r12, %r13, %zmm11, %zmm24);
+       OCB_INPUT(5, %r14, %r15, %rax, %rbx, %zmm10, %zmm25);
+       vpternlogq $0x96, %zmm23, %zmm24, %zmm25;
+       OCB_LOAD_PTRS(6);
+       OCB_INPUT(6, %r10, %r11, %r12, %r13, %zmm9, %zmm20);
+       OCB_INPUT(7, %r14, %r15, %rax, %rbx, %zmm8, %zmm21);
+       OCB_LOAD_PTRS(8);
+       OCB_INPUT(8, %r10, %r11, %r12, %r13, %zmm7, %zmm26);
+       vpternlogq $0x96, %zmm20, %zmm21, %zmm26;
+       OCB_INPUT(9, %r14, %r15, %rax, %rbx, %zmm6, %zmm23);
+       OCB_LOAD_PTRS(10);
+       OCB_INPUT(10, %r10, %r11, %r12, %r13, %zmm5, %zmm24);
+       OCB_INPUT(11, %r14, %r15, %rax, %rbx, %zmm4, %zmm27);
+       vpternlogq $0x96, %zmm23, %zmm24, %zmm27;
+       OCB_LOAD_PTRS(12);
+       OCB_INPUT(12, %r10, %r11, %r12, %r13, %zmm3, %zmm20);
+       OCB_INPUT(13, %r14, %r15, %rax, %rbx, %zmm2, %zmm21);
+       OCB_LOAD_PTRS(14);
+       OCB_INPUT(14, %r10, %r11, %r12, %r13, %zmm1, %zmm23);
+       vpternlogq $0x96, %zmm20, %zmm21, %zmm23;
+       OCB_INPUT(15, %r14, %r15, %rax, %rbx, %zmm0, %zmm24);
+#undef OCB_LOAD_PTRS
+#undef OCB_INPUT
+
+       vpbroadcastq (key_table)(CTX), %zmm16;
+       vpshufb .Lpack_bswap rRIP, %zmm16, %zmm16;
+
+       vpternlogq $0x96, %zmm24, %zmm22, %zmm25;
+       vpternlogq $0x96, %zmm26, %zmm27, %zmm23;
+       vpxorq %zmm25, %zmm23, %zmm20;
+       vextracti64x4 $1, %zmm20, %ymm21;
+       vpxorq %ymm21, %ymm20, %ymm20;
+       vextracti64x2 $1, %ymm20, %xmm21;
+       vpternlogq $0x96, (%r8), %xmm21, %xmm20;
+       vmovdqu64 %xmm30, (%rcx);
+       vmovdqu64 %xmm20, (%r8);
+
+       cmpl $128, key_bitlength(CTX);
+       movl $32, %r8d;
+       movl $24, %eax;
+       cmovel %eax, %r8d; /* max */
+
+       /* inpack64_pre: */
+       vpxorq %zmm0, %zmm16, %zmm0;
+       vpxorq %zmm1, %zmm16, %zmm1;
+       vpxorq %zmm2, %zmm16, %zmm2;
+       vpxorq %zmm3, %zmm16, %zmm3;
+       vpxorq %zmm4, %zmm16, %zmm4;
+       vpxorq %zmm5, %zmm16, %zmm5;
+       vpxorq %zmm6, %zmm16, %zmm6;
+       vpxorq %zmm7, %zmm16, %zmm7;
+       vpxorq %zmm8, %zmm16, %zmm8;
+       vpxorq %zmm9, %zmm16, %zmm9;
+       vpxorq %zmm10, %zmm16, %zmm10;
+       vpxorq %zmm11, %zmm16, %zmm11;
+       vpxorq %zmm12, %zmm16, %zmm12;
+       vpxorq %zmm13, %zmm16, %zmm13;
+       vpxorq %zmm14, %zmm16, %zmm14;
+       vpxorq %zmm15, %zmm16, %zmm15;
+
+       call __camellia_gfni_avx512_enc_blk64;
+
+       vpxorq 0 * 64(%rsi), %zmm7, %zmm7;
+       vpxorq 1 * 64(%rsi), %zmm6, %zmm6;
+       vpxorq 2 * 64(%rsi), %zmm5, %zmm5;
+       vpxorq 3 * 64(%rsi), %zmm4, %zmm4;
+       vpxorq 4 * 64(%rsi), %zmm3, %zmm3;
+       vpxorq 5 * 64(%rsi), %zmm2, %zmm2;
+       vpxorq 6 * 64(%rsi), %zmm1, %zmm1;
+       vpxorq 7 * 64(%rsi), %zmm0, %zmm0;
+       vpxorq 8 * 64(%rsi), %zmm15, %zmm15;
+       vpxorq 9 * 64(%rsi), %zmm14, %zmm14;
+       vpxorq 10 * 64(%rsi), %zmm13, %zmm13;
+       vpxorq 11 * 64(%rsi), %zmm12, %zmm12;
+       vpxorq 12 * 64(%rsi), %zmm11, %zmm11;
+       vpxorq 13 * 64(%rsi), %zmm10, %zmm10;
+       vpxorq 14 * 64(%rsi), %zmm9, %zmm9;
+       vpxorq 15 * 64(%rsi), %zmm8, %zmm8;
+
+       write_output(%zmm7, %zmm6, %zmm5, %zmm4, %zmm3, %zmm2, %zmm1, %zmm0,
+                    %zmm15, %zmm14, %zmm13, %zmm12, %zmm11, %zmm10, %zmm9,
+                    %zmm8, %rsi);
+
+       clear_regs();
+
+       popq %rbx;
+       CFI_RESTORE(%rbx);
+       popq %r15;
+       CFI_RESTORE(%r15);
+       popq %r14;
+       CFI_RESTORE(%r14);
+       popq %r13;
+       CFI_RESTORE(%r12);
+       popq %r12;
+       CFI_RESTORE(%r13);
+
+       ret_spec_stop;
+       CFI_ENDPROC();
+ELF(.size _gcry_camellia_gfni_avx512_ocb_enc,.-_gcry_camellia_gfni_avx512_ocb_enc;)
+
+.align 16
+.globl _gcry_camellia_gfni_avx512_ocb_dec
+ELF(.type   _gcry_camellia_gfni_avx512_ocb_dec,@function;)
+
+_gcry_camellia_gfni_avx512_ocb_dec:
+       /* input:
+        *      %rdi: ctx, CTX
+        *      %rsi: dst (64 blocks)
+        *      %rdx: src (64 blocks)
+        *      %rcx: offset
+        *      %r8 : checksum
+        *      %r9 : L pointers (void *L[64])
+        */
+       CFI_STARTPROC();
+       spec_stop_avx512;
+
+       pushq %r12;
+       CFI_PUSH(%r12);
+       pushq %r13;
+       CFI_PUSH(%r13);
+       pushq %r14;
+       CFI_PUSH(%r14);
+       pushq %r15;
+       CFI_PUSH(%r15);
+       pushq %rbx;
+       CFI_PUSH(%rbx);
+       pushq %r8;
+       CFI_PUSH(%r8);
+
+       vmovdqu64 (%rcx), %xmm30;
+
+       /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
+       /* C_i = Offset_i xor DECIPHER(K, P_i xor Offset_i)  */
+
+#define OCB_INPUT(n, l0reg, l1reg, l2reg, l3reg, zreg) \
+         vpxorq (l0reg), %xmm30, %xmm16; \
+         vpxorq (l1reg), %xmm16, %xmm30; \
+         vinserti64x2 $1, %xmm30, %ymm16, %ymm16; \
+         vpxorq (l2reg), %xmm30, %xmm30; \
+         vinserti64x2 $2, %xmm30, %zmm16, %zmm16; \
+         vpxorq (l3reg), %xmm30, %xmm30; \
+         vinserti64x2 $3, %xmm30, %zmm16, %zmm16; \
+         vpxorq (n * 64)(%rdx), %zmm16, zreg; \
+         vmovdqu64 %zmm16, (n * 64)(%rsi);
+
+#define OCB_LOAD_PTRS(n) \
+         movq ((n * 4 * 8) + (0 * 8))(%r9), %r10; \
+         movq ((n * 4 * 8) + (1 * 8))(%r9), %r11; \
+         movq ((n * 4 * 8) + (2 * 8))(%r9), %r12; \
+         movq ((n * 4 * 8) + (3 * 8))(%r9), %r13; \
+         movq ((n * 4 * 8) + (4 * 8))(%r9), %r14; \
+         movq ((n * 4 * 8) + (5 * 8))(%r9), %r15; \
+         movq ((n * 4 * 8) + (6 * 8))(%r9), %rax; \
+         movq ((n * 4 * 8) + (7 * 8))(%r9), %rbx;
+
+       OCB_LOAD_PTRS(0);
+       OCB_INPUT(0, %r10, %r11, %r12, %r13, %zmm15);
+       OCB_INPUT(1, %r14, %r15, %rax, %rbx, %zmm14);
+       OCB_LOAD_PTRS(2);
+       OCB_INPUT(2, %r10, %r11, %r12, %r13, %zmm13);
+       OCB_INPUT(3, %r14, %r15, %rax, %rbx, %zmm12);
+       OCB_LOAD_PTRS(4);
+       OCB_INPUT(4, %r10, %r11, %r12, %r13, %zmm11);
+       OCB_INPUT(5, %r14, %r15, %rax, %rbx, %zmm10);
+       OCB_LOAD_PTRS(6);
+       OCB_INPUT(6, %r10, %r11, %r12, %r13, %zmm9);
+       OCB_INPUT(7, %r14, %r15, %rax, %rbx, %zmm8);
+       OCB_LOAD_PTRS(8);
+       OCB_INPUT(8, %r10, %r11, %r12, %r13, %zmm7);
+       OCB_INPUT(9, %r14, %r15, %rax, %rbx, %zmm6);
+       OCB_LOAD_PTRS(10);
+       OCB_INPUT(10, %r10, %r11, %r12, %r13, %zmm5);
+       OCB_INPUT(11, %r14, %r15, %rax, %rbx, %zmm4);
+       OCB_LOAD_PTRS(12);
+       OCB_INPUT(12, %r10, %r11, %r12, %r13, %zmm3);
+       OCB_INPUT(13, %r14, %r15, %rax, %rbx, %zmm2);
+       OCB_LOAD_PTRS(14);
+       OCB_INPUT(14, %r10, %r11, %r12, %r13, %zmm1);
+       OCB_INPUT(15, %r14, %r15, %rax, %rbx, %zmm0);
+#undef OCB_LOAD_PTRS
+#undef OCB_INPUT
+
+       vmovdqu64 %xmm30, (%rcx);
+
+       cmpl $128, key_bitlength(CTX);
+       movl $32, %r8d;
+       movl $24, %eax;
+       cmovel %eax, %r8d; /* max */
+
+       vpbroadcastq (key_table)(CTX, %r8, 8), %zmm16;
+       vpshufb .Lpack_bswap rRIP, %zmm16, %zmm16;
+
+       /* inpack64_pre: */
+       vpxorq %zmm0, %zmm16, %zmm0;
+       vpxorq %zmm1, %zmm16, %zmm1;
+       vpxorq %zmm2, %zmm16, %zmm2;
+       vpxorq %zmm3, %zmm16, %zmm3;
+       vpxorq %zmm4, %zmm16, %zmm4;
+       vpxorq %zmm5, %zmm16, %zmm5;
+       vpxorq %zmm6, %zmm16, %zmm6;
+       vpxorq %zmm7, %zmm16, %zmm7;
+       vpxorq %zmm8, %zmm16, %zmm8;
+       vpxorq %zmm9, %zmm16, %zmm9;
+       vpxorq %zmm10, %zmm16, %zmm10;
+       vpxorq %zmm11, %zmm16, %zmm11;
+       vpxorq %zmm12, %zmm16, %zmm12;
+       vpxorq %zmm13, %zmm16, %zmm13;
+       vpxorq %zmm14, %zmm16, %zmm14;
+       vpxorq %zmm15, %zmm16, %zmm15;
+
+       call __camellia_gfni_avx512_dec_blk64;
+
+       vpxorq 0 * 64(%rsi), %zmm7, %zmm7;
+       vpxorq 1 * 64(%rsi), %zmm6, %zmm6;
+       vpxorq 2 * 64(%rsi), %zmm5, %zmm5;
+       vpxorq 3 * 64(%rsi), %zmm4, %zmm4;
+       vpxorq 4 * 64(%rsi), %zmm3, %zmm3;
+       vpxorq 5 * 64(%rsi), %zmm2, %zmm2;
+       vpxorq 6 * 64(%rsi), %zmm1, %zmm1;
+       vpxorq 7 * 64(%rsi), %zmm0, %zmm0;
+       vpxorq 8 * 64(%rsi), %zmm15, %zmm15;
+       vpxorq 9 * 64(%rsi), %zmm14, %zmm14;
+       vpxorq 10 * 64(%rsi), %zmm13, %zmm13;
+       vpxorq 11 * 64(%rsi), %zmm12, %zmm12;
+       vpxorq 12 * 64(%rsi), %zmm11, %zmm11;
+       vpxorq 13 * 64(%rsi), %zmm10, %zmm10;
+       vpxorq 14 * 64(%rsi), %zmm9, %zmm9;
+       vpxorq 15 * 64(%rsi), %zmm8, %zmm8;
+
+       write_output(%zmm7, %zmm6, %zmm5, %zmm4, %zmm3, %zmm2, %zmm1, %zmm0,
+                    %zmm15, %zmm14, %zmm13, %zmm12, %zmm11, %zmm10, %zmm9,
+                    %zmm8, %rsi);
+
+       popq %r8;
+       CFI_RESTORE(%r8);
+
+       /* Checksum_i = Checksum_{i-1} xor C_i  */
+       vpternlogq $0x96, %zmm7, %zmm6, %zmm5;
+       vpternlogq $0x96, %zmm4, %zmm3, %zmm2;
+       vpternlogq $0x96, %zmm1, %zmm0, %zmm15;
+       vpternlogq $0x96, %zmm14, %zmm13, %zmm12;
+       vpternlogq $0x96, %zmm11, %zmm10, %zmm9;
+       vpternlogq $0x96, %zmm5, %zmm2, %zmm15;
+       vpternlogq $0x96, %zmm12, %zmm9, %zmm8;
+       vpxorq %zmm15, %zmm8, %zmm8;
+
+       vextracti64x4 $1, %zmm8, %ymm0;
+       vpxor %ymm0, %ymm8, %ymm8;
+       vextracti128 $1, %ymm8, %xmm0;
+       vpternlogq $0x96, (%r8), %xmm0, %xmm8;
+       vmovdqu64 %xmm8, (%r8);
+
+       clear_regs();
+
+       popq %rbx;
+       CFI_RESTORE(%rbx);
+       popq %r15;
+       CFI_RESTORE(%r15);
+       popq %r14;
+       CFI_RESTORE(%r14);
+       popq %r13;
+       CFI_RESTORE(%r12);
+       popq %r12;
+       CFI_RESTORE(%r13);
+
+       ret_spec_stop;
+       CFI_ENDPROC();
+ELF(.size _gcry_camellia_gfni_avx512_ocb_dec,.-_gcry_camellia_gfni_avx512_ocb_dec;)
+
+.align 16
+.globl _gcry_camellia_gfni_avx512_enc_blk64
+ELF(.type   _gcry_camellia_gfni_avx512_enc_blk64,@function;)
+
+_gcry_camellia_gfni_avx512_enc_blk64:
+       /* input:
+        *      %rdi: ctx, CTX
+        *      %rsi: dst (64 blocks)
+        *      %rdx: src (64 blocks)
+        */
+       CFI_STARTPROC();
+       spec_stop_avx512;
+
+       cmpl $128, key_bitlength(CTX);
+       movl $32, %r8d;
+       movl $24, %eax;
+       cmovel %eax, %r8d; /* max */
+       xorl %eax, %eax;
+
+       vpbroadcastq (key_table)(CTX), %zmm0;
+       vpshufb .Lpack_bswap rRIP, %zmm0, %zmm0;
+
+       vpxorq (0) * 64(%rdx), %zmm0, %zmm15;
+       vpxorq (1) * 64(%rdx), %zmm0, %zmm14;
+       vpxorq (2) * 64(%rdx), %zmm0, %zmm13;
+       vpxorq (3) * 64(%rdx), %zmm0, %zmm12;
+       vpxorq (4) * 64(%rdx), %zmm0, %zmm11;
+       vpxorq (5) * 64(%rdx), %zmm0, %zmm10;
+       vpxorq (6) * 64(%rdx), %zmm0, %zmm9;
+       vpxorq (7) * 64(%rdx), %zmm0, %zmm8;
+       vpxorq (8) * 64(%rdx), %zmm0, %zmm7;
+       vpxorq (9) * 64(%rdx), %zmm0, %zmm6;
+       vpxorq (10) * 64(%rdx), %zmm0, %zmm5;
+       vpxorq (11) * 64(%rdx), %zmm0, %zmm4;
+       vpxorq (12) * 64(%rdx), %zmm0, %zmm3;
+       vpxorq (13) * 64(%rdx), %zmm0, %zmm2;
+       vpxorq (14) * 64(%rdx), %zmm0, %zmm1;
+       vpxorq (15) * 64(%rdx), %zmm0, %zmm0;
+
+       call __camellia_gfni_avx512_enc_blk64;
+
+       vmovdqu64 %zmm7, (0) * 64(%rsi);
+       vmovdqu64 %zmm6, (1) * 64(%rsi);
+       vmovdqu64 %zmm5, (2) * 64(%rsi);
+       vmovdqu64 %zmm4, (3) * 64(%rsi);
+       vmovdqu64 %zmm3, (4) * 64(%rsi);
+       vmovdqu64 %zmm2, (5) * 64(%rsi);
+       vmovdqu64 %zmm1, (6) * 64(%rsi);
+       vmovdqu64 %zmm0, (7) * 64(%rsi);
+       vmovdqu64 %zmm15, (8) * 64(%rsi);
+       vmovdqu64 %zmm14, (9) * 64(%rsi);
+       vmovdqu64 %zmm13, (10) * 64(%rsi);
+       vmovdqu64 %zmm12, (11) * 64(%rsi);
+       vmovdqu64 %zmm11, (12) * 64(%rsi);
+       vmovdqu64 %zmm10, (13) * 64(%rsi);
+       vmovdqu64 %zmm9, (14) * 64(%rsi);
+       vmovdqu64 %zmm8, (15) * 64(%rsi);
+
+       clear_regs();
+
+       ret_spec_stop;
+       CFI_ENDPROC();
+ELF(.size _gcry_camellia_gfni_avx512_enc_blk64,.-_gcry_camellia_gfni_avx512_enc_blk64;)
+
+.align 16
+.globl _gcry_camellia_gfni_avx512_dec_blk64
+ELF(.type   _gcry_camellia_gfni_avx512_dec_blk64,@function;)
+
+_gcry_camellia_gfni_avx512_dec_blk64:
+       /* input:
+        *      %rdi: ctx, CTX
+        *      %rsi: dst (64 blocks)
+        *      %rdx: src (64 blocks)
+        */
+       CFI_STARTPROC();
+       spec_stop_avx512;
+
+       cmpl $128, key_bitlength(CTX);
+       movl $32, %r8d;
+       movl $24, %eax;
+       cmovel %eax, %r8d; /* max */
+       xorl %eax, %eax;
+
+       vpbroadcastq (key_table)(CTX, %r8, 8), %zmm0;
+       vpshufb .Lpack_bswap rRIP, %zmm0, %zmm0;
+
+       vpxorq (0) * 64(%rdx), %zmm0, %zmm15;
+       vpxorq (1) * 64(%rdx), %zmm0, %zmm14;
+       vpxorq (2) * 64(%rdx), %zmm0, %zmm13;
+       vpxorq (3) * 64(%rdx), %zmm0, %zmm12;
+       vpxorq (4) * 64(%rdx), %zmm0, %zmm11;
+       vpxorq (5) * 64(%rdx), %zmm0, %zmm10;
+       vpxorq (6) * 64(%rdx), %zmm0, %zmm9;
+       vpxorq (7) * 64(%rdx), %zmm0, %zmm8;
+       vpxorq (8) * 64(%rdx), %zmm0, %zmm7;
+       vpxorq (9) * 64(%rdx), %zmm0, %zmm6;
+       vpxorq (10) * 64(%rdx), %zmm0, %zmm5;
+       vpxorq (11) * 64(%rdx), %zmm0, %zmm4;
+       vpxorq (12) * 64(%rdx), %zmm0, %zmm3;
+       vpxorq (13) * 64(%rdx), %zmm0, %zmm2;
+       vpxorq (14) * 64(%rdx), %zmm0, %zmm1;
+       vpxorq (15) * 64(%rdx), %zmm0, %zmm0;
+
+       call __camellia_gfni_avx512_dec_blk64;
+
+       vmovdqu64 %zmm7, (0) * 64(%rsi);
+       vmovdqu64 %zmm6, (1) * 64(%rsi);
+       vmovdqu64 %zmm5, (2) * 64(%rsi);
+       vmovdqu64 %zmm4, (3) * 64(%rsi);
+       vmovdqu64 %zmm3, (4) * 64(%rsi);
+       vmovdqu64 %zmm2, (5) * 64(%rsi);
+       vmovdqu64 %zmm1, (6) * 64(%rsi);
+       vmovdqu64 %zmm0, (7) * 64(%rsi);
+       vmovdqu64 %zmm15, (8) * 64(%rsi);
+       vmovdqu64 %zmm14, (9) * 64(%rsi);
+       vmovdqu64 %zmm13, (10) * 64(%rsi);
+       vmovdqu64 %zmm12, (11) * 64(%rsi);
+       vmovdqu64 %zmm11, (12) * 64(%rsi);
+       vmovdqu64 %zmm10, (13) * 64(%rsi);
+       vmovdqu64 %zmm9, (14) * 64(%rsi);
+       vmovdqu64 %zmm8, (15) * 64(%rsi);
+
+       clear_regs();
+
+       ret_spec_stop;
+       CFI_ENDPROC();
+ELF(.size _gcry_camellia_gfni_avx512_dec_blk64,.-_gcry_camellia_gfni_avx512_dec_blk64;)
+
+#endif /* defined(ENABLE_GFNI_SUPPORT) && defined(ENABLE_AVX512_SUPPORT) */
+#endif /* __x86_64 */
index 72c02d7740896a46bc79da52aa089d88a2d00343..5051a305feddc620360d0b801bd8d884ef138b37 100644 (file)
  * Libgcrypt is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
+ * GNU Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
- * 02110-1301, USA.
+ * License along with this program; if not, see <https://www.gnu.org/licenses/>.
+ * SPDX-License-Identifier: LGPL-2.1-or-later
  */
 
 /* I put all the libgcrypt-specific stuff in this file to keep the
@@ -64,7 +63,7 @@
 #include "camellia.h"
 #include "bufhelp.h"
 #include "cipher-internal.h"
-#include "cipher-selftest.h"
+#include "bulkhelp.h"
 
 /* Helper macro to force alignment to 16 bytes.  */
 #ifdef HAVE_GCC_ATTRIBUTE_ALIGNED
 # define USE_VAES_AVX2 1
 #endif
 
+/* USE_GFNI_AVX2 inidicates whether to compile with Intel GFNI/AVX2 code. */
+#undef USE_GFNI_AVX2
+#if defined(USE_AESNI_AVX2) && defined(ENABLE_GFNI_SUPPORT)
+# define USE_GFNI_AVX2 1
+#endif
+
+/* USE_GFNI_AVX512 inidicates whether to compile with Intel GFNI/AVX512 code. */
+#undef USE_GFNI_AVX512
+#if defined(USE_GFNI_AVX2) && defined(ENABLE_AVX512_SUPPORT)
+# define USE_GFNI_AVX512 1
+#endif
+
+/* USE_PPC_CRYPTO indicates whether to enable PowerPC vector crypto
+ * accelerated code. */
+#undef USE_PPC_CRYPTO
+#if !defined(WORDS_BIGENDIAN) && defined(ENABLE_PPC_CRYPTO_SUPPORT) && \
+    defined(HAVE_COMPATIBLE_CC_PPC_ALTIVEC) && \
+    defined(HAVE_GCC_INLINE_ASM_PPC_ALTIVEC) && \
+    (SIZEOF_UNSIGNED_LONG == 8) && (__GNUC__ >= 4)
+# define USE_PPC_CRYPTO 1
+#endif
+
+/* USE_AARCH64_CE indicates whether to enable ARMv8/CE accelerated code. */
+#undef USE_AARCH64_CE
+#if defined(__AARCH64EL__) && \
+    defined(HAVE_COMPATIBLE_GCC_AARCH64_PLATFORM_AS) && \
+    defined(HAVE_GCC_INLINE_ASM_AARCH64_CRYPTO) && \
+    defined(HAVE_COMPATIBLE_CC_AARCH64_NEON_INTRINSICS) && \
+    (__GNUC__ >= 4)
+# define USE_AARCH64_CE 1
+#endif
+
 typedef struct
 {
   KEY_TABLE_TYPE keytable;
@@ -105,9 +136,20 @@ typedef struct
   unsigned int use_aesni_avx:1;        /* AES-NI/AVX implementation shall be used.  */
 #endif /*USE_AESNI_AVX*/
 #ifdef USE_AESNI_AVX2
+  unsigned int use_avx2:1; /* If any of AVX2 implementation is enabled.  */
   unsigned int use_aesni_avx2:1;/* AES-NI/AVX2 implementation shall be used.  */
   unsigned int use_vaes_avx2:1; /* VAES/AVX2 implementation shall be used.  */
+  unsigned int use_gfni_avx2:1; /* GFNI/AVX2 implementation shall be used.  */
+  unsigned int use_gfni_avx512:1; /* GFNI/AVX512 implementation shall be used.  */
 #endif /*USE_AESNI_AVX2*/
+#ifdef USE_PPC_CRYPTO
+  unsigned int use_ppc:1;
+  unsigned int use_ppc8:1;
+  unsigned int use_ppc9:1;
+#endif /*USE_PPC_CRYPTO*/
+#ifdef USE_AARCH64_CE
+  unsigned int use_aarch64ce:1;
+#endif /*USE_AARCH64_CE*/
 } CAMELLIA_context;
 
 /* Assembly implementations use SystemV ABI, ABI conversion and additional
@@ -126,7 +168,7 @@ typedef struct
 
 #ifdef USE_AESNI_AVX
 /* Assembler implementations of Camellia using AES-NI and AVX.  Process data
-   in 16 block same time.
+   in 16 blocks same time.
  */
 extern void _gcry_camellia_aesni_avx_ctr_enc(CAMELLIA_context *ctx,
                                             unsigned char *out,
@@ -158,19 +200,33 @@ extern void _gcry_camellia_aesni_avx_ocb_dec(CAMELLIA_context *ctx,
                                             const u64 Ls[16]) ASM_FUNC_ABI;
 
 extern void _gcry_camellia_aesni_avx_ocb_auth(CAMELLIA_context *ctx,
-                                            const unsigned char *abuf,
-                                            unsigned char *offset,
-                                            unsigned char *checksum,
-                                            const u64 Ls[16]) ASM_FUNC_ABI;
+                                             const unsigned char *abuf,
+                                             unsigned char *offset,
+                                             unsigned char *checksum,
+                                             const u64 Ls[16]) ASM_FUNC_ABI;
 
 extern void _gcry_camellia_aesni_avx_keygen(CAMELLIA_context *ctx,
                                            const unsigned char *key,
                                            unsigned int keylen) ASM_FUNC_ABI;
+
+extern void _gcry_camellia_aesni_avx_ecb_enc(const CAMELLIA_context *ctx,
+                                            unsigned char *out,
+                                            const unsigned char *in)
+                                            ASM_FUNC_ABI;
+
+extern void _gcry_camellia_aesni_avx_ecb_dec(const CAMELLIA_context *ctx,
+                                            unsigned char *out,
+                                            const unsigned char *in)
+                                            ASM_FUNC_ABI;
+
+static const int avx_burn_stack_depth = 16 * CAMELLIA_BLOCK_SIZE + 16 +
+                                        2 * sizeof(void *) + ASM_EXTRA_STACK;
+
 #endif
 
 #ifdef USE_AESNI_AVX2
 /* Assembler implementations of Camellia using AES-NI and AVX2.  Process data
-   in 32 block same time.
+   in 32 blocks same time.
  */
 extern void _gcry_camellia_aesni_avx2_ctr_enc(CAMELLIA_context *ctx,
                                              unsigned char *out,
@@ -206,11 +262,27 @@ extern void _gcry_camellia_aesni_avx2_ocb_auth(CAMELLIA_context *ctx,
                                               unsigned char *offset,
                                               unsigned char *checksum,
                                               const u64 Ls[32]) ASM_FUNC_ABI;
+
+extern void _gcry_camellia_aesni_avx2_enc_blk1_32(const CAMELLIA_context *ctx,
+                                                  unsigned char *out,
+                                                  const unsigned char *in,
+                                                  unsigned int nblocks)
+                                                  ASM_FUNC_ABI;
+
+extern void _gcry_camellia_aesni_avx2_dec_blk1_32(const CAMELLIA_context *ctx,
+                                                  unsigned char *out,
+                                                  const unsigned char *in,
+                                                  unsigned int nblocks)
+                                                  ASM_FUNC_ABI;
+
+static const int avx2_burn_stack_depth = 32 * CAMELLIA_BLOCK_SIZE + 16 +
+                                         2 * sizeof(void *) + ASM_EXTRA_STACK;
+
 #endif
 
 #ifdef USE_VAES_AVX2
 /* Assembler implementations of Camellia using VAES and AVX2.  Process data
-   in 32 block same time.
+   in 32 blocks same time.
  */
 extern void _gcry_camellia_vaes_avx2_ctr_enc(CAMELLIA_context *ctx,
                                             unsigned char *out,
@@ -246,8 +318,202 @@ extern void _gcry_camellia_vaes_avx2_ocb_auth(CAMELLIA_context *ctx,
                                              unsigned char *offset,
                                              unsigned char *checksum,
                                              const u64 Ls[32]) ASM_FUNC_ABI;
+
+extern void _gcry_camellia_vaes_avx2_enc_blk1_32(const CAMELLIA_context *ctx,
+                                                 unsigned char *out,
+                                                 const unsigned char *in,
+                                                 unsigned int nblocks)
+                                                 ASM_FUNC_ABI;
+
+extern void _gcry_camellia_vaes_avx2_dec_blk1_32(const CAMELLIA_context *ctx,
+                                                 unsigned char *out,
+                                                 const unsigned char *in,
+                                                 unsigned int nblocks)
+                                                 ASM_FUNC_ABI;
+#endif
+
+#ifdef USE_GFNI_AVX2
+/* Assembler implementations of Camellia using GFNI and AVX2.  Process data
+   in 32 blocks same time.
+ */
+extern void _gcry_camellia_gfni_avx2_ctr_enc(CAMELLIA_context *ctx,
+                                            unsigned char *out,
+                                            const unsigned char *in,
+                                            unsigned char *ctr) ASM_FUNC_ABI;
+
+extern void _gcry_camellia_gfni_avx2_cbc_dec(CAMELLIA_context *ctx,
+                                            unsigned char *out,
+                                            const unsigned char *in,
+                                            unsigned char *iv) ASM_FUNC_ABI;
+
+extern void _gcry_camellia_gfni_avx2_cfb_dec(CAMELLIA_context *ctx,
+                                            unsigned char *out,
+                                            const unsigned char *in,
+                                            unsigned char *iv) ASM_FUNC_ABI;
+
+extern void _gcry_camellia_gfni_avx2_ocb_enc(CAMELLIA_context *ctx,
+                                            unsigned char *out,
+                                            const unsigned char *in,
+                                            unsigned char *offset,
+                                            unsigned char *checksum,
+                                            const u64 Ls[32]) ASM_FUNC_ABI;
+
+extern void _gcry_camellia_gfni_avx2_ocb_dec(CAMELLIA_context *ctx,
+                                            unsigned char *out,
+                                            const unsigned char *in,
+                                            unsigned char *offset,
+                                            unsigned char *checksum,
+                                            const u64 Ls[32]) ASM_FUNC_ABI;
+
+extern void _gcry_camellia_gfni_avx2_ocb_auth(CAMELLIA_context *ctx,
+                                             const unsigned char *abuf,
+                                             unsigned char *offset,
+                                             unsigned char *checksum,
+                                             const u64 Ls[32]) ASM_FUNC_ABI;
+
+extern void _gcry_camellia_gfni_avx2_enc_blk1_32(const CAMELLIA_context *ctx,
+                                                 unsigned char *out,
+                                                 const unsigned char *in,
+                                                 unsigned int nblocks)
+                                                 ASM_FUNC_ABI;
+
+extern void _gcry_camellia_gfni_avx2_dec_blk1_32(const CAMELLIA_context *ctx,
+                                                 unsigned char *out,
+                                                 const unsigned char *in,
+                                                 unsigned int nblocks)
+                                                 ASM_FUNC_ABI;
+#endif
+
+#ifdef USE_GFNI_AVX512
+/* Assembler implementations of Camellia using GFNI and AVX512.  Process data
+   in 64 blocks same time.
+ */
+extern void _gcry_camellia_gfni_avx512_ctr_enc(CAMELLIA_context *ctx,
+                                               unsigned char *out,
+                                               const unsigned char *in,
+                                               unsigned char *ctr) ASM_FUNC_ABI;
+
+extern void _gcry_camellia_gfni_avx512_cbc_dec(CAMELLIA_context *ctx,
+                                               unsigned char *out,
+                                               const unsigned char *in,
+                                               unsigned char *iv) ASM_FUNC_ABI;
+
+extern void _gcry_camellia_gfni_avx512_cfb_dec(CAMELLIA_context *ctx,
+                                               unsigned char *out,
+                                               const unsigned char *in,
+                                               unsigned char *iv) ASM_FUNC_ABI;
+
+extern void _gcry_camellia_gfni_avx512_ocb_enc(CAMELLIA_context *ctx,
+                                               unsigned char *out,
+                                               const unsigned char *in,
+                                               unsigned char *offset,
+                                               unsigned char *checksum,
+                                               const u64 Ls[32]) ASM_FUNC_ABI;
+
+extern void _gcry_camellia_gfni_avx512_ocb_dec(CAMELLIA_context *ctx,
+                                               unsigned char *out,
+                                               const unsigned char *in,
+                                               unsigned char *offset,
+                                               unsigned char *checksum,
+                                               const u64 Ls[32]) ASM_FUNC_ABI;
+
+extern void _gcry_camellia_gfni_avx512_enc_blk64(const CAMELLIA_context *ctx,
+                                                 unsigned char *out,
+                                                 const unsigned char *in)
+                                                 ASM_FUNC_ABI;
+
+extern void _gcry_camellia_gfni_avx512_dec_blk64(const CAMELLIA_context *ctx,
+                                                 unsigned char *out,
+                                                 const unsigned char *in)
+                                                 ASM_FUNC_ABI;
+
+/* Stack not used by AVX512 implementation. */
+static const int avx512_burn_stack_depth = 0;
 #endif
 
+#ifdef USE_PPC_CRYPTO
+extern void _gcry_camellia_ppc8_encrypt_blk16(const void *key_table,
+                                             void *out,
+                                             const void *in,
+                                             int key_length);
+
+extern void _gcry_camellia_ppc8_decrypt_blk16(const void *key_table,
+                                             void *out,
+                                             const void *in,
+                                             int key_length);
+
+extern void _gcry_camellia_ppc9_encrypt_blk16(const void *key_table,
+                                             void *out,
+                                             const void *in,
+                                             int key_length);
+
+extern void _gcry_camellia_ppc9_decrypt_blk16(const void *key_table,
+                                             void *out,
+                                             const void *in,
+                                             int key_length);
+
+extern void _gcry_camellia_ppc8_keygen(void *key_table, const void *vkey,
+                                      unsigned int keylen);
+
+extern void _gcry_camellia_ppc9_keygen(void *key_table, const void *vkey,
+                                      unsigned int keylen);
+
+void camellia_ppc_enc_blk16(const CAMELLIA_context *ctx, unsigned char *out,
+                            const unsigned char *in)
+{
+  if (ctx->use_ppc9)
+    _gcry_camellia_ppc9_encrypt_blk16 (ctx->keytable, out, in,
+                                      ctx->keybitlength / 8);
+  else
+    _gcry_camellia_ppc8_encrypt_blk16 (ctx->keytable, out, in,
+                                      ctx->keybitlength / 8);
+}
+
+void camellia_ppc_dec_blk16(const CAMELLIA_context *ctx, unsigned char *out,
+                            const unsigned char *in)
+{
+  if (ctx->use_ppc9)
+    _gcry_camellia_ppc9_decrypt_blk16 (ctx->keytable, out, in,
+                                      ctx->keybitlength / 8);
+  else
+    _gcry_camellia_ppc8_decrypt_blk16 (ctx->keytable, out, in,
+                                      ctx->keybitlength / 8);
+}
+
+static const int ppc_burn_stack_depth = 16 * CAMELLIA_BLOCK_SIZE + 16 +
+                                        2 * sizeof(void *);
+#endif /*USE_PPC_CRYPTO*/
+
+#ifdef USE_AARCH64_CE
+extern void _gcry_camellia_aarch64ce_encrypt_blk16(const void *key_table,
+                                                  void *out, const void *in,
+                                                  int key_length);
+
+extern void _gcry_camellia_aarch64ce_decrypt_blk16(const void *key_table,
+                                                  void *out, const void *in,
+                                                  int key_length);
+
+extern void _gcry_camellia_aarch64ce_keygen(void *key_table, const void *vkey,
+                                           unsigned int keylen);
+
+void camellia_aarch64ce_enc_blk16(const CAMELLIA_context *ctx,
+                                 unsigned char *out, const unsigned char *in)
+{
+  _gcry_camellia_aarch64ce_encrypt_blk16 (ctx->keytable, out, in,
+                                         ctx->keybitlength / 8);
+}
+
+void camellia_aarch64ce_dec_blk16(const CAMELLIA_context *ctx,
+                                 unsigned char *out, const unsigned char *in)
+{
+  _gcry_camellia_aarch64ce_decrypt_blk16 (ctx->keytable, out, in,
+                                         ctx->keybitlength / 8);
+}
+
+static const int aarch64ce_burn_stack_depth = 16 * CAMELLIA_BLOCK_SIZE + 16 +
+                                             2 * sizeof(void *);
+#endif /*USE_AARCH64_CE*/
+
 static const char *selftest(void);
 
 static void _gcry_camellia_ctr_enc (void *context, unsigned char *ctr,
@@ -259,6 +525,15 @@ static void _gcry_camellia_cbc_dec (void *context, unsigned char *iv,
 static void _gcry_camellia_cfb_dec (void *context, unsigned char *iv,
                                    void *outbuf_arg, const void *inbuf_arg,
                                    size_t nblocks);
+static void _gcry_camellia_xts_crypt (void *context, unsigned char *tweak,
+                                     void *outbuf_arg, const void *inbuf_arg,
+                                     size_t nblocks, int encrypt);
+static void _gcry_camellia_ecb_crypt (void *context, void *outbuf_arg,
+                                     const void *inbuf_arg, size_t nblocks,
+                                     int encrypt);
+static void _gcry_camellia_ctr32le_enc (void *context, unsigned char *ctr,
+                                       void *outbuf_arg, const void *inbuf_arg,
+                                       size_t nblocks);
 static size_t _gcry_camellia_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg,
                                        const void *inbuf_arg, size_t nblocks,
                                        int encrypt);
@@ -272,9 +547,9 @@ camellia_setkey(void *c, const byte *key, unsigned keylen,
   CAMELLIA_context *ctx=c;
   static int initialized=0;
   static const char *selftest_failed=NULL;
-#if defined(USE_AESNI_AVX) || defined(USE_AESNI_AVX2) || defined(USE_VAES_AVX2)
   unsigned int hwf = _gcry_get_hw_features ();
-#endif
+
+  (void)hwf;
 
   if(keylen!=16 && keylen!=24 && keylen!=32)
     return GPG_ERR_INV_KEYLEN;
@@ -296,9 +571,28 @@ camellia_setkey(void *c, const byte *key, unsigned keylen,
 #ifdef USE_AESNI_AVX2
   ctx->use_aesni_avx2 = (hwf & HWF_INTEL_AESNI) && (hwf & HWF_INTEL_AVX2);
   ctx->use_vaes_avx2 = 0;
+  ctx->use_gfni_avx2 = 0;
+  ctx->use_gfni_avx512 = 0;
+  ctx->use_avx2 = ctx->use_aesni_avx2;
 #endif
 #ifdef USE_VAES_AVX2
   ctx->use_vaes_avx2 = (hwf & HWF_INTEL_VAES_VPCLMUL) && (hwf & HWF_INTEL_AVX2);
+  ctx->use_avx2 |= ctx->use_vaes_avx2;
+#endif
+#ifdef USE_GFNI_AVX2
+  ctx->use_gfni_avx2 = (hwf & HWF_INTEL_GFNI) && (hwf & HWF_INTEL_AVX2);
+  ctx->use_avx2 |= ctx->use_gfni_avx2;
+#endif
+#ifdef USE_GFNI_AVX512
+  ctx->use_gfni_avx512 = (hwf & HWF_INTEL_GFNI) && (hwf & HWF_INTEL_AVX512);
+#endif
+#ifdef USE_PPC_CRYPTO
+  ctx->use_ppc8 = (hwf & HWF_PPC_VCRYPTO) != 0;
+  ctx->use_ppc9 = (hwf & HWF_PPC_VCRYPTO) && (hwf & HWF_PPC_ARCH_3_00);
+  ctx->use_ppc = ctx->use_ppc8 || ctx->use_ppc9;
+#endif
+#ifdef USE_AARCH64_CE
+  ctx->use_aarch64ce = (hwf & HWF_ARM_AES) != 0;
 #endif
 
   ctx->keybitlength=keylen*8;
@@ -310,14 +604,27 @@ camellia_setkey(void *c, const byte *key, unsigned keylen,
   bulk_ops->ctr_enc = _gcry_camellia_ctr_enc;
   bulk_ops->ocb_crypt = _gcry_camellia_ocb_crypt;
   bulk_ops->ocb_auth  = _gcry_camellia_ocb_auth;
+  bulk_ops->xts_crypt = _gcry_camellia_xts_crypt;
+  bulk_ops->ecb_crypt = _gcry_camellia_ecb_crypt;
+  bulk_ops->ctr32le_enc = _gcry_camellia_ctr32le_enc;
 
   if (0)
     { }
 #ifdef USE_AESNI_AVX
   else if (ctx->use_aesni_avx)
     _gcry_camellia_aesni_avx_keygen(ctx, key, keylen);
-  else
 #endif
+#ifdef USE_PPC_CRYPTO
+  else if (ctx->use_ppc9)
+    _gcry_camellia_ppc9_keygen(ctx->keytable, key, keylen);
+  else if (ctx->use_ppc8)
+    _gcry_camellia_ppc8_keygen(ctx->keytable, key, keylen);
+#endif
+#ifdef USE_AARCH64_CE
+  else if (ctx->use_aarch64ce)
+    _gcry_camellia_aarch64ce_keygen(ctx->keytable, key, keylen);
+#endif
+  else
     {
       Camellia_Ekeygen(ctx->keybitlength,key,ctx->keytable);
       _gcry_burn_stack
@@ -328,6 +635,23 @@ camellia_setkey(void *c, const byte *key, unsigned keylen,
          );
     }
 
+#ifdef USE_GFNI_AVX2
+  if (ctx->use_gfni_avx2)
+    {
+      /* Disable AESNI & VAES implementations when GFNI implementation is
+       * enabled. */
+#ifdef USE_AESNI_AVX
+      ctx->use_aesni_avx = 0;
+#endif
+#ifdef USE_AESNI_AVX2
+      ctx->use_aesni_avx2 = 0;
+#endif
+#ifdef USE_VAES_AVX2
+      ctx->use_vaes_avx2 = 0;
+#endif
+    }
+#endif
+
   return 0;
 }
 
@@ -422,6 +746,230 @@ camellia_decrypt(void *c, byte *outbuf, const byte *inbuf)
 
 #endif /*!USE_ARM_ASM*/
 
+
+static unsigned int
+camellia_encrypt_blk1_32 (void *priv, byte *outbuf, const byte *inbuf,
+                         size_t num_blks)
+{
+  const CAMELLIA_context *ctx = priv;
+  unsigned int stack_burn_size = 0;
+
+  gcry_assert (num_blks <= 32);
+
+#ifdef USE_GFNI_AVX2
+  if (ctx->use_gfni_avx2 && num_blks >= 2)
+    {
+      /* 2 or more parallel block GFNI processing is faster than
+       * generic C implementation.  */
+      _gcry_camellia_gfni_avx2_enc_blk1_32 (ctx, outbuf, inbuf, num_blks);
+      return avx2_burn_stack_depth;
+    }
+#endif
+#ifdef USE_VAES_AVX2
+  if (ctx->use_vaes_avx2 && num_blks >= 4)
+    {
+      /* 4 or more parallel block VAES processing is faster than
+       * generic C implementation.  */
+      _gcry_camellia_vaes_avx2_enc_blk1_32 (ctx, outbuf, inbuf, num_blks);
+      return avx2_burn_stack_depth;
+    }
+#endif
+#ifdef USE_AESNI_AVX2
+  if (ctx->use_aesni_avx2 && num_blks >= 5)
+    {
+      /* 5 or more parallel block AESNI processing is faster than
+       * generic C implementation.  */
+      _gcry_camellia_aesni_avx2_enc_blk1_32 (ctx, outbuf, inbuf, num_blks);
+      return avx2_burn_stack_depth;
+    }
+#endif
+#ifdef USE_AESNI_AVX
+  while (ctx->use_aesni_avx && num_blks >= 16)
+    {
+      _gcry_camellia_aesni_avx_ecb_enc (ctx, outbuf, inbuf);
+      stack_burn_size = avx_burn_stack_depth;
+      outbuf += CAMELLIA_BLOCK_SIZE * 16;
+      inbuf += CAMELLIA_BLOCK_SIZE * 16;
+      num_blks -= 16;
+    }
+#endif
+#ifdef USE_PPC_CRYPTO
+  while (ctx->use_ppc && num_blks >= 16)
+    {
+      camellia_ppc_enc_blk16 (ctx, outbuf, inbuf);
+      stack_burn_size = ppc_burn_stack_depth;
+      outbuf += CAMELLIA_BLOCK_SIZE * 16;
+      inbuf += CAMELLIA_BLOCK_SIZE * 16;
+      num_blks -= 16;
+    }
+#endif
+#ifdef USE_AARCH64_CE
+  while (ctx->use_aarch64ce && num_blks >= 16)
+    {
+      camellia_aarch64ce_enc_blk16 (ctx, outbuf, inbuf);
+      stack_burn_size = aarch64ce_burn_stack_depth;
+      outbuf += CAMELLIA_BLOCK_SIZE * 16;
+      inbuf += CAMELLIA_BLOCK_SIZE * 16;
+      num_blks -= 16;
+    }
+#endif
+
+  while (num_blks)
+    {
+      unsigned int nburn = camellia_encrypt((void *)ctx, outbuf, inbuf);
+      stack_burn_size = nburn > stack_burn_size ? nburn : stack_burn_size;
+      outbuf += CAMELLIA_BLOCK_SIZE;
+      inbuf += CAMELLIA_BLOCK_SIZE;
+      num_blks--;
+    }
+
+  return stack_burn_size;
+}
+
+static unsigned int
+camellia_encrypt_blk1_64 (void *priv, byte *outbuf, const byte *inbuf,
+                         size_t num_blks)
+{
+  CAMELLIA_context *ctx = priv;
+  unsigned int stack_burn_size = 0;
+  unsigned int nburn;
+
+  gcry_assert (num_blks <= 64);
+
+#ifdef USE_GFNI_AVX512
+  if (num_blks == 64 && ctx->use_gfni_avx512)
+    {
+      _gcry_camellia_gfni_avx512_enc_blk64 (ctx, outbuf, inbuf);
+      return avx512_burn_stack_depth;
+    }
+#endif
+
+  do
+    {
+      unsigned int curr_blks = num_blks > 32 ? 32 : num_blks;
+      nburn = camellia_encrypt_blk1_32 (ctx, outbuf, inbuf, curr_blks);
+      stack_burn_size = nburn > stack_burn_size ? nburn : stack_burn_size;
+      outbuf += curr_blks * 16;
+      inbuf += curr_blks * 16;
+      num_blks -= curr_blks;
+    }
+  while (num_blks > 0);
+
+  return stack_burn_size;
+}
+
+static unsigned int
+camellia_decrypt_blk1_32 (void *priv, byte *outbuf, const byte *inbuf,
+                         size_t num_blks)
+{
+  const CAMELLIA_context *ctx = priv;
+  unsigned int stack_burn_size = 0;
+
+  gcry_assert (num_blks <= 32);
+
+#ifdef USE_GFNI_AVX2
+  if (ctx->use_gfni_avx2 && num_blks >= 2)
+    {
+      /* 2 or more parallel block GFNI processing is faster than
+       * generic C implementation.  */
+      _gcry_camellia_gfni_avx2_dec_blk1_32 (ctx, outbuf, inbuf, num_blks);
+      return avx2_burn_stack_depth;
+    }
+#endif
+#ifdef USE_VAES_AVX2
+  if (ctx->use_vaes_avx2 && num_blks >= 4)
+    {
+      /* 4 or more parallel block VAES processing is faster than
+       * generic C implementation.  */
+      _gcry_camellia_vaes_avx2_dec_blk1_32 (ctx, outbuf, inbuf, num_blks);
+      return avx2_burn_stack_depth;
+    }
+#endif
+#ifdef USE_AESNI_AVX2
+  if (ctx->use_aesni_avx2 && num_blks >= 5)
+    {
+      /* 5 or more parallel block AESNI processing is faster than
+       * generic C implementation.  */
+      _gcry_camellia_aesni_avx2_dec_blk1_32 (ctx, outbuf, inbuf, num_blks);
+      return avx2_burn_stack_depth;
+    }
+#endif
+#ifdef USE_AESNI_AVX
+  while (ctx->use_aesni_avx && num_blks >= 16)
+    {
+      _gcry_camellia_aesni_avx_ecb_dec (ctx, outbuf, inbuf);
+      stack_burn_size = avx_burn_stack_depth;
+      outbuf += CAMELLIA_BLOCK_SIZE * 16;
+      inbuf += CAMELLIA_BLOCK_SIZE * 16;
+      num_blks -= 16;
+    }
+#endif
+#ifdef USE_PPC_CRYPTO
+  while (ctx->use_ppc && num_blks >= 16)
+    {
+      camellia_ppc_dec_blk16 (ctx, outbuf, inbuf);
+      stack_burn_size = ppc_burn_stack_depth;
+      outbuf += CAMELLIA_BLOCK_SIZE * 16;
+      inbuf += CAMELLIA_BLOCK_SIZE * 16;
+      num_blks -= 16;
+    }
+#endif
+#ifdef USE_AARCH64_CE
+  while (ctx->use_aarch64ce && num_blks >= 16)
+    {
+      camellia_aarch64ce_dec_blk16 (ctx, outbuf, inbuf);
+      stack_burn_size = aarch64ce_burn_stack_depth;
+      outbuf += CAMELLIA_BLOCK_SIZE * 16;
+      inbuf += CAMELLIA_BLOCK_SIZE * 16;
+      num_blks -= 16;
+    }
+#endif
+
+  while (num_blks)
+    {
+      unsigned int nburn = camellia_decrypt((void *)ctx, outbuf, inbuf);
+      stack_burn_size = nburn > stack_burn_size ? nburn : stack_burn_size;
+      outbuf += CAMELLIA_BLOCK_SIZE;
+      inbuf += CAMELLIA_BLOCK_SIZE;
+      num_blks--;
+    }
+
+  return stack_burn_size;
+}
+
+static unsigned int
+camellia_decrypt_blk1_64 (void *priv, byte *outbuf, const byte *inbuf,
+                         size_t num_blks)
+{
+  CAMELLIA_context *ctx = priv;
+  unsigned int stack_burn_size = 0;
+  unsigned int nburn;
+
+  gcry_assert (num_blks <= 64);
+
+#ifdef USE_GFNI_AVX512
+  if (num_blks == 64 && ctx->use_gfni_avx512)
+    {
+      _gcry_camellia_gfni_avx512_dec_blk64 (ctx, outbuf, inbuf);
+      return avx512_burn_stack_depth;
+    }
+#endif
+
+  do
+    {
+      unsigned int curr_blks = num_blks > 32 ? 32 : num_blks;
+      nburn = camellia_decrypt_blk1_32 (ctx, outbuf, inbuf, curr_blks);
+      stack_burn_size = nburn > stack_burn_size ? nburn : stack_burn_size;
+      outbuf += curr_blks * 16;
+      inbuf += curr_blks * 16;
+      num_blks -= curr_blks;
+    }
+  while (num_blks > 0);
+
+  return stack_burn_size;
+}
+
+
 /* Bulk encryption of complete blocks in CTR mode.  This function is only
    intended for the bulk encryption feature of cipher.c.  CTR is expected to be
    of size CAMELLIA_BLOCK_SIZE. */
@@ -433,27 +981,53 @@ _gcry_camellia_ctr_enc(void *context, unsigned char *ctr,
   CAMELLIA_context *ctx = context;
   unsigned char *outbuf = outbuf_arg;
   const unsigned char *inbuf = inbuf_arg;
-  unsigned char tmpbuf[CAMELLIA_BLOCK_SIZE];
-  int burn_stack_depth = CAMELLIA_encrypt_stack_burn_size;
+  int burn_stack_depth = 0;
+
+#ifdef USE_GFNI_AVX512
+  if (ctx->use_gfni_avx512)
+    {
+      int did_use_gfni_avx512 = 0;
+
+      /* Process data in 64 block chunks. */
+      while (nblocks >= 64)
+        {
+          _gcry_camellia_gfni_avx512_ctr_enc (ctx, outbuf, inbuf, ctr);
+          nblocks -= 64;
+          outbuf += 64 * CAMELLIA_BLOCK_SIZE;
+          inbuf  += 64 * CAMELLIA_BLOCK_SIZE;
+          did_use_gfni_avx512 = 1;
+        }
+
+      if (did_use_gfni_avx512)
+        {
+          if (burn_stack_depth < avx512_burn_stack_depth)
+            burn_stack_depth = avx512_burn_stack_depth;
+        }
+
+      /* Use generic code to handle smaller chunks... */
+    }
+#endif
 
 #ifdef USE_AESNI_AVX2
-  if (ctx->use_aesni_avx2)
+  if (ctx->use_avx2)
     {
       int did_use_aesni_avx2 = 0;
+      typeof (&_gcry_camellia_aesni_avx2_ctr_enc) bulk_ctr_fn =
+         _gcry_camellia_aesni_avx2_ctr_enc;
+
 #ifdef USE_VAES_AVX2
-      int use_vaes = ctx->use_vaes_avx2;
+      if (ctx->use_vaes_avx2)
+       bulk_ctr_fn =_gcry_camellia_vaes_avx2_ctr_enc;
+#endif
+#ifdef USE_GFNI_AVX2
+      if (ctx->use_gfni_avx2)
+       bulk_ctr_fn =_gcry_camellia_gfni_avx2_ctr_enc;
 #endif
 
       /* Process data in 32 block chunks. */
       while (nblocks >= 32)
         {
-#ifdef USE_VAES_AVX2
-          if (use_vaes)
-            _gcry_camellia_vaes_avx2_ctr_enc(ctx, outbuf, inbuf, ctr);
-          else
-#endif
-            _gcry_camellia_aesni_avx2_ctr_enc(ctx, outbuf, inbuf, ctr);
-
+         bulk_ctr_fn (ctx, outbuf, inbuf, ctr);
           nblocks -= 32;
           outbuf += 32 * CAMELLIA_BLOCK_SIZE;
           inbuf  += 32 * CAMELLIA_BLOCK_SIZE;
@@ -462,15 +1036,11 @@ _gcry_camellia_ctr_enc(void *context, unsigned char *ctr,
 
       if (did_use_aesni_avx2)
         {
-          int avx2_burn_stack_depth = 32 * CAMELLIA_BLOCK_SIZE + 16 +
-                                        2 * sizeof(void *) + ASM_EXTRA_STACK;
-
           if (burn_stack_depth < avx2_burn_stack_depth)
             burn_stack_depth = avx2_burn_stack_depth;
         }
 
       /* Use generic code to handle smaller chunks... */
-      /* TODO: use caching instead? */
     }
 #endif
 
@@ -492,32 +1062,31 @@ _gcry_camellia_ctr_enc(void *context, unsigned char *ctr,
 
       if (did_use_aesni_avx)
         {
-          int avx_burn_stack_depth = 16 * CAMELLIA_BLOCK_SIZE +
-                                       2 * sizeof(void *) + ASM_EXTRA_STACK;
-
           if (burn_stack_depth < avx_burn_stack_depth)
             burn_stack_depth = avx_burn_stack_depth;
         }
 
       /* Use generic code to handle smaller chunks... */
-      /* TODO: use caching instead? */
     }
 #endif
 
-  for ( ;nblocks; nblocks-- )
+  /* Process remaining blocks. */
+  if (nblocks)
     {
-      /* Encrypt the counter. */
-      Camellia_EncryptBlock(ctx->keybitlength, ctr, ctx->keytable, tmpbuf);
-      /* XOR the input with the encrypted counter and store in output.  */
-      cipher_block_xor(outbuf, tmpbuf, inbuf, CAMELLIA_BLOCK_SIZE);
-      outbuf += CAMELLIA_BLOCK_SIZE;
-      inbuf  += CAMELLIA_BLOCK_SIZE;
-      /* Increment the counter.  */
-      cipher_block_add(ctr, 1, CAMELLIA_BLOCK_SIZE);
+      byte tmpbuf[CAMELLIA_BLOCK_SIZE * 32];
+      unsigned int tmp_used = CAMELLIA_BLOCK_SIZE;
+      size_t nburn;
+
+      nburn = bulk_ctr_enc_128(ctx, camellia_encrypt_blk1_32, outbuf, inbuf,
+                               nblocks, ctr, tmpbuf,
+                               sizeof(tmpbuf) / CAMELLIA_BLOCK_SIZE, &tmp_used);
+      burn_stack_depth = nburn > burn_stack_depth ? nburn : burn_stack_depth;
+
+      wipememory(tmpbuf, tmp_used);
     }
 
-  wipememory(tmpbuf, sizeof(tmpbuf));
-  _gcry_burn_stack(burn_stack_depth);
+  if (burn_stack_depth)
+    _gcry_burn_stack(burn_stack_depth);
 }
 
 /* Bulk decryption of complete blocks in CBC mode.  This function is only
@@ -530,27 +1099,53 @@ _gcry_camellia_cbc_dec(void *context, unsigned char *iv,
   CAMELLIA_context *ctx = context;
   unsigned char *outbuf = outbuf_arg;
   const unsigned char *inbuf = inbuf_arg;
-  unsigned char savebuf[CAMELLIA_BLOCK_SIZE];
-  int burn_stack_depth = CAMELLIA_decrypt_stack_burn_size;
+  int burn_stack_depth = 0;
+
+#ifdef USE_GFNI_AVX512
+  if (ctx->use_gfni_avx512)
+    {
+      int did_use_gfni_avx512 = 0;
+
+      /* Process data in 64 block chunks. */
+      while (nblocks >= 64)
+        {
+          _gcry_camellia_gfni_avx512_cbc_dec (ctx, outbuf, inbuf, iv);
+          nblocks -= 64;
+          outbuf += 64 * CAMELLIA_BLOCK_SIZE;
+          inbuf  += 64 * CAMELLIA_BLOCK_SIZE;
+          did_use_gfni_avx512 = 1;
+        }
+
+      if (did_use_gfni_avx512)
+        {
+          if (burn_stack_depth < avx512_burn_stack_depth)
+            burn_stack_depth = avx512_burn_stack_depth;
+        }
+
+      /* Use generic code to handle smaller chunks... */
+    }
+#endif
 
 #ifdef USE_AESNI_AVX2
-  if (ctx->use_aesni_avx2)
+  if (ctx->use_avx2)
     {
       int did_use_aesni_avx2 = 0;
+      typeof (&_gcry_camellia_aesni_avx2_cbc_dec) bulk_cbc_fn =
+         _gcry_camellia_aesni_avx2_cbc_dec;
+
 #ifdef USE_VAES_AVX2
-      int use_vaes = ctx->use_vaes_avx2;
+      if (ctx->use_vaes_avx2)
+       bulk_cbc_fn =_gcry_camellia_vaes_avx2_cbc_dec;
+#endif
+#ifdef USE_GFNI_AVX2
+      if (ctx->use_gfni_avx2)
+       bulk_cbc_fn =_gcry_camellia_gfni_avx2_cbc_dec;
 #endif
 
       /* Process data in 32 block chunks. */
       while (nblocks >= 32)
         {
-#ifdef USE_VAES_AVX2
-          if (use_vaes)
-            _gcry_camellia_vaes_avx2_cbc_dec(ctx, outbuf, inbuf, iv);
-          else
-#endif
-            _gcry_camellia_aesni_avx2_cbc_dec(ctx, outbuf, inbuf, iv);
-
+         bulk_cbc_fn (ctx, outbuf, inbuf, iv);
           nblocks -= 32;
           outbuf += 32 * CAMELLIA_BLOCK_SIZE;
           inbuf  += 32 * CAMELLIA_BLOCK_SIZE;
@@ -559,9 +1154,6 @@ _gcry_camellia_cbc_dec(void *context, unsigned char *iv,
 
       if (did_use_aesni_avx2)
         {
-          int avx2_burn_stack_depth = 32 * CAMELLIA_BLOCK_SIZE + 16 +
-                                        2 * sizeof(void *) + ASM_EXTRA_STACK;;
-
           if (burn_stack_depth < avx2_burn_stack_depth)
             burn_stack_depth = avx2_burn_stack_depth;
         }
@@ -588,9 +1180,6 @@ _gcry_camellia_cbc_dec(void *context, unsigned char *iv,
 
       if (did_use_aesni_avx)
         {
-          int avx_burn_stack_depth = 16 * CAMELLIA_BLOCK_SIZE +
-                                       2 * sizeof(void *) + ASM_EXTRA_STACK;
-
           if (burn_stack_depth < avx_burn_stack_depth)
             burn_stack_depth = avx_burn_stack_depth;
         }
@@ -599,20 +1188,23 @@ _gcry_camellia_cbc_dec(void *context, unsigned char *iv,
     }
 #endif
 
-  for ( ;nblocks; nblocks-- )
+  /* Process remaining blocks. */
+  if (nblocks)
     {
-      /* INBUF is needed later and it may be identical to OUTBUF, so store
-         the intermediate result to SAVEBUF.  */
-      Camellia_DecryptBlock(ctx->keybitlength, inbuf, ctx->keytable, savebuf);
+      byte tmpbuf[CAMELLIA_BLOCK_SIZE * 32];
+      unsigned int tmp_used = CAMELLIA_BLOCK_SIZE;
+      size_t nburn;
 
-      cipher_block_xor_n_copy_2(outbuf, savebuf, iv, inbuf,
-                                CAMELLIA_BLOCK_SIZE);
-      inbuf += CAMELLIA_BLOCK_SIZE;
-      outbuf += CAMELLIA_BLOCK_SIZE;
+      nburn = bulk_cbc_dec_128(ctx, camellia_decrypt_blk1_32, outbuf, inbuf,
+                               nblocks, iv, tmpbuf,
+                               sizeof(tmpbuf) / CAMELLIA_BLOCK_SIZE, &tmp_used);
+      burn_stack_depth = nburn > burn_stack_depth ? nburn : burn_stack_depth;
+
+      wipememory(tmpbuf, tmp_used);
     }
 
-  wipememory(savebuf, sizeof(savebuf));
-  _gcry_burn_stack(burn_stack_depth);
+  if (burn_stack_depth)
+    _gcry_burn_stack(burn_stack_depth);
 }
 
 /* Bulk decryption of complete blocks in CFB mode.  This function is only
@@ -625,26 +1217,53 @@ _gcry_camellia_cfb_dec(void *context, unsigned char *iv,
   CAMELLIA_context *ctx = context;
   unsigned char *outbuf = outbuf_arg;
   const unsigned char *inbuf = inbuf_arg;
-  int burn_stack_depth = CAMELLIA_decrypt_stack_burn_size;
+  int burn_stack_depth = 0;
+
+#ifdef USE_GFNI_AVX512
+  if (ctx->use_gfni_avx512)
+    {
+      int did_use_gfni_avx512 = 0;
+
+      /* Process data in 64 block chunks. */
+      while (nblocks >= 64)
+        {
+          _gcry_camellia_gfni_avx512_cfb_dec (ctx, outbuf, inbuf, iv);
+          nblocks -= 64;
+          outbuf += 64 * CAMELLIA_BLOCK_SIZE;
+          inbuf  += 64 * CAMELLIA_BLOCK_SIZE;
+          did_use_gfni_avx512 = 1;
+        }
+
+      if (did_use_gfni_avx512)
+        {
+          if (burn_stack_depth < avx512_burn_stack_depth)
+            burn_stack_depth = avx512_burn_stack_depth;
+        }
+
+      /* Use generic code to handle smaller chunks... */
+    }
+#endif
 
 #ifdef USE_AESNI_AVX2
-  if (ctx->use_aesni_avx2)
+  if (ctx->use_avx2)
     {
       int did_use_aesni_avx2 = 0;
+      typeof (&_gcry_camellia_aesni_avx2_cfb_dec) bulk_cfb_fn =
+         _gcry_camellia_aesni_avx2_cfb_dec;
+
 #ifdef USE_VAES_AVX2
-      int use_vaes = ctx->use_vaes_avx2;
+      if (ctx->use_vaes_avx2)
+       bulk_cfb_fn =_gcry_camellia_vaes_avx2_cfb_dec;
+#endif
+#ifdef USE_GFNI_AVX2
+      if (ctx->use_gfni_avx2)
+       bulk_cfb_fn =_gcry_camellia_gfni_avx2_cfb_dec;
 #endif
 
       /* Process data in 32 block chunks. */
       while (nblocks >= 32)
         {
-#ifdef USE_VAES_AVX2
-          if (use_vaes)
-            _gcry_camellia_vaes_avx2_cfb_dec(ctx, outbuf, inbuf, iv);
-          else
-#endif
-            _gcry_camellia_aesni_avx2_cfb_dec(ctx, outbuf, inbuf, iv);
-
+         bulk_cfb_fn (ctx, outbuf, inbuf, iv);
           nblocks -= 32;
           outbuf += 32 * CAMELLIA_BLOCK_SIZE;
           inbuf  += 32 * CAMELLIA_BLOCK_SIZE;
@@ -653,9 +1272,6 @@ _gcry_camellia_cfb_dec(void *context, unsigned char *iv,
 
       if (did_use_aesni_avx2)
         {
-          int avx2_burn_stack_depth = 32 * CAMELLIA_BLOCK_SIZE + 16 +
-                                        2 * sizeof(void *) + ASM_EXTRA_STACK;
-
           if (burn_stack_depth < avx2_burn_stack_depth)
             burn_stack_depth = avx2_burn_stack_depth;
         }
@@ -682,9 +1298,6 @@ _gcry_camellia_cfb_dec(void *context, unsigned char *iv,
 
       if (did_use_aesni_avx)
         {
-          int avx_burn_stack_depth = 16 * CAMELLIA_BLOCK_SIZE +
-                                       2 * sizeof(void *) + ASM_EXTRA_STACK;
-
           if (burn_stack_depth < avx_burn_stack_depth)
             burn_stack_depth = avx_burn_stack_depth;
         }
@@ -693,15 +1306,111 @@ _gcry_camellia_cfb_dec(void *context, unsigned char *iv,
     }
 #endif
 
-  for ( ;nblocks; nblocks-- )
+  /* Process remaining blocks. */
+  if (nblocks)
     {
-      Camellia_EncryptBlock(ctx->keybitlength, iv, ctx->keytable, iv);
-      cipher_block_xor_n_copy(outbuf, iv, inbuf, CAMELLIA_BLOCK_SIZE);
-      outbuf += CAMELLIA_BLOCK_SIZE;
-      inbuf  += CAMELLIA_BLOCK_SIZE;
+      byte tmpbuf[CAMELLIA_BLOCK_SIZE * 32];
+      unsigned int tmp_used = CAMELLIA_BLOCK_SIZE;
+      size_t nburn;
+
+      nburn = bulk_cfb_dec_128(ctx, camellia_encrypt_blk1_32, outbuf, inbuf,
+                               nblocks, iv, tmpbuf,
+                               sizeof(tmpbuf) / CAMELLIA_BLOCK_SIZE, &tmp_used);
+      burn_stack_depth = nburn > burn_stack_depth ? nburn : burn_stack_depth;
+
+      wipememory(tmpbuf, tmp_used);
+    }
+
+  if (burn_stack_depth)
+    _gcry_burn_stack(burn_stack_depth);
+}
+
+/* Bulk encryption/decryption in ECB mode. */
+static void
+_gcry_camellia_ecb_crypt (void *context, void *outbuf_arg,
+                         const void *inbuf_arg, size_t nblocks, int encrypt)
+{
+  CAMELLIA_context *ctx = context;
+  unsigned char *outbuf = outbuf_arg;
+  const unsigned char *inbuf = inbuf_arg;
+  int burn_stack_depth = 0;
+
+  /* Process remaining blocks. */
+  if (nblocks)
+    {
+      size_t nburn;
+
+      nburn = bulk_ecb_crypt_128(ctx, encrypt ? camellia_encrypt_blk1_64
+                                              : camellia_decrypt_blk1_64,
+                                 outbuf, inbuf, nblocks, 64);
+      burn_stack_depth = nburn > burn_stack_depth ? nburn : burn_stack_depth;
     }
 
-  _gcry_burn_stack(burn_stack_depth);
+  if (burn_stack_depth)
+    _gcry_burn_stack(burn_stack_depth);
+}
+
+/* Bulk encryption/decryption of complete blocks in XTS mode. */
+static void
+_gcry_camellia_xts_crypt (void *context, unsigned char *tweak,
+                          void *outbuf_arg, const void *inbuf_arg,
+                          size_t nblocks, int encrypt)
+{
+  CAMELLIA_context *ctx = context;
+  unsigned char *outbuf = outbuf_arg;
+  const unsigned char *inbuf = inbuf_arg;
+  int burn_stack_depth = 0;
+
+  /* Process remaining blocks. */
+  if (nblocks)
+    {
+      byte tmpbuf[CAMELLIA_BLOCK_SIZE * 64];
+      unsigned int tmp_used = CAMELLIA_BLOCK_SIZE;
+      size_t nburn;
+
+      nburn = bulk_xts_crypt_128(ctx, encrypt ? camellia_encrypt_blk1_64
+                                              : camellia_decrypt_blk1_64,
+                                 outbuf, inbuf, nblocks, tweak, tmpbuf,
+                                 sizeof(tmpbuf) / CAMELLIA_BLOCK_SIZE,
+                                 &tmp_used);
+      burn_stack_depth = nburn > burn_stack_depth ? nburn : burn_stack_depth;
+
+      wipememory(tmpbuf, tmp_used);
+    }
+
+  if (burn_stack_depth)
+    _gcry_burn_stack(burn_stack_depth);
+}
+
+/* Bulk encryption of complete blocks in CTR32LE mode (for GCM-SIV). */
+static void
+_gcry_camellia_ctr32le_enc(void *context, unsigned char *ctr,
+                           void *outbuf_arg, const void *inbuf_arg,
+                           size_t nblocks)
+{
+  CAMELLIA_context *ctx = context;
+  byte *outbuf = outbuf_arg;
+  const byte *inbuf = inbuf_arg;
+  int burn_stack_depth = 0;
+
+  /* Process remaining blocks. */
+  if (nblocks)
+    {
+      byte tmpbuf[64 * CAMELLIA_BLOCK_SIZE];
+      unsigned int tmp_used = CAMELLIA_BLOCK_SIZE;
+      size_t nburn;
+
+      nburn = bulk_ctr32le_enc_128 (ctx, camellia_encrypt_blk1_64, outbuf,
+                                    inbuf, nblocks, ctr, tmpbuf,
+                                    sizeof(tmpbuf) / CAMELLIA_BLOCK_SIZE,
+                                    &tmp_used);
+      burn_stack_depth = nburn > burn_stack_depth ? nburn : burn_stack_depth;
+
+      wipememory (tmpbuf, tmp_used);
+    }
+
+  if (burn_stack_depth)
+    _gcry_burn_stack (burn_stack_depth);
 }
 
 /* Bulk encryption/decryption of complete blocks in OCB mode. */
@@ -709,15 +1418,13 @@ static size_t
 _gcry_camellia_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg,
                          const void *inbuf_arg, size_t nblocks, int encrypt)
 {
-#if defined(USE_AESNI_AVX) || defined(USE_AESNI_AVX2)
+#if defined(USE_PPC_CRYPTO) || defined(USE_AESNI_AVX) || defined(USE_AESNI_AVX2)
   CAMELLIA_context *ctx = (void *)&c->context.c;
   unsigned char *outbuf = outbuf_arg;
   const unsigned char *inbuf = inbuf_arg;
-  int burn_stack_depth;
+  int burn_stack_depth = 0;
   u64 blkn = c->u_mode.ocb.data_nblocks;
 
-  burn_stack_depth = encrypt ? CAMELLIA_encrypt_stack_burn_size :
-                             CAMELLIA_decrypt_stack_burn_size;
 #else
   (void)c;
   (void)outbuf_arg;
@@ -725,38 +1432,69 @@ _gcry_camellia_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg,
   (void)encrypt;
 #endif
 
+#ifdef USE_GFNI_AVX512
+  if (ctx->use_gfni_avx512)
+    {
+      int did_use_gfni_avx512 = 0;
+      u64 Ls[64];
+      u64 *l;
+
+      if (nblocks >= 64)
+       {
+         typeof (&_gcry_camellia_gfni_avx512_ocb_dec) bulk_ocb_fn =
+             encrypt ? _gcry_camellia_gfni_avx512_ocb_enc
+                     : _gcry_camellia_gfni_avx512_ocb_dec;
+          l = bulk_ocb_prepare_L_pointers_array_blk64 (c, Ls, blkn);
+
+         /* Process data in 64 block chunks. */
+         while (nblocks >= 64)
+           {
+             blkn += 64;
+             *l = (uintptr_t)(void *)ocb_get_l(c, blkn - blkn % 64);
+
+             bulk_ocb_fn (ctx, outbuf, inbuf, c->u_iv.iv, c->u_ctr.ctr, Ls);
+
+             nblocks -= 64;
+             outbuf += 64 * CAMELLIA_BLOCK_SIZE;
+             inbuf  += 64 * CAMELLIA_BLOCK_SIZE;
+             did_use_gfni_avx512 = 1;
+           }
+       }
+
+      if (did_use_gfni_avx512)
+       {
+         if (burn_stack_depth < avx2_burn_stack_depth)
+           burn_stack_depth = avx2_burn_stack_depth;
+       }
+
+      /* Use generic code to handle smaller chunks... */
+    }
+#endif
+
 #ifdef USE_AESNI_AVX2
-  if (ctx->use_aesni_avx2)
+  if (ctx->use_avx2)
     {
       int did_use_aesni_avx2 = 0;
-#ifdef USE_VAES_AVX2
-      int encrypt_use_vaes = encrypt && ctx->use_vaes_avx2;
-      int decrypt_use_vaes = !encrypt && ctx->use_vaes_avx2;
-#endif
       u64 Ls[32];
-      unsigned int n = 32 - (blkn % 32);
       u64 *l;
-      int i;
 
       if (nblocks >= 32)
        {
-         for (i = 0; i < 32; i += 8)
-           {
-             /* Use u64 to store pointers for x32 support (assembly function
-              * assumes 64-bit pointers). */
-             Ls[(i + 0 + n) % 32] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
-             Ls[(i + 1 + n) % 32] = (uintptr_t)(void *)c->u_mode.ocb.L[1];
-             Ls[(i + 2 + n) % 32] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
-             Ls[(i + 3 + n) % 32] = (uintptr_t)(void *)c->u_mode.ocb.L[2];
-             Ls[(i + 4 + n) % 32] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
-             Ls[(i + 5 + n) % 32] = (uintptr_t)(void *)c->u_mode.ocb.L[1];
-             Ls[(i + 6 + n) % 32] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
-           }
+         typeof (&_gcry_camellia_aesni_avx2_ocb_dec) bulk_ocb_fn =
+             encrypt ? _gcry_camellia_aesni_avx2_ocb_enc
+                     : _gcry_camellia_aesni_avx2_ocb_dec;
 
-         Ls[(7 + n) % 32] = (uintptr_t)(void *)c->u_mode.ocb.L[3];
-         Ls[(15 + n) % 32] = (uintptr_t)(void *)c->u_mode.ocb.L[4];
-         Ls[(23 + n) % 32] = (uintptr_t)(void *)c->u_mode.ocb.L[3];
-         l = &Ls[(31 + n) % 32];
+#ifdef USE_VAES_AVX2
+         if (ctx->use_vaes_avx2)
+           bulk_ocb_fn = encrypt ? _gcry_camellia_vaes_avx2_ocb_enc
+                                 : _gcry_camellia_vaes_avx2_ocb_dec;
+#endif
+#ifdef USE_GFNI_AVX2
+         if (ctx->use_gfni_avx2)
+           bulk_ocb_fn = encrypt ? _gcry_camellia_gfni_avx2_ocb_enc
+                                 : _gcry_camellia_gfni_avx2_ocb_dec;
+#endif
+          l = bulk_ocb_prepare_L_pointers_array_blk32 (c, Ls, blkn);
 
          /* Process data in 32 block chunks. */
          while (nblocks >= 32)
@@ -764,21 +1502,7 @@ _gcry_camellia_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg,
              blkn += 32;
              *l = (uintptr_t)(void *)ocb_get_l(c, blkn - blkn % 32);
 
-             if (0) {}
-#ifdef USE_VAES_AVX2
-             else if (encrypt_use_vaes)
-               _gcry_camellia_vaes_avx2_ocb_enc(ctx, outbuf, inbuf, c->u_iv.iv,
-                                                 c->u_ctr.ctr, Ls);
-             else if (decrypt_use_vaes)
-               _gcry_camellia_vaes_avx2_ocb_dec(ctx, outbuf, inbuf, c->u_iv.iv,
-                                                 c->u_ctr.ctr, Ls);
-#endif
-             else if (encrypt)
-               _gcry_camellia_aesni_avx2_ocb_enc(ctx, outbuf, inbuf, c->u_iv.iv,
-                                                 c->u_ctr.ctr, Ls);
-             else
-               _gcry_camellia_aesni_avx2_ocb_dec(ctx, outbuf, inbuf, c->u_iv.iv,
-                                                 c->u_ctr.ctr, Ls);
+             bulk_ocb_fn (ctx, outbuf, inbuf, c->u_iv.iv, c->u_ctr.ctr, Ls);
 
              nblocks -= 32;
              outbuf += 32 * CAMELLIA_BLOCK_SIZE;
@@ -789,9 +1513,6 @@ _gcry_camellia_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg,
 
       if (did_use_aesni_avx2)
        {
-         int avx2_burn_stack_depth = 32 * CAMELLIA_BLOCK_SIZE +
-                                     2 * sizeof(void *) + ASM_EXTRA_STACK;
-
          if (burn_stack_depth < avx2_burn_stack_depth)
            burn_stack_depth = avx2_burn_stack_depth;
        }
@@ -805,27 +1526,11 @@ _gcry_camellia_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg,
     {
       int did_use_aesni_avx = 0;
       u64 Ls[16];
-      unsigned int n = 16 - (blkn % 16);
       u64 *l;
-      int i;
 
       if (nblocks >= 16)
        {
-         for (i = 0; i < 16; i += 8)
-           {
-             /* Use u64 to store pointers for x32 support (assembly function
-              * assumes 64-bit pointers). */
-             Ls[(i + 0 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
-             Ls[(i + 1 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[1];
-             Ls[(i + 2 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
-             Ls[(i + 3 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[2];
-             Ls[(i + 4 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
-             Ls[(i + 5 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[1];
-             Ls[(i + 6 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
-           }
-
-         Ls[(7 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[3];
-         l = &Ls[(15 + n) % 16];
+          l = bulk_ocb_prepare_L_pointers_array_blk16 (c, Ls, blkn);
 
          /* Process data in 16 block chunks. */
          while (nblocks >= 16)
@@ -849,9 +1554,6 @@ _gcry_camellia_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg,
 
       if (did_use_aesni_avx)
        {
-         int avx_burn_stack_depth = 16 * CAMELLIA_BLOCK_SIZE +
-                                     2 * sizeof(void *) + ASM_EXTRA_STACK;
-
          if (burn_stack_depth < avx_burn_stack_depth)
            burn_stack_depth = avx_burn_stack_depth;
        }
@@ -860,7 +1562,25 @@ _gcry_camellia_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg,
     }
 #endif
 
-#if defined(USE_AESNI_AVX) || defined(USE_AESNI_AVX2)
+#if defined(USE_PPC_CRYPTO) || defined(USE_AESNI_AVX) || defined(USE_AESNI_AVX2)
+  /* Process remaining blocks. */
+  if (nblocks)
+    {
+      byte tmpbuf[CAMELLIA_BLOCK_SIZE * 32];
+      unsigned int tmp_used = CAMELLIA_BLOCK_SIZE;
+      size_t nburn;
+
+      nburn = bulk_ocb_crypt_128 (c, ctx, encrypt ? camellia_encrypt_blk1_32
+                                                  : camellia_decrypt_blk1_32,
+                                  outbuf, inbuf, nblocks, &blkn, encrypt,
+                                  tmpbuf, sizeof(tmpbuf) / CAMELLIA_BLOCK_SIZE,
+                                  &tmp_used);
+      burn_stack_depth = nburn > burn_stack_depth ? nburn : burn_stack_depth;
+
+      wipememory(tmpbuf, tmp_used);
+      nblocks = 0;
+    }
+
   c->u_mode.ocb.data_nblocks = blkn;
 
   if (burn_stack_depth)
@@ -875,49 +1595,38 @@ static size_t
 _gcry_camellia_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg,
                         size_t nblocks)
 {
-#if defined(USE_AESNI_AVX) || defined(USE_AESNI_AVX2)
+#if defined(USE_PPC_CRYPTO) || defined(USE_AESNI_AVX) || defined(USE_AESNI_AVX2)
   CAMELLIA_context *ctx = (void *)&c->context.c;
   const unsigned char *abuf = abuf_arg;
-  int burn_stack_depth;
+  int burn_stack_depth = 0;
   u64 blkn = c->u_mode.ocb.aad_nblocks;
-
-  burn_stack_depth = CAMELLIA_encrypt_stack_burn_size;
 #else
   (void)c;
   (void)abuf_arg;
 #endif
 
 #ifdef USE_AESNI_AVX2
-  if (ctx->use_aesni_avx2)
+  if (ctx->use_avx2)
     {
       int did_use_aesni_avx2 = 0;
-#ifdef USE_VAES_AVX2
-      int use_vaes = ctx->use_vaes_avx2;
-#endif
       u64 Ls[32];
-      unsigned int n = 32 - (blkn % 32);
       u64 *l;
-      int i;
 
       if (nblocks >= 32)
        {
-         for (i = 0; i < 32; i += 8)
-           {
-             /* Use u64 to store pointers for x32 support (assembly function
-              * assumes 64-bit pointers). */
-             Ls[(i + 0 + n) % 32] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
-             Ls[(i + 1 + n) % 32] = (uintptr_t)(void *)c->u_mode.ocb.L[1];
-             Ls[(i + 2 + n) % 32] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
-             Ls[(i + 3 + n) % 32] = (uintptr_t)(void *)c->u_mode.ocb.L[2];
-             Ls[(i + 4 + n) % 32] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
-             Ls[(i + 5 + n) % 32] = (uintptr_t)(void *)c->u_mode.ocb.L[1];
-             Ls[(i + 6 + n) % 32] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
-           }
+         typeof (&_gcry_camellia_aesni_avx2_ocb_auth) bulk_auth_fn =
+             _gcry_camellia_aesni_avx2_ocb_auth;
 
-         Ls[(7 + n) % 32] = (uintptr_t)(void *)c->u_mode.ocb.L[3];
-         Ls[(15 + n) % 32] = (uintptr_t)(void *)c->u_mode.ocb.L[4];
-         Ls[(23 + n) % 32] = (uintptr_t)(void *)c->u_mode.ocb.L[3];
-         l = &Ls[(31 + n) % 32];
+#ifdef USE_VAES_AVX2
+         if (ctx->use_vaes_avx2)
+           bulk_auth_fn = _gcry_camellia_vaes_avx2_ocb_auth;
+#endif
+#ifdef USE_GFNI_AVX2
+         if (ctx->use_gfni_avx2)
+           bulk_auth_fn = _gcry_camellia_gfni_avx2_ocb_auth;
+#endif
+
+          l = bulk_ocb_prepare_L_pointers_array_blk32 (c, Ls, blkn);
 
          /* Process data in 32 block chunks. */
          while (nblocks >= 32)
@@ -925,16 +1634,8 @@ _gcry_camellia_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg,
              blkn += 32;
              *l = (uintptr_t)(void *)ocb_get_l(c, blkn - blkn % 32);
 
-#ifdef USE_VAES_AVX2
-              if (use_vaes)
-                _gcry_camellia_vaes_avx2_ocb_auth(ctx, abuf,
-                                                  c->u_mode.ocb.aad_offset,
-                                                  c->u_mode.ocb.aad_sum, Ls);
-              else
-#endif
-                _gcry_camellia_aesni_avx2_ocb_auth(ctx, abuf,
-                                                   c->u_mode.ocb.aad_offset,
-                                                   c->u_mode.ocb.aad_sum, Ls);
+             bulk_auth_fn (ctx, abuf, c->u_mode.ocb.aad_offset,
+                           c->u_mode.ocb.aad_sum, Ls);
 
              nblocks -= 32;
              abuf += 32 * CAMELLIA_BLOCK_SIZE;
@@ -944,9 +1645,6 @@ _gcry_camellia_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg,
 
       if (did_use_aesni_avx2)
        {
-         int avx2_burn_stack_depth = 32 * CAMELLIA_BLOCK_SIZE +
-                                     2 * sizeof(void *) + ASM_EXTRA_STACK;
-
          if (burn_stack_depth < avx2_burn_stack_depth)
            burn_stack_depth = avx2_burn_stack_depth;
        }
@@ -960,27 +1658,11 @@ _gcry_camellia_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg,
     {
       int did_use_aesni_avx = 0;
       u64 Ls[16];
-      unsigned int n = 16 - (blkn % 16);
       u64 *l;
-      int i;
 
       if (nblocks >= 16)
        {
-         for (i = 0; i < 16; i += 8)
-           {
-             /* Use u64 to store pointers for x32 support (assembly function
-              * assumes 64-bit pointers). */
-             Ls[(i + 0 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
-             Ls[(i + 1 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[1];
-             Ls[(i + 2 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
-             Ls[(i + 3 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[2];
-             Ls[(i + 4 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
-             Ls[(i + 5 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[1];
-             Ls[(i + 6 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
-           }
-
-         Ls[(7 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[3];
-         l = &Ls[(15 + n) % 16];
+          l = bulk_ocb_prepare_L_pointers_array_blk16 (c, Ls, blkn);
 
          /* Process data in 16 block chunks. */
          while (nblocks >= 16)
@@ -1000,9 +1682,6 @@ _gcry_camellia_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg,
 
       if (did_use_aesni_avx)
        {
-         int avx_burn_stack_depth = 16 * CAMELLIA_BLOCK_SIZE +
-                                     2 * sizeof(void *) + ASM_EXTRA_STACK;
-
          if (burn_stack_depth < avx_burn_stack_depth)
            burn_stack_depth = avx_burn_stack_depth;
        }
@@ -1011,7 +1690,24 @@ _gcry_camellia_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg,
     }
 #endif
 
-#if defined(USE_AESNI_AVX) || defined(USE_AESNI_AVX2)
+#if defined(USE_PPC_CRYPTO) || defined(USE_AESNI_AVX) || defined(USE_AESNI_AVX2)
+  /* Process remaining blocks. */
+  if (nblocks)
+    {
+      byte tmpbuf[CAMELLIA_BLOCK_SIZE * 32];
+      unsigned int tmp_used = CAMELLIA_BLOCK_SIZE;
+      size_t nburn;
+
+      nburn = bulk_ocb_auth_128 (c, ctx, camellia_encrypt_blk1_32,
+                                 abuf, nblocks, &blkn, tmpbuf,
+                                 sizeof(tmpbuf) / CAMELLIA_BLOCK_SIZE,
+                                 &tmp_used);
+      burn_stack_depth = nburn > burn_stack_depth ? nburn : burn_stack_depth;
+
+      wipememory(tmpbuf, tmp_used);
+      nblocks = 0;
+    }
+
   c->u_mode.ocb.aad_nblocks = blkn;
 
   if (burn_stack_depth)
@@ -1021,44 +1717,6 @@ _gcry_camellia_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg,
   return nblocks;
 }
 
-/* Run the self-tests for CAMELLIA-CTR-128, tests IV increment of bulk CTR
-   encryption.  Returns NULL on success. */
-static const char*
-selftest_ctr_128 (void)
-{
-  const int nblocks = 32+16+1;
-  const int blocksize = CAMELLIA_BLOCK_SIZE;
-  const int context_size = sizeof(CAMELLIA_context);
-
-  return _gcry_selftest_helper_ctr("CAMELLIA", &camellia_setkey,
-           &camellia_encrypt, nblocks, blocksize, context_size);
-}
-
-/* Run the self-tests for CAMELLIA-CBC-128, tests bulk CBC decryption.
-   Returns NULL on success. */
-static const char*
-selftest_cbc_128 (void)
-{
-  const int nblocks = 32+16+2;
-  const int blocksize = CAMELLIA_BLOCK_SIZE;
-  const int context_size = sizeof(CAMELLIA_context);
-
-  return _gcry_selftest_helper_cbc("CAMELLIA", &camellia_setkey,
-           &camellia_encrypt, nblocks, blocksize, context_size);
-}
-
-/* Run the self-tests for CAMELLIA-CFB-128, tests bulk CFB decryption.
-   Returns NULL on success. */
-static const char*
-selftest_cfb_128 (void)
-{
-  const int nblocks = 32+16+2;
-  const int blocksize = CAMELLIA_BLOCK_SIZE;
-  const int context_size = sizeof(CAMELLIA_context);
-
-  return _gcry_selftest_helper_cfb("CAMELLIA", &camellia_setkey,
-           &camellia_encrypt, nblocks, blocksize, context_size);
-}
 
 static const char *
 selftest(void)
@@ -1066,7 +1724,6 @@ selftest(void)
   CAMELLIA_context ctx;
   byte scratch[16];
   cipher_bulk_ops_t bulk_ops;
-  const char *r;
 
   /* These test vectors are from RFC-3713 */
   static const byte plaintext[]=
@@ -1130,15 +1787,6 @@ selftest(void)
   if(memcmp(scratch,plaintext,sizeof(plaintext))!=0)
     return "CAMELLIA-256 test decryption failed.";
 
-  if ( (r = selftest_ctr_128 ()) )
-    return r;
-
-  if ( (r = selftest_cbc_128 ()) )
-    return r;
-
-  if ( (r = selftest_cfb_128 ()) )
-    return r;
-
   return NULL;
 }
 
diff --git a/cipher/camellia-ppc8le.c b/cipher/camellia-ppc8le.c
new file mode 100644 (file)
index 0000000..3eeb91a
--- /dev/null
@@ -0,0 +1,47 @@
+/* camellia-ppc8le.c - POWER8 Vector Crypto Camellia implementation
+ * Copyright (C) 2023 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <config.h>
+
+#if !defined(WORDS_BIGENDIAN) && defined(ENABLE_PPC_CRYPTO_SUPPORT) && \
+    defined(HAVE_COMPATIBLE_CC_PPC_ALTIVEC) && \
+    defined(HAVE_GCC_INLINE_ASM_PPC_ALTIVEC) && \
+    (SIZEOF_UNSIGNED_LONG == 8) && (__GNUC__ >= 4)
+
+#ifdef HAVE_GCC_ATTRIBUTE_OPTIMIZE
+# define FUNC_ATTR_OPT __attribute__((optimize("-O2")))
+#else
+# define FUNC_ATTR_OPT
+#endif
+
+#if defined(__clang__) && defined(HAVE_CLANG_ATTRIBUTE_PPC_TARGET)
+# define SIMD128_OPT_ATTR __attribute__((target("arch=pwr8"))) FUNC_ATTR_OPT
+#elif defined(HAVE_GCC_ATTRIBUTE_PPC_TARGET)
+# define SIMD128_OPT_ATTR __attribute__((target("cpu=power8"))) FUNC_ATTR_OPT
+#else
+# define SIMD128_OPT_ATTR FUNC_ATTR_OPT
+#endif
+
+#define FUNC_ENC_BLK16 _gcry_camellia_ppc8_encrypt_blk16
+#define FUNC_DEC_BLK16 _gcry_camellia_ppc8_decrypt_blk16
+#define FUNC_KEY_SETUP _gcry_camellia_ppc8_keygen
+
+#include "camellia-simd128.h"
+
+#endif /* ENABLE_PPC_CRYPTO_SUPPORT */
diff --git a/cipher/camellia-ppc9le.c b/cipher/camellia-ppc9le.c
new file mode 100644 (file)
index 0000000..6d57173
--- /dev/null
@@ -0,0 +1,47 @@
+/* camellia-ppc9le.c - POWER9 Vector Crypto Camellia implementation
+ * Copyright (C) 2023 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <config.h>
+
+#if !defined(WORDS_BIGENDIAN) && defined(ENABLE_PPC_CRYPTO_SUPPORT) && \
+    defined(HAVE_COMPATIBLE_CC_PPC_ALTIVEC) && \
+    defined(HAVE_GCC_INLINE_ASM_PPC_ALTIVEC) && \
+    (SIZEOF_UNSIGNED_LONG == 8) && (__GNUC__ >= 4)
+
+#ifdef HAVE_GCC_ATTRIBUTE_OPTIMIZE
+# define FUNC_ATTR_OPT __attribute__((optimize("-O2")))
+#else
+# define FUNC_ATTR_OPT
+#endif
+
+#if defined(__clang__) && defined(HAVE_CLANG_ATTRIBUTE_PPC_TARGET)
+# define SIMD128_OPT_ATTR __attribute__((target("arch=pwr9"))) FUNC_ATTR_OPT
+#elif defined(HAVE_GCC_ATTRIBUTE_PPC_TARGET)
+# define SIMD128_OPT_ATTR __attribute__((target("cpu=power9"))) FUNC_ATTR_OPT
+#else
+# define SIMD128_OPT_ATTR FUNC_ATTR_OPT
+#endif
+
+#define FUNC_ENC_BLK16 _gcry_camellia_ppc9_encrypt_blk16
+#define FUNC_DEC_BLK16 _gcry_camellia_ppc9_decrypt_blk16
+#define FUNC_KEY_SETUP _gcry_camellia_ppc9_keygen
+
+#include "camellia-simd128.h"
+
+#endif /* ENABLE_PPC_CRYPTO_SUPPORT */
diff --git a/cipher/camellia-simd128.h b/cipher/camellia-simd128.h
new file mode 100644 (file)
index 0000000..ed26afb
--- /dev/null
@@ -0,0 +1,2235 @@
+/* camellia-simd128.h - Camellia cipher SIMD128 intrinsics implementation
+ * Copyright (C) 2023 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+/*
+ * SSE/AVX/NEON implementation of Camellia cipher, using AES-NI/ARMv8-CE/
+ * PPC-crypto for sbox calculations. This implementation takes 16 input blocks
+ * and process them in parallel. Vectorized key setup is also available at
+ * the end of file. This implementation is from
+ *  - https://github.com/jkivilin/camellia-simd-aesni
+ *
+ * This work was originally presented in Master's Thesis,
+ *   "Block Ciphers: Fast Implementations on x86-64 Architecture" (pages 42-50)
+ *   http://urn.fi/URN:NBN:fi:oulu-201305311409
+ */
+
+#include <config.h>
+#include "types.h"
+
+
+#define ALWAYS_INLINE inline __attribute__((always_inline))
+#define NO_INLINE __attribute__((noinline))
+#define NO_INSTRUMENT_FUNCTION __attribute__((no_instrument_function))
+
+#define ASM_FUNC_ATTR          NO_INSTRUMENT_FUNCTION
+#define ASM_FUNC_ATTR_INLINE   ASM_FUNC_ATTR ALWAYS_INLINE
+#define ASM_FUNC_ATTR_NOINLINE ASM_FUNC_ATTR NO_INLINE SIMD128_OPT_ATTR
+
+
+#if defined(HAVE_GCC_INLINE_ASM_PPC_ALTIVEC) && !defined(WORDS_BIGENDIAN)
+
+/**********************************************************************
+  AT&T x86 asm to intrinsics conversion macros (PowerPC VSX+crypto)
+ **********************************************************************/
+#include <altivec.h>
+
+typedef vector signed char int8x16_t;
+typedef vector unsigned char uint8x16_t;
+typedef vector unsigned short uint16x8_t;
+typedef vector unsigned int uint32x4_t;
+typedef vector unsigned long long uint64x2_t;
+typedef uint64x2_t __m128i;
+
+#ifdef __clang__
+/* clang has mismatching prototype for vec_sbox_be. */
+static ASM_FUNC_ATTR_INLINE uint8x16_t
+asm_sbox_be(uint8x16_t b)
+{
+  uint8x16_t o;
+  __asm__ ("vsbox %0, %1\n\t" : "=v" (o) : "v" (b));
+  return o;
+}
+#undef vec_sbox_be
+#define vec_sbox_be asm_sbox_be
+#endif
+
+#define vec_bswap(a)            ((__m128i)vec_reve((uint8x16_t)a))
+
+#define vpand128(a, b, o)       (o = vec_and(b, a))
+#define vpandn128(a, b, o)      (o = vec_andc(a, b))
+#define vpxor128(a, b, o)       (o = vec_xor(b, a))
+#define vpor128(a, b, o)        (o = vec_or(b, a))
+
+#define vpsrlb128(s, a, o)      ({ o = (__m128i)((uint8x16_t)a >> s); })
+#define vpsllb128(s, a, o)      ({ o = (__m128i)((uint8x16_t)a << s); })
+#define vpsrlw128(s, a, o)      ({ o = (__m128i)((uint16x8_t)a >> s); })
+#define vpsllw128(s, a, o)      ({ o = (__m128i)((uint16x8_t)a << s); })
+#define vpsrld128(s, a, o)      ({ o = (__m128i)((uint32x4_t)a >> s); })
+#define vpslld128(s, a, o)      ({ o = (__m128i)((uint32x4_t)a << s); })
+#define vpsrlq128(s, a, o)      ({ o = (__m128i)((uint64x2_t)a >> s); })
+#define vpsllq128(s, a, o)      ({ o = (__m128i)((uint64x2_t)a << s); })
+#define vpsrldq128(s, a, o)     ({ uint64x2_t __tmp = { 0, 0 }; \
+                                 o = (__m128i)vec_sld((uint8x16_t)__tmp, \
+                                                      (uint8x16_t)a, (16 - (s)) & 15);})
+#define vpslldq128(s, a, o)     ({ uint64x2_t __tmp = { 0, 0 }; \
+                                 o = (__m128i)vec_sld((uint8x16_t)a, \
+                                                      (uint8x16_t)__tmp, (s) & 15);})
+
+#define if_vpsrlb128(...)       __VA_ARGS__
+#define if_not_vpsrlb128(...)   /*_*/
+#define vpsrl_byte_128(s, a, o) vpsrlb128(s, a, o)
+#define vpsll_byte_128(s, a, o) vpsllb128(s, a, o)
+
+#define vpaddb128(a, b, o)      (o = (__m128i)vec_add((uint8x16_t)b, (uint8x16_t)a))
+
+#define vpcmpgtb128(a, b, o)    (o = (__m128i)vec_cmpgt((int8x16_t)b, (int8x16_t)a))
+#define vpabsb128(a, o)         (o = (__m128i)vec_abs((int8x16_t)a))
+
+#define vpshufd128_0x4e(a, o)   (o = (__m128i)vec_reve((uint64x2_t)a))
+#define vpshufd128_0x1b(a, o)   (o = (__m128i)vec_reve((uint32x4_t)a))
+
+#define vpshufb128(m, a, o) \
+       ({ uint64x2_t __tmpz = { 0, 0 }; \
+          o = (__m128i)vec_perm((uint8x16_t)a, (uint8x16_t)__tmpz, (uint8x16_t)m); })
+
+#define vpunpckhdq128(a, b, o)  (o = (__m128i)vec_mergel((uint32x4_t)b, (uint32x4_t)a))
+#define vpunpckldq128(a, b, o)  (o = (__m128i)vec_mergeh((uint32x4_t)b, (uint32x4_t)a))
+#define vpunpckhqdq128(a, b, o) (o = (__m128i)vec_mergel((uint64x2_t)b, (uint64x2_t)a))
+#define vpunpcklqdq128(a, b, o) (o = (__m128i)vec_mergeh((uint64x2_t)b, (uint64x2_t)a))
+
+#define vmovdqa128(a, o)        (o = a)
+#define vmovd128(a, o)          ({ uint32x4_t __tmp = { (a), 0, 0, 0 }; \
+                                  o = (__m128i)(__tmp); })
+#define vmovq128(a, o)          ({ uint64x2_t __tmp = { (a), 0 }; \
+                                  o = (__m128i)(__tmp); })
+
+#define vmovdqa128_memld(a, o)  (o = *(const __m128i *)(a))
+#define vmovdqa128_memst(a, o)  (*(__m128i *)(o) = (a))
+#define vpshufb128_amemld(m, a, o) vpshufb128(*(const __m128i *)(m), a, o)
+
+/* Following operations may have unaligned memory input */
+#define vmovdqu128_memld(a, o)  (o = (__m128i)vec_xl(0, (const uint8_t *)(a)))
+#define vpxor128_memld(a, b, o) vpxor128(b, (__m128i)vec_xl(0, (const uint8_t *)(a)), o)
+
+/* Following operations may have unaligned memory output */
+#define vmovdqu128_memst(a, o)  vec_xst((uint8x16_t)(a), 0, (uint8_t *)(o))
+#define vmovq128_memst(a, o)    (((uint64_unaligned_t *)(o))[0] = ((__m128i)(a))[0])
+
+/* PowerPC AES encrypt last round => ShiftRows + SubBytes + XOR round key  */
+static const uint8x16_t shift_row =
+  { 0, 5, 10, 15, 4, 9, 14, 3, 8, 13, 2, 7, 12, 1, 6, 11 };
+#define vaesenclast128(a, b, o) \
+       ({ uint64x2_t __tmp = (__m128i)vec_sbox_be((uint8x16_t)(b)); \
+          vpshufb128(shift_row, __tmp, __tmp); \
+          vpxor128(a, __tmp, o); })
+
+/* Macros for exposing SubBytes from PowerPC crypto instructions. */
+#define aes_subbytes(a, o) \
+       (o = (__m128i)vec_sbox_be((uint8x16_t)(a)))
+#define aes_subbytes_and_shuf_and_xor(zero, a, o) \
+        vaesenclast128((zero), (a), (o))
+/*#define aes_load_inv_shufmask(shufmask_reg) \
+       load_frequent_const(inv_shift_row, (shufmask_reg))*/
+#define aes_inv_shuf(shufmask_reg, a, o) \
+       vpshufb128(shufmask_reg, (a), (o))
+#define if_aes_subbytes(...) __VA_ARGS__
+#define if_not_aes_subbytes(...) /*_*/
+
+#define memory_barrier_with_vec(a) __asm__("" : "+wa"(a) :: "memory")
+
+#endif /* __powerpc__ */
+
+#ifdef __ARM_NEON
+
+/**********************************************************************
+  AT&T x86 asm to intrinsics conversion macros (ARMv8-CE)
+ **********************************************************************/
+#include <arm_neon.h>
+
+#define __m128i uint64x2_t
+
+#define vpand128(a, b, o)       (o = vandq_u64(b, a))
+#define vpandn128(a, b, o)      (o = vbicq_u64(a, b))
+#define vpxor128(a, b, o)       (o = veorq_u64(b, a))
+#define vpor128(a, b, o)        (o = vorrq_u64(b, a))
+
+#define vpsrlb128(s, a, o)      (o = (__m128i)vshrq_n_u8((uint8x16_t)a, s))
+#define vpsllb128(s, a, o)      (o = (__m128i)vshlq_n_u8((uint8x16_t)a, s))
+#define vpsrlw128(s, a, o)      (o = (__m128i)vshrq_n_u16((uint16x8_t)a, s))
+#define vpsllw128(s, a, o)      (o = (__m128i)vshlq_n_u16((uint16x8_t)a, s))
+#define vpsrld128(s, a, o)      (o = (__m128i)vshrq_n_u32((uint32x4_t)a, s))
+#define vpslld128(s, a, o)      (o = (__m128i)vshlq_n_u32((uint32x4_t)a, s))
+#define vpsrlq128(s, a, o)      (o = (__m128i)vshrq_n_u64(a, s))
+#define vpsllq128(s, a, o)      (o = (__m128i)vshlq_n_u64(a, s))
+#define vpsrldq128(s, a, o)     ({ uint64x2_t __tmp = { 0, 0 }; \
+                               o = (__m128i)vextq_u8((uint8x16_t)a, \
+                                                     (uint8x16_t)__tmp, (s) & 15);})
+#define vpslldq128(s, a, o)     ({ uint64x2_t __tmp = { 0, 0 }; \
+                               o = (__m128i)vextq_u8((uint8x16_t)__tmp, \
+                                                     (uint8x16_t)a, (16 - (s)) & 15);})
+
+#define if_vpsrlb128(...)       __VA_ARGS__
+#define if_not_vpsrlb128(...)   /*_*/
+#define vpsrl_byte_128(s, a, o) vpsrlb128(s, a, o)
+#define vpsll_byte_128(s, a, o) vpsllb128(s, a, o)
+
+#define vpaddb128(a, b, o)      (o = (__m128i)vaddq_u8((uint8x16_t)b, (uint8x16_t)a))
+
+#define vpcmpgtb128(a, b, o)    (o = (__m128i)vcgtq_s8((int8x16_t)b, (int8x16_t)a))
+#define vpabsb128(a, o)         (o = (__m128i)vabsq_s8((int8x16_t)a))
+
+#define vpshufd128_0x4e(a, o)   (o = (__m128i)vextq_u8((uint8x16_t)a, (uint8x16_t)a, 8))
+#define vpshufd128_0x1b(a, o)   (o = (__m128i)vrev64q_u32((uint32x4_t)vextq_u8((uint8x16_t)a, (uint8x16_t)a, 8)))
+#define vpshufb128(m, a, o)     (o = (__m128i)vqtbl1q_u8((uint8x16_t)a, (uint8x16_t)m))
+
+#define vpunpckhdq128(a, b, o)  (o = (__m128i)vzip2q_u32((uint32x4_t)b, (uint32x4_t)a))
+#define vpunpckldq128(a, b, o)  (o = (__m128i)vzip1q_u32((uint32x4_t)b, (uint32x4_t)a))
+#define vpunpckhqdq128(a, b, o) (o = (__m128i)vzip2q_u64(b, a))
+#define vpunpcklqdq128(a, b, o) (o = (__m128i)vzip1q_u64(b, a))
+
+/* CE AES encrypt last round => ShiftRows + SubBytes + XOR round key  */
+#define vaesenclast128(a, b, o) (o = (__m128i)vaeseq_u8((uint8x16_t)b, (uint8x16_t)a))
+
+#define vmovdqa128(a, o)        (o = a)
+#define vmovd128(a, o)          ({ uint32x4_t __tmp = { a, 0, 0, 0 }; o = (__m128i)__tmp; })
+#define vmovq128(a, o)          ({ uint64x2_t __tmp = { a, 0 }; o = (__m128i)__tmp; })
+
+#define vmovdqa128_memld(a, o)  (o = (*(const __m128i *)(a)))
+#define vmovdqa128_memst(a, o)  (*(__m128i *)(o) = (a))
+#define vpshufb128_amemld(m, a, o) vpshufb128(*(const __m128i *)(m), a, o)
+
+/* Following operations may have unaligned memory input */
+#define vmovdqu128_memld(a, o)  (o = (__m128i)vld1q_u8((const uint8_t *)(a)))
+#define vpxor128_memld(a, b, o) vpxor128(b, (__m128i)vld1q_u8((const uint8_t *)(a)), o)
+
+/* Following operations may have unaligned memory output */
+#define vmovdqu128_memst(a, o)  vst1q_u8((uint8_t *)(o), (uint8x16_t)a)
+#define vmovq128_memst(a, o)    (((uint64_unaligned_t *)(o))[0] = (a)[0])
+
+/* Macros for exposing SubBytes from Crypto-Extension instruction set. */
+#define aes_subbytes_and_shuf_and_xor(zero, a, o) \
+        vaesenclast128(zero, a, o)
+#define aes_load_inv_shufmask(shufmask_reg) \
+       load_frequent_const(inv_shift_row, shufmask_reg)
+#define aes_inv_shuf(shufmask_reg, a, o) \
+       vpshufb128(shufmask_reg, a, o)
+#define if_aes_subbytes(...) /*_*/
+#define if_not_aes_subbytes(...) __VA_ARGS__
+
+#define memory_barrier_with_vec(a) __asm__("" : "+w"(a) :: "memory")
+
+#endif /* __ARM_NEON */
+
+#if defined(__x86_64__) || defined(__i386__)
+
+/**********************************************************************
+  AT&T x86 asm to intrinsics conversion macros
+ **********************************************************************/
+#include <x86intrin.h>
+
+#define vpand128(a, b, o)       (o = _mm_and_si128(b, a))
+#define vpandn128(a, b, o)      (o = _mm_andnot_si128(b, a))
+#define vpxor128(a, b, o)       (o = _mm_xor_si128(b, a))
+#define vpor128(a, b, o)        (o = _mm_or_si128(b, a))
+
+#define vpsrlw128(s, a, o)      (o = _mm_srli_epi16(a, s))
+#define vpsllw128(s, a, o)      (o = _mm_slli_epi16(a, s))
+#define vpsrld128(s, a, o)      (o = _mm_srli_epi32(a, s))
+#define vpslld128(s, a, o)      (o = _mm_slli_epi32(a, s))
+#define vpsrlq128(s, a, o)      (o = _mm_srli_epi64(a, s))
+#define vpsllq128(s, a, o)      (o = _mm_slli_epi64(a, s))
+#define vpsrldq128(s, a, o)     (o = _mm_srli_si128(a, s))
+#define vpslldq128(s, a, o)     (o = _mm_slli_si128(a, s))
+
+#define if_vpsrlb128(...)       /*_*/
+#define if_not_vpsrlb128(...)   __VA_ARGS__
+#define vpsrl_byte_128(s, a, o) vpsrld128(s, a, o)
+#define vpsll_byte_128(s, a, o) vpslld128(s, a, o)
+
+#define vpaddb128(a, b, o)      (o = _mm_add_epi8(b, a))
+
+#define vpcmpgtb128(a, b, o)    (o = _mm_cmpgt_epi8(b, a))
+#define vpabsb128(a, o)         (o = _mm_abs_epi8(a))
+
+#define vpshufd128_0x1b(a, o)   (o = _mm_shuffle_epi32(a, 0x1b))
+#define vpshufd128_0x4e(a, o)   (o = _mm_shuffle_epi32(a, 0x4e))
+#define vpshufb128(m, a, o)     (o = _mm_shuffle_epi8(a, m))
+
+#define vpunpckhdq128(a, b, o)  (o = _mm_unpackhi_epi32(b, a))
+#define vpunpckldq128(a, b, o)  (o = _mm_unpacklo_epi32(b, a))
+#define vpunpckhqdq128(a, b, o) (o = _mm_unpackhi_epi64(b, a))
+#define vpunpcklqdq128(a, b, o) (o = _mm_unpacklo_epi64(b, a))
+
+/* AES-NI encrypt last round => ShiftRows + SubBytes + XOR round key  */
+#define vaesenclast128(a, b, o) (o = _mm_aesenclast_si128(b, a))
+
+#define vmovdqa128(a, o)        (o = a)
+#define vmovd128(a, o)          (o = _mm_set_epi32(0, 0, 0, a))
+#define vmovq128(a, o)          (o = _mm_set_epi64x(0, a))
+
+#define vmovdqa128_memld(a, o)  (o = (*(const __m128i *)(a)))
+#define vmovdqa128_memst(a, o)  (*(__m128i *)(o) = (a))
+#define vpshufb128_amemld(m, a, o) vpshufb128(*(const __m128i *)(m), a, o)
+
+/* Following operations may have unaligned memory input */
+#define vmovdqu128_memld(a, o)  (o = _mm_loadu_si128((const __m128i *)(a)))
+#define vpxor128_memld(a, b, o) \
+       vpxor128(b, _mm_loadu_si128((const __m128i *)(a)), o)
+
+/* Following operations may have unaligned memory output */
+#define vmovdqu128_memst(a, o)  _mm_storeu_si128((__m128i *)(o), a)
+#define vmovq128_memst(a, o)    _mm_storel_epi64((__m128i *)(o), a)
+
+/* Macros for exposing SubBytes from AES-NI instruction set. */
+#define aes_subbytes_and_shuf_and_xor(zero, a, o) \
+       vaesenclast128(zero, a, o)
+#define aes_load_inv_shufmask(shufmask_reg) \
+       load_frequent_const(inv_shift_row, shufmask_reg)
+#define aes_inv_shuf(shufmask_reg, a, o) \
+       vpshufb128(shufmask_reg, a, o)
+#define if_aes_subbytes(...) /*_*/
+#define if_not_aes_subbytes(...) __VA_ARGS__
+
+#define memory_barrier_with_vec(a) __asm__("" : "+x"(a) :: "memory")
+
+#endif /* defined(__x86_64__) || defined(__i386__) */
+
+/**********************************************************************
+  helper macros
+ **********************************************************************/
+#define filter_8bit(x, lo_t, hi_t, mask4bit, tmp0) \
+       vpand128(x, mask4bit, tmp0); \
+       if_vpsrlb128(vpsrlb128(4, x, x)); \
+       if_not_vpsrlb128(vpandn128(x, mask4bit, x)); \
+       if_not_vpsrlb128(vpsrld128(4, x, x)); \
+       \
+       vpshufb128(tmp0, lo_t, tmp0); \
+       vpshufb128(x, hi_t, x); \
+       vpxor128(tmp0, x, x);
+
+#define transpose_4x4(x0, x1, x2, x3, t1, t2) \
+       vpunpckhdq128(x1, x0, t2); \
+       vpunpckldq128(x1, x0, x0); \
+       \
+       vpunpckldq128(x3, x2, t1); \
+       vpunpckhdq128(x3, x2, x2); \
+       \
+       vpunpckhqdq128(t1, x0, x1); \
+       vpunpcklqdq128(t1, x0, x0); \
+       \
+       vpunpckhqdq128(x2, t2, x3); \
+       vpunpcklqdq128(x2, t2, x2);
+
+#define load_zero(o) vmovq128(0, o)
+
+#define load_frequent_const(constant, o) vmovdqa128(constant ## _stack, o)
+
+#define prepare_frequent_const(constant) \
+       vmovdqa128_memld(&(constant), constant ## _stack); \
+       memory_barrier_with_vec(constant ## _stack)
+
+#define prepare_frequent_constants() \
+       prepare_frequent_const(inv_shift_row); \
+       prepare_frequent_const(pack_bswap); \
+       prepare_frequent_const(shufb_16x16b); \
+       prepare_frequent_const(mask_0f); \
+       prepare_frequent_const(pre_tf_lo_s1); \
+       prepare_frequent_const(pre_tf_hi_s1); \
+       prepare_frequent_const(pre_tf_lo_s4); \
+       prepare_frequent_const(pre_tf_hi_s4); \
+       prepare_frequent_const(post_tf_lo_s1); \
+       prepare_frequent_const(post_tf_hi_s1); \
+       prepare_frequent_const(post_tf_lo_s3); \
+       prepare_frequent_const(post_tf_hi_s3); \
+       prepare_frequent_const(post_tf_lo_s2); \
+       prepare_frequent_const(post_tf_hi_s2)
+
+#define frequent_constants_declare \
+       __m128i inv_shift_row_stack; \
+       __m128i pack_bswap_stack; \
+       __m128i shufb_16x16b_stack; \
+       __m128i mask_0f_stack; \
+       __m128i pre_tf_lo_s1_stack; \
+       __m128i pre_tf_hi_s1_stack; \
+       __m128i pre_tf_lo_s4_stack; \
+       __m128i pre_tf_hi_s4_stack; \
+       __m128i post_tf_lo_s1_stack; \
+       __m128i post_tf_hi_s1_stack; \
+       __m128i post_tf_lo_s3_stack; \
+       __m128i post_tf_hi_s3_stack; \
+       __m128i post_tf_lo_s2_stack; \
+       __m128i post_tf_hi_s2_stack
+
+/**********************************************************************
+  16-way camellia macros
+ **********************************************************************/
+
+/*
+ * IN:
+ *   x0..x7: byte-sliced AB state
+ *   mem_cd: register pointer storing CD state
+ *   key: index for key material
+ * OUT:
+ *   x0..x7: new byte-sliced CD state
+ */
+#define roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, t0, t1, t2, t3, t4, t5, t6, \
+                 t7, mem_cd, key) \
+       /* \
+        * S-function with AES subbytes \
+        */ \
+       if_not_aes_subbytes(aes_load_inv_shufmask(t4);) \
+       load_frequent_const(mask_0f, t7); \
+       load_frequent_const(pre_tf_lo_s1, t0); \
+       load_frequent_const(pre_tf_hi_s1, t1); \
+       \
+       /* AES inverse shift rows */ \
+       if_not_aes_subbytes( \
+         aes_inv_shuf(t4, x0, x0); \
+         aes_inv_shuf(t4, x7, x7); \
+         aes_inv_shuf(t4, x1, x1); \
+         aes_inv_shuf(t4, x4, x4); \
+         aes_inv_shuf(t4, x2, x2); \
+         aes_inv_shuf(t4, x5, x5); \
+         aes_inv_shuf(t4, x3, x3); \
+         aes_inv_shuf(t4, x6, x6); \
+       ) \
+       \
+       /* prefilter sboxes 1, 2 and 3 */ \
+       load_frequent_const(pre_tf_lo_s4, t2); \
+       load_frequent_const(pre_tf_hi_s4, t3); \
+       filter_8bit(x0, t0, t1, t7, t6); \
+       filter_8bit(x7, t0, t1, t7, t6); \
+       filter_8bit(x1, t0, t1, t7, t6); \
+       filter_8bit(x4, t0, t1, t7, t6); \
+       filter_8bit(x2, t0, t1, t7, t6); \
+       filter_8bit(x5, t0, t1, t7, t6); \
+       \
+       /* prefilter sbox 4 */ \
+       if_not_aes_subbytes(load_zero(t4);) \
+       filter_8bit(x3, t2, t3, t7, t6); \
+       filter_8bit(x6, t2, t3, t7, t6); \
+       \
+       /* AES subbytes + AES shift rows */ \
+       load_frequent_const(post_tf_lo_s1, t0); \
+       load_frequent_const(post_tf_hi_s1, t1); \
+       if_not_aes_subbytes( \
+         aes_subbytes_and_shuf_and_xor(t4, x0, x0); \
+         aes_subbytes_and_shuf_and_xor(t4, x7, x7); \
+         aes_subbytes_and_shuf_and_xor(t4, x1, x1); \
+         aes_subbytes_and_shuf_and_xor(t4, x4, x4); \
+         aes_subbytes_and_shuf_and_xor(t4, x2, x2); \
+         aes_subbytes_and_shuf_and_xor(t4, x5, x5); \
+         aes_subbytes_and_shuf_and_xor(t4, x3, x3); \
+         aes_subbytes_and_shuf_and_xor(t4, x6, x6); \
+       ) \
+       if_aes_subbytes( \
+         aes_subbytes(x0, x0); \
+         aes_subbytes(x7, x7); \
+         aes_subbytes(x1, x1); \
+         aes_subbytes(x4, x4); \
+         aes_subbytes(x2, x2); \
+         aes_subbytes(x5, x5); \
+         aes_subbytes(x3, x3); \
+         aes_subbytes(x6, x6); \
+       ) \
+       \
+       /* postfilter sboxes 1 and 4 */ \
+       load_frequent_const(post_tf_lo_s3, t2); \
+       load_frequent_const(post_tf_hi_s3, t3); \
+       filter_8bit(x0, t0, t1, t7, t6); \
+       filter_8bit(x7, t0, t1, t7, t6); \
+       filter_8bit(x3, t0, t1, t7, t6); \
+       filter_8bit(x6, t0, t1, t7, t6); \
+       \
+       /* postfilter sbox 3 */ \
+       load_frequent_const(post_tf_lo_s2, t4); \
+       load_frequent_const(post_tf_hi_s2, t5); \
+       filter_8bit(x2, t2, t3, t7, t6); \
+       filter_8bit(x5, t2, t3, t7, t6); \
+       \
+       vmovq128((key), t0); \
+       \
+       /* postfilter sbox 2 */ \
+       filter_8bit(x1, t4, t5, t7, t2); \
+       filter_8bit(x4, t4, t5, t7, t2); \
+       \
+       /* P-function */ \
+       vpxor128(x5, x0, x0); \
+       vpxor128(x6, x1, x1); \
+       vpxor128(x7, x2, x2); \
+       vpxor128(x4, x3, x3); \
+       \
+       vpxor128(x2, x4, x4); \
+       vpxor128(x3, x5, x5); \
+       vpxor128(x0, x6, x6); \
+       vpxor128(x1, x7, x7); \
+       \
+       vpxor128(x7, x0, x0); \
+       vpxor128(x4, x1, x1); \
+       vpxor128(x5, x2, x2); \
+       vpxor128(x6, x3, x3); \
+       \
+       vpxor128(x3, x4, x4); \
+       vpxor128(x0, x5, x5); \
+       vpxor128(x1, x6, x6); \
+       vpxor128(x2, x7, x7); /* note: high and low parts swapped */ \
+       \
+       /* Add key material and result to CD (x becomes new CD) */ \
+       \
+       vpshufb128(bcast[7], t0, t7); \
+       vpshufb128(bcast[6], t0, t6); \
+       vpshufb128(bcast[5], t0, t5); \
+       vpshufb128(bcast[4], t0, t4); \
+       vpshufb128(bcast[3], t0, t3); \
+       vpshufb128(bcast[2], t0, t2); \
+       vpshufb128(bcast[1], t0, t1); \
+       \
+       vpxor128(t3, x4, x4); \
+       vpxor128(mem_cd[0], x4, x4); \
+       \
+       load_zero(t3); \
+       vpshufb128(t3, t0, t0); \
+       \
+       vpxor128(t2, x5, x5); \
+       vpxor128(mem_cd[1], x5, x5); \
+       \
+       vpxor128(t1, x6, x6); \
+       vpxor128(mem_cd[2], x6, x6); \
+       \
+       vpxor128(t0, x7, x7); \
+       vpxor128(mem_cd[3], x7, x7); \
+       \
+       vpxor128(t7, x0, x0); \
+       vpxor128(mem_cd[4], x0, x0); \
+       \
+       vpxor128(t6, x1, x1); \
+       vpxor128(mem_cd[5], x1, x1); \
+       \
+       vpxor128(t5, x2, x2); \
+       vpxor128(mem_cd[6], x2, x2); \
+       \
+       vpxor128(t4, x3, x3); \
+       vpxor128(mem_cd[7], x3, x3);
+
+/*
+ * IN/OUT:
+ *  x0..x7: byte-sliced AB state preloaded
+ *  mem_ab: byte-sliced AB state in memory
+ *  mem_cb: byte-sliced CD state in memory
+ */
+#define two_roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
+                     y6, y7, mem_ab, mem_cd, i, dir, store_ab) \
+       roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
+                 y6, y7, mem_cd, ctx->key_table[(i)]); \
+       \
+       vmovdqa128(x4, mem_cd[0]); \
+       vmovdqa128(x5, mem_cd[1]); \
+       vmovdqa128(x6, mem_cd[2]); \
+       vmovdqa128(x7, mem_cd[3]); \
+       vmovdqa128(x0, mem_cd[4]); \
+       vmovdqa128(x1, mem_cd[5]); \
+       vmovdqa128(x2, mem_cd[6]); \
+       vmovdqa128(x3, mem_cd[7]); \
+       \
+       roundsm16(x4, x5, x6, x7, x0, x1, x2, x3, y0, y1, y2, y3, y4, y5, \
+                 y6, y7, mem_ab, ctx->key_table[(i) + (dir)]); \
+       \
+       store_ab(x0, x1, x2, x3, x4, x5, x6, x7, mem_ab);
+
+#define dummy_store(x0, x1, x2, x3, x4, x5, x6, x7, mem_ab) /* do nothing */
+
+#define store_ab_state(x0, x1, x2, x3, x4, x5, x6, x7, mem_ab) \
+       /* Store new AB state */ \
+       vmovdqa128(x0, mem_ab[0]); \
+       vmovdqa128(x1, mem_ab[1]); \
+       vmovdqa128(x2, mem_ab[2]); \
+       vmovdqa128(x3, mem_ab[3]); \
+       vmovdqa128(x4, mem_ab[4]); \
+       vmovdqa128(x5, mem_ab[5]); \
+       vmovdqa128(x6, mem_ab[6]); \
+       vmovdqa128(x7, mem_ab[7]);
+
+#define enc_rounds16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
+                     y6, y7, mem_ab, mem_cd, i) \
+       two_roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
+                     y6, y7, mem_ab, mem_cd, (i) + 2, 1, store_ab_state); \
+       two_roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
+                     y6, y7, mem_ab, mem_cd, (i) + 4, 1, store_ab_state); \
+       two_roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
+                     y6, y7, mem_ab, mem_cd, (i) + 6, 1, dummy_store);
+
+#define dec_rounds16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
+                     y6, y7, mem_ab, mem_cd, i) \
+       two_roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
+                     y6, y7, mem_ab, mem_cd, (i) + 7, -1, store_ab_state); \
+       two_roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
+                     y6, y7, mem_ab, mem_cd, (i) + 5, -1, store_ab_state); \
+       two_roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
+                     y6, y7, mem_ab, mem_cd, (i) + 3, -1, dummy_store);
+
+#define LE64_LO32(x) ((x) & 0xffffffffU)
+#define LE64_HI32(x) ((x >> 32) & 0xffffffffU)
+
+/*
+ * IN:
+ *  v0..3: byte-sliced 32-bit integers
+ * OUT:
+ *  v0..3: (IN <<< 1)
+ */
+#define rol32_1_16(v0, v1, v2, v3, t0, t1, t2, zero) \
+       if_vpsrlb128(vpsrlb128(7, v0, t0)); \
+       if_not_vpsrlb128(vpcmpgtb128(v0, zero, t0)); \
+       vpaddb128(v0, v0, v0); \
+       if_not_vpsrlb128(vpabsb128(t0, t0)); \
+       \
+       if_vpsrlb128(vpsrlb128(7, v1, t1)); \
+       if_not_vpsrlb128(vpcmpgtb128(v1, zero, t1)); \
+       vpaddb128(v1, v1, v1); \
+       if_not_vpsrlb128(vpabsb128(t1, t1)); \
+       \
+       if_vpsrlb128(vpsrlb128(7, v2, t2)); \
+       if_not_vpsrlb128(vpcmpgtb128(v2, zero, t2)); \
+       vpaddb128(v2, v2, v2); \
+       if_not_vpsrlb128(vpabsb128(t2, t2)); \
+       \
+       vpor128(t0, v1, v1); \
+       \
+       if_vpsrlb128(vpsrlb128(7, v3, t0)); \
+       if_not_vpsrlb128(vpcmpgtb128(v3, zero, t0)); \
+       vpaddb128(v3, v3, v3); \
+       if_not_vpsrlb128(vpabsb128(t0, t0)); \
+       \
+       vpor128(t1, v2, v2); \
+       vpor128(t2, v3, v3); \
+       vpor128(t0, v0, v0);
+
+/*
+ * IN:
+ *   r: byte-sliced AB state in memory
+ *   l: byte-sliced CD state in memory
+ * OUT:
+ *   x0..x7: new byte-sliced CD state
+ */
+#define fls16(l, l0, l1, l2, l3, l4, l5, l6, l7, r, t0, t1, t2, t3, tt0, \
+             tt1, tt2, tt3, kl, kr) \
+       /* \
+        * t0 = kll; \
+        * t0 &= ll; \
+        * lr ^= rol32(t0, 1); \
+        */ \
+       load_zero(tt0); \
+       vmovd128(LE64_LO32(*(kl)), t0); \
+       vpshufb128(tt0, t0, t3); \
+       vpshufb128(bcast[1], t0, t2); \
+       vpshufb128(bcast[2], t0, t1); \
+       vpshufb128(bcast[3], t0, t0); \
+       \
+       vpand128(l0, t0, t0); \
+       vpand128(l1, t1, t1); \
+       vpand128(l2, t2, t2); \
+       vpand128(l3, t3, t3); \
+       \
+       rol32_1_16(t3, t2, t1, t0, tt1, tt2, tt3, tt0); \
+       \
+       vpxor128(l4, t0, l4); \
+       vmovdqa128(l4, l[4]); \
+       vpxor128(l5, t1, l5); \
+       vmovdqa128(l5, l[5]); \
+       vpxor128(l6, t2, l6); \
+       vmovdqa128(l6, l[6]); \
+       vpxor128(l7, t3, l7); \
+       vmovdqa128(l7, l[7]); \
+       \
+       /* \
+        * t2 = krr; \
+        * t2 |= rr; \
+        * rl ^= t2; \
+        */ \
+       \
+       vmovd128(LE64_HI32(*(kr)), t0); \
+       vpshufb128(tt0, t0, t3); \
+       vpshufb128(bcast[1], t0, t2); \
+       vpshufb128(bcast[2], t0, t1); \
+       vpshufb128(bcast[3], t0, t0); \
+       \
+       vpor128(r[4], t0, t0); \
+       vpor128(r[5], t1, t1); \
+       vpor128(r[6], t2, t2); \
+       vpor128(r[7], t3, t3); \
+       \
+       vpxor128(r[0], t0, t0); \
+       vpxor128(r[1], t1, t1); \
+       vpxor128(r[2], t2, t2); \
+       vpxor128(r[3], t3, t3); \
+       vmovdqa128(t0, r[0]); \
+       vmovdqa128(t1, r[1]); \
+       vmovdqa128(t2, r[2]); \
+       vmovdqa128(t3, r[3]); \
+       \
+       /* \
+        * t2 = krl; \
+        * t2 &= rl; \
+        * rr ^= rol32(t2, 1); \
+        */ \
+       vmovd128(LE64_LO32(*(kr)), t0); \
+       vpshufb128(tt0, t0, t3); \
+       vpshufb128(bcast[1], t0, t2); \
+       vpshufb128(bcast[2], t0, t1); \
+       vpshufb128(bcast[3], t0, t0); \
+       \
+       vpand128(r[0], t0, t0); \
+       vpand128(r[1], t1, t1); \
+       vpand128(r[2], t2, t2); \
+       vpand128(r[3], t3, t3); \
+       \
+       rol32_1_16(t3, t2, t1, t0, tt1, tt2, tt3, tt0); \
+       \
+       vpxor128(r[4], t0, t0); \
+       vpxor128(r[5], t1, t1); \
+       vpxor128(r[6], t2, t2); \
+       vpxor128(r[7], t3, t3); \
+       vmovdqa128(t0, r[4]); \
+       vmovdqa128(t1, r[5]); \
+       vmovdqa128(t2, r[6]); \
+       vmovdqa128(t3, r[7]); \
+       \
+       /* \
+        * t0 = klr; \
+        * t0 |= lr; \
+        * ll ^= t0; \
+        */ \
+       \
+       vmovd128(LE64_HI32(*(kl)), t0); \
+       vpshufb128(tt0, t0, t3); \
+       vpshufb128(bcast[1], t0, t2); \
+       vpshufb128(bcast[2], t0, t1); \
+       vpshufb128(bcast[3], t0, t0); \
+       \
+       vpor128(l4, t0, t0); \
+       vpor128(l5, t1, t1); \
+       vpor128(l6, t2, t2); \
+       vpor128(l7, t3, t3); \
+       \
+       vpxor128(l0, t0, l0); \
+       vmovdqa128(l0, l[0]); \
+       vpxor128(l1, t1, l1); \
+       vmovdqa128(l1, l[1]); \
+       vpxor128(l2, t2, l2); \
+       vmovdqa128(l2, l[2]); \
+       vpxor128(l3, t3, l3); \
+       vmovdqa128(l3, l[3]);
+
+#define byteslice_16x16b_fast(a0, b0, c0, d0, a1, b1, c1, d1, a2, b2, c2, d2, \
+                             a3, b3, c3, d3, st0, st1) \
+       vmovdqa128(d2, st0); \
+       vmovdqa128(d3, st1); \
+       transpose_4x4(a0, a1, a2, a3, d2, d3); \
+       transpose_4x4(b0, b1, b2, b3, d2, d3); \
+       vmovdqa128(st0, d2); \
+       vmovdqa128(st1, d3); \
+       \
+       vmovdqa128(a0, st0); \
+       vmovdqa128(a1, st1); \
+       transpose_4x4(c0, c1, c2, c3, a0, a1); \
+       transpose_4x4(d0, d1, d2, d3, a0, a1); \
+       \
+       vmovdqa128(shufb_16x16b_stack, a0); \
+       vmovdqa128(st1, a1); \
+       vpshufb128(a0, a2, a2); \
+       vpshufb128(a0, a3, a3); \
+       vpshufb128(a0, b0, b0); \
+       vpshufb128(a0, b1, b1); \
+       vpshufb128(a0, b2, b2); \
+       vpshufb128(a0, b3, b3); \
+       vpshufb128(a0, a1, a1); \
+       vpshufb128(a0, c0, c0); \
+       vpshufb128(a0, c1, c1); \
+       vpshufb128(a0, c2, c2); \
+       vpshufb128(a0, c3, c3); \
+       vpshufb128(a0, d0, d0); \
+       vpshufb128(a0, d1, d1); \
+       vpshufb128(a0, d2, d2); \
+       vpshufb128(a0, d3, d3); \
+       vmovdqa128(d3, st1); \
+       vmovdqa128(st0, d3); \
+       vpshufb128(a0, d3, a0); \
+       vmovdqa128(d2, st0); \
+       \
+       transpose_4x4(a0, b0, c0, d0, d2, d3); \
+       transpose_4x4(a1, b1, c1, d1, d2, d3); \
+       vmovdqa128(st0, d2); \
+       vmovdqa128(st1, d3); \
+       \
+       vmovdqa128(b0, st0); \
+       vmovdqa128(b1, st1); \
+       transpose_4x4(a2, b2, c2, d2, b0, b1); \
+       transpose_4x4(a3, b3, c3, d3, b0, b1); \
+       vmovdqa128(st0, b0); \
+       vmovdqa128(st1, b1); \
+       /* does not adjust output bytes inside vectors */
+
+/* load blocks to registers and apply pre-whitening */
+#define inpack16_pre(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
+                    y6, y7, rio, key) \
+       vmovq128((key), x0); \
+       vpshufb128(pack_bswap_stack, x0, x0); \
+       \
+       vpxor128_memld((rio) + 0 * 16, x0, y7); \
+       vpxor128_memld((rio) + 1 * 16, x0, y6); \
+       vpxor128_memld((rio) + 2 * 16, x0, y5); \
+       vpxor128_memld((rio) + 3 * 16, x0, y4); \
+       vpxor128_memld((rio) + 4 * 16, x0, y3); \
+       vpxor128_memld((rio) + 5 * 16, x0, y2); \
+       vpxor128_memld((rio) + 6 * 16, x0, y1); \
+       vpxor128_memld((rio) + 7 * 16, x0, y0); \
+       vpxor128_memld((rio) + 8 * 16, x0, x7); \
+       vpxor128_memld((rio) + 9 * 16, x0, x6); \
+       vpxor128_memld((rio) + 10 * 16, x0, x5); \
+       vpxor128_memld((rio) + 11 * 16, x0, x4); \
+       vpxor128_memld((rio) + 12 * 16, x0, x3); \
+       vpxor128_memld((rio) + 13 * 16, x0, x2); \
+       vpxor128_memld((rio) + 14 * 16, x0, x1); \
+       vpxor128_memld((rio) + 15 * 16, x0, x0);
+
+/* byteslice pre-whitened blocks and store to temporary memory */
+#define inpack16_post(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
+                     y6, y7, mem_ab, mem_cd) \
+       byteslice_16x16b_fast(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, \
+                             y4, y5, y6, y7, mem_ab[0], mem_cd[0]); \
+       \
+       vmovdqa128(x0, mem_ab[0]); \
+       vmovdqa128(x1, mem_ab[1]); \
+       vmovdqa128(x2, mem_ab[2]); \
+       vmovdqa128(x3, mem_ab[3]); \
+       vmovdqa128(x4, mem_ab[4]); \
+       vmovdqa128(x5, mem_ab[5]); \
+       vmovdqa128(x6, mem_ab[6]); \
+       vmovdqa128(x7, mem_ab[7]); \
+       vmovdqa128(y0, mem_cd[0]); \
+       vmovdqa128(y1, mem_cd[1]); \
+       vmovdqa128(y2, mem_cd[2]); \
+       vmovdqa128(y3, mem_cd[3]); \
+       vmovdqa128(y4, mem_cd[4]); \
+       vmovdqa128(y5, mem_cd[5]); \
+       vmovdqa128(y6, mem_cd[6]); \
+       vmovdqa128(y7, mem_cd[7]);
+
+/* de-byteslice, apply post-whitening and store blocks */
+#define outunpack16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, \
+                   y5, y6, y7, key, stack_tmp0, stack_tmp1) \
+       byteslice_16x16b_fast(y0, y4, x0, x4, y1, y5, x1, x5, y2, y6, x2, x6, \
+                             y3, y7, x3, x7, stack_tmp0, stack_tmp1); \
+       \
+       vmovdqa128(x0, stack_tmp0); \
+       \
+       vmovq128((key), x0); \
+       vpshufb128(pack_bswap_stack, x0, x0); \
+       \
+       vpxor128(x0, y7, y7); \
+       vpxor128(x0, y6, y6); \
+       vpxor128(x0, y5, y5); \
+       vpxor128(x0, y4, y4); \
+       vpxor128(x0, y3, y3); \
+       vpxor128(x0, y2, y2); \
+       vpxor128(x0, y1, y1); \
+       vpxor128(x0, y0, y0); \
+       vpxor128(x0, x7, x7); \
+       vpxor128(x0, x6, x6); \
+       vpxor128(x0, x5, x5); \
+       vpxor128(x0, x4, x4); \
+       vpxor128(x0, x3, x3); \
+       vpxor128(x0, x2, x2); \
+       vpxor128(x0, x1, x1); \
+       vpxor128(stack_tmp0, x0, x0);
+
+#define write_output(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
+                    y6, y7, rio) \
+       vmovdqu128_memst(x0, (rio) + 0 * 16); \
+       vmovdqu128_memst(x1, (rio) + 1 * 16); \
+       vmovdqu128_memst(x2, (rio) + 2 * 16); \
+       vmovdqu128_memst(x3, (rio) + 3 * 16); \
+       vmovdqu128_memst(x4, (rio) + 4 * 16); \
+       vmovdqu128_memst(x5, (rio) + 5 * 16); \
+       vmovdqu128_memst(x6, (rio) + 6 * 16); \
+       vmovdqu128_memst(x7, (rio) + 7 * 16); \
+       vmovdqu128_memst(y0, (rio) + 8 * 16); \
+       vmovdqu128_memst(y1, (rio) + 9 * 16); \
+       vmovdqu128_memst(y2, (rio) + 10 * 16); \
+       vmovdqu128_memst(y3, (rio) + 11 * 16); \
+       vmovdqu128_memst(y4, (rio) + 12 * 16); \
+       vmovdqu128_memst(y5, (rio) + 13 * 16); \
+       vmovdqu128_memst(y6, (rio) + 14 * 16); \
+       vmovdqu128_memst(y7, (rio) + 15 * 16);
+
+/**********************************************************************
+  macros for defining constant vectors
+ **********************************************************************/
+#define SWAP_LE64(x) (x)
+
+#define M128I_BYTE(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7) \
+       { \
+         SWAP_LE64((((a0) & 0xffULL) << 0) | \
+                   (((a1) & 0xffULL) << 8) | \
+                   (((a2) & 0xffULL) << 16) | \
+                   (((a3) & 0xffULL) << 24) | \
+                   (((a4) & 0xffULL) << 32) | \
+                   (((a5) & 0xffULL) << 40) | \
+                   (((a6) & 0xffULL) << 48) | \
+                   (((a7) & 0xffULL) << 56)), \
+         SWAP_LE64((((b0) & 0xffULL) << 0) | \
+                   (((b1) & 0xffULL) << 8) | \
+                   (((b2) & 0xffULL) << 16) | \
+                   (((b3) & 0xffULL) << 24) | \
+                   (((b4) & 0xffULL) << 32) | \
+                   (((b5) & 0xffULL) << 40) | \
+                   (((b6) & 0xffULL) << 48) | \
+                   (((b7) & 0xffULL) << 56)) \
+       }
+
+#define M128I_U32(a0, a1, b0, b1) \
+       { \
+         SWAP_LE64((((a0) & 0xffffffffULL) << 0) | \
+                   (((a1) & 0xffffffffULL) << 32)), \
+         SWAP_LE64((((b0) & 0xffffffffULL) << 0) | \
+                   (((b1) & 0xffffffffULL) << 32)) \
+       }
+
+#define M128I_REP16(x) { (0x0101010101010101ULL * (x)), (0x0101010101010101ULL * (x)) }
+
+#define SHUFB_BYTES(idx) \
+       (((0 + (idx)) << 0)  | ((4 + (idx)) << 8) | \
+        ((8 + (idx)) << 16) | ((12 + (idx)) << 24))
+
+typedef u64 uint64_unaligned_t __attribute__((aligned(1), may_alias));
+
+static const __m128i shufb_16x16b =
+  M128I_U32(SHUFB_BYTES(0), SHUFB_BYTES(1), SHUFB_BYTES(2), SHUFB_BYTES(3));
+
+static const __m128i pack_bswap =
+  M128I_U32(0x00010203, 0x04050607, 0x0f0f0f0f, 0x0f0f0f0f);
+
+static const __m128i bcast[8] =
+{
+  M128I_REP16(0), M128I_REP16(1), M128I_REP16(2), M128I_REP16(3),
+  M128I_REP16(4), M128I_REP16(5), M128I_REP16(6), M128I_REP16(7)
+};
+
+/*
+ * pre-SubByte transform
+ *
+ * pre-lookup for sbox1, sbox2, sbox3:
+ *   swap_bitendianness(
+ *       isom_map_camellia_to_aes(
+ *           camellia_f(
+ *               swap_bitendianess(in)
+ *           )
+ *       )
+ *   )
+ *
+ * (note: '⊕ 0xc5' inside camellia_f())
+ */
+static const __m128i pre_tf_lo_s1 =
+  M128I_BYTE(0x45, 0xe8, 0x40, 0xed, 0x2e, 0x83, 0x2b, 0x86,
+            0x4b, 0xe6, 0x4e, 0xe3, 0x20, 0x8d, 0x25, 0x88);
+
+static const __m128i pre_tf_hi_s1 =
+  M128I_BYTE(0x00, 0x51, 0xf1, 0xa0, 0x8a, 0xdb, 0x7b, 0x2a,
+            0x09, 0x58, 0xf8, 0xa9, 0x83, 0xd2, 0x72, 0x23);
+
+/*
+ * pre-SubByte transform
+ *
+ * pre-lookup for sbox4:
+ *   swap_bitendianness(
+ *       isom_map_camellia_to_aes(
+ *           camellia_f(
+ *               swap_bitendianess(in <<< 1)
+ *           )
+ *       )
+ *   )
+ *
+ * (note: '⊕ 0xc5' inside camellia_f())
+ */
+static const __m128i pre_tf_lo_s4 =
+  M128I_BYTE(0x45, 0x40, 0x2e, 0x2b, 0x4b, 0x4e, 0x20, 0x25,
+            0x14, 0x11, 0x7f, 0x7a, 0x1a, 0x1f, 0x71, 0x74);
+
+static const __m128i pre_tf_hi_s4 =
+  M128I_BYTE(0x00, 0xf1, 0x8a, 0x7b, 0x09, 0xf8, 0x83, 0x72,
+            0xad, 0x5c, 0x27, 0xd6, 0xa4, 0x55, 0x2e, 0xdf);
+
+/*
+ * post-SubByte transform
+ *
+ * post-lookup for sbox1, sbox4:
+ *  swap_bitendianness(
+ *      camellia_h(
+ *          isom_map_aes_to_camellia(
+ *              swap_bitendianness(
+ *                  aes_inverse_affine_transform(in)
+ *              )
+ *          )
+ *      )
+ *  )
+ *
+ * (note: '⊕ 0x6e' inside camellia_h())
+ */
+static const __m128i post_tf_lo_s1 =
+  M128I_BYTE(0x3c, 0xcc, 0xcf, 0x3f, 0x32, 0xc2, 0xc1, 0x31,
+            0xdc, 0x2c, 0x2f, 0xdf, 0xd2, 0x22, 0x21, 0xd1);
+
+static const __m128i post_tf_hi_s1 =
+  M128I_BYTE(0x00, 0xf9, 0x86, 0x7f, 0xd7, 0x2e, 0x51, 0xa8,
+            0xa4, 0x5d, 0x22, 0xdb, 0x73, 0x8a, 0xf5, 0x0c);
+
+/*
+ * post-SubByte transform
+ *
+ * post-lookup for sbox2:
+ *  swap_bitendianness(
+ *      camellia_h(
+ *          isom_map_aes_to_camellia(
+ *              swap_bitendianness(
+ *                  aes_inverse_affine_transform(in)
+ *              )
+ *          )
+ *      )
+ *  ) <<< 1
+ *
+ * (note: '⊕ 0x6e' inside camellia_h())
+ */
+static const __m128i post_tf_lo_s2 =
+  M128I_BYTE(0x78, 0x99, 0x9f, 0x7e, 0x64, 0x85, 0x83, 0x62,
+            0xb9, 0x58, 0x5e, 0xbf, 0xa5, 0x44, 0x42, 0xa3);
+
+static const __m128i post_tf_hi_s2 =
+  M128I_BYTE(0x00, 0xf3, 0x0d, 0xfe, 0xaf, 0x5c, 0xa2, 0x51,
+            0x49, 0xba, 0x44, 0xb7, 0xe6, 0x15, 0xeb, 0x18);
+
+/*
+ * post-SubByte transform
+ *
+ * post-lookup for sbox3:
+ *  swap_bitendianness(
+ *      camellia_h(
+ *          isom_map_aes_to_camellia(
+ *              swap_bitendianness(
+ *                  aes_inverse_affine_transform(in)
+ *              )
+ *          )
+ *      )
+ *  ) >>> 1
+ *
+ * (note: '⊕ 0x6e' inside camellia_h())
+ */
+static const __m128i post_tf_lo_s3 =
+  M128I_BYTE(0x1e, 0x66, 0xe7, 0x9f, 0x19, 0x61, 0xe0, 0x98,
+            0x6e, 0x16, 0x97, 0xef, 0x69, 0x11, 0x90, 0xe8);
+
+static const __m128i post_tf_hi_s3 =
+  M128I_BYTE(0x00, 0xfc, 0x43, 0xbf, 0xeb, 0x17, 0xa8, 0x54,
+            0x52, 0xae, 0x11, 0xed, 0xb9, 0x45, 0xfa, 0x06);
+
+/* For isolating SubBytes from AESENCLAST, inverse shift row */
+static const __m128i inv_shift_row =
+  M128I_BYTE(0x00, 0x0d, 0x0a, 0x07, 0x04, 0x01, 0x0e, 0x0b,
+            0x08, 0x05, 0x02, 0x0f, 0x0c, 0x09, 0x06, 0x03);
+
+/* 4-bit mask */
+static const __m128i mask_0f =
+  M128I_U32(0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f);
+
+/* Encrypts 16 input block from IN and writes result to OUT. IN and OUT may
+ * unaligned pointers. */
+void ASM_FUNC_ATTR_NOINLINE
+FUNC_ENC_BLK16(const void *key_table, void *vout, const void *vin,
+              int key_length)
+{
+  const struct enc_ctx_s
+  {
+    const u64 *key_table;
+    int key_length;
+  } sctx =
+    {
+      .key_table = (const u64 *)key_table,
+      .key_length = key_length
+    };
+  const struct enc_ctx_s *ctx = &sctx;
+  char *out = vout;
+  const char *in = vin;
+  __m128i x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15;
+  __m128i ab[8];
+  __m128i cd[8];
+  __m128i tmp0, tmp1;
+  unsigned int lastk, k;
+  frequent_constants_declare;
+
+  prepare_frequent_constants();
+
+  if (ctx->key_length > 16)
+    lastk = 32;
+  else
+    lastk = 24;
+
+  inpack16_pre(x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14,
+              x15, in, ctx->key_table[0]);
+
+  inpack16_post(x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14,
+               x15, ab, cd);
+
+  k = 0;
+  while (1)
+    {
+      enc_rounds16(x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14,
+                 x15, ab, cd, k);
+
+      if (k == lastk - 8)
+       break;
+
+      fls16(ab, x0, x1, x2, x3, x4, x5, x6, x7, cd, x8, x9, x10, x11, x12, x13, x14,
+           x15, &ctx->key_table[k + 8], &ctx->key_table[k + 9]);
+
+      k += 8;
+    }
+
+  /* load CD for output */
+  vmovdqa128(cd[0], x8);
+  vmovdqa128(cd[1], x9);
+  vmovdqa128(cd[2], x10);
+  vmovdqa128(cd[3], x11);
+  vmovdqa128(cd[4], x12);
+  vmovdqa128(cd[5], x13);
+  vmovdqa128(cd[6], x14);
+  vmovdqa128(cd[7], x15);
+
+  outunpack16(x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14,
+             x15, ctx->key_table[lastk], tmp0, tmp1);
+
+  write_output(x7, x6, x5, x4, x3, x2, x1, x0, x15, x14, x13, x12, x11, x10, x9,
+              x8, out);
+}
+
+/* Decrypts 16 input block from IN and writes result to OUT. IN and OUT may
+ * unaligned pointers. */
+void ASM_FUNC_ATTR_NOINLINE
+FUNC_DEC_BLK16(const void *key_table, void *vout, const void *vin,
+              int key_length)
+{
+  const struct dec_ctx_s
+  {
+    const u64 *key_table;
+    int key_length;
+  } sctx =
+    {
+      .key_table = (const u64 *)key_table,
+      .key_length = key_length
+    };
+  const struct dec_ctx_s *ctx = &sctx;
+  char *out = vout;
+  const char *in = vin;
+  __m128i x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15;
+  __m128i ab[8];
+  __m128i cd[8];
+  __m128i tmp0, tmp1;
+  unsigned int firstk, k;
+  frequent_constants_declare;
+
+  prepare_frequent_constants();
+
+  if (ctx->key_length > 16)
+    firstk = 32;
+  else
+    firstk = 24;
+
+  inpack16_pre(x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14,
+              x15, in, ctx->key_table[firstk]);
+
+  inpack16_post(x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14,
+               x15, ab, cd);
+
+  k = firstk - 8;
+  while (1)
+    {
+      dec_rounds16(x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13,
+                 x14, x15, ab, cd, k);
+
+      if (k == 0)
+       break;
+
+      fls16(ab, x0, x1, x2, x3, x4, x5, x6, x7, cd, x8, x9, x10, x11, x12, x13,
+           x14, x15, &ctx->key_table[k + 1], &ctx->key_table[k]);
+
+      k -= 8;
+    }
+
+  /* load CD for output */
+  vmovdqa128(cd[0], x8);
+  vmovdqa128(cd[1], x9);
+  vmovdqa128(cd[2], x10);
+  vmovdqa128(cd[3], x11);
+  vmovdqa128(cd[4], x12);
+  vmovdqa128(cd[5], x13);
+  vmovdqa128(cd[6], x14);
+  vmovdqa128(cd[7], x15);
+
+  outunpack16(x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14,
+             x15, ctx->key_table[0], tmp0, tmp1);
+
+  write_output(x7, x6, x5, x4, x3, x2, x1, x0, x15, x14, x13, x12, x11, x10, x9,
+              x8, out);
+}
+
+/********* Key setup **********************************************************/
+
+/*
+ * Camellia F-function, 1-way SIMD/AESNI.
+ *
+ * IN:
+ *  ab: 64-bit AB state
+ *  cd: 64-bit CD state
+ */
+#define camellia_f(ab, x, t0, t1, t2, t3, t4, inv_shift_row, sbox4mask, \
+                  _0f0f0f0fmask, pre_s1lo_mask, pre_s1hi_mask, key) \
+       vmovq128((key), t0); \
+       load_zero(t3); \
+       \
+       vpxor128(ab, t0, x); \
+       \
+       /* \
+        * S-function with AES subbytes \
+        */ \
+       \
+       /* input rotation for sbox4 (<<< 1) */ \
+       vpand128(x, sbox4mask, t0); \
+       vpandn128(x, sbox4mask, x); \
+       vpaddb128(t0, t0, t1); \
+       vpsrl_byte_128(7, t0, t0); \
+       vpor128(t0, t1, t0); \
+       vpand128(sbox4mask, t0, t0); \
+       vpor128(t0, x, x); \
+       \
+       vmovdqa128_memld(&post_tf_lo_s1, t0); \
+       vmovdqa128_memld(&post_tf_hi_s1, t1); \
+       \
+       /* prefilter sboxes */ \
+       filter_8bit(x, pre_s1lo_mask, pre_s1hi_mask, _0f0f0f0fmask, t2); \
+       \
+       /* AES subbytes + AES shift rows + AES inv shift rows */ \
+       aes_subbytes_and_shuf_and_xor(t3, x, x); \
+       \
+       /* postfilter sboxes */ \
+       filter_8bit(x, t0, t1, _0f0f0f0fmask, t2); \
+       \
+       /* output rotation for sbox2 (<<< 1) */ \
+       /* output rotation for sbox3 (>>> 1) */ \
+       aes_inv_shuf(inv_shift_row, x, t1); \
+       vpshufb128_amemld(&sp0044440444044404mask, x, t4); \
+       vpshufb128_amemld(&sp1110111010011110mask, x, x); \
+       vpaddb128(t1, t1, t2); \
+       vpsrl_byte_128(7, t1, t0); \
+       vpsll_byte_128(7, t1, t3); \
+       vpor128(t0, t2, t0); \
+       vpsrl_byte_128(1, t1, t1); \
+       vpshufb128_amemld(&sp0222022222000222mask, t0, t0); \
+       vpor128(t1, t3, t1); \
+       \
+       vpxor128(x, t4, t4); \
+       vpshufb128_amemld(&sp3033303303303033mask, t1, t1); \
+       vpxor128(t4, t0, t0); \
+       vpxor128(t1, t0, t0); \
+       vpsrldq128(8, t0, x); \
+       vpxor128(t0, x, x); \
+
+#define vec_rol128(in, out, nrol, t0) \
+       vpshufd128_0x4e(in, out); \
+       vpsllq128((nrol), in, t0); \
+       vpsrlq128((64-(nrol)), out, out); \
+       vpaddb128(t0, out, out);
+
+#define vec_ror128(in, out, nror, t0) \
+       vpshufd128_0x4e(in, out); \
+       vpsrlq128((nror), in, t0); \
+       vpsllq128((64-(nror)), out, out); \
+       vpaddb128(t0, out, out);
+
+#define U64_BYTE(a0, a1, a2, a3, b0, b1, b2, b3) \
+       ( \
+         SWAP_LE64((((a0) & 0xffULL) << 0) | \
+                   (((a1) & 0xffULL) << 8) | \
+                   (((a2) & 0xffULL) << 16) | \
+                   (((a3) & 0xffULL) << 24) | \
+                   (((b0) & 0xffULL) << 32) | \
+                   (((b1) & 0xffULL) << 40) | \
+                   (((b2) & 0xffULL) << 48) | \
+                   (((b3) & 0xffULL) << 56)) \
+       )
+
+#define U64_U32(a0, b0) \
+       ( \
+         SWAP_LE64((((a0) & 0xffffffffULL) << 0) | \
+                   (((b0) & 0xffffffffULL) << 32)) \
+       )
+
+static const __m128i bswap128_mask =
+  M128I_BYTE(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+
+static const __m128i inv_shift_row_and_unpcklbw =
+  M128I_BYTE(0x00, 0xff, 0x0d, 0xff, 0x0a, 0xff, 0x07, 0xff,
+            0x04, 0xff, 0x01, 0xff, 0x0e, 0xff, 0x0b, 0xff);
+
+static const __m128i sp0044440444044404mask =
+  M128I_U32(0xffff0404, 0x0404ff04, 0x0d0dff0d, 0x0d0dff0d);
+
+static const __m128i sp1110111010011110mask =
+  M128I_U32(0x000000ff, 0x000000ff, 0x0bffff0b, 0x0b0b0bff);
+
+static const __m128i sp0222022222000222mask =
+  M128I_U32(0xff060606, 0xff060606, 0x0c0cffff, 0xff0c0c0c);
+
+static const __m128i sp3033303303303033mask =
+  M128I_U32(0x04ff0404, 0x04ff0404, 0xff0a0aff, 0x0aff0a0a);
+
+static const u64 sbox4_input_mask =
+  U64_BYTE(0x00, 0xff, 0x00, 0x00, 0xff, 0x00, 0x00, 0x00);
+
+static const u64 sigma1 =
+  U64_U32(0x3BCC908B, 0xA09E667F);
+
+static const u64 sigma2 =
+  U64_U32(0x4CAA73B2, 0xB67AE858);
+
+static const u64 sigma3 =
+  U64_U32(0xE94F82BE, 0xC6EF372F);
+
+static const u64 sigma4 =
+  U64_U32(0xF1D36F1C, 0x54FF53A5);
+
+static const u64 sigma5 =
+  U64_U32(0xDE682D1D, 0x10E527FA);
+
+static const u64 sigma6 =
+  U64_U32(0xB3E6C1FD, 0xB05688C2);
+
+#define cmll_sub(n, ctx) &ctx->key_table[n]
+
+static ASM_FUNC_ATTR_INLINE void
+camellia_setup128(void *key_table, __m128i x0)
+{
+  struct setup128_ctx_s
+  {
+    u64 *key_table;
+  } sctx = { .key_table = (u64 *)key_table };
+  struct setup128_ctx_s *ctx = &sctx;
+
+  /* input:
+   *   ctx: subkey storage at key_table(CTX)
+   *   x0: key
+   */
+
+  __m128i x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15;
+  __m128i tmp0;
+
+#define KL128 x0
+#define KA128 x2
+
+  vpshufb128_amemld(&bswap128_mask, KL128, KL128);
+
+  vmovdqa128_memld(&inv_shift_row_and_unpcklbw, x11);
+  vmovq128(sbox4_input_mask, x12);
+  vmovdqa128_memld(&mask_0f, x13);
+  vmovdqa128_memld(&pre_tf_lo_s1, x14);
+  vmovdqa128_memld(&pre_tf_hi_s1, x15);
+
+  /*
+   * Generate KA
+   */
+  vpsrldq128(8, KL128, x2);
+  vmovdqa128(KL128, x3);
+  vpslldq128(8, x3, x3);
+  vpsrldq128(8, x3, x3);
+
+  camellia_f(x2, x4, x1,
+            x5, x6, x7, x8,
+            x11, x12, x13, x14, x15, sigma1);
+  vpxor128(x4, x3, x3);
+  camellia_f(x3, x2, x1,
+            x5, x6, x7, x8,
+            x11, x12, x13, x14, x15, sigma2);
+  camellia_f(x2, x3, x1,
+            x5, x6, x7, x8,
+            x11, x12, x13, x14, x15, sigma3);
+  vpxor128(x4, x3, x3);
+  camellia_f(x3, x4, x1,
+            x5, x6, x7, x8,
+            x11, x12, x13, x14, x15, sigma4);
+
+  vpslldq128(8, x3, x3);
+  vpxor128(x4, x2, x2);
+  vpsrldq128(8, x3, x3);
+  vpslldq128(8, x2, KA128);
+  vpor128(x3, KA128, KA128);
+
+  /*
+   * Generate subkeys
+   */
+  vmovdqu128_memst(KA128, cmll_sub(24, ctx));
+  vec_rol128(KL128, x3, 15, x15);
+  vec_rol128(KA128, x4, 15, x15);
+  vec_rol128(KA128, x5, 30, x15);
+  vec_rol128(KL128, x6, 45, x15);
+  vec_rol128(KA128, x7, 45, x15);
+  vec_rol128(KL128, x8, 60, x15);
+  vec_rol128(KA128, x9, 60, x15);
+  vec_ror128(KL128, x10, 128-77, x15);
+
+  /* absorb kw2 to other subkeys */
+  vpslldq128(8, KL128, x15);
+  vpsrldq128(8, x15, x15);
+  vpxor128(x15, KA128, KA128);
+  vpxor128(x15, x3, x3);
+  vpxor128(x15, x4, x4);
+
+  /* subl(1) ^= subr(1) & ~subr(9); */
+  vpandn128(x15, x5, x13);
+  vpslldq128(12, x13, x13);
+  vpsrldq128(8, x13, x13);
+  vpxor128(x13, x15, x15);
+  /* dw = subl(1) & subl(9), subr(1) ^= CAMELLIA_RL1(dw); */
+  vpand128(x15, x5, x14);
+  vpslld128(1, x14, x11);
+  vpsrld128(31, x14, x14);
+  vpaddb128(x11, x14, x14);
+  vpslldq128(8, x14, x14);
+  vpsrldq128(12, x14, x14);
+  vpxor128(x14, x15, x15);
+
+  vpxor128(x15, x6, x6);
+  vpxor128(x15, x8, x8);
+  vpxor128(x15, x9, x9);
+
+  /* subl(1) ^= subr(1) & ~subr(17); */
+  vpandn128(x15, x10, x13);
+  vpslldq128(12, x13, x13);
+  vpsrldq128(8, x13, x13);
+  vpxor128(x13, x15, x15);
+  /* dw = subl(1) & subl(17), subr(1) ^= CAMELLIA_RL1(dw); */
+  vpand128(x15, x10, x14);
+  vpslld128(1, x14, x11);
+  vpsrld128(31, x14, x14);
+  vpaddb128(x11, x14, x14);
+  vpslldq128(8, x14, x14);
+  vpsrldq128(12, x14, x14);
+  vpxor128(x14, x15, x15);
+
+  vpshufd128_0x1b(KL128, KL128);
+  vpshufd128_0x1b(KA128, KA128);
+  vpshufd128_0x1b(x3, x3);
+  vpshufd128_0x1b(x4, x4);
+  vpshufd128_0x1b(x5, x5);
+  vpshufd128_0x1b(x6, x6);
+  vpshufd128_0x1b(x7, x7);
+  vpshufd128_0x1b(x8, x8);
+  vpshufd128_0x1b(x9, x9);
+  vpshufd128_0x1b(x10, x10);
+
+  vmovdqu128_memst(KL128, cmll_sub(0, ctx));
+  vpshufd128_0x1b(KL128, KL128);
+  vmovdqu128_memst(KA128, cmll_sub(2, ctx));
+  vmovdqu128_memst(x3, cmll_sub(4, ctx));
+  vmovdqu128_memst(x4, cmll_sub(6, ctx));
+  vmovdqu128_memst(x5, cmll_sub(8, ctx));
+  vmovdqu128_memst(x6, cmll_sub(10, ctx));
+  vpsrldq128(8, x8, x8);
+  vmovq128_memst(x7, cmll_sub(12, ctx));
+  vmovq128_memst(x8, cmll_sub(13, ctx));
+  vmovdqu128_memst(x9, cmll_sub(14, ctx));
+  vmovdqu128_memst(x10, cmll_sub(16, ctx));
+
+  vmovdqu128_memld(cmll_sub(24, ctx), KA128);
+
+  vec_ror128(KL128, x3, 128 - 94, x7);
+  vec_ror128(KA128, x4, 128 - 94, x7);
+  vec_ror128(KL128, x5, 128 - 111, x7);
+  vec_ror128(KA128, x6, 128 - 111, x7);
+
+  vpxor128(x15, x3, x3);
+  vpxor128(x15, x4, x4);
+  vpxor128(x15, x5, x5);
+  vpslldq128(8, x15, x15);
+  vpxor128(x15, x6, x6);
+
+  /* absorb kw4 to other subkeys */
+  vpslldq128(8, x6, x15);
+  vpxor128(x15, x5, x5);
+  vpxor128(x15, x4, x4);
+  vpxor128(x15, x3, x3);
+
+  /* subl(25) ^= subr(25) & ~subr(16); */
+  vmovdqu128_memld(cmll_sub(16, ctx), tmp0);
+  vpshufd128_0x1b(tmp0, x10);
+  vpandn128(x15, x10, x13);
+  vpslldq128(4, x13, x13);
+  vpxor128(x13, x15, x15);
+  /* dw = subl(25) & subl(16), subr(25) ^= CAMELLIA_RL1(dw); */
+  vpand128(x15, x10, x14);
+  vpslld128(1, x14, x11);
+  vpsrld128(31, x14, x14);
+  vpaddb128(x11, x14, x14);
+  vpsrldq128(12, x14, x14);
+  vpslldq128(8, x14, x14);
+  vpxor128(x14, x15, x15);
+
+  vpshufd128_0x1b(x3, x3);
+  vpshufd128_0x1b(x4, x4);
+  vpshufd128_0x1b(x5, x5);
+  vpshufd128_0x1b(x6, x6);
+
+  vmovdqu128_memst(x3, cmll_sub(18, ctx));
+  vmovdqu128_memst(x4, cmll_sub(20, ctx));
+  vmovdqu128_memst(x5, cmll_sub(22, ctx));
+  vmovdqu128_memst(x6, cmll_sub(24, ctx));
+
+  vmovdqu128_memld(cmll_sub(14, ctx), tmp0);
+  vpshufd128_0x1b(tmp0, x3);
+  vmovdqu128_memld(cmll_sub(12, ctx), tmp0);
+  vpshufd128_0x1b(tmp0, x4);
+  vmovdqu128_memld(cmll_sub(10, ctx), tmp0);
+  vpshufd128_0x1b(tmp0, x5);
+  vmovdqu128_memld(cmll_sub(8, ctx), tmp0);
+  vpshufd128_0x1b(tmp0, x6);
+
+  vpxor128(x15, x3, x3);
+  vpxor128(x15, x4, x4);
+  vpxor128(x15, x5, x5);
+
+  /* subl(25) ^= subr(25) & ~subr(8); */
+  vpandn128(x15, x6, x13);
+  vpslldq128(4, x13, x13);
+  vpxor128(x13, x15, x15);
+  /* dw = subl(25) & subl(8), subr(25) ^= CAMELLIA_RL1(dw); */
+  vpand128(x15, x6, x14);
+  vpslld128(1, x14, x11);
+  vpsrld128(31, x14, x14);
+  vpaddb128(x11, x14, x14);
+  vpsrldq128(12, x14, x14);
+  vpslldq128(8, x14, x14);
+  vpxor128(x14, x15, x15);
+
+  vpshufd128_0x1b(x3, x3);
+  vpshufd128_0x1b(x4, x4);
+  vpshufd128_0x1b(x5, x5);
+
+  vmovdqu128_memst(x3, cmll_sub(14, ctx));
+  vmovdqu128_memst(x4, cmll_sub(12, ctx));
+  vmovdqu128_memst(x5, cmll_sub(10, ctx));
+
+  vmovdqu128_memld(cmll_sub(6, ctx), tmp0);
+  vpshufd128_0x1b(tmp0, x6);
+  vmovdqu128_memld(cmll_sub(4, ctx), tmp0);
+  vpshufd128_0x1b(tmp0, x4);
+  vmovdqu128_memld(cmll_sub(2, ctx), tmp0);
+  vpshufd128_0x1b(tmp0, x2);
+  vmovdqu128_memld(cmll_sub(0, ctx), tmp0);
+  vpshufd128_0x1b(tmp0, x0);
+
+  vpxor128(x15, x6, x6);
+  vpxor128(x15, x4, x4);
+  vpxor128(x15, x2, x2);
+  vpxor128(x15, x0, x0);
+
+  vpshufd128_0x1b(x6, x6);
+  vpshufd128_0x1b(x4, x4);
+  vpshufd128_0x1b(x2, x2);
+  vpshufd128_0x1b(x0, x0);
+
+  vpsrldq128(8, x2, x3);
+  vpsrldq128(8, x4, x5);
+  vpsrldq128(8, x6, x7);
+
+  /*
+   * key XOR is end of F-function.
+   */
+  vpxor128(x2, x0, x0);
+  vpxor128(x4, x2, x2);
+
+  vmovq128_memst(x0, cmll_sub(0, ctx));
+  vmovq128_memst(x3, cmll_sub(2, ctx));
+  vpxor128(x5, x3, x3);
+  vpxor128(x6, x4, x4);
+  vpxor128(x7, x5, x5);
+  vmovq128_memst(x2, cmll_sub(3, ctx));
+  vmovq128_memst(x3, cmll_sub(4, ctx));
+  vmovq128_memst(x4, cmll_sub(5, ctx));
+  vmovq128_memst(x5, cmll_sub(6, ctx));
+
+  vmovq128(*cmll_sub(7, ctx), x7);
+  vmovq128(*cmll_sub(8, ctx), x8);
+  vmovq128(*cmll_sub(9, ctx), x9);
+  vmovq128(*cmll_sub(10, ctx), x10);
+  /* tl = subl(10) ^ (subr(10) & ~subr(8)); */
+  vpandn128(x10, x8, x15);
+  vpsrldq128(4, x15, x15);
+  vpxor128(x15, x10, x0);
+  /* dw = tl & subl(8), tr = subr(10) ^ CAMELLIA_RL1(dw); */
+  vpand128(x8, x0, x15);
+  vpslld128(1, x15, x14);
+  vpsrld128(31, x15, x15);
+  vpaddb128(x14, x15, x15);
+  vpslldq128(12, x15, x15);
+  vpsrldq128(8, x15, x15);
+  vpxor128(x15, x0, x0);
+
+  vpxor128(x0, x6, x6);
+  vmovq128_memst(x6, cmll_sub(7, ctx));
+
+  vmovq128(*cmll_sub(11, ctx), x11);
+  vmovq128(*cmll_sub(12, ctx), x12);
+  vmovq128(*cmll_sub(13, ctx), x13);
+  vmovq128(*cmll_sub(14, ctx), x14);
+  vmovq128(*cmll_sub(15, ctx), x15);
+  /* tl = subl(7) ^ (subr(7) & ~subr(9)); */
+  vpandn128(x7, x9, x1);
+  vpsrldq128(4, x1, x1);
+  vpxor128(x1, x7, x0);
+  /* dw = tl & subl(9), tr = subr(7) ^ CAMELLIA_RL1(dw); */
+  vpand128(x9, x0, x1);
+  vpslld128(1, x1, x2);
+  vpsrld128(31, x1, x1);
+  vpaddb128(x2, x1, x1);
+  vpslldq128(12, x1, x1);
+  vpsrldq128(8, x1, x1);
+  vpxor128(x1, x0, x0);
+
+  vpxor128(x11, x0, x0);
+  vpxor128(x12, x10, x10);
+  vpxor128(x13, x11, x11);
+  vpxor128(x14, x12, x12);
+  vpxor128(x15, x13, x13);
+  vmovq128_memst(x0, cmll_sub(10, ctx));
+  vmovq128_memst(x10, cmll_sub(11, ctx));
+  vmovq128_memst(x11, cmll_sub(12, ctx));
+  vmovq128_memst(x12, cmll_sub(13, ctx));
+  vmovq128_memst(x13, cmll_sub(14, ctx));
+
+  vmovq128(*cmll_sub(16, ctx), x6);
+  vmovq128(*cmll_sub(17, ctx), x7);
+  vmovq128(*cmll_sub(18, ctx), x8);
+  vmovq128(*cmll_sub(19, ctx), x9);
+  vmovq128(*cmll_sub(20, ctx), x10);
+  /* tl = subl(18) ^ (subr(18) & ~subr(16)); */
+  vpandn128(x8, x6, x1);
+  vpsrldq128(4, x1, x1);
+  vpxor128(x1, x8, x0);
+  /* dw = tl & subl(16), tr = subr(18) ^ CAMELLIA_RL1(dw); */
+  vpand128(x6, x0, x1);
+  vpslld128(1, x1, x2);
+  vpsrld128(31, x1, x1);
+  vpaddb128(x2, x1, x1);
+  vpslldq128(12, x1, x1);
+  vpsrldq128(8, x1, x1);
+  vpxor128(x1, x0, x0);
+
+  vpxor128(x14, x0, x0);
+  vmovq128_memst(x0, cmll_sub(15, ctx));
+
+  /* tl = subl(15) ^ (subr(15) & ~subr(17)); */
+  vpandn128(x15, x7, x1);
+  vpsrldq128(4, x1, x1);
+  vpxor128(x1, x15, x0);
+  /* dw = tl & subl(17), tr = subr(15) ^ CAMELLIA_RL1(dw); */
+  vpand128(x7, x0, x1);
+  vpslld128(1, x1, x2);
+  vpsrld128(31, x1, x1);
+  vpaddb128(x2, x1, x1);
+  vpslldq128(12, x1, x1);
+  vpsrldq128(8, x1, x1);
+  vpxor128(x1, x0, x0);
+
+  vmovq128(*cmll_sub(21, ctx), x1);
+  vmovq128(*cmll_sub(22, ctx), x2);
+  vmovq128(*cmll_sub(23, ctx), x3);
+  vmovq128(*cmll_sub(24, ctx), x4);
+
+  vpxor128(x9, x0, x0);
+  vpxor128(x10, x8, x8);
+  vpxor128(x1, x9, x9);
+  vpxor128(x2, x10, x10);
+  vpxor128(x3, x1, x1);
+  vpxor128(x4, x3, x3);
+
+  vmovq128_memst(x0, cmll_sub(18, ctx));
+  vmovq128_memst(x8, cmll_sub(19, ctx));
+  vmovq128_memst(x9, cmll_sub(20, ctx));
+  vmovq128_memst(x10, cmll_sub(21, ctx));
+  vmovq128_memst(x1, cmll_sub(22, ctx));
+  vmovq128_memst(x2, cmll_sub(23, ctx));
+  vmovq128_memst(x3, cmll_sub(24, ctx));
+
+#undef KL128
+#undef KA128
+
+  /* kw2 and kw4 are unused now. */
+  load_zero(tmp0);
+  vmovq128_memst(tmp0, cmll_sub(1, ctx));
+  vmovq128_memst(tmp0, cmll_sub(25, ctx));
+}
+
+static ASM_FUNC_ATTR_INLINE void
+camellia_setup256(void *key_table, __m128i x0, __m128i x1)
+{
+  struct setup256_ctx_s
+  {
+    u64 *key_table;
+  } sctx = { .key_table = (u64 *)key_table };
+  struct setup256_ctx_s *ctx = &sctx;
+
+  /* input:
+   *   ctx: subkey storage at key_table(CTX)
+   *   x0, x1: key
+   */
+
+  __m128i x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15;
+  __m128i tmp0;
+
+#define KL128 x0
+#define KR128 x1
+#define KA128 x2
+#define KB128 x3
+
+  vpshufb128_amemld(&bswap128_mask, KL128, KL128);
+  vpshufb128_amemld(&bswap128_mask, KR128, KR128);
+
+  vmovdqa128_memld(&inv_shift_row_and_unpcklbw, x11);
+  vmovq128(*&sbox4_input_mask, x12);
+  vmovdqa128_memld(&mask_0f, x13);
+  vmovdqa128_memld(&pre_tf_lo_s1, x14);
+  vmovdqa128_memld(&pre_tf_hi_s1, x15);
+
+  /*
+   * Generate KA
+   */
+  vpxor128(KL128, KR128, x3);
+  vpsrldq128(8, KR128, x6);
+  vpsrldq128(8, x3, x2);
+  vpslldq128(8, x3, x3);
+  vpsrldq128(8, x3, x3);
+
+  camellia_f(x2, x4, x5,
+            x7, x8, x9, x10,
+            x11, x12, x13, x14, x15, sigma1);
+  vpxor128(x4, x3, x3);
+  camellia_f(x3, x2, x5,
+            x7, x8, x9, x10,
+            x11, x12, x13, x14, x15, sigma2);
+  vpxor128(x6, x2, x2);
+  camellia_f(x2, x3, x5,
+            x7, x8, x9, x10,
+            x11, x12, x13, x14, x15, sigma3);
+  vpxor128(x4, x3, x3);
+  vpxor128(KR128, x3, x3);
+  camellia_f(x3, x4, x5,
+            x7, x8, x9, x10,
+            x11, x12, x13, x14, x15, sigma4);
+
+  vpslldq128(8, x3, x3);
+  vpxor128(x4, x2, x2);
+  vpsrldq128(8, x3, x3);
+  vpslldq128(8, x2, KA128);
+  vpor128(x3, KA128, KA128);
+
+  /*
+   * Generate KB
+   */
+  vpxor128(KA128, KR128, x3);
+  vpsrldq128(8, x3, x4);
+  vpslldq128(8, x3, x3);
+  vpsrldq128(8, x3, x3);
+
+  camellia_f(x4, x5, x6,
+            x7, x8, x9, x10,
+            x11, x12, x13, x14, x15, sigma5);
+  vpxor128(x5, x3, x3);
+
+  camellia_f(x3, x5, x6,
+            x7, x8, x9, x10,
+            x11, x12, x13, x14, x15, sigma6);
+  vpslldq128(8, x3, x3);
+  vpxor128(x5, x4, x4);
+  vpsrldq128(8, x3, x3);
+  vpslldq128(8, x4, x4);
+  vpor128(x3, x4, KB128);
+
+  /*
+   * Generate subkeys
+   */
+  vmovdqu128_memst(KB128, cmll_sub(32, ctx));
+  vec_rol128(KR128, x4, 15, x15);
+  vec_rol128(KA128, x5, 15, x15);
+  vec_rol128(KR128, x6, 30, x15);
+  vec_rol128(KB128, x7, 30, x15);
+  vec_rol128(KL128, x8, 45, x15);
+  vec_rol128(KA128, x9, 45, x15);
+  vec_rol128(KL128, x10, 60, x15);
+  vec_rol128(KR128, x11, 60, x15);
+  vec_rol128(KB128, x12, 60, x15);
+
+  /* absorb kw2 to other subkeys */
+  vpslldq128(8, KL128, x15);
+  vpsrldq128(8, x15, x15);
+  vpxor128(x15, KB128, KB128);
+  vpxor128(x15, x4, x4);
+  vpxor128(x15, x5, x5);
+
+  /* subl(1) ^= subr(1) & ~subr(9); */
+  vpandn128(x15, x6, x13);
+  vpslldq128(12, x13, x13);
+  vpsrldq128(8, x13, x13);
+  vpxor128(x13, x15, x15);
+  /* dw = subl(1) & subl(9), subr(1) ^= CAMELLIA_RL1(dw); */
+  vpand128(x15, x6, x14);
+  vpslld128(1, x14, x13);
+  vpsrld128(31, x14, x14);
+  vpaddb128(x13, x14, x14);
+  vpslldq128(8, x14, x14);
+  vpsrldq128(12, x14, x14);
+  vpxor128(x14, x15, x15);
+
+  vpxor128(x15, x7, x7);
+  vpxor128(x15, x8, x8);
+  vpxor128(x15, x9, x9);
+
+  vpshufd128_0x1b(KL128, KL128);
+  vpshufd128_0x1b(KB128, KB128);
+  vpshufd128_0x1b(x4, x4);
+  vpshufd128_0x1b(x5, x5);
+  vpshufd128_0x1b(x6, x6);
+  vpshufd128_0x1b(x7, x7);
+  vpshufd128_0x1b(x8, x8);
+  vpshufd128_0x1b(x9, x9);
+
+  vmovdqu128_memst(KL128, cmll_sub(0, ctx));
+  vpshufd128_0x1b(KL128, KL128);
+  vmovdqu128_memst(KB128, cmll_sub(2, ctx));
+  vmovdqu128_memst(x4, cmll_sub(4, ctx));
+  vmovdqu128_memst(x5, cmll_sub(6, ctx));
+  vmovdqu128_memst(x6, cmll_sub(8, ctx));
+  vmovdqu128_memst(x7, cmll_sub(10, ctx));
+  vmovdqu128_memst(x8, cmll_sub(12, ctx));
+  vmovdqu128_memst(x9, cmll_sub(14, ctx));
+
+  vmovdqu128_memld(cmll_sub(32, ctx), KB128);
+
+  /* subl(1) ^= subr(1) & ~subr(17); */
+  vpandn128(x15, x10, x13);
+  vpslldq128(12, x13, x13);
+  vpsrldq128(8, x13, x13);
+  vpxor128(x13, x15, x15);
+  /* dw = subl(1) & subl(17), subr(1) ^= CAMELLIA_RL1(dw); */
+  vpand128(x15, x10, x14);
+  vpslld128(1, x14, x13);
+  vpsrld128(31, x14, x14);
+  vpaddb128(x13, x14, x14);
+  vpslldq128(8, x14, x14);
+  vpsrldq128(12, x14, x14);
+  vpxor128(x14, x15, x15);
+
+  vpxor128(x15, x11, x11);
+  vpxor128(x15, x12, x12);
+
+  vec_ror128(KL128, x4, 128-77, x14);
+  vec_ror128(KA128, x5, 128-77, x14);
+  vec_ror128(KR128, x6, 128-94, x14);
+  vec_ror128(KA128, x7, 128-94, x14);
+  vec_ror128(KL128, x8, 128-111, x14);
+  vec_ror128(KB128, x9, 128-111, x14);
+
+  vpxor128(x15, x4, x4);
+
+  vpshufd128_0x1b(x10, x10);
+  vpshufd128_0x1b(x11, x11);
+  vpshufd128_0x1b(x12, x12);
+  vpshufd128_0x1b(x4, x4);
+
+  vmovdqu128_memst(x10, cmll_sub(16, ctx));
+  vmovdqu128_memst(x11, cmll_sub(18, ctx));
+  vmovdqu128_memst(x12, cmll_sub(20, ctx));
+  vmovdqu128_memst(x4, cmll_sub(22, ctx));
+
+  /* subl(1) ^= subr(1) & ~subr(25); */
+  vpandn128(x15, x5, x13);
+  vpslldq128(12, x13, x13);
+  vpsrldq128(8, x13, x13);
+  vpxor128(x13, x15, x15);
+  /* dw = subl(1) & subl(25), subr(1) ^= CAMELLIA_RL1(dw); */
+  vpand128(x15, x5, x14);
+  vpslld128(1, x14, x13);
+  vpsrld128(31, x14, x14);
+  vpaddb128(x13, x14, x14);
+  vpslldq128(8, x14, x14);
+  vpsrldq128(12, x14, x14);
+  vpxor128(x14, x15, x15);
+
+  vpxor128(x15, x6, x6);
+  vpxor128(x15, x7, x7);
+  vpxor128(x15, x8, x8);
+  vpslldq128(8, x15, x15);
+  vpxor128(x15, x9, x9);
+
+  /* absorb kw4 to other subkeys */
+  vpslldq128(8, x9, x15);
+  vpxor128(x15, x8, x8);
+  vpxor128(x15, x7, x7);
+  vpxor128(x15, x6, x6);
+
+  /* subl(33) ^= subr(33) & ~subr(24); */
+  vpandn128(x15, x5, x14);
+  vpslldq128(4, x14, x14);
+  vpxor128(x14, x15, x15);
+  /* dw = subl(33) & subl(24), subr(33) ^= CAMELLIA_RL1(dw); */
+  vpand128(x15, x5, x14);
+  vpslld128(1, x14, x13);
+  vpsrld128(31, x14, x14);
+  vpaddb128(x13, x14, x14);
+  vpsrldq128(12, x14, x14);
+  vpslldq128(8, x14, x14);
+  vpxor128(x14, x15, x15);
+
+  vpshufd128_0x1b(x5, x5);
+  vpshufd128_0x1b(x6, x6);
+  vpshufd128_0x1b(x7, x7);
+  vpshufd128_0x1b(x8, x8);
+  vpshufd128_0x1b(x9, x9);
+
+  vmovdqu128_memst(x5, cmll_sub(24, ctx));
+  vmovdqu128_memst(x6, cmll_sub(26, ctx));
+  vmovdqu128_memst(x7, cmll_sub(28, ctx));
+  vmovdqu128_memst(x8, cmll_sub(30, ctx));
+  vmovdqu128_memst(x9, cmll_sub(32, ctx));
+
+  vmovdqu128_memld(cmll_sub(22, ctx), tmp0);
+  vpshufd128_0x1b(tmp0, x0);
+  vmovdqu128_memld(cmll_sub(20, ctx), tmp0);
+  vpshufd128_0x1b(tmp0, x1);
+  vmovdqu128_memld(cmll_sub(18, ctx), tmp0);
+  vpshufd128_0x1b(tmp0, x2);
+  vmovdqu128_memld(cmll_sub(16, ctx), tmp0);
+  vpshufd128_0x1b(tmp0, x3);
+  vmovdqu128_memld(cmll_sub(14, ctx), tmp0);
+  vpshufd128_0x1b(tmp0, x4);
+  vmovdqu128_memld(cmll_sub(12, ctx), tmp0);
+  vpshufd128_0x1b(tmp0, x5);
+  vmovdqu128_memld(cmll_sub(10, ctx), tmp0);
+  vpshufd128_0x1b(tmp0, x6);
+  vmovdqu128_memld(cmll_sub(8, ctx), tmp0);
+  vpshufd128_0x1b(tmp0, x7);
+
+  vpxor128(x15, x0, x0);
+  vpxor128(x15, x1, x1);
+  vpxor128(x15, x2, x2);
+
+  /* subl(33) ^= subr(33) & ~subr(24); */
+  vpandn128(x15, x3, x14);
+  vpslldq128(4, x14, x14);
+  vpxor128(x14, x15, x15);
+  /* dw = subl(33) & subl(24), subr(33) ^= CAMELLIA_RL1(dw); */
+  vpand128(x15, x3, x14);
+  vpslld128(1, x14, x13);
+  vpsrld128(31, x14, x14);
+  vpaddb128(x13, x14, x14);
+  vpsrldq128(12, x14, x14);
+  vpslldq128(8, x14, x14);
+  vpxor128(x14, x15, x15);
+
+  vpxor128(x15, x4, x4);
+  vpxor128(x15, x5, x5);
+  vpxor128(x15, x6, x6);
+
+  vpshufd128_0x1b(x0, x0);
+  vpshufd128_0x1b(x1, x1);
+  vpshufd128_0x1b(x2, x2);
+  vpshufd128_0x1b(x4, x4);
+  vpshufd128_0x1b(x5, x5);
+  vpshufd128_0x1b(x6, x6);
+
+  vmovdqu128_memst(x0, cmll_sub(22, ctx));
+  vmovdqu128_memst(x1, cmll_sub(20, ctx));
+  vmovdqu128_memst(x2, cmll_sub(18, ctx));
+  vmovdqu128_memst(x4, cmll_sub(14, ctx));
+  vmovdqu128_memst(x5, cmll_sub(12, ctx));
+  vmovdqu128_memst(x6, cmll_sub(10, ctx));
+
+  vmovdqu128_memld(cmll_sub(6, ctx), tmp0);
+  vpshufd128_0x1b(tmp0, x6);
+  vmovdqu128_memld(cmll_sub(4, ctx), tmp0);
+  vpshufd128_0x1b(tmp0, x4);
+  vmovdqu128_memld(cmll_sub(2, ctx), tmp0);
+  vpshufd128_0x1b(tmp0, x2);
+  vmovdqu128_memld(cmll_sub(0, ctx), tmp0);
+  vpshufd128_0x1b(tmp0, x0);
+
+  /* subl(33) ^= subr(33) & ~subr(24); */
+  vpandn128(x15, x7, x14);
+  vpslldq128(4, x14, x14);
+  vpxor128(x14, x15, x15);
+  /* dw = subl(33) & subl(24), subr(33) ^= CAMELLIA_RL1(dw); */
+  vpand128(x15, x7, x14);
+  vpslld128(1, x14, x13);
+  vpsrld128(31, x14, x14);
+  vpaddb128(x13, x14, x14);
+  vpsrldq128(12, x14, x14);
+  vpslldq128(8, x14, x14);
+  vpxor128(x14, x15, x15);
+
+  vpxor128(x15, x6, x6);
+  vpxor128(x15, x4, x4);
+  vpxor128(x15, x2, x2);
+  vpxor128(x15, x0, x0);
+
+  vpshufd128_0x1b(x6, x6);
+  vpshufd128_0x1b(x4, x4);
+  vpshufd128_0x1b(x2, x2);
+  vpshufd128_0x1b(x0, x0);
+
+  vpsrldq128(8, x2, x3);
+  vpsrldq128(8, x4, x5);
+  vpsrldq128(8, x6, x7);
+
+  /*
+    * key XOR is end of F-function.
+    */
+  vpxor128(x2, x0, x0);
+  vpxor128(x4, x2, x2);
+
+  vmovq128_memst(x0, cmll_sub(0, ctx));
+  vmovq128_memst(x3, cmll_sub(2, ctx));
+  vpxor128(x5, x3, x3);
+  vpxor128(x6, x4, x4);
+  vpxor128(x7, x5, x5);
+  vmovq128_memst(x2, cmll_sub(3, ctx));
+  vmovq128_memst(x3, cmll_sub(4, ctx));
+  vmovq128_memst(x4, cmll_sub(5, ctx));
+  vmovq128_memst(x5, cmll_sub(6, ctx));
+
+  vmovq128(*cmll_sub(7, ctx), x7);
+  vmovq128(*cmll_sub(8, ctx), x8);
+  vmovq128(*cmll_sub(9, ctx), x9);
+  vmovq128(*cmll_sub(10, ctx), x10);
+  /* tl = subl(10) ^ (subr(10) & ~subr(8)); */
+  vpandn128(x10, x8, x15);
+  vpsrldq128(4, x15, x15);
+  vpxor128(x15, x10, x0);
+  /* dw = tl & subl(8), tr = subr(10) ^ CAMELLIA_RL1(dw); */
+  vpand128(x8, x0, x15);
+  vpslld128(1, x15, x14);
+  vpsrld128(31, x15, x15);
+  vpaddb128(x14, x15, x15);
+  vpslldq128(12, x15, x15);
+  vpsrldq128(8, x15, x15);
+  vpxor128(x15, x0, x0);
+
+  vpxor128(x0, x6, x6);
+  vmovq128_memst(x6, cmll_sub(7, ctx));
+
+  vmovq128(*cmll_sub(11, ctx), x11);
+  vmovq128(*cmll_sub(12, ctx), x12);
+  vmovq128(*cmll_sub(13, ctx), x13);
+  vmovq128(*cmll_sub(14, ctx), x14);
+  vmovq128(*cmll_sub(15, ctx), x15);
+  /* tl = subl(7) ^ (subr(7) & ~subr(9)); */
+  vpandn128(x7, x9, x1);
+  vpsrldq128(4, x1, x1);
+  vpxor128(x1, x7, x0);
+  /* dw = tl & subl(9), tr = subr(7) ^ CAMELLIA_RL1(dw); */
+  vpand128(x9, x0, x1);
+  vpslld128(1, x1, x2);
+  vpsrld128(31, x1, x1);
+  vpaddb128(x2, x1, x1);
+  vpslldq128(12, x1, x1);
+  vpsrldq128(8, x1, x1);
+  vpxor128(x1, x0, x0);
+
+  vpxor128(x11, x0, x0);
+  vpxor128(x12, x10, x10);
+  vpxor128(x13, x11, x11);
+  vpxor128(x14, x12, x12);
+  vpxor128(x15, x13, x13);
+  vmovq128_memst(x0, cmll_sub(10, ctx));
+  vmovq128_memst(x10, cmll_sub(11, ctx));
+  vmovq128_memst(x11, cmll_sub(12, ctx));
+  vmovq128_memst(x12, cmll_sub(13, ctx));
+  vmovq128_memst(x13, cmll_sub(14, ctx));
+
+  vmovq128(*cmll_sub(16, ctx), x6);
+  vmovq128(*cmll_sub(17, ctx), x7);
+  vmovq128(*cmll_sub(18, ctx), x8);
+  vmovq128(*cmll_sub(19, ctx), x9);
+  vmovq128(*cmll_sub(20, ctx), x10);
+  /* tl = subl(18) ^ (subr(18) & ~subr(16)); */
+  vpandn128(x8, x6, x1);
+  vpsrldq128(4, x1, x1);
+  vpxor128(x1, x8, x0);
+  /* dw = tl & subl(16), tr = subr(18) ^ CAMELLIA_RL1(dw); */
+  vpand128(x6, x0, x1);
+  vpslld128(1, x1, x2);
+  vpsrld128(31, x1, x1);
+  vpaddb128(x2, x1, x1);
+  vpslldq128(12, x1, x1);
+  vpsrldq128(8, x1, x1);
+  vpxor128(x1, x0, x0);
+
+  vpxor128(x14, x0, x0);
+  vmovq128_memst(x0, cmll_sub(15, ctx));
+
+  /* tl = subl(15) ^ (subr(15) & ~subr(17)); */
+  vpandn128(x15, x7, x1);
+  vpsrldq128(4, x1, x1);
+  vpxor128(x1, x15, x0);
+  /* dw = tl & subl(17), tr = subr(15) ^ CAMELLIA_RL1(dw); */
+  vpand128(x7, x0, x1);
+  vpslld128(1, x1, x2);
+  vpsrld128(31, x1, x1);
+  vpaddb128(x2, x1, x1);
+  vpslldq128(12, x1, x1);
+  vpsrldq128(8, x1, x1);
+  vpxor128(x1, x0, x0);
+
+  vmovq128(*cmll_sub(21, ctx), x1);
+  vmovq128(*cmll_sub(22, ctx), x2);
+  vmovq128(*cmll_sub(23, ctx), x3);
+  vmovq128(*cmll_sub(24, ctx), x4);
+
+  vpxor128(x9, x0, x0);
+  vpxor128(x10, x8, x8);
+  vpxor128(x1, x9, x9);
+  vpxor128(x2, x10, x10);
+  vpxor128(x3, x1, x1);
+
+  vmovq128_memst(x0, cmll_sub(18, ctx));
+  vmovq128_memst(x8, cmll_sub(19, ctx));
+  vmovq128_memst(x9, cmll_sub(20, ctx));
+  vmovq128_memst(x10, cmll_sub(21, ctx));
+  vmovq128_memst(x1, cmll_sub(22, ctx));
+
+  vmovq128(*cmll_sub(25, ctx), x5);
+  vmovq128(*cmll_sub(26, ctx), x6);
+  vmovq128(*cmll_sub(27, ctx), x7);
+  vmovq128(*cmll_sub(28, ctx), x8);
+  vmovq128(*cmll_sub(29, ctx), x9);
+  vmovq128(*cmll_sub(30, ctx), x10);
+  vmovq128(*cmll_sub(31, ctx), x11);
+  vmovq128(*cmll_sub(32, ctx), x12);
+
+  /* tl = subl(26) ^ (subr(26) & ~subr(24)); */
+  vpandn128(x6, x4, x15);
+  vpsrldq128(4, x15, x15);
+  vpxor128(x15, x6, x0);
+  /* dw = tl & subl(26), tr = subr(24) ^ CAMELLIA_RL1(dw); */
+  vpand128(x4, x0, x15);
+  vpslld128(1, x15, x14);
+  vpsrld128(31, x15, x15);
+  vpaddb128(x14, x15, x15);
+  vpslldq128(12, x15, x15);
+  vpsrldq128(8, x15, x15);
+  vpxor128(x15, x0, x0);
+
+  vpxor128(x0, x2, x2);
+  vmovq128_memst(x2, cmll_sub(23, ctx));
+
+  /* tl = subl(23) ^ (subr(23) &  ~subr(25)); */
+  vpandn128(x3, x5, x15);
+  vpsrldq128(4, x15, x15);
+  vpxor128(x15, x3, x0);
+  /* dw = tl & subl(26), tr = subr(24) ^ CAMELLIA_RL1(dw); */
+  vpand128(x5, x0, x15);
+  vpslld128(1, x15, x14);
+  vpsrld128(31, x15, x15);
+  vpaddb128(x14, x15, x15);
+  vpslldq128(12, x15, x15);
+  vpsrldq128(8, x15, x15);
+  vpxor128(x15, x0, x0);
+
+  vpxor128(x7, x0, x0);
+  vpxor128(x8, x6, x6);
+  vpxor128(x9, x7, x7);
+  vpxor128(x10, x8, x8);
+  vpxor128(x11, x9, x9);
+  vpxor128(x12, x11, x11);
+
+  vmovq128_memst(x0, cmll_sub(26, ctx));
+  vmovq128_memst(x6, cmll_sub(27, ctx));
+  vmovq128_memst(x7, cmll_sub(28, ctx));
+  vmovq128_memst(x8, cmll_sub(29, ctx));
+  vmovq128_memst(x9, cmll_sub(30, ctx));
+  vmovq128_memst(x10, cmll_sub(31, ctx));
+  vmovq128_memst(x11, cmll_sub(32, ctx));
+
+#undef KL128
+#undef KR128
+#undef KA128
+#undef KB128
+
+  /* kw2 and kw4 are unused now. */
+  load_zero(tmp0);
+  vmovq128_memst(tmp0, cmll_sub(1, ctx));
+  vmovq128_memst(tmp0, cmll_sub(33, ctx));
+}
+
+void ASM_FUNC_ATTR_NOINLINE
+FUNC_KEY_SETUP(void *key_table, const void *vkey, unsigned int keylen)
+{
+  const char *key = vkey;
+
+  /* input:
+   *   key_table: subkey storage at key_table(CTX)
+   *   key_length_bits: output key length as number of bits
+   *   key: input key buffer
+   *   keylen: key length in bytes
+   */
+
+  __m128i x0, x1, x2;
+
+  switch (keylen)
+    {
+      default:
+       return; /* Unsupported key length! */
+
+      case 16:
+       vmovdqu128_memld(key, x0);
+       camellia_setup128(key_table, x0);
+       return;
+
+      case 24:
+       vmovdqu128_memld(key, x0);
+       vmovq128(*(uint64_unaligned_t *)(key + 16), x1);
+
+       x2[0] = -1;
+       x2[1] = -1;
+       vpxor128(x1, x2, x2);
+       vpslldq128(8, x2, x2);
+       vpor128(x2, x1, x1);
+       break;
+
+      case 32:
+       vmovdqu128_memld(key, x0);
+       vmovdqu128_memld(key + 16, x1);
+       break;
+    }
+
+  camellia_setup256(key_table, x0, x1);
+}
index d7a1e6f4a099bdfd364c5c094f8fb0a3bdc797b1..0797e6f38bc1aa5515d76e1faaf9b8b52dde78d1 100644 (file)
@@ -3,19 +3,21 @@
  * Copyright (C) 2006,2007
  * NTT (Nippon Telegraph and Telephone Corporation).
  *
- * This library is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
+ * This file is part of Libgcrypt.
  *
- * This library is distributed in the hope that it will be useful,
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with this library; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ * License along with this program; if not, see <https://www.gnu.org/licenses/>.
+ * SPDX-License-Identifier: LGPL-2.1-or-later
  */
 
 #ifndef HEADER_CAMELLIA_H
index a804654c1b6f10bc2e267d48b7d46a046706dc00..b8ae8ba09ed16e7f8646036aacbb555c100cff0e 100644 (file)
        rorq $32,               RLR0; \
        movq RLR0,              (RIO);
 
-.align 8
+.align 16
 .globl _gcry_cast5_amd64_encrypt_block
 ELF(.type   _gcry_cast5_amd64_encrypt_block,@function;)
 
@@ -223,7 +223,7 @@ _gcry_cast5_amd64_encrypt_block:
        CFI_ENDPROC();
 ELF(.size _gcry_cast5_amd64_encrypt_block,.-_gcry_cast5_amd64_encrypt_block;)
 
-.align 8
+.align 16
 .globl _gcry_cast5_amd64_decrypt_block
 ELF(.type   _gcry_cast5_amd64_decrypt_block,@function;)
 
@@ -373,7 +373,7 @@ ELF(.size _gcry_cast5_amd64_decrypt_block,.-_gcry_cast5_amd64_decrypt_block;)
        rorq $32,               c; \
        rorq $32,               d;
 
-.align 8
+.align 16
 ELF(.type   __cast5_enc_blk4,@function;)
 
 __cast5_enc_blk4:
@@ -403,7 +403,7 @@ __cast5_enc_blk4:
        CFI_ENDPROC();
 ELF(.size __cast5_enc_blk4,.-__cast5_enc_blk4;)
 
-.align 8
+.align 16
 ELF(.type   __cast5_dec_blk4,@function;)
 
 __cast5_dec_blk4:
@@ -435,7 +435,7 @@ __cast5_dec_blk4:
        ret_spec_stop;
 ELF(.size __cast5_dec_blk4,.-__cast5_dec_blk4;)
 
-.align 8
+.align 16
 .globl _gcry_cast5_amd64_ctr_enc
 ELF(.type   _gcry_cast5_amd64_ctr_enc,@function;)
 _gcry_cast5_amd64_ctr_enc:
@@ -512,7 +512,7 @@ _gcry_cast5_amd64_ctr_enc:
        CFI_ENDPROC();
 ELF(.size _gcry_cast5_amd64_ctr_enc,.-_gcry_cast5_amd64_ctr_enc;)
 
-.align 8
+.align 16
 .globl _gcry_cast5_amd64_cbc_dec
 ELF(.type   _gcry_cast5_amd64_cbc_dec,@function;)
 _gcry_cast5_amd64_cbc_dec:
@@ -586,7 +586,7 @@ _gcry_cast5_amd64_cbc_dec:
        CFI_ENDPROC();
 ELF(.size _gcry_cast5_amd64_cbc_dec,.-_gcry_cast5_amd64_cbc_dec;)
 
-.align 8
+.align 16
 .globl _gcry_cast5_amd64_cfb_dec
 ELF(.type   _gcry_cast5_amd64_cfb_dec,@function;)
 _gcry_cast5_amd64_cfb_dec:
index 76ddd2e335ddf59939eddab6cb0c371a3b4b6bc8..ae53e6b449189f8732536bd64db93684f487cd3c 100644 (file)
 #define Kr_arm_dec (Kr_arm_enc + (16))
 
 /* register macros */
-#define CTX %r0
-#define Rs1 %r7
-#define Rs2 %r8
-#define Rs3 %r9
-#define Rs4 %r10
-#define RMASK %r11
-#define RKM %r1
-#define RKR %r2
-
-#define RL0 %r3
-#define RR0 %r4
-
-#define RL1 %r9
-#define RR1 %r10
-
-#define RT0 %lr
-#define RT1 %ip
-#define RT2 %r5
-#define RT3 %r6
+#define CTX r0
+#define Rs1 r7
+#define Rs2 r8
+#define Rs3 r9
+#define Rs4 r10
+#define RMASK r11
+#define RKM r1
+#define RKR r2
+
+#define RL0 r3
+#define RR0 r4
+
+#define RL1 r9
+#define RR1 r10
+
+#define RT0 lr
+#define RT1 ip
+#define RT2 r5
+#define RT3 r6
 
 /* helper macros */
 #define ldr_unaligned_le(rout, rsrc, offs, rtmp) \
 
 _gcry_cast5_arm_encrypt_block:
        /* input:
-        *      %r0: CTX
-        *      %r1: dst
-        *      %r2: src
+        *      r0: CTX
+        *      r1: dst
+        *      r2: src
         */
-       push {%r1, %r4-%r11, %ip, %lr};
+       push {r1, r4-r11, ip, lr};
 
        GET_DATA_POINTER(Rs1, _gcry_cast5_s1to4, Rs2);
        mov RMASK, #(0xff << 2);
@@ -279,7 +279,7 @@ _gcry_cast5_arm_encrypt_block:
        add Rs3, Rs1, #(0x100*4*2);
        add Rs4, Rs1, #(0x100*4*3);
 
-       read_block(%r2, 0, RL0, RR0, RT0);
+       read_block(r2, 0, RL0, RR0, RT0);
 
        load_km(0);
        load_kr(0);
@@ -300,10 +300,10 @@ _gcry_cast5_arm_encrypt_block:
        enc_round(14, F3, RL0, RR0, load_km, shift_kr, dummy);
        enc_round(15, F1, RR0, RL0, dummy, dummy, dummy);
 
-       ldr %r1, [%sp], #4;
-       write_block(%r1, 0, RR0, RL0, RT0, RT1);
+       ldr r1, [sp], #4;
+       write_block(r1, 0, RR0, RL0, RT0, RT1);
 
-       pop {%r4-%r11, %ip, %pc};
+       pop {r4-r11, ip, pc};
 .ltorg
 .size _gcry_cast5_arm_encrypt_block,.-_gcry_cast5_arm_encrypt_block;
 
@@ -313,11 +313,11 @@ _gcry_cast5_arm_encrypt_block:
 
 _gcry_cast5_arm_decrypt_block:
        /* input:
-        *      %r0: CTX
-        *      %r1: dst
-        *      %r2: src
+        *      r0: CTX
+        *      r1: dst
+        *      r2: src
         */
-       push {%r1, %r4-%r11, %ip, %lr};
+       push {r1, r4-r11, ip, lr};
 
        GET_DATA_POINTER(Rs1, _gcry_cast5_s1to4, Rs2);
        mov RMASK, #(0xff << 2);
@@ -325,7 +325,7 @@ _gcry_cast5_arm_decrypt_block:
        add Rs3, Rs1, #(0x100 * 4 * 2);
        add Rs4, Rs1, #(0x100 * 4 * 3);
 
-       read_block(%r2, 0, RL0, RR0, RT0);
+       read_block(r2, 0, RL0, RR0, RT0);
 
        load_km(15);
        load_dec_kr(15);
@@ -346,10 +346,10 @@ _gcry_cast5_arm_decrypt_block:
        dec_round(1, F2, RL0, RR0, load_km, shift_kr, dummy);
        dec_round(0, F1, RR0, RL0, dummy, dummy, dummy);
 
-       ldr %r1, [%sp], #4;
-       write_block(%r1, 0, RR0, RL0, RT0, RT1);
+       ldr r1, [sp], #4;
+       write_block(r1, 0, RR0, RL0, RT0, RT1);
 
-       pop {%r4-%r11, %ip, %pc};
+       pop {r4-r11, ip, pc};
 .ltorg
 .size _gcry_cast5_arm_decrypt_block,.-_gcry_cast5_arm_decrypt_block;
 
@@ -511,7 +511,7 @@ _gcry_cast5_arm_enc_blk2:
         * output:
         *      [RR0, RL0], [RR1, RL1]: dst
         */
-       push {%lr};
+       push {lr};
 
        GET_DATA_POINTER(Rs1, _gcry_cast5_s1to4, Rs2);
        mov RMASK, #(0xff << 2);
@@ -541,7 +541,7 @@ _gcry_cast5_arm_enc_blk2:
        host_to_be(RR1, RT0);
        host_to_be(RL1, RT0);
 
-       pop {%pc};
+       pop {pc};
 .ltorg
 .size _gcry_cast5_arm_enc_blk2,.-_gcry_cast5_arm_enc_blk2;
 
@@ -551,40 +551,40 @@ _gcry_cast5_arm_enc_blk2:
 
 _gcry_cast5_arm_cfb_dec:
        /* input:
-        *      %r0: CTX
-        *      %r1: dst (2 blocks)
-        *      %r2: src (2 blocks)
-        *      %r3: iv (64bit)
+        *      r0: CTX
+        *      r1: dst (2 blocks)
+        *      r2: src (2 blocks)
+        *      r3: iv (64bit)
         */
-       push {%r1, %r2, %r4-%r11, %ip, %lr};
+       push {r1, r2, r4-r11, ip, lr};
 
-       mov %lr, %r3;
+       mov lr, r3;
 
-       /* Load input (iv/%r3 is aligned, src/%r2 might not be) */
-       ldm %r3, {RL0, RR0};
+       /* Load input (iv/r3 is aligned, src/r2 might not be) */
+       ldm r3, {RL0, RR0};
        host_to_be(RL0, RT1);
        host_to_be(RR0, RT1);
-       read_block(%r2, 0, RL1, RR1, %ip);
+       read_block(r2, 0, RL1, RR1, ip);
 
        /* Update IV, load src[1] and save to iv[0] */
-       read_block_host(%r2, 8, %r5, %r6, %r7);
-       stm %lr, {%r5, %r6};
+       read_block_host(r2, 8, r5, r6, r7);
+       stm lr, {r5, r6};
 
        bl _gcry_cast5_arm_enc_blk2;
-       /* result in RR0:RL0, RR1:RL1 = %r4:%r3, %r10:%r9 */
+       /* result in RR0:RL0, RR1:RL1 = r4:r3, r10:r9 */
 
-       /* %r0: dst, %r1: %src */
-       pop {%r0, %r1};
+       /* r0: dst, r1: src */
+       pop {r0, r1};
 
        /* dst = src ^ result */
-       read_block2_host(%r1, %r5, %r6, %r7, %r8, %lr);
-       eor %r5, %r4;
-       eor %r6, %r3;
-       eor %r7, %r10;
-       eor %r8, %r9;
-       write_block2_host(%r0, %r5, %r6, %r7, %r8, %r1, %r2);
-
-       pop {%r4-%r11, %ip, %pc};
+       read_block2_host(r1, r5, r6, r7, r8, lr);
+       eor r5, r4;
+       eor r6, r3;
+       eor r7, r10;
+       eor r8, r9;
+       write_block2_host(r0, r5, r6, r7, r8, r1, r2);
+
+       pop {r4-r11, ip, pc};
 .ltorg
 .size _gcry_cast5_arm_cfb_dec,.-_gcry_cast5_arm_cfb_dec;
 
@@ -594,42 +594,42 @@ _gcry_cast5_arm_cfb_dec:
 
 _gcry_cast5_arm_ctr_enc:
        /* input:
-        *      %r0: CTX
-        *      %r1: dst (2 blocks)
-        *      %r2: src (2 blocks)
-        *      %r3: iv (64bit, big-endian)
+        *      r0: CTX
+        *      r1: dst (2 blocks)
+        *      r2: src (2 blocks)
+        *      r3: iv (64bit, big-endian)
         */
-       push {%r1, %r2, %r4-%r11, %ip, %lr};
+       push {r1, r2, r4-r11, ip, lr};
 
-       mov %lr, %r3;
+       mov lr, r3;
 
        /* Load IV (big => host endian) */
-       read_block_aligned(%lr, 0, RL0, RR0, be_to_host, RT1);
+       read_block_aligned(lr, 0, RL0, RR0, be_to_host, RT1);
 
        /* Construct IVs */
        adds RR1, RR0, #1; /* +1 */
        adc RL1, RL0, #0;
-       adds %r6, RR1, #1; /* +2 */
-       adc %r5, RL1, #0;
+       adds r6, RR1, #1; /* +2 */
+       adc r5, RL1, #0;
 
        /* Store new IV (host => big-endian) */
-       write_block_aligned(%lr, 0, %r5, %r6, host_to_be, RT1);
+       write_block_aligned(lr, 0, r5, r6, host_to_be, RT1);
 
        bl _gcry_cast5_arm_enc_blk2;
-       /* result in RR0:RL0, RR1:RL1 = %r4:%r3, %r10:%r9 */
+       /* result in RR0:RL0, RR1:RL1 = r4:r3, r10:r9 */
 
-       /* %r0: dst, %r1: %src */
-       pop {%r0, %r1};
+       /* r0: dst, r1: src */
+       pop {r0, r1};
 
        /* XOR key-stream with plaintext */
-       read_block2_host(%r1, %r5, %r6, %r7, %r8, %lr);
-       eor %r5, %r4;
-       eor %r6, %r3;
-       eor %r7, %r10;
-       eor %r8, %r9;
-       write_block2_host(%r0, %r5, %r6, %r7, %r8, %r1, %r2);
-
-       pop {%r4-%r11, %ip, %pc};
+       read_block2_host(r1, r5, r6, r7, r8, lr);
+       eor r5, r4;
+       eor r6, r3;
+       eor r7, r10;
+       eor r8, r9;
+       write_block2_host(r0, r5, r6, r7, r8, r1, r2);
+
+       pop {r4-r11, ip, pc};
 .ltorg
 .size _gcry_cast5_arm_ctr_enc,.-_gcry_cast5_arm_ctr_enc;
 
@@ -682,45 +682,45 @@ _gcry_cast5_arm_dec_blk2:
 
 _gcry_cast5_arm_cbc_dec:
        /* input:
-        *      %r0: CTX
-        *      %r1: dst (2 blocks)
-        *      %r2: src (2 blocks)
-        *      %r3: iv (64bit)
+        *      r0: CTX
+        *      r1: dst (2 blocks)
+        *      r2: src (2 blocks)
+        *      r3: iv (64bit)
         */
-       push {%r1-%r11, %ip, %lr};
+       push {r1-r11, ip, lr};
 
-       read_block2(%r2, RL0, RR0, RL1, RR1, RT0);
+       read_block2(r2, RL0, RR0, RL1, RR1, RT0);
 
        /* dec_blk2 is only used by cbc_dec, jump directly in/out instead
         * of function call. */
        b _gcry_cast5_arm_dec_blk2;
 .Ldec_cbc_tail:
-       /* result in RR0:RL0, RR1:RL1 = %r4:%r3, %r10:%r9 */
+       /* result in RR0:RL0, RR1:RL1 = r4:r3, r10:r9 */
 
-       /* %r0: dst, %r1: %src, %r2: iv */
-       pop {%r0-%r2};
+       /* r0: dst, r1: src, r2: iv */
+       pop {r0-r2};
 
-       /* load IV+1 (src[0]) to %r7:%r8. Might be unaligned. */
-       read_block_host(%r1, 0, %r7, %r8, %r5);
-       /* load IV (iv[0]) to %r5:%r6. 'iv' is aligned. */
-       ldm %r2, {%r5, %r6};
+       /* load IV+1 (src[0]) to r7:r8. Might be unaligned. */
+       read_block_host(r1, 0, r7, r8, r5);
+       /* load IV (iv[0]) to r5:r6. 'iv' is aligned. */
+       ldm r2, {r5, r6};
 
        /* out[1] ^= IV+1 */
-       eor %r10, %r7;
-       eor %r9, %r8;
+       eor r10, r7;
+       eor r9, r8;
        /* out[0] ^= IV */
-       eor %r4, %r5;
-       eor %r3, %r6;
+       eor r4, r5;
+       eor r3, r6;
 
-       /* load IV+2 (src[1]) to %r7:%r8. Might be unaligned. */
-       read_block_host(%r1, 8, %r7, %r8, %r5);
+       /* load IV+2 (src[1]) to r7:r8. Might be unaligned. */
+       read_block_host(r1, 8, r7, r8, r5);
        /* store IV+2 to iv[0] (aligned). */
-       stm %r2, {%r7, %r8};
+       stm r2, {r7, r8};
 
        /* store result to dst[0-3]. Might be unaligned. */
-       write_block2_host(%r0, %r4, %r3, %r10, %r9, %r5, %r6);
+       write_block2_host(r0, r4, r3, r10, r9, r5, r6);
 
-       pop {%r4-%r11, %ip, %pc};
+       pop {r4-r11, ip, pc};
 .ltorg
 .size _gcry_cast5_arm_cbc_dec,.-_gcry_cast5_arm_cbc_dec;
 
index 837ea0fe57abaf60e472d2ae718baa8b5e2eafd2..34cf2e58db3d29dbfd20a7e41a15d7256fe624e5 100644 (file)
@@ -4,7 +4,7 @@
  * This file is part of Libgcrypt.
  *
  * Libgcrypt is free software; you can redistribute it and/or modify
- * it under the terms of the GNU Lesser general Public License as
+ * it under the terms of the GNU Lesser General Public License as
  * published by the Free Software Foundation; either version 2.1 of
  * the License, or (at your option) any later version.
  *
@@ -14,8 +14,8 @@
  * GNU Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA
+ * License along with this program; if not, see <https://www.gnu.org/licenses/>.
+ * SPDX-License-Identifier: LGPL-2.1-or-later
  */
 
 /* Test vectors:
@@ -45,7 +45,6 @@
 #include "bithelp.h"
 #include "bufhelp.h"
 #include "cipher-internal.h"
-#include "cipher-selftest.h"
 
 /* USE_AMD64_ASM indicates whether to use AMD64 assembly code. */
 #undef USE_AMD64_ASM
@@ -991,48 +990,6 @@ _gcry_cast5_cfb_dec(void *context, unsigned char *iv, void *outbuf_arg,
 }
 
 
-/* Run the self-tests for CAST5-CTR, tests IV increment of bulk CTR
-   encryption.  Returns NULL on success. */
-static const char *
-selftest_ctr (void)
-{
-  const int nblocks = 4+1;
-  const int blocksize = CAST5_BLOCKSIZE;
-  const int context_size = sizeof(CAST5_context);
-
-  return _gcry_selftest_helper_ctr("CAST5", &cast_setkey,
-           &encrypt_block, nblocks, blocksize, context_size);
-}
-
-
-/* Run the self-tests for CAST5-CBC, tests bulk CBC decryption.
-   Returns NULL on success. */
-static const char *
-selftest_cbc (void)
-{
-  const int nblocks = 4+2;
-  const int blocksize = CAST5_BLOCKSIZE;
-  const int context_size = sizeof(CAST5_context);
-
-  return _gcry_selftest_helper_cbc("CAST5", &cast_setkey,
-           &encrypt_block, nblocks, blocksize, context_size);
-}
-
-
-/* Run the self-tests for CAST5-CFB, tests bulk CBC decryption.
-   Returns NULL on success. */
-static const char *
-selftest_cfb (void)
-{
-  const int nblocks = 4+2;
-  const int blocksize = CAST5_BLOCKSIZE;
-  const int context_size = sizeof(CAST5_context);
-
-  return _gcry_selftest_helper_cfb("CAST5", &cast_setkey,
-           &encrypt_block, nblocks, blocksize, context_size);
-}
-
-
 static const char*
 selftest(void)
 {
@@ -1046,7 +1003,6 @@ selftest(void)
     static const byte cipher[8] =
                     { 0x23, 0x8B, 0x4F, 0xE5, 0x84, 0x7E, 0x44, 0xB2 };
     byte buffer[8];
-    const char *r;
 
     cast_setkey( &c, key, 16, &bulk_ops );
     encrypt_block( &c, buffer, plain );
@@ -1082,15 +1038,6 @@ selftest(void)
     }
 #endif
 
-    if ( (r = selftest_cbc ()) )
-      return r;
-
-    if ( (r = selftest_cfb ()) )
-      return r;
-
-    if ( (r = selftest_ctr ()) )
-      return r;
-
     return NULL;
 }
 
index 2a980b95ce95e7ae2dbbd7ba7e6646a1ae01dc76..692f0f6aa1eb6ef786411bf7354d3d6bce7bb09b 100644 (file)
@@ -36,8 +36,6 @@
 
 .cpu generic+simd
 
-.text
-
 #include "asm-poly1305-aarch64.h"
 
 /* register macros */
                ROTATE4(b1, b2, b3, b4, 7, tmp1, tmp2, tmp3, tmp4,              \
                        _(iop27), _(iop28), _(iop29));
 
+SECTION_RODATA
+
 .align 4
-.globl _gcry_chacha20_aarch64_blocks4_data_inc_counter
+ELF(.type _gcry_chacha20_aarch64_blocks4_data_inc_counter,%object;)
 _gcry_chacha20_aarch64_blocks4_data_inc_counter:
        .long 0,1,2,3
 
 .align 4
-.globl _gcry_chacha20_aarch64_blocks4_data_rot8
+ELF(.type _gcry_chacha20_aarch64_blocks4_data_rot8,%object;)
 _gcry_chacha20_aarch64_blocks4_data_rot8:
        .byte 3,0,1,2
        .byte 7,4,5,6
        .byte 11,8,9,10
        .byte 15,12,13,14
 
-.align 3
+.text
+
+.align 4
 .globl _gcry_chacha20_aarch64_blocks4
 ELF(.type _gcry_chacha20_aarch64_blocks4,%function;)
 
@@ -364,7 +366,7 @@ ELF(.size _gcry_chacha20_aarch64_blocks4, .-_gcry_chacha20_aarch64_blocks4;)
   4-way stitched chacha20-poly1305
  **********************************************************************/
 
-.align 3
+.align 4
 .globl _gcry_chacha20_poly1305_aarch64_blocks4
 ELF(.type _gcry_chacha20_poly1305_aarch64_blocks4,%function;)
 
index 9f2a036aacb0a1a29a3a71797e531c3912677c24..54e2ffabf90c163d64e2ec17e27bc064c3b3721d 100644 (file)
@@ -33,8 +33,6 @@
    (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
     defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS))
 
-.text
-
 #include "asm-common-amd64.h"
 #include "asm-poly1305-amd64.h"
 
        PLUS(c1,d1); PLUS(c2,d2); XOR(b1,c1); XOR(b2,c2);       \
            ROTATE2(b1, b2,  7, tmp1);
 
+SECTION_RODATA
+
+ELF(.type _chacha20_avx2_data,@object;)
 .align 32
-chacha20_data:
+_chacha20_avx2_data:
 .Lshuf_rol16:
        .byte 2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13
 .Lshuf_rol8:
@@ -168,7 +169,9 @@ chacha20_data:
 .Lunsigned_cmp:
        .long 0x80000000
 
-.align 8
+.text
+
+.align 16
 .globl _gcry_chacha20_amd64_avx2_blocks8
 ELF(.type _gcry_chacha20_amd64_avx2_blocks8,@function;)
 
@@ -333,7 +336,7 @@ ELF(.size _gcry_chacha20_amd64_avx2_blocks8,
 
 #define _ /*_*/
 
-.align 8
+.align 16
 .globl _gcry_chacha20_poly1305_amd64_avx2_blocks8
 ELF(.type _gcry_chacha20_poly1305_amd64_avx2_blocks8,@function;)
 
diff --git a/cipher/chacha20-amd64-avx512.S b/cipher/chacha20-amd64-avx512.S
new file mode 100644 (file)
index 0000000..2d14081
--- /dev/null
@@ -0,0 +1,736 @@
+/* chacha20-amd64-avx512.S  -  AVX512 implementation of ChaCha20 cipher
+ *
+ * Copyright (C) 2022 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+/*
+ * Based on D. J. Bernstein reference implementation at
+ * http://cr.yp.to/chacha.html:
+ *
+ * chacha-regs.c version 20080118
+ * D. J. Bernstein
+ * Public domain.
+ */
+
+#ifdef __x86_64
+#include <config.h>
+#if defined(HAVE_GCC_INLINE_ASM_AVX512) && \
+   (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
+    defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS))
+
+#include "asm-common-amd64.h"
+
+/* register macros */
+#define INPUT %rdi
+#define DST   %rsi
+#define SRC   %rdx
+#define NBLKS %rcx
+#define ROUND %eax
+
+/* vector registers */
+#define X0 %zmm0
+#define X1 %zmm1
+#define X2 %zmm2
+#define X3 %zmm3
+#define X4 %zmm4
+#define X5 %zmm5
+#define X6 %zmm6
+#define X7 %zmm7
+#define X8 %zmm8
+#define X9 %zmm9
+#define X10 %zmm10
+#define X11 %zmm11
+#define X12 %zmm12
+#define X13 %zmm13
+#define X14 %zmm14
+#define X15 %zmm15
+#define X0y %ymm0
+#define X1y %ymm1
+#define X2y %ymm2
+#define X3y %ymm3
+#define X4y %ymm4
+#define X5y %ymm5
+#define X6y %ymm6
+#define X7y %ymm7
+#define X8y %ymm8
+#define X9y %ymm9
+#define X10y %ymm10
+#define X11y %ymm11
+#define X12y %ymm12
+#define X13y %ymm13
+#define X14y %ymm14
+#define X15y %ymm15
+#define X0x %xmm0
+#define X1x %xmm1
+#define X2x %xmm2
+#define X3x %xmm3
+#define X4x %xmm4
+#define X5x %xmm5
+#define X6x %xmm6
+#define X7x %xmm7
+#define X8x %xmm8
+#define X9x %xmm9
+#define X10x %xmm10
+#define X11x %xmm11
+#define X12x %xmm12
+#define X13x %xmm13
+#define X14x %xmm14
+#define X15x %xmm15
+
+#define TMP0 %zmm16
+#define TMP1 %zmm17
+#define TMP0y %ymm16
+#define TMP1y %ymm17
+#define TMP0x %xmm16
+#define TMP1x %xmm17
+
+#define COUNTER_ADD %zmm18
+#define COUNTER_ADDy %ymm18
+#define COUNTER_ADDx %xmm18
+
+#define X12_SAVE %zmm19
+#define X12_SAVEy %ymm19
+#define X12_SAVEx %xmm19
+#define X13_SAVE %zmm20
+#define X13_SAVEy %ymm20
+#define X13_SAVEx %xmm20
+
+#define S0 %zmm21
+#define S1 %zmm22
+#define S2 %zmm23
+#define S3 %zmm24
+#define S4 %zmm25
+#define S5 %zmm26
+#define S6 %zmm27
+#define S7 %zmm28
+#define S8 %zmm29
+#define S14 %zmm30
+#define S15 %zmm31
+#define S0y %ymm21
+#define S1y %ymm22
+#define S2y %ymm23
+#define S3y %ymm24
+#define S4y %ymm25
+#define S5y %ymm26
+#define S6y %ymm27
+#define S7y %ymm28
+#define S8y %ymm29
+#define S14y %ymm30
+#define S15y %ymm31
+#define S0x %xmm21
+#define S1x %xmm22
+#define S2x %xmm23
+#define S3x %xmm24
+#define S4x %xmm25
+#define S5x %xmm26
+#define S6x %xmm27
+#define S7x %xmm28
+#define S8x %xmm29
+#define S14x %xmm30
+#define S15x %xmm31
+
+/**********************************************************************
+  helper macros
+ **********************************************************************/
+
+/* 4x4 32-bit integer matrix transpose */
+#define transpose_4x4(x0,x1,x2,x3,t1,t2) \
+       vpunpckhdq x1, x0, t2; \
+       vpunpckldq x1, x0, x0; \
+       \
+       vpunpckldq x3, x2, t1; \
+       vpunpckhdq x3, x2, x2; \
+       \
+       vpunpckhqdq t1, x0, x1; \
+       vpunpcklqdq t1, x0, x0; \
+       \
+       vpunpckhqdq x2, t2, x3; \
+       vpunpcklqdq x2, t2, x2;
+
+/* 4x4 128-bit matrix transpose */
+#define transpose_16byte_4x4(x0,x1,x2,x3,t1,t2) \
+       vshufi32x4 $0xee, x1, x0, t2; \
+       vshufi32x4 $0x44, x1, x0, x0; \
+       \
+       vshufi32x4 $0x44, x3, x2, t1; \
+       vshufi32x4 $0xee, x3, x2, x2; \
+       \
+       vshufi32x4 $0xdd, t1, x0, x1; \
+       vshufi32x4 $0x88, t1, x0, x0; \
+       \
+       vshufi32x4 $0xdd, x2, t2, x3; \
+       vshufi32x4 $0x88, x2, t2, x2;
+
+/* 2x2 128-bit matrix transpose */
+#define transpose_16byte_2x2(x0,x1,t1) \
+       vmovdqa32  x0, t1; \
+       vshufi32x4 $0x0, x1, x0, x0; \
+       vshufi32x4 $0x3, x1, t1, x1;
+
+#define xor_src_dst_4x4(dst, src, offset, add, x0, x4, x8, x12) \
+       vpxord (offset + 0 * (add))(src), x0, x0; \
+       vpxord (offset + 1 * (add))(src), x4, x4; \
+       vpxord (offset + 2 * (add))(src), x8, x8; \
+       vpxord (offset + 3 * (add))(src), x12, x12; \
+       vmovdqu32 x0, (offset + 0 * (add))(dst); \
+       vmovdqu32 x4, (offset + 1 * (add))(dst); \
+       vmovdqu32 x8, (offset + 2 * (add))(dst); \
+       vmovdqu32 x12, (offset + 3 * (add))(dst);
+
+#define xor_src_dst(dst, src, offset, xreg) \
+       vpxord offset(src), xreg, xreg; \
+       vmovdqu32 xreg, offset(dst);
+
+#define clear_vec4(v0,v1,v2,v3) \
+       vpxord v0, v0, v0; \
+       vpxord v1, v1, v1; \
+       vpxord v2, v2, v2; \
+       vpxord v3, v3, v3;
+
+#define clear_zmm16_zmm31() \
+       clear_vec4(%ymm16, %ymm20, %ymm24, %ymm28); \
+       clear_vec4(%ymm17, %ymm21, %ymm25, %ymm29); \
+       clear_vec4(%ymm18, %ymm22, %ymm26, %ymm30); \
+       clear_vec4(%ymm19, %ymm23, %ymm27, %ymm31);
+
+/**********************************************************************
+  16-way (zmm), 8-way (ymm), 4-way (xmm) chacha20
+ **********************************************************************/
+
+#define ROTATE2(v1,v2,c)       \
+       vprold $(c), v1, v1;    \
+       vprold $(c), v2, v2;
+
+#define XOR(ds,s) \
+       vpxord s, ds, ds;
+
+#define PLUS(ds,s) \
+       vpaddd s, ds, ds;
+
+#define QUARTERROUND2V(a1,b1,c1,d1,a2,b2,c2,d2)                        \
+       PLUS(a1,b1); PLUS(a2,b2); XOR(d1,a1); XOR(d2,a2);       \
+           ROTATE2(d1, d2, 16);                                \
+       PLUS(c1,d1); PLUS(c2,d2); XOR(b1,c1); XOR(b2,c2);       \
+           ROTATE2(b1, b2, 12);                                \
+       PLUS(a1,b1); PLUS(a2,b2); XOR(d1,a1); XOR(d2,a2);       \
+           ROTATE2(d1, d2, 8);                                 \
+       PLUS(c1,d1); PLUS(c2,d2); XOR(b1,c1); XOR(b2,c2);       \
+           ROTATE2(b1, b2, 7);
+
+/**********************************************************************
+  1-way/2-way (xmm) chacha20
+ **********************************************************************/
+
+#define ROTATE(v1,c)                   \
+       vprold $(c), v1, v1;            \
+
+#define WORD_SHUF(v1,shuf)             \
+       vpshufd $shuf, v1, v1;
+
+#define QUARTERROUND1H(x0,x1,x2,x3,shuf_x1,shuf_x2,shuf_x3) \
+       PLUS(x0, x1); XOR(x3, x0); ROTATE(x3, 16); \
+       PLUS(x2, x3); XOR(x1, x2); ROTATE(x1, 12); \
+       PLUS(x0, x1); XOR(x3, x0); ROTATE(x3, 8); \
+       PLUS(x2, x3); \
+         WORD_SHUF(x3, shuf_x3); \
+                     XOR(x1, x2); \
+         WORD_SHUF(x2, shuf_x2); \
+                                  ROTATE(x1, 7); \
+         WORD_SHUF(x1, shuf_x1);
+
+#define QUARTERROUND2H(x0,x1,x2,x3,y0,y1,y2,y3,shuf_x1,shuf_x2,shuf_x3) \
+       PLUS(x0, x1); PLUS(y0, y1); XOR(x3, x0); XOR(y3, y0); \
+         ROTATE(x3, 16); ROTATE(y3, 16); \
+       PLUS(x2, x3); PLUS(y2, y3); XOR(x1, x2); XOR(y1, y2); \
+         ROTATE(x1, 12); ROTATE(y1, 12); \
+       PLUS(x0, x1); PLUS(y0, y1); XOR(x3, x0); XOR(y3, y0); \
+         ROTATE(x3, 8); ROTATE(y3, 8); \
+       PLUS(x2, x3); PLUS(y2, y3); \
+         WORD_SHUF(x3, shuf_x3); WORD_SHUF(y3, shuf_x3); \
+                     XOR(x1, x2); XOR(y1, y2); \
+         WORD_SHUF(x2, shuf_x2); WORD_SHUF(y2, shuf_x2); \
+                                  ROTATE(x1, 7); ROTATE(y1, 7); \
+         WORD_SHUF(x1, shuf_x1); WORD_SHUF(y1, shuf_x1);
+
+SECTION_RODATA
+
+.align 64
+ELF(.type _gcry_chacha20_amd64_avx512_data,@object;)
+_gcry_chacha20_amd64_avx512_data:
+.Lcounter_0_1_2_3:
+.Lcounter_0_1:
+       .long 0,0,0,0
+.Lone:
+       .long 1,0,0,0
+.Lcounter_2_3:
+.Ltwo:
+       .long 2,0,0,0
+.Lthree:
+       .long 3,0,0,0
+.Linc_counter:
+       .byte 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
+ELF(.size _gcry_chacha20_amd64_avx512_data,.-_gcry_chacha20_amd64_avx512_data)
+
+.text
+
+.align 16
+.globl _gcry_chacha20_amd64_avx512_blocks
+ELF(.type _gcry_chacha20_amd64_avx512_blocks,@function;)
+_gcry_chacha20_amd64_avx512_blocks:
+       /* input:
+        *      %rdi: input
+        *      %rsi: dst
+        *      %rdx: src
+        *      %rcx: nblks
+        */
+       CFI_STARTPROC();
+
+       spec_stop_avx512;
+
+       cmpq $4, NBLKS;
+       jb .Lskip_vertical_handling;
+
+       /* Load constants */
+       vpmovzxbd .Linc_counter rRIP, COUNTER_ADD;
+       kxnorq %k1, %k1, %k1;
+
+       cmpq $16, NBLKS;
+       jae .Lprocess_16v;
+
+       /* Preload state to YMM registers */
+       vpbroadcastd (0 * 4)(INPUT), S0y;
+       vpbroadcastd (1 * 4)(INPUT), S1y;
+       vpbroadcastd (2 * 4)(INPUT), S2y;
+       vpbroadcastd (3 * 4)(INPUT), S3y;
+       vpbroadcastd (4 * 4)(INPUT), S4y;
+       vpbroadcastd (5 * 4)(INPUT), S5y;
+       vpbroadcastd (6 * 4)(INPUT), S6y;
+       vpbroadcastd (7 * 4)(INPUT), S7y;
+       vpbroadcastd (8 * 4)(INPUT), S8y;
+       vpbroadcastd (14 * 4)(INPUT), S14y;
+       vpbroadcastd (15 * 4)(INPUT), S15y;
+       jmp .Lskip16v;
+
+.align 16
+.Lprocess_16v:
+       /* Process 16 ChaCha20 blocks */
+
+       /* Preload state to ZMM registers */
+       vpbroadcastd (0 * 4)(INPUT), S0;
+       vpbroadcastd (1 * 4)(INPUT), S1;
+       vpbroadcastd (2 * 4)(INPUT), S2;
+       vpbroadcastd (3 * 4)(INPUT), S3;
+       vpbroadcastd (4 * 4)(INPUT), S4;
+       vpbroadcastd (5 * 4)(INPUT), S5;
+       vpbroadcastd (6 * 4)(INPUT), S6;
+       vpbroadcastd (7 * 4)(INPUT), S7;
+       vpbroadcastd (8 * 4)(INPUT), S8;
+       vpbroadcastd (14 * 4)(INPUT), S14;
+       vpbroadcastd (15 * 4)(INPUT), S15;
+
+       movl $20, ROUND;
+       subq $16, NBLKS;
+
+       /* Construct counter vectors X12 and X13 */
+       vpmovm2d %k1, X9;
+       vpaddd (12 * 4)(INPUT){1to16}, COUNTER_ADD, X12;
+       vpbroadcastd (13 * 4)(INPUT), X13;
+       vpcmpud $6, X12, COUNTER_ADD, %k2;
+       vpsubd X9, X13, X13{%k2};
+       vmovdqa32 X12, X12_SAVE;
+       vmovdqa32 X13, X13_SAVE;
+
+       /* Load vectors */
+       vmovdqa32 S0, X0;
+       vmovdqa32 S4, X4;
+       vmovdqa32 S8, X8;
+       vmovdqa32 S1, X1;
+       vmovdqa32 S5, X5;
+       vpbroadcastd (9 * 4)(INPUT), X9;
+       QUARTERROUND2V(X0, X4,  X8, X12,   X1, X5,  X9, X13)
+       vmovdqa32 S2, X2;
+       vmovdqa32 S6, X6;
+       vpbroadcastd (10 * 4)(INPUT), X10;
+       vmovdqa32 S14, X14;
+       vmovdqa32 S3, X3;
+       vmovdqa32 S7, X7;
+       vpbroadcastd (11 * 4)(INPUT), X11;
+       vmovdqa32 S15, X15;
+
+       /* Update counter */
+       addq $16, (12 * 4)(INPUT);
+       jmp .Lround2_entry_16v;
+
+.align 16
+.Loop16v:
+       movl $20, ROUND;
+       subq $16, NBLKS;
+
+       vmovdqa32 S0, X0;
+       vmovdqa32 S4, X4;
+       vmovdqa32 S8, X8;
+       transpose_16byte_4x4(X1, X5, X9, X13, TMP0, TMP1);
+       xor_src_dst_4x4(DST, SRC, (64 * 1), 256, X1, X5, X9, X13);
+       vpmovm2d %k1, X9;
+       vpaddd (12 * 4)(INPUT){1to16}, COUNTER_ADD, X12;
+       vpbroadcastd (13 * 4)(INPUT), X13;
+       vpcmpud $6, X12, COUNTER_ADD, %k2;
+       vpsubd X9, X13, X13{%k2};
+       vmovdqa32 S1, X1;
+       vmovdqa32 S5, X5;
+       vpbroadcastd (9 * 4)(INPUT), X9;
+       vmovdqa32 X12, X12_SAVE;
+       vmovdqa32 X13, X13_SAVE;
+       QUARTERROUND2V(X0, X4,  X8, X12,   X1, X5,  X9, X13)
+       transpose_16byte_4x4(X2, X6, X10, X14, TMP0, TMP1);
+       xor_src_dst_4x4(DST, SRC, (64 * 2), 256, X2, X6, X10, X14);
+       vmovdqa32 S2, X2;
+       vmovdqa32 S6, X6;
+       vpbroadcastd (10 * 4)(INPUT), X10;
+       vmovdqa32 S14, X14;
+       transpose_16byte_4x4(X3, X7, X11, X15, TMP0, TMP1);
+       xor_src_dst_4x4(DST, SRC, (64 * 3), 256, X3, X7, X11, X15);
+       leaq (16 * 64)(SRC), SRC;
+       leaq (16 * 64)(DST), DST;
+       vmovdqa32 S3, X3;
+       vmovdqa32 S7, X7;
+       vpbroadcastd (11 * 4)(INPUT), X11;
+       vmovdqa32 S15, X15;
+
+       /* Update counter */
+       addq $16, (12 * 4)(INPUT);
+       jmp .Lround2_entry_16v;
+
+.align 16
+.Lround2_16v:
+       QUARTERROUND2V(X2, X7,  X8, X13,   X3, X4,  X9, X14)
+       QUARTERROUND2V(X0, X4,  X8, X12,   X1, X5,  X9, X13)
+.align 16
+.Lround2_entry_16v:
+       QUARTERROUND2V(X2, X6, X10, X14,   X3, X7, X11, X15)
+       QUARTERROUND2V(X0, X5, X10, X15,   X1, X6, X11, X12)
+       subl $2, ROUND;
+       jnz .Lround2_16v;
+
+       PLUS(X0, S0);
+       PLUS(X1, S1);
+       QUARTERROUND2V(X2, X7,  X8, X13,   X3, X4,  X9, X14)
+       PLUS(X2, S2);
+       PLUS(X3, S3);
+       transpose_4x4(X0, X1, X2, X3, TMP0, TMP1);
+       PLUS(X4, S4);
+       PLUS(X5, S5);
+       PLUS(X6, S6);
+       PLUS(X7, S7);
+       transpose_4x4(X4, X5, X6, X7, TMP0, TMP1);
+       PLUS(X8, S8);
+       PLUS(X9, (9 * 4)(INPUT){1to16});
+       PLUS(X10, (10 * 4)(INPUT){1to16});
+       PLUS(X11, (11 * 4)(INPUT){1to16});
+       transpose_4x4(X8, X9, X10, X11, TMP0, TMP1);
+       PLUS(X12, X12_SAVE);
+       PLUS(X13, X13_SAVE);
+       PLUS(X14, S14);
+       PLUS(X15, S15);
+       transpose_4x4(X12, X13, X14, X15, TMP0, TMP1);
+
+       transpose_16byte_4x4(X0, X4, X8, X12, TMP0, TMP1);
+       xor_src_dst_4x4(DST, SRC, (64 * 0), 256, X0, X4, X8, X12);
+
+       cmpq $16, NBLKS;
+       jae .Loop16v;
+
+       transpose_16byte_4x4(X1, X5, X9, X13, TMP0, TMP1);
+       xor_src_dst_4x4(DST, SRC, (64 * 1), 256, X1, X5, X9, X13);
+       transpose_16byte_4x4(X2, X6, X10, X14, TMP0, TMP1);
+       xor_src_dst_4x4(DST, SRC, (64 * 2), 256, X2, X6, X10, X14);
+       transpose_16byte_4x4(X3, X7, X11, X15, TMP0, TMP1);
+       xor_src_dst_4x4(DST, SRC, (64 * 3), 256, X3, X7, X11, X15);
+
+       leaq (16 * 64)(SRC), SRC;
+       leaq (16 * 64)(DST), DST;
+
+.align 16
+.Lskip16v:
+       cmpq $8, NBLKS;
+       jb .Lskip8v;
+
+       /* Process 8 ChaCha20 blocks */
+
+       /* Construct counter vectors X12 and X13 */
+       vpmovm2d %k1, X9y;
+       vpaddd (12 * 4)(INPUT){1to8}, COUNTER_ADDy, X12y;
+       vpbroadcastd (13 * 4)(INPUT), X13y;
+       vpcmpud $6, X12y, COUNTER_ADDy, %k2;
+       vpsubd X9y, X13y, X13y{%k2};
+       vmovdqa32 X12y, X12_SAVEy;
+       vmovdqa32 X13y, X13_SAVEy;
+
+       /* Load vectors */
+       vmovdqa32 S0y, X0y;
+       vmovdqa32 S4y, X4y;
+       vmovdqa32 S8y, X8y;
+       vmovdqa32 S1y, X1y;
+       vmovdqa32 S5y, X5y;
+       vpbroadcastd (9 * 4)(INPUT), X9y;
+       vmovdqa32 S2y, X2y;
+       vmovdqa32 S6y, X6y;
+       vpbroadcastd (10 * 4)(INPUT), X10y;
+       vmovdqa32 S14y, X14y;
+       vmovdqa32 S3y, X3y;
+       vmovdqa32 S7y, X7y;
+       vpbroadcastd (11 * 4)(INPUT), X11y;
+       vmovdqa32 S15y, X15y;
+
+       /* Update counter */
+       addq $8, (12 * 4)(INPUT);
+
+       movl $20, ROUND;
+       subq $8, NBLKS;
+.align 16
+.Lround2_8v:
+       QUARTERROUND2V(X0y, X4y,  X8y, X12y,   X1y, X5y,  X9y, X13y)
+       QUARTERROUND2V(X2y, X6y, X10y, X14y,   X3y, X7y, X11y, X15y)
+       QUARTERROUND2V(X0y, X5y, X10y, X15y,   X1y, X6y, X11y, X12y)
+       QUARTERROUND2V(X2y, X7y,  X8y, X13y,   X3y, X4y,  X9y, X14y)
+       subl $2, ROUND;
+       jnz .Lround2_8v;
+
+       PLUS(X0y, S0y);
+       PLUS(X1y, S1y);
+       PLUS(X2y, S2y);
+       PLUS(X3y, S3y);
+       transpose_4x4(X0y, X1y, X2y, X3y, TMP0y, TMP1y);
+       PLUS(X4y, S4y);
+       PLUS(X5y, S5y);
+       PLUS(X6y, S6y);
+       PLUS(X7y, S7y);
+       transpose_4x4(X4y, X5y, X6y, X7y, TMP0y, TMP1y);
+       PLUS(X8y, S8y);
+       transpose_16byte_2x2(X0y, X4y, TMP0y);
+       PLUS(X9y, (9 * 4)(INPUT){1to8});
+       transpose_16byte_2x2(X1y, X5y, TMP0y);
+       PLUS(X10y, (10 * 4)(INPUT){1to8});
+       transpose_16byte_2x2(X2y, X6y, TMP0y);
+       PLUS(X11y, (11 * 4)(INPUT){1to8});
+       transpose_16byte_2x2(X3y, X7y, TMP0y);
+       xor_src_dst_4x4(DST, SRC, (16 * 0),  64, X0y, X1y, X2y, X3y);
+       transpose_4x4(X8y, X9y, X10y, X11y, TMP0y, TMP1y);
+       PLUS(X12y, X12_SAVEy);
+       PLUS(X13y, X13_SAVEy);
+       PLUS(X14y, S14y);
+       PLUS(X15y, S15y);
+       xor_src_dst_4x4(DST, SRC, (16 * 16), 64, X4y, X5y, X6y, X7y);
+       transpose_4x4(X12y, X13y, X14y, X15y, TMP0y, TMP1y);
+       transpose_16byte_2x2(X8y, X12y, TMP0y);
+       transpose_16byte_2x2(X9y, X13y, TMP0y);
+       transpose_16byte_2x2(X10y, X14y, TMP0y);
+       transpose_16byte_2x2(X11y, X15y, TMP0y);
+       xor_src_dst_4x4(DST, SRC, (16 * 2),  64, X8y, X9y, X10y, X11y);
+       xor_src_dst_4x4(DST, SRC, (16 * 18), 64, X12y, X13y, X14y, X15y);
+
+       leaq (8 * 64)(SRC), SRC;
+       leaq (8 * 64)(DST), DST;
+
+.align 16
+.Lskip8v:
+       cmpq $4, NBLKS;
+       jb .Lskip4v;
+
+       /* Process 4 ChaCha20 blocks */
+
+       /* Construct counter vectors X12 and X13 */
+       vpmovm2d %k1, X9x;
+       vpaddd (12 * 4)(INPUT){1to4}, COUNTER_ADDx, X12x;
+       vpbroadcastd (13 * 4)(INPUT), X13x;
+       vpcmpud $6, X12x, COUNTER_ADDx, %k2;
+       vpsubd X9x, X13x, X13x{%k2};
+       vmovdqa32 X12x, X12_SAVEx;
+       vmovdqa32 X13x, X13_SAVEx;
+
+       /* Load vectors */
+       vmovdqa32 S0x, X0x;
+       vmovdqa32 S4x, X4x;
+       vmovdqa32 S8x, X8x;
+       vmovdqa32 S1x, X1x;
+       vmovdqa32 S5x, X5x;
+       vpbroadcastd (9 * 4)(INPUT), X9x;
+       vmovdqa32 S2x, X2x;
+       vmovdqa32 S6x, X6x;
+       vpbroadcastd (10 * 4)(INPUT), X10x;
+       vmovdqa32 S14x, X14x;
+       vmovdqa32 S3x, X3x;
+       vmovdqa32 S7x, X7x;
+       vpbroadcastd (11 * 4)(INPUT), X11x;
+       vmovdqa32 S15x, X15x;
+
+       /* Update counter */
+       addq $4, (12 * 4)(INPUT);
+
+       movl $20, ROUND;
+       subq $4, NBLKS;
+.align 16
+.Lround2_4v:
+       QUARTERROUND2V(X0x, X4x,  X8x, X12x,   X1x, X5x,  X9x, X13x)
+       QUARTERROUND2V(X2x, X6x, X10x, X14x,   X3x, X7x, X11x, X15x)
+       QUARTERROUND2V(X0x, X5x, X10x, X15x,   X1x, X6x, X11x, X12x)
+       QUARTERROUND2V(X2x, X7x,  X8x, X13x,   X3x, X4x,  X9x, X14x)
+       subl $2, ROUND;
+       jnz .Lround2_4v;
+
+       PLUS(X0x, S0x);
+       PLUS(X1x, S1x);
+       PLUS(X2x, S2x);
+       PLUS(X3x, S3x);
+       transpose_4x4(X0x, X1x, X2x, X3x, TMP0x, TMP1x);
+       PLUS(X4x, S4x);
+       PLUS(X5x, S5x);
+       PLUS(X6x, S6x);
+       PLUS(X7x, S7x);
+       xor_src_dst_4x4(DST, SRC, (16 * 0), 64, X0x, X1x, X2x, X3x);
+       transpose_4x4(X4x, X5x, X6x, X7x, TMP0x, TMP1x);
+       PLUS(X8x, S8x);
+       PLUS(X9x, (9 * 4)(INPUT){1to4});
+       PLUS(X10x, (10 * 4)(INPUT){1to4});
+       PLUS(X11x, (11 * 4)(INPUT){1to4});
+       xor_src_dst_4x4(DST, SRC, (16 * 1), 64, X4x, X5x, X6x, X7x);
+       transpose_4x4(X8x, X9x, X10x, X11x, TMP0x, TMP1x);
+       PLUS(X12x, X12_SAVEx);
+       PLUS(X13x, X13_SAVEx);
+       PLUS(X14x, S14x);
+       PLUS(X15x, S15x);
+       xor_src_dst_4x4(DST, SRC, (16 * 2), 64, X8x, X9x, X10x, X11x);
+       transpose_4x4(X12x, X13x, X14x, X15x, TMP0x, TMP1x);
+       xor_src_dst_4x4(DST, SRC, (16 * 3), 64, X12x, X13x, X14x, X15x);
+
+       leaq (4 * 64)(SRC), SRC;
+       leaq (4 * 64)(DST), DST;
+
+.align 16
+.Lskip4v:
+       /* clear AVX512 registers */
+       kxorq %k2, %k2, %k2;
+       vzeroupper;
+       clear_zmm16_zmm31();
+
+.align 16
+.Lskip_vertical_handling:
+       cmpq $0, NBLKS;
+       je .Ldone;
+
+       /* Load state */
+       vmovdqu (0 * 4)(INPUT), X10x;
+       vmovdqu (4 * 4)(INPUT), X11x;
+       vmovdqu (8 * 4)(INPUT), X12x;
+       vmovdqu (12 * 4)(INPUT), X13x;
+
+       /* Load constant */
+       vmovdqa .Lone rRIP, X4x;
+
+       cmpq $1, NBLKS;
+       je .Lhandle1;
+
+       /* Process two ChaCha20 blocks (XMM) */
+       movl $20, ROUND;
+       subq $2, NBLKS;
+
+       vmovdqa X10x, X0x;
+       vmovdqa X11x, X1x;
+       vmovdqa X12x, X2x;
+       vmovdqa X13x, X3x;
+
+       vmovdqa X10x, X8x;
+       vmovdqa X11x, X9x;
+       vmovdqa X12x, X14x;
+       vpaddq X4x, X13x, X15x;
+       vmovdqa X15x, X7x;
+
+.align 16
+.Lround2_2:
+       QUARTERROUND2H(X0x, X1x, X2x,  X3x,  X8x, X9x, X14x, X15x,
+                      0x39, 0x4e, 0x93);
+       QUARTERROUND2H(X0x, X1x, X2x,  X3x,  X8x, X9x, X14x, X15x,
+                      0x93, 0x4e, 0x39);
+       subl $2, ROUND;
+       jnz .Lround2_2;
+
+       PLUS(X0x, X10x);
+       PLUS(X1x, X11x);
+       PLUS(X2x, X12x);
+       PLUS(X3x, X13x);
+
+       vpaddq .Ltwo rRIP, X13x, X13x; /* Update counter */
+
+       xor_src_dst_4x4(DST, SRC, 0 * 4, 4 * 4, X0x, X1x, X2x, X3x);
+
+       PLUS(X8x, X10x);
+       PLUS(X9x, X11x);
+       PLUS(X14x, X12x);
+       PLUS(X15x, X7x);
+
+       xor_src_dst_4x4(DST, SRC, 16 * 4, 4 * 4, X8x, X9x, X14x, X15x);
+       lea (2 * 64)(DST), DST;
+       lea (2 * 64)(SRC), SRC;
+
+       cmpq $0, NBLKS;
+       je .Lskip1;
+
+.align 16
+.Lhandle1:
+       /* Process one ChaCha20 block (XMM) */
+       movl $20, ROUND;
+       subq $1, NBLKS;
+
+       vmovdqa X10x, X0x;
+       vmovdqa X11x, X1x;
+       vmovdqa X12x, X2x;
+       vmovdqa X13x, X3x;
+
+.align 16
+.Lround2_1:
+       QUARTERROUND1H(X0x, X1x, X2x, X3x, 0x39, 0x4e, 0x93);
+       QUARTERROUND1H(X0x, X1x, X2x, X3x, 0x93, 0x4e, 0x39);
+       subl $2, ROUND;
+       jnz .Lround2_1;
+
+       PLUS(X0x, X10x);
+       PLUS(X1x, X11x);
+       PLUS(X2x, X12x);
+       PLUS(X3x, X13x);
+
+       vpaddq X4x, X13x, X13x; /* Update counter */
+
+       xor_src_dst_4x4(DST, SRC, 0 * 4, 4 * 4, X0x, X1x, X2x, X3x);
+
+.align 16
+.Lskip1:
+       /* Store counter */
+       vmovdqu X13x, (12 * 4)(INPUT);
+
+.align 16
+.Ldone:
+       vzeroall; /* clears ZMM0-ZMM15 */
+
+       xorl %eax, %eax;
+       ret_spec_stop;
+       CFI_ENDPROC();
+ELF(.size _gcry_chacha20_amd64_avx512_blocks,
+         .-_gcry_chacha20_amd64_avx512_blocks;)
+
+#endif /*defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS)*/
+#endif /*__x86_64*/
index 6c7379787d1e2181b0044688c9736f71fffbc057..1ce5a8e6b7ad996186af061333d85c318865f768 100644 (file)
@@ -33,8 +33,6 @@
    (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
     defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS))
 
-.text
-
 #include "asm-common-amd64.h"
 #include "asm-poly1305-amd64.h"
 
        PLUS(c1,d1); PLUS(c2,d2); XOR(b1,c1); XOR(b2,c2);       \
            ROTATE2(b1, b2,  7, tmp1, tmp2);
 
-chacha20_data:
+SECTION_RODATA
+
+ELF(.type _chacha20_ssse3_data,@object;)
+_chacha20_ssse3_data:
 .align 16
 .Lshuf_rol16:
        .byte 2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13
@@ -164,7 +165,9 @@ chacha20_data:
 .Lunsigned_cmp:
        .long 0x80000000,0x80000000,0x80000000,0x80000000
 
-.align 8
+.text
+
+.align 16
 .globl _gcry_chacha20_amd64_ssse3_blocks4
 ELF(.type _gcry_chacha20_amd64_ssse3_blocks4,@function;)
 
@@ -366,7 +369,7 @@ ELF(.size _gcry_chacha20_amd64_ssse3_blocks4,
                                   ROTATE(x1, 7, tmp1); \
          WORD_SHUF(x1, shuf_x1);
 
-.align 8
+.align 16
 .globl _gcry_chacha20_amd64_ssse3_blocks1
 ELF(.type _gcry_chacha20_amd64_ssse3_blocks1,@function;)
 
@@ -513,7 +516,7 @@ ELF(.size _gcry_chacha20_amd64_ssse3_blocks1,
 
 #define _ /*_*/
 
-.align 8
+.align 16
 .globl _gcry_chacha20_poly1305_amd64_ssse3_blocks4
 ELF(.type _gcry_chacha20_poly1305_amd64_ssse3_blocks4,@function;)
 
@@ -781,7 +784,7 @@ ELF(.size _gcry_chacha20_poly1305_amd64_ssse3_blocks4,
   2-way && 1-way stitched chacha20-poly1305
  **********************************************************************/
 
-.align 8
+.align 16
 .globl _gcry_chacha20_poly1305_amd64_ssse3_blocks1
 ELF(.type _gcry_chacha20_poly1305_amd64_ssse3_blocks1,@function;)
 
diff --git a/cipher/chacha20-p10le-8x.s b/cipher/chacha20-p10le-8x.s
new file mode 100644 (file)
index 0000000..ff68c9e
--- /dev/null
@@ -0,0 +1,864 @@
+# Copyright 2021- IBM Inc. All rights reserved
+#
+# This file is part of Libgcrypt.
+#
+# Libgcrypt is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as
+# published by the Free Software Foundation; either version 2.1 of
+# the License, or (at your option) any later version.
+#
+# Libgcrypt is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public
+# License along with this program; if not, see <http://www.gnu.org/licenses/>.
+#
+#===================================================================================
+# Written by Danny Tsen <dtsen@us.ibm.com>
+#
+# This function handles multiple 64-byte block data length
+#   and the length should be more than 512 bytes.
+#
+# unsigned int _gcry_chacha20_p10le_8x(u32 *state, byte *dst, const byte *src, size_t len);
+#
+# r1 - top of the stack
+# r3 to r10 input parameters
+# r3 - out
+# r4 - inp
+# r5 - len
+# r6 - key[8]
+# r7 - counter[4]
+#
+# do rounds,  8 quarter rounds
+# 1.  a += b; d ^= a; d <<<= 16;
+# 2.  c += d; b ^= c; b <<<= 12;
+# 3.  a += b; d ^= a; d <<<= 8;
+# 4.  c += d; b ^= c; b <<<= 7
+#
+# row1 = (row1 + row2),  row4 = row1 xor row4,  row4 rotate each word by 16
+# row3 = (row3 + row4),  row2 = row3 xor row2,  row2 rotate each word by 12
+# row1 = (row1 + row2), row4 = row1 xor row4,  row4 rotate each word by 8
+# row3 = (row3 + row4), row2 = row3 xor row2,  row2 rotate each word by 7
+#
+# 4 blocks (a b c d)
+#
+# a0 b0 c0 d0
+# a1 b1 c1 d1
+# ...
+# a4 b4 c4 d4
+# ...
+# a8 b8 c8 d8
+# ...
+# a12 b12 c12 d12
+# a13 ...
+# a14 ...
+# a15 b15 c15 d15
+#
+# Column round (v0, v4,  v8, v12, v1, v5,  v9, v13, v2, v6, v10, v14, v3, v7, v11, v15)
+# Diagnal round (v0, v5, v10, v15, v1, v6, v11, v12, v2, v7,  v8, v13, v3, v4,  v9, v14)
+#
+.text
+
+.macro QT_loop_8x
+       # QR(v0, v4,  v8, v12, v1, v5,  v9, v13, v2, v6, v10, v14, v3, v7, v11, v15)
+       xxlor   0, 32+25, 32+25
+       xxlor   32+25, 20, 20
+       vadduwm 0, 0, 4
+       vadduwm 1, 1, 5
+       vadduwm 2, 2, 6
+       vadduwm 3, 3, 7
+         vadduwm 16, 16, 20
+         vadduwm 17, 17, 21
+         vadduwm 18, 18, 22
+         vadduwm 19, 19, 23
+
+         vpermxor 12, 12, 0, 25
+         vpermxor 13, 13, 1, 25
+         vpermxor 14, 14, 2, 25
+         vpermxor 15, 15, 3, 25
+         vpermxor 28, 28, 16, 25
+         vpermxor 29, 29, 17, 25
+         vpermxor 30, 30, 18, 25
+         vpermxor 31, 31, 19, 25
+       xxlor   32+25, 0, 0
+       vadduwm 8, 8, 12
+       vadduwm 9, 9, 13
+       vadduwm 10, 10, 14
+       vadduwm 11, 11, 15
+         vadduwm 24, 24, 28
+         vadduwm 25, 25, 29
+         vadduwm 26, 26, 30
+         vadduwm 27, 27, 31
+       vxor 4, 4, 8
+       vxor 5, 5, 9
+       vxor 6, 6, 10
+       vxor 7, 7, 11
+         vxor 20, 20, 24
+         vxor 21, 21, 25
+         vxor 22, 22, 26
+         vxor 23, 23, 27
+
+       xxlor   0, 32+25, 32+25
+       xxlor   32+25, 21, 21
+       vrlw 4, 4, 25  #
+       vrlw 5, 5, 25
+       vrlw 6, 6, 25
+       vrlw 7, 7, 25
+         vrlw 20, 20, 25  #
+         vrlw 21, 21, 25
+         vrlw 22, 22, 25
+         vrlw 23, 23, 25
+       xxlor   32+25, 0, 0
+       vadduwm 0, 0, 4
+       vadduwm 1, 1, 5
+       vadduwm 2, 2, 6
+       vadduwm 3, 3, 7
+         vadduwm 16, 16, 20
+         vadduwm 17, 17, 21
+         vadduwm 18, 18, 22
+         vadduwm 19, 19, 23
+
+       xxlor   0, 32+25, 32+25
+       xxlor   32+25, 22, 22
+         vpermxor 12, 12, 0, 25
+         vpermxor 13, 13, 1, 25
+         vpermxor 14, 14, 2, 25
+         vpermxor 15, 15, 3, 25
+         vpermxor 28, 28, 16, 25
+         vpermxor 29, 29, 17, 25
+         vpermxor 30, 30, 18, 25
+         vpermxor 31, 31, 19, 25
+       xxlor   32+25, 0, 0
+       vadduwm 8, 8, 12
+       vadduwm 9, 9, 13
+       vadduwm 10, 10, 14
+       vadduwm 11, 11, 15
+         vadduwm 24, 24, 28
+         vadduwm 25, 25, 29
+         vadduwm 26, 26, 30
+         vadduwm 27, 27, 31
+       xxlor   0, 32+28, 32+28
+       xxlor   32+28, 23, 23
+       vxor 4, 4, 8
+       vxor 5, 5, 9
+       vxor 6, 6, 10
+       vxor 7, 7, 11
+         vxor 20, 20, 24
+         vxor 21, 21, 25
+         vxor 22, 22, 26
+         vxor 23, 23, 27
+       vrlw 4, 4, 28  #
+       vrlw 5, 5, 28
+       vrlw 6, 6, 28
+       vrlw 7, 7, 28
+         vrlw 20, 20, 28  #
+         vrlw 21, 21, 28
+         vrlw 22, 22, 28
+         vrlw 23, 23, 28
+       xxlor   32+28, 0, 0
+
+       # QR(v0, v5, v10, v15, v1, v6, v11, v12, v2, v7,  v8, v13, v3, v4,  v9, v14)
+       xxlor   0, 32+25, 32+25
+       xxlor   32+25, 20, 20
+       vadduwm 0, 0, 5
+       vadduwm 1, 1, 6
+       vadduwm 2, 2, 7
+       vadduwm 3, 3, 4
+         vadduwm 16, 16, 21
+         vadduwm 17, 17, 22
+         vadduwm 18, 18, 23
+         vadduwm 19, 19, 20
+
+         vpermxor 15, 15, 0, 25
+         vpermxor 12, 12, 1, 25
+         vpermxor 13, 13, 2, 25
+         vpermxor 14, 14, 3, 25
+         vpermxor 31, 31, 16, 25
+         vpermxor 28, 28, 17, 25
+         vpermxor 29, 29, 18, 25
+         vpermxor 30, 30, 19, 25
+
+       xxlor   32+25, 0, 0
+       vadduwm 10, 10, 15
+       vadduwm 11, 11, 12
+       vadduwm 8, 8, 13
+       vadduwm 9, 9, 14
+         vadduwm 26, 26, 31
+         vadduwm 27, 27, 28
+         vadduwm 24, 24, 29
+         vadduwm 25, 25, 30
+       vxor 5, 5, 10
+       vxor 6, 6, 11
+       vxor 7, 7, 8
+       vxor 4, 4, 9
+         vxor 21, 21, 26
+         vxor 22, 22, 27
+         vxor 23, 23, 24
+         vxor 20, 20, 25
+
+       xxlor   0, 32+25, 32+25
+       xxlor   32+25, 21, 21
+       vrlw 5, 5, 25
+       vrlw 6, 6, 25
+       vrlw 7, 7, 25
+       vrlw 4, 4, 25
+         vrlw 21, 21, 25
+         vrlw 22, 22, 25
+         vrlw 23, 23, 25
+         vrlw 20, 20, 25
+       xxlor   32+25, 0, 0
+
+       vadduwm 0, 0, 5
+       vadduwm 1, 1, 6
+       vadduwm 2, 2, 7
+       vadduwm 3, 3, 4
+         vadduwm 16, 16, 21
+         vadduwm 17, 17, 22
+         vadduwm 18, 18, 23
+         vadduwm 19, 19, 20
+
+       xxlor   0, 32+25, 32+25
+       xxlor   32+25, 22, 22
+         vpermxor 15, 15, 0, 25
+         vpermxor 12, 12, 1, 25
+         vpermxor 13, 13, 2, 25
+         vpermxor 14, 14, 3, 25
+         vpermxor 31, 31, 16, 25
+         vpermxor 28, 28, 17, 25
+         vpermxor 29, 29, 18, 25
+         vpermxor 30, 30, 19, 25
+       xxlor   32+25, 0, 0
+
+       vadduwm 10, 10, 15
+       vadduwm 11, 11, 12
+       vadduwm 8, 8, 13
+       vadduwm 9, 9, 14
+         vadduwm 26, 26, 31
+         vadduwm 27, 27, 28
+         vadduwm 24, 24, 29
+         vadduwm 25, 25, 30
+
+       xxlor   0, 32+28, 32+28
+       xxlor   32+28, 23, 23
+       vxor 5, 5, 10
+       vxor 6, 6, 11
+       vxor 7, 7, 8
+       vxor 4, 4, 9
+         vxor 21, 21, 26
+         vxor 22, 22, 27
+         vxor 23, 23, 24
+         vxor 20, 20, 25
+       vrlw 5, 5, 28
+       vrlw 6, 6, 28
+       vrlw 7, 7, 28
+       vrlw 4, 4, 28
+         vrlw 21, 21, 28
+         vrlw 22, 22, 28
+         vrlw 23, 23, 28
+         vrlw 20, 20, 28
+       xxlor   32+28, 0, 0
+.endm
+
+.macro QT_loop_4x
+       # QR(v0, v4,  v8, v12, v1, v5,  v9, v13, v2, v6, v10, v14, v3, v7, v11, v15)
+       vadduwm 0, 0, 4
+       vadduwm 1, 1, 5
+       vadduwm 2, 2, 6
+       vadduwm 3, 3, 7
+         vpermxor 12, 12, 0, 20
+         vpermxor 13, 13, 1, 20
+         vpermxor 14, 14, 2, 20
+         vpermxor 15, 15, 3, 20
+       vadduwm 8, 8, 12
+       vadduwm 9, 9, 13
+       vadduwm 10, 10, 14
+       vadduwm 11, 11, 15
+       vxor 4, 4, 8
+       vxor 5, 5, 9
+       vxor 6, 6, 10
+       vxor 7, 7, 11
+       vrlw 4, 4, 21
+       vrlw 5, 5, 21
+       vrlw 6, 6, 21
+       vrlw 7, 7, 21
+       vadduwm 0, 0, 4
+       vadduwm 1, 1, 5
+       vadduwm 2, 2, 6
+       vadduwm 3, 3, 7
+         vpermxor 12, 12, 0, 22
+         vpermxor 13, 13, 1, 22
+         vpermxor 14, 14, 2, 22
+         vpermxor 15, 15, 3, 22
+       vadduwm 8, 8, 12
+       vadduwm 9, 9, 13
+       vadduwm 10, 10, 14
+       vadduwm 11, 11, 15
+       vxor 4, 4, 8
+       vxor 5, 5, 9
+       vxor 6, 6, 10
+       vxor 7, 7, 11
+       vrlw 4, 4, 23
+       vrlw 5, 5, 23
+       vrlw 6, 6, 23
+       vrlw 7, 7, 23
+
+       # QR(v0, v5, v10, v15, v1, v6, v11, v12, v2, v7,  v8, v13, v3, v4,  v9, v14)
+       vadduwm 0, 0, 5
+       vadduwm 1, 1, 6
+       vadduwm 2, 2, 7
+       vadduwm 3, 3, 4
+         vpermxor 15, 15, 0, 20
+         vpermxor 12, 12, 1, 20
+         vpermxor 13, 13, 2, 20
+         vpermxor 14, 14, 3, 20
+       vadduwm 10, 10, 15
+       vadduwm 11, 11, 12
+       vadduwm 8, 8, 13
+       vadduwm 9, 9, 14
+       vxor 5, 5, 10
+       vxor 6, 6, 11
+       vxor 7, 7, 8
+       vxor 4, 4, 9
+       vrlw 5, 5, 21
+       vrlw 6, 6, 21
+       vrlw 7, 7, 21
+       vrlw 4, 4, 21
+       vadduwm 0, 0, 5
+       vadduwm 1, 1, 6
+       vadduwm 2, 2, 7
+       vadduwm 3, 3, 4
+         vpermxor 15, 15, 0, 22
+         vpermxor 12, 12, 1, 22
+         vpermxor 13, 13, 2, 22
+         vpermxor 14, 14, 3, 22
+       vadduwm 10, 10, 15
+       vadduwm 11, 11, 12
+       vadduwm 8, 8, 13
+       vadduwm 9, 9, 14
+       vxor 5, 5, 10
+       vxor 6, 6, 11
+       vxor 7, 7, 8
+       vxor 4, 4, 9
+       vrlw 5, 5, 23
+       vrlw 6, 6, 23
+       vrlw 7, 7, 23
+       vrlw 4, 4, 23
+.endm
+
+# Transpose
+.macro TP_4x a0 a1 a2 a3
+       xxmrghw  10, 32+\a0, 32+\a1     # a0, a1, b0, b1
+       xxmrghw  11, 32+\a2, 32+\a3     # a2, a3, b2, b3
+       xxmrglw  12, 32+\a0, 32+\a1     # c0, c1, d0, d1
+       xxmrglw  13, 32+\a2, 32+\a3     # c2, c3, d2, d3
+       xxpermdi        32+\a0, 10, 11, 0       # a0, a1, a2, a3
+       xxpermdi        32+\a1, 10, 11, 3       # b0, b1, b2, b3
+       xxpermdi        32+\a2, 12, 13, 0       # c0, c1, c2, c3
+       xxpermdi        32+\a3, 12, 13, 3       # d0, d1, d2, d3
+.endm
+
+# key stream = working state + state
+.macro Add_state S
+       vadduwm \S+0, \S+0, 16-\S
+       vadduwm \S+4, \S+4, 17-\S
+       vadduwm \S+8, \S+8, 18-\S
+       vadduwm \S+12, \S+12, 19-\S
+
+       vadduwm \S+1, \S+1, 16-\S
+       vadduwm \S+5, \S+5, 17-\S
+       vadduwm \S+9, \S+9, 18-\S
+       vadduwm \S+13, \S+13, 19-\S
+
+       vadduwm \S+2, \S+2, 16-\S
+       vadduwm \S+6, \S+6, 17-\S
+       vadduwm \S+10, \S+10, 18-\S
+       vadduwm \S+14, \S+14, 19-\S
+
+       vadduwm \S+3, \S+3, 16-\S
+       vadduwm \S+7, \S+7, 17-\S
+       vadduwm \S+11, \S+11, 18-\S
+       vadduwm \S+15, \S+15, 19-\S
+.endm
+
+#
+# write 256 bytes
+#
+.macro Write_256 S
+       add 9, 14, 5
+       add 16, 14, 4
+       lxvw4x 0, 0, 9
+       lxvw4x 1, 17, 9
+       lxvw4x 2, 18, 9
+       lxvw4x 3, 19, 9
+       lxvw4x 4, 20, 9
+       lxvw4x 5, 21, 9
+       lxvw4x 6, 22, 9
+       lxvw4x 7, 23, 9
+       lxvw4x 8, 24, 9
+       lxvw4x 9, 25, 9
+       lxvw4x 10, 26, 9
+       lxvw4x 11, 27, 9
+       lxvw4x 12, 28, 9
+       lxvw4x 13, 29, 9
+       lxvw4x 14, 30, 9
+       lxvw4x 15, 31, 9
+
+       xxlxor \S+32, \S+32, 0
+       xxlxor \S+36, \S+36, 1
+       xxlxor \S+40, \S+40, 2
+       xxlxor \S+44, \S+44, 3
+       xxlxor \S+33, \S+33, 4
+       xxlxor \S+37, \S+37, 5
+       xxlxor \S+41, \S+41, 6
+       xxlxor \S+45, \S+45, 7
+       xxlxor \S+34, \S+34, 8
+       xxlxor \S+38, \S+38, 9
+       xxlxor \S+42, \S+42, 10
+       xxlxor \S+46, \S+46, 11
+       xxlxor \S+35, \S+35, 12
+       xxlxor \S+39, \S+39, 13
+       xxlxor \S+43, \S+43, 14
+       xxlxor \S+47, \S+47, 15
+
+       stxvw4x \S+32, 0, 16
+       stxvw4x \S+36, 17, 16
+       stxvw4x \S+40, 18, 16
+       stxvw4x \S+44, 19, 16
+
+       stxvw4x \S+33, 20, 16
+       stxvw4x \S+37, 21, 16
+       stxvw4x \S+41, 22, 16
+       stxvw4x \S+45, 23, 16
+
+       stxvw4x \S+34, 24, 16
+       stxvw4x \S+38, 25, 16
+       stxvw4x \S+42, 26, 16
+       stxvw4x \S+46, 27, 16
+
+       stxvw4x \S+35, 28, 16
+       stxvw4x \S+39, 29, 16
+       stxvw4x \S+43, 30, 16
+       stxvw4x \S+47, 31, 16
+
+.endm
+
+#
+# unsigned int _gcry_chacha20_p10le_8x(u32 *state, byte *dst, const byte *src, size_t len);
+#
+.global _gcry_chacha20_p10le_8x
+.align 5
+_gcry_chacha20_p10le_8x:
+       cmpdi   6, 512
+       blt     Out_no_chacha
+
+       stdu 1,-1024(1)
+       mflr 0
+
+       std     14,112(1)
+       std     15,120(1)
+       std     16,128(1)
+       std     17,136(1)
+       std     18,144(1)
+       std     19,152(1)
+       std     20,160(1)
+       std     21,168(1)
+       std     22,176(1)
+       std     23,184(1)
+       std     24,192(1)
+       std     25,200(1)
+       std     26,208(1)
+       std     27,216(1)
+       std     28,224(1)
+       std     29,232(1)
+       std     30,240(1)
+       std     31,248(1)
+       std     0, 1040(1)
+
+       li      17, 16
+       li      18, 32
+       li      19, 48
+       li      20, 64
+       li      21, 80
+       li      22, 96
+       li      23, 112
+       li      24, 128
+       li      25, 144
+       li      26, 160
+       li      27, 176
+       li      28, 192
+       li      29, 208
+       li      30, 224
+       li      31, 240
+       addi    9, 1, 256
+       stvx    20, 0, 9
+       stvx    21, 17, 9
+       stvx    22, 18, 9
+       stvx    23, 19, 9
+       stvx    24, 20, 9
+       stvx    25, 21, 9
+       stvx    26, 22, 9
+       stvx    27, 23, 9
+       stvx    28, 24, 9
+       stvx    29, 25, 9
+       stvx    30, 26, 9
+       stvx    31, 27, 9
+
+       add     9, 9, 27
+       addi    14, 17, 16
+       stxvx   14, 14, 9
+       addi    14, 14, 16
+       stxvx   15, 14, 9
+       addi    14, 14, 16
+       stxvx   16, 14, 9
+       addi    14, 14, 16
+       stxvx   17, 14, 9
+       addi    14, 14, 16
+       stxvx   18, 14, 9
+       addi    14, 14, 16
+       stxvx   19, 14, 9
+       addi    14, 14, 16
+       stxvx   20, 14, 9
+       addi    14, 14, 16
+       stxvx   21, 14, 9
+       addi    14, 14, 16
+       stxvx   22, 14, 9
+       addi    14, 14, 16
+       stxvx   23, 14, 9
+       addi    14, 14, 16
+       stxvx   24, 14, 9
+       addi    14, 14, 16
+       stxvx   25, 14, 9
+       addi    14, 14, 16
+       stxvx   26, 14, 9
+       addi    14, 14, 16
+       stxvx   27, 14, 9
+       addi    14, 14, 16
+       stxvx   28, 14, 9
+       addi    14, 14, 16
+       stxvx   29, 14, 9
+       addi    14, 14, 16
+       stxvx   30, 14, 9
+       addi    14, 14, 16
+       stxvx   31, 14, 9
+
+       mr 15, 6                        # len
+       li 14, 0                        # offset to inp and outp
+
+       ld      10, sigma@got(2)
+
+        lxvw4x 48, 0, 3                #  vr16, constants
+       lxvw4x  49, 17, 3               #  vr17, key 1
+       lxvw4x  50, 18, 3               #  vr18, key 2
+       lxvw4x  51, 19, 3               #  vr19, counter, nonce
+
+       lxvw4x  62, 19, 10              # vr30, 4
+
+       vspltisw 21, 12
+       vspltisw 23, 7
+
+       ld      11, permx@got(2)
+       lxvw4x  32+20, 0, 11
+       lxvw4x  32+22, 17, 11
+
+       li 8, 10
+       mtctr 8
+
+       xxlor   16, 48, 48
+       xxlor   17, 49, 49
+       xxlor   18, 50, 50
+       xxlor   19, 51, 51
+
+       vspltisw 25, 4
+       vspltisw 26, 8
+
+       xxlor   16, 48, 48
+       xxlor   17, 49, 49
+       xxlor   18, 50, 50
+       xxlor   19, 51, 51
+
+       xxlor   25, 32+26, 32+26
+       xxlor   24, 32+25, 32+25
+
+       vadduwm 31, 30, 25              # (0, 1, 2, 3) + (4, 4, 4, 4)
+       xxlor   30, 32+30, 32+30
+       xxlor   31, 32+31, 32+31
+
+       xxlor   20, 32+20, 32+20
+       xxlor   21, 32+21, 32+21
+       xxlor   22, 32+22, 32+22
+       xxlor   23, 32+23, 32+23
+
+Loop_8x:
+       lvx     0, 20, 10
+       lvx     1, 21, 10
+       lvx     2, 22, 10
+       lvx     3, 23, 10
+       xxspltw  32+4, 17, 0
+       xxspltw  32+5, 17, 1
+       xxspltw  32+6, 17, 2
+       xxspltw  32+7, 17, 3
+       xxspltw  32+8, 18, 0
+       xxspltw  32+9, 18, 1
+       xxspltw  32+10, 18, 2
+       xxspltw  32+11, 18, 3
+       xxspltw  32+12, 19, 0
+       xxspltw  32+13, 19, 1
+       xxspltw  32+14, 19, 2
+       xxspltw  32+15, 19, 3
+       vadduwm 12, 12, 30      # increase counter
+
+       lvx     16, 20, 10
+       lvx     17, 21, 10
+       lvx     18, 22, 10
+       lvx     19, 23, 10
+       xxspltw  32+20, 17, 0
+       xxspltw  32+21, 17, 1
+       xxspltw  32+22, 17, 2
+       xxspltw  32+23, 17, 3
+       xxspltw  32+24, 18, 0
+       xxspltw  32+25, 18, 1
+       xxspltw  32+26, 18, 2
+       xxspltw  32+27, 18, 3
+       xxspltw  32+28, 19, 0
+       xxspltw  32+29, 19, 1
+       vadduwm 28, 28, 31      # increase counter
+       xxspltw  32+30, 19, 2
+       xxspltw  32+31, 19, 3
+
+.align 5
+quarter_loop_8x:
+       QT_loop_8x
+
+       bdnz    quarter_loop_8x
+
+       xxlor   0, 32+30, 32+30
+       xxlor   32+30, 30, 30
+       vadduwm 12, 12, 30
+       xxlor   32+30, 0, 0
+       TP_4x 0, 1, 2, 3
+       TP_4x 4, 5, 6, 7
+       TP_4x 8, 9, 10, 11
+       TP_4x 12, 13, 14, 15
+
+       xxlor   0, 48, 48
+       xxlor   1, 49, 49
+       xxlor   2, 50, 50
+       xxlor   3, 51, 51
+       xxlor   48, 16, 16
+       xxlor   49, 17, 17
+       xxlor   50, 18, 18
+       xxlor   51, 19, 19
+       Add_state 0
+       xxlor   48, 0, 0
+       xxlor   49, 1, 1
+       xxlor   50, 2, 2
+       xxlor   51, 3, 3
+       Write_256 0
+       addi    14, 14, 256
+       addi    15, 15, -256
+
+       xxlor   5, 32+31, 32+31
+       xxlor   32+31, 31, 31
+       vadduwm 28, 28, 31
+       xxlor   32+31, 5, 5
+       TP_4x 16+0, 16+1, 16+2, 16+3
+       TP_4x 16+4, 16+5, 16+6, 16+7
+       TP_4x 16+8, 16+9, 16+10, 16+11
+       TP_4x 16+12, 16+13, 16+14, 16+15
+
+       xxlor   32, 16, 16
+       xxlor   33, 17, 17
+       xxlor   34, 18, 18
+       xxlor   35, 19, 19
+       Add_state 16
+       Write_256 16
+       addi    14, 14, 256
+       addi    15, 15, -256
+
+       # should update counter before out?
+       xxlor   32+24, 24, 24
+       xxlor   32+25, 25, 25
+       xxlor   32+30, 30, 30
+       vadduwm 30, 30, 25
+       vadduwm 31, 30, 24
+       xxlor   30, 32+30, 32+30
+       xxlor   31, 32+31, 32+31
+
+       cmpdi   15, 0
+       beq     Out_loop
+
+       cmpdi   15, 512
+       blt     Loop_last
+
+       mtctr 8
+       b Loop_8x
+
+Loop_last:
+        lxvw4x 48, 0, 3                #  vr16, constants
+       lxvw4x  49, 17, 3               #  vr17, key 1
+       lxvw4x  50, 18, 3               #  vr18, key 2
+       lxvw4x  51, 19, 3               #  vr19, counter, nonce
+
+       vspltisw 21, 12
+       vspltisw 23, 7
+       lxvw4x  32+20, 0, 11
+       lxvw4x  32+22, 17, 11
+
+       li 8, 10
+       mtctr 8
+
+Loop_4x:
+       lvx     0, 20, 10
+       lvx     1, 21, 10
+       lvx     2, 22, 10
+       lvx     3, 23, 10
+       vspltw  4, 17, 0
+       vspltw  5, 17, 1
+       vspltw  6, 17, 2
+       vspltw  7, 17, 3
+       vspltw  8, 18, 0
+       vspltw  9, 18, 1
+       vspltw  10, 18, 2
+       vspltw  11, 18, 3
+       vspltw  12, 19, 0
+       vadduwm 12, 12, 30      # increase counter
+       vspltw  13, 19, 1
+       vspltw  14, 19, 2
+       vspltw  15, 19, 3
+
+.align 5
+quarter_loop:
+       QT_loop_4x
+
+       bdnz    quarter_loop
+
+       vadduwm 12, 12, 30
+       TP_4x 0, 1, 2, 3
+       TP_4x 4, 5, 6, 7
+       TP_4x 8, 9, 10, 11
+       TP_4x 12, 13, 14, 15
+
+       Add_state 0
+       Write_256 0
+       addi    14, 14, 256
+       addi    15, 15, -256
+
+       # Update state counter
+       vspltisw 25, 4
+       vadduwm 30, 30, 25
+
+       cmpdi   15, 0
+       beq     Out_loop
+
+       mtctr 8
+       b Loop_4x
+
+Out_loop:
+       #
+       # Update state counter
+       #
+       vspltisb        16, -1          # first 16 bytes - 0xffff...ff
+       vspltisb        17, 0           # second 16 bytes - 0x0000...00
+       vsldoi          18, 16, 17, 12
+       vand            18, 18, 30
+       xxlor           32+19, 19, 19
+       vadduwm         18, 19, 18
+       stxvw4x         32+18, 19, 3
+       li      3, 0
+
+       addi    9, 1, 256
+       lvx     20, 0, 9
+       lvx     21, 17, 9
+       lvx     22, 18, 9
+       lvx     23, 19, 9
+       lvx     24, 20, 9
+       lvx     25, 21, 9
+       lvx     26, 22, 9
+       lvx     27, 23, 9
+       lvx     28, 24, 9
+       lvx     29, 25, 9
+       lvx     30, 26, 9
+       lvx     31, 27, 9
+
+       add     9, 9, 27
+       addi    14, 17, 16
+       lxvx    14, 14, 9
+       addi    14, 14, 16
+       lxvx    15, 14, 9
+       addi    14, 14, 16
+       lxvx    16, 14, 9
+       addi    14, 14, 16
+       lxvx    17, 14, 9
+       addi    14, 14, 16
+       lxvx    18, 14, 9
+       addi    14, 14, 16
+       lxvx    19, 14, 9
+       addi    14, 14, 16
+       lxvx    20, 14, 9
+       addi    14, 14, 16
+       lxvx    21, 14, 9
+       addi    14, 14, 16
+       lxvx    22, 14, 9
+       addi    14, 14, 16
+       lxvx    23, 14, 9
+       addi    14, 14, 16
+       lxvx    24, 14, 9
+       addi    14, 14, 16
+       lxvx    25, 14, 9
+       addi    14, 14, 16
+       lxvx    26, 14, 9
+       addi    14, 14, 16
+       lxvx    27, 14, 9
+       addi    14, 14, 16
+       lxvx    28, 14, 9
+       addi    14, 14, 16
+       lxvx    29, 14, 9
+       addi    14, 14, 16
+       lxvx    30, 14, 9
+       addi    14, 14, 16
+       lxvx    31, 14, 9
+
+       ld      0, 1040(1)
+       ld      14,112(1)
+       ld      15,120(1)
+       ld      16,128(1)
+       ld      17,136(1)
+       ld      18,144(1)
+       ld      19,152(1)
+       ld      20,160(1)
+       ld      21,168(1)
+       ld      22,176(1)
+       ld      23,184(1)
+       ld      24,192(1)
+       ld      25,200(1)
+       ld      26,208(1)
+       ld      27,216(1)
+       ld      28,224(1)
+       ld      29,232(1)
+       ld      30,240(1)
+       ld      31,248(1)
+
+       mtlr    0
+       addi    1, 1, 1024
+       blr
+
+Out_no_chacha:
+       li      3, 0
+       blr
+
+.section .rodata
+.align 4
+sigma:
+.long 0x61707865, 0x3320646e, 0x79622d32, 0x6b206574
+.long 0x0c0d0e0f, 0x08090a0b, 0x04050607, 0x00010203
+.long 1, 0, 0, 0
+.long 0, 1, 2, 3
+.long 0x61707865, 0x61707865, 0x61707865, 0x61707865
+.long 0x3320646e, 0x3320646e, 0x3320646e, 0x3320646e
+.long 0x79622d32, 0x79622d32, 0x79622d32, 0x79622d32
+.long 0x6b206574, 0x6b206574, 0x6b206574, 0x6b206574
+permx:
+.long 0x22330011, 0x66774455, 0xaabb8899, 0xeeffccdd
+.long 0x11223300, 0x55667744, 0x99aabb88, 0xddeeffcc
index 4a21b837d166bab419d95439bf179893cc73a89e..994b6a0150c7f9673030ecbf37a117175aa48762 100644 (file)
@@ -136,9 +136,8 @@ vec_add_ctr_u64(vector4x_u32 v, vector4x_u32 a)
 #define ADD_U64(v,a) \
        (v = vec_add_ctr_u64(v, a))
 
-unsigned int ASM_FUNC_ATTR
-_gcry_chacha20_ppc8_blocks1(u32 *state, byte *dst, const byte *src,
-                           size_t nblks)
+static ASM_FUNC_ATTR_INLINE unsigned int
+chacha20_ppc_blocks1(u32 *state, byte *dst, const byte *src, size_t nblks)
 {
   vector4x_u32 counter_1 = { 1, 0, 0, 0 };
   vector4x_u32 rotate_16 = { 16, 16, 16, 16 };
@@ -283,9 +282,8 @@ _gcry_chacha20_ppc8_blocks1(u32 *state, byte *dst, const byte *src,
        PLUS(c1,d1); PLUS(c2,d2); XOR(b1,c1); XOR(b2,c2);       \
            ROTATE(b1, rotate_7); ROTATE(b2, rotate_7);
 
-unsigned int ASM_FUNC_ATTR
-_gcry_chacha20_ppc8_blocks4(u32 *state, byte *dst, const byte *src,
-                           size_t nblks)
+static ASM_FUNC_ATTR_INLINE unsigned int
+chacha20_ppc_blocks4(u32 *state, byte *dst, const byte *src, size_t nblks)
 {
   vector4x_u32 counters_0123 = { 0, 1, 2, 3 };
   vector4x_u32 counter_4 = { 4, 0, 0, 0 };
@@ -470,10 +468,10 @@ _gcry_chacha20_ppc8_blocks4(u32 *state, byte *dst, const byte *src,
     MUL_MOD_1305_64_PART2(h2, h1, h0, r1, r0, r1_mult5); \
   } while (0)
 
-unsigned int ASM_FUNC_ATTR
-_gcry_chacha20_poly1305_ppc8_blocks4(u32 *state, byte *dst, const byte *src,
-                                    size_t nblks, POLY1305_STATE *st,
-                                    const byte *poly1305_src)
+static ASM_FUNC_ATTR_INLINE unsigned int
+chacha20_poly1305_ppc_blocks4(u32 *state, byte *dst, const byte *src,
+                             size_t nblks, POLY1305_STATE *st,
+                             const byte *poly1305_src)
 {
   vector4x_u32 counters_0123 = { 0, 1, 2, 3 };
   vector4x_u32 counter_4 = { 4, 0, 0, 0 };
@@ -641,6 +639,112 @@ _gcry_chacha20_poly1305_ppc8_blocks4(u32 *state, byte *dst, const byte *src,
   return 0;
 }
 
+#else
+
+static ASM_FUNC_ATTR_INLINE unsigned int
+chacha20_poly1305_ppc_blocks4(u32 *state, byte *dst, const byte *src,
+                             size_t nblks, POLY1305_STATE *st,
+                             const byte *poly1305_src)
+{
+}
+
 #endif /* SIZEOF_UNSIGNED_LONG == 8 */
 
+
+#ifdef HAVE_GCC_ATTRIBUTE_OPTIMIZE
+# define FUNC_ATTR_OPT_O2 __attribute__((optimize("-O2")))
+#else
+# define FUNC_ATTR_OPT_O2
+#endif
+
+#if defined(__clang__) && defined(HAVE_CLANG_ATTRIBUTE_PPC_TARGET)
+# define FUNC_ATTR_TARGET_P8 __attribute__((target("arch=pwr8")))
+# define FUNC_ATTR_TARGET_P9 __attribute__((target("arch=pwr9")))
+# define HAVE_FUNC_ATTR_TARGET 1
+#elif defined(HAVE_GCC_ATTRIBUTE_PPC_TARGET)
+# define FUNC_ATTR_TARGET_P8 __attribute__((target("cpu=power8")))
+# define FUNC_ATTR_TARGET_P9 __attribute__((target("cpu=power9")))
+# define HAVE_FUNC_ATTR_TARGET 1
+#else
+# define FUNC_ATTR_TARGET_P8
+# define FUNC_ATTR_TARGET_P9
+# undef HAVE_FUNC_ATTR_TARGET
+#endif
+
+
+/* Functions targetting POWER8. */
+unsigned int ASM_FUNC_ATTR FUNC_ATTR_TARGET_P8 FUNC_ATTR_OPT_O2
+_gcry_chacha20_ppc8_blocks1(u32 *state, byte *dst, const byte *src,
+                           size_t nblks)
+{
+  return chacha20_ppc_blocks1(state, dst, src, nblks);
+}
+
+unsigned int ASM_FUNC_ATTR FUNC_ATTR_TARGET_P8 FUNC_ATTR_OPT_O2
+_gcry_chacha20_ppc8_blocks4(u32 *state, byte *dst, const byte *src,
+                           size_t nblks)
+{
+  return chacha20_ppc_blocks4(state, dst, src, nblks);
+}
+
+unsigned int ASM_FUNC_ATTR FUNC_ATTR_TARGET_P8 FUNC_ATTR_OPT_O2
+_gcry_chacha20_poly1305_ppc8_blocks4(u32 *state, byte *dst, const byte *src,
+                                    size_t nblks, POLY1305_STATE *st,
+                                    const byte *poly1305_src)
+{
+  return chacha20_poly1305_ppc_blocks4(state, dst, src, nblks, st,
+                                      poly1305_src);
+}
+
+#ifdef HAVE_FUNC_ATTR_TARGET
+/* Functions targetting POWER9. */
+unsigned int ASM_FUNC_ATTR FUNC_ATTR_TARGET_P9 FUNC_ATTR_OPT_O2
+_gcry_chacha20_ppc9_blocks1(u32 *state, byte *dst, const byte *src,
+                           size_t nblks)
+{
+  return chacha20_ppc_blocks1(state, dst, src, nblks);
+}
+
+unsigned int ASM_FUNC_ATTR FUNC_ATTR_TARGET_P9 FUNC_ATTR_OPT_O2
+_gcry_chacha20_ppc9_blocks4(u32 *state, byte *dst, const byte *src,
+                           size_t nblks)
+{
+  return chacha20_ppc_blocks4(state, dst, src, nblks);
+}
+
+unsigned int ASM_FUNC_ATTR FUNC_ATTR_TARGET_P9 FUNC_ATTR_OPT_O2
+_gcry_chacha20_poly1305_ppc9_blocks4(u32 *state, byte *dst, const byte *src,
+                                    size_t nblks, POLY1305_STATE *st,
+                                    const byte *poly1305_src)
+{
+  return chacha20_poly1305_ppc_blocks4(state, dst, src, nblks, st,
+                                      poly1305_src);
+}
+#else
+/* Compiler does not support target attribute, use same functions for POWER9
+ * as for POWER8. */
+unsigned int ASM_FUNC_ATTR FUNC_ATTR_TARGET_P9 FUNC_ATTR_OPT_O2
+_gcry_chacha20_ppc9_blocks1(u32 *state, byte *dst, const byte *src,
+                           size_t nblks)
+{
+  return _gcry_chacha20_ppc8_blocks1(state, dst, src, nblks);
+}
+
+unsigned int ASM_FUNC_ATTR FUNC_ATTR_TARGET_P9 FUNC_ATTR_OPT_O2
+_gcry_chacha20_ppc9_blocks4(u32 *state, byte *dst, const byte *src,
+                           size_t nblks)
+{
+  return _gcry_chacha20_ppc8_blocks4(state, dst, src, nblks);
+}
+
+unsigned int ASM_FUNC_ATTR FUNC_ATTR_TARGET_P9 FUNC_ATTR_OPT_O2
+_gcry_chacha20_poly1305_ppc9_blocks4(u32 *state, byte *dst, const byte *src,
+                                    size_t nblks, POLY1305_STATE *st,
+                                    const byte *poly1305_src)
+{
+  return _gcry_chacha20_poly1305_ppc8_blocks4(state, dst, src, nblks, st,
+                                             poly1305_src);
+}
+#endif /* HAVE_GCC_ATTRIBUTE_PPC_TARGET */
+
 #endif /* ENABLE_PPC_CRYPTO_SUPPORT */
index 9b1d59c6ad19d00c236e972c085110c1cc6d3844..5a9319983e61341e0b8aada3bbbee3af7a07df32 100644 (file)
 #include "asm-poly1305-s390x.h"
 
 .machine "z13+vx"
-.text
 
+.section .rodata
+
+ELF(.type _gcry_chacha20_s390x_vx_constants,@function;)
 .balign 16
+_gcry_chacha20_s390x_vx_constants:
 .Lconsts:
 .Lwordswap:
        .byte 12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3
   4-way && 2-way && 1-way chacha20 ("horizontal")
  **********************************************************************/
 
-.balign 8
+.text
+
+.balign 16
 .globl _gcry_chacha20_s390x_vx_blocks4_2_1
 ELF(.type _gcry_chacha20_s390x_vx_blocks4_2_1,@function;)
 
@@ -578,7 +583,7 @@ ELF(.size _gcry_chacha20_s390x_vx_blocks4_2_1,
   4-way && 2-way && 1-way stitched chacha20-poly1305 ("horizontal")
  **********************************************************************/
 
-.balign 8
+.balign 16
 .globl _gcry_chacha20_poly1305_s390x_vx_blocks4_2_1
 ELF(.type _gcry_chacha20_poly1305_s390x_vx_blocks4_2_1,@function;)
 
@@ -1058,7 +1063,7 @@ ELF(.size _gcry_chacha20_poly1305_s390x_vx_blocks4_2_1,
          vpdi vc, tmpc,   vd, 0;                               \
          vpdi vd, tmpc,   vd, 5;
 
-.balign 8
+.balign 16
 .globl _gcry_chacha20_s390x_vx_blocks8
 ELF(.type _gcry_chacha20_s390x_vx_blocks8,@function;)
 
@@ -1276,7 +1281,7 @@ ELF(.size _gcry_chacha20_s390x_vx_blocks8,
   8-way stitched chacha20-poly1305 ("vertical")
  **********************************************************************/
 
-.balign 8
+.balign 16
 .globl _gcry_chacha20_poly1305_s390x_vx_blocks8
 ELF(.type _gcry_chacha20_poly1305_s390x_vx_blocks8,@function;)
 
index 497594a0bb9d0d34fe16a3b7f5d1204c7ff50503..ca8176f46eb8b7f4f89246251f13d7fa361f4b40 100644 (file)
@@ -4,7 +4,7 @@
  * This file is part of Libgcrypt.
  *
  * Libgcrypt is free software; you can redistribute it and/or modify
- * it under the terms of the GNU Lesser general Public License as
+ * it under the terms of the GNU Lesser General Public License as
  * published by the Free Software Foundation; either version 2.1 of
  * the License, or (at your option) any later version.
  *
 # define USE_AVX2 1
 #endif
 
+/* USE_AVX512 indicates whether to compile with Intel AVX512 code. */
+#undef USE_AVX512
+#if defined(__x86_64__) && defined(HAVE_GCC_INLINE_ASM_AVX512) && \
+    (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
+     defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS))
+# define USE_AVX512 1
+#endif
+
 /* USE_ARMV7_NEON indicates whether to enable ARMv7 NEON assembly code. */
 #undef USE_ARMV7_NEON
 #ifdef ENABLE_NEON_SUPPORT
@@ -123,8 +131,11 @@ typedef struct CHACHA20_context_s
   unsigned int unused; /* bytes in the pad.  */
   unsigned int use_ssse3:1;
   unsigned int use_avx2:1;
+  unsigned int use_avx512:1;
   unsigned int use_neon:1;
   unsigned int use_ppc:1;
+  unsigned int use_p9:1;
+  unsigned int use_p10:1;
   unsigned int use_s390x:1;
 } CHACHA20_context_t;
 
@@ -161,8 +172,22 @@ unsigned int _gcry_chacha20_poly1305_amd64_avx2_blocks8(
 
 #endif /* USE_AVX2 */
 
+#ifdef USE_AVX512
+
+unsigned int _gcry_chacha20_amd64_avx512_blocks(u32 *state, byte *dst,
+                                                const byte *src,
+                                                size_t nblks) ASM_FUNC_ABI;
+
+#endif /* USE_AVX2 */
+
 #ifdef USE_PPC_VEC
 
+#ifndef WORDS_BIGENDIAN
+unsigned int _gcry_chacha20_p10le_8x(u32 *state, byte *dst,
+                                    const byte *src,
+                                    size_t len);
+#endif
+
 unsigned int _gcry_chacha20_ppc8_blocks4(u32 *state, byte *dst,
                                         const byte *src,
                                         size_t nblks);
@@ -171,12 +196,24 @@ unsigned int _gcry_chacha20_ppc8_blocks1(u32 *state, byte *dst,
                                         const byte *src,
                                         size_t nblks);
 
+unsigned int _gcry_chacha20_ppc9_blocks4(u32 *state, byte *dst,
+                                        const byte *src,
+                                        size_t nblks);
+
+unsigned int _gcry_chacha20_ppc9_blocks1(u32 *state, byte *dst,
+                                        const byte *src,
+                                        size_t nblks);
+
 #undef USE_PPC_VEC_POLY1305
 #if SIZEOF_UNSIGNED_LONG == 8
 #define USE_PPC_VEC_POLY1305 1
 unsigned int _gcry_chacha20_poly1305_ppc8_blocks4(
                u32 *state, byte *dst, const byte *src, size_t nblks,
                POLY1305_STATE *st, const byte *poly1305_src);
+
+unsigned int _gcry_chacha20_poly1305_ppc9_blocks4(
+               u32 *state, byte *dst, const byte *src, size_t nblks,
+               POLY1305_STATE *st, const byte *poly1305_src);
 #endif /* SIZEOF_UNSIGNED_LONG == 8 */
 
 #endif /* USE_PPC_VEC */
@@ -328,6 +365,13 @@ static unsigned int
 chacha20_blocks (CHACHA20_context_t *ctx, byte *dst, const byte *src,
                 size_t nblks)
 {
+#ifdef USE_AVX512
+  if (ctx->use_avx512)
+    {
+      return _gcry_chacha20_amd64_avx512_blocks(ctx->input, dst, src, nblks);
+    }
+#endif
+
 #ifdef USE_SSSE3
   if (ctx->use_ssse3)
     {
@@ -338,7 +382,10 @@ chacha20_blocks (CHACHA20_context_t *ctx, byte *dst, const byte *src,
 #ifdef USE_PPC_VEC
   if (ctx->use_ppc)
     {
-      return _gcry_chacha20_ppc8_blocks1(ctx->input, dst, src, nblks);
+      if (ctx->use_p9)
+       return _gcry_chacha20_ppc9_blocks1(ctx->input, dst, src, nblks);
+      else
+       return _gcry_chacha20_ppc8_blocks1(ctx->input, dst, src, nblks);
     }
 #endif
 
@@ -464,6 +511,9 @@ chacha20_do_setkey (CHACHA20_context_t *ctx,
 #ifdef USE_SSSE3
   ctx->use_ssse3 = (features & HWF_INTEL_SSSE3) != 0;
 #endif
+#ifdef USE_AVX512
+  ctx->use_avx512 = (features & HWF_INTEL_AVX512) != 0;
+#endif
 #ifdef USE_AVX2
   ctx->use_avx2 = (features & HWF_INTEL_AVX2) != 0;
 #endif
@@ -475,6 +525,15 @@ chacha20_do_setkey (CHACHA20_context_t *ctx,
 #endif
 #ifdef USE_PPC_VEC
   ctx->use_ppc = (features & HWF_PPC_ARCH_2_07) != 0;
+  ctx->use_p9  = (features & HWF_PPC_ARCH_3_00) != 0;
+# ifndef WORDS_BIGENDIAN
+  ctx->use_p10 = (features & HWF_PPC_ARCH_3_10) != 0;
+#  ifdef ENABLE_FORCE_SOFT_HWFEATURES
+  /* HWF_PPC_ARCH_3_10 above is used as soft HW-feature indicator for P10.
+   * Actual implementation works with HWF_PPC_ARCH_3_00 also. */
+  ctx->use_p10 |= (features & HWF_PPC_ARCH_3_00) != 0;
+#  endif
+# endif
 #endif
 #ifdef USE_S390X_VX
   ctx->use_s390x = (features & HWF_S390X_VX) != 0;
@@ -510,6 +569,19 @@ do_chacha20_encrypt_stream_tail (CHACHA20_context_t *ctx, byte *outbuf,
   static const unsigned char zero_pad[CHACHA20_BLOCK_SIZE] = { 0, };
   unsigned int nburn, burn = 0;
 
+#ifdef USE_AVX512
+  if (ctx->use_avx512 && length >= CHACHA20_BLOCK_SIZE)
+    {
+      size_t nblocks = length / CHACHA20_BLOCK_SIZE;
+      nburn = _gcry_chacha20_amd64_avx512_blocks(ctx->input, outbuf, inbuf,
+                                                 nblocks);
+      burn = nburn > burn ? nburn : burn;
+      length %= CHACHA20_BLOCK_SIZE;
+      outbuf += nblocks * CHACHA20_BLOCK_SIZE;
+      inbuf  += nblocks * CHACHA20_BLOCK_SIZE;
+    }
+#endif
+
 #ifdef USE_AVX2
   if (ctx->use_avx2 && length >= CHACHA20_BLOCK_SIZE * 8)
     {
@@ -571,7 +643,29 @@ do_chacha20_encrypt_stream_tail (CHACHA20_context_t *ctx, byte *outbuf,
     {
       size_t nblocks = length / CHACHA20_BLOCK_SIZE;
       nblocks -= nblocks % 4;
-      nburn = _gcry_chacha20_ppc8_blocks4(ctx->input, outbuf, inbuf, nblocks);
+      if (0)
+        {}
+#ifndef WORDS_BIGENDIAN
+      /*
+       * A workaround to skip counter overflow. This is rare.
+       */
+      else if (ctx->use_p10 && nblocks >= 8
+               && ((u64)ctx->input[12] + nblocks) <= 0xffffffffU)
+        {
+          size_t len = nblocks * CHACHA20_BLOCK_SIZE;
+          nburn = _gcry_chacha20_p10le_8x(ctx->input, outbuf, inbuf, len);
+        }
+#endif
+      else if (ctx->use_p9)
+        {
+          nburn = _gcry_chacha20_ppc9_blocks4(ctx->input, outbuf, inbuf,
+                                              nblocks);
+        }
+      else
+        {
+          nburn = _gcry_chacha20_ppc8_blocks4(ctx->input, outbuf, inbuf,
+                                              nblocks);
+        }
       burn = nburn > burn ? nburn : burn;
       length -= nblocks * CHACHA20_BLOCK_SIZE;
       outbuf += nblocks * CHACHA20_BLOCK_SIZE;
@@ -598,7 +692,7 @@ do_chacha20_encrypt_stream_tail (CHACHA20_context_t *ctx, byte *outbuf,
       size_t nblocks = length / CHACHA20_BLOCK_SIZE;
       nburn = chacha20_blocks(ctx, outbuf, inbuf, nblocks);
       burn = nburn > burn ? nburn : burn;
-      length -= nblocks * CHACHA20_BLOCK_SIZE;
+      length %= CHACHA20_BLOCK_SIZE;
       outbuf += nblocks * CHACHA20_BLOCK_SIZE;
       inbuf  += nblocks * CHACHA20_BLOCK_SIZE;
     }
@@ -703,6 +797,13 @@ _gcry_chacha20_poly1305_encrypt(gcry_cipher_hd_t c, byte *outbuf,
 
   if (0)
     { }
+#ifdef USE_AVX512
+  else if (ctx->use_avx512)
+    {
+      /* Skip stitched chacha20-poly1305 for AVX512. */
+      authptr = NULL;
+    }
+#endif
 #ifdef USE_AVX2
   else if (ctx->use_avx2 && length >= CHACHA20_BLOCK_SIZE * 8)
     {
@@ -760,9 +861,17 @@ _gcry_chacha20_poly1305_encrypt(gcry_cipher_hd_t c, byte *outbuf,
     }
 #endif
 #ifdef USE_PPC_VEC_POLY1305
+  else if (ctx->use_ppc && ctx->use_p10)
+    {
+      /* Skip stitched chacha20-poly1305 for P10. */
+      authptr = NULL;
+    }
   else if (ctx->use_ppc && length >= CHACHA20_BLOCK_SIZE * 4)
     {
-      nburn = _gcry_chacha20_ppc8_blocks4(ctx->input, outbuf, inbuf, 4);
+      if (ctx->use_p9)
+        nburn = _gcry_chacha20_ppc9_blocks4(ctx->input, outbuf, inbuf, 4);
+      else
+       nburn = _gcry_chacha20_ppc8_blocks4(ctx->input, outbuf, inbuf, 4);
       burn = nburn > burn ? nburn : burn;
 
       authptr = outbuf;
@@ -904,7 +1013,12 @@ _gcry_chacha20_poly1305_encrypt(gcry_cipher_hd_t c, byte *outbuf,
          size_t nblocks = length / CHACHA20_BLOCK_SIZE;
          nblocks -= nblocks % 4;
 
-         nburn = _gcry_chacha20_poly1305_ppc8_blocks4(
+         if (ctx->use_p9)
+           nburn = _gcry_chacha20_poly1305_ppc9_blocks4(
+                     ctx->input, outbuf, inbuf, nblocks,
+                     &c->u_mode.poly1305.ctx.state, authptr);
+         else
+           nburn = _gcry_chacha20_poly1305_ppc8_blocks4(
                      ctx->input, outbuf, inbuf, nblocks,
                      &c->u_mode.poly1305.ctx.state, authptr);
          burn = nburn > burn ? nburn : burn;
@@ -969,8 +1083,10 @@ _gcry_chacha20_poly1305_encrypt(gcry_cipher_hd_t c, byte *outbuf,
       size_t currlen = length;
 
       /* Since checksumming is done after encryption, process input in 24KiB
-       * chunks to keep data loaded in L1 cache for checksumming. */
-      if (currlen > 24 * 1024)
+       * chunks to keep data loaded in L1 cache for checksumming.  However
+       * only do splitting if input is large enough so that last chunks does
+       * not end up being short. */
+      if (currlen > 32 * 1024)
        currlen = 24 * 1024;
 
       nburn = do_chacha20_encrypt_stream_tail (ctx, outbuf, inbuf, currlen);
@@ -998,6 +1114,11 @@ _gcry_chacha20_poly1305_decrypt(gcry_cipher_hd_t c, byte *outbuf,
 {
   CHACHA20_context_t *ctx = (void *) &c->context.c;
   unsigned int nburn, burn = 0;
+#if defined(USE_AVX512) || defined(USE_PPC_VEC_POLY1305)                  \
+  || defined(USE_AVX2) || defined(USE_SSSE3) || defined(USE_AARCH64_SIMD) \
+  || defined(USE_S390X_VX_POLY1305)
+  int skip_stitched = 0;
+#endif
 
   if (!length)
     return 0;
@@ -1033,8 +1154,23 @@ _gcry_chacha20_poly1305_decrypt(gcry_cipher_hd_t c, byte *outbuf,
 
   gcry_assert (c->u_mode.poly1305.ctx.leftover == 0);
 
+#ifdef USE_AVX512
+  if (ctx->use_avx512)
+    {
+      /* Skip stitched chacha20-poly1305 for AVX512. */
+      skip_stitched = 1;
+    }
+#endif
+#ifdef USE_PPC_VEC_POLY1305
+  if (ctx->use_ppc && ctx->use_p10)
+    {
+      /* Skip stitched chacha20-poly1305 for P10. */
+      skip_stitched = 1;
+    }
+#endif
+
 #ifdef USE_AVX2
-  if (ctx->use_avx2 && length >= 8 * CHACHA20_BLOCK_SIZE)
+  if (!skip_stitched && ctx->use_avx2 && length >= 8 * CHACHA20_BLOCK_SIZE)
     {
       size_t nblocks = length / CHACHA20_BLOCK_SIZE;
       nblocks -= nblocks % 8;
@@ -1051,7 +1187,7 @@ _gcry_chacha20_poly1305_decrypt(gcry_cipher_hd_t c, byte *outbuf,
 #endif
 
 #ifdef USE_SSSE3
-  if (ctx->use_ssse3)
+  if (!skip_stitched && ctx->use_ssse3)
     {
       if (length >= 4 * CHACHA20_BLOCK_SIZE)
        {
@@ -1085,7 +1221,7 @@ _gcry_chacha20_poly1305_decrypt(gcry_cipher_hd_t c, byte *outbuf,
 #endif
 
 #ifdef USE_AARCH64_SIMD
-  if (ctx->use_neon && length >= 4 * CHACHA20_BLOCK_SIZE)
+  if (!skip_stitched && ctx->use_neon && length >= 4 * CHACHA20_BLOCK_SIZE)
     {
       size_t nblocks = length / CHACHA20_BLOCK_SIZE;
       nblocks -= nblocks % 4;
@@ -1102,14 +1238,20 @@ _gcry_chacha20_poly1305_decrypt(gcry_cipher_hd_t c, byte *outbuf,
 #endif
 
 #ifdef USE_PPC_VEC_POLY1305
-  if (ctx->use_ppc && length >= 4 * CHACHA20_BLOCK_SIZE)
+  /* skip stitch for p10 */
+  if (!skip_stitched && ctx->use_ppc && length >= 4 * CHACHA20_BLOCK_SIZE)
     {
       size_t nblocks = length / CHACHA20_BLOCK_SIZE;
       nblocks -= nblocks % 4;
 
-      nburn = _gcry_chacha20_poly1305_ppc8_blocks4(
-                       ctx->input, outbuf, inbuf, nblocks,
-                       &c->u_mode.poly1305.ctx.state, inbuf);
+      if (ctx->use_p9)
+       nburn = _gcry_chacha20_poly1305_ppc9_blocks4(
+                         ctx->input, outbuf, inbuf, nblocks,
+                         &c->u_mode.poly1305.ctx.state, inbuf);
+      else
+       nburn = _gcry_chacha20_poly1305_ppc8_blocks4(
+                         ctx->input, outbuf, inbuf, nblocks,
+                         &c->u_mode.poly1305.ctx.state, inbuf);
       burn = nburn > burn ? nburn : burn;
 
       length -= nblocks * CHACHA20_BLOCK_SIZE;
@@ -1119,7 +1261,7 @@ _gcry_chacha20_poly1305_decrypt(gcry_cipher_hd_t c, byte *outbuf,
 #endif
 
 #ifdef USE_S390X_VX_POLY1305
-  if (ctx->use_s390x)
+  if (!skip_stitched && ctx->use_s390x)
     {
       if (length >= 8 * CHACHA20_BLOCK_SIZE)
        {
@@ -1157,8 +1299,10 @@ _gcry_chacha20_poly1305_decrypt(gcry_cipher_hd_t c, byte *outbuf,
       size_t currlen = length;
 
       /* Since checksumming is done before decryption, process input in 24KiB
-       * chunks to keep data loaded in L1 cache for decryption. */
-      if (currlen > 24 * 1024)
+       * chunks to keep data loaded in L1 cache for decryption.  However only
+       * do splitting if input is large enough so that last chunks does not
+       * end up being short. */
+      if (currlen > 32 * 1024)
        currlen = 24 * 1024;
 
       nburn = _gcry_poly1305_update_burn (&c->u_mode.poly1305.ctx, inbuf,
index 76c64fb57db20409b8410c8d5d0590810f56648a..0d75be86e47e07ff9aaedd5a0e0e416d7e132339 100644 (file)
@@ -4,7 +4,7 @@
  * This file is part of Libgcrypt.
  *
  * Libgcrypt is free software; you can redistribute it and/or modify
- * it under the terms of the GNU Lesser general Public License as
+ * it under the terms of the GNU Lesser General Public License as
  * published by the Free Software Foundation; either version 2.1 of
  * the License, or (at your option) any later version.
  *
@@ -300,7 +300,7 @@ _gcry_cipher_keywrap_decrypt_auto (gcry_cipher_hd_t c,
           unsigned int plen = (t[4]<<24) | (t[5]<<16) | (t[6]<<8) | t[7];
 
           err = 0;
-          if (plen > 8)
+          if (plen >= 8)
             err = GPG_ERR_CHECKSUM;
           else if (plen)
             {
index d4df1e72aa9bbec1bf289197bd0279c489cad5ed..495c6287832735d68ddd3fa651dd15f42672a836 100644 (file)
@@ -5,7 +5,7 @@
  * This file is part of Libgcrypt.
  *
  * Libgcrypt is free software; you can redistribute it and/or modify
- * it under the terms of the GNU Lesser general Public License as
+ * it under the terms of the GNU Lesser General Public License as
  * published by the Free Software Foundation; either version 2.1 of
  * the License, or (at your option) any later version.
  *
index dcb268d084b581f834e7283ad06e92b3dd4ae63c..1243038659634cb85d323261091127f187f720ee 100644 (file)
@@ -4,7 +4,7 @@
  * This file is part of Libgcrypt.
  *
  * Libgcrypt is free software; you can redistribute it and/or modify
- * it under the terms of the GNU Lesser general Public License as
+ * it under the terms of the GNU Lesser General Public License as
  * published by the Free Software Foundation; either version 2.1 of
  * the License, or (at your option) any later version.
  *
@@ -38,9 +38,9 @@ static unsigned int
 do_cbc_mac (gcry_cipher_hd_t c, const unsigned char *inbuf, size_t inlen,
             int do_padding)
 {
-  const unsigned int blocksize = 16;
   gcry_cipher_encrypt_t enc_fn = c->spec->encrypt;
-  unsigned char tmp[blocksize];
+  unsigned char tmp[16];
+  const unsigned int blocksize = DIM(tmp);
   unsigned int burn = 0;
   unsigned int unused = c->u_mode.ccm.mac_unused;
   size_t nblocks;
@@ -260,7 +260,7 @@ _gcry_cipher_ccm_authenticate (gcry_cipher_hd_t c, const unsigned char *abuf,
 }
 
 
-gcry_err_code_t
+static gcry_err_code_t
 _gcry_cipher_ccm_tag (gcry_cipher_hd_t c, unsigned char *outbuf,
                      size_t outbuflen, int check)
 {
@@ -345,8 +345,10 @@ _gcry_cipher_ccm_encrypt (gcry_cipher_hd_t c, unsigned char *outbuf,
       size_t currlen = inbuflen;
 
       /* Since checksumming is done before encryption, process input in 24KiB
-       * chunks to keep data loaded in L1 cache for encryption. */
-      if (currlen > 24 * 1024)
+       * chunks to keep data loaded in L1 cache for encryption.  However only
+       * do splitting if input is large enough so that last chunks does not
+       * end up being short. */
+      if (currlen > 32 * 1024)
        currlen = 24 * 1024;
 
       c->u_mode.ccm.encryptlen -= currlen;
@@ -391,8 +393,10 @@ _gcry_cipher_ccm_decrypt (gcry_cipher_hd_t c, unsigned char *outbuf,
       size_t currlen = inbuflen;
 
       /* Since checksumming is done after decryption, process input in 24KiB
-       * chunks to keep data loaded in L1 cache for checksumming. */
-      if (currlen > 24 * 1024)
+       * chunks to keep data loaded in L1 cache for checksumming.  However
+       * only do splitting if input is large enough so that last chunks
+       * does not end up being short. */
+      if (currlen > 32 * 1024)
        currlen = 24 * 1024;
 
       err = _gcry_cipher_ctr_encrypt (c, outbuf, outbuflen, inbuf, currlen);
index 012c6c13c381a7070e4489d10cf9381447812f78..d51ca4d4a115792713477b8d461ae47fae4af603 100644 (file)
@@ -5,7 +5,7 @@
  * This file is part of Libgcrypt.
  *
  * Libgcrypt is free software; you can redistribute it and/or modify
- * it under the terms of the GNU Lesser general Public License as
+ * it under the terms of the GNU Lesser General Public License as
  * published by the Free Software Foundation; either version 2.1 of
  * the License, or (at your option) any later version.
  *
index d66c568770aa26800cad6324628ff2d8ea6c4a52..98363334495bc921640eb65acebd7cdbdd122b1e 100644 (file)
@@ -5,7 +5,7 @@
  * This file is part of Libgcrypt.
  *
  * Libgcrypt is free software; you can redistribute it and/or modify
- * it under the terms of the GNU Lesser general Public License as
+ * it under the terms of the GNU Lesser General Public License as
  * published by the Free Software Foundation; either version 2.1 of
  * the License, or (at your option) any later version.
  *
index 08f815a9e490f19e90b77762838111d139133c77..9f67bed772d0b7a0c92891f4c832b85b6a9bf0f9 100644 (file)
@@ -4,7 +4,7 @@
  * This file is part of Libgcrypt.
  *
  * Libgcrypt is free software; you can redistribute it and/or modify
- * it under the terms of the GNU Lesser general Public License as
+ * it under the terms of the GNU Lesser General Public License as
  * published by the Free Software Foundation; either version 2.1 of
  * the License, or (at your option) any later version.
  *
@@ -53,8 +53,10 @@ _gcry_cipher_eax_encrypt (gcry_cipher_hd_t c,
       size_t currlen = inbuflen;
 
       /* Since checksumming is done after encryption, process input in 24KiB
-       * chunks to keep data loaded in L1 cache for checksumming. */
-      if (currlen > 24 * 1024)
+       * chunks to keep data loaded in L1 cache for checksumming. However
+       * only do splitting if input is large enough so that last chunks does
+       * not end up being short.*/
+      if (currlen > 32 * 1024)
        currlen = 24 * 1024;
 
       err = _gcry_cipher_ctr_encrypt (c, outbuf, outbuflen, inbuf, currlen);
@@ -100,8 +102,10 @@ _gcry_cipher_eax_decrypt (gcry_cipher_hd_t c,
       size_t currlen = inbuflen;
 
       /* Since checksumming is done before decryption, process input in 24KiB
-       * chunks to keep data loaded in L1 cache for decryption. */
-      if (currlen > 24 * 1024)
+       * chunks to keep data loaded in L1 cache for decryption.  However only
+       * do splitting if input is large enough so that last chunks does not
+       * end up being short. */
+      if (currlen > 32 * 1024)
        currlen = 24 * 1024;
 
       err = _gcry_cmac_write (c, &c->u_mode.eax.cmac_ciphertext, inbuf,
index 16502b4ad2d03ead2b9c83ed0df393c4ff3e0147..c7027af3152f8ef0a0c31542a44ae9840633e6e6 100644 (file)
@@ -121,21 +121,21 @@ gcry_gcm_reduction_constant:
  * Engineering — MoCrySEn, 2013". */
 
 #define vmull_p64(rq, rl, rh, ad, bd) \
-       vext.8 t0l, ad, ad, $1; \
+       vext.8 t0l, ad, ad, #1; \
        vmull.p8 t0q, t0l, bd; \
-       vext.8 rl, bd, bd, $1; \
+       vext.8 rl, bd, bd, #1; \
        vmull.p8 rq, ad, rl; \
-       vext.8 t1l, ad, ad, $2; \
+       vext.8 t1l, ad, ad, #2; \
        vmull.p8 t1q, t1l, bd; \
-       vext.8 t3l, bd, bd, $2; \
+       vext.8 t3l, bd, bd, #2; \
        vmull.p8 t3q, ad, t3l; \
-       vext.8 t2l, ad, ad, $3; \
+       vext.8 t2l, ad, ad, #3; \
        vmull.p8 t2q, t2l, bd; \
        veor t0q, t0q, rq; \
-       vext.8 rl, bd, bd, $3; \
+       vext.8 rl, bd, bd, #3; \
        vmull.p8 rq, ad, rl; \
        veor t1q, t1q, t3q; \
-       vext.8 t3l, bd, bd, $4; \
+       vext.8 t3l, bd, bd, #4; \
        vmull.p8 t3q, ad, t3l; \
        veor t0l, t0l, t0h; \
        vand t0h, t0h, k48; \
@@ -147,13 +147,13 @@ gcry_gcm_reduction_constant:
        veor t2l, t2l, t2h; \
        vand t2h, t2h, k16; \
        veor t3l, t3l, t3h; \
-       vmov.i64 t3h, $0; \
-       vext.8 t0q, t0q, t0q, $15; \
+       vmov.i64 t3h, #0; \
+       vext.8 t0q, t0q, t0q, #15; \
        veor t2l, t2l, t2h; \
-       vext.8 t1q, t1q, t1q, $14; \
+       vext.8 t1q, t1q, t1q, #14; \
        vmull.p8 rq, ad, bd; \
-       vext.8 t2q, t2q, t2q, $13; \
-       vext.8 t3q, t3q, t3q, $12; \
+       vext.8 t2q, t2q, t2q, #13; \
+       vext.8 t3q, t3q, t3q, #12; \
        veor t0q, t0q, t1q; \
        veor t2q, t2q, t3q; \
        veor rq, rq, t0q; \
index e6714249f0cc179788c39170eeb5c42718e79680..0c31a56346e43edd242e6fe38e71205d0fc53219 100644 (file)
 
 .cpu generic+simd+crypto
 
-.text
-
 
 /* Constants */
 
+SECTION_RODATA
+
 .align 4
+ELF(.type gcry_gcm_reduction_constant,%object;)
 gcry_gcm_reduction_constant:
 .Lrconst:
   .quad 0x87
@@ -149,34 +150,14 @@ gcry_gcm_reduction_constant:
 #define _(...) __VA_ARGS__
 #define __ _()
 
-#define CLEAR_REG(reg) movi reg.16b, #0;
-
-#define VPUSH_ABI \
-        stp d8, d9, [sp, #-16]!; \
-        CFI_ADJUST_CFA_OFFSET(16); \
-        stp d10, d11, [sp, #-16]!; \
-        CFI_ADJUST_CFA_OFFSET(16); \
-        stp d12, d13, [sp, #-16]!; \
-        CFI_ADJUST_CFA_OFFSET(16); \
-        stp d14, d15, [sp, #-16]!; \
-        CFI_ADJUST_CFA_OFFSET(16);
-
-#define VPOP_ABI \
-        ldp d14, d15, [sp], #16; \
-        CFI_ADJUST_CFA_OFFSET(-16); \
-        ldp d12, d13, [sp], #16; \
-        CFI_ADJUST_CFA_OFFSET(-16); \
-        ldp d10, d11, [sp], #16; \
-        CFI_ADJUST_CFA_OFFSET(-16); \
-        ldp d8, d9, [sp], #16; \
-        CFI_ADJUST_CFA_OFFSET(-16);
+.text
 
 /*
  * unsigned int _gcry_ghash_armv8_ce_pmull (void *gcm_key, byte *result,
  *                                          const byte *buf, size_t nblocks,
  *                                          void *gcm_table);
  */
-.align 3
+.align 4
 .globl _gcry_ghash_armv8_ce_pmull
 ELF(.type  _gcry_ghash_armv8_ce_pmull,%function;)
 _gcry_ghash_armv8_ce_pmull:
@@ -375,7 +356,7 @@ ELF(.size _gcry_ghash_armv8_ce_pmull,.-_gcry_ghash_armv8_ce_pmull;)
  *                                            const byte *buf, size_t nblocks,
  *                                            void *gcm_table);
  */
-.align 3
+.align 4
 .globl _gcry_polyval_armv8_ce_pmull
 ELF(.type  _gcry_polyval_armv8_ce_pmull,%function;)
 _gcry_polyval_armv8_ce_pmull:
@@ -601,7 +582,7 @@ ELF(.size _gcry_polyval_armv8_ce_pmull,.-_gcry_polyval_armv8_ce_pmull;)
 /*
  * void _gcry_ghash_setup_armv8_ce_pmull (void *gcm_key, void *gcm_table);
  */
-.align 3
+.align 4
 .globl _gcry_ghash_setup_armv8_ce_pmull
 ELF(.type  _gcry_ghash_setup_armv8_ce_pmull,%function;)
 _gcry_ghash_setup_armv8_ce_pmull:
index daf807d0a99c4f043f4bdf4948d854b8d65a730e..4368ba16396cb5cbe02182cb8167ee3113e7d9bd 100644 (file)
@@ -1,11 +1,11 @@
 /* cipher-gcm-intel-pclmul.c  -  Intel PCLMUL accelerated Galois Counter Mode
  *                               implementation
- * Copyright (C) 2013-2014,2019 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ * Copyright (C) 2013-2014,2019,2022 Jussi Kivilinna <jussi.kivilinna@iki.fi>
  *
  * This file is part of Libgcrypt.
  *
  * Libgcrypt is free software; you can redistribute it and/or modify
- * it under the terms of the GNU Lesser general Public License as
+ * it under the terms of the GNU Lesser General Public License as
  * published by the Free Software Foundation; either version 2.1 of
  * the License, or (at your option) any later version.
  *
 #define ASM_FUNC_ATTR_INLINE ASM_FUNC_ATTR ALWAYS_INLINE
 
 
+#define GCM_INTEL_USE_VPCLMUL_AVX2         (1 << 0)
+#define GCM_INTEL_AGGR8_TABLE_INITIALIZED  (1 << 1)
+#define GCM_INTEL_AGGR16_TABLE_INITIALIZED (1 << 2)
+#define GCM_INTEL_USE_VPCLMUL_AVX512       (1 << 3)
+#define GCM_INTEL_AGGR32_TABLE_INITIALIZED (1 << 4)
+
+
 /*
  Intel PCLMUL ghash based on white paper:
   "Intel® Carry-Less Multiplication Instruction and its Usage for Computing the
    GCM Mode - Rev 2.01"; Shay Gueron, Michael E. Kounavis.
  */
-static ASM_FUNC_ATTR_INLINE void reduction(void)
+static ASM_FUNC_ATTR_INLINE
+void reduction(void)
 {
   /* input: <xmm1:xmm3> */
 
@@ -83,7 +91,8 @@ static ASM_FUNC_ATTR_INLINE void reduction(void)
                 ::: "memory" );
 }
 
-static ASM_FUNC_ATTR_INLINE void gfmul_pclmul(void)
+static ASM_FUNC_ATTR_INLINE
+void gfmul_pclmul(void)
 {
   /* Input: XMM0 and XMM1, Output: XMM1. Input XMM0 stays unmodified.
      Input must be converted to little-endian.
@@ -358,12 +367,12 @@ gfmul_pclmul_aggr4_le(const void *buf, const void *h_1, const void *h_table)
                                                                                \
     "pshufd $78, %%xmm8, %%xmm11\n\t"                                          \
     "pshufd $78, %%xmm5, %%xmm7\n\t"                                           \
-    "pxor %%xmm8, %%xmm11\n\t"  /* xmm11 holds 4:a0+a1 */                      \
-    "pxor %%xmm5, %%xmm7\n\t"   /* xmm7 holds 4:b0+b1 */                       \
+    "pxor %%xmm8, %%xmm11\n\t"  /* xmm11 holds 2:a0+a1 */                      \
+    "pxor %%xmm5, %%xmm7\n\t"   /* xmm7 holds 2:b0+b1 */                       \
     "movdqa %%xmm8, %%xmm6\n\t"                                                \
-    "pclmulqdq $0, %%xmm5, %%xmm6\n\t"   /* xmm6 holds 4:a0*b0 */              \
-    "pclmulqdq $17, %%xmm8, %%xmm5\n\t"  /* xmm5 holds 4:a1*b1 */              \
-    "pclmulqdq $0, %%xmm11, %%xmm7\n\t"  /* xmm7 holds 4:(a0+a1)*(b0+b1) */    \
+    "pclmulqdq $0, %%xmm5, %%xmm6\n\t"   /* xmm6 holds 2:a0*b0 */              \
+    "pclmulqdq $17, %%xmm8, %%xmm5\n\t"  /* xmm5 holds 2:a1*b1 */              \
+    "pclmulqdq $0, %%xmm11, %%xmm7\n\t"  /* xmm7 holds 2:(a0+a1)*(b0+b1) */    \
                                                                                \
     "pxor %%xmm6, %%xmm3\n\t" /* xmm3 holds 2+3+4+5+6+7+8:a0*b0 */             \
     "pxor %%xmm5, %%xmm1\n\t" /* xmm1 holds 2+3+4+5+6+7+8:a1*b1 */             \
@@ -371,16 +380,16 @@ gfmul_pclmul_aggr4_le(const void *buf, const void *h_1, const void *h_table)
                                                                                \
     "pshufd $78, %%xmm0, %%xmm11\n\t"                                          \
     "pshufd $78, %%xmm2, %%xmm7\n\t"                                           \
-    "pxor %%xmm0, %%xmm11\n\t" /* xmm11 holds 3:a0+a1 */                       \
-    "pxor %%xmm2, %%xmm7\n\t"  /* xmm7 holds 3:b0+b1 */                        \
+    "pxor %%xmm0, %%xmm11\n\t" /* xmm11 holds 1:a0+a1 */                       \
+    "pxor %%xmm2, %%xmm7\n\t"  /* xmm7 holds 1:b0+b1 */                        \
     "movdqa %%xmm0, %%xmm6\n\t"                                                \
-    "pclmulqdq $0, %%xmm2, %%xmm6\n\t"  /* xmm6 holds 3:a0*b0 */               \
-    "pclmulqdq $17, %%xmm0, %%xmm2\n\t" /* xmm2 holds 3:a1*b1 */               \
-    "pclmulqdq $0, %%xmm11, %%xmm7\n\t" /* xmm7 holds 3:(a0+a1)*(b0+b1) */     \
+    "pclmulqdq $0, %%xmm2, %%xmm6\n\t"  /* xmm6 holds 1:a0*b0 */               \
+    "pclmulqdq $17, %%xmm0, %%xmm2\n\t" /* xmm2 holds 1:a1*b1 */               \
+    "pclmulqdq $0, %%xmm11, %%xmm7\n\t" /* xmm7 holds 1:(a0+a1)*(b0+b1) */     \
                                                                                \
-    "pxor %%xmm6, %%xmm3\n\t" /* xmm3 holds 1+2+3+3+4+5+6+7+8:a0*b0 */         \
-    "pxor %%xmm2, %%xmm1\n\t" /* xmm1 holds 1+2+3+3+4+5+6+7+8:a1*b1 */         \
-    "pxor %%xmm7, %%xmm4\n\t"/* xmm4 holds 1+2+3+3+4+5+6+7+8:(a0+a1)*(b0+b1) */\
+    "pxor %%xmm6, %%xmm3\n\t" /* xmm3 holds 1+2+3+4+5+6+7+8:a0*b0 */           \
+    "pxor %%xmm2, %%xmm1\n\t" /* xmm1 holds 1+2+3+4+5+6+7+8:a1*b1 */           \
+    "pxor %%xmm7, %%xmm4\n\t"/* xmm4 holds 1+2+3+4+5+6+7+8:(a0+a1)*(b0+b1) */  \
                                                                                \
     /* aggregated reduction... */                                              \
     "movdqa %%xmm3, %%xmm5\n\t"                                                \
@@ -432,14 +441,804 @@ gfmul_pclmul_aggr8_le(const void *buf, const void *h_table)
 
   reduction();
 }
-#endif
 
-static ASM_FUNC_ATTR_INLINE void gcm_lsh(void *h, unsigned int hoffs)
+#ifdef GCM_USE_INTEL_VPCLMUL_AVX2
+
+#define GFMUL_AGGR16_ASM_VPCMUL_AVX2(be_to_le)                                          \
+    /* perform clmul and merge results... */                                            \
+    "vmovdqu 0*16(%[buf]), %%ymm5\n\t"                                                  \
+    "vmovdqu 2*16(%[buf]), %%ymm2\n\t"                                                  \
+    be_to_le("vpshufb %%ymm15, %%ymm5, %%ymm5\n\t") /* be => le */                      \
+    be_to_le("vpshufb %%ymm15, %%ymm2, %%ymm2\n\t") /* be => le */                      \
+    "vpxor %%ymm5, %%ymm1, %%ymm1\n\t"                                                  \
+                                                                                        \
+    "vpshufd $78, %%ymm0, %%ymm5\n\t"                                                   \
+    "vpshufd $78, %%ymm1, %%ymm4\n\t"                                                   \
+    "vpxor %%ymm0, %%ymm5, %%ymm5\n\t" /* ymm5 holds 15|16:a0+a1 */                     \
+    "vpxor %%ymm1, %%ymm4, %%ymm4\n\t" /* ymm4 holds 15|16:b0+b1 */                     \
+    "vpclmulqdq $0, %%ymm1, %%ymm0, %%ymm3\n\t"  /* ymm3 holds 15|16:a0*b0 */           \
+    "vpclmulqdq $17, %%ymm0, %%ymm1, %%ymm1\n\t" /* ymm1 holds 15|16:a1*b1 */           \
+    "vpclmulqdq $0, %%ymm5, %%ymm4, %%ymm4\n\t"  /* ymm4 holds 15|16:(a0+a1)*(b0+b1) */ \
+                                                                                        \
+    "vmovdqu %[h1_h2], %%ymm0\n\t"                                                      \
+                                                                                        \
+    "vpshufd $78, %%ymm13, %%ymm14\n\t"                                                 \
+    "vpshufd $78, %%ymm2, %%ymm7\n\t"                                                   \
+    "vpxor %%ymm13, %%ymm14, %%ymm14\n\t" /* ymm14 holds 13|14:a0+a1 */                 \
+    "vpxor %%ymm2, %%ymm7, %%ymm7\n\t"    /* ymm7 holds 13|14:b0+b1 */                  \
+    "vpclmulqdq $0, %%ymm2, %%ymm13, %%ymm6\n\t"  /* ymm6 holds 13|14:a0*b0 */          \
+    "vpclmulqdq $17, %%ymm13, %%ymm2, %%ymm2\n\t" /* ymm2 holds 13|14:a1*b1 */          \
+    "vpclmulqdq $0, %%ymm14, %%ymm7, %%ymm7\n\t"  /* ymm7 holds 13|14:(a0+a1)*(b0+b1) */\
+                                                                                        \
+    "vpxor %%ymm6, %%ymm3, %%ymm3\n\t" /* ymm3 holds 13+15|14+16:a0*b0 */               \
+    "vpxor %%ymm2, %%ymm1, %%ymm1\n\t" /* ymm1 holds 13+15|14+16:a1*b1 */               \
+    "vpxor %%ymm7, %%ymm4, %%ymm4\n\t" /* ymm4 holds 13+15|14+16:(a0+a1)*(b0+b1) */     \
+                                                                                        \
+    "vmovdqu 4*16(%[buf]), %%ymm5\n\t"                                                  \
+    "vmovdqu 6*16(%[buf]), %%ymm2\n\t"                                                  \
+    be_to_le("vpshufb %%ymm15, %%ymm5, %%ymm5\n\t") /* be => le */                      \
+    be_to_le("vpshufb %%ymm15, %%ymm2, %%ymm2\n\t") /* be => le */                      \
+                                                                                        \
+    "vpshufd $78, %%ymm12, %%ymm14\n\t"                                                 \
+    "vpshufd $78, %%ymm5, %%ymm7\n\t"                                                   \
+    "vpxor %%ymm12, %%ymm14, %%ymm14\n\t" /* ymm14 holds 11|12:a0+a1 */                 \
+    "vpxor %%ymm5, %%ymm7, %%ymm7\n\t"    /* ymm7 holds 11|12:b0+b1 */                  \
+    "vpclmulqdq $0, %%ymm5, %%ymm12, %%ymm6\n\t"  /* ymm6 holds 11|12:a0*b0 */          \
+    "vpclmulqdq $17, %%ymm12, %%ymm5, %%ymm5\n\t" /* ymm5 holds 11|12:a1*b1 */          \
+    "vpclmulqdq $0, %%ymm14, %%ymm7, %%ymm7\n\t"  /* ymm7 holds 11|12:(a0+a1)*(b0+b1) */\
+                                                                                        \
+    "vpxor %%ymm6, %%ymm3, %%ymm3\n\t" /* ymm3 holds 11+13+15|12+14+16:a0*b0 */         \
+    "vpxor %%ymm5, %%ymm1, %%ymm1\n\t" /* ymm1 holds 11+13+15|12+14+16:a1*b1 */         \
+    "vpxor %%ymm7, %%ymm4, %%ymm4\n\t" /* ymm4 holds 11+13+15|12+14+16:(a0+a1)*(b0+b1) */\
+                                                                                        \
+    "vpshufd $78, %%ymm11, %%ymm14\n\t"                                                 \
+    "vpshufd $78, %%ymm2, %%ymm7\n\t"                                                   \
+    "vpxor %%ymm11, %%ymm14, %%ymm14\n\t" /* ymm14 holds 9|10:a0+a1 */                  \
+    "vpxor %%ymm2, %%ymm7, %%ymm7\n\t"    /* ymm7 holds 9|10:b0+b1 */                   \
+    "vpclmulqdq $0, %%ymm2, %%ymm11, %%ymm6\n\t"  /* ymm6 holds 9|10:a0*b0 */           \
+    "vpclmulqdq $17, %%ymm11, %%ymm2, %%ymm2\n\t" /* ymm2 holds 9|10:a1*b1 */           \
+    "vpclmulqdq $0, %%ymm14, %%ymm7, %%ymm7\n\t" /* ymm7 holds 9|10:(a0+a1)*(b0+b1) */  \
+                                                                                        \
+    "vpxor %%ymm6, %%ymm3, %%ymm3\n\t" /* ymm3 holds 9+11+…+15|10+12+…+16:a0*b0 */      \
+    "vpxor %%ymm2, %%ymm1, %%ymm1\n\t" /* ymm1 holds 9+11+…+15|10+12+…+16:a1*b1 */      \
+    "vpxor %%ymm7, %%ymm4, %%ymm4\n\t" /* ymm4 holds 9+11+…+15|10+12+…+16:(a0+a1)*(b0+b1) */\
+                                                                                        \
+    "vmovdqu 8*16(%[buf]), %%ymm5\n\t"                                                  \
+    "vmovdqu 10*16(%[buf]), %%ymm2\n\t"                                                 \
+    be_to_le("vpshufb %%ymm15, %%ymm5, %%ymm5\n\t") /* be => le */                      \
+    be_to_le("vpshufb %%ymm15, %%ymm2, %%ymm2\n\t") /* be => le */                      \
+                                                                                        \
+    "vpshufd $78, %%ymm10, %%ymm14\n\t"                                                 \
+    "vpshufd $78, %%ymm5, %%ymm7\n\t"                                                   \
+    "vpxor %%ymm10, %%ymm14, %%ymm14\n\t" /* ymm14 holds 7|8:a0+a1 */                   \
+    "vpxor %%ymm5, %%ymm7, %%ymm7\n\t"    /* ymm7 holds 7|8:b0+b1 */                    \
+    "vpclmulqdq $0, %%ymm5, %%ymm10, %%ymm6\n\t"  /* ymm6 holds 7|8:a0*b0 */            \
+    "vpclmulqdq $17, %%ymm10, %%ymm5, %%ymm5\n\t" /* ymm5 holds 7|8:a1*b1 */            \
+    "vpclmulqdq $0, %%ymm14, %%ymm7, %%ymm7\n\t" /* ymm7 holds 7|8:(a0+a1)*(b0+b1) */   \
+                                                                                        \
+    "vpxor %%ymm6, %%ymm3, %%ymm3\n\t" /* ymm3 holds 7+9+…+15|8+10+…+16:a0*b0 */        \
+    "vpxor %%ymm5, %%ymm1, %%ymm1\n\t" /* ymm1 holds 7+9+…+15|8+10+…+16:a1*b1 */        \
+    "vpxor %%ymm7, %%ymm4, %%ymm4\n\t" /* ymm4 holds 7+9+…+15|8+10+…+16:(a0+a1)*(b0+b1) */\
+                                                                                        \
+    "vpshufd $78, %%ymm9, %%ymm14\n\t"                                                  \
+    "vpshufd $78, %%ymm2, %%ymm7\n\t"                                                   \
+    "vpxor %%ymm9, %%ymm14, %%ymm14\n\t" /* ymm14 holds 5|6:a0+a1 */                    \
+    "vpxor %%ymm2, %%ymm7, %%ymm7\n\t"   /* ymm7 holds 5|6:b0+b1 */                     \
+    "vpclmulqdq $0, %%ymm2, %%ymm9, %%ymm6\n\t"  /* ymm6 holds 5|6:a0*b0 */             \
+    "vpclmulqdq $17, %%ymm9, %%ymm2, %%ymm2\n\t" /* ymm2 holds 5|6:a1*b1 */             \
+    "vpclmulqdq $0, %%ymm14, %%ymm7, %%ymm7\n\t" /* ymm7 holds 5|6:(a0+a1)*(b0+b1) */   \
+                                                                                        \
+    "vpxor %%ymm6, %%ymm3, %%ymm3\n\t" /* ymm3 holds 5+7+…+15|6+8+…+16:a0*b0 */         \
+    "vpxor %%ymm2, %%ymm1, %%ymm1\n\t" /* ymm1 holds 5+7+…+15|6+8+…+16:a1*b1 */         \
+    "vpxor %%ymm7, %%ymm4, %%ymm4\n\t" /* ymm4 holds 5+7+…+15|6+8+…+16:(a0+a1)*(b0+b1) */\
+                                                                                        \
+    "vmovdqu 12*16(%[buf]), %%ymm5\n\t"                                                 \
+    "vmovdqu 14*16(%[buf]), %%ymm2\n\t"                                                 \
+    be_to_le("vpshufb %%ymm15, %%ymm5, %%ymm5\n\t") /* be => le */                      \
+    be_to_le("vpshufb %%ymm15, %%ymm2, %%ymm2\n\t") /* be => le */                      \
+                                                                                        \
+    "vpshufd $78, %%ymm8, %%ymm14\n\t"                                                  \
+    "vpshufd $78, %%ymm5, %%ymm7\n\t"                                                   \
+    "vpxor %%ymm8, %%ymm14, %%ymm14\n\t" /* ymm14 holds 3|4:a0+a1 */                    \
+    "vpxor %%ymm5, %%ymm7, %%ymm7\n\t"   /* ymm7 holds 3|4:b0+b1 */                     \
+    "vpclmulqdq $0, %%ymm5, %%ymm8, %%ymm6\n\t"  /* ymm6 holds 3|4:a0*b0 */             \
+    "vpclmulqdq $17, %%ymm8, %%ymm5, %%ymm5\n\t" /* ymm5 holds 3|4:a1*b1 */             \
+    "vpclmulqdq $0, %%ymm14, %%ymm7, %%ymm7\n\t" /* ymm7 holds 3|4:(a0+a1)*(b0+b1) */   \
+                                                                                        \
+    "vpxor %%ymm6, %%ymm3, %%ymm3\n\t" /* ymm3 holds 3+5+…+15|4+6+…+16:a0*b0 */         \
+    "vpxor %%ymm5, %%ymm1, %%ymm1\n\t" /* ymm1 holds 3+5+…+15|4+6+…+16:a1*b1 */         \
+    "vpxor %%ymm7, %%ymm4, %%ymm4\n\t" /* ymm4 holds 3+5+…+15|4+6+…+16:(a0+a1)*(b0+b1) */\
+                                                                                        \
+    "vpshufd $78, %%ymm0, %%ymm14\n\t"                                                  \
+    "vpshufd $78, %%ymm2, %%ymm7\n\t"                                                   \
+    "vpxor %%ymm0, %%ymm14, %%ymm14\n\t" /* ymm14 holds 1|2:a0+a1 */                    \
+    "vpxor %%ymm2, %%ymm7, %%ymm7\n\t"   /* ymm7 holds 1|2:b0+b1 */                     \
+    "vpclmulqdq $0, %%ymm2, %%ymm0, %%ymm6\n\t"  /* ymm6 holds 1|2:a0*b0 */             \
+    "vpclmulqdq $17, %%ymm0, %%ymm2, %%ymm2\n\t" /* ymm2 holds 1|2:a1*b1 */             \
+    "vpclmulqdq $0, %%ymm14, %%ymm7, %%ymm7\n\t" /* ymm7 holds 1|2:(a0+a1)*(b0+b1) */   \
+                                                                                        \
+    "vmovdqu %[h15_h16], %%ymm0\n\t"                                                    \
+                                                                                        \
+    "vpxor %%ymm6, %%ymm3, %%ymm3\n\t" /* ymm3 holds 1+3+…+15|2+4+…+16:a0*b0 */         \
+    "vpxor %%ymm2, %%ymm1, %%ymm1\n\t" /* ymm1 holds 1+3+…+15|2+4+…+16:a1*b1 */         \
+    "vpxor %%ymm7, %%ymm4, %%ymm4\n\t" /* ymm4 holds 1+3+…+15|2+4+…+16:(a0+a1)*(b0+b1) */\
+                                                                                        \
+    /* aggregated reduction... */                                                       \
+    "vpxor %%ymm1, %%ymm3, %%ymm5\n\t" /* ymm5 holds a0*b0+a1*b1 */                     \
+    "vpxor %%ymm5, %%ymm4, %%ymm4\n\t" /* ymm4 holds a0*b0+a1*b1+(a0+a1)*(b0+b1) */     \
+    "vpslldq $8, %%ymm4, %%ymm5\n\t"                                                    \
+    "vpsrldq $8, %%ymm4, %%ymm4\n\t"                                                    \
+    "vpxor %%ymm5, %%ymm3, %%ymm3\n\t"                                                  \
+    "vpxor %%ymm4, %%ymm1, %%ymm1\n\t" /* <ymm1:xmm3> holds the result of the           \
+                                          carry-less multiplication of ymm0             \
+                                          by ymm1 */                                    \
+                                                                                        \
+    /* first phase of the reduction */                                                  \
+    "vpsllq $1, %%ymm3, %%ymm6\n\t"  /* packed right shifting << 63 */                  \
+    "vpxor %%ymm3, %%ymm6, %%ymm6\n\t"                                                  \
+    "vpsllq $57, %%ymm3, %%ymm5\n\t"  /* packed right shifting << 57 */                 \
+    "vpsllq $62, %%ymm6, %%ymm6\n\t"  /* packed right shifting << 62 */                 \
+    "vpxor %%ymm5, %%ymm6, %%ymm6\n\t" /* xor the shifted versions */                   \
+    "vpshufd $0x6a, %%ymm6, %%ymm5\n\t"                                                 \
+    "vpshufd $0xae, %%ymm6, %%ymm6\n\t"                                                 \
+    "vpxor %%ymm5, %%ymm3, %%ymm3\n\t" /* first phase of the reduction complete */      \
+                                                                                        \
+    /* second phase of the reduction */                                                 \
+    "vpxor %%ymm3, %%ymm1, %%ymm1\n\t" /* xor the shifted versions */                   \
+    "vpsrlq $1, %%ymm3, %%ymm3\n\t"    /* packed left shifting >> 1 */                  \
+    "vpxor %%ymm3, %%ymm6, %%ymm6\n\t"                                                  \
+    "vpsrlq $1, %%ymm3, %%ymm3\n\t"    /* packed left shifting >> 2 */                  \
+    "vpxor %%ymm3, %%ymm1, %%ymm1\n\t"                                                  \
+    "vpsrlq $5, %%ymm3, %%ymm3\n\t"    /* packed left shifting >> 7 */                  \
+    "vpxor %%ymm3, %%ymm6, %%ymm6\n\t"                                                  \
+    "vpxor %%ymm6, %%ymm1, %%ymm1\n\t" /* the result is in ymm1 */                      \
+                                                                                        \
+    /* merge 128-bit halves */                                                          \
+    "vextracti128 $1, %%ymm1, %%xmm2\n\t"                                               \
+    "vpxor %%xmm2, %%xmm1, %%xmm1\n\t"
+
+static ASM_FUNC_ATTR_INLINE void
+gfmul_vpclmul_avx2_aggr16(const void *buf, const void *h_table,
+                         const u64 *h1_h2_h15_h16)
+{
+  /* Input:
+      Hx: YMM0, YMM8, YMM9, YMM10, YMM11, YMM12, YMM13
+      bemask: YMM15
+      Hash: XMM1
+    Output:
+      Hash: XMM1
+    Inputs YMM0, YMM8, YMM9, YMM10, YMM11, YMM12, YMM13 and YMM15 stay
+    unmodified.
+  */
+  asm volatile (GFMUL_AGGR16_ASM_VPCMUL_AVX2(be_to_le)
+               :
+               : [buf] "r" (buf),
+                 [h_table] "r" (h_table),
+                 [h1_h2] "m" (h1_h2_h15_h16[0]),
+                 [h15_h16] "m" (h1_h2_h15_h16[4])
+               : "memory" );
+}
+
+static ASM_FUNC_ATTR_INLINE void
+gfmul_vpclmul_avx2_aggr16_le(const void *buf, const void *h_table,
+                            const u64 *h1_h2_h15_h16)
+{
+  /* Input:
+      Hx: YMM0, YMM8, YMM9, YMM10, YMM11, YMM12, YMM13
+      bemask: YMM15
+      Hash: XMM1
+    Output:
+      Hash: XMM1
+    Inputs YMM0, YMM8, YMM9, YMM10, YMM11, YMM12, YMM13 and YMM15 stay
+    unmodified.
+  */
+  asm volatile (GFMUL_AGGR16_ASM_VPCMUL_AVX2(le_to_le)
+               :
+               : [buf] "r" (buf),
+                 [h_table] "r" (h_table),
+                 [h1_h2] "m" (h1_h2_h15_h16[0]),
+                 [h15_h16] "m" (h1_h2_h15_h16[4])
+               : "memory" );
+}
+
+static ASM_FUNC_ATTR_INLINE
+void gfmul_pclmul_avx2(void)
+{
+  /* Input: YMM0 and YMM1, Output: YMM1. Input YMM0 stays unmodified.
+     Input must be converted to little-endian.
+   */
+  asm volatile (/* gfmul, ymm0 has operator a and ymm1 has operator b. */
+               "vpshufd $78, %%ymm0, %%ymm2\n\t"
+               "vpshufd $78, %%ymm1, %%ymm4\n\t"
+               "vpxor %%ymm0, %%ymm2, %%ymm2\n\t" /* ymm2 holds a0+a1 */
+               "vpxor %%ymm1, %%ymm4, %%ymm4\n\t" /* ymm4 holds b0+b1 */
+
+               "vpclmulqdq $0, %%ymm1, %%ymm0, %%ymm3\n\t"  /* ymm3 holds a0*b0 */
+               "vpclmulqdq $17, %%ymm0, %%ymm1, %%ymm1\n\t" /* ymm6 holds a1*b1 */
+               "vpclmulqdq $0, %%ymm2, %%ymm4, %%ymm4\n\t"  /* ymm4 holds (a0+a1)*(b0+b1) */
+
+               "vpxor %%ymm1, %%ymm3, %%ymm5\n\t" /* ymm5 holds a0*b0+a1*b1 */
+               "vpxor %%ymm5, %%ymm4, %%ymm4\n\t" /* ymm4 holds a0*b0+a1*b1+(a0+a1)*(b0+b1) */
+               "vpslldq $8, %%ymm4, %%ymm5\n\t"
+               "vpsrldq $8, %%ymm4, %%ymm4\n\t"
+               "vpxor %%ymm5, %%ymm3, %%ymm3\n\t"
+               "vpxor %%ymm4, %%ymm1, %%ymm1\n\t" /* <ymm1:ymm3> holds the result of the
+                                                     carry-less multiplication of ymm0
+                                                     by ymm1 */
+
+               /* first phase of the reduction */
+               "vpsllq $1, %%ymm3, %%ymm6\n\t"  /* packed right shifting << 63 */
+               "vpxor %%ymm3, %%ymm6, %%ymm6\n\t"
+               "vpsllq $57, %%ymm3, %%ymm5\n\t"  /* packed right shifting << 57 */
+               "vpsllq $62, %%ymm6, %%ymm6\n\t"  /* packed right shifting << 62 */
+               "vpxor %%ymm5, %%ymm6, %%ymm6\n\t" /* xor the shifted versions */
+               "vpshufd $0x6a, %%ymm6, %%ymm5\n\t"
+               "vpshufd $0xae, %%ymm6, %%ymm6\n\t"
+               "vpxor %%ymm5, %%ymm3, %%ymm3\n\t" /* first phase of the reduction complete */
+
+               /* second phase of the reduction */
+               "vpxor %%ymm3, %%ymm1, %%ymm1\n\t" /* xor the shifted versions */
+               "vpsrlq $1, %%ymm3, %%ymm3\n\t"    /* packed left shifting >> 1 */
+               "vpxor %%ymm3, %%ymm6, %%ymm6\n\t"
+               "vpsrlq $1, %%ymm3, %%ymm3\n\t"    /* packed left shifting >> 2 */
+               "vpxor %%ymm3, %%ymm1, %%ymm1\n\t"
+               "vpsrlq $5, %%ymm3, %%ymm3\n\t"    /* packed left shifting >> 7 */
+               "vpxor %%ymm3, %%ymm6, %%ymm6\n\t"
+               "vpxor %%ymm6, %%ymm1, %%ymm1\n\t" /* the result is in ymm1 */
+                ::: "memory" );
+}
+
+static ASM_FUNC_ATTR_INLINE void
+gcm_lsh_avx2(void *h, unsigned int hoffs)
+{
+  static const u64 pconst[4] __attribute__ ((aligned (32))) =
+    {
+      U64_C(0x0000000000000001), U64_C(0xc200000000000000),
+      U64_C(0x0000000000000001), U64_C(0xc200000000000000)
+    };
+
+  asm volatile ("vmovdqu %[h], %%ymm2\n\t"
+                "vpshufd $0xff, %%ymm2, %%ymm3\n\t"
+                "vpsrad $31, %%ymm3, %%ymm3\n\t"
+                "vpslldq $8, %%ymm2, %%ymm4\n\t"
+                "vpand %[pconst], %%ymm3, %%ymm3\n\t"
+                "vpaddq %%ymm2, %%ymm2, %%ymm2\n\t"
+                "vpsrlq $63, %%ymm4, %%ymm4\n\t"
+                "vpxor %%ymm3, %%ymm2, %%ymm2\n\t"
+                "vpxor %%ymm4, %%ymm2, %%ymm2\n\t"
+                "vmovdqu %%ymm2, %[h]\n\t"
+                : [h] "+m" (*((byte *)h + hoffs))
+                : [pconst] "m" (*pconst)
+                : "memory" );
+}
+
+static ASM_FUNC_ATTR_INLINE void
+load_h1h2_to_ymm1(gcry_cipher_hd_t c)
+{
+  unsigned int key_pos =
+    offsetof(struct gcry_cipher_handle, u_mode.gcm.u_ghash_key.key);
+  unsigned int table_pos =
+    offsetof(struct gcry_cipher_handle, u_mode.gcm.gcm_table);
+
+  if (key_pos + 16 == table_pos)
+    {
+      /* Optimization: Table follows immediately after key. */
+      asm volatile ("vmovdqu %[key], %%ymm1\n\t"
+                   :
+                   : [key] "m" (*c->u_mode.gcm.u_ghash_key.key)
+                   : "memory");
+    }
+  else
+    {
+      asm volatile ("vmovdqa %[key], %%xmm1\n\t"
+                   "vinserti128 $1, 0*16(%[h_table]), %%ymm1, %%ymm1\n\t"
+                   :
+                   : [h_table] "r" (c->u_mode.gcm.gcm_table),
+                     [key] "m" (*c->u_mode.gcm.u_ghash_key.key)
+                   : "memory");
+    }
+}
+
+static ASM_FUNC_ATTR void
+ghash_setup_aggr8_avx2(gcry_cipher_hd_t c)
+{
+  c->u_mode.gcm.hw_impl_flags |= GCM_INTEL_AGGR8_TABLE_INITIALIZED;
+
+  asm volatile (/* load H⁴ */
+               "vbroadcasti128 3*16(%[h_table]), %%ymm0\n\t"
+               :
+               : [h_table] "r" (c->u_mode.gcm.gcm_table)
+               : "memory");
+  /* load H <<< 1, H² <<< 1 */
+  load_h1h2_to_ymm1 (c);
+
+  gfmul_pclmul_avx2 (); /* H<<<1•H⁴ => H⁵, H²<<<1•H⁴ => H⁶ */
+
+  asm volatile ("vmovdqu %%ymm1, 3*16(%[h_table])\n\t"
+               /* load H³ <<< 1, H⁴ <<< 1 */
+               "vmovdqu 1*16(%[h_table]), %%ymm1\n\t"
+               :
+               : [h_table] "r" (c->u_mode.gcm.gcm_table)
+               : "memory");
+
+  gfmul_pclmul_avx2 (); /* H³<<<1•H⁴ => H⁷, H⁴<<<1•H⁴ => H⁸ */
+
+  asm volatile ("vmovdqu %%ymm1, 6*16(%[h_table])\n\t" /* store H⁸ for aggr16 setup */
+               "vmovdqu %%ymm1, 5*16(%[h_table])\n\t"
+               :
+               : [h_table] "r" (c->u_mode.gcm.gcm_table)
+               : "memory");
+
+  gcm_lsh_avx2 (c->u_mode.gcm.gcm_table, 3 * 16); /* H⁵ <<< 1, H⁶ <<< 1 */
+  gcm_lsh_avx2 (c->u_mode.gcm.gcm_table, 5 * 16); /* H⁷ <<< 1, H⁸ <<< 1 */
+}
+
+static ASM_FUNC_ATTR void
+ghash_setup_aggr16_avx2(gcry_cipher_hd_t c)
+{
+  c->u_mode.gcm.hw_impl_flags |= GCM_INTEL_AGGR16_TABLE_INITIALIZED;
+
+  asm volatile (/* load H⁸ */
+               "vbroadcasti128 7*16(%[h_table]), %%ymm0\n\t"
+               :
+               : [h_table] "r" (c->u_mode.gcm.gcm_table)
+               : "memory");
+  /* load H <<< 1, H² <<< 1 */
+  load_h1h2_to_ymm1 (c);
+
+  gfmul_pclmul_avx2 (); /* H<<<1•H⁸ => H⁹, H²<<<1•H⁸ => H¹⁰ */
+
+  asm volatile ("vmovdqu %%ymm1, 7*16(%[h_table])\n\t"
+               /* load H³ <<< 1, H⁴ <<< 1 */
+               "vmovdqu 1*16(%[h_table]), %%ymm1\n\t"
+               :
+               : [h_table] "r" (c->u_mode.gcm.gcm_table)
+               : "memory");
+
+  gfmul_pclmul_avx2 (); /* H³<<<1•H⁸ => H¹¹, H⁴<<<1•H⁸ => H¹² */
+
+  asm volatile ("vmovdqu %%ymm1, 9*16(%[h_table])\n\t"
+               /* load H⁵ <<< 1, H⁶ <<< 1 */
+               "vmovdqu 3*16(%[h_table]), %%ymm1\n\t"
+               :
+               : [h_table] "r" (c->u_mode.gcm.gcm_table)
+               : "memory");
+
+  gfmul_pclmul_avx2 (); /* H⁵<<<1•H⁸ => H¹³, H⁶<<<1•H⁸ => H¹⁴ */
+
+  asm volatile ("vmovdqu %%ymm1, 11*16(%[h_table])\n\t"
+               /* load H⁷ <<< 1, H⁸ <<< 1 */
+               "vmovdqu 5*16(%[h_table]), %%ymm1\n\t"
+               :
+               : [h_table] "r" (c->u_mode.gcm.gcm_table)
+               : "memory");
+
+  gfmul_pclmul_avx2 (); /* H⁷<<<1•H⁸ => H¹⁵, H⁸<<<1•H⁸ => H¹⁶ */
+
+  asm volatile ("vmovdqu %%ymm1, 14*16(%[h_table])\n\t" /* store H¹⁶ for aggr32 setup */
+                "vmovdqu %%ymm1, 13*16(%[h_table])\n\t"
+               :
+               : [h_table] "r" (c->u_mode.gcm.gcm_table)
+               : "memory");
+
+  gcm_lsh_avx2 (c->u_mode.gcm.gcm_table, 7 * 16); /* H⁹ <<< 1, H¹⁰ <<< 1 */
+  gcm_lsh_avx2 (c->u_mode.gcm.gcm_table, 9 * 16); /* H¹¹ <<< 1, H¹² <<< 1 */
+  gcm_lsh_avx2 (c->u_mode.gcm.gcm_table, 11 * 16); /* H¹³ <<< 1, H¹⁴ <<< 1 */
+  gcm_lsh_avx2 (c->u_mode.gcm.gcm_table, 13 * 16); /* H¹⁵ <<< 1, H¹⁶ <<< 1 */
+}
+
+#endif /* GCM_USE_INTEL_VPCLMUL_AVX2 */
+
+#ifdef GCM_USE_INTEL_VPCLMUL_AVX512
+
+#define GFMUL_AGGR32_ASM_VPCMUL_AVX512(be_to_le)                                          \
+    /* perform clmul and merge results... */                                              \
+    "vmovdqu64 0*16(%[buf]), %%zmm5\n\t"                                                  \
+    "vmovdqu64 4*16(%[buf]), %%zmm2\n\t"                                                  \
+    be_to_le("vpshufb %%zmm15, %%zmm5, %%zmm5\n\t") /* be => le */                        \
+    be_to_le("vpshufb %%zmm15, %%zmm2, %%zmm2\n\t") /* be => le */                        \
+    "vpxorq %%zmm5, %%zmm1, %%zmm1\n\t"                                                   \
+                                                                                          \
+    "vpshufd $78, %%zmm0, %%zmm5\n\t"                                                     \
+    "vpshufd $78, %%zmm1, %%zmm4\n\t"                                                     \
+    "vpxorq %%zmm0, %%zmm5, %%zmm5\n\t" /* zmm5 holds 29|…|32:a0+a1 */                    \
+    "vpxorq %%zmm1, %%zmm4, %%zmm4\n\t" /* zmm4 holds 29|…|32:b0+b1 */                    \
+    "vpclmulqdq $0, %%zmm1, %%zmm0, %%zmm3\n\t"  /* zmm3 holds 29|…|32:a0*b0 */           \
+    "vpclmulqdq $17, %%zmm0, %%zmm1, %%zmm1\n\t" /* zmm1 holds 29|…|32:a1*b1 */           \
+    "vpclmulqdq $0, %%zmm5, %%zmm4, %%zmm4\n\t"  /* zmm4 holds 29|…|32:(a0+a1)*(b0+b1) */ \
+                                                                                          \
+    "vpshufd $78, %%zmm13, %%zmm14\n\t"                                                   \
+    "vpshufd $78, %%zmm2, %%zmm7\n\t"                                                     \
+    "vpxorq %%zmm13, %%zmm14, %%zmm14\n\t" /* zmm14 holds 25|…|28:a0+a1 */                \
+    "vpxorq %%zmm2, %%zmm7, %%zmm7\n\t"    /* zmm7 holds 25|…|28:b0+b1 */                 \
+    "vpclmulqdq $0, %%zmm2, %%zmm13, %%zmm17\n\t"  /* zmm17 holds 25|…|28:a0*b0 */        \
+    "vpclmulqdq $17, %%zmm13, %%zmm2, %%zmm18\n\t" /* zmm18 holds 25|…|28:a1*b1 */        \
+    "vpclmulqdq $0, %%zmm14, %%zmm7, %%zmm19\n\t"  /* zmm19 holds 25|…|28:(a0+a1)*(b0+b1) */\
+                                                                                          \
+    "vmovdqu64 8*16(%[buf]), %%zmm5\n\t"                                                  \
+    "vmovdqu64 12*16(%[buf]), %%zmm2\n\t"                                                 \
+    be_to_le("vpshufb %%zmm15, %%zmm5, %%zmm5\n\t") /* be => le */                        \
+    be_to_le("vpshufb %%zmm15, %%zmm2, %%zmm2\n\t") /* be => le */                        \
+                                                                                          \
+    "vpshufd $78, %%zmm12, %%zmm14\n\t"                                                   \
+    "vpshufd $78, %%zmm5, %%zmm7\n\t"                                                     \
+    "vpxorq %%zmm12, %%zmm14, %%zmm14\n\t" /* zmm14 holds 21|…|24:a0+a1 */                \
+    "vpxorq %%zmm5, %%zmm7, %%zmm7\n\t"    /* zmm7 holds 21|…|24:b0+b1 */                 \
+    "vpclmulqdq $0, %%zmm5, %%zmm12, %%zmm6\n\t"  /* zmm6 holds 21|…|24:a0*b0 */          \
+    "vpclmulqdq $17, %%zmm12, %%zmm5, %%zmm5\n\t" /* zmm5 holds 21|…|24:a1*b1 */          \
+    "vpclmulqdq $0, %%zmm14, %%zmm7, %%zmm7\n\t"  /* zmm7 holds 21|…|24:(a0+a1)*(b0+b1) */\
+                                                                                          \
+    "vpternlogq $0x96, %%zmm6, %%zmm17, %%zmm3\n\t" /* zmm3 holds 21+…|…|…+32:a0*b0 */    \
+    "vpternlogq $0x96, %%zmm5, %%zmm18, %%zmm1\n\t" /* zmm1 holds 21+…|…|…+32:a1*b1 */    \
+    "vpternlogq $0x96, %%zmm7, %%zmm19, %%zmm4\n\t" /* zmm4 holds 21+…|…|…+32:(a0+a1)*(b0+b1) */\
+                                                                                          \
+    "vpshufd $78, %%zmm11, %%zmm14\n\t"                                                   \
+    "vpshufd $78, %%zmm2, %%zmm7\n\t"                                                     \
+    "vpxorq %%zmm11, %%zmm14, %%zmm14\n\t" /* zmm14 holds 17|…|20:a0+a1 */                \
+    "vpxorq %%zmm2, %%zmm7, %%zmm7\n\t"    /* zmm7 holds 17|…|20:b0+b1 */                 \
+    "vpclmulqdq $0, %%zmm2, %%zmm11, %%zmm17\n\t"  /* zmm17 holds 17|…|20:a0*b0 */        \
+    "vpclmulqdq $17, %%zmm11, %%zmm2, %%zmm18\n\t" /* zmm18 holds 17|…|20:a1*b1 */        \
+    "vpclmulqdq $0, %%zmm14, %%zmm7, %%zmm19\n\t" /* zmm19 holds 17|…|20:(a0+a1)*(b0+b1) */\
+                                                                                          \
+    "vmovdqu64 16*16(%[buf]), %%zmm5\n\t"                                                 \
+    "vmovdqu64 20*16(%[buf]), %%zmm2\n\t"                                                 \
+    be_to_le("vpshufb %%zmm15, %%zmm5, %%zmm5\n\t") /* be => le */                        \
+    be_to_le("vpshufb %%zmm15, %%zmm2, %%zmm2\n\t") /* be => le */                        \
+                                                                                          \
+    "vpshufd $78, %%zmm10, %%zmm14\n\t"                                                   \
+    "vpshufd $78, %%zmm5, %%zmm7\n\t"                                                     \
+    "vpxorq %%zmm10, %%zmm14, %%zmm14\n\t" /* zmm14 holds 13|…|16:a0+a1 */                \
+    "vpxorq %%zmm5, %%zmm7, %%zmm7\n\t"    /* zmm7 holds 13|…|16:b0+b1 */                 \
+    "vpclmulqdq $0, %%zmm5, %%zmm10, %%zmm6\n\t"  /* zmm6 holds 13|…|16:a0*b0 */          \
+    "vpclmulqdq $17, %%zmm10, %%zmm5, %%zmm5\n\t" /* zmm5 holds 13|…|16:a1*b1 */          \
+    "vpclmulqdq $0, %%zmm14, %%zmm7, %%zmm7\n\t" /* zmm7 holds 13|…|16:(a0+a1)*(b0+b1) */ \
+                                                                                          \
+    "vpternlogq $0x96, %%zmm6, %%zmm17, %%zmm3\n\t" /* zmm3 holds 13+…|…|…+32:a0*b0 */    \
+    "vpternlogq $0x96, %%zmm5, %%zmm18, %%zmm1\n\t" /* zmm1 holds 13+…|…|…+32:a1*b1 */    \
+    "vpternlogq $0x96, %%zmm7, %%zmm19, %%zmm4\n\t" /* zmm4 holds 13+…|…|…+32:(a0+a1)*(b0+b1) */\
+                                                                                          \
+    "vpshufd $78, %%zmm9, %%zmm14\n\t"                                                    \
+    "vpshufd $78, %%zmm2, %%zmm7\n\t"                                                     \
+    "vpxorq %%zmm9, %%zmm14, %%zmm14\n\t" /* zmm14 holds 9|…|12:a0+a1 */                  \
+    "vpxorq %%zmm2, %%zmm7, %%zmm7\n\t"   /* zmm7 holds 9|…|12:b0+b1 */                   \
+    "vpclmulqdq $0, %%zmm2, %%zmm9, %%zmm17\n\t"  /* zmm17 holds 9|…|12:a0*b0 */          \
+    "vpclmulqdq $17, %%zmm9, %%zmm2, %%zmm18\n\t" /* zmm18 holds 9|…|12:a1*b1 */          \
+    "vpclmulqdq $0, %%zmm14, %%zmm7, %%zmm19\n\t" /* zmm19 holds 9|…|12:(a0+a1)*(b0+b1) */\
+                                                                                          \
+    "vmovdqu64 24*16(%[buf]), %%zmm5\n\t"                                                 \
+    "vmovdqu64 28*16(%[buf]), %%zmm2\n\t"                                                 \
+    be_to_le("vpshufb %%zmm15, %%zmm5, %%zmm5\n\t") /* be => le */                        \
+    be_to_le("vpshufb %%zmm15, %%zmm2, %%zmm2\n\t") /* be => le */                        \
+                                                                                          \
+    "vpshufd $78, %%zmm8, %%zmm14\n\t"                                                    \
+    "vpshufd $78, %%zmm5, %%zmm7\n\t"                                                     \
+    "vpxorq %%zmm8, %%zmm14, %%zmm14\n\t" /* zmm14 holds 5|…|8:a0+a1 */                   \
+    "vpxorq %%zmm5, %%zmm7, %%zmm7\n\t"   /* zmm7 holds 5|…|8:b0+b1 */                    \
+    "vpclmulqdq $0, %%zmm5, %%zmm8, %%zmm6\n\t"  /* zmm6 holds 5|…|8:a0*b0 */             \
+    "vpclmulqdq $17, %%zmm8, %%zmm5, %%zmm5\n\t" /* zmm5 holds 5|…|8:a1*b1 */             \
+    "vpclmulqdq $0, %%zmm14, %%zmm7, %%zmm7\n\t" /* zmm7 holds 5|…|8:(a0+a1)*(b0+b1) */   \
+                                                                                          \
+    "vpternlogq $0x96, %%zmm6, %%zmm17, %%zmm3\n\t" /* zmm3 holds 5+…|…|…+32:a0*b0 */     \
+    "vpternlogq $0x96, %%zmm5, %%zmm18, %%zmm1\n\t" /* zmm1 holds 5+…|…|…+32:a1*b1 */     \
+    "vpternlogq $0x96, %%zmm7, %%zmm19, %%zmm4\n\t" /* zmm4 holds 5+…|…|…+32:(a0+a1)*(b0+b1) */\
+                                                                                          \
+    "vpshufd $78, %%zmm16, %%zmm14\n\t"                                                   \
+    "vpshufd $78, %%zmm2, %%zmm7\n\t"                                                     \
+    "vpxorq %%zmm16, %%zmm14, %%zmm14\n\t" /* zmm14 holds 1|…|4:a0+a1 */                  \
+    "vpxorq %%zmm2, %%zmm7, %%zmm7\n\t"   /* zmm7 holds 1|2:b0+b1 */                      \
+    "vpclmulqdq $0, %%zmm2, %%zmm16, %%zmm6\n\t"  /* zmm6 holds 1|2:a0*b0 */              \
+    "vpclmulqdq $17, %%zmm16, %%zmm2, %%zmm2\n\t" /* zmm2 holds 1|2:a1*b1 */              \
+    "vpclmulqdq $0, %%zmm14, %%zmm7, %%zmm7\n\t" /* zmm7 holds 1|2:(a0+a1)*(b0+b1) */     \
+                                                                                          \
+    "vpxorq %%zmm6, %%zmm3, %%zmm3\n\t" /* zmm3 holds 1+3+…+15|2+4+…+16:a0*b0 */          \
+    "vpxorq %%zmm2, %%zmm1, %%zmm1\n\t" /* zmm1 holds 1+3+…+15|2+4+…+16:a1*b1 */          \
+    "vpxorq %%zmm7, %%zmm4, %%zmm4\n\t" /* zmm4 holds 1+3+…+15|2+4+…+16:(a0+a1)*(b0+b1) */\
+                                                                                          \
+    /* aggregated reduction... */                                                         \
+    "vpternlogq $0x96, %%zmm1, %%zmm3, %%zmm4\n\t" /* zmm4 holds                          \
+                                                    * a0*b0+a1*b1+(a0+a1)*(b0+b1) */      \
+    "vpslldq $8, %%zmm4, %%zmm5\n\t"                                                      \
+    "vpsrldq $8, %%zmm4, %%zmm4\n\t"                                                      \
+    "vpxorq %%zmm5, %%zmm3, %%zmm3\n\t"                                                   \
+    "vpxorq %%zmm4, %%zmm1, %%zmm1\n\t" /* <zmm1:zmm3> holds the result of the            \
+                                          carry-less multiplication of zmm0               \
+                                          by zmm1 */                                      \
+                                                                                          \
+    /* first phase of the reduction */                                                    \
+    "vpsllq $1, %%zmm3, %%zmm6\n\t"  /* packed right shifting << 63 */                    \
+    "vpxorq %%zmm3, %%zmm6, %%zmm6\n\t"                                                   \
+    "vpsllq $57, %%zmm3, %%zmm5\n\t"  /* packed right shifting << 57 */                   \
+    "vpsllq $62, %%zmm6, %%zmm6\n\t"  /* packed right shifting << 62 */                   \
+    "vpxorq %%zmm5, %%zmm6, %%zmm6\n\t" /* xor the shifted versions */                    \
+    "vpshufd $0x6a, %%zmm6, %%zmm5\n\t"                                                   \
+    "vpshufd $0xae, %%zmm6, %%zmm6\n\t"                                                   \
+    "vpxorq %%zmm5, %%zmm3, %%zmm3\n\t" /* first phase of the reduction complete */       \
+                                                                                          \
+    /* second phase of the reduction */                                                   \
+    "vpsrlq $1, %%zmm3, %%zmm2\n\t"    /* packed left shifting >> 1 */                    \
+    "vpsrlq $2, %%zmm3, %%zmm4\n\t"    /* packed left shifting >> 2 */                    \
+    "vpsrlq $7, %%zmm3, %%zmm5\n\t"    /* packed left shifting >> 7 */                    \
+    "vpternlogq $0x96, %%zmm3, %%zmm2, %%zmm1\n\t" /* xor the shifted versions */         \
+    "vpternlogq $0x96, %%zmm4, %%zmm5, %%zmm6\n\t"                                        \
+    "vpxorq %%zmm6, %%zmm1, %%zmm1\n\t" /* the result is in zmm1 */                       \
+                                                                                          \
+    /* merge 256-bit halves */                                                            \
+    "vextracti64x4 $1, %%zmm1, %%ymm2\n\t"                                                \
+    "vpxor %%ymm2, %%ymm1, %%ymm1\n\t"                                                    \
+    /* merge 128-bit halves */                                                            \
+    "vextracti128 $1, %%ymm1, %%xmm2\n\t"                                                 \
+    "vpxor %%xmm2, %%xmm1, %%xmm1\n\t"
+
+static ASM_FUNC_ATTR_INLINE void
+gfmul_vpclmul_avx512_aggr32(const void *buf, const void *h_table)
+{
+  /* Input:
+      Hx: ZMM0, ZMM8, ZMM9, ZMM10, ZMM11, ZMM12, ZMM13, ZMM16
+      bemask: ZMM15
+      Hash: XMM1
+    Output:
+      Hash: XMM1
+    Inputs ZMM0, ZMM8, ZMM9, ZMM10, ZMM11, ZMM12, ZMM13, ZMM16 and YMM15 stay
+    unmodified.
+  */
+  asm volatile (GFMUL_AGGR32_ASM_VPCMUL_AVX512(be_to_le)
+               :
+               : [buf] "r" (buf),
+                 [h_table] "r" (h_table)
+               : "memory" );
+}
+
+static ASM_FUNC_ATTR_INLINE void
+gfmul_vpclmul_avx512_aggr32_le(const void *buf, const void *h_table)
+{
+  /* Input:
+      Hx: ZMM0, ZMM8, ZMM9, ZMM10, ZMM11, ZMM12, ZMM13, ZMM16
+      bemask: ZMM15
+      Hash: XMM1
+    Output:
+      Hash: XMM1
+    Inputs ZMM0, ZMM8, ZMM9, ZMM10, ZMM11, ZMM12, ZMM13, ZMM16 and YMM15 stay
+    unmodified.
+  */
+  asm volatile (GFMUL_AGGR32_ASM_VPCMUL_AVX512(le_to_le)
+               :
+               : [buf] "r" (buf),
+                 [h_table] "r" (h_table)
+               : "memory" );
+}
+
+static ASM_FUNC_ATTR_INLINE
+void gfmul_pclmul_avx512(void)
+{
+  /* Input: ZMM0 and ZMM1, Output: ZMM1. Input ZMM0 stays unmodified.
+     Input must be converted to little-endian.
+   */
+  asm volatile (/* gfmul, zmm0 has operator a and zmm1 has operator b. */
+               "vpshufd $78, %%zmm0, %%zmm2\n\t"
+               "vpshufd $78, %%zmm1, %%zmm4\n\t"
+               "vpxorq %%zmm0, %%zmm2, %%zmm2\n\t" /* zmm2 holds a0+a1 */
+               "vpxorq %%zmm1, %%zmm4, %%zmm4\n\t" /* zmm4 holds b0+b1 */
+
+               "vpclmulqdq $0, %%zmm1, %%zmm0, %%zmm3\n\t"  /* zmm3 holds a0*b0 */
+               "vpclmulqdq $17, %%zmm0, %%zmm1, %%zmm1\n\t" /* zmm6 holds a1*b1 */
+               "vpclmulqdq $0, %%zmm2, %%zmm4, %%zmm4\n\t"  /* zmm4 holds (a0+a1)*(b0+b1) */
+
+               "vpternlogq $0x96, %%zmm1, %%zmm3, %%zmm4\n\t" /* zmm4 holds
+                                                               * a0*b0+a1*b1+(a0+a1)*(b0+b1) */
+               "vpslldq $8, %%zmm4, %%zmm5\n\t"
+               "vpsrldq $8, %%zmm4, %%zmm4\n\t"
+               "vpxorq %%zmm5, %%zmm3, %%zmm3\n\t"
+               "vpxorq %%zmm4, %%zmm1, %%zmm1\n\t" /* <zmm1:zmm3> holds the result of the
+                                                     carry-less multiplication of zmm0
+                                                     by zmm1 */
+
+               /* first phase of the reduction */
+               "vpsllq $1, %%zmm3, %%zmm6\n\t"  /* packed right shifting << 63 */
+               "vpxorq %%zmm3, %%zmm6, %%zmm6\n\t"
+               "vpsllq $57, %%zmm3, %%zmm5\n\t"  /* packed right shifting << 57 */
+               "vpsllq $62, %%zmm6, %%zmm6\n\t"  /* packed right shifting << 62 */
+               "vpxorq %%zmm5, %%zmm6, %%zmm6\n\t" /* xor the shifted versions */
+               "vpshufd $0x6a, %%zmm6, %%zmm5\n\t"
+               "vpshufd $0xae, %%zmm6, %%zmm6\n\t"
+               "vpxorq %%zmm5, %%zmm3, %%zmm3\n\t" /* first phase of the reduction complete */
+
+               /* second phase of the reduction */
+               "vpsrlq $1, %%zmm3, %%zmm2\n\t"    /* packed left shifting >> 1 */
+               "vpsrlq $2, %%zmm3, %%zmm4\n\t"    /* packed left shifting >> 2 */
+               "vpsrlq $7, %%zmm3, %%zmm5\n\t"    /* packed left shifting >> 7 */
+               "vpternlogq $0x96, %%zmm3, %%zmm2, %%zmm1\n\t" /* xor the shifted versions */
+               "vpternlogq $0x96, %%zmm4, %%zmm5, %%zmm6\n\t"
+               "vpxorq %%zmm6, %%zmm1, %%zmm1\n\t" /* the result is in zmm1 */
+                ::: "memory" );
+}
+
+static ASM_FUNC_ATTR_INLINE void
+gcm_lsh_avx512(void *h, unsigned int hoffs)
+{
+  static const u64 pconst[8] __attribute__ ((aligned (64))) =
+    {
+      U64_C(0x0000000000000001), U64_C(0xc200000000000000),
+      U64_C(0x0000000000000001), U64_C(0xc200000000000000),
+      U64_C(0x0000000000000001), U64_C(0xc200000000000000),
+      U64_C(0x0000000000000001), U64_C(0xc200000000000000)
+    };
+
+  asm volatile ("vmovdqu64 %[h], %%zmm2\n\t"
+                "vpshufd $0xff, %%zmm2, %%zmm3\n\t"
+                "vpsrad $31, %%zmm3, %%zmm3\n\t"
+                "vpslldq $8, %%zmm2, %%zmm4\n\t"
+                "vpandq %[pconst], %%zmm3, %%zmm3\n\t"
+                "vpaddq %%zmm2, %%zmm2, %%zmm2\n\t"
+                "vpsrlq $63, %%zmm4, %%zmm4\n\t"
+                "vpternlogq $0x96, %%zmm4, %%zmm3, %%zmm2\n\t"
+                "vmovdqu64 %%zmm2, %[h]\n\t"
+                : [h] "+m" (*((byte *)h + hoffs))
+                : [pconst] "m" (*pconst)
+                : "memory" );
+}
+
+static ASM_FUNC_ATTR_INLINE void
+load_h1h4_to_zmm1(gcry_cipher_hd_t c)
+{
+  unsigned int key_pos =
+    offsetof(struct gcry_cipher_handle, u_mode.gcm.u_ghash_key.key);
+  unsigned int table_pos =
+    offsetof(struct gcry_cipher_handle, u_mode.gcm.gcm_table);
+
+  if (key_pos + 16 == table_pos)
+    {
+      /* Optimization: Table follows immediately after key. */
+      asm volatile ("vmovdqu64 %[key], %%zmm1\n\t"
+                   :
+                   : [key] "m" (*c->u_mode.gcm.u_ghash_key.key)
+                   : "memory");
+    }
+  else
+    {
+      asm volatile ("vmovdqu64 -1*16(%[h_table]), %%zmm1\n\t"
+                   "vinserti64x2 $0, %[key], %%zmm1, %%zmm1\n\t"
+                   :
+                   : [h_table] "r" (c->u_mode.gcm.gcm_table),
+                     [key] "m" (*c->u_mode.gcm.u_ghash_key.key)
+                   : "memory");
+    }
+}
+
+static ASM_FUNC_ATTR void
+ghash_setup_aggr8_avx512(gcry_cipher_hd_t c)
+{
+  c->u_mode.gcm.hw_impl_flags |= GCM_INTEL_AGGR8_TABLE_INITIALIZED;
+
+  asm volatile (/* load H⁴ */
+               "vbroadcasti64x2 3*16(%[h_table]), %%zmm0\n\t"
+               :
+               : [h_table] "r" (c->u_mode.gcm.gcm_table)
+               : "memory");
+  /* load H <<< 1, H² <<< 1, H³ <<< 1, H⁴ <<< 1 */
+  load_h1h4_to_zmm1 (c);
+
+  gfmul_pclmul_avx512 (); /* H<<<1•H⁴ => H⁵, …, H⁴<<<1•H⁴ => H⁸ */
+
+  asm volatile ("vmovdqu64 %%zmm1, 4*16(%[h_table])\n\t" /* store H⁸ for aggr16 setup */
+               "vmovdqu64 %%zmm1, 3*16(%[h_table])\n\t"
+               :
+               : [h_table] "r" (c->u_mode.gcm.gcm_table)
+               : "memory");
+
+  gcm_lsh_avx512 (c->u_mode.gcm.gcm_table, 3 * 16); /* H⁵ <<< 1, …, H⁸ <<< 1 */
+}
+
+static ASM_FUNC_ATTR void
+ghash_setup_aggr16_avx512(gcry_cipher_hd_t c)
+{
+  c->u_mode.gcm.hw_impl_flags |= GCM_INTEL_AGGR16_TABLE_INITIALIZED;
+
+  asm volatile (/* load H⁸ */
+               "vbroadcasti64x2 7*16(%[h_table]), %%zmm0\n\t"
+               :
+               : [h_table] "r" (c->u_mode.gcm.gcm_table)
+               : "memory");
+  /* load H <<< 1, H² <<< 1, H³ <<< 1, H⁴ <<< 1 */
+  load_h1h4_to_zmm1 (c);
+
+  gfmul_pclmul_avx512 (); /* H<<<1•H⁸ => H⁹, … , H⁴<<<1•H⁸ => H¹² */
+
+  asm volatile ("vmovdqu64 %%zmm1, 7*16(%[h_table])\n\t"
+               /* load H⁵ <<< 1, …, H⁸ <<< 1 */
+               "vmovdqu64 3*16(%[h_table]), %%zmm1\n\t"
+               :
+               : [h_table] "r" (c->u_mode.gcm.gcm_table)
+               : "memory");
+
+  gfmul_pclmul_avx512 (); /* H⁵<<<1•H⁸ => H¹¹, … , H⁸<<<1•H⁸ => H¹⁶ */
+
+  asm volatile ("vmovdqu64 %%zmm1, 12*16(%[h_table])\n\t" /* store H¹⁶ for aggr32 setup */
+                "vmovdqu64 %%zmm1, 11*16(%[h_table])\n\t"
+               :
+               : [h_table] "r" (c->u_mode.gcm.gcm_table)
+               : "memory");
+
+  gcm_lsh_avx512 (c->u_mode.gcm.gcm_table, 7 * 16); /* H⁹ <<< 1, …, H¹² <<< 1 */
+  gcm_lsh_avx512 (c->u_mode.gcm.gcm_table, 11 * 16); /* H¹³ <<< 1, …, H¹⁶ <<< 1 */
+}
+
+static ASM_FUNC_ATTR void
+ghash_setup_aggr32_avx512(gcry_cipher_hd_t c)
+{
+  c->u_mode.gcm.hw_impl_flags |= GCM_INTEL_AGGR32_TABLE_INITIALIZED;
+
+  asm volatile (/* load H¹⁶ */
+               "vbroadcasti64x2 15*16(%[h_table]), %%zmm0\n\t"
+               :
+               : [h_table] "r" (c->u_mode.gcm.gcm_table)
+               : "memory");
+  /* load H <<< 1, H² <<< 1, H³ <<< 1, H⁴ <<< 1 */
+  load_h1h4_to_zmm1 (c);
+
+  gfmul_pclmul_avx512 (); /* H<<<1•H¹⁶ => H¹⁷, …, H⁴<<<1•H¹⁶ => H²⁰ */
+
+  asm volatile ("vmovdqu64 %%zmm1, 15*16(%[h_table])\n\t"
+               /* load H⁵ <<< 1, …, H⁸ <<< 1 */
+               "vmovdqu64 3*16(%[h_table]), %%zmm1\n\t"
+               :
+               : [h_table] "r" (c->u_mode.gcm.gcm_table)
+               : "memory");
+
+  gfmul_pclmul_avx512 (); /* H⁵<<<1•H¹⁶ => H²¹, …, H⁹<<<1•H¹⁶ => H²⁴ */
+
+  asm volatile ("vmovdqu64 %%zmm1, 19*16(%[h_table])\n\t"
+               /* load H⁹ <<< 1, …, H¹² <<< 1 */
+               "vmovdqu64 7*16(%[h_table]), %%zmm1\n\t"
+               :
+               : [h_table] "r" (c->u_mode.gcm.gcm_table)
+               : "memory");
+
+  gfmul_pclmul_avx512 (); /* H⁹<<<1•H¹⁶ => H²⁵, …, H¹²<<<1•H¹⁶ => H²⁸ */
+
+  asm volatile ("vmovdqu64 %%zmm1, 23*16(%[h_table])\n\t"
+               /* load H¹³ <<< 1, …, H¹⁶ <<< 1 */
+               "vmovdqu64 11*16(%[h_table]), %%zmm1\n\t"
+               :
+               : [h_table] "r" (c->u_mode.gcm.gcm_table)
+               : "memory");
+
+  gfmul_pclmul_avx512 (); /* H¹³<<<1•H¹⁶ => H²⁹, …, H¹⁶<<<1•H¹⁶ => H³² */
+
+  asm volatile ("vmovdqu64 %%zmm1, 27*16(%[h_table])\n\t"
+               :
+               : [h_table] "r" (c->u_mode.gcm.gcm_table)
+               : "memory");
+
+  gcm_lsh_avx512 (c->u_mode.gcm.gcm_table, 15 * 16);
+  gcm_lsh_avx512 (c->u_mode.gcm.gcm_table, 19 * 16);
+  gcm_lsh_avx512 (c->u_mode.gcm.gcm_table, 23 * 16);
+  gcm_lsh_avx512 (c->u_mode.gcm.gcm_table, 27 * 16);
+}
+
+static const u64 swap128b_perm[8] __attribute__ ((aligned (64))) =
+  {
+    /* For swapping order of 128bit lanes in 512bit register using vpermq. */
+    6, 7, 4, 5, 2, 3, 0, 1
+  };
+
+#endif /* GCM_USE_INTEL_VPCLMUL_AVX512 */
+#endif /* __x86_64__ */
+
+static unsigned int ASM_FUNC_ATTR
+_gcry_ghash_intel_pclmul (gcry_cipher_hd_t c, byte *result, const byte *buf,
+                         size_t nblocks);
+
+static unsigned int ASM_FUNC_ATTR
+_gcry_polyval_intel_pclmul (gcry_cipher_hd_t c, byte *result, const byte *buf,
+                           size_t nblocks);
+
+static ASM_FUNC_ATTR_INLINE void
+gcm_lsh(void *h, unsigned int hoffs)
 {
   static const u64 pconst[2] __attribute__ ((aligned (16))) =
     { U64_C(0x0000000000000001), U64_C(0xc200000000000000) };
 
-  asm volatile ("movdqu (%[h]), %%xmm2\n\t"
+  asm volatile ("movdqu %[h], %%xmm2\n\t"
                 "pshufd $0xff, %%xmm2, %%xmm3\n\t"
                 "movdqa %%xmm2, %%xmm4\n\t"
                 "psrad $31, %%xmm3\n\t"
@@ -449,15 +1248,14 @@ static ASM_FUNC_ATTR_INLINE void gcm_lsh(void *h, unsigned int hoffs)
                 "psrlq $63, %%xmm4\n\t"
                 "pxor %%xmm3, %%xmm2\n\t"
                 "pxor %%xmm4, %%xmm2\n\t"
-                "movdqu %%xmm2, (%[h])\n\t"
-                :
-                : [pconst] "m" (*pconst),
-                  [h] "r" ((byte *)h + hoffs)
+                "movdqu %%xmm2, %[h]\n\t"
+                : [h] "+m" (*((byte *)h + hoffs))
+                : [pconst] "m" (*pconst)
                 : "memory" );
 }
 
 void ASM_FUNC_ATTR
-_gcry_ghash_setup_intel_pclmul (gcry_cipher_hd_t c)
+_gcry_ghash_setup_intel_pclmul (gcry_cipher_hd_t c, unsigned int hw_features)
 {
   static const unsigned char be_mask[16] __attribute__ ((aligned (16))) =
     { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 };
@@ -480,6 +1278,12 @@ _gcry_ghash_setup_intel_pclmul (gcry_cipher_hd_t c)
                 : "memory" );
 #endif
 
+  (void)hw_features;
+
+  c->u_mode.gcm.hw_impl_flags = 0;
+  c->u_mode.gcm.ghash_fn = _gcry_ghash_intel_pclmul;
+  c->u_mode.gcm.polyval_fn = _gcry_polyval_intel_pclmul;
+
   /* Swap endianness of hsub. */
   asm volatile ("movdqu (%[key]), %%xmm0\n\t"
                 "pshufb %[be_mask], %%xmm0\n\t"
@@ -489,7 +1293,7 @@ _gcry_ghash_setup_intel_pclmul (gcry_cipher_hd_t c)
                   [be_mask] "m" (*be_mask)
                 : "memory");
 
-  gcm_lsh(c->u_mode.gcm.u_ghash_key.key, 0); /* H <<< 1 */
+  gcm_lsh (c->u_mode.gcm.u_ghash_key.key, 0); /* H <<< 1 */
 
   asm volatile ("movdqa %%xmm0, %%xmm1\n\t"
                 "movdqu (%[key]), %%xmm0\n\t" /* load H <<< 1 */
@@ -500,80 +1304,86 @@ _gcry_ghash_setup_intel_pclmul (gcry_cipher_hd_t c)
   gfmul_pclmul (); /* H<<<1•H => H² */
 
   asm volatile ("movdqu %%xmm1, 0*16(%[h_table])\n\t"
-                "movdqa %%xmm1, %%xmm7\n\t"
                 :
                 : [h_table] "r" (c->u_mode.gcm.gcm_table)
                 : "memory");
 
-  gcm_lsh(c->u_mode.gcm.gcm_table, 0 * 16); /* H² <<< 1 */
-  gfmul_pclmul (); /* H<<<1•H² => H³ */
+  gcm_lsh (c->u_mode.gcm.gcm_table, 0 * 16); /* H² <<< 1 */
 
-  asm volatile ("movdqa %%xmm7, %%xmm0\n\t"
-                "movdqu %%xmm1, 1*16(%[h_table])\n\t"
-                "movdqu 0*16(%[h_table]), %%xmm1\n\t" /* load H² <<< 1 */
-                :
-                : [h_table] "r" (c->u_mode.gcm.gcm_table)
-                : "memory");
-
-  gfmul_pclmul (); /* H²<<<1•H² => H⁴ */
+  if (0)
+    { }
+#ifdef GCM_USE_INTEL_VPCLMUL_AVX2
+  else if ((hw_features & HWF_INTEL_VAES_VPCLMUL)
+           && (hw_features & HWF_INTEL_AVX2))
+    {
+      c->u_mode.gcm.hw_impl_flags |= GCM_INTEL_USE_VPCLMUL_AVX2;
 
-  asm volatile ("movdqu %%xmm1, 2*16(%[h_table])\n\t"
-                "movdqa %%xmm1, %%xmm0\n\t"
-                "movdqu (%[key]), %%xmm1\n\t" /* load H <<< 1 */
-                :
-                : [h_table] "r" (c->u_mode.gcm.gcm_table),
-                  [key] "r" (c->u_mode.gcm.u_ghash_key.key)
-                : "memory");
+#ifdef GCM_USE_INTEL_VPCLMUL_AVX512
+      if (hw_features & HWF_INTEL_AVX512)
+       c->u_mode.gcm.hw_impl_flags |= GCM_INTEL_USE_VPCLMUL_AVX512;
+#endif
 
-  gcm_lsh(c->u_mode.gcm.gcm_table, 1 * 16); /* H³ <<< 1 */
-  gcm_lsh(c->u_mode.gcm.gcm_table, 2 * 16); /* H⁴ <<< 1 */
+      asm volatile (/* H² */
+                   "vinserti128 $1, %%xmm1, %%ymm1, %%ymm1\n\t"
+                   /* load H <<< 1, H² <<< 1 */
+                   "vinserti128 $1, 0*16(%[h_table]), %%ymm0, %%ymm0\n\t"
+                   :
+                   : [h_table] "r" (c->u_mode.gcm.gcm_table)
+                   : "memory");
 
-#ifdef __x86_64__
-  gfmul_pclmul (); /* H<<<1•H⁴ => H⁵ */
+      gfmul_pclmul_avx2 (); /* H<<<1•H² => H³, H²<<<1•H² => H⁴ */
 
-  asm volatile ("movdqu %%xmm1, 3*16(%[h_table])\n\t"
-                "movdqu 0*16(%[h_table]), %%xmm1\n\t" /* load H² <<< 1 */
-                :
-                : [h_table] "r" (c->u_mode.gcm.gcm_table)
-                : "memory");
+      asm volatile ("vmovdqu %%ymm1, 2*16(%[h_table])\n\t" /* store H⁴ for aggr8 setup */
+                   "vmovdqu %%ymm1, 1*16(%[h_table])\n\t"
+                   :
+                   : [h_table] "r" (c->u_mode.gcm.gcm_table)
+                   : "memory");
 
-  gfmul_pclmul (); /* H²<<<1•H⁴ => H⁶ */
+      gcm_lsh_avx2 (c->u_mode.gcm.gcm_table, 1 * 16); /* H³ <<< 1, H⁴ <<< 1 */
 
-  asm volatile ("movdqu %%xmm1, 4*16(%[h_table])\n\t"
-                "movdqu 1*16(%[h_table]), %%xmm1\n\t" /* load H³ <<< 1 */
-                :
-                : [h_table] "r" (c->u_mode.gcm.gcm_table)
-                : "memory");
+      asm volatile ("vzeroupper\n\t"
+                   ::: "memory" );
+    }
+#endif /* GCM_USE_INTEL_VPCLMUL_AVX2 */
+  else
+    {
+      asm volatile ("movdqa %%xmm1, %%xmm7\n\t"
+                   ::: "memory");
 
-  gfmul_pclmul (); /* H³<<<1•H⁴ => H⁷ */
+      gfmul_pclmul (); /* H<<<1•H² => H³ */
 
-  asm volatile ("movdqu %%xmm1, 5*16(%[h_table])\n\t"
-                "movdqu 2*16(%[h_table]), %%xmm1\n\t" /* load H⁴ <<< 1 */
-                :
-                : [h_table] "r" (c->u_mode.gcm.gcm_table)
-                : "memory");
+      asm volatile ("movdqa %%xmm7, %%xmm0\n\t"
+                   "movdqu %%xmm1, 1*16(%[h_table])\n\t"
+                   "movdqu 0*16(%[h_table]), %%xmm1\n\t" /* load H² <<< 1 */
+                   :
+                   : [h_table] "r" (c->u_mode.gcm.gcm_table)
+                   : "memory");
 
-  gfmul_pclmul (); /* H³<<<1•H⁴ => H⁸ */
+      gfmul_pclmul (); /* H²<<<1•H² => H⁴ */
 
-  asm volatile ("movdqu %%xmm1, 6*16(%[h_table])\n\t"
-                :
-                : [h_table] "r" (c->u_mode.gcm.gcm_table)
-                : "memory");
+      asm volatile ("movdqu %%xmm1, 3*16(%[h_table])\n\t" /* store H⁴ for aggr8 setup */
+                   "movdqu %%xmm1, 2*16(%[h_table])\n\t"
+                   :
+                   : [h_table] "r" (c->u_mode.gcm.gcm_table)
+                   : "memory");
 
-  gcm_lsh(c->u_mode.gcm.gcm_table, 3 * 16); /* H⁵ <<< 1 */
-  gcm_lsh(c->u_mode.gcm.gcm_table, 4 * 16); /* H⁶ <<< 1 */
-  gcm_lsh(c->u_mode.gcm.gcm_table, 5 * 16); /* H⁷ <<< 1 */
-  gcm_lsh(c->u_mode.gcm.gcm_table, 6 * 16); /* H⁸ <<< 1 */
+      gcm_lsh (c->u_mode.gcm.gcm_table, 1 * 16); /* H³ <<< 1 */
+      gcm_lsh (c->u_mode.gcm.gcm_table, 2 * 16); /* H⁴ <<< 1 */
+    }
 
-#ifdef __WIN64__
   /* Clear/restore used registers. */
-  asm volatile( "pxor %%xmm0, %%xmm0\n\t"
-                "pxor %%xmm1, %%xmm1\n\t"
-                "pxor %%xmm2, %%xmm2\n\t"
-                "pxor %%xmm3, %%xmm3\n\t"
-                "pxor %%xmm4, %%xmm4\n\t"
-                "pxor %%xmm5, %%xmm5\n\t"
-                "movdqu 0*16(%0), %%xmm6\n\t"
+  asm volatile ("pxor %%xmm0, %%xmm0\n\t"
+               "pxor %%xmm1, %%xmm1\n\t"
+               "pxor %%xmm2, %%xmm2\n\t"
+               "pxor %%xmm3, %%xmm3\n\t"
+               "pxor %%xmm4, %%xmm4\n\t"
+               "pxor %%xmm5, %%xmm5\n\t"
+               "pxor %%xmm6, %%xmm6\n\t"
+               "pxor %%xmm7, %%xmm7\n\t"
+               ::: "memory" );
+#ifdef __x86_64__
+#ifdef __WIN64__
+  asm volatile ("movdqu 0*16(%0), %%xmm6\n\t"
                 "movdqu 1*16(%0), %%xmm7\n\t"
                 "movdqu 2*16(%0), %%xmm8\n\t"
                 "movdqu 3*16(%0), %%xmm9\n\t"
@@ -587,16 +1397,7 @@ _gcry_ghash_setup_intel_pclmul (gcry_cipher_hd_t c)
                 : "r" (win64tmp)
                 : "memory" );
 #else
-  /* Clear used registers. */
-  asm volatile( "pxor %%xmm0, %%xmm0\n\t"
-                "pxor %%xmm1, %%xmm1\n\t"
-                "pxor %%xmm2, %%xmm2\n\t"
-                "pxor %%xmm3, %%xmm3\n\t"
-                "pxor %%xmm4, %%xmm4\n\t"
-                "pxor %%xmm5, %%xmm5\n\t"
-                "pxor %%xmm6, %%xmm6\n\t"
-                "pxor %%xmm7, %%xmm7\n\t"
-                "pxor %%xmm8, %%xmm8\n\t"
+  asm volatile ("pxor %%xmm8, %%xmm8\n\t"
                 "pxor %%xmm9, %%xmm9\n\t"
                 "pxor %%xmm10, %%xmm10\n\t"
                 "pxor %%xmm11, %%xmm11\n\t"
@@ -605,14 +1406,67 @@ _gcry_ghash_setup_intel_pclmul (gcry_cipher_hd_t c)
                 "pxor %%xmm14, %%xmm14\n\t"
                 "pxor %%xmm15, %%xmm15\n\t"
                 ::: "memory" );
-#endif
-#endif
+#endif /* __WIN64__ */
+#endif /* __x86_64__ */
+}
+
+
+#ifdef __x86_64__
+static ASM_FUNC_ATTR void
+ghash_setup_aggr8(gcry_cipher_hd_t c)
+{
+  c->u_mode.gcm.hw_impl_flags |= GCM_INTEL_AGGR8_TABLE_INITIALIZED;
+
+  asm volatile ("movdqa 3*16(%[h_table]), %%xmm0\n\t" /* load H⁴ */
+               "movdqu %[key], %%xmm1\n\t" /* load H <<< 1 */
+               :
+               : [h_table] "r" (c->u_mode.gcm.gcm_table),
+                 [key] "m" (*c->u_mode.gcm.u_ghash_key.key)
+               : "memory");
+
+  gfmul_pclmul (); /* H<<<1•H⁴ => H⁵ */
+
+  asm volatile ("movdqu %%xmm1, 3*16(%[h_table])\n\t"
+               "movdqu 0*16(%[h_table]), %%xmm1\n\t" /* load H² <<< 1 */
+               :
+               : [h_table] "r" (c->u_mode.gcm.gcm_table)
+               : "memory");
+
+  gfmul_pclmul (); /* H²<<<1•H⁴ => H⁶ */
+
+  asm volatile ("movdqu %%xmm1, 4*16(%[h_table])\n\t"
+               "movdqu 1*16(%[h_table]), %%xmm1\n\t" /* load H³ <<< 1 */
+               :
+               : [h_table] "r" (c->u_mode.gcm.gcm_table)
+               : "memory");
+
+  gfmul_pclmul (); /* H³<<<1•H⁴ => H⁷ */
+
+  asm volatile ("movdqu %%xmm1, 5*16(%[h_table])\n\t"
+               "movdqu 2*16(%[h_table]), %%xmm1\n\t" /* load H⁴ <<< 1 */
+               :
+               : [h_table] "r" (c->u_mode.gcm.gcm_table)
+               : "memory");
+
+  gfmul_pclmul (); /* H⁴<<<1•H⁴ => H⁸ */
+
+  asm volatile ("movdqu %%xmm1, 6*16(%[h_table])\n\t"
+               "movdqu %%xmm1, 7*16(%[h_table])\n\t" /* store H⁸ for aggr16 setup */
+               :
+               : [h_table] "r" (c->u_mode.gcm.gcm_table)
+               : "memory");
+
+  gcm_lsh (c->u_mode.gcm.gcm_table, 3 * 16); /* H⁵ <<< 1 */
+  gcm_lsh (c->u_mode.gcm.gcm_table, 4 * 16); /* H⁶ <<< 1 */
+  gcm_lsh (c->u_mode.gcm.gcm_table, 5 * 16); /* H⁷ <<< 1 */
+  gcm_lsh (c->u_mode.gcm.gcm_table, 6 * 16); /* H⁸ <<< 1 */
 }
+#endif /* __x86_64__ */
 
 
 unsigned int ASM_FUNC_ATTR
 _gcry_ghash_intel_pclmul (gcry_cipher_hd_t c, byte *result, const byte *buf,
-                          size_t nblocks)
+                         size_t nblocks)
 {
   static const unsigned char be_mask[16] __attribute__ ((aligned (16))) =
     { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 };
@@ -650,15 +1504,147 @@ _gcry_ghash_intel_pclmul (gcry_cipher_hd_t c, byte *result, const byte *buf,
                   [be_mask] "m" (*be_mask)
                 : "memory" );
 
+#if defined(GCM_USE_INTEL_VPCLMUL_AVX2)
+  if (nblocks >= 16
+      && ((c->u_mode.gcm.hw_impl_flags & GCM_INTEL_USE_VPCLMUL_AVX2)
+          || (c->u_mode.gcm.hw_impl_flags & GCM_INTEL_USE_VPCLMUL_AVX512)))
+    {
+#if defined(GCM_USE_INTEL_VPCLMUL_AVX512)
+      if (nblocks >= 32
+         && (c->u_mode.gcm.hw_impl_flags & GCM_INTEL_USE_VPCLMUL_AVX512))
+       {
+         asm volatile ("vpopcntb %%xmm7, %%xmm16\n\t" /* spec stop for old AVX512 CPUs */
+                       "vshufi64x2 $0, %%zmm7, %%zmm7, %%zmm15\n\t"
+                       "vmovdqa %%xmm1, %%xmm8\n\t"
+                       "vmovdqu64 %[swapperm], %%zmm14\n\t"
+                       :
+                       : [swapperm] "m" (swap128b_perm),
+                         [h_table] "r" (c->u_mode.gcm.gcm_table)
+                       : "memory" );
+
+         if (!(c->u_mode.gcm.hw_impl_flags & GCM_INTEL_AGGR32_TABLE_INITIALIZED))
+           {
+             if (!(c->u_mode.gcm.hw_impl_flags & GCM_INTEL_AGGR16_TABLE_INITIALIZED))
+               {
+                 if (!(c->u_mode.gcm.hw_impl_flags & GCM_INTEL_AGGR8_TABLE_INITIALIZED))
+                   ghash_setup_aggr8_avx512 (c); /* Clobbers registers XMM0-XMM7. */
+
+                 ghash_setup_aggr16_avx512 (c); /* Clobbers registers XMM0-XMM7. */
+               }
+
+             ghash_setup_aggr32_avx512 (c); /* Clobbers registers XMM0-XMM7. */
+           }
+
+         /* Preload H1-H32. */
+         load_h1h4_to_zmm1 (c);
+         asm volatile ("vpermq %%zmm1, %%zmm14, %%zmm16\n\t" /* H1|H2|H3|H4 */
+                       "vmovdqa %%xmm8, %%xmm1\n\t"
+                       "vpermq 27*16(%[h_table]), %%zmm14, %%zmm0\n\t"  /* H28|H29|H31|H32 */
+                       "vpermq 23*16(%[h_table]), %%zmm14, %%zmm13\n\t" /* H25|H26|H27|H28 */
+                       "vpermq 19*16(%[h_table]), %%zmm14, %%zmm12\n\t" /* H21|H22|H23|H24 */
+                       "vpermq 15*16(%[h_table]), %%zmm14, %%zmm11\n\t" /* H17|H18|H19|H20 */
+                       "vpermq 11*16(%[h_table]), %%zmm14, %%zmm10\n\t" /* H13|H14|H15|H16 */
+                       "vpermq 7*16(%[h_table]), %%zmm14, %%zmm9\n\t"   /* H9|H10|H11|H12 */
+                       "vpermq 3*16(%[h_table]), %%zmm14, %%zmm8\n\t"   /* H4|H6|H7|H8 */
+                       :
+                       : [h_1] "m" (*c->u_mode.gcm.u_ghash_key.key),
+                         [h_table] "r" (c->u_mode.gcm.gcm_table)
+                       : "memory" );
+
+         while (nblocks >= 32)
+           {
+             gfmul_vpclmul_avx512_aggr32 (buf, c->u_mode.gcm.gcm_table);
+
+             buf += 32 * blocksize;
+             nblocks -= 32;
+           }
+
+         asm volatile ("vmovdqa %%xmm15, %%xmm7\n\t"
+                       "vpxorq %%ymm16, %%ymm16, %%ymm16\n\t"
+                       "vpxorq %%ymm17, %%ymm17, %%ymm17\n\t"
+                       "vpxorq %%ymm18, %%ymm18, %%ymm18\n\t"
+                       "vpxorq %%ymm19, %%ymm19, %%ymm19\n\t"
+                       :
+                       :
+                       : "memory" );
+       }
+#endif /* GCM_USE_INTEL_VPCLMUL_AVX512 */
+
+      if (nblocks >= 16)
+       {
+         u64 h1_h2_h15_h16[4*2];
+
+         asm volatile ("vinserti128 $1, %%xmm7, %%ymm7, %%ymm15\n\t"
+                       "vmovdqa %%xmm1, %%xmm8\n\t"
+                       ::: "memory" );
+
+         if (!(c->u_mode.gcm.hw_impl_flags & GCM_INTEL_AGGR16_TABLE_INITIALIZED))
+           {
+             if (!(c->u_mode.gcm.hw_impl_flags & GCM_INTEL_AGGR8_TABLE_INITIALIZED))
+               ghash_setup_aggr8_avx2 (c); /* Clobbers registers XMM0-XMM7. */
+
+             ghash_setup_aggr16_avx2 (c); /* Clobbers registers XMM0-XMM7. */
+           }
+
+         /* Preload H1-H16. */
+         load_h1h2_to_ymm1 (c);
+         asm volatile ("vperm2i128 $0x23, %%ymm1, %%ymm1, %%ymm7\n\t" /* H1|H2 */
+                       "vmovdqa %%xmm8, %%xmm1\n\t"
+                       "vpxor %%xmm8, %%xmm8, %%xmm8\n\t"
+                       "vperm2i128 $0x23, 13*16(%[h_table]), %%ymm8, %%ymm0\n\t"  /* H15|H16 */
+                       "vperm2i128 $0x23, 11*16(%[h_table]), %%ymm8, %%ymm13\n\t" /* H13|H14 */
+                       "vperm2i128 $0x23, 9*16(%[h_table]), %%ymm8, %%ymm12\n\t"  /* H11|H12 */
+                       "vperm2i128 $0x23, 7*16(%[h_table]), %%ymm8, %%ymm11\n\t"  /* H9|H10 */
+                       "vperm2i128 $0x23, 5*16(%[h_table]), %%ymm8, %%ymm10\n\t"  /* H7|H8 */
+                       "vperm2i128 $0x23, 3*16(%[h_table]), %%ymm8, %%ymm9\n\t"   /* H5|H6 */
+                       "vperm2i128 $0x23, 1*16(%[h_table]), %%ymm8, %%ymm8\n\t"   /* H3|H4 */
+                       "vmovdqu %%ymm0, %[h15_h16]\n\t"
+                       "vmovdqu %%ymm7, %[h1_h2]\n\t"
+                       : [h1_h2] "=m" (h1_h2_h15_h16[0]),
+                         [h15_h16] "=m" (h1_h2_h15_h16[4])
+                       : [h_1] "m" (*c->u_mode.gcm.u_ghash_key.key),
+                         [h_table] "r" (c->u_mode.gcm.gcm_table)
+                       : "memory" );
+
+         while (nblocks >= 16)
+           {
+             gfmul_vpclmul_avx2_aggr16 (buf, c->u_mode.gcm.gcm_table,
+                                       h1_h2_h15_h16);
+
+             buf += 16 * blocksize;
+             nblocks -= 16;
+           }
+
+         asm volatile ("vmovdqu %%ymm15, %[h15_h16]\n\t"
+                       "vmovdqu %%ymm15, %[h1_h2]\n\t"
+                       "vmovdqa %%xmm15, %%xmm7\n\t"
+                       :
+                         [h1_h2] "=m" (h1_h2_h15_h16[0]),
+                         [h15_h16] "=m" (h1_h2_h15_h16[4])
+                       :
+                       : "memory" );
+       }
+
+      asm volatile ("vzeroupper\n\t" ::: "memory" );
+    }
+#endif /* GCM_USE_INTEL_VPCLMUL_AVX2 */
+
 #ifdef __x86_64__
   if (nblocks >= 8)
     {
-      /* Preload H1. */
       asm volatile ("movdqa %%xmm7, %%xmm15\n\t"
-                    "movdqa %[h_1], %%xmm0\n\t"
-                    :
-                    : [h_1] "m" (*c->u_mode.gcm.u_ghash_key.key)
-                    : "memory" );
+                   "movdqa %%xmm1, %%xmm8\n\t"
+                   ::: "memory" );
+
+      if (!(c->u_mode.gcm.hw_impl_flags & GCM_INTEL_AGGR8_TABLE_INITIALIZED))
+       ghash_setup_aggr8 (c); /* Clobbers registers XMM0-XMM7. */
+
+      /* Preload H1. */
+      asm volatile ("movdqa %%xmm8, %%xmm1\n\t"
+                   "movdqa %[h_1], %%xmm0\n\t"
+                   :
+                   : [h_1] "m" (*c->u_mode.gcm.u_ghash_key.key)
+                   : "memory" );
 
       while (nblocks >= 8)
         {
@@ -667,20 +1653,8 @@ _gcry_ghash_intel_pclmul (gcry_cipher_hd_t c, byte *result, const byte *buf,
           buf += 8 * blocksize;
           nblocks -= 8;
         }
-#ifndef __WIN64__
-      /* Clear used x86-64/XMM registers. */
-      asm volatile( "pxor %%xmm8, %%xmm8\n\t"
-                    "pxor %%xmm9, %%xmm9\n\t"
-                    "pxor %%xmm10, %%xmm10\n\t"
-                    "pxor %%xmm11, %%xmm11\n\t"
-                    "pxor %%xmm12, %%xmm12\n\t"
-                    "pxor %%xmm13, %%xmm13\n\t"
-                    "pxor %%xmm14, %%xmm14\n\t"
-                    "pxor %%xmm15, %%xmm15\n\t"
-                    ::: "memory" );
-#endif
     }
-#endif
+#endif /* __x86_64__ */
 
   while (nblocks >= 4)
     {
@@ -722,46 +1696,56 @@ _gcry_ghash_intel_pclmul (gcry_cipher_hd_t c, byte *result, const byte *buf,
                 : [be_mask] "m" (*be_mask)
                 : "memory" );
 
-#if defined(__x86_64__) && defined(__WIN64__)
   /* Clear/restore used registers. */
-  asm volatile( "pxor %%xmm0, %%xmm0\n\t"
-                "pxor %%xmm1, %%xmm1\n\t"
-                "pxor %%xmm2, %%xmm2\n\t"
-                "pxor %%xmm3, %%xmm3\n\t"
-                "pxor %%xmm4, %%xmm4\n\t"
-                "pxor %%xmm5, %%xmm5\n\t"
-                "movdqu 0*16(%0), %%xmm6\n\t"
-                "movdqu 1*16(%0), %%xmm7\n\t"
-                "movdqu 2*16(%0), %%xmm8\n\t"
-                "movdqu 3*16(%0), %%xmm9\n\t"
-                "movdqu 4*16(%0), %%xmm10\n\t"
-                "movdqu 5*16(%0), %%xmm11\n\t"
-                "movdqu 6*16(%0), %%xmm12\n\t"
-                "movdqu 7*16(%0), %%xmm13\n\t"
-                "movdqu 8*16(%0), %%xmm14\n\t"
-                "movdqu 9*16(%0), %%xmm15\n\t"
-                :
-                : "r" (win64tmp)
-                : "memory" );
+  asm volatile ("pxor %%xmm0, %%xmm0\n\t"
+               "pxor %%xmm1, %%xmm1\n\t"
+               "pxor %%xmm2, %%xmm2\n\t"
+               "pxor %%xmm3, %%xmm3\n\t"
+               "pxor %%xmm4, %%xmm4\n\t"
+               "pxor %%xmm5, %%xmm5\n\t"
+               "pxor %%xmm6, %%xmm6\n\t"
+               "pxor %%xmm7, %%xmm7\n\t"
+               :
+               :
+               : "memory" );
+#ifdef __x86_64__
+#ifdef __WIN64__
+  asm volatile ("movdqu 0*16(%0), %%xmm6\n\t"
+               "movdqu 1*16(%0), %%xmm7\n\t"
+               "movdqu 2*16(%0), %%xmm8\n\t"
+               "movdqu 3*16(%0), %%xmm9\n\t"
+               "movdqu 4*16(%0), %%xmm10\n\t"
+               "movdqu 5*16(%0), %%xmm11\n\t"
+               "movdqu 6*16(%0), %%xmm12\n\t"
+               "movdqu 7*16(%0), %%xmm13\n\t"
+               "movdqu 8*16(%0), %%xmm14\n\t"
+               "movdqu 9*16(%0), %%xmm15\n\t"
+               :
+               : "r" (win64tmp)
+               : "memory" );
 #else
   /* Clear used registers. */
-  asm volatile( "pxor %%xmm0, %%xmm0\n\t"
-                "pxor %%xmm1, %%xmm1\n\t"
-                "pxor %%xmm2, %%xmm2\n\t"
-                "pxor %%xmm3, %%xmm3\n\t"
-                "pxor %%xmm4, %%xmm4\n\t"
-                "pxor %%xmm5, %%xmm5\n\t"
-                "pxor %%xmm6, %%xmm6\n\t"
-                "pxor %%xmm7, %%xmm7\n\t"
-                ::: "memory" );
-#endif
+  asm volatile (
+               "pxor %%xmm8, %%xmm8\n\t"
+               "pxor %%xmm9, %%xmm9\n\t"
+               "pxor %%xmm10, %%xmm10\n\t"
+               "pxor %%xmm11, %%xmm11\n\t"
+               "pxor %%xmm12, %%xmm12\n\t"
+               "pxor %%xmm13, %%xmm13\n\t"
+               "pxor %%xmm14, %%xmm14\n\t"
+               "pxor %%xmm15, %%xmm15\n\t"
+               :
+               :
+               : "memory" );
+#endif /* __WIN64__ */
+#endif /* __x86_64__ */
 
   return 0;
 }
 
 unsigned int ASM_FUNC_ATTR
 _gcry_polyval_intel_pclmul (gcry_cipher_hd_t c, byte *result, const byte *buf,
-                            size_t nblocks)
+                           size_t nblocks)
 {
   static const unsigned char be_mask[16] __attribute__ ((aligned (16))) =
     { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 };
@@ -799,15 +1783,144 @@ _gcry_polyval_intel_pclmul (gcry_cipher_hd_t c, byte *result, const byte *buf,
                   [be_mask] "m" (*be_mask)
                 : "memory" );
 
+#if defined(GCM_USE_INTEL_VPCLMUL_AVX2)
+  if (nblocks >= 16
+      && ((c->u_mode.gcm.hw_impl_flags & GCM_INTEL_USE_VPCLMUL_AVX2)
+          || (c->u_mode.gcm.hw_impl_flags & GCM_INTEL_USE_VPCLMUL_AVX512)))
+    {
+#if defined(GCM_USE_INTEL_VPCLMUL_AVX512)
+      if (nblocks >= 32
+         && (c->u_mode.gcm.hw_impl_flags & GCM_INTEL_USE_VPCLMUL_AVX512))
+       {
+         asm volatile ("vpopcntb %%xmm1, %%xmm16\n\t" /* spec stop for old AVX512 CPUs */
+                       "vmovdqa %%xmm1, %%xmm8\n\t"
+                       "vmovdqu64 %[swapperm], %%zmm14\n\t"
+                       :
+                       : [swapperm] "m" (swap128b_perm),
+                         [h_table] "r" (c->u_mode.gcm.gcm_table)
+                       : "memory" );
+
+         if (!(c->u_mode.gcm.hw_impl_flags & GCM_INTEL_AGGR32_TABLE_INITIALIZED))
+           {
+             if (!(c->u_mode.gcm.hw_impl_flags & GCM_INTEL_AGGR16_TABLE_INITIALIZED))
+               {
+                 if (!(c->u_mode.gcm.hw_impl_flags & GCM_INTEL_AGGR8_TABLE_INITIALIZED))
+                   ghash_setup_aggr8_avx512 (c); /* Clobbers registers XMM0-XMM7. */
+
+                 ghash_setup_aggr16_avx512 (c); /* Clobbers registers XMM0-XMM7. */
+               }
+
+             ghash_setup_aggr32_avx512 (c); /* Clobbers registers XMM0-XMM7. */
+           }
+
+         /* Preload H1-H32. */
+         load_h1h4_to_zmm1 (c);
+         asm volatile ("vpermq %%zmm1, %%zmm14, %%zmm16\n\t" /* H1|H2|H3|H4 */
+                       "vmovdqa %%xmm8, %%xmm1\n\t"
+                       "vpermq 27*16(%[h_table]), %%zmm14, %%zmm0\n\t"  /* H28|H29|H31|H32 */
+                       "vpermq 23*16(%[h_table]), %%zmm14, %%zmm13\n\t" /* H25|H26|H27|H28 */
+                       "vpermq 19*16(%[h_table]), %%zmm14, %%zmm12\n\t" /* H21|H22|H23|H24 */
+                       "vpermq 15*16(%[h_table]), %%zmm14, %%zmm11\n\t" /* H17|H18|H19|H20 */
+                       "vpermq 11*16(%[h_table]), %%zmm14, %%zmm10\n\t" /* H13|H14|H15|H16 */
+                       "vpermq 7*16(%[h_table]), %%zmm14, %%zmm9\n\t"   /* H9|H10|H11|H12 */
+                       "vpermq 3*16(%[h_table]), %%zmm14, %%zmm8\n\t"   /* H4|H6|H7|H8 */
+                       :
+                       : [h_1] "m" (*c->u_mode.gcm.u_ghash_key.key),
+                         [h_table] "r" (c->u_mode.gcm.gcm_table)
+                       : "memory" );
+
+         while (nblocks >= 32)
+           {
+             gfmul_vpclmul_avx512_aggr32_le (buf, c->u_mode.gcm.gcm_table);
+
+             buf += 32 * blocksize;
+             nblocks -= 32;
+           }
+
+         asm volatile ("vpxor %%xmm7, %%xmm7, %%xmm7\n\t"
+                       "vpxorq %%ymm16, %%ymm16, %%ymm16\n\t"
+                       "vpxorq %%ymm17, %%ymm17, %%ymm17\n\t"
+                       "vpxorq %%ymm18, %%ymm18, %%ymm18\n\t"
+                       "vpxorq %%ymm19, %%ymm19, %%ymm19\n\t"
+                       :
+                       :
+                       : "memory" );
+       }
+#endif /* GCM_USE_INTEL_VPCLMUL_AVX512 */
+
+      if (nblocks >= 16)
+       {
+         u64 h1_h2_h15_h16[4*2];
+
+         asm volatile ("vmovdqa %%xmm1, %%xmm8\n\t"
+                       ::: "memory" );
+
+         if (!(c->u_mode.gcm.hw_impl_flags & GCM_INTEL_AGGR16_TABLE_INITIALIZED))
+           {
+             if (!(c->u_mode.gcm.hw_impl_flags & GCM_INTEL_AGGR8_TABLE_INITIALIZED))
+               ghash_setup_aggr8_avx2 (c); /* Clobbers registers XMM0-XMM7. */
+
+             ghash_setup_aggr16_avx2 (c); /* Clobbers registers XMM0-XMM7. */
+           }
+
+         /* Preload H1-H16. */
+         load_h1h2_to_ymm1 (c);
+         asm volatile ("vperm2i128 $0x23, %%ymm1, %%ymm1, %%ymm7\n\t" /* H1|H2 */
+                       "vmovdqa %%xmm8, %%xmm1\n\t"
+                       "vpxor %%xmm8, %%xmm8, %%xmm8\n\t"
+                       "vperm2i128 $0x23, 13*16(%[h_table]), %%ymm8, %%ymm0\n\t"  /* H15|H16 */
+                       "vperm2i128 $0x23, 11*16(%[h_table]), %%ymm8, %%ymm13\n\t" /* H13|H14 */
+                       "vperm2i128 $0x23, 9*16(%[h_table]), %%ymm8, %%ymm12\n\t"  /* H11|H12 */
+                       "vperm2i128 $0x23, 7*16(%[h_table]), %%ymm8, %%ymm11\n\t"  /* H9|H10 */
+                       "vperm2i128 $0x23, 5*16(%[h_table]), %%ymm8, %%ymm10\n\t"  /* H7|H8 */
+                       "vperm2i128 $0x23, 3*16(%[h_table]), %%ymm8, %%ymm9\n\t"   /* H5|H6 */
+                       "vperm2i128 $0x23, 1*16(%[h_table]), %%ymm8, %%ymm8\n\t"   /* H3|H4 */
+                       "vmovdqu %%ymm0, %[h15_h16]\n\t"
+                       "vmovdqu %%ymm7, %[h1_h2]\n\t"
+                       : [h1_h2] "=m" (h1_h2_h15_h16[0]),
+                         [h15_h16] "=m" (h1_h2_h15_h16[4])
+                       : [h_1] "m" (*c->u_mode.gcm.u_ghash_key.key),
+                         [h_table] "r" (c->u_mode.gcm.gcm_table)
+                       : "memory" );
+
+         while (nblocks >= 16)
+           {
+             gfmul_vpclmul_avx2_aggr16_le (buf, c->u_mode.gcm.gcm_table,
+                                           h1_h2_h15_h16);
+
+             buf += 16 * blocksize;
+             nblocks -= 16;
+           }
+
+         asm volatile ("vpxor %%xmm7, %%xmm7, %%xmm7\n\t"
+                       "vmovdqu %%ymm7, %[h15_h16]\n\t"
+                       "vmovdqu %%ymm7, %[h1_h2]\n\t"
+                       : [h1_h2] "=m" (h1_h2_h15_h16[0]),
+                         [h15_h16] "=m" (h1_h2_h15_h16[4])
+                       :
+                       : "memory" );
+       }
+
+      asm volatile ("vzeroupper\n\t" ::: "memory" );
+    }
+#endif /* GCM_USE_INTEL_VPCLMUL_AVX2 */
+
 #ifdef __x86_64__
   if (nblocks >= 8)
     {
+      asm volatile ("movdqa %%xmm1, %%xmm8\n\t"
+                   ::: "memory" );
+
+      if (!(c->u_mode.gcm.hw_impl_flags & GCM_INTEL_AGGR8_TABLE_INITIALIZED))
+       ghash_setup_aggr8 (c); /* Clobbers registers XMM0-XMM7. */
+
       /* Preload H1. */
-      asm volatile ("pxor %%xmm15, %%xmm15\n\t"
-                    "movdqa %[h_1], %%xmm0\n\t"
-                    :
-                    : [h_1] "m" (*c->u_mode.gcm.u_ghash_key.key)
-                    : "memory" );
+      asm volatile ("movdqa %%xmm8, %%xmm1\n\t"
+                   "pxor %%xmm15, %%xmm15\n\t"
+                   "movdqa %[h_1], %%xmm0\n\t"
+                   :
+                   : [h_1] "m" (*c->u_mode.gcm.u_ghash_key.key)
+                   : "memory" );
 
       while (nblocks >= 8)
         {
@@ -816,18 +1929,6 @@ _gcry_polyval_intel_pclmul (gcry_cipher_hd_t c, byte *result, const byte *buf,
           buf += 8 * blocksize;
           nblocks -= 8;
         }
-#ifndef __WIN64__
-      /* Clear used x86-64/XMM registers. */
-      asm volatile( "pxor %%xmm8, %%xmm8\n\t"
-                    "pxor %%xmm9, %%xmm9\n\t"
-                    "pxor %%xmm10, %%xmm10\n\t"
-                    "pxor %%xmm11, %%xmm11\n\t"
-                    "pxor %%xmm12, %%xmm12\n\t"
-                    "pxor %%xmm13, %%xmm13\n\t"
-                    "pxor %%xmm14, %%xmm14\n\t"
-                    "pxor %%xmm15, %%xmm15\n\t"
-                    ::: "memory" );
-#endif
     }
 #endif
 
@@ -870,39 +1971,49 @@ _gcry_polyval_intel_pclmul (gcry_cipher_hd_t c, byte *result, const byte *buf,
                 : [be_mask] "m" (*be_mask)
                 : "memory" );
 
-#if defined(__x86_64__) && defined(__WIN64__)
   /* Clear/restore used registers. */
-  asm volatile( "pxor %%xmm0, %%xmm0\n\t"
-                "pxor %%xmm1, %%xmm1\n\t"
-                "pxor %%xmm2, %%xmm2\n\t"
-                "pxor %%xmm3, %%xmm3\n\t"
-                "pxor %%xmm4, %%xmm4\n\t"
-                "pxor %%xmm5, %%xmm5\n\t"
-                "movdqu 0*16(%0), %%xmm6\n\t"
-                "movdqu 1*16(%0), %%xmm7\n\t"
-                "movdqu 2*16(%0), %%xmm8\n\t"
-                "movdqu 3*16(%0), %%xmm9\n\t"
-                "movdqu 4*16(%0), %%xmm10\n\t"
-                "movdqu 5*16(%0), %%xmm11\n\t"
-                "movdqu 6*16(%0), %%xmm12\n\t"
-                "movdqu 7*16(%0), %%xmm13\n\t"
-                "movdqu 8*16(%0), %%xmm14\n\t"
-                "movdqu 9*16(%0), %%xmm15\n\t"
-                :
-                : "r" (win64tmp)
-                : "memory" );
+  asm volatile ("pxor %%xmm0, %%xmm0\n\t"
+               "pxor %%xmm1, %%xmm1\n\t"
+               "pxor %%xmm2, %%xmm2\n\t"
+               "pxor %%xmm3, %%xmm3\n\t"
+               "pxor %%xmm4, %%xmm4\n\t"
+               "pxor %%xmm5, %%xmm5\n\t"
+               "pxor %%xmm6, %%xmm6\n\t"
+               "pxor %%xmm7, %%xmm7\n\t"
+               :
+               :
+               : "memory" );
+#ifdef __x86_64__
+#ifdef __WIN64__
+  asm volatile ("movdqu 0*16(%0), %%xmm6\n\t"
+               "movdqu 1*16(%0), %%xmm7\n\t"
+               "movdqu 2*16(%0), %%xmm8\n\t"
+               "movdqu 3*16(%0), %%xmm9\n\t"
+               "movdqu 4*16(%0), %%xmm10\n\t"
+               "movdqu 5*16(%0), %%xmm11\n\t"
+               "movdqu 6*16(%0), %%xmm12\n\t"
+               "movdqu 7*16(%0), %%xmm13\n\t"
+               "movdqu 8*16(%0), %%xmm14\n\t"
+               "movdqu 9*16(%0), %%xmm15\n\t"
+               :
+               : "r" (win64tmp)
+               : "memory" );
 #else
   /* Clear used registers. */
-  asm volatile( "pxor %%xmm0, %%xmm0\n\t"
-                "pxor %%xmm1, %%xmm1\n\t"
-                "pxor %%xmm2, %%xmm2\n\t"
-                "pxor %%xmm3, %%xmm3\n\t"
-                "pxor %%xmm4, %%xmm4\n\t"
-                "pxor %%xmm5, %%xmm5\n\t"
-                "pxor %%xmm6, %%xmm6\n\t"
-                "pxor %%xmm7, %%xmm7\n\t"
-                ::: "memory" );
-#endif
+  asm volatile (
+               "pxor %%xmm8, %%xmm8\n\t"
+               "pxor %%xmm9, %%xmm9\n\t"
+               "pxor %%xmm10, %%xmm10\n\t"
+               "pxor %%xmm11, %%xmm11\n\t"
+               "pxor %%xmm12, %%xmm12\n\t"
+               "pxor %%xmm13, %%xmm13\n\t"
+               "pxor %%xmm14, %%xmm14\n\t"
+               "pxor %%xmm15, %%xmm15\n\t"
+               :
+               :
+               : "memory" );
+#endif /* __WIN64__ */
+#endif /* __x86_64__ */
 
   return 0;
 }
index 4f75e95cff05192a77d122ce4d400661d6da9809..648d159807207b87bdfe93f2e5021cc580118fec 100644 (file)
@@ -5,7 +5,7 @@
  * This file is part of Libgcrypt.
  *
  * Libgcrypt is free software; you can redistribute it and/or modify
- * it under the terms of the GNU Lesser general Public License as
+ * it under the terms of the GNU Lesser General Public License as
  * published by the Free Software Foundation; either version 2.1 of
  * the License, or (at your option) any later version.
  *
@@ -437,6 +437,7 @@ _gcry_ghash_ppc_vpmsum (byte *result, void *gcm_table,
       in1 = vec_load_he (16, buf);
       in2 = vec_load_he (32, buf);
       in3 = vec_load_he (48, buf);
+      buf += 64;
       in0 = vec_be_swap(in0, bswap_const);
       in1 = vec_be_swap(in1, bswap_const);
       in2 = vec_be_swap(in2, bswap_const);
@@ -464,17 +465,13 @@ _gcry_ghash_ppc_vpmsum (byte *result, void *gcm_table,
       Xh3 = asm_xor (Xh3, Xh1);
 
       /* Gerald Estrin's scheme for parallel multiplication of polynomials */
-      while (1)
+      for (; blocks_remaining > 4; blocks_remaining -= 4)
         {
-         buf += 64;
-         blocks_remaining -= 4;
-         if (!blocks_remaining)
-           break;
-
          in0 = vec_load_he (0, buf);
          in1 = vec_load_he (16, buf);
          in2 = vec_load_he (32, buf);
          in3 = vec_load_he (48, buf);
+         buf += 64;
          in1 = vec_be_swap(in1, bswap_const);
          in2 = vec_be_swap(in2, bswap_const);
          in3 = vec_be_swap(in3, bswap_const);
index 9ebc00366792ae69decd96e7c38736225e159ad2..1f7a52bc682264ab13c68a50d33c8a01f15ef60b 100644 (file)
@@ -4,7 +4,7 @@
  * This file is part of Libgcrypt.
  *
  * Libgcrypt is free software; you can redistribute it and/or modify
- * it under the terms of the GNU Lesser general Public License as
+ * it under the terms of the GNU Lesser General Public License as
  * published by the Free Software Foundation; either version 2.1 of
  * the License, or (at your option) any later version.
  *
index fc79986e5ad3dde1cf76eb40cbab5c4288af83fb..d3c04d58f18afef331525cc3dce78ad65a4aa563 100644 (file)
@@ -5,7 +5,7 @@
  * This file is part of Libgcrypt.
  *
  * Libgcrypt is free software; you can redistribute it and/or modify
- * it under the terms of the GNU Lesser general Public License as
+ * it under the terms of the GNU Lesser General Public License as
  * published by the Free Software Foundation; either version 2.1 of
  * the License, or (at your option) any later version.
  *
@@ -30,6 +30,8 @@
 #include "./cipher-internal.h"
 
 
+static gcry_err_code_t _gcry_cipher_gcm_setiv_zero (gcry_cipher_hd_t c);
+
 /* Helper macro to force alignment to 16 or 64 bytes.  */
 #ifdef HAVE_GCC_ATTRIBUTE_ALIGNED
 # define ATTR_ALIGNED_64  __attribute__ ((aligned (64)))
 
 
 #ifdef GCM_USE_INTEL_PCLMUL
-extern void _gcry_ghash_setup_intel_pclmul (gcry_cipher_hd_t c);
-
-extern unsigned int _gcry_ghash_intel_pclmul (gcry_cipher_hd_t c, byte *result,
-                                              const byte *buf, size_t nblocks);
-
-extern unsigned int _gcry_polyval_intel_pclmul (gcry_cipher_hd_t c,
-                                                byte *result,
-                                                const byte *buf,
-                                                size_t nblocks);
+extern void _gcry_ghash_setup_intel_pclmul (gcry_cipher_hd_t c,
+                                           unsigned int hw_features);
 #endif
 
 #ifdef GCM_USE_ARM_PMULL
@@ -594,9 +589,7 @@ setupM (gcry_cipher_hd_t c)
 #ifdef GCM_USE_INTEL_PCLMUL
   else if (features & HWF_INTEL_PCLMUL)
     {
-      c->u_mode.gcm.ghash_fn = _gcry_ghash_intel_pclmul;
-      c->u_mode.gcm.polyval_fn = _gcry_polyval_intel_pclmul;
-      _gcry_ghash_setup_intel_pclmul (c);
+      _gcry_ghash_setup_intel_pclmul (c, features);
     }
 #endif
 #ifdef GCM_USE_ARM_PMULL
@@ -888,8 +881,9 @@ gcm_crypt_inner (gcry_cipher_hd_t c, byte *outbuf, size_t outbuflen,
 
       /* Since checksumming is done after/before encryption/decryption,
        * process input in 24KiB chunks to keep data loaded in L1 cache for
-       * checksumming/decryption. */
-      if (currlen > 24 * 1024)
+       * checksumming/decryption.  However only do splitting if input is
+       * large enough so that last chunks does not end up being short. */
+      if (currlen > 32 * 1024)
        currlen = 24 * 1024;
 
       if (!encrypt)
@@ -917,8 +911,6 @@ _gcry_cipher_gcm_encrypt (gcry_cipher_hd_t c,
                           byte *outbuf, size_t outbuflen,
                           const byte *inbuf, size_t inbuflen)
 {
-  static const unsigned char zerobuf[MAX_BLOCKSIZE];
-
   if (c->spec->blocksize != GCRY_GCM_BLOCK_LEN)
     return GPG_ERR_CIPHER_ALGO;
   if (outbuflen < inbuflen)
@@ -931,7 +923,7 @@ _gcry_cipher_gcm_encrypt (gcry_cipher_hd_t c,
     return GPG_ERR_INV_STATE;
 
   if (!c->marks.iv)
-    _gcry_cipher_gcm_setiv (c, zerobuf, GCRY_GCM_BLOCK_LEN);
+    _gcry_cipher_gcm_setiv_zero (c);
 
   if (c->u_mode.gcm.disallow_encryption_because_of_setiv_in_fips_mode)
     return GPG_ERR_INV_STATE;
@@ -959,8 +951,6 @@ _gcry_cipher_gcm_decrypt (gcry_cipher_hd_t c,
                           byte *outbuf, size_t outbuflen,
                           const byte *inbuf, size_t inbuflen)
 {
-  static const unsigned char zerobuf[MAX_BLOCKSIZE];
-
   if (c->spec->blocksize != GCRY_GCM_BLOCK_LEN)
     return GPG_ERR_CIPHER_ALGO;
   if (outbuflen < inbuflen)
@@ -973,7 +963,7 @@ _gcry_cipher_gcm_decrypt (gcry_cipher_hd_t c,
     return GPG_ERR_INV_STATE;
 
   if (!c->marks.iv)
-    _gcry_cipher_gcm_setiv (c, zerobuf, GCRY_GCM_BLOCK_LEN);
+    _gcry_cipher_gcm_setiv_zero (c);
 
   if (!c->u_mode.gcm.ghash_aad_finalized)
     {
@@ -997,8 +987,6 @@ gcry_err_code_t
 _gcry_cipher_gcm_authenticate (gcry_cipher_hd_t c,
                                const byte * aadbuf, size_t aadbuflen)
 {
-  static const unsigned char zerobuf[MAX_BLOCKSIZE];
-
   if (c->spec->blocksize != GCRY_GCM_BLOCK_LEN)
     return GPG_ERR_CIPHER_ALGO;
   if (c->u_mode.gcm.datalen_over_limits)
@@ -1010,7 +998,7 @@ _gcry_cipher_gcm_authenticate (gcry_cipher_hd_t c,
     return GPG_ERR_INV_STATE;
 
   if (!c->marks.iv)
-    _gcry_cipher_gcm_setiv (c, zerobuf, GCRY_GCM_BLOCK_LEN);
+    _gcry_cipher_gcm_setiv_zero (c);
 
   gcm_bytecounter_add(c->u_mode.gcm.aadlen, aadbuflen);
   if (!gcm_check_aadlen_or_ivlen(c->u_mode.gcm.aadlen))
@@ -1113,6 +1101,15 @@ _gcry_cipher_gcm_setiv (gcry_cipher_hd_t c, const byte *iv, size_t ivlen)
 {
   c->marks.iv = 0;
   c->marks.tag = 0;
+
+  return _gcry_cipher_gcm_initiv (c, iv, ivlen);
+}
+
+static gcry_err_code_t
+_gcry_cipher_gcm_setiv_zero (gcry_cipher_hd_t c)
+{
+  static const unsigned char zerobuf[MAX_BLOCKSIZE];
+
   c->u_mode.gcm.disallow_encryption_because_of_setiv_in_fips_mode = 0;
 
   if (fips_mode ())
@@ -1121,7 +1118,7 @@ _gcry_cipher_gcm_setiv (gcry_cipher_hd_t c, const byte *iv, size_t ivlen)
       c->u_mode.gcm.disallow_encryption_because_of_setiv_in_fips_mode = 1;
     }
 
-  return _gcry_cipher_gcm_initiv (c, iv, ivlen);
+  return _gcry_cipher_gcm_setiv (c, zerobuf, GCRY_GCM_BLOCK_LEN);
 }
 
 
index c8a1097ad04d69eff81714172e59be6a0efdd7d1..cd8ff78885762c23de9de8f87718683d515a5740 100644 (file)
@@ -4,7 +4,7 @@
  * This file is part of Libgcrypt.
  *
  * Libgcrypt is free software; you can redistribute it and/or modify
- * it under the terms of the GNU Lesser general Public License as
+ * it under the terms of the GNU Lesser General Public License as
  * published by the Free Software Foundation; either version 2.1 of
  * the License, or (at your option) any later version.
  *
 # endif
 #endif /* GCM_USE_INTEL_PCLMUL */
 
+/* GCM_USE_INTEL_VPCLMUL_AVX2 indicates whether to compile GCM with Intel
+   VPCLMUL/AVX2 code.  */
+#undef GCM_USE_INTEL_VPCLMUL_AVX2
+#if defined(__x86_64__) && defined(GCM_USE_INTEL_PCLMUL) && \
+    defined(ENABLE_AVX2_SUPPORT) && defined(HAVE_GCC_INLINE_ASM_VAES_VPCLMUL)
+# define GCM_USE_INTEL_VPCLMUL_AVX2 1
+#endif /* GCM_USE_INTEL_VPCLMUL_AVX2 */
+
+/* GCM_USE_INTEL_VPCLMUL_AVX512 indicates whether to compile GCM with Intel
+   VPCLMUL/AVX512 code.  */
+#undef GCM_USE_INTEL_VPCLMUL_AVX512
+#if defined(__x86_64__) && defined(GCM_USE_INTEL_VPCLMUL_AVX2) && \
+    defined(ENABLE_AVX512_SUPPORT) && defined(HAVE_GCC_INLINE_ASM_AVX512)
+# define GCM_USE_INTEL_VPCLMUL_AVX512 1
+#endif /* GCM_USE_INTEL_VPCLMUL_AVX512 */
+
 /* GCM_USE_ARM_PMULL indicates whether to compile GCM with ARMv8 PMULL code. */
 #undef GCM_USE_ARM_PMULL
 #if defined(ENABLE_ARM_CRYPTO_SUPPORT) && defined(GCM_USE_TABLES)
@@ -145,6 +161,8 @@ typedef struct cipher_mode_ops
    not NULL.  */
 typedef struct cipher_bulk_ops
 {
+  void (*ecb_crypt)(void *context, void *outbuf_arg, const void *inbuf_arg,
+                   size_t nblocks, int encrypt);
   void (*cfb_enc)(void *context, unsigned char *iv, void *outbuf_arg,
                  const void *inbuf_arg, size_t nblocks);
   void (*cfb_dec)(void *context, unsigned char *iv, void *outbuf_arg,
@@ -229,6 +247,14 @@ struct gcry_cipher_handle
   int mode;
   unsigned int flags;
 
+  struct {
+    int geniv_method;
+    unsigned char fixed[MAX_BLOCKSIZE];
+    unsigned char dynamic[MAX_BLOCKSIZE];
+    size_t fixed_iv_len;
+    size_t dynamic_iv_len;
+  } aead;
+
   struct {
     unsigned int key:1; /* Set to 1 if a key has been set.  */
     unsigned int iv:1;  /* Set to 1 if a IV has been set.  */
@@ -355,6 +381,9 @@ struct gcry_cipher_handle
 
       /* Key length used for GCM-SIV key generating key. */
       unsigned int siv_keylen;
+
+      /* Flags for accelerated implementations. */
+      unsigned int hw_impl_flags;
     } gcm;
 
     /* Mode specific storage for OCB mode. */
index bfafa4c86b7843d3c2828a1e6d1ecd8c3754d31f..eaddc49559625c283ce5d6da9e893c261da83daf 100644 (file)
@@ -4,7 +4,7 @@
  * This file is part of Libgcrypt.
  *
  * Libgcrypt is free software; you can redistribute it and/or modify
- * it under the terms of the GNU Lesser general Public License as
+ * it under the terms of the GNU Lesser General Public License as
  * published by the Free Software Foundation; either version 2.1 of
  * the License, or (at your option) any later version.
  *
@@ -548,9 +548,10 @@ ocb_crypt (gcry_cipher_hd_t c, int encrypt,
       nblks = nblks < nmaxblks ? nblks : nmaxblks;
 
       /* Since checksum xoring is done before/after encryption/decryption,
-       process input in 24KiB chunks to keep data loaded in L1 cache for
-       checksumming. */
-      if (nblks > 24 * 1024 / OCB_BLOCK_LEN)
+        process input in 24KiB chunks to keep data loaded in L1 cache for
+        checksumming.  However only do splitting if input is large enough
+        so that last chunks does not end up being short. */
+      if (nblks > 32 * 1024 / OCB_BLOCK_LEN)
        nblks = 24 * 1024 / OCB_BLOCK_LEN;
 
       /* Use a bulk method if available.  */
index 09db397e65adcd0d0cfe8644c27fa73d0b2de954..8828f024652ea301340b3f693114dd153ad27f98 100644 (file)
@@ -5,7 +5,7 @@
  * This file is part of Libgcrypt.
  *
  * Libgcrypt is free software; you can redistribute it and/or modify
- * it under the terms of the GNU Lesser general Public License as
+ * it under the terms of the GNU Lesser General Public License as
  * published by the Free Software Foundation; either version 2.1 of
  * the License, or (at your option) any later version.
  *
index 9abd8b0cec54606eff11a01d73b68ac6693b9b33..c76dd9a403f6a3475cb9c8d9802371c03262151b 100644 (file)
@@ -4,7 +4,7 @@
  * This file is part of Libgcrypt.
  *
  * Libgcrypt is free software; you can redistribute it and/or modify
- * it under the terms of the GNU Lesser general Public License as
+ * it under the terms of the GNU Lesser General Public License as
  * published by the Free Software Foundation; either version 2.1 of
  * the License, or (at your option) any later version.
  *
@@ -176,8 +176,10 @@ _gcry_cipher_poly1305_encrypt (gcry_cipher_hd_t c,
       size_t currlen = inbuflen;
 
       /* Since checksumming is done after encryption, process input in 24KiB
-       * chunks to keep data loaded in L1 cache for checksumming. */
-      if (currlen > 24 * 1024)
+       * chunks to keep data loaded in L1 cache for checksumming.  However
+       * only do splitting if input is large enough so that last chunks does
+       * not end up being short. */
+      if (currlen > 32 * 1024)
        currlen = 24 * 1024;
 
       c->spec->stencrypt(&c->context.c, outbuf, (byte*)inbuf, currlen);
@@ -236,8 +238,10 @@ _gcry_cipher_poly1305_decrypt (gcry_cipher_hd_t c,
       size_t currlen = inbuflen;
 
       /* Since checksumming is done before decryption, process input in 24KiB
-       * chunks to keep data loaded in L1 cache for decryption. */
-      if (currlen > 24 * 1024)
+       * chunks to keep data loaded in L1 cache for decryption.  However only
+       * do splitting if input is large enough so that last chunks does not
+       * end up being short. */
+      if (currlen > 32 * 1024)
        currlen = 24 * 1024;
 
       _gcry_poly1305_update (&c->u_mode.poly1305.ctx, inbuf, currlen);
diff --git a/cipher/cipher-selftest.c b/cipher/cipher-selftest.c
deleted file mode 100644 (file)
index d7f38a4..0000000
+++ /dev/null
@@ -1,512 +0,0 @@
-/* cipher-selftest.c - Helper functions for bulk encryption selftests.
- * Copyright (C) 2013,2020 Jussi Kivilinna <jussi.kivilinna@iki.fi>
- *
- * This file is part of Libgcrypt.
- *
- * Libgcrypt is free software; you can redistribute it and/or modify
- * it under the terms of the GNU Lesser general Public License as
- * published by the Free Software Foundation; either version 2.1 of
- * the License, or (at your option) any later version.
- *
- * Libgcrypt is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with this program; if not, see <http://www.gnu.org/licenses/>.
- */
-
-#include <config.h>
-#ifdef HAVE_SYSLOG
-# include <syslog.h>
-#endif /*HAVE_SYSLOG*/
-
-#include "types.h"
-#include "g10lib.h"
-#include "cipher.h"
-#include "bufhelp.h"
-#include "cipher-selftest.h"
-#include "cipher-internal.h"
-
-#ifdef HAVE_STDINT_H
-# include <stdint.h> /* uintptr_t */
-#elif defined(HAVE_INTTYPES_H)
-# include <inttypes.h>
-#else
-/* In this case, uintptr_t is provided by config.h. */
-#endif
-
-/* Helper macro to force alignment to 16 bytes.  */
-#ifdef HAVE_GCC_ATTRIBUTE_ALIGNED
-# define ATTR_ALIGNED_16  __attribute__ ((aligned (16)))
-#else
-# define ATTR_ALIGNED_16
-#endif
-
-
-/* Return an allocated buffers of size CONTEXT_SIZE with an alignment
-   of 16.  The caller must free that buffer using the address returned
-   at R_MEM.  Returns NULL and sets ERRNO on failure.  */
-void *
-_gcry_cipher_selftest_alloc_ctx (const int context_size, unsigned char **r_mem)
-{
-  int offs;
-  unsigned int ctx_aligned_size, memsize;
-
-  ctx_aligned_size = context_size + 15;
-  ctx_aligned_size -= ctx_aligned_size & 0xf;
-
-  memsize = ctx_aligned_size + 16;
-
-  *r_mem = xtrycalloc (1, memsize);
-  if (!*r_mem)
-    return NULL;
-
-  offs = (16 - ((uintptr_t)*r_mem & 15)) & 15;
-  return (void*)(*r_mem + offs);
-}
-
-
-/* Run the self-tests for <block cipher>-CBC-<block size>, tests bulk CBC
-   decryption.  Returns NULL on success. */
-const char *
-_gcry_selftest_helper_cbc (const char *cipher, gcry_cipher_setkey_t setkey_func,
-                          gcry_cipher_encrypt_t encrypt_one,
-                          const int nblocks, const int blocksize,
-                          const int context_size)
-{
-  cipher_bulk_ops_t bulk_ops = { 0, };
-  int i, offs;
-  unsigned char *ctx, *plaintext, *plaintext2, *ciphertext, *iv, *iv2, *mem;
-  unsigned int ctx_aligned_size, memsize;
-
-  static const unsigned char key[16] ATTR_ALIGNED_16 = {
-      0x66,0x9A,0x00,0x7F,0xC7,0x6A,0x45,0x9F,
-      0x98,0xBA,0xF9,0x17,0xFE,0xDF,0x95,0x22
-    };
-
-  /* Allocate buffers, align first two elements to 16 bytes and latter to
-     block size.  */
-  ctx_aligned_size = context_size + 15;
-  ctx_aligned_size -= ctx_aligned_size & 0xf;
-
-  memsize = ctx_aligned_size + (blocksize * 2) + (blocksize * nblocks * 3) + 16;
-
-  mem = xtrycalloc (1, memsize);
-  if (!mem)
-    return "failed to allocate memory";
-
-  offs = (16 - ((uintptr_t)mem & 15)) & 15;
-  ctx = (void*)(mem + offs);
-  iv = ctx + ctx_aligned_size;
-  iv2 = iv + blocksize;
-  plaintext = iv2 + blocksize;
-  plaintext2 = plaintext + nblocks * blocksize;
-  ciphertext = plaintext2 + nblocks * blocksize;
-
-  /* Initialize ctx */
-  if (setkey_func (ctx, key, sizeof(key), &bulk_ops) != GPG_ERR_NO_ERROR)
-   {
-     xfree(mem);
-     return "setkey failed";
-   }
-
-  /* Test single block code path */
-  memset (iv, 0x4e, blocksize);
-  memset (iv2, 0x4e, blocksize);
-  for (i = 0; i < blocksize; i++)
-    plaintext[i] = i;
-
-  /* CBC manually.  */
-  buf_xor (ciphertext, iv, plaintext, blocksize);
-  encrypt_one (ctx, ciphertext, ciphertext);
-  memcpy (iv, ciphertext, blocksize);
-
-  /* CBC decrypt.  */
-  bulk_ops.cbc_dec (ctx, iv2, plaintext2, ciphertext, 1);
-  if (memcmp (plaintext2, plaintext, blocksize))
-    {
-      xfree (mem);
-#ifdef HAVE_SYSLOG
-      syslog (LOG_USER|LOG_WARNING, "Libgcrypt warning: "
-              "%s-CBC-%d test failed (plaintext mismatch)", cipher,
-             blocksize * 8);
-#else
-      (void)cipher; /* Not used.  */
-#endif
-      return "selftest for CBC failed - see syslog for details";
-    }
-
-  if (memcmp (iv2, iv, blocksize))
-    {
-      xfree (mem);
-#ifdef HAVE_SYSLOG
-      syslog (LOG_USER|LOG_WARNING, "Libgcrypt warning: "
-              "%s-CBC-%d test failed (IV mismatch)", cipher, blocksize * 8);
-#endif
-      return "selftest for CBC failed - see syslog for details";
-    }
-
-  /* Test parallelized code paths */
-  memset (iv, 0x5f, blocksize);
-  memset (iv2, 0x5f, blocksize);
-
-  for (i = 0; i < nblocks * blocksize; i++)
-    plaintext[i] = i;
-
-  /* Create CBC ciphertext manually.  */
-  for (i = 0; i < nblocks * blocksize; i+=blocksize)
-    {
-      buf_xor (&ciphertext[i], iv, &plaintext[i], blocksize);
-      encrypt_one (ctx, &ciphertext[i], &ciphertext[i]);
-      memcpy (iv, &ciphertext[i], blocksize);
-    }
-
-  /* Decrypt using bulk CBC and compare result.  */
-  bulk_ops.cbc_dec (ctx, iv2, plaintext2, ciphertext, nblocks);
-
-  if (memcmp (plaintext2, plaintext, nblocks * blocksize))
-    {
-      xfree (mem);
-#ifdef HAVE_SYSLOG
-      syslog (LOG_USER|LOG_WARNING, "Libgcrypt warning: "
-              "%s-CBC-%d test failed (plaintext mismatch, parallel path)",
-             cipher, blocksize * 8);
-#endif
-      return "selftest for CBC failed - see syslog for details";
-    }
-  if (memcmp (iv2, iv, blocksize))
-    {
-      xfree (mem);
-#ifdef HAVE_SYSLOG
-      syslog (LOG_USER|LOG_WARNING, "Libgcrypt warning: "
-              "%s-CBC-%d test failed (IV mismatch, parallel path)",
-             cipher, blocksize * 8);
-#endif
-      return "selftest for CBC failed - see syslog for details";
-    }
-
-  xfree (mem);
-  return NULL;
-}
-
-/* Run the self-tests for <block cipher>-CFB-<block size>, tests bulk CFB
-   decryption.  Returns NULL on success. */
-const char *
-_gcry_selftest_helper_cfb (const char *cipher, gcry_cipher_setkey_t setkey_func,
-                          gcry_cipher_encrypt_t encrypt_one,
-                          const int nblocks, const int blocksize,
-                          const int context_size)
-{
-  cipher_bulk_ops_t bulk_ops = { 0, };
-  int i, offs;
-  unsigned char *ctx, *plaintext, *plaintext2, *ciphertext, *iv, *iv2, *mem;
-  unsigned int ctx_aligned_size, memsize;
-
-  static const unsigned char key[16] ATTR_ALIGNED_16 = {
-      0x11,0x9A,0x00,0x7F,0xC7,0x6A,0x45,0x9F,
-      0x98,0xBA,0xF9,0x17,0xFE,0xDF,0x95,0x33
-    };
-
-  /* Allocate buffers, align first two elements to 16 bytes and latter to
-     block size.  */
-  ctx_aligned_size = context_size + 15;
-  ctx_aligned_size -= ctx_aligned_size & 0xf;
-
-  memsize = ctx_aligned_size + (blocksize * 2) + (blocksize * nblocks * 3) + 16;
-
-  mem = xtrycalloc (1, memsize);
-  if (!mem)
-    return "failed to allocate memory";
-
-  offs = (16 - ((uintptr_t)mem & 15)) & 15;
-  ctx = (void*)(mem + offs);
-  iv = ctx + ctx_aligned_size;
-  iv2 = iv + blocksize;
-  plaintext = iv2 + blocksize;
-  plaintext2 = plaintext + nblocks * blocksize;
-  ciphertext = plaintext2 + nblocks * blocksize;
-
-  /* Initialize ctx */
-  if (setkey_func (ctx, key, sizeof(key), &bulk_ops) != GPG_ERR_NO_ERROR)
-   {
-     xfree(mem);
-     return "setkey failed";
-   }
-
-  /* Test single block code path */
-  memset(iv, 0xd3, blocksize);
-  memset(iv2, 0xd3, blocksize);
-  for (i = 0; i < blocksize; i++)
-    plaintext[i] = i;
-
-  /* CFB manually.  */
-  encrypt_one (ctx, ciphertext, iv);
-  buf_xor_2dst (iv, ciphertext, plaintext, blocksize);
-
-  /* CFB decrypt.  */
-  bulk_ops.cfb_dec (ctx, iv2, plaintext2, ciphertext, 1);
-  if (memcmp(plaintext2, plaintext, blocksize))
-    {
-      xfree(mem);
-#ifdef HAVE_SYSLOG
-      syslog (LOG_USER|LOG_WARNING, "Libgcrypt warning: "
-              "%s-CFB-%d test failed (plaintext mismatch)", cipher,
-             blocksize * 8);
-#else
-      (void)cipher; /* Not used.  */
-#endif
-      return "selftest for CFB failed - see syslog for details";
-    }
-
-  if (memcmp(iv2, iv, blocksize))
-    {
-      xfree(mem);
-#ifdef HAVE_SYSLOG
-      syslog (LOG_USER|LOG_WARNING, "Libgcrypt warning: "
-              "%s-CFB-%d test failed (IV mismatch)", cipher, blocksize * 8);
-#endif
-      return "selftest for CFB failed - see syslog for details";
-    }
-
-  /* Test parallelized code paths */
-  memset(iv, 0xe6, blocksize);
-  memset(iv2, 0xe6, blocksize);
-
-  for (i = 0; i < nblocks * blocksize; i++)
-    plaintext[i] = i;
-
-  /* Create CFB ciphertext manually.  */
-  for (i = 0; i < nblocks * blocksize; i+=blocksize)
-    {
-      encrypt_one (ctx, &ciphertext[i], iv);
-      buf_xor_2dst (iv, &ciphertext[i], &plaintext[i], blocksize);
-    }
-
-  /* Decrypt using bulk CBC and compare result.  */
-  bulk_ops.cfb_dec (ctx, iv2, plaintext2, ciphertext, nblocks);
-
-  if (memcmp(plaintext2, plaintext, nblocks * blocksize))
-    {
-      xfree(mem);
-#ifdef HAVE_SYSLOG
-      syslog (LOG_USER|LOG_WARNING, "Libgcrypt warning: "
-              "%s-CFB-%d test failed (plaintext mismatch, parallel path)",
-              cipher, blocksize * 8);
-#endif
-      return "selftest for CFB failed - see syslog for details";
-    }
-  if (memcmp(iv2, iv, blocksize))
-    {
-      xfree(mem);
-#ifdef HAVE_SYSLOG
-      syslog (LOG_USER|LOG_WARNING, "Libgcrypt warning: "
-              "%s-CFB-%d test failed (IV mismatch, parallel path)", cipher,
-             blocksize * 8);
-#endif
-      return "selftest for CFB failed - see syslog for details";
-    }
-
-  xfree(mem);
-  return NULL;
-}
-
-/* Run the self-tests for <block cipher>-CTR-<block size>, tests IV increment
-   of bulk CTR encryption.  Returns NULL on success. */
-const char *
-_gcry_selftest_helper_ctr (const char *cipher, gcry_cipher_setkey_t setkey_func,
-                          gcry_cipher_encrypt_t encrypt_one,
-                          const int nblocks, const int blocksize,
-                          const int context_size)
-{
-  cipher_bulk_ops_t bulk_ops = { 0, };
-  int i, j, offs, diff;
-  unsigned char *ctx, *plaintext, *plaintext2, *ciphertext, *ciphertext2,
-                *iv, *iv2, *mem;
-  unsigned int ctx_aligned_size, memsize;
-
-  static const unsigned char key[16] ATTR_ALIGNED_16 = {
-      0x06,0x9A,0x00,0x7F,0xC7,0x6A,0x45,0x9F,
-      0x98,0xBA,0xF9,0x17,0xFE,0xDF,0x95,0x21
-    };
-
-  /* Allocate buffers, align first two elements to 16 bytes and latter to
-     block size.  */
-  ctx_aligned_size = context_size + 15;
-  ctx_aligned_size -= ctx_aligned_size & 0xf;
-
-  memsize = ctx_aligned_size + (blocksize * 2) + (blocksize * nblocks * 4) + 16;
-
-  mem = xtrycalloc (1, memsize);
-  if (!mem)
-    return "failed to allocate memory";
-
-  offs = (16 - ((uintptr_t)mem & 15)) & 15;
-  ctx = (void*)(mem + offs);
-  iv = ctx + ctx_aligned_size;
-  iv2 = iv + blocksize;
-  plaintext = iv2 + blocksize;
-  plaintext2 = plaintext + nblocks * blocksize;
-  ciphertext = plaintext2 + nblocks * blocksize;
-  ciphertext2 = ciphertext + nblocks * blocksize;
-
-  /* Initialize ctx */
-  if (setkey_func (ctx, key, sizeof(key), &bulk_ops) != GPG_ERR_NO_ERROR)
-   {
-     xfree(mem);
-     return "setkey failed";
-   }
-
-  /* Test single block code path */
-  memset (iv, 0xff, blocksize);
-  for (i = 0; i < blocksize; i++)
-    plaintext[i] = i;
-
-  /* CTR manually.  */
-  encrypt_one (ctx, ciphertext, iv);
-  for (i = 0; i < blocksize; i++)
-    ciphertext[i] ^= plaintext[i];
-  for (i = blocksize; i > 0; i--)
-    {
-      iv[i-1]++;
-      if (iv[i-1])
-        break;
-    }
-
-  memset (iv2, 0xff, blocksize);
-  bulk_ops.ctr_enc (ctx, iv2, plaintext2, ciphertext, 1);
-
-  if (memcmp (plaintext2, plaintext, blocksize))
-    {
-      xfree (mem);
-#ifdef HAVE_SYSLOG
-      syslog (LOG_USER|LOG_WARNING, "Libgcrypt warning: "
-              "%s-CTR-%d test failed (plaintext mismatch)", cipher,
-             blocksize * 8);
-#else
-      (void)cipher; /* Not used.  */
-#endif
-      return "selftest for CTR failed - see syslog for details";
-    }
-
-  if (memcmp (iv2, iv, blocksize))
-    {
-      xfree (mem);
-#ifdef HAVE_SYSLOG
-      syslog (LOG_USER|LOG_WARNING, "Libgcrypt warning: "
-              "%s-CTR-%d test failed (IV mismatch)", cipher,
-             blocksize * 8);
-#endif
-      return "selftest for CTR failed - see syslog for details";
-    }
-
-  /* Test bulk encryption with typical IV. */
-  memset(iv, 0x57, blocksize-4);
-  iv[blocksize-1] = 1;
-  iv[blocksize-2] = 0;
-  iv[blocksize-3] = 0;
-  iv[blocksize-4] = 0;
-  memset(iv2, 0x57, blocksize-4);
-  iv2[blocksize-1] = 1;
-  iv2[blocksize-2] = 0;
-  iv2[blocksize-3] = 0;
-  iv2[blocksize-4] = 0;
-
-  for (i = 0; i < blocksize * nblocks; i++)
-    plaintext2[i] = plaintext[i] = i;
-
-  /* Create CTR ciphertext manually.  */
-  for (i = 0; i < blocksize * nblocks; i+=blocksize)
-    {
-      encrypt_one (ctx, &ciphertext[i], iv);
-      for (j = 0; j < blocksize; j++)
-        ciphertext[i+j] ^= plaintext[i+j];
-      for (j = blocksize; j > 0; j--)
-        {
-          iv[j-1]++;
-          if (iv[j-1])
-            break;
-        }
-    }
-
-  bulk_ops.ctr_enc (ctx, iv2, ciphertext2, plaintext2, nblocks);
-
-  if (memcmp (ciphertext2, ciphertext, blocksize * nblocks))
-    {
-      xfree (mem);
-#ifdef HAVE_SYSLOG
-      syslog (LOG_USER|LOG_WARNING, "Libgcrypt warning: "
-              "%s-CTR-%d test failed (ciphertext mismatch, bulk)", cipher,
-              blocksize * 8);
-#endif
-      return "selftest for CTR failed - see syslog for details";
-    }
-  if (memcmp(iv2, iv, blocksize))
-    {
-      xfree (mem);
-#ifdef HAVE_SYSLOG
-      syslog (LOG_USER|LOG_WARNING, "Libgcrypt warning: "
-              "%s-CTR-%d test failed (IV mismatch, bulk)", cipher,
-              blocksize * 8);
-#endif
-      return "selftest for CTR failed - see syslog for details";
-    }
-
-  /* Test parallelized code paths (check counter overflow handling) */
-  for (diff = 0; diff < nblocks; diff++) {
-    memset(iv, 0xff, blocksize);
-    iv[blocksize-1] -= diff;
-    iv[0] = iv[1] = 0;
-    iv[2] = 0x07;
-
-    for (i = 0; i < blocksize * nblocks; i++)
-      plaintext[i] = i;
-
-    /* Create CTR ciphertext manually.  */
-    for (i = 0; i < blocksize * nblocks; i+=blocksize)
-      {
-        encrypt_one (ctx, &ciphertext[i], iv);
-        for (j = 0; j < blocksize; j++)
-          ciphertext[i+j] ^= plaintext[i+j];
-        for (j = blocksize; j > 0; j--)
-          {
-            iv[j-1]++;
-            if (iv[j-1])
-              break;
-          }
-      }
-
-    /* Decrypt using bulk CTR and compare result.  */
-    memset(iv2, 0xff, blocksize);
-    iv2[blocksize-1] -= diff;
-    iv2[0] = iv2[1] = 0;
-    iv2[2] = 0x07;
-
-    bulk_ops.ctr_enc (ctx, iv2, plaintext2, ciphertext, nblocks);
-
-    if (memcmp (plaintext2, plaintext, blocksize * nblocks))
-      {
-        xfree (mem);
-#ifdef HAVE_SYSLOG
-        syslog (LOG_USER|LOG_WARNING, "Libgcrypt warning: "
-                "%s-CTR-%d test failed (plaintext mismatch, diff: %d)", cipher,
-               blocksize * 8, diff);
-#endif
-        return "selftest for CTR failed - see syslog for details";
-      }
-    if (memcmp(iv2, iv, blocksize))
-      {
-        xfree (mem);
-#ifdef HAVE_SYSLOG
-        syslog (LOG_USER|LOG_WARNING, "Libgcrypt warning: "
-                "%s-CTR-%d test failed (IV mismatch, diff: %d)", cipher,
-               blocksize * 8, diff);
-#endif
-        return "selftest for CTR failed - see syslog for details";
-      }
-  }
-
-  xfree (mem);
-  return NULL;
-}
diff --git a/cipher/cipher-selftest.h b/cipher/cipher-selftest.h
deleted file mode 100644 (file)
index c3090ad..0000000
+++ /dev/null
@@ -1,69 +0,0 @@
-/* cipher-selftest.h - Helper functions for bulk encryption selftests.
- * Copyright (C) 2013,2020 Jussi Kivilinna <jussi.kivilinna@iki.fi>
- *
- * This file is part of Libgcrypt.
- *
- * Libgcrypt is free software; you can redistribute it and/or modify
- * it under the terms of the GNU Lesser general Public License as
- * published by the Free Software Foundation; either version 2.1 of
- * the License, or (at your option) any later version.
- *
- * Libgcrypt is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with this program; if not, see <http://www.gnu.org/licenses/>.
- */
-
-#ifndef G10_SELFTEST_HELP_H
-#define G10_SELFTEST_HELP_H
-
-#include <config.h>
-#include "types.h"
-#include "g10lib.h"
-#include "cipher.h"
-
-typedef void (*gcry_cipher_bulk_cbc_dec_t)(void *context, unsigned char *iv,
-                                          void *outbuf_arg,
-                                          const void *inbuf_arg,
-                                          size_t nblocks);
-
-typedef void (*gcry_cipher_bulk_cfb_dec_t)(void *context, unsigned char *iv,
-                                          void *outbuf_arg,
-                                          const void *inbuf_arg,
-                                          size_t nblocks);
-
-typedef void (*gcry_cipher_bulk_ctr_enc_t)(void *context, unsigned char *iv,
-                                          void *outbuf_arg,
-                                          const void *inbuf_arg,
-                                          size_t nblocks);
-
-/* Helper function to allocate an aligned context for selftests.  */
-void *_gcry_cipher_selftest_alloc_ctx (const int context_size,
-                                       unsigned char **r_mem);
-
-
-/* Helper function for bulk CBC decryption selftest */
-const char *
-_gcry_selftest_helper_cbc (const char *cipher, gcry_cipher_setkey_t setkey,
-                          gcry_cipher_encrypt_t encrypt_one,
-                          const int nblocks, const int blocksize,
-                          const int context_size);
-
-/* Helper function for bulk CFB decryption selftest */
-const char *
-_gcry_selftest_helper_cfb (const char *cipher, gcry_cipher_setkey_t setkey,
-                          gcry_cipher_encrypt_t encrypt_one,
-                          const int nblocks, const int blocksize,
-                          const int context_size);
-
-/* Helper function for bulk CTR encryption selftest */
-const char *
-_gcry_selftest_helper_ctr (const char *cipher, gcry_cipher_setkey_t setkey,
-                          gcry_cipher_encrypt_t encrypt_one,
-                          const int nblocks, const int blocksize,
-                          const int context_size);
-
-#endif /*G10_SELFTEST_HELP_H*/
index 160beb4851b9ced37fe04225f0a7b41c85ddd52f..769cdd685da41889a6916020047442ce6451620e 100644 (file)
@@ -4,7 +4,7 @@
  * This file is part of Libgcrypt.
  *
  * Libgcrypt is free software; you can redistribute it and/or modify
- * it under the terms of the GNU Lesser general Public License as
+ * it under the terms of the GNU Lesser General Public License as
  * published by the Free Software Foundation; either version 2.1 of
  * the License, or (at your option) any later version.
  *
index 0522a271a11771d3aa778d0a2782f18096de9f48..754f289c8957eb95ac0d85d0832c0c2d25e30434 100644 (file)
@@ -4,7 +4,7 @@
  * This file is part of Libgcrypt.
  *
  * Libgcrypt is free software; you can redistribute it and/or modify
- * it under the terms of the GNU Lesser general Public License as
+ * it under the terms of the GNU Lesser General Public License as
  * published by the Free Software Foundation; either version 2.1 of
  * the License, or (at your option) any later version.
  *
index d1443a62118a65e2ca6745da5779e98a8ede33a4..898bb58fbecc38e8135850278ef5c2b9409a1405 100644 (file)
@@ -6,7 +6,7 @@
  * This file is part of Libgcrypt.
  *
  * Libgcrypt is free software; you can redistribute it and/or modify
- * it under the terms of the GNU Lesser general Public License as
+ * it under the terms of the GNU Lesser General Public License as
  * published by the Free Software Foundation; either version 2.1 of
  * the License, or (at your option) any later version.
  *
@@ -91,7 +91,12 @@ static gcry_cipher_spec_t * const cipher_list[] =
 #if USE_SM4
      &_gcry_cipher_spec_sm4,
 #endif
-    NULL
+#if USE_ARIA
+     &_gcry_cipher_spec_aria128,
+     &_gcry_cipher_spec_aria192,
+     &_gcry_cipher_spec_aria256,
+#endif
+     NULL
   };
 
 /* Cipher implementations starting with index 0 (enum gcry_cipher_algos) */
@@ -207,9 +212,18 @@ static gcry_cipher_spec_t * const cipher_list_algo301[] =
     NULL,
 #endif
 #if USE_SM4
-     &_gcry_cipher_spec_sm4,
+    &_gcry_cipher_spec_sm4,
+#else
+    NULL,
+#endif
+#if USE_ARIA
+    &_gcry_cipher_spec_aria128,
+    &_gcry_cipher_spec_aria192,
+    &_gcry_cipher_spec_aria256
 #else
     NULL,
+    NULL,
+    NULL
 #endif
   };
 
@@ -765,6 +779,8 @@ cipher_setkey (gcry_cipher_hd_t c, byte *key, size_t keylen)
   rc = c->spec->setkey (&c->context.c, key, keylen, &c->bulk);
   if (!rc || (c->marks.allow_weak_key && rc == GPG_ERR_WEAK_KEY))
     {
+      int is_weak_key = (rc == GPG_ERR_WEAK_KEY);
+
       /* Duplicate initial context.  */
       memcpy ((void *) ((char *) &c->context.c + c->spec->contextsize),
               (void *) &c->context.c,
@@ -787,7 +803,7 @@ cipher_setkey (gcry_cipher_hd_t c, byte *key, size_t keylen)
 
         case GCRY_CIPHER_MODE_GCM_SIV:
           rc = _gcry_cipher_gcm_siv_setkey (c, keylen);
-          if (rc)
+          if (rc && !(c->marks.allow_weak_key && rc == GPG_ERR_WEAK_KEY))
            c->marks.key = 0;
           break;
 
@@ -829,6 +845,11 @@ cipher_setkey (gcry_cipher_hd_t c, byte *key, size_t keylen)
         default:
           break;
         }
+
+      /* Restore "weak key" error-code in case mode specific setkey
+       * returned success. */
+      if (!rc && is_weak_key)
+       rc = GPG_ERR_WEAK_KEY;
     }
   else
     c->marks.key = 0;
@@ -983,14 +1004,11 @@ cipher_reset (gcry_cipher_hd_t c)
 
 \f
 static gcry_err_code_t
-do_ecb_crypt (gcry_cipher_hd_t c,
-              unsigned char *outbuf, size_t outbuflen,
-              const unsigned char *inbuf, size_t inbuflen,
-              gcry_cipher_encrypt_t crypt_fn)
+do_ecb_crypt (gcry_cipher_hd_t c, unsigned char *outbuf, size_t outbuflen,
+             const unsigned char *inbuf, size_t inbuflen, int encrypt)
 {
   unsigned int blocksize = c->spec->blocksize;
   size_t n, nblocks;
-  unsigned int burn, nburn;
 
   if (outbuflen < inbuflen)
     return GPG_ERR_BUFFER_TOO_SHORT;
@@ -998,18 +1016,32 @@ do_ecb_crypt (gcry_cipher_hd_t c,
     return GPG_ERR_INV_LENGTH;
 
   nblocks = inbuflen / blocksize;
-  burn = 0;
 
-  for (n=0; n < nblocks; n++ )
+  if (nblocks == 0)
+    return 0;
+
+  if (c->bulk.ecb_crypt)
     {
-      nburn = crypt_fn (&c->context.c, outbuf, inbuf);
-      burn = nburn > burn ? nburn : burn;
-      inbuf  += blocksize;
-      outbuf += blocksize;
+      c->bulk.ecb_crypt (&c->context.c, outbuf, inbuf, nblocks, encrypt);
     }
+  else
+    {
+      gcry_cipher_encrypt_t crypt_fn =
+          encrypt ? c->spec->encrypt : c->spec->decrypt;
+      unsigned int burn = 0;
+      unsigned int nburn;
 
-  if (burn > 0)
-    _gcry_burn_stack (burn + 4 * sizeof(void *));
+      for (n = 0; n < nblocks; n++)
+       {
+         nburn = crypt_fn (&c->context.c, outbuf, inbuf);
+         burn = nburn > burn ? nburn : burn;
+         inbuf  += blocksize;
+         outbuf += blocksize;
+       }
+
+      if (burn > 0)
+       _gcry_burn_stack (burn + 4 * sizeof(void *));
+    }
 
   return 0;
 }
@@ -1019,7 +1051,7 @@ do_ecb_encrypt (gcry_cipher_hd_t c,
                 unsigned char *outbuf, size_t outbuflen,
                 const unsigned char *inbuf, size_t inbuflen)
 {
-  return do_ecb_crypt (c, outbuf, outbuflen, inbuf, inbuflen, c->spec->encrypt);
+  return do_ecb_crypt (c, outbuf, outbuflen, inbuf, inbuflen, 1);
 }
 
 static gcry_err_code_t
@@ -1027,7 +1059,7 @@ do_ecb_decrypt (gcry_cipher_hd_t c,
                 unsigned char *outbuf, size_t outbuflen,
                 const unsigned char *inbuf, size_t inbuflen)
 {
-  return do_ecb_crypt (c, outbuf, outbuflen, inbuf, inbuflen, c->spec->decrypt);
+  return do_ecb_crypt (c, outbuf, outbuflen, inbuf, inbuflen, 0);
 }
 
 
@@ -1210,9 +1242,20 @@ _gcry_cipher_setkey (gcry_cipher_hd_t hd, const void *key, size_t keylen)
 
 
 gcry_err_code_t
-_gcry_cipher_setiv (gcry_cipher_hd_t hd, const void *iv, size_t ivlen)
+_gcry_cipher_setiv (gcry_cipher_hd_t c, const void *iv, size_t ivlen)
 {
-  return hd->mode_ops.setiv (hd, iv, ivlen);
+  if (c->mode == GCRY_CIPHER_MODE_GCM)
+    {
+      c->u_mode.gcm.disallow_encryption_because_of_setiv_in_fips_mode = 0;
+
+      if (fips_mode ())
+        {
+          /* Direct invocation of GCM setiv in FIPS mode disables encryption. */
+          c->u_mode.gcm.disallow_encryption_because_of_setiv_in_fips_mode = 1;
+        }
+    }
+
+  return c->mode_ops.setiv (c, iv, ivlen);
 }
 
 
@@ -1250,6 +1293,56 @@ _gcry_cipher_getctr (gcry_cipher_hd_t hd, void *ctr, size_t ctrlen)
 }
 
 
+gcry_err_code_t
+_gcry_cipher_setup_geniv (gcry_cipher_hd_t hd, int method,
+                          const void *fixed_iv, size_t fixed_iv_len,
+                          const void *dyn_iv, size_t dyn_iv_len)
+{
+  gcry_err_code_t rc = 0;
+
+  if (method != GCRY_CIPHER_GENIV_METHOD_CONCAT)
+    return GPG_ERR_INV_ARG;
+
+  if (fixed_iv_len + dyn_iv_len > MAX_BLOCKSIZE)
+    return GPG_ERR_INV_ARG;
+
+  hd->aead.geniv_method = GCRY_CIPHER_GENIV_METHOD_CONCAT;
+  hd->aead.fixed_iv_len = fixed_iv_len;
+  hd->aead.dynamic_iv_len = dyn_iv_len;
+  memset (hd->aead.fixed, 0, MAX_BLOCKSIZE);
+  memset (hd->aead.dynamic, 0, MAX_BLOCKSIZE);
+  memcpy (hd->aead.fixed, fixed_iv, fixed_iv_len);
+  memcpy (hd->aead.dynamic, dyn_iv, dyn_iv_len);
+
+  return rc;
+}
+
+
+gcry_err_code_t
+_gcry_cipher_geniv (gcry_cipher_hd_t hd, void *iv, size_t iv_len)
+{
+  gcry_err_code_t rc = 0;
+  int i;
+
+  if (hd->aead.geniv_method != GCRY_CIPHER_GENIV_METHOD_CONCAT)
+    return GPG_ERR_INV_ARG;
+
+  if (iv_len != hd->aead.fixed_iv_len + hd->aead.dynamic_iv_len)
+    return GPG_ERR_INV_ARG;
+
+  memcpy (iv, hd->aead.fixed, hd->aead.fixed_iv_len);
+  memcpy ((byte *)iv+hd->aead.fixed_iv_len,
+          hd->aead.dynamic, hd->aead.dynamic_iv_len);
+  rc = hd->mode_ops.setiv (hd, iv, iv_len);
+
+  for (i = hd->aead.dynamic_iv_len; i > 0; i--)
+    if (++hd->aead.dynamic[i - 1] != 0)
+      break;
+
+  return rc;
+}
+
+
 gcry_err_code_t
 _gcry_cipher_authenticate (gcry_cipher_hd_t hd, const void *abuf,
                            size_t abuflen)
@@ -1651,6 +1744,30 @@ _gcry_cipher_ctl (gcry_cipher_hd_t h, int cmd, void *buffer, size_t buflen)
         }
       break;
 
+    case PRIV_CIPHERCTL_GET_COUNTER: /* (private)  */
+      /* This is the input block as used in CTR mode which has
+         initially been set as IV.  The returned format is:
+           1 byte  Actual length of the block in bytes.
+           n byte  The block.
+         If the provided buffer is too short, an error is returned. */
+      if (buflen < (1 + h->spec->blocksize))
+        rc = GPG_ERR_TOO_SHORT;
+      else
+        {
+          unsigned char *ctrp;
+          unsigned char *dst = buffer;
+          int n = h->unused;
+
+          if (!n)
+            n = h->spec->blocksize;
+          gcry_assert (n <= h->spec->blocksize);
+          *dst++ = n;
+          ctrp = h->u_ctr.ctr + h->spec->blocksize - n;
+          while (n--)
+            *dst++ = *ctrp++;
+        }
+      break;
+
     case GCRYCTL_SET_SBOX:
       if (h->spec->set_extra_info)
         rc = h->spec->set_extra_info
index 7ac884af354f7d4fd0ec0b117a5e6fd2edab1794..2d8d216831f5998dd88e092d08cd082377fe420b 100644 (file)
@@ -25,8 +25,6 @@
 
 .cpu generic+simd+crypto
 
-.text
-
 
 /* Structure of crc32_consts_s */
 
 
 /* Constants */
 
+SECTION_RODATA
+
 .align 6
+ELF(.type _crc32_aarch64_ce_constants,%object;)
+_crc32_aarch64_ce_constants:
 .Lcrc32_constants:
 .Lcrc32_partial_fold_input_mask:
   .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
   .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
   .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
 
+.text
 
 /*
  * void _gcry_crc32r_armv8_ce_bulk (u32 *pcrc, const byte *inbuf, size_t inlen,
  *                                  const struct crc32_consts_s *consts);
  */
-.align 3
+.align 4
 .globl _gcry_crc32r_armv8_ce_bulk
 ELF(.type  _gcry_crc32r_armv8_ce_bulk,%function;)
 _gcry_crc32r_armv8_ce_bulk:
@@ -235,7 +238,7 @@ ELF(.size _gcry_crc32r_armv8_ce_bulk,.-_gcry_crc32r_armv8_ce_bulk;)
  * void _gcry_crc32r_armv8_ce_reduction_4 (u32 *pcrc, u32 data, u32 crc,
  *                                         const struct crc32_consts_s *consts);
  */
-.align 3
+.align 4
 .globl _gcry_crc32r_armv8_ce_reduction_4
 ELF(.type  _gcry_crc32r_armv8_ce_reduction_4,%function;)
 _gcry_crc32r_armv8_ce_reduction_4:
@@ -268,7 +271,7 @@ ELF(.size _gcry_crc32r_armv8_ce_reduction_4,.-_gcry_crc32r_armv8_ce_reduction_4;
  * void _gcry_crc32_armv8_ce_bulk (u32 *pcrc, const byte *inbuf, size_t inlen,
  *                                 const struct crc32_consts_s *consts);
  */
-.align 3
+.align 4
 .globl _gcry_crc32_armv8_ce_bulk
 ELF(.type  _gcry_crc32_armv8_ce_bulk,%function;)
 _gcry_crc32_armv8_ce_bulk:
@@ -465,7 +468,7 @@ ELF(.size _gcry_crc32_armv8_ce_bulk,.-_gcry_crc32_armv8_ce_bulk;)
  * void _gcry_crc32_armv8_ce_reduction_4 (u32 *pcrc, u32 data, u32 crc,
  *                                        const struct crc32_consts_s *consts);
  */
-.align 3
+.align 4
 .globl _gcry_crc32_armv8_ce_reduction_4
 ELF(.type  _gcry_crc32_armv8_ce_reduction_4,%function;)
 _gcry_crc32_armv8_ce_reduction_4:
index 17e55548215b5434460d83b3fa2105c0d74dfcd1..e8c314d4a0778c98c47170e0f0a3fa80bfaf8997 100644 (file)
@@ -14,8 +14,8 @@
  * GNU Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA
+ * License along with this program; if not, see <https://www.gnu.org/licenses/>.
+ * SPDX-License-Identifier: LGPL-2.1-or-later
  *
  */
 
index 8c8b1915abe3a63837be4ae15087e8bbe339bc00..825dee2adf294f34d2547c7cfc8dee5216b3a1ef 100644 (file)
@@ -14,8 +14,8 @@
  * GNU Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA
+ * License along with this program; if not, see <https://www.gnu.org/licenses/>.
+ * SPDX-License-Identifier: LGPL-2.1-or-later
  *
  */
 
index b9a40130cefdfb637ff0a35e683cc9024e6090b5..6eadccc8fc1e09f326c1e81710adb522b3526870 100644 (file)
@@ -14,8 +14,8 @@
  * GNU Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA
+ * License along with this program; if not, see <https://www.gnu.org/licenses/>.
+ * SPDX-License-Identifier: LGPL-2.1-or-later
  *
  */
 
index b38869ecc9da577fc3f18023b938078cdf43a54b..cdff06484dd923da48e9e1331f90c4f078c0ccc6 100644 (file)
@@ -14,8 +14,8 @@
  * GNU Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA
+ * License along with this program; if not, see <https://www.gnu.org/licenses/>.
+ * SPDX-License-Identifier: LGPL-2.1-or-later
  *
  */
 
index c1bf9f29e8dd21f9ec8ab2ea316fe82971c8e141..e4236a92e1cfc460d75847e452253c38933fc810 100644 (file)
        movl   left##d,   (io); \
        movl   right##d, 4(io);
 
-.align 8
+.align 16
 .globl _gcry_3des_amd64_crypt_block
 ELF(.type  _gcry_3des_amd64_crypt_block,@function;)
 
@@ -473,7 +473,7 @@ ELF(.size _gcry_3des_amd64_crypt_block,.-_gcry_3des_amd64_crypt_block;)
        movl   left##d,   (io); \
        movl   right##d, 4(io);
 
-.align 8
+.align 16
 ELF(.type  _gcry_3des_amd64_crypt_blk3,@function;)
 _gcry_3des_amd64_crypt_blk3:
        /* input:
@@ -548,7 +548,7 @@ _gcry_3des_amd64_crypt_blk3:
        CFI_ENDPROC();
 ELF(.size _gcry_3des_amd64_crypt_blk3,.-_gcry_3des_amd64_crypt_blk3;)
 
-.align 8
+.align 16
 .globl  _gcry_3des_amd64_cbc_dec
 ELF(.type   _gcry_3des_amd64_cbc_dec,@function;)
 _gcry_3des_amd64_cbc_dec:
@@ -646,7 +646,7 @@ _gcry_3des_amd64_cbc_dec:
        CFI_ENDPROC();
 ELF(.size _gcry_3des_amd64_cbc_dec,.-_gcry_3des_amd64_cbc_dec;)
 
-.align 8
+.align 16
 .globl  _gcry_3des_amd64_ctr_enc
 ELF(.type   _gcry_3des_amd64_ctr_enc,@function;)
 _gcry_3des_amd64_ctr_enc:
@@ -744,7 +744,7 @@ _gcry_3des_amd64_ctr_enc:
        CFI_ENDPROC();
 ELF(.size _gcry_3des_amd64_cbc_dec,.-_gcry_3des_amd64_cbc_dec;)
 
-.align 8
+.align 16
 .globl  _gcry_3des_amd64_cfb_dec
 ELF(.type   _gcry_3des_amd64_cfb_dec,@function;)
 _gcry_3des_amd64_cfb_dec:
@@ -841,7 +841,12 @@ _gcry_3des_amd64_cfb_dec:
        CFI_ENDPROC();
 ELF(.size _gcry_3des_amd64_cfb_dec,.-_gcry_3des_amd64_cfb_dec;)
 
+
+SECTION_RODATA
+ELF(.type _des_amd64_data,@object;)
+
 .align 16
+_des_amd64_data:
 .L_s1:
        .quad 0x0010100001010400, 0x0000000000000000
        .quad 0x0000100000010000, 0x0010100001010404
index 51116fcfc8158c3ea61b722b3c92a0ad33304785..4b3f9a1eb084e0252fd80c423b56e02f0e7954f5 100644 (file)
@@ -5,7 +5,7 @@
  * This file is part of Libgcrypt.
  *
  * Libgcrypt is free software; you can redistribute it and/or modify
- * it under the terms of the GNU Lesser general Public License as
+ * it under the terms of the GNU Lesser General Public License as
  * published by the Free Software Foundation; either version 2.1 of
  * the License, or (at your option) any later version.
  *
@@ -15,8 +15,8 @@
  * GNU Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA
+ * License along with this program; if not, see <https://www.gnu.org/licenses/>.
+ * SPDX-License-Identifier: LGPL-2.1-or-later
  *
  * For a description of triple encryption, see:
  *   Bruce Schneier: Applied Cryptography. Second Edition.
 #include "cipher.h"
 #include "bufhelp.h"
 #include "cipher-internal.h"
-#include "cipher-selftest.h"
 
 
 #define DES_BLOCKSIZE 8
@@ -1047,66 +1046,6 @@ is_weak_key ( const byte *key )
 }
 
 
-/* Alternative setkey for selftests; need larger key than default. */
-static gcry_err_code_t
-bulk_selftest_setkey (void *context, const byte *__key, unsigned __keylen,
-                      cipher_bulk_ops_t *bulk_ops)
-{
-  static const unsigned char key[24] ATTR_ALIGNED_16 = {
-      0x66,0x9A,0x00,0x7F,0xC7,0x6A,0x45,0x9F,
-      0x98,0xBA,0xF9,0x17,0xFE,0xDF,0x95,0x22,
-      0x18,0x2A,0x39,0x47,0x5E,0x6F,0x75,0x82
-    };
-
-  (void)__key;
-  (void)__keylen;
-
-  return do_tripledes_setkey(context, key, sizeof(key), bulk_ops);
-}
-
-
-/* Run the self-tests for DES-CTR, tests IV increment of bulk CTR
-   encryption.  Returns NULL on success. */
-static const char *
-selftest_ctr (void)
-{
-  const int nblocks = 3+1;
-  const int blocksize = DES_BLOCKSIZE;
-  const int context_size = sizeof(struct _tripledes_ctx);
-
-  return _gcry_selftest_helper_ctr("3DES", &bulk_selftest_setkey,
-           &do_tripledes_encrypt, nblocks, blocksize, context_size);
-}
-
-
-/* Run the self-tests for DES-CBC, tests bulk CBC decryption.
-   Returns NULL on success. */
-static const char *
-selftest_cbc (void)
-{
-  const int nblocks = 3+2;
-  const int blocksize = DES_BLOCKSIZE;
-  const int context_size = sizeof(struct _tripledes_ctx);
-
-  return _gcry_selftest_helper_cbc("3DES", &bulk_selftest_setkey,
-           &do_tripledes_encrypt, nblocks, blocksize, context_size);
-}
-
-
-/* Run the self-tests for DES-CFB, tests bulk CBC decryption.
-   Returns NULL on success. */
-static const char *
-selftest_cfb (void)
-{
-  const int nblocks = 3+2;
-  const int blocksize = DES_BLOCKSIZE;
-  const int context_size = sizeof(struct _tripledes_ctx);
-
-  return _gcry_selftest_helper_cfb("3DES", &bulk_selftest_setkey,
-           &do_tripledes_encrypt, nblocks, blocksize, context_size);
-}
-
-
 /*
  * Performs a selftest of this DES/Triple-DES implementation.
  * Returns an string with the error text on failure.
@@ -1115,8 +1054,6 @@ selftest_cfb (void)
 static const char *
 selftest (void)
 {
-  const char *r;
-
   /*
    * Check if 'u32' is really 32 bits wide. This DES / 3DES implementation
    * need this.
@@ -1296,15 +1233,6 @@ selftest (void)
         return "DES weak key detection failed";
   }
 
-  if ( (r = selftest_cbc ()) )
-    return r;
-
-  if ( (r = selftest_cfb ()) )
-    return r;
-
-  if ( (r = selftest_ctr ()) )
-    return r;
-
   return 0;
 }
 
index 7c86e12cc988db4981ab72d1e388336fe9ba2cd6..17fa550512862fb040071aca2b2cd10a0ab86eaa 100644 (file)
@@ -84,6 +84,9 @@ static const struct
     { "brainpoolP320r1", "1.3.36.3.3.2.8.1.1.9" },
     { "brainpoolP384r1", "1.3.36.3.3.2.8.1.1.11"},
     { "brainpoolP512r1", "1.3.36.3.3.2.8.1.1.13"},
+    { "brainpoolP256r1", "bp256" },         /* Short names from GnuPG.  */
+    { "brainpoolP384r1", "bp384" },
+    { "brainpoolP512r1", "bp512" },
 
     { "GOST2001-test", "1.2.643.2.2.35.0" },
     { "GOST2001-CryptoPro-A", "1.2.643.2.2.35.1" },
@@ -147,7 +150,7 @@ static const ecc_domain_parms_t domain_parms[] =
   {
     {
       /* (-x^2 + y^2 = 1 + dx^2y^2) */
-      "Ed25519", 255, 0,
+      "Ed25519", 255, 1,
       MPI_EC_EDWARDS, ECC_DIALECT_ED25519,
       "0x7FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFED",
       "-0x01",
@@ -176,7 +179,7 @@ static const ecc_domain_parms_t domain_parms[] =
     },
     {
       /* (x^2 + y^2 = 1 + dx^2y^2) */
-      "Ed448", 448, 0,
+      "Ed448", 448, 1,
       MPI_EC_EDWARDS, ECC_DIALECT_SAFECURVE,
       "0xFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFE"
       "FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF",
@@ -778,8 +781,7 @@ _gcry_ecc_update_curve_param (const char *name,
 }
 
 
-/* Return the name matching the parameters in PKEY.  This works only
-   with curves described by the Weierstrass equation. */
+/* Return the name matching the parameters in PKEY.  */
 const char *
 _gcry_ecc_get_curve (gcry_sexp_t keyparms, int iterator, unsigned int *r_nbits)
 {
index d6b8991af6ade7ab91f2aae3e2f04bef714a664c..8be57b72e228db2658615e53d3bca6d4e3517702 100644 (file)
@@ -58,27 +58,163 @@ _gcry_ecc_get_algo_keylen (int curveid)
   return len;
 }
 
-gpg_error_t
-_gcry_ecc_mul_point (int curveid, unsigned char *result,
-                     const unsigned char *scalar, const unsigned char *point)
+/* For Curve25519 and X448, we need to mask the bits and enable the MSB.  */
+static void
+ecc_tweak_bits (unsigned char *seckey, size_t seckey_len)
 {
+  if (seckey_len == 32)
+    {
+      seckey[0] &= 0xf8;
+      seckey[31] &= 0x7f;
+      seckey[31] |= 0x40;
+    }
+  else
+    {
+      seckey[0] &= 0xfc;
+      seckey[55] |= 0x80;
+    }
+}
+
+gpg_err_code_t
+_gcry_ecc_curve_keypair (const char *curve,
+                         unsigned char *pubkey, size_t pubkey_len,
+                         unsigned char *seckey, size_t seckey_len)
+{
+  gpg_err_code_t err;
   unsigned int nbits;
   unsigned int nbytes;
-  const char *curve;
-  gpg_err_code_t err;
-  gcry_mpi_t mpi_k;
-  mpi_ec_t ec;
-  mpi_point_struct Q;
+  gcry_mpi_t mpi_k = NULL;
+  mpi_ec_t ec = NULL;
+  mpi_point_struct Q = { NULL, NULL, NULL };
   gcry_mpi_t x;
   unsigned int len;
   unsigned char *buf;
 
-  if (curveid == GCRY_ECC_CURVE25519)
-    curve = "Curve25519";
-  else if (curveid == GCRY_ECC_CURVE448)
-    curve = "X448";
+  err = prepare_ec (&ec, curve);
+  if (err)
+    return err;
+
+  nbits = ec->nbits;
+  nbytes = (nbits + 7)/8;
+
+  if (seckey_len != nbytes)
+    return GPG_ERR_INV_ARG;
+
+  if (ec->model == MPI_EC_WEIERSTRASS)
+    {
+      if (pubkey_len != 1 + 2*nbytes)
+        return GPG_ERR_INV_ARG;
+
+      do
+        {
+          mpi_free (mpi_k);
+          mpi_k = mpi_new (nbytes*8);
+          _gcry_randomize (seckey, nbytes, GCRY_STRONG_RANDOM);
+          _gcry_mpi_set_buffer (mpi_k, seckey, nbytes, 0);
+        }
+      while (mpi_cmp (mpi_k, ec->n) >= 0);
+    }
+  else if (ec->model == MPI_EC_MONTGOMERY)
+    {
+      if (pubkey_len != nbytes)
+        return GPG_ERR_INV_ARG;
+
+      _gcry_randomize (seckey, nbytes, GCRY_STRONG_RANDOM);
+      /* Existing ECC applications with libgcrypt (like gpg-agent in
+         GnuPG) assumes that scalar is tweaked at key generation time.
+         For the possible use case where generated key with this routine
+         may be used with those, we put compatibile behavior here.  */
+      ecc_tweak_bits (seckey, nbytes);
+      mpi_k = _gcry_mpi_set_opaque_copy (NULL, seckey, nbytes*8);
+    }
   else
-    return gpg_error (GPG_ERR_UNKNOWN_CURVE);
+    return GPG_ERR_UNKNOWN_CURVE;
+
+  x = mpi_new (nbits);
+  point_init (&Q);
+
+  _gcry_mpi_ec_mul_point (&Q, mpi_k, ec->G, ec);
+
+  if (ec->model == MPI_EC_WEIERSTRASS)
+    {
+      gcry_mpi_t y = mpi_new (nbits);
+      gcry_mpi_t negative = mpi_new (nbits);
+
+      _gcry_mpi_ec_get_affine (x, y, &Q, ec);
+      /* For the backward compatibility, we check if it's a
+         "compliant key".  */
+
+      mpi_sub (negative, ec->p, y);
+      if (mpi_cmp (negative, y) < 0)   /* p - y < p */
+        {
+          mpi_free (y);
+          y = negative;
+          mpi_sub (mpi_k, ec->n, mpi_k);
+          buf = _gcry_mpi_get_buffer (mpi_k, 0, &len, NULL);
+          memset (seckey, 0, nbytes - len);
+          memcpy (seckey + nbytes - len, buf, len);
+        }
+      else /* p - y >= p */
+        mpi_free (negative);
+
+      buf = _gcry_ecc_ec2os_buf (x, y, ec->p, &len);
+      if (!buf)
+        {
+          err = gpg_err_code_from_syserror ();
+          mpi_free (y);
+        }
+      else
+        {
+          if (len != 1 + 2*nbytes)
+            {
+              err = GPG_ERR_INV_ARG;
+              mpi_free (y);
+            }
+          else
+            {
+              /* (x,y) in SEC1 point encoding.  */
+              memcpy (pubkey, buf, len);
+              xfree (buf);
+              mpi_free (y);
+            }
+        }
+    }
+  else /* MPI_EC_MONTGOMERY */
+    {
+      _gcry_mpi_ec_get_affine (x, NULL, &Q, ec);
+
+      buf = _gcry_mpi_get_buffer (x, nbytes, &len, NULL);
+      if (!buf)
+        err = gpg_err_code_from_syserror ();
+      else
+        {
+          memcpy (pubkey, buf, nbytes);
+          xfree (buf);
+        }
+    }
+
+  mpi_free (x);
+  point_free (&Q);
+  mpi_free (mpi_k);
+  _gcry_mpi_ec_free (ec);
+  return err;
+}
+
+gpg_err_code_t
+_gcry_ecc_curve_mul_point (const char *curve,
+                           unsigned char *result, size_t result_len,
+                           const unsigned char *scalar, size_t scalar_len,
+                           const unsigned char *point, size_t point_len)
+{
+  unsigned int nbits;
+  unsigned int nbytes;
+  gpg_err_code_t err;
+  gcry_mpi_t mpi_k = NULL;
+  mpi_ec_t ec = NULL;
+  mpi_point_struct Q = { NULL, NULL, NULL };
+  gcry_mpi_t x = NULL;
+  unsigned int len;
+  unsigned char *buf;
 
   err = prepare_ec (&ec, curve);
   if (err)
@@ -87,18 +223,50 @@ _gcry_ecc_mul_point (int curveid, unsigned char *result,
   nbits = ec->nbits;
   nbytes = (nbits + 7)/8;
 
-  mpi_k = _gcry_mpi_set_opaque_copy (NULL, scalar, nbytes*8);
-  x = mpi_new (nbits);
+  if (ec->model == MPI_EC_WEIERSTRASS)
+    {
+      if (scalar_len != nbytes
+          || result_len != 1 + 2*nbytes
+          || point_len != 1 + 2*nbytes)
+        {
+          err = GPG_ERR_INV_ARG;
+          goto leave;
+        }
+
+      mpi_k = mpi_new (nbytes*8);
+      _gcry_mpi_set_buffer (mpi_k, scalar, nbytes, 0);
+    }
+  else if (ec->model == MPI_EC_MONTGOMERY)
+    {
+      if (scalar_len != nbytes
+          || result_len != nbytes
+          || point_len != nbytes)
+        {
+          err = GPG_ERR_INV_ARG;
+          goto leave;
+        }
+
+      mpi_k = _gcry_mpi_set_opaque_copy (NULL, scalar, nbytes*8);
+    }
+  else
+    {
+      err = GPG_ERR_UNKNOWN_CURVE;
+      goto leave;
+    }
+
   point_init (&Q);
 
   if (point)
     {
-      gcry_mpi_t mpi_u = _gcry_mpi_set_opaque_copy (NULL, point, nbytes*8);
+      gcry_mpi_t mpi_u = _gcry_mpi_set_opaque_copy (NULL, point, point_len*8);
       mpi_point_struct P;
 
       point_init (&P);
-      err = _gcry_ecc_mont_decodepoint (mpi_u, ec, &P);
-      _gcry_mpi_release (mpi_u);
+      if (ec->model == MPI_EC_WEIERSTRASS)
+        err = _gcry_ecc_sec_decodepoint (mpi_u, ec, &P);
+      else /* MPI_EC_MONTGOMERY */
+        err = _gcry_ecc_mont_decodepoint (mpi_u, ec, &P);
+      mpi_free (mpi_u);
       if (err)
         goto leave;
       _gcry_mpi_ec_mul_point (&Q, mpi_k, &P, ec);
@@ -107,21 +275,83 @@ _gcry_ecc_mul_point (int curveid, unsigned char *result,
   else
     _gcry_mpi_ec_mul_point (&Q, mpi_k, ec->G, ec);
 
-  _gcry_mpi_ec_get_affine (x, NULL, &Q, ec);
+  x = mpi_new (nbits);
+  if (ec->model == MPI_EC_WEIERSTRASS)
+    {
+      gcry_mpi_t y = mpi_new (nbits);
 
-  buf = _gcry_mpi_get_buffer (x, nbytes, &len, NULL);
-  if (!buf)
-    err = gpg_error_from_syserror ();
-  else
+      _gcry_mpi_ec_get_affine (x, y, &Q, ec);
+
+      buf = _gcry_ecc_ec2os_buf (x, y, ec->p, &len);
+      if (!buf)
+        {
+          err = gpg_err_code_from_syserror ();
+          mpi_free (y);
+        }
+      else
+        {
+          if (len != 1 + 2*nbytes)
+            {
+              err = GPG_ERR_INV_ARG;
+              mpi_free (y);
+            }
+          else
+            {
+              /* (x,y) in SEC1 point encoding.  */
+              memcpy (result, buf, len);
+              xfree (buf);
+              mpi_free (y);
+            }
+        }
+    }
+  else                          /* MPI_EC_MONTGOMERY */
     {
-      memcpy (result, buf, nbytes);
-      xfree (buf);
+      _gcry_mpi_ec_get_affine (x, NULL, &Q, ec);
+      buf = _gcry_mpi_get_buffer (x, nbytes, &len, NULL);
+      if (!buf)
+        err = gpg_err_code_from_syserror ();
+      else
+        {
+          if (len != nbytes)
+            err = GPG_ERR_INV_ARG;
+          else
+            {
+              /* x in little endian.  */
+              memcpy (result, buf, nbytes);
+              xfree (buf);
+            }
+        }
     }
+  mpi_free (x);
 
  leave:
-  _gcry_mpi_release (x);
   point_free (&Q);
-  _gcry_mpi_release (mpi_k);
+  mpi_free (mpi_k);
   _gcry_mpi_ec_free (ec);
   return err;
 }
+
+gpg_err_code_t
+_gcry_ecc_mul_point (int curveid, unsigned char *result,
+                     const unsigned char *scalar, const unsigned char *point)
+{
+  const char *curve;
+  size_t pubkey_len, seckey_len;
+
+  if (curveid == GCRY_ECC_CURVE25519)
+    {
+      curve = "Curve25519";
+      pubkey_len = seckey_len = 32;
+    }
+  else if (curveid == GCRY_ECC_CURVE448)
+    {
+      curve = "X448";
+      pubkey_len = seckey_len = 56;
+    }
+  else
+    return gpg_error (GPG_ERR_UNKNOWN_CURVE);
+
+  return _gcry_ecc_curve_mul_point (curve, result, pubkey_len,
+                                    scalar, seckey_len,
+                                    point, pubkey_len);
+}
index 3f3ef97b24cfae21077c773d15bd05bed468720d..871b0371a3238131aabb15fc975fa2f5787e80c5 100644 (file)
@@ -106,6 +106,14 @@ _gcry_ecc_ecdsa_sign (gcry_mpi_t input, gcry_mpi_t k_supplied, mpi_ec_t ec,
               k = NULL;
               if ((flags & PUBKEY_FLAG_RFC6979) && hashalgo)
                 {
+                  if (fips_mode () &&
+                      (hashalgo == GCRY_MD_SHAKE128
+                       || hashalgo == GCRY_MD_SHAKE256))
+                    {
+                      rc = GPG_ERR_DIGEST_ALGO;
+                      goto leave;
+                    }
+
                   /* Use Pornin's method for deterministic DSA.  If this
                      flag is set, it is expected that HASH is an opaque
                      MPI with the to be signed hash.  That hash is also
index cf3fc6618b570a8a5afc43c966fc0cf38724c7ab..65525207275ab9b236de04eba57c12b43e3849d7 100644 (file)
@@ -75,7 +75,7 @@ static const char *ecc_names[] =
 
 
 /* Sample NIST P-256 key from RFC 6979 A.2.5 */
-static const char sample_public_key_secp256[] =
+static const char ecdsa_sample_public_key_secp256[] =
   "(public-key"
   " (ecc"
   "  (curve secp256r1)"
@@ -83,7 +83,7 @@ static const char sample_public_key_secp256[] =
   /**/  "60FED4BA255A9D31C961EB74C6356D68C049B8923B61FA6CE669622E60F29FB6"
   /**/  "7903FE1008B8BC99A41AE9E95628BC64F2F1B20C2D7E9F5177A3C294D4462299#)))";
 
-static const char sample_secret_key_secp256[] =
+static const char ecdsa_sample_secret_key_secp256[] =
   "(private-key"
   " (ecc"
   "  (curve secp256r1)"
@@ -92,6 +92,87 @@ static const char sample_secret_key_secp256[] =
   /**/  "60FED4BA255A9D31C961EB74C6356D68C049B8923B61FA6CE669622E60F29FB6"
   /**/  "7903FE1008B8BC99A41AE9E95628BC64F2F1B20C2D7E9F5177A3C294D4462299#)))";
 
+/* Sample data from RFC 6979 section A.2.5, hash is of message "sample" */
+static const char ecdsa_sample_data[] =
+  "(data (flags rfc6979 prehash)"
+  " (hash-algo sha256)"
+  " (value 6:sample))";
+
+static const char ecdsa_sample_data_bad[] =
+  "(data (flags rfc6979)"
+  " (hash sha256 #bf2bdbe1aa9b6ec1e2ade1d694f41fc71a831d0268e98915"
+  /**/           "62113d8a62add1bf#))";
+
+static const char ecdsa_signature_r[] =
+  "efd48b2aacb6a8fd1140dd9cd45e81d69d2c877b56aaf991c34d0ea84eaf3716";
+
+static const char ecdsa_signature_s[] =
+  "f7cb1c942d657c41d436c7a1b6e29f65f3e900dbb9aff4064dc4ab2f843acda8";
+
+static const char *ecdsa_data_tmpl = "(data (flags rfc6979) (hash %s %b))";
+/* Sample data from RFC 6979 section A.2.5, hash is of message "sample" */
+static const char ecdsa_sample_data_string[] = "sample";
+static const char ecdsa_sample_data_bad_string[] = "sbmple";
+
+
+/* Ed25519 test vector from RFC 8032 7.1.  */
+static const char ed25519_sample_public_key[] =
+  "(public-key"
+  " (ecc"
+  "  (curve Ed25519)"
+  "  (flags eddsa)"
+  "  (q #3d4017c3e843895a92b70aa74d1b7ebc9c982ccf2ec4968cc0cd55f12af4660c#)))";
+static const char ed25519_sample_secret_key[] =
+  "(private-key"
+  " (ecc"
+  "  (curve Ed25519)"
+  "  (flags eddsa)"
+  "  (d #4ccd089b28ff96da9db6c346ec114e0f5b8a319f35aba624da8cf6ed4fb8a6fb#)"
+  "  (q #3d4017c3e843895a92b70aa74d1b7ebc9c982ccf2ec4968cc0cd55f12af4660c#)))";
+static const char ed25519_sample_data[] =
+  "(data (value #72#))";
+static const char ed25519_sample_data_bad[] =
+  "(data (value #72727272#))";
+static const char ed25519_signature_r[] =
+  "92a009a9f0d4cab8720e820b5f642540a2b27b5416503f8fb3762223ebdb69da";
+static const char ed25519_signature_s[] =
+  "085ac1e43e15996e458f3613d0f11d8c387b2eaeb4302aeeb00d291612bb0c00";
+static const char *ed25519_data_tmpl = "(data (value %b))";
+/* Sample data from RFC 6979 section A.2.5, hash is of message "sample" */
+static const char ed25519_sample_data_string[] = "\x72";
+static const char ed25519_sample_data_bad_string[] = "\x72\x72\x72\x72";
+
+
+/* Ed448 test vector from RFC 8032 7.4.  */
+static const char ed448_sample_public_key[] =
+  "(public-key"
+  " (ecc"
+  "  (curve Ed448)"
+  "  (q #43ba28f430cdff456ae531545f7ecd0ac834a55d9358c0372bfa0c6c6798c086"
+  /**/  "6aea01eb00742802b8438ea4cb82169c235160627b4c3a9480#)))";
+static const char ed448_sample_secret_key[] =
+  "(private-key"
+  " (ecc"
+  "  (curve Ed448)"
+  "  (d #c4eab05d357007c632f3dbb48489924d552b08fe0c353a0d4a1f00acda2c463a"
+  /**/  "fbea67c5e8d2877c5e3bc397a659949ef8021e954e0a12274e#)"
+  "  (q #43ba28f430cdff456ae531545f7ecd0ac834a55d9358c0372bfa0c6c6798c086"
+  /**/  "6aea01eb00742802b8438ea4cb82169c235160627b4c3a9480#)))";
+static const char ed448_sample_data[] =
+  "(data (value #03#))";
+static const char ed448_sample_data_bad[] =
+  "(data (value #030303#))";
+static const char ed448_signature_r[] =
+  "26b8f91727bd62897af15e41eb43c377efb9c610d48f2335cb0bd0087810f435"
+  "2541b143c4b981b7e18f62de8ccdf633fc1bf037ab7cd77980";
+static const char ed448_signature_s[] =
+  "5e0dbcc0aae1cbcee1afb2e027df36bc04dcecbf154336c19f0af7e0a6472905"
+  "e799f1953d2a0ff3348ab21aa4adafd1d234441cf807c03a00";
+static const char *ed448_data_tmpl = "(data (value %b))";
+/* Sample data from RFC 6979 section A.2.5, hash is of message "sample" */
+static const char ed448_sample_data_string[] = "\x03";
+static const char ed448_sample_data_bad_string[] = "\x03\x03\x03";
+
 
 /* Registered progress function and its callback value. */
 static void (*progress_cb) (void *, const char*, int, int, int);
@@ -364,6 +445,70 @@ leave:
   return result;
 }
 
+static int
+test_keys_eddsa_fips (gcry_sexp_t skey)
+{
+  int result = -1; /* Default to failure */
+  gcry_ctx_t ctx = NULL;
+  const char *data_tmpl = "(data (value %b))";
+  gcry_sexp_t sig = NULL;
+  char plaintext[128];
+  int rc;
+
+  /* Create a random plaintext.  */
+  _gcry_randomize (plaintext, sizeof plaintext, GCRY_WEAK_RANDOM);
+
+  rc = _gcry_pk_single_data_push (&ctx, (void *)plaintext, sizeof(plaintext));
+  if (rc)
+    {
+      log_error ("EdDSA operation: failed to push input data: %s\n",
+                 gpg_strerror (rc));
+      goto leave;
+    }
+
+  /* Sign the data */
+  rc = _gcry_pk_sign_md (&sig, data_tmpl, NULL, skey, ctx);
+  if (rc)
+    {
+      log_error ("EdDSA operation: signing failed: %s\n", gpg_strerror (rc));
+      goto leave;
+    }
+
+  /* Verify this signature.  */
+  rc = _gcry_pk_verify_md (sig, data_tmpl, NULL, skey, ctx);
+  if (rc)
+    {
+      log_error ("EdDSA operation: verification failed: %s\n", gpg_strerror (rc));
+      goto leave;
+    }
+
+  _gcry_ctx_release (ctx);
+  ctx = NULL;
+
+  /* Modify the data and check that the signing fails.  */
+  plaintext[sizeof plaintext / 2] ^= 1;
+  rc = _gcry_pk_single_data_push (&ctx, (void *)plaintext, sizeof(plaintext));
+  if (rc)
+    {
+      log_error ("EdDSA operation: failed to push input data: %s\n",
+                 gpg_strerror (rc));
+      goto leave;
+    }
+
+  rc = _gcry_pk_verify_md (sig, data_tmpl, NULL, skey, ctx);
+  if (rc != GPG_ERR_BAD_SIGNATURE)
+    {
+      log_error ("EdDSA operation: signature verification worked on modified data\n");
+      goto leave;
+    }
+
+  result = 0;
+leave:
+  _gcry_ctx_release (ctx);
+  sexp_release (sig);
+  return result;
+}
+
 
 static void
 test_ecdh_only_keys (mpi_ec_t ec, unsigned int nbits, int flags)
@@ -505,10 +650,7 @@ check_secret_key (mpi_ec_t ec, int flags)
       goto leave;
     }
 
-  if ((flags & PUBKEY_FLAG_EDDSA)
-      || (ec->model == MPI_EC_EDWARDS && ec->dialect == ECC_DIALECT_SAFECURVE))
-    ; /* Fixme: EdDSA is special.  */
-  else if (!mpi_cmp_ui (ec->Q->z, 1))
+  if (!mpi_cmp_ui (ec->Q->z, 1))
     {
       /* Fast path if Q is already in affine coordinates.  */
       if (mpi_cmp (x1, ec->Q->x) || (y1 && mpi_cmp (y1, ec->Q->y)))
@@ -693,12 +835,21 @@ ecc_generate (const gcry_sexp_t genparms, gcry_sexp_t *r_skey)
         log_debug ("ecgen result  using Ed25519+EdDSA\n");
     }
 
-  if (fips_mode () && test_keys_fips (*r_skey))
+  if (fips_mode ())
     {
-      sexp_release (*r_skey);
-      *r_skey = NULL;
-      fips_signal_error ("self-test after key generation failed");
-      rc = GPG_ERR_SELFTEST_FAILED;
+      int result;
+
+      if (ec->model == MPI_EC_EDWARDS)
+        result = test_keys_eddsa_fips (*r_skey);
+      else
+        result = test_keys_fips (*r_skey);
+      if (result)
+        {
+          sexp_release (*r_skey);
+          *r_skey = NULL;
+          fips_signal_error ("self-test after key generation failed");
+          rc = GPG_ERR_SELFTEST_FAILED;
+        }
     }
 
  leave:
@@ -790,13 +941,28 @@ ecc_sign (gcry_sexp_t *r_sig, gcry_sexp_t s_data, gcry_sexp_t keyparms)
   if (rc)
     goto leave;
 
-  /* Hash algo is determined by curve in EdDSA.  Fill it if not specified.  */
-  if ((ctx.flags & PUBKEY_FLAG_EDDSA) && !ctx.hash_algo)
+  /* Hash algo is determined by curve in EdDSA.  */
+  if ((ctx.flags & PUBKEY_FLAG_EDDSA))
     {
-      if (ec->dialect == ECC_DIALECT_ED25519)
-        ctx.hash_algo = GCRY_MD_SHA512;
-      else if (ec->dialect == ECC_DIALECT_SAFECURVE)
-        ctx.hash_algo = GCRY_MD_SHAKE256;
+      if (ctx.hash_algo)
+        {
+          if (fips_mode ()
+              && ((ec->dialect == ECC_DIALECT_ED25519
+                   &&ctx.hash_algo != GCRY_MD_SHA512)
+                  || (ec->dialect == ECC_DIALECT_SAFECURVE
+                      && ctx.hash_algo != GCRY_MD_SHAKE256)))
+            {
+              rc = GPG_ERR_DIGEST_ALGO;
+              goto leave;
+            }
+        }
+      else
+        {
+          if (ec->dialect == ECC_DIALECT_ED25519)
+            ctx.hash_algo = GCRY_MD_SHA512;
+          else if (ec->dialect == ECC_DIALECT_SAFECURVE)
+            ctx.hash_algo = GCRY_MD_SHAKE256;
+        }
     }
 
   sig_r = mpi_new (0);
@@ -897,13 +1063,28 @@ ecc_verify (gcry_sexp_t s_sig, gcry_sexp_t s_data, gcry_sexp_t s_keyparms)
   if (DBG_CIPHER)
     log_mpidump ("ecc_verify data", data);
 
-  /* Hash algo is determined by curve in EdDSA.  Fill it if not specified.  */
-  if ((ctx.flags & PUBKEY_FLAG_EDDSA) && !ctx.hash_algo)
+  /* Hash algo is determined by curve in EdDSA.  */
+  if ((ctx.flags & PUBKEY_FLAG_EDDSA))
     {
-      if (ec->dialect == ECC_DIALECT_ED25519)
-        ctx.hash_algo = GCRY_MD_SHA512;
-      else if (ec->dialect == ECC_DIALECT_SAFECURVE)
-        ctx.hash_algo = GCRY_MD_SHAKE256;
+      if (ctx.hash_algo)
+        {
+          if (fips_mode ()
+              && ((ec->dialect == ECC_DIALECT_ED25519
+                   &&ctx.hash_algo != GCRY_MD_SHA512)
+                  || (ec->dialect == ECC_DIALECT_SAFECURVE
+                      && ctx.hash_algo != GCRY_MD_SHAKE256)))
+            {
+              rc = GPG_ERR_DIGEST_ALGO;
+              goto leave;
+            }
+        }
+      else
+        {
+          if (ec->dialect == ECC_DIALECT_ED25519)
+            ctx.hash_algo = GCRY_MD_SHA512;
+          else if (ec->dialect == ECC_DIALECT_SAFECURVE)
+            ctx.hash_algo = GCRY_MD_SHAKE256;
+        }
     }
 
   /*
@@ -1747,19 +1928,12 @@ _gcry_pk_ecc_get_sexp (gcry_sexp_t *r_sexp, int mode, mpi_ec_t ec)
  */
 
 static const char *
-selftest_hash_sign (gcry_sexp_t pkey, gcry_sexp_t skey)
+selftest_hash_sign (gcry_sexp_t pkey, gcry_sexp_t skey, const char *tmpl,
+                    const char *input_str, const char *input_bad_str,
+                    const char *signature_r, const char *signature_s)
 {
   int md_algo = GCRY_MD_SHA256;
   gcry_md_hd_t hd = NULL;
-  const char *data_tmpl = "(data (flags rfc6979) (hash %s %b))";
-  /* Sample data from RFC 6979 section A.2.5, hash is of message "sample" */
-  static const char sample_data[] = "sample";
-  static const char sample_data_bad[] = "sbmple";
-  static const char signature_r[] =
-    "efd48b2aacb6a8fd1140dd9cd45e81d69d2c877b56aaf991c34d0ea84eaf3716";
-  static const char signature_s[] =
-    "f7cb1c942d657c41d436c7a1b6e29f65f3e900dbb9aff4064dc4ab2f843acda8";
-
   const char *errtxt = NULL;
   gcry_error_t err;
   gcry_sexp_t sig = NULL;
@@ -1778,7 +1952,7 @@ selftest_hash_sign (gcry_sexp_t pkey, gcry_sexp_t skey)
       goto leave;
     }
 
-  _gcry_md_write (hd, sample_data, strlen(sample_data));
+  _gcry_md_write (hd, input_str, strlen (input_str));
 
   err = _gcry_mpi_scan (&r, GCRYMPI_FMT_HEX, signature_r, 0, NULL);
   if (!err)
@@ -1790,7 +1964,7 @@ selftest_hash_sign (gcry_sexp_t pkey, gcry_sexp_t skey)
       goto leave;
     }
 
-  err = _gcry_pk_sign_md (&sig, data_tmpl, hd, skey, NULL);
+  err = _gcry_pk_sign_md (&sig, tmpl, hd, skey, NULL);
   if (err)
     {
       errtxt = "signing failed";
@@ -1802,12 +1976,9 @@ selftest_hash_sign (gcry_sexp_t pkey, gcry_sexp_t skey)
   l1 = _gcry_sexp_find_token (sig, "sig-val", 0);
   if (!l1)
     goto leave;
-  l2 = _gcry_sexp_find_token (l1, "ecdsa", 0);
-  if (!l2)
-    goto leave;
 
-  sexp_release (l1);
-  l1 = l2;
+  /* Here, we have the ECC name like: "ecdsa", "eddsa"...,
+     But we skip parsing the name.  */
 
   l2 = _gcry_sexp_find_token (l1, "r", 0);
   if (!l2)
@@ -1836,7 +2007,7 @@ selftest_hash_sign (gcry_sexp_t pkey, gcry_sexp_t skey)
   errtxt = NULL;
 
   /* verify generated signature */
-  err = _gcry_pk_verify_md (sig, data_tmpl, hd, pkey, NULL);
+  err = _gcry_pk_verify_md (sig, tmpl, hd, pkey, NULL);
   if (err)
     {
       errtxt = "verify failed";
@@ -1844,8 +2015,8 @@ selftest_hash_sign (gcry_sexp_t pkey, gcry_sexp_t skey)
     }
 
   _gcry_md_reset(hd);
-  _gcry_md_write (hd, sample_data_bad, strlen(sample_data_bad));
-  err = _gcry_pk_verify_md (sig, data_tmpl, hd, pkey, NULL);
+  _gcry_md_write (hd, input_bad_str, strlen (input_bad_str));
+  err = _gcry_pk_verify_md (sig, tmpl, hd, pkey, NULL);
   if (gcry_err_code (err) != GPG_ERR_BAD_SIGNATURE)
     {
       errtxt = "bad signature not detected";
@@ -1867,22 +2038,125 @@ selftest_hash_sign (gcry_sexp_t pkey, gcry_sexp_t skey)
 
 
 static const char *
-selftest_sign (gcry_sexp_t pkey, gcry_sexp_t skey)
+selftest_hash_sign_eddsa (gcry_sexp_t pkey, gcry_sexp_t skey, const char *tmpl,
+                          const char *input_str, const char *input_bad_str,
+                          const char *signature_r, const char *signature_s)
 {
-  /* Sample data from RFC 6979 section A.2.5, hash is of message "sample" */
-  static const char sample_data[] =
-    "(data (flags rfc6979 prehash)"
-    " (hash-algo sha256)"
-    " (value 6:sample))";
-  static const char sample_data_bad[] =
-    "(data (flags rfc6979)"
-    " (hash sha256 #bf2bdbe1aa9b6ec1e2ade1d694f41fc71a831d0268e98915"
-    /**/           "62113d8a62add1bf#))";
-  static const char signature_r[] =
-    "efd48b2aacb6a8fd1140dd9cd45e81d69d2c877b56aaf991c34d0ea84eaf3716";
-  static const char signature_s[] =
-    "f7cb1c942d657c41d436c7a1b6e29f65f3e900dbb9aff4064dc4ab2f843acda8";
+  gcry_ctx_t ctx = NULL;
+  const char *errtxt = NULL;
+  gcry_error_t err;
+  gcry_sexp_t sig = NULL;
+  gcry_sexp_t l1 = NULL;
+  gcry_sexp_t l2 = NULL;
+  unsigned char *r = NULL;
+  unsigned char *s = NULL;
+  size_t r_len, s_len;
+  unsigned char *calculated_r = NULL;
+  unsigned char *calculated_s = NULL;
+  size_t calculated_r_len, calculated_s_len;
+
+  err = _gcry_pk_single_data_push (&ctx, (void *)input_str, strlen (input_str));
+  if (err)
+    {
+      errtxt ="error setting input data";
+      goto leave;
+    }
+
+  r = _gcry_hex2buffer (signature_r, &r_len);
+  s = _gcry_hex2buffer (signature_s, &s_len);
+  if (!r || !s)
+    {
+      errtxt = "converting data failed";
+      goto leave;
+    }
+
+  err = _gcry_pk_sign_md (&sig, tmpl, NULL, skey, ctx);
+  if (err)
+    {
+      errtxt = "signing failed";
+      goto leave;
+    }
+
+  /* check against known signature */
+  errtxt = "signature validity failed";
+  l1 = _gcry_sexp_find_token (sig, "sig-val", 0);
+  if (!l1)
+    goto leave;
+
+  /* Here, we have the ECC name like: "ecdsa", "eddsa"...,
+     But we skip parsing the name.  */
+
+  l2 = _gcry_sexp_find_token (l1, "r", 0);
+  if (!l2)
+    goto leave;
+  calculated_r = _gcry_sexp_nth_buffer (l2, 1, &calculated_r_len);
+  if (!calculated_r)
+    goto leave;
+
+  sexp_release (l2);
+  l2 = _gcry_sexp_find_token (l1, "s", 0);
+  if (!l2)
+    goto leave;
+  calculated_s = _gcry_sexp_nth_buffer (l2, 1, &calculated_s_len);
+  if (!calculated_s)
+    goto leave;
+
+  errtxt = "known sig check failed";
 
+  if (r_len != calculated_r_len)
+    goto leave;
+  if (s_len != calculated_s_len)
+    goto leave;
+  if (memcmp (r, calculated_r, r_len))
+    goto leave;
+  if (memcmp (s, calculated_s, s_len))
+    goto leave;
+
+  errtxt = NULL;
+
+  /* verify generated signature */
+  err = _gcry_pk_verify_md (sig, tmpl, NULL, pkey, ctx);
+  if (err)
+    {
+      errtxt = "verify failed";
+      goto leave;
+    }
+
+  _gcry_ctx_release (ctx);
+  ctx = NULL;
+  err = _gcry_pk_single_data_push (&ctx, (void *)input_bad_str,
+                                   strlen (input_bad_str));
+  if (err)
+    {
+      errtxt ="error setting input data";
+      goto leave;
+    }
+
+  err = _gcry_pk_verify_md (sig, tmpl, NULL, pkey, ctx);
+  if (gcry_err_code (err) != GPG_ERR_BAD_SIGNATURE)
+    {
+      errtxt = "bad signature not detected";
+      goto leave;
+    }
+
+ leave:
+  _gcry_ctx_release (ctx);
+  sexp_release (sig);
+  sexp_release (l1);
+  sexp_release (l2);
+  xfree (r);
+  xfree (s);
+  xfree (calculated_r);
+  xfree (calculated_s);
+  return errtxt;
+}
+
+
+static const char *
+selftest_sign (gcry_sexp_t pkey, gcry_sexp_t skey,
+               const char *input, const char *input_bad,
+               const char *signature_r, const char *signature_s)
+{
   const char *errtxt = NULL;
   gcry_error_t err;
   gcry_sexp_t data = NULL;
@@ -1896,10 +2170,10 @@ selftest_sign (gcry_sexp_t pkey, gcry_sexp_t skey)
   gcry_mpi_t calculated_s = NULL;
   int cmp;
 
-  err = sexp_sscan (&data, NULL, sample_data, strlen (sample_data));
+  err = sexp_sscan (&data, NULL, input, strlen (input));
   if (!err)
     err = sexp_sscan (&data_bad, NULL,
-                      sample_data_bad, strlen (sample_data_bad));
+                      input_bad, strlen (input_bad));
   if (!err)
     err = _gcry_mpi_scan (&r, GCRYMPI_FMT_HEX, signature_r, 0, NULL);
   if (!err)
@@ -1923,12 +2197,9 @@ selftest_sign (gcry_sexp_t pkey, gcry_sexp_t skey)
   l1 = _gcry_sexp_find_token (sig, "sig-val", 0);
   if (!l1)
     goto leave;
-  l2 = _gcry_sexp_find_token (l1, "ecdsa", 0);
-  if (!l2)
-    goto leave;
 
-  sexp_release (l1);
-  l1 = l2;
+  /* Here, we have the ECC name like: "ecdsa", "eddsa"...,
+     But we skip parsing the name.  */
 
   l2 = _gcry_sexp_find_token (l1, "r", 0);
   if (!l2)
@@ -1986,7 +2257,12 @@ selftest_sign (gcry_sexp_t pkey, gcry_sexp_t skey)
 
 
 static gpg_err_code_t
-selftests_ecdsa (selftest_report_func_t report, int extended)
+selftests_ecc (selftest_report_func_t report, int extended, int is_eddsa,
+               const char *secret_key, const char *public_key,
+               const char *input, const char *input_bad,
+               const char *tmpl,
+               const char *input_str, const char *input_bad_str,
+               const char *signature_r, const char *signature_s)
 {
   const char *what;
   const char *errtxt;
@@ -1995,11 +2271,9 @@ selftests_ecdsa (selftest_report_func_t report, int extended)
   gcry_sexp_t pkey = NULL;
 
   what = "convert";
-  err = sexp_sscan (&skey, NULL, sample_secret_key_secp256,
-                    strlen (sample_secret_key_secp256));
+  err = sexp_sscan (&skey, NULL, secret_key, strlen (secret_key));
   if (!err)
-    err = sexp_sscan (&pkey, NULL, sample_public_key_secp256,
-                      strlen (sample_public_key_secp256));
+    err = sexp_sscan (&pkey, NULL, public_key, strlen (public_key));
   if (err)
     {
       errtxt = _gcry_strerror (err);
@@ -2007,7 +2281,7 @@ selftests_ecdsa (selftest_report_func_t report, int extended)
     }
 
   what = "key consistency";
-  err = ecc_check_secret_key(skey);
+  err = ecc_check_secret_key (skey);
   if (err)
     {
       errtxt = _gcry_strerror (err);
@@ -2017,13 +2291,21 @@ selftests_ecdsa (selftest_report_func_t report, int extended)
   if (extended)
     {
       what = "sign";
-      errtxt = selftest_sign (pkey, skey);
+      errtxt = selftest_sign (pkey, skey, input, input_bad,
+                              signature_r, signature_s);
       if (errtxt)
         goto failed;
     }
 
   what = "digest sign";
-  errtxt = selftest_hash_sign (pkey, skey);
+  if (is_eddsa)
+    errtxt = selftest_hash_sign_eddsa (pkey, skey, tmpl,
+                                       input_str, input_bad_str,
+                                       signature_r, signature_s);
+  else
+    errtxt = selftest_hash_sign (pkey, skey, tmpl,
+                                 input_str, input_bad_str,
+                                 signature_r, signature_s);
   if (errtxt)
     goto failed;
 
@@ -2044,14 +2326,40 @@ selftests_ecdsa (selftest_report_func_t report, int extended)
 static gpg_err_code_t
 run_selftests (int algo, int extended, selftest_report_func_t report)
 {
+  int r;
+
   if (algo != GCRY_PK_ECC)
     return GPG_ERR_PUBKEY_ALGO;
 
-  return selftests_ecdsa (report, extended);
+  r = selftests_ecc (report, extended, 0,
+                     ecdsa_sample_secret_key_secp256,
+                     ecdsa_sample_public_key_secp256,
+                     ecdsa_sample_data, ecdsa_sample_data_bad,
+                     ecdsa_data_tmpl,
+                     ecdsa_sample_data_string, ecdsa_sample_data_bad_string,
+                     ecdsa_signature_r, ecdsa_signature_s);
+  if (r)
+    return r;
+
+  r = selftests_ecc (report, extended, 1,
+                     ed25519_sample_secret_key,
+                     ed25519_sample_public_key,
+                     ed25519_sample_data, ed25519_sample_data_bad,
+                     ed25519_data_tmpl,
+                     ed25519_sample_data_string, ed25519_sample_data_bad_string,
+                     ed25519_signature_r, ed25519_signature_s);
+  if (r)
+    return r;
+
+  r = selftests_ecc (report, extended, 1,
+                     ed448_sample_secret_key,
+                     ed448_sample_public_key,
+                     ed448_sample_data, ed448_sample_data_bad,
+                     ed448_data_tmpl,
+                     ed448_sample_data_string, ed448_sample_data_bad_string,
+                     ed448_signature_r, ed448_signature_s);
+  return r;
 }
-
-
-
 \f
 gcry_pk_spec_t _gcry_pubkey_spec_ecc =
   {
index ed2d7cacd134fb8254c9f09e17ca26bc0f8323ee..deff40d73d7be6ff801e21dfa35a2906ff0785f7 100644 (file)
@@ -51,12 +51,10 @@ _gcry_hash_selftest_check_one (int algo,
   gcry_md_hd_t hd;
   unsigned char *digest;
   char aaa[1000];
-  int xof = 0;
+  int expect_xof = 0;
 
-  if (_gcry_md_get_algo_dlen (algo) == 0)
-    xof = 1;
-  else if (_gcry_md_get_algo_dlen (algo) != expectlen)
-    return "digest size does not match expected size";
+  if (_gcry_md_get_algo_dlen (algo) != expectlen)
+    expect_xof = 1;
 
   err = _gcry_md_open (&hd, algo, 0);
   if (err)
@@ -85,7 +83,7 @@ _gcry_hash_selftest_check_one (int algo,
 
   if (!result)
     {
-      if (!xof)
+      if (!expect_xof)
        {
          digest = _gcry_md_read (hd, algo);
 
index 9e9a432e5ed7f744771c4942478db0647b2e422e..9f4f3c489fa8bcfe477880f69903c0e59a824343 100644 (file)
@@ -4,7 +4,7 @@
  * This file is part of Libgcrypt.
  *
  * Libgcrypt is free software; you can redistribute it and/or modify
- * it under the terms of the GNU Lesser general Public License as
+ * it under the terms of the GNU Lesser General Public License as
  * published by the Free Software Foundation; either version 2.1 of
  * the License, or (at your option) any later version.
  *
@@ -36,8 +36,4 @@ _gcry_kdf_scrypt (const unsigned char *passwd, size_t passwdlen,
                   unsigned long iterations,
                   size_t dklen, unsigned char *dk);
 
-/*-- blake2.c --*/
-gcry_err_code_t
-blake2b_vl_hash (const void *in, size_t inlen, size_t outputlen, void *output);
-
 #endif /*GCRY_KDF_INTERNAL_H*/
index 9f67e4d9fb77333e10fc73c4ffda4a7dd06ae461..b4c5f83aee697fcc04fe5eea02f7efa40aa401ed 100644 (file)
@@ -5,7 +5,7 @@
  * This file is part of Libgcrypt.
  *
  * Libgcrypt is free software; you can redistribute it and/or modify
- * it under the terms of the GNU Lesser general Public License as
+ * it under the terms of the GNU Lesser General Public License as
  * published by the Free Software Foundation; either version 2.1 of
  * the License, or (at your option) any later version.
  *
@@ -419,10 +419,13 @@ argon2_fill_first_blocks (argon2_ctx_t a)
   iov[iov_count].len = 4 * 7;
   iov[iov_count].off = 0;
   iov_count++;
-  iov[iov_count].data = (void *)a->password;
-  iov[iov_count].len = a->passwordlen;
-  iov[iov_count].off = 0;
-  iov_count++;
+  if (a->passwordlen)
+    {
+      iov[iov_count].data = (void *)a->password;
+      iov[iov_count].len = a->passwordlen;
+      iov[iov_count].off = 0;
+      iov_count++;
+    }
 
   buf_put_le32 (buf[7], a->saltlen);
   iov[iov_count].data = buf[7];
@@ -897,133 +900,1293 @@ argon2_open (gcry_kdf_hd_t *hd, int subalgo,
   *hd = (void *)a;
   return 0;
 }
+\f
+typedef struct balloon_context *balloon_ctx_t;
+
+/* Per thread data for Balloon.  */
+struct balloon_thread_data {
+  balloon_ctx_t b;
+  gpg_err_code_t ec;
+  unsigned int idx;
+  unsigned char *block;
+};
+
+/* Balloon context */
+struct balloon_context {
+  int algo;
+  int prng_type;
+
+  unsigned int blklen;
+  const gcry_md_spec_t *md_spec;
+
+  const unsigned char *password;
+  size_t passwordlen;
+
+  const unsigned char *salt;
+  /* Length of salt is fixed.  */
+
+  unsigned int s_cost;
+  unsigned int t_cost;
+  unsigned int parallelism;
+
+  u64 n_blocks;
+
+  unsigned char *block;
+
+  /* In future, we may use flexible array member.  */
+  struct balloon_thread_data thread_data[1];
+};
+
+/* Maximum size of underlining digest size.  */
+#define BALLOON_BLOCK_LEN_MAX 64
+
+static gpg_err_code_t
+prng_aes_ctr_init (gcry_cipher_hd_t *hd_p, balloon_ctx_t b,
+                   gcry_buffer_t *iov, unsigned int iov_count)
+{
+  gpg_err_code_t ec;
+  gcry_cipher_hd_t hd;
+  unsigned char key[BALLOON_BLOCK_LEN_MAX];
+  int cipher_algo;
+  unsigned int keylen, blklen;
+
+  switch (b->blklen)
+    {
+    case 64:
+      cipher_algo = GCRY_CIPHER_AES256;
+      break;
+
+    case 48:
+      cipher_algo = GCRY_CIPHER_AES192;
+      break;
+
+    default:
+    case 32:
+      cipher_algo = GCRY_CIPHER_AES;
+      break;
+    }
+
+  keylen = _gcry_cipher_get_algo_keylen (cipher_algo);
+  blklen = _gcry_cipher_get_algo_blklen (cipher_algo);
+
+  b->md_spec->hash_buffers (key, b->blklen, iov, iov_count);
+  ec = _gcry_cipher_open (&hd, cipher_algo, GCRY_CIPHER_MODE_CTR, 0);
+  if (ec)
+    return ec;
+
+  ec = _gcry_cipher_setkey (hd, key, keylen);
+  if (ec)
+    {
+      _gcry_cipher_close (hd);
+      return ec;
+    }
+
+  if (cipher_algo == GCRY_CIPHER_AES
+      && b->md_spec == &_gcry_digest_spec_sha256)
+    /* Original Balloon uses zero IV.  */
+    ;
+  else
+    {
+      ec = _gcry_cipher_setiv (hd, key+keylen, blklen);
+      if (ec)
+        {
+          _gcry_cipher_close (hd);
+          return ec;
+        }
+    }
+
+  wipememory (key, BALLOON_BLOCK_LEN_MAX);
+  *hd_p = hd;
+  return ec;
+}
+
+static u64
+prng_aes_ctr_get_rand64 (gcry_cipher_hd_t hd)
+{
+  static const unsigned char zero64[8];
+  unsigned char rand64[8];
+
+  _gcry_cipher_encrypt (hd, rand64, sizeof (rand64), zero64, sizeof (zero64));
+  return buf_get_le64 (rand64);
+}
+
+static void
+prng_aes_ctr_fini (gcry_cipher_hd_t hd)
+{
+  _gcry_cipher_close (hd);
+}
 
+static size_t
+ballon_context_size (unsigned int parallelism)
+{
+  size_t n;
+
+  n = offsetof (struct balloon_context, thread_data)
+    + parallelism * sizeof (struct balloon_thread_data);
+  return n;
+}
 
 static gpg_err_code_t
 balloon_open (gcry_kdf_hd_t *hd, int subalgo,
               const unsigned long *param, unsigned int paramlen,
-              const void *passphrase, size_t passphraselen,
+              const void *password, size_t passwordlen,
               const void *salt, size_t saltlen)
 {
+  unsigned int blklen;
+  int hash_type;
+  unsigned int s_cost;
+  unsigned int t_cost;
+  unsigned int parallelism = 1;
+  balloon_ctx_t b;
+  gpg_err_code_t ec;
+  size_t n;
+  unsigned char *block;
+  unsigned int i;
+  const gcry_md_spec_t *md_spec;
+
+  hash_type = subalgo;
+  switch (hash_type)
+    {
+    case GCRY_MD_SHA256:
+      md_spec = &_gcry_digest_spec_sha256;
+      break;
+
+    case GCRY_MD_SHA384:
+      md_spec = &_gcry_digest_spec_sha384;
+      break;
+
+    case GCRY_MD_SHA512:
+      md_spec = &_gcry_digest_spec_sha512;
+      break;
+
+    case GCRY_MD_SHA3_256:
+      md_spec = &_gcry_digest_spec_sha3_256;
+      break;
+
+    case GCRY_MD_SHA3_384:
+      md_spec = &_gcry_digest_spec_sha3_384;
+      break;
+
+    case GCRY_MD_SHA3_512:
+      md_spec = &_gcry_digest_spec_sha3_512;
+      break;
+
+    default:
+      return GPG_ERR_NOT_SUPPORTED;
+    }
+
+  blklen = _gcry_md_get_algo_dlen (hash_type);
+  if (!blklen || blklen > BALLOON_BLOCK_LEN_MAX)
+    return GPG_ERR_NOT_SUPPORTED;
+
+  if (saltlen != blklen)
+    return GPG_ERR_NOT_SUPPORTED;
+
   /*
    * It should have space_cost and time_cost.
    * Optionally, for parallelised version, it has parallelism.
+   * Possibly (in future), it may have option to specify PRNG type.
    */
   if (paramlen != 2 && paramlen != 3)
     return GPG_ERR_INV_VALUE;
+  else
+    {
+      s_cost = (unsigned int)param[0];
+      t_cost = (unsigned int)param[1];
+      if (paramlen >= 3)
+        parallelism = (unsigned int)param[2];
+    }
+
+  if (s_cost < 1)
+    return GPG_ERR_INV_VALUE;
+
+  n = ballon_context_size (parallelism);
+  b = xtrymalloc (n);
+  if (!b)
+    return gpg_err_code_from_errno (errno);
+
+  b->algo = GCRY_KDF_BALLOON;
+  b->md_spec = md_spec;
+  b->blklen = blklen;
+
+  b->password = password;
+  b->passwordlen = passwordlen;
+  b->salt = salt;
+
+  b->s_cost = s_cost;
+  b->t_cost = t_cost;
+  b->parallelism = parallelism;
+
+  b->n_blocks = (s_cost * 1024) / b->blklen;
+
+  block = xtrycalloc (parallelism * b->n_blocks, b->blklen);
+  if (!block)
+    {
+      ec = gpg_err_code_from_errno (errno);
+      xfree (b);
+      return ec;
+    }
+  b->block = block;
+
+  for (i = 0; i < parallelism; i++)
+    {
+      struct balloon_thread_data *t = &b->thread_data[i];
+
+      t->b = b;
+      t->ec = 0;
+      t->idx = i;
+      t->block = block;
+      block += b->blklen * b->n_blocks;
+    }
 
-  (void)param;
-  (void)subalgo;
-  (void)passphrase;
-  (void)passphraselen;
-  (void)salt;
-  (void)saltlen;
-  *hd = NULL;
-  return GPG_ERR_NOT_IMPLEMENTED;
+  *hd = (void *)b;
+  return 0;
 }
 
 
-struct gcry_kdf_handle {
-  int algo;
-  /* And algo specific parts come.  */
-};
+static void
+balloon_xor_block (balloon_ctx_t b, u64 *dst, const u64 *src)
+{
+  int i;
 
-gpg_err_code_t
-_gcry_kdf_open (gcry_kdf_hd_t *hd, int algo, int subalgo,
-                const unsigned long *param, unsigned int paramlen,
-                const void *passphrase, size_t passphraselen,
-                const void *salt, size_t saltlen,
-                const void *key, size_t keylen,
-                const void *ad, size_t adlen)
+  for (i = 0; i < b->blklen/8; i++)
+    dst[i] ^= src[i];
+}
+
+#define BALLOON_COMPRESS_BLOCKS 5
+
+static void
+balloon_compress (balloon_ctx_t b, u64 *counter_p, unsigned char *out,
+                  const unsigned char *blocks[BALLOON_COMPRESS_BLOCKS])
 {
-  gpg_err_code_t ec;
+  gcry_buffer_t iov[1+BALLOON_COMPRESS_BLOCKS];
+  unsigned char octet_counter[sizeof (u64)];
+  unsigned int i;
 
-  switch (algo)
+  buf_put_le64 (octet_counter, *counter_p);
+  iov[0].data = octet_counter;
+  iov[0].len = sizeof (octet_counter);
+  iov[0].off = 0;
+
+  for (i = 1; i < 1+BALLOON_COMPRESS_BLOCKS; i++)
     {
-    case GCRY_KDF_ARGON2:
-      if (!passphraselen || !saltlen)
-        ec = GPG_ERR_INV_VALUE;
-      else
-        ec = argon2_open (hd, subalgo, param, paramlen,
-                          passphrase, passphraselen, salt, saltlen,
-                          key, keylen, ad, adlen);
-      break;
+      iov[i].data = (void *)blocks[i-1];
+      iov[i].len = b->blklen;
+      iov[i].off = 0;
+    }
 
-    case GCRY_KDF_BALLOON:
-      if (!passphraselen || !saltlen)
-        ec = GPG_ERR_INV_VALUE;
-      else
+  b->md_spec->hash_buffers (out, b->blklen, iov, 1+BALLOON_COMPRESS_BLOCKS);
+  *counter_p += 1;
+}
+
+static void
+balloon_expand (balloon_ctx_t b, u64 *counter_p, unsigned char *block,
+                u64 n_blocks)
+{
+  gcry_buffer_t iov[2];
+  unsigned char octet_counter[sizeof (u64)];
+  u64 i;
+
+  iov[0].data = octet_counter;
+  iov[0].len = sizeof (octet_counter);
+  iov[0].off = 0;
+  iov[1].len = b->blklen;
+  iov[1].off = 0;
+
+  for (i = 1; i < n_blocks; i++)
+    {
+      buf_put_le64 (octet_counter, *counter_p);
+      iov[1].data = block;
+      block += b->blklen;
+      b->md_spec->hash_buffers (block, b->blklen, iov, 2);
+      *counter_p += 1;
+    }
+}
+
+static void
+balloon_compute_fill (balloon_ctx_t b,
+                      struct balloon_thread_data *t,
+                      const unsigned char *salt,
+                      u64 *counter_p)
+{
+  gcry_buffer_t iov[6];
+  unsigned char octet_counter[sizeof (u64)];
+  unsigned char octet_s_cost[4];
+  unsigned char octet_t_cost[4];
+  unsigned char octet_parallelism[4];
+
+  buf_put_le64 (octet_counter, *counter_p);
+  buf_put_le32 (octet_s_cost, b->s_cost);
+  buf_put_le32 (octet_t_cost, b->t_cost);
+  buf_put_le32 (octet_parallelism, b->parallelism);
+
+  iov[0].data = octet_counter;
+  iov[0].len = sizeof (octet_counter);
+  iov[0].off = 0;
+  iov[1].data = (void *)salt;
+  iov[1].len = b->blklen;
+  iov[1].off = 0;
+  iov[2].data = (void *)b->password;
+  iov[2].len = b->passwordlen;
+  iov[2].off = 0;
+  iov[3].data = octet_s_cost;
+  iov[3].len = 4;
+  iov[3].off = 0;
+  iov[4].data = octet_t_cost;
+  iov[4].len = 4;
+  iov[4].off = 0;
+  iov[5].data = octet_parallelism;
+  iov[5].len = 4;
+  iov[5].off = 0;
+  b->md_spec->hash_buffers (t->block, b->blklen, iov, 6);
+  *counter_p += 1;
+  balloon_expand (b, counter_p, t->block, b->n_blocks);
+}
+
+static void
+balloon_compute_mix (gcry_cipher_hd_t prng,
+                     balloon_ctx_t b, struct balloon_thread_data *t,
+                     u64 *counter_p)
+{
+  u64 i;
+
+  for (i = 0; i < b->n_blocks; i++)
+    {
+      unsigned char *cur_block = t->block + (b->blklen * i);
+      const unsigned char *blocks[BALLOON_COMPRESS_BLOCKS];
+      const unsigned char *prev_block;
+      unsigned int n;
+
+      prev_block = i
+        ? cur_block - b->blklen
+        : t->block + (b->blklen * (t->b->n_blocks - 1));
+
+      n = 0;
+      blocks[n++] = prev_block;
+      blocks[n++] = cur_block;
+
+      for (; n < BALLOON_COMPRESS_BLOCKS; n++)
         {
-          (void)key;
-          (void)keylen;
-          (void)ad;
-          (void)adlen;
-          ec = balloon_open (hd, subalgo, param, paramlen,
-                             passphrase, passphraselen, salt, saltlen);
+          u64 rand64 = prng_aes_ctr_get_rand64 (prng);
+          blocks[n] = t->block + (b->blklen * (rand64 % b->n_blocks));
         }
-      break;
 
-    default:
-      ec = GPG_ERR_UNKNOWN_ALGORITHM;
-      break;
+      balloon_compress (b, counter_p, cur_block, blocks);
     }
+}
 
-  return ec;
+
+static void
+balloon_compute (void *priv)
+{
+  struct balloon_thread_data *t = (struct balloon_thread_data *)priv;
+  balloon_ctx_t b = t->b;
+  gcry_cipher_hd_t prng;
+  gcry_buffer_t iov[4];
+  unsigned char salt[BALLOON_BLOCK_LEN_MAX];
+  unsigned char octet_s_cost[4];
+  unsigned char octet_t_cost[4];
+  unsigned char octet_parallelism[4];
+  u32 u;
+  u64 counter;
+  unsigned int i;
+
+  counter = 0;
+
+  memcpy (salt, b->salt, b->blklen);
+  u = buf_get_le32 (b->salt) + t->idx;
+  buf_put_le32 (salt, u);
+
+  buf_put_le32 (octet_s_cost, b->s_cost);
+  buf_put_le32 (octet_t_cost, b->t_cost);
+  buf_put_le32 (octet_parallelism, b->parallelism);
+
+  iov[0].data = salt;
+  iov[0].len = b->blklen;
+  iov[0].off = 0;
+  iov[1].data = octet_s_cost;
+  iov[1].len = 4;
+  iov[1].off = 0;
+  iov[2].data = octet_t_cost;
+  iov[2].len = 4;
+  iov[2].off = 0;
+  iov[3].data = octet_parallelism;
+  iov[3].len = 4;
+  iov[3].off = 0;
+
+  t->ec = prng_aes_ctr_init (&prng, b, iov, 4);
+  if (t->ec)
+    return;
+
+  balloon_compute_fill (b, t, salt, &counter);
+
+  for (i = 0; i < b->t_cost; i++)
+    balloon_compute_mix (prng, b, t, &counter);
+
+  /* The result is now at the last block.  */
+
+  prng_aes_ctr_fini (prng);
 }
 
-gpg_err_code_t
-_gcry_kdf_compute (gcry_kdf_hd_t h, const struct gcry_kdf_thread_ops *ops)
+static gpg_err_code_t
+balloon_compute_all (balloon_ctx_t b, const struct gcry_kdf_thread_ops *ops)
 {
-  gpg_err_code_t ec;
+  unsigned int parallelism = b->parallelism;
+  unsigned int i;
+  int ret;
 
-  switch (h->algo)
+  for (i = 0; i < parallelism; i++)
     {
-    case GCRY_KDF_ARGON2:
-      ec = argon2_compute ((argon2_ctx_t)(void *)h, ops);
-      break;
+      struct balloon_thread_data *t = &b->thread_data[i];
 
-    default:
-      ec = GPG_ERR_UNKNOWN_ALGORITHM;
-      break;
+      if (ops)
+        {
+          ret = ops->dispatch_job (ops->jobs_context, balloon_compute, t);
+          if (ret < 0)
+            return GPG_ERR_CANCELED;
+        }
+      else
+        balloon_compute (t);
     }
 
-  return ec;
-}
+  if (ops)
+    {
+      ret = ops->wait_all_jobs (ops->jobs_context);
+      if (ret < 0)
+        return GPG_ERR_CANCELED;
+    }
 
+  return 0;
+}
 
-gpg_err_code_t
-_gcry_kdf_final (gcry_kdf_hd_t h, size_t resultlen, void *result)
+static gpg_err_code_t
+balloon_final (balloon_ctx_t b, size_t resultlen, void *result)
 {
-  gpg_err_code_t ec;
+  unsigned int parallelism = b->parallelism;
+  unsigned int i;
+  u64 out[BALLOON_BLOCK_LEN_MAX/8];
 
-  switch (h->algo)
+  if (resultlen != b->blklen)
+    return GPG_ERR_INV_VALUE;
+
+  memset (out, 0, b->blklen);
+  for (i = 0; i < parallelism; i++)
     {
-    case GCRY_KDF_ARGON2:
-      ec = argon2_final ((argon2_ctx_t)(void *)h, resultlen, result);
-      break;
+      struct balloon_thread_data *t = &b->thread_data[i];
+      const unsigned char *last_block;
 
-    default:
-      ec = GPG_ERR_UNKNOWN_ALGORITHM;
-      break;
+      if (t->ec)
+        return t->ec;
+
+      last_block = t->block + (b->blklen * (t->b->n_blocks - 1));
+      balloon_xor_block (b, out, (const u64 *)(void *)last_block);
     }
 
-  return ec;
+  memcpy (result, out, b->blklen);
+
+  return 0;
 }
 
-void
-_gcry_kdf_close (gcry_kdf_hd_t h)
+static void
+balloon_close (balloon_ctx_t b)
 {
-  switch (h->algo)
-    {
-    case GCRY_KDF_ARGON2:
-      argon2_close ((argon2_ctx_t)(void *)h);
-      break;
+  unsigned int parallelism = b->parallelism;
+  size_t n = ballon_context_size (parallelism);
 
-    default:
-      break;
+  if (b->block)
+    {
+      wipememory (b->block, parallelism * b->n_blocks);
+      xfree (b->block);
     }
+
+  wipememory (b, n);
+  xfree (b);
 }
+\f
+typedef struct onestep_kdf_context *onestep_kdf_ctx_t;
+
+/* OneStepKDF context */
+struct onestep_kdf_context {
+  int algo;
+  gcry_md_hd_t md;
+  unsigned int blklen;
+  unsigned int outlen;
+  const void *input;
+  size_t inputlen;
+  const void *fixedinfo;
+  size_t fixedinfolen;
+};
+
+static gpg_err_code_t
+onestep_kdf_open (gcry_kdf_hd_t *hd, int hashalgo,
+                  const unsigned long *param, unsigned int paramlen,
+                  const void *input, size_t inputlen,
+                  const void *fixedinfo, size_t fixedinfolen)
+{
+  gpg_err_code_t ec;
+  unsigned int outlen;
+  onestep_kdf_ctx_t o;
+  size_t n;
 
+  if (paramlen != 1)
+    return GPG_ERR_INV_VALUE;
+  else
+    outlen = (unsigned int)param[0];
+
+  n = sizeof (struct onestep_kdf_context);
+  o = xtrymalloc (n);
+  if (!o)
+    return gpg_err_code_from_errno (errno);
+
+  o->blklen = _gcry_md_get_algo_dlen (hashalgo);
+  if (!o->blklen)
+    {
+      xfree (o);
+      return GPG_ERR_DIGEST_ALGO;
+    }
+  ec = _gcry_md_open (&o->md, hashalgo, 0);
+  if (ec)
+    {
+      xfree (o);
+      return ec;
+    }
+  o->algo = GCRY_KDF_ONESTEP_KDF;
+  o->outlen = outlen;
+  o->input = input;
+  o->inputlen = inputlen;
+  o->fixedinfo = fixedinfo;
+  o->fixedinfolen = fixedinfolen;
+
+  *hd = (void *)o;
+  return 0;
+}
+
+
+static gpg_err_code_t
+onestep_kdf_compute (onestep_kdf_ctx_t o, const struct gcry_kdf_thread_ops *ops)
+{
+  (void)o;
+
+  if (ops != NULL)
+    return GPG_ERR_INV_VALUE;
+
+  return 0;
+}
+
+static gpg_err_code_t
+onestep_kdf_final (onestep_kdf_ctx_t o, size_t resultlen, void *result)
+{
+  u32 counter = 0;
+  unsigned char cnt[4];
+  int i;
+
+  if (resultlen != o->outlen)
+    return GPG_ERR_INV_VALUE;
+
+  for (i = 0; i < o->outlen / o->blklen; i++)
+    {
+      counter++;
+      buf_put_be32 (cnt, counter);
+      _gcry_md_write (o->md, cnt, sizeof (cnt));
+      _gcry_md_write (o->md, o->input, o->inputlen);
+      _gcry_md_write (o->md, o->fixedinfo, o->fixedinfolen);
+      _gcry_md_final (o->md);
+      memcpy ((char *)result + o->blklen * i,
+              _gcry_md_read (o->md, 0), o->blklen);
+      resultlen -= o->blklen;
+      _gcry_md_reset (o->md);
+    }
+
+  if (resultlen)
+    {
+      counter++;
+      buf_put_be32 (cnt, counter);
+      _gcry_md_write (o->md, cnt, sizeof (cnt));
+      _gcry_md_write (o->md, o->input, o->inputlen);
+      _gcry_md_write (o->md, o->fixedinfo, o->fixedinfolen);
+      _gcry_md_final (o->md);
+      memcpy ((char *)result + o->blklen * i,
+              _gcry_md_read (o->md, 0), resultlen);
+    }
+
+  return 0;
+}
+
+static void
+onestep_kdf_close (onestep_kdf_ctx_t o)
+{
+  _gcry_md_close (o->md);
+  xfree (o);
+}
+\f
+typedef struct onestep_kdf_mac_context *onestep_kdf_mac_ctx_t;
+
+/* OneStep_KDF_MAC context */
+struct onestep_kdf_mac_context {
+  int algo;
+  gcry_mac_hd_t md;
+  unsigned int blklen;
+  unsigned int outlen;
+  const void *input;
+  size_t inputlen;
+  const void *salt;
+  size_t saltlen;
+  const void *fixedinfo;
+  size_t fixedinfolen;
+};
+
+static gpg_err_code_t
+onestep_kdf_mac_open (gcry_kdf_hd_t *hd, int macalgo,
+                      const unsigned long *param, unsigned int paramlen,
+                      const void *input, size_t inputlen,
+                      const void *key, size_t keylen,
+                      const void *fixedinfo, size_t fixedinfolen)
+{
+  gpg_err_code_t ec;
+  unsigned int outlen;
+  onestep_kdf_mac_ctx_t o;
+  size_t n;
+
+  if (paramlen != 1)
+    return GPG_ERR_INV_VALUE;
+  else
+    outlen = (unsigned int)param[0];
+
+  n = sizeof (struct onestep_kdf_mac_context);
+  o = xtrymalloc (n);
+  if (!o)
+    return gpg_err_code_from_errno (errno);
+
+  o->blklen = _gcry_mac_get_algo_maclen (macalgo);
+  if (!o->blklen)
+    {
+      xfree (o);
+      return GPG_ERR_MAC_ALGO;
+    }
+  ec = _gcry_mac_open (&o->md, macalgo, 0, NULL);
+  if (ec)
+    {
+      xfree (o);
+      return ec;
+    }
+  o->algo = GCRY_KDF_ONESTEP_KDF_MAC;
+  o->outlen = outlen;
+  o->input = input;
+  o->inputlen = inputlen;
+  o->salt = key;
+  o->saltlen = keylen;
+  o->fixedinfo = fixedinfo;
+  o->fixedinfolen = fixedinfolen;
+
+  *hd = (void *)o;
+  return 0;
+}
+
+
+static gpg_err_code_t
+onestep_kdf_mac_compute (onestep_kdf_mac_ctx_t o,
+                         const struct gcry_kdf_thread_ops *ops)
+{
+  (void)o;
+
+  if (ops != NULL)
+    return GPG_ERR_INV_VALUE;
+
+  return 0;
+}
+
+static gpg_err_code_t
+onestep_kdf_mac_final (onestep_kdf_mac_ctx_t o, size_t resultlen, void *result)
+{
+  u32 counter = 0;
+  unsigned char cnt[4];
+  int i;
+  gcry_err_code_t ec;
+  size_t len = o->blklen;
+
+  if (resultlen != o->outlen)
+    return GPG_ERR_INV_VALUE;
+
+  ec = _gcry_mac_setkey (o->md, o->salt, o->saltlen);
+  if (ec)
+    return ec;
+
+  for (i = 0; i < o->outlen / o->blklen; i++)
+    {
+      counter++;
+      buf_put_be32 (cnt, counter);
+      ec = _gcry_mac_write (o->md, cnt, sizeof (cnt));
+      if (ec)
+        return ec;
+      ec = _gcry_mac_write (o->md, o->input, o->inputlen);
+      if (ec)
+        return ec;
+      ec = _gcry_mac_write (o->md, o->fixedinfo, o->fixedinfolen);
+      if (ec)
+        return ec;
+      ec = _gcry_mac_read (o->md, (char *)result + o->blklen * i, &len);
+      if (ec)
+        return ec;
+      resultlen -= o->blklen;
+      ec = _gcry_mac_ctl (o->md, GCRYCTL_RESET, NULL, 0);
+      if (ec)
+        return ec;
+    }
+
+  if (resultlen)
+    {
+      counter++;
+      len = resultlen;
+      buf_put_be32 (cnt, counter);
+      ec = _gcry_mac_write (o->md, cnt, sizeof (cnt));
+      if (ec)
+        return ec;
+      ec = _gcry_mac_write (o->md, o->input, o->inputlen);
+      if (ec)
+        return ec;
+      ec =_gcry_mac_write (o->md, o->fixedinfo, o->fixedinfolen);
+      if (ec)
+        return ec;
+      ec = _gcry_mac_read (o->md, (char *)result + o->blklen * i, &len);
+      if (ec)
+        return ec;
+    }
+
+  return 0;
+}
+
+static void
+onestep_kdf_mac_close (onestep_kdf_mac_ctx_t o)
+{
+  _gcry_mac_close (o->md);
+  xfree (o);
+}
+\f
+typedef struct hkdf_context *hkdf_ctx_t;
+
+/* Hkdf context */
+struct hkdf_context {
+  int algo;
+  gcry_mac_hd_t md;
+  int mode;
+  unsigned int blklen;
+  unsigned int outlen;
+  const void *input;
+  size_t inputlen;
+  const void *salt;
+  size_t saltlen;
+  const void *fixedinfo;
+  size_t fixedinfolen;
+  unsigned char *prk;
+};
+
+static gpg_err_code_t
+hkdf_open (gcry_kdf_hd_t *hd, int macalgo,
+           const unsigned long *param, unsigned int paramlen,
+           const void *input, size_t inputlen,
+           const void *salt, size_t saltlen,
+           const void *fixedinfo, size_t fixedinfolen)
+{
+  gpg_err_code_t ec;
+  unsigned int outlen;
+  int mode;
+  hkdf_ctx_t h;
+  size_t n;
+  unsigned char *prk;
+
+  if (paramlen != 1 && paramlen != 2)
+    return GPG_ERR_INV_VALUE;
+  else
+    {
+      outlen = (unsigned int)param[0];
+      /* MODE: support extract only, expand only: FIXME*/
+      if (paramlen == 2)
+        mode = (unsigned int)param[1];
+      else
+        mode = 0;
+    }
+
+  n = sizeof (struct hkdf_context);
+  h = xtrymalloc (n);
+  if (!h)
+    return gpg_err_code_from_errno (errno);
+
+  h->blklen = _gcry_mac_get_algo_maclen (macalgo);
+  if (!h->blklen)
+    {
+      xfree (h);
+      return GPG_ERR_MAC_ALGO;
+    }
+
+  if (outlen > 255 * h->blklen)
+    {
+      xfree (h);
+      return GPG_ERR_INV_VALUE;
+    }
+
+  ec = _gcry_mac_open (&h->md, macalgo, 0, NULL);
+  if (ec)
+    {
+      xfree (h);
+      return ec;
+    }
+  prk = xtrymalloc (h->blklen);
+  if (!prk)
+    {
+      _gcry_mac_close (h->md);
+      xfree (h);
+      return gpg_err_code_from_errno (errno);
+    }
+  h->prk = prk;
+  h->algo = GCRY_KDF_HKDF;
+  h->outlen = outlen;
+  h->mode = mode;
+  h->input = input;
+  h->inputlen = inputlen;
+  h->salt = salt;
+  h->saltlen = saltlen;
+  h->fixedinfo = fixedinfo;
+  h->fixedinfolen = fixedinfolen;
+
+  *hd = (void *)h;
+  return 0;
+}
+
+
+static gpg_err_code_t
+hkdf_compute (hkdf_ctx_t h, const struct gcry_kdf_thread_ops *ops)
+{
+  gcry_err_code_t ec;
+  size_t len = h->blklen;
+
+  if (ops != NULL)
+    return GPG_ERR_INV_VALUE;
+
+  /* Extract */
+  ec = _gcry_mac_setkey (h->md, h->salt, h->saltlen);
+  if (ec)
+    return ec;
+
+  ec = _gcry_mac_write (h->md, h->input, h->inputlen);
+  if (ec)
+    return ec;
+
+  ec = _gcry_mac_read (h->md, h->prk, &len);
+  if (ec)
+    return ec;
+
+  ec = _gcry_mac_ctl (h->md, GCRYCTL_RESET, NULL, 0);
+  if (ec)
+    return ec;
+
+  return 0;
+}
+
+static gpg_err_code_t
+hkdf_final (hkdf_ctx_t h, size_t resultlen, void *result)
+{
+  unsigned char counter = 0;
+  int i;
+  gcry_err_code_t ec;
+  size_t len = h->blklen;
+
+  if (resultlen != h->outlen)
+    return GPG_ERR_INV_VALUE;
+
+  /* Expand */
+  ec = _gcry_mac_setkey (h->md, h->prk, h->blklen);
+  if (ec)
+    return ec;
+
+  /* We re-use the memory of ->prk.  */
+
+  for (i = 0; i < h->outlen / h->blklen; i++)
+    {
+      counter++;
+      if (i)
+        {
+          ec = _gcry_mac_write (h->md, h->prk, h->blklen);
+          if (ec)
+            return ec;
+        }
+      if (h->fixedinfo)
+        {
+          ec = _gcry_mac_write (h->md, h->fixedinfo, h->fixedinfolen);
+          if (ec)
+            return ec;
+        }
+      ec = _gcry_mac_write (h->md, &counter, 1);
+      if (ec)
+        return ec;
+      ec = _gcry_mac_read (h->md, h->prk, &len);
+      if (ec)
+        return ec;
+      memcpy ((char *)result + h->blklen * i, h->prk, len);
+      resultlen -= h->blklen;
+      ec = _gcry_mac_ctl (h->md, GCRYCTL_RESET, NULL, 0);
+      if (ec)
+        return ec;
+    }
+
+  if (resultlen)
+    {
+      counter++;
+      len = resultlen;
+      if (i)
+        {
+          ec = _gcry_mac_write (h->md, h->prk, h->blklen);
+          if (ec)
+            return ec;
+        }
+      if (h->fixedinfo)
+        {
+          ec = _gcry_mac_write (h->md, h->fixedinfo, h->fixedinfolen);
+          if (ec)
+            return ec;
+        }
+      ec = _gcry_mac_write (h->md, &counter, 1);
+      if (ec)
+        return ec;
+      ec = _gcry_mac_read (h->md, (char *)result + h->blklen * i, &len);
+      if (ec)
+        return ec;
+    }
+
+  return 0;
+}
+
+static void
+hkdf_close (hkdf_ctx_t h)
+{
+  _gcry_mac_close (h->md);
+  xfree (h->prk);
+  xfree (h);
+}
+\f
+typedef struct x963_kdf_context *x963_kdf_ctx_t;
+
+/* X963KDF context */
+struct x963_kdf_context {
+  int algo;
+  gcry_md_hd_t md;
+  unsigned int blklen;
+  unsigned int outlen;
+  const void *input;
+  size_t inputlen;
+  const void *sharedinfo;
+  size_t sharedinfolen;
+};
+
+static gpg_err_code_t
+x963_kdf_open (gcry_kdf_hd_t *hd, int hashalgo,
+                  const unsigned long *param, unsigned int paramlen,
+                  const void *input, size_t inputlen,
+                  const void *sharedinfo, size_t sharedinfolen)
+{
+  gpg_err_code_t ec;
+  unsigned int outlen;
+  x963_kdf_ctx_t o;
+  size_t n;
+
+  if (paramlen != 1)
+    return GPG_ERR_INV_VALUE;
+  else
+    outlen = (unsigned int)param[0];
+
+  n = sizeof (struct x963_kdf_context);
+  o = xtrymalloc (n);
+  if (!o)
+    return gpg_err_code_from_errno (errno);
+
+  o->blklen = _gcry_md_get_algo_dlen (hashalgo);
+  if (!o->blklen)
+    {
+      xfree (o);
+      return GPG_ERR_DIGEST_ALGO;
+    }
+  ec = _gcry_md_open (&o->md, hashalgo, 0);
+  if (ec)
+    {
+      xfree (o);
+      return ec;
+    }
+  o->algo = GCRY_KDF_X963_KDF;
+  o->outlen = outlen;
+  o->input = input;
+  o->inputlen = inputlen;
+  o->sharedinfo = sharedinfo;
+  o->sharedinfolen = sharedinfolen;
+
+  *hd = (void *)o;
+  return 0;
+}
+
+
+static gpg_err_code_t
+x963_kdf_compute (x963_kdf_ctx_t o, const struct gcry_kdf_thread_ops *ops)
+{
+  (void)o;
+
+  if (ops != NULL)
+    return GPG_ERR_INV_VALUE;
+
+  return 0;
+}
+
+static gpg_err_code_t
+x963_kdf_final (x963_kdf_ctx_t o, size_t resultlen, void *result)
+{
+  u32 counter = 0;
+  unsigned char cnt[4];
+  int i;
+
+  if (resultlen != o->outlen)
+    return GPG_ERR_INV_VALUE;
+
+  for (i = 0; i < o->outlen / o->blklen; i++)
+    {
+      counter++;
+      _gcry_md_write (o->md, o->input, o->inputlen);
+      buf_put_be32 (cnt, counter);
+      _gcry_md_write (o->md, cnt, sizeof (cnt));
+      if (o->sharedinfolen)
+        _gcry_md_write (o->md, o->sharedinfo, o->sharedinfolen);
+      _gcry_md_final (o->md);
+      memcpy ((char *)result + o->blklen * i,
+              _gcry_md_read (o->md, 0), o->blklen);
+      resultlen -= o->blklen;
+      _gcry_md_reset (o->md);
+    }
+
+  if (resultlen)
+    {
+      counter++;
+      _gcry_md_write (o->md, o->input, o->inputlen);
+      buf_put_be32 (cnt, counter);
+      _gcry_md_write (o->md, cnt, sizeof (cnt));
+      if (o->sharedinfolen)
+        _gcry_md_write (o->md, o->sharedinfo, o->sharedinfolen);
+      _gcry_md_final (o->md);
+      memcpy ((char *)result + o->blklen * i,
+              _gcry_md_read (o->md, 0), resultlen);
+    }
+
+  return 0;
+}
+
+static void
+x963_kdf_close (x963_kdf_ctx_t o)
+{
+  _gcry_md_close (o->md);
+  xfree (o);
+}
+\f
+struct gcry_kdf_handle {
+  int algo;
+  /* And algo specific parts come.  */
+};
+
+gpg_err_code_t
+_gcry_kdf_open (gcry_kdf_hd_t *hd, int algo, int subalgo,
+                const unsigned long *param, unsigned int paramlen,
+                const void *input, size_t inputlen,
+                const void *salt, size_t saltlen,
+                const void *key, size_t keylen,
+                const void *ad, size_t adlen)
+{
+  gpg_err_code_t ec;
+
+  switch (algo)
+    {
+    case GCRY_KDF_ARGON2:
+      if (!saltlen)
+        ec = GPG_ERR_INV_VALUE;
+      else
+        ec = argon2_open (hd, subalgo, param, paramlen,
+                          input, inputlen, salt, saltlen,
+                          key, keylen, ad, adlen);
+      break;
+
+    case GCRY_KDF_BALLOON:
+      if (!inputlen || !saltlen || keylen || adlen)
+        ec = GPG_ERR_INV_VALUE;
+      else
+        {
+          (void)key;
+          (void)ad;
+          ec = balloon_open (hd, subalgo, param, paramlen,
+                             input, inputlen, salt, saltlen);
+        }
+      break;
+
+    case GCRY_KDF_ONESTEP_KDF:
+      if (!inputlen || !paramlen || !adlen)
+        ec = GPG_ERR_INV_VALUE;
+      else
+        {
+          (void)salt;
+          (void)key;
+          ec = onestep_kdf_open (hd, subalgo, param, paramlen,
+                                 input, inputlen, ad, adlen);
+        }
+      break;
+
+    case GCRY_KDF_ONESTEP_KDF_MAC:
+      if (!inputlen || !paramlen || !keylen || !adlen)
+        ec = GPG_ERR_INV_VALUE;
+      else
+        {
+          (void)salt;
+          ec = onestep_kdf_mac_open (hd, subalgo, param, paramlen,
+                                     input, inputlen, key, keylen, ad, adlen);
+        }
+      break;
+
+    case GCRY_KDF_HKDF:
+      if (!inputlen || !paramlen)
+        ec = GPG_ERR_INV_VALUE;
+      else
+        {
+          (void)salt;
+          ec = hkdf_open (hd, subalgo, param, paramlen,
+                          input, inputlen, key, keylen, ad, adlen);
+        }
+      break;
+
+    case GCRY_KDF_X963_KDF:
+      if (!inputlen || !paramlen)
+        ec = GPG_ERR_INV_VALUE;
+      else
+        {
+          (void)salt;
+          (void)key;
+          ec = x963_kdf_open (hd, subalgo, param, paramlen,
+                              input, inputlen, ad, adlen);
+        }
+      break;
+
+    default:
+      ec = GPG_ERR_UNKNOWN_ALGORITHM;
+      break;
+    }
+
+  return ec;
+}
+
+gpg_err_code_t
+_gcry_kdf_compute (gcry_kdf_hd_t h, const struct gcry_kdf_thread_ops *ops)
+{
+  gpg_err_code_t ec;
+
+  switch (h->algo)
+    {
+    case GCRY_KDF_ARGON2:
+      ec = argon2_compute ((argon2_ctx_t)(void *)h, ops);
+      break;
+
+    case GCRY_KDF_BALLOON:
+      ec = balloon_compute_all ((balloon_ctx_t)(void *)h, ops);
+      break;
+
+    case GCRY_KDF_ONESTEP_KDF:
+      ec = onestep_kdf_compute ((onestep_kdf_ctx_t)(void *)h, ops);
+      break;
+
+    case GCRY_KDF_ONESTEP_KDF_MAC:
+      ec = onestep_kdf_mac_compute ((onestep_kdf_mac_ctx_t)(void *)h, ops);
+      break;
+
+    case GCRY_KDF_HKDF:
+      ec = hkdf_compute ((hkdf_ctx_t)(void *)h, ops);
+      break;
+
+    case GCRY_KDF_X963_KDF:
+      ec = x963_kdf_compute ((x963_kdf_ctx_t)(void *)h, ops);
+      break;
+
+    default:
+      ec = GPG_ERR_UNKNOWN_ALGORITHM;
+      break;
+    }
+
+  return ec;
+}
+
+
+gpg_err_code_t
+_gcry_kdf_final (gcry_kdf_hd_t h, size_t resultlen, void *result)
+{
+  gpg_err_code_t ec;
+
+  switch (h->algo)
+    {
+    case GCRY_KDF_ARGON2:
+      ec = argon2_final ((argon2_ctx_t)(void *)h, resultlen, result);
+      break;
+
+    case GCRY_KDF_BALLOON:
+      ec = balloon_final ((balloon_ctx_t)(void *)h, resultlen, result);
+      break;
+
+    case GCRY_KDF_ONESTEP_KDF:
+      ec = onestep_kdf_final ((onestep_kdf_ctx_t)(void *)h, resultlen, result);
+      break;
+
+    case GCRY_KDF_ONESTEP_KDF_MAC:
+      ec = onestep_kdf_mac_final ((onestep_kdf_mac_ctx_t)(void *)h,
+                                  resultlen, result);
+      break;
+
+    case GCRY_KDF_HKDF:
+      ec = hkdf_final ((hkdf_ctx_t)(void *)h, resultlen, result);
+      break;
+
+    case GCRY_KDF_X963_KDF:
+      ec = x963_kdf_final ((x963_kdf_ctx_t)(void *)h, resultlen, result);
+      break;
+
+    default:
+      ec = GPG_ERR_UNKNOWN_ALGORITHM;
+      break;
+    }
+
+  return ec;
+}
+
+void
+_gcry_kdf_close (gcry_kdf_hd_t h)
+{
+  switch (h->algo)
+    {
+    case GCRY_KDF_ARGON2:
+      argon2_close ((argon2_ctx_t)(void *)h);
+      break;
+
+    case GCRY_KDF_BALLOON:
+      balloon_close ((balloon_ctx_t)(void *)h);
+      break;
+
+    case GCRY_KDF_ONESTEP_KDF:
+      onestep_kdf_close ((onestep_kdf_ctx_t)(void *)h);
+      break;
+
+    case GCRY_KDF_ONESTEP_KDF_MAC:
+      onestep_kdf_mac_close ((onestep_kdf_mac_ctx_t)(void *)h);
+      break;
+
+    case GCRY_KDF_HKDF:
+      hkdf_close ((hkdf_ctx_t)(void *)h);
+      break;
+
+    case GCRY_KDF_X963_KDF:
+      x963_kdf_close ((x963_kdf_ctx_t)(void *)h);
+      break;
+
+    default:
+      break;
+    }
+}
+\f
 /* Check one KDF call with ALGO and HASH_ALGO using the regular KDF
  * API. (passphrase,passphraselen) is the password to be derived,
  * (salt,saltlen) the salt for the key derivation,
diff --git a/cipher/keccak-amd64-avx512.S b/cipher/keccak-amd64-avx512.S
new file mode 100644 (file)
index 0000000..b1fc7b6
--- /dev/null
@@ -0,0 +1,587 @@
+/* keccak-amd64-avx512.S  -  x86-64 AVX512 implementation of Keccak
+ *
+ * Copyright (C) 2022 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ *
+ * ---
+ *
+ * Core function `KeccakF1600_ce` based on ARMv8-CE KeccakF1600 implementation
+ * by Andy Polyakov from CRYPTOGAMS distribution `arm/keccak1600-armv8.pl`.
+ * `KeccakF1600_ce` was ported to x86-64 AVX512 and converted to use GCC
+ * preprocessed assembly and fitted with new absorb function optimized for
+ * x86-64. SHA3-256 performance on Intel tigerlake, 5.72 cpB.
+ *
+ * Original copyright license follows:
+ *
+ *  Copyright (c) 2006, CRYPTOGAMS by <appro@openssl.org>
+ *  All rights reserved.
+ *
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions
+ *  are met:
+ *
+ *        * Redistributions of source code must retain copyright notices,
+ *          this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above
+ *          copyright notice, this list of conditions and the following
+ *          disclaimer in the documentation and/or other materials
+ *          provided with the distribution.
+ *
+ *        * Neither the name of the CRYPTOGAMS nor the names of its
+ *          copyright holder and contributors may be used to endorse or
+ *          promote products derived from this software without specific
+ *          prior written permission.
+ *
+ *  ALTERNATIVELY, provided that this notice is retained in full, this
+ *  product may be distributed under the terms of the GNU General Public
+ *  License (GPL), in which case the provisions of the GPL apply INSTEAD OF
+ *  those given above.
+ *
+ *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER AND CONTRIBUTORS
+ *  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifdef __x86_64
+#include <config.h>
+#if defined(HAVE_GCC_INLINE_ASM_AVX512) && \
+   (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
+    defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS))
+
+#include "asm-common-amd64.h"
+
+.text
+
+/* Register macros. */
+#define A_0_0 %xmm31
+#define A_0_1 %xmm30
+#define A_0_2 %xmm29
+#define A_0_3 %xmm28
+#define A_0_4 %xmm27
+#define A_1_0 %xmm26
+#define A_1_1 %xmm25
+#define A_1_2 %xmm24
+#define A_1_3 %xmm23
+#define A_1_4 %xmm22
+#define A_2_0 %xmm21
+#define A_2_1 %xmm20
+#define A_2_2 %xmm19
+#define A_2_3 %xmm18
+#define A_2_4 %xmm17
+#define A_3_0 %xmm16
+#define A_3_1 %xmm15
+#define A_3_2 %xmm14
+#define A_3_3 %xmm13
+#define A_3_4 %xmm12
+#define A_4_0 %xmm11
+#define A_4_1 %xmm10
+#define A_4_2 %xmm9
+#define A_4_3 %xmm8
+#define A_4_4 %xmm7
+
+#define C_0 %xmm6
+#define C_1 %xmm5
+#define C_2 %xmm4
+#define C_3 %xmm3
+#define C_4 %xmm2
+#define C_5 %xmm1
+#define C_6 %xmm0
+
+#define D_0 C_4
+#define D_1 C_5
+#define D_2 C_6
+#define D_3 C_2
+#define D_4 C_3
+
+/* Helper macros for ARMv8-CE to x86-64/AVX512 conversion. */
+#define eor3_d(dst_s1, s2, s3) \
+       vpternlogq $0x96, s3, s2, dst_s1;
+
+#define eor3(dst, s1, s2, s3) \
+       vmovdqa s1, dst; \
+       eor3_d(dst, s2, s3);
+
+#define rax1_c(dst, s1, s2_rol1) \
+       vprolq $1, s2_rol1, dst; \
+       vpxor s1, dst, dst;
+
+#define rax1_t(dst_s1, s2_rol1, tmp) \
+       vprolq $1, s2_rol1, tmp; \
+       vpxor tmp, dst_s1, dst_s1;
+
+#define rax1_s(dst_s1, s2_rol1) \
+       vprolq $1, s2_rol1, s2_rol1; \
+       vpxor s2_rol1, dst_s1, dst_s1;
+
+#define xar(dst, s1, s2, rol) \
+       vpxorq s2, s1, dst; \
+       vprolq $(rol), dst, dst;
+
+#define xar_x(dst, s1, s2, rol) \
+       vpxor s2, s1, dst; \
+       vprolq $(rol), dst, dst;
+
+#define bcax_d(dst_s1, s2, s3) \
+       vpternlogq $0xb4, s3, s2, dst_s1;
+
+#define bcax(dst, s1, s2, s3) \
+       vmovdqa64 s1, dst; \
+       bcax_d(dst, s2, s3);
+
+#define bcax_x(dst, s1, s2, s3) \
+       vmovdqa s1, dst; \
+       bcax_d(dst, s2, s3);
+
+#define eor(dst, s1, s2) \
+       vpxorq s2, s1, dst;
+
+/* Misc helper macros. */
+#define clear_avx512_4regs(a, b, c, d) \
+       eor(a, a, a); eor(b, b, b); eor(c, c, c); eor(d, d, d);
+
+#define clear_regs() \
+       vzeroall; /* xmm0-xmm15 */ \
+       clear_avx512_4regs(%ymm16, %ymm17, %ymm18, %ymm19); \
+       clear_avx512_4regs(%ymm20, %ymm21, %ymm22, %ymm23); \
+       clear_avx512_4regs(%ymm24, %ymm25, %ymm26, %ymm27); \
+       clear_avx512_4regs(%ymm28, %ymm29, %ymm30, %ymm31);
+
+ELF(.type      KeccakF1600_ce,@function)
+.align 64, 0xcc
+KeccakF1600_ce:
+.Loop_ce:
+       CFI_STARTPROC()
+
+       ////////////////////////////////////////////////// Theta
+       eor3(   C_0, A_4_0, A_3_0, A_2_0)
+       eor3(   C_1, A_4_1, A_3_1, A_2_1)
+       eor3(   C_3, A_4_3, A_3_3, A_2_3)
+       eor3(   C_2, A_4_2, A_3_2, A_2_2)
+       eor3(   C_4, A_4_4, A_3_4, A_2_4)
+       eor3_d( C_0, A_1_0, A_0_0)
+       eor3_d( C_1, A_1_1, A_0_1)
+       eor3_d( C_3, A_1_3, A_0_3)
+       eor3_d( C_2, A_1_2, A_0_2)
+       eor3_d( C_4, A_1_4, A_0_4)
+
+       rax1_c( C_5, C_0, C_2)                  // D[1]
+       rax1_t( C_2, C_4, C_6)                  // D[3]
+       rax1_c( C_6, C_1, C_3)                  // D[2]
+       rax1_s( C_3, C_0)                       // D[4]
+       rax1_s( C_4, C_1)                       // D[0]
+
+       ////////////////////////////////////////////////// Theta+Rho+Pi
+       xar(    C_0, A_0_1, D_1, 1)             // C[0]=A[2][0]
+
+       xar(    A_0_1, A_1_1, D_1, 44)
+       xar(    A_1_1, A_1_4, D_4, 20)
+       xar(    A_1_4, A_4_2, D_2, 61)
+       xar(    A_4_2, A_2_4, D_4, 39)
+       xar(    A_2_4, A_4_0, D_0, 18)
+
+       xar(    C_1, A_0_2, D_2, 62)            // C[1]=A[4][0]
+
+       xar(    A_0_2, A_2_2, D_2, 43)
+       xar(    A_2_2, A_2_3, D_3, 25)
+       xar(    A_2_3, A_3_4, D_4, 8)
+       xar_x(  A_3_4, A_4_3, D_3, 56)
+       xar(    A_4_3, A_3_0, D_0, 41)
+
+       xar(    A_3_0, A_0_4, D_4, 27)
+
+       xar_x(  D_4, A_4_4, D_4, 14)            // D[4]=A[0][4]
+       xar_x(  A_4_4, A_4_1, D_1, 2)
+       xar(    A_1_3, A_1_3, D_3, 55)          // A[1][3]=A[4][1]
+       xar(    A_0_4, A_3_1, D_1, 45)          // A[0][4]=A[1][3]
+       xar(    A_3_1, A_1_0, D_0, 36)
+
+       xar(    A_1_0, A_0_3, D_3, 28)
+
+       eor(    A_0_0, A_0_0, D_0)
+
+       xar_x(  D_3, A_3_3, D_3, 21)            // D[3]=A[0][3]
+       xar(    A_0_3, A_3_2, D_2, 15)          // A[0][3]=A[3][3]
+       xar(    D_1, A_2_1, D_1, 10)            // D[1]=A[3][2]
+       xar(    D_2, A_1_2, D_2, 6)             // D[2]=A[2][1]
+       xar(    D_0, A_2_0, D_0, 3)             // D[0]=A[1][2]
+
+       ////////////////////////////////////////////////// Chi+Iota
+       bcax_x( A_4_0, C_1, A_4_2, A_1_3)       // A[1][3]=A[4][1]
+       bcax(   A_4_1, A_1_3, A_4_3, A_4_2)     // A[1][3]=A[4][1]
+       bcax_d( A_4_2, A_4_4, A_4_3)
+       bcax_d( A_4_3, C_1, A_4_4)
+       bcax_d( A_4_4, A_1_3, C_1)              // A[1][3]=A[4][1]
+
+       bcax_x( A_3_2, D_1, A_3_4, A_0_3)       // A[0][3]=A[3][3]
+       bcax(   A_3_3, A_0_3, A_3_0, A_3_4)     // A[0][3]=A[3][3]
+       bcax_d( A_3_4, A_3_1, A_3_0)
+       bcax_d( A_3_0, D_1, A_3_1)
+       bcax_d( A_3_1, A_0_3, D_1)              // A[0][3]=A[3][3]
+
+       bcax(   A_2_0, C_0, A_2_2, D_2)
+       bcax(   A_2_1, D_2, A_2_3, A_2_2)
+       bcax_d( A_2_2, A_2_4, A_2_3)
+       bcax_d( A_2_3, C_0, A_2_4)
+       bcax_d( A_2_4, D_2, C_0)
+
+       bcax(   A_1_2, D_0, A_1_4, A_0_4)       // A[0][4]=A[1][3]
+       bcax(   A_1_3, A_0_4, A_1_0, A_1_4)     // A[0][4]=A[1][3]
+       bcax_d( A_1_4, A_1_1, A_1_0)
+       bcax_d( A_1_0, D_0, A_1_1)
+       bcax_d( A_1_1, A_0_4, D_0)              // A[0][4]=A[1][3]
+
+       bcax(   A_0_3, D_3, A_0_0, D_4)
+       bcax(   A_0_4, D_4, A_0_1, A_0_0)
+       bcax_d( A_0_0, A_0_2, A_0_1)
+       bcax_d( A_0_1, D_3, A_0_2)
+       bcax_d( A_0_2, D_4, D_3)
+       eor(    A_0_0, A_0_0, (%r10))
+
+       cmpq    %r10, %r11
+       je      .Lend_ce
+
+       addq    $8, %r10
+       jmp     .Loop_ce
+
+.align 64, 0xcc
+.Lend_ce:
+       ret_spec_stop
+       CFI_ENDPROC()
+ELF(.size      KeccakF1600_ce,.-KeccakF1600_ce)
+
+.globl         _gcry_keccak_f1600_state_permute64_avx512
+ELF(.type      _gcry_keccak_f1600_state_permute64_avx512,@function)
+.align 64, 0xcc
+_gcry_keccak_f1600_state_permute64_avx512:
+       /* input:
+        *      %rdi: state
+        *      %rsi: round constants
+        */
+       CFI_STARTPROC()
+
+       spec_stop_avx512;
+
+       leaq            12*8(%rdi), %rax
+       leaq            (24-1)*8(%rsi), %r11
+
+       vmovdqu64       0*8(%rdi), A_0_0
+       vmovdqu64       1*8(%rdi), A_0_1
+       vmovdqu64       2*8(%rdi), A_0_2
+       vmovdqu64       3*8(%rdi), A_0_3
+       vmovdqu64       4*8(%rdi), A_0_4
+       vmovdqu64       5*8(%rdi), A_1_0
+       vmovdqu64       6*8(%rdi), A_1_1
+       vmovdqu64       7*8(%rdi), A_1_2
+       vmovdqu64       8*8(%rdi), A_1_3
+       vmovdqu64       9*8(%rdi), A_1_4
+       vmovdqu64       10*8(%rdi), A_2_0
+       vmovdqu64       11*8(%rdi), A_2_1
+       vmovdqu64       0*8(%rax), A_2_2
+       vmovdqu64       1*8(%rax), A_2_3
+       vmovdqu64       2*8(%rax), A_2_4
+       vmovdqu64       3*8(%rax), A_3_0
+       vmovdqu         4*8(%rax), A_3_1
+       vmovdqu         5*8(%rax), A_3_2
+       vmovdqu         6*8(%rax), A_3_3
+       vmovdqu         7*8(%rax), A_3_4
+       vmovdqu         8*8(%rax), A_4_0
+       vmovdqu         9*8(%rax), A_4_1
+       vmovdqu         10*8(%rax), A_4_2
+       vmovdqu         11*8(%rax), A_4_3
+       vmovq           12*8(%rax), A_4_4
+
+       movq            %rsi, %r10
+       call            KeccakF1600_ce
+
+       vpunpcklqdq     A_0_1, A_0_0, A_0_0
+       vpunpcklqdq     A_0_3, A_0_2, A_0_2
+       vpunpcklqdq     A_1_0, A_0_4, A_0_4
+       vpunpcklqdq     A_1_2, A_1_1, A_1_1
+       vpunpcklqdq     A_1_4, A_1_3, A_1_3
+       vpunpcklqdq     A_2_1, A_2_0, A_2_0
+       vpunpcklqdq     A_2_3, A_2_2, A_2_2
+       vpunpcklqdq     A_3_0, A_2_4, A_2_4
+       vpunpcklqdq     A_3_2, A_3_1, A_3_1
+       vpunpcklqdq     A_3_4, A_3_3, A_3_3
+       vpunpcklqdq     A_4_1, A_4_0, A_4_0
+       vpunpcklqdq     A_4_3, A_4_2, A_4_2
+       vmovdqu64       A_0_0, 0*8(%rdi)
+       vmovdqu64       A_0_2, 2*8(%rdi)
+       vmovdqu64       A_0_4, 4*8(%rdi)
+       vmovdqu64       A_1_1, 6*8(%rdi)
+       vmovdqu64       A_1_3, 8*8(%rdi)
+       vmovdqu64       A_2_0, 10*8(%rdi)
+       vmovdqu64       A_2_2, 0*8(%rax)
+       vmovdqu64       A_2_4, 2*8(%rax)
+       vmovdqu         A_3_1, 4*8(%rax)
+       vmovdqu         A_3_3, 6*8(%rax)
+       vmovdqu         A_4_0, 8*8(%rax)
+       vmovdqu         A_4_2, 10*8(%rax)
+       vmovq           A_4_4, 12*8(%rax)
+
+       xorl            %eax, %eax
+
+       clear_regs()
+       ret_spec_stop
+       CFI_ENDPROC()
+ELF(.size      _gcry_keccak_f1600_state_permute64_avx512,
+               .-_gcry_keccak_f1600_state_permute64_avx512)
+
+.globl         _gcry_keccak_absorb_blocks_avx512
+ELF(.type      _gcry_keccak_absorb_blocks_avx512,@function)
+.align 64, 0xcc
+_gcry_keccak_absorb_blocks_avx512:
+       /* input:
+        *      %rdi: state
+        *      %rsi: round constants
+        *      %rdx: lanes
+        *      %rcx: nlanes
+        *      %r8 : blocklanes
+        *      %r9 : lanes output pointer
+        */
+       CFI_STARTPROC()
+
+       spec_stop_avx512;
+
+       leaq            12*8(%rdi), %rax
+       leaq            (24-1)*8(%rsi), %r11
+
+       vmovdqu64       0*8(%rdi), A_0_0
+       vmovdqu64       1*8(%rdi), A_0_1
+       vmovdqu64       2*8(%rdi), A_0_2
+       vmovdqu64       3*8(%rdi), A_0_3
+       vmovdqu64       4*8(%rdi), A_0_4
+       vmovdqu64       5*8(%rdi), A_1_0
+       vmovdqu64       6*8(%rdi), A_1_1
+       vmovdqu64       7*8(%rdi), A_1_2
+       vmovdqu64       8*8(%rdi), A_1_3
+       vmovdqu64       9*8(%rdi), A_1_4
+       vmovdqu64       10*8(%rdi), A_2_0
+       vmovdqu64       11*8(%rdi), A_2_1
+       vmovdqu64       0*8(%rax), A_2_2
+       vmovdqu64       1*8(%rax), A_2_3
+       vmovdqu64       2*8(%rax), A_2_4
+       vmovdqu64       3*8(%rax), A_3_0
+       vmovdqu         4*8(%rax), A_3_1
+       vmovdqu         5*8(%rax), A_3_2
+       vmovdqu         6*8(%rax), A_3_3
+       vmovdqu         7*8(%rax), A_3_4
+       vmovdqu         8*8(%rax), A_4_0
+       vmovdqu         9*8(%rax), A_4_1
+       vmovdqu         10*8(%rax), A_4_2
+       vmovdqu         11*8(%rax), A_4_3
+       vmovq           12*8(%rax), A_4_4
+
+       cmpq            $(104 >> 3), %r8
+       jb              .Loop_absorb_72_ce
+       je              .Loop_absorb_104_ce
+       cmpq            $(144 >> 3), %r8
+       jb              .Loop_absorb_136_ce
+       je              .Loop_absorb_144_ce
+       jmp             .Loop_absorb_168_ce
+
+.align 64, 0xcc
+.Loop_absorb_168_ce:
+       subq            %r8, %rcx       // len - bsz
+       jb              .Labsorbed_ce
+
+       vpxorq          0*8(%rdx), A_0_0, A_0_0
+       vpxorq          1*8(%rdx), A_0_1, A_0_1
+       vpxorq          2*8(%rdx), A_0_2, A_0_2
+       vpxorq          3*8(%rdx), A_0_3, A_0_3
+       vpxorq          4*8(%rdx), A_0_4, A_0_4
+       vpxorq          5*8(%rdx), A_1_0, A_1_0
+       vpxorq          6*8(%rdx), A_1_1, A_1_1
+       vpxorq          7*8(%rdx), A_1_2, A_1_2
+       vpxorq          8*8(%rdx), A_1_3, A_1_3
+       vpxorq          9*8(%rdx), A_1_4, A_1_4
+       vpxorq          10*8(%rdx), A_2_0, A_2_0
+       vpxorq          11*8(%rdx), A_2_1, A_2_1
+       vpxorq          12*8(%rdx), A_2_2, A_2_2
+       vpxorq          13*8(%rdx), A_2_3, A_2_3
+       vpxorq          14*8(%rdx), A_2_4, A_2_4
+       vpxorq          15*8(%rdx), A_3_0, A_3_0
+       vpxor           16*8(%rdx), A_3_1, A_3_1
+       vpxor           17*8(%rdx), A_3_2, A_3_2
+       vpxor           18*8(%rdx), A_3_3, A_3_3
+       vpxor           19*8(%rdx), A_3_4, A_3_4
+       vmovq           20*8(%rdx), C_0
+       leaq            21*8(%rdx), %rdx
+       vpxorq          C_0, A_4_0, A_4_0
+
+       movq            %rsi, %r10
+       call            KeccakF1600_ce
+
+       jmp             .Loop_absorb_168_ce
+
+.align 64, 0xcc
+.Loop_absorb_144_ce:
+       subq            %r8, %rcx       // len - bsz
+       jb              .Labsorbed_ce
+
+       vpxorq          0*8(%rdx), A_0_0, A_0_0
+       vpxorq          1*8(%rdx), A_0_1, A_0_1
+       vpxorq          2*8(%rdx), A_0_2, A_0_2
+       vpxorq          3*8(%rdx), A_0_3, A_0_3
+       vpxorq          4*8(%rdx), A_0_4, A_0_4
+       vpxorq          5*8(%rdx), A_1_0, A_1_0
+       vpxorq          6*8(%rdx), A_1_1, A_1_1
+       vpxorq          7*8(%rdx), A_1_2, A_1_2
+       vpxorq          8*8(%rdx), A_1_3, A_1_3
+       vpxorq          9*8(%rdx), A_1_4, A_1_4
+       vpxorq          10*8(%rdx), A_2_0, A_2_0
+       vpxorq          11*8(%rdx), A_2_1, A_2_1
+       vpxorq          12*8(%rdx), A_2_2, A_2_2
+       vpxorq          13*8(%rdx), A_2_3, A_2_3
+       vpxorq          14*8(%rdx), A_2_4, A_2_4
+       vpxorq          15*8(%rdx), A_3_0, A_3_0
+       vpxor           16*8(%rdx), A_3_1, A_3_1
+       vmovq           17*8(%rdx), C_0
+       leaq            18*8(%rdx), %rdx
+       vpxor           C_0, A_3_2, A_3_2
+
+       movq            %rsi, %r10
+       call            KeccakF1600_ce
+
+       jmp             .Loop_absorb_144_ce
+
+.align 64, 0xcc
+.Loop_absorb_136_ce:
+       subq            %r8, %rcx       // len - bsz
+       jb              .Labsorbed_ce
+
+       vpxorq          0*8(%rdx), A_0_0, A_0_0
+       vpxorq          1*8(%rdx), A_0_1, A_0_1
+       vpxorq          2*8(%rdx), A_0_2, A_0_2
+       vpxorq          3*8(%rdx), A_0_3, A_0_3
+       vpxorq          4*8(%rdx), A_0_4, A_0_4
+       vpxorq          5*8(%rdx), A_1_0, A_1_0
+       vpxorq          6*8(%rdx), A_1_1, A_1_1
+       vpxorq          7*8(%rdx), A_1_2, A_1_2
+       vpxorq          8*8(%rdx), A_1_3, A_1_3
+       vpxorq          9*8(%rdx), A_1_4, A_1_4
+       vpxorq          10*8(%rdx), A_2_0, A_2_0
+       vpxorq          11*8(%rdx), A_2_1, A_2_1
+       vpxorq          12*8(%rdx), A_2_2, A_2_2
+       vpxorq          13*8(%rdx), A_2_3, A_2_3
+       vpxorq          14*8(%rdx), A_2_4, A_2_4
+       vpxorq          15*8(%rdx), A_3_0, A_3_0
+       vmovq           16*8(%rdx), C_0
+       leaq            17*8(%rdx), %rdx
+       vpxor           C_0, A_3_1, A_3_1
+
+       movq            %rsi, %r10
+       call            KeccakF1600_ce
+
+       jmp             .Loop_absorb_136_ce
+
+.align 64, 0xcc
+.Loop_absorb_104_ce:
+       subq            %r8, %rcx       // len - bsz
+       jb              .Labsorbed_ce
+
+       vpxorq          0*8(%rdx), A_0_0, A_0_0
+       vpxorq          1*8(%rdx), A_0_1, A_0_1
+       vpxorq          2*8(%rdx), A_0_2, A_0_2
+       vpxorq          3*8(%rdx), A_0_3, A_0_3
+       vpxorq          4*8(%rdx), A_0_4, A_0_4
+       vpxorq          5*8(%rdx), A_1_0, A_1_0
+       vpxorq          6*8(%rdx), A_1_1, A_1_1
+       vpxorq          7*8(%rdx), A_1_2, A_1_2
+       vpxorq          8*8(%rdx), A_1_3, A_1_3
+       vpxorq          9*8(%rdx), A_1_4, A_1_4
+       vpxorq          10*8(%rdx), A_2_0, A_2_0
+       vpxorq          11*8(%rdx), A_2_1, A_2_1
+       vmovq           12*8(%rdx), C_0
+       leaq            13*8(%rdx), %rdx
+       vpxorq          C_0, A_2_2, A_2_2
+
+       movq            %rsi, %r10
+       call            KeccakF1600_ce
+
+       jmp             .Loop_absorb_104_ce
+
+.align 64, 0xcc
+.Loop_absorb_72_ce:
+       subq            %r8, %rcx       // len - bsz
+       jb              .Labsorbed_ce
+
+       vpxorq          0*8(%rdx), A_0_0, A_0_0
+       vpxorq          1*8(%rdx), A_0_1, A_0_1
+       vpxorq          2*8(%rdx), A_0_2, A_0_2
+       vpxorq          3*8(%rdx), A_0_3, A_0_3
+       vpxorq          4*8(%rdx), A_0_4, A_0_4
+       vpxorq          5*8(%rdx), A_1_0, A_1_0
+       vpxorq          6*8(%rdx), A_1_1, A_1_1
+       vpxorq          7*8(%rdx), A_1_2, A_1_2
+       vmovq           8*8(%rdx), C_0
+       leaq            9*8(%rdx), %rdx
+       vpxorq          C_0, A_1_3, A_1_3
+
+       movq            %rsi, %r10
+       call            KeccakF1600_ce
+
+       jmp             .Loop_absorb_72_ce
+
+.align 64, 0xcc
+.Labsorbed_ce:
+       vpunpcklqdq     A_0_1, A_0_0, A_0_0
+       vpunpcklqdq     A_0_3, A_0_2, A_0_2
+       vpunpcklqdq     A_1_0, A_0_4, A_0_4
+       vpunpcklqdq     A_1_2, A_1_1, A_1_1
+       vpunpcklqdq     A_1_4, A_1_3, A_1_3
+       vpunpcklqdq     A_2_1, A_2_0, A_2_0
+       vpunpcklqdq     A_2_3, A_2_2, A_2_2
+       vpunpcklqdq     A_3_0, A_2_4, A_2_4
+       vpunpcklqdq     A_3_2, A_3_1, A_3_1
+       vpunpcklqdq     A_3_4, A_3_3, A_3_3
+       vpunpcklqdq     A_4_1, A_4_0, A_4_0
+       vpunpcklqdq     A_4_3, A_4_2, A_4_2
+       vmovdqu64       A_0_0, 0*8(%rdi)
+       vmovdqu64       A_0_2, 2*8(%rdi)
+       vmovdqu64       A_0_4, 4*8(%rdi)
+       vmovdqu64       A_1_1, 6*8(%rdi)
+       vmovdqu64       A_1_3, 8*8(%rdi)
+       vmovdqu64       A_2_0, 10*8(%rdi)
+       vmovdqu64       A_2_2, 0*8(%rax)
+       vmovdqu64       A_2_4, 2*8(%rax)
+       vmovdqu         A_3_1, 4*8(%rax)
+       vmovdqu         A_3_3, 6*8(%rax)
+       vmovdqu         A_4_0, 8*8(%rax)
+       vmovdqu         A_4_2, 10*8(%rax)
+       vmovq           A_4_4, 12*8(%rax)
+
+       leaq            (%r8, %rcx), %rax               // return value
+       movq            %rdx, (%r9)                     // return buffer pointer
+
+       clear_regs()
+       ret_spec_stop
+       CFI_ENDPROC()
+ELF(.size      _gcry_keccak_absorb_blocks_avx512,
+               .-_gcry_keccak_absorb_blocks_avx512)
+
+#endif /* HAVE_GCC_INLINE_ASM_AVX512 */
+#endif /* __x86_64 */
index 11e64b3e7851ada06c74c5c24f0f140e3bd0ed01..aaf83a62bfd2060741d32681aa3505c1fc43b284 100644 (file)
@@ -4,7 +4,7 @@
  * This file is part of Libgcrypt.
  *
  * Libgcrypt is free software; you can redistribute it and/or modify
- * it under the terms of the GNU Lesser general Public License as
+ * it under the terms of the GNU Lesser General Public License as
  * published by the Free Software Foundation; either version 2.1 of
  * the License, or (at your option) any later version.
  *
 #endif
 
 
+/* USE_64BIT_AVX512 indicates whether to compile with Intel AVX512 code. */
+#undef USE_64BIT_AVX512
+#if defined(USE_64BIT) && defined(__x86_64__) && \
+    defined(HAVE_GCC_INLINE_ASM_AVX512) && \
+    (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
+     defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS))
+# define USE_64BIT_AVX512 1
+#endif
+
+
 /* USE_64BIT_ARM_NEON indicates whether to enable 64-bit ARM/NEON assembly
  * code. */
 #undef USE_64BIT_ARM_NEON
 #endif /* USE_S390X_CRYPTO */
 
 
+/* x86-64 vector register assembly implementations use SystemV ABI, ABI
+ * conversion needed on Win64 through function attribute. */
+#undef ASM_FUNC_ABI
+#if defined(USE_64BIT_AVX512) && defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)
+# define ASM_FUNC_ABI __attribute__((sysv_abi))
+#else
+# define ASM_FUNC_ABI
+#endif
+
+
 #if defined(USE_64BIT) || defined(USE_64BIT_ARM_NEON)
 # define NEED_COMMON64 1
 #endif
 
 #define SHA3_DELIMITED_SUFFIX 0x06
 #define SHAKE_DELIMITED_SUFFIX 0x1F
-
+#define CSHAKE_DELIMITED_SUFFIX 0x04
 
 typedef struct
 {
@@ -123,7 +143,9 @@ typedef struct KECCAK_CONTEXT_S
   unsigned int outlen;
   unsigned int blocksize;
   unsigned int count;
-  unsigned int suffix;
+  unsigned int suffix:8;
+  unsigned int shake_in_extract_mode:1;
+  unsigned int shake_in_read_mode:1;
   const keccak_ops_t *ops;
 #ifdef USE_S390X_CRYPTO
   unsigned int kimd_func;
@@ -428,6 +450,68 @@ static const keccak_ops_t keccak_bmi2_64_ops =
 #endif /* USE_64BIT_BMI2 */
 
 
+/* 64-bit Intel AVX512 implementation. */
+#ifdef USE_64BIT_AVX512
+
+extern ASM_FUNC_ABI unsigned int
+_gcry_keccak_f1600_state_permute64_avx512(u64 *state, const u64 *rconst);
+
+extern ASM_FUNC_ABI unsigned int
+_gcry_keccak_absorb_blocks_avx512(u64 *state, const u64 *rconst,
+                                  const byte *lanes, u64 nlanes,
+                                  u64 blocklanes, u64 *new_lanes);
+
+static unsigned int
+keccak_f1600_state_permute64_avx512(KECCAK_STATE *hd)
+{
+  return _gcry_keccak_f1600_state_permute64_avx512 (
+                                hd->u.state64, _gcry_keccak_round_consts_64bit);
+}
+
+static unsigned int
+keccak_absorb_lanes64_avx512(KECCAK_STATE *hd, int pos, const byte *lanes,
+                            size_t nlanes, int blocklanes)
+{
+  while (nlanes)
+    {
+      if (pos == 0 && blocklanes > 0 && nlanes >= (size_t)blocklanes)
+        {
+          /* Get new pointer through u64 variable for "x32" compatibility. */
+          u64 new_lanes;
+          nlanes = _gcry_keccak_absorb_blocks_avx512 (
+                            hd->u.state64, _gcry_keccak_round_consts_64bit,
+                            lanes, nlanes, blocklanes, &new_lanes);
+          lanes = (const byte *)(uintptr_t)new_lanes;
+        }
+
+      while (nlanes)
+       {
+         hd->u.state64[pos] ^= buf_get_le64 (lanes);
+         lanes += 8;
+         nlanes--;
+
+         if (++pos == blocklanes)
+           {
+             keccak_f1600_state_permute64_avx512 (hd);
+             pos = 0;
+             break;
+           }
+       }
+    }
+
+  return 0;
+}
+
+static const keccak_ops_t keccak_avx512_64_ops =
+{
+  .permute = keccak_f1600_state_permute64_avx512,
+  .absorb = keccak_absorb_lanes64_avx512,
+  .extract = keccak_extract64,
+};
+
+#endif /* USE_64BIT_AVX512 */
+
+
 /* 64-bit ARMv7/NEON implementation. */
 #ifdef USE_64BIT_ARM_NEON
 
@@ -884,6 +968,8 @@ keccak_init (int algo, void *context, unsigned int flags)
   memset (hd, 0, sizeof *hd);
 
   ctx->count = 0;
+  ctx->shake_in_extract_mode = 0;
+  ctx->shake_in_read_mode = 0;
 
   /* Select generic implementation. */
 #ifdef USE_64BIT
@@ -894,6 +980,10 @@ keccak_init (int algo, void *context, unsigned int flags)
 
   /* Select optimized implementation based in hw features. */
   if (0) {}
+#ifdef USE_64BIT_AVX512
+  else if (features & HWF_INTEL_AVX512)
+    ctx->ops = &keccak_avx512_64_ops;
+#endif
 #ifdef USE_64BIT_ARM_NEON
   else if (features & HWF_ARM_NEON)
     ctx->ops = &keccak_armv7_neon_64_ops;
@@ -935,15 +1025,17 @@ keccak_init (int algo, void *context, unsigned int flags)
       ctx->blocksize = 576 / 8;
       ctx->outlen = 512 / 8;
       break;
+    case GCRY_MD_CSHAKE128:
     case GCRY_MD_SHAKE128:
       ctx->suffix = SHAKE_DELIMITED_SUFFIX;
       ctx->blocksize = 1344 / 8;
-      ctx->outlen = 0;
+      ctx->outlen = 256 / 8;
       break;
+    case GCRY_MD_CSHAKE256:
     case GCRY_MD_SHAKE256:
       ctx->suffix = SHAKE_DELIMITED_SUFFIX;
       ctx->blocksize = 1088 / 8;
-      ctx->outlen = 0;
+      ctx->outlen = 512 / 8;
       break;
     default:
       BUG();
@@ -969,9 +1061,11 @@ keccak_init (int algo, void *context, unsigned int flags)
        case GCRY_MD_SHA3_512:
          kimd_func = KMID_FUNCTION_SHA3_512;
          break;
+       case GCRY_MD_CSHAKE128:
        case GCRY_MD_SHAKE128:
          kimd_func = KMID_FUNCTION_SHAKE128;
          break;
+       case GCRY_MD_CSHAKE256:
        case GCRY_MD_SHAKE256:
          kimd_func = KMID_FUNCTION_SHAKE256;
          break;
@@ -1095,8 +1189,8 @@ keccak_read (void *context)
 }
 
 
-static void
-keccak_extract (void *context, void *out, size_t outlen)
+static gcry_err_code_t
+do_keccak_extract (void *context, void *out, size_t outlen)
 {
   KECCAK_CONTEXT *ctx = context;
   KECCAK_STATE *hd = &ctx->state;
@@ -1113,7 +1207,7 @@ keccak_extract (void *context, void *out, size_t outlen)
   if (ctx->kimd_func)
     {
       keccak_extract_s390x (context, out, outlen);
-      return;
+      return 0;
     }
 #endif
 
@@ -1218,6 +1312,52 @@ keccak_extract (void *context, void *out, size_t outlen)
 
   if (burn)
     _gcry_burn_stack (burn);
+
+  return 0;
+}
+
+
+static gcry_err_code_t
+keccak_extract (void *context, void *out, size_t outlen)
+{
+  KECCAK_CONTEXT *ctx = context;
+
+  if (ctx->shake_in_read_mode)
+    return GPG_ERR_INV_STATE;
+  if (!ctx->shake_in_extract_mode)
+    ctx->shake_in_extract_mode = 1;
+
+  return do_keccak_extract (context, out, outlen);
+}
+
+
+static byte *
+keccak_shake_read (void *context)
+{
+  KECCAK_CONTEXT *ctx = (KECCAK_CONTEXT *) context;
+  KECCAK_STATE *hd = &ctx->state;
+
+  if (ctx->shake_in_extract_mode)
+    {
+      /* Already in extract mode. */
+      return NULL;
+    }
+
+  if (!ctx->shake_in_read_mode)
+    {
+      byte tmpbuf[64];
+
+      gcry_assert(sizeof(tmpbuf) >= ctx->outlen);
+
+      ctx->shake_in_read_mode = 1;
+
+      do_keccak_extract (context, tmpbuf, ctx->outlen);
+      buf_cpy (&hd->u, tmpbuf, ctx->outlen);
+
+      wipememory(tmpbuf, sizeof(tmpbuf));
+    }
+
+  return (byte *)&hd->u;
 }
 
 
@@ -1232,10 +1372,10 @@ _gcry_sha3_hash_buffers (void *outbuf, size_t nbytes, const gcry_buffer_t *iov,
   for (;iovcnt > 0; iov++, iovcnt--)
     keccak_write (&hd, (const char*)iov[0].data + iov[0].off, iov[0].len);
   keccak_final (&hd);
-  if (spec->mdlen > 0)
+  if (hd.suffix == SHA3_DELIMITED_SUFFIX)
     memcpy (outbuf, keccak_read (&hd), spec->mdlen);
   else
-    keccak_extract (&hd, outbuf, nbytes);
+    do_keccak_extract (&hd, outbuf, nbytes);
 }
 
 
@@ -1287,6 +1427,146 @@ _gcry_shake256_hash_buffers (void *outbuf, size_t nbytes,
                           &_gcry_digest_spec_shake256);
 }
 
+
+static unsigned int
+cshake_input_n (KECCAK_CONTEXT *ctx, const void *n, unsigned int n_len)
+{
+  unsigned char buf[3];
+
+  buf[0] = 1;
+  buf[1] = ctx->blocksize;
+  keccak_write (ctx, buf, 2);
+
+  /* Here, N_LEN must be less than 255 */
+  if (n_len < 32)
+    {
+      buf[0] = 1;
+      buf[1] = n_len * 8;
+    }
+  else
+    {
+      buf[0] = 2;
+      buf[1] = (n_len * 8) >> 8;
+      buf[2] = (n_len * 8) & 0xff;
+    }
+
+  keccak_write (ctx, buf, buf[0] + 1);
+  keccak_write (ctx, n, n_len);
+  return 2 + buf[0] + 1 + n_len;
+}
+
+static void
+cshake_input_s (KECCAK_CONTEXT *ctx, const void *s, unsigned int s_len,
+                unsigned int len_written)
+{
+  unsigned char buf[168];
+  unsigned int padlen;
+
+  /* Here, S_LEN must be less than 255 */
+  if (s_len < 32)
+    {
+      buf[0] = 1;
+      buf[1] = s_len * 8;
+    }
+  else
+    {
+      buf[0] = 2;
+      buf[1] = (s_len * 8) >> 8;
+      buf[2] = (s_len * 8) & 0xff;
+    }
+
+  keccak_write (ctx, buf, buf[0] + 1);
+  keccak_write (ctx, s, s_len);
+
+  len_written += buf[0] + 1 + s_len;
+  padlen = ctx->blocksize - (len_written % ctx->blocksize);
+  memset (buf, 0, padlen);
+  keccak_write (ctx, buf, padlen);
+}
+
+gpg_err_code_t
+_gcry_cshake_customize (void *context, struct gcry_cshake_customization *p)
+{
+  KECCAK_CONTEXT *ctx = (KECCAK_CONTEXT *) context;
+  unsigned int len_written;
+
+  if (p->n_len >= 255 || p->s_len >= 255)
+    return GPG_ERR_TOO_LARGE;
+
+  if (p->n_len == 0 && p->s_len == 0)
+    /* No customization */
+    return 0;
+
+  len_written = cshake_input_n (ctx, p->n, p->n_len);
+  cshake_input_s (ctx, p->s, p->s_len, len_written);
+  ctx->suffix = CSHAKE_DELIMITED_SUFFIX;
+  return 0;
+}
+
+
+static void
+cshake128_init (void *context, unsigned int flags)
+{
+  keccak_init (GCRY_MD_CSHAKE128, context, flags);
+}
+
+static void
+cshake256_init (void *context, unsigned int flags)
+{
+  keccak_init (GCRY_MD_CSHAKE256, context, flags);
+}
+
+static void
+cshake_hash_buffers (const gcry_md_spec_t *spec, void *outbuf, size_t nbytes,
+                     const gcry_buffer_t *iov, int iovcnt)
+{
+  KECCAK_CONTEXT ctx;
+
+  spec->init (&ctx, 0);
+
+  if (iovcnt < 2)
+    ; /* No customization, do same as SHAKE does.  */
+  else
+    {
+      if (iov[0].len != 0 || iov[1].len != 0)
+        {
+          const void *n = (unsigned char *)iov[0].data + iov[0].off;
+          size_t n_len = iov[0].len;
+          const void *s = (unsigned char *)iov[1].data + iov[1].off;
+          size_t s_len = iov[1].len;
+          size_t len;
+
+          len = cshake_input_n (&ctx, n, n_len);
+          cshake_input_s (&ctx, s, s_len, len);
+          ctx.suffix = CSHAKE_DELIMITED_SUFFIX;
+        }
+      iovcnt -= 2;
+      iov += 2;
+    }
+
+  for (;iovcnt > 0; iov++, iovcnt--)
+    keccak_write (&ctx, (const char*)iov[0].data + iov[0].off, iov[0].len);
+  keccak_final (&ctx);
+  do_keccak_extract (&ctx, outbuf, nbytes);
+}
+
+static void
+_gcry_cshake128_hash_buffers (void *outbuf, size_t nbytes,
+                              const gcry_buffer_t *iov, int iovcnt)
+{
+  const gcry_md_spec_t *spec = &_gcry_digest_spec_shake128;
+
+  cshake_hash_buffers (spec, outbuf, nbytes, iov, iovcnt);
+}
+
+static void
+_gcry_cshake256_hash_buffers (void *outbuf, size_t nbytes,
+                              const gcry_buffer_t *iov, int iovcnt)
+{
+  const gcry_md_spec_t *spec = &_gcry_digest_spec_shake256;
+
+  cshake_hash_buffers (spec, outbuf, nbytes, iov, iovcnt);
+}
 \f
 /*
      Self-test section.
@@ -1369,6 +1649,7 @@ selftests_keccak (int algo, int extended, selftest_report_func_t report)
       hash_len = 64;
       break;
 
+    case GCRY_MD_CSHAKE128:
     case GCRY_MD_SHAKE128:
       short_hash =
        "\x58\x81\x09\x2d\xd8\x18\xbf\x5c\xf8\xa3\xdd\xb7\x93\xfb\xcb\xa7"
@@ -1382,6 +1663,7 @@ selftests_keccak (int algo, int extended, selftest_report_func_t report)
       hash_len = 32;
       break;
 
+    case GCRY_MD_CSHAKE256:
     case GCRY_MD_SHAKE256:
       short_hash =
        "\x48\x33\x66\x60\x13\x60\xa8\x77\x1c\x68\x63\x08\x0c\xc4\x11\x4d"
@@ -1441,7 +1723,9 @@ run_selftests (int algo, int extended, selftest_report_func_t report)
     case GCRY_MD_SHA3_256:
     case GCRY_MD_SHA3_384:
     case GCRY_MD_SHA3_512:
+    case GCRY_MD_CSHAKE128:
     case GCRY_MD_SHAKE128:
+    case GCRY_MD_CSHAKE256:
     case GCRY_MD_SHAKE256:
       ec = selftests_keccak (algo, extended, report);
       break;
@@ -1456,52 +1740,91 @@ run_selftests (int algo, int extended, selftest_report_func_t report)
 
 
 \f
-static const byte sha3_224_asn[] = { 0x30 };
+/* Object IDs obtained from
+ * https://csrc.nist.gov/projects/computer-security-objects-register/algorithm-registration#Hash
+ */
+static const byte sha3_224_asn[] =
+  { 0x30, 0x2d, 0x30, 0x0d, 0x06, 0x09, 0x60, 0x86, 0x48,
+    0x01, 0x65, 0x03, 0x04, 0x02, 0x07, 0x05, 0x00, 0x04,
+    0x1c
+  };
 static const gcry_md_oid_spec_t oid_spec_sha3_224[] =
   {
     { "2.16.840.1.101.3.4.2.7" },
-    /* PKCS#1 sha3_224WithRSAEncryption */
-    { "?" },
+    /* id-rsassa-pkcs1-v1-5-with-sha3-224 */
+    { "2.16.840.1.101.3.4.3.13" },
+    /* id-ecdsa-with-sha3-224 */
+    { "2.16.840.1.101.3.4.3.9" },
     { NULL }
   };
-static const byte sha3_256_asn[] = { 0x30 };
+static const byte sha3_256_asn[] =
+  { 0x30, 0x31, 0x30, 0x0d, 0x06, 0x09, 0x60, 0x86, 0x48,
+    0x01, 0x65, 0x03, 0x04, 0x02, 0x08, 0x05, 0x00, 0x04,
+    0x20
+  };
 static const gcry_md_oid_spec_t oid_spec_sha3_256[] =
   {
     { "2.16.840.1.101.3.4.2.8" },
-    /* PKCS#1 sha3_256WithRSAEncryption */
-    { "?" },
+    /* id-rsassa-pkcs1-v1-5-with-sha3-256 */
+    { "2.16.840.1.101.3.4.3.14" },
+    /* id-ecdsa-with-sha3-256 */
+    { "2.16.840.1.101.3.4.3.10" },
     { NULL }
   };
-static const byte sha3_384_asn[] = { 0x30 };
+static const byte sha3_384_asn[] =
+  { 0x30, 0x41, 0x30, 0x0d, 0x06, 0x09, 0x60, 0x86, 0x48,
+    0x01, 0x65, 0x03, 0x04, 0x02, 0x09, 0x05, 0x00, 0x04,
+    0x30
+  };
 static const gcry_md_oid_spec_t oid_spec_sha3_384[] =
   {
     { "2.16.840.1.101.3.4.2.9" },
-    /* PKCS#1 sha3_384WithRSAEncryption */
-    { "?" },
+    /* id-rsassa-pkcs1-v1-5-with-sha3-384 */
+    { "2.16.840.1.101.3.4.3.15" },
+    /* id-ecdsa-with-sha3-384 */
+    { "2.16.840.1.101.3.4.3.11" },
     { NULL }
   };
-static const byte sha3_512_asn[] = { 0x30 };
+static const byte sha3_512_asn[] =
+  { 0x30, 0x51, 0x30, 0x0d, 0x06, 0x09, 0x60, 0x86, 0x48,
+    0x01, 0x65, 0x03, 0x04, 0x02, 0x0a, 0x05, 0x00, 0x04,
+    0x40
+  };
 static const gcry_md_oid_spec_t oid_spec_sha3_512[] =
   {
     { "2.16.840.1.101.3.4.2.10" },
-    /* PKCS#1 sha3_512WithRSAEncryption */
-    { "?" },
+    /* id-rsassa-pkcs1-v1-5-with-sha3-512 */
+    { "2.16.840.1.101.3.4.3.16" },
+    /* id-ecdsa-with-sha3-512 */
+    { "2.16.840.1.101.3.4.3.12" },
     { NULL }
   };
-static const byte shake128_asn[] = { 0x30 };
+static const byte shake128_asn[] =
+  { 0x30, 0x31, 0x30, 0x0d, 0x06, 0x09, 0x60, 0x86, 0x48,
+    0x01, 0x65, 0x03, 0x04, 0x02, 0x0b, 0x05, 0x00, 0x04,
+    0x20
+  };
 static const gcry_md_oid_spec_t oid_spec_shake128[] =
   {
     { "2.16.840.1.101.3.4.2.11" },
-    /* PKCS#1 shake128WithRSAEncryption */
-    { "?" },
+    /* RFC 8692 id-RSASSA-PSS-SHAKE128 */
+    { "1.3.6.1.5.5.7.6.30" },
+    /* RFC 8692 id-ecdsa-with-shake128 */
+    { "1.3.6.1.5.5.7.6.32" },
     { NULL }
   };
-static const byte shake256_asn[] = { 0x30 };
+static const byte shake256_asn[] =
+  { 0x30, 0x51, 0x30, 0x0d, 0x06, 0x09, 0x60, 0x86, 0x48,
+    0x01, 0x65, 0x03, 0x04, 0x02, 0x0c, 0x05, 0x00, 0x04,
+    0x40
+  };
 static const gcry_md_oid_spec_t oid_spec_shake256[] =
   {
     { "2.16.840.1.101.3.4.2.12" },
-    /* PKCS#1 shake256WithRSAEncryption */
-    { "?" },
+    /* RFC 8692 id-RSASSA-PSS-SHAKE256 */
+    { "1.3.6.1.5.5.7.6.31" },
+    /* RFC 8692 id-ecdsa-with-shake256 */
+    { "1.3.6.1.5.5.7.6.33" },
     { NULL }
   };
 
@@ -1544,8 +1867,9 @@ const gcry_md_spec_t _gcry_digest_spec_sha3_512 =
 const gcry_md_spec_t _gcry_digest_spec_shake128 =
   {
     GCRY_MD_SHAKE128, {0, 1},
-    "SHAKE128", shake128_asn, DIM (shake128_asn), oid_spec_shake128, 0,
-    shake128_init, keccak_write, keccak_final, NULL, keccak_extract,
+    "SHAKE128", shake128_asn, DIM (shake128_asn), oid_spec_shake128, 32,
+    shake128_init, keccak_write, keccak_final, keccak_shake_read,
+    keccak_extract,
     _gcry_shake128_hash_buffers,
     sizeof (KECCAK_CONTEXT),
     run_selftests
@@ -1553,9 +1877,28 @@ const gcry_md_spec_t _gcry_digest_spec_shake128 =
 const gcry_md_spec_t _gcry_digest_spec_shake256 =
   {
     GCRY_MD_SHAKE256, {0, 1},
-    "SHAKE256", shake256_asn, DIM (shake256_asn), oid_spec_shake256, 0,
-    shake256_init, keccak_write, keccak_final, NULL, keccak_extract,
+    "SHAKE256", shake256_asn, DIM (shake256_asn), oid_spec_shake256, 64,
+    shake256_init, keccak_write, keccak_final, keccak_shake_read,
+    keccak_extract,
     _gcry_shake256_hash_buffers,
     sizeof (KECCAK_CONTEXT),
     run_selftests
   };
+const gcry_md_spec_t _gcry_digest_spec_cshake128 =
+  {
+    GCRY_MD_CSHAKE128, {0, 1},
+    "CSHAKE128", NULL, 0, NULL, 32,
+    cshake128_init, keccak_write, keccak_final, keccak_shake_read,
+    keccak_extract, _gcry_cshake128_hash_buffers,
+    sizeof (KECCAK_CONTEXT),
+    run_selftests
+  };
+const gcry_md_spec_t _gcry_digest_spec_cshake256 =
+  {
+    GCRY_MD_CSHAKE256, {0, 1},
+    "CSHAKE256", NULL, 0, NULL, 64,
+    cshake256_init, keccak_write, keccak_final, keccak_shake_read,
+    keccak_extract, _gcry_cshake256_hash_buffers,
+    sizeof (KECCAK_CONTEXT),
+    run_selftests
+  };
index 1ce42a42fc75696e3f15fd2adc9ad716811111a1..dbeaed2cba0c784117553c3861ccd5e9f214493e 100644 (file)
@@ -4,7 +4,7 @@
  * This file is part of Libgcrypt.
  *
  * Libgcrypt is free software; you can redistribute it and/or modify
- * it under the terms of the GNU Lesser general Public License as
+ * it under the terms of the GNU Lesser General Public License as
  * published by the Free Software Foundation; either version 2.1 of
  * the License, or (at your option) any later version.
  *
index 45ef462f2316a60f6cce5f5e009511c8959360b1..fbf5af2eec7f324dc5bfcd96c46365a4729be9f7 100644 (file)
@@ -4,7 +4,7 @@
  * This file is part of Libgcrypt.
  *
  * Libgcrypt is free software; you can redistribute it and/or modify
- * it under the terms of the GNU Lesser general Public License as
+ * it under the terms of the GNU Lesser General Public License as
  * published by the Free Software Foundation; either version 2.1 of
  * the License, or (at your option) any later version.
  *
diff --git a/cipher/kem-ecc.c b/cipher/kem-ecc.c
new file mode 100644 (file)
index 0000000..5525312
--- /dev/null
@@ -0,0 +1,332 @@
+/* kem-ecc.c - Key Encapsulation Mechanism with ECC
+ * Copyright (C) 2024 g10 Code GmbH
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser general Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <https://www.gnu.org/licenses/>.
+ * SPDX-License-Identifier: LGPL-2.1-or-later
+ *
+ */
+
+#include <config.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <errno.h>
+
+#include "g10lib.h"
+#include "cipher.h"
+
+#include "kem-ecc.h"
+
+#define ECC_PUBKEY_LEN_MAX 133
+#define ECC_SECKEY_LEN_MAX 66
+
+static const char *
+algo_to_curve (int algo)
+{
+  switch (algo)
+    {
+    case GCRY_KEM_RAW_X25519:
+    case GCRY_KEM_DHKEM25519:
+      return "Curve25519";
+
+    case GCRY_KEM_RAW_X448:
+    case GCRY_KEM_DHKEM448:
+      return "X448";
+
+    case GCRY_KEM_RAW_BP256:
+      return "brainpoolP256r1";
+
+    case GCRY_KEM_RAW_BP384:
+      return "brainpoolP384r1";
+
+    case GCRY_KEM_RAW_BP512:
+      return "brainpoolP512r1";
+
+    case GCRY_KEM_RAW_P256R1:
+      return "NIST P-256";
+
+    case GCRY_KEM_RAW_P384R1:
+      return "NIST P-384";
+
+    case GCRY_KEM_RAW_P521R1:
+      return "NIST P-521";
+
+    default:
+      return 0;
+    }
+}
+
+
+static int
+algo_to_seckey_len (int algo)
+{
+  switch (algo)
+    {
+    case GCRY_KEM_RAW_X25519:
+    case GCRY_KEM_DHKEM25519:
+      return 32;
+
+    case GCRY_KEM_RAW_X448:
+    case GCRY_KEM_DHKEM448:
+      return 56;
+
+    case GCRY_KEM_RAW_BP256:
+      return 32;
+
+    case GCRY_KEM_RAW_BP384:
+      return 48;
+
+    case GCRY_KEM_RAW_BP512:
+      return 64;
+
+    case GCRY_KEM_RAW_P256R1:
+      return 32;
+
+    case GCRY_KEM_RAW_P384R1:
+      return 48;
+
+    case GCRY_KEM_RAW_P521R1:
+      return 66;
+
+    default:
+      return 0;
+    }
+}
+
+
+static gpg_err_code_t
+ecc_mul_point (int algo, unsigned char *result, size_t result_len,
+               const unsigned char *scalar, size_t scalar_len,
+               const unsigned char *point, size_t point_len)
+{
+  const char *curve = algo_to_curve (algo);
+
+  return _gcry_ecc_curve_mul_point (curve, result, result_len,
+                                    scalar, scalar_len, point, point_len);
+}
+
+
+gpg_err_code_t
+_gcry_ecc_raw_keypair (int algo, void *pubkey, size_t pubkey_len,
+                       void *seckey, size_t seckey_len)
+{
+  const char *curve = algo_to_curve (algo);
+
+  return _gcry_ecc_curve_keypair (curve,
+                                  pubkey, pubkey_len, seckey, seckey_len);
+}
+
+gpg_err_code_t
+_gcry_ecc_raw_encap (int algo, const void *pubkey, size_t pubkey_len,
+                     void *ciphertext, size_t ciphertext_len,
+                     void *shared, size_t shared_len)
+{
+  gpg_err_code_t err;
+  unsigned char seckey_ephemeral[ECC_SECKEY_LEN_MAX];
+  void *pubkey_ephemeral = ciphertext;
+  size_t seckey_len;
+
+  if (ciphertext_len != pubkey_len)
+    return GPG_ERR_INV_VALUE;
+
+  seckey_len = algo_to_seckey_len (algo);
+  err = _gcry_ecc_raw_keypair (algo, pubkey_ephemeral, pubkey_len,
+                               seckey_ephemeral, seckey_len);
+  if (err)
+    return err;
+
+  /* Do ECDH.  */
+  return ecc_mul_point (algo, shared, shared_len, seckey_ephemeral, seckey_len,
+                        pubkey, pubkey_len);
+}
+
+gpg_err_code_t
+_gcry_ecc_raw_decap (int algo, const void *seckey, size_t seckey_len,
+                     const void *ciphertext, size_t ciphertext_len,
+                     void *shared, size_t shared_len)
+{
+  /* Do ECDH.  */
+  return ecc_mul_point (algo, shared, shared_len, seckey, seckey_len,
+                        ciphertext, ciphertext_len);
+}
+
+
+enum
+  {
+    DHKEM_X25519_HKDF_SHA256 = 0x20, /* Defined in RFC 9180.  */
+    DHKEM_X448_HKDF_SHA512   = 0x21
+  };
+
+static gpg_err_code_t
+ecc_dhkem_kdf (int kem_algo, size_t ecc_len,
+               const unsigned char *ecdh, const unsigned char *ciphertext,
+               const unsigned char *pubkey, void *shared)
+{
+  gpg_err_code_t err;
+  unsigned char *p;
+  unsigned char labeled_ikm[7+5+7+ECC_PUBKEY_LEN_MAX];
+  int labeled_ikm_size;
+  unsigned char labeled_info[2+7+5+13+2*ECC_PUBKEY_LEN_MAX];
+  int labeled_info_size;
+  gcry_kdf_hd_t hd;
+  unsigned long param[1];
+  int macalgo;
+  int mac_len;
+
+  if (kem_algo == DHKEM_X25519_HKDF_SHA256)
+    macalgo = GCRY_MAC_HMAC_SHA256;
+  else if (kem_algo == DHKEM_X448_HKDF_SHA512)
+    macalgo = GCRY_MAC_HMAC_SHA512;
+  else
+    return GPG_ERR_UNKNOWN_ALGORITHM;
+
+  mac_len = _gcry_mac_get_algo_maclen (macalgo);
+  param[0] = mac_len;
+  labeled_ikm_size = 7+5+7+ecc_len;
+  labeled_info_size = 2+7+5+13+ecc_len*2;
+
+  p = labeled_ikm;
+  memcpy (p, "HPKE-v1", 7);
+  p += 7;
+  memcpy (p, "KEM", 3);
+  p[3] = 0;
+  p[4] = kem_algo;
+  p += 5;
+  memcpy (p, "eae_prk", 7);
+  p += 7;
+  memcpy (p, ecdh, ecc_len);
+
+  p = labeled_info;
+  /* length */
+  p[0] = 0;
+  p[1] = mac_len;
+  p += 2;
+  memcpy (p, "HPKE-v1", 7);
+  p += 7;
+  memcpy (p, "KEM", 3);
+  p[3] = 0;
+  p[4] = kem_algo;
+  p += 5;
+  memcpy (p, "shared_secret", 13);
+  p += 13;
+  /* kem_context */
+  memcpy (p, ciphertext, ecc_len);
+  p += ecc_len;
+  memcpy (p, pubkey, ecc_len);
+  p += ecc_len;
+
+  err = _gcry_kdf_open (&hd, GCRY_KDF_HKDF, macalgo, param, 1,
+                        labeled_ikm, labeled_ikm_size,
+                        NULL, 0, NULL, 0, labeled_info, labeled_info_size);
+  if (err)
+    return err;
+
+  err = _gcry_kdf_compute (hd, NULL);
+  if (!err)
+    err = _gcry_kdf_final (hd, mac_len, shared);
+  _gcry_kdf_close (hd);
+  return err;
+}
+
+
+gpg_err_code_t
+_gcry_ecc_dhkem_encap (int algo, const void *pubkey, void *ciphertext,
+                       void *shared)
+{
+  gpg_err_code_t err;
+  unsigned char ecdh[ECC_PUBKEY_LEN_MAX];
+  unsigned char seckey_ephemeral[ECC_SECKEY_LEN_MAX];
+  void *pubkey_ephemeral = ciphertext;
+  int curveid;
+  int kem_algo;
+  size_t ecc_len;
+
+  if (algo == GCRY_KEM_DHKEM25519)
+    {
+      curveid = GCRY_ECC_CURVE25519;
+      kem_algo = DHKEM_X25519_HKDF_SHA256;
+    }
+  else if (algo == GCRY_KEM_DHKEM448)
+    {
+      curveid = GCRY_ECC_CURVE448;
+      kem_algo = DHKEM_X448_HKDF_SHA512;
+    }
+  else
+    return GPG_ERR_UNKNOWN_ALGORITHM;
+
+  ecc_len = _gcry_ecc_get_algo_keylen (curveid);
+
+  err = _gcry_ecc_raw_keypair (algo, pubkey_ephemeral, ecc_len,
+                               seckey_ephemeral, ecc_len);
+  if (err)
+    return err;
+
+  /* Do ECDH.  */
+  err = ecc_mul_point (algo, ecdh, ecc_len, seckey_ephemeral, ecc_len,
+                       pubkey, ecc_len);
+  if (err)
+    return err;
+
+  return ecc_dhkem_kdf (kem_algo, ecc_len, ecdh, ciphertext, pubkey, shared);
+}
+
+gpg_err_code_t
+_gcry_ecc_dhkem_decap (int algo, const void *seckey, const void *ciphertext,
+                       void *shared, const void *optional)
+{
+  gpg_err_code_t err;
+  unsigned char ecdh[ECC_PUBKEY_LEN_MAX];
+  unsigned char pubkey_computed[ECC_PUBKEY_LEN_MAX];
+  const unsigned char *pubkey;
+  int curveid;
+  int kem_algo;
+  size_t ecc_len;
+
+  if (algo == GCRY_KEM_DHKEM25519)
+    {
+      curveid = GCRY_ECC_CURVE25519;
+      kem_algo = DHKEM_X25519_HKDF_SHA256;
+    }
+  else if (algo == GCRY_KEM_DHKEM448)
+    {
+      curveid = GCRY_ECC_CURVE448;
+      kem_algo = DHKEM_X448_HKDF_SHA512;
+    }
+  else
+    return GPG_ERR_UNKNOWN_ALGORITHM;
+
+  ecc_len = _gcry_ecc_get_algo_keylen (curveid);
+
+  if (optional)
+    pubkey = optional;
+  else
+    {
+      err = ecc_mul_point (algo, pubkey_computed, ecc_len, seckey, ecc_len,
+                           NULL, ecc_len);
+      if (err)
+        return err;
+
+      pubkey = pubkey_computed;
+    }
+
+  /* Do ECDH.  */
+  err = ecc_mul_point (algo, ecdh, ecc_len, seckey, ecc_len,
+                       ciphertext, ecc_len);
+  if (err)
+    return err;
+
+  return ecc_dhkem_kdf (kem_algo, ecc_len, ecdh, ciphertext, pubkey, shared);
+}
diff --git a/cipher/kem-ecc.h b/cipher/kem-ecc.h
new file mode 100644 (file)
index 0000000..b320a71
--- /dev/null
@@ -0,0 +1,40 @@
+/* kem-ecc.h - Key Encapsulation Mechanism with ECC
+ * Copyright (C) 2024 g10 Code GmbH
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser general Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <https://www.gnu.org/licenses/>.
+ * SPDX-License-Identifier: LGPL-2.1-or-later
+ *
+ */
+
+gpg_err_code_t _gcry_ecc_raw_keypair (int algo,
+                                      void *pubkey, size_t pubkey_len,
+                                      void *seckey, size_t seckey_len);
+gpg_err_code_t _gcry_ecc_raw_encap (int algo,
+                                    const void *pubkey, size_t pubkey_len,
+                                    void *ciphertext, size_t ciphertext_len,
+                                    void *shared, size_t shared_len);
+gpg_err_code_t _gcry_ecc_raw_decap (int algo,
+                                    const void *seckey, size_t seckey_len,
+                                    const void *ciphertext,
+                                    size_t ciphertext_len,
+                                    void *shared, size_t shared_len);
+
+gpg_err_code_t _gcry_ecc_dhkem_encap (int algo, const void *pubkey,
+                                      void *ciphertext,
+                                      void *shared);
+gpg_err_code_t _gcry_ecc_dhkem_decap (int algo, const void *seckey,
+                                      const void *ciphertext,
+                                      void *shared, const void *optional);
diff --git a/cipher/kem.c b/cipher/kem.c
new file mode 100644 (file)
index 0000000..45a818b
--- /dev/null
@@ -0,0 +1,435 @@
+/* kem.c  - Key Encapsulation Mechanisms
+ * Copyright (C) 2023 Simon Josefsson <simon@josefsson.org>
+ * Copyright (C) 2023 g10 Code GmbH
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <https://www.gnu.org/licenses/>.
+ * SPDX-License-Identifier: LGPL-2.1-or-later
+ */
+
+#include <config.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <errno.h>
+
+#include "g10lib.h"
+#include "cipher.h"
+#include "sntrup761.h"
+#include "mceliece6688128f.h"
+#include "kyber.h"
+#include "kem-ecc.h"
+
+
+/* Information about the the KEM algoithms for use by the s-expression
+ * interface.  */
+static const struct
+{
+  const char *name;           /* Name of the algo.  */
+  unsigned int namelen;       /* Only here to avoid strlen calls.  */
+  int algo;                   /* KEM algo number.   */
+  unsigned int nbits;         /* Number of bits.    */
+  unsigned int fips:1;        /* True if this is a FIPS140-3 approved KEM. */
+  int pubkey_len;             /* Length of the public key.  */
+  int seckey_len;             /* Length of the secret key.  */
+} kem_infos[] =
+  {
+    { "sntrup761", 9, GCRY_KEM_SNTRUP761,  761, 0,
+      GCRY_KEM_SNTRUP761_PUBKEY_LEN, GCRY_KEM_SNTRUP761_SECKEY_LEN },
+    { "kyber512",  8, GCRY_KEM_MLKEM512,   512, 0,
+      GCRY_KEM_MLKEM512_PUBKEY_LEN,  GCRY_KEM_MLKEM512_SECKEY_LEN },
+    { "kyber768",  8, GCRY_KEM_MLKEM768,   768, 1,
+      GCRY_KEM_MLKEM768_PUBKEY_LEN,  GCRY_KEM_MLKEM768_SECKEY_LEN },
+    { "kyber1024", 9, GCRY_KEM_MLKEM1024, 1024, 1,
+      GCRY_KEM_MLKEM1024_PUBKEY_LEN, GCRY_KEM_MLKEM1024_SECKEY_LEN },
+    { NULL }
+  };
+
+/* This is a short version of kem_infos from above.  It is required
+ * for the algoithm module interface.  Keep in sync.  */
+static const char *kem_names[] =
+  {
+    "sntrup761",
+    "kyber512",
+    "kyber768",
+    "kyber1024",
+    NULL
+  };
+
+
+
+\f
+/* Helper for sntrup761.  */
+static void
+sntrup761_random (void *ctx, size_t length, uint8_t *dst)
+{
+  (void)ctx;
+
+  _gcry_randomize (dst, length, GCRY_STRONG_RANDOM);
+}
+
+
+gcry_err_code_t
+_gcry_kem_keypair (int algo,
+                   void *pubkey, size_t pubkey_len,
+                   void *seckey, size_t seckey_len)
+{
+  switch (algo)
+    {
+    case GCRY_KEM_SNTRUP761:
+      if (seckey_len != GCRY_KEM_SNTRUP761_SECKEY_LEN
+          || pubkey_len != GCRY_KEM_SNTRUP761_PUBKEY_LEN)
+        return GPG_ERR_INV_ARG;
+      sntrup761_keypair (pubkey, seckey, NULL, sntrup761_random);
+      return 0;
+
+    case GCRY_KEM_CM6688128F:
+      mceliece6688128f_keypair (pubkey, seckey);
+      return 0;
+
+    case GCRY_KEM_MLKEM512:
+      if (seckey_len != GCRY_KEM_MLKEM512_SECKEY_LEN
+          || pubkey_len != GCRY_KEM_MLKEM512_PUBKEY_LEN)
+        return GPG_ERR_INV_ARG;
+      kyber_keypair (algo, pubkey, seckey);
+      return 0;
+
+    case GCRY_KEM_MLKEM768:
+      if (seckey_len != GCRY_KEM_MLKEM768_SECKEY_LEN
+          || pubkey_len != GCRY_KEM_MLKEM768_PUBKEY_LEN)
+        return GPG_ERR_INV_ARG;
+      kyber_keypair (algo, pubkey, seckey);
+      return 0;
+
+    case GCRY_KEM_MLKEM1024:
+      if (seckey_len != GCRY_KEM_MLKEM1024_SECKEY_LEN
+          || pubkey_len != GCRY_KEM_MLKEM1024_PUBKEY_LEN)
+        return GPG_ERR_INV_ARG;
+      kyber_keypair (algo, pubkey, seckey);
+      return 0;
+
+    case GCRY_KEM_RAW_X25519:
+    case GCRY_KEM_RAW_X448:
+    case GCRY_KEM_RAW_BP256:
+    case GCRY_KEM_RAW_BP384:
+    case GCRY_KEM_RAW_BP512:
+    case GCRY_KEM_RAW_P256R1:
+    case GCRY_KEM_RAW_P384R1:
+    case GCRY_KEM_RAW_P521R1:
+    case GCRY_KEM_DHKEM25519:
+    case GCRY_KEM_DHKEM448:
+      return _gcry_ecc_raw_keypair (algo, pubkey, pubkey_len,
+                                    seckey, seckey_len);
+
+    default:
+      return GPG_ERR_UNKNOWN_ALGORITHM;
+    }
+
+  return GPG_ERR_UNKNOWN_ALGORITHM;
+}
+
+
+gcry_err_code_t
+_gcry_kem_encap (int algo,
+                 const void *pubkey, size_t pubkey_len,
+                 void *ciphertext, size_t ciphertext_len,
+                 void *shared, size_t shared_len,
+                 const void *optional, size_t optional_len)
+{
+  switch (algo)
+    {
+    case GCRY_KEM_SNTRUP761:
+      if (optional != NULL || optional_len != 0)
+        return GPG_ERR_INV_VALUE;
+      if (pubkey_len != GCRY_KEM_SNTRUP761_PUBKEY_LEN
+          || ciphertext_len != GCRY_KEM_SNTRUP761_ENCAPS_LEN
+          || shared_len != GCRY_KEM_SNTRUP761_SHARED_LEN)
+        return GPG_ERR_INV_VALUE;
+      sntrup761_enc (ciphertext, shared, pubkey, NULL, sntrup761_random);
+      return 0;
+
+    case GCRY_KEM_CM6688128F:
+      if (optional != NULL)
+       return GPG_ERR_INV_VALUE;
+      mceliece6688128f_enc (ciphertext, shared, pubkey);
+      return 0;
+
+    case GCRY_KEM_MLKEM512:
+    case GCRY_KEM_MLKEM768:
+    case GCRY_KEM_MLKEM1024:
+      if (optional != NULL)
+        return GPG_ERR_INV_VALUE;
+      kyber_encap (algo, ciphertext, shared, pubkey);
+      return 0;
+
+    case GCRY_KEM_RAW_X25519:
+    case GCRY_KEM_RAW_X448:
+    case GCRY_KEM_RAW_BP256:
+    case GCRY_KEM_RAW_BP384:
+    case GCRY_KEM_RAW_BP512:
+    case GCRY_KEM_RAW_P256R1:
+    case GCRY_KEM_RAW_P384R1:
+    case GCRY_KEM_RAW_P521R1:
+      if (optional != NULL)
+        return GPG_ERR_INV_VALUE;
+      return _gcry_ecc_raw_encap (algo, pubkey, pubkey_len,
+                                  ciphertext, ciphertext_len,
+                                  shared, shared_len);
+
+    case GCRY_KEM_DHKEM25519:
+    case GCRY_KEM_DHKEM448:
+      if (optional != NULL)
+        return GPG_ERR_INV_VALUE;
+      return _gcry_ecc_dhkem_encap (algo, pubkey, ciphertext, shared);
+
+    default:
+      return GPG_ERR_UNKNOWN_ALGORITHM;
+    }
+  return GPG_ERR_UNKNOWN_ALGORITHM;
+}
+
+
+gcry_err_code_t
+_gcry_kem_decap (int algo,
+                 const void *seckey, size_t seckey_len,
+                 const void *ciphertext, size_t ciphertext_len,
+                 void *shared, size_t shared_len,
+                 const void *optional, size_t optional_len)
+{
+  switch (algo)
+    {
+    case GCRY_KEM_SNTRUP761:
+      if (optional != NULL || optional_len != 0)
+        return GPG_ERR_INV_VALUE;
+      if (seckey_len != GCRY_KEM_SNTRUP761_SECKEY_LEN
+          || ciphertext_len != GCRY_KEM_SNTRUP761_ENCAPS_LEN
+          || shared_len != GCRY_KEM_SNTRUP761_SHARED_LEN)
+        return GPG_ERR_INV_VALUE;
+      sntrup761_dec (shared, ciphertext, seckey);
+      return 0;
+
+    case GCRY_KEM_CM6688128F:
+      if (optional != NULL)
+       return GPG_ERR_INV_VALUE;
+      mceliece6688128f_dec (shared, ciphertext, seckey);
+      return 0;
+
+    case GCRY_KEM_MLKEM512:
+    case GCRY_KEM_MLKEM768:
+    case GCRY_KEM_MLKEM1024:
+      if (optional != NULL)
+        return GPG_ERR_INV_VALUE;
+      kyber_decap (algo, shared, ciphertext, seckey);
+      return 0;
+
+    case GCRY_KEM_RAW_X25519:
+    case GCRY_KEM_RAW_X448:
+    case GCRY_KEM_RAW_BP256:
+    case GCRY_KEM_RAW_BP384:
+    case GCRY_KEM_RAW_BP512:
+    case GCRY_KEM_RAW_P256R1:
+    case GCRY_KEM_RAW_P384R1:
+    case GCRY_KEM_RAW_P521R1:
+      if (optional != NULL)
+        return GPG_ERR_INV_VALUE;
+      return _gcry_ecc_raw_decap (algo, seckey, seckey_len,
+                                  ciphertext, ciphertext_len,
+                                  shared, shared_len);
+
+    case GCRY_KEM_DHKEM25519:
+    case GCRY_KEM_DHKEM448:
+      return _gcry_ecc_dhkem_decap (algo, seckey, ciphertext, shared,
+                                    optional);
+
+    default:
+      return GPG_ERR_UNKNOWN_ALGORITHM;
+    }
+  return GPG_ERR_UNKNOWN_ALGORITHM;
+}
+
+
+
+/* Generate a KEM keypair using the s-expression interface.  The
+ * GENPARAMS is prety simple in this case because it has only the
+ * algorithm name.  For example:
+ *   (kyber768)
+ */
+static gcry_err_code_t
+kem_generate (const gcry_sexp_t genparms, gcry_sexp_t *r_skey)
+{
+  gpg_err_code_t ec;
+  const char *algo;
+  size_t algolen;
+  const char *name;
+  int i;
+  int algoid;
+  void *pubkey = NULL;
+  void *seckey = NULL;
+  size_t pubkey_len, seckey_len;
+
+  algo = sexp_nth_data (genparms, 0, &algolen);
+  if (!algo || !algolen)
+    return GPG_ERR_PUBKEY_ALGO;
+  for (i=0; (name=kem_infos[i].name); i++)
+    if (kem_infos[i].namelen == algolen && !memcmp (name, algo, algolen))
+      break;
+  if (!name)
+    return GPG_ERR_WRONG_PUBKEY_ALGO;
+  algoid = kem_infos[i].algo;
+  pubkey_len = kem_infos[i].pubkey_len;
+  seckey_len = kem_infos[i].seckey_len;
+  /* (from here on we can jump to leave for cleanup)  */
+
+  /* Allocate buffers for the created key.  */
+  seckey = xtrycalloc_secure (1, seckey_len);
+  if (!seckey)
+    {
+      ec = gpg_err_code_from_syserror ();
+      goto leave;
+    }
+  pubkey = xtrycalloc (1, pubkey_len);
+  if (!pubkey)
+    {
+      ec = gpg_err_code_from_syserror ();
+      goto leave;
+    }
+
+  /* Generate key.  */
+  ec = _gcry_kem_keypair (algoid, pubkey, pubkey_len, seckey, seckey_len);
+  if (ec)
+    goto leave;
+
+  /* Put the key into an s-expression.  */
+  ec = sexp_build (r_skey, NULL,
+                   "(key-data"
+                   " (public-key"
+                   "  (%s(p%b)))"
+                   " (private-key"
+                   "  (%s(p%b)(s%b))))",
+                   name,
+                   (int)pubkey_len, pubkey,
+                   name,
+                   (int)pubkey_len, pubkey,
+                   (int)seckey_len, seckey);
+
+
+  /* FIXME: Add FIPS selftest.  */
+
+ leave:
+  if (seckey)
+    {
+      wipememory (seckey, seckey_len);
+      xfree (seckey);
+    }
+  xfree (pubkey);
+  return ec;
+}
+
+
+/* Compute a keygrip.  MD is the hash context which we are going to
+ * update.  KEYPARAM is an S-expression with the key parameters, this
+ * is usually a public key but may also be a secret key.  An example
+ * of such an S-expression is:
+ *
+ *     (kyber768
+ *       (p #4243...#)
+ *       (s #1718...#))
+ *
+ * What we hash is the algorithm name, \x00 and the value of p.
+ * Including the algorithm name allows us to see a different key
+ * despite that it uses the same parameters.  Whether this is a good
+ * decision is not clear - but it should not harm.
+ */
+static gpg_err_code_t
+kem_compute_keygrip (gcry_md_hd_t md, gcry_sexp_t keyparam)
+{
+  gcry_sexp_t l1;
+  const char *algo, *data;
+  size_t algolen, datalen;
+  const char *name;
+  int i;
+
+  algo = sexp_nth_data (keyparam, 0, &algolen);
+  if (!algo || !algolen)
+    return GPG_ERR_PUBKEY_ALGO;
+  for (i=0; (name=kem_infos[i].name); i++)
+    if (kem_infos[i].namelen == algolen && !memcmp (name, algo, algolen))
+      break;
+  if (!name)
+    return GPG_ERR_WRONG_PUBKEY_ALGO;
+
+  _gcry_md_write (md, name, algolen+1); /* (also hash the nul) */
+
+  l1 = sexp_find_token (keyparam, "p", 1);
+  if (!l1)
+    return GPG_ERR_NO_OBJ;
+
+  data = sexp_nth_data (l1, 1, &datalen);
+  if (!data)
+    {
+      sexp_release (l1);
+      return GPG_ERR_NO_OBJ;
+    }
+
+  _gcry_md_write (md, data, datalen);
+  sexp_release (l1);
+
+  return 0;
+}
+
+
+/* Return the number of bits for the key described by PARMS.  On error
+ * 0 is returned. */
+static unsigned int
+kem_get_nbits (gcry_sexp_t keyparam)
+{
+  const char *algo;
+  size_t algolen;
+  const char *name;
+  int i;
+
+  algo = sexp_nth_data (keyparam, 0, &algolen);
+  if (!algo || !algolen)
+    return 0;  /* GPG_ERR_PUBKEY_ALGO */
+  for (i=0; (name=kem_infos[i].name); i++)
+    if (kem_infos[i].namelen == algolen && !memcmp (name, algo, algolen))
+      break;
+  if (!name)
+    return 0;  /* GPG_ERR_WRONG_PUBKEY_ALGO */
+
+  return kem_infos[i].nbits;
+}
+
+
+/* Generic structure to represent some KEM algorithms in our public
+ * key system.  */
+gcry_pk_spec_t _gcry_pubkey_spec_kem =
+  {
+    GCRY_PK_KEM, { 0, 0 },
+    GCRY_PK_USAGE_ENCR,
+    "KEM", kem_names,
+    "p", "s", "k", "", "p",
+    kem_generate,
+    NULL,  /* kem_check_secret_key */
+    NULL,  /* encrypt_raw - Use gcry_kem_encap instead.  */
+    NULL,  /* decrypt_raw - Use gcry_kem_decap unstead.  */
+    NULL,  /* sign */
+    NULL,  /* verify */
+    kem_get_nbits,
+    NULL,  /* selftests */
+    kem_compute_keygrip,
+    NULL,  /* get_curve */
+    NULL   /* get_curve_param */
+  };
diff --git a/cipher/kyber-common.c b/cipher/kyber-common.c
new file mode 100644 (file)
index 0000000..5419030
--- /dev/null
@@ -0,0 +1,766 @@
+/* kyber-common.c - the Kyber key encapsulation mechanism (common part)
+ * Copyright (C) 2024 g10 Code GmbH
+ *
+ * This file was modified for use by Libgcrypt.
+ *
+ * This file is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * This file is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <https://www.gnu.org/licenses/>.
+ * SPDX-License-Identifier: LGPL-2.1-or-later
+ *
+ * You can also use this file under the same licence of original code.
+ * SPDX-License-Identifier: CC0 OR Apache-2.0
+ *
+ */
+/*
+  Original code from:
+
+  Repository: https://github.com/pq-crystals/kyber.git
+  Branch: standard
+  Commit: 11d00ff1f20cfca1f72d819e5a45165c1e0a2816
+
+  Licence:
+  Public Domain (https://creativecommons.org/share-your-work/public-domain/cc0/);
+  or Apache 2.0 License (https://www.apache.org/licenses/LICENSE-2.0.html).
+
+  Authors:
+        Joppe Bos
+        Léo Ducas
+        Eike Kiltz
+        Tancrède Lepoint
+        Vadim Lyubashevsky
+        John Schanck
+        Peter Schwabe
+        Gregor Seiler
+        Damien Stehlé
+
+  Kyber Home: https://www.pq-crystals.org/kyber/
+ */
+/*
+ * From original code, following modification was made.
+ *
+ * - C++ style comments are changed to C-style.
+ *
+ * - Functions "poly_cbd_eta1" "poly_cbd_eta2" are removed.
+ *
+ * - Constant "zeta" is static, not available outside.
+ *
+ * - "poly_compress" and "poly_decompress" are now two variants _128
+ *   and _160.
+ *
+ * - "poly_getnoise_eta1" is now two variants _2 and _3_4.
+ *
+ * - "poly_getnoise_eta2" directly uses "cbd2" function.
+ */
+
+/*************** kyber/ref/cbd.c */
+
+/*************************************************
+* Name:        load32_littleendian
+*
+* Description: load 4 bytes into a 32-bit integer
+*              in little-endian order
+*
+* Arguments:   - const uint8_t *x: pointer to input byte array
+*
+* Returns 32-bit unsigned integer loaded from x
+**************************************************/
+static uint32_t load32_littleendian(const uint8_t x[4])
+{
+  uint32_t r;
+  r  = (uint32_t)x[0];
+  r |= (uint32_t)x[1] << 8;
+  r |= (uint32_t)x[2] << 16;
+  r |= (uint32_t)x[3] << 24;
+  return r;
+}
+
+/*************************************************
+* Name:        load24_littleendian
+*
+* Description: load 3 bytes into a 32-bit integer
+*              in little-endian order.
+*              This function is only needed for Kyber-512
+*
+* Arguments:   - const uint8_t *x: pointer to input byte array
+*
+* Returns 32-bit unsigned integer loaded from x (most significant byte is zero)
+**************************************************/
+#if !defined(KYBER_K) || KYBER_K == 2
+static uint32_t load24_littleendian(const uint8_t x[3])
+{
+  uint32_t r;
+  r  = (uint32_t)x[0];
+  r |= (uint32_t)x[1] << 8;
+  r |= (uint32_t)x[2] << 16;
+  return r;
+}
+#endif
+
+
+/*************************************************
+* Name:        cbd2
+*
+* Description: Given an array of uniformly random bytes, compute
+*              polynomial with coefficients distributed according to
+*              a centered binomial distribution with parameter eta=2
+*
+* Arguments:   - poly *r: pointer to output polynomial
+*              - const uint8_t *buf: pointer to input byte array
+**************************************************/
+static void cbd2(poly *r, const uint8_t buf[2*KYBER_N/4])
+{
+  unsigned int i,j;
+  uint32_t t,d;
+  int16_t a,b;
+
+  for(i=0;i<KYBER_N/8;i++) {
+    t  = load32_littleendian(buf+4*i);
+    d  = t & 0x55555555;
+    d += (t>>1) & 0x55555555;
+
+    for(j=0;j<8;j++) {
+      a = (d >> (4*j+0)) & 0x3;
+      b = (d >> (4*j+2)) & 0x3;
+      r->coeffs[8*i+j] = a - b;
+    }
+  }
+}
+
+/*************************************************
+* Name:        cbd3
+*
+* Description: Given an array of uniformly random bytes, compute
+*              polynomial with coefficients distributed according to
+*              a centered binomial distribution with parameter eta=3.
+*              This function is only needed for Kyber-512
+*
+* Arguments:   - poly *r: pointer to output polynomial
+*              - const uint8_t *buf: pointer to input byte array
+**************************************************/
+#if !defined(KYBER_K) || KYBER_K == 2
+static void cbd3(poly *r, const uint8_t buf[3*KYBER_N/4])
+{
+  unsigned int i,j;
+  uint32_t t,d;
+  int16_t a,b;
+
+  for(i=0;i<KYBER_N/4;i++) {
+    t  = load24_littleendian(buf+3*i);
+    d  = t & 0x00249249;
+    d += (t>>1) & 0x00249249;
+    d += (t>>2) & 0x00249249;
+
+    for(j=0;j<4;j++) {
+      a = (d >> (6*j+0)) & 0x7;
+      b = (d >> (6*j+3)) & 0x7;
+      r->coeffs[4*i+j] = a - b;
+    }
+  }
+}
+#endif
+
+/*************** kyber/ref/indcpa.c */
+/*************************************************
+* Name:        rej_uniform
+*
+* Description: Run rejection sampling on uniform random bytes to generate
+*              uniform random integers mod q
+*
+* Arguments:   - int16_t *r: pointer to output buffer
+*              - unsigned int len: requested number of 16-bit integers (uniform mod q)
+*              - const uint8_t *buf: pointer to input buffer (assumed to be uniformly random bytes)
+*              - unsigned int buflen: length of input buffer in bytes
+*
+* Returns number of sampled 16-bit integers (at most len)
+**************************************************/
+static unsigned int rej_uniform(int16_t *r,
+                                unsigned int len,
+                                const uint8_t *buf,
+                                unsigned int buflen)
+{
+  unsigned int ctr, pos;
+  uint16_t val0, val1;
+
+  ctr = pos = 0;
+  while(ctr < len && pos + 3 <= buflen) {
+    val0 = ((buf[pos+0] >> 0) | ((uint16_t)buf[pos+1] << 8)) & 0xFFF;
+    val1 = ((buf[pos+1] >> 4) | ((uint16_t)buf[pos+2] << 4)) & 0xFFF;
+    pos += 3;
+
+    if(val0 < KYBER_Q)
+      r[ctr++] = val0;
+    if(ctr < len && val1 < KYBER_Q)
+      r[ctr++] = val1;
+  }
+
+  return ctr;
+}
+
+/*************** kyber/ref/ntt.c */
+/* Code to generate zetas and zetas_inv used in the number-theoretic transform:
+
+#define KYBER_ROOT_OF_UNITY 17
+
+static const uint8_t tree[128] = {
+  0, 64, 32, 96, 16, 80, 48, 112, 8, 72, 40, 104, 24, 88, 56, 120,
+  4, 68, 36, 100, 20, 84, 52, 116, 12, 76, 44, 108, 28, 92, 60, 124,
+  2, 66, 34, 98, 18, 82, 50, 114, 10, 74, 42, 106, 26, 90, 58, 122,
+  6, 70, 38, 102, 22, 86, 54, 118, 14, 78, 46, 110, 30, 94, 62, 126,
+  1, 65, 33, 97, 17, 81, 49, 113, 9, 73, 41, 105, 25, 89, 57, 121,
+  5, 69, 37, 101, 21, 85, 53, 117, 13, 77, 45, 109, 29, 93, 61, 125,
+  3, 67, 35, 99, 19, 83, 51, 115, 11, 75, 43, 107, 27, 91, 59, 123,
+  7, 71, 39, 103, 23, 87, 55, 119, 15, 79, 47, 111, 31, 95, 63, 127
+};
+
+void init_ntt() {
+  unsigned int i;
+  int16_t tmp[128];
+
+  tmp[0] = MONT;
+  for(i=1;i<128;i++)
+    tmp[i] = fqmul(tmp[i-1],MONT*KYBER_ROOT_OF_UNITY % KYBER_Q);
+
+  for(i=0;i<128;i++) {
+    zetas[i] = tmp[tree[i]];
+    if(zetas[i] > KYBER_Q/2)
+      zetas[i] -= KYBER_Q;
+    if(zetas[i] < -KYBER_Q/2)
+      zetas[i] += KYBER_Q;
+  }
+}
+*/
+
+static const int16_t zetas[128] = {
+  -1044,  -758,  -359, -1517,  1493,  1422,   287,   202,
+   -171,   622,  1577,   182,   962, -1202, -1474,  1468,
+    573, -1325,   264,   383,  -829,  1458, -1602,  -130,
+   -681,  1017,   732,   608, -1542,   411,  -205, -1571,
+   1223,   652,  -552,  1015, -1293,  1491,  -282, -1544,
+    516,    -8,  -320,  -666, -1618, -1162,   126,  1469,
+   -853,   -90,  -271,   830,   107, -1421,  -247,  -951,
+   -398,   961, -1508,  -725,   448, -1065,   677, -1275,
+  -1103,   430,   555,   843, -1251,   871,  1550,   105,
+    422,   587,   177,  -235,  -291,  -460,  1574,  1653,
+   -246,   778,  1159,  -147,  -777,  1483,  -602,  1119,
+  -1590,   644,  -872,   349,   418,   329,  -156,   -75,
+    817,  1097,   603,   610,  1322, -1285, -1465,   384,
+  -1215,  -136,  1218, -1335,  -874,   220, -1187, -1659,
+  -1185, -1530, -1278,   794, -1510,  -854,  -870,   478,
+   -108,  -308,   996,   991,   958, -1460,  1522,  1628
+};
+
+/*************************************************
+* Name:        fqmul
+*
+* Description: Multiplication followed by Montgomery reduction
+*
+* Arguments:   - int16_t a: first factor
+*              - int16_t b: second factor
+*
+* Returns 16-bit integer congruent to a*b*R^{-1} mod q
+**************************************************/
+static int16_t fqmul(int16_t a, int16_t b) {
+  return montgomery_reduce((int32_t)a*b);
+}
+
+/*************************************************
+* Name:        ntt
+*
+* Description: Inplace number-theoretic transform (NTT) in Rq.
+*              input is in standard order, output is in bitreversed order
+*
+* Arguments:   - int16_t r[256]: pointer to input/output vector of elements of Zq
+**************************************************/
+void ntt(int16_t r[256]) {
+  unsigned int len, start, j, k;
+  int16_t t, zeta;
+
+  k = 1;
+  for(len = 128; len >= 2; len >>= 1) {
+    for(start = 0; start < 256; start = j + len) {
+      zeta = zetas[k++];
+      for(j = start; j < start + len; j++) {
+        t = fqmul(zeta, r[j + len]);
+        r[j + len] = r[j] - t;
+        r[j] = r[j] + t;
+      }
+    }
+  }
+}
+
+/*************************************************
+* Name:        invntt_tomont
+*
+* Description: Inplace inverse number-theoretic transform in Rq and
+*              multiplication by Montgomery factor 2^16.
+*              Input is in bitreversed order, output is in standard order
+*
+* Arguments:   - int16_t r[256]: pointer to input/output vector of elements of Zq
+**************************************************/
+void invntt(int16_t r[256]) {
+  unsigned int start, len, j, k;
+  int16_t t, zeta;
+  const int16_t f = 1441; /* mont^2/128 */
+
+  k = 127;
+  for(len = 2; len <= 128; len <<= 1) {
+    for(start = 0; start < 256; start = j + len) {
+      zeta = zetas[k--];
+      for(j = start; j < start + len; j++) {
+        t = r[j];
+        r[j] = barrett_reduce(t + r[j + len]);
+        r[j + len] = r[j + len] - t;
+        r[j + len] = fqmul(zeta, r[j + len]);
+      }
+    }
+  }
+
+  for(j = 0; j < 256; j++)
+    r[j] = fqmul(r[j], f);
+}
+
+/*************************************************
+* Name:        basemul
+*
+* Description: Multiplication of polynomials in Zq[X]/(X^2-zeta)
+*              used for multiplication of elements in Rq in NTT domain
+*
+* Arguments:   - int16_t r[2]: pointer to the output polynomial
+*              - const int16_t a[2]: pointer to the first factor
+*              - const int16_t b[2]: pointer to the second factor
+*              - int16_t zeta: integer defining the reduction polynomial
+**************************************************/
+void basemul(int16_t r[2], const int16_t a[2], const int16_t b[2], int16_t zeta)
+{
+  r[0]  = fqmul(a[1], b[1]);
+  r[0]  = fqmul(r[0], zeta);
+  r[0] += fqmul(a[0], b[0]);
+  r[1]  = fqmul(a[0], b[1]);
+  r[1] += fqmul(a[1], b[0]);
+}
+/*************** kyber/ref/poly.c */
+
+/*************************************************
+* Name:        poly_compress
+*
+* Description: Compression and subsequent serialization of a polynomial
+*
+* Arguments:   - uint8_t *r: pointer to output byte array
+*                            (of length KYBER_POLYCOMPRESSEDBYTES)
+*              - const poly *a: pointer to input polynomial
+**************************************************/
+#if !defined(KYBER_K) || KYBER_K == 2 || KYBER_K == 3
+void poly_compress_128(uint8_t r[KYBER_POLYCOMPRESSEDBYTES_2_3], const poly *a)
+{
+  unsigned int i,j;
+  int32_t u;
+  uint32_t d0;
+  uint8_t t[8];
+
+  for(i=0;i<KYBER_N/8;i++) {
+    for(j=0;j<8;j++) {
+      /* map to positive standard representatives */
+      u  = a->coeffs[8*i+j];
+      u += (u >> 15) & KYBER_Q;
+/*    t[j] = ((((uint16_t)u << 4) + KYBER_Q/2)/KYBER_Q) & 15; */
+      d0 = u << 4;
+      d0 += 1665;
+      d0 *= 80635;
+      d0 >>= 28;
+      t[j] = d0 & 0xf;
+    }
+
+    r[0] = t[0] | (t[1] << 4);
+    r[1] = t[2] | (t[3] << 4);
+    r[2] = t[4] | (t[5] << 4);
+    r[3] = t[6] | (t[7] << 4);
+    r += 4;
+  }
+}
+#endif
+
+#if !defined(KYBER_K) || KYBER_K == 4
+void poly_compress_160(uint8_t r[KYBER_POLYCOMPRESSEDBYTES_4], const poly *a)
+{
+  unsigned int i,j;
+  int32_t u;
+  uint32_t d0;
+  uint8_t t[8];
+
+  for(i=0;i<KYBER_N/8;i++) {
+    for(j=0;j<8;j++) {
+      /* map to positive standard representatives */
+      u  = a->coeffs[8*i+j];
+      u += (u >> 15) & KYBER_Q;
+/*    t[j] = ((((uint32_t)u << 5) + KYBER_Q/2)/KYBER_Q) & 31; */
+      d0 = u << 5;
+      d0 += 1664;
+      d0 *= 40318;
+      d0 >>= 27;
+      t[j] = d0 & 0x1f;
+    }
+
+    r[0] = (t[0] >> 0) | (t[1] << 5);
+    r[1] = (t[1] >> 3) | (t[2] << 2) | (t[3] << 7);
+    r[2] = (t[3] >> 1) | (t[4] << 4);
+    r[3] = (t[4] >> 4) | (t[5] << 1) | (t[6] << 6);
+    r[4] = (t[6] >> 2) | (t[7] << 3);
+    r += 5;
+  }
+}
+#endif
+
+/*************************************************
+* Name:        poly_decompress
+*
+* Description: De-serialization and subsequent decompression of a polynomial;
+*              approximate inverse of poly_compress
+*
+* Arguments:   - poly *r: pointer to output polynomial
+*              - const uint8_t *a: pointer to input byte array
+*                                  (of length KYBER_POLYCOMPRESSEDBYTES bytes)
+**************************************************/
+#if !defined(KYBER_K) || KYBER_K == 2 || KYBER_K == 3
+void poly_decompress_128(poly *r, const uint8_t a[KYBER_POLYCOMPRESSEDBYTES_2_3])
+{
+  unsigned int i;
+  for(i=0;i<KYBER_N/2;i++) {
+    r->coeffs[2*i+0] = (((uint16_t)(a[0] & 15)*KYBER_Q) + 8) >> 4;
+    r->coeffs[2*i+1] = (((uint16_t)(a[0] >> 4)*KYBER_Q) + 8) >> 4;
+    a += 1;
+  }
+}
+#endif
+
+#if !defined(KYBER_K) || KYBER_K == 4
+void poly_decompress_160(poly *r, const uint8_t a[KYBER_POLYCOMPRESSEDBYTES_4])
+{
+  unsigned int i;
+  unsigned int j;
+  uint8_t t[8];
+  for(i=0;i<KYBER_N/8;i++) {
+    t[0] = (a[0] >> 0);
+    t[1] = (a[0] >> 5) | (a[1] << 3);
+    t[2] = (a[1] >> 2);
+    t[3] = (a[1] >> 7) | (a[2] << 1);
+    t[4] = (a[2] >> 4) | (a[3] << 4);
+    t[5] = (a[3] >> 1);
+    t[6] = (a[3] >> 6) | (a[4] << 2);
+    t[7] = (a[4] >> 3);
+    a += 5;
+
+    for(j=0;j<8;j++)
+      r->coeffs[8*i+j] = ((uint32_t)(t[j] & 31)*KYBER_Q + 16) >> 5;
+  }
+}
+#endif
+
+/*************************************************
+* Name:        poly_tobytes
+*
+* Description: Serialization of a polynomial
+*
+* Arguments:   - uint8_t *r: pointer to output byte array
+*                            (needs space for KYBER_POLYBYTES bytes)
+*              - const poly *a: pointer to input polynomial
+**************************************************/
+void poly_tobytes(uint8_t r[KYBER_POLYBYTES], const poly *a)
+{
+  unsigned int i;
+  uint16_t t0, t1;
+
+  for(i=0;i<KYBER_N/2;i++) {
+    /* map to positive standard representatives */
+    t0  = a->coeffs[2*i];
+    t0 += ((int16_t)t0 >> 15) & KYBER_Q;
+    t1 = a->coeffs[2*i+1];
+    t1 += ((int16_t)t1 >> 15) & KYBER_Q;
+    r[3*i+0] = (t0 >> 0);
+    r[3*i+1] = (t0 >> 8) | (t1 << 4);
+    r[3*i+2] = (t1 >> 4);
+  }
+}
+
+/*************************************************
+* Name:        poly_frombytes
+*
+* Description: De-serialization of a polynomial;
+*              inverse of poly_tobytes
+*
+* Arguments:   - poly *r: pointer to output polynomial
+*              - const uint8_t *a: pointer to input byte array
+*                                  (of KYBER_POLYBYTES bytes)
+**************************************************/
+void poly_frombytes(poly *r, const uint8_t a[KYBER_POLYBYTES])
+{
+  unsigned int i;
+  for(i=0;i<KYBER_N/2;i++) {
+    r->coeffs[2*i]   = ((a[3*i+0] >> 0) | ((uint16_t)a[3*i+1] << 8)) & 0xFFF;
+    r->coeffs[2*i+1] = ((a[3*i+1] >> 4) | ((uint16_t)a[3*i+2] << 4)) & 0xFFF;
+  }
+}
+
+/*************************************************
+* Name:        poly_frommsg
+*
+* Description: Convert 32-byte message to polynomial
+*
+* Arguments:   - poly *r: pointer to output polynomial
+*              - const uint8_t *msg: pointer to input message
+**************************************************/
+void poly_frommsg(poly *r, const uint8_t msg[KYBER_INDCPA_MSGBYTES])
+{
+  unsigned int i,j;
+  int16_t mask;
+
+#if (KYBER_INDCPA_MSGBYTES != KYBER_N/8)
+#error "KYBER_INDCPA_MSGBYTES must be equal to KYBER_N/8 bytes!"
+#endif
+
+  for(i=0;i<KYBER_N/8;i++) {
+    for(j=0;j<8;j++) {
+      mask = -(int16_t)((msg[i] >> j)&1);
+      r->coeffs[8*i+j] = mask & ((KYBER_Q+1)/2);
+    }
+  }
+}
+
+/*************************************************
+* Name:        poly_tomsg
+*
+* Description: Convert polynomial to 32-byte message
+*
+* Arguments:   - uint8_t *msg: pointer to output message
+*              - const poly *a: pointer to input polynomial
+**************************************************/
+void poly_tomsg(uint8_t msg[KYBER_INDCPA_MSGBYTES], const poly *a)
+{
+  unsigned int i,j;
+  uint32_t t;
+
+  for(i=0;i<KYBER_N/8;i++) {
+    msg[i] = 0;
+    for(j=0;j<8;j++) {
+      t  = a->coeffs[8*i+j];
+      /* t += ((int16_t)t >> 15) & KYBER_Q; */
+      /* t  = (((t << 1) + KYBER_Q/2)/KYBER_Q) & 1; */
+      t <<= 1;
+      t += 1665;
+      t *= 80635;
+      t >>= 28;
+      t &= 1;
+      msg[i] |= t << j;
+    }
+  }
+}
+
+/*************************************************
+* Name:        poly_getnoise_eta1
+*
+* Description: Sample a polynomial deterministically from a seed and a nonce,
+*              with output polynomial close to centered binomial distribution
+*              with parameter KYBER_ETA1
+*
+* Arguments:   - poly *r: pointer to output polynomial
+*              - const uint8_t *seed: pointer to input seed
+*                                     (of length KYBER_SYMBYTES bytes)
+*              - uint8_t nonce: one-byte input nonce
+**************************************************/
+#if !defined(KYBER_K) || KYBER_K == 2
+void poly_getnoise_eta1_2(poly *r, const uint8_t seed[KYBER_SYMBYTES], uint8_t nonce)
+{
+  uint8_t buf[KYBER_ETA1_2*KYBER_N/4];
+  prf(buf, sizeof(buf), seed, nonce);
+  cbd3(r, buf);
+}
+#endif
+
+#if !defined(KYBER_K) || KYBER_K == 3 || KYBER_K == 4
+void poly_getnoise_eta1_3_4(poly *r, const uint8_t seed[KYBER_SYMBYTES], uint8_t nonce)
+{
+  uint8_t buf[KYBER_ETA1_3_4*KYBER_N/4];
+  prf(buf, sizeof(buf), seed, nonce);
+  cbd2(r, buf);
+}
+#endif
+
+/*************************************************
+* Name:        poly_getnoise_eta2
+*
+* Description: Sample a polynomial deterministically from a seed and a nonce,
+*              with output polynomial close to centered binomial distribution
+*              with parameter KYBER_ETA2
+*
+* Arguments:   - poly *r: pointer to output polynomial
+*              - const uint8_t *seed: pointer to input seed
+*                                     (of length KYBER_SYMBYTES bytes)
+*              - uint8_t nonce: one-byte input nonce
+**************************************************/
+void poly_getnoise_eta2(poly *r, const uint8_t seed[KYBER_SYMBYTES], uint8_t nonce)
+{
+  uint8_t buf[KYBER_ETA2*KYBER_N/4];
+  prf(buf, sizeof(buf), seed, nonce);
+  cbd2(r, buf);
+}
+
+
+/*************************************************
+* Name:        poly_ntt
+*
+* Description: Computes negacyclic number-theoretic transform (NTT) of
+*              a polynomial in place;
+*              inputs assumed to be in normal order, output in bitreversed order
+*
+* Arguments:   - uint16_t *r: pointer to in/output polynomial
+**************************************************/
+void poly_ntt(poly *r)
+{
+  ntt(r->coeffs);
+  poly_reduce(r);
+}
+
+/*************************************************
+* Name:        poly_invntt_tomont
+*
+* Description: Computes inverse of negacyclic number-theoretic transform (NTT)
+*              of a polynomial in place;
+*              inputs assumed to be in bitreversed order, output in normal order
+*
+* Arguments:   - uint16_t *a: pointer to in/output polynomial
+**************************************************/
+void poly_invntt_tomont(poly *r)
+{
+  invntt(r->coeffs);
+}
+
+/*************************************************
+* Name:        poly_basemul_montgomery
+*
+* Description: Multiplication of two polynomials in NTT domain
+*
+* Arguments:   - poly *r: pointer to output polynomial
+*              - const poly *a: pointer to first input polynomial
+*              - const poly *b: pointer to second input polynomial
+**************************************************/
+void poly_basemul_montgomery(poly *r, const poly *a, const poly *b)
+{
+  unsigned int i;
+  for(i=0;i<KYBER_N/4;i++) {
+    basemul(&r->coeffs[4*i], &a->coeffs[4*i], &b->coeffs[4*i], zetas[64+i]);
+    basemul(&r->coeffs[4*i+2], &a->coeffs[4*i+2], &b->coeffs[4*i+2], -zetas[64+i]);
+  }
+}
+
+/*************************************************
+* Name:        poly_tomont
+*
+* Description: Inplace conversion of all coefficients of a polynomial
+*              from normal domain to Montgomery domain
+*
+* Arguments:   - poly *r: pointer to input/output polynomial
+**************************************************/
+void poly_tomont(poly *r)
+{
+  unsigned int i;
+  const int16_t f = (1ULL << 32) % KYBER_Q;
+  for(i=0;i<KYBER_N;i++)
+    r->coeffs[i] = montgomery_reduce((int32_t)r->coeffs[i]*f);
+}
+
+/*************************************************
+* Name:        poly_reduce
+*
+* Description: Applies Barrett reduction to all coefficients of a polynomial
+*              for details of the Barrett reduction see comments in reduce.c
+*
+* Arguments:   - poly *r: pointer to input/output polynomial
+**************************************************/
+void poly_reduce(poly *r)
+{
+  unsigned int i;
+  for(i=0;i<KYBER_N;i++)
+    r->coeffs[i] = barrett_reduce(r->coeffs[i]);
+}
+
+/*************************************************
+* Name:        poly_add
+*
+* Description: Add two polynomials; no modular reduction is performed
+*
+* Arguments: - poly *r: pointer to output polynomial
+*            - const poly *a: pointer to first input polynomial
+*            - const poly *b: pointer to second input polynomial
+**************************************************/
+void poly_add(poly *r, const poly *a, const poly *b)
+{
+  unsigned int i;
+  for(i=0;i<KYBER_N;i++)
+    r->coeffs[i] = a->coeffs[i] + b->coeffs[i];
+}
+
+/*************************************************
+* Name:        poly_sub
+*
+* Description: Subtract two polynomials; no modular reduction is performed
+*
+* Arguments: - poly *r:       pointer to output polynomial
+*            - const poly *a: pointer to first input polynomial
+*            - const poly *b: pointer to second input polynomial
+**************************************************/
+void poly_sub(poly *r, const poly *a, const poly *b)
+{
+  unsigned int i;
+  for(i=0;i<KYBER_N;i++)
+    r->coeffs[i] = a->coeffs[i] - b->coeffs[i];
+}
+
+/*************** kyber/ref/reduce.c */
+
+/*************************************************
+* Name:        montgomery_reduce
+*
+* Description: Montgomery reduction; given a 32-bit integer a, computes
+*              16-bit integer congruent to a * R^-1 mod q, where R=2^16
+*
+* Arguments:   - int32_t a: input integer to be reduced;
+*                           has to be in {-q2^15,...,q2^15-1}
+*
+* Returns:     integer in {-q+1,...,q-1} congruent to a * R^-1 modulo q.
+**************************************************/
+int16_t montgomery_reduce(int32_t a)
+{
+  int16_t t;
+
+  t = (int16_t)a*QINV;
+  t = (a - (int32_t)t*KYBER_Q) >> 16;
+  return t;
+}
+
+/*************************************************
+* Name:        barrett_reduce
+*
+* Description: Barrett reduction; given a 16-bit integer a, computes
+*              centered representative congruent to a mod q in {-(q-1)/2,...,(q-1)/2}
+*
+* Arguments:   - int16_t a: input integer to be reduced
+*
+* Returns:     integer in {-(q-1)/2,...,(q-1)/2} congruent to a modulo q.
+**************************************************/
+int16_t barrett_reduce(int16_t a) {
+  int16_t t;
+  const int16_t v = ((1<<26) + KYBER_Q/2)/KYBER_Q;
+
+  t  = ((int32_t)v*a + (1<<25)) >> 26;
+  t *= KYBER_Q;
+  return a - t;
+}
diff --git a/cipher/kyber-kdep.c b/cipher/kyber-kdep.c
new file mode 100644 (file)
index 0000000..85a51c6
--- /dev/null
@@ -0,0 +1,825 @@
+/* kyber-kdep.c - the Kyber key encapsulation mechanism (KYBER_K dependent part)
+ * Copyright (C) 2024 g10 Code GmbH
+ *
+ * This file was modified for use by Libgcrypt.
+ *
+ * This file is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * This file is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <https://www.gnu.org/licenses/>.
+ * SPDX-License-Identifier: LGPL-2.1-or-later
+ *
+ * You can also use this file under the same licence of original code.
+ * SPDX-License-Identifier: CC0 OR Apache-2.0
+ *
+ */
+/*
+  Original code from:
+
+  Repository: https://github.com/pq-crystals/kyber.git
+  Branch: standard
+  Commit: 11d00ff1f20cfca1f72d819e5a45165c1e0a2816
+
+  Licence:
+  Public Domain (https://creativecommons.org/share-your-work/public-domain/cc0/);
+  or Apache 2.0 License (https://www.apache.org/licenses/LICENSE-2.0.html).
+
+  Authors:
+        Joppe Bos
+        Léo Ducas
+        Eike Kiltz
+        Tancrède Lepoint
+        Vadim Lyubashevsky
+        John Schanck
+        Peter Schwabe
+        Gregor Seiler
+        Damien Stehlé
+
+  Kyber Home: https://www.pq-crystals.org/kyber/
+ */
+/*
+ * From original code, following modification was made.
+ *
+ * - C++ style comments are changed to C-style.
+ *
+ * - With the change of "verify" routine (now "verify1"), no negation
+ *   for the cmov argument in crypto_kem_dec.
+ *
+ * - Call to xof_init and xof_close are added in gen_matrix.
+ */
+
+/*************** kyber/ref/polyvec.h */
+typedef struct{
+  poly vec[KYBER_K];
+} polyvec;
+
+static void polyvec_compress(uint8_t r[KYBER_POLYVECCOMPRESSEDBYTES], const polyvec *a);
+static void polyvec_decompress(polyvec *r, const uint8_t a[KYBER_POLYVECCOMPRESSEDBYTES]);
+
+static void polyvec_tobytes(uint8_t r[KYBER_POLYVECBYTES], const polyvec *a);
+static void polyvec_frombytes(polyvec *r, const uint8_t a[KYBER_POLYVECBYTES]);
+
+static void polyvec_ntt(polyvec *r);
+static void polyvec_invntt_tomont(polyvec *r);
+
+static void polyvec_basemul_acc_montgomery(poly *r, const polyvec *a, const polyvec *b);
+
+static void polyvec_reduce(polyvec *r);
+
+static void polyvec_add(polyvec *r, const polyvec *a, const polyvec *b);
+
+/*************** kyber/ref/indcpa.h */
+static void gen_matrix(polyvec *a, const uint8_t seed[KYBER_SYMBYTES], int transposed);
+
+static void indcpa_keypair_derand(uint8_t pk[KYBER_INDCPA_PUBLICKEYBYTES],
+                                  uint8_t sk[KYBER_INDCPA_SECRETKEYBYTES],
+                                  const uint8_t coins[KYBER_SYMBYTES]);
+
+static void indcpa_enc(uint8_t c[KYBER_INDCPA_BYTES],
+                       const uint8_t m[KYBER_INDCPA_MSGBYTES],
+                       const uint8_t pk[KYBER_INDCPA_PUBLICKEYBYTES],
+                       const uint8_t coins[KYBER_SYMBYTES]);
+
+static void indcpa_dec(uint8_t m[KYBER_INDCPA_MSGBYTES],
+                       const uint8_t c[KYBER_INDCPA_BYTES],
+                       const uint8_t sk[KYBER_INDCPA_SECRETKEYBYTES]);
+
+/*************** kyber/ref/kem.h */
+
+static int crypto_kem_keypair_derand(uint8_t *pk, uint8_t *sk, const uint8_t *coins);
+
+static int crypto_kem_enc_derand(uint8_t *ct, uint8_t *ss, const uint8_t *pk, const uint8_t *coins);
+
+/*************** kyber/ref/indcpa.c */
+
+/*************************************************
+* Name:        pack_pk
+*
+* Description: Serialize the public key as concatenation of the
+*              serialized vector of polynomials pk
+*              and the public seed used to generate the matrix A.
+*
+* Arguments:   uint8_t *r: pointer to the output serialized public key
+*              polyvec *pk: pointer to the input public-key polyvec
+*              const uint8_t *seed: pointer to the input public seed
+**************************************************/
+static void pack_pk(uint8_t r[KYBER_INDCPA_PUBLICKEYBYTES],
+                    polyvec *pk,
+                    const uint8_t seed[KYBER_SYMBYTES])
+{
+  polyvec_tobytes(r, pk);
+  memcpy(r+KYBER_POLYVECBYTES, seed, KYBER_SYMBYTES);
+}
+
+/*************************************************
+* Name:        unpack_pk
+*
+* Description: De-serialize public key from a byte array;
+*              approximate inverse of pack_pk
+*
+* Arguments:   - polyvec *pk: pointer to output public-key polynomial vector
+*              - uint8_t *seed: pointer to output seed to generate matrix A
+*              - const uint8_t *packedpk: pointer to input serialized public key
+**************************************************/
+static void unpack_pk(polyvec *pk,
+                      uint8_t seed[KYBER_SYMBYTES],
+                      const uint8_t packedpk[KYBER_INDCPA_PUBLICKEYBYTES])
+{
+  polyvec_frombytes(pk, packedpk);
+  memcpy(seed, packedpk+KYBER_POLYVECBYTES, KYBER_SYMBYTES);
+}
+
+/*************************************************
+* Name:        pack_sk
+*
+* Description: Serialize the secret key
+*
+* Arguments:   - uint8_t *r: pointer to output serialized secret key
+*              - polyvec *sk: pointer to input vector of polynomials (secret key)
+**************************************************/
+static void pack_sk(uint8_t r[KYBER_INDCPA_SECRETKEYBYTES], polyvec *sk)
+{
+  polyvec_tobytes(r, sk);
+}
+
+/*************************************************
+* Name:        unpack_sk
+*
+* Description: De-serialize the secret key; inverse of pack_sk
+*
+* Arguments:   - polyvec *sk: pointer to output vector of polynomials (secret key)
+*              - const uint8_t *packedsk: pointer to input serialized secret key
+**************************************************/
+static void unpack_sk(polyvec *sk, const uint8_t packedsk[KYBER_INDCPA_SECRETKEYBYTES])
+{
+  polyvec_frombytes(sk, packedsk);
+}
+
+/*************************************************
+* Name:        pack_ciphertext
+*
+* Description: Serialize the ciphertext as concatenation of the
+*              compressed and serialized vector of polynomials b
+*              and the compressed and serialized polynomial v
+*
+* Arguments:   uint8_t *r: pointer to the output serialized ciphertext
+*              poly *pk: pointer to the input vector of polynomials b
+*              poly *v: pointer to the input polynomial v
+**************************************************/
+static void pack_ciphertext(uint8_t r[KYBER_INDCPA_BYTES], polyvec *b, poly *v)
+{
+  polyvec_compress(r, b);
+  poly_compress(r+KYBER_POLYVECCOMPRESSEDBYTES, v);
+}
+
+/*************************************************
+* Name:        unpack_ciphertext
+*
+* Description: De-serialize and decompress ciphertext from a byte array;
+*              approximate inverse of pack_ciphertext
+*
+* Arguments:   - polyvec *b: pointer to the output vector of polynomials b
+*              - poly *v: pointer to the output polynomial v
+*              - const uint8_t *c: pointer to the input serialized ciphertext
+**************************************************/
+static void unpack_ciphertext(polyvec *b, poly *v, const uint8_t c[KYBER_INDCPA_BYTES])
+{
+  polyvec_decompress(b, c);
+  poly_decompress(v, c+KYBER_POLYVECCOMPRESSEDBYTES);
+}
+
+#define gen_a(A,B)  gen_matrix(A,B,0)
+#define gen_at(A,B) gen_matrix(A,B,1)
+
+/*************************************************
+* Name:        gen_matrix
+*
+* Description: Deterministically generate matrix A (or the transpose of A)
+*              from a seed. Entries of the matrix are polynomials that look
+*              uniformly random. Performs rejection sampling on output of
+*              a XOF
+*
+* Arguments:   - polyvec *a: pointer to ouptput matrix A
+*              - const uint8_t *seed: pointer to input seed
+*              - int transposed: boolean deciding whether A or A^T is generated
+**************************************************/
+#define GEN_MATRIX_NBLOCKS ((12*KYBER_N/8*(1 << 12)/KYBER_Q + XOF_BLOCKBYTES)/XOF_BLOCKBYTES)
+void gen_matrix(polyvec *a, const uint8_t seed[KYBER_SYMBYTES], int transposed)
+{
+  unsigned int ctr, i, j, k;
+  unsigned int buflen, off;
+  uint8_t buf[GEN_MATRIX_NBLOCKS*XOF_BLOCKBYTES+2];
+  xof_state state;
+
+  for(i=0;i<KYBER_K;i++) {
+    for(j=0;j<KYBER_K;j++) {
+      xof_init(&state);
+      if(transposed)
+        xof_absorb(&state, seed, i, j);
+      else
+        xof_absorb(&state, seed, j, i);
+
+      xof_squeezeblocks(buf, GEN_MATRIX_NBLOCKS, &state);
+      buflen = GEN_MATRIX_NBLOCKS*XOF_BLOCKBYTES;
+      ctr = rej_uniform(a[i].vec[j].coeffs, KYBER_N, buf, buflen);
+
+      while(ctr < KYBER_N) {
+        off = buflen % 3;
+        for(k = 0; k < off; k++)
+          buf[k] = buf[buflen - off + k];
+        xof_squeezeblocks(buf + off, 1, &state);
+        buflen = off + XOF_BLOCKBYTES;
+        ctr += rej_uniform(a[i].vec[j].coeffs + ctr, KYBER_N - ctr, buf, buflen);
+      }
+      xof_close (&state);
+    }
+  }
+}
+
+/*************************************************
+* Name:        indcpa_keypair_derand
+*
+* Description: Generates public and private key for the CPA-secure
+*              public-key encryption scheme underlying Kyber
+*
+* Arguments:   - uint8_t *pk: pointer to output public key
+*                             (of length KYBER_INDCPA_PUBLICKEYBYTES bytes)
+*              - uint8_t *sk: pointer to output private key
+*                             (of length KYBER_INDCPA_SECRETKEYBYTES bytes)
+*              - const uint8_t *coins: pointer to input randomness
+*                             (of length KYBER_SYMBYTES bytes)
+**************************************************/
+void indcpa_keypair_derand(uint8_t pk[KYBER_INDCPA_PUBLICKEYBYTES],
+                           uint8_t sk[KYBER_INDCPA_SECRETKEYBYTES],
+                           const uint8_t coins[KYBER_SYMBYTES])
+{
+  unsigned int i;
+  uint8_t buf[2*KYBER_SYMBYTES];
+  const uint8_t *publicseed = buf;
+  const uint8_t *noiseseed = buf+KYBER_SYMBYTES;
+  uint8_t nonce = 0;
+  polyvec a[KYBER_K], e, pkpv, skpv;
+
+  hash_g(buf, coins, KYBER_SYMBYTES);
+
+  gen_a(a, publicseed);
+
+  for(i=0;i<KYBER_K;i++)
+    poly_getnoise_eta1(&skpv.vec[i], noiseseed, nonce++);
+  for(i=0;i<KYBER_K;i++)
+    poly_getnoise_eta1(&e.vec[i], noiseseed, nonce++);
+
+  polyvec_ntt(&skpv);
+  polyvec_ntt(&e);
+
+  /* matrix-vector multiplication */
+  for(i=0;i<KYBER_K;i++) {
+    polyvec_basemul_acc_montgomery(&pkpv.vec[i], &a[i], &skpv);
+    poly_tomont(&pkpv.vec[i]);
+  }
+
+  polyvec_add(&pkpv, &pkpv, &e);
+  polyvec_reduce(&pkpv);
+
+  pack_sk(sk, &skpv);
+  pack_pk(pk, &pkpv, publicseed);
+}
+
+
+/*************************************************
+* Name:        indcpa_enc
+*
+* Description: Encryption function of the CPA-secure
+*              public-key encryption scheme underlying Kyber.
+*
+* Arguments:   - uint8_t *c: pointer to output ciphertext
+*                            (of length KYBER_INDCPA_BYTES bytes)
+*              - const uint8_t *m: pointer to input message
+*                                  (of length KYBER_INDCPA_MSGBYTES bytes)
+*              - const uint8_t *pk: pointer to input public key
+*                                   (of length KYBER_INDCPA_PUBLICKEYBYTES)
+*              - const uint8_t *coins: pointer to input random coins used as seed
+*                                      (of length KYBER_SYMBYTES) to deterministically
+*                                      generate all randomness
+**************************************************/
+void indcpa_enc(uint8_t c[KYBER_INDCPA_BYTES],
+                const uint8_t m[KYBER_INDCPA_MSGBYTES],
+                const uint8_t pk[KYBER_INDCPA_PUBLICKEYBYTES],
+                const uint8_t coins[KYBER_SYMBYTES])
+{
+  unsigned int i;
+  uint8_t seed[KYBER_SYMBYTES];
+  uint8_t nonce = 0;
+  polyvec sp, pkpv, ep, at[KYBER_K], b;
+  poly v, k, epp;
+
+  unpack_pk(&pkpv, seed, pk);
+  poly_frommsg(&k, m);
+  gen_at(at, seed);
+
+  for(i=0;i<KYBER_K;i++)
+    poly_getnoise_eta1(sp.vec+i, coins, nonce++);
+  for(i=0;i<KYBER_K;i++)
+    poly_getnoise_eta2(ep.vec+i, coins, nonce++);
+  poly_getnoise_eta2(&epp, coins, nonce++);
+
+  polyvec_ntt(&sp);
+
+  /* matrix-vector multiplication */
+  for(i=0;i<KYBER_K;i++)
+    polyvec_basemul_acc_montgomery(&b.vec[i], &at[i], &sp);
+
+  polyvec_basemul_acc_montgomery(&v, &pkpv, &sp);
+
+  polyvec_invntt_tomont(&b);
+  poly_invntt_tomont(&v);
+
+  polyvec_add(&b, &b, &ep);
+  poly_add(&v, &v, &epp);
+  poly_add(&v, &v, &k);
+  polyvec_reduce(&b);
+  poly_reduce(&v);
+
+  pack_ciphertext(c, &b, &v);
+}
+
+/*************************************************
+* Name:        indcpa_dec
+*
+* Description: Decryption function of the CPA-secure
+*              public-key encryption scheme underlying Kyber.
+*
+* Arguments:   - uint8_t *m: pointer to output decrypted message
+*                            (of length KYBER_INDCPA_MSGBYTES)
+*              - const uint8_t *c: pointer to input ciphertext
+*                                  (of length KYBER_INDCPA_BYTES)
+*              - const uint8_t *sk: pointer to input secret key
+*                                   (of length KYBER_INDCPA_SECRETKEYBYTES)
+**************************************************/
+void indcpa_dec(uint8_t m[KYBER_INDCPA_MSGBYTES],
+                const uint8_t c[KYBER_INDCPA_BYTES],
+                const uint8_t sk[KYBER_INDCPA_SECRETKEYBYTES])
+{
+  polyvec b, skpv;
+  poly v, mp;
+
+  unpack_ciphertext(&b, &v, c);
+  unpack_sk(&skpv, sk);
+
+  polyvec_ntt(&b);
+  polyvec_basemul_acc_montgomery(&mp, &skpv, &b);
+  poly_invntt_tomont(&mp);
+
+  poly_sub(&mp, &v, &mp);
+  poly_reduce(&mp);
+
+  poly_tomsg(m, &mp);
+}
+
+/*************** kyber/ref/kem.c */
+/*************************************************
+* Name:        crypto_kem_keypair_derand
+*
+* Description: Generates public and private key
+*              for CCA-secure Kyber key encapsulation mechanism
+*
+* Arguments:   - uint8_t *pk: pointer to output public key
+*                (an already allocated array of KYBER_PUBLICKEYBYTES bytes)
+*              - uint8_t *sk: pointer to output private key
+*                (an already allocated array of KYBER_SECRETKEYBYTES bytes)
+*              - uint8_t *coins: pointer to input randomness
+*                (an already allocated array filled with 2*KYBER_SYMBYTES random bytes)
+**
+* Returns 0 (success)
+**************************************************/
+int crypto_kem_keypair_derand(uint8_t *pk,
+                              uint8_t *sk,
+                              const uint8_t *coins)
+{
+  indcpa_keypair_derand(pk, sk, coins);
+  memcpy(sk+KYBER_INDCPA_SECRETKEYBYTES, pk, KYBER_PUBLICKEYBYTES);
+  hash_h(sk+KYBER_SECRETKEYBYTES-2*KYBER_SYMBYTES, pk, KYBER_PUBLICKEYBYTES);
+  /* Value z for pseudo-random output on reject */
+  memcpy(sk+KYBER_SECRETKEYBYTES-KYBER_SYMBYTES, coins+KYBER_SYMBYTES, KYBER_SYMBYTES);
+  return 0;
+}
+
+/*************************************************
+* Name:        crypto_kem_keypair
+*
+* Description: Generates public and private key
+*              for CCA-secure Kyber key encapsulation mechanism
+*
+* Arguments:   - uint8_t *pk: pointer to output public key
+*                (an already allocated array of KYBER_PUBLICKEYBYTES bytes)
+*              - uint8_t *sk: pointer to output private key
+*                (an already allocated array of KYBER_SECRETKEYBYTES bytes)
+*
+* Returns 0 (success)
+**************************************************/
+int crypto_kem_keypair(uint8_t *pk,
+                       uint8_t *sk)
+{
+  uint8_t coins[2*KYBER_SYMBYTES];
+  randombytes(coins, 2*KYBER_SYMBYTES);
+  crypto_kem_keypair_derand(pk, sk, coins);
+  return 0;
+}
+
+/*************************************************
+* Name:        crypto_kem_enc_derand
+*
+* Description: Generates cipher text and shared
+*              secret for given public key
+*
+* Arguments:   - uint8_t *ct: pointer to output cipher text
+*                (an already allocated array of KYBER_CIPHERTEXTBYTES bytes)
+*              - uint8_t *ss: pointer to output shared secret
+*                (an already allocated array of KYBER_SSBYTES bytes)
+*              - const uint8_t *pk: pointer to input public key
+*                (an already allocated array of KYBER_PUBLICKEYBYTES bytes)
+*              - const uint8_t *coins: pointer to input randomness
+*                (an already allocated array filled with KYBER_SYMBYTES random bytes)
+**
+* Returns 0 (success)
+**************************************************/
+int crypto_kem_enc_derand(uint8_t *ct,
+                          uint8_t *ss,
+                          const uint8_t *pk,
+                          const uint8_t *coins)
+{
+  uint8_t buf[2*KYBER_SYMBYTES];
+  /* Will contain key, coins */
+  uint8_t kr[2*KYBER_SYMBYTES];
+
+  memcpy(buf, coins, KYBER_SYMBYTES);
+
+  /* Multitarget countermeasure for coins + contributory KEM */
+  hash_h(buf+KYBER_SYMBYTES, pk, KYBER_PUBLICKEYBYTES);
+  hash_g(kr, buf, 2*KYBER_SYMBYTES);
+
+  /* coins are in kr+KYBER_SYMBYTES */
+  indcpa_enc(ct, buf, pk, kr+KYBER_SYMBYTES);
+
+  memcpy(ss,kr,KYBER_SYMBYTES);
+  return 0;
+}
+
+/*************************************************
+* Name:        crypto_kem_enc
+*
+* Description: Generates cipher text and shared
+*              secret for given public key
+*
+* Arguments:   - uint8_t *ct: pointer to output cipher text
+*                (an already allocated array of KYBER_CIPHERTEXTBYTES bytes)
+*              - uint8_t *ss: pointer to output shared secret
+*                (an already allocated array of KYBER_SSBYTES bytes)
+*              - const uint8_t *pk: pointer to input public key
+*                (an already allocated array of KYBER_PUBLICKEYBYTES bytes)
+*
+* Returns 0 (success)
+**************************************************/
+int crypto_kem_enc(uint8_t *ct,
+                   uint8_t *ss,
+                   const uint8_t *pk)
+{
+  uint8_t coins[KYBER_SYMBYTES];
+  randombytes(coins, KYBER_SYMBYTES);
+  crypto_kem_enc_derand(ct, ss, pk, coins);
+  return 0;
+}
+
+/*************************************************
+* Name:        crypto_kem_dec
+*
+* Description: Generates shared secret for given
+*              cipher text and private key
+*
+* Arguments:   - uint8_t *ss: pointer to output shared secret
+*                (an already allocated array of KYBER_SSBYTES bytes)
+*              - const uint8_t *ct: pointer to input cipher text
+*                (an already allocated array of KYBER_CIPHERTEXTBYTES bytes)
+*              - const uint8_t *sk: pointer to input private key
+*                (an already allocated array of KYBER_SECRETKEYBYTES bytes)
+*
+* Returns 0.
+*
+* On failure, ss will contain a pseudo-random value.
+**************************************************/
+int crypto_kem_dec(uint8_t *ss,
+                   const uint8_t *ct,
+                   const uint8_t *sk)
+{
+  unsigned int success;
+  uint8_t buf[2*KYBER_SYMBYTES];
+  /* Will contain key, coins */
+  uint8_t kr[2*KYBER_SYMBYTES];
+  uint8_t cmp[KYBER_CIPHERTEXTBYTES+KYBER_SYMBYTES];
+  const uint8_t *pk = sk+KYBER_INDCPA_SECRETKEYBYTES;
+
+  indcpa_dec(buf, ct, sk);
+
+  /* Multitarget countermeasure for coins + contributory KEM */
+  memcpy(buf+KYBER_SYMBYTES, sk+KYBER_SECRETKEYBYTES-2*KYBER_SYMBYTES, KYBER_SYMBYTES);
+  hash_g(kr, buf, 2*KYBER_SYMBYTES);
+
+  /* coins are in kr+KYBER_SYMBYTES */
+  indcpa_enc(cmp, buf, pk, kr+KYBER_SYMBYTES);
+
+  success = verify1(ct, cmp, KYBER_CIPHERTEXTBYTES);
+
+  /* Compute rejection key */
+  rkprf(ss,sk+KYBER_SECRETKEYBYTES-KYBER_SYMBYTES,ct);
+
+  /* Copy true key to return buffer if fail is false */
+  cmov(ss,kr,KYBER_SYMBYTES,success);
+
+  return 0;
+}
+
+/*************** kyber/ref/polyvec.c */
+
+/*************************************************
+* Name:        polyvec_compress
+*
+* Description: Compress and serialize vector of polynomials
+*
+* Arguments:   - uint8_t *r: pointer to output byte array
+*                            (needs space for KYBER_POLYVECCOMPRESSEDBYTES)
+*              - const polyvec *a: pointer to input vector of polynomials
+**************************************************/
+void polyvec_compress(uint8_t r[KYBER_POLYVECCOMPRESSEDBYTES], const polyvec *a)
+{
+  unsigned int i,j,k;
+  uint64_t d0;
+
+#if (KYBER_POLYVECCOMPRESSEDBYTES == (KYBER_K * 352))
+  uint16_t t[8];
+  for(i=0;i<KYBER_K;i++) {
+    for(j=0;j<KYBER_N/8;j++) {
+      for(k=0;k<8;k++) {
+        t[k]  = a->vec[i].coeffs[8*j+k];
+        t[k] += ((int16_t)t[k] >> 15) & KYBER_Q;
+/*      t[k]  = ((((uint32_t)t[k] << 11) + KYBER_Q/2)/KYBER_Q) & 0x7ff; */
+        d0 = t[k];
+        d0 <<= 11;
+        d0 += 1664;
+        d0 *= 645084;
+        d0 >>= 31;
+        t[k] = d0 & 0x7ff;
+
+      }
+
+      r[ 0] = (t[0] >>  0);
+      r[ 1] = (t[0] >>  8) | (t[1] << 3);
+      r[ 2] = (t[1] >>  5) | (t[2] << 6);
+      r[ 3] = (t[2] >>  2);
+      r[ 4] = (t[2] >> 10) | (t[3] << 1);
+      r[ 5] = (t[3] >>  7) | (t[4] << 4);
+      r[ 6] = (t[4] >>  4) | (t[5] << 7);
+      r[ 7] = (t[5] >>  1);
+      r[ 8] = (t[5] >>  9) | (t[6] << 2);
+      r[ 9] = (t[6] >>  6) | (t[7] << 5);
+      r[10] = (t[7] >>  3);
+      r += 11;
+    }
+  }
+#elif (KYBER_POLYVECCOMPRESSEDBYTES == (KYBER_K * 320))
+  uint16_t t[4];
+  for(i=0;i<KYBER_K;i++) {
+    for(j=0;j<KYBER_N/4;j++) {
+      for(k=0;k<4;k++) {
+        t[k]  = a->vec[i].coeffs[4*j+k];
+        t[k] += ((int16_t)t[k] >> 15) & KYBER_Q;
+/*      t[k]  = ((((uint32_t)t[k] << 10) + KYBER_Q/2)/ KYBER_Q) & 0x3ff; */
+        d0 = t[k];
+        d0 <<= 10;
+        d0 += 1665;
+        d0 *= 1290167;
+        d0 >>= 32;
+        t[k] = d0 & 0x3ff;
+      }
+
+      r[0] = (t[0] >> 0);
+      r[1] = (t[0] >> 8) | (t[1] << 2);
+      r[2] = (t[1] >> 6) | (t[2] << 4);
+      r[3] = (t[2] >> 4) | (t[3] << 6);
+      r[4] = (t[3] >> 2);
+      r += 5;
+    }
+  }
+#else
+#error "KYBER_POLYVECCOMPRESSEDBYTES needs to be in {320*KYBER_K, 352*KYBER_K}"
+#endif
+}
+
+/*************************************************
+* Name:        polyvec_decompress
+*
+* Description: De-serialize and decompress vector of polynomials;
+*              approximate inverse of polyvec_compress
+*
+* Arguments:   - polyvec *r:       pointer to output vector of polynomials
+*              - const uint8_t *a: pointer to input byte array
+*                                  (of length KYBER_POLYVECCOMPRESSEDBYTES)
+**************************************************/
+void polyvec_decompress(polyvec *r, const uint8_t a[KYBER_POLYVECCOMPRESSEDBYTES])
+{
+  unsigned int i,j,k;
+
+#if (KYBER_POLYVECCOMPRESSEDBYTES == (KYBER_K * 352))
+  uint16_t t[8];
+  for(i=0;i<KYBER_K;i++) {
+    for(j=0;j<KYBER_N/8;j++) {
+      t[0] = (a[0] >> 0) | ((uint16_t)a[ 1] << 8);
+      t[1] = (a[1] >> 3) | ((uint16_t)a[ 2] << 5);
+      t[2] = (a[2] >> 6) | ((uint16_t)a[ 3] << 2) | ((uint16_t)a[4] << 10);
+      t[3] = (a[4] >> 1) | ((uint16_t)a[ 5] << 7);
+      t[4] = (a[5] >> 4) | ((uint16_t)a[ 6] << 4);
+      t[5] = (a[6] >> 7) | ((uint16_t)a[ 7] << 1) | ((uint16_t)a[8] << 9);
+      t[6] = (a[8] >> 2) | ((uint16_t)a[ 9] << 6);
+      t[7] = (a[9] >> 5) | ((uint16_t)a[10] << 3);
+      a += 11;
+
+      for(k=0;k<8;k++)
+        r->vec[i].coeffs[8*j+k] = ((uint32_t)(t[k] & 0x7FF)*KYBER_Q + 1024) >> 11;
+    }
+  }
+#elif (KYBER_POLYVECCOMPRESSEDBYTES == (KYBER_K * 320))
+  uint16_t t[4];
+  for(i=0;i<KYBER_K;i++) {
+    for(j=0;j<KYBER_N/4;j++) {
+      t[0] = (a[0] >> 0) | ((uint16_t)a[1] << 8);
+      t[1] = (a[1] >> 2) | ((uint16_t)a[2] << 6);
+      t[2] = (a[2] >> 4) | ((uint16_t)a[3] << 4);
+      t[3] = (a[3] >> 6) | ((uint16_t)a[4] << 2);
+      a += 5;
+
+      for(k=0;k<4;k++)
+        r->vec[i].coeffs[4*j+k] = ((uint32_t)(t[k] & 0x3FF)*KYBER_Q + 512) >> 10;
+    }
+  }
+#else
+#error "KYBER_POLYVECCOMPRESSEDBYTES needs to be in {320*KYBER_K, 352*KYBER_K}"
+#endif
+}
+
+/*************************************************
+* Name:        polyvec_tobytes
+*
+* Description: Serialize vector of polynomials
+*
+* Arguments:   - uint8_t *r: pointer to output byte array
+*                            (needs space for KYBER_POLYVECBYTES)
+*              - const polyvec *a: pointer to input vector of polynomials
+**************************************************/
+void polyvec_tobytes(uint8_t r[KYBER_POLYVECBYTES], const polyvec *a)
+{
+  unsigned int i;
+  for(i=0;i<KYBER_K;i++)
+    poly_tobytes(r+i*KYBER_POLYBYTES, &a->vec[i]);
+}
+
+/*************************************************
+* Name:        polyvec_frombytes
+*
+* Description: De-serialize vector of polynomials;
+*              inverse of polyvec_tobytes
+*
+* Arguments:   - uint8_t *r:       pointer to output byte array
+*              - const polyvec *a: pointer to input vector of polynomials
+*                                  (of length KYBER_POLYVECBYTES)
+**************************************************/
+void polyvec_frombytes(polyvec *r, const uint8_t a[KYBER_POLYVECBYTES])
+{
+  unsigned int i;
+  for(i=0;i<KYBER_K;i++)
+    poly_frombytes(&r->vec[i], a+i*KYBER_POLYBYTES);
+}
+
+/*************************************************
+* Name:        polyvec_ntt
+*
+* Description: Apply forward NTT to all elements of a vector of polynomials
+*
+* Arguments:   - polyvec *r: pointer to in/output vector of polynomials
+**************************************************/
+void polyvec_ntt(polyvec *r)
+{
+  unsigned int i;
+  for(i=0;i<KYBER_K;i++)
+    poly_ntt(&r->vec[i]);
+}
+
+/*************************************************
+* Name:        polyvec_invntt_tomont
+*
+* Description: Apply inverse NTT to all elements of a vector of polynomials
+*              and multiply by Montgomery factor 2^16
+*
+* Arguments:   - polyvec *r: pointer to in/output vector of polynomials
+**************************************************/
+void polyvec_invntt_tomont(polyvec *r)
+{
+  unsigned int i;
+  for(i=0;i<KYBER_K;i++)
+    poly_invntt_tomont(&r->vec[i]);
+}
+
+/*************************************************
+* Name:        polyvec_basemul_acc_montgomery
+*
+* Description: Multiply elements of a and b in NTT domain, accumulate into r,
+*              and multiply by 2^-16.
+*
+* Arguments: - poly *r: pointer to output polynomial
+*            - const polyvec *a: pointer to first input vector of polynomials
+*            - const polyvec *b: pointer to second input vector of polynomials
+**************************************************/
+void polyvec_basemul_acc_montgomery(poly *r, const polyvec *a, const polyvec *b)
+{
+  unsigned int i;
+  poly t;
+
+  poly_basemul_montgomery(r, &a->vec[0], &b->vec[0]);
+  for(i=1;i<KYBER_K;i++) {
+    poly_basemul_montgomery(&t, &a->vec[i], &b->vec[i]);
+    poly_add(r, r, &t);
+  }
+
+  poly_reduce(r);
+}
+
+/*************************************************
+* Name:        polyvec_reduce
+*
+* Description: Applies Barrett reduction to each coefficient
+*              of each element of a vector of polynomials;
+*              for details of the Barrett reduction see comments in reduce.c
+*
+* Arguments:   - polyvec *r: pointer to input/output polynomial
+**************************************************/
+void polyvec_reduce(polyvec *r)
+{
+  unsigned int i;
+  for(i=0;i<KYBER_K;i++)
+    poly_reduce(&r->vec[i]);
+}
+
+/*************************************************
+* Name:        polyvec_add
+*
+* Description: Add vectors of polynomials
+*
+* Arguments: - polyvec *r: pointer to output vector of polynomials
+*            - const polyvec *a: pointer to first input vector of polynomials
+*            - const polyvec *b: pointer to second input vector of polynomials
+**************************************************/
+void polyvec_add(polyvec *r, const polyvec *a, const polyvec *b)
+{
+  unsigned int i;
+  for(i=0;i<KYBER_K;i++)
+    poly_add(&r->vec[i], &a->vec[i], &b->vec[i]);
+}
+
+/*****************/
+#undef KYBER_K
+#undef KYBER_POLYCOMPRESSEDBYTES
+#undef KYBER_POLYVECCOMPRESSEDBYTES
+#undef poly_compress
+#undef poly_decompress
+#undef poly_getnoise_eta1
+#undef crypto_kem_keypair_derand
+#undef crypto_kem_enc_derand
+#undef crypto_kem_keypair
+#undef crypto_kem_enc
+#undef crypto_kem_dec
+#undef polyvec
+#undef polyvec_compress
+#undef polyvec_decompress
+#undef polyvec_tobytes
+#undef polyvec_frombytes
+#undef polyvec_ntt
+#undef polyvec_invntt_tomont
+#undef polyvec_basemul_acc_montgomery
+#undef polyvec_reduce
+#undef polyvec_add
+#undef pack_pk
+#undef unpack_pk
+#undef pack_sk
+#undef unpack_sk
+#undef pack_ciphertext
+#undef unpack_ciphertext
+#undef gen_matrix
+#undef indcpa_keypair_derand
+#undef indcpa_enc
+#undef indcpa_dec
diff --git a/cipher/kyber.c b/cipher/kyber.c
new file mode 100644 (file)
index 0000000..e5e2193
--- /dev/null
@@ -0,0 +1,530 @@
+/* kyber.c - the Kyber key encapsulation mechanism (main part)
+ * Copyright (C) 2024 g10 Code GmbH
+ *
+ * This file was modified for use by Libgcrypt.
+ *
+ * This file is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * This file is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <https://www.gnu.org/licenses/>.
+ * SPDX-License-Identifier: LGPL-2.1-or-later
+ *
+ * You can also use this file under the same licence of original code.
+ * SPDX-License-Identifier: CC0 OR Apache-2.0
+ *
+ */
+/*
+  Original code from:
+
+  Repository: https://github.com/pq-crystals/kyber.git
+  Branch: standard
+  Commit: 11d00ff1f20cfca1f72d819e5a45165c1e0a2816
+
+  Licence:
+  Public Domain (https://creativecommons.org/share-your-work/public-domain/cc0/);
+  or Apache 2.0 License (https://www.apache.org/licenses/LICENSE-2.0.html).
+
+  Authors:
+        Joppe Bos
+        Léo Ducas
+        Eike Kiltz
+        Tancrède Lepoint
+        Vadim Lyubashevsky
+        John Schanck
+        Peter Schwabe
+        Gregor Seiler
+        Damien Stehlé
+
+  Kyber Home: https://www.pq-crystals.org/kyber/
+ */
+/*
+ * This implementation consists of four files: kyber.h (header),
+ * kyber.c (this), kyber-common.c (common part), and kyber-kdep.c
+ * (KYBER_K dependent part).
+ *
+ * It is for inclusion in libgcrypt library.  Also, standalone use of
+ * the implementation is possible.  With KYBER_K defined, it can offer
+ * the variant of that KYBER_K specified.  Otherwise, three variants
+ * are offered.
+ *
+ * From original code, following modification was made.
+ *
+ * - C++ style comments are changed to C-style.
+ *
+ * - No use of KYBER_NAMESPACE and FIPS202_NAMESPACE.  Don't export
+ *   internal symbols.
+ *
+ * - "verify" routine is changed to return 1 on success, and now has
+ *   new name "verify1", so that the use of the routine won't need
+ *   negation (since negation might result non-constant-time code with
+ *   branch by some compiler).
+ *
+ * - For "xof" routines, definitions of xof_init and xof_close are
+ *   added, so that memory will be possible to be cleared after its
+ *   use.
+ *
+ * - Different external API for shake128, having _init and _close.
+ *
+ * - New implementation of kyber_shake128_absorb, with the shake128
+ *   API.
+ *
+ * - Added an external function: shake256v with variable arguments.
+ *
+ * - Macro definitions of xof_squeezeblocks, prf, and rkprf are
+ *   modified to use the shake128 API and the shake256v function.
+ *
+ */
+
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <stddef.h>
+#include <stdint.h>
+#include <string.h>
+
+#ifdef _GCRYPT_IN_LIBGCRYPT
+#include <stdarg.h>
+#include <gpg-error.h>
+
+#include "types.h"
+#include "g10lib.h"
+#include "gcrypt-int.h"
+#include "const-time.h"
+#include "kyber.h"
+
+static int crypto_kem_keypair_2(uint8_t *pk, uint8_t *sk);
+static int crypto_kem_keypair_3(uint8_t *pk, uint8_t *sk);
+static int crypto_kem_keypair_4(uint8_t *pk, uint8_t *sk);
+
+static int crypto_kem_enc_2(uint8_t *ct, uint8_t *ss, const uint8_t *pk);
+static int crypto_kem_enc_3(uint8_t *ct, uint8_t *ss, const uint8_t *pk);
+static int crypto_kem_enc_4(uint8_t *ct, uint8_t *ss, const uint8_t *pk);
+
+static int crypto_kem_dec_2(uint8_t *ss, const uint8_t *ct, const uint8_t *sk);
+static int crypto_kem_dec_3(uint8_t *ss, const uint8_t *ct, const uint8_t *sk);
+static int crypto_kem_dec_4(uint8_t *ss, const uint8_t *ct, const uint8_t *sk);
+
+void
+kyber_keypair (int algo, uint8_t *pk, uint8_t *sk)
+{
+  switch (algo)
+    {
+    case GCRY_KEM_MLKEM512:
+      crypto_kem_keypair_2 (pk, sk);
+      break;
+    case GCRY_KEM_MLKEM768:
+    default:
+      crypto_kem_keypair_3 (pk, sk);
+      break;
+    case GCRY_KEM_MLKEM1024:
+      crypto_kem_keypair_4 (pk, sk);
+      break;
+    }
+}
+
+void
+kyber_encap (int algo, uint8_t *ct, uint8_t *ss, const uint8_t *pk)
+{
+  switch (algo)
+    {
+    case GCRY_KEM_MLKEM512:
+      crypto_kem_enc_2 (ct, ss, pk);
+      break;
+    case GCRY_KEM_MLKEM768:
+    default:
+      crypto_kem_enc_3 (ct, ss, pk);
+      break;
+    case GCRY_KEM_MLKEM1024:
+      crypto_kem_enc_4 (ct, ss, pk);
+      break;
+    }
+}
+
+void
+kyber_decap (int algo, uint8_t *ss, const uint8_t *ct, const uint8_t *sk)
+{
+  switch (algo)
+    {
+    case GCRY_KEM_MLKEM512:
+      crypto_kem_dec_2 (ss, ct, sk);
+      break;
+    case GCRY_KEM_MLKEM768:
+    default:
+      crypto_kem_dec_3 (ss, ct, sk);
+      break;
+    case GCRY_KEM_MLKEM1024:
+      crypto_kem_dec_4 (ss, ct, sk);
+      break;
+    }
+}
+
+static void
+randombytes (uint8_t *out, size_t outlen)
+{
+  _gcry_randomize (out, outlen, GCRY_VERY_STRONG_RANDOM);
+}
+
+typedef struct {
+  gcry_md_hd_t h;
+} keccak_state;
+
+static void
+shake128_init (keccak_state *state)
+{
+  gcry_err_code_t ec;
+
+  ec = _gcry_md_open (&state->h, GCRY_MD_SHAKE128, 0);
+  if (ec)
+    log_fatal ("internal md_open failed: %d\n", ec);
+}
+
+static void
+shake128_absorb (keccak_state *state, const uint8_t *in, size_t inlen)
+{
+  _gcry_md_write (state->h, in, inlen);
+}
+
+static void
+shake128_finalize (keccak_state *state)
+{
+  (void)state;
+}
+
+static void
+shake128_squeeze (keccak_state *state, uint8_t *out, size_t outlen)
+{
+  _gcry_md_extract (state->h, GCRY_MD_SHAKE128, out, outlen);
+}
+
+static void
+shake128_close (keccak_state *state)
+{
+  _gcry_md_close (state->h);
+}
+
+#define MAX_ARGS 16
+static void
+shake256v (uint8_t *out, size_t outlen, ...)
+{
+  gcry_buffer_t iov[MAX_ARGS];
+  va_list ap;
+  int i;
+  void *p;
+  size_t len;
+
+  va_start (ap, outlen);
+  for (i = 0; i < MAX_ARGS; i++)
+    {
+      p = va_arg (ap, void *);
+      len = va_arg (ap, size_t);
+      if (!p)
+        break;
+
+      iov[i].size = 0;
+      iov[i].data = p;
+      iov[i].off = 0;
+      iov[i].len = len;
+    }
+  va_end (ap);
+
+  _gcry_md_hash_buffers_extract (GCRY_MD_SHAKE256, 0, out, outlen,
+                                 iov, i);
+}
+
+static void
+sha3_256 (uint8_t h[32], const uint8_t *in, size_t inlen)
+{
+  _gcry_md_hash_buffer (GCRY_MD_SHA3_256, h, in, inlen);
+}
+
+static void
+sha3_512 (uint8_t h[64], const uint8_t *in, size_t inlen)
+{
+  _gcry_md_hash_buffer (GCRY_MD_SHA3_512, h, in, inlen);
+}
+
+#define verify1 ct_memequal
+#define cmov    ct_memmov_cond
+#else
+#include "kyber.h"
+
+void randombytes (uint8_t *out, size_t outlen);
+
+typedef struct {
+  uint64_t s[25];
+  unsigned int pos;
+} keccak_state;
+
+void shake128_init (keccak_state *state);
+void shake128_absorb (keccak_state *state, const uint8_t *in, size_t inlen);
+void shake128_finalize (keccak_state *state);
+void shake128_squeeze (keccak_state *state, uint8_t *out, size_t outlen);
+void shake128_close (keccak_state *state);
+
+void shake256v (uint8_t *out, size_t outlen, ...);
+void sha3_256 (uint8_t h[32], const uint8_t *in, size_t inlen);
+void sha3_512 (uint8_t h[64], const uint8_t *in, size_t inlen);
+
+/* Return 1 when success, 0 otherwise.  */
+unsigned int verify1 (const uint8_t *a, const uint8_t *b, size_t len);
+/* Conditional move.  */
+void cmov (uint8_t *r, const uint8_t *x, size_t len, uint8_t b);
+#endif
+
+/*************** kyber/ref/fips202.h */
+#define SHAKE128_RATE 168
+
+/*************** kyber/ref/params.h */
+#define KYBER_N 256
+#define KYBER_Q 3329
+
+#define KYBER_SYMBYTES 32   /* size in bytes of hashes, and seeds */
+#define KYBER_SSBYTES  32   /* size in bytes of shared key */
+
+#define KYBER_POLYBYTES          384
+
+#define KYBER_ETA2 2
+
+#define KYBER_INDCPA_MSGBYTES       (KYBER_SYMBYTES)
+
+/* KYBER_K dependent values (part 1) */
+#define KYBER_ETA1_2   3
+#define KYBER_ETA1_3_4 2
+
+#define KYBER_POLYCOMPRESSEDBYTES_2_3 128
+#define KYBER_POLYCOMPRESSEDBYTES_4   160
+
+/*************** kyber/ref/poly.h */
+/*
+ * Elements of R_q = Z_q[X]/(X^n + 1). Represents polynomial
+ * coeffs[0] + X*coeffs[1] + X^2*coeffs[2] + ... + X^{n-1}*coeffs[n-1]
+ */
+typedef struct{
+  int16_t coeffs[KYBER_N];
+} poly;
+
+#if !defined(KYBER_K) || KYBER_K == 2 || KYBER_K == 3
+static void poly_compress_128(uint8_t r[KYBER_POLYCOMPRESSEDBYTES_2_3], const poly *a);
+static void poly_decompress_128(poly *r, const uint8_t a[KYBER_POLYCOMPRESSEDBYTES_2_3]);
+#endif
+#if !defined(KYBER_K) || KYBER_K == 4
+static void poly_compress_160(uint8_t r[KYBER_POLYCOMPRESSEDBYTES_4], const poly *a);
+static void poly_decompress_160(poly *r, const uint8_t a[KYBER_POLYCOMPRESSEDBYTES_4]);
+#endif
+static void poly_tobytes(uint8_t r[KYBER_POLYBYTES], const poly *a);
+static void poly_frombytes(poly *r, const uint8_t a[KYBER_POLYBYTES]);
+
+static void poly_frommsg(poly *r, const uint8_t msg[KYBER_INDCPA_MSGBYTES]);
+static void poly_tomsg(uint8_t msg[KYBER_INDCPA_MSGBYTES], const poly *r);
+#if !defined(KYBER_K) || KYBER_K == 2
+static void poly_getnoise_eta1_2(poly *r, const uint8_t seed[KYBER_SYMBYTES], uint8_t nonce);
+#endif
+#if !defined(KYBER_K) || KYBER_K == 3 || KYBER_K == 4
+static void poly_getnoise_eta1_3_4(poly *r, const uint8_t seed[KYBER_SYMBYTES], uint8_t nonce);
+#endif
+static void poly_getnoise_eta2(poly *r, const uint8_t seed[KYBER_SYMBYTES], uint8_t nonce);
+
+static void poly_ntt(poly *r);
+static void poly_invntt_tomont(poly *r);
+static void poly_basemul_montgomery(poly *r, const poly *a, const poly *b);
+static void poly_tomont(poly *r);
+
+static void poly_reduce(poly *r);
+
+static void poly_add(poly *r, const poly *a, const poly *b);
+static void poly_sub(poly *r, const poly *a, const poly *b);
+
+/*************** kyber/ref/ntt.h */
+static const int16_t zetas[128];
+
+static void ntt(int16_t poly[256]);
+
+static void invntt(int16_t poly[256]);
+
+static void basemul(int16_t r[2], const int16_t a[2], const int16_t b[2], int16_t zeta);
+
+/*************** kyber/ref/reduce.h */
+#define MONT -1044 /* 2^16 mod q */
+#define QINV -3327 /* q^-1 mod 2^16 */
+
+static int16_t montgomery_reduce(int32_t a);
+
+static int16_t barrett_reduce(int16_t a);
+
+/*************** kyber/ref/symmetric.h */
+typedef keccak_state xof_state;
+
+static void kyber_shake128_absorb (keccak_state *state,
+                                   const uint8_t seed[KYBER_SYMBYTES],
+                                   uint8_t x, uint8_t y)
+{
+  shake128_absorb (state, seed, KYBER_SYMBYTES);
+  shake128_absorb (state, &x, 1);
+  shake128_absorb (state, &y, 1);
+  shake128_finalize (state);
+}
+
+#define XOF_BLOCKBYTES SHAKE128_RATE
+
+#define hash_h(OUT, IN, INBYTES) sha3_256(OUT, IN, INBYTES)
+#define hash_g(OUT, IN, INBYTES) sha3_512(OUT, IN, INBYTES)
+#define xof_init(STATE) shake128_init(STATE)
+#define xof_close(STATE) shake128_close(STATE)
+#define xof_absorb(STATE, SEED, X, Y) kyber_shake128_absorb(STATE, SEED, X, Y)
+#define xof_squeezeblocks(OUT, OUTBLOCKS, STATE) shake128_squeeze(STATE, OUT, SHAKE128_RATE * OUTBLOCKS)
+#define prf(OUT, OUTBYTES, KEY, NONCE) \
+  shake256v(OUT, OUTBYTES, (void *)(KEY), (size_t)KYBER_SYMBYTES, \
+                          (void *)&(NONCE), (size_t)1, \
+                          NULL, (size_t)0)
+#define rkprf(OUT, KEY, INPUT) \
+  shake256v(OUT, KYBER_SSBYTES, (void *)(KEY), (size_t)KYBER_SYMBYTES, \
+                               (void *)(INPUT), (size_t)KYBER_CIPHERTEXTBYTES, \
+                               NULL, (size_t)0)
+
+#include "kyber-common.c"
+
+#define VARIANT2(name) name ## _2
+#define VARIANT3(name) name ## _3
+#define VARIANT4(name) name ## _4
+
+/* KYBER_K dependent values (part 2) */
+#define KYBER_POLYVECBYTES      (KYBER_K * KYBER_POLYBYTES)
+#define KYBER_INDCPA_PUBLICKEYBYTES (KYBER_POLYVECBYTES + KYBER_SYMBYTES)
+#define KYBER_INDCPA_SECRETKEYBYTES (KYBER_POLYVECBYTES)
+#define KYBER_INDCPA_BYTES          (KYBER_POLYVECCOMPRESSEDBYTES + KYBER_POLYCOMPRESSEDBYTES)
+
+#define KYBER_PUBLICKEYBYTES  (KYBER_INDCPA_PUBLICKEYBYTES)
+/* 32 bytes of additional space to save H(pk) */
+#define KYBER_SECRETKEYBYTES  (KYBER_INDCPA_SECRETKEYBYTES + KYBER_INDCPA_PUBLICKEYBYTES + 2*KYBER_SYMBYTES)
+#define KYBER_CIPHERTEXTBYTES (KYBER_INDCPA_BYTES)
+
+#ifdef KYBER_K
+# if KYBER_K == 2
+#  define KYBER_POLYCOMPRESSEDBYTES    128
+#  define KYBER_POLYVECCOMPRESSEDBYTES (KYBER_K * 320)
+#  define poly_compress poly_compress_128
+#  define poly_decompress poly_decompress_128
+#  define poly_getnoise_eta1 poly_getnoise_eta1_2
+# elif KYBER_K == 3
+#  define KYBER_POLYCOMPRESSEDBYTES    128
+#  define KYBER_POLYVECCOMPRESSEDBYTES (KYBER_K * 320)
+#  define poly_compress poly_compress_128
+#  define poly_decompress poly_decompress_128
+#  define poly_getnoise_eta1 poly_getnoise_eta1_3_4
+# elif KYBER_K == 4
+#  define KYBER_POLYCOMPRESSEDBYTES    160
+#  define KYBER_POLYVECCOMPRESSEDBYTES (KYBER_K * 352)
+#  define poly_compress poly_compress_160
+#  define poly_decompress poly_decompress_160
+#  define poly_getnoise_eta1 poly_getnoise_eta1_3_4
+# endif
+# include "kyber-kdep.c"
+# else
+# define KYBER_K 2
+# define KYBER_POLYCOMPRESSEDBYTES    128
+# define KYBER_POLYVECCOMPRESSEDBYTES (KYBER_K * 320)
+# define poly_compress poly_compress_128
+# define poly_decompress poly_decompress_128
+# define poly_getnoise_eta1 poly_getnoise_eta1_2
+# define crypto_kem_keypair_derand VARIANT2(crypto_kem_keypair_derand)
+# define crypto_kem_enc_derand VARIANT2(crypto_kem_enc_derand)
+# define crypto_kem_keypair VARIANT2(crypto_kem_keypair)
+# define crypto_kem_enc VARIANT2(crypto_kem_enc)
+# define crypto_kem_dec VARIANT2(crypto_kem_dec)
+# define polyvec VARIANT2(polyvec)
+# define polyvec_compress VARIANT2(polyvec_compress)
+# define polyvec_decompress VARIANT2(polyvec_decompress)
+# define polyvec_tobytes VARIANT2(polyvec_tobytes)
+# define polyvec_frombytes VARIANT2(polyvec_frombytes)
+# define polyvec_ntt VARIANT2(polyvec_ntt)
+# define polyvec_invntt_tomont VARIANT2(polyvec_invntt_tomont)
+# define polyvec_basemul_acc_montgomery VARIANT2(polyvec_basemul_acc_montgomery)
+# define polyvec_reduce VARIANT2(polyvec_reduce)
+# define polyvec_add VARIANT2(polyvec_add)
+# define pack_pk VARIANT2(pack_pk)
+# define unpack_pk VARIANT2(unpack_pk)
+# define pack_sk VARIANT2(pack_sk)
+# define unpack_sk VARIANT2(unpack_sk)
+# define pack_ciphertext VARIANT2(pack_ciphertext)
+# define unpack_ciphertext VARIANT2(unpack_ciphertext)
+# define gen_matrix VARIANT2(gen_matrix)
+# define indcpa_keypair_derand VARIANT2(indcpa_keypair_derand)
+# define indcpa_enc VARIANT2(indcpa_enc)
+# define indcpa_dec VARIANT2(indcpa_dec)
+# include "kyber-kdep.c"
+
+# define KYBER_K 3
+# define KYBER_POLYCOMPRESSEDBYTES    128
+# define KYBER_POLYVECCOMPRESSEDBYTES (KYBER_K * 320)
+# define poly_compress poly_compress_128
+# define poly_decompress poly_decompress_128
+# define poly_getnoise_eta1 poly_getnoise_eta1_3_4
+# define crypto_kem_keypair_derand VARIANT3(crypto_kem_keypair_derand)
+# define crypto_kem_enc_derand VARIANT3(crypto_kem_enc_derand)
+# define crypto_kem_keypair VARIANT3(crypto_kem_keypair)
+# define crypto_kem_enc VARIANT3(crypto_kem_enc)
+# define crypto_kem_dec VARIANT3(crypto_kem_dec)
+# define polyvec VARIANT3(polyvec)
+# define polyvec_compress VARIANT3(polyvec_compress)
+# define polyvec_decompress VARIANT3(polyvec_decompress)
+# define polyvec_tobytes VARIANT3(polyvec_tobytes)
+# define polyvec_frombytes VARIANT3(polyvec_frombytes)
+# define polyvec_ntt VARIANT3(polyvec_ntt)
+# define polyvec_invntt_tomont VARIANT3(polyvec_invntt_tomont)
+# define polyvec_basemul_acc_montgomery VARIANT3(polyvec_basemul_acc_montgomery)
+# define polyvec_reduce VARIANT3(polyvec_reduce)
+# define polyvec_add VARIANT3(polyvec_add)
+# define pack_pk VARIANT3(pack_pk)
+# define unpack_pk VARIANT3(unpack_pk)
+# define pack_sk VARIANT3(pack_sk)
+# define unpack_sk VARIANT3(unpack_sk)
+# define pack_ciphertext VARIANT3(pack_ciphertext)
+# define unpack_ciphertext VARIANT3(unpack_ciphertext)
+# define gen_matrix VARIANT3(gen_matrix)
+# define indcpa_keypair_derand VARIANT3(indcpa_keypair_derand)
+# define indcpa_enc VARIANT3(indcpa_enc)
+# define indcpa_dec VARIANT3(indcpa_dec)
+# include "kyber-kdep.c"
+
+# define KYBER_K 4
+# define KYBER_POLYCOMPRESSEDBYTES    160
+# define KYBER_POLYVECCOMPRESSEDBYTES (KYBER_K * 352)
+# define poly_compress poly_compress_160
+# define poly_decompress poly_decompress_160
+# define poly_getnoise_eta1 poly_getnoise_eta1_3_4
+# define crypto_kem_keypair_derand VARIANT4(crypto_kem_keypair_derand)
+# define crypto_kem_enc_derand VARIANT4(crypto_kem_enc_derand)
+# define crypto_kem_keypair VARIANT4(crypto_kem_keypair)
+# define crypto_kem_enc VARIANT4(crypto_kem_enc)
+# define crypto_kem_dec VARIANT4(crypto_kem_dec)
+# define polyvec VARIANT4(polyvec)
+# define polyvec_compress VARIANT4(polyvec_compress)
+# define polyvec_decompress VARIANT4(polyvec_decompress)
+# define polyvec_tobytes VARIANT4(polyvec_tobytes)
+# define polyvec_frombytes VARIANT4(polyvec_frombytes)
+# define polyvec_ntt VARIANT4(polyvec_ntt)
+# define polyvec_invntt_tomont VARIANT4(polyvec_invntt_tomont)
+# define polyvec_basemul_acc_montgomery VARIANT4(polyvec_basemul_acc_montgomery)
+# define polyvec_reduce VARIANT4(polyvec_reduce)
+# define polyvec_add VARIANT4(polyvec_add)
+# define pack_pk VARIANT4(pack_pk)
+# define unpack_pk VARIANT4(unpack_pk)
+# define pack_sk VARIANT4(pack_sk)
+# define unpack_sk VARIANT4(unpack_sk)
+# define pack_ciphertext VARIANT4(pack_ciphertext)
+# define unpack_ciphertext VARIANT4(unpack_ciphertext)
+# define gen_matrix VARIANT4(gen_matrix)
+# define indcpa_keypair_derand VARIANT4(indcpa_keypair_derand)
+# define indcpa_enc VARIANT4(indcpa_enc)
+# define indcpa_dec VARIANT4(indcpa_dec)
+# include "kyber-kdep.c"
+#endif
diff --git a/cipher/kyber.h b/cipher/kyber.h
new file mode 100644 (file)
index 0000000..2fe6883
--- /dev/null
@@ -0,0 +1,130 @@
+/* kyber.h - the Kyber key encapsulation mechanism (header)
+ * Copyright (C) 2024 g10 Code GmbH
+ *
+ * This file was modified for use by Libgcrypt.
+ *
+ * This file is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * This file is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <https://www.gnu.org/licenses/>.
+ * SPDX-License-Identifier: LGPL-2.1-or-later
+ *
+ * You can also use this file under the same licence of original code.
+ * SPDX-License-Identifier: CC0 OR Apache-2.0
+ *
+ */
+/*
+  Original code from:
+
+  Repository: https://github.com/pq-crystals/kyber.git
+  Branch: standard
+  Commit: 11d00ff1f20cfca1f72d819e5a45165c1e0a2816
+
+  Licence:
+  Public Domain (https://creativecommons.org/share-your-work/public-domain/cc0/);
+  or Apache 2.0 License (https://www.apache.org/licenses/LICENSE-2.0.html).
+
+  Authors:
+        Joppe Bos
+        Léo Ducas
+        Eike Kiltz
+        Tancrède Lepoint
+        Vadim Lyubashevsky
+        John Schanck
+        Peter Schwabe
+        Gregor Seiler
+        Damien Stehlé
+
+  Kyber Home: https://www.pq-crystals.org/kyber/
+ */
+/* Standalone use is possible either with KYBER_K defined with the
+ * value (2, 3, or 4), or not defined.  For the latter, routines for
+ * three variants are available.
+ */
+
+#ifndef KYBER_H
+#define KYBER_H
+
+#ifdef _GCRYPT_IN_LIBGCRYPT
+/**** Start of the glue code to libgcrypt ****/
+#define kyber_keypair   _gcry_mlkem_keypair
+#define kyber_encap     _gcry_mlkem_encap
+#define kyber_decap     _gcry_mlkem_decap
+/**** End of the glue code ****/
+
+void kyber_keypair (int algo, uint8_t *pk, uint8_t *sk);
+void kyber_encap (int algo, uint8_t *ct, uint8_t *ss, const uint8_t *pk);
+void kyber_decap (int algo, uint8_t *ss, const uint8_t *ct, const uint8_t *sk);
+#elif defined(KYBER_K)
+int crypto_kem_keypair (uint8_t *pk, uint8_t *sk);
+int crypto_kem_enc (uint8_t *ct, uint8_t *ss, const uint8_t *pk);
+int crypto_kem_dec (uint8_t *ss, const uint8_t *ct, const uint8_t *sk);
+# if KYBER_K == 2
+#  define CRYPTO_SECRETKEYBYTES   (2*384+2*384+32+2*32)
+#  define CRYPTO_PUBLICKEYBYTES   (2*384+32)
+#  define CRYPTO_CIPHERTEXTBYTES  (128+2*320)
+#  define CRYPTO_BYTES            32
+#  define CRYPTO_ALGNAME "Kyber512"
+# elif KYBER_K == 3
+#  define CRYPTO_SECRETKEYBYTES   (3*384+3*384+32+2*32)
+#  define CRYPTO_PUBLICKEYBYTES   (3*384+32)
+#  define CRYPTO_CIPHERTEXTBYTES  (128+3*320)
+#  define CRYPTO_BYTES            32
+#  define CRYPTO_ALGNAME "Kyber768"
+# elif KYBER_K == 4
+#  define CRYPTO_SECRETKEYBYTES   (4*384+2*384+32+2*32)
+#  define CRYPTO_PUBLICKEYBYTES   (4*384+32)
+#  define CRYPTO_CIPHERTEXTBYTES  (160+2*352)
+#  define CRYPTO_BYTES            32
+#  define CRYPTO_ALGNAME "Kyber1024"
+# else
+#  define CRYPTO_SECRETKEYBYTES_512   (2*384+2*384+32+2*32)
+#  define CRYPTO_PUBLICKEYBYTES_512   (2*384+32)
+#  define CRYPTO_CIPHERTEXTBYTES_512  (128+2*320)
+#  define CRYPTO_BYTES_512            32
+
+#  define CRYPTO_SECRETKEYBYTES_768   (3*384+3*384+32+2*32)
+#  define CRYPTO_PUBLICKEYBYTES_768   (3*384+32)
+#  define CRYPTO_CIPHERTEXTBYTES_768  (128+3*320)
+#  define CRYPTO_BYTES_768            32
+
+#  define CRYPTO_SECRETKEYBYTES_1024  (4*384+2*384+32+2*32)
+#  define CRYPTO_PUBLICKEYBYTES_1024  (4*384+32)
+#  define CRYPTO_CIPHERTEXTBYTES_1024 (160+2*352)
+#  define CRYPTO_BYTES_1024           32
+
+#  define CRYPTO_ALGNAME "Kyber"
+
+#  define crypto_kem_keypair_2 crypto_kem_keypair_512
+#  define crypto_kem_keypair_3 crypto_kem_keypair_768
+#  define crypto_kem_keypair_4 crypto_kem_keypair_1024
+
+int crypto_kem_keypair_2 (uint8_t *pk, uint8_t *sk);
+int crypto_kem_keypair_3 (uint8_t *pk, uint8_t *sk);
+int crypto_kem_keypair_4 (uint8_t *pk, uint8_t *sk);
+
+#  define crypto_kem_enc_2 crypto_kem_enc_512
+#  define crypto_kem_enc_3 crypto_kem_enc_768
+#  define crypto_kem_enc_4 crypto_kem_enc_1024
+int crypto_kem_enc_2 (uint8_t *ct, uint8_t *ss, const uint8_t *pk);
+int crypto_kem_enc_3 (uint8_t *ct, uint8_t *ss, const uint8_t *pk);
+int crypto_kem_enc_4 (uint8_t *ct, uint8_t *ss, const uint8_t *pk);
+
+#  define crypto_kem_dec_2 crypto_kem_dec_512
+#  define crypto_kem_dec_3 crypto_kem_dec_768
+#  define crypto_kem_dec_4 crypto_kem_dec_1024
+int crypto_kem_dec_2 (uint8_t *ss, const uint8_t *ct, const uint8_t *sk);
+int crypto_kem_dec_3 (uint8_t *ss, const uint8_t *ct, const uint8_t *sk);
+int crypto_kem_dec_4 (uint8_t *ss, const uint8_t *ct, const uint8_t *sk);
+# endif
+#endif
+
+#endif /* KYBER_H */
index b80c3406ccf21cd3c94e503fa6682e5c3b4c29bf..2274bd8e6e5cf224d87dc2503c9372544eadc5b1 100644 (file)
@@ -5,7 +5,7 @@
  * This file is part of Libgcrypt.
  *
  * Libgcrypt is free software; you can redistribute it and/or modify
- * it under the terms of the GNU Lesser general Public License as
+ * it under the terms of the GNU Lesser General Public License as
  * published by the Free Software Foundation; either version 2.1 of
  * the License, or (at your option) any later version.
  *
@@ -60,6 +60,8 @@ map_mac_algo_to_cipher (int mac_algo)
       return GCRY_CIPHER_GOST28147;
     case GCRY_MAC_CMAC_SM4:
       return GCRY_CIPHER_SM4;
+    case GCRY_MAC_CMAC_ARIA:
+      return GCRY_CIPHER_ARIA128;
     }
 }
 
@@ -522,3 +524,9 @@ const gcry_mac_spec_t _gcry_mac_type_spec_cmac_sm4 = {
   &cmac_ops
 };
 #endif
+#if USE_ARIA
+const gcry_mac_spec_t _gcry_mac_type_spec_cmac_aria = {
+  GCRY_MAC_CMAC_ARIA, {0, 0}, "CMAC_ARIA",
+  &cmac_ops
+};
+#endif
index 12f515ebb85cf5f8e8a2e25559947e032b6ec5b7..b5610c44e045c4d778aa979515ee23a62463aef8 100644 (file)
@@ -4,7 +4,7 @@
  * This file is part of Libgcrypt.
  *
  * Libgcrypt is free software; you can redistribute it and/or modify
- * it under the terms of the GNU Lesser general Public License as
+ * it under the terms of the GNU Lesser General Public License as
  * published by the Free Software Foundation; either version 2.1 of
  * the License, or (at your option) any later version.
  *
@@ -45,6 +45,10 @@ map_mac_algo_to_cipher (int mac_algo)
       return GCRY_CIPHER_SERPENT128;
     case GCRY_MAC_GMAC_SEED:
       return GCRY_CIPHER_SEED;
+    case GCRY_MAC_GMAC_SM4:
+      return GCRY_CIPHER_SM4;
+    case GCRY_MAC_GMAC_ARIA:
+      return GCRY_CIPHER_ARIA128;
     }
 }
 
@@ -185,3 +189,15 @@ const gcry_mac_spec_t _gcry_mac_type_spec_gmac_camellia = {
   &gmac_ops
 };
 #endif
+#if USE_SM4
+const gcry_mac_spec_t _gcry_mac_type_spec_gmac_sm4 = {
+  GCRY_MAC_GMAC_SM4, {0, 0}, "GMAC_SM4",
+  &gmac_ops
+};
+#endif
+#if USE_ARIA
+const gcry_mac_spec_t _gcry_mac_type_spec_gmac_aria = {
+  GCRY_MAC_GMAC_ARIA, {0, 0}, "GMAC_ARIA",
+  &gmac_ops
+};
+#endif
index 9fac77dc79a0fcd326dd0da5d46d5bc3e54b93e2..a5acab708c58593998e7cca4e755f62387362f07 100644 (file)
@@ -5,7 +5,7 @@
  * This file is part of Libgcrypt.
  *
  * Libgcrypt is free software; you can redistribute it and/or modify
- * it under the terms of the GNU Lesser general Public License as
+ * it under the terms of the GNU Lesser General Public License as
  * published by the Free Software Foundation; either version 2.1 of
  * the License, or (at your option) any later version.
  *
index 019981524d2639e8d053b7f03bb190f888ac0cea..113cf433fcd6d8efb335c845db5b07daaeffae39 100644 (file)
@@ -4,7 +4,7 @@
  * This file is part of Libgcrypt.
  *
  * Libgcrypt is free software; you can redistribute it and/or modify
- * it under the terms of the GNU Lesser general Public License as
+ * it under the terms of the GNU Lesser General Public License as
  * published by the Free Software Foundation; either version 2.1 of
  * the License, or (at your option) any later version.
  *
@@ -234,6 +234,9 @@ extern const gcry_mac_spec_t _gcry_mac_type_spec_gost28147_imit;
 #if USE_SM4
 extern const gcry_mac_spec_t _gcry_mac_type_spec_cmac_sm4;
 #endif
+#if USE_ARIA
+extern const gcry_mac_spec_t _gcry_mac_type_spec_cmac_aria;
+#endif
 
 /*
  * The GMAC algorithm specifications (mac-gmac.c).
@@ -253,6 +256,12 @@ extern const gcry_mac_spec_t _gcry_mac_type_spec_gmac_seed;
 #if USE_CAMELLIA
 extern const gcry_mac_spec_t _gcry_mac_type_spec_gmac_camellia;
 #endif
+#if USE_SM4
+extern const gcry_mac_spec_t _gcry_mac_type_spec_gmac_sm4;
+#endif
+#if USE_ARIA
+extern const gcry_mac_spec_t _gcry_mac_type_spec_gmac_aria;
+#endif
 
 /*
  * The Poly1305 MAC algorithm specifications (mac-poly1305.c).
@@ -273,3 +282,9 @@ extern const gcry_mac_spec_t _gcry_mac_type_spec_poly1305mac_serpent;
 #if USE_SEED
 extern const gcry_mac_spec_t _gcry_mac_type_spec_poly1305mac_seed;
 #endif
+#if USE_SM4
+extern const gcry_mac_spec_t _gcry_mac_type_spec_poly1305mac_sm4;
+#endif
+#if USE_ARIA
+extern const gcry_mac_spec_t _gcry_mac_type_spec_poly1305mac_aria;
+#endif
index 3abc77745e79e0c8a31b1845c5a2d3d32e218b70..dfaef4468b50507c88ce53f12bc90f0e0289f221 100644 (file)
@@ -4,7 +4,7 @@
  * This file is part of Libgcrypt.
  *
  * Libgcrypt is free software; you can redistribute it and/or modify
- * it under the terms of the GNU Lesser general Public License as
+ * it under the terms of the GNU Lesser General Public License as
  * published by the Free Software Foundation; either version 2.1 of
  * the License, or (at your option) any later version.
  *
@@ -83,6 +83,12 @@ poly1305mac_open (gcry_mac_hd_t h)
     case GCRY_MAC_POLY1305_SEED:
       cipher_algo = GCRY_CIPHER_SEED;
       break;
+    case GCRY_MAC_POLY1305_SM4:
+      cipher_algo = GCRY_CIPHER_SM4;
+      break;
+    case GCRY_MAC_POLY1305_ARIA:
+      cipher_algo = GCRY_CIPHER_ARIA128;
+      break;
     }
 
   err = _gcry_cipher_open_internal (&mac_ctx->hd, cipher_algo,
@@ -362,3 +368,15 @@ const gcry_mac_spec_t _gcry_mac_type_spec_poly1305mac_seed = {
   &poly1305mac_ops
 };
 #endif
+#if USE_SM4
+const gcry_mac_spec_t _gcry_mac_type_spec_poly1305mac_sm4 = {
+  GCRY_MAC_POLY1305_SM4, {0, 0}, "POLY1305_SM4",
+  &poly1305mac_ops
+};
+#endif
+#if USE_ARIA
+const gcry_mac_spec_t _gcry_mac_type_spec_poly1305mac_aria = {
+  GCRY_MAC_POLY1305_ARIA, {0, 0}, "POLY1305_ARIA",
+  &poly1305mac_ops
+};
+#endif
index ba1eb300afa70109ec37b628beb2eb2a53b12f73..128ac53d991e393c07a5f122df3d21c9b028a735 100644 (file)
@@ -4,7 +4,7 @@
  * This file is part of Libgcrypt.
  *
  * Libgcrypt is free software; you can redistribute it and/or modify
- * it under the terms of the GNU Lesser general Public License as
+ * it under the terms of the GNU Lesser General Public License as
  * published by the Free Software Foundation; either version 2.1 of
  * the License, or (at your option) any later version.
  *
@@ -132,8 +132,15 @@ static const gcry_mac_spec_t * const mac_list[] = {
   &_gcry_mac_type_spec_poly1305mac,
 #if USE_SM4
   &_gcry_mac_type_spec_cmac_sm4,
+  &_gcry_mac_type_spec_gmac_sm4,
+  &_gcry_mac_type_spec_poly1305mac_sm4,
 #endif
-  NULL,
+#if USE_ARIA
+  &_gcry_mac_type_spec_cmac_aria,
+  &_gcry_mac_type_spec_gmac_aria,
+  &_gcry_mac_type_spec_poly1305mac_aria,
+#endif
+  NULL
 };
 
 /* HMAC implementations start with index 101 (enum gcry_mac_algos) */
@@ -242,10 +249,10 @@ static const gcry_mac_spec_t * const mac_list_algo101[] =
 #endif
 #if USE_SHA512
     &_gcry_mac_type_spec_hmac_sha512_256,
-    &_gcry_mac_type_spec_hmac_sha512_224,
+    &_gcry_mac_type_spec_hmac_sha512_224
 #else
     NULL,
-    NULL,
+    NULL
 #endif
   };
 
@@ -308,7 +315,12 @@ static const gcry_mac_spec_t * const mac_list_algo201[] =
     NULL,
 #endif
 #if USE_SM4
-    &_gcry_mac_type_spec_cmac_sm4
+    &_gcry_mac_type_spec_cmac_sm4,
+#else
+    NULL,
+#endif
+#if USE_ARIA
+    &_gcry_mac_type_spec_cmac_aria
 #else
     NULL
 #endif
@@ -338,7 +350,17 @@ static const gcry_mac_spec_t * const mac_list_algo401[] =
     NULL,
 #endif
 #if USE_SEED
-    &_gcry_mac_type_spec_gmac_seed
+    &_gcry_mac_type_spec_gmac_seed,
+#else
+    NULL,
+#endif
+#if USE_SM4
+    &_gcry_mac_type_spec_gmac_sm4,
+#else
+    NULL,
+#endif
+#if USE_ARIA
+    &_gcry_mac_type_spec_gmac_aria
 #else
     NULL
 #endif
@@ -369,7 +391,17 @@ static const gcry_mac_spec_t * const mac_list_algo501[] =
     NULL,
 #endif
 #if USE_SEED
-    &_gcry_mac_type_spec_poly1305mac_seed
+    &_gcry_mac_type_spec_poly1305mac_seed,
+#else
+    NULL,
+#endif
+#if USE_SM4
+    &_gcry_mac_type_spec_poly1305mac_sm4,
+#else
+    NULL,
+#endif
+#if USE_ARIA
+    &_gcry_mac_type_spec_poly1305mac_aria
 #else
     NULL
 #endif
diff --git a/cipher/mceliece6688128f.c b/cipher/mceliece6688128f.c
new file mode 100644 (file)
index 0000000..63d20a6
--- /dev/null
@@ -0,0 +1,3673 @@
+/* mceliece6688128f.c - Classic McEliece for libgcrypt
+ * Copyright (C) 2023-2024 Simon Josefsson <simon@josefsson.org>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <https://www.gnu.org/licenses/>.
+ * SPDX-License-Identifier: LGPL-2.1-or-later
+ *
+ */
+
+/* This file is extracted from libmceliece. */
+
+/*
+ * libmceliece is hereby placed into the public domain.
+ *
+ * [SPDX-License-Identifier](https://spdx.dev/ids/):
+ * [LicenseRef-PD-hp](https://cr.yp.to/spdx.html)
+ * OR
+ * [CC0-1.0](https://spdx.org/licenses/CC0-1.0.html)
+ * OR
+ * [0BSD](https://spdx.org/licenses/0BSD.html)
+ * OR
+ * [MIT-0](https://spdx.org/licenses/MIT-0.html)
+ * OR
+ * [MIT](https://spdx.org/licenses/MIT.html)
+ *
+ * libmceliece is based on the official Classic McEliece software, which
+ * was written by Tung Chou. See the following papers for the major
+ * algorithms used for speed inside that software:
+ *
+ * * Daniel J. Bernstein, Tung Chou, Peter Schwabe. "McBits: fast
+ *   constant-time code-based cryptography." CHES 2013.
+ *   [https://tungchou.github.io/papers/mcbits.pdf](https://tungchou.github.io/papers/mcbits.pdf)
+ *
+ * * Tung Chou. "McBits revisited." CHES 2017.
+ *   [https://tungchou.github.io/papers/mcbits_revisited.pdf](https://tungchou.github.io/papers/mcbits_revisited.pdf)
+ *
+ * The official Classic McEliece software includes `ref`, `vec`, `sse`, and
+ * `avx` implementations; libmceliece includes only `vec` and `avx`.
+ *
+ * The following components of libmceliece are from Daniel J. Bernstein:
+ *
+ * * Small [changes](download.html#changelog)
+ *   for namespacing, portability, etc.
+ *
+ * * Software to compute control bits (also used in the official software).
+ *   See the following paper: Daniel J. Bernstein. "Verified fast formulas
+ *   for control bits for permutation networks." 2020.
+ *   [https://cr.yp.to/papers.html#controlbits](https://cr.yp.to/papers.html#controlbits)
+ *
+ * * `crypto_sort/int32`. See [https://sorting.cr.yp.to](https://sorting.cr.yp.to).
+ *
+ * * Infrastructure to build a library with automatic run-time selection of
+ *   implementations based on the run-time CPU and a database of
+ *   benchmarks. This infrastructure was introduced in
+ *   [`lib25519`](https://lib25519.cr.yp.to), with some extensions and
+ *   adaptations in libmceliece.
+ *
+ * * Various software for tests and benchmarks. This is based on
+ *   public-domain code in the SUPERCOP benchmarking framework.
+ *
+ * This file is generated by mceliece6688128f.sh from these files:
+ *
+ * libmceliece-20230612/include-build/crypto_declassify.h
+ * libmceliece-20230612/crypto_kem/6688128f/vec/params.h
+ * libmceliece-20230612/inttypes/crypto_intN.h
+ * libmceliece-20230612/inttypes/crypto_intN.h
+ * libmceliece-20230612/inttypes/crypto_intN.h
+ * libmceliece-20230612/inttypes/crypto_uintN.h
+ * libmceliece-20230612/inttypes/crypto_uintN.h
+ * libmceliece-20230612/inttypes/crypto_uintN.h
+ * libmceliece-20230612/crypto_kem/6688128f/vec/vec.h
+ * libmceliece-20230612/crypto_kem/6688128f/vec/benes.h
+ * libmceliece-20230612/crypto_kem/6688128f/vec/bm.h
+ * libmceliece-20230612/crypto_kem/6688128f/vec/controlbits.h
+ * libmceliece-20230612/crypto_kem/6688128f/vec/decrypt.h
+ * libmceliece-20230612/crypto_kem/6688128f/vec/encrypt.h
+ * libmceliece-20230612/crypto_kem/6688128f/vec/fft_consts.h
+ * libmceliece-20230612/crypto_kem/6688128f/vec/fft.h
+ * libmceliece-20230612/crypto_kem/6688128f/vec/fft_powers.h
+ * libmceliece-20230612/crypto_kem/6688128f/vec/fft_scalars_2x.h
+ * libmceliece-20230612/crypto_kem/6688128f/vec/fft_scalars_4x.h
+ * libmceliece-20230612/crypto_kem/6688128f/vec/fft_tr.h
+ * libmceliece-20230612/crypto_kem/6688128f/vec/gf.h
+ * libmceliece-20230612/crypto_kem/6688128f/vec/hash.h
+ * libmceliece-20230612/crypto_kem/6688128f/vec/int32_sort.h
+ * libmceliece-20230612/crypto_kem/6688128f/vec/operations.h
+ * libmceliece-20230612/crypto_kem/6688128f/vec/pk_gen.h
+ * libmceliece-20230612/crypto_kem/6688128f/vec/sk_gen.h
+ * libmceliece-20230612/crypto_kem/6688128f/vec/transpose.h
+ * libmceliece-20230612/crypto_kem/6688128f/vec/uint16_sort.h
+ * libmceliece-20230612/crypto_kem/6688128f/vec/uint64_sort.h
+ * libmceliece-20230612/crypto_kem/6688128f/vec/util.h
+ * libmceliece-20230612/crypto_kem/6688128f/vec/benes.c
+ * libmceliece-20230612/crypto_kem/6688128f/vec/bm.c
+ * libmceliece-20230612/crypto_kem/6688128f/vec/controlbits.c
+ * libmceliece-20230612/crypto_kem/6688128f/vec/decrypt.c
+ * libmceliece-20230612/crypto_kem/6688128f/vec/encrypt.c
+ * libmceliece-20230612/crypto_kem/6688128f/vec/shared-fft_consts.c
+ * libmceliece-20230612/crypto_kem/6688128f/vec/shared-fft_powers.c
+ * libmceliece-20230612/crypto_kem/6688128f/vec/shared-fft_scalars_2x.c
+ * libmceliece-20230612/crypto_kem/6688128f/vec/shared-fft_scalars_4x.c
+ * libmceliece-20230612/crypto_kem/6688128f/vec/fft.c
+ * libmceliece-20230612/crypto_kem/6688128f/vec/fft_tr.c
+ * libmceliece-20230612/crypto_kem/6688128f/vec/gf.c
+ * libmceliece-20230612/crypto_kem/6688128f/vec/kem_dec.c
+ * libmceliece-20230612/crypto_kem/6688128f/vec/kem_enc.c
+ * libmceliece-20230612/crypto_kem/6688128f/vec/kem_keypair.c
+ * libmceliece-20230612/crypto_kem/6688128f/vec/pk_gen.c
+ * libmceliece-20230612/crypto_kem/6688128f/vec/sk_gen.c
+ * libmceliece-20230612/crypto_kem/6688128f/vec/vec.c
+ * libmceliece-20230612/crypto_kem/6688128f/vec/wrap_dec.c
+ * libmceliece-20230612/crypto_kem/6688128f/vec/wrap_enc.c
+ * libmceliece-20230612/crypto_kem/6688128f/vec/wrap_keypair.c
+ *
+ */
+
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include "g10lib.h"
+#include "mceliece6688128f.h"
+
+#define int8 crypto_int8
+#define uint8 crypto_uint8
+#define int16 crypto_int16
+#define uint16 crypto_uint16
+#define int32 crypto_int32
+#define uint32 crypto_uint32
+#define int64 crypto_int64
+#define uint64 crypto_uint64
+
+static void
+randombytes (uint8_t *out, size_t outlen)
+{
+  _gcry_randomize (out, outlen, GCRY_STRONG_RANDOM);
+}
+
+/* from libmceliece-20230612/include-build/crypto_declassify.h */
+#ifndef crypto_declassify_h
+#define crypto_declassify_h
+
+static void crypto_declassify(void *crypto_declassify_v,long long crypto_declassify_vlen) {
+  (void) crypto_declassify_v;
+  (void) crypto_declassify_vlen;
+}
+
+#endif
+
+/* from libmceliece-20230612/crypto_kem/6688128f/vec/params.h */
+#ifndef PARAMS_H
+#define PARAMS_H
+
+#define GFBITS 13
+#define SYS_N 6688
+#define SYS_T 128
+
+#define COND_BYTES ((1 << (GFBITS-4))*(2*GFBITS - 1))
+#define IRR_BYTES (SYS_T * 2)
+
+#define PK_NROWS (SYS_T*GFBITS)
+#define PK_NCOLS (SYS_N - PK_NROWS)
+#define PK_ROW_BYTES ((PK_NCOLS + 7)/8)
+
+#define SYND_BYTES ((PK_NROWS + 7)/8)
+
+#define GFMASK ((1 << GFBITS) - 1)
+
+#endif
+
+
+/* from libmceliece-20230612/inttypes/crypto_intN.h */
+#ifndef crypto_int64_h
+#define crypto_int64_h
+
+#define crypto_int64 int64_t
+
+GCC_ATTR_UNUSED
+static crypto_int64 crypto_int64_negative_mask(crypto_int64 crypto_int64_x)
+{
+  return crypto_int64_x >> (64-1);
+}
+
+GCC_ATTR_UNUSED
+static crypto_int64 crypto_int64_nonzero_mask(crypto_int64 crypto_int64_x)
+{
+  return crypto_int64_negative_mask(crypto_int64_x) | crypto_int64_negative_mask(-crypto_int64_x);
+}
+
+GCC_ATTR_UNUSED
+static crypto_int64 crypto_int64_zero_mask(crypto_int64 crypto_int64_x)
+{
+  return ~crypto_int64_nonzero_mask(crypto_int64_x);
+}
+
+GCC_ATTR_UNUSED
+static crypto_int64 crypto_int64_positive_mask(crypto_int64 crypto_int64_x)
+{
+  crypto_int64 crypto_int64_z = -crypto_int64_x;
+  crypto_int64_z ^= crypto_int64_x & crypto_int64_z;
+  return crypto_int64_negative_mask(crypto_int64_z);
+}
+
+GCC_ATTR_UNUSED
+static crypto_int64 crypto_int64_unequal_mask(crypto_int64 crypto_int64_x,crypto_int64 crypto_int64_y)
+{
+  crypto_int64 crypto_int64_xy = crypto_int64_x ^ crypto_int64_y;
+  return crypto_int64_nonzero_mask(crypto_int64_xy);
+}
+
+GCC_ATTR_UNUSED
+static crypto_int64 crypto_int64_equal_mask(crypto_int64 crypto_int64_x,crypto_int64 crypto_int64_y)
+{
+  return ~crypto_int64_unequal_mask(crypto_int64_x,crypto_int64_y);
+}
+
+GCC_ATTR_UNUSED
+static crypto_int64 crypto_int64_smaller_mask(crypto_int64 crypto_int64_x,crypto_int64 crypto_int64_y)
+{
+  crypto_int64 crypto_int64_xy = crypto_int64_x ^ crypto_int64_y;
+  crypto_int64 crypto_int64_z = crypto_int64_x - crypto_int64_y;
+  crypto_int64_z ^= crypto_int64_xy & (crypto_int64_z ^ crypto_int64_x);
+  return crypto_int64_negative_mask(crypto_int64_z);
+}
+
+GCC_ATTR_UNUSED
+static crypto_int64 crypto_int64_min(crypto_int64 crypto_int64_x,crypto_int64 crypto_int64_y)
+{
+  crypto_int64 crypto_int64_xy = crypto_int64_y ^ crypto_int64_x;
+  crypto_int64 crypto_int64_z = crypto_int64_y - crypto_int64_x;
+  crypto_int64_z ^= crypto_int64_xy & (crypto_int64_z ^ crypto_int64_y);
+  crypto_int64_z = crypto_int64_negative_mask(crypto_int64_z);
+  crypto_int64_z &= crypto_int64_xy;
+  return crypto_int64_x ^ crypto_int64_z;
+}
+
+GCC_ATTR_UNUSED
+static crypto_int64 crypto_int64_max(crypto_int64 crypto_int64_x,crypto_int64 crypto_int64_y)
+{
+  crypto_int64 crypto_int64_xy = crypto_int64_y ^ crypto_int64_x;
+  crypto_int64 crypto_int64_z = crypto_int64_y - crypto_int64_x;
+  crypto_int64_z ^= crypto_int64_xy & (crypto_int64_z ^ crypto_int64_y);
+  crypto_int64_z = crypto_int64_negative_mask(crypto_int64_z);
+  crypto_int64_z &= crypto_int64_xy;
+  return crypto_int64_y ^ crypto_int64_z;
+}
+
+GCC_ATTR_UNUSED
+static void crypto_int64_minmax(crypto_int64 *crypto_int64_a,crypto_int64 *crypto_int64_b)
+{
+  crypto_int64 crypto_int64_x = *crypto_int64_a;
+  crypto_int64 crypto_int64_y = *crypto_int64_b;
+  crypto_int64 crypto_int64_xy = crypto_int64_y ^ crypto_int64_x;
+  crypto_int64 crypto_int64_z = crypto_int64_y - crypto_int64_x;
+  crypto_int64_z ^= crypto_int64_xy & (crypto_int64_z ^ crypto_int64_y);
+  crypto_int64_z = crypto_int64_negative_mask(crypto_int64_z);
+  crypto_int64_z &= crypto_int64_xy;
+  *crypto_int64_a = crypto_int64_x ^ crypto_int64_z;
+  *crypto_int64_b = crypto_int64_y ^ crypto_int64_z;
+}
+
+#endif
+
+/* from libmceliece-20230612/inttypes/crypto_intN.h */
+#ifndef crypto_int16_h
+#define crypto_int16_h
+
+#define crypto_int16 int16_t
+
+GCC_ATTR_UNUSED
+static crypto_int16 crypto_int16_negative_mask(crypto_int16 crypto_int16_x)
+{
+  return crypto_int16_x >> (16-1);
+}
+
+GCC_ATTR_UNUSED
+static crypto_int16 crypto_int16_nonzero_mask(crypto_int16 crypto_int16_x)
+{
+  return crypto_int16_negative_mask(crypto_int16_x) | crypto_int16_negative_mask(-crypto_int16_x);
+}
+
+GCC_ATTR_UNUSED
+static crypto_int16 crypto_int16_zero_mask(crypto_int16 crypto_int16_x)
+{
+  return ~crypto_int16_nonzero_mask(crypto_int16_x);
+}
+
+GCC_ATTR_UNUSED
+static crypto_int16 crypto_int16_positive_mask(crypto_int16 crypto_int16_x)
+{
+  crypto_int16 crypto_int16_z = -crypto_int16_x;
+  crypto_int16_z ^= crypto_int16_x & crypto_int16_z;
+  return crypto_int16_negative_mask(crypto_int16_z);
+}
+
+GCC_ATTR_UNUSED
+static crypto_int16 crypto_int16_unequal_mask(crypto_int16 crypto_int16_x,crypto_int16 crypto_int16_y)
+{
+  crypto_int16 crypto_int16_xy = crypto_int16_x ^ crypto_int16_y;
+  return crypto_int16_nonzero_mask(crypto_int16_xy);
+}
+
+GCC_ATTR_UNUSED
+static crypto_int16 crypto_int16_equal_mask(crypto_int16 crypto_int16_x,crypto_int16 crypto_int16_y)
+{
+  return ~crypto_int16_unequal_mask(crypto_int16_x,crypto_int16_y);
+}
+
+GCC_ATTR_UNUSED
+static crypto_int16 crypto_int16_smaller_mask(crypto_int16 crypto_int16_x,crypto_int16 crypto_int16_y)
+{
+  crypto_int16 crypto_int16_xy = crypto_int16_x ^ crypto_int16_y;
+  crypto_int16 crypto_int16_z = crypto_int16_x - crypto_int16_y;
+  crypto_int16_z ^= crypto_int16_xy & (crypto_int16_z ^ crypto_int16_x);
+  return crypto_int16_negative_mask(crypto_int16_z);
+}
+
+GCC_ATTR_UNUSED
+static crypto_int16 crypto_int16_min(crypto_int16 crypto_int16_x,crypto_int16 crypto_int16_y)
+{
+  crypto_int16 crypto_int16_xy = crypto_int16_y ^ crypto_int16_x;
+  crypto_int16 crypto_int16_z = crypto_int16_y - crypto_int16_x;
+  crypto_int16_z ^= crypto_int16_xy & (crypto_int16_z ^ crypto_int16_y);
+  crypto_int16_z = crypto_int16_negative_mask(crypto_int16_z);
+  crypto_int16_z &= crypto_int16_xy;
+  return crypto_int16_x ^ crypto_int16_z;
+}
+
+GCC_ATTR_UNUSED
+static crypto_int16 crypto_int16_max(crypto_int16 crypto_int16_x,crypto_int16 crypto_int16_y)
+{
+  crypto_int16 crypto_int16_xy = crypto_int16_y ^ crypto_int16_x;
+  crypto_int16 crypto_int16_z = crypto_int16_y - crypto_int16_x;
+  crypto_int16_z ^= crypto_int16_xy & (crypto_int16_z ^ crypto_int16_y);
+  crypto_int16_z = crypto_int16_negative_mask(crypto_int16_z);
+  crypto_int16_z &= crypto_int16_xy;
+  return crypto_int16_y ^ crypto_int16_z;
+}
+
+GCC_ATTR_UNUSED
+static void crypto_int16_minmax(crypto_int16 *crypto_int16_a,crypto_int16 *crypto_int16_b)
+{
+  crypto_int16 crypto_int16_x = *crypto_int16_a;
+  crypto_int16 crypto_int16_y = *crypto_int16_b;
+  crypto_int16 crypto_int16_xy = crypto_int16_y ^ crypto_int16_x;
+  crypto_int16 crypto_int16_z = crypto_int16_y - crypto_int16_x;
+  crypto_int16_z ^= crypto_int16_xy & (crypto_int16_z ^ crypto_int16_y);
+  crypto_int16_z = crypto_int16_negative_mask(crypto_int16_z);
+  crypto_int16_z &= crypto_int16_xy;
+  *crypto_int16_a = crypto_int16_x ^ crypto_int16_z;
+  *crypto_int16_b = crypto_int16_y ^ crypto_int16_z;
+}
+
+#endif
+
+/* from libmceliece-20230612/inttypes/crypto_intN.h */
+#ifndef crypto_int32_h
+#define crypto_int32_h
+
+#define crypto_int32 int32_t
+
+GCC_ATTR_UNUSED
+static crypto_int32 crypto_int32_negative_mask(crypto_int32 crypto_int32_x)
+{
+  return crypto_int32_x >> (32-1);
+}
+
+GCC_ATTR_UNUSED
+static crypto_int32 crypto_int32_nonzero_mask(crypto_int32 crypto_int32_x)
+{
+  return crypto_int32_negative_mask(crypto_int32_x) | crypto_int32_negative_mask(-crypto_int32_x);
+}
+
+GCC_ATTR_UNUSED
+static crypto_int32 crypto_int32_zero_mask(crypto_int32 crypto_int32_x)
+{
+  return ~crypto_int32_nonzero_mask(crypto_int32_x);
+}
+
+GCC_ATTR_UNUSED
+static crypto_int32 crypto_int32_positive_mask(crypto_int32 crypto_int32_x)
+{
+  crypto_int32 crypto_int32_z = -crypto_int32_x;
+  crypto_int32_z ^= crypto_int32_x & crypto_int32_z;
+  return crypto_int32_negative_mask(crypto_int32_z);
+}
+
+GCC_ATTR_UNUSED
+static crypto_int32 crypto_int32_unequal_mask(crypto_int32 crypto_int32_x,crypto_int32 crypto_int32_y)
+{
+  crypto_int32 crypto_int32_xy = crypto_int32_x ^ crypto_int32_y;
+  return crypto_int32_nonzero_mask(crypto_int32_xy);
+}
+
+GCC_ATTR_UNUSED
+static crypto_int32 crypto_int32_equal_mask(crypto_int32 crypto_int32_x,crypto_int32 crypto_int32_y)
+{
+  return ~crypto_int32_unequal_mask(crypto_int32_x,crypto_int32_y);
+}
+
+GCC_ATTR_UNUSED
+static crypto_int32 crypto_int32_smaller_mask(crypto_int32 crypto_int32_x,crypto_int32 crypto_int32_y)
+{
+  crypto_int32 crypto_int32_xy = crypto_int32_x ^ crypto_int32_y;
+  crypto_int32 crypto_int32_z = crypto_int32_x - crypto_int32_y;
+  crypto_int32_z ^= crypto_int32_xy & (crypto_int32_z ^ crypto_int32_x);
+  return crypto_int32_negative_mask(crypto_int32_z);
+}
+
+GCC_ATTR_UNUSED
+static crypto_int32 crypto_int32_min(crypto_int32 crypto_int32_x,crypto_int32 crypto_int32_y)
+{
+  crypto_int32 crypto_int32_xy = crypto_int32_y ^ crypto_int32_x;
+  crypto_int32 crypto_int32_z = crypto_int32_y - crypto_int32_x;
+  crypto_int32_z ^= crypto_int32_xy & (crypto_int32_z ^ crypto_int32_y);
+  crypto_int32_z = crypto_int32_negative_mask(crypto_int32_z);
+  crypto_int32_z &= crypto_int32_xy;
+  return crypto_int32_x ^ crypto_int32_z;
+}
+
+GCC_ATTR_UNUSED
+static crypto_int32 crypto_int32_max(crypto_int32 crypto_int32_x,crypto_int32 crypto_int32_y)
+{
+  crypto_int32 crypto_int32_xy = crypto_int32_y ^ crypto_int32_x;
+  crypto_int32 crypto_int32_z = crypto_int32_y - crypto_int32_x;
+  crypto_int32_z ^= crypto_int32_xy & (crypto_int32_z ^ crypto_int32_y);
+  crypto_int32_z = crypto_int32_negative_mask(crypto_int32_z);
+  crypto_int32_z &= crypto_int32_xy;
+  return crypto_int32_y ^ crypto_int32_z;
+}
+
+GCC_ATTR_UNUSED
+static void crypto_int32_minmax(crypto_int32 *crypto_int32_a,crypto_int32 *crypto_int32_b)
+{
+  crypto_int32 crypto_int32_x = *crypto_int32_a;
+  crypto_int32 crypto_int32_y = *crypto_int32_b;
+  crypto_int32 crypto_int32_xy = crypto_int32_y ^ crypto_int32_x;
+  crypto_int32 crypto_int32_z = crypto_int32_y - crypto_int32_x;
+  crypto_int32_z ^= crypto_int32_xy & (crypto_int32_z ^ crypto_int32_y);
+  crypto_int32_z = crypto_int32_negative_mask(crypto_int32_z);
+  crypto_int32_z &= crypto_int32_xy;
+  *crypto_int32_a = crypto_int32_x ^ crypto_int32_z;
+  *crypto_int32_b = crypto_int32_y ^ crypto_int32_z;
+}
+
+#endif
+
+/* from libmceliece-20230612/inttypes/crypto_uintN.h */
+#ifndef crypto_uint64_h
+#define crypto_uint64_h
+
+#define crypto_uint64 uint64_t
+#define crypto_uint64_signed int64_t
+
+GCC_ATTR_UNUSED
+static crypto_uint64_signed crypto_uint64_signed_negative_mask(crypto_uint64_signed crypto_uint64_signed_x)
+{
+  return crypto_uint64_signed_x >> (64-1);
+}
+
+GCC_ATTR_UNUSED
+static crypto_uint64 crypto_uint64_nonzero_mask(crypto_uint64 crypto_uint64_x)
+{
+  return crypto_uint64_signed_negative_mask(crypto_uint64_x) | crypto_uint64_signed_negative_mask(-crypto_uint64_x);
+}
+
+GCC_ATTR_UNUSED
+static crypto_uint64 crypto_uint64_zero_mask(crypto_uint64 crypto_uint64_x)
+{
+  return ~crypto_uint64_nonzero_mask(crypto_uint64_x);
+}
+
+GCC_ATTR_UNUSED
+static crypto_uint64 crypto_uint64_unequal_mask(crypto_uint64 crypto_uint64_x,crypto_uint64 crypto_uint64_y)
+{
+  crypto_uint64 crypto_uint64_xy = crypto_uint64_x ^ crypto_uint64_y;
+  return crypto_uint64_nonzero_mask(crypto_uint64_xy);
+}
+
+GCC_ATTR_UNUSED
+static crypto_uint64 crypto_uint64_equal_mask(crypto_uint64 crypto_uint64_x,crypto_uint64 crypto_uint64_y)
+{
+  return ~crypto_uint64_unequal_mask(crypto_uint64_x,crypto_uint64_y);
+}
+
+GCC_ATTR_UNUSED
+static crypto_uint64 crypto_uint64_smaller_mask(crypto_uint64 crypto_uint64_x,crypto_uint64 crypto_uint64_y)
+{
+  crypto_uint64 crypto_uint64_xy = crypto_uint64_x ^ crypto_uint64_y;
+  crypto_uint64 crypto_uint64_z = crypto_uint64_x - crypto_uint64_y;
+  crypto_uint64_z ^= crypto_uint64_xy & (crypto_uint64_z ^ crypto_uint64_x ^ (((crypto_uint64) 1) << (64-1)));
+  return crypto_uint64_signed_negative_mask(crypto_uint64_z);
+}
+
+GCC_ATTR_UNUSED
+static crypto_uint64 crypto_uint64_min(crypto_uint64 crypto_uint64_x,crypto_uint64 crypto_uint64_y)
+{
+  crypto_uint64 crypto_uint64_xy = crypto_uint64_y ^ crypto_uint64_x;
+  crypto_uint64 crypto_uint64_z = crypto_uint64_y - crypto_uint64_x;
+  crypto_uint64_z ^= crypto_uint64_xy & (crypto_uint64_z ^ crypto_uint64_y ^ (((crypto_uint64) 1) << (64-1)));
+  crypto_uint64_z = crypto_uint64_signed_negative_mask(crypto_uint64_z);
+  crypto_uint64_z &= crypto_uint64_xy;
+  return crypto_uint64_x ^ crypto_uint64_z;
+}
+
+GCC_ATTR_UNUSED
+static crypto_uint64 crypto_uint64_max(crypto_uint64 crypto_uint64_x,crypto_uint64 crypto_uint64_y)
+{
+  crypto_uint64 crypto_uint64_xy = crypto_uint64_y ^ crypto_uint64_x;
+  crypto_uint64 crypto_uint64_z = crypto_uint64_y - crypto_uint64_x;
+  crypto_uint64_z ^= crypto_uint64_xy & (crypto_uint64_z ^ crypto_uint64_y ^ (((crypto_uint64) 1) << (64-1)));
+  crypto_uint64_z = crypto_uint64_signed_negative_mask(crypto_uint64_z);
+  crypto_uint64_z &= crypto_uint64_xy;
+  return crypto_uint64_y ^ crypto_uint64_z;
+}
+
+GCC_ATTR_UNUSED
+static void crypto_uint64_minmax(crypto_uint64 *crypto_uint64_a,crypto_uint64 *crypto_uint64_b)
+{
+  crypto_uint64 crypto_uint64_x = *crypto_uint64_a;
+  crypto_uint64 crypto_uint64_y = *crypto_uint64_b;
+  crypto_uint64 crypto_uint64_xy = crypto_uint64_y ^ crypto_uint64_x;
+  crypto_uint64 crypto_uint64_z = crypto_uint64_y - crypto_uint64_x;
+  crypto_uint64_z ^= crypto_uint64_xy & (crypto_uint64_z ^ crypto_uint64_y ^ (((crypto_uint64) 1) << (64-1)));
+  crypto_uint64_z = crypto_uint64_signed_negative_mask(crypto_uint64_z);
+  crypto_uint64_z &= crypto_uint64_xy;
+  *crypto_uint64_a = crypto_uint64_x ^ crypto_uint64_z;
+  *crypto_uint64_b = crypto_uint64_y ^ crypto_uint64_z;
+}
+
+#endif
+
+/* from libmceliece-20230612/inttypes/crypto_uintN.h */
+#ifndef crypto_uint16_h
+#define crypto_uint16_h
+
+#define crypto_uint16 uint16_t
+#define crypto_uint16_signed int16_t
+
+GCC_ATTR_UNUSED
+static crypto_uint16_signed crypto_uint16_signed_negative_mask(crypto_uint16_signed crypto_uint16_signed_x)
+{
+  return crypto_uint16_signed_x >> (16-1);
+}
+
+GCC_ATTR_UNUSED
+static crypto_uint16 crypto_uint16_nonzero_mask(crypto_uint16 crypto_uint16_x)
+{
+  return crypto_uint16_signed_negative_mask(crypto_uint16_x) | crypto_uint16_signed_negative_mask(-crypto_uint16_x);
+}
+
+GCC_ATTR_UNUSED
+static crypto_uint16 crypto_uint16_zero_mask(crypto_uint16 crypto_uint16_x)
+{
+  return ~crypto_uint16_nonzero_mask(crypto_uint16_x);
+}
+
+GCC_ATTR_UNUSED
+static crypto_uint16 crypto_uint16_unequal_mask(crypto_uint16 crypto_uint16_x,crypto_uint16 crypto_uint16_y)
+{
+  crypto_uint16 crypto_uint16_xy = crypto_uint16_x ^ crypto_uint16_y;
+  return crypto_uint16_nonzero_mask(crypto_uint16_xy);
+}
+
+GCC_ATTR_UNUSED
+static crypto_uint16 crypto_uint16_equal_mask(crypto_uint16 crypto_uint16_x,crypto_uint16 crypto_uint16_y)
+{
+  return ~crypto_uint16_unequal_mask(crypto_uint16_x,crypto_uint16_y);
+}
+
+GCC_ATTR_UNUSED
+static crypto_uint16 crypto_uint16_smaller_mask(crypto_uint16 crypto_uint16_x,crypto_uint16 crypto_uint16_y)
+{
+  crypto_uint16 crypto_uint16_xy = crypto_uint16_x ^ crypto_uint16_y;
+  crypto_uint16 crypto_uint16_z = crypto_uint16_x - crypto_uint16_y;
+  crypto_uint16_z ^= crypto_uint16_xy & (crypto_uint16_z ^ crypto_uint16_x ^ (((crypto_uint16) 1) << (16-1)));
+  return crypto_uint16_signed_negative_mask(crypto_uint16_z);
+}
+
+GCC_ATTR_UNUSED
+static crypto_uint16 crypto_uint16_min(crypto_uint16 crypto_uint16_x,crypto_uint16 crypto_uint16_y)
+{
+  crypto_uint16 crypto_uint16_xy = crypto_uint16_y ^ crypto_uint16_x;
+  crypto_uint16 crypto_uint16_z = crypto_uint16_y - crypto_uint16_x;
+  crypto_uint16_z ^= crypto_uint16_xy & (crypto_uint16_z ^ crypto_uint16_y ^ (((crypto_uint16) 1) << (16-1)));
+  crypto_uint16_z = crypto_uint16_signed_negative_mask(crypto_uint16_z);
+  crypto_uint16_z &= crypto_uint16_xy;
+  return crypto_uint16_x ^ crypto_uint16_z;
+}
+
+GCC_ATTR_UNUSED
+static crypto_uint16 crypto_uint16_max(crypto_uint16 crypto_uint16_x,crypto_uint16 crypto_uint16_y)
+{
+  crypto_uint16 crypto_uint16_xy = crypto_uint16_y ^ crypto_uint16_x;
+  crypto_uint16 crypto_uint16_z = crypto_uint16_y - crypto_uint16_x;
+  crypto_uint16_z ^= crypto_uint16_xy & (crypto_uint16_z ^ crypto_uint16_y ^ (((crypto_uint16) 1) << (16-1)));
+  crypto_uint16_z = crypto_uint16_signed_negative_mask(crypto_uint16_z);
+  crypto_uint16_z &= crypto_uint16_xy;
+  return crypto_uint16_y ^ crypto_uint16_z;
+}
+
+GCC_ATTR_UNUSED
+static void crypto_uint16_minmax(crypto_uint16 *crypto_uint16_a,crypto_uint16 *crypto_uint16_b)
+{
+  crypto_uint16 crypto_uint16_x = *crypto_uint16_a;
+  crypto_uint16 crypto_uint16_y = *crypto_uint16_b;
+  crypto_uint16 crypto_uint16_xy = crypto_uint16_y ^ crypto_uint16_x;
+  crypto_uint16 crypto_uint16_z = crypto_uint16_y - crypto_uint16_x;
+  crypto_uint16_z ^= crypto_uint16_xy & (crypto_uint16_z ^ crypto_uint16_y ^ (((crypto_uint16) 1) << (16-1)));
+  crypto_uint16_z = crypto_uint16_signed_negative_mask(crypto_uint16_z);
+  crypto_uint16_z &= crypto_uint16_xy;
+  *crypto_uint16_a = crypto_uint16_x ^ crypto_uint16_z;
+  *crypto_uint16_b = crypto_uint16_y ^ crypto_uint16_z;
+}
+
+#endif
+
+/* from libmceliece-20230612/inttypes/crypto_uintN.h */
+#ifndef crypto_uint32_h
+#define crypto_uint32_h
+
+#define crypto_uint32 uint32_t
+#define crypto_uint32_signed int32_t
+
+GCC_ATTR_UNUSED
+static crypto_uint32_signed crypto_uint32_signed_negative_mask(crypto_uint32_signed crypto_uint32_signed_x)
+{
+  return crypto_uint32_signed_x >> (32-1);
+}
+
+GCC_ATTR_UNUSED
+static crypto_uint32 crypto_uint32_nonzero_mask(crypto_uint32 crypto_uint32_x)
+{
+  return crypto_uint32_signed_negative_mask(crypto_uint32_x) | crypto_uint32_signed_negative_mask(-crypto_uint32_x);
+}
+
+GCC_ATTR_UNUSED
+static crypto_uint32 crypto_uint32_zero_mask(crypto_uint32 crypto_uint32_x)
+{
+  return ~crypto_uint32_nonzero_mask(crypto_uint32_x);
+}
+
+GCC_ATTR_UNUSED
+static crypto_uint32 crypto_uint32_unequal_mask(crypto_uint32 crypto_uint32_x,crypto_uint32 crypto_uint32_y)
+{
+  crypto_uint32 crypto_uint32_xy = crypto_uint32_x ^ crypto_uint32_y;
+  return crypto_uint32_nonzero_mask(crypto_uint32_xy);
+}
+
+GCC_ATTR_UNUSED
+static crypto_uint32 crypto_uint32_equal_mask(crypto_uint32 crypto_uint32_x,crypto_uint32 crypto_uint32_y)
+{
+  return ~crypto_uint32_unequal_mask(crypto_uint32_x,crypto_uint32_y);
+}
+
+GCC_ATTR_UNUSED
+static crypto_uint32 crypto_uint32_smaller_mask(crypto_uint32 crypto_uint32_x,crypto_uint32 crypto_uint32_y)
+{
+  crypto_uint32 crypto_uint32_xy = crypto_uint32_x ^ crypto_uint32_y;
+  crypto_uint32 crypto_uint32_z = crypto_uint32_x - crypto_uint32_y;
+  crypto_uint32_z ^= crypto_uint32_xy & (crypto_uint32_z ^ crypto_uint32_x ^ (((crypto_uint32) 1) << (32-1)));
+  return crypto_uint32_signed_negative_mask(crypto_uint32_z);
+}
+
+GCC_ATTR_UNUSED
+static crypto_uint32 crypto_uint32_min(crypto_uint32 crypto_uint32_x,crypto_uint32 crypto_uint32_y)
+{
+  crypto_uint32 crypto_uint32_xy = crypto_uint32_y ^ crypto_uint32_x;
+  crypto_uint32 crypto_uint32_z = crypto_uint32_y - crypto_uint32_x;
+  crypto_uint32_z ^= crypto_uint32_xy & (crypto_uint32_z ^ crypto_uint32_y ^ (((crypto_uint32) 1) << (32-1)));
+  crypto_uint32_z = crypto_uint32_signed_negative_mask(crypto_uint32_z);
+  crypto_uint32_z &= crypto_uint32_xy;
+  return crypto_uint32_x ^ crypto_uint32_z;
+}
+
+GCC_ATTR_UNUSED
+static crypto_uint32 crypto_uint32_max(crypto_uint32 crypto_uint32_x,crypto_uint32 crypto_uint32_y)
+{
+  crypto_uint32 crypto_uint32_xy = crypto_uint32_y ^ crypto_uint32_x;
+  crypto_uint32 crypto_uint32_z = crypto_uint32_y - crypto_uint32_x;
+  crypto_uint32_z ^= crypto_uint32_xy & (crypto_uint32_z ^ crypto_uint32_y ^ (((crypto_uint32) 1) << (32-1)));
+  crypto_uint32_z = crypto_uint32_signed_negative_mask(crypto_uint32_z);
+  crypto_uint32_z &= crypto_uint32_xy;
+  return crypto_uint32_y ^ crypto_uint32_z;
+}
+
+GCC_ATTR_UNUSED
+static void crypto_uint32_minmax(crypto_uint32 *crypto_uint32_a,crypto_uint32 *crypto_uint32_b)
+{
+  crypto_uint32 crypto_uint32_x = *crypto_uint32_a;
+  crypto_uint32 crypto_uint32_y = *crypto_uint32_b;
+  crypto_uint32 crypto_uint32_xy = crypto_uint32_y ^ crypto_uint32_x;
+  crypto_uint32 crypto_uint32_z = crypto_uint32_y - crypto_uint32_x;
+  crypto_uint32_z ^= crypto_uint32_xy & (crypto_uint32_z ^ crypto_uint32_y ^ (((crypto_uint32) 1) << (32-1)));
+  crypto_uint32_z = crypto_uint32_signed_negative_mask(crypto_uint32_z);
+  crypto_uint32_z &= crypto_uint32_xy;
+  *crypto_uint32_a = crypto_uint32_x ^ crypto_uint32_z;
+  *crypto_uint32_b = crypto_uint32_y ^ crypto_uint32_z;
+}
+
+#endif
+
+/* from libmceliece-20230612/crypto_kem/6688128f/vec/vec.h */
+#ifndef VEC_H
+#define VEC_H
+
+
+
+typedef uint64_t vec;
+
+static inline vec vec_setbits(vec b)
+{
+       vec ret = -b;
+
+       return ret;
+}
+
+static inline vec vec_set1_16b(uint16_t v)
+{
+       vec ret;
+
+       ret = v;
+       ret |= ret << 16;
+       ret |= ret << 32;
+
+       return ret;
+}
+
+static inline void vec_copy(vec * out, vec * in)
+{
+       int i;
+
+       for (i = 0; i < GFBITS; i++)
+               out[i] = in[i];
+}
+
+static inline vec vec_or_reduce(vec * a)
+{
+       int i;
+       vec ret;
+
+       ret = a[0];
+       for (i = 1; i < GFBITS; i++)
+               ret |= a[i];
+
+       return ret;
+}
+
+static inline int vec_testz(vec a)
+{
+       a |= a >> 32;
+       a |= a >> 16;
+       a |= a >> 8;
+       a |= a >> 4;
+       a |= a >> 2;
+       a |= a >> 1;
+
+       return (a&1)^1;
+}
+
+static void vec_mul(vec *, const vec *, const vec *);
+static void vec_sq(vec *, vec *);
+static void vec_inv(vec *, vec *);
+
+#endif
+
+
+/* from libmceliece-20230612/crypto_kem/6688128f/vec/benes.h */
+/*
+  This file is for Benes network related functions
+*/
+
+#ifndef BENES_H
+#define BENES_H
+
+
+static void benes(vec *, const unsigned char *, int);
+
+#endif
+
+
+/* from libmceliece-20230612/crypto_kem/6688128f/vec/bm.h */
+/*
+  This file is for the inversion-free Berlekamp-Massey algorithm
+  see https://ieeexplore.ieee.org/document/87857
+*/
+
+#ifndef BM_H
+#define BM_H
+
+
+static void bm(vec [][GFBITS], vec [][ GFBITS ]);
+
+#endif
+
+
+/* from libmceliece-20230612/crypto_kem/6688128f/vec/controlbits.h */
+/* This file is for implementing the Nassimi-Sahni algorithm */
+/* See David Nassimi, Sartaj Sahni "Parallel algorithms to set up the Benes permutationnetwork" */
+/* See also https://cr.yp.to/papers/controlbits-20200923.pdf */
+
+#ifndef CONTROLBITS_H
+#define CONTROLBITS_H
+
+
+
+
+#endif
+
+/* from libmceliece-20230612/crypto_kem/6688128f/vec/decrypt.h */
+/*
+  This file is for Nieddereiter decryption
+*/
+
+#ifndef DECRYPT_H
+#define DECRYPT_H
+
+static int decrypt(unsigned char *, const unsigned char *, const unsigned char *);
+
+#endif
+
+
+/* from libmceliece-20230612/crypto_kem/6688128f/vec/encrypt.h */
+/*
+  This file is for Niederreiter encryption
+*/
+/* 20230102 djb: rename encrypt() as pke_encrypt() */
+
+#ifndef ENCRYPT_H
+#define ENCRYPT_H
+
+static void pke_encrypt(unsigned char *, const unsigned char *, unsigned char *);
+
+#endif
+
+
+/* from libmceliece-20230612/crypto_kem/6688128f/vec/fft_consts.h */
+#ifndef fft_consts_h
+#define fft_consts_h
+
+
+
+
+#endif
+
+/* from libmceliece-20230612/crypto_kem/6688128f/vec/fft.h */
+/*
+  This file is for the Gao-Mateer FFT
+  sse http://www.math.clemson.edu/~sgao/papers/GM10.pdf
+*/
+
+#ifndef FFT_H
+#define FFT_H
+
+
+static void fft(vec [][GFBITS], vec [][GFBITS]);
+
+#endif
+
+
+/* from libmceliece-20230612/crypto_kem/6688128f/vec/fft_powers.h */
+#ifndef fft_powers_h
+#define fft_powers_h
+
+
+
+
+#endif
+
+/* from libmceliece-20230612/crypto_kem/6688128f/vec/fft_scalars_2x.h */
+#ifndef fft_scalars_2x_h
+#define fft_scalars_2x_h
+
+
+
+
+#endif
+
+/* from libmceliece-20230612/crypto_kem/6688128f/vec/fft_scalars_4x.h */
+#ifndef fft_scalars_4x_h
+#define fft_scalars_4x_h
+
+
+
+
+#endif
+
+/* from libmceliece-20230612/crypto_kem/6688128f/vec/fft_tr.h */
+/*
+  This file is for transpose of the Gao-Mateer FFT
+*/
+
+#ifndef FFT_TR_H
+#define FFT_TR_H
+
+
+static void fft_tr(vec out[][GFBITS], vec in[][ GFBITS ]);
+
+#endif
+
+
+/* from libmceliece-20230612/crypto_kem/6688128f/vec/gf.h */
+/*
+  This file is for functions for field arithmetic
+*/
+/* 20221231 djb: const for GF_mul */
+
+#ifndef GF_H
+#define GF_H
+
+
+
+typedef uint16_t gf;
+
+gf gf_iszero(gf);
+gf gf_mul(gf, gf);
+gf gf_frac(gf, gf);
+gf gf_inv(gf);
+
+static void GF_mul(gf *, const gf *, const gf *);
+
+/* 2 field multiplications */
+static inline uint64_t gf_mul2(gf a, gf b0, gf b1)
+{
+       int i;
+
+       uint64_t tmp=0;
+       uint64_t t0;
+       uint64_t t1;
+       uint64_t t;
+       uint64_t mask = 0x0000000100000001;
+
+       t0 = a;
+       t1 = b1;
+       t1 = (t1 << 32) | b0;
+
+       for (i = 0; i < GFBITS; i++)
+       {
+               tmp ^= t0 * (t1 & mask);
+               mask += mask;
+       }
+
+       /**/
+
+       t = tmp & 0x01FF000001FF0000;
+       tmp ^= (t >> 9) ^ (t >> 10) ^ (t >> 12) ^ (t >> 13);
+
+       t = tmp & 0x0000E0000000E000;
+       tmp ^= (t >> 9) ^ (t >> 10) ^ (t >> 12) ^ (t >> 13);
+
+       return tmp & 0x00001FFF00001FFF;
+}
+
+#endif
+
+
+/* from libmceliece-20230612/crypto_kem/6688128f/vec/hash.h */
+
+#define shake crypto_xof_shake256
+
+#define crypto_hash_32b(out,in,inlen) \
+  shake(out,32,in,inlen)
+
+/* from libmceliece-20230612/crypto_kem/6688128f/vec/int32_sort.h */
+#ifndef int32_sort_h
+#define int32_sort_h
+
+
+
+#define int32_MINMAX(a,b) \
+do { \
+  int64_t ab = (int64_t)b ^ (int64_t)a; \
+  int64_t c = (int64_t)b - (int64_t)a; \
+  c ^= ab & (c ^ b); \
+  c >>= 31; \
+  c &= ab; \
+  a ^= c; \
+  b ^= c; \
+} while(0)
+
+static void int32_sort(int32_t *x,long long n)
+{
+  long long top,p,q,r,i;
+
+  if (n < 2) return;
+  top = 1;
+  while (top < n - top) top += top;
+
+  for (p = top;p > 0;p >>= 1) {
+    for (i = 0;i < n - p;++i)
+      if (!(i & p))
+        int32_MINMAX(x[i],x[i+p]);
+    i = 0;
+    for (q = top;q > p;q >>= 1) {
+      for (;i < n - q;++i) {
+        if (!(i & p)) {
+          int32_t a = x[i + p];
+          for (r = q;r > p;r >>= 1)
+            int32_MINMAX(a,x[i+r]);
+          x[i + p] = a;
+        }
+      }
+    }
+  }
+}
+
+#endif
+
+
+/* from libmceliece-20230612/crypto_kem/6688128f/vec/operations.h */
+#ifndef OPERATIONS_H
+#define OPERATIONS_H
+
+
+static void operation_enc(
+       unsigned char *c,
+       unsigned char *key,
+       const unsigned char *pk
+);
+
+static void operation_dec(
+       unsigned char *key,
+       const unsigned char *c,
+       const unsigned char *sk
+);
+
+static void operation_keypair
+(
+       unsigned char *pk,
+       unsigned char *sk
+);
+
+#endif
+
+
+/* from libmceliece-20230612/crypto_kem/6688128f/vec/pk_gen.h */
+/*
+  This file is for public-key generation
+*/
+
+#ifndef PK_GEN_H
+#define PK_GEN_H
+
+
+static int pk_gen(unsigned char *, const unsigned char *, uint32_t *, int16_t *, uint64_t *);
+
+#endif
+
+
+/* from libmceliece-20230612/crypto_kem/6688128f/vec/sk_gen.h */
+/*
+  This file is for secret-key generation
+*/
+
+#ifndef SK_GEN_H
+#define SK_GEN_H
+
+
+
+static int genpoly_gen(gf *, gf *);
+
+#endif
+
+
+/* from libmceliece-20230612/crypto_kem/6688128f/vec/transpose.h */
+/*
+  This file is for matrix transposition
+*/
+
+#ifndef TRANSPOSE_H
+#define TRANSPOSE_H
+
+/* input: in, a 64x64 matrix over GF(2) */
+/* output: out, transpose of in */
+static inline void transpose_64x64(uint64_t * out, uint64_t * in)
+{
+       int i, j, s, d;
+
+       uint64_t x, y;
+       uint64_t masks[6][2] = {
+                               {0x5555555555555555, 0xAAAAAAAAAAAAAAAA},
+                               {0x3333333333333333, 0xCCCCCCCCCCCCCCCC},
+                               {0x0F0F0F0F0F0F0F0F, 0xF0F0F0F0F0F0F0F0},
+                               {0x00FF00FF00FF00FF, 0xFF00FF00FF00FF00},
+                               {0x0000FFFF0000FFFF, 0xFFFF0000FFFF0000},
+                               {0x00000000FFFFFFFF, 0xFFFFFFFF00000000}
+                              };
+
+       for (i = 0; i < 64; i++)
+               out[i] = in[i];
+
+       for (d = 5; d >= 0; d--)
+       {
+               s = 1 << d;
+
+               for (i = 0; i < 64; i += s*2)
+               for (j = i; j < i+s; j++)
+               {
+                       x = (out[j] & masks[d][0]) | ((out[j+s] & masks[d][0]) << s);
+                       y = ((out[j] & masks[d][1]) >> s) | (out[j+s] & masks[d][1]);
+
+                       out[j+0] = x;
+                       out[j+s] = y;
+               }
+       }
+}
+
+#endif
+
+
+/* from libmceliece-20230612/crypto_kem/6688128f/vec/uint16_sort.h */
+#ifndef uint16_sort_h
+#define uint16_sort_h
+
+
+
+#define uint16_MINMAX(a,b) \
+do { \
+  uint16_t c = b - a; \
+  c >>= 15; \
+  c = -c; \
+  c &= a ^ b; \
+  a ^= c; \
+  b ^= c; \
+} while(0)
+
+static void uint16_sort(uint16_t *x,long long n)
+{
+  long long top,p,q,r,i;
+
+  if (n < 2) return;
+  top = 1;
+  while (top < n - top) top += top;
+
+  for (p = top;p > 0;p >>= 1) {
+    for (i = 0;i < n - p;++i)
+      if (!(i & p))
+        uint16_MINMAX(x[i],x[i+p]);
+    i = 0;
+    for (q = top;q > p;q >>= 1) {
+      for (;i < n - q;++i) {
+        if (!(i & p)) {
+          int16_t a = x[i + p];
+          for (r = q;r > p;r >>= 1)
+            uint16_MINMAX(a,x[i+r]);
+          x[i + p] = a;
+        }
+      }
+    }
+  }
+}
+
+#endif
+
+
+/* from libmceliece-20230612/crypto_kem/6688128f/vec/uint64_sort.h */
+#ifndef uint64_sort_h
+#define uint64_sort_h
+
+
+
+#define uint64_MINMAX(a,b) \
+do { \
+  uint64_t c = b - a; \
+  c >>= 63; \
+  c = -c; \
+  c &= a ^ b; \
+  a ^= c; \
+  b ^= c; \
+} while(0)
+
+static void uint64_sort(uint64_t *x,long long n)
+{
+  long long top,p,q,r,i;
+
+  if (n < 2) return;
+  top = 1;
+  while (top < n - top) top += top;
+
+  for (p = top;p > 0;p >>= 1) {
+    for (i = 0;i < n - p;++i)
+      if (!(i & p))
+        uint64_MINMAX(x[i],x[i+p]);
+    i = 0;
+    for (q = top;q > p;q >>= 1) {
+      for (;i < n - q;++i) {
+        if (!(i & p)) {
+          uint64_t a = x[i + p];
+          for (r = q;r > p;r >>= 1)
+            uint64_MINMAX(a,x[i+r]);
+          x[i + p] = a;
+        }
+      }
+    }
+  }
+}
+
+#endif
+
+
+/* from libmceliece-20230612/crypto_kem/6688128f/vec/util.h */
+/*
+  This file is for loading/storing data in a little-endian fashion
+*/
+
+#ifndef UTIL_H
+#define UTIL_H
+
+
+
+static inline void store_i(unsigned char *out, uint64_t in, int i)
+{
+       int j;
+
+       for (j = 0; j < i; j++)
+               out[j] = (in >> (j * 8)) & 0xFF;
+}
+
+static inline void store_gf(unsigned char *dest, uint16_t a)
+{
+       dest[0] = a & 0xFF;
+       dest[1] = a >> 8;
+}
+
+static inline uint16_t load_gf(const unsigned char *src)
+{
+       uint16_t a;
+
+       a = src[1];
+       a <<= 8;
+       a |= src[0];
+
+       return a & GFMASK;
+}
+
+static inline uint32_t load4(const unsigned char *src)
+{
+       uint32_t a;
+
+       a  = src[3]; a <<= 8;
+       a |= src[2]; a <<= 8;
+       a |= src[1]; a <<= 8;
+       a |= src[0];
+
+       return a;
+}
+
+static inline void irr_load(vec out[][GFBITS], const unsigned char * in)
+{
+       int i, j;
+       uint64_t v0 = 0, v1 = 0;
+       uint16_t irr[ SYS_T ];
+
+       for (i = 0; i < SYS_T; i++)
+               irr[i] = load_gf(in + i*2);
+
+       for (i = 0; i < GFBITS; i++)
+       {
+               for (j = 63; j >= 0; j--)
+               {
+                       v0 <<= 1;
+                       v1 <<= 1;
+                       v0 |= (irr[j] >> i) & 1;
+                       v1 |= (irr[j+64] >> i) & 1;
+               }
+
+               out[0][i] = v0;
+               out[1][i] = v1;
+       }
+}
+
+static inline void store8(unsigned char *out, uint64_t in)
+{
+       out[0] = (in >> 0x00) & 0xFF;
+       out[1] = (in >> 0x08) & 0xFF;
+       out[2] = (in >> 0x10) & 0xFF;
+       out[3] = (in >> 0x18) & 0xFF;
+       out[4] = (in >> 0x20) & 0xFF;
+       out[5] = (in >> 0x28) & 0xFF;
+       out[6] = (in >> 0x30) & 0xFF;
+       out[7] = (in >> 0x38) & 0xFF;
+}
+
+static inline uint64_t load8(const unsigned char * in)
+{
+       int i;
+       uint64_t ret = in[7];
+
+       for (i = 6; i >= 0; i--)
+       {
+               ret <<= 8;
+               ret |= in[i];
+       }
+
+       return ret;
+}
+
+#endif
+
+
+static void crypto_xof_shake256(unsigned char *h,long long hlen,
+                               const unsigned char *m,long long mlen)
+{
+  gcry_md_hd_t mdh;
+  gcry_err_code_t ec;
+
+  ec = _gcry_md_open (&mdh, GCRY_MD_SHAKE256, 0);
+  if (ec)
+    log_fatal ("internal md_open failed: %d\n", ec);
+  _gcry_md_write (mdh, m, mlen);
+  _gcry_md_extract (mdh, GCRY_MD_SHAKE256, h, hlen);
+  _gcry_md_close (mdh);
+}
+
+/* from libmceliece-20230612/crypto_kem/6688128f/vec/benes.c */
+/*
+  This file is for Benes network related functions
+
+  For the implementation strategy, see
+  https://eprint.iacr.org/2017/793.pdf
+*/
+/* 20221230 djb: add linker lines */
+
+/* linker define benes */
+
+
+/* middle layers of the benes network */
+static void layer_in(uint64_t data[2][64], uint64_t * bits, int lgs)
+{
+       int i, j, s;
+
+       uint64_t d;
+
+       s = 1 << lgs;
+
+       for (i = 0; i < 64; i += s*2)
+       for (j = i; j < i+s; j++)
+       {
+
+               d = (data[0][j+0] ^ data[0][j+s]);
+               d &= (*bits++);
+               data[0][j+0] ^= d;
+               data[0][j+s] ^= d;
+
+               d = (data[1][j+0] ^ data[1][j+s]);
+               d &= (*bits++);
+               data[1][j+0] ^= d;
+               data[1][j+s] ^= d;
+       }
+}
+
+/* first and last layers of the benes network */
+static void layer_ex(uint64_t * data, uint64_t * bits, int lgs)
+{
+       int i, j, s;
+
+       uint64_t d;
+
+       s = 1 << lgs;
+
+       for (i = 0; i < 128; i += s*2)
+       for (j = i; j < i+s; j++)
+       {
+
+               d = (data[j+0] ^ data[j+s]);
+               d &= (*bits++);
+               data[j+0] ^= d;
+               data[j+s] ^= d;
+       }
+}
+
+/* input: r, sequence of bits to be permuted */
+/*        bits, condition bits of the Benes network */
+/*        rev, 0 for normal application; !0 for inverse */
+/* output: r, permuted bits */
+static void benes(vec * r, const unsigned char * bits, int rev)
+{
+       int i, iter, inc;
+
+       const unsigned char *bits_ptr;
+
+       uint64_t r_int_v[2][64];
+       uint64_t r_int_h[2][64];
+       uint64_t b_int_v[64];
+       uint64_t b_int_h[64];
+
+       /**/
+
+       if (rev) { bits_ptr = bits + 12288; inc = -1024; }
+       else     { bits_ptr = bits;         inc = 0;    }
+
+       for (i = 0; i < 64; i++)
+       {
+               r_int_v[0][i] = r[i*2 + 0];
+               r_int_v[1][i] = r[i*2 + 1];
+       }
+
+       transpose_64x64(r_int_h[0], r_int_v[0]);
+       transpose_64x64(r_int_h[1], r_int_v[1]);
+
+       for (iter = 0; iter <= 6; iter++)
+       {
+               for (i = 0; i < 64; i++)
+               {
+                       b_int_v[i] = load8(bits_ptr); bits_ptr += 8;
+               }
+
+               bits_ptr += inc;
+
+               transpose_64x64(b_int_h, b_int_v);
+
+               layer_ex(r_int_h[0], b_int_h, iter);
+       }
+
+       transpose_64x64(r_int_v[0], r_int_h[0]);
+       transpose_64x64(r_int_v[1], r_int_h[1]);
+
+       for (iter = 0; iter <= 5; iter++)
+       {
+               for (i = 0; i < 64; i++) { b_int_v[i] = load8(bits_ptr); bits_ptr += 8; }
+               bits_ptr += inc;
+
+               layer_in(r_int_v, b_int_v, iter);
+       }
+
+       for (iter = 4; iter >= 0; iter--)
+       {
+               for (i = 0; i < 64; i++) { b_int_v[i] = load8(bits_ptr); bits_ptr += 8; }
+               bits_ptr += inc;
+
+               layer_in(r_int_v, b_int_v, iter);
+       }
+
+       transpose_64x64(r_int_h[0], r_int_v[0]);
+       transpose_64x64(r_int_h[1], r_int_v[1]);
+
+       for (iter = 6; iter >= 0; iter--)
+       {
+               for (i = 0; i < 64; i++)
+               {
+                       b_int_v[i] = load8(bits_ptr); bits_ptr += 8;
+               }
+
+               bits_ptr += inc;
+
+               transpose_64x64(b_int_h, b_int_v);
+
+               layer_ex(r_int_h[0], b_int_h, iter);
+       }
+
+       transpose_64x64(r_int_v[0], r_int_h[0]);
+       transpose_64x64(r_int_v[1], r_int_h[1]);
+
+       for (i = 0; i < 64; i++)
+       {
+               r[i*2+0] = r_int_v[0][i];
+               r[i*2+1] = r_int_v[1][i];
+       }
+}
+
+
+/* from libmceliece-20230612/crypto_kem/6688128f/vec/bm.c */
+/*
+  This file is for implementating the inversion-free Berlekamp-Massey algorithm
+  see https://ieeexplore.ieee.org/document/87857
+
+  For the implementation strategy, see
+  https://eprint.iacr.org/2017/793.pdf
+*/
+/* 20221230 djb: add linker lines */
+
+/* linker define bm */
+/* linker use vec_mul */
+/* linker use gf_inv */
+
+
+
+static inline uint16_t mask_nonzero(gf a)
+{
+       uint32_t ret = a;
+
+       ret -= 1;
+       ret >>= 31;
+       ret -= 1;
+
+       return ret;
+}
+
+static inline uint16_t mask_leq(uint16_t a, uint16_t b)
+{
+       uint32_t a_tmp = a;
+       uint32_t b_tmp = b;
+       uint32_t ret = b_tmp - a_tmp;
+
+       ret >>= 31;
+       ret -= 1;
+
+       return ret;
+}
+
+static inline void vec_cmov(vec * out, vec * in, uint16_t mask)
+{
+       int i;
+
+       vec m0, m1;
+
+       m0 = vec_set1_16b(mask);
+       m1 = ~m0;
+
+       for (i = 0; i < GFBITS; i++)
+       {
+               out[i] = (in[i] & m0) | (out[i] & m1);
+               out[i] = (in[i] & m0) | (out[i] & m1);
+       }
+}
+
+static inline void interleave(vec *in, int idx0, int idx1, vec *mask, int b)
+{
+       int s = 1 << b;
+
+       vec x, y;
+
+       x = (in[idx0] & mask[0]) | ((in[idx1] & mask[0]) << s);
+       y = ((in[idx0] & mask[1]) >> s) | (in[idx1] & mask[1]);
+
+       in[idx0] = x;
+       in[idx1] = y;
+}
+
+/* input: in, field elements in bitsliced form */
+/* output: out, field elements in non-bitsliced form */
+static inline void get_coefs(gf *out, vec *in)
+{
+       int i, k;
+
+       vec mask[4][2];
+       vec buf[16];
+
+       for (i =  0; i < 13; i++) buf[i] = in[i];
+       for (i = 13; i < 16; i++) buf[i] = 0;
+
+       mask[0][0] = vec_set1_16b(0x5555);
+       mask[0][1] = vec_set1_16b(0xAAAA);
+       mask[1][0] = vec_set1_16b(0x3333);
+       mask[1][1] = vec_set1_16b(0xCCCC);
+       mask[2][0] = vec_set1_16b(0x0F0F);
+       mask[2][1] = vec_set1_16b(0xF0F0);
+       mask[3][0] = vec_set1_16b(0x00FF);
+       mask[3][1] = vec_set1_16b(0xFF00);
+
+       interleave(buf,  0,  8, mask[3], 3);
+       interleave(buf,  1,  9, mask[3], 3);
+       interleave(buf,  2, 10, mask[3], 3);
+       interleave(buf,  3, 11, mask[3], 3);
+       interleave(buf,  4, 12, mask[3], 3);
+       interleave(buf,  5, 13, mask[3], 3);
+       interleave(buf,  6, 14, mask[3], 3);
+       interleave(buf,  7, 15, mask[3], 3);
+
+       interleave(buf,  0,  4, mask[2], 2);
+       interleave(buf,  1,  5, mask[2], 2);
+       interleave(buf,  2,  6, mask[2], 2);
+       interleave(buf,  3,  7, mask[2], 2);
+       interleave(buf,  8, 12, mask[2], 2);
+       interleave(buf,  9, 13, mask[2], 2);
+       interleave(buf, 10, 14, mask[2], 2);
+       interleave(buf, 11, 15, mask[2], 2);
+
+       interleave(buf,  0,  2, mask[1], 1);
+       interleave(buf,  1,  3, mask[1], 1);
+       interleave(buf,  4,  6, mask[1], 1);
+       interleave(buf,  5,  7, mask[1], 1);
+       interleave(buf,  8, 10, mask[1], 1);
+       interleave(buf,  9, 11, mask[1], 1);
+       interleave(buf, 12, 14, mask[1], 1);
+       interleave(buf, 13, 15, mask[1], 1);
+
+       interleave(buf,  0,  1, mask[0], 0);
+       interleave(buf,  2,  3, mask[0], 0);
+       interleave(buf,  4,  5, mask[0], 0);
+       interleave(buf,  6,  7, mask[0], 0);
+       interleave(buf,  8,  9, mask[0], 0);
+       interleave(buf, 10, 11, mask[0], 0);
+       interleave(buf, 12, 13, mask[0], 0);
+       interleave(buf, 14, 15, mask[0], 0);
+
+       for (i = 0; i < 16; i++)
+       for (k = 0; k <  4; k++)
+               out[ k*16 + i ] = (buf[i] >> (k*16)) & GFMASK;
+}
+
+static void update(vec in[][GFBITS], const gf e)
+{
+       int i;
+       vec tmp;
+
+       for (i = 0; i < GFBITS; i++)
+       {
+               tmp = (e >> i) & 1;
+
+               in[0][i] = (in[0][i] >> 1) | (in[1][i] << 63);
+               in[1][i] = (in[1][i] >> 1) | (tmp      << 63);
+       }
+}
+
+static inline gf vec_reduce(vec in[][GFBITS])
+{
+       int i;
+       vec tmp;
+       gf ret = 0;
+
+       for (i = GFBITS-1; i >= 0; i--)
+       {
+               tmp = in[0][i] ^ in[1][i];
+
+               tmp ^= tmp >> 32;
+               tmp ^= tmp >> 16;
+               tmp ^= tmp >> 8;
+               tmp ^= tmp >> 4;
+               tmp ^= tmp >> 2;
+               tmp ^= tmp >> 1;
+
+               ret <<= 1;
+               ret |= tmp & 1;
+       }
+
+       return ret;
+}
+
+/* input: in, sequence of field elements */
+/* output: out, minimal polynomial of in */
+static void bm(vec out[][ GFBITS ], vec in[][ GFBITS ])
+{
+       int i;
+       uint16_t N, L;
+       uint16_t mask;
+       uint64_t one = 1, t;
+
+       vec prod[2][GFBITS];
+       vec interval[2][GFBITS];
+       vec dd[2][GFBITS], bb[2][GFBITS];
+       vec B[2][GFBITS], C[2][GFBITS];
+       vec B_tmp[2][GFBITS], C_tmp[2][GFBITS];
+       vec v[GFBITS];
+
+       gf d, b, c0 = 1;
+       gf coefs[256];
+
+       /* initialization */
+
+       get_coefs(&coefs[  0], in[0]);
+       get_coefs(&coefs[ 64], in[1]);
+       get_coefs(&coefs[128], in[2]);
+       get_coefs(&coefs[192], in[3]);
+
+       C[0][0] = 0;
+       C[1][0] = 0;
+       B[0][0] = 0;
+       B[1][0] = one << 63;
+
+       for (i = 1; i < GFBITS; i++)
+               C[0][i] = C[1][i] = B[0][i] = B[1][i] = 0;
+
+       b = 1;
+       L = 0;
+
+       /**/
+
+       for (i = 0; i < GFBITS; i++)
+               interval[0][i] = interval[1][i] = 0;
+
+       for (N = 0; N < 256; N++)
+       {
+               vec_mul(prod[0], C[0], interval[0]);
+               vec_mul(prod[1], C[1], interval[1]);
+               update(interval, coefs[N]);
+               d = vec_reduce(prod);
+
+               t = gf_mul2(c0, coefs[N], b);
+               d ^= t & 0xFFFFFFFF;
+
+               mask = mask_nonzero(d) & mask_leq(L*2, N);
+
+               for (i = 0; i < GFBITS; i++)
+               {
+                       dd[0][i] = dd[1][i] = vec_setbits((d >> i) & 1);
+                       bb[0][i] = bb[1][i] = vec_setbits((b >> i) & 1);
+               }
+
+               vec_mul(B_tmp[0], dd[0], B[0]);
+               vec_mul(B_tmp[1], dd[1], B[1]);
+               vec_mul(C_tmp[0], bb[0], C[0]);
+               vec_mul(C_tmp[1], bb[1], C[1]);
+
+               vec_cmov(B[0], C[0], mask);
+               vec_cmov(B[1], C[1], mask);
+               update(B, c0 & mask);
+
+               for (i = 0; i < GFBITS; i++)
+               {
+                       C[0][i] = B_tmp[0][i] ^ C_tmp[0][i];
+                       C[1][i] = B_tmp[1][i] ^ C_tmp[1][i];
+               }
+
+               c0 = t >> 32;
+               b = (d & mask) | (b & ~mask);
+               L = ((N+1-L) & mask) | (L & ~mask);
+       }
+
+       c0 = gf_inv(c0);
+
+       for (i = 0; i < GFBITS; i++)
+               v[i] = vec_setbits((c0 >> i) & 1);
+
+       vec_mul(out[0], C[0], v);
+       vec_mul(out[1], C[1], v);
+}
+
+
+/* from libmceliece-20230612/crypto_kem/6688128f/vec/controlbits.c */
+/* This file is for implementing the Nassimi-Sahni algorithm */
+/* See David Nassimi, Sartaj Sahni "Parallel algorithms to set up the Benes permutationnetwork" */
+/* See also https://cr.yp.to/papers/controlbits-20200923.pdf */
+
+/* 20221230 djb: add linker line */
+
+/* linker define controlbitsfrompermutation */
+
+typedef int16_t int16;
+typedef int32_t int32;
+#define int32_min crypto_int32_min
+
+/* parameters: 1 <= w <= 14; n = 2^w */
+/* input: permutation pi of {0,1,...,n-1} */
+/* output: (2m-1)n/2 control bits at positions pos,pos+step,... */
+/* output position pos is by definition 1&(out[pos/8]>>(pos&7)) */
+/* caller must 0-initialize positions first */
+/* temp must have space for int32[2*n] */
+static void cbrecursion(unsigned char *out,long long pos,long long step,const int16 *pi,long long w,long long n,int32 *temp)
+{
+#define A temp
+#define B (temp+n)
+#define q ((int16 *) (temp+n+n/4))
+/* q can start anywhere between temp+n and temp+n/2 */
+
+  long long x,i,j,k;
+
+  if (w == 1) {
+    out[pos>>3] ^= pi[0]<<(pos&7);
+    return;
+  }
+
+  for (x = 0;x < n;++x) A[x] = ((pi[x]^1)<<16)|pi[x^1];
+  int32_sort(A,n); /* A = (id<<16)+pibar */
+
+  for (x = 0;x < n;++x) {
+    int32 Ax = A[x];
+    int32 px = Ax&0xffff;
+    int32 cx = int32_min(px,x);
+    B[x] = (px<<16)|cx;
+  }
+  /* B = (p<<16)+c */
+
+  for (x = 0;x < n;++x) A[x] = (A[x]<<16)|x; /* A = (pibar<<16)+id */
+  int32_sort(A,n); /* A = (id<<16)+pibar^-1 */
+
+  for (x = 0;x < n;++x) A[x] = (A[x]<<16)+(B[x]>>16); /* A = (pibar^(-1)<<16)+pibar */
+  int32_sort(A,n); /* A = (id<<16)+pibar^2 */
+
+  if (w <= 10) {
+    for (x = 0;x < n;++x) B[x] = ((A[x]&0xffff)<<10)|(B[x]&0x3ff);
+
+    for (i = 1;i < w-1;++i) {
+      /* B = (p<<10)+c */
+
+      for (x = 0;x < n;++x) A[x] = ((B[x]&~0x3ff)<<6)|x; /* A = (p<<16)+id */
+      int32_sort(A,n); /* A = (id<<16)+p^{-1} */
+
+      for (x = 0;x < n;++x) A[x] = (A[x]<<20)|B[x]; /* A = (p^{-1}<<20)+(p<<10)+c */
+      int32_sort(A,n); /* A = (id<<20)+(pp<<10)+cp */
+
+      for (x = 0;x < n;++x) {
+        int32 ppcpx = A[x]&0xfffff;
+        int32 ppcx = (A[x]&0xffc00)|(B[x]&0x3ff);
+        B[x] = int32_min(ppcx,ppcpx);
+      }
+    }
+    for (x = 0;x < n;++x) B[x] &= 0x3ff;
+  } else {
+    for (x = 0;x < n;++x) B[x] = (A[x]<<16)|(B[x]&0xffff);
+
+    for (i = 1;i < w-1;++i) {
+      /* B = (p<<16)+c */
+
+      for (x = 0;x < n;++x) A[x] = (B[x]&~0xffff)|x;
+      int32_sort(A,n); /* A = (id<<16)+p^(-1) */
+
+      for (x = 0;x < n;++x) A[x] = (A[x]<<16)|(B[x]&0xffff);
+      /* A = p^(-1)<<16+c */
+
+      if (i < w-2) {
+        for (x = 0;x < n;++x) B[x] = (A[x]&~0xffff)|(B[x]>>16);
+        /* B = (p^(-1)<<16)+p */
+        int32_sort(B,n); /* B = (id<<16)+p^(-2) */
+        for (x = 0;x < n;++x) B[x] = (B[x]<<16)|(A[x]&0xffff);
+        /* B = (p^(-2)<<16)+c */
+      }
+
+      int32_sort(A,n);
+      /* A = id<<16+cp */
+      for (x = 0;x < n;++x) {
+        int32 cpx = (B[x]&~0xffff)|(A[x]&0xffff);
+        B[x] = int32_min(B[x],cpx);
+      }
+    }
+    for (x = 0;x < n;++x) B[x] &= 0xffff;
+  }
+
+  for (x = 0;x < n;++x) A[x] = (((int32)pi[x])<<16)+x;
+  int32_sort(A,n); /* A = (id<<16)+pi^(-1) */
+
+  for (j = 0;j < n/2;++j) {
+    long long lx = 2*j;
+    int32 fj = B[lx]&1; /* f[j] */
+    int32 Fx = lx+fj; /* F[x] */
+    int32 Fx1 = Fx^1; /* F[x+1] */
+
+    out[pos>>3] ^= fj<<(pos&7);
+    pos += step;
+
+    B[lx] = (A[lx]<<16)|Fx;
+    B[lx+1] = (A[lx+1]<<16)|Fx1;
+  }
+  /* B = (pi^(-1)<<16)+F */
+
+  int32_sort(B,n); /* B = (id<<16)+F(pi) */
+
+  pos += (2*w-3)*step*(n/2);
+
+  for (k = 0;k < n/2;++k) {
+    long long y = 2*k;
+    int32 lk = B[y]&1; /* l[k] */
+    int32 Ly = y+lk; /* L[y] */
+    int32 Ly1 = Ly^1; /* L[y+1] */
+
+    out[pos>>3] ^= lk<<(pos&7);
+    pos += step;
+
+    A[y] = (Ly<<16)|(B[y]&0xffff);
+    A[y+1] = (Ly1<<16)|(B[y+1]&0xffff);
+  }
+  /* A = (L<<16)+F(pi) */
+
+  int32_sort(A,n); /* A = (id<<16)+F(pi(L)) = (id<<16)+M */
+
+  pos -= (2*w-2)*step*(n/2);
+
+  for (j = 0;j < n/2;++j) {
+    q[j] = (A[2*j]&0xffff)>>1;
+    q[j+n/2] = (A[2*j+1]&0xffff)>>1;
+  }
+
+  cbrecursion(out,pos,step*2,q,w-1,n/2,temp);
+  cbrecursion(out,pos+step,step*2,q+n/2,w-1,n/2,temp);
+}
+
+/* input: p, an array of int16 */
+/* input: n, length of p */
+/* input: s, meaning that stride-2^s cswaps are performed */
+/* input: cb, the control bits */
+/* output: the result of apply the control bits to p */
+static void layer(int16_t *p, const unsigned char *cb, int s, int n)
+{
+  int i, j;
+  int stride = 1 << s;
+  int index = 0;
+  int16_t d, m;
+
+  for (i = 0; i < n; i += stride*2)
+  {
+    for (j = 0; j < stride; j++)
+    {
+      d = p[ i+j ] ^ p[ i+j+stride ];
+      m = (cb[ index >> 3 ] >> (index & 7)) & 1;
+      m = -m;
+      d &= m;
+      p[ i+j ] ^= d;
+      p[ i+j+stride ] ^= d;
+      index++;
+    }
+  }
+}
+
+/* parameters: 1 <= w <= 14; n = 2^w */
+/* input: permutation pi of {0,1,...,n-1} */
+/* output: (2m-1)n/2 control bits at positions 0,1,... */
+/* output position pos is by definition 1&(out[pos/8]>>(pos&7)) */
+static void controlbitsfrompermutation(unsigned char *out,const int16 *pi,long long w,long long n)
+{
+  int32 temp[2*n];
+  int16 pi_test[n], diff;
+  int i;
+  unsigned char *ptr;
+
+  while (1)
+  {
+    memset(out,0,(((2*w-1)*n/2)+7)/8);
+    cbrecursion(out,0,1,pi,w,n,temp);
+
+    /* check for correctness */
+
+    for (i = 0; i < n; i++)
+      pi_test[i] = i;
+
+    ptr = out;
+    for (i = 0; i < w; i++)
+    {
+      layer(pi_test, ptr, i, n);
+      ptr += n >> 4;
+    }
+
+    for (i = w-2; i >= 0; i--)
+    {
+      layer(pi_test, ptr, i, n);
+      ptr += n >> 4;
+    }
+
+    diff = 0;
+    for (i = 0; i < n; i++)
+      diff |= pi[i] ^ pi_test[i];
+
+    diff = crypto_int16_nonzero_mask(diff);
+    crypto_declassify(&diff,sizeof diff);
+    if (diff == 0)
+      break;
+  }
+}
+
+#undef A
+#undef B
+#undef q
+
+/* from libmceliece-20230612/crypto_kem/6688128f/vec/decrypt.c */
+/*
+  This file is for Niederreiter decryption
+*/
+/* 20221230 djb: add linker lines */
+
+/* linker define decrypt */
+/* linker use benes bm fft fft_tr */
+/* linker use vec_mul vec_sq vec_inv */
+
+
+
+
+static void scaling(vec out[][GFBITS], vec inv[][GFBITS], const unsigned char *sk, vec *recv)
+{
+       int i, j;
+
+       vec irr_int[2][ GFBITS ];
+       vec eval[128][ GFBITS ];
+       vec tmp[ GFBITS ];
+
+       /**/
+
+       irr_load(irr_int, sk);
+
+       fft(eval, irr_int);
+
+       for (i = 0; i < 128; i++)
+               vec_sq(eval[i], eval[i]);
+
+       vec_copy(inv[0], eval[0]);
+
+       for (i = 1; i < 128; i++)
+               vec_mul(inv[i], inv[i-1], eval[i]);
+
+       vec_inv(tmp, inv[127]);
+
+       for (i = 126; i >= 0; i--)
+       {
+               vec_mul(inv[i+1], tmp, inv[i]);
+               vec_mul(tmp, tmp, eval[i+1]);
+       }
+
+       vec_copy(inv[0], tmp);
+
+       /**/
+
+       for (i = 0; i < 128; i++)
+       for (j = 0; j < GFBITS; j++)
+               out[i][j] = inv[i][j] & recv[i];
+}
+
+static void preprocess(vec *recv, const unsigned char *s)
+{
+       int i;
+       unsigned char r[ 1024 ];
+
+       for (i = 0; i < SYND_BYTES; i++)
+               r[i] = s[i];
+
+       for (i = SYND_BYTES; i < 1024; i++)
+               r[i] = 0;
+
+       for (i = 0; i < 128; i++)
+               recv[i] = load8(r + i*8);
+}
+
+static void postprocess(unsigned char * e, vec * err)
+{
+       int i;
+       unsigned char error8[ (1 << GFBITS)/8 ];
+
+       for (i = 0; i < 128; i++)
+               store8(error8 + i*8, err[i]);
+
+       for (i = 0; i < SYS_N/8; i++)
+               e[i] = error8[i];
+}
+
+static void scaling_inv(vec out[][GFBITS], vec inv[][GFBITS], vec *recv)
+{
+       int i, j;
+
+       for (i = 0; i < 128; i++)
+       for (j = 0; j < GFBITS; j++)
+               out[i][j] = inv[i][j] & recv[i];
+}
+
+static int weight_check(unsigned char * e, vec * error)
+{
+       int i;
+       uint16_t w0 = 0;
+       uint16_t w1 = 0;
+       uint16_t check;
+
+       for (i = 0; i < (1 << GFBITS); i++)
+               w0 += (error[i/64] >> (i%64)) & 1;
+
+       for (i = 0; i < SYS_N; i++)
+               w1 += (e[i/8] >> (i%8)) & 1;
+
+       check = (w0 ^ SYS_T) | (w1 ^ SYS_T);
+       check -= 1;
+       check >>= 15;
+
+       return check;
+}
+
+static uint16_t synd_cmp(vec s0[][ GFBITS ] , vec s1[][ GFBITS ])
+{
+       int i, j;
+       vec diff = 0;
+
+       for (i = 0; i < 4; i++)
+       for (j = 0; j < GFBITS; j++)
+               diff |= (s0[i][j] ^ s1[i][j]);
+
+       return vec_testz(diff);
+}
+
+/* Niederreiter decryption with the Berlekamp decoder */
+/* intput: sk, secret key */
+/*         s, ciphertext (syndrome) */
+/* output: e, error vector */
+/* return: 0 for success; 1 for failure */
+static int decrypt(unsigned char *e, const unsigned char *sk, const unsigned char *s)
+{
+       int i;
+
+       uint16_t check_synd;
+       uint16_t check_weight;
+
+       vec inv[ 128 ][ GFBITS ];
+       vec scaled[ 128 ][ GFBITS ];
+       vec eval[ 128 ][ GFBITS ];
+
+       vec error[ 128 ];
+
+       vec s_priv[ 4 ][ GFBITS ];
+       vec s_priv_cmp[ 4 ][ GFBITS ];
+       vec locator[2][ GFBITS ];
+
+       vec recv[ 128 ];
+       vec allone;
+
+       /* Berlekamp decoder */
+
+       preprocess(recv, s);
+
+       benes(recv, sk + IRR_BYTES, 1);
+       scaling(scaled, inv, sk, recv);
+       fft_tr(s_priv, scaled);
+       bm(locator, s_priv);
+
+       fft(eval, locator);
+
+       /* reencryption and weight check */
+
+       allone = vec_setbits(1);
+
+       for (i = 0; i < 128; i++)
+       {
+               error[i] = vec_or_reduce(eval[i]);
+               error[i] ^= allone;
+       }
+
+       scaling_inv(scaled, inv, error);
+       fft_tr(s_priv_cmp, scaled);
+
+       check_synd = synd_cmp(s_priv, s_priv_cmp);
+
+       /**/
+
+       benes(error, sk + IRR_BYTES, 0);
+
+       postprocess(e, error);
+
+       check_weight = weight_check(e, error);
+
+#ifdef KAT
+  {
+    int k;
+    printf("decrypt e: positions");
+    for (k = 0;k < SYS_N;++k)
+      if (e[k/8] & (1 << (k&7)))
+        printf(" %d",k);
+    printf("\n");
+  }
+#endif
+
+       return 1 - (check_synd & check_weight);
+}
+
+
+/* from libmceliece-20230612/crypto_kem/6688128f/vec/encrypt.c */
+/* 20230102 djb: rename encrypt() as pke_encrypt() */
+/* 20221231 djb: move encrypt.h last for macos portability; tnx thom wiggers */
+/* 20221230 djb: add linker line */
+
+/* linker define pke_encrypt */
+
+/*
+  This file is for Niederreiter encryption
+*/
+
+
+
+
+
+static inline crypto_uint16 uint16_is_smaller_declassify(uint16_t t,uint16_t u)
+{
+  crypto_uint16 mask = crypto_uint16_smaller_mask(t,u);
+  crypto_declassify(&mask,sizeof mask);
+  return mask;
+}
+
+static inline crypto_uint32 uint32_is_equal_declassify(uint32_t t,uint32_t u)
+{
+  crypto_uint32 mask = crypto_uint32_equal_mask(t,u);
+  crypto_declassify(&mask,sizeof mask);
+  return mask;
+}
+
+/* output: e, an error vector of weight t */
+static void gen_e(unsigned char *e)
+{
+       int i, j, eq, count;
+
+       union
+       {
+               uint16_t nums[ SYS_T*2 ];
+               unsigned char bytes[ SYS_T*2 * sizeof(uint16_t) ];
+       } buf;
+
+       uint16_t ind[ SYS_T ];
+       uint64_t e_int[ (SYS_N+63)/64 ];
+       uint64_t one = 1;
+       uint64_t mask;
+       uint64_t val[ SYS_T ];
+
+       while (1)
+       {
+               randombytes(buf.bytes, sizeof(buf));
+
+               for (i = 0; i < SYS_T*2; i++)
+                       buf.nums[i] = load_gf(buf.bytes + i*2);
+
+               /* moving and counting indices in the correct range */
+
+               count = 0;
+               for (i = 0; i < SYS_T*2 && count < SYS_T; i++)
+                       if (uint16_is_smaller_declassify(buf.nums[i],SYS_N))
+                               ind[ count++ ] = buf.nums[i];
+
+               if (count < SYS_T) continue;
+
+               /* check for repetition */
+
+               uint16_sort(ind, SYS_T);
+
+               eq = 0;
+               for (i = 1; i < SYS_T; i++)
+                       if (uint32_is_equal_declassify(ind[i-1],ind[i]))
+                               eq = 1;
+
+               if (eq == 0)
+                       break;
+       }
+
+       for (j = 0; j < SYS_T; j++)
+               val[j] = one << (ind[j] & 63);
+
+       for (i = 0; i < (SYS_N+63)/64; i++)
+       {
+               e_int[i] = 0;
+
+               for (j = 0; j < SYS_T; j++)
+               {
+                       mask = i ^ (ind[j] >> 6);
+                       mask -= 1;
+                       mask >>= 63;
+                       mask = -mask;
+
+                       e_int[i] |= val[j] & mask;
+               }
+       }
+
+       for (i = 0; i < (SYS_N+63)/64 - 1; i++)
+               { store8(e, e_int[i]); e += 8; }
+
+       for (j = 0; j < (SYS_N % 64); j+=8)
+               e[ j/8 ] = (e_int[i] >> j) & 0xFF;
+}
+
+/* input: public key pk, error vector e */
+/* output: syndrome s */
+static void syndrome(unsigned char *s, const unsigned char *pk, unsigned char *e)
+{
+       uint64_t b;
+
+       const uint64_t *pk_ptr;
+       const uint64_t *e_ptr = ((uint64_t *) (e + SYND_BYTES));
+
+       int i, j;
+
+       /**/
+
+       for (i = 0; i < SYND_BYTES; i++)
+               s[i] = e[i];
+
+       for (i = 0; i < PK_NROWS; i++)
+       {
+               pk_ptr = ((uint64_t *) (pk + PK_ROW_BYTES * i));
+
+               b = 0;
+               for (j = 0; j < PK_NCOLS/64; j++)
+                       b ^= pk_ptr[j] & e_ptr[j];
+
+               b ^= ((uint32_t *) &pk_ptr[j])[0] & ((uint32_t *) &e_ptr[j])[0];
+
+               b ^= b >> 32;
+               b ^= b >> 16;
+               b ^= b >> 8;
+               b ^= b >> 4;
+               b ^= b >> 2;
+               b ^= b >> 1;
+               b &= 1;
+
+               s[ i/8 ] ^= (b << (i%8));
+       }
+}
+
+/* input: public key pk */
+/* output: error vector e, syndrome s */
+static void pke_encrypt(unsigned char *s, const unsigned char *pk, unsigned char *e)
+{
+       gen_e(e);
+
+#ifdef KAT
+  {
+    int k;
+    printf("encrypt e: positions");
+    for (k = 0;k < SYS_N;++k)
+      if (e[k/8] & (1 << (k&7)))
+        printf(" %d",k);
+    printf("\n");
+  }
+#endif
+
+       syndrome(s, pk, e);
+}
+
+
+/* from libmceliece-20230612/crypto_kem/6688128f/vec/shared-fft_consts.c */
+/* linker define fft_consts */
+
+
+const vec fft_consts[128][GFBITS] = {
+{ 0x6969969669699696, 0x9966669966999966, 0x9966669966999966, 0xFF0000FF00FFFF00, 0xCC3333CCCC3333CC, 0x9966669966999966, 0x6666666666666666, 0xA55AA55AA55AA55A, 0xCCCC33333333CCCC, 0x5A5A5A5A5A5A5A5A, 0x55AAAA55AA5555AA, 0x0FF0F00FF00F0FF0, 0x5AA55AA5A55AA55A }, { 0x6969969669699696, 0x9966669966999966, 0x9966669966999966, 0xFF0000FF00FFFF00, 0xCC3333CCCC3333CC, 0x9966669966999966, 0x6666666666666666, 0xA55AA55AA55AA55A, 0xCCCC33333333CCCC, 0x5A5A5A5A5A5A5A5A, 0x55AAAA55AA5555AA, 0x0FF0F00FF00F0FF0, 0x5AA55AA5A55AA55A }, { 0xA55A5AA55AA5A55A, 0x6969696996969696, 0x5AA55AA5A55AA55A, 0x9999999966666666, 0x3C3CC3C3C3C33C3C, 0xFFFF0000FFFF0000, 0x0000000000000000, 0xCC33CC3333CC33CC, 0x0000000000000000, 0x3C3C3C3C3C3C3C3C, 0xAA5555AAAA5555AA, 0xC33C3CC33CC3C33C, 0x00FFFF0000FFFF00 }, { 0xA55A5AA55AA5A55A, 0x6969696996969696, 0x5AA55AA5A55AA55A, 0x6666666699999999, 0xC3C33C3C3C3CC3C3, 0x0000FFFF0000FFFF, 0x0000000000000000, 0x33CC33CCCC33CC33, 0x0000000000000000, 0x3C3C3C3C3C3C3C3C, 0xAA5555AAAA5555AA, 0xC33C3CC33CC3C33C, 0xFF0000FFFF0000FF }, { 0xFFFFFFFF00000000, 0xA5A5A5A55A5A5A5A, 0x0FF0F00FF00F0FF0, 0x9669966969966996, 0x0000FFFFFFFF0000, 0x33333333CCCCCCCC, 0xA55A5AA55AA5A55A, 0x00FFFF0000FFFF00, 0x0000000000000000, 0xC33CC33CC33CC33C, 0x0F0FF0F00F0FF0F0, 0xCCCCCCCCCCCCCCCC, 0xAAAA55555555AAAA }, { 0xFFFFFFFF00000000, 0xA5A5A5A55A5A5A5A, 0x0FF0F00FF00F0FF0, 0x6996699696699669, 0xFFFF00000000FFFF, 0x33333333CCCCCCCC, 0x5AA5A55AA55A5AA5, 0xFF0000FFFF0000FF, 0xFFFFFFFFFFFFFFFF, 0xC33CC33CC33CC33C, 0x0F0FF0F00F0FF0F0, 0xCCCCCCCCCCCCCCCC, 0x5555AAAAAAAA5555 }, { 0xFFFFFFFF00000000, 0x5A5A5A5AA5A5A5A5, 0xF00F0FF00FF0F00F, 0x6996699696699669, 0x0000FFFFFFFF0000, 0x33333333CCCCCCCC, 0x5AA5A55AA55A5AA5, 0xFF0000FFFF0000FF, 0xFFFFFFFFFFFFFFFF, 0xC33CC33CC33CC33C, 0x0F0FF0F00F0FF0F0, 0xCCCCCCCCCCCCCCCC, 0xAAAA55555555AAAA }, { 0xFFFFFFFF00000000, 0x5A5A5A5AA5A5A5A5, 0xF00F0FF00FF0F00F, 0x9669966969966996, 0xFFFF00000000FFFF, 0x33333333CCCCCCCC, 0xA55A5AA55AA5A55A, 0x00FFFF0000FFFF00, 0x0000000000000000, 0xC33CC33CC33CC33C, 0x0F0FF0F00F0FF0F0, 0xCCCCCCCCCCCCCCCC, 0x5555AAAAAAAA5555 }, { 0xC33C3CC33CC3C33C, 0x9966669966999966, 0x9966996699669966, 0x6969969669699696, 0xAA55AA5555AA55AA, 0x9966996699669966, 0x5AA5A55A5AA5A55A, 0xC3C3C3C33C3C3C3C, 0x3CC33CC3C33CC33C, 0x3333CCCC3333CCCC, 0x9999999966666666, 0xC33CC33CC33CC33C, 0x6666999999996666 }, { 0x3CC3C33CC33C3CC3, 0x6699996699666699, 0x6699669966996699, 0x6969969669699696, 0xAA55AA5555AA55AA, 0x9966996699669966, 0xA55A5AA5A55A5AA5, 0xC3C3C3C33C3C3C3C, 0x3CC33CC3C33CC33C, 0x3333CCCC3333CCCC, 0x6666666699999999, 0x3CC33CC33CC33CC3, 0x9999666666669999 }, { 0xC33C3CC33CC3C33C, 0x9966669966999966, 0x6699669966996699, 0x6969969669699696, 0xAA55AA5555AA55AA, 0x6699669966996699, 0x5AA5A55A5AA5A55A, 0x3C3C3C3CC3C3C3C3, 0xC33CC33C3CC33CC3, 0xCCCC3333CCCC3333, 0x6666666699999999, 0xC33CC33CC33CC33C, 0x9999666666669999 }, { 0x3CC3C33CC33C3CC3, 0x6699996699666699, 0x9966996699669966, 0x6969969669699696, 0xAA55AA5555AA55AA, 0x6699669966996699, 0xA55A5AA5A55A5AA5, 0x3C3C3C3CC3C3C3C3, 0xC33CC33C3CC33CC3, 0xCCCC3333CCCC3333, 0x9999999966666666, 0x3CC33CC33CC33CC3, 0x6666999999996666 }, { 0xC33C3CC33CC3C33C, 0x6699996699666699, 0x6699669966996699, 0x6969969669699696, 0x55AA55AAAA55AA55, 0x9966996699669966, 0x5AA5A55A5AA5A55A, 0xC3C3C3C33C3C3C3C, 0xC33CC33C3CC33CC3, 0x3333CCCC3333CCCC, 0x9999999966666666, 0xC33CC33CC33CC33C, 0x6666999999996666 }, { 0x3CC3C33CC33C3CC3, 0x9966669966999966, 0x9966996699669966, 0x6969969669699696, 0x55AA55AAAA55AA55, 0x9966996699669966, 0xA55A5AA5A55A5AA5, 0xC3C3C3C33C3C3C3C, 0xC33CC33C3CC33CC3, 0x3333CCCC3333CCCC, 0x6666666699999999, 0x3CC33CC33CC33CC3, 0x9999666666669999 }, { 0xC33C3CC33CC3C33C, 0x6699996699666699, 0x9966996699669966, 0x6969969669699696, 0x55AA55AAAA55AA55, 0x6699669966996699, 0x5AA5A55A5AA5A55A, 0x3C3C3C3CC3C3C3C3, 0x3CC33CC3C33CC33C, 0xCCCC3333CCCC3333, 0x6666666699999999, 0xC33CC33CC33CC33C, 0x9999666666669999 }, { 0x3CC3C33CC33C3CC3, 0x9966669966999966, 0x6699669966996699, 0x6969969669699696, 0x55AA55AAAA55AA55, 0x6699669966996699, 0xA55A5AA5A55A5AA5, 0x3C3C3C3CC3C3C3C3, 0x3CC33CC3C33CC33C, 0xCCCC3333CCCC3333, 0x9999999966666666, 0x3CC33CC33CC33CC3, 0x6666999999996666 }, { 0x3C3CC3C3C3C33C3C, 0x55555555AAAAAAAA, 0xF00FF00F0FF00FF0, 0x5AA55AA5A55AA55A, 0x55AAAA55AA5555AA, 0xF00F0FF0F00F0FF0, 0x9669699696696996, 0xA55AA55AA55AA55A, 0x55555555AAAAAAAA, 0xCCCC33333333CCCC, 0x0000FFFFFFFF0000, 0xFF0000FF00FFFF00, 0x6996699669966996 }, { 0xC3C33C3C3C3CC3C3, 0x55555555AAAAAAAA, 0x0FF00FF0F00FF00F, 0x5AA55AA5A55AA55A, 0x55AAAA55AA5555AA, 0xF00F0FF0F00F0FF0, 0x9669699696696996, 0x5AA55AA55AA55AA5, 0x55555555AAAAAAAA, 0x3333CCCCCCCC3333, 0x0000FFFFFFFF0000, 0x00FFFF00FF0000FF, 0x9669966996699669 }, { 0x3C3CC3C3C3C33C3C, 0x55555555AAAAAAAA, 0xF00FF00F0FF00FF0, 0xA55AA55A5AA55AA5, 0xAA5555AA55AAAA55, 0x0FF0F00F0FF0F00F, 0x9669699696696996, 0x5AA55AA55AA55AA5, 0xAAAAAAAA55555555, 0x3333CCCCCCCC3333, 0xFFFF00000000FFFF, 0xFF0000FF00FFFF00, 0x9669966996699669 }, { 0xC3C33C3C3C3CC3C3, 0x55555555AAAAAAAA, 0x0FF00FF0F00FF00F, 0xA55AA55A5AA55AA5, 0xAA5555AA55AAAA55, 0x0FF0F00F0FF0F00F, 0x9669699696696996, 0xA55AA55AA55AA55A, 0xAAAAAAAA55555555, 0xCCCC33333333CCCC, 0xFFFF00000000FFFF, 0x00FFFF00FF0000FF, 0x6996699669966996 }, { 0x3C3CC3C3C3C33C3C, 0x55555555AAAAAAAA, 0x0FF00FF0F00FF00F, 0xA55AA55A5AA55AA5, 0xAA5555AA55AAAA55, 0x0FF0F00F0FF0F00F, 0x6996966969969669, 0xA55AA55AA55AA55A, 0xAAAAAAAA55555555, 0xCCCC33333333CCCC, 0x0000FFFFFFFF0000, 0xFF0000FF00FFFF00, 0x6996699669966996 }, { 0xC3C33C3C3C3CC3C3, 0x55555555AAAAAAAA, 0xF00FF00F0FF00FF0, 0xA55AA55A5AA55AA5, 0xAA5555AA55AAAA55, 0x0FF0F00F0FF0F00F, 0x6996966969969669, 0x5AA55AA55AA55AA5, 0xAAAAAAAA55555555, 0x3333CCCCCCCC3333, 0x0000FFFFFFFF0000, 0x00FFFF00FF0000FF, 0x9669966996699669 }, { 0x3C3CC3C3C3C33C3C, 0x55555555AAAAAAAA, 0x0FF00FF0F00FF00F, 0x5AA55AA5A55AA55A, 0x55AAAA55AA5555AA, 0xF00F0FF0F00F0FF0, 0x6996966969969669, 0x5AA55AA55AA55AA5, 0x55555555AAAAAAAA, 0x3333CCCCCCCC3333, 0xFFFF00000000FFFF, 0xFF0000FF00FFFF00, 0x9669966996699669 }, { 0xC3C33C3C3C3CC3C3, 0x55555555AAAAAAAA, 0xF00FF00F0FF00FF0, 0x5AA55AA5A55AA55A, 0x55AAAA55AA5555AA, 0xF00F0FF0F00F0FF0, 0x6996966969969669, 0xA55AA55AA55AA55A, 0x55555555AAAAAAAA, 0xCCCC33333333CCCC, 0xFFFF00000000FFFF, 0x00FFFF00FF0000FF, 0x6996699669966996 }, { 0x3C3CC3C3C3C33C3C, 0xAAAAAAAA55555555, 0x0FF00FF0F00FF00F, 0x5AA55AA5A55AA55A, 0xAA5555AA55AAAA55, 0xF00F0FF0F00F0FF0, 0x9669699696696996, 0xA55AA55AA55AA55A, 0x55555555AAAAAAAA, 0xCCCC33333333CCCC, 0x0000FFFFFFFF0000, 0xFF0000FF00FFFF00, 0x6996699669966996 }, { 0xC3C33C3C3C3CC3C3, 0xAAAAAAAA55555555, 0xF00FF00F0FF00FF0, 0x5AA55AA5A55AA55A, 0xAA5555AA55AAAA55, 0xF00F0FF0F00F0FF0, 0x9669699696696996, 0x5AA55AA55AA55AA5, 0x55555555AAAAAAAA, 0x3333CCCCCCCC3333, 0x0000FFFFFFFF0000, 0x00FFFF00FF0000FF, 0x9669966996699669 }, { 0x3C3CC3C3C3C33C3C, 0xAAAAAAAA55555555, 0x0FF00FF0F00FF00F, 0xA55AA55A5AA55AA5, 0x55AAAA55AA5555AA, 0x0FF0F00F0FF0F00F, 0x9669699696696996, 0x5AA55AA55AA55AA5, 0xAAAAAAAA55555555, 0x3333CCCCCCCC3333, 0xFFFF00000000FFFF, 0xFF0000FF00FFFF00, 0x9669966996699669 }, { 0xC3C33C3C3C3CC3C3, 0xAAAAAAAA55555555, 0xF00FF00F0FF00FF0, 0xA55AA55A5AA55AA5, 0x55AAAA55AA5555AA, 0x0FF0F00F0FF0F00F, 0x9669699696696996, 0xA55AA55AA55AA55A, 0xAAAAAAAA55555555, 0xCCCC33333333CCCC, 0xFFFF00000000FFFF, 0x00FFFF00FF0000FF, 0x6996699669966996 }, { 0x3C3CC3C3C3C33C3C, 0xAAAAAAAA55555555, 0xF00FF00F0FF00FF0, 0xA55AA55A5AA55AA5, 0x55AAAA55AA5555AA, 0x0FF0F00F0FF0F00F, 0x6996966969969669, 0xA55AA55AA55AA55A, 0xAAAAAAAA55555555, 0xCCCC33333333CCCC, 0x0000FFFFFFFF0000, 0xFF0000FF00FFFF00, 0x6996699669966996 }, { 0xC3C33C3C3C3CC3C3, 0xAAAAAAAA55555555, 0x0FF00FF0F00FF00F, 0xA55AA55A5AA55AA5, 0x55AAAA55AA5555AA, 0x0FF0F00F0FF0F00F, 0x6996966969969669, 0x5AA55AA55AA55AA5, 0xAAAAAAAA55555555, 0x3333CCCCCCCC3333, 0x0000FFFFFFFF0000, 0x00FFFF00FF0000FF, 0x9669966996699669 }, { 0x3C3CC3C3C3C33C3C, 0xAAAAAAAA55555555, 0xF00FF00F0FF00FF0, 0x5AA55AA5A55AA55A, 0xAA5555AA55AAAA55, 0xF00F0FF0F00F0FF0, 0x6996966969969669, 0x5AA55AA55AA55AA5, 0x55555555AAAAAAAA, 0x3333CCCCCCCC3333, 0xFFFF00000000FFFF, 0xFF0000FF00FFFF00, 0x9669966996699669 }, { 0xC3C33C3C3C3CC3C3, 0xAAAAAAAA55555555, 0x0FF00FF0F00FF00F, 0x5AA55AA5A55AA55A, 0xAA5555AA55AAAA55, 0xF00F0FF0F00F0FF0, 0x6996966969969669, 0xA55AA55AA55AA55A, 0x55555555AAAAAAAA, 0xCCCC33333333CCCC, 0xFFFF00000000FFFF, 0x00FFFF00FF0000FF, 0x6996699669966996 }, { 0x3CC3C33C3CC3C33C, 0xAAAAAAAAAAAAAAAA, 0xFFFF0000FFFF0000, 0x3CC3C33C3CC3C33C, 0x55AA55AA55AA55AA, 0xFFFF0000FFFF0000, 0x0F0F0F0FF0F0F0F0, 0xFF0000FF00FFFF00, 0x33CCCC33CC3333CC, 0xFF0000FF00FFFF00, 0x6996966996696996, 0xA55A5AA55AA5A55A, 0x6996966996696996 }, { 0x3CC3C33C3CC3C33C, 0xAAAAAAAAAAAAAAAA, 0xFFFF0000FFFF0000, 0x3CC3C33C3CC3C33C, 0x55AA55AA55AA55AA, 0x0000FFFF0000FFFF, 0xF0F0F0F00F0F0F0F, 0x00FFFF00FF0000FF, 0xCC3333CC33CCCC33, 0x00FFFF00FF0000FF, 0x9669699669969669, 0xA55A5AA55AA5A55A, 0x6996966996696996 }, { 0x3CC3C33C3CC3C33C, 0xAAAAAAAAAAAAAAAA, 0xFFFF0000FFFF0000, 0x3CC3C33C3CC3C33C, 0xAA55AA55AA55AA55, 0x0000FFFF0000FFFF, 0xF0F0F0F00F0F0F0F, 0x00FFFF00FF0000FF, 0xCC3333CC33CCCC33, 0xFF0000FF00FFFF00, 0x6996966996696996, 0xA55A5AA55AA5A55A, 0x6996966996696996 }, { 0x3CC3C33C3CC3C33C, 0xAAAAAAAAAAAAAAAA, 0xFFFF0000FFFF0000, 0x3CC3C33C3CC3C33C, 0xAA55AA55AA55AA55, 0xFFFF0000FFFF0000, 0x0F0F0F0FF0F0F0F0, 0xFF0000FF00FFFF00, 0x33CCCC33CC3333CC, 0x00FFFF00FF0000FF, 0x9669699669969669, 0xA55A5AA55AA5A55A, 0x6996966996696996 }, { 0x3CC3C33C3CC3C33C, 0xAAAAAAAAAAAAAAAA, 0xFFFF0000FFFF0000, 0xC33C3CC3C33C3CC3, 0xAA55AA55AA55AA55, 0x0000FFFF0000FFFF, 0xF0F0F0F00F0F0F0F, 0xFF0000FF00FFFF00, 0x33CCCC33CC3333CC, 0xFF0000FF00FFFF00, 0x6996966996696996, 0xA55A5AA55AA5A55A, 0x6996966996696996 }, { 0x3CC3C33C3CC3C33C, 0xAAAAAAAAAAAAAAAA, 0xFFFF0000FFFF0000, 0xC33C3CC3C33C3CC3, 0xAA55AA55AA55AA55, 0xFFFF0000FFFF0000, 0x0F0F0F0FF0F0F0F0, 0x00FFFF00FF0000FF, 0xCC3333CC33CCCC33, 0x00FFFF00FF0000FF, 0x9669699669969669, 0xA55A5AA55AA5A55A, 0x6996966996696996 }, { 0x3CC3C33C3CC3C33C, 0xAAAAAAAAAAAAAAAA, 0xFFFF0000FFFF0000, 0xC33C3CC3C33C3CC3, 0x55AA55AA55AA55AA, 0xFFFF0000FFFF0000, 0x0F0F0F0FF0F0F0F0, 0x00FFFF00FF0000FF, 0xCC3333CC33CCCC33, 0xFF0000FF00FFFF00, 0x6996966996696996, 0xA55A5AA55AA5A55A, 0x6996966996696996 }, { 0x3CC3C33C3CC3C33C, 0xAAAAAAAAAAAAAAAA, 0xFFFF0000FFFF0000, 0xC33C3CC3C33C3CC3, 0x55AA55AA55AA55AA, 0x0000FFFF0000FFFF, 0xF0F0F0F00F0F0F0F, 0xFF0000FF00FFFF00, 0x33CCCC33CC3333CC, 0x00FFFF00FF0000FF, 0x9669699669969669, 0xA55A5AA55AA5A55A, 0x6996966996696996 }, { 0x3CC3C33C3CC3C33C, 0xAAAAAAAAAAAAAAAA, 0x0000FFFF0000FFFF, 0xC33C3CC3C33C3CC3, 0xAA55AA55AA55AA55, 0xFFFF0000FFFF0000, 0x0F0F0F0FF0F0F0F0, 0xFF0000FF00FFFF00, 0x33CCCC33CC3333CC, 0xFF0000FF00FFFF00, 0x6996966996696996, 0xA55A5AA55AA5A55A, 0x6996966996696996 }, { 0x3CC3C33C3CC3C33C, 0xAAAAAAAAAAAAAAAA, 0x0000FFFF0000FFFF, 0xC33C3CC3C33C3CC3, 0xAA55AA55AA55AA55, 0x0000FFFF0000FFFF, 0xF0F0F0F00F0F0F0F, 0x00FFFF00FF0000FF, 0xCC3333CC33CCCC33, 0x00FFFF00FF0000FF, 0x9669699669969669, 0xA55A5AA55AA5A55A, 0x6996966996696996 }, { 0x3CC3C33C3CC3C33C, 0xAAAAAAAAAAAAAAAA, 0x0000FFFF0000FFFF, 0xC33C3CC3C33C3CC3, 0x55AA55AA55AA55AA, 0x0000FFFF0000FFFF, 0xF0F0F0F00F0F0F0F, 0x00FFFF00FF0000FF, 0xCC3333CC33CCCC33, 0xFF0000FF00FFFF00, 0x6996966996696996, 0xA55A5AA55AA5A55A, 0x6996966996696996 }, { 0x3CC3C33C3CC3C33C, 0xAAAAAAAAAAAAAAAA, 0x0000FFFF0000FFFF, 0xC33C3CC3C33C3CC3, 0x55AA55AA55AA55AA, 0xFFFF0000FFFF0000, 0x0F0F0F0FF0F0F0F0, 0xFF0000FF00FFFF00, 0x33CCCC33CC3333CC, 0x00FFFF00FF0000FF, 0x9669699669969669, 0xA55A5AA55AA5A55A, 0x6996966996696996 }, { 0x3CC3C33C3CC3C33C, 0xAAAAAAAAAAAAAAAA, 0x0000FFFF0000FFFF, 0x3CC3C33C3CC3C33C, 0x55AA55AA55AA55AA, 0x0000FFFF0000FFFF, 0xF0F0F0F00F0F0F0F, 0xFF0000FF00FFFF00, 0x33CCCC33CC3333CC, 0xFF0000FF00FFFF00, 0x6996966996696996, 0xA55A5AA55AA5A55A, 0x6996966996696996 }, { 0x3CC3C33C3CC3C33C, 0xAAAAAAAAAAAAAAAA, 0x0000FFFF0000FFFF, 0x3CC3C33C3CC3C33C, 0x55AA55AA55AA55AA, 0xFFFF0000FFFF0000, 0x0F0F0F0FF0F0F0F0, 0x00FFFF00FF0000FF, 0xCC3333CC33CCCC33, 0x00FFFF00FF0000FF, 0x9669699669969669, 0xA55A5AA55AA5A55A, 0x6996966996696996 }, { 0x3CC3C33C3CC3C33C, 0xAAAAAAAAAAAAAAAA, 0x0000FFFF0000FFFF, 0x3CC3C33C3CC3C33C, 0xAA55AA55AA55AA55, 0xFFFF0000FFFF0000, 0x0F0F0F0FF0F0F0F0, 0x00FFFF00FF0000FF, 0xCC3333CC33CCCC33, 0xFF0000FF00FFFF00, 0x6996966996696996, 0xA55A5AA55AA5A55A, 0x6996966996696996 }, { 0x3CC3C33C3CC3C33C, 0xAAAAAAAAAAAAAAAA, 0x0000FFFF0000FFFF, 0x3CC3C33C3CC3C33C, 0xAA55AA55AA55AA55, 0x0000FFFF0000FFFF, 0xF0F0F0F00F0F0F0F, 0xFF0000FF00FFFF00, 0x33CCCC33CC3333CC, 0x00FFFF00FF0000FF, 0x9669699669969669, 0xA55A5AA55AA5A55A, 0x6996966996696996 }, { 0x3CC3C33C3CC3C33C, 0x5555555555555555, 0x0000FFFF0000FFFF, 0x3CC3C33C3CC3C33C, 0x55AA55AA55AA55AA, 0xFFFF0000FFFF0000, 0x0F0F0F0FF0F0F0F0, 0xFF0000FF00FFFF00, 0x33CCCC33CC3333CC, 0xFF0000FF00FFFF00, 0x6996966996696996, 0xA55A5AA55AA5A55A, 0x6996966996696996 }, { 0x3CC3C33C3CC3C33C, 0x5555555555555555, 0x0000FFFF0000FFFF, 0x3CC3C33C3CC3C33C, 0x55AA55AA55AA55AA, 0x0000FFFF0000FFFF, 0xF0F0F0F00F0F0F0F, 0x00FFFF00FF0000FF, 0xCC3333CC33CCCC33, 0x00FFFF00FF0000FF, 0x9669699669969669, 0xA55A5AA55AA5A55A, 0x6996966996696996 }, { 0x3CC3C33C3CC3C33C, 0x5555555555555555, 0x0000FFFF0000FFFF, 0x3CC3C33C3CC3C33C, 0xAA55AA55AA55AA55, 0x0000FFFF0000FFFF, 0xF0F0F0F00F0F0F0F, 0x00FFFF00FF0000FF, 0xCC3333CC33CCCC33, 0xFF0000FF00FFFF00, 0x6996966996696996, 0xA55A5AA55AA5A55A, 0x6996966996696996 }, { 0x3CC3C33C3CC3C33C, 0x5555555555555555, 0x0000FFFF0000FFFF, 0x3CC3C33C3CC3C33C, 0xAA55AA55AA55AA55, 0xFFFF0000FFFF0000, 0x0F0F0F0FF0F0F0F0, 0xFF0000FF00FFFF00, 0x33CCCC33CC3333CC, 0x00FFFF00FF0000FF, 0x9669699669969669, 0xA55A5AA55AA5A55A, 0x6996966996696996 }, { 0x3CC3C33C3CC3C33C, 0x5555555555555555, 0x0000FFFF0000FFFF, 0xC33C3CC3C33C3CC3, 0xAA55AA55AA55AA55, 0x0000FFFF0000FFFF, 0xF0F0F0F00F0F0F0F, 0xFF0000FF00FFFF00, 0x33CCCC33CC3333CC, 0xFF0000FF00FFFF00, 0x6996966996696996, 0xA55A5AA55AA5A55A, 0x6996966996696996 }, { 0x3CC3C33C3CC3C33C, 0x5555555555555555, 0x0000FFFF0000FFFF, 0xC33C3CC3C33C3CC3, 0xAA55AA55AA55AA55, 0xFFFF0000FFFF0000, 0x0F0F0F0FF0F0F0F0, 0x00FFFF00FF0000FF, 0xCC3333CC33CCCC33, 0x00FFFF00FF0000FF, 0x9669699669969669, 0xA55A5AA55AA5A55A, 0x6996966996696996 }, { 0x3CC3C33C3CC3C33C, 0x5555555555555555, 0x0000FFFF0000FFFF, 0xC33C3CC3C33C3CC3, 0x55AA55AA55AA55AA, 0xFFFF0000FFFF0000, 0x0F0F0F0FF0F0F0F0, 0x00FFFF00FF0000FF, 0xCC3333CC33CCCC33, 0xFF0000FF00FFFF00, 0x6996966996696996, 0xA55A5AA55AA5A55A, 0x6996966996696996 }, { 0x3CC3C33C3CC3C33C, 0x5555555555555555, 0x0000FFFF0000FFFF, 0xC33C3CC3C33C3CC3, 0x55AA55AA55AA55AA, 0x0000FFFF0000FFFF, 0xF0F0F0F00F0F0F0F, 0xFF0000FF00FFFF00, 0x33CCCC33CC3333CC, 0x00FFFF00FF0000FF, 0x9669699669969669, 0xA55A5AA55AA5A55A, 0x6996966996696996 }, { 0x3CC3C33C3CC3C33C, 0x5555555555555555, 0xFFFF0000FFFF0000, 0xC33C3CC3C33C3CC3, 0xAA55AA55AA55AA55, 0xFFFF0000FFFF0000, 0x0F0F0F0FF0F0F0F0, 0xFF0000FF00FFFF00, 0x33CCCC33CC3333CC, 0xFF0000FF00FFFF00, 0x6996966996696996, 0xA55A5AA55AA5A55A, 0x6996966996696996 }, { 0x3CC3C33C3CC3C33C, 0x5555555555555555, 0xFFFF0000FFFF0000, 0xC33C3CC3C33C3CC3, 0xAA55AA55AA55AA55, 0x0000FFFF0000FFFF, 0xF0F0F0F00F0F0F0F, 0x00FFFF00FF0000FF, 0xCC3333CC33CCCC33, 0x00FFFF00FF0000FF, 0x9669699669969669, 0xA55A5AA55AA5A55A, 0x6996966996696996 }, { 0x3CC3C33C3CC3C33C, 0x5555555555555555, 0xFFFF0000FFFF0000, 0xC33C3CC3C33C3CC3, 0x55AA55AA55AA55AA, 0x0000FFFF0000FFFF, 0xF0F0F0F00F0F0F0F, 0x00FFFF00FF0000FF, 0xCC3333CC33CCCC33, 0xFF0000FF00FFFF00, 0x6996966996696996, 0xA55A5AA55AA5A55A, 0x6996966996696996 }, { 0x3CC3C33C3CC3C33C, 0x5555555555555555, 0xFFFF0000FFFF0000, 0xC33C3CC3C33C3CC3, 0x55AA55AA55AA55AA, 0xFFFF0000FFFF0000, 0x0F0F0F0FF0F0F0F0, 0xFF0000FF00FFFF00, 0x33CCCC33CC3333CC, 0x00FFFF00FF0000FF, 0x9669699669969669, 0xA55A5AA55AA5A55A, 0x6996966996696996 }, { 0x3CC3C33C3CC3C33C, 0x5555555555555555, 0xFFFF0000FFFF0000, 0x3CC3C33C3CC3C33C, 0x55AA55AA55AA55AA, 0x0000FFFF0000FFFF, 0xF0F0F0F00F0F0F0F, 0xFF0000FF00FFFF00, 0x33CCCC33CC3333CC, 0xFF0000FF00FFFF00, 0x6996966996696996, 0xA55A5AA55AA5A55A, 0x6996966996696996 }, { 0x3CC3C33C3CC3C33C, 0x5555555555555555, 0xFFFF0000FFFF0000, 0x3CC3C33C3CC3C33C, 0x55AA55AA55AA55AA, 0xFFFF0000FFFF0000, 0x0F0F0F0FF0F0F0F0, 0x00FFFF00FF0000FF, 0xCC3333CC33CCCC33, 0x00FFFF00FF0000FF, 0x9669699669969669, 0xA55A5AA55AA5A55A, 0x6996966996696996 }, { 0x3CC3C33C3CC3C33C, 0x5555555555555555, 0xFFFF0000FFFF0000, 0x3CC3C33C3CC3C33C, 0xAA55AA55AA55AA55, 0xFFFF0000FFFF0000, 0x0F0F0F0FF0F0F0F0, 0x00FFFF00FF0000FF, 0xCC3333CC33CCCC33, 0xFF0000FF00FFFF00, 0x6996966996696996, 0xA55A5AA55AA5A55A, 0x6996966996696996 }, { 0x3CC3C33C3CC3C33C, 0x5555555555555555, 0xFFFF0000FFFF0000, 0x3CC3C33C3CC3C33C, 0xAA55AA55AA55AA55, 0x0000FFFF0000FFFF, 0xF0F0F0F00F0F0F0F, 0xFF0000FF00FFFF00, 0x33CCCC33CC3333CC, 0x00FFFF00FF0000FF, 0x9669699669969669, 0xA55A5AA55AA5A55A, 0x6996966996696996 }, { 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0xFFFFFFFF00000000, 0xFFFF0000FFFF0000, 0xFF00FF00FF00FF00, 0xF0F0F0F0F0F0F0F0, 0xCCCCCCCCCCCCCCCC, 0xAAAAAAAAAAAAAAAA }, { 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFF00000000, 0xFFFF0000FFFF0000, 0xFF00FF00FF00FF00, 0xF0F0F0F0F0F0F0F0, 0xCCCCCCCCCCCCCCCC, 0xAAAAAAAAAAAAAAAA }, { 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0x0000000000000000, 0xFFFFFFFF00000000, 0xFFFF0000FFFF0000, 0xFF00FF00FF00FF00, 0xF0F0F0F0F0F0F0F0, 0xCCCCCCCCCCCCCCCC, 0xAAAAAAAAAAAAAAAA }, { 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFF00000000, 0xFFFF0000FFFF0000, 0xFF00FF00FF00FF00, 0xF0F0F0F0F0F0F0F0, 0xCCCCCCCCCCCCCCCC, 0xAAAAAAAAAAAAAAAA }, { 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0x0000000000000000, 0x0000000000000000, 0xFFFFFFFF00000000, 0xFFFF0000FFFF0000, 0xFF00FF00FF00FF00, 0xF0F0F0F0F0F0F0F0, 0xCCCCCCCCCCCCCCCC, 0xAAAAAAAAAAAAAAAA }, { 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFF00000000, 0xFFFF0000FFFF0000, 0xFF00FF00FF00FF00, 0xF0F0F0F0F0F0F0F0, 0xCCCCCCCCCCCCCCCC, 0xAAAAAAAAAAAAAAAA }, { 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0x0000000000000000, 0xFFFFFFFF00000000, 0xFFFF0000FFFF0000, 0xFF00FF00FF00FF00, 0xF0F0F0F0F0F0F0F0, 0xCCCCCCCCCCCCCCCC, 0xAAAAAAAAAAAAAAAA }, { 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFF00000000, 0xFFFF0000FFFF0000, 0xFF00FF00FF00FF00, 0xF0F0F0F0F0F0F0F0, 0xCCCCCCCCCCCCCCCC, 0xAAAAAAAAAAAAAAAA }, { 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0xFFFFFFFF00000000, 0xFFFF0000FFFF0000, 0xFF00FF00FF00FF00, 0xF0F0F0F0F0F0F0F0, 0xCCCCCCCCCCCCCCCC, 0xAAAAAAAAAAAAAAAA }, { 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0x0000000000000000, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFF00000000, 0xFFFF0000FFFF0000, 0xFF00FF00FF00FF00, 0xF0F0F0F0F0F0F0F0, 0xCCCCCCCCCCCCCCCC, 0xAAAAAAAAAAAAAAAA }, { 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0x0000000000000000, 0xFFFFFFFF00000000, 0xFFFF0000FFFF0000, 0xFF00FF00FF00FF00, 0xF0F0F0F0F0F0F0F0, 0xCCCCCCCCCCCCCCCC, 0xAAAAAAAAAAAAAAAA }, { 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFF00000000, 0xFFFF0000FFFF0000, 0xFF00FF00FF00FF00, 0xF0F0F0F0F0F0F0F0, 0xCCCCCCCCCCCCCCCC, 0xAAAAAAAAAAAAAAAA }, { 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0x0000000000000000, 0x0000000000000000, 0xFFFFFFFF00000000, 0xFFFF0000FFFF0000, 0xFF00FF00FF00FF00, 0xF0F0F0F0F0F0F0F0, 0xCCCCCCCCCCCCCCCC, 0xAAAAAAAAAAAAAAAA }, { 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFF00000000, 0xFFFF0000FFFF0000, 0xFF00FF00FF00FF00, 0xF0F0F0F0F0F0F0F0, 0xCCCCCCCCCCCCCCCC, 0xAAAAAAAAAAAAAAAA }, { 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0x0000000000000000, 0xFFFFFFFF00000000, 0xFFFF0000FFFF0000, 0xFF00FF00FF00FF00, 0xF0F0F0F0F0F0F0F0, 0xCCCCCCCCCCCCCCCC, 0xAAAAAAAAAAAAAAAA }, { 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFF00000000, 0xFFFF0000FFFF0000, 0xFF00FF00FF00FF00, 0xF0F0F0F0F0F0F0F0, 0xCCCCCCCCCCCCCCCC, 0xAAAAAAAAAAAAAAAA }, { 0x0000000000000000, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0xFFFFFFFF00000000, 0xFFFF0000FFFF0000, 0xFF00FF00FF00FF00, 0xF0F0F0F0F0F0F0F0, 0xCCCCCCCCCCCCCCCC, 0xAAAAAAAAAAAAAAAA }, { 0x0000000000000000, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFF00000000, 0xFFFF0000FFFF0000, 0xFF00FF00FF00FF00, 0xF0F0F0F0F0F0F0F0, 0xCCCCCCCCCCCCCCCC, 0xAAAAAAAAAAAAAAAA }, { 0x0000000000000000, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0x0000000000000000, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0x0000000000000000, 0xFFFFFFFF00000000, 0xFFFF0000FFFF0000, 0xFF00FF00FF00FF00, 0xF0F0F0F0F0F0F0F0, 0xCCCCCCCCCCCCCCCC, 0xAAAAAAAAAAAAAAAA }, { 0x0000000000000000, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0x0000000000000000, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFF00000000, 0xFFFF0000FFFF0000, 0xFF00FF00FF00FF00, 0xF0F0F0F0F0F0F0F0, 0xCCCCCCCCCCCCCCCC, 0xAAAAAAAAAAAAAAAA }, { 0x0000000000000000, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0x0000000000000000, 0x0000000000000000, 0xFFFFFFFF00000000, 0xFFFF0000FFFF0000, 0xFF00FF00FF00FF00, 0xF0F0F0F0F0F0F0F0, 0xCCCCCCCCCCCCCCCC, 0xAAAAAAAAAAAAAAAA }, { 0x0000000000000000, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFF00000000, 0xFFFF0000FFFF0000, 0xFF00FF00FF00FF00, 0xF0F0F0F0F0F0F0F0, 0xCCCCCCCCCCCCCCCC, 0xAAAAAAAAAAAAAAAA }, { 0x0000000000000000, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0x0000000000000000, 0xFFFFFFFF00000000, 0xFFFF0000FFFF0000, 0xFF00FF00FF00FF00, 0xF0F0F0F0F0F0F0F0, 0xCCCCCCCCCCCCCCCC, 0xAAAAAAAAAAAAAAAA }, { 0x0000000000000000, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFF00000000, 0xFFFF0000FFFF0000, 0xFF00FF00FF00FF00, 0xF0F0F0F0F0F0F0F0, 0xCCCCCCCCCCCCCCCC, 0xAAAAAAAAAAAAAAAA }, { 0x0000000000000000, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0xFFFFFFFF00000000, 0xFFFF0000FFFF0000, 0xFF00FF00FF00FF00, 0xF0F0F0F0F0F0F0F0, 0xCCCCCCCCCCCCCCCC, 0xAAAAAAAAAAAAAAAA }, { 0x0000000000000000, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0x0000000000000000, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFF00000000, 0xFFFF0000FFFF0000, 0xFF00FF00FF00FF00, 0xF0F0F0F0F0F0F0F0, 0xCCCCCCCCCCCCCCCC, 0xAAAAAAAAAAAAAAAA }, { 0x0000000000000000, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0x0000000000000000, 0xFFFFFFFF00000000, 0xFFFF0000FFFF0000, 0xFF00FF00FF00FF00, 0xF0F0F0F0F0F0F0F0, 0xCCCCCCCCCCCCCCCC, 0xAAAAAAAAAAAAAAAA }, { 0x0000000000000000, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFF00000000, 0xFFFF0000FFFF0000, 0xFF00FF00FF00FF00, 0xF0F0F0F0F0F0F0F0, 0xCCCCCCCCCCCCCCCC, 0xAAAAAAAAAAAAAAAA }, { 0x0000000000000000, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0x0000000000000000, 0x0000000000000000, 0xFFFFFFFF00000000, 0xFFFF0000FFFF0000, 0xFF00FF00FF00FF00, 0xF0F0F0F0F0F0F0F0, 0xCCCCCCCCCCCCCCCC, 0xAAAAAAAAAAAAAAAA }, { 0x0000000000000000, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFF00000000, 0xFFFF0000FFFF0000, 0xFF00FF00FF00FF00, 0xF0F0F0F0F0F0F0F0, 0xCCCCCCCCCCCCCCCC, 0xAAAAAAAAAAAAAAAA }, { 0x0000000000000000, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0x0000000000000000, 0xFFFFFFFF00000000, 0xFFFF0000FFFF0000, 0xFF00FF00FF00FF00, 0xF0F0F0F0F0F0F0F0, 0xCCCCCCCCCCCCCCCC, 0xAAAAAAAAAAAAAAAA }, { 0x0000000000000000, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFF00000000, 0xFFFF0000FFFF0000, 0xFF00FF00FF00FF00, 0xF0F0F0F0F0F0F0F0, 0xCCCCCCCCCCCCCCCC, 0xAAAAAAAAAAAAAAAA }, { 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0xFFFFFFFF00000000, 0xFFFF0000FFFF0000, 0xFF00FF00FF00FF00, 0xF0F0F0F0F0F0F0F0, 0xCCCCCCCCCCCCCCCC, 0xAAAAAAAAAAAAAAAA }, { 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFF00000000, 0xFFFF0000FFFF0000, 0xFF00FF00FF00FF00, 0xF0F0F0F0F0F0F0F0, 0xCCCCCCCCCCCCCCCC, 0xAAAAAAAAAAAAAAAA }, { 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0x0000000000000000, 0xFFFFFFFF00000000, 0xFFFF0000FFFF0000, 0xFF00FF00FF00FF00, 0xF0F0F0F0F0F0F0F0, 0xCCCCCCCCCCCCCCCC, 0xAAAAAAAAAAAAAAAA }, { 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFF00000000, 0xFFFF0000FFFF0000, 0xFF00FF00FF00FF00, 0xF0F0F0F0F0F0F0F0, 0xCCCCCCCCCCCCCCCC, 0xAAAAAAAAAAAAAAAA }, { 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0x0000000000000000, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0x0000000000000000, 0x0000000000000000, 0xFFFFFFFF00000000, 0xFFFF0000FFFF0000, 0xFF00FF00FF00FF00, 0xF0F0F0F0F0F0F0F0, 0xCCCCCCCCCCCCCCCC, 0xAAAAAAAAAAAAAAAA }, { 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0x0000000000000000, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFF00000000, 0xFFFF0000FFFF0000, 0xFF00FF00FF00FF00, 0xF0F0F0F0F0F0F0F0, 0xCCCCCCCCCCCCCCCC, 0xAAAAAAAAAAAAAAAA }, { 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0x0000000000000000, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0x0000000000000000, 0xFFFFFFFF00000000, 0xFFFF0000FFFF0000, 0xFF00FF00FF00FF00, 0xF0F0F0F0F0F0F0F0, 0xCCCCCCCCCCCCCCCC, 0xAAAAAAAAAAAAAAAA }, { 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0x0000000000000000, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFF00000000, 0xFFFF0000FFFF0000, 0xFF00FF00FF00FF00, 0xF0F0F0F0F0F0F0F0, 0xCCCCCCCCCCCCCCCC, 0xAAAAAAAAAAAAAAAA }, { 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0xFFFFFFFF00000000, 0xFFFF0000FFFF0000, 0xFF00FF00FF00FF00, 0xF0F0F0F0F0F0F0F0, 0xCCCCCCCCCCCCCCCC, 0xAAAAAAAAAAAAAAAA }, { 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0x0000000000000000, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFF00000000, 0xFFFF0000FFFF0000, 0xFF00FF00FF00FF00, 0xF0F0F0F0F0F0F0F0, 0xCCCCCCCCCCCCCCCC, 0xAAAAAAAAAAAAAAAA }, { 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0x0000000000000000, 0xFFFFFFFF00000000, 0xFFFF0000FFFF0000, 0xFF00FF00FF00FF00, 0xF0F0F0F0F0F0F0F0, 0xCCCCCCCCCCCCCCCC, 0xAAAAAAAAAAAAAAAA }, { 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFF00000000, 0xFFFF0000FFFF0000, 0xFF00FF00FF00FF00, 0xF0F0F0F0F0F0F0F0, 0xCCCCCCCCCCCCCCCC, 0xAAAAAAAAAAAAAAAA }, { 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0x0000000000000000, 0x0000000000000000, 0xFFFFFFFF00000000, 0xFFFF0000FFFF0000, 0xFF00FF00FF00FF00, 0xF0F0F0F0F0F0F0F0, 0xCCCCCCCCCCCCCCCC, 0xAAAAAAAAAAAAAAAA }, { 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFF00000000, 0xFFFF0000FFFF0000, 0xFF00FF00FF00FF00, 0xF0F0F0F0F0F0F0F0, 0xCCCCCCCCCCCCCCCC, 0xAAAAAAAAAAAAAAAA }, { 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0x0000000000000000, 0xFFFFFFFF00000000, 0xFFFF0000FFFF0000, 0xFF00FF00FF00FF00, 0xF0F0F0F0F0F0F0F0, 0xCCCCCCCCCCCCCCCC, 0xAAAAAAAAAAAAAAAA }, { 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFF00000000, 0xFFFF0000FFFF0000, 0xFF00FF00FF00FF00, 0xF0F0F0F0F0F0F0F0, 0xCCCCCCCCCCCCCCCC, 0xAAAAAAAAAAAAAAAA }, { 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0xFFFFFFFF00000000, 0xFFFF0000FFFF0000, 0xFF00FF00FF00FF00, 0xF0F0F0F0F0F0F0F0, 0xCCCCCCCCCCCCCCCC, 0xAAAAAAAAAAAAAAAA }, { 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFF00000000, 0xFFFF0000FFFF0000, 0xFF00FF00FF00FF00, 0xF0F0F0F0F0F0F0F0, 0xCCCCCCCCCCCCCCCC, 0xAAAAAAAAAAAAAAAA }, { 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0x0000000000000000, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0x0000000000000000, 0xFFFFFFFF00000000, 0xFFFF0000FFFF0000, 0xFF00FF00FF00FF00, 0xF0F0F0F0F0F0F0F0, 0xCCCCCCCCCCCCCCCC, 0xAAAAAAAAAAAAAAAA }, { 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0x0000000000000000, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFF00000000, 0xFFFF0000FFFF0000, 0xFF00FF00FF00FF00, 0xF0F0F0F0F0F0F0F0, 0xCCCCCCCCCCCCCCCC, 0xAAAAAAAAAAAAAAAA }, { 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0x0000000000000000, 0x0000000000000000, 0xFFFFFFFF00000000, 0xFFFF0000FFFF0000, 0xFF00FF00FF00FF00, 0xF0F0F0F0F0F0F0F0, 0xCCCCCCCCCCCCCCCC, 0xAAAAAAAAAAAAAAAA }, { 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFF00000000, 0xFFFF0000FFFF0000, 0xFF00FF00FF00FF00, 0xF0F0F0F0F0F0F0F0, 0xCCCCCCCCCCCCCCCC, 0xAAAAAAAAAAAAAAAA }, { 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0x0000000000000000, 0xFFFFFFFF00000000, 0xFFFF0000FFFF0000, 0xFF00FF00FF00FF00, 0xF0F0F0F0F0F0F0F0, 0xCCCCCCCCCCCCCCCC, 0xAAAAAAAAAAAAAAAA }, { 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFF00000000, 0xFFFF0000FFFF0000, 0xFF00FF00FF00FF00, 0xF0F0F0F0F0F0F0F0, 0xCCCCCCCCCCCCCCCC, 0xAAAAAAAAAAAAAAAA }, { 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0xFFFFFFFF00000000, 0xFFFF0000FFFF0000, 0xFF00FF00FF00FF00, 0xF0F0F0F0F0F0F0F0, 0xCCCCCCCCCCCCCCCC, 0xAAAAAAAAAAAAAAAA }, { 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0x0000000000000000, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFF00000000, 0xFFFF0000FFFF0000, 0xFF00FF00FF00FF00, 0xF0F0F0F0F0F0F0F0, 0xCCCCCCCCCCCCCCCC, 0xAAAAAAAAAAAAAAAA }, { 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0x0000000000000000, 0xFFFFFFFF00000000, 0xFFFF0000FFFF0000, 0xFF00FF00FF00FF00, 0xF0F0F0F0F0F0F0F0, 0xCCCCCCCCCCCCCCCC, 0xAAAAAAAAAAAAAAAA }, { 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFF00000000, 0xFFFF0000FFFF0000, 0xFF00FF00FF00FF00, 0xF0F0F0F0F0F0F0F0, 0xCCCCCCCCCCCCCCCC, 0xAAAAAAAAAAAAAAAA }, { 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0x0000000000000000, 0x0000000000000000, 0xFFFFFFFF00000000, 0xFFFF0000FFFF0000, 0xFF00FF00FF00FF00, 0xF0F0F0F0F0F0F0F0, 0xCCCCCCCCCCCCCCCC, 0xAAAAAAAAAAAAAAAA }, { 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFF00000000, 0xFFFF0000FFFF0000, 0xFF00FF00FF00FF00, 0xF0F0F0F0F0F0F0F0, 0xCCCCCCCCCCCCCCCC, 0xAAAAAAAAAAAAAAAA }, { 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0x0000000000000000, 0xFFFFFFFF00000000, 0xFFFF0000FFFF0000, 0xFF00FF00FF00FF00, 0xF0F0F0F0F0F0F0F0, 0xCCCCCCCCCCCCCCCC, 0xAAAAAAAAAAAAAAAA }, { 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFF00000000, 0xFFFF0000FFFF0000, 0xFF00FF00FF00FF00, 0xF0F0F0F0F0F0F0F0, 0xCCCCCCCCCCCCCCCC, 0xAAAAAAAAAAAAAAAA },
+};
+
+/* from libmceliece-20230612/crypto_kem/6688128f/vec/shared-fft_powers.c */
+/* linker define fft_powers */
+
+
+const vec fft_powers[128][GFBITS] = {
+{ 0x5AA55AA5A55AA55A, 0x3CC33CC3C33CC33C, 0xA5A55A5AA5A55A5A, 0xFFFF0000FFFF0000, 0x33CC33CC33CC33CC, 0xF00FF00F0FF00FF0, 0xFFFFFFFF00000000, 0x6969696996969696, 0x5A5A5A5A5A5A5A5A, 0xA5A5A5A55A5A5A5A, 0xAAAA55555555AAAA, 0x0FF00FF00FF00FF0, 0x0F0FF0F00F0FF0F0 }, { 0x5AA55AA5A55AA55A, 0xC33CC33C3CC33CC3, 0xA5A55A5AA5A55A5A, 0x0000FFFF0000FFFF, 0x33CC33CC33CC33CC, 0x0FF00FF0F00FF00F, 0xFFFFFFFF00000000, 0x6969696996969696, 0x5A5A5A5A5A5A5A5A, 0xA5A5A5A55A5A5A5A, 0x5555AAAAAAAA5555, 0xF00FF00FF00FF00F, 0xF0F00F0FF0F00F0F }, { 0xA55AA55A5AA55AA5, 0xC33CC33C3CC33CC3, 0xA5A55A5AA5A55A5A, 0x0000FFFF0000FFFF, 0x33CC33CC33CC33CC, 0xF00FF00F0FF00FF0, 0xFFFFFFFF00000000, 0x6969696996969696, 0xA5A5A5A5A5A5A5A5, 0x5A5A5A5AA5A5A5A5, 0xAAAA55555555AAAA, 0x0FF00FF00FF00FF0, 0xF0F00F0FF0F00F0F }, { 0xA55AA55A5AA55AA5, 0x3CC33CC3C33CC33C, 0xA5A55A5AA5A55A5A, 0xFFFF0000FFFF0000, 0x33CC33CC33CC33CC, 0x0FF00FF0F00FF00F, 0xFFFFFFFF00000000, 0x6969696996969696, 0xA5A5A5A5A5A5A5A5, 0x5A5A5A5AA5A5A5A5, 0x5555AAAAAAAA5555, 0xF00FF00FF00FF00F, 0x0F0FF0F00F0FF0F0 }, { 0xA55AA55A5AA55AA5, 0x3CC33CC3C33CC33C, 0xA5A55A5AA5A55A5A, 0x0000FFFF0000FFFF, 0xCC33CC33CC33CC33, 0xF00FF00F0FF00FF0, 0xFFFFFFFF00000000, 0x9696969669696969, 0x5A5A5A5A5A5A5A5A, 0x5A5A5A5AA5A5A5A5, 0xAAAA55555555AAAA, 0xF00FF00FF00FF00F, 0x0F0FF0F00F0FF0F0 }, { 0xA55AA55A5AA55AA5, 0xC33CC33C3CC33CC3, 0xA5A55A5AA5A55A5A, 0xFFFF0000FFFF0000, 0xCC33CC33CC33CC33, 0x0FF00FF0F00FF00F, 0xFFFFFFFF00000000, 0x9696969669696969, 0x5A5A5A5A5A5A5A5A, 0x5A5A5A5AA5A5A5A5, 0x5555AAAAAAAA5555, 0x0FF00FF00FF00FF0, 0xF0F00F0FF0F00F0F }, { 0x5AA55AA5A55AA55A, 0xC33CC33C3CC33CC3, 0xA5A55A5AA5A55A5A, 0xFFFF0000FFFF0000, 0xCC33CC33CC33CC33, 0xF00FF00F0FF00FF0, 0xFFFFFFFF00000000, 0x9696969669696969, 0xA5A5A5A5A5A5A5A5, 0xA5A5A5A55A5A5A5A, 0xAAAA55555555AAAA, 0xF00FF00FF00FF00F, 0xF0F00F0FF0F00F0F }, { 0x5AA55AA5A55AA55A, 0x3CC33CC3C33CC33C, 0xA5A55A5AA5A55A5A, 0x0000FFFF0000FFFF, 0xCC33CC33CC33CC33, 0x0FF00FF0F00FF00F, 0xFFFFFFFF00000000, 0x9696969669696969, 0xA5A5A5A5A5A5A5A5, 0xA5A5A5A55A5A5A5A, 0x5555AAAAAAAA5555, 0x0FF00FF00FF00FF0, 0x0F0FF0F00F0FF0F0 }, { 0x5AA55AA5A55AA55A, 0xC33CC33C3CC33CC3, 0xA5A55A5AA5A55A5A, 0x0000FFFF0000FFFF, 0xCC33CC33CC33CC33, 0xF00FF00F0FF00FF0, 0x00000000FFFFFFFF, 0x9696969669696969, 0xA5A5A5A5A5A5A5A5, 0xA5A5A5A55A5A5A5A, 0x5555AAAAAAAA5555, 0x0FF00FF00FF00FF0, 0x0F0FF0F00F0FF0F0 }, { 0x5AA55AA5A55AA55A, 0x3CC33CC3C33CC33C, 0xA5A55A5AA5A55A5A, 0xFFFF0000FFFF0000, 0xCC33CC33CC33CC33, 0x0FF00FF0F00FF00F, 0x00000000FFFFFFFF, 0x9696969669696969, 0xA5A5A5A5A5A5A5A5, 0xA5A5A5A55A5A5A5A, 0xAAAA55555555AAAA, 0xF00FF00FF00FF00F, 0xF0F00F0FF0F00F0F }, { 0xA55AA55A5AA55AA5, 0x3CC33CC3C33CC33C, 0xA5A55A5AA5A55A5A, 0xFFFF0000FFFF0000, 0xCC33CC33CC33CC33, 0xF00FF00F0FF00FF0, 0x00000000FFFFFFFF, 0x9696969669696969, 0x5A5A5A5A5A5A5A5A, 0x5A5A5A5AA5A5A5A5, 0x5555AAAAAAAA5555, 0x0FF00FF00FF00FF0, 0xF0F00F0FF0F00F0F }, { 0xA55AA55A5AA55AA5, 0xC33CC33C3CC33CC3, 0xA5A55A5AA5A55A5A, 0x0000FFFF0000FFFF, 0xCC33CC33CC33CC33, 0x0FF00FF0F00FF00F, 0x00000000FFFFFFFF, 0x9696969669696969, 0x5A5A5A5A5A5A5A5A, 0x5A5A5A5AA5A5A5A5, 0xAAAA55555555AAAA, 0xF00FF00FF00FF00F, 0x0F0FF0F00F0FF0F0 }, { 0xA55AA55A5AA55AA5, 0xC33CC33C3CC33CC3, 0xA5A55A5AA5A55A5A, 0xFFFF0000FFFF0000, 0x33CC33CC33CC33CC, 0xF00FF00F0FF00FF0, 0x00000000FFFFFFFF, 0x6969696996969696, 0xA5A5A5A5A5A5A5A5, 0x5A5A5A5AA5A5A5A5, 0x5555AAAAAAAA5555, 0xF00FF00FF00FF00F, 0x0F0FF0F00F0FF0F0 }, { 0xA55AA55A5AA55AA5, 0x3CC33CC3C33CC33C, 0xA5A55A5AA5A55A5A, 0x0000FFFF0000FFFF, 0x33CC33CC33CC33CC, 0x0FF00FF0F00FF00F, 0x00000000FFFFFFFF, 0x6969696996969696, 0xA5A5A5A5A5A5A5A5, 0x5A5A5A5AA5A5A5A5, 0xAAAA55555555AAAA, 0x0FF00FF00FF00FF0, 0xF0F00F0FF0F00F0F }, { 0x5AA55AA5A55AA55A, 0x3CC33CC3C33CC33C, 0xA5A55A5AA5A55A5A, 0x0000FFFF0000FFFF, 0x33CC33CC33CC33CC, 0xF00FF00F0FF00FF0, 0x00000000FFFFFFFF, 0x6969696996969696, 0x5A5A5A5A5A5A5A5A, 0xA5A5A5A55A5A5A5A, 0x5555AAAAAAAA5555, 0xF00FF00FF00FF00F, 0xF0F00F0FF0F00F0F }, { 0x5AA55AA5A55AA55A, 0xC33CC33C3CC33CC3, 0xA5A55A5AA5A55A5A, 0xFFFF0000FFFF0000, 0x33CC33CC33CC33CC, 0x0FF00FF0F00FF00F, 0x00000000FFFFFFFF, 0x6969696996969696, 0x5A5A5A5A5A5A5A5A, 0xA5A5A5A55A5A5A5A, 0xAAAA55555555AAAA, 0x0FF00FF00FF00FF0, 0x0F0FF0F00F0FF0F0 }, { 0x5AA55AA5A55AA55A, 0x3CC33CC3C33CC33C, 0xA5A55A5AA5A55A5A, 0xFFFF0000FFFF0000, 0x33CC33CC33CC33CC, 0x0FF00FF0F00FF00F, 0xFFFFFFFF00000000, 0x6969696996969696, 0xA5A5A5A5A5A5A5A5, 0xA5A5A5A55A5A5A5A, 0xAAAA55555555AAAA, 0xF00FF00FF00FF00F, 0xF0F00F0FF0F00F0F }, { 0x5AA55AA5A55AA55A, 0xC33CC33C3CC33CC3, 0xA5A55A5AA5A55A5A, 0x0000FFFF0000FFFF, 0x33CC33CC33CC33CC, 0xF00FF00F0FF00FF0, 0xFFFFFFFF00000000, 0x6969696996969696, 0xA5A5A5A5A5A5A5A5, 0xA5A5A5A55A5A5A5A, 0x5555AAAAAAAA5555, 0x0FF00FF00FF00FF0, 0x0F0FF0F00F0FF0F0 }, { 0xA55AA55A5AA55AA5, 0xC33CC33C3CC33CC3, 0xA5A55A5AA5A55A5A, 0x0000FFFF0000FFFF, 0x33CC33CC33CC33CC, 0x0FF00FF0F00FF00F, 0xFFFFFFFF00000000, 0x6969696996969696, 0x5A5A5A5A5A5A5A5A, 0x5A5A5A5AA5A5A5A5, 0xAAAA55555555AAAA, 0xF00FF00FF00FF00F, 0x0F0FF0F00F0FF0F0 }, { 0xA55AA55A5AA55AA5, 0x3CC33CC3C33CC33C, 0xA5A55A5AA5A55A5A, 0xFFFF0000FFFF0000, 0x33CC33CC33CC33CC, 0xF00FF00F0FF00FF0, 0xFFFFFFFF00000000, 0x6969696996969696, 0x5A5A5A5A5A5A5A5A, 0x5A5A5A5AA5A5A5A5, 0x5555AAAAAAAA5555, 0x0FF00FF00FF00FF0, 0xF0F00F0FF0F00F0F }, { 0xA55AA55A5AA55AA5, 0x3CC33CC3C33CC33C, 0xA5A55A5AA5A55A5A, 0x0000FFFF0000FFFF, 0xCC33CC33CC33CC33, 0x0FF00FF0F00FF00F, 0xFFFFFFFF00000000, 0x9696969669696969, 0xA5A5A5A5A5A5A5A5, 0x5A5A5A5AA5A5A5A5, 0xAAAA55555555AAAA, 0x0FF00FF00FF00FF0, 0xF0F00F0FF0F00F0F }, { 0xA55AA55A5AA55AA5, 0xC33CC33C3CC33CC3, 0xA5A55A5AA5A55A5A, 0xFFFF0000FFFF0000, 0xCC33CC33CC33CC33, 0xF00FF00F0FF00FF0, 0xFFFFFFFF00000000, 0x9696969669696969, 0xA5A5A5A5A5A5A5A5, 0x5A5A5A5AA5A5A5A5, 0x5555AAAAAAAA5555, 0xF00FF00FF00FF00F, 0x0F0FF0F00F0FF0F0 }, { 0x5AA55AA5A55AA55A, 0xC33CC33C3CC33CC3, 0xA5A55A5AA5A55A5A, 0xFFFF0000FFFF0000, 0xCC33CC33CC33CC33, 0x0FF00FF0F00FF00F, 0xFFFFFFFF00000000, 0x9696969669696969, 0x5A5A5A5A5A5A5A5A, 0xA5A5A5A55A5A5A5A, 0xAAAA55555555AAAA, 0x0FF00FF00FF00FF0, 0x0F0FF0F00F0FF0F0 }, { 0x5AA55AA5A55AA55A, 0x3CC33CC3C33CC33C, 0xA5A55A5AA5A55A5A, 0x0000FFFF0000FFFF, 0xCC33CC33CC33CC33, 0xF00FF00F0FF00FF0, 0xFFFFFFFF00000000, 0x9696969669696969, 0x5A5A5A5A5A5A5A5A, 0xA5A5A5A55A5A5A5A, 0x5555AAAAAAAA5555, 0xF00FF00FF00FF00F, 0xF0F00F0FF0F00F0F }, { 0x5AA55AA5A55AA55A, 0xC33CC33C3CC33CC3, 0xA5A55A5AA5A55A5A, 0x0000FFFF0000FFFF, 0xCC33CC33CC33CC33, 0x0FF00FF0F00FF00F, 0x00000000FFFFFFFF, 0x9696969669696969, 0x5A5A5A5A5A5A5A5A, 0xA5A5A5A55A5A5A5A, 0x5555AAAAAAAA5555, 0xF00FF00FF00FF00F, 0xF0F00F0FF0F00F0F }, { 0x5AA55AA5A55AA55A, 0x3CC33CC3C33CC33C, 0xA5A55A5AA5A55A5A, 0xFFFF0000FFFF0000, 0xCC33CC33CC33CC33, 0xF00FF00F0FF00FF0, 0x00000000FFFFFFFF, 0x9696969669696969, 0x5A5A5A5A5A5A5A5A, 0xA5A5A5A55A5A5A5A, 0xAAAA55555555AAAA, 0x0FF00FF00FF00FF0, 0x0F0FF0F00F0FF0F0 }, { 0xA55AA55A5AA55AA5, 0x3CC33CC3C33CC33C, 0xA5A55A5AA5A55A5A, 0xFFFF0000FFFF0000, 0xCC33CC33CC33CC33, 0x0FF00FF0F00FF00F, 0x00000000FFFFFFFF, 0x9696969669696969, 0xA5A5A5A5A5A5A5A5, 0x5A5A5A5AA5A5A5A5, 0x5555AAAAAAAA5555, 0xF00FF00FF00FF00F, 0x0F0FF0F00F0FF0F0 }, { 0xA55AA55A5AA55AA5, 0xC33CC33C3CC33CC3, 0xA5A55A5AA5A55A5A, 0x0000FFFF0000FFFF, 0xCC33CC33CC33CC33, 0xF00FF00F0FF00FF0, 0x00000000FFFFFFFF, 0x9696969669696969, 0xA5A5A5A5A5A5A5A5, 0x5A5A5A5AA5A5A5A5, 0xAAAA55555555AAAA, 0x0FF00FF00FF00FF0, 0xF0F00F0FF0F00F0F }, { 0xA55AA55A5AA55AA5, 0xC33CC33C3CC33CC3, 0xA5A55A5AA5A55A5A, 0xFFFF0000FFFF0000, 0x33CC33CC33CC33CC, 0x0FF00FF0F00FF00F, 0x00000000FFFFFFFF, 0x6969696996969696, 0x5A5A5A5A5A5A5A5A, 0x5A5A5A5AA5A5A5A5, 0x5555AAAAAAAA5555, 0x0FF00FF00FF00FF0, 0xF0F00F0FF0F00F0F }, { 0xA55AA55A5AA55AA5, 0x3CC33CC3C33CC33C, 0xA5A55A5AA5A55A5A, 0x0000FFFF0000FFFF, 0x33CC33CC33CC33CC, 0xF00FF00F0FF00FF0, 0x00000000FFFFFFFF, 0x6969696996969696, 0x5A5A5A5A5A5A5A5A, 0x5A5A5A5AA5A5A5A5, 0xAAAA55555555AAAA, 0xF00FF00FF00FF00F, 0x0F0FF0F00F0FF0F0 }, { 0x5AA55AA5A55AA55A, 0x3CC33CC3C33CC33C, 0xA5A55A5AA5A55A5A, 0x0000FFFF0000FFFF, 0x33CC33CC33CC33CC, 0x0FF00FF0F00FF00F, 0x00000000FFFFFFFF, 0x6969696996969696, 0xA5A5A5A5A5A5A5A5, 0xA5A5A5A55A5A5A5A, 0x5555AAAAAAAA5555, 0x0FF00FF00FF00FF0, 0x0F0FF0F00F0FF0F0 }, { 0x5AA55AA5A55AA55A, 0xC33CC33C3CC33CC3, 0xA5A55A5AA5A55A5A, 0xFFFF0000FFFF0000, 0x33CC33CC33CC33CC, 0xF00FF00F0FF00FF0, 0x00000000FFFFFFFF, 0x6969696996969696, 0xA5A5A5A5A5A5A5A5, 0xA5A5A5A55A5A5A5A, 0xAAAA55555555AAAA, 0xF00FF00FF00FF00F, 0xF0F00F0FF0F00F0F }, { 0xA55AA55A5AA55AA5, 0x3CC33CC3C33CC33C, 0xA5A55A5AA5A55A5A, 0xFFFF0000FFFF0000, 0x33CC33CC33CC33CC, 0x0FF00FF0F00FF00F, 0x00000000FFFFFFFF, 0x6969696996969696, 0x5A5A5A5A5A5A5A5A, 0x5A5A5A5AA5A5A5A5, 0xAAAA55555555AAAA, 0xF00FF00FF00FF00F, 0xF0F00F0FF0F00F0F }, { 0xA55AA55A5AA55AA5, 0xC33CC33C3CC33CC3, 0xA5A55A5AA5A55A5A, 0x0000FFFF0000FFFF, 0x33CC33CC33CC33CC, 0xF00FF00F0FF00FF0, 0x00000000FFFFFFFF, 0x6969696996969696, 0x5A5A5A5A5A5A5A5A, 0x5A5A5A5AA5A5A5A5, 0x5555AAAAAAAA5555, 0x0FF00FF00FF00FF0, 0x0F0FF0F00F0FF0F0 }, { 0x5AA55AA5A55AA55A, 0xC33CC33C3CC33CC3, 0xA5A55A5AA5A55A5A, 0x0000FFFF0000FFFF, 0x33CC33CC33CC33CC, 0x0FF00FF0F00FF00F, 0x00000000FFFFFFFF, 0x6969696996969696, 0xA5A5A5A5A5A5A5A5, 0xA5A5A5A55A5A5A5A, 0xAAAA55555555AAAA, 0xF00FF00FF00FF00F, 0x0F0FF0F00F0FF0F0 }, { 0x5AA55AA5A55AA55A, 0x3CC33CC3C33CC33C, 0xA5A55A5AA5A55A5A, 0xFFFF0000FFFF0000, 0x33CC33CC33CC33CC, 0xF00FF00F0FF00FF0, 0x00000000FFFFFFFF, 0x6969696996969696, 0xA5A5A5A5A5A5A5A5, 0xA5A5A5A55A5A5A5A, 0x5555AAAAAAAA5555, 0x0FF00FF00FF00FF0, 0xF0F00F0FF0F00F0F }, { 0x5AA55AA5A55AA55A, 0x3CC33CC3C33CC33C, 0xA5A55A5AA5A55A5A, 0x0000FFFF0000FFFF, 0xCC33CC33CC33CC33, 0x0FF00FF0F00FF00F, 0x00000000FFFFFFFF, 0x9696969669696969, 0x5A5A5A5A5A5A5A5A, 0xA5A5A5A55A5A5A5A, 0xAAAA55555555AAAA, 0x0FF00FF00FF00FF0, 0xF0F00F0FF0F00F0F }, { 0x5AA55AA5A55AA55A, 0xC33CC33C3CC33CC3, 0xA5A55A5AA5A55A5A, 0xFFFF0000FFFF0000, 0xCC33CC33CC33CC33, 0xF00FF00F0FF00FF0, 0x00000000FFFFFFFF, 0x9696969669696969, 0x5A5A5A5A5A5A5A5A, 0xA5A5A5A55A5A5A5A, 0x5555AAAAAAAA5555, 0xF00FF00FF00FF00F, 0x0F0FF0F00F0FF0F0 }, { 0xA55AA55A5AA55AA5, 0xC33CC33C3CC33CC3, 0xA5A55A5AA5A55A5A, 0xFFFF0000FFFF0000, 0xCC33CC33CC33CC33, 0x0FF00FF0F00FF00F, 0x00000000FFFFFFFF, 0x9696969669696969, 0xA5A5A5A5A5A5A5A5, 0x5A5A5A5AA5A5A5A5, 0xAAAA55555555AAAA, 0x0FF00FF00FF00FF0, 0x0F0FF0F00F0FF0F0 }, { 0xA55AA55A5AA55AA5, 0x3CC33CC3C33CC33C, 0xA5A55A5AA5A55A5A, 0x0000FFFF0000FFFF, 0xCC33CC33CC33CC33, 0xF00FF00F0FF00FF0, 0x00000000FFFFFFFF, 0x9696969669696969, 0xA5A5A5A5A5A5A5A5, 0x5A5A5A5AA5A5A5A5, 0x5555AAAAAAAA5555, 0xF00FF00FF00FF00F, 0xF0F00F0FF0F00F0F }, { 0xA55AA55A5AA55AA5, 0xC33CC33C3CC33CC3, 0xA5A55A5AA5A55A5A, 0x0000FFFF0000FFFF, 0xCC33CC33CC33CC33, 0x0FF00FF0F00FF00F, 0xFFFFFFFF00000000, 0x9696969669696969, 0xA5A5A5A5A5A5A5A5, 0x5A5A5A5AA5A5A5A5, 0x5555AAAAAAAA5555, 0xF00FF00FF00FF00F, 0xF0F00F0FF0F00F0F }, { 0xA55AA55A5AA55AA5, 0x3CC33CC3C33CC33C, 0xA5A55A5AA5A55A5A, 0xFFFF0000FFFF0000, 0xCC33CC33CC33CC33, 0xF00FF00F0FF00FF0, 0xFFFFFFFF00000000, 0x9696969669696969, 0xA5A5A5A5A5A5A5A5, 0x5A5A5A5AA5A5A5A5, 0xAAAA55555555AAAA, 0x0FF00FF00FF00FF0, 0x0F0FF0F00F0FF0F0 }, { 0x5AA55AA5A55AA55A, 0x3CC33CC3C33CC33C, 0xA5A55A5AA5A55A5A, 0xFFFF0000FFFF0000, 0xCC33CC33CC33CC33, 0x0FF00FF0F00FF00F, 0xFFFFFFFF00000000, 0x9696969669696969, 0x5A5A5A5A5A5A5A5A, 0xA5A5A5A55A5A5A5A, 0x5555AAAAAAAA5555, 0xF00FF00FF00FF00F, 0x0F0FF0F00F0FF0F0 }, { 0x5AA55AA5A55AA55A, 0xC33CC33C3CC33CC3, 0xA5A55A5AA5A55A5A, 0x0000FFFF0000FFFF, 0xCC33CC33CC33CC33, 0xF00FF00F0FF00FF0, 0xFFFFFFFF00000000, 0x9696969669696969, 0x5A5A5A5A5A5A5A5A, 0xA5A5A5A55A5A5A5A, 0xAAAA55555555AAAA, 0x0FF00FF00FF00FF0, 0xF0F00F0FF0F00F0F }, { 0x5AA55AA5A55AA55A, 0xC33CC33C3CC33CC3, 0xA5A55A5AA5A55A5A, 0xFFFF0000FFFF0000, 0x33CC33CC33CC33CC, 0x0FF00FF0F00FF00F, 0xFFFFFFFF00000000, 0x6969696996969696, 0xA5A5A5A5A5A5A5A5, 0xA5A5A5A55A5A5A5A, 0x5555AAAAAAAA5555, 0x0FF00FF00FF00FF0, 0xF0F00F0FF0F00F0F }, { 0x5AA55AA5A55AA55A, 0x3CC33CC3C33CC33C, 0xA5A55A5AA5A55A5A, 0x0000FFFF0000FFFF, 0x33CC33CC33CC33CC, 0xF00FF00F0FF00FF0, 0xFFFFFFFF00000000, 0x6969696996969696, 0xA5A5A5A5A5A5A5A5, 0xA5A5A5A55A5A5A5A, 0xAAAA55555555AAAA, 0xF00FF00FF00FF00F, 0x0F0FF0F00F0FF0F0 }, { 0xA55AA55A5AA55AA5, 0x3CC33CC3C33CC33C, 0xA5A55A5AA5A55A5A, 0x0000FFFF0000FFFF, 0x33CC33CC33CC33CC, 0x0FF00FF0F00FF00F, 0xFFFFFFFF00000000, 0x6969696996969696, 0x5A5A5A5A5A5A5A5A, 0x5A5A5A5AA5A5A5A5, 0x5555AAAAAAAA5555, 0x0FF00FF00FF00FF0, 0x0F0FF0F00F0FF0F0 }, { 0xA55AA55A5AA55AA5, 0xC33CC33C3CC33CC3, 0xA5A55A5AA5A55A5A, 0xFFFF0000FFFF0000, 0x33CC33CC33CC33CC, 0xF00FF00F0FF00FF0, 0xFFFFFFFF00000000, 0x6969696996969696, 0x5A5A5A5A5A5A5A5A, 0x5A5A5A5AA5A5A5A5, 0xAAAA55555555AAAA, 0xF00FF00FF00FF00F, 0xF0F00F0FF0F00F0F }, { 0xA55AA55A5AA55AA5, 0x3CC33CC3C33CC33C, 0xA5A55A5AA5A55A5A, 0xFFFF0000FFFF0000, 0x33CC33CC33CC33CC, 0xF00FF00F0FF00FF0, 0x00000000FFFFFFFF, 0x6969696996969696, 0xA5A5A5A5A5A5A5A5, 0x5A5A5A5AA5A5A5A5, 0xAAAA55555555AAAA, 0x0FF00FF00FF00FF0, 0x0F0FF0F00F0FF0F0 }, { 0xA55AA55A5AA55AA5, 0xC33CC33C3CC33CC3, 0xA5A55A5AA5A55A5A, 0x0000FFFF0000FFFF, 0x33CC33CC33CC33CC, 0x0FF00FF0F00FF00F, 0x00000000FFFFFFFF, 0x6969696996969696, 0xA5A5A5A5A5A5A5A5, 0x5A5A5A5AA5A5A5A5, 0x5555AAAAAAAA5555, 0xF00FF00FF00FF00F, 0xF0F00F0FF0F00F0F }, { 0x5AA55AA5A55AA55A, 0xC33CC33C3CC33CC3, 0xA5A55A5AA5A55A5A, 0x0000FFFF0000FFFF, 0x33CC33CC33CC33CC, 0xF00FF00F0FF00FF0, 0x00000000FFFFFFFF, 0x6969696996969696, 0x5A5A5A5A5A5A5A5A, 0xA5A5A5A55A5A5A5A, 0xAAAA55555555AAAA, 0x0FF00FF00FF00FF0, 0xF0F00F0FF0F00F0F }, { 0x5AA55AA5A55AA55A, 0x3CC33CC3C33CC33C, 0xA5A55A5AA5A55A5A, 0xFFFF0000FFFF0000, 0x33CC33CC33CC33CC, 0x0FF00FF0F00FF00F, 0x00000000FFFFFFFF, 0x6969696996969696, 0x5A5A5A5A5A5A5A5A, 0xA5A5A5A55A5A5A5A, 0x5555AAAAAAAA5555, 0xF00FF00FF00FF00F, 0x0F0FF0F00F0FF0F0 }, { 0x5AA55AA5A55AA55A, 0x3CC33CC3C33CC33C, 0xA5A55A5AA5A55A5A, 0x0000FFFF0000FFFF, 0xCC33CC33CC33CC33, 0xF00FF00F0FF00FF0, 0x00000000FFFFFFFF, 0x9696969669696969, 0xA5A5A5A5A5A5A5A5, 0xA5A5A5A55A5A5A5A, 0xAAAA55555555AAAA, 0xF00FF00FF00FF00F, 0x0F0FF0F00F0FF0F0 }, { 0x5AA55AA5A55AA55A, 0xC33CC33C3CC33CC3, 0xA5A55A5AA5A55A5A, 0xFFFF0000FFFF0000, 0xCC33CC33CC33CC33, 0x0FF00FF0F00FF00F, 0x00000000FFFFFFFF, 0x9696969669696969, 0xA5A5A5A5A5A5A5A5, 0xA5A5A5A55A5A5A5A, 0x5555AAAAAAAA5555, 0x0FF00FF00FF00FF0, 0xF0F00F0FF0F00F0F }, { 0xA55AA55A5AA55AA5, 0xC33CC33C3CC33CC3, 0xA5A55A5AA5A55A5A, 0xFFFF0000FFFF0000, 0xCC33CC33CC33CC33, 0xF00FF00F0FF00FF0, 0x00000000FFFFFFFF, 0x9696969669696969, 0x5A5A5A5A5A5A5A5A, 0x5A5A5A5AA5A5A5A5, 0xAAAA55555555AAAA, 0xF00FF00FF00FF00F, 0xF0F00F0FF0F00F0F }, { 0xA55AA55A5AA55AA5, 0x3CC33CC3C33CC33C, 0xA5A55A5AA5A55A5A, 0x0000FFFF0000FFFF, 0xCC33CC33CC33CC33, 0x0FF00FF0F00FF00F, 0x00000000FFFFFFFF, 0x9696969669696969, 0x5A5A5A5A5A5A5A5A, 0x5A5A5A5AA5A5A5A5, 0x5555AAAAAAAA5555, 0x0FF00FF00FF00FF0, 0x0F0FF0F00F0FF0F0 }, { 0xA55AA55A5AA55AA5, 0xC33CC33C3CC33CC3, 0xA5A55A5AA5A55A5A, 0x0000FFFF0000FFFF, 0xCC33CC33CC33CC33, 0xF00FF00F0FF00FF0, 0xFFFFFFFF00000000, 0x9696969669696969, 0x5A5A5A5A5A5A5A5A, 0x5A5A5A5AA5A5A5A5, 0x5555AAAAAAAA5555, 0x0FF00FF00FF00FF0, 0x0F0FF0F00F0FF0F0 }, { 0xA55AA55A5AA55AA5, 0x3CC33CC3C33CC33C, 0xA5A55A5AA5A55A5A, 0xFFFF0000FFFF0000, 0xCC33CC33CC33CC33, 0x0FF00FF0F00FF00F, 0xFFFFFFFF00000000, 0x9696969669696969, 0x5A5A5A5A5A5A5A5A, 0x5A5A5A5AA5A5A5A5, 0xAAAA55555555AAAA, 0xF00FF00FF00FF00F, 0xF0F00F0FF0F00F0F }, { 0x5AA55AA5A55AA55A, 0x3CC33CC3C33CC33C, 0xA5A55A5AA5A55A5A, 0xFFFF0000FFFF0000, 0xCC33CC33CC33CC33, 0xF00FF00F0FF00FF0, 0xFFFFFFFF00000000, 0x9696969669696969, 0xA5A5A5A5A5A5A5A5, 0xA5A5A5A55A5A5A5A, 0x5555AAAAAAAA5555, 0x0FF00FF00FF00FF0, 0xF0F00F0FF0F00F0F }, { 0x5AA55AA5A55AA55A, 0xC33CC33C3CC33CC3, 0xA5A55A5AA5A55A5A, 0x0000FFFF0000FFFF, 0xCC33CC33CC33CC33, 0x0FF00FF0F00FF00F, 0xFFFFFFFF00000000, 0x9696969669696969, 0xA5A5A5A5A5A5A5A5, 0xA5A5A5A55A5A5A5A, 0xAAAA55555555AAAA, 0xF00FF00FF00FF00F, 0x0F0FF0F00F0FF0F0 }, { 0x5AA55AA5A55AA55A, 0xC33CC33C3CC33CC3, 0xA5A55A5AA5A55A5A, 0xFFFF0000FFFF0000, 0x33CC33CC33CC33CC, 0xF00FF00F0FF00FF0, 0xFFFFFFFF00000000, 0x6969696996969696, 0x5A5A5A5A5A5A5A5A, 0xA5A5A5A55A5A5A5A, 0x5555AAAAAAAA5555, 0xF00FF00FF00FF00F, 0x0F0FF0F00F0FF0F0 }, { 0x5AA55AA5A55AA55A, 0x3CC33CC3C33CC33C, 0xA5A55A5AA5A55A5A, 0x0000FFFF0000FFFF, 0x33CC33CC33CC33CC, 0x0FF00FF0F00FF00F, 0xFFFFFFFF00000000, 0x6969696996969696, 0x5A5A5A5A5A5A5A5A, 0xA5A5A5A55A5A5A5A, 0xAAAA55555555AAAA, 0x0FF00FF00FF00FF0, 0xF0F00F0FF0F00F0F }, { 0xA55AA55A5AA55AA5, 0x3CC33CC3C33CC33C, 0xA5A55A5AA5A55A5A, 0x0000FFFF0000FFFF, 0x33CC33CC33CC33CC, 0xF00FF00F0FF00FF0, 0xFFFFFFFF00000000, 0x6969696996969696, 0xA5A5A5A5A5A5A5A5, 0x5A5A5A5AA5A5A5A5, 0x5555AAAAAAAA5555, 0xF00FF00FF00FF00F, 0xF0F00F0FF0F00F0F }, { 0xA55AA55A5AA55AA5, 0xC33CC33C3CC33CC3, 0xA5A55A5AA5A55A5A, 0xFFFF0000FFFF0000, 0x33CC33CC33CC33CC, 0x0FF00FF0F00FF00F, 0xFFFFFFFF00000000, 0x6969696996969696, 0xA5A5A5A5A5A5A5A5, 0x5A5A5A5AA5A5A5A5, 0xAAAA55555555AAAA, 0x0FF00FF00FF00FF0, 0x0F0FF0F00F0FF0F0 }, { 0xA55AA55A5AA55AA5, 0x3CC33CC3C33CC33C, 0xA5A55A5AA5A55A5A, 0xFFFF0000FFFF0000, 0x33CC33CC33CC33CC, 0xF00FF00F0FF00FF0, 0xFFFFFFFF00000000, 0x6969696996969696, 0x5A5A5A5A5A5A5A5A, 0xA5A5A5A55A5A5A5A, 0xAAAA55555555AAAA, 0x0FF00FF00FF00FF0, 0x0F0FF0F00F0FF0F0 }, { 0xA55AA55A5AA55AA5, 0xC33CC33C3CC33CC3, 0xA5A55A5AA5A55A5A, 0x0000FFFF0000FFFF, 0x33CC33CC33CC33CC, 0x0FF00FF0F00FF00F, 0xFFFFFFFF00000000, 0x6969696996969696, 0x5A5A5A5A5A5A5A5A, 0xA5A5A5A55A5A5A5A, 0x5555AAAAAAAA5555, 0xF00FF00FF00FF00F, 0xF0F00F0FF0F00F0F }, { 0x5AA55AA5A55AA55A, 0xC33CC33C3CC33CC3, 0xA5A55A5AA5A55A5A, 0x0000FFFF0000FFFF, 0x33CC33CC33CC33CC, 0xF00FF00F0FF00FF0, 0xFFFFFFFF00000000, 0x6969696996969696, 0xA5A5A5A5A5A5A5A5, 0x5A5A5A5AA5A5A5A5, 0xAAAA55555555AAAA, 0x0FF00FF00FF00FF0, 0xF0F00F0FF0F00F0F }, { 0x5AA55AA5A55AA55A, 0x3CC33CC3C33CC33C, 0xA5A55A5AA5A55A5A, 0xFFFF0000FFFF0000, 0x33CC33CC33CC33CC, 0x0FF00FF0F00FF00F, 0xFFFFFFFF00000000, 0x6969696996969696, 0xA5A5A5A5A5A5A5A5, 0x5A5A5A5AA5A5A5A5, 0x5555AAAAAAAA5555, 0xF00FF00FF00FF00F, 0x0F0FF0F00F0FF0F0 }, { 0x5AA55AA5A55AA55A, 0x3CC33CC3C33CC33C, 0xA5A55A5AA5A55A5A, 0x0000FFFF0000FFFF, 0xCC33CC33CC33CC33, 0xF00FF00F0FF00FF0, 0xFFFFFFFF00000000, 0x9696969669696969, 0x5A5A5A5A5A5A5A5A, 0x5A5A5A5AA5A5A5A5, 0xAAAA55555555AAAA, 0xF00FF00FF00FF00F, 0x0F0FF0F00F0FF0F0 }, { 0x5AA55AA5A55AA55A, 0xC33CC33C3CC33CC3, 0xA5A55A5AA5A55A5A, 0xFFFF0000FFFF0000, 0xCC33CC33CC33CC33, 0x0FF00FF0F00FF00F, 0xFFFFFFFF00000000, 0x9696969669696969, 0x5A5A5A5A5A5A5A5A, 0x5A5A5A5AA5A5A5A5, 0x5555AAAAAAAA5555, 0x0FF00FF00FF00FF0, 0xF0F00F0FF0F00F0F }, { 0xA55AA55A5AA55AA5, 0xC33CC33C3CC33CC3, 0xA5A55A5AA5A55A5A, 0xFFFF0000FFFF0000, 0xCC33CC33CC33CC33, 0xF00FF00F0FF00FF0, 0xFFFFFFFF00000000, 0x9696969669696969, 0xA5A5A5A5A5A5A5A5, 0xA5A5A5A55A5A5A5A, 0xAAAA55555555AAAA, 0xF00FF00FF00FF00F, 0xF0F00F0FF0F00F0F }, { 0xA55AA55A5AA55AA5, 0x3CC33CC3C33CC33C, 0xA5A55A5AA5A55A5A, 0x0000FFFF0000FFFF, 0xCC33CC33CC33CC33, 0x0FF00FF0F00FF00F, 0xFFFFFFFF00000000, 0x9696969669696969, 0xA5A5A5A5A5A5A5A5, 0xA5A5A5A55A5A5A5A, 0x5555AAAAAAAA5555, 0x0FF00FF00FF00FF0, 0x0F0FF0F00F0FF0F0 }, { 0xA55AA55A5AA55AA5, 0xC33CC33C3CC33CC3, 0xA5A55A5AA5A55A5A, 0x0000FFFF0000FFFF, 0xCC33CC33CC33CC33, 0xF00FF00F0FF00FF0, 0x00000000FFFFFFFF, 0x9696969669696969, 0xA5A5A5A5A5A5A5A5, 0xA5A5A5A55A5A5A5A, 0x5555AAAAAAAA5555, 0x0FF00FF00FF00FF0, 0x0F0FF0F00F0FF0F0 }, { 0xA55AA55A5AA55AA5, 0x3CC33CC3C33CC33C, 0xA5A55A5AA5A55A5A, 0xFFFF0000FFFF0000, 0xCC33CC33CC33CC33, 0x0FF00FF0F00FF00F, 0x00000000FFFFFFFF, 0x9696969669696969, 0xA5A5A5A5A5A5A5A5, 0xA5A5A5A55A5A5A5A, 0xAAAA55555555AAAA, 0xF00FF00FF00FF00F, 0xF0F00F0FF0F00F0F }, { 0x5AA55AA5A55AA55A, 0x3CC33CC3C33CC33C, 0xA5A55A5AA5A55A5A, 0xFFFF0000FFFF0000, 0xCC33CC33CC33CC33, 0xF00FF00F0FF00FF0, 0x00000000FFFFFFFF, 0x9696969669696969, 0x5A5A5A5A5A5A5A5A, 0x5A5A5A5AA5A5A5A5, 0x5555AAAAAAAA5555, 0x0FF00FF00FF00FF0, 0xF0F00F0FF0F00F0F }, { 0x5AA55AA5A55AA55A, 0xC33CC33C3CC33CC3, 0xA5A55A5AA5A55A5A, 0x0000FFFF0000FFFF, 0xCC33CC33CC33CC33, 0x0FF00FF0F00FF00F, 0x00000000FFFFFFFF, 0x9696969669696969, 0x5A5A5A5A5A5A5A5A, 0x5A5A5A5AA5A5A5A5, 0xAAAA55555555AAAA, 0xF00FF00FF00FF00F, 0x0F0FF0F00F0FF0F0 }, { 0x5AA55AA5A55AA55A, 0xC33CC33C3CC33CC3, 0xA5A55A5AA5A55A5A, 0xFFFF0000FFFF0000, 0x33CC33CC33CC33CC, 0xF00FF00F0FF00FF0, 0x00000000FFFFFFFF, 0x6969696996969696, 0xA5A5A5A5A5A5A5A5, 0x5A5A5A5AA5A5A5A5, 0x5555AAAAAAAA5555, 0xF00FF00FF00FF00F, 0x0F0FF0F00F0FF0F0 }, { 0x5AA55AA5A55AA55A, 0x3CC33CC3C33CC33C, 0xA5A55A5AA5A55A5A, 0x0000FFFF0000FFFF, 0x33CC33CC33CC33CC, 0x0FF00FF0F00FF00F, 0x00000000FFFFFFFF, 0x6969696996969696, 0xA5A5A5A5A5A5A5A5, 0x5A5A5A5AA5A5A5A5, 0xAAAA55555555AAAA, 0x0FF00FF00FF00FF0, 0xF0F00F0FF0F00F0F }, { 0xA55AA55A5AA55AA5, 0x3CC33CC3C33CC33C, 0xA5A55A5AA5A55A5A, 0x0000FFFF0000FFFF, 0x33CC33CC33CC33CC, 0xF00FF00F0FF00FF0, 0x00000000FFFFFFFF, 0x6969696996969696, 0x5A5A5A5A5A5A5A5A, 0xA5A5A5A55A5A5A5A, 0x5555AAAAAAAA5555, 0xF00FF00FF00FF00F, 0xF0F00F0FF0F00F0F }, { 0xA55AA55A5AA55AA5, 0xC33CC33C3CC33CC3, 0xA5A55A5AA5A55A5A, 0xFFFF0000FFFF0000, 0x33CC33CC33CC33CC, 0x0FF00FF0F00FF00F, 0x00000000FFFFFFFF, 0x6969696996969696, 0x5A5A5A5A5A5A5A5A, 0xA5A5A5A55A5A5A5A, 0xAAAA55555555AAAA, 0x0FF00FF00FF00FF0, 0x0F0FF0F00F0FF0F0 }, { 0xA55AA55A5AA55AA5, 0x3CC33CC3C33CC33C, 0xA5A55A5AA5A55A5A, 0xFFFF0000FFFF0000, 0x33CC33CC33CC33CC, 0x0FF00FF0F00FF00F, 0xFFFFFFFF00000000, 0x6969696996969696, 0xA5A5A5A5A5A5A5A5, 0xA5A5A5A55A5A5A5A, 0xAAAA55555555AAAA, 0xF00FF00FF00FF00F, 0xF0F00F0FF0F00F0F }, { 0xA55AA55A5AA55AA5, 0xC33CC33C3CC33CC3, 0xA5A55A5AA5A55A5A, 0x0000FFFF0000FFFF, 0x33CC33CC33CC33CC, 0xF00FF00F0FF00FF0, 0xFFFFFFFF00000000, 0x6969696996969696, 0xA5A5A5A5A5A5A5A5, 0xA5A5A5A55A5A5A5A, 0x5555AAAAAAAA5555, 0x0FF00FF00FF00FF0, 0x0F0FF0F00F0FF0F0 }, { 0x5AA55AA5A55AA55A, 0xC33CC33C3CC33CC3, 0xA5A55A5AA5A55A5A, 0x0000FFFF0000FFFF, 0x33CC33CC33CC33CC, 0x0FF00FF0F00FF00F, 0xFFFFFFFF00000000, 0x6969696996969696, 0x5A5A5A5A5A5A5A5A, 0x5A5A5A5AA5A5A5A5, 0xAAAA55555555AAAA, 0xF00FF00FF00FF00F, 0x0F0FF0F00F0FF0F0 }, { 0x5AA55AA5A55AA55A, 0x3CC33CC3C33CC33C, 0xA5A55A5AA5A55A5A, 0xFFFF0000FFFF0000, 0x33CC33CC33CC33CC, 0xF00FF00F0FF00FF0, 0xFFFFFFFF00000000, 0x6969696996969696, 0x5A5A5A5A5A5A5A5A, 0x5A5A5A5AA5A5A5A5, 0x5555AAAAAAAA5555, 0x0FF00FF00FF00FF0, 0xF0F00F0FF0F00F0F }, { 0x5AA55AA5A55AA55A, 0x3CC33CC3C33CC33C, 0xA5A55A5AA5A55A5A, 0x0000FFFF0000FFFF, 0xCC33CC33CC33CC33, 0x0FF00FF0F00FF00F, 0xFFFFFFFF00000000, 0x9696969669696969, 0xA5A5A5A5A5A5A5A5, 0x5A5A5A5AA5A5A5A5, 0xAAAA55555555AAAA, 0x0FF00FF00FF00FF0, 0xF0F00F0FF0F00F0F }, { 0x5AA55AA5A55AA55A, 0xC33CC33C3CC33CC3, 0xA5A55A5AA5A55A5A, 0xFFFF0000FFFF0000, 0xCC33CC33CC33CC33, 0xF00FF00F0FF00FF0, 0xFFFFFFFF00000000, 0x9696969669696969, 0xA5A5A5A5A5A5A5A5, 0x5A5A5A5AA5A5A5A5, 0x5555AAAAAAAA5555, 0xF00FF00FF00FF00F, 0x0F0FF0F00F0FF0F0 }, { 0xA55AA55A5AA55AA5, 0xC33CC33C3CC33CC3, 0xA5A55A5AA5A55A5A, 0xFFFF0000FFFF0000, 0xCC33CC33CC33CC33, 0x0FF00FF0F00FF00F, 0xFFFFFFFF00000000, 0x9696969669696969, 0x5A5A5A5A5A5A5A5A, 0xA5A5A5A55A5A5A5A, 0xAAAA55555555AAAA, 0x0FF00FF00FF00FF0, 0x0F0FF0F00F0FF0F0 }, { 0xA55AA55A5AA55AA5, 0x3CC33CC3C33CC33C, 0xA5A55A5AA5A55A5A, 0x0000FFFF0000FFFF, 0xCC33CC33CC33CC33, 0xF00FF00F0FF00FF0, 0xFFFFFFFF00000000, 0x9696969669696969, 0x5A5A5A5A5A5A5A5A, 0xA5A5A5A55A5A5A5A, 0x5555AAAAAAAA5555, 0xF00FF00FF00FF00F, 0xF0F00F0FF0F00F0F }, { 0xA55AA55A5AA55AA5, 0xC33CC33C3CC33CC3, 0xA5A55A5AA5A55A5A, 0x0000FFFF0000FFFF, 0xCC33CC33CC33CC33, 0x0FF00FF0F00FF00F, 0x00000000FFFFFFFF, 0x9696969669696969, 0x5A5A5A5A5A5A5A5A, 0xA5A5A5A55A5A5A5A, 0x5555AAAAAAAA5555, 0xF00FF00FF00FF00F, 0xF0F00F0FF0F00F0F }, { 0xA55AA55A5AA55AA5, 0x3CC33CC3C33CC33C, 0xA5A55A5AA5A55A5A, 0xFFFF0000FFFF0000, 0xCC33CC33CC33CC33, 0xF00FF00F0FF00FF0, 0x00000000FFFFFFFF, 0x9696969669696969, 0x5A5A5A5A5A5A5A5A, 0xA5A5A5A55A5A5A5A, 0xAAAA55555555AAAA, 0x0FF00FF00FF00FF0, 0x0F0FF0F00F0FF0F0 }, { 0x5AA55AA5A55AA55A, 0x3CC33CC3C33CC33C, 0xA5A55A5AA5A55A5A, 0xFFFF0000FFFF0000, 0xCC33CC33CC33CC33, 0x0FF00FF0F00FF00F, 0x00000000FFFFFFFF, 0x9696969669696969, 0xA5A5A5A5A5A5A5A5, 0x5A5A5A5AA5A5A5A5, 0x5555AAAAAAAA5555, 0xF00FF00FF00FF00F, 0x0F0FF0F00F0FF0F0 }, { 0x5AA55AA5A55AA55A, 0xC33CC33C3CC33CC3, 0xA5A55A5AA5A55A5A, 0x0000FFFF0000FFFF, 0xCC33CC33CC33CC33, 0xF00FF00F0FF00FF0, 0x00000000FFFFFFFF, 0x9696969669696969, 0xA5A5A5A5A5A5A5A5, 0x5A5A5A5AA5A5A5A5, 0xAAAA55555555AAAA, 0x0FF00FF00FF00FF0, 0xF0F00F0FF0F00F0F }, { 0x5AA55AA5A55AA55A, 0xC33CC33C3CC33CC3, 0xA5A55A5AA5A55A5A, 0xFFFF0000FFFF0000, 0x33CC33CC33CC33CC, 0x0FF00FF0F00FF00F, 0x00000000FFFFFFFF, 0x6969696996969696, 0x5A5A5A5A5A5A5A5A, 0x5A5A5A5AA5A5A5A5, 0x5555AAAAAAAA5555, 0x0FF00FF00FF00FF0, 0xF0F00F0FF0F00F0F }, { 0x5AA55AA5A55AA55A, 0x3CC33CC3C33CC33C, 0xA5A55A5AA5A55A5A, 0x0000FFFF0000FFFF, 0x33CC33CC33CC33CC, 0xF00FF00F0FF00FF0, 0x00000000FFFFFFFF, 0x6969696996969696, 0x5A5A5A5A5A5A5A5A, 0x5A5A5A5AA5A5A5A5, 0xAAAA55555555AAAA, 0xF00FF00FF00FF00F, 0x0F0FF0F00F0FF0F0 }, { 0xA55AA55A5AA55AA5, 0x3CC33CC3C33CC33C, 0xA5A55A5AA5A55A5A, 0x0000FFFF0000FFFF, 0x33CC33CC33CC33CC, 0x0FF00FF0F00FF00F, 0x00000000FFFFFFFF, 0x6969696996969696, 0xA5A5A5A5A5A5A5A5, 0xA5A5A5A55A5A5A5A, 0x5555AAAAAAAA5555, 0x0FF00FF00FF00FF0, 0x0F0FF0F00F0FF0F0 }, { 0xA55AA55A5AA55AA5, 0xC33CC33C3CC33CC3, 0xA5A55A5AA5A55A5A, 0xFFFF0000FFFF0000, 0x33CC33CC33CC33CC, 0xF00FF00F0FF00FF0, 0x00000000FFFFFFFF, 0x6969696996969696, 0xA5A5A5A5A5A5A5A5, 0xA5A5A5A55A5A5A5A, 0xAAAA55555555AAAA, 0xF00FF00FF00FF00F, 0xF0F00F0FF0F00F0F }, { 0x5AA55AA5A55AA55A, 0x3CC33CC3C33CC33C, 0xA5A55A5AA5A55A5A, 0xFFFF0000FFFF0000, 0x33CC33CC33CC33CC, 0x0FF00FF0F00FF00F, 0x00000000FFFFFFFF, 0x6969696996969696, 0x5A5A5A5A5A5A5A5A, 0x5A5A5A5AA5A5A5A5, 0xAAAA55555555AAAA, 0xF00FF00FF00FF00F, 0xF0F00F0FF0F00F0F }, { 0x5AA55AA5A55AA55A, 0xC33CC33C3CC33CC3, 0xA5A55A5AA5A55A5A, 0x0000FFFF0000FFFF, 0x33CC33CC33CC33CC, 0xF00FF00F0FF00FF0, 0x00000000FFFFFFFF, 0x6969696996969696, 0x5A5A5A5A5A5A5A5A, 0x5A5A5A5AA5A5A5A5, 0x5555AAAAAAAA5555, 0x0FF00FF00FF00FF0, 0x0F0FF0F00F0FF0F0 }, { 0xA55AA55A5AA55AA5, 0xC33CC33C3CC33CC3, 0xA5A55A5AA5A55A5A, 0x0000FFFF0000FFFF, 0x33CC33CC33CC33CC, 0x0FF00FF0F00FF00F, 0x00000000FFFFFFFF, 0x6969696996969696, 0xA5A5A5A5A5A5A5A5, 0xA5A5A5A55A5A5A5A, 0xAAAA55555555AAAA, 0xF00FF00FF00FF00F, 0x0F0FF0F00F0FF0F0 }, { 0xA55AA55A5AA55AA5, 0x3CC33CC3C33CC33C, 0xA5A55A5AA5A55A5A, 0xFFFF0000FFFF0000, 0x33CC33CC33CC33CC, 0xF00FF00F0FF00FF0, 0x00000000FFFFFFFF, 0x6969696996969696, 0xA5A5A5A5A5A5A5A5, 0xA5A5A5A55A5A5A5A, 0x5555AAAAAAAA5555, 0x0FF00FF00FF00FF0, 0xF0F00F0FF0F00F0F }, { 0xA55AA55A5AA55AA5, 0x3CC33CC3C33CC33C, 0xA5A55A5AA5A55A5A, 0x0000FFFF0000FFFF, 0xCC33CC33CC33CC33, 0x0FF00FF0F00FF00F, 0x00000000FFFFFFFF, 0x9696969669696969, 0x5A5A5A5A5A5A5A5A, 0xA5A5A5A55A5A5A5A, 0xAAAA55555555AAAA, 0x0FF00FF00FF00FF0, 0xF0F00F0FF0F00F0F }, { 0xA55AA55A5AA55AA5, 0xC33CC33C3CC33CC3, 0xA5A55A5AA5A55A5A, 0xFFFF0000FFFF0000, 0xCC33CC33CC33CC33, 0xF00FF00F0FF00FF0, 0x00000000FFFFFFFF, 0x9696969669696969, 0x5A5A5A5A5A5A5A5A, 0xA5A5A5A55A5A5A5A, 0x5555AAAAAAAA5555, 0xF00FF00FF00FF00F, 0x0F0FF0F00F0FF0F0 }, { 0x5AA55AA5A55AA55A, 0xC33CC33C3CC33CC3, 0xA5A55A5AA5A55A5A, 0xFFFF0000FFFF0000, 0xCC33CC33CC33CC33, 0x0FF00FF0F00FF00F, 0x00000000FFFFFFFF, 0x9696969669696969, 0xA5A5A5A5A5A5A5A5, 0x5A5A5A5AA5A5A5A5, 0xAAAA55555555AAAA, 0x0FF00FF00FF00FF0, 0x0F0FF0F00F0FF0F0 }, { 0x5AA55AA5A55AA55A, 0x3CC33CC3C33CC33C, 0xA5A55A5AA5A55A5A, 0x0000FFFF0000FFFF, 0xCC33CC33CC33CC33, 0xF00FF00F0FF00FF0, 0x00000000FFFFFFFF, 0x9696969669696969, 0xA5A5A5A5A5A5A5A5, 0x5A5A5A5AA5A5A5A5, 0x5555AAAAAAAA5555, 0xF00FF00FF00FF00F, 0xF0F00F0FF0F00F0F }, { 0x5AA55AA5A55AA55A, 0xC33CC33C3CC33CC3, 0xA5A55A5AA5A55A5A, 0x0000FFFF0000FFFF, 0xCC33CC33CC33CC33, 0x0FF00FF0F00FF00F, 0xFFFFFFFF00000000, 0x9696969669696969, 0xA5A5A5A5A5A5A5A5, 0x5A5A5A5AA5A5A5A5, 0x5555AAAAAAAA5555, 0xF00FF00FF00FF00F, 0xF0F00F0FF0F00F0F }, { 0x5AA55AA5A55AA55A, 0x3CC33CC3C33CC33C, 0xA5A55A5AA5A55A5A, 0xFFFF0000FFFF0000, 0xCC33CC33CC33CC33, 0xF00FF00F0FF00FF0, 0xFFFFFFFF00000000, 0x9696969669696969, 0xA5A5A5A5A5A5A5A5, 0x5A5A5A5AA5A5A5A5, 0xAAAA55555555AAAA, 0x0FF00FF00FF00FF0, 0x0F0FF0F00F0FF0F0 }, { 0xA55AA55A5AA55AA5, 0x3CC33CC3C33CC33C, 0xA5A55A5AA5A55A5A, 0xFFFF0000FFFF0000, 0xCC33CC33CC33CC33, 0x0FF00FF0F00FF00F, 0xFFFFFFFF00000000, 0x9696969669696969, 0x5A5A5A5A5A5A5A5A, 0xA5A5A5A55A5A5A5A, 0x5555AAAAAAAA5555, 0xF00FF00FF00FF00F, 0x0F0FF0F00F0FF0F0 }, { 0xA55AA55A5AA55AA5, 0xC33CC33C3CC33CC3, 0xA5A55A5AA5A55A5A, 0x0000FFFF0000FFFF, 0xCC33CC33CC33CC33, 0xF00FF00F0FF00FF0, 0xFFFFFFFF00000000, 0x9696969669696969, 0x5A5A5A5A5A5A5A5A, 0xA5A5A5A55A5A5A5A, 0xAAAA55555555AAAA, 0x0FF00FF00FF00FF0, 0xF0F00F0FF0F00F0F }, { 0xA55AA55A5AA55AA5, 0xC33CC33C3CC33CC3, 0xA5A55A5AA5A55A5A, 0xFFFF0000FFFF0000, 0x33CC33CC33CC33CC, 0x0FF00FF0F00FF00F, 0xFFFFFFFF00000000, 0x6969696996969696, 0xA5A5A5A5A5A5A5A5, 0xA5A5A5A55A5A5A5A, 0x5555AAAAAAAA5555, 0x0FF00FF00FF00FF0, 0xF0F00F0FF0F00F0F }, { 0xA55AA55A5AA55AA5, 0x3CC33CC3C33CC33C, 0xA5A55A5AA5A55A5A, 0x0000FFFF0000FFFF, 0x33CC33CC33CC33CC, 0xF00FF00F0FF00FF0, 0xFFFFFFFF00000000, 0x6969696996969696, 0xA5A5A5A5A5A5A5A5, 0xA5A5A5A55A5A5A5A, 0xAAAA55555555AAAA, 0xF00FF00FF00FF00F, 0x0F0FF0F00F0FF0F0 }, { 0x5AA55AA5A55AA55A, 0x3CC33CC3C33CC33C, 0xA5A55A5AA5A55A5A, 0x0000FFFF0000FFFF, 0x33CC33CC33CC33CC, 0x0FF00FF0F00FF00F, 0xFFFFFFFF00000000, 0x6969696996969696, 0x5A5A5A5A5A5A5A5A, 0x5A5A5A5AA5A5A5A5, 0x5555AAAAAAAA5555, 0x0FF00FF00FF00FF0, 0x0F0FF0F00F0FF0F0 }, { 0x5AA55AA5A55AA55A, 0xC33CC33C3CC33CC3, 0xA5A55A5AA5A55A5A, 0xFFFF0000FFFF0000, 0x33CC33CC33CC33CC, 0xF00FF00F0FF00FF0, 0xFFFFFFFF00000000, 0x6969696996969696, 0x5A5A5A5A5A5A5A5A, 0x5A5A5A5AA5A5A5A5, 0xAAAA55555555AAAA, 0xF00FF00FF00FF00F, 0xF0F00F0FF0F00F0F }, { 0x5AA55AA5A55AA55A, 0x3CC33CC3C33CC33C, 0xA5A55A5AA5A55A5A, 0xFFFF0000FFFF0000, 0x33CC33CC33CC33CC, 0xF00FF00F0FF00FF0, 0x00000000FFFFFFFF, 0x6969696996969696, 0xA5A5A5A5A5A5A5A5, 0x5A5A5A5AA5A5A5A5, 0xAAAA55555555AAAA, 0x0FF00FF00FF00FF0, 0x0F0FF0F00F0FF0F0 }, { 0x5AA55AA5A55AA55A, 0xC33CC33C3CC33CC3, 0xA5A55A5AA5A55A5A, 0x0000FFFF0000FFFF, 0x33CC33CC33CC33CC, 0x0FF00FF0F00FF00F, 0x00000000FFFFFFFF, 0x6969696996969696, 0xA5A5A5A5A5A5A5A5, 0x5A5A5A5AA5A5A5A5, 0x5555AAAAAAAA5555, 0xF00FF00FF00FF00F, 0xF0F00F0FF0F00F0F }, { 0xA55AA55A5AA55AA5, 0xC33CC33C3CC33CC3, 0xA5A55A5AA5A55A5A, 0x0000FFFF0000FFFF, 0x33CC33CC33CC33CC, 0xF00FF00F0FF00FF0, 0x00000000FFFFFFFF, 0x6969696996969696, 0x5A5A5A5A5A5A5A5A, 0xA5A5A5A55A5A5A5A, 0xAAAA55555555AAAA, 0x0FF00FF00FF00FF0, 0xF0F00F0FF0F00F0F }, { 0xA55AA55A5AA55AA5, 0x3CC33CC3C33CC33C, 0xA5A55A5AA5A55A5A, 0xFFFF0000FFFF0000, 0x33CC33CC33CC33CC, 0x0FF00FF0F00FF00F, 0x00000000FFFFFFFF, 0x6969696996969696, 0x5A5A5A5A5A5A5A5A, 0xA5A5A5A55A5A5A5A, 0x5555AAAAAAAA5555, 0xF00FF00FF00FF00F, 0x0F0FF0F00F0FF0F0 }, { 0xA55AA55A5AA55AA5, 0x3CC33CC3C33CC33C, 0xA5A55A5AA5A55A5A, 0x0000FFFF0000FFFF, 0xCC33CC33CC33CC33, 0xF00FF00F0FF00FF0, 0x00000000FFFFFFFF, 0x9696969669696969, 0xA5A5A5A5A5A5A5A5, 0xA5A5A5A55A5A5A5A, 0xAAAA55555555AAAA, 0xF00FF00FF00FF00F, 0x0F0FF0F00F0FF0F0 }, { 0xA55AA55A5AA55AA5, 0xC33CC33C3CC33CC3, 0xA5A55A5AA5A55A5A, 0xFFFF0000FFFF0000, 0xCC33CC33CC33CC33, 0x0FF00FF0F00FF00F, 0x00000000FFFFFFFF, 0x9696969669696969, 0xA5A5A5A5A5A5A5A5, 0xA5A5A5A55A5A5A5A, 0x5555AAAAAAAA5555, 0x0FF00FF00FF00FF0, 0xF0F00F0FF0F00F0F }, { 0x5AA55AA5A55AA55A, 0xC33CC33C3CC33CC3, 0xA5A55A5AA5A55A5A, 0xFFFF0000FFFF0000, 0xCC33CC33CC33CC33, 0xF00FF00F0FF00FF0, 0x00000000FFFFFFFF, 0x9696969669696969, 0x5A5A5A5A5A5A5A5A, 0x5A5A5A5AA5A5A5A5, 0xAAAA55555555AAAA, 0xF00FF00FF00FF00F, 0xF0F00F0FF0F00F0F }, { 0x5AA55AA5A55AA55A, 0x3CC33CC3C33CC33C, 0xA5A55A5AA5A55A5A, 0x0000FFFF0000FFFF, 0xCC33CC33CC33CC33, 0x0FF00FF0F00FF00F, 0x00000000FFFFFFFF, 0x9696969669696969, 0x5A5A5A5A5A5A5A5A, 0x5A5A5A5AA5A5A5A5, 0x5555AAAAAAAA5555, 0x0FF00FF00FF00FF0, 0x0F0FF0F00F0FF0F0 }, { 0x5AA55AA5A55AA55A, 0xC33CC33C3CC33CC3, 0xA5A55A5AA5A55A5A, 0x0000FFFF0000FFFF, 0xCC33CC33CC33CC33, 0xF00FF00F0FF00FF0, 0xFFFFFFFF00000000, 0x9696969669696969, 0x5A5A5A5A5A5A5A5A, 0x5A5A5A5AA5A5A5A5, 0x5555AAAAAAAA5555, 0x0FF00FF00FF00FF0, 0x0F0FF0F00F0FF0F0 }, { 0x5AA55AA5A55AA55A, 0x3CC33CC3C33CC33C, 0xA5A55A5AA5A55A5A, 0xFFFF0000FFFF0000, 0xCC33CC33CC33CC33, 0x0FF00FF0F00FF00F, 0xFFFFFFFF00000000, 0x9696969669696969, 0x5A5A5A5A5A5A5A5A, 0x5A5A5A5AA5A5A5A5, 0xAAAA55555555AAAA, 0xF00FF00FF00FF00F, 0xF0F00F0FF0F00F0F }, { 0xA55AA55A5AA55AA5, 0x3CC33CC3C33CC33C, 0xA5A55A5AA5A55A5A, 0xFFFF0000FFFF0000, 0xCC33CC33CC33CC33, 0xF00FF00F0FF00FF0, 0xFFFFFFFF00000000, 0x9696969669696969, 0xA5A5A5A5A5A5A5A5, 0xA5A5A5A55A5A5A5A, 0x5555AAAAAAAA5555, 0x0FF00FF00FF00FF0, 0xF0F00F0FF0F00F0F }, { 0xA55AA55A5AA55AA5, 0xC33CC33C3CC33CC3, 0xA5A55A5AA5A55A5A, 0x0000FFFF0000FFFF, 0xCC33CC33CC33CC33, 0x0FF00FF0F00FF00F, 0xFFFFFFFF00000000, 0x9696969669696969, 0xA5A5A5A5A5A5A5A5, 0xA5A5A5A55A5A5A5A, 0xAAAA55555555AAAA, 0xF00FF00FF00FF00F, 0x0F0FF0F00F0FF0F0 }, { 0xA55AA55A5AA55AA5, 0xC33CC33C3CC33CC3, 0xA5A55A5AA5A55A5A, 0xFFFF0000FFFF0000, 0x33CC33CC33CC33CC, 0xF00FF00F0FF00FF0, 0xFFFFFFFF00000000, 0x6969696996969696, 0x5A5A5A5A5A5A5A5A, 0xA5A5A5A55A5A5A5A, 0x5555AAAAAAAA5555, 0xF00FF00FF00FF00F, 0x0F0FF0F00F0FF0F0 }, { 0xA55AA55A5AA55AA5, 0x3CC33CC3C33CC33C, 0xA5A55A5AA5A55A5A, 0x0000FFFF0000FFFF, 0x33CC33CC33CC33CC, 0x0FF00FF0F00FF00F, 0xFFFFFFFF00000000, 0x6969696996969696, 0x5A5A5A5A5A5A5A5A, 0xA5A5A5A55A5A5A5A, 0xAAAA55555555AAAA, 0x0FF00FF00FF00FF0, 0xF0F00F0FF0F00F0F }, { 0x5AA55AA5A55AA55A, 0x3CC33CC3C33CC33C, 0xA5A55A5AA5A55A5A, 0x0000FFFF0000FFFF, 0x33CC33CC33CC33CC, 0xF00FF00F0FF00FF0, 0xFFFFFFFF00000000, 0x6969696996969696, 0xA5A5A5A5A5A5A5A5, 0x5A5A5A5AA5A5A5A5, 0x5555AAAAAAAA5555, 0xF00FF00FF00FF00F, 0xF0F00F0FF0F00F0F }, { 0x5AA55AA5A55AA55A, 0xC33CC33C3CC33CC3, 0xA5A55A5AA5A55A5A, 0xFFFF0000FFFF0000, 0x33CC33CC33CC33CC, 0x0FF00FF0F00FF00F, 0xFFFFFFFF00000000, 0x6969696996969696, 0xA5A5A5A5A5A5A5A5, 0x5A5A5A5AA5A5A5A5, 0xAAAA55555555AAAA, 0x0FF00FF00FF00FF0, 0x0F0FF0F00F0FF0F0 }
+};
+
+/* from libmceliece-20230612/crypto_kem/6688128f/vec/shared-fft_scalars_2x.c */
+/* linker define fft_scalars_2x */
+
+
+const vec fft_scalars_2x[5][2][GFBITS] = {
+{{ 0X3C3CF30C0000C003, 0X0CCCC3F333C0000C, 0X03C33F33FCC0C03C, 0X0003000F3C03C0C0, 0XF33FF33030CF03F0, 0X0CF0303300F0CCC0, 0XFF3F0C0CC0FF3CC0, 0XCF3CF0FF003FC000, 0XC00FF3CF0303F300, 0X3CCC0CC00CF0CC00, 0XF30FFC3C3FCCFC00, 0X3F0FC3F0CCF0C000, 0X3000FF33CCF0F000 }, { 0X0C0F0FCF0F0CF330, 0XF0000FC33C3CCF3C, 0X3C0F3F00C3C300FC, 0X3C33CCC0F0F3CC30, 0XC0CFFFFFCCCC30CC, 0X3FC3F3CCFFFC033F, 0XFC3030CCCCC0CFCF, 0X0FCF0C00CCF333C3, 0XCFFCF33000CFF030, 0X00CFFCC330F30FCC, 0X3CCC3FCCC0F3FFF3, 0XF00F0C3FC003C0FF, 0X330CCFCC03C0FC33 }}, {{ 0X0F0F0FF0F000000F, 0X00FFFFFFFF0000F0, 0XFFFF00FF00000F00, 0XFFF000F00F0FF000, 0XFFF0000F0FF000F0, 0X00FF000FFF000000, 0XFF0F0FFF0F0FF000, 0X0FFF0000000F0000, 0X00F000F0FFF00F00, 0X00F00FF00F00F000, 0XFFF000F000F00000, 0X00F00F000FF00000, 0X0000FF0F0000F000 }, { 0XF0FFFFFFF0F00F00, 0X00FFF0FFFF0000FF, 0X00FF00000F0F0FFF, 0XF000F0000F00FF0F, 0XFF000000FFF00000, 0XF0FF000FF00F0FF0, 0X0F0F0F00FF000F0F, 0X0F0F00F0F0F0F000, 0X00F00F00F00F000F, 0X00F0F0F00000FFF0, 0XFFFFFF0FF00F0FFF, 0X0F0FFFF00FFFFFFF, 0XFFFF0F0FFF0FFF00 }}, {{ 0X00FF0000000000FF, 0XFFFFFFFFFF00FF00, 0XFF0000FF00FF0000, 0XFFFF000000FF0000, 0XFF00000000FF0000, 0X00FFFFFFFF000000, 0XFF0000FFFFFF0000, 0XFF00FF00FFFF0000, 0X00FFFFFFFF00FF00, 0XFFFF000000000000, 0X00FF0000FF000000, 0XFF00FF00FF000000, 0X00FF00FFFF000000 }, { 0X00FF00FF00FF0000, 0XFF00FFFF000000FF, 0X0000FFFF000000FF, 0X00FFFF00FF000000, 0XFFFFFF0000FF00FF, 0X0000FFFF00FFFF00, 0XFF00FF0000FFFF00, 0X00000000FFFFFFFF, 0X0000FF0000000000, 0XFF00FFFF00FFFF00, 0X00FFFF00000000FF, 0X0000FF00FF00FFFF, 0XFF0000FFFFFF0000 }}, {{ 0X000000000000FFFF, 0XFFFFFFFFFFFF0000, 0X0000000000000000, 0XFFFF0000FFFF0000, 0XFFFFFFFFFFFF0000, 0X0000FFFF00000000, 0X0000FFFFFFFF0000, 0XFFFF0000FFFF0000, 0X0000FFFF00000000, 0XFFFF000000000000, 0XFFFF000000000000, 0XFFFF000000000000, 0XFFFFFFFF00000000 }, { 0X0000FFFF00000000, 0XFFFFFFFF0000FFFF, 0X00000000FFFFFFFF, 0X0000000000000000, 0X0000FFFF00000000, 0XFFFF0000FFFF0000, 0X0000FFFFFFFF0000, 0X0000FFFF0000FFFF, 0XFFFFFFFF0000FFFF, 0X00000000FFFF0000, 0XFFFF0000FFFFFFFF, 0XFFFF0000FFFFFFFF, 0X0000000000000000 }}, {{ 0X00000000FFFFFFFF, 0X0000000000000000, 0XFFFFFFFF00000000, 0X0000000000000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0X0000000000000000, 0XFFFFFFFF00000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0XFFFFFFFF00000000 }, { 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0X0000000000000000, 0X00000000FFFFFFFF, 0XFFFFFFFF00000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X00000000FFFFFFFF, 0XFFFFFFFF00000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFF00000000 }}
+};
+
+/* from libmceliece-20230612/crypto_kem/6688128f/vec/shared-fft_scalars_4x.c */
+/* linker define fft_scalars_4x */
+
+
+const vec fft_scalars_4x[6][4][GFBITS] = {
+{{ 0x3C3CF30C0000C003, 0x0CCCC3F333C0000C, 0x03C33F33FCC0C03C, 0x0003000F3C03C0C0, 0xF33FF33030CF03F0, 0x0CF0303300F0CCC0, 0xFF3F0C0CC0FF3CC0, 0xCF3CF0FF003FC000, 0xC00FF3CF0303F300, 0x3CCC0CC00CF0CC00, 0xF30FFC3C3FCCFC00, 0x3F0FC3F0CCF0C000, 0x3000FF33CCF0F000 }, { 0x0C0F0FCF0F0CF330, 0xF0000FC33C3CCF3C, 0x3C0F3F00C3C300FC, 0x3C33CCC0F0F3CC30, 0xC0CFFFFFCCCC30CC, 0x3FC3F3CCFFFC033F, 0xFC3030CCCCC0CFCF, 0x0FCF0C00CCF333C3, 0xCFFCF33000CFF030, 0x00CFFCC330F30FCC, 0x3CCC3FCCC0F3FFF3, 0xF00F0C3FC003C0FF, 0x330CCFCC03C0FC33 }, { 0xF0F30C33CF03F03F, 0x00F30FC00C3300FF, 0xF3CC3CF3F3FCF33F, 0x3C0FC0FC303C3F3C, 0xFC30CF303F3FF00F, 0x33300C0CC3300CF3, 0x3C030CF3F03FF3F3, 0x3CCC03FCCC3FFC03, 0x033C3C3CF0003FC3, 0xFFC0FF00F0FF0F03, 0xF3F30CF003FCC303, 0x30CFCFC3CC0F3000, 0x0CF30CCF3FCFCC0F }, { 0x3F30CC0C000F3FCC, 0xFC3CF030FC3FFF03, 0x33FFFCFF0CCF3CC3, 0x003CFF33C3CC30CF, 0xCFF3CF33C00F3003, 0x00F3CC0CF3003CCF, 0x3C000CFCCC3C3333, 0xF3CF03C0FCF03FF0, 0x3F3C3CF0C330330C, 0x33CCFCC0FF0033F0, 0x33C300C0F0C003F3, 0x003FF0003F00C00C, 0xCFF3C3033F030FFF }}, {{ 0x0F0F0FF0F000000F, 0x00FFFFFFFF0000F0, 0xFFFF00FF00000F00, 0xFFF000F00F0FF000, 0xFFF0000F0FF000F0, 0x00FF000FFF000000, 0xFF0F0FFF0F0FF000, 0x0FFF0000000F0000, 0x00F000F0FFF00F00, 0x00F00FF00F00F000, 0xFFF000F000F00000, 0x00F00F000FF00000, 0x0000FF0F0000F000 }, { 0xF0FFFFFFF0F00F00, 0x00FFF0FFFF0000FF, 0x00FF00000F0F0FFF, 0xF000F0000F00FF0F, 0xFF000000FFF00000, 0xF0FF000FF00F0FF0, 0x0F0F0F00FF000F0F, 0x0F0F00F0F0F0F000, 0x00F00F00F00F000F, 0x00F0F0F00000FFF0, 0xFFFFFF0FF00F0FFF, 0x0F0FFFF00FFFFFFF, 0xFFFF0F0FFF0FFF00 }, { 0x0F0F00FF0FF0FFFF, 0xF000F0F00F00FF0F, 0x000FFFF0FFF0FF0F, 0x00F00FFF00000FF0, 0xFFFFF0000FFFF00F, 0xFFF0FFF0000FFFF0, 0xF0F0F0000F0F0F00, 0x00F000F0F00FFF00, 0xF0FF0F0FFF00F0FF, 0xF0FF0FFFF0F0F0FF, 0x00FFFFFFFFFFFFF0, 0x00FFF0F0FF000F0F, 0x000FFFF0000FFF00 }, { 0xFF0F0F00F000F0FF, 0x0FFFFFFFFF00000F, 0xF0FFFF000F00F0FF, 0x0F0000F00FFF0FFF, 0x0F0F0F00FF0F000F, 0x000F0F0FFFF0F000, 0xF0FFFF0F00F0FF0F, 0x0F0F000F0F00F0FF, 0x0000F0FF00FF0F0F, 0x00FFFF0FF0FFF0F0, 0x0000000F00F0FFF0, 0xF0F00000FF00F0F0, 0x0F0F0FFFFFFFFFFF }}, {{ 0x00FF0000000000FF, 0xFFFFFFFFFF00FF00, 0xFF0000FF00FF0000, 0xFFFF000000FF0000, 0xFF00000000FF0000, 0x00FFFFFFFF000000, 0xFF0000FFFFFF0000, 0xFF00FF00FFFF0000, 0x00FFFFFFFF00FF00, 0xFFFF000000000000, 0x00FF0000FF000000, 0xFF00FF00FF000000, 0x00FF00FFFF000000 }, { 0x00FF00FF00FF0000, 0xFF00FFFF000000FF, 0x0000FFFF000000FF, 0x00FFFF00FF000000, 0xFFFFFF0000FF00FF, 0x0000FFFF00FFFF00, 0xFF00FF0000FFFF00, 0x00000000FFFFFFFF, 0x0000FF0000000000, 0xFF00FFFF00FFFF00, 0x00FFFF00000000FF, 0x0000FF00FF00FFFF, 0xFF0000FFFFFF0000 }, { 0xFFFF00FF00FF00FF, 0x00FFFF000000FF00, 0xFFFF00FFFFFFFF00, 0x0000FFFF00FFFFFF, 0x00FF0000FF0000FF, 0xFFFF0000FF00FFFF, 0xFF000000FFFFFF00, 0x000000000000FFFF, 0xFF00FF00FFFF0000, 0xFFFF00FFFF00FFFF, 0xFFFFFFFFFF00FF00, 0xFFFF00FFFF0000FF, 0x0000FF00000000FF }, { 0xFF0000FFFFFF00FF, 0xFFFF0000FFFFFFFF, 0xFFFF000000FFFFFF, 0x00FFFF00FF0000FF, 0xFFFFFF00FFFFFF00, 0x00FFFF00FFFF00FF, 0x0000FFFF00FF0000, 0x000000FFFF000000, 0xFF00FF0000FF00FF, 0x00FF0000000000FF, 0xFF00FFFF00FF00FF, 0xFFFFFFFFFFFFFFFF, 0x0000FF000000FFFF }}, {{ 0x000000000000FFFF, 0xFFFFFFFFFFFF0000, 0x0000000000000000, 0xFFFF0000FFFF0000, 0xFFFFFFFFFFFF0000, 0x0000FFFF00000000, 0x0000FFFFFFFF0000, 0xFFFF0000FFFF0000, 0x0000FFFF00000000, 0xFFFF000000000000, 0xFFFF000000000000, 0xFFFF000000000000, 0xFFFFFFFF00000000 }, { 0x0000FFFF00000000, 0xFFFFFFFF0000FFFF, 0x00000000FFFFFFFF, 0x0000000000000000, 0x0000FFFF00000000, 0xFFFF0000FFFF0000, 0x0000FFFFFFFF0000, 0x0000FFFF0000FFFF, 0xFFFFFFFF0000FFFF, 0x00000000FFFF0000, 0xFFFF0000FFFFFFFF, 0xFFFF0000FFFFFFFF, 0x0000000000000000 }, { 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFF00000000, 0xFFFF000000000000, 0x0000FFFF00000000, 0x00000000FFFF0000, 0x0000FFFFFFFFFFFF, 0x0000FFFFFFFFFFFF, 0xFFFFFFFF00000000, 0x000000000000FFFF, 0x000000000000FFFF, 0xFFFFFFFFFFFF0000, 0xFFFFFFFF0000FFFF, 0xFFFF0000FFFFFFFF }, { 0x0000FFFFFFFFFFFF, 0x0000FFFF0000FFFF, 0x0000FFFFFFFF0000, 0xFFFF0000FFFFFFFF, 0x00000000FFFF0000, 0xFFFF00000000FFFF, 0x0000FFFF0000FFFF, 0xFFFF00000000FFFF, 0x0000FFFF0000FFFF, 0x0000FFFF00000000, 0xFFFFFFFF00000000, 0x0000FFFFFFFF0000, 0x0000FFFFFFFFFFFF }}, {{ 0x00000000FFFFFFFF, 0x0000000000000000, 0xFFFFFFFF00000000, 0x0000000000000000, 0xFFFFFFFF00000000, 0xFFFFFFFF00000000, 0xFFFFFFFF00000000, 0x0000000000000000, 0xFFFFFFFF00000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0xFFFFFFFF00000000 }, { 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0x0000000000000000, 0x0000000000000000, 0x00000000FFFFFFFF, 0xFFFFFFFF00000000, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0x00000000FFFFFFFF, 0xFFFFFFFF00000000, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFF00000000 }, { 0x00000000FFFFFFFF, 0xFFFFFFFF00000000, 0xFFFFFFFF00000000, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0x0000000000000000, 0xFFFFFFFF00000000, 0x00000000FFFFFFFF, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF }, { 0xFFFFFFFFFFFFFFFF, 0x00000000FFFFFFFF, 0x0000000000000000, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0x0000000000000000, 0xFFFFFFFF00000000, 0x00000000FFFFFFFF, 0xFFFFFFFF00000000, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFF00000000, 0xFFFFFFFF00000000 }}, {{ 0xFFFFFFFFFFFFFFFF, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000 }, { 0x0000000000000000, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0x0000000000000000, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0x0000000000000000 }, { 0xFFFFFFFFFFFFFFFF, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0x0000000000000000, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0x0000000000000000, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF }, { 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0x0000000000000000, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF }},
+};
+
+/* from libmceliece-20230612/crypto_kem/6688128f/vec/fft.c */
+/*
+  This file is for implementing the Gao-Mateer FFT, see
+  http://www.math.clemson.edu/~sgao/papers/GM10.pdf
+
+  For the implementation strategy, see
+  https://eprint.iacr.org/2017/793.pdf
+*/
+/* 20221230 djb: split these arrays into separate .c files */
+/* 20221230 djb: rename powers array as fft_powers */
+/* 20221230 djb: rename consts array as fft_consts */
+/* 20221230 djb: rename s array as fft_scalars_2x */
+/* 20221230 djb: add linker lines */
+
+/* linker define fft */
+/* linker use vec_mul */
+/* linker use fft_scalars_2x fft_consts fft_powers */
+
+
+
+
+/* input: in, polynomial in bitsliced form */
+/* output: in, result of applying the radix conversions on in */
+static void radix_conversions(vec in[][GFBITS])
+{
+       int i, j, k;
+
+       const vec mask[5][2] =
+       {
+               {0x8888888888888888, 0x4444444444444444},
+               {0xC0C0C0C0C0C0C0C0, 0x3030303030303030},
+               {0xF000F000F000F000, 0x0F000F000F000F00},
+               {0xFF000000FF000000, 0x00FF000000FF0000},
+               {0xFFFF000000000000, 0x0000FFFF00000000}
+       };
+
+       for (j = 0; j <= 5; j++)
+       {
+               for (i = 0; i < GFBITS; i++)
+               {
+                       in[1][i] ^= in[1][i] >> 32;
+                       in[0][i] ^= in[1][i] << 32;
+               }
+
+               for (i = 0; i < GFBITS; i++)
+               for (k = 4; k >= j; k--)
+               {
+                       in[0][i] ^= (in[0][i] & mask[k][0]) >> (1 << k);
+                       in[0][i] ^= (in[0][i] & mask[k][1]) >> (1 << k);
+                       in[1][i] ^= (in[1][i] & mask[k][0]) >> (1 << k);
+                       in[1][i] ^= (in[1][i] & mask[k][1]) >> (1 << k);
+               }
+
+               if (j < 5)
+               {
+                       vec_mul(in[0], in[0], fft_scalars_2x[j][0]);
+                       vec_mul(in[1], in[1], fft_scalars_2x[j][1]);
+               }
+       }
+}
+
+/* input: in, result of applying the radix conversions to the input polynomial */
+/* output: out, evaluation results (by applying the FFT butterflies) */
+static void butterflies(vec out[][ GFBITS ], vec in[][ GFBITS ])
+{
+       int i, j, k, s, b;
+
+       vec tmp[ GFBITS ];
+       vec pre[8][ GFBITS ];
+       vec buf[128];
+
+       uint64_t consts_ptr = 2;
+
+       const unsigned char reversal[128] =
+       {
+         0, 64, 32, 96, 16, 80, 48, 112,
+         8, 72, 40, 104, 24, 88, 56, 120,
+         4, 68, 36, 100, 20, 84, 52, 116,
+         12, 76, 44, 108, 28, 92, 60, 124,
+         2, 66, 34, 98, 18, 82, 50, 114,
+         10, 74, 42, 106, 26, 90, 58, 122,
+         6, 70, 38, 102, 22, 86, 54, 118,
+         14, 78, 46, 110, 30, 94, 62, 126,
+         1, 65, 33, 97, 17, 81, 49, 113,
+         9, 73, 41, 105, 25, 89, 57, 121,
+         5, 69, 37, 101, 21, 85, 53, 117,
+         13, 77, 45, 109, 29, 93, 61, 125,
+         3, 67, 35, 99, 19, 83, 51, 115,
+         11, 75, 43, 107, 27, 91, 59, 123,
+         7, 71, 39, 103, 23, 87, 55, 119,
+         15, 79, 47, 111, 31, 95, 63, 127
+       };
+
+       const uint16_t beta[7] = {2522, 7827, 7801, 8035, 6897, 8167, 3476};
+
+       /**/
+
+       for (i = 0; i < 7; i++)
+       {
+               for (j = 0; j < GFBITS; j++)
+               {
+                       pre[i][j] = (beta[i] >> j) & 1;
+                       pre[i][j] = -pre[i][j];
+               }
+
+               vec_mul(pre[i], in[1], pre[i]);
+       }
+
+       for (i = 0; i < GFBITS; i++)
+       {
+               buf[0] = in[0][i];
+
+               buf[1] = buf[0] ^ pre[0][i];      buf[32] = in[0][i] ^ pre[5][i];
+               buf[3] = buf[1] ^ pre[1][i];      buf[96] = buf[32] ^ pre[6][i];
+                                              buf[97] = buf[96] ^ pre[0][i];
+               buf[2] = in[0][i] ^ pre[1][i];  buf[99] = buf[97] ^ pre[1][i];
+               buf[6] = buf[2] ^ pre[2][i];      buf[98] = buf[99] ^ pre[0][i];
+               buf[7] = buf[6] ^ pre[0][i];      buf[102] = buf[98] ^ pre[2][i];
+               buf[5] = buf[7] ^ pre[1][i];      buf[103] = buf[102] ^ pre[0][i];
+                                              buf[101] = buf[103] ^ pre[1][i];
+               buf[4] = in[0][i] ^ pre[2][i];  buf[100] = buf[101] ^ pre[0][i];
+               buf[12] = buf[4] ^ pre[3][i];     buf[108] = buf[100] ^ pre[3][i];
+               buf[13] = buf[12] ^ pre[0][i];    buf[109] = buf[108] ^ pre[0][i];
+               buf[15] = buf[13] ^ pre[1][i];    buf[111] = buf[109] ^ pre[1][i];
+               buf[14] = buf[15] ^ pre[0][i];    buf[110] = buf[111] ^ pre[0][i];
+               buf[10] = buf[14] ^ pre[2][i];    buf[106] = buf[110] ^ pre[2][i];
+               buf[11] = buf[10] ^ pre[0][i];    buf[107] = buf[106] ^ pre[0][i];
+               buf[9] = buf[11] ^ pre[1][i];     buf[105] = buf[107] ^ pre[1][i];
+                                              buf[104] = buf[105] ^ pre[0][i];
+               buf[8] = in[0][i] ^ pre[3][i];  buf[120] = buf[104] ^ pre[4][i];
+               buf[24] = buf[8] ^ pre[4][i];     buf[121] = buf[120] ^ pre[0][i];
+               buf[25] = buf[24] ^ pre[0][i];    buf[123] = buf[121] ^ pre[1][i];
+               buf[27] = buf[25] ^ pre[1][i];    buf[122] = buf[123] ^ pre[0][i];
+               buf[26] = buf[27] ^ pre[0][i];    buf[126] = buf[122] ^ pre[2][i];
+               buf[30] = buf[26] ^ pre[2][i];    buf[127] = buf[126] ^ pre[0][i];
+               buf[31] = buf[30] ^ pre[0][i];    buf[125] = buf[127] ^ pre[1][i];
+               buf[29] = buf[31] ^ pre[1][i];    buf[124] = buf[125] ^ pre[0][i];
+               buf[28] = buf[29] ^ pre[0][i];    buf[116] = buf[124] ^ pre[3][i];
+               buf[20] = buf[28] ^ pre[3][i];    buf[117] = buf[116] ^ pre[0][i];
+               buf[21] = buf[20] ^ pre[0][i];    buf[119] = buf[117] ^ pre[1][i];
+               buf[23] = buf[21] ^ pre[1][i];    buf[118] = buf[119] ^ pre[0][i];
+               buf[22] = buf[23] ^ pre[0][i];    buf[114] = buf[118] ^ pre[2][i];
+               buf[18] = buf[22] ^ pre[2][i];    buf[115] = buf[114] ^ pre[0][i];
+               buf[19] = buf[18] ^ pre[0][i];    buf[113] = buf[115] ^ pre[1][i];
+               buf[17] = buf[19] ^ pre[1][i];    buf[112] = buf[113] ^ pre[0][i];
+                                              buf[80] = buf[112] ^ pre[5][i];
+               buf[16] = in[0][i] ^ pre[4][i]; buf[81] = buf[80] ^ pre[0][i];
+               buf[48] = buf[16] ^ pre[5][i];    buf[83] = buf[81] ^ pre[1][i];
+               buf[49] = buf[48] ^ pre[0][i];    buf[82] = buf[83] ^ pre[0][i];
+               buf[51] = buf[49] ^ pre[1][i];    buf[86] = buf[82] ^ pre[2][i];
+               buf[50] = buf[51] ^ pre[0][i];    buf[87] = buf[86] ^ pre[0][i];
+               buf[54] = buf[50] ^ pre[2][i];    buf[85] = buf[87] ^ pre[1][i];
+               buf[55] = buf[54] ^ pre[0][i];    buf[84] = buf[85] ^ pre[0][i];
+               buf[53] = buf[55] ^ pre[1][i];    buf[92] = buf[84] ^ pre[3][i];
+               buf[52] = buf[53] ^ pre[0][i];    buf[93] = buf[92] ^ pre[0][i];
+               buf[60] = buf[52] ^ pre[3][i];    buf[95] = buf[93] ^ pre[1][i];
+               buf[61] = buf[60] ^ pre[0][i];    buf[94] = buf[95] ^ pre[0][i];
+               buf[63] = buf[61] ^ pre[1][i];    buf[90] = buf[94] ^ pre[2][i];
+               buf[62] = buf[63] ^ pre[0][i];    buf[91] = buf[90] ^ pre[0][i];
+               buf[58] = buf[62] ^ pre[2][i];    buf[89] = buf[91] ^ pre[1][i];
+               buf[59] = buf[58] ^ pre[0][i];    buf[88] = buf[89] ^ pre[0][i];
+               buf[57] = buf[59] ^ pre[1][i];    buf[72] = buf[88] ^ pre[4][i];
+               buf[56] = buf[57] ^ pre[0][i];    buf[73] = buf[72] ^ pre[0][i];
+               buf[40] = buf[56] ^ pre[4][i];    buf[75] = buf[73] ^ pre[1][i];
+               buf[41] = buf[40] ^ pre[0][i];    buf[74] = buf[75] ^ pre[0][i];
+               buf[43] = buf[41] ^ pre[1][i];    buf[78] = buf[74] ^ pre[2][i];
+               buf[42] = buf[43] ^ pre[0][i];    buf[79] = buf[78] ^ pre[0][i];
+               buf[46] = buf[42] ^ pre[2][i];    buf[77] = buf[79] ^ pre[1][i];
+               buf[47] = buf[46] ^ pre[0][i];    buf[76] = buf[77] ^ pre[0][i];
+               buf[45] = buf[47] ^ pre[1][i];    buf[68] = buf[76] ^ pre[3][i];
+               buf[44] = buf[45] ^ pre[0][i];    buf[69] = buf[68] ^ pre[0][i];
+               buf[36] = buf[44] ^ pre[3][i];    buf[71] = buf[69] ^ pre[1][i];
+               buf[37] = buf[36] ^ pre[0][i];    buf[70] = buf[71] ^ pre[0][i];
+               buf[39] = buf[37] ^ pre[1][i];    buf[66] = buf[70] ^ pre[2][i];
+               buf[38] = buf[39] ^ pre[0][i];    buf[67] = buf[66] ^ pre[0][i];
+               buf[34] = buf[38] ^ pre[2][i];    buf[65] = buf[67] ^ pre[1][i];
+               buf[35] = buf[34] ^ pre[0][i];
+               buf[33] = buf[35] ^ pre[1][i];    buf[64] = in[0][i] ^ pre[6][i];
+
+               transpose_64x64(buf +  0, buf +  0);
+               transpose_64x64(buf + 64, buf + 64);
+
+               for (j = 0; j < 128; j++)
+                       out[ reversal[j] ][i] = buf[j];
+       }
+
+       for (i = 1; i <= 6; i++)
+       {
+               s = 1 << i;
+
+               for (j = 0; j < 128; j += 2*s)
+               for (k = j; k < j+s; k++)
+               {
+                       vec_mul(tmp, out[k+s], fft_consts[ consts_ptr + (k-j) ]);
+
+                       for (b = 0; b < GFBITS; b++) out[k  ][b] ^= tmp[b];
+                       for (b = 0; b < GFBITS; b++) out[k+s][b] ^= out[k][b];
+               }
+
+               consts_ptr += (1 << i);
+       }
+
+       /* adding the part contributed by x^128 */
+
+       for (i = 0; i < 128; i++)
+       for (b = 0; b < GFBITS; b++)
+               out[i][b] ^= fft_powers[i][b];
+}
+
+/* input: in, polynomial in bitsliced form */
+/* output: out, bitsliced results of evaluating in all the field elements */
+static void fft(vec out[][GFBITS], vec in[][GFBITS])
+{
+       radix_conversions(in);
+       butterflies(out, in);
+}
+
+/* from libmceliece-20230612/crypto_kem/6688128f/vec/fft_tr.c */
+/*
+  This file is for transpose of the Gao-Mateer FFT
+  Functions with names ending with _tr are (roughly) the transpose of the corresponding functions in fft.c
+
+  For the implementation strategy, see
+  https://eprint.iacr.org/2017/793.pdf
+*/
+/* 20221230 djb: split these arrays into separate .c files */
+/* 20221230 djb: rename consts array as fft_consts */
+/* 20221230 djb: rename s array as fft_scalars_4x */
+/* 20221230 djb: add linker lines */
+
+/* linker define fft_tr */
+/* linker use vec_mul */
+/* linker use fft_scalars_4x fft_consts */
+
+
+
+
+static void radix_conversions_tr(vec in[][ GFBITS ])
+{
+       int i, j, k;
+
+       const vec mask[6][2] =
+       {
+               {0x2222222222222222, 0x4444444444444444},
+               {0x0C0C0C0C0C0C0C0C, 0x3030303030303030},
+               {0x00F000F000F000F0, 0x0F000F000F000F00},
+               {0x0000FF000000FF00, 0x00FF000000FF0000},
+               {0x00000000FFFF0000, 0x0000FFFF00000000},
+               {0xFFFFFFFF00000000, 0x00000000FFFFFFFF}
+       };
+
+       /**/
+
+       for (j = 6; j >= 0; j--)
+       {
+               if (j < 6)
+               {
+                       vec_mul(in[0], in[0], fft_scalars_4x[j][0]); /* scaling */
+                       vec_mul(in[1], in[1], fft_scalars_4x[j][1]); /* scaling */
+                       vec_mul(in[2], in[2], fft_scalars_4x[j][2]); /* scaling */
+                       vec_mul(in[3], in[3], fft_scalars_4x[j][3]); /* scaling */
+               }
+
+               for (k = j; k <= 4; k++)
+               for (i = 0; i < GFBITS; i++)
+               {
+                       in[0][i] ^= (in[0][i] & mask[k][0]) << (1 << k);
+                       in[0][i] ^= (in[0][i] & mask[k][1]) << (1 << k);
+                       in[1][i] ^= (in[1][i] & mask[k][0]) << (1 << k);
+                       in[1][i] ^= (in[1][i] & mask[k][1]) << (1 << k);
+                       in[2][i] ^= (in[2][i] & mask[k][0]) << (1 << k);
+                       in[2][i] ^= (in[2][i] & mask[k][1]) << (1 << k);
+                       in[3][i] ^= (in[3][i] & mask[k][0]) << (1 << k);
+                       in[3][i] ^= (in[3][i] & mask[k][1]) << (1 << k);
+               }
+
+               if (j <= 5)
+               for (i = 0; i < GFBITS; i++)
+               {
+                       in[1][i] ^= in[0][i] >> 32;
+                       in[1][i] ^= in[1][i] << 32;
+
+                       in[3][i] ^= in[2][i] >> 32;
+                       in[3][i] ^= in[3][i] << 32;
+               }
+
+               for (i = 0; i < GFBITS; i++)
+                       in[3][i] ^= in[2][i] ^= in[1][i];
+       }
+}
+
+static void butterflies_tr(vec out[][ GFBITS ], vec in[][ GFBITS ])
+{
+       int i, j, k, s, b;
+
+       vec tmp[ GFBITS ];
+       vec pre[6][2][ GFBITS ];
+       vec buf[2][64];
+
+       uint64_t consts_ptr = 128;
+
+       const unsigned char reversal[128] =
+       {
+         0, 64, 32, 96, 16, 80, 48, 112,
+         8, 72, 40, 104, 24, 88, 56, 120,
+         4, 68, 36, 100, 20, 84, 52, 116,
+         12, 76, 44, 108, 28, 92, 60, 124,
+         2, 66, 34, 98, 18, 82, 50, 114,
+         10, 74, 42, 106, 26, 90, 58, 122,
+         6, 70, 38, 102, 22, 86, 54, 118,
+         14, 78, 46, 110, 30, 94, 62, 126,
+         1, 65, 33, 97, 17, 81, 49, 113,
+         9, 73, 41, 105, 25, 89, 57, 121,
+         5, 69, 37, 101, 21, 85, 53, 117,
+         13, 77, 45, 109, 29, 93, 61, 125,
+         3, 67, 35, 99, 19, 83, 51, 115,
+         11, 75, 43, 107, 27, 91, 59, 123,
+         7, 71, 39, 103, 23, 87, 55, 119,
+         15, 79, 47, 111, 31, 95, 63, 127
+       };
+
+       const uint16_t beta[6] = {5246, 5306, 6039, 6685, 4905, 6755};
+
+       /**/
+
+       for (i = 6; i >= 0; i--)
+       {
+               s = 1 << i;
+               consts_ptr -= s;
+
+               for (j = 0; j < 128; j += 2*s)
+               for (k = j; k < j+s; k++)
+               {
+                       for (b = 0; b < GFBITS; b++) in[k][b] ^= in[k+s][b];
+
+                       vec_mul(tmp, in[k], fft_consts[ consts_ptr + (k-j) ]);
+
+                       for (b = 0; b < GFBITS; b++) in[k+s][b] ^= tmp[b];
+               }
+       }
+
+       for (i = 0; i < GFBITS; i++)
+       {
+               for (k = 0; k < 128; k++)
+                       (&buf[0][0])[ k ] = in[ reversal[k] ][i];
+
+               transpose_64x64(buf[0], buf[0]);
+               transpose_64x64(buf[1], buf[1]);
+
+               for (k = 0; k < 2; k++)
+               {
+                       pre[0][k][i] = buf[k][32]; buf[k][33] ^= buf[k][32];
+                       pre[1][k][i] = buf[k][33]; buf[k][35] ^= buf[k][33];
+                       pre[0][k][i] ^= buf[k][35]; buf[k][34] ^= buf[k][35];
+                       pre[2][k][i] = buf[k][34]; buf[k][38] ^= buf[k][34];
+                       pre[0][k][i] ^= buf[k][38]; buf[k][39] ^= buf[k][38];
+                       pre[1][k][i] ^= buf[k][39]; buf[k][37] ^= buf[k][39];
+                       pre[0][k][i] ^= buf[k][37]; buf[k][36] ^= buf[k][37];
+                       pre[3][k][i] = buf[k][36]; buf[k][44] ^= buf[k][36];
+                       pre[0][k][i] ^= buf[k][44]; buf[k][45] ^= buf[k][44];
+                       pre[1][k][i] ^= buf[k][45]; buf[k][47] ^= buf[k][45];
+                       pre[0][k][i] ^= buf[k][47]; buf[k][46] ^= buf[k][47];
+                       pre[2][k][i] ^= buf[k][46]; buf[k][42] ^= buf[k][46];
+                       pre[0][k][i] ^= buf[k][42]; buf[k][43] ^= buf[k][42];
+                       pre[1][k][i] ^= buf[k][43]; buf[k][41] ^= buf[k][43];
+                       pre[0][k][i] ^= buf[k][41]; buf[k][40] ^= buf[k][41];
+                       pre[4][k][i] = buf[k][40]; buf[k][56] ^= buf[k][40];
+                       pre[0][k][i] ^= buf[k][56]; buf[k][57] ^= buf[k][56];
+                       pre[1][k][i] ^= buf[k][57]; buf[k][59] ^= buf[k][57];
+                       pre[0][k][i] ^= buf[k][59]; buf[k][58] ^= buf[k][59];
+                       pre[2][k][i] ^= buf[k][58]; buf[k][62] ^= buf[k][58];
+                       pre[0][k][i] ^= buf[k][62]; buf[k][63] ^= buf[k][62];
+                       pre[1][k][i] ^= buf[k][63]; buf[k][61] ^= buf[k][63];
+                       pre[0][k][i] ^= buf[k][61]; buf[k][60] ^= buf[k][61];
+                       pre[3][k][i] ^= buf[k][60]; buf[k][52] ^= buf[k][60];
+                       pre[0][k][i] ^= buf[k][52]; buf[k][53] ^= buf[k][52];
+                       pre[1][k][i] ^= buf[k][53]; buf[k][55] ^= buf[k][53];
+                       pre[0][k][i] ^= buf[k][55]; buf[k][54] ^= buf[k][55];
+                       pre[2][k][i] ^= buf[k][54]; buf[k][50] ^= buf[k][54];
+                       pre[0][k][i] ^= buf[k][50]; buf[k][51] ^= buf[k][50];
+                       pre[1][k][i] ^= buf[k][51]; buf[k][49] ^= buf[k][51];
+                       pre[0][k][i] ^= buf[k][49]; buf[k][48] ^= buf[k][49];
+                       pre[5][k][i] = buf[k][48]; buf[k][16] ^= buf[k][48];
+                       pre[0][k][i] ^= buf[k][16]; buf[k][17] ^= buf[k][16];
+                       pre[1][k][i] ^= buf[k][17]; buf[k][19] ^= buf[k][17];
+                       pre[0][k][i] ^= buf[k][19]; buf[k][18] ^= buf[k][19];
+                       pre[2][k][i] ^= buf[k][18]; buf[k][22] ^= buf[k][18];
+                       pre[0][k][i] ^= buf[k][22]; buf[k][23] ^= buf[k][22];
+                       pre[1][k][i] ^= buf[k][23]; buf[k][21] ^= buf[k][23];
+                       pre[0][k][i] ^= buf[k][21]; buf[k][20] ^= buf[k][21];
+                       pre[3][k][i] ^= buf[k][20]; buf[k][28] ^= buf[k][20];
+                       pre[0][k][i] ^= buf[k][28]; buf[k][29] ^= buf[k][28];
+                       pre[1][k][i] ^= buf[k][29]; buf[k][31] ^= buf[k][29];
+                       pre[0][k][i] ^= buf[k][31]; buf[k][30] ^= buf[k][31];
+                       pre[2][k][i] ^= buf[k][30]; buf[k][26] ^= buf[k][30];
+                       pre[0][k][i] ^= buf[k][26]; buf[k][27] ^= buf[k][26];
+                       pre[1][k][i] ^= buf[k][27]; buf[k][25] ^= buf[k][27];
+                       pre[0][k][i] ^= buf[k][25]; buf[k][24] ^= buf[k][25];
+                       pre[4][k][i] ^= buf[k][24]; buf[k][8] ^= buf[k][24];
+                       pre[0][k][i] ^= buf[k][8]; buf[k][9] ^= buf[k][8];
+                       pre[1][k][i] ^= buf[k][9]; buf[k][11] ^= buf[k][9];
+                       pre[0][k][i] ^= buf[k][11]; buf[k][10] ^= buf[k][11];
+                       pre[2][k][i] ^= buf[k][10]; buf[k][14] ^= buf[k][10];
+                       pre[0][k][i] ^= buf[k][14]; buf[k][15] ^= buf[k][14];
+                       pre[1][k][i] ^= buf[k][15]; buf[k][13] ^= buf[k][15];
+                       pre[0][k][i] ^= buf[k][13]; buf[k][12] ^= buf[k][13];
+                       pre[3][k][i] ^= buf[k][12]; buf[k][4] ^= buf[k][12];
+                       pre[0][k][i] ^= buf[k][4]; buf[k][5] ^= buf[k][4];
+                       pre[1][k][i] ^= buf[k][5]; buf[k][7] ^= buf[k][5];
+                       pre[0][k][i] ^= buf[k][7]; buf[k][6] ^= buf[k][7];
+                       pre[2][k][i] ^= buf[k][6]; buf[k][2] ^= buf[k][6];
+                       pre[0][k][i] ^= buf[k][2]; buf[k][3] ^= buf[k][2];
+                       pre[1][k][i] ^= buf[k][3]; buf[k][1] ^= buf[k][3];
+
+                       pre[0][k][i] ^= buf[k][1]; out[k][i] = buf[k][0] ^ buf[k][1];
+               }
+       }
+
+       for (j = 0; j < GFBITS; j++) tmp[j] = vec_setbits((beta[0] >> j) & 1);
+
+       vec_mul(out[2], pre[0][0], tmp);
+       vec_mul(out[3], pre[0][1], tmp);
+
+       for (i = 1; i < 6; i++)
+       {
+               for (j = 0; j < GFBITS; j++) tmp[j] = vec_setbits((beta[i] >> j) & 1);
+
+               vec_mul(pre[i][0], pre[i][0], tmp);
+               vec_mul(pre[i][1], pre[i][1], tmp);
+
+               for (b = 0; b < GFBITS; b++)
+               {
+                       out[2][b] ^= pre[i][0][b];
+                       out[3][b] ^= pre[i][1][b];
+               }
+       }
+
+}
+
+static void fft_tr(vec out[][GFBITS], vec in[][ GFBITS ])
+{
+       butterflies_tr(out, in);
+
+       radix_conversions_tr(out);
+}
+
+
+/* from libmceliece-20230612/crypto_kem/6688128f/vec/gf.c */
+/*
+  this file is for functions for field arithmetic
+*/
+/* 20221231 djb: const for GF_mul */
+/* 20221230 djb: add linker line */
+
+/* linker define gf_iszero gf_mul gf_inv gf_frac GF_mul */
+
+
+
+/* field multiplication */
+gf gf_mul(gf in0, gf in1)
+{
+       int i;
+
+       uint64_t tmp;
+       uint64_t t0;
+       uint64_t t1;
+       uint64_t t;
+
+       t0 = in0;
+       t1 = in1;
+
+       tmp = t0 * (t1 & 1);
+
+       for (i = 1; i < GFBITS; i++)
+               tmp ^= (t0 * (t1 & (1 << i)));
+
+       /**/
+
+       t = tmp & 0x1FF0000;
+       tmp ^= (t >> 9) ^ (t >> 10) ^ (t >> 12) ^ (t >> 13);
+
+       t = tmp & 0x000E000;
+       tmp ^= (t >> 9) ^ (t >> 10) ^ (t >> 12) ^ (t >> 13);
+
+       return tmp & GFMASK;
+}
+
+/* 2 field squarings */
+static inline gf gf_sq2(gf in)
+{
+       int i;
+
+       const uint64_t B[] = {0x1111111111111111,
+                             0x0303030303030303,
+                             0x000F000F000F000F,
+                             0x000000FF000000FF};
+
+       const uint64_t M[] = {0x0001FF0000000000,
+                             0x000000FF80000000,
+                             0x000000007FC00000,
+                             0x00000000003FE000};
+
+       uint64_t x = in;
+       uint64_t t;
+
+       x = (x | (x << 24)) & B[3];
+       x = (x | (x << 12)) & B[2];
+       x = (x | (x << 6)) & B[1];
+       x = (x | (x << 3)) & B[0];
+
+       for (i = 0; i < 4; i++)
+       {
+               t = x & M[i];
+               x ^= (t >> 9) ^ (t >> 10) ^ (t >> 12) ^ (t >> 13);
+       }
+
+       return x & GFMASK;
+}
+
+/* square and multiply */
+static inline gf gf_sqmul(gf in, gf m)
+{
+       int i;
+
+       uint64_t x;
+       uint64_t t0;
+       uint64_t t1;
+       uint64_t t;
+
+       const uint64_t M[] = {0x0000001FF0000000,
+                             0x000000000FF80000,
+                             0x000000000007E000};
+
+       t0 = in;
+       t1 = m;
+
+       x = (t1 << 6) * (t0 & (1 << 6));
+
+       t0 ^= (t0 << 7);
+
+       x ^= (t1 * (t0 & (0x04001)));
+       x ^= (t1 * (t0 & (0x08002))) << 1;
+       x ^= (t1 * (t0 & (0x10004))) << 2;
+       x ^= (t1 * (t0 & (0x20008))) << 3;
+       x ^= (t1 * (t0 & (0x40010))) << 4;
+       x ^= (t1 * (t0 & (0x80020))) << 5;
+
+       for (i = 0; i < 3; i++)
+       {
+               t = x & M[i];
+               x ^= (t >> 9) ^ (t >> 10) ^ (t >> 12) ^ (t >> 13);
+       }
+
+       return x & GFMASK;
+}
+
+/* square twice and multiply */
+static inline gf gf_sq2mul(gf in, gf m)
+{
+       int i;
+
+       uint64_t x;
+       uint64_t t0;
+       uint64_t t1;
+       uint64_t t;
+
+       const uint64_t M[] = {0x1FF0000000000000,
+                             0x000FF80000000000,
+                             0x000007FC00000000,
+                             0x00000003FE000000,
+                             0x0000000001FE0000,
+                             0x000000000001E000};
+
+       t0 = in;
+       t1 = m;
+
+       x = (t1 << 18) * (t0 & (1 << 6));
+
+       t0 ^= (t0 << 21);
+
+       x ^= (t1 * (t0 & (0x010000001)));
+       x ^= (t1 * (t0 & (0x020000002))) << 3;
+       x ^= (t1 * (t0 & (0x040000004))) << 6;
+       x ^= (t1 * (t0 & (0x080000008))) << 9;
+       x ^= (t1 * (t0 & (0x100000010))) << 12;
+       x ^= (t1 * (t0 & (0x200000020))) << 15;
+
+       for (i = 0; i < 6; i++)
+       {
+               t = x & M[i];
+               x ^= (t >> 9) ^ (t >> 10) ^ (t >> 12) ^ (t >> 13);
+       }
+
+       return x & GFMASK;
+}
+
+/* return num/den */
+gf gf_frac(gf den, gf num)
+{
+       gf tmp_11;
+       gf tmp_1111;
+       gf out;
+
+       tmp_11 = gf_sqmul(den, den); /* 11 */
+       tmp_1111 = gf_sq2mul(tmp_11, tmp_11); /* 1111 */
+       out = gf_sq2(tmp_1111);
+       out = gf_sq2mul(out, tmp_1111); /* 11111111 */
+       out = gf_sq2(out);
+       out = gf_sq2mul(out, tmp_1111); /* 111111111111 */
+
+       return gf_sqmul(out, num); /* 1111111111110 */
+}
+
+/* return 1/den */
+gf gf_inv(gf den)
+{
+       return gf_frac(den, ((gf) 1));
+}
+
+/* check if a == 0 */
+gf gf_iszero(gf a)
+{
+       uint32_t t = a;
+
+       t -= 1;
+       t >>= 19;
+
+       return (gf) t;
+}
+
+/* multiplication in GF((2^m)^t) */
+static void GF_mul(gf *out, const gf *in0, const gf *in1)
+{
+       int i, j;
+
+       gf prod[255];
+
+       for (i = 0; i < 255; i++)
+               prod[i] = 0;
+
+       for (i = 0; i < 128; i++)
+               for (j = 0; j < 128; j++)
+                       prod[i+j] ^= gf_mul(in0[i], in1[j]);
+
+       /**/
+
+       for (i = 254; i >= 128; i--)
+       {
+               prod[i - 121] ^= prod[i];
+               prod[i - 126] ^= prod[i];
+               prod[i - 127] ^= prod[i];
+               prod[i - 128] ^= prod[i];
+       }
+
+       for (i = 0; i < 128; i++)
+               out[i] = prod[i];
+}
+
+
+/* from libmceliece-20230612/crypto_kem/6688128f/vec/kem_dec.c */
+/* 20221230 djb: add linker lines */
+/* 20221230 djb: split out of operations.c */
+
+/* linker define operation_dec */
+/* linker use decrypt */
+
+
+
+
+static void operation_dec(
+       unsigned char *key,
+       const unsigned char *c,
+       const unsigned char *sk
+)
+{
+       int i;
+
+       unsigned char ret_decrypt = 0;
+
+       uint16_t m;
+
+       unsigned char e[ SYS_N/8 ];
+       unsigned char preimage[ 1 + SYS_N/8 + SYND_BYTES ];
+       unsigned char *x = preimage;
+       const unsigned char *s = sk + 40 + IRR_BYTES + COND_BYTES;
+
+       /**/
+
+       ret_decrypt = decrypt(e, sk + 40, c);
+
+       m = ret_decrypt;
+       m -= 1;
+       m >>= 8;
+
+       *x++ = m & 1;
+       for (i = 0; i < SYS_N/8; i++)
+               *x++ = (~m & s[i]) | (m & e[i]);
+
+       for (i = 0; i < SYND_BYTES; i++)
+               *x++ = c[i];
+
+       crypto_hash_32b(key, preimage, sizeof(preimage));
+}
+
+
+/* from libmceliece-20230612/crypto_kem/6688128f/vec/kem_enc.c */
+/* 20230102 djb: rename encrypt() as pke_encrypt() */
+/* 20221230 djb: add linker lines */
+/* 20221230 djb: split out of operations.c */
+
+/* linker define operation_enc */
+/* linker use pke_encrypt */
+
+
+
+
+static void operation_enc(
+       unsigned char *c,
+       unsigned char *key,
+       const unsigned char *pk
+)
+{
+       unsigned char e[ SYS_N/8 ];
+       unsigned char one_ec[ 1 + SYS_N/8 + SYND_BYTES ] = {1};
+
+       /**/
+
+       pke_encrypt(c, pk, e);
+
+       memcpy(one_ec + 1, e, SYS_N/8);
+       memcpy(one_ec + 1 + SYS_N/8, c, SYND_BYTES);
+
+       crypto_hash_32b(key, one_ec, sizeof(one_ec));
+}
+
+
+/* from libmceliece-20230612/crypto_kem/6688128f/vec/kem_keypair.c */
+/* 20221230 djb: add linker lines */
+/* 20221230 djb: split out of operations.c */
+
+/* linker define operation_keypair */
+/* linker use controlbitsfrompermutation genpoly_gen pk_gen */
+
+
+
+
+static void operation_keypair
+(
+       unsigned char *pk,
+       unsigned char *sk
+)
+{
+       int i;
+       unsigned char seed[ 33 ] = {64};
+       unsigned char r[ SYS_N/8 + (1 << GFBITS)*sizeof(uint32_t) + SYS_T*2 + 32 ];
+       unsigned char *rp, *skp;
+       uint64_t pivots = 0;
+
+       gf f[ SYS_T ]; /* element in GF(2^mt) */
+       gf irr[ SYS_T ]; /* Goppa polynomial */
+       uint32_t perm[ 1 << GFBITS ]; /* random permutation as 32-bit integers */
+       int16_t pi[ 1 << GFBITS ]; /* random permutation */
+
+       randombytes(seed+1, 32);
+
+       while (1)
+       {
+               rp = &r[ sizeof(r)-32 ];
+               skp = sk;
+
+               /* expanding and updating the seed */
+
+               shake(r, sizeof(r), seed, 33);
+               memcpy(skp, seed+1, 32);
+               skp += 32 + 8;
+               memcpy(seed+1, &r[ sizeof(r)-32 ], 32);
+
+               /* generating irreducible polynomial */
+
+               rp -= sizeof(f);
+
+               for (i = 0; i < SYS_T; i++)
+                       f[i] = load_gf(rp + i*2);
+
+               if (genpoly_gen(irr, f))
+                       continue;
+
+               for (i = 0; i < SYS_T; i++)
+                       store_gf(skp + i*2, irr[i]);
+
+               skp += IRR_BYTES;
+
+               /* generating permutation */
+
+               rp -= sizeof(perm);
+
+               for (i = 0; i < (1 << GFBITS); i++)
+                       perm[i] = load4(rp + i*4);
+
+               if (pk_gen(pk, skp - IRR_BYTES, perm, pi, &pivots))
+                       continue;
+
+               controlbitsfrompermutation(skp, pi, GFBITS, 1 << GFBITS);
+               skp += COND_BYTES;
+
+               /* storing the random string s */
+
+               rp -= SYS_N/8;
+               memcpy(skp, rp, SYS_N/8);
+
+               /* storing positions of the 32 pivots */
+
+               store8(sk + 32, pivots);
+
+               break;
+       }
+}
+
+
+/* from libmceliece-20230612/crypto_kem/6688128f/vec/pk_gen.c */
+/*
+  This file is for public-key generation
+*/
+/* 20221231 djb: remove unused min definition */
+/* 20221231 djb: more 0 initialization to clarify data flow; tnx thom wiggers */
+/* 20221230 djb: add linker lines */
+
+/* linker define pk_gen */
+/* linker use fft vec_inv vec_mul */
+
+
+
+static crypto_uint64 uint64_is_equal_declassify(uint64_t t,uint64_t u)
+{
+  crypto_uint64 mask = crypto_uint64_equal_mask(t,u);
+  crypto_declassify(&mask,sizeof mask);
+  return mask;
+}
+
+static crypto_uint64 uint64_is_zero_declassify(uint64_t t)
+{
+  crypto_uint64 mask = crypto_uint64_zero_mask(t);
+  crypto_declassify(&mask,sizeof mask);
+  return mask;
+}
+
+
+static void de_bitslicing(uint64_t * out, const vec in[][GFBITS])
+{
+       int i, j, r;
+
+       for (i = 0; i < (1 << GFBITS); i++)
+               out[i] = 0 ;
+
+       for (i = 0; i < 128; i++)
+       for (j = GFBITS-1; j >= 0; j--)
+       for (r = 0; r < 64; r++)
+       {
+               out[i*64 + r] <<= 1;
+               out[i*64 + r] |= (in[i][j] >> r) & 1;
+       }
+}
+
+static void to_bitslicing_2x(vec out0[][GFBITS], vec out1[][GFBITS], const uint64_t * in)
+{
+       int i, j, r;
+
+       for (i = 0; i < 128; i++)
+       {
+               for (j = 0;j < GFBITS;++j) out0[i][j] = out1[i][j] = 0;
+
+               for (j = GFBITS-1; j >= 0; j--)
+               for (r = 63; r >= 0; r--)
+               {
+                       out1[i][j] <<= 1;
+                       out1[i][j] |= (in[i*64 + r] >> (j + GFBITS)) & 1;
+               }
+
+               for (j = GFBITS-1; j >= 0; j--)
+               for (r = 63; r >= 0; r--)
+               {
+                       out0[i][GFBITS-1-j] <<= 1;
+                       out0[i][GFBITS-1-j] |= (in[i*64 + r] >> j) & 1;
+               }
+       }
+}
+
+/* return number of trailing zeros of in */
+static inline int ctz(uint64_t in)
+{
+       int i, b, m = 0, r = 0;
+
+       for (i = 0; i < 64; i++)
+       {
+               b = (in >> i) & 1;
+               m |= b;
+               r += (m^1) & (b^1);
+       }
+
+       return r;
+}
+
+static inline uint64_t same_mask(uint16_t x, uint16_t y)
+{
+        uint64_t mask;
+
+        mask = x ^ y;
+        mask -= 1;
+        mask >>= 63;
+        mask = -mask;
+
+        return mask;
+}
+
+static int mov_columns(uint64_t mat[][ (SYS_N + 63) / 64 ], int16_t * pi, uint64_t * pivots)
+{
+       int i, j, k, s, block_idx, row;
+       uint64_t buf[64], ctz_list[32], t, d, mask, one = 1;
+
+       row = PK_NROWS - 32;
+       block_idx = row/64;
+
+       /* extract the 32x64 matrix */
+
+       for (i = 0; i < 32; i++)
+               buf[i] = (mat[ row + i ][ block_idx + 0 ] >> 32) |
+                        (mat[ row + i ][ block_idx + 1 ] << 32);
+
+       /* compute the column indices of pivots by Gaussian elimination. */
+       /* the indices are stored in ctz_list */
+
+       *pivots = 0;
+
+       for (i = 0; i < 32; i++)
+       {
+               t = buf[i];
+               for (j = i+1; j < 32; j++)
+                       t |= buf[j];
+
+               if (uint64_is_zero_declassify(t)) return -1; /* return if buf is not full rank */
+
+               ctz_list[i] = s = ctz(t);
+               *pivots |= one << ctz_list[i];
+
+               for (j = i+1; j < 32; j++) { mask = (buf[i] >> s) & 1; mask -= 1;    buf[i] ^= buf[j] & mask; }
+               for (j = i+1; j < 32; j++) { mask = (buf[j] >> s) & 1; mask = -mask; buf[j] ^= buf[i] & mask; }
+       }
+
+       /* updating permutation */
+
+       for (j = 0;   j < 32; j++)
+       for (k = j+1; k < 64; k++)
+       {
+                       d = pi[ row + j ] ^ pi[ row + k ];
+                       d &= same_mask(k, ctz_list[j]);
+                       pi[ row + j ] ^= d;
+                       pi[ row + k ] ^= d;
+       }
+
+       /* moving columns of mat according to the column indices of pivots */
+
+       for (i = 0; i < PK_NROWS; i++)
+       {
+               t = (mat[ i ][ block_idx + 0 ] >> 32) |
+                   (mat[ i ][ block_idx + 1 ] << 32);
+
+               for (j = 0; j < 32; j++)
+               {
+                       d  = t >> j;
+                       d ^= t >> ctz_list[j];
+                       d &= 1;
+
+                       t ^= d << ctz_list[j];
+                       t ^= d << j;
+               }
+
+               mat[ i ][ block_idx + 0 ] = (mat[ i ][ block_idx + 0 ] << 32 >> 32) | (t << 32);
+               mat[ i ][ block_idx + 1 ] = (mat[ i ][ block_idx + 1 ] >> 32 << 32) | (t >> 32);
+       }
+
+       return 0;
+}
+
+static int pk_gen(unsigned char * pk, const unsigned char * irr, uint32_t * perm, int16_t * pi, uint64_t * pivots)
+{
+       const int nblocks_H = (SYS_N + 63) / 64;
+       const int nblocks_I = (PK_NROWS + 63) / 64;
+
+       int i, j, k;
+       int row, c;
+
+       uint64_t mat[ PK_NROWS ][ nblocks_H ];
+
+       uint64_t mask;
+
+       vec irr_int[2][ GFBITS ];
+
+       vec consts[ 128 ][ GFBITS ];
+       vec eval[ 128 ][ GFBITS ];
+       vec prod[ 128 ][ GFBITS ];
+       vec tmp[ GFBITS ];
+
+       uint64_t list[1 << GFBITS];
+
+       /* compute the inverses */
+
+       irr_load(irr_int, irr);
+
+       fft(eval, irr_int);
+
+       vec_copy(prod[0], eval[0]);
+
+       for (i = 1; i < 128; i++)
+               vec_mul(prod[i], prod[i-1], eval[i]);
+
+       vec_inv(tmp, prod[127]);
+
+       for (i = 126; i >= 0; i--)
+       {
+               vec_mul(prod[i+1], prod[i], tmp);
+               vec_mul(tmp, tmp, eval[i+1]);
+       }
+
+       vec_copy(prod[0], tmp);
+
+       /* fill matrix */
+
+       de_bitslicing(list, prod);
+
+       for (i = 0; i < (1 << GFBITS); i++)
+       {
+               list[i] <<= GFBITS;
+               list[i] |= i;
+               list[i] |= ((uint64_t) perm[i]) << 31;
+       }
+
+       uint64_sort(list, 1 << GFBITS);
+
+       for (i = 1; i < (1 << GFBITS); i++)
+               if (uint64_is_equal_declassify(list[i-1] >> 31,list[i] >> 31))
+                       return -1;
+
+       to_bitslicing_2x(consts, prod, list);
+
+       for (i = 0; i < (1 << GFBITS); i++)
+               pi[i] = list[i] & GFMASK;
+
+       for (j = 0; j < nblocks_H; j++)
+       for (k = 0; k < GFBITS; k++)
+               mat[ k ][ j ] = prod[ j ][ k ];
+
+       for (i = 1; i < SYS_T; i++)
+       for (j = 0; j < nblocks_H; j++)
+       {
+               vec_mul(prod[j], prod[j], consts[j]);
+
+               for (k = 0; k < GFBITS; k++)
+                       mat[ i*GFBITS + k ][ j ] = prod[ j ][ k ];
+       }
+
+       /* gaussian elimination */
+
+       for (row = 0; row < PK_NROWS; row++)
+       {
+               i = row >> 6;
+               j = row & 63;
+
+               if (row == PK_NROWS - 32)
+               {
+                       if (mov_columns(mat, pi, pivots))
+                               return -1;
+               }
+
+               for (k = row + 1; k < PK_NROWS; k++)
+               {
+                       mask = mat[ row ][ i ] >> j;
+                       mask &= 1;
+                       mask -= 1;
+
+                       for (c = 0; c < nblocks_H; c++)
+                               mat[ row ][ c ] ^= mat[ k ][ c ] & mask;
+               }
+
+               if ( uint64_is_zero_declassify((mat[ row ][ i ] >> j) & 1) ) /* return if not systematic */
+               {
+                       return -1;
+               }
+
+               for (k = 0; k < row; k++)
+               {
+                       mask = mat[ k ][ i ] >> j;
+                       mask &= 1;
+                       mask = -mask;
+
+                       for (c = 0; c < nblocks_H; c++)
+                               mat[ k ][ c ] ^= mat[ row ][ c ] & mask;
+               }
+
+               for (k = row+1; k < PK_NROWS; k++)
+               {
+                       mask = mat[ k ][ i ] >> j;
+                       mask &= 1;
+                       mask = -mask;
+
+                       for (c = 0; c < nblocks_H; c++)
+                               mat[ k ][ c ] ^= mat[ row ][ c ] & mask;
+               }
+       }
+
+       for (i = 0; i < PK_NROWS; i++)
+       {
+               for (j = nblocks_I; j < nblocks_H-1; j++)
+               {
+                       store8(pk, mat[i][j]);
+                       pk += 8;
+               }
+
+               store_i(pk, mat[i][j], PK_ROW_BYTES % 8);
+
+                pk += PK_ROW_BYTES % 8;
+       }
+
+       /**/
+
+       return 0;
+}
+
+
+/* from libmceliece-20230612/crypto_kem/6688128f/vec/sk_gen.c */
+/*
+  This file is for secret-key generation
+*/
+/* 20221230 djb: add linker lines */
+
+/* linker define genpoly_gen */
+/* linker use gf_iszero gf_mul gf_inv GF_mul */
+
+
+
+static inline crypto_uint16 gf_is_zero_declassify(gf t)
+{
+  crypto_uint16 mask = crypto_uint16_zero_mask(t);
+  crypto_declassify(&mask,sizeof mask);
+  return mask;
+}
+
+/* input: f, element in GF((2^m)^t) */
+/* output: out, minimal polynomial of f */
+/* return: 0 for success and -1 for failure */
+static int genpoly_gen(gf *out, gf *f)
+{
+       int i, j, k, c;
+
+       gf mat[ SYS_T+1 ][ SYS_T ];
+       gf mask, inv, t;
+
+       /* fill matrix */
+
+       mat[0][0] = 1;
+
+       for (i = 1; i < SYS_T; i++)
+               mat[0][i] = 0;
+
+       for (i = 0; i < SYS_T; i++)
+               mat[1][i] = f[i];
+
+       for (j = 2; j <= SYS_T; j++)
+               GF_mul(mat[j], mat[j-1], f);
+
+       /* gaussian */
+
+       for (j = 0; j < SYS_T; j++)
+       {
+               for (k = j + 1; k < SYS_T; k++)
+               {
+                       mask = gf_iszero(mat[ j ][ j ]);
+
+                       for (c = j; c < SYS_T + 1; c++)
+                               mat[ c ][ j ] ^= mat[ c ][ k ] & mask;
+
+               }
+
+               if ( gf_is_zero_declassify(mat[ j ][ j ]) ) /* return if not systematic */
+               {
+                       return -1;
+               }
+
+               inv = gf_inv(mat[j][j]);
+
+               for (c = j; c < SYS_T + 1; c++)
+                       mat[ c ][ j ] = gf_mul(mat[ c ][ j ], inv) ;
+
+               for (k = 0; k < SYS_T; k++)
+               {
+                       if (k != j)
+                       {
+                               t = mat[ j ][ k ];
+
+                               for (c = j; c < SYS_T + 1; c++)
+                                       mat[ c ][ k ] ^= gf_mul(mat[ c ][ j ], t);
+                       }
+               }
+       }
+
+       for (i = 0; i < SYS_T; i++)
+               out[i] = mat[ SYS_T ][ i ];
+
+       return 0;
+}
+
+
+/* from libmceliece-20230612/crypto_kem/6688128f/vec/vec.c */
+/* 20221230 djb: add linker line */
+
+/* linker define vec_mul vec_sq vec_inv */
+
+
+
+static void vec_mul(vec * h, const vec * f, const vec * g)
+{
+       int i, j;
+       vec buf[ 2*GFBITS-1 ];
+
+       for (i = 0; i < 2*GFBITS-1; i++)
+               buf[i] = 0;
+
+       for (i = 0; i < GFBITS; i++)
+       for (j = 0; j < GFBITS; j++)
+               buf[i+j] ^= f[i] & g[j];
+
+       for (i = 2*GFBITS-2; i >= GFBITS; i--)
+       {
+               buf[i-GFBITS+4] ^= buf[i];
+               buf[i-GFBITS+3] ^= buf[i];
+               buf[i-GFBITS+1] ^= buf[i];
+               buf[i-GFBITS+0] ^= buf[i];
+       }
+
+       for (i = 0; i < GFBITS; i++)
+               h[i] = buf[i];
+}
+
+/* bitsliced field squarings */
+static void vec_sq(vec * out, vec * in)
+{
+       int i;
+       vec result[GFBITS], t;
+
+       t = in[11] ^ in[12];
+
+       result[0] = in[0] ^ in[11];
+       result[1] = in[7] ^ t;
+       result[2] = in[1] ^ in[7];
+       result[3] = in[8] ^ t;
+       result[4] = in[2] ^ in[7];
+       result[4] = result[4] ^ in[8];
+       result[4] = result[4] ^ t;
+       result[5] = in[7] ^ in[9];
+       result[6] = in[3] ^ in[8];
+       result[6] = result[6] ^ in[9];
+       result[6] = result[6] ^ in[12];
+       result[7] = in[8] ^ in[10];
+       result[8] = in[4] ^ in[9];
+       result[8] = result[8] ^ in[10];
+       result[9] = in[9] ^ in[11];
+       result[10] = in[5] ^ in[10];
+       result[10] = result[10] ^ in[11];
+       result[11] = in[10] ^ in[12];
+       result[12] = in[6] ^ t;
+
+       for (i = 0; i < GFBITS; i++)
+               out[i] = result[i];
+}
+
+/* bitsliced field inverses */
+static void vec_inv(vec * out, vec * in)
+{
+       vec tmp_11[ GFBITS ];
+       vec tmp_1111[ GFBITS ];
+
+       vec_copy(out, in);
+
+       vec_sq(out, out);
+       vec_mul(tmp_11, out, in); /* ^11 */
+
+       vec_sq(out, tmp_11);
+       vec_sq(out, out);
+       vec_mul(tmp_1111, out, tmp_11); /* ^1111 */
+
+       vec_sq(out, tmp_1111);
+       vec_sq(out, out);
+       vec_sq(out, out);
+       vec_sq(out, out);
+       vec_mul(out, out, tmp_1111); /* ^11111111 */
+
+       vec_sq(out, out);
+       vec_sq(out, out);
+       vec_sq(out, out);
+       vec_sq(out, out);
+       vec_mul(out, out, tmp_1111); /* ^111111111111 */
+
+       vec_sq(out, out); /* ^1111111111110 */
+}
+
+
+/* from libmceliece-20230612/crypto_kem/6688128f/vec/wrap_dec.c */
+
+void mceliece6688128f_dec(uint8_t *key,
+                         const uint8_t *c,
+                         const uint8_t *sk)
+{
+  operation_dec((unsigned char*) key,
+               (unsigned char*) c,
+               (unsigned char*) sk);
+}
+
+/* from libmceliece-20230612/crypto_kem/6688128f/vec/wrap_enc.c */
+
+void mceliece6688128f_enc(uint8_t *c,
+                         uint8_t *key,
+                         const uint8_t *pk)
+{
+  operation_enc((unsigned char*) c,
+               (unsigned char*) key,
+               (unsigned char*) pk);
+}
+
+/* from libmceliece-20230612/crypto_kem/6688128f/vec/wrap_keypair.c */
+
+void mceliece6688128f_keypair(uint8_t *pk,
+                             uint8_t *sk)
+{
+  operation_keypair((unsigned char*) pk, (unsigned char*) sk);
+}
diff --git a/cipher/mceliece6688128f.h b/cipher/mceliece6688128f.h
new file mode 100644 (file)
index 0000000..eb9f23a
--- /dev/null
@@ -0,0 +1,63 @@
+/* mceliece6688128f.h - Classic McEliece for libgcrypt
+ * Copyright (C) 2023-2024 Simon Josefsson <simon@josefsson.org>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <https://www.gnu.org/licenses/>.
+ * SPDX-License-Identifier: LGPL-2.1-or-later
+ *
+ */
+
+#ifndef MCELIECE6688128F_H
+#define MCELIECE6688128F_H
+
+#include <string.h>
+#include <stdint.h>
+
+#ifdef _GCRYPT_IN_LIBGCRYPT
+/**** Start of the glue code to libgcrypt ****/
+#include "g10lib.h"             /* for GCC_ATTR_UNUSED */
+#include "gcrypt-int.h"
+
+#define mceliece6688128f_keypair _gcry_mceliece6688128f_keypair
+#define mceliece6688128f_enc     _gcry_mceliece6688128f_enc
+#define mceliece6688128f_dec     _gcry_mceliece6688128f_dec
+/**** End of the glue code ****/
+#else
+#if __GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ >= 5 )
+#define GCC_ATTR_UNUSED  __attribute__ ((unused))
+#else
+#define GCC_ATTR_UNUSED
+#endif
+
+#define MCELIECE6688128F_SECRETKEY_SIZE 13932
+#define MCELIECE6688128F_PUBLICKEY_SIZE 1044992
+#define MCELIECE6688128F_CIPHERTEXT_SIZE 208
+#define MCELIECE6688128F_SIZE 32
+#endif
+
+typedef void mceliece6688128f_random_func (void *ctx,
+                                          size_t length,
+                                          uint8_t *dst);
+
+void
+mceliece6688128f_keypair (uint8_t *pk, uint8_t *sk);
+
+void
+mceliece6688128f_enc (uint8_t *c, uint8_t *k, const uint8_t *pk);
+
+void
+mceliece6688128f_dec (uint8_t *k, const uint8_t *c, const uint8_t *sk);
+
+#endif /* MCELIECE6688128F_H */
index 34336b5cca3dac2d760d9604dc1ca7277a4844a9..1991c3316491af3ce8b1344b922c21670fa4a51a 100644 (file)
@@ -6,7 +6,7 @@
  * This file is part of Libgcrypt.
  *
  * Libgcrypt is free software; you can redistribute it and/or modify
- * it under the terms of the GNU Lesser general Public License as
+ * it under the terms of the GNU Lesser General Public License as
  * published by the Free Software Foundation; either version 2.1 of
  * the License, or (at your option) any later version.
  *
@@ -58,6 +58,8 @@ static const gcry_md_spec_t * const digest_list[] =
      &_gcry_digest_spec_sha3_512,
      &_gcry_digest_spec_shake128,
      &_gcry_digest_spec_shake256,
+     &_gcry_digest_spec_cshake128,
+     &_gcry_digest_spec_cshake256,
 #endif
 #if USE_GOST_R_3411_94
      &_gcry_digest_spec_gost3411_94,
@@ -244,6 +246,13 @@ static const gcry_md_spec_t * const digest_list_algo301[] =
 #else
     NULL,
     NULL,
+#endif
+#if USE_SHA3
+    &_gcry_digest_spec_cshake128,
+    &_gcry_digest_spec_cshake256
+#else
+    NULL,
+    NULL
 #endif
   };
 
@@ -996,6 +1005,55 @@ prepare_macpads (gcry_md_hd_t a, const unsigned char *key, size_t keylen)
 }
 
 
+static gcry_err_code_t
+md_customize (gcry_md_hd_t h, void *buffer, size_t buflen)
+{
+  gcry_err_code_t rc = 0;
+  GcryDigestEntry *r;
+  int algo_had_customize = 0;
+
+  if (!h->ctx->list)
+    return GPG_ERR_DIGEST_ALGO; /* Might happen if no algo is enabled.  */
+
+  for (r = h->ctx->list; r; r = r->next)
+    {
+      switch (r->spec->algo)
+        {
+        case GCRY_MD_CSHAKE128:
+        case GCRY_MD_CSHAKE256:
+          algo_had_customize = 1;
+          if (buflen != sizeof (struct gcry_cshake_customization))
+            rc = GPG_ERR_INV_ARG;
+          else
+            rc = _gcry_cshake_customize (r->context, buffer);
+          break;
+        default:
+          rc = GPG_ERR_DIGEST_ALGO;
+          break;
+        }
+
+      if (rc)
+        break;
+    }
+
+  if (rc && !algo_had_customize)
+    {
+      /* None of algorithms had customize implementation, so contexts were not
+       * modified. Just return error. */
+      return rc;
+    }
+  else if (rc && algo_had_customize)
+    {
+      /* Some of the contexts have been modified, but got error. Reset
+       * all contexts. */
+      _gcry_md_reset (h);
+      return rc;
+    }
+
+  return 0;
+}
+
+
 gcry_err_code_t
 _gcry_md_ctl (gcry_md_hd_t hd, int cmd, void *buffer, size_t buflen)
 {
@@ -1014,6 +1072,9 @@ _gcry_md_ctl (gcry_md_hd_t hd, int cmd, void *buffer, size_t buflen)
     case GCRYCTL_STOP_DUMP:
       md_stop_debug ( hd );
       break;
+    case GCRYCTL_MD_CUSTOMIZE:
+      rc = md_customize (hd, buffer, buflen);
+      break;
     default:
       rc = GPG_ERR_INV_OP;
     }
@@ -1125,8 +1186,8 @@ md_extract(gcry_md_hd_t a, int algo, void *out, size_t outlen)
        {
          if (r->next)
            log_debug ("more than one algorithm in md_extract(0)\n");
-         r->spec->extract (r->context, out, outlen);
-         return 0;
+
+         return r->spec->extract (r->context, out, outlen);
        }
     }
   else
@@ -1134,8 +1195,7 @@ md_extract(gcry_md_hd_t a, int algo, void *out, size_t outlen)
       for (r = a->ctx->list; r; r = r->next)
        if (r->spec->algo == algo && r->spec->extract)
          {
-           r->spec->extract (r->context, out, outlen);
-           return 0;
+           return r->spec->extract (r->context, out, outlen);
          }
     }
 
@@ -1248,6 +1308,7 @@ _gcry_md_hash_buffers_extract (int algo, unsigned int flags, void *digest,
                               int iovcnt)
 {
   const gcry_md_spec_t *spec;
+  int is_xof;
   int hmac;
 
   if (!iov || iovcnt < 0)
@@ -1266,11 +1327,13 @@ _gcry_md_hash_buffers_extract (int algo, unsigned int flags, void *digest,
       return GPG_ERR_DIGEST_ALGO;
     }
 
-  if (spec->mdlen > 0 && digestlen != -1 && digestlen != spec->mdlen)
-    return GPG_ERR_DIGEST_ALGO;
-  if (spec->mdlen == 0 && digestlen == -1)
+  is_xof = spec->extract != NULL;
+  if (!is_xof && digestlen != -1 && digestlen != spec->mdlen)
     return GPG_ERR_DIGEST_ALGO;
 
+  if (digestlen == -1)
+    digestlen = spec->mdlen;
+
   if (!hmac && spec->hash_buffers)
     {
       if (spec->flags.disabled || (!spec->flags.fips && fips_mode ()))
@@ -1304,7 +1367,7 @@ _gcry_md_hash_buffers_extract (int algo, unsigned int flags, void *digest,
       for (;iovcnt; iov++, iovcnt--)
         md_write (h, (const char*)iov[0].data + iov[0].off, iov[0].len);
       md_final (h);
-      if (spec->mdlen > 0)
+      if (digestlen == spec->mdlen)
        memcpy (digest, md_read (h, algo), spec->mdlen);
       else if (digestlen > 0)
        md_extract (h, algo, digest, digestlen);
index 49b2af2a015e85df36624c401ab18ac2d8589758..530af54f17da85bd737c5d75343c7796a7851112 100644 (file)
@@ -14,8 +14,8 @@
  * GNU Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA
+ * License along with this program; if not, see <https://www.gnu.org/licenses/>.
+ * SPDX-License-Identifier: LGPL-2.1-or-later
  *
  * Based on md5.c in libgcrypt, but rewritten to compute md4 checksums
  * using a public domain md4 implementation with the following comments:
index 744a2cc197d23c3ce0a7cb01c543894e75d8ff98..b807da557d916dc3796fe74dcd597bf907ece023 100644 (file)
@@ -15,8 +15,8 @@
  * GNU Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA
+ * License along with this program; if not, see <https://www.gnu.org/licenses/>.
+ * SPDX-License-Identifier: LGPL-2.1-or-later
  *
  * According to the definition of MD5 in RFC 1321 from April 1992.
  * NOTE: This is *not* the same file as the one from glibc.
diff --git a/cipher/poly1305-amd64-avx512.S b/cipher/poly1305-amd64-avx512.S
new file mode 100644 (file)
index 0000000..250db07
--- /dev/null
@@ -0,0 +1,1626 @@
+/*
+;;
+;; Copyright (c) 2021-2022, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;;     * Redistributions of source code must retain the above copyright notice,
+;;       this list of conditions and the following disclaimer.
+;;     * Redistributions in binary form must reproduce the above copyright
+;;       notice, this list of conditions and the following disclaimer in the
+;;       documentation and/or other materials provided with the distribution.
+;;     * Neither the name of Intel Corporation nor the names of its contributors
+;;       may be used to endorse or promote products derived from this software
+;;       without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+*/
+/*
+ * From:
+ *  https://github.com/intel/intel-ipsec-mb/blob/f0cad21a644231c0f5d4af51f56061a5796343fb/lib/avx512/poly_fma_avx512.asm
+ *
+ * Conversion to GAS assembly and integration to libgcrypt
+ *  by Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ */
+
+#ifdef __x86_64
+#include <config.h>
+#if (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
+     defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) && \
+    defined(HAVE_INTEL_SYNTAX_PLATFORM_AS) && \
+    defined(HAVE_GCC_INLINE_ASM_AVX512)
+#include "asm-common-amd64.h"
+
+.intel_syntax noprefix
+
+SECTION_RODATA
+
+ELF(.type _gcry_poly1305_avx512_consts,@object)
+_gcry_poly1305_avx512_consts:
+
+.align 64
+.Lmask_44:
+  .quad 0xfffffffffff, 0xfffffffffff, 0xfffffffffff, 0xfffffffffff
+  .quad 0xfffffffffff, 0xfffffffffff, 0xfffffffffff, 0xfffffffffff
+
+.align 64
+.Lmask_42:
+  .quad 0x3ffffffffff, 0x3ffffffffff, 0x3ffffffffff, 0x3ffffffffff
+  .quad 0x3ffffffffff, 0x3ffffffffff, 0x3ffffffffff, 0x3ffffffffff
+
+.align 64
+.Lhigh_bit:
+  .quad 0x10000000000, 0x10000000000, 0x10000000000, 0x10000000000
+  .quad 0x10000000000, 0x10000000000, 0x10000000000, 0x10000000000
+
+.Lbyte_len_to_mask_table:
+  .short 0x0000, 0x0001, 0x0003, 0x0007
+  .short 0x000f, 0x001f, 0x003f, 0x007f
+  .short 0x00ff, 0x01ff, 0x03ff, 0x07ff
+  .short 0x0fff, 0x1fff, 0x3fff, 0x7fff
+  .short 0xffff
+
+.align 64
+.Lbyte64_len_to_mask_table:
+  .quad 0x0000000000000000, 0x0000000000000001
+  .quad 0x0000000000000003, 0x0000000000000007
+  .quad 0x000000000000000f, 0x000000000000001f
+  .quad 0x000000000000003f, 0x000000000000007f
+  .quad 0x00000000000000ff, 0x00000000000001ff
+  .quad 0x00000000000003ff, 0x00000000000007ff
+  .quad 0x0000000000000fff, 0x0000000000001fff
+  .quad 0x0000000000003fff, 0x0000000000007fff
+  .quad 0x000000000000ffff, 0x000000000001ffff
+  .quad 0x000000000003ffff, 0x000000000007ffff
+  .quad 0x00000000000fffff, 0x00000000001fffff
+  .quad 0x00000000003fffff, 0x00000000007fffff
+  .quad 0x0000000000ffffff, 0x0000000001ffffff
+  .quad 0x0000000003ffffff, 0x0000000007ffffff
+  .quad 0x000000000fffffff, 0x000000001fffffff
+  .quad 0x000000003fffffff, 0x000000007fffffff
+  .quad 0x00000000ffffffff, 0x00000001ffffffff
+  .quad 0x00000003ffffffff, 0x00000007ffffffff
+  .quad 0x0000000fffffffff, 0x0000001fffffffff
+  .quad 0x0000003fffffffff, 0x0000007fffffffff
+  .quad 0x000000ffffffffff, 0x000001ffffffffff
+  .quad 0x000003ffffffffff, 0x000007ffffffffff
+  .quad 0x00000fffffffffff, 0x00001fffffffffff
+  .quad 0x00003fffffffffff, 0x00007fffffffffff
+  .quad 0x0000ffffffffffff, 0x0001ffffffffffff
+  .quad 0x0003ffffffffffff, 0x0007ffffffffffff
+  .quad 0x000fffffffffffff, 0x001fffffffffffff
+  .quad 0x003fffffffffffff, 0x007fffffffffffff
+  .quad 0x00ffffffffffffff, 0x01ffffffffffffff
+  .quad 0x03ffffffffffffff, 0x07ffffffffffffff
+  .quad 0x0fffffffffffffff, 0x1fffffffffffffff
+  .quad 0x3fffffffffffffff, 0x7fffffffffffffff
+  .quad 0xffffffffffffffff
+
+.Lqword_high_bit_mask:
+  .short 0, 0x1, 0x5, 0x15, 0x55, 0x57, 0x5f, 0x7f, 0xff
+
+ELF(.size _gcry_poly1305_avx512_consts,.-_gcry_poly1305_avx512_consts)
+
+#define raxd eax
+#define rbxd ebx
+#define rcxd ecx
+#define rdxd edx
+#define rsid esi
+#define rdid edi
+#define rbpd ebp
+#define rspd esp
+#define __DWORD(X) X##d
+#define DWORD(R) __DWORD(R)
+
+#define arg1    rdi
+#define arg2    rsi
+#define arg3    rdx
+#define arg4    rcx
+
+#define job     arg1
+#define gp1     rsi
+#define gp2     rcx
+
+/* ;; don't use rdx and rax - they are needed for multiply operation */
+#define gp3     rbp
+#define gp4     r8
+#define gp5     r9
+#define gp6     r10
+#define gp7     r11
+#define gp8     r12
+#define gp9     r13
+#define gp10    r14
+#define gp11    r15
+
+#define len     gp11
+#define msg     gp10
+
+#define POLY1305_BLOCK_SIZE 16
+
+#define STACK_r_save         0
+#define STACK_r_save_size    (6 * 64)
+#define STACK_gpr_save       (STACK_r_save + STACK_r_save_size)
+#define STACK_gpr_save_size  (8 * 8)
+#define STACK_rsp_save       (STACK_gpr_save + STACK_gpr_save_size)
+#define STACK_rsp_save_size  (1 * 8)
+#define STACK_SIZE           (STACK_rsp_save + STACK_rsp_save_size)
+
+#define A2_ZERO(...) /**/
+#define A2_ZERO_INVERT(...) __VA_ARGS__
+#define A2_NOT_ZERO(...) __VA_ARGS__
+#define A2_NOT_ZERO_INVERT(...) /**/
+
+#define clear_zmm(vec) vpxord vec, vec, vec
+
+/*
+;; =============================================================================
+;; =============================================================================
+;; Computes hash for message length being multiple of block size
+;; =============================================================================
+;; Combining 64-bit x 64-bit multiplication with reduction steps
+;;
+;; NOTES:
+;;   1) A2 here is only two bits so anything above is subject of reduction.
+;;      Constant C1 = R1 + (R1 >> 2) simplifies multiply with less operations
+;;   2) Magic 5x comes from mod 2^130-5 property and incorporating
+;;      reduction into multiply phase.
+;;      See "Cheating at modular arithmetic" and "Poly1305's prime: 2^130 - 5"
+;;      paragraphs at https://loup-vaillant.fr/tutorials/poly1305-design for more details.
+;;
+;; Flow of the code below is as follows:
+;;
+;;          A2        A1        A0
+;;        x           R1        R0
+;;   -----------------------------
+;;       A2×R0     A1×R0     A0×R0
+;;   +             A0×R1
+;;   +           5xA2xR1   5xA1xR1
+;;   -----------------------------
+;;     [0|L2L] [L1H|L1L] [L0H|L0L]
+;;
+;;   Registers:  T3:T2     T1:A0
+;;
+;; Completing the multiply and adding (with carry) 3x128-bit limbs into
+;; 192-bits again (3x64-bits):
+;; A0 = L0L
+;; A1 = L0H + L1L
+;; T3 = L1H + L2L
+; A0     [in/out] GPR with accumulator bits 63:0
+; A1     [in/out] GPR with accumulator bits 127:64
+; A2     [in/out] GPR with accumulator bits 195:128
+; R0     [in] GPR with R constant bits 63:0
+; R1     [in] GPR with R constant bits 127:64
+; C1     [in] C1 = R1 + (R1 >> 2)
+; T1     [clobbered] GPR register
+; T2     [clobbered] GPR register
+; T3     [clobbered] GPR register
+; GP_RAX [clobbered] RAX register
+; GP_RDX [clobbered] RDX register
+; IF_A2  [in] Used if input A2 is not 0
+*/
+#define POLY1305_MUL_REDUCE(A0, A1, A2, R0, R1, C1, T1, T2, T3, GP_RAX, GP_RDX, IF_A2) \
+       /* T3:T2 = (A0 * R1) */ \
+       mov     GP_RAX, R1; \
+       mul     A0; \
+       mov     T2, GP_RAX; \
+       mov     GP_RAX, R0; \
+       mov     T3, GP_RDX; \
+       \
+       /* T1:A0 = (A0 * R0) */ \
+       mul     A0; \
+       mov     A0, GP_RAX; /* A0 not used in other operations */ \
+       mov     GP_RAX, R0; \
+       mov     T1, GP_RDX; \
+       \
+       /* T3:T2 += (A1 * R0) */ \
+       mul     A1; \
+       add     T2, GP_RAX; \
+       mov     GP_RAX, C1; \
+       adc     T3, GP_RDX; \
+       \
+       /* T1:A0 += (A1 * R1x5) */ \
+       mul     A1; \
+       IF_A2(mov A1, A2); /* use A1 for A2 */ \
+       add     A0, GP_RAX; \
+       adc     T1, GP_RDX; \
+       \
+       /* NOTE: A2 is clamped to 2-bits, */ \
+       /*       R1/R0 is clamped to 60-bits, */ \
+       /*       their product is less than 2^64. */ \
+       \
+       IF_A2(/* T3:T2 += (A2 * R1x5) */); \
+       IF_A2(imul    A1, C1); \
+       IF_A2(add     T2, A1); \
+       IF_A2(mov     A1, T1); /* T1:A0 => A1:A0 */ \
+       IF_A2(adc     T3, 0); \
+       \
+       IF_A2(/* T3:A1 += (A2 * R0) */); \
+       IF_A2(imul    A2, R0); \
+       IF_A2(add     A1, T2); \
+       IF_A2(adc     T3, A2); \
+       \
+       IF_A2##_INVERT(/* If A2 == 0, just move and add T1-T2 to A1 */); \
+       IF_A2##_INVERT(mov     A1, T1); \
+       IF_A2##_INVERT(add     A1, T2); \
+       IF_A2##_INVERT(adc     T3, 0); \
+       \
+       /* At this point, 3 64-bit limbs are in T3:A1:A0 */ \
+       /* T3 can span over more than 2 bits so final partial reduction step is needed. */ \
+       \
+       /* Partial reduction (just to fit into 130 bits) */ \
+       /*    A2 = T3 & 3 */ \
+       /*    k = (T3 & ~3) + (T3 >> 2) */ \
+       /*         Y    x4  +  Y    x1 */ \
+       /*    A2:A1:A0 += k */ \
+       \
+       /* Result will be in A2:A1:A0 */ \
+       mov     T1, T3; \
+       mov     DWORD(A2), DWORD(T3); \
+       and     T1, ~3; \
+       shr     T3, 2; \
+       and     DWORD(A2), 3; \
+       add     T1, T3; \
+       \
+       /* A2:A1:A0 += k (kept in T1) */ \
+       add     A0, T1; \
+       adc     A1, 0; \
+       adc     DWORD(A2), 0
+
+/*
+;; =============================================================================
+;; =============================================================================
+;; Computes hash for 8 16-byte message blocks,
+;; and adds new message blocks to accumulator.
+;;
+;; It first multiplies all 8 blocks with powers of R:
+;;
+;;      a2      a1      a0
+;; ×    b2      b1      b0
+;; ---------------------------------------
+;;     a2×b0   a1×b0   a0×b0
+;; +   a1×b1   a0×b1 5×a2×b1
+;; +   a0×b2 5×a2×b2 5×a1×b2
+;; ---------------------------------------
+;;        p2      p1      p0
+;;
+;; Then, it propagates the carry (higher bits after bit 43) from lower limbs into higher limbs,
+;; multiplying by 5 in case of the carry of p2.
+;;
+;A0    [in/out] ZMM register containing 1st 44-bit limb of the 8 blocks
+;A1    [in/out] ZMM register containing 2nd 44-bit limb of the 8 blocks
+;A2    [in/out] ZMM register containing 3rd 44-bit limb of the 8 blocks
+;R0    [in] ZMM register (R0) to include the 1st limb of R
+;R1    [in] ZMM register (R1) to include the 2nd limb of R
+;R2    [in] ZMM register (R2) to include the 3rd limb of R
+;R1P   [in] ZMM register (R1') to include the 2nd limb of R (multiplied by 5)
+;R2P   [in] ZMM register (R2') to include the 3rd limb of R (multiplied by 5)
+;P0_L  [clobbered] ZMM register to contain p[0] of the 8 blocks
+;P0_H  [clobbered] ZMM register to contain p[0] of the 8 blocks
+;P1_L  [clobbered] ZMM register to contain p[1] of the 8 blocks
+;P1_H  [clobbered] ZMM register to contain p[1] of the 8 blocks
+;P2_L  [clobbered] ZMM register to contain p[2] of the 8 blocks
+;P2_H  [clobbered] ZMM register to contain p[2] of the 8 blocks
+;ZTMP1 [clobbered] Temporary ZMM register
+*/
+#define POLY1305_MUL_REDUCE_VEC(A0, A1, A2, R0, R1, R2, R1P, R2P, P0_L, P0_H, \
+                               P1_L, P1_H, P2_L, P2_H, ZTMP1) \
+       /* ;; Reset accumulator */ \
+       vpxorq  P0_L, P0_L, P0_L; \
+       vpxorq  P0_H, P0_H, P0_H; \
+       vpxorq  P1_L, P1_L, P1_L; \
+       vpxorq  P1_H, P1_H, P1_H; \
+       vpxorq  P2_L, P2_L, P2_L; \
+       vpxorq  P2_H, P2_H, P2_H; \
+       \
+       /* ; Reset accumulator and calculate products */ \
+       vpmadd52luq P0_L, A2, R1P; \
+       vpmadd52huq P0_H, A2, R1P; \
+       vpmadd52luq P1_L, A2, R2P; \
+       vpmadd52huq P1_H, A2, R2P; \
+       vpmadd52luq P2_L, A2, R0; \
+       vpmadd52huq P2_H, A2, R0; \
+       \
+       vpmadd52luq P1_L, A0, R1; \
+       vpmadd52huq P1_H, A0, R1; \
+       vpmadd52luq P2_L, A0, R2; \
+       vpmadd52huq P2_H, A0, R2; \
+       vpmadd52luq P0_L, A0, R0; \
+       vpmadd52huq P0_H, A0, R0; \
+       \
+       vpmadd52luq P0_L, A1, R2P; \
+       vpmadd52huq P0_H, A1, R2P; \
+       vpmadd52luq P1_L, A1, R0; \
+       vpmadd52huq P1_H, A1, R0; \
+       vpmadd52luq P2_L, A1, R1; \
+       vpmadd52huq P2_H, A1, R1; \
+       \
+       /* ; Carry propagation (first pass) */ \
+       vpsrlq  ZTMP1, P0_L, 44; \
+       vpandq  A0, P0_L, [.Lmask_44 ADD_RIP]; /* ; Clear top 20 bits */ \
+       vpsllq  P0_H, P0_H, 8; \
+       vpaddq  P0_H, P0_H, ZTMP1; \
+       vpaddq  P1_L, P1_L, P0_H; \
+       vpandq  A1, P1_L, [.Lmask_44 ADD_RIP]; /* ; Clear top 20 bits */ \
+       vpsrlq  ZTMP1, P1_L, 44; \
+       vpsllq  P1_H, P1_H, 8; \
+       vpaddq  P1_H, P1_H, ZTMP1; \
+       vpaddq  P2_L, P2_L, P1_H; \
+       vpandq  A2, P2_L, [.Lmask_42 ADD_RIP]; /* ; Clear top 22 bits */ \
+       vpsrlq  ZTMP1, P2_L, 42; \
+       vpsllq  P2_H, P2_H, 10; \
+       vpaddq  P2_H, P2_H, ZTMP1; \
+       \
+       /* ; Carry propagation (second pass) */ \
+       \
+       /* ; Multiply by 5 the highest bits (above 130 bits) */ \
+       vpaddq  A0, A0, P2_H; \
+       vpsllq  P2_H, P2_H, 2; \
+       vpaddq  A0, A0, P2_H; \
+       vpsrlq  ZTMP1, A0, 44; \
+       vpandq  A0, A0, [.Lmask_44 ADD_RIP]; \
+       vpaddq  A1, A1, ZTMP1;
+
+/*
+;; =============================================================================
+;; =============================================================================
+;; Computes hash for 16 16-byte message blocks,
+;; and adds new message blocks to accumulator,
+;; interleaving this computation with the loading and splatting
+;; of new data.
+;;
+;; It first multiplies all 16 blocks with powers of R (8 blocks from A0-A2
+;; and 8 blocks from B0-B2, multiplied by R0-R2)
+;;
+;;      a2      a1      a0
+;; ×    b2      b1      b0
+;; ---------------------------------------
+;;     a2×b0   a1×b0   a0×b0
+;; +   a1×b1   a0×b1 5×a2×b1
+;; +   a0×b2 5×a2×b2 5×a1×b2
+;; ---------------------------------------
+;;        p2      p1      p0
+;;
+;; Then, it propagates the carry (higher bits after bit 43)
+;; from lower limbs into higher limbs,
+;; multiplying by 5 in case of the carry of p2, and adds
+;; the results to A0-A2 and B0-B2.
+;;
+;; =============================================================================
+;A0    [in/out] ZMM register containing 1st 44-bit limb of blocks 1-8
+;A1    [in/out] ZMM register containing 2nd 44-bit limb of blocks 1-8
+;A2    [in/out] ZMM register containing 3rd 44-bit limb of blocks 1-8
+;B0    [in/out] ZMM register containing 1st 44-bit limb of blocks 9-16
+;B1    [in/out] ZMM register containing 2nd 44-bit limb of blocks 9-16
+;B2    [in/out] ZMM register containing 3rd 44-bit limb of blocks 9-16
+;R0    [in] ZMM register (R0) to include the 1st limb of R
+;R1    [in] ZMM register (R1) to include the 2nd limb of R
+;R2    [in] ZMM register (R2) to include the 3rd limb of R
+;R1P   [in] ZMM register (R1') to include the 2nd limb of R (multiplied by 5)
+;R2P   [in] ZMM register (R2') to include the 3rd limb of R (multiplied by 5)
+;P0_L  [clobbered] ZMM register to contain p[0] of the 8 blocks 1-8
+;P0_H  [clobbered] ZMM register to contain p[0] of the 8 blocks 1-8
+;P1_L  [clobbered] ZMM register to contain p[1] of the 8 blocks 1-8
+;P1_H  [clobbered] ZMM register to contain p[1] of the 8 blocks 1-8
+;P2_L  [clobbered] ZMM register to contain p[2] of the 8 blocks 1-8
+;P2_H  [clobbered] ZMM register to contain p[2] of the 8 blocks 1-8
+;Q0_L  [clobbered] ZMM register to contain p[0] of the 8 blocks 9-16
+;Q0_H  [clobbered] ZMM register to contain p[0] of the 8 blocks 9-16
+;Q1_L  [clobbered] ZMM register to contain p[1] of the 8 blocks 9-16
+;Q1_H  [clobbered] ZMM register to contain p[1] of the 8 blocks 9-16
+;Q2_L  [clobbered] ZMM register to contain p[2] of the 8 blocks 9-16
+;Q2_H  [clobbered] ZMM register to contain p[2] of the 8 blocks 9-16
+;ZTMP1 [clobbered] Temporary ZMM register
+;ZTMP2 [clobbered] Temporary ZMM register
+;ZTMP3 [clobbered] Temporary ZMM register
+;ZTMP4 [clobbered] Temporary ZMM register
+;ZTMP5 [clobbered] Temporary ZMM register
+;ZTMP6 [clobbered] Temporary ZMM register
+;ZTMP7 [clobbered] Temporary ZMM register
+;ZTMP8 [clobbered] Temporary ZMM register
+;ZTMP9 [clobbered] Temporary ZMM register
+;MSG   [in/out] Pointer to message
+;LEN   [in/out] Length left of message
+*/
+#define POLY1305_MSG_MUL_REDUCE_VEC16(A0, A1, A2, B0, B1, B2, R0, R1, R2, R1P, \
+                                     R2P, P0_L, P0_H, P1_L, P1_H, P2_L, P2_H, \
+                                     Q0_L, Q0_H, Q1_L, Q1_H, Q2_L, Q2_H, \
+                                     ZTMP1, ZTMP2, ZTMP3, ZTMP4, ZTMP5, \
+                                     ZTMP6, ZTMP7, ZTMP8, ZTMP9, MSG, LEN) \
+       /* ;; Reset accumulator */ \
+       vpxorq  P0_L, P0_L, P0_L; \
+       vpxorq  P0_H, P0_H, P0_H; \
+       vpxorq  P1_L, P1_L, P1_L; \
+       vpxorq  P1_H, P1_H, P1_H; \
+       vpxorq  P2_L, P2_L, P2_L; \
+       vpxorq  P2_H, P2_H, P2_H; \
+       vpxorq  Q0_L, Q0_L, Q0_L; \
+       vpxorq  Q0_H, Q0_H, Q0_H; \
+       vpxorq  Q1_L, Q1_L, Q1_L; \
+       vpxorq  Q1_H, Q1_H, Q1_H; \
+       vpxorq  Q2_L, Q2_L, Q2_L; \
+       vpxorq  Q2_H, Q2_H, Q2_H; \
+       \
+       /* ;; This code interleaves hash computation with input loading/splatting */ \
+       \
+               /* ; Calculate products */ \
+               vpmadd52luq P0_L, A2, R1P; \
+               vpmadd52huq P0_H, A2, R1P; \
+       /* ;; input loading of new blocks */ \
+       add     MSG, POLY1305_BLOCK_SIZE*16; \
+       sub     LEN, POLY1305_BLOCK_SIZE*16; \
+       \
+               vpmadd52luq Q0_L, B2, R1P; \
+               vpmadd52huq Q0_H, B2, R1P; \
+               \
+               vpmadd52luq P1_L, A2, R2P; \
+               vpmadd52huq P1_H, A2, R2P; \
+       /* ; Load next block of data (128 bytes) */ \
+       vmovdqu64 ZTMP5, [MSG]; \
+       vmovdqu64 ZTMP2, [MSG + 64]; \
+       \
+               vpmadd52luq Q1_L, B2, R2P; \
+               vpmadd52huq Q1_H, B2, R2P; \
+       \
+       /* ; Interleave new blocks of data */ \
+       vpunpckhqdq ZTMP3, ZTMP5, ZTMP2; \
+       vpunpcklqdq ZTMP5, ZTMP5, ZTMP2; \
+       \
+               vpmadd52luq P0_L, A0, R0; \
+               vpmadd52huq P0_H, A0, R0; \
+       /* ; Highest 42-bit limbs of new blocks */ \
+       vpsrlq  ZTMP6, ZTMP3, 24; \
+       vporq   ZTMP6, ZTMP6, [.Lhigh_bit ADD_RIP]; /* ; Add 2^128 to all 8 final qwords of the message */ \
+       \
+               vpmadd52luq Q0_L, B0, R0; \
+               vpmadd52huq Q0_H, B0, R0; \
+               \
+       /* ; Middle 44-bit limbs of new blocks */ \
+       vpsrlq  ZTMP2, ZTMP5, 44; \
+       vpsllq  ZTMP4, ZTMP3, 20; \
+       \
+               vpmadd52luq P2_L, A2, R0; \
+               vpmadd52huq P2_H, A2, R0; \
+       vpternlogq ZTMP2, ZTMP4, [.Lmask_44 ADD_RIP], 0xA8; /* ; (A OR B AND C) */ \
+       \
+       /* ; Lowest 44-bit limbs of new blocks */ \
+       vpandq  ZTMP5, ZTMP5, [.Lmask_44 ADD_RIP]; \
+       \
+               vpmadd52luq Q2_L, B2, R0; \
+               vpmadd52huq Q2_H, B2, R0; \
+               \
+       /* ; Load next block of data (128 bytes) */ \
+       vmovdqu64 ZTMP8, [MSG + 64*2]; \
+       vmovdqu64 ZTMP9, [MSG + 64*3]; \
+       \
+               vpmadd52luq P1_L, A0, R1; \
+               vpmadd52huq P1_H, A0, R1; \
+       /* ; Interleave new blocks of data */ \
+       vpunpckhqdq ZTMP3, ZTMP8, ZTMP9; \
+       vpunpcklqdq ZTMP8, ZTMP8, ZTMP9; \
+       \
+               vpmadd52luq Q1_L, B0, R1; \
+               vpmadd52huq Q1_H, B0, R1; \
+       \
+       /* ; Highest 42-bit limbs of new blocks */ \
+       vpsrlq  ZTMP7, ZTMP3, 24; \
+       vporq   ZTMP7, ZTMP7, [.Lhigh_bit ADD_RIP]; /* ; Add 2^128 to all 8 final qwords of the message */ \
+       \
+               vpmadd52luq P0_L, A1, R2P; \
+               vpmadd52huq P0_H, A1, R2P; \
+               \
+       /* ; Middle 44-bit limbs of new blocks */ \
+       vpsrlq  ZTMP9, ZTMP8, 44; \
+       vpsllq  ZTMP4, ZTMP3, 20; \
+       \
+               vpmadd52luq Q0_L, B1, R2P; \
+               vpmadd52huq Q0_H, B1, R2P; \
+               \
+       vpternlogq ZTMP9, ZTMP4, [.Lmask_44 ADD_RIP], 0xA8; /* ; (A OR B AND C) */ \
+       \
+       /* ; Lowest 44-bit limbs of new blocks */ \
+       vpandq  ZTMP8, ZTMP8, [.Lmask_44 ADD_RIP]; \
+       \
+               vpmadd52luq P2_L, A0, R2; \
+               vpmadd52huq P2_H, A0, R2; \
+       /* ; Carry propagation (first pass) */ \
+       vpsrlq  ZTMP1, P0_L, 44; \
+       vpsllq  P0_H, P0_H, 8; \
+               vpmadd52luq Q2_L, B0, R2; \
+               vpmadd52huq Q2_H, B0, R2; \
+               \
+       vpsrlq  ZTMP3, Q0_L, 44; \
+       vpsllq  Q0_H, Q0_H, 8; \
+       \
+               vpmadd52luq P1_L, A1, R0; \
+               vpmadd52huq P1_H, A1, R0; \
+       /* ; Carry propagation (first pass) - continue */ \
+       vpandq  A0, P0_L, [.Lmask_44 ADD_RIP]; /* ; Clear top 20 bits */ \
+       vpaddq  P0_H, P0_H, ZTMP1; \
+               vpmadd52luq Q1_L, B1, R0; \
+               vpmadd52huq Q1_H, B1, R0; \
+       \
+       vpandq  B0, Q0_L, [.Lmask_44 ADD_RIP]; /* ; Clear top 20 bits */ \
+       vpaddq  Q0_H, Q0_H, ZTMP3; \
+       \
+               vpmadd52luq P2_L, A1, R1; \
+               vpmadd52huq P2_H, A1, R1; \
+       /* ; Carry propagation (first pass) - continue */ \
+       vpaddq  P1_L, P1_L, P0_H; \
+       vpsllq  P1_H, P1_H, 8; \
+       vpsrlq  ZTMP1, P1_L, 44; \
+               vpmadd52luq Q2_L, B1, R1; \
+               vpmadd52huq Q2_H, B1, R1; \
+       \
+       vpandq  A1, P1_L, [.Lmask_44 ADD_RIP]; /* ; Clear top 20 bits */ \
+       vpaddq  Q1_L, Q1_L, Q0_H; \
+       vpsllq  Q1_H, Q1_H, 8; \
+       vpsrlq  ZTMP3, Q1_L, 44; \
+       vpandq  B1, Q1_L, [.Lmask_44 ADD_RIP]; /* ; Clear top 20 bits */ \
+       \
+       vpaddq  P2_L, P2_L, P1_H; /* ; P2_L += P1_H + P1_L[63:44] */ \
+       vpaddq  P2_L, P2_L, ZTMP1; \
+       vpandq  A2, P2_L, [.Lmask_42 ADD_RIP]; /* ; Clear top 22 bits */ \
+       vpaddq  A2, A2, ZTMP6; /* ; Add highest bits from new blocks to accumulator */ \
+       vpsrlq  ZTMP1, P2_L, 42; \
+       vpsllq  P2_H, P2_H, 10; \
+       vpaddq  P2_H, P2_H, ZTMP1; \
+       \
+       vpaddq  Q2_L, Q2_L, Q1_H; /* ; Q2_L += P1_H + P1_L[63:44] */ \
+       vpaddq  Q2_L, Q2_L, ZTMP3; \
+       vpandq  B2, Q2_L, [.Lmask_42 ADD_RIP]; /* ; Clear top 22 bits */ \
+       vpaddq  B2, B2, ZTMP7; /* ; Add highest bits from new blocks to accumulator */ \
+       vpsrlq  ZTMP3, Q2_L, 42; \
+       vpsllq  Q2_H, Q2_H, 10; \
+       vpaddq  Q2_H, Q2_H, ZTMP3; \
+       \
+       /* ; Carry propagation (second pass) */ \
+       /* ; Multiply by 5 the highest bits (above 130 bits) */ \
+       vpaddq  A0, A0, P2_H; \
+       vpsllq  P2_H, P2_H, 2; \
+       vpaddq  A0, A0, P2_H; \
+       vpaddq  B0, B0, Q2_H; \
+       vpsllq  Q2_H, Q2_H, 2; \
+       vpaddq  B0, B0, Q2_H; \
+       \
+       vpsrlq  ZTMP1, A0, 44; \
+       vpandq  A0, A0, [.Lmask_44 ADD_RIP]; \
+       vpaddq  A0, A0, ZTMP5; /* ; Add low 42-bit bits from new blocks to accumulator */ \
+       vpaddq  A1, A1, ZTMP2; /* ; Add medium 42-bit bits from new blocks to accumulator */ \
+       vpaddq  A1, A1, ZTMP1; \
+       vpsrlq  ZTMP3, B0, 44; \
+       vpandq  B0, B0, [.Lmask_44 ADD_RIP]; \
+       vpaddq  B0, B0, ZTMP8; /* ; Add low 42-bit bits from new blocks to accumulator */ \
+       vpaddq  B1, B1, ZTMP9; /* ; Add medium 42-bit bits from new blocks to accumulator */ \
+       vpaddq  B1, B1, ZTMP3
+
+/*
+;; =============================================================================
+;; =============================================================================
+;; Computes hash for 16 16-byte message blocks.
+;;
+;; It first multiplies all 16 blocks with powers of R (8 blocks from A0-A2
+;; and 8 blocks from B0-B2, multiplied by R0-R2 and S0-S2)
+;;
+;;
+;;      a2      a1      a0
+;; ×    b2      b1      b0
+;; ---------------------------------------
+;;     a2×b0   a1×b0   a0×b0
+;; +   a1×b1   a0×b1 5×a2×b1
+;; +   a0×b2 5×a2×b2 5×a1×b2
+;; ---------------------------------------
+;;        p2      p1      p0
+;;
+;; Then, it propagates the carry (higher bits after bit 43) from lower limbs into higher limbs,
+;; multiplying by 5 in case of the carry of p2.
+;;
+;; =============================================================================
+;A0    [in/out] ZMM register containing 1st 44-bit limb of the 8 blocks
+;A1    [in/out] ZMM register containing 2nd 44-bit limb of the 8 blocks
+;A2    [in/out] ZMM register containing 3rd 44-bit limb of the 8 blocks
+;B0    [in/out] ZMM register containing 1st 44-bit limb of the 8 blocks
+;B1    [in/out] ZMM register containing 2nd 44-bit limb of the 8 blocks
+;B2    [in/out] ZMM register containing 3rd 44-bit limb of the 8 blocks
+;R0    [in] ZMM register (R0) to include the 1st limb in IDX
+;R1    [in] ZMM register (R1) to include the 2nd limb in IDX
+;R2    [in] ZMM register (R2) to include the 3rd limb in IDX
+;R1P   [in] ZMM register (R1') to include the 2nd limb (multiplied by 5) in IDX
+;R2P   [in] ZMM register (R2') to include the 3rd limb (multiplied by 5) in IDX
+;S0    [in] ZMM register (R0) to include the 1st limb in IDX
+;S1    [in] ZMM register (R1) to include the 2nd limb in IDX
+;S2    [in] ZMM register (R2) to include the 3rd limb in IDX
+;S1P   [in] ZMM register (R1') to include the 2nd limb (multiplied by 5) in IDX
+;S2P   [in] ZMM register (R2') to include the 3rd limb (multiplied by 5) in IDX
+;P0_L  [clobbered] ZMM register to contain p[0] of the 8 blocks
+;P0_H  [clobbered] ZMM register to contain p[0] of the 8 blocks
+;P1_L  [clobbered] ZMM register to contain p[1] of the 8 blocks
+;P1_H  [clobbered] ZMM register to contain p[1] of the 8 blocks
+;P2_L  [clobbered] ZMM register to contain p[2] of the 8 blocks
+;P2_H  [clobbered] ZMM register to contain p[2] of the 8 blocks
+;Q0_L  [clobbered] ZMM register to contain p[0] of the 8 blocks
+;Q0_H  [clobbered] ZMM register to contain p[0] of the 8 blocks
+;Q1_L  [clobbered] ZMM register to contain p[1] of the 8 blocks
+;Q1_H  [clobbered] ZMM register to contain p[1] of the 8 blocks
+;Q2_L  [clobbered] ZMM register to contain p[2] of the 8 blocks
+;Q2_H  [clobbered] ZMM register to contain p[2] of the 8 blocks
+;ZTMP1 [clobbered] Temporary ZMM register
+;ZTMP2 [clobbered] Temporary ZMM register
+*/
+#define POLY1305_MUL_REDUCE_VEC16(A0, A1, A2, B0, B1, B2, R0, R1, R2, R1P, R2P,\
+                                 S0, S1, S2, S1P, S2P, P0_L, P0_H, P1_L, P1_H,\
+                                 P2_L, P2_H, Q0_L, Q0_H, Q1_L, Q1_H, Q2_L,\
+                                 Q2_H, ZTMP1, ZTMP2) \
+       /* ;; Reset accumulator */ \
+       vpxorq  P0_L, P0_L, P0_L; \
+       vpxorq  P0_H, P0_H, P0_H; \
+       vpxorq  P1_L, P1_L, P1_L; \
+       vpxorq  P1_H, P1_H, P1_H; \
+       vpxorq  P2_L, P2_L, P2_L; \
+       vpxorq  P2_H, P2_H, P2_H; \
+       vpxorq  Q0_L, Q0_L, Q0_L; \
+       vpxorq  Q0_H, Q0_H, Q0_H; \
+       vpxorq  Q1_L, Q1_L, Q1_L; \
+       vpxorq  Q1_H, Q1_H, Q1_H; \
+       vpxorq  Q2_L, Q2_L, Q2_L; \
+       vpxorq  Q2_H, Q2_H, Q2_H; \
+       \
+       /* ;; This code interleaves hash computation with input loading/splatting */ \
+       \
+       /* ; Calculate products */ \
+       vpmadd52luq P0_L, A2, R1P; \
+       vpmadd52huq P0_H, A2, R1P; \
+       \
+       vpmadd52luq Q0_L, B2, S1P; \
+       vpmadd52huq Q0_H, B2, S1P; \
+       \
+       vpmadd52luq P1_L, A2, R2P; \
+       vpmadd52huq P1_H, A2, R2P; \
+       \
+       vpmadd52luq Q1_L, B2, S2P; \
+       vpmadd52huq Q1_H, B2, S2P; \
+       \
+       vpmadd52luq P0_L, A0, R0; \
+       vpmadd52huq P0_H, A0, R0; \
+       \
+       vpmadd52luq Q0_L, B0, S0; \
+       vpmadd52huq Q0_H, B0, S0; \
+       \
+       vpmadd52luq P2_L, A2, R0; \
+       vpmadd52huq P2_H, A2, R0; \
+       vpmadd52luq Q2_L, B2, S0; \
+       vpmadd52huq Q2_H, B2, S0; \
+       \
+       vpmadd52luq P1_L, A0, R1; \
+       vpmadd52huq P1_H, A0, R1; \
+       vpmadd52luq Q1_L, B0, S1; \
+       vpmadd52huq Q1_H, B0, S1; \
+       \
+       vpmadd52luq P0_L, A1, R2P; \
+       vpmadd52huq P0_H, A1, R2P; \
+       \
+       vpmadd52luq Q0_L, B1, S2P; \
+       vpmadd52huq Q0_H, B1, S2P; \
+       \
+       vpmadd52luq P2_L, A0, R2; \
+       vpmadd52huq P2_H, A0, R2; \
+       \
+       vpmadd52luq Q2_L, B0, S2; \
+       vpmadd52huq Q2_H, B0, S2; \
+       \
+       /* ; Carry propagation (first pass) */ \
+       vpsrlq  ZTMP1, P0_L, 44; \
+       vpsllq  P0_H, P0_H, 8; \
+       vpsrlq  ZTMP2, Q0_L, 44; \
+       vpsllq  Q0_H, Q0_H, 8; \
+       \
+       vpmadd52luq P1_L, A1, R0; \
+       vpmadd52huq P1_H, A1, R0; \
+       vpmadd52luq Q1_L, B1, S0; \
+       vpmadd52huq Q1_H, B1, S0; \
+       \
+       /* ; Carry propagation (first pass) - continue */ \
+       vpandq  A0, P0_L, [.Lmask_44 ADD_RIP]; /* ; Clear top 20 bits */ \
+       vpaddq  P0_H, P0_H, ZTMP1; \
+       vpandq  B0, Q0_L, [.Lmask_44 ADD_RIP]; /* ; Clear top 20 bits */ \
+       vpaddq  Q0_H, Q0_H, ZTMP2; \
+       \
+       vpmadd52luq P2_L, A1, R1; \
+       vpmadd52huq P2_H, A1, R1; \
+       vpmadd52luq Q2_L, B1, S1; \
+       vpmadd52huq Q2_H, B1, S1; \
+       \
+       /* ; Carry propagation (first pass) - continue */ \
+       vpaddq  P1_L, P1_L, P0_H; \
+       vpsllq  P1_H, P1_H, 8; \
+       vpsrlq  ZTMP1, P1_L, 44; \
+       vpandq  A1, P1_L, [.Lmask_44 ADD_RIP]; /* ; Clear top 20 bits */ \
+       vpaddq  Q1_L, Q1_L, Q0_H; \
+       vpsllq  Q1_H, Q1_H, 8; \
+       vpsrlq  ZTMP2, Q1_L, 44; \
+       vpandq  B1, Q1_L, [.Lmask_44 ADD_RIP]; /* ; Clear top 20 bits */ \
+       \
+       vpaddq  P2_L, P2_L, P1_H; /* ; P2_L += P1_H + P1_L[63:44] */ \
+       vpaddq  P2_L, P2_L, ZTMP1; \
+       vpandq  A2, P2_L, [.Lmask_42 ADD_RIP]; /* ; Clear top 22 bits */ \
+       vpsrlq  ZTMP1, P2_L, 42; \
+       vpsllq  P2_H, P2_H, 10; \
+       vpaddq  P2_H, P2_H, ZTMP1; \
+       \
+       vpaddq  Q2_L, Q2_L, Q1_H; /* ; Q2_L += P1_H + P1_L[63:44] */ \
+       vpaddq  Q2_L, Q2_L, ZTMP2; \
+       vpandq  B2, Q2_L, [.Lmask_42 ADD_RIP]; /* ; Clear top 22 bits */ \
+       vpsrlq  ZTMP2, Q2_L, 42; \
+       vpsllq  Q2_H, Q2_H, 10; \
+       vpaddq  Q2_H, Q2_H, ZTMP2; \
+       \
+       /* ; Carry propagation (second pass) */ \
+       /* ; Multiply by 5 the highest bits (above 130 bits) */ \
+       vpaddq  A0, A0, P2_H; \
+       vpsllq  P2_H, P2_H, 2; \
+       vpaddq  A0, A0, P2_H; \
+       vpaddq  B0, B0, Q2_H; \
+       vpsllq  Q2_H, Q2_H, 2; \
+       vpaddq  B0, B0, Q2_H; \
+       \
+       vpsrlq  ZTMP1, A0, 44; \
+       vpandq  A0, A0, [.Lmask_44 ADD_RIP]; \
+       vpaddq  A1, A1, ZTMP1; \
+       vpsrlq  ZTMP2, B0, 44; \
+       vpandq  B0, B0, [.Lmask_44 ADD_RIP]; \
+       vpaddq  B1, B1, ZTMP2;
+
+/*
+;; =============================================================================
+;; =============================================================================
+;; Shuffle data blocks, so they match the right power of R.
+;; Powers of R are in this order: R^8 R^4 R^7 R^3 R^6 R^2 R^5 R
+;; Data blocks are coming in this order: A0 A4 A1 A5 A2 A6 A3 A7
+;; Generally the computation is: A0*R^8 + A1*R^7 + A2*R^6 + A3*R^5 +
+;;                               A4*R^4 + A5*R^3 + A6*R^2 + A7*R
+;; When there are less data blocks, less powers of R are used, so data needs to
+;; be shuffled. Example: if 4 blocks are left, only A0-A3 are available and only
+;; R-R^4 are used (A0*R^4 + A1*R^3 + A2*R^2 + A3*R), so A0-A3 need to be shifted
+;; =============================================================================
+;A_L      [in/out] 0-43 bits of input data
+;A_M      [in/out] 44-87 bits of input data
+;A_H      [in/out] 88-129 bits of input data
+;TMP      [clobbered] Temporary GP register
+;N_BLOCKS [in] Number of remaining input blocks
+*/
+#define SHUFFLE_DATA_SMASK_1 0x39
+#define SHUFFLE_DATA_KMASK_1 0xffff
+#define SHUFFLE_DATA_SMASK_2 0x4E
+#define SHUFFLE_DATA_KMASK_2 0xffff
+#define SHUFFLE_DATA_SMASK_3 0x93
+#define SHUFFLE_DATA_KMASK_3 0xffff
+#define SHUFFLE_DATA_KMASK_4 0xffff
+#define SHUFFLE_DATA_SMASK_5 0x39
+#define SHUFFLE_DATA_KMASK_5 0xfff0
+#define SHUFFLE_DATA_SMASK_6 0x4E
+#define SHUFFLE_DATA_KMASK_6 0xff00
+#define SHUFFLE_DATA_SMASK_7 0x93
+#define SHUFFLE_DATA_KMASK_7 0xf000
+
+#define SHUFFLE_DATA_BLOCKS_GENERIC(A_L, A_M, A_H, TMP, N_BLOCKS) \
+       mov     TMP, SHUFFLE_DATA_KMASK_##N_BLOCKS; \
+       kmovq   k1, TMP; \
+       vpshufd A_L{k1}, A_L, 0x4E; \
+       vpshufd A_M{k1}, A_M, 0x4E; \
+       vpshufd A_H{k1}, A_H, 0x4E; \
+       vshufi64x2 A_L, A_L, A_L, SHUFFLE_DATA_SMASK_##N_BLOCKS; \
+       vshufi64x2 A_M, A_M, A_M, SHUFFLE_DATA_SMASK_##N_BLOCKS; \
+       vshufi64x2 A_H, A_H, A_H, SHUFFLE_DATA_SMASK_##N_BLOCKS
+
+#define SHUFFLE_DATA_BLOCKS_1(A_L, A_M, A_H, TMP) \
+       SHUFFLE_DATA_BLOCKS_GENERIC(A_L, A_M, A_H, TMP, 1)
+
+#define SHUFFLE_DATA_BLOCKS_2(A_L, A_M, A_H, TMP) \
+       SHUFFLE_DATA_BLOCKS_GENERIC(A_L, A_M, A_H, TMP, 2)
+
+#define SHUFFLE_DATA_BLOCKS_3(A_L, A_M, A_H, TMP) \
+       SHUFFLE_DATA_BLOCKS_GENERIC(A_L, A_M, A_H, TMP, 3)
+
+#define SHUFFLE_DATA_BLOCKS_4(A_L, A_M, A_H, TMP) \
+       mov     TMP, SHUFFLE_DATA_KMASK_4; \
+       kmovq   k1, TMP; \
+       vpshufd A_L{k1}, A_L, 0x4E; \
+       vpshufd A_M{k1}, A_M, 0x4E; \
+       vpshufd A_H{k1}, A_H, 0x4E;
+
+#define SHUFFLE_DATA_BLOCKS_5(A_L, A_M, A_H, TMP) \
+       SHUFFLE_DATA_BLOCKS_GENERIC(A_L, A_M, A_H, TMP, 5)
+
+#define SHUFFLE_DATA_BLOCKS_6(A_L, A_M, A_H, TMP) \
+       SHUFFLE_DATA_BLOCKS_GENERIC(A_L, A_M, A_H, TMP, 6)
+
+#define SHUFFLE_DATA_BLOCKS_7(A_L, A_M, A_H, TMP) \
+       SHUFFLE_DATA_BLOCKS_GENERIC(A_L, A_M, A_H, TMP, 7)
+
+/*
+;; =============================================================================
+;; =============================================================================
+;; Computes hash for message length being multiple of block size
+;; =============================================================================
+;MSG    [in/out] GPR pointer to input message (updated)
+;LEN    [in/out] GPR in: length in bytes / out: length mod 16
+;A0     [in/out] accumulator bits 63..0
+;A1     [in/out] accumulator bits 127..64
+;A2     [in/out] accumulator bits 195..128
+;R0     [in] R constant bits 63..0
+;R1     [in] R constant bits 127..64
+;T0     [clobbered] GPR register
+;T1     [clobbered] GPR register
+;T2     [clobbered] GPR register
+;T3     [clobbered] GPR register
+;GP_RAX [clobbered] RAX register
+;GP_RDX [clobbered] RDX register
+*/
+#define POLY1305_BLOCKS(MSG, LEN, A0, A1, A2, R0, R1, T0, T1, T2, T3, \
+                       GP_RAX, GP_RDX) \
+       /* ; Minimum of 256 bytes to run vectorized code */ \
+       cmp     LEN, POLY1305_BLOCK_SIZE*16; \
+       jb      .L_final_loop; \
+       \
+       /* ; Spread accumulator into 44-bit limbs in quadwords */ \
+       mov     T0, A0; \
+       and     T0, [.Lmask_44 ADD_RIP]; /* ;; First limb (A[43:0]) */ \
+       vmovq   xmm5, T0; \
+       \
+       mov     T0, A1; \
+       shrd    A0, T0, 44; \
+       and     A0, [.Lmask_44 ADD_RIP]; /* ;; Second limb (A[77:52]) */ \
+       vmovq   xmm6, A0; \
+       \
+       shrd    A1, A2, 24; \
+       and     A1, [.Lmask_42 ADD_RIP]; /* ;; Third limb (A[129:88]) */ \
+       vmovq   xmm7, A1; \
+       \
+       /* ; Load first block of data (128 bytes) */ \
+       vmovdqu64 zmm0, [MSG]; \
+       vmovdqu64 zmm1, [MSG + 64]; \
+       \
+       /* ; Interleave the data to form 44-bit limbs */ \
+       /* ; */ \
+       /* ; zmm13 to have bits 0-43 of all 8 blocks in 8 qwords */ \
+       /* ; zmm14 to have bits 87-44 of all 8 blocks in 8 qwords */ \
+       /* ; zmm15 to have bits 127-88 of all 8 blocks in 8 qwords */ \
+       vpunpckhqdq zmm15, zmm0, zmm1; \
+       vpunpcklqdq zmm13, zmm0, zmm1; \
+       \
+       vpsrlq  zmm14, zmm13, 44; \
+       vpsllq  zmm18, zmm15, 20; \
+       vpternlogq zmm14, zmm18, [.Lmask_44 ADD_RIP], 0xA8; /* ; (A OR B AND C) */ \
+       \
+       vpandq  zmm13, zmm13, [.Lmask_44 ADD_RIP]; \
+       vpsrlq  zmm15, zmm15, 24; \
+       \
+       /* ; Add 2^128 to all 8 final qwords of the message */ \
+       vporq   zmm15, zmm15, [.Lhigh_bit ADD_RIP]; \
+       \
+       vpaddq  zmm13, zmm13, zmm5; \
+       vpaddq  zmm14, zmm14, zmm6; \
+       vpaddq  zmm15, zmm15, zmm7; \
+       \
+       /* ; Load next blocks of data (128 bytes) */ \
+       vmovdqu64 zmm0, [MSG + 64*2]; \
+       vmovdqu64 zmm1, [MSG + 64*3]; \
+       \
+       /* ; Interleave the data to form 44-bit limbs */ \
+       /* ; */ \
+       /* ; zmm13 to have bits 0-43 of all 8 blocks in 8 qwords */ \
+       /* ; zmm14 to have bits 87-44 of all 8 blocks in 8 qwords */ \
+       /* ; zmm15 to have bits 127-88 of all 8 blocks in 8 qwords */ \
+       vpunpckhqdq zmm18, zmm0, zmm1; \
+       vpunpcklqdq zmm16, zmm0, zmm1; \
+       \
+       vpsrlq  zmm17, zmm16, 44; \
+       vpsllq  zmm19, zmm18, 20; \
+       vpternlogq zmm17, zmm19, [.Lmask_44 ADD_RIP], 0xA8; /* ; (A OR B AND C) */ \
+       \
+       vpandq  zmm16, zmm16, [.Lmask_44 ADD_RIP]; \
+       vpsrlq  zmm18, zmm18, 24; \
+       \
+       /* ; Add 2^128 to all 8 final qwords of the message */ \
+       vporq   zmm18, zmm18, [.Lhigh_bit ADD_RIP]; \
+       \
+       /* ; Use memory in stack to save powers of R, before loading them into ZMM registers */ \
+       /* ; The first 16*8 bytes will contain the 16 bytes of the 8 powers of R */ \
+       /* ; The last 64 bytes will contain the last 2 bits of powers of R, spread in 8 qwords, */ \
+       /* ; to be OR'd with the highest qwords (in zmm26) */ \
+       vmovq   xmm3, R0; \
+       vpinsrq xmm3, xmm3, R1, 1; \
+       vinserti32x4 zmm1, zmm1, xmm3, 3; \
+       \
+       vpxorq  zmm0, zmm0, zmm0; \
+       vpxorq  zmm2, zmm2, zmm2; \
+       \
+       /* ; Calculate R^2 */ \
+       mov     T0, R1; \
+       shr     T0, 2; \
+       add     T0, R1; /* ;; T0 = R1 + (R1 >> 2) */ \
+       \
+       mov     A0, R0; \
+       mov     A1, R1; \
+       \
+       POLY1305_MUL_REDUCE(A0, A1, A2, R0, R1, T0, T1, T2, T3, GP_RAX, GP_RDX, A2_ZERO); \
+       \
+       vmovq   xmm3, A0; \
+       vpinsrq xmm3, xmm3, A1, 1; \
+       vinserti32x4 zmm1, zmm1, xmm3, 2; \
+       \
+       vmovq   xmm4, A2; \
+       vinserti32x4 zmm2, zmm2, xmm4, 2; \
+       \
+       /* ; Calculate R^3 */ \
+       POLY1305_MUL_REDUCE(A0, A1, A2, R0, R1, T0, T1, T2, T3, GP_RAX, GP_RDX, A2_NOT_ZERO); \
+       \
+       vmovq   xmm3, A0; \
+       vpinsrq xmm3, xmm3, A1, 1; \
+       vinserti32x4 zmm1, zmm1, xmm3, 1; \
+       \
+       vmovq   xmm4, A2; \
+       vinserti32x4 zmm2, zmm2, xmm4, 1; \
+       \
+       /* ; Calculate R^4 */ \
+       POLY1305_MUL_REDUCE(A0, A1, A2, R0, R1, T0, T1, T2, T3, GP_RAX, GP_RDX, A2_NOT_ZERO); \
+       \
+       vmovq   xmm3, A0; \
+       vpinsrq xmm3, xmm3, A1, 1; \
+       vinserti32x4 zmm1, zmm1, xmm3, 0; \
+       \
+       vmovq   xmm4, A2; \
+       vinserti32x4 zmm2, zmm2, xmm4, 0; \
+       \
+       /* ; Move 2 MSbits to top 24 bits, to be OR'ed later */ \
+       vpsllq  zmm2, zmm2, 40; \
+       \
+       vpunpckhqdq zmm21, zmm1, zmm0; \
+       vpunpcklqdq zmm19, zmm1, zmm0; \
+       \
+       vpsrlq  zmm20, zmm19, 44; \
+       vpsllq  zmm4, zmm21, 20; \
+       vpternlogq zmm20, zmm4, [.Lmask_44 ADD_RIP], 0xA8; /* ; (A OR B AND C) */ \
+       \
+       vpandq  zmm19, zmm19, [.Lmask_44 ADD_RIP]; \
+       vpsrlq  zmm21, zmm21, 24; \
+       \
+       /* ; zmm2 contains the 2 highest bits of the powers of R */ \
+       vporq   zmm21, zmm21, zmm2; \
+       \
+       /* ; Broadcast 44-bit limbs of R^4 */ \
+       mov     T0, A0; \
+       and     T0, [.Lmask_44 ADD_RIP]; /* ;; First limb (R^4[43:0]) */ \
+       vpbroadcastq zmm22, T0; \
+       \
+       mov     T0, A1; \
+       shrd    A0, T0, 44; \
+       and     A0, [.Lmask_44 ADD_RIP]; /* ;; Second limb (R^4[87:44]) */ \
+       vpbroadcastq zmm23, A0; \
+       \
+       shrd    A1, A2, 24; \
+       and     A1, [.Lmask_42 ADD_RIP]; /* ;; Third limb (R^4[129:88]) */ \
+       vpbroadcastq zmm24, A1; \
+       \
+       /* ; Generate 4*5*R^4 */ \
+       vpsllq  zmm25, zmm23, 2; \
+       vpsllq  zmm26, zmm24, 2; \
+       \
+       /* ; 5*R^4 */ \
+       vpaddq  zmm25, zmm25, zmm23; \
+       vpaddq  zmm26, zmm26, zmm24; \
+       \
+       /* ; 4*5*R^4 */ \
+       vpsllq  zmm25, zmm25, 2; \
+       vpsllq  zmm26, zmm26, 2; \
+       \
+       vpslldq zmm29, zmm19, 8; \
+       vpslldq zmm30, zmm20, 8; \
+       vpslldq zmm31, zmm21, 8; \
+       \
+       /* ; Calculate R^8-R^5 */ \
+       POLY1305_MUL_REDUCE_VEC(zmm19, zmm20, zmm21, \
+                               zmm22, zmm23, zmm24, \
+                               zmm25, zmm26, \
+                               zmm5, zmm6, zmm7, zmm8, zmm9, zmm10, \
+                               zmm11); \
+       \
+       /* ; Interleave powers of R: R^8 R^4 R^7 R^3 R^6 R^2 R^5 R */ \
+       vporq   zmm19, zmm19, zmm29; \
+       vporq   zmm20, zmm20, zmm30; \
+       vporq   zmm21, zmm21, zmm31; \
+       \
+       /* ; Broadcast R^8 */ \
+       vpbroadcastq zmm22, xmm19; \
+       vpbroadcastq zmm23, xmm20; \
+       vpbroadcastq zmm24, xmm21; \
+       \
+       /* ; Generate 4*5*R^8 */ \
+       vpsllq  zmm25, zmm23, 2; \
+       vpsllq  zmm26, zmm24, 2; \
+       \
+       /* ; 5*R^8 */ \
+       vpaddq  zmm25, zmm25, zmm23; \
+       vpaddq  zmm26, zmm26, zmm24; \
+       \
+       /* ; 4*5*R^8 */ \
+       vpsllq  zmm25, zmm25, 2; \
+       vpsllq  zmm26, zmm26, 2; \
+       \
+       cmp     LEN, POLY1305_BLOCK_SIZE*32; \
+       jb      .L_len_256_511; \
+       \
+       /* ; Store R^8-R for later use */ \
+       vmovdqa64 [rsp + STACK_r_save], zmm19; \
+       vmovdqa64 [rsp + STACK_r_save + 64], zmm20; \
+       vmovdqa64 [rsp + STACK_r_save + 64*2], zmm21; \
+       \
+       /* ; Calculate R^16-R^9 */ \
+       POLY1305_MUL_REDUCE_VEC(zmm19, zmm20, zmm21, \
+                               zmm22, zmm23, zmm24, \
+                               zmm25, zmm26, \
+                               zmm5, zmm6, zmm7, zmm8, zmm9, zmm10, \
+                               zmm11); \
+       \
+       /* ; Store R^16-R^9 for later use */ \
+       vmovdqa64 [rsp + STACK_r_save + 64*3], zmm19; \
+       vmovdqa64 [rsp + STACK_r_save + 64*4], zmm20; \
+       vmovdqa64 [rsp + STACK_r_save + 64*5], zmm21; \
+       \
+       /* ; Broadcast R^16 */ \
+       vpbroadcastq zmm22, xmm19; \
+       vpbroadcastq zmm23, xmm20; \
+       vpbroadcastq zmm24, xmm21; \
+       \
+       /* ; Generate 4*5*R^16 */ \
+       vpsllq  zmm25, zmm23, 2; \
+       vpsllq  zmm26, zmm24, 2; \
+       \
+       /* ; 5*R^16 */ \
+       vpaddq  zmm25, zmm25, zmm23; \
+       vpaddq  zmm26, zmm26, zmm24; \
+       \
+       /* ; 4*5*R^16 */ \
+       vpsllq  zmm25, zmm25, 2; \
+       vpsllq  zmm26, zmm26, 2; \
+       \
+       mov     T0, LEN; \
+       and     T0, 0xffffffffffffff00; /* ; multiple of 256 bytes */ \
+       \
+.L_poly1305_blocks_loop: \
+       cmp     T0, POLY1305_BLOCK_SIZE*16; \
+       jbe     .L_poly1305_blocks_loop_end; \
+       \
+       /* ; zmm13-zmm18 contain the 16 blocks of message plus the previous accumulator */ \
+       /* ; zmm22-24 contain the 5x44-bit limbs of the powers of R */ \
+       /* ; zmm25-26 contain the 5x44-bit limbs of the powers of R' (5*4*R) */ \
+       POLY1305_MSG_MUL_REDUCE_VEC16(zmm13, zmm14, zmm15, zmm16, zmm17, zmm18, \
+                                     zmm22, zmm23, zmm24, zmm25, zmm26, \
+                                     zmm5, zmm6, zmm7, zmm8, zmm9, zmm10, \
+                                     zmm19, zmm20, zmm21, zmm27, zmm28, zmm29, \
+                                     zmm30, zmm31, zmm11, zmm0, zmm1, \
+                                     zmm2, zmm3, zmm4, zmm12, MSG, T0); \
+       \
+       jmp     .L_poly1305_blocks_loop; \
+       \
+.L_poly1305_blocks_loop_end: \
+       \
+       /* ;; Need to multiply by r^16, r^15, r^14... r */ \
+       \
+       /* ; First multiply by r^16-r^9 */ \
+       \
+       /* ; Read R^16-R^9 */ \
+       vmovdqa64 zmm19, [rsp + STACK_r_save + 64*3]; \
+       vmovdqa64 zmm20, [rsp + STACK_r_save + 64*4]; \
+       vmovdqa64 zmm21, [rsp + STACK_r_save + 64*5]; \
+       /* ; Read R^8-R */ \
+       vmovdqa64 zmm22, [rsp + STACK_r_save]; \
+       vmovdqa64 zmm23, [rsp + STACK_r_save + 64]; \
+       vmovdqa64 zmm24, [rsp + STACK_r_save + 64*2]; \
+       \
+       /* ; zmm27 to have bits 87-44 of all 9-16th powers of R' in 8 qwords */ \
+       /* ; zmm28 to have bits 129-88 of all 9-16th powers of R' in 8 qwords */ \
+       vpsllq  zmm0, zmm20, 2; \
+       vpaddq  zmm27, zmm20, zmm0; /* ; R1' (R1*5) */ \
+       vpsllq  zmm1, zmm21, 2; \
+       vpaddq  zmm28, zmm21, zmm1; /* ; R2' (R2*5) */ \
+       \
+       /* ; 4*5*R */ \
+       vpsllq  zmm27, zmm27, 2; \
+       vpsllq  zmm28, zmm28, 2; \
+       \
+       /* ; Then multiply by r^8-r */ \
+       \
+       /* ; zmm25 to have bits 87-44 of all 1-8th powers of R' in 8 qwords */ \
+       /* ; zmm26 to have bits 129-88 of all 1-8th powers of R' in 8 qwords */ \
+       vpsllq  zmm2, zmm23, 2; \
+       vpaddq  zmm25, zmm23, zmm2; /* ; R1' (R1*5) */ \
+       vpsllq  zmm3, zmm24, 2; \
+       vpaddq  zmm26, zmm24, zmm3; /* ; R2' (R2*5) */ \
+       \
+       /* ; 4*5*R */ \
+       vpsllq  zmm25, zmm25, 2; \
+       vpsllq  zmm26, zmm26, 2; \
+       \
+       POLY1305_MUL_REDUCE_VEC16(zmm13, zmm14, zmm15, zmm16, zmm17, zmm18, \
+                                 zmm19, zmm20, zmm21, zmm27, zmm28, \
+                                 zmm22, zmm23, zmm24, zmm25, zmm26, \
+                                 zmm0, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6, \
+                                 zmm7, zmm8, zmm9, zmm10, zmm11, zmm12, zmm29); \
+       \
+       /* ;; Add all blocks (horizontally) */ \
+       vpaddq  zmm13, zmm13, zmm16; \
+       vpaddq  zmm14, zmm14, zmm17; \
+       vpaddq  zmm15, zmm15, zmm18; \
+       \
+       vextracti64x4   ymm0, zmm13, 1; \
+       vextracti64x4   ymm1, zmm14, 1; \
+       vextracti64x4   ymm2, zmm15, 1; \
+       \
+       vpaddq  ymm13, ymm13, ymm0; \
+       vpaddq  ymm14, ymm14, ymm1; \
+       vpaddq  ymm15, ymm15, ymm2; \
+       \
+       vextracti32x4   xmm10, ymm13, 1; \
+       vextracti32x4   xmm11, ymm14, 1; \
+       vextracti32x4   xmm12, ymm15, 1; \
+       \
+       vpaddq  xmm13, xmm13, xmm10; \
+       vpaddq  xmm14, xmm14, xmm11; \
+       vpaddq  xmm15, xmm15, xmm12; \
+       \
+       vpsrldq xmm10, xmm13, 8; \
+       vpsrldq xmm11, xmm14, 8; \
+       vpsrldq xmm12, xmm15, 8; \
+       \
+       /* ; Finish folding and clear second qword */ \
+       mov     T0, 0xfd; \
+       kmovq   k1, T0; \
+       vpaddq  xmm13{k1}{z}, xmm13, xmm10; \
+       vpaddq  xmm14{k1}{z}, xmm14, xmm11; \
+       vpaddq  xmm15{k1}{z}, xmm15, xmm12; \
+       \
+       add     MSG, POLY1305_BLOCK_SIZE*16; \
+       \
+       and     LEN, (POLY1305_BLOCK_SIZE*16 - 1); /* ; Get remaining lengths (LEN < 256 bytes) */ \
+       \
+.L_less_than_256: \
+       \
+       cmp     LEN, POLY1305_BLOCK_SIZE*8; \
+       jb      .L_less_than_128; \
+       \
+       /* ; Read next 128 bytes */ \
+       /* ; Load first block of data (128 bytes) */ \
+       vmovdqu64 zmm0, [MSG]; \
+       vmovdqu64 zmm1, [MSG + 64]; \
+       \
+       /* ; Interleave the data to form 44-bit limbs */ \
+       /* ; */ \
+       /* ; zmm13 to have bits 0-43 of all 8 blocks in 8 qwords */ \
+       /* ; zmm14 to have bits 87-44 of all 8 blocks in 8 qwords */ \
+       /* ; zmm15 to have bits 127-88 of all 8 blocks in 8 qwords */ \
+       vpunpckhqdq zmm5, zmm0, zmm1; \
+       vpunpcklqdq zmm3, zmm0, zmm1; \
+       \
+       vpsrlq  zmm4, zmm3, 44; \
+       vpsllq  zmm8, zmm5, 20; \
+       vpternlogq zmm4, zmm8, [.Lmask_44 ADD_RIP], 0xA8; /* ; (A OR B AND C) */ \
+       \
+       vpandq  zmm3, zmm3, [.Lmask_44 ADD_RIP]; \
+       vpsrlq  zmm5, zmm5, 24; \
+       \
+       /* ; Add 2^128 to all 8 final qwords of the message */ \
+       vporq   zmm5, zmm5, [.Lhigh_bit ADD_RIP]; \
+       \
+       vpaddq  zmm13, zmm13, zmm3; \
+       vpaddq  zmm14, zmm14, zmm4; \
+       vpaddq  zmm15, zmm15, zmm5; \
+       \
+       add     MSG, POLY1305_BLOCK_SIZE*8; \
+       sub     LEN, POLY1305_BLOCK_SIZE*8; \
+       \
+       POLY1305_MUL_REDUCE_VEC(zmm13, zmm14, zmm15, \
+                               zmm22, zmm23, zmm24, \
+                               zmm25, zmm26, \
+                               zmm5, zmm6, zmm7, zmm8, zmm9, zmm10, \
+                               zmm11); \
+       \
+       /* ;; Add all blocks (horizontally) */ \
+       vextracti64x4   ymm0, zmm13, 1; \
+       vextracti64x4   ymm1, zmm14, 1; \
+       vextracti64x4   ymm2, zmm15, 1; \
+       \
+       vpaddq  ymm13, ymm13, ymm0; \
+       vpaddq  ymm14, ymm14, ymm1; \
+       vpaddq  ymm15, ymm15, ymm2; \
+       \
+       vextracti32x4   xmm10, ymm13, 1; \
+       vextracti32x4   xmm11, ymm14, 1; \
+       vextracti32x4   xmm12, ymm15, 1; \
+       \
+       vpaddq  xmm13, xmm13, xmm10; \
+       vpaddq  xmm14, xmm14, xmm11; \
+       vpaddq  xmm15, xmm15, xmm12; \
+       \
+       vpsrldq xmm10, xmm13, 8; \
+       vpsrldq xmm11, xmm14, 8; \
+       vpsrldq xmm12, xmm15, 8; \
+       \
+       /* ; Finish folding and clear second qword */ \
+       mov     T0, 0xfd; \
+       kmovq   k1, T0; \
+       vpaddq  xmm13{k1}{z}, xmm13, xmm10; \
+       vpaddq  xmm14{k1}{z}, xmm14, xmm11; \
+       vpaddq  xmm15{k1}{z}, xmm15, xmm12; \
+       \
+.L_less_than_128: \
+       cmp     LEN, 32; /* ; If remaining bytes is <= 32, perform last blocks in scalar */ \
+       jbe     .L_simd_to_gp; \
+       \
+       mov     T0, LEN; \
+       and     T0, 0x3f; \
+       lea     T1, [.Lbyte64_len_to_mask_table ADD_RIP]; \
+       mov     T1, [T1 + 8*T0]; \
+       \
+       /* ; Load default byte masks */ \
+       mov     T2, 0xffffffffffffffff; \
+       xor     T3, T3; \
+       \
+       cmp     LEN, 64; \
+       cmovb   T2, T1; /* ; Load mask for first 64 bytes */ \
+       cmovg   T3, T1; /* ; Load mask for second 64 bytes */ \
+       \
+       kmovq   k1, T2; \
+       kmovq   k2, T3; \
+       vmovdqu8 zmm0{k1}{z}, [MSG]; \
+       vmovdqu8 zmm1{k2}{z}, [MSG + 64]; \
+       \
+       /* ; Pad last block message, if partial */ \
+       mov     T0, LEN; \
+       and     T0, 0x70; /* ; Multiple of 16 bytes */ \
+       /* ; Load last block of data (up to 112 bytes) */ \
+       shr     T0, 3; /* ; Get number of full qwords */ \
+       \
+       /* ; Interleave the data to form 44-bit limbs */ \
+       /* ; */ \
+       /* ; zmm13 to have bits 0-43 of all 8 blocks in 8 qwords */ \
+       /* ; zmm14 to have bits 87-44 of all 8 blocks in 8 qwords */ \
+       /* ; zmm15 to have bits 127-88 of all 8 blocks in 8 qwords */ \
+       vpunpckhqdq zmm4, zmm0, zmm1; \
+       vpunpcklqdq zmm2, zmm0, zmm1; \
+       \
+       vpsrlq  zmm3, zmm2, 44; \
+       vpsllq  zmm28, zmm4, 20; \
+       vpternlogq zmm3, zmm28, [.Lmask_44 ADD_RIP], 0xA8; /* ; (A OR B AND C) */ \
+       \
+       vpandq  zmm2, zmm2, [.Lmask_44 ADD_RIP]; \
+       vpsrlq  zmm4, zmm4, 24; \
+       \
+       lea     T1, [.Lqword_high_bit_mask ADD_RIP]; \
+       kmovb   k1, [T1 + T0]; \
+       /* ; Add 2^128 to final qwords of the message (all full blocks and partial block, */ \
+       /* ; if "pad_to_16" is selected) */ \
+       vporq   zmm4{k1}, zmm4, [.Lhigh_bit ADD_RIP]; \
+       \
+       vpaddq  zmm13, zmm13, zmm2; \
+       vpaddq  zmm14, zmm14, zmm3; \
+       vpaddq  zmm15, zmm15, zmm4; \
+       \
+       mov     T0, LEN; \
+       add     T0, 15; \
+       shr     T0, 4;      /* ; Get number of 16-byte blocks (including partial blocks) */ \
+       xor     LEN, LEN; /* ; All length will be consumed */ \
+       \
+       /* ; No need to shuffle data blocks (data is in the right order) */ \
+       cmp     T0, 8; \
+       je      .L_end_shuffle; \
+       \
+       cmp     T0, 4; \
+       je      .L_shuffle_blocks_4; \
+       jb      .L_shuffle_blocks_3; \
+       \
+       /* ; Number of 16-byte blocks > 4 */ \
+       cmp     T0, 6; \
+       je      .L_shuffle_blocks_6; \
+       ja      .L_shuffle_blocks_7; \
+       jmp     .L_shuffle_blocks_5; \
+       \
+.L_shuffle_blocks_3: \
+       SHUFFLE_DATA_BLOCKS_3(zmm13, zmm14, zmm15, T1); \
+       jmp     .L_end_shuffle; \
+.L_shuffle_blocks_4: \
+       SHUFFLE_DATA_BLOCKS_4(zmm13, zmm14, zmm15, T1); \
+       jmp     .L_end_shuffle; \
+.L_shuffle_blocks_5: \
+       SHUFFLE_DATA_BLOCKS_5(zmm13, zmm14, zmm15, T1); \
+       jmp     .L_end_shuffle; \
+.L_shuffle_blocks_6: \
+       SHUFFLE_DATA_BLOCKS_6(zmm13, zmm14, zmm15, T1); \
+       jmp     .L_end_shuffle; \
+.L_shuffle_blocks_7: \
+       SHUFFLE_DATA_BLOCKS_7(zmm13, zmm14, zmm15, T1); \
+       \
+.L_end_shuffle: \
+       \
+       /* ; zmm13-zmm15 contain the 8 blocks of message plus the previous accumulator */ \
+       /* ; zmm22-24 contain the 3x44-bit limbs of the powers of R */ \
+       /* ; zmm25-26 contain the 3x44-bit limbs of the powers of R' (5*4*R) */ \
+       POLY1305_MUL_REDUCE_VEC(zmm13, zmm14, zmm15, \
+                               zmm22, zmm23, zmm24, \
+                               zmm25, zmm26, \
+                               zmm5, zmm6, zmm7, zmm8, zmm9, zmm10, \
+                               zmm11); \
+       \
+       /* ;; Add all blocks (horizontally) */ \
+       vextracti64x4   ymm0, zmm13, 1; \
+       vextracti64x4   ymm1, zmm14, 1; \
+       vextracti64x4   ymm2, zmm15, 1; \
+       \
+       vpaddq  ymm13, ymm13, ymm0; \
+       vpaddq  ymm14, ymm14, ymm1; \
+       vpaddq  ymm15, ymm15, ymm2; \
+       \
+       vextracti32x4   xmm10, ymm13, 1; \
+       vextracti32x4   xmm11, ymm14, 1; \
+       vextracti32x4   xmm12, ymm15, 1; \
+       \
+       vpaddq  xmm13, xmm13, xmm10; \
+       vpaddq  xmm14, xmm14, xmm11; \
+       vpaddq  xmm15, xmm15, xmm12; \
+       \
+       vpsrldq xmm10, xmm13, 8; \
+       vpsrldq xmm11, xmm14, 8; \
+       vpsrldq xmm12, xmm15, 8; \
+       \
+       vpaddq  xmm13, xmm13, xmm10; \
+       vpaddq  xmm14, xmm14, xmm11; \
+       vpaddq  xmm15, xmm15, xmm12; \
+       \
+.L_simd_to_gp: \
+       /* ; Carry propagation */ \
+       vpsrlq  xmm0, xmm13, 44; \
+       vpandq  xmm13, xmm13, [.Lmask_44 ADD_RIP]; /* ; Clear top 20 bits */ \
+       vpaddq  xmm14, xmm14, xmm0; \
+       vpsrlq  xmm0, xmm14, 44; \
+       vpandq  xmm14, xmm14, [.Lmask_44 ADD_RIP]; /* ; Clear top 20 bits */ \
+       vpaddq  xmm15, xmm15, xmm0; \
+       vpsrlq  xmm0, xmm15, 42; \
+       vpandq  xmm15, xmm15, [.Lmask_42 ADD_RIP]; /* ; Clear top 22 bits */ \
+       vpsllq  xmm1, xmm0, 2; \
+       vpaddq  xmm0, xmm0, xmm1; \
+       vpaddq  xmm13, xmm13, xmm0; \
+       \
+       /* ; Put together A */ \
+       vmovq   A0, xmm13; \
+       \
+       vmovq   T0, xmm14; \
+       mov     T1, T0; \
+       shl     T1, 44; \
+       or      A0, T1; \
+       \
+       shr     T0, 20; \
+       vmovq   A2, xmm15; \
+       mov     A1, A2; \
+       shl     A1, 24; \
+       or      A1, T0; \
+       shr     A2, 40; \
+       \
+       /* ; Clear powers of R */ \
+       vpxorq  zmm0, zmm0, zmm0; \
+       vmovdqa64 [rsp + STACK_r_save], zmm0; \
+       vmovdqa64 [rsp + STACK_r_save + 64], zmm0; \
+       vmovdqa64 [rsp + STACK_r_save + 64*2], zmm0; \
+       vmovdqa64 [rsp + STACK_r_save + 64*3], zmm0; \
+       vmovdqa64 [rsp + STACK_r_save + 64*4], zmm0; \
+       vmovdqa64 [rsp + STACK_r_save + 64*5], zmm0; \
+       \
+       vzeroall; \
+       clear_zmm(ymm16); clear_zmm(ymm20); clear_zmm(ymm24); clear_zmm(ymm28); \
+       clear_zmm(ymm17); clear_zmm(ymm21); clear_zmm(ymm25); clear_zmm(ymm29); \
+       clear_zmm(ymm18); clear_zmm(ymm22); clear_zmm(ymm26); clear_zmm(ymm30); \
+       clear_zmm(ymm19); clear_zmm(ymm23); clear_zmm(ymm27); clear_zmm(ymm31); \
+       \
+.L_final_loop: \
+       cmp     LEN, POLY1305_BLOCK_SIZE; \
+       jb      .L_poly1305_blocks_exit; \
+       \
+       /* ;; A += MSG[i] */ \
+       add     A0, [MSG + 0]; \
+       adc     A1, [MSG + 8]; \
+       adc     A2, 1; /* ;; no padding bit */ \
+       \
+       mov     T0, R1; \
+       shr     T0, 2; \
+       add     T0, R1; /* ;; T0 = R1 + (R1 >> 2) */ \
+       \
+       POLY1305_MUL_REDUCE(A0, A1, A2, R0, R1, \
+                           T0, T1, T2, T3, GP_RAX, GP_RDX, A2_NOT_ZERO); \
+       \
+       add     MSG, POLY1305_BLOCK_SIZE; \
+       sub     LEN, POLY1305_BLOCK_SIZE; \
+       \
+       jmp     .L_final_loop; \
+       \
+.L_len_256_511: \
+       \
+       /* ; zmm13-zmm15 contain the 8 blocks of message plus the previous accumulator */ \
+       /* ; zmm22-24 contain the 3x44-bit limbs of the powers of R */ \
+       /* ; zmm25-26 contain the 3x44-bit limbs of the powers of R' (5*4*R) */ \
+       POLY1305_MUL_REDUCE_VEC(zmm13, zmm14, zmm15, \
+                               zmm22, zmm23, zmm24, \
+                               zmm25, zmm26, \
+                               zmm5, zmm6, zmm7, zmm8, zmm9, zmm10, \
+                               zmm11); \
+       \
+       /* ; Then multiply by r^8-r */ \
+       \
+       /* ; zmm19-zmm21 contains R^8-R, need to move it to zmm22-24, */ \
+       /* ; as it might be used in other part of the code */ \
+       vmovdqa64 zmm22, zmm19; \
+       vmovdqa64 zmm23, zmm20; \
+       vmovdqa64 zmm24, zmm21; \
+       \
+       /* ; zmm25 to have bits 87-44 of all 8 powers of R' in 8 qwords */ \
+       /* ; zmm26 to have bits 129-88 of all 8 powers of R' in 8 qwords */ \
+       vpsllq  zmm0, zmm23, 2; \
+       vpaddq  zmm25, zmm23, zmm0; /* ; R1' (R1*5) */ \
+       vpsllq  zmm1, zmm24, 2; \
+       vpaddq  zmm26, zmm24, zmm1; /* ; R2' (R2*5) */ \
+       \
+       /* ; 4*5*R^8 */ \
+       vpsllq  zmm25, zmm25, 2; \
+       vpsllq  zmm26, zmm26, 2; \
+       \
+       vpaddq  zmm13, zmm13, zmm16; \
+       vpaddq  zmm14, zmm14, zmm17; \
+       vpaddq  zmm15, zmm15, zmm18; \
+       \
+       /* ; zmm13-zmm15 contain the 8 blocks of message plus the previous accumulator */ \
+       /* ; zmm22-24 contain the 3x44-bit limbs of the powers of R */ \
+       /* ; zmm25-26 contain the 3x44-bit limbs of the powers of R' (5*4*R) */ \
+       POLY1305_MUL_REDUCE_VEC(zmm13, zmm14, zmm15, \
+                               zmm22, zmm23, zmm24, \
+                               zmm25, zmm26, \
+                               zmm5, zmm6, zmm7, zmm8, zmm9, zmm10, \
+                               zmm11); \
+       \
+       /* ;; Add all blocks (horizontally) */ \
+       vextracti64x4   ymm0, zmm13, 1; \
+       vextracti64x4   ymm1, zmm14, 1; \
+       vextracti64x4   ymm2, zmm15, 1; \
+       \
+       vpaddq  ymm13, ymm13, ymm0; \
+       vpaddq  ymm14, ymm14, ymm1; \
+       vpaddq  ymm15, ymm15, ymm2; \
+       \
+       vextracti32x4   xmm10, ymm13, 1; \
+       vextracti32x4   xmm11, ymm14, 1; \
+       vextracti32x4   xmm12, ymm15, 1; \
+       \
+       vpaddq  xmm13, xmm13, xmm10; \
+       vpaddq  xmm14, xmm14, xmm11; \
+       vpaddq  xmm15, xmm15, xmm12; \
+       \
+       vpsrldq xmm10, xmm13, 8; \
+       vpsrldq xmm11, xmm14, 8; \
+       vpsrldq xmm12, xmm15, 8; \
+       \
+       /* ; Finish folding and clear second qword */ \
+       mov     T0, 0xfd; \
+       kmovq   k1, T0; \
+       vpaddq  xmm13{k1}{z}, xmm13, xmm10; \
+       vpaddq  xmm14{k1}{z}, xmm14, xmm11; \
+       vpaddq  xmm15{k1}{z}, xmm15, xmm12; \
+       \
+       add     MSG, POLY1305_BLOCK_SIZE*16; \
+       sub     LEN, POLY1305_BLOCK_SIZE*16; \
+       \
+       jmp     .L_less_than_256; \
+.L_poly1305_blocks_exit: \
+
+/*
+;; =============================================================================
+;; =============================================================================
+;; Creates stack frame and saves registers
+;; =============================================================================
+*/
+#define FUNC_ENTRY() \
+       mov     rax, rsp; \
+       CFI_DEF_CFA_REGISTER(rax); \
+       sub     rsp, STACK_SIZE; \
+       and     rsp, -64; \
+       \
+       mov     [rsp + STACK_gpr_save + 8*0], rbx; \
+       mov     [rsp + STACK_gpr_save + 8*1], rbp; \
+       mov     [rsp + STACK_gpr_save + 8*2], r12; \
+       mov     [rsp + STACK_gpr_save + 8*3], r13; \
+       mov     [rsp + STACK_gpr_save + 8*4], r14; \
+       mov     [rsp + STACK_gpr_save + 8*5], r15; \
+       mov     [rsp + STACK_rsp_save], rax; \
+       CFI_CFA_ON_STACK(STACK_rsp_save, 0)
+
+/*
+;; =============================================================================
+;; =============================================================================
+;; Restores registers and removes the stack frame
+;; =============================================================================
+*/
+#define FUNC_EXIT() \
+       mov     rbx, [rsp + STACK_gpr_save + 8*0]; \
+       mov     rbp, [rsp + STACK_gpr_save + 8*1]; \
+       mov     r12, [rsp + STACK_gpr_save + 8*2]; \
+       mov     r13, [rsp + STACK_gpr_save + 8*3]; \
+       mov     r14, [rsp + STACK_gpr_save + 8*4]; \
+       mov     r15, [rsp + STACK_gpr_save + 8*5]; \
+       mov     rsp, [rsp + STACK_rsp_save]; \
+       CFI_DEF_CFA_REGISTER(rsp)
+
+.text
+
+/*
+;; =============================================================================
+;; =============================================================================
+;; void poly1305_aead_update_fma_avx512(const void *msg, const uint64_t msg_len,
+;;                                      void *hash, const void *key)
+;; arg1 - Input message
+;; arg2 - Message length
+;; arg3 - Input/output hash
+;; arg4 - Poly1305 key
+*/
+.align 32
+.globl _gcry_poly1305_amd64_avx512_blocks
+ELF(.type _gcry_poly1305_amd64_avx512_blocks,@function;)
+_gcry_poly1305_amd64_avx512_blocks:
+       CFI_STARTPROC()
+       spec_stop_avx512_intel_syntax;
+       FUNC_ENTRY()
+
+#define _a0 gp3
+#define _a0 gp3
+#define _a1 gp4
+#define _a2 gp5
+#define _r0 gp6
+#define _r1 gp7
+#define _len arg2
+#define _arg3 arg4             /* ; use rcx, arg3 = rdx */
+
+       /* ;; load R */
+       mov     _r0, [arg4 + 0 * 8]
+       mov     _r1, [arg4 + 1 * 8]
+
+       /* ;; load accumulator / current hash value */
+       /* ;; note: arg4 can't be used beyond this point */
+       mov     _arg3, arg3             /* ; note: _arg3 = arg4 (linux) */
+       mov     _a0, [_arg3 + 0 * 8]
+       mov     _a1, [_arg3 + 1 * 8]
+       mov     DWORD(_a2), [_arg3 + 2 * 8]    /* ; note: _a2 = arg4 (win) */
+
+       POLY1305_BLOCKS(arg1, _len, _a0, _a1, _a2, _r0, _r1,
+                       gp10, gp11, gp8, gp9, rax, rdx)
+
+       /* ;; save accumulator back */
+       mov     [_arg3 + 0 * 8], _a0
+       mov     [_arg3 + 1 * 8], _a1
+       mov     [_arg3 + 2 * 8], DWORD(_a2)
+
+       FUNC_EXIT()
+       xor eax, eax
+       kxorw k1, k1, k1
+       kxorw k2, k2, k2
+       ret_spec_stop
+       CFI_ENDPROC()
+ELF(.size _gcry_poly1305_amd64_avx512_blocks,
+         .-_gcry_poly1305_amd64_avx512_blocks;)
+
+#endif
+#endif
index 19cee5f6f3ecad5cd113f65a807b38a41ef52de6..809a2850c631a15c144451a5a4b04dab92b21baf 100644 (file)
@@ -4,7 +4,7 @@
  * This file is part of Libgcrypt.
  *
  * Libgcrypt is free software; you can redistribute it and/or modify
- * it under the terms of the GNU Lesser general Public License as
+ * it under the terms of the GNU Lesser General Public License as
  * published by the Free Software Foundation; either version 2.1 of
  * the License, or (at your option) any later version.
  *
 #define POLY1305_BLOCKSIZE 16
 
 
+/* POLY1305_USE_AVX512 indicates whether to compile with Intel AVX512 code. */
+#undef POLY1305_USE_AVX512
+#if defined(__x86_64__) && defined(HAVE_GCC_INLINE_ASM_AVX512) && \
+    defined(HAVE_INTEL_SYNTAX_PLATFORM_AS) && \
+    (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
+     defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS))
+# define POLY1305_USE_AVX512 1
+#endif
+
+/* POLY1305_USE_PPC_VEC indicates whether to enable PowerPC vector code. */
+#undef POLY1305_USE_PPC_VEC
+#ifdef ENABLE_PPC_CRYPTO_SUPPORT
+# if defined(HAVE_COMPATIBLE_CC_PPC_ALTIVEC) && \
+     defined(HAVE_GCC_INLINE_ASM_PPC_ALTIVEC) && \
+     !defined(WORDS_BIGENDIAN)
+#  if __GNUC__ >= 4
+#   define POLY1305_USE_PPC_VEC 1
+#  endif
+# endif
+#endif
+
+
 typedef struct
 {
   u32 k[4];
@@ -46,6 +68,12 @@ typedef struct poly1305_context_s
   POLY1305_STATE state;
   byte buffer[POLY1305_BLOCKSIZE];
   unsigned int leftover;
+#ifdef POLY1305_USE_AVX512
+  unsigned int use_avx512:1;
+#endif
+#ifdef POLY1305_USE_PPC_VEC
+  unsigned int use_p10:1;
+#endif
 } poly1305_context_t;
 
 
diff --git a/cipher/poly1305-p10le.s b/cipher/poly1305-p10le.s
new file mode 100644 (file)
index 0000000..4202b41
--- /dev/null
@@ -0,0 +1,841 @@
+# Copyright 2021- IBM Inc. All rights reserved
+#
+# This file is part of Libgcrypt.
+#
+# Libgcrypt is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as
+# published by the Free Software Foundation; either version 2.1 of
+# the License, or (at your option) any later version.
+#
+# Libgcrypt is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public
+# License along with this program; if not, see <http://www.gnu.org/licenses/>.
+#
+#===================================================================================
+# Written by Danny Tsen <dtsen@us.ibm.com>
+#
+# Poly1305 - this version mainly using vector/VSX/Scalar
+#  - 26 bits limbs
+#  - Handle multiple 64 byte blcoks but need at least 2 64 bytes block
+#
+# Improve performance by breaking down polynominal to the sum of products with
+#     h4 = m1 * r⁴ + m2 * r³ + m3 * r² + m4 * r
+#
+#  07/22/21 - this revison based on the above sum of products.  Setup r^4, r^3, r^2, r and s3, s2, s1, s0
+#             to 9 vectors for multiplications.
+#
+# setup r^4, r^3, r^2, r vectors
+#    vs    [r^1, r^3, r^2, r^4]
+#    vs0 = [r0,.....]
+#    vs1 = [r1,.....]
+#    vs2 = [r2,.....]
+#    vs3 = [r3,.....]
+#    vs4 = [r4,.....]
+#    vs5 = [r1*5,...]
+#    vs6 = [r2*5,...]
+#    vs7 = [r2*5,...]
+#    vs8 = [r4*5,...]
+#
+#  Each word in a vector consists a member of a "r/s" in [a * r/s].
+#
+# r0, r4*5, r3*5, r2*5, r1*5;
+# r1, r0,   r4*5, r3*5, r2*5;
+# r2, r1,   r0,   r4*5, r3*5;
+# r3, r2,   r1,   r0,   r4*5;
+# r4, r3,   r2,   r1,   r0  ;
+#
+#
+# gcry_poly1305_p10le_4blocks( uint8_t *k, uint32_t mlen, uint8_t *m)
+#  k = 32 bytes key
+#  r3 = k (r, s)
+#  r4 = mlen
+#  r5 = m
+#
+.text
+
+# Block size 16 bytes
+# key = (r, s)
+# clamp r &= 0x0FFFFFFC0FFFFFFC 0x0FFFFFFC0FFFFFFF
+# p = 2^130 - 5
+# a += m
+# a = (r + a) % p
+# a += s
+# 16 bytes (a)
+#
+# p[0] = a0*r0 + a1*r4*5 + a2*r3*5 + a3*r2*5 + a4*r1*5;
+# p[1] = a0*r1 + a1*r0   + a2*r4*5 + a3*r3*5 + a4*r2*5;
+# p[2] = a0*r2 + a1*r1   + a2*r0   + a3*r4*5 + a4*r3*5;
+# p[3] = a0*r3 + a1*r2   + a2*r1   + a3*r0   + a4*r4*5;
+# p[4] = a0*r4 + a1*r3   + a2*r2   + a3*r1   + a4*r0  ;
+#
+#    [r^2, r^3, r^1, r^4]
+#    [m3,  m2,  m4,  m1]
+#
+# multiply odd and even words
+.macro mul_odd
+       vmulouw 14, 4, 26
+       vmulouw 10, 5, 3
+       vmulouw 11, 6, 2
+       vmulouw 12, 7, 1
+       vmulouw 13, 8, 0
+       vmulouw 15, 4, 27
+       vaddudm 14, 14, 10
+       vaddudm 14, 14, 11
+       vmulouw 10, 5, 26
+       vmulouw 11, 6, 3
+       vaddudm 14, 14, 12
+       vaddudm 14, 14, 13      # x0
+       vaddudm 15, 15, 10
+       vaddudm 15, 15, 11
+       vmulouw 12, 7, 2
+       vmulouw 13, 8, 1
+       vaddudm 15, 15, 12
+       vaddudm 15, 15, 13      # x1
+       vmulouw 16, 4, 28
+       vmulouw 10, 5, 27
+       vmulouw 11, 6, 26
+       vaddudm 16, 16, 10
+       vaddudm 16, 16, 11
+       vmulouw 12, 7, 3
+       vmulouw 13, 8, 2
+       vaddudm 16, 16, 12
+       vaddudm 16, 16, 13      # x2
+       vmulouw 17, 4, 29
+       vmulouw 10, 5, 28
+       vmulouw 11, 6, 27
+       vaddudm 17, 17, 10
+       vaddudm 17, 17, 11
+       vmulouw 12, 7, 26
+       vmulouw 13, 8, 3
+       vaddudm 17, 17, 12
+       vaddudm 17, 17, 13      # x3
+       vmulouw 18, 4, 30
+       vmulouw 10, 5, 29
+       vmulouw 11, 6, 28
+       vaddudm 18, 18, 10
+       vaddudm 18, 18, 11
+       vmulouw 12, 7, 27
+       vmulouw 13, 8, 26
+       vaddudm 18, 18, 12
+       vaddudm 18, 18, 13      # x4
+.endm
+
+.macro mul_even
+       vmuleuw 9, 4, 26
+       vmuleuw 10, 5, 3
+       vmuleuw 11, 6, 2
+       vmuleuw 12, 7, 1
+       vmuleuw 13, 8, 0
+       vaddudm 14, 14, 9
+       vaddudm 14, 14, 10
+       vaddudm 14, 14, 11
+       vaddudm 14, 14, 12
+       vaddudm 14, 14, 13      # x0
+
+       vmuleuw 9, 4, 27
+       vmuleuw 10, 5, 26
+       vmuleuw 11, 6, 3
+       vmuleuw 12, 7, 2
+       vmuleuw 13, 8, 1
+       vaddudm 15, 15, 9
+       vaddudm 15, 15, 10
+       vaddudm 15, 15, 11
+       vaddudm 15, 15, 12
+       vaddudm 15, 15, 13      # x1
+
+       vmuleuw 9, 4, 28
+       vmuleuw 10, 5, 27
+       vmuleuw 11, 6, 26
+       vmuleuw 12, 7, 3
+       vmuleuw 13, 8, 2
+       vaddudm 16, 16, 9
+       vaddudm 16, 16, 10
+       vaddudm 16, 16, 11
+       vaddudm 16, 16, 12
+       vaddudm 16, 16, 13      # x2
+
+       vmuleuw 9, 4, 29
+       vmuleuw 10, 5, 28
+       vmuleuw 11, 6, 27
+       vmuleuw 12, 7, 26
+       vmuleuw 13, 8, 3
+       vaddudm 17, 17, 9
+       vaddudm 17, 17, 10
+       vaddudm 17, 17, 11
+       vaddudm 17, 17, 12
+       vaddudm 17, 17, 13      # x3
+
+       vmuleuw 9, 4, 30
+       vmuleuw 10, 5, 29
+       vmuleuw 11, 6, 28
+       vmuleuw 12, 7, 27
+       vmuleuw 13, 8, 26
+       vaddudm 18, 18, 9
+       vaddudm 18, 18, 10
+       vaddudm 18, 18, 11
+       vaddudm 18, 18, 12
+       vaddudm 18, 18, 13      # x4
+.endm
+
+# setup r^4, r^3, r^2, r vectors
+#    [r, r^3, r^2, r^4]
+#    vs0 = [r0,...]
+#    vs1 = [r1,...]
+#    vs2 = [r2,...]
+#    vs3 = [r3,...]
+#    vs4 = [r4,...]
+#    vs5 = [r4*5,...]
+#    vs6 = [r3*5,...]
+#    vs7 = [r2*5,...]
+#    vs8 = [r1*5,...]
+#
+# r0, r4*5, r3*5, r2*5, r1*5;
+# r1, r0,   r4*5, r3*5, r2*5;
+# r2, r1,   r0,   r4*5, r3*5;
+# r3, r2,   r1,   r0,   r4*5;
+# r4, r3,   r2,   r1,   r0  ;
+#
+.macro poly1305_setup_r
+
+       # save r
+       xxlor   26, 58, 58
+       xxlor   27, 59, 59
+       xxlor   28, 60, 60
+       xxlor   29, 61, 61
+       xxlor   30, 62, 62
+
+       xxlxor  31, 31, 31
+
+#    [r, r^3, r^2, r^4]
+       # compute r^2
+       vmr     4, 26
+       vmr     5, 27
+       vmr     6, 28
+       vmr     7, 29
+       vmr     8, 30
+       bl      do_mul          # r^2 r^1
+       xxpermdi 58, 58, 36, 0x3                # r0
+       xxpermdi 59, 59, 37, 0x3                # r1
+       xxpermdi 60, 60, 38, 0x3                # r2
+       xxpermdi 61, 61, 39, 0x3                # r3
+       xxpermdi 62, 62, 40, 0x3                # r4
+       xxpermdi 36, 36, 36, 0x3
+       xxpermdi 37, 37, 37, 0x3
+       xxpermdi 38, 38, 38, 0x3
+       xxpermdi 39, 39, 39, 0x3
+       xxpermdi 40, 40, 40, 0x3
+       vspltisb 13, 2
+       vsld    9, 27, 13
+       vsld    10, 28, 13
+       vsld    11, 29, 13
+       vsld    12, 30, 13
+       vaddudm 0, 9, 27
+       vaddudm 1, 10, 28
+       vaddudm 2, 11, 29
+       vaddudm 3, 12, 30
+
+       bl      do_mul          # r^4 r^3
+       vmrgow  26, 26, 4
+       vmrgow  27, 27, 5
+       vmrgow  28, 28, 6
+       vmrgow  29, 29, 7
+       vmrgow  30, 30, 8
+       vspltisb 13, 2
+       vsld    9, 27, 13
+       vsld    10, 28, 13
+       vsld    11, 29, 13
+       vsld    12, 30, 13
+       vaddudm 0, 9, 27
+       vaddudm 1, 10, 28
+       vaddudm 2, 11, 29
+       vaddudm 3, 12, 30
+
+       # r^2 r^4
+       xxlor   0, 58, 58
+       xxlor   1, 59, 59
+       xxlor   2, 60, 60
+       xxlor   3, 61, 61
+       xxlor   4, 62, 62
+       xxlor   5, 32, 32
+       xxlor   6, 33, 33
+       xxlor   7, 34, 34
+       xxlor   8, 35, 35
+
+       vspltw  9, 26, 3
+       vspltw  10, 26, 2
+       vmrgow  26, 10, 9
+       vspltw  9, 27, 3
+       vspltw  10, 27, 2
+       vmrgow  27, 10, 9
+       vspltw  9, 28, 3
+       vspltw  10, 28, 2
+       vmrgow  28, 10, 9
+       vspltw  9, 29, 3
+       vspltw  10, 29, 2
+       vmrgow  29, 10, 9
+       vspltw  9, 30, 3
+       vspltw  10, 30, 2
+       vmrgow  30, 10, 9
+
+       vsld    9, 27, 13
+       vsld    10, 28, 13
+       vsld    11, 29, 13
+       vsld    12, 30, 13
+       vaddudm 0, 9, 27
+       vaddudm 1, 10, 28
+       vaddudm 2, 11, 29
+       vaddudm 3, 12, 30
+.endm
+
+do_mul:
+       mul_odd
+
+       # do reduction ( h %= p )
+       # carry reduction
+       vspltisb 9, 2
+       vsrd    10, 14, 31
+       vsrd    11, 17, 31
+       vand    7, 17, 25
+       vand    4, 14, 25
+       vaddudm 18, 18, 11
+       vsrd    12, 18, 31
+       vaddudm 15, 15, 10
+
+       vsrd    11, 15, 31
+       vand    8, 18, 25
+       vand    5, 15, 25
+       vaddudm 4, 4, 12
+       vsld    10, 12, 9
+       vaddudm 6, 16, 11
+
+       vsrd    13, 6, 31
+       vand    6, 6, 25
+       vaddudm 4, 4, 10
+       vsrd    10, 4, 31
+       vaddudm 7, 7, 13
+
+       vsrd    11, 7, 31
+       vand    7, 7, 25
+       vand    4, 4, 25
+       vaddudm 5, 5, 10
+       vaddudm 8, 8, 11
+       blr
+
+#
+# init key
+#
+do_poly1305_init:
+       ld      10, rmask@got(2)
+       ld      11, 0(10)
+       ld      12, 8(10)
+
+       li      14, 16
+       li      15, 32
+       ld      10, cnum@got(2)
+       lvx     25, 0, 10       # v25 - mask
+       lvx     31, 14, 10      # v31 = 1a
+       lvx     19, 15, 10      # v19 = 1 << 24
+       lxv     24, 48(10)      # vs24
+       lxv     25, 64(10)      # vs25
+
+       # initialize
+       # load key from r3 to vectors
+       ld      9, 16(3)
+       ld      10, 24(3)
+       ld      11, 0(3)
+       ld      12, 8(3)
+
+       # break 26 bits
+       extrdi  14, 9, 26, 38
+       extrdi  15, 9, 26, 12
+       extrdi  16, 9, 12, 0
+       mtvsrdd 58, 0, 14
+       insrdi  16, 10, 14, 38
+       mtvsrdd 59, 0, 15
+       extrdi  17, 10, 26, 24
+       mtvsrdd 60, 0, 16
+       extrdi  18, 10, 24, 0
+       mtvsrdd 61, 0, 17
+       mtvsrdd 62, 0, 18
+
+       # r1 = r1 * 5, r2 = r2 * 5, r3 = r3 * 5, r4 = r4 * 5
+       li      9, 5
+       mtvsrdd 36, 0, 9
+       vmulouw 0, 27, 4                # v0 = rr0
+       vmulouw 1, 28, 4                # v1 = rr1
+       vmulouw 2, 29, 4                # v2 = rr2
+       vmulouw 3, 30, 4                # v3 = rr3
+       blr
+
+#
+# gcry_poly1305_p10le_4blocks( uint8_t *k, uint32_t mlen, uint8_t *m)
+#  k = 32 bytes key
+#  r3 = k (r, s)
+#  r4 = mlen
+#  r5 = m
+#
+.global gcry_poly1305_p10le_4blocks
+.align 5
+gcry_poly1305_p10le_4blocks:
+_gcry_poly1305_p10le_4blocks:
+       cmpdi   5, 128
+       blt     Out_no_poly1305
+
+       stdu 1,-1024(1)
+       mflr 0
+
+       std     14,112(1)
+       std     15,120(1)
+       std     16,128(1)
+       std     17,136(1)
+       std     18,144(1)
+       std     19,152(1)
+       std     20,160(1)
+       std     21,168(1)
+       std     31,248(1)
+       li      14, 256
+       stvx    20, 14, 1
+       addi    14, 14, 16
+       stvx    21, 14, 1
+       addi    14, 14, 16
+       stvx    22, 14, 1
+       addi    14, 14, 16
+       stvx    23, 14, 1
+       addi    14, 14, 16
+       stvx    24, 14, 1
+       addi    14, 14, 16
+       stvx    25, 14, 1
+       addi    14, 14, 16
+       stvx    26, 14, 1
+       addi    14, 14, 16
+       stvx    27, 14, 1
+       addi    14, 14, 16
+       stvx    28, 14, 1
+       addi    14, 14, 16
+       stvx    29, 14, 1
+       addi    14, 14, 16
+       stvx    30, 14, 1
+       addi    14, 14, 16
+       stvx    31, 14, 1
+
+       addi    14, 14, 16
+       stxvx   14, 14, 1
+       addi    14, 14, 16
+       stxvx   15, 14, 1
+       addi    14, 14, 16
+       stxvx   16, 14, 1
+       addi    14, 14, 16
+       stxvx   17, 14, 1
+       addi    14, 14, 16
+       stxvx   18, 14, 1
+       addi    14, 14, 16
+       stxvx   19, 14, 1
+       addi    14, 14, 16
+       stxvx   20, 14, 1
+       addi    14, 14, 16
+       stxvx   21, 14, 1
+       addi    14, 14, 16
+       stxvx   22, 14, 1
+       addi    14, 14, 16
+       stxvx   23, 14, 1
+       addi    14, 14, 16
+       stxvx   24, 14, 1
+       addi    14, 14, 16
+       stxvx   25, 14, 1
+       addi    14, 14, 16
+       stxvx   26, 14, 1
+       addi    14, 14, 16
+       stxvx   27, 14, 1
+       addi    14, 14, 16
+       stxvx   28, 14, 1
+       addi    14, 14, 16
+       stxvx   29, 14, 1
+       addi    14, 14, 16
+       stxvx   30, 14, 1
+       addi    14, 14, 16
+       stxvx   31, 14, 1
+       std     0, 1040(1)
+
+       bl do_poly1305_init
+
+       li      21, 0   # counter to message
+
+       poly1305_setup_r
+
+       # load previous state
+       # break/convert r6 to 26 bits
+       ld      9, 32(3)
+       ld      10, 40(3)
+       lwz     19, 48(3)
+       sldi    19, 19, 24
+       mtvsrdd 41, 0, 19
+       extrdi  14, 9, 26, 38
+       extrdi  15, 9, 26, 12
+       extrdi  16, 9, 12, 0
+       mtvsrdd 36, 0, 14
+       insrdi  16, 10, 14, 38
+       mtvsrdd 37, 0, 15
+       extrdi  17, 10, 26, 24
+       mtvsrdd 38, 0, 16
+       extrdi  18, 10, 24, 0
+       mtvsrdd 39, 0, 17
+       mtvsrdd 40, 0, 18
+       vor     8, 8, 9
+
+       # input m1 m2
+       add     20, 4, 21
+       xxlor   49, 24, 24
+       xxlor   50, 25, 25
+       lxvw4x  43, 0, 20
+       addi    17, 20, 16
+       lxvw4x  44, 0, 17
+       vperm   14, 11, 12, 17
+       vperm   15, 11, 12, 18
+       vand    9, 14, 25       # a0
+       vsrd    10, 14, 31      # >> 26
+       vsrd    11, 10, 31      # 12 bits left
+       vand    10, 10, 25      # a1
+       vspltisb 13, 12
+       vand    16, 15, 25
+       vsld    12, 16, 13
+       vor     11, 11, 12
+       vand    11, 11, 25      # a2
+       vspltisb 13, 14
+       vsrd    12, 15, 13      # >> 14
+       vsrd    13, 12, 31      # >> 26, a4
+       vand    12, 12, 25      # a3
+
+       vaddudm 20, 4, 9
+       vaddudm 21, 5, 10
+       vaddudm 22, 6, 11
+       vaddudm 23, 7, 12
+       vaddudm 24, 8, 13
+
+       # m3 m4
+       addi    17, 17, 16
+       lxvw4x  43, 0, 17
+       addi    17, 17, 16
+       lxvw4x  44, 0, 17
+       vperm   14, 11, 12, 17
+       vperm   15, 11, 12, 18
+       vand    9, 14, 25       # a0
+       vsrd    10, 14, 31      # >> 26
+       vsrd    11, 10, 31      # 12 bits left
+       vand    10, 10, 25      # a1
+       vspltisb 13, 12
+       vand    16, 15, 25
+       vsld    12, 16, 13
+       vspltisb 13, 14
+       vor     11, 11, 12
+       vand    11, 11, 25      # a2
+       vsrd    12, 15, 13      # >> 14
+       vsrd    13, 12, 31      # >> 26, a4
+       vand    12, 12, 25      # a3
+
+       # Smash 4 message blocks into 5 vectors of [m4,  m2,  m3,  m1]
+       vmrgow  4, 9, 20
+       vmrgow  5, 10, 21
+       vmrgow  6, 11, 22
+       vmrgow  7, 12, 23
+       vmrgow  8, 13, 24
+       vaddudm 8, 8, 19
+
+       addi    5, 5, -64
+       addi    21, 21, 64
+
+       li      9, 64
+       divdu   31, 5, 9
+
+       mtctr   31
+
+# h4 =   m1 * r⁴ + m2 * r³ + m3 * r² + m4 * r
+# Rewrite the polynominal sum of product as follows,
+# h1 = (h0 + m1) * r^2,        h2 = (h0 + m2) * r^2
+# h3 = (h1 + m3) * r^2,        h4 = (h2 + m4) * r^2  --> (h0 + m1) r*4 + (h3 + m3) r^2, (h0 + m2) r^4 + (h0 + m4) r^2
+#  .... Repeat
+# h5 = (h3 + m5) * r^2,        h6 = (h4 + m6) * r^2  -->
+# h7 = (h5 + m7) * r^2,        h8 = (h6 + m8) * r^1  --> m5 * r^4 + m6 * r^3 + m7 * r^2 + m8 * r
+#
+loop_4blocks:
+
+       # Multiply odd words and even words
+       mul_odd
+       mul_even
+       # carry reduction
+       vspltisb 9, 2
+       vsrd    10, 14, 31
+       vsrd    11, 17, 31
+       vand    7, 17, 25
+       vand    4, 14, 25
+       vaddudm 18, 18, 11
+       vsrd    12, 18, 31
+       vaddudm 15, 15, 10
+
+       vsrd    11, 15, 31
+       vand    8, 18, 25
+       vand    5, 15, 25
+       vaddudm 4, 4, 12
+       vsld    10, 12, 9
+       vaddudm 6, 16, 11
+
+       vsrd    13, 6, 31
+       vand    6, 6, 25
+       vaddudm 4, 4, 10
+       vsrd    10, 4, 31
+       vaddudm 7, 7, 13
+
+       vsrd    11, 7, 31
+       vand    7, 7, 25
+       vand    4, 4, 25
+       vaddudm 5, 5, 10
+       vaddudm 8, 8, 11
+
+       # input m1  m2  m3  m4
+       add     20, 4, 21
+       xxlor   49, 24, 24
+       xxlor   50, 25, 25
+       lxvw4x  43, 0, 20
+       addi    17, 20, 16
+       lxvw4x  44, 0, 17
+       vperm   14, 11, 12, 17
+       vperm   15, 11, 12, 18
+       addi    17, 17, 16
+       lxvw4x  43, 0, 17
+       addi    17, 17, 16
+       lxvw4x  44, 0, 17
+       vperm   17, 11, 12, 17
+       vperm   18, 11, 12, 18
+
+       vand    20, 14, 25      # a0
+       vand    9, 17, 25       # a0
+       vsrd    21, 14, 31      # >> 26
+       vsrd    22, 21, 31      # 12 bits left
+       vsrd    10, 17, 31      # >> 26
+       vsrd    11, 10, 31      # 12 bits left
+
+       vand    21, 21, 25      # a1
+       vand    10, 10, 25      # a1
+
+       vspltisb 13, 12
+       vand    16, 15, 25
+       vsld    23, 16, 13
+       vor     22, 22, 23
+       vand    22, 22, 25      # a2
+       vand    16, 18, 25
+       vsld    12, 16, 13
+       vor     11, 11, 12
+       vand    11, 11, 25      # a2
+       vspltisb 13, 14
+       vsrd    23, 15, 13      # >> 14
+       vsrd    24, 23, 31      # >> 26, a4
+       vand    23, 23, 25      # a3
+       vsrd    12, 18, 13      # >> 14
+       vsrd    13, 12, 31      # >> 26, a4
+       vand    12, 12, 25      # a3
+
+       vaddudm 4, 4, 20
+       vaddudm 5, 5, 21
+       vaddudm 6, 6, 22
+       vaddudm 7, 7, 23
+       vaddudm 8, 8, 24
+
+       # Smash 4 message blocks into 5 vectors of [m4,  m2,  m3,  m1]
+       vmrgow  4, 9, 4
+       vmrgow  5, 10, 5
+       vmrgow  6, 11, 6
+       vmrgow  7, 12, 7
+       vmrgow  8, 13, 8
+       vaddudm 8, 8, 19
+
+       addi    5, 5, -64
+       addi    21, 21, 64
+
+       bdnz    loop_4blocks
+
+       xxlor   58, 0, 0
+       xxlor   59, 1, 1
+       xxlor   60, 2, 2
+       xxlor   61, 3, 3
+       xxlor   62, 4, 4
+       xxlor   32, 5, 5
+       xxlor   33, 6, 6
+       xxlor   34, 7, 7
+       xxlor   35, 8, 8
+
+       # Multiply odd words and even words
+       mul_odd
+       mul_even
+
+       # Sum the products.
+       xxpermdi 41, 31, 46, 0
+       xxpermdi 42, 31, 47, 0
+       vaddudm 4, 14, 9
+       xxpermdi 36, 31, 36, 3
+       vaddudm 5, 15, 10
+       xxpermdi 37, 31, 37, 3
+       xxpermdi 43, 31, 48, 0
+       vaddudm 6, 16, 11
+       xxpermdi 38, 31, 38, 3
+       xxpermdi 44, 31, 49, 0
+       vaddudm 7, 17, 12
+       xxpermdi 39, 31, 39, 3
+       xxpermdi 45, 31, 50, 0
+       vaddudm 8, 18, 13
+       xxpermdi 40, 31, 40, 3
+
+       # carry reduction
+       vspltisb 9, 2
+       vsrd    10, 4, 31
+       vsrd    11, 7, 31
+       vand    7, 7, 25
+       vand    4, 4, 25
+       vaddudm 8, 8, 11
+       vsrd    12, 8, 31
+       vaddudm 5, 5, 10
+
+       vsrd    11, 5, 31
+       vand    8, 8, 25
+       vand    5, 5, 25
+       vaddudm 4, 4, 12
+       vsld    10, 12, 9
+       vaddudm 6, 6, 11
+
+       vsrd    13, 6, 31
+       vand    6, 6, 25
+       vaddudm 4, 4, 10
+       vsrd    10, 4, 31
+       vaddudm 7, 7, 13
+
+       vsrd    11, 7, 31
+       vand    7, 7, 25
+       vand    4, 4, 25
+       vaddudm 5, 5, 10
+       vaddudm 8, 8, 11
+
+       b       do_final_update
+
+do_final_update:
+       # v4, v5, v6, v7 and v8 are 26 bit vectors
+       vsld    5, 5, 31
+       vor     20, 4, 5
+       vspltisb 11, 12
+       vsrd    12, 6, 11
+       vsld    6, 6, 31
+       vsld    6, 6, 31
+       vor     20, 20, 6
+       vspltisb 11, 14
+       vsld    7, 7, 11
+       vor     21, 7, 12
+       mfvsrld 16, 40          # save last 2 bytes
+       vsld    8, 8, 11
+       vsld    8, 8, 31
+       vor     21, 21, 8
+       mfvsrld 17, 52
+       mfvsrld 19, 53
+       srdi    16, 16, 24
+
+       std     17, 32(3)
+       std     19, 40(3)
+       stw     16, 48(3)
+
+Out_loop:
+       li      3, 0
+
+       li      14, 256
+       lvx     20, 14, 1
+       addi    14, 14, 16
+       lvx     21, 14, 1
+       addi    14, 14, 16
+       lvx     22, 14, 1
+       addi    14, 14, 16
+       lvx     23, 14, 1
+       addi    14, 14, 16
+       lvx     24, 14, 1
+       addi    14, 14, 16
+       lvx     25, 14, 1
+       addi    14, 14, 16
+       lvx     26, 14, 1
+       addi    14, 14, 16
+       lvx     27, 14, 1
+       addi    14, 14, 16
+       lvx     28, 14, 1
+       addi    14, 14, 16
+       lvx     29, 14, 1
+       addi    14, 14, 16
+       lvx     30, 14, 1
+       addi    14, 14, 16
+       lvx     31, 14, 1
+
+       addi    14, 14, 16
+       lxvx    14, 14, 1
+       addi    14, 14, 16
+       lxvx    15, 14, 1
+       addi    14, 14, 16
+       lxvx    16, 14, 1
+       addi    14, 14, 16
+       lxvx    17, 14, 1
+       addi    14, 14, 16
+       lxvx    18, 14, 1
+       addi    14, 14, 16
+       lxvx    19, 14, 1
+       addi    14, 14, 16
+       lxvx    20, 14, 1
+       addi    14, 14, 16
+       lxvx    21, 14, 1
+       addi    14, 14, 16
+       lxvx    22, 14, 1
+       addi    14, 14, 16
+       lxvx    23, 14, 1
+       addi    14, 14, 16
+       lxvx    24, 14, 1
+       addi    14, 14, 16
+       lxvx    25, 14, 1
+       addi    14, 14, 16
+       lxvx    26, 14, 1
+       addi    14, 14, 16
+       lxvx    27, 14, 1
+       addi    14, 14, 16
+       lxvx    28, 14, 1
+       addi    14, 14, 16
+       lxvx    29, 14, 1
+       addi    14, 14, 16
+       lxvx    30, 14, 1
+       addi    14, 14, 16
+       lxvx    31, 14, 1
+
+       ld      0, 1040(1)
+       ld      14,112(1)
+       ld      15,120(1)
+       ld      16,128(1)
+       ld      17,136(1)
+       ld      18,144(1)
+       ld      19,152(1)
+       ld      20,160(1)
+       ld      21,168(1)
+       ld      31,248(1)
+
+       mtlr    0
+       addi    1, 1, 1024
+       blr
+
+Out_no_poly1305:
+       li      3, 0
+       blr
+
+.data
+.align 5
+rmask:
+.byte  0xff, 0xff, 0xff, 0x0f, 0xfc, 0xff, 0xff, 0x0f, 0xfc, 0xff, 0xff, 0x0f, 0xfc, 0xff, 0xff, 0x0f
+cnum:
+.long  0x03ffffff, 0x00000000, 0x03ffffff, 0x00000000
+.long  0x1a, 0x00, 0x1a, 0x00
+.long  0x01000000, 0x01000000, 0x01000000, 0x01000000
+.long  0x00010203, 0x04050607, 0x10111213, 0x14151617
+.long  0x08090a0b, 0x0c0d0e0f, 0x18191a1b, 0x1c1d1e1f
+.long  0x05, 0x00, 0x00, 0x00
+.long  0x02020202, 0x02020202, 0x02020202, 0x02020202
+.long  0xffffffff, 0xffffffff, 0x00000000, 0x00000000
index 28bed5600bc9885e06dd283f22c48bd5ea35c7f8..5ba424e4bd239175d8bc45e55ad645ab7467d429 100644 (file)
@@ -26,7 +26,7 @@
 
 .text
 
-.balign 8
+.balign 16
 .globl _gcry_poly1305_s390x_blocks1
 ELF(.type _gcry_poly1305_s390x_blocks1,@function;)
 
index e57e64f33819223d8904fd6ecab713a8c039f29c..8bc656992e1df4fd2e0747b1838d26be56034231 100644 (file)
@@ -4,7 +4,7 @@
  * This file is part of Libgcrypt.
  *
  * Libgcrypt is free software; you can redistribute it and/or modify
- * it under the terms of the GNU Lesser general Public License as
+ * it under the terms of the GNU Lesser General Public License as
  * published by the Free Software Foundation; either version 2.1 of
  * the License, or (at your option) any later version.
  *
@@ -40,7 +40,7 @@ static const char *selftest (void);
 
 #undef USE_MPI_64BIT
 #undef USE_MPI_32BIT
-#if BYTES_PER_MPI_LIMB == 8 && defined(HAVE_TYPE_U64)
+#if BYTES_PER_MPI_LIMB == 8 && defined(HAVE_U64)
 # define USE_MPI_64BIT 1
 #elif BYTES_PER_MPI_LIMB == 4
 # define USE_MPI_32BIT 1
@@ -60,6 +60,19 @@ static const char *selftest (void);
 #endif
 
 
+/* AMD64 Assembly implementations use SystemV ABI, ABI conversion and
+ * additional stack to store XMM6-XMM15 needed on Win64. */
+#undef ASM_FUNC_ABI
+#undef ASM_FUNC_WRAPPER_ATTR
+#if defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)
+# define ASM_FUNC_ABI __attribute__((sysv_abi))
+# define ASM_FUNC_WRAPPER_ATTR __attribute__((noinline))
+#else
+# define ASM_FUNC_ABI
+# define ASM_FUNC_WRAPPER_ATTR
+#endif
+
+
 #ifdef USE_S390X_ASM
 
 #define HAVE_ASM_POLY1305_BLOCKS 1
@@ -78,10 +91,51 @@ poly1305_blocks (poly1305_context_t *ctx, const byte *buf, size_t len,
 #endif /* USE_S390X_ASM */
 
 
+#ifdef POLY1305_USE_AVX512
+
+extern unsigned int
+_gcry_poly1305_amd64_avx512_blocks(const void *msg, const u64 msg_len,
+                                  void *hash, const void *key) ASM_FUNC_ABI;
+
+ASM_FUNC_WRAPPER_ATTR static unsigned int
+poly1305_amd64_avx512_blocks(poly1305_context_t *ctx, const byte *buf,
+                            size_t len)
+{
+  POLY1305_STATE *st = &ctx->state;
+  return _gcry_poly1305_amd64_avx512_blocks(buf, len, st->h, st->r);
+}
+
+#endif /* POLY1305_USE_AVX512 */
+
+
+#ifdef POLY1305_USE_PPC_VEC
+
+extern unsigned int
+gcry_poly1305_p10le_4blocks(unsigned char *key, const byte *m, size_t len);
+
+#endif /* POLY1305_USE_PPC_VEC */
+
+
 static void poly1305_init (poly1305_context_t *ctx,
                           const byte key[POLY1305_KEYLEN])
 {
   POLY1305_STATE *st = &ctx->state;
+  unsigned int features = _gcry_get_hw_features ();
+
+#ifdef POLY1305_USE_AVX512
+  ctx->use_avx512 = (features & HWF_INTEL_AVX512) != 0;
+#endif
+
+#ifdef POLY1305_USE_PPC_VEC
+  ctx->use_p10 = (features & HWF_PPC_ARCH_3_10) != 0;
+# ifdef ENABLE_FORCE_SOFT_HWFEATURES
+  /* HWF_PPC_ARCH_3_10 above is used as soft HW-feature indicator for P10.
+   * Actual implementation works with HWF_PPC_ARCH_3_00 also. */
+  ctx->use_p10 |= (features & HWF_PPC_ARCH_3_00) != 0;
+# endif
+#endif
+
+  (void)features;
 
   ctx->leftover = 0;
 
@@ -181,8 +235,8 @@ static void poly1305_init (poly1305_context_t *ctx,
 #ifndef HAVE_ASM_POLY1305_BLOCKS
 
 static unsigned int
-poly1305_blocks (poly1305_context_t *ctx, const byte *buf, size_t len,
-                byte high_pad)
+poly1305_blocks_generic (poly1305_context_t *ctx, const byte *buf, size_t len,
+                        byte high_pad)
 {
   POLY1305_STATE *st = &ctx->state;
   u64 r0, r1, r1_mult5;
@@ -235,6 +289,18 @@ poly1305_blocks (poly1305_context_t *ctx, const byte *buf, size_t len,
   return 6 * sizeof (void *) + 18 * sizeof (u64);
 }
 
+static unsigned int
+poly1305_blocks (poly1305_context_t *ctx, const byte *buf, size_t len,
+                byte high_pad)
+{
+#ifdef POLY1305_USE_AVX512
+  if ((high_pad & ctx->use_avx512) != 0)
+    return poly1305_amd64_avx512_blocks(ctx, buf, len);
+#endif
+
+  return poly1305_blocks_generic(ctx, buf, len, high_pad);
+}
+
 #endif /* !HAVE_ASM_POLY1305_BLOCKS */
 
 static unsigned int poly1305_final (poly1305_context_t *ctx,
@@ -533,6 +599,7 @@ _gcry_poly1305_update_burn (poly1305_context_t *ctx, const byte *m,
                            size_t bytes)
 {
   unsigned int burn = 0;
+  unsigned int nburn;
 
   /* handle leftover */
   if (ctx->leftover)
@@ -546,15 +613,31 @@ _gcry_poly1305_update_burn (poly1305_context_t *ctx, const byte *m,
       ctx->leftover += want;
       if (ctx->leftover < POLY1305_BLOCKSIZE)
        return 0;
-      burn = poly1305_blocks (ctx, ctx->buffer, POLY1305_BLOCKSIZE, 1);
+      nburn = poly1305_blocks (ctx, ctx->buffer, POLY1305_BLOCKSIZE, 1);
+      burn = nburn > burn ? nburn : burn;
       ctx->leftover = 0;
     }
 
+#ifdef POLY1305_USE_PPC_VEC
+  /* PPC-P10/little-endian: bulk process multiples of eight blocks */
+  if (ctx->use_p10 && bytes >= POLY1305_BLOCKSIZE * 8)
+    {
+      size_t nblks = bytes / (POLY1305_BLOCKSIZE * 8);
+      size_t len = nblks * (POLY1305_BLOCKSIZE * 8);
+      POLY1305_STATE *st = &ctx->state;
+      nburn = gcry_poly1305_p10le_4blocks ((unsigned char *) st, m, len);
+      burn = nburn > burn ? nburn : burn;
+      m += len;
+      bytes -= len;
+    }
+#endif /* POLY1305_USE_PPC_VEC */
+
   /* process full blocks */
   if (bytes >= POLY1305_BLOCKSIZE)
     {
       size_t nblks = bytes / POLY1305_BLOCKSIZE;
-      burn = poly1305_blocks (ctx, m, nblks * POLY1305_BLOCKSIZE, 1);
+      nburn = poly1305_blocks (ctx, m, nblks * POLY1305_BLOCKSIZE, 1);
+      burn = nburn > burn ? nburn : burn;
       m += nblks * POLY1305_BLOCKSIZE;
       bytes -= nblks * POLY1305_BLOCKSIZE;
     }
index e24de4dc7c162d28ec67319715af209769af6503..2f32d95049503e4e1b6c450c6bd8ec35e39a3a5b 100644 (file)
@@ -5,7 +5,7 @@
  * This file is part of Libgcrypt.
  *
  * Libgcrypt is free software; you can redistribute it and/or modify
- * it under the terms of the GNU Lesser general Public License as
+ * it under the terms of the GNU Lesser General Public License as
  * published by the Free Software Foundation; either version 2.1 of
  * the License, or (at your option) any later version.
  *
@@ -15,8 +15,8 @@
  * GNU Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA
+ * License along with this program; if not, see <https://www.gnu.org/licenses/>.
+ * SPDX-License-Identifier: LGPL-2.1-or-later
  */
 
 #include <config.h>
@@ -752,6 +752,8 @@ gen_prime (unsigned int nbits, int secret, int randomlevel,
   unsigned int count1, count2;
   int *mods;
 
+  (void)count1; /* The value is not used, actually.  */
+
 /*   if (  DBG_CIPHER ) */
 /*     log_debug ("generate a prime of %u bits ", nbits ); */
 
index 0ca77099179bbe4e16b2d788d1bbdac0abcd65cd..acf4510501a8c37936c9170b28d5d35cef9360a3 100644 (file)
@@ -4,7 +4,7 @@
  * This file is part of Libgcrypt.
  *
  * Libgcrypt is free software; you can redistribute it and/or modify
- * it under the terms of the GNU Lesser general Public License as
+ * it under the terms of the GNU Lesser General Public License as
  * published by the Free Software Foundation; either version 2.1 of
  * the License, or (at your option) any later version.
  *
index 244dd5d40db19ced8e1cb045ee4b5ebbe35ba197..68defea6661fb8ee0cdccecc442b4983eb74512a 100644 (file)
@@ -957,10 +957,7 @@ _gcry_pk_util_data_to_mpi (gcry_sexp_t input, gcry_mpi_t *ret_mpi,
       void *random_override = NULL;
       size_t random_override_len = 0;
 
-      /* The RSA PKCS#1.5 encryption is no longer supported by FIPS */
-      if (fips_mode ())
-        rc = GPG_ERR_INV_FLAG;
-      else if ( !(value=sexp_nth_data (lvalue, 1, &valuelen)) || !valuelen )
+      if ( !(value=sexp_nth_data (lvalue, 1, &valuelen)) || !valuelen )
         rc = GPG_ERR_INV_OBJ;
       else
         {
@@ -1092,10 +1089,7 @@ _gcry_pk_util_data_to_mpi (gcry_sexp_t input, gcry_mpi_t *ret_mpi,
       const void * value;
       size_t valuelen;
 
-      /* The RSA OAEP encryption requires some more assurances in FIPS */
-      if (fips_mode ())
-        rc = GPG_ERR_INV_FLAG;
-      else if ( !(value=sexp_nth_data (lvalue, 1, &valuelen)) || !valuelen )
+      if ( !(value=sexp_nth_data (lvalue, 1, &valuelen)) || !valuelen )
        rc = GPG_ERR_INV_OBJ;
       else
        {
index 4612f64d6511f8d3d58ee4935a17aacc99cc29c6..214bd6117d2f308171f17493b111742304ce467f 100644 (file)
@@ -6,7 +6,7 @@
  * This file is part of Libgcrypt.
  *
  * Libgcrypt is free software; you can redistribute it and/or modify
- * it under the terms of the GNU Lesser general Public License as
+ * it under the terms of the GNU Lesser General Public License as
  * published by the Free Software Foundation; either version 2.1 of
  * the License, or (at your option) any later version.
  *
@@ -48,6 +48,7 @@ static gcry_pk_spec_t * const pubkey_list[] =
 #if USE_ELGAMAL
     &_gcry_pubkey_spec_elg,
 #endif
+    &_gcry_pubkey_spec_kem,
     NULL
   };
 
@@ -452,23 +453,51 @@ _gcry_pk_sign (gcry_sexp_t *r_sig, gcry_sexp_t s_hash, gcry_sexp_t s_skey)
 }
 
 
-gcry_err_code_t
-_gcry_pk_sign_md (gcry_sexp_t *r_sig, const char *tmpl, gcry_md_hd_t hd_orig,
-                  gcry_sexp_t s_skey, gcry_ctx_t ctx)
+#define MAX_CONTEXTS 2 /* Currently, input data and random_override */
+
+static gcry_err_code_t
+prepare_datasexp_to_be_signed (const char *tmpl, gcry_md_hd_t hd,
+                               gcry_ctx_t ctx, gcry_sexp_t *s_data_p)
 {
-  gcry_err_code_t rc;
-  gcry_pk_spec_t *spec;
-  gcry_sexp_t keyparms = NULL;
-  gcry_sexp_t s_hash = NULL;
-  int algo;
+  const char *s;
+  const char *digest_name = NULL;
   const unsigned char *digest;
   int digest_size;
-  gcry_error_t err;
-  gcry_md_hd_t hd;
-  const char *s;
-  char *hash_name;
+  int algo;
+  gcry_err_code_t rc;
 
-  *r_sig = NULL;
+  if (hd == NULL)
+    {
+      const unsigned char *data[MAX_CONTEXTS];
+      int data_size[MAX_CONTEXTS];
+      int i = 0;
+      void *argv[MAX_CONTEXTS*2];
+
+      while (1)
+        {
+          size_t len;
+
+          rc = _gcry_pk_get_single_data (&ctx, &data[i/2], &len);
+          if (rc)
+            return rc;
+
+          data_size[i/2] = (int)len;
+
+          argv[i] = (void *)&data_size[i/2];
+          argv[i+1] = (void *)&data[i/2];
+
+          i += 2;
+
+          if (!ctx)
+            break;
+
+          if (i >= MAX_CONTEXTS*2)
+            return GPG_ERR_EINVAL;
+        }
+
+      rc = _gcry_sexp_build_array (s_data_p, NULL, tmpl, argv);
+      return rc;
+    }
 
   /* Check if it has fixed hash name or %s */
   s = strstr (tmpl, "(hash ");
@@ -477,72 +506,59 @@ _gcry_pk_sign_md (gcry_sexp_t *r_sig, const char *tmpl, gcry_md_hd_t hd_orig,
 
   s += 6;
   if (!strncmp (s, "%s", 2))
-    hash_name = NULL;
+    {
+      algo = _gcry_md_get_algo (hd);
+
+      if (fips_mode () && algo == GCRY_MD_SHA1)
+        {
+          _gcry_md_close (hd);
+          return GPG_ERR_DIGEST_ALGO;
+        }
+
+      digest_name = _gcry_md_algo_name (algo);
+      digest_size = (int)_gcry_md_get_algo_dlen (algo);
+      digest = _gcry_md_read (hd, 0);
+    }
   else
     {
       const char *p;
+      char *digest_name_supplied;
 
       for (p = s; *p && *p != ' '; p++)
        ;
 
-      hash_name = xtrymalloc (p - s + 1);
-      if (!hash_name)
+      digest_name_supplied = xtrymalloc (p - s + 1);
+      if (!digest_name_supplied)
        return gpg_error_from_syserror ();
-      memcpy (hash_name, s, p - s);
-      hash_name[p - s] = 0;
-    }
-
-  err = _gcry_md_copy (&hd, hd_orig);
-  if (err)
-    {
-      xfree (hash_name);
-      return gpg_err_code (err);
-    }
+      memcpy (digest_name_supplied, s, p - s);
+      digest_name_supplied[p - s] = 0;
 
-  if (hash_name)
-    {
-      algo = _gcry_md_map_name (hash_name);
-      digest_size = (int) _gcry_md_get_algo_dlen (algo);
-
-      if (algo == 0 || digest_size == 0
+      algo = _gcry_md_map_name (digest_name_supplied);
+      xfree (digest_name_supplied);
+      if (algo == 0
           || (fips_mode () && algo == GCRY_MD_SHA1))
        {
-         xfree (hash_name);
          _gcry_md_close (hd);
          return GPG_ERR_DIGEST_ALGO;
        }
 
+      digest_size = (int)_gcry_md_get_algo_dlen (algo);
       digest = _gcry_md_read (hd, algo);
     }
-  else
-    {
-      algo = _gcry_md_get_algo (hd);
-      digest_size = (int) _gcry_md_get_algo_dlen (algo);
-
-      if (digest_size == 0 || (fips_mode () && algo == GCRY_MD_SHA1))
-        {
-          _gcry_md_close (hd);
-          return GPG_ERR_DIGEST_ALGO;
-        }
-
-      digest = _gcry_md_read (hd, 0);
-    }
 
   if (!digest)
     {
-      xfree (hash_name);
       _gcry_md_close (hd);
       return GPG_ERR_NOT_IMPLEMENTED;
     }
 
   if (!ctx)
     {
-      if (hash_name)
-       rc = _gcry_sexp_build (&s_hash, NULL, tmpl,
+      if (!digest_name)
+       rc = _gcry_sexp_build (s_data_p, NULL, tmpl,
                               digest_size, digest);
       else
-       rc = _gcry_sexp_build (&s_hash, NULL, tmpl,
-                              _gcry_md_algo_name (algo),
+       rc = _gcry_sexp_build (s_data_p, NULL, tmpl, digest_name,
                               digest_size, digest);
     }
   else
@@ -550,26 +566,46 @@ _gcry_pk_sign_md (gcry_sexp_t *r_sig, const char *tmpl, gcry_md_hd_t hd_orig,
       const unsigned char *p;
       size_t len;
 
-      rc = _gcry_pk_get_random_override (ctx, &p, &len);
+      rc = _gcry_pk_get_single_data (&ctx, &p, &len);
       if (rc)
-        {
-          _gcry_md_close (hd);
-          return rc;
-        }
+        return rc;
 
-      if (hash_name)
-       rc = _gcry_sexp_build (&s_hash, NULL, tmpl,
-                              digest_size, digest,
-                              (int) len, p);
+      if (!digest_name)
+       rc = _gcry_sexp_build (s_data_p, NULL, tmpl,
+                               digest_size, digest, (int) len, p);
       else
-       rc = _gcry_sexp_build (&s_hash, NULL, tmpl,
-                              _gcry_md_algo_name (algo),
-                              digest_size, digest,
-                              (int) len, p);
+       rc = _gcry_sexp_build (s_data_p, NULL, tmpl, digest_name,
+                               digest_size, digest, (int) len, p);
     }
 
-  xfree (hash_name);
   _gcry_md_close (hd);
+  return rc;
+}
+
+
+gcry_err_code_t
+_gcry_pk_sign_md (gcry_sexp_t *r_sig, const char *tmpl, gcry_md_hd_t hd_orig,
+                  gcry_sexp_t s_skey, gcry_ctx_t ctx)
+{
+  gcry_err_code_t rc;
+  gcry_pk_spec_t *spec;
+  gcry_sexp_t keyparms = NULL;
+  gcry_sexp_t s_data = NULL;
+  gcry_error_t err;
+  gcry_md_hd_t hd;
+
+  *r_sig = NULL;
+
+  if (!hd_orig)
+    hd = NULL;
+  else
+    {
+      err = _gcry_md_copy (&hd, hd_orig);
+      if (err)
+        return gpg_err_code (err);
+    }
+
+  rc = prepare_datasexp_to_be_signed (tmpl, hd, ctx, &s_data);
   if (rc)
     return rc;
 
@@ -582,12 +618,12 @@ _gcry_pk_sign_md (gcry_sexp_t *r_sig, const char *tmpl, gcry_md_hd_t hd_orig,
   else if (!spec->flags.fips && fips_mode ())
     rc = GPG_ERR_PUBKEY_ALGO;
   else if (spec->sign)
-    rc = spec->sign (r_sig, s_hash, keyparms);
+    rc = spec->sign (r_sig, s_data, keyparms);
   else
     rc = GPG_ERR_NOT_IMPLEMENTED;
 
  leave:
-  sexp_release (s_hash);
+  sexp_release (s_data);
   sexp_release (keyparms);
   return rc;
 }
@@ -633,115 +669,20 @@ _gcry_pk_verify_md (gcry_sexp_t s_sig, const char *tmpl, gcry_md_hd_t hd_orig,
   gcry_err_code_t rc;
   gcry_pk_spec_t *spec;
   gcry_sexp_t keyparms = NULL;
-  gcry_sexp_t s_hash = NULL;
-  int algo;
-  const unsigned char *digest;
-  int digest_size;
+  gcry_sexp_t s_data = NULL;
   gcry_error_t err;
   gcry_md_hd_t hd;
-  const char *s;
-  char *hash_name;
-
-  /* Check if it has fixed hash name or %s */
-  s = strstr (tmpl, "(hash ");
-  if (s == NULL)
-    return GPG_ERR_DIGEST_ALGO;
-
-  s += 6;
-  if (!strncmp (s, "%s", 2))
-    hash_name = NULL;
-  else
-    {
-      const char *p;
-
-      for (p = s; *p && *p != ' '; p++)
-        ;
-
-      hash_name = xtrymalloc (p - s + 1);
-      if (!hash_name)
-        return gpg_error_from_syserror ();
-      memcpy (hash_name, s, p - s);
-      hash_name[p - s] = 0;
-    }
-
-  err = _gcry_md_copy (&hd, hd_orig);
-  if (err)
-    {
-      xfree (hash_name);
-      return gpg_err_code (err);
-    }
-
-  if (hash_name)
-    {
-      algo = _gcry_md_map_name (hash_name);
-      digest_size = (int) _gcry_md_get_algo_dlen (algo);
-
-      if (algo == 0 || digest_size == 0
-          || (fips_mode () && algo == GCRY_MD_SHA1))
-        {
-          xfree (hash_name);
-          _gcry_md_close (hd);
-          return GPG_ERR_DIGEST_ALGO;
-        }
-
-      digest = _gcry_md_read (hd, algo);
-    }
-  else
-    {
-      algo = _gcry_md_get_algo (hd);
-      digest_size = (int) _gcry_md_get_algo_dlen (algo);
-
-      if (digest_size == 0 || (fips_mode () && algo == GCRY_MD_SHA1))
-        {
-          _gcry_md_close (hd);
-          return GPG_ERR_DIGEST_ALGO;
-        }
 
-      digest = _gcry_md_read (hd, 0);
-    }
-
-  if (!digest)
-    {
-      xfree (hash_name);
-      _gcry_md_close (hd);
-      return GPG_ERR_DIGEST_ALGO;
-    }
-
-  if (!ctx)
-    {
-      if (hash_name)
-        rc = _gcry_sexp_build (&s_hash, NULL, tmpl,
-                               digest_size, digest);
-      else
-        rc = _gcry_sexp_build (&s_hash, NULL, tmpl,
-                               _gcry_md_algo_name (algo),
-                               digest_size, digest);
-    }
+  if (!hd_orig)
+    hd = NULL;
   else
     {
-      const unsigned char *p;
-      size_t len;
-
-      rc = _gcry_pk_get_random_override (ctx, &p, &len);
-      if (rc)
-        {
-          _gcry_md_close (hd);
-          return rc;
-        }
-
-      if (hash_name)
-        rc = _gcry_sexp_build (&s_hash, NULL, tmpl,
-                               digest_size, digest,
-                               (int) len, p);
-      else
-        rc = _gcry_sexp_build (&s_hash, NULL, tmpl,
-                               _gcry_md_algo_name (algo),
-                               digest_size, digest,
-                               (int) len, p);
+      err = _gcry_md_copy (&hd, hd_orig);
+      if (err)
+        return gpg_err_code (err);
     }
 
-  xfree (hash_name);
-  _gcry_md_close (hd);
+  rc = prepare_datasexp_to_be_signed (tmpl, hd, ctx, &s_data);
   if (rc)
     return rc;
 
@@ -754,12 +695,12 @@ _gcry_pk_verify_md (gcry_sexp_t s_sig, const char *tmpl, gcry_md_hd_t hd_orig,
   else if (!spec->flags.fips && fips_mode ())
     rc = GPG_ERR_PUBKEY_ALGO;
   else if (spec->verify)
-    rc = spec->verify (s_sig, s_hash, keyparms);
+    rc = spec->verify (s_sig, s_data, keyparms);
   else
     rc = GPG_ERR_NOT_IMPLEMENTED;
 
  leave:
-  sexp_release (s_hash);
+  sexp_release (s_data);
   sexp_release (keyparms);
   return rc;
 }
@@ -800,38 +741,34 @@ _gcry_pk_testkey (gcry_sexp_t s_key)
 }
 
 
-/*
-  Create a public key pair and return it in r_key.
-  How the key is created depends on s_parms:
-  (genkey
-   (algo
-     (parameter_name_1 ....)
-      ....
-     (parameter_name_n ....)
-  ))
-  The key is returned in a format depending on the
-  algorithm. Both, private and secret keys are returned
-  and optionally some additional informatin.
-  For elgamal we return this structure:
-  (key-data
-   (public-key
-     (elg
-       (p <mpi>)
-       (g <mpi>)
-       (y <mpi>)
-     )
-   )
-   (private-key
-     (elg
-       (p <mpi>)
-       (g <mpi>)
-       (y <mpi>)
-       (x <mpi>)
-     )
-   )
-   (misc-key-info
-      (pm1-factors n1 n2 ... nn)
-   ))
+
+/* Create a public key pair and return it as R_KEY.
+ * How the key is created depends on s_parms:
+ *
+ *   (genkey
+ *     (algo
+ *       (parameter_name_1 ....)
+ *       ....
+ *       (parameter_name_n ....)))
+ *
+ * The key is returned in a format depending on the algorithm. Both,
+ * private and secret keys are returned and optionally some additional
+ * information.  For example for Elgamal this structure is returned:
+ *
+ *   (key-data
+ *     (public-key
+ *       (elg
+ *         (p <mpi>)
+ *         (g <mpi>)
+ *         (y <mpi>)))
+ *     (private-key
+ *       (elg
+ *         (p <mpi>)
+ *         (g <mpi>)
+ *         (y <mpi>)
+ *         (x <mpi>)))
+ *     (misc-key-info
+ *        (pm1-factors n1 n2 ... nn)))
  */
 gcry_err_code_t
 _gcry_pk_genkey (gcry_sexp_t *r_key, gcry_sexp_t s_parms)
@@ -1273,47 +1210,50 @@ _gcry_pk_selftest (int algo, int extended, selftest_report_func_t report)
 }
 
 
-struct pk_random_override {
+struct pk_single_data {
   size_t len;
   unsigned char area[1];  /* In future, we may use flexible array member.  */
 };
 
 gpg_err_code_t
-_gcry_pk_random_override_new (gcry_ctx_t *r_ctx,
-                              const unsigned char *p, size_t len)
+_gcry_pk_single_data_push (gcry_ctx_t *r_ctx,
+                          const unsigned char *p, size_t len)
 {
   gcry_ctx_t ctx;
-  struct pk_random_override *pro;
+  struct pk_single_data *psd;
+  int data_type = CONTEXT_TYPE_SINGLE_DATA;
 
-  *r_ctx = NULL;
   if (!p)
     return GPG_ERR_EINVAL;
 
-  ctx = _gcry_ctx_alloc (CONTEXT_TYPE_RANDOM_OVERRIDE,
-                         offsetof (struct pk_random_override, area) + len,
-                         NULL);
+  ctx = _gcry_ctx_alloc (data_type,
+                        offsetof (struct pk_single_data, area) + len,
+                         NULL, *r_ctx);
   if (!ctx)
     return gpg_err_code_from_syserror ();
-  pro = _gcry_ctx_get_pointer (ctx, CONTEXT_TYPE_RANDOM_OVERRIDE);
-  pro->len = len;
-  memcpy (pro->area, p, len);
+  psd = _gcry_ctx_get_pointer (ctx, data_type);
+  psd->len = len;
+  memcpy (psd->area, p, len);
 
   *r_ctx = ctx;
   return 0;
 }
 
 gpg_err_code_t
-_gcry_pk_get_random_override (gcry_ctx_t ctx,
-                              const unsigned char **r_p, size_t *r_len)
+_gcry_pk_get_single_data (gcry_ctx_t *r_ctx,
+                          const unsigned char **r_p, size_t *r_len)
 {
-  struct pk_random_override *pro;
+  struct pk_single_data *psd;
+  int data_type = CONTEXT_TYPE_SINGLE_DATA;
+  gcry_ctx_t ctx = *r_ctx;
 
-  pro = _gcry_ctx_find_pointer (ctx, CONTEXT_TYPE_RANDOM_OVERRIDE);
-  if (!pro)
+  psd = _gcry_ctx_find_pointer (ctx, data_type);
+  if (!psd)
     return GPG_ERR_EINVAL;
 
-  *r_p = pro->area;
-  *r_len = pro->len;
+  *r_p = psd->area;
+  *r_len = psd->len;
+  *r_ctx = _gcry_ctx_get_pointer (ctx, 0);
 
   return 0;
 }
index c270ce9b3e772320f6cb9a034f55911fff5bd0ff..30efa7ae9fcb2730988715f796a004b8b2276fb4 100644 (file)
@@ -5,7 +5,7 @@
  * This file is part of Libgcrypt
  *
  * Libgcrypt is free software; you can redistribute it and/or modify
- * it under the terms of the GNU Lesser general Public License as
+ * it under the terms of the GNU Lesser General Public License as
  * published by the Free Software Foundation; either version 2.1 of
  * the License, or (at your option) any later version.
  *
@@ -15,8 +15,8 @@
  * GNU Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA
+ * License along with this program; if not, see <https://www.gnu.org/licenses/>.
+ * SPDX-License-Identifier: LGPL-2.1-or-later
  */
 
 /* This implementation was written by Nikos Mavroyanopoulos for GNUTLS
index 184fcd20a720bfc7013f27be136e4d248cfe3c1a..90998dedea6138289a5b02511249780d481ceb18 100644 (file)
 .globl _gcry_aes_arm_encrypt_block
 ELF(.type   _gcry_aes_arm_encrypt_block,%function;)
 
+.align 4
 _gcry_aes_arm_encrypt_block:
        /* input:
         *      %x0: keysched, CTX
@@ -265,7 +266,6 @@ _gcry_aes_arm_encrypt_block:
        mov     x0, #(0);
        ret_spec_stop;
 
-.ltorg
 .Lenc_not_128:
        beq .Lenc_192
 
@@ -278,7 +278,6 @@ _gcry_aes_arm_encrypt_block:
 
        b .Lenc_done;
 
-.ltorg
 .Lenc_192:
        encround(8, RA, RB, RC, RD, RNA, RNB, RNC, RND, preload_first_key);
        encround(9, RNA, RNB, RNC, RND, RA, RB, RC, RD, preload_first_key);
@@ -433,6 +432,7 @@ ELF(.size _gcry_aes_arm_encrypt_block,.-_gcry_aes_arm_encrypt_block;)
 .globl _gcry_aes_arm_decrypt_block
 ELF(.type   _gcry_aes_arm_decrypt_block,%function;)
 
+.align 4
 _gcry_aes_arm_decrypt_block:
        /* input:
         *      %x0: keysched, CTX
@@ -488,7 +488,6 @@ _gcry_aes_arm_decrypt_block:
        mov     x0, #(0);
        ret_spec_stop;
 
-.ltorg
 .Ldec_256:
        beq .Ldec_192;
 
@@ -500,7 +499,6 @@ _gcry_aes_arm_decrypt_block:
 
        b .Ldec_tail;
 
-.ltorg
 .Ldec_192:
        firstdecround(11, RA, RB, RC, RD, RNA, RNB, RNC, RND);
        decround(10, RNA, RNB, RNC, RND, RA, RB, RC, RD, preload_first_key);
index ff6b0b264c2d568737d0e09220084c7565c48455..906737a663bc9a3096a65a5f47c8230d5f40e566 100644 (file)
@@ -27,7 +27,6 @@
 #include "g10lib.h"
 #include "cipher.h"
 #include "bufhelp.h"
-#include "cipher-selftest.h"
 #include "rijndael-internal.h"
 #include "./cipher-internal.h"
 
@@ -871,7 +870,7 @@ do_aesni_enc_vec8 (const RIJNDAEL_context *ctx)
                 "aesenc %%xmm0, %%xmm10\n\t"
                 "aesenc %%xmm0, %%xmm11\n\t"
                 "movdqa 0xa0(%[key]), %%xmm0\n\t"
-                "jb .Ldeclast%=\n\t"
+                "jb .Lenclast%=\n\t"
                 "aesenc %%xmm0, %%xmm1\n\t"
                 "aesenc %%xmm0, %%xmm2\n\t"
                 "aesenc %%xmm0, %%xmm3\n\t"
@@ -890,7 +889,7 @@ do_aesni_enc_vec8 (const RIJNDAEL_context *ctx)
                 "aesenc %%xmm0, %%xmm10\n\t"
                 "aesenc %%xmm0, %%xmm11\n\t"
                 "movdqa 0xc0(%[key]), %%xmm0\n\t"
-                "je .Ldeclast%=\n\t"
+                "je .Lenclast%=\n\t"
                 "aesenc %%xmm0, %%xmm1\n\t"
                 "aesenc %%xmm0, %%xmm2\n\t"
                 "aesenc %%xmm0, %%xmm3\n\t"
@@ -910,7 +909,7 @@ do_aesni_enc_vec8 (const RIJNDAEL_context *ctx)
                 "aesenc %%xmm0, %%xmm11\n\t"
                 "movdqa 0xe0(%[key]), %%xmm0\n"
 
-                ".Ldeclast%=:\n\t"
+                ".Lenclast%=:\n\t"
                 : /* no output */
                 : [key] "r" (ctx->keyschenc),
                   [rounds] "r" (ctx->rounds)
@@ -1718,6 +1717,160 @@ _gcry_aes_aesni_encrypt (const RIJNDAEL_context *ctx, unsigned char *dst,
 }
 
 
+void ASM_FUNC_ATTR
+_gcry_aes_aesni_ecb_crypt (RIJNDAEL_context *ctx, unsigned char *dst,
+                          const unsigned char *src, size_t nblocks,
+                          int encrypt)
+{
+  aesni_prepare_2_7_variable;
+
+  aesni_prepare ();
+  aesni_prepare_2_7();
+
+  if (!encrypt && !ctx->decryption_prepared)
+    {
+      do_aesni_prepare_decryption ( ctx );
+      ctx->decryption_prepared = 1;
+    }
+
+#ifdef __x86_64__
+  if (nblocks >= 8)
+    {
+      const void *key = encrypt ? ctx->keyschenc : ctx->keyschdec;
+      aesni_prepare_8_15_variable;
+
+      aesni_prepare_8_15();
+
+      for (; nblocks >= 8; nblocks -= 8)
+       {
+         asm volatile
+           ("movdqa (%[key]), %%xmm0\n\t"
+            "movdqu 0*16(%[src]), %%xmm1\n\t"
+            "movdqu 1*16(%[src]), %%xmm2\n\t"
+            "movdqu 2*16(%[src]), %%xmm3\n\t"
+            "movdqu 3*16(%[src]), %%xmm4\n\t"
+            "movdqu 4*16(%[src]), %%xmm8\n\t"
+            "movdqu 5*16(%[src]), %%xmm9\n\t"
+            "movdqu 6*16(%[src]), %%xmm10\n\t"
+            "movdqu 7*16(%[src]), %%xmm11\n\t"
+            "pxor   %%xmm0, %%xmm1\n\t"
+            "pxor   %%xmm0, %%xmm2\n\t"
+            "pxor   %%xmm0, %%xmm3\n\t"
+            "pxor   %%xmm0, %%xmm4\n\t"
+            "pxor   %%xmm0, %%xmm8\n\t"
+            "pxor   %%xmm0, %%xmm9\n\t"
+            "pxor   %%xmm0, %%xmm10\n\t"
+            "pxor   %%xmm0, %%xmm11\n\t"
+            : /* No output */
+            : [src] "r" (src),
+              [key] "r" (key)
+            : "memory");
+
+         if (encrypt)
+           {
+             do_aesni_enc_vec8 (ctx);
+             asm volatile
+               ("aesenclast %%xmm0, %%xmm1\n\t"
+                "aesenclast %%xmm0, %%xmm2\n\t"
+                "aesenclast %%xmm0, %%xmm3\n\t"
+                "aesenclast %%xmm0, %%xmm4\n\t"
+                "aesenclast %%xmm0, %%xmm8\n\t"
+                "aesenclast %%xmm0, %%xmm9\n\t"
+                "aesenclast %%xmm0, %%xmm10\n\t"
+                "aesenclast %%xmm0, %%xmm11\n\t"
+                ::: "memory" );
+           }
+         else
+           {
+             do_aesni_dec_vec8 (ctx);
+             asm volatile
+               ("aesdeclast %%xmm0, %%xmm1\n\t"
+                "aesdeclast %%xmm0, %%xmm2\n\t"
+                "aesdeclast %%xmm0, %%xmm3\n\t"
+                "aesdeclast %%xmm0, %%xmm4\n\t"
+                "aesdeclast %%xmm0, %%xmm8\n\t"
+                "aesdeclast %%xmm0, %%xmm9\n\t"
+                "aesdeclast %%xmm0, %%xmm10\n\t"
+                "aesdeclast %%xmm0, %%xmm11\n\t"
+                ::: "memory" );
+           }
+
+         asm volatile
+           ("movdqu %%xmm1, 0*16(%[dst])\n\t"
+            "movdqu %%xmm2, 1*16(%[dst])\n\t"
+            "movdqu %%xmm3, 2*16(%[dst])\n\t"
+            "movdqu %%xmm4, 3*16(%[dst])\n\t"
+            "movdqu %%xmm8, 4*16(%[dst])\n\t"
+            "movdqu %%xmm9, 5*16(%[dst])\n\t"
+            "movdqu %%xmm10, 6*16(%[dst])\n\t"
+            "movdqu %%xmm11, 7*16(%[dst])\n\t"
+            : /* No output */
+            : [dst] "r" (dst)
+            : "memory");
+
+         dst += 8*BLOCKSIZE;
+         src += 8*BLOCKSIZE;
+       }
+
+      aesni_cleanup_8_15();
+    }
+#endif
+
+  for (; nblocks >= 4; nblocks -= 4)
+    {
+      asm volatile
+       ("movdqu 0*16(%[src]), %%xmm1\n\t"
+        "movdqu 1*16(%[src]), %%xmm2\n\t"
+        "movdqu 2*16(%[src]), %%xmm3\n\t"
+        "movdqu 3*16(%[src]), %%xmm4\n\t"
+        : /* No output */
+        : [src] "r" (src)
+        : "memory");
+
+      if (encrypt)
+       do_aesni_enc_vec4 (ctx);
+      else
+       do_aesni_dec_vec4 (ctx);
+
+      asm volatile
+       ("movdqu %%xmm1, 0*16(%[dst])\n\t"
+        "movdqu %%xmm2, 1*16(%[dst])\n\t"
+        "movdqu %%xmm3, 2*16(%[dst])\n\t"
+        "movdqu %%xmm4, 3*16(%[dst])\n\t"
+        : /* No output */
+        : [dst] "r" (dst)
+        : "memory");
+
+      dst += 4*BLOCKSIZE;
+      src += 4*BLOCKSIZE;
+    }
+
+  for (; nblocks; nblocks--)
+    {
+      asm volatile ("movdqu %[src], %%xmm0\n\t"
+                    :
+                    : [src] "m" (*src)
+                    : "memory" );
+
+      if (encrypt)
+       do_aesni_enc (ctx);
+      else
+       do_aesni_dec (ctx);
+
+      asm volatile ("movdqu %%xmm0, %[dst]\n\t"
+                    : [dst] "=m" (*dst)
+                    :
+                    : "memory" );
+
+      dst += BLOCKSIZE;
+      src += BLOCKSIZE;
+    }
+
+  aesni_cleanup ();
+  aesni_cleanup_2_7 ();
+}
+
+
 void ASM_FUNC_ATTR
 _gcry_aes_aesni_cfb_enc (RIJNDAEL_context *ctx, unsigned char *iv,
                          unsigned char *outbuf, const unsigned char *inbuf,
index 6e3cc81932b05d387efdec09b6eb03fe48e6681a..526c2b7b74c63521a8bc5324d4b5bf0fa623a8e4 100644 (file)
 #define lastencround(round) \
        do_lastencround((round) + 1);
 
-.align 8
+.align 16
 .globl _gcry_aes_amd64_encrypt_block
 ELF(.type   _gcry_aes_amd64_encrypt_block,@function;)
 
@@ -377,7 +377,7 @@ ELF(.size _gcry_aes_amd64_encrypt_block,.-_gcry_aes_amd64_encrypt_block;)
 #define lastdecround(round) \
        do_lastdecround(round);
 
-.align 8
+.align 16
 .globl _gcry_aes_amd64_decrypt_block
 ELF(.type   _gcry_aes_amd64_decrypt_block,@function;)
 
index e680c817b22300708053bcd3622280167883ebf0..632daac2aea717b53933c0634db8d99ff2d995bc 100644 (file)
 .arm
 
 /* register macros */
-#define CTX    %r0
-#define RTAB   %lr
-#define RMASK  %ip
+#define CTX    r0
+#define RTAB   lr
+#define RMASK  ip
 
-#define RA     %r4
-#define RB     %r5
-#define RC     %r6
-#define RD     %r7
+#define RA     r4
+#define RB     r5
+#define RC     r6
+#define RD     r7
 
-#define RNA    %r8
-#define RNB    %r9
-#define RNC    %r10
-#define RND    %r11
+#define RNA    r8
+#define RNB    r9
+#define RNC    r10
+#define RND    r11
 
-#define RT0    %r1
-#define RT1    %r2
-#define RT2    %r3
+#define RT0    r1
+#define RT1    r2
+#define RT2    r3
 
 /* helper macros */
 #define ldr_unaligned_le(rout, rsrc, offs, rtmp) \
 
 _gcry_aes_arm_encrypt_block:
        /* input:
-        *      %r0: keysched, CTX
-        *      %r1: dst
-        *      %r2: src
-        *      %r3: number of rounds.. 10, 12 or 14
-        *      %st+0: encryption table
+        *      r0: keysched, CTX
+        *      r1: dst
+        *      r2: src
+        *      r3: number of rounds.. 10, 12 or 14
+        *      st+0: encryption table
         */
-       push {%r4-%r11, %ip, %lr};
+       push {r4-r11, ip, lr};
 
        /* read input block */
 
        /* test if src is unaligned */
-       tst     %r2, #3;
+       tst     r2, #3;
        beq     1f;
 
        /* unaligned load */
-       ldr_unaligned_le(RA, %r2, 0, RNA);
-       ldr_unaligned_le(RB, %r2, 4, RNB);
-       ldr_unaligned_le(RC, %r2, 8, RNA);
-       ldr_unaligned_le(RD, %r2, 12, RNB);
+       ldr_unaligned_le(RA, r2, 0, RNA);
+       ldr_unaligned_le(RB, r2, 4, RNB);
+       ldr_unaligned_le(RC, r2, 8, RNA);
+       ldr_unaligned_le(RD, r2, 12, RNB);
        b       2f;
 .ltorg
 1:
        /* aligned load */
-       ldm     %r2, {RA, RB, RC, RD};
+       ldm     r2, {RA, RB, RC, RD};
 #ifndef __ARMEL__
        rev     RA, RA;
        rev     RB, RB;
@@ -247,12 +247,12 @@ _gcry_aes_arm_encrypt_block:
        rev     RD, RD;
 #endif
 2:
-       ldr     RTAB, [%sp, #40];
-       sub     %sp, #16;
+       ldr     RTAB, [sp, #40];
+       sub     sp, #16;
 
-       str     %r1, [%sp, #4];         /* dst */
+       str     r1, [sp, #4];           /* dst */
        mov     RMASK, #0xff;
-       str     %r3, [%sp, #8];         /* nrounds */
+       str     r3, [sp, #8];           /* nrounds */
        mov     RMASK, RMASK, lsl#2;    /* byte mask */
 
        firstencround(0, RA, RB, RC, RD, RNA, RNB, RNC, RND);
@@ -264,7 +264,7 @@ _gcry_aes_arm_encrypt_block:
        encround(6, RA, RB, RC, RD, RNA, RNB, RNC, RND, preload_first_key);
        encround(7, RNA, RNB, RNC, RND, RA, RB, RC, RD, preload_first_key);
 
-       ldr     RT0, [%sp, #8];         /* nrounds */
+       ldr     RT0, [sp, #8];          /* nrounds */
        cmp     RT0, #12;
        bge     .Lenc_not_128;
 
@@ -272,8 +272,8 @@ _gcry_aes_arm_encrypt_block:
        lastencround(9, RNA, RNB, RNC, RND, RA, RB, RC, RD);
 
 .Lenc_done:
-       ldr     RT0, [%sp, #4];         /* dst */
-       add     %sp, #16;
+       ldr     RT0, [sp, #4];          /* dst */
+       add     sp, #16;
 
        /* store output block */
 
@@ -301,7 +301,7 @@ _gcry_aes_arm_encrypt_block:
 2:
 
        mov     r0, #(10 * 4);
-       pop {%r4-%r11, %ip, %pc};
+       pop {r4-r11, ip, pc};
 
 .ltorg
 .Lenc_not_128:
@@ -473,30 +473,30 @@ _gcry_aes_arm_encrypt_block:
 
 _gcry_aes_arm_decrypt_block:
        /* input:
-        *      %r0: keysched, CTX
-        *      %r1: dst
-        *      %r2: src
-        *      %r3: number of rounds.. 10, 12 or 14
-        *      %st+0: decryption table
+        *      r0: keysched, CTX
+        *      r1: dst
+        *      r2: src
+        *      r3: number of rounds.. 10, 12 or 14
+        *      st+0: decryption table
         */
-       push {%r4-%r11, %ip, %lr};
+       push {r4-r11, ip, lr};
 
        /* read input block */
 
        /* test if src is unaligned */
-       tst     %r2, #3;
+       tst     r2, #3;
        beq     1f;
 
        /* unaligned load */
-       ldr_unaligned_le(RA, %r2, 0, RNA);
-       ldr_unaligned_le(RB, %r2, 4, RNB);
-       ldr_unaligned_le(RC, %r2, 8, RNA);
-       ldr_unaligned_le(RD, %r2, 12, RNB);
+       ldr_unaligned_le(RA, r2, 0, RNA);
+       ldr_unaligned_le(RB, r2, 4, RNB);
+       ldr_unaligned_le(RC, r2, 8, RNA);
+       ldr_unaligned_le(RD, r2, 12, RNB);
        b       2f;
 .ltorg
 1:
        /* aligned load */
-       ldm     %r2, {RA, RB, RC, RD};
+       ldm     r2, {RA, RB, RC, RD};
 #ifndef __ARMEL__
        rev     RA, RA;
        rev     RB, RB;
@@ -504,14 +504,14 @@ _gcry_aes_arm_decrypt_block:
        rev     RD, RD;
 #endif
 2:
-       ldr     RTAB, [%sp, #40];
-       sub     %sp, #16;
+       ldr     RTAB, [sp, #40];
+       sub     sp, #16;
 
        mov     RMASK, #0xff;
-       str     %r1, [%sp, #4];         /* dst */
+       str     r1, [sp, #4];           /* dst */
        mov     RMASK, RMASK, lsl#2;    /* byte mask */
 
-       cmp     %r3, #12;
+       cmp     r3, #12;
        bge     .Ldec_256;
 
        firstdecround(9, RA, RB, RC, RD, RNA, RNB, RNC, RND);
@@ -526,8 +526,8 @@ _gcry_aes_arm_decrypt_block:
        decround(1, RA, RB, RC, RD, RNA, RNB, RNC, RND, set_last_round_rmask);
        lastdecround(0, RNA, RNB, RNC, RND, RA, RB, RC, RD);
 
-       ldr     RT0, [%sp, #4];         /* dst */
-       add     %sp, #16;
+       ldr     RT0, [sp, #4];          /* dst */
+       add     sp, #16;
 
        /* store output block */
 
@@ -554,7 +554,7 @@ _gcry_aes_arm_decrypt_block:
        stm     RT0, {RA, RB, RC, RD};
 2:
        mov     r0, #(10 * 4);
-       pop {%r4-%r11, %ip, %pc};
+       pop {r4-r11, ip, pc};
 
 .ltorg
 .Ldec_256:
index 1eafa93ed3352e1c6ba5fecfe75061db81e237ef..3c4149b38fbda6c4be8c112358f48465ffe0ad72 100644 (file)
@@ -483,9 +483,9 @@ _gcry_aes_cbc_enc_armv8_ce:
    *    r1: outbuf
    *    r2: inbuf
    *    r3: iv
-   *    %st+0: nblocks => r4
-   *    %st+4: cbc_mac => r5
-   *    %st+8: nrounds => r6
+   *    st+0: nblocks => r4
+   *    st+4: cbc_mac => r5
+   *    st+8: nrounds => r6
    */
 
   push {r4-r6,lr} /* 4*4 = 16b */
@@ -563,8 +563,8 @@ _gcry_aes_cbc_dec_armv8_ce:
    *    r1: outbuf
    *    r2: inbuf
    *    r3: iv
-   *    %st+0: nblocks => r4
-   *    %st+4: nrounds => r5
+   *    st+0: nblocks => r4
+   *    st+4: nrounds => r5
    */
 
   push {r4-r6,lr} /* 4*4 = 16b */
@@ -653,6 +653,149 @@ _gcry_aes_cbc_dec_armv8_ce:
 .size _gcry_aes_cbc_dec_armv8_ce,.-_gcry_aes_cbc_dec_armv8_ce;
 
 
+/*
+ * void _gcry_aes_ecb_enc_armv8_ce (const void *keysched,
+ *                                  unsigned char *outbuf,
+ *                                  const unsigned char *inbuf,
+ *                                  size_t nblocks,
+ *                                  unsigned int nrounds);
+ */
+
+.align 3
+.globl _gcry_aes_ecb_enc_armv8_ce
+.type  _gcry_aes_ecb_enc_armv8_ce,%function;
+_gcry_aes_ecb_enc_armv8_ce:
+  /* input:
+   *    r0: keysched
+   *    r1: outbuf
+   *    r2: inbuf
+   *    r3: nblocks
+   *    st+0: nrounds => r4
+   */
+
+  push {r4-r6,lr} /* 4*4 = 16b */
+  cmp r3, #0
+  beq .Lecb_enc_skip
+  ldr r4, [sp, #(16+0)]
+  vpush {q4-q7}
+
+  cmp r4, #12
+  aes_preload_keys(r0, lr);
+
+  beq .Lecb_entry_192e
+  bhi .Lecb_entry_256e
+
+#define ECB_CRYPT(bits, e_d, mc_imc, ...) \
+  .Lecb_entry_##bits##e_d: \
+    cmp r3, #4; \
+    blo .Lecb_loop_##bits##e_d; \
+    \
+  .Lecb_loop4_##bits##e_d: \
+    vld1.8 {q1-q2}, [r2]!; /* load plaintext */ \
+    sub r3, r3, #4; \
+    vld1.8 {q3-q4}, [r2]!; /* load plaintext */ \
+    cmp r3, #4; \
+    \
+    do_aes_4_##bits(e_d, mc_imc, q1, q2, q3, q4, ##__VA_ARGS__); \
+    \
+    vst1.8 {q1-q2}, [r1]!; /* store ciphertext */ \
+    vst1.8 {q3-q4}, [r1]!; /* store ciphertext */ \
+    \
+    bhs .Lecb_loop4_##bits##e_d; \
+    cmp r3, #0; \
+    beq .Lecb_done_##e_d; \
+    \
+  .Lecb_loop_##bits##e_d: \
+    vld1.8 {q1}, [r2]!; /* load ciphertext */ \
+    subs r3, r3, #1; \
+    \
+    do_aes_one##bits(e_d, mc_imc, q1, q1, ##__VA_ARGS__); \
+    \
+    vst1.8 {q1}, [r1]!; /* store plaintext */ \
+    bne .Lecb_loop_##bits##e_d; \
+    b .Lecb_done_##e_d;
+
+  ECB_CRYPT(128, e, mc)
+  ECB_CRYPT(192, e, mc, r0, lr)
+  ECB_CRYPT(256, e, mc, r0, lr)
+
+.Lecb_done_e:
+  CLEAR_REG(q0)
+  CLEAR_REG(q1)
+  CLEAR_REG(q2)
+  CLEAR_REG(q3)
+  CLEAR_REG(q8)
+  CLEAR_REG(q9)
+  vpop {q4-q7}
+  CLEAR_REG(q10)
+  CLEAR_REG(q11)
+  CLEAR_REG(q12)
+  CLEAR_REG(q13)
+  CLEAR_REG(q14)
+
+.Lecb_enc_skip:
+  pop {r4-r6,pc}
+.size _gcry_aes_ecb_enc_armv8_ce,.-_gcry_aes_ecb_enc_armv8_ce;
+
+
+/*
+ * void _gcry_aes_ecb_dec_armv8_ce (const void *keysched,
+ *                                  unsigned char *outbuf,
+ *                                  const unsigned char *inbuf,
+ *                                  size_t nblocks,
+ *                                  unsigned int nrounds);
+ */
+
+.align 3
+.globl _gcry_aes_ecb_dec_armv8_ce
+.type  _gcry_aes_ecb_dec_armv8_ce,%function;
+_gcry_aes_ecb_dec_armv8_ce:
+  /* input:
+   *    r0: keysched
+   *    r1: outbuf
+   *    r2: inbuf
+   *    r3: nblocks
+   *    st+0: nrounds => r4
+   */
+
+  push {r4-r6,lr} /* 4*4 = 16b */
+  cmp r3, #0
+  beq .Lecb_enc_skip
+  ldr r4, [sp, #(16+0)]
+  vpush {q4-q7}
+
+  cmp r4, #12
+
+  aes_preload_keys(r0, lr);
+
+  beq .Lecb_entry_192d
+  bhi .Lecb_entry_256d
+
+  ECB_CRYPT(128, d, imc)
+  ECB_CRYPT(192, d, imc, r0, lr)
+  ECB_CRYPT(256, d, imc, r0, lr)
+
+#undef ECB_CRYPT
+
+.Lecb_done_d:
+  CLEAR_REG(q0)
+  CLEAR_REG(q1)
+  CLEAR_REG(q2)
+  CLEAR_REG(q3)
+  CLEAR_REG(q8)
+  CLEAR_REG(q9)
+  vpop {q4-q7}
+  CLEAR_REG(q10)
+  CLEAR_REG(q11)
+  CLEAR_REG(q12)
+  CLEAR_REG(q13)
+  CLEAR_REG(q14)
+
+.Lecb_dec_skip:
+  pop {r4-r6,pc}
+.size _gcry_aes_ecb_dec_armv8_ce,.-_gcry_aes_ecb_dec_armv8_ce;
+
+
 /*
  * void _gcry_aes_cfb_enc_armv8_ce (const void *keysched,
  *                                  unsigned char *outbuf,
@@ -669,8 +812,8 @@ _gcry_aes_cfb_enc_armv8_ce:
    *    r1: outbuf
    *    r2: inbuf
    *    r3: iv
-   *    %st+0: nblocks => r4
-   *    %st+4: nrounds => r5
+   *    st+0: nblocks => r4
+   *    st+4: nrounds => r5
    */
 
   push {r4-r6,lr} /* 4*4 = 16b */
@@ -745,8 +888,8 @@ _gcry_aes_cfb_dec_armv8_ce:
    *    r1: outbuf
    *    r2: inbuf
    *    r3: iv
-   *    %st+0: nblocks => r4
-   *    %st+4: nrounds => r5
+   *    st+0: nblocks => r4
+   *    st+4: nrounds => r5
    */
 
   push {r4-r6,lr} /* 4*4 = 16b */
@@ -853,8 +996,8 @@ _gcry_aes_ctr_enc_armv8_ce:
    *    r1: outbuf
    *    r2: inbuf
    *    r3: iv
-   *    %st+0: nblocks => r4
-   *    %st+4: nrounds => r5
+   *    st+0: nblocks => r4
+   *    st+4: nrounds => r5
    */
 
   vpush {q4-q7}
@@ -1033,8 +1176,8 @@ _gcry_aes_ctr32le_enc_armv8_ce:
    *    r1: outbuf
    *    r2: inbuf
    *    r3: iv
-   *    %st+0: nblocks => r4
-   *    %st+4: nrounds => r5
+   *    st+0: nblocks => r4
+   *    st+4: nrounds => r5
    */
 
   vpush {q4-q7}
@@ -1138,7 +1281,7 @@ _gcry_aes_ctr32le_enc_armv8_ce:
 
 
 /*
- * void _gcry_aes_ocb_enc_armv8_ce (const void *keysched,
+ * long _gcry_aes_ocb_enc_armv8_ce (const void *keysched,
  *                                  unsigned char *outbuf,
  *                                  const unsigned char *inbuf,
  *                                  unsigned char *offset,
@@ -1158,11 +1301,11 @@ _gcry_aes_ocb_enc_armv8_ce:
    *    r1: outbuf
    *    r2: inbuf
    *    r3: offset
-   *    %st+0: checksum => r4
-   *    %st+4: Ls => r5
-   *    %st+8: nblocks => r6  (0 < nblocks <= 32)
-   *    %st+12: nrounds => r7
-   *    %st+16: blkn => lr
+   *    st+0: checksum => r4
+   *    st+4: Ls => r5
+   *    st+8: nblocks => r6  (0 < nblocks <= 32)
+   *    st+12: nrounds => r7
+   *    st+16: blkn => lr
    */
 
   vpush {q4-q7}
@@ -1305,6 +1448,7 @@ _gcry_aes_ocb_enc_armv8_ce:
   CLEAR_REG(q13)
   CLEAR_REG(q14)
 
+  mov r0, #0
   pop {r4-r12,lr}
   vpop {q4-q7}
   bx lr
@@ -1312,7 +1456,7 @@ _gcry_aes_ocb_enc_armv8_ce:
 
 
 /*
- * void _gcry_aes_ocb_dec_armv8_ce (const void *keysched,
+ * long _gcry_aes_ocb_dec_armv8_ce (const void *keysched,
  *                                  unsigned char *outbuf,
  *                                  const unsigned char *inbuf,
  *                                  unsigned char *offset,
@@ -1332,11 +1476,11 @@ _gcry_aes_ocb_dec_armv8_ce:
    *    r1: outbuf
    *    r2: inbuf
    *    r3: offset
-   *    %st+0: checksum => r4
-   *    %st+4: Ls => r5
-   *    %st+8: nblocks => r6  (0 < nblocks <= 32)
-   *    %st+12: nrounds => r7
-   *    %st+16: blkn => lr
+   *    st+0: checksum => r4
+   *    st+4: Ls => r5
+   *    st+8: nblocks => r6  (0 < nblocks <= 32)
+   *    st+12: nrounds => r7
+   *    st+16: blkn => lr
    */
 
   vpush {q4-q7}
@@ -1479,6 +1623,7 @@ _gcry_aes_ocb_dec_armv8_ce:
   CLEAR_REG(q13)
   CLEAR_REG(q14)
 
+  mov r0, #0
   pop {r4-r12,lr}
   vpop {q4-q7}
   bx lr
@@ -1486,7 +1631,7 @@ _gcry_aes_ocb_dec_armv8_ce:
 
 
 /*
- * void _gcry_aes_ocb_auth_armv8_ce (const void *keysched,
+ * long _gcry_aes_ocb_auth_armv8_ce (const void *keysched,
  *                                   const unsigned char *abuf,
  *                                   unsigned char *offset,
  *                                   unsigned char *checksum,
@@ -1505,10 +1650,10 @@ _gcry_aes_ocb_auth_armv8_ce:
    *    r1: abuf
    *    r2: offset
    *    r3: checksum
-   *    %st+0: Ls => r5
-   *    %st+4: nblocks => r6  (0 < nblocks <= 32)
-   *    %st+8: nrounds => r7
-   *    %st+12: blkn => lr
+   *    st+0: Ls => r5
+   *    st+4: nblocks => r6  (0 < nblocks <= 32)
+   *    st+8: nrounds => r7
+   *    st+12: blkn => lr
    */
 
   vpush {q4-q7}
@@ -1632,6 +1777,7 @@ _gcry_aes_ocb_auth_armv8_ce:
   CLEAR_REG(q13)
   CLEAR_REG(q14)
 
+  mov r0, #0
   pop {r4-r12,lr}
   vpop {q4-q7}
   bx lr
@@ -1655,8 +1801,8 @@ _gcry_aes_xts_enc_armv8_ce:
    *    r1: outbuf
    *    r2: inbuf
    *    r3: iv
-   *    %st+0: nblocks => r4
-   *    %st+4: nrounds => r5
+   *    st+0: nblocks => r4
+   *    st+4: nrounds => r5
    */
 
   vpush {q4-q7}
@@ -1810,8 +1956,8 @@ _gcry_aes_xts_dec_armv8_ce:
    *    r1: outbuf
    *    r2: inbuf
    *    r3: iv
-   *    %st+0: nblocks => r4
-   *    %st+4: nrounds => r5
+   *    st+0: nblocks => r4
+   *    st+4: nrounds => r5
    */
 
   vpush {q4-q7}
index 4fef03454df40efb96119c5f2dd843ae2233f7cb..64f67fbe6f1a5f8705539b815604d71da95f854b 100644 (file)
  *                                     const byte *src,
  *                                     unsigned int nrounds);
  */
-.align 3
+.align 4
 .globl _gcry_aes_enc_armv8_ce
 ELF(.type  _gcry_aes_enc_armv8_ce,%function;)
 _gcry_aes_enc_armv8_ce:
@@ -326,7 +326,7 @@ ELF(.size _gcry_aes_enc_armv8_ce,.-_gcry_aes_enc_armv8_ce;)
  *                                     const byte *src,
  *                                     unsigned int nrounds);
  */
-.align 3
+.align 4
 .globl _gcry_aes_dec_armv8_ce
 ELF(.type  _gcry_aes_dec_armv8_ce,%function;)
 _gcry_aes_dec_armv8_ce:
@@ -385,6 +385,119 @@ _gcry_aes_dec_armv8_ce:
 ELF(.size _gcry_aes_dec_armv8_ce,.-_gcry_aes_dec_armv8_ce;)
 
 
+/*
+ * void _gcry_aes_ecb_enc_armv8_ce (const void *keysched,
+ *                                  unsigned char *outbuf,
+ *                                  const unsigned char *inbuf,
+ *                                  size_t nblocks, unsigned int nrounds);
+ */
+
+.align 4
+.globl _gcry_aes_ecb_enc_armv8_ce
+ELF(.type  _gcry_aes_ecb_enc_armv8_ce,%function;)
+_gcry_aes_ecb_enc_armv8_ce:
+  /* input:
+   *    x0: keysched
+   *    x1: outbuf
+   *    x2: inbuf
+   *    x3: nblocks
+   *    w4: nrounds
+   */
+  CFI_STARTPROC();
+
+  cbz x3, .Lecb_enc_skip
+
+  aes_preload_keys(x0, w4);
+
+  b.eq .Lecb_entry_192e
+  b.hi .Lecb_entry_256e
+
+#define ECB_CRYPT(bits, e_d, mc_imc) \
+  .Lecb_entry_##bits##e_d: \
+    cmp x3, #4; \
+    b.lo .Lecb_loop_##bits##e_d; \
+    \
+  .Lecb_loop4_##bits##e_d: \
+    sub x3, x3, #4; \
+    ld1 {v0.16b-v3.16b}, [x2], #64; /* load ciphertext */ \
+    cmp x3, #4; \
+    do_aes_4_##bits(e_d, mc_imc, v0, v1, v2, v3); \
+    st1 {v0.16b-v3.16b}, [x1], #64; /* store plaintext */ \
+    \
+    b.hs .Lecb_loop4_##bits##e_d; \
+    CLEAR_REG(v1); \
+    CLEAR_REG(v2); \
+    CLEAR_REG(v3); \
+    cbz x3, .Lecb_done_##e_d; \
+    \
+  .Lecb_loop_##bits##e_d: \
+    ld1 {v0.16b}, [x2], #16; /* load ciphertext */ \
+    sub x3, x3, #1; \
+    do_aes_one##bits(e_d, mc_imc, v0, v0, vk0); \
+    st1 {v0.16b}, [x1], #16; /* store plaintext */ \
+    \
+    cbnz x3, .Lecb_loop_##bits##e_d; \
+    b .Lecb_done_##e_d;
+
+  ECB_CRYPT(128, e, mc)
+  ECB_CRYPT(192, e, mc)
+  ECB_CRYPT(256, e, mc)
+
+.Lecb_done_e:
+  aes_clear_keys(w4)
+
+  CLEAR_REG(v0)
+
+.Lecb_enc_skip:
+  ret_spec_stop
+  CFI_ENDPROC();
+ELF(.size _gcry_aes_ecb_enc_armv8_ce,.-_gcry_aes_ecb_enc_armv8_ce;)
+
+
+/*
+ * void _gcry_aes_ecb_dec_armv8_ce (const void *keysched,
+ *                                  unsigned char *outbuf,
+ *                                  const unsigned char *inbuf,
+ *                                  size_t nblocks, unsigned int nrounds);
+ */
+
+.align 4
+.globl _gcry_aes_ecb_dec_armv8_ce
+ELF(.type  _gcry_aes_ecb_dec_armv8_ce,%function;)
+_gcry_aes_ecb_dec_armv8_ce:
+  /* input:
+   *    x0: keysched
+   *    x1: outbuf
+   *    x2: inbuf
+   *    x3: nblocks
+   *    w4: nrounds
+   */
+  CFI_STARTPROC();
+
+  cbz x3, .Lecb_enc_skip
+
+  aes_preload_keys(x0, w4);
+
+  b.eq .Lecb_entry_192d
+  b.hi .Lecb_entry_256d
+
+  ECB_CRYPT(128, d, imc)
+  ECB_CRYPT(192, d, imc)
+  ECB_CRYPT(256, d, imc)
+
+#undef ECB_CRYPT
+
+.Lecb_done_d:
+  aes_clear_keys(w4)
+
+  CLEAR_REG(v0)
+
+.Lecb_dec_skip:
+  ret_spec_stop
+  CFI_ENDPROC();
+ELF(.size _gcry_aes_ecb_dec_armv8_ce,.-_gcry_aes_ecb_dec_armv8_ce;)
+
+
 /*
  * void _gcry_aes_cbc_enc_armv8_ce (const void *keysched,
  *                                  unsigned char *outbuf,
@@ -393,7 +506,7 @@ ELF(.size _gcry_aes_dec_armv8_ce,.-_gcry_aes_dec_armv8_ce;)
  *                                  int cbc_mac, unsigned int nrounds);
  */
 
-.align 3
+.align 4
 .globl _gcry_aes_cbc_enc_armv8_ce
 ELF(.type  _gcry_aes_cbc_enc_armv8_ce,%function;)
 _gcry_aes_cbc_enc_armv8_ce:
@@ -471,10 +584,11 @@ ELF(.size _gcry_aes_cbc_enc_armv8_ce,.-_gcry_aes_cbc_enc_armv8_ce;)
  * void _gcry_aes_cbc_dec_armv8_ce (const void *keysched,
  *                                  unsigned char *outbuf,
  *                                  const unsigned char *inbuf,
- *                                  unsigned char *iv, unsigned int nrounds);
+ *                                  unsigned char *iv,
+ *                                  size_t nblocks, unsigned int nrounds);
  */
 
-.align 3
+.align 4
 .globl _gcry_aes_cbc_dec_armv8_ce
 ELF(.type  _gcry_aes_cbc_dec_armv8_ce,%function;)
 _gcry_aes_cbc_dec_armv8_ce:
@@ -596,7 +710,7 @@ ELF(.size _gcry_aes_cbc_dec_armv8_ce,.-_gcry_aes_cbc_dec_armv8_ce;)
  *                                  unsigned char *iv, unsigned int nrounds);
  */
 
-.align 3
+.align 4
 .globl _gcry_aes_ctr_enc_armv8_ce
 ELF(.type  _gcry_aes_ctr_enc_armv8_ce,%function;)
 _gcry_aes_ctr_enc_armv8_ce:
@@ -790,7 +904,7 @@ ELF(.size _gcry_aes_ctr_enc_armv8_ce,.-_gcry_aes_ctr_enc_armv8_ce;)
  *                                      unsigned int nrounds);
  */
 
-.align 3
+.align 4
 .globl _gcry_aes_ctr32le_enc_armv8_ce
 ELF(.type  _gcry_aes_ctr32le_enc_armv8_ce,%function;)
 _gcry_aes_ctr32le_enc_armv8_ce:
@@ -936,7 +1050,7 @@ ELF(.size _gcry_aes_ctr32le_enc_armv8_ce,.-_gcry_aes_ctr32le_enc_armv8_ce;)
  *                                  unsigned char *iv, unsigned int nrounds);
  */
 
-.align 3
+.align 4
 .globl _gcry_aes_cfb_enc_armv8_ce
 ELF(.type  _gcry_aes_cfb_enc_armv8_ce,%function;)
 _gcry_aes_cfb_enc_armv8_ce:
@@ -1018,7 +1132,7 @@ ELF(.size _gcry_aes_cfb_enc_armv8_ce,.-_gcry_aes_cfb_enc_armv8_ce;)
  *                                  unsigned char *iv, unsigned int nrounds);
  */
 
-.align 3
+.align 4
 .globl _gcry_aes_cfb_dec_armv8_ce
 ELF(.type  _gcry_aes_cfb_dec_armv8_ce,%function;)
 _gcry_aes_cfb_dec_armv8_ce:
@@ -1136,7 +1250,7 @@ ELF(.size _gcry_aes_cfb_dec_armv8_ce,.-_gcry_aes_cfb_dec_armv8_ce;)
 
 
 /*
- * void _gcry_aes_ocb_enc_armv8_ce (const void *keysched,
+ * long _gcry_aes_ocb_enc_armv8_ce (const void *keysched,
  *                                  unsigned char *outbuf,
  *                                  const unsigned char *inbuf,
  *                                  unsigned char *offset,
@@ -1147,7 +1261,7 @@ ELF(.size _gcry_aes_cfb_dec_armv8_ce,.-_gcry_aes_cfb_dec_armv8_ce;)
  *                                  unsigned int blkn);
  */
 
-.align 3
+.align 4
 .globl _gcry_aes_ocb_enc_armv8_ce
 ELF(.type  _gcry_aes_ocb_enc_armv8_ce,%function;)
 _gcry_aes_ocb_enc_armv8_ce:
@@ -1379,13 +1493,14 @@ _gcry_aes_ocb_enc_armv8_ce:
   add sp, sp, #128;
   CFI_ADJUST_CFA_OFFSET(-128);
 
+  mov x0, #0
   ret_spec_stop
   CFI_ENDPROC();
 ELF(.size _gcry_aes_ocb_enc_armv8_ce,.-_gcry_aes_ocb_enc_armv8_ce;)
 
 
 /*
- * void _gcry_aes_ocb_dec_armv8_ce (const void *keysched,
+ * long _gcry_aes_ocb_dec_armv8_ce (const void *keysched,
  *                                  unsigned char *outbuf,
  *                                  const unsigned char *inbuf,
  *                                  unsigned char *offset,
@@ -1396,7 +1511,7 @@ ELF(.size _gcry_aes_ocb_enc_armv8_ce,.-_gcry_aes_ocb_enc_armv8_ce;)
  *                                  unsigned int blkn);
  */
 
-.align 3
+.align 4
 .globl _gcry_aes_ocb_dec_armv8_ce
 ELF(.type  _gcry_aes_ocb_dec_armv8_ce,%function;)
 _gcry_aes_ocb_dec_armv8_ce:
@@ -1458,13 +1573,14 @@ _gcry_aes_ocb_dec_armv8_ce:
   add sp, sp, #128;
   CFI_ADJUST_CFA_OFFSET(-128);
 
+  mov x0, #0
   ret_spec_stop
   CFI_ENDPROC();
 ELF(.size _gcry_aes_ocb_dec_armv8_ce,.-_gcry_aes_ocb_dec_armv8_ce;)
 
 
 /*
- * void _gcry_aes_ocb_auth_armv8_ce (const void *keysched,
+ * long _gcry_aes_ocb_auth_armv8_ce (const void *keysched,
  *                                   const unsigned char *abuf,
  *                                   unsigned char *offset,
  *                                   unsigned char *checksum,
@@ -1474,7 +1590,7 @@ ELF(.size _gcry_aes_ocb_dec_armv8_ce,.-_gcry_aes_ocb_dec_armv8_ce;)
  *                                   unsigned int blkn);
  */
 
-.align 3
+.align 4
 .globl _gcry_aes_ocb_auth_armv8_ce
 ELF(.type  _gcry_aes_ocb_auth_armv8_ce,%function;)
 _gcry_aes_ocb_auth_armv8_ce:
@@ -1605,6 +1721,7 @@ _gcry_aes_ocb_auth_armv8_ce:
   CLEAR_REG(v2)
   CLEAR_REG(v16)
 
+  mov x0, #0
   ret_spec_stop
   CFI_ENDPROC();
 ELF(.size _gcry_aes_ocb_auth_armv8_ce,.-_gcry_aes_ocb_auth_armv8_ce;)
@@ -1619,7 +1736,7 @@ ELF(.size _gcry_aes_ocb_auth_armv8_ce,.-_gcry_aes_ocb_auth_armv8_ce;)
  *                                  unsigned int nrounds);
  */
 
-.align 3
+.align 4
 .globl _gcry_aes_xts_enc_armv8_ce
 ELF(.type  _gcry_aes_xts_enc_armv8_ce,%function;)
 _gcry_aes_xts_enc_armv8_ce:
@@ -1820,7 +1937,7 @@ ELF(.size _gcry_aes_xts_enc_armv8_ce,.-_gcry_aes_xts_enc_armv8_ce;)
  *                                  unsigned int nrounds);
  */
 
-.align 3
+.align 4
 .globl _gcry_aes_xts_dec_armv8_ce
 ELF(.type  _gcry_aes_xts_dec_armv8_ce,%function;)
 _gcry_aes_xts_dec_armv8_ce:
@@ -1882,7 +1999,7 @@ ELF(.size _gcry_aes_xts_dec_armv8_ce,.-_gcry_aes_xts_dec_armv8_ce;)
 /*
  * u32 _gcry_aes_sbox4_armv8_ce(u32 in4b);
  */
-.align 3
+.align 4
 .globl _gcry_aes_sbox4_armv8_ce
 ELF(.type  _gcry_aes_sbox4_armv8_ce,%function;)
 _gcry_aes_sbox4_armv8_ce:
@@ -1905,7 +2022,7 @@ ELF(.size _gcry_aes_sbox4_armv8_ce,.-_gcry_aes_sbox4_armv8_ce;)
 /*
  * void _gcry_aes_invmixcol_armv8_ce(void *dst, const void *src);
  */
-.align 3
+.align 4
 .globl _gcry_aes_invmixcol_armv8_ce
 ELF(.type  _gcry_aes_invmixcol_armv8_ce,%function;)
 _gcry_aes_invmixcol_armv8_ce:
index b24ae3e9a92c299c6725041b5e84bb6ab191b22d..042b7d428431161700a1d749a29644b535a24dcd 100644 (file)
@@ -1,5 +1,5 @@
 /* ARMv8 Crypto Extension AES for Libgcrypt
- * Copyright (C) 2016 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ * Copyright (C) 2016, 2022 Jussi Kivilinna <jussi.kivilinna@iki.fi>
  *
  * This file is part of Libgcrypt.
  *
@@ -27,7 +27,6 @@
 #include "g10lib.h"
 #include "cipher.h"
 #include "bufhelp.h"
-#include "cipher-selftest.h"
 #include "rijndael-internal.h"
 #include "./cipher-internal.h"
 
@@ -81,32 +80,32 @@ extern void _gcry_aes_ctr32le_enc_armv8_ce (const void *keysched,
                                             unsigned char *iv, size_t nblocks,
                                             unsigned int nrounds);
 
-extern void _gcry_aes_ocb_enc_armv8_ce (const void *keysched,
-                                        unsigned char *outbuf,
-                                        const unsigned char *inbuf,
-                                        unsigned char *offset,
-                                        unsigned char *checksum,
-                                        unsigned char *L_table,
-                                        size_t nblocks,
-                                        unsigned int nrounds,
-                                        unsigned int blkn);
-extern void _gcry_aes_ocb_dec_armv8_ce (const void *keysched,
-                                        unsigned char *outbuf,
-                                        const unsigned char *inbuf,
-                                        unsigned char *offset,
-                                        unsigned char *checksum,
-                                        unsigned char *L_table,
-                                        size_t nblocks,
-                                        unsigned int nrounds,
-                                        unsigned int blkn);
-extern void _gcry_aes_ocb_auth_armv8_ce (const void *keysched,
-                                         const unsigned char *abuf,
-                                         unsigned char *offset,
-                                         unsigned char *checksum,
-                                         unsigned char *L_table,
-                                         size_t nblocks,
-                                         unsigned int nrounds,
-                                         unsigned int blkn);
+extern size_t _gcry_aes_ocb_enc_armv8_ce (const void *keysched,
+                                          unsigned char *outbuf,
+                                          const unsigned char *inbuf,
+                                          unsigned char *offset,
+                                          unsigned char *checksum,
+                                          unsigned char *L_table,
+                                          size_t nblocks,
+                                          unsigned int nrounds,
+                                          unsigned int blkn);
+extern size_t _gcry_aes_ocb_dec_armv8_ce (const void *keysched,
+                                          unsigned char *outbuf,
+                                          const unsigned char *inbuf,
+                                          unsigned char *offset,
+                                          unsigned char *checksum,
+                                          unsigned char *L_table,
+                                          size_t nblocks,
+                                          unsigned int nrounds,
+                                          unsigned int blkn);
+extern size_t _gcry_aes_ocb_auth_armv8_ce (const void *keysched,
+                                           const unsigned char *abuf,
+                                           unsigned char *offset,
+                                           unsigned char *checksum,
+                                           unsigned char *L_table,
+                                           size_t nblocks,
+                                           unsigned int nrounds,
+                                           unsigned int blkn);
 extern void _gcry_aes_xts_enc_armv8_ce (const void *keysched,
                                         unsigned char *outbuf,
                                         const unsigned char *inbuf,
@@ -117,115 +116,56 @@ extern void _gcry_aes_xts_dec_armv8_ce (const void *keysched,
                                         const unsigned char *inbuf,
                                         unsigned char *tweak,
                                         size_t nblocks, unsigned int nrounds);
+extern void _gcry_aes_ecb_enc_armv8_ce (const void *keysched,
+                                        unsigned char *outbuf,
+                                        const unsigned char *inbuf,
+                                        size_t nblocks, unsigned int nrounds);
+extern void _gcry_aes_ecb_dec_armv8_ce (const void *keysched,
+                                        unsigned char *outbuf,
+                                        const unsigned char *inbuf,
+                                        size_t nblocks, unsigned int nrounds);
 
-typedef void (*ocb_crypt_fn_t) (const void *keysched, unsigned char *outbuf,
-                                const unsigned char *inbuf,
-                                unsigned char *offset, unsigned char *checksum,
-                                unsigned char *L_table, size_t nblocks,
-                                unsigned int nrounds, unsigned int blkn);
-
-typedef void (*xts_crypt_fn_t) (const void *keysched, unsigned char *outbuf,
-                                const unsigned char *inbuf,
-                                unsigned char *tweak, size_t nblocks,
-                                unsigned int nrounds);
 
 void
 _gcry_aes_armv8_ce_setkey (RIJNDAEL_context *ctx, const byte *key)
 {
-  union
-    {
-      PROPERLY_ALIGNED_TYPE dummy;
-      byte data[MAXKC][4];
-      u32 data32[MAXKC];
-    } tkk[2];
   unsigned int rounds = ctx->rounds;
-  int KC = rounds - 6;
-  unsigned int keylen = KC * 4;
-  unsigned int i, r, t;
+  unsigned int KC = rounds - 6;
+  u32 *W_u32 = ctx->keyschenc32b;
+  unsigned int i, j;
+  u32 W_prev;
   byte rcon = 1;
-  int j;
-#define k      tkk[0].data
-#define k_u32  tkk[0].data32
-#define tk     tkk[1].data
-#define tk_u32 tkk[1].data32
-#define W      (ctx->keyschenc)
-#define W_u32  (ctx->keyschenc32)
-
-  for (i = 0; i < keylen; i++)
-    {
-      k[i >> 2][i & 3] = key[i];
-    }
 
-  for (j = KC-1; j >= 0; j--)
-    {
-      tk_u32[j] = k_u32[j];
-    }
-  r = 0;
-  t = 0;
-  /* Copy values into round key array.  */
-  for (j = 0; (j < KC) && (r < rounds + 1); )
+  for (i = 0; i < KC; i += 2)
     {
-      for (; (j < KC) && (t < 4); j++, t++)
-        {
-          W_u32[r][t] = le_bswap32(tk_u32[j]);
-        }
-      if (t == 4)
-        {
-          r++;
-          t = 0;
-        }
+      W_u32[i + 0] = buf_get_le32(key + i * 4 + 0);
+      W_u32[i + 1] = buf_get_le32(key + i * 4 + 4);
     }
 
-  while (r < rounds + 1)
+  for (i = KC, j = KC, W_prev = W_u32[KC - 1];
+       i < 4 * (rounds + 1);
+       i += 2, j += 2)
     {
-      tk_u32[0] ^= _gcry_aes_sbox4_armv8_ce(rol(tk_u32[KC - 1], 24)) ^ rcon;
+      u32 temp0 = W_prev;
+      u32 temp1;
 
-      if (KC != 8)
+      if (j == KC)
         {
-          for (j = 1; j < KC; j++)
-            {
-              tk_u32[j] ^= tk_u32[j-1];
-            }
+          j = 0;
+          temp0 = _gcry_aes_sbox4_armv8_ce(rol(temp0, 24)) ^ rcon;
+          rcon = ((rcon << 1) ^ (-(rcon >> 7) & 0x1b)) & 0xff;
         }
-      else
+      else if (KC == 8 && j == 4)
         {
-          for (j = 1; j < KC/2; j++)
-            {
-              tk_u32[j] ^= tk_u32[j-1];
-            }
-
-          tk_u32[KC/2] ^= _gcry_aes_sbox4_armv8_ce(tk_u32[KC/2 - 1]);
-
-          for (j = KC/2 + 1; j < KC; j++)
-            {
-              tk_u32[j] ^= tk_u32[j-1];
-            }
+          temp0 = _gcry_aes_sbox4_armv8_ce(temp0);
         }
 
-      /* Copy values into round key array.  */
-      for (j = 0; (j < KC) && (r < rounds + 1); )
-        {
-          for (; (j < KC) && (t < 4); j++, t++)
-            {
-              W_u32[r][t] = le_bswap32(tk_u32[j]);
-            }
-          if (t == 4)
-            {
-              r++;
-              t = 0;
-            }
-        }
+      temp1 = W_u32[i - KC + 0];
 
-      rcon = (rcon << 1) ^ ((rcon >> 7) * 0x1b);
+      W_u32[i + 0] = temp0 ^ temp1;
+      W_u32[i + 1] = W_u32[i - KC + 1] ^ temp0 ^ temp1;
+      W_prev = W_u32[i + 1];
     }
-
-#undef W
-#undef tk
-#undef k
-#undef W_u32
-#undef tk_u32
-#undef k_u32
-  wipememory(&tkk, sizeof(tkk));
 }
 
 /* Make a decryption key from an encryption key. */
@@ -369,8 +309,6 @@ _gcry_aes_armv8_ce_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg,
 {
   RIJNDAEL_context *ctx = (void *)&c->context.c;
   const void *keysched = encrypt ? ctx->keyschenc32 : ctx->keyschdec32;
-  ocb_crypt_fn_t crypt_fn = encrypt ? _gcry_aes_ocb_enc_armv8_ce
-                                    : _gcry_aes_ocb_dec_armv8_ce;
   unsigned char *outbuf = outbuf_arg;
   const unsigned char *inbuf = inbuf_arg;
   unsigned int nrounds = ctx->rounds;
@@ -384,10 +322,16 @@ _gcry_aes_armv8_ce_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg,
 
   c->u_mode.ocb.data_nblocks = blkn + nblocks;
 
-  crypt_fn(keysched, outbuf, inbuf, c->u_iv.iv, c->u_ctr.ctr,
-           c->u_mode.ocb.L[0], nblocks, nrounds, (unsigned int)blkn);
-
-  return 0;
+  if (encrypt)
+    return _gcry_aes_ocb_enc_armv8_ce (keysched, outbuf, inbuf,
+                                      c->u_iv.iv, c->u_ctr.ctr,
+                                      c->u_mode.ocb.L[0], nblocks, nrounds,
+                                      (unsigned int)blkn);
+  else
+    return _gcry_aes_ocb_dec_armv8_ce (keysched, outbuf, inbuf,
+                                      c->u_iv.iv, c->u_ctr.ctr,
+                                      c->u_mode.ocb.L[0], nblocks, nrounds,
+                                      (unsigned int)blkn);
 }
 
 size_t
@@ -402,11 +346,9 @@ _gcry_aes_armv8_ce_ocb_auth (gcry_cipher_hd_t c, void *abuf_arg,
 
   c->u_mode.ocb.aad_nblocks = blkn + nblocks;
 
-  _gcry_aes_ocb_auth_armv8_ce(keysched, abuf, c->u_mode.ocb.aad_offset,
-                             c->u_mode.ocb.aad_sum, c->u_mode.ocb.L[0],
-                             nblocks, nrounds, (unsigned int)blkn);
-
-  return 0;
+  return _gcry_aes_ocb_auth_armv8_ce (keysched, abuf, c->u_mode.ocb.aad_offset,
+                                     c->u_mode.ocb.aad_sum, c->u_mode.ocb.L[0],
+                                     nblocks, nrounds, (unsigned int)blkn);
 }
 
 void
@@ -415,8 +357,6 @@ _gcry_aes_armv8_ce_xts_crypt (RIJNDAEL_context *ctx, unsigned char *tweak,
                              size_t nblocks, int encrypt)
 {
   const void *keysched = encrypt ? ctx->keyschenc32 : ctx->keyschdec32;
-  xts_crypt_fn_t crypt_fn = encrypt ? _gcry_aes_xts_enc_armv8_ce
-                                    : _gcry_aes_xts_dec_armv8_ce;
   unsigned int nrounds = ctx->rounds;
 
   if ( !encrypt && !ctx->decryption_prepared )
@@ -425,7 +365,32 @@ _gcry_aes_armv8_ce_xts_crypt (RIJNDAEL_context *ctx, unsigned char *tweak,
       ctx->decryption_prepared = 1;
     }
 
-  crypt_fn(keysched, outbuf, inbuf, tweak, nblocks, nrounds);
+  if (encrypt)
+    _gcry_aes_xts_enc_armv8_ce (keysched, outbuf, inbuf, tweak,
+                               nblocks, nrounds);
+  else
+    _gcry_aes_xts_dec_armv8_ce (keysched, outbuf, inbuf, tweak,
+                               nblocks, nrounds);
 }
 
+void
+_gcry_aes_armv8_ce_ecb_crypt (void *context, void *outbuf,
+                             const void *inbuf, size_t nblocks,
+                             int encrypt)
+{
+  RIJNDAEL_context *ctx = context;
+  const void *keysched = encrypt ? ctx->keyschenc32 : ctx->keyschdec32;
+  unsigned int nrounds = ctx->rounds;
+
+  if ( !encrypt && !ctx->decryption_prepared )
+    {
+      _gcry_aes_armv8_ce_prepare_decryption ( ctx );
+      ctx->decryption_prepared = 1;
+    }
+
+  if (encrypt)
+    _gcry_aes_ecb_enc_armv8_ce (keysched, outbuf, inbuf, nblocks, nrounds);
+  else
+    _gcry_aes_ecb_dec_armv8_ce (keysched, outbuf, inbuf, nblocks, nrounds);
+}
 #endif /* USE_ARM_CE */
index 3060408834cb0954ff9d205ccc6fe45b0616b547..166f2415b952aaa1394b9ea67ef384e3140f54d6 100644 (file)
@@ -89,7 +89,7 @@
 # endif
 #endif /* ENABLE_AESNI_SUPPORT */
 
-/* USE_VAES inidicates whether to compile with Intel VAES code.  */
+/* USE_VAES inidicates whether to compile with AMD64 VAES code.  */
 #undef USE_VAES
 #if (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
      defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) && \
 # define USE_VAES 1
 #endif
 
+/* USE_VAES_I386 inidicates whether to compile with i386 VAES code.  */
+#undef USE_VAES_I386
+#if (defined(HAVE_COMPATIBLE_GCC_I386_PLATFORM_AS) || \
+     defined(HAVE_COMPATIBLE_GCC_WIN32_PLATFORM_AS)) && \
+     defined(__i386__) && defined(ENABLE_AVX2_SUPPORT) && \
+     defined(HAVE_GCC_INLINE_ASM_VAES_VPCLMUL) && \
+     defined(USE_AESNI)
+# define USE_VAES_I386 1
+#endif
+
 /* USE_ARM_CE indicates whether to enable ARMv8 Crypto Extension assembly
  * code. */
 #undef USE_ARM_CE
@@ -160,6 +170,7 @@ typedef struct RIJNDAEL_context_s
     PROPERLY_ALIGNED_TYPE dummy;
     byte keyschedule[MAXROUNDS+1][4][4];
     u32 keyschedule32[MAXROUNDS+1][4];
+    u32 keyschedule32b[(MAXROUNDS+1)*4];
 #ifdef USE_PADLOCK
     /* The key as passed to the padlock engine.  It is only used if
        the padlock engine is used (USE_PADLOCK, below).  */
@@ -195,10 +206,11 @@ typedef struct RIJNDAEL_context_s
 } RIJNDAEL_context ATTR_ALIGNED_16;
 
 /* Macros defining alias for the keyschedules.  */
-#define keyschenc   u1.keyschedule
-#define keyschenc32 u1.keyschedule32
-#define keyschdec   u2.keyschedule
-#define keyschdec32 u2.keyschedule32
-#define padlockkey  u1.padlock_key
+#define keyschenc     u1.keyschedule
+#define keyschenc32   u1.keyschedule32
+#define keyschenc32b  u1.keyschedule32b
+#define keyschdec     u2.keyschedule
+#define keyschdec32   u2.keyschedule32
+#define padlockkey    u1.padlock_key
 
 #endif /* G10_RIJNDAEL_INTERNAL_H */
index 3af214d74e9a8c92091dd2d09fb583d146d836a1..2583b834c1f6baeb351db70a5a4f942788e53421 100644 (file)
@@ -27,7 +27,6 @@
 #include "g10lib.h"
 #include "cipher.h"
 #include "bufhelp.h"
-#include "cipher-selftest.h"
 #include "rijndael-internal.h"
 
 #ifdef USE_PADLOCK
index bbbeaac035b96961dd03e861182dc9447e8f5d36..fc8ee52694a3f974a7a06de596ea32072a499c33 100644 (file)
@@ -30,6 +30,7 @@
 
 
 typedef vector unsigned char block;
+typedef vector unsigned int vec_u32;
 
 typedef union
 {
@@ -158,31 +159,6 @@ typedef union
     rkeylast = ALIGNED_LOAD (rk, nrounds); \
   } while (0)
 
-#define AES_ENCRYPT_ALL(blk, nrounds) \
-  do { \
-    blk ^= rkey0; \
-    blk = asm_cipher_be (blk, rkey1); \
-    blk = asm_cipher_be (blk, rkey2); \
-    blk = asm_cipher_be (blk, rkey3); \
-    blk = asm_cipher_be (blk, rkey4); \
-    blk = asm_cipher_be (blk, rkey5); \
-    blk = asm_cipher_be (blk, rkey6); \
-    blk = asm_cipher_be (blk, rkey7); \
-    blk = asm_cipher_be (blk, rkey8); \
-    blk = asm_cipher_be (blk, rkey9); \
-    if (nrounds >= 12) \
-      { \
-       blk = asm_cipher_be (blk, rkey10); \
-       blk = asm_cipher_be (blk, rkey11); \
-       if (rounds > 12) \
-         { \
-           blk = asm_cipher_be (blk, rkey12); \
-           blk = asm_cipher_be (blk, rkey13); \
-         } \
-      } \
-    blk = asm_cipherlast_be (blk, rkeylast); \
-  } while (0)
-
 
 static ASM_FUNC_ATTR_INLINE block
 asm_aligned_ld(unsigned long offset, const void *ptr)
@@ -280,6 +256,16 @@ asm_xor(block a, block b)
   return res;
 }
 
+static ASM_FUNC_ATTR_INLINE block
+asm_sbox_be(block b)
+{
+  block o;
+  __asm__ volatile ("vsbox %0, %1\n\t"
+                   : "=v" (o)
+                   : "v" (b));
+  return o;
+}
+
 static ASM_FUNC_ATTR_INLINE block
 asm_cipher_be(block b, block rk)
 {
index 72f31852b418beabb41a80d0c3b07a4ba64c21e8..ec5cda731bdcfcf06494d650f39f664aada3cfff 100644 (file)
@@ -1,6 +1,6 @@
 /* Rijndael (AES) for GnuPG - PowerPC Vector Crypto AES implementation
  * Copyright (C) 2019 Shawn Landden <shawn@git.icu>
- * Copyright (C) 2019-2020 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ * Copyright (C) 2019-2020, 2022-2023 Jussi Kivilinna <jussi.kivilinna@iki.fi>
  *
  * This file is part of Libgcrypt.
  *
@@ -23,9 +23,9 @@
  * is released under.
  */
 
-unsigned int ENCRYPT_BLOCK_FUNC (const RIJNDAEL_context *ctx,
-                                unsigned char *out,
-                                const unsigned char *in)
+unsigned int PPC_OPT_ATTR
+ENCRYPT_BLOCK_FUNC (const RIJNDAEL_context *ctx, unsigned char *out,
+                   const unsigned char *in)
 {
   const block bige_const = asm_load_be_const();
   const u128_t *rk = (u128_t *)&ctx->keyschenc;
@@ -44,9 +44,9 @@ unsigned int ENCRYPT_BLOCK_FUNC (const RIJNDAEL_context *ctx,
 }
 
 
-unsigned int DECRYPT_BLOCK_FUNC (const RIJNDAEL_context *ctx,
-                                unsigned char *out,
-                                const unsigned char *in)
+unsigned int PPC_OPT_ATTR
+DECRYPT_BLOCK_FUNC (const RIJNDAEL_context *ctx, unsigned char *out,
+                   const unsigned char *in)
 {
   const block bige_const = asm_load_be_const();
   const u128_t *rk = (u128_t *)&ctx->keyschdec;
@@ -65,9 +65,9 @@ unsigned int DECRYPT_BLOCK_FUNC (const RIJNDAEL_context *ctx,
 }
 
 
-void CFB_ENC_FUNC (void *context, unsigned char *iv_arg,
-                  void *outbuf_arg, const void *inbuf_arg,
-                  size_t nblocks)
+void PPC_OPT_ATTR
+CFB_ENC_FUNC (void *context, unsigned char *iv_arg, void *outbuf_arg,
+             const void *inbuf_arg, size_t nblocks)
 {
   const block bige_const = asm_load_be_const();
   RIJNDAEL_context *ctx = context;
@@ -76,48 +76,309 @@ void CFB_ENC_FUNC (void *context, unsigned char *iv_arg,
   u128_t *out = (u128_t *)outbuf_arg;
   int rounds = ctx->rounds;
   ROUND_KEY_VARIABLES_ALL;
-  block rkeylast_orig;
-  block iv;
+  block key0_xor_keylast;
+  block iv, outiv;
 
   iv = VEC_LOAD_BE (iv_arg, 0, bige_const);
+  outiv = iv;
 
   PRELOAD_ROUND_KEYS_ALL (rounds);
-  rkeylast_orig = rkeylast;
+  key0_xor_keylast = rkey0 ^ rkeylast;
+  iv ^= rkey0;
 
-  for (; nblocks >= 2; nblocks -= 2)
+  for (; nblocks; nblocks--)
     {
-      block in2, iv1;
+      rkeylast = key0_xor_keylast ^ VEC_LOAD_BE (in++, 0, bige_const);
+
+      iv = asm_cipher_be (iv, rkey1);
+      iv = asm_cipher_be (iv, rkey2);
+      iv = asm_cipher_be (iv, rkey3);
+      iv = asm_cipher_be (iv, rkey4);
+      iv = asm_cipher_be (iv, rkey5);
+      iv = asm_cipher_be (iv, rkey6);
+      iv = asm_cipher_be (iv, rkey7);
+      iv = asm_cipher_be (iv, rkey8);
+      iv = asm_cipher_be (iv, rkey9);
+      if (rounds >= 12)
+       {
+         iv = asm_cipher_be (iv, rkey10);
+         iv = asm_cipher_be (iv, rkey11);
+         if (rounds > 12)
+           {
+             iv = asm_cipher_be (iv, rkey12);
+             iv = asm_cipher_be (iv, rkey13);
+           }
+       }
+      iv = asm_cipherlast_be (iv, rkeylast);
 
-      rkeylast = rkeylast_orig ^ VEC_LOAD_BE (in, 0, bige_const);
-      in2 = VEC_LOAD_BE (in + 1, 0, bige_const);
-      in += 2;
+      outiv = rkey0 ^ iv;
+      VEC_STORE_BE (out++, 0, outiv, bige_const);
+    }
 
-      AES_ENCRYPT_ALL (iv, rounds);
+  VEC_STORE_BE (iv_arg, 0, outiv, bige_const);
+}
 
-      iv1 = iv;
-      rkeylast = rkeylast_orig ^ in2;
 
-      AES_ENCRYPT_ALL (iv, rounds);
+void PPC_OPT_ATTR
+ECB_CRYPT_FUNC (void *context, void *outbuf_arg, const void *inbuf_arg,
+               size_t nblocks, int encrypt)
+{
+  const block bige_const = asm_load_be_const();
+  RIJNDAEL_context *ctx = context;
+  const u128_t *rk = encrypt ? (u128_t *)&ctx->keyschenc
+                            : (u128_t *)&ctx->keyschdec;
+  const u128_t *in = (const u128_t *)inbuf_arg;
+  u128_t *out = (u128_t *)outbuf_arg;
+  int rounds = ctx->rounds;
+  ROUND_KEY_VARIABLES;
+  block b0, b1, b2, b3, b4, b5, b6, b7;
+  block rkey;
 
-      VEC_STORE_BE (out++, 0, iv1, bige_const);
-      VEC_STORE_BE (out++, 0, iv, bige_const);
+  if (!encrypt && !ctx->decryption_prepared)
+    {
+      internal_aes_ppc_prepare_decryption (ctx);
+      ctx->decryption_prepared = 1;
     }
 
-  for (; nblocks; nblocks--)
+  PRELOAD_ROUND_KEYS (rounds);
+
+  for (; nblocks >= 8; nblocks -= 8)
     {
-      rkeylast = rkeylast_orig ^ VEC_LOAD_BE (in++, 0, bige_const);
+      b0 = VEC_LOAD_BE (in, 0, bige_const);
+      b1 = VEC_LOAD_BE (in, 1, bige_const);
+      b2 = VEC_LOAD_BE (in, 2, bige_const);
+      b3 = VEC_LOAD_BE (in, 3, bige_const);
+      b0 = asm_xor (rkey0, b0);
+      b1 = asm_xor (rkey0, b1);
+      b4 = VEC_LOAD_BE (in, 4, bige_const);
+      b5 = VEC_LOAD_BE (in, 5, bige_const);
+      b2 = asm_xor (rkey0, b2);
+      b3 = asm_xor (rkey0, b3);
+      b6 = VEC_LOAD_BE (in, 6, bige_const);
+      b7 = VEC_LOAD_BE (in, 7, bige_const);
+      in += 8;
+      b4 = asm_xor (rkey0, b4);
+      b5 = asm_xor (rkey0, b5);
+      b6 = asm_xor (rkey0, b6);
+      b7 = asm_xor (rkey0, b7);
 
-      AES_ENCRYPT_ALL (iv, rounds);
+      if (encrypt)
+       {
+#define DO_ROUND(r) \
+             rkey = ALIGNED_LOAD (rk, r); \
+             b0 = asm_cipher_be (b0, rkey); \
+             b1 = asm_cipher_be (b1, rkey); \
+             b2 = asm_cipher_be (b2, rkey); \
+             b3 = asm_cipher_be (b3, rkey); \
+             b4 = asm_cipher_be (b4, rkey); \
+             b5 = asm_cipher_be (b5, rkey); \
+             b6 = asm_cipher_be (b6, rkey); \
+             b7 = asm_cipher_be (b7, rkey);
 
-      VEC_STORE_BE (out++, 0, iv, bige_const);
+         DO_ROUND(1);
+         DO_ROUND(2);
+         DO_ROUND(3);
+         DO_ROUND(4);
+         DO_ROUND(5);
+         DO_ROUND(6);
+         DO_ROUND(7);
+         DO_ROUND(8);
+         DO_ROUND(9);
+         if (rounds >= 12)
+           {
+             DO_ROUND(10);
+             DO_ROUND(11);
+             if (rounds > 12)
+               {
+                 DO_ROUND(12);
+                 DO_ROUND(13);
+               }
+           }
+
+#undef DO_ROUND
+
+         b0 = asm_cipherlast_be (b0, rkeylast);
+         b1 = asm_cipherlast_be (b1, rkeylast);
+         b2 = asm_cipherlast_be (b2, rkeylast);
+         b3 = asm_cipherlast_be (b3, rkeylast);
+         b4 = asm_cipherlast_be (b4, rkeylast);
+         b5 = asm_cipherlast_be (b5, rkeylast);
+         b6 = asm_cipherlast_be (b6, rkeylast);
+         b7 = asm_cipherlast_be (b7, rkeylast);
+       }
+      else
+       {
+#define DO_ROUND(r) \
+             rkey = ALIGNED_LOAD (rk, r); \
+             b0 = asm_ncipher_be (b0, rkey); \
+             b1 = asm_ncipher_be (b1, rkey); \
+             b2 = asm_ncipher_be (b2, rkey); \
+             b3 = asm_ncipher_be (b3, rkey); \
+             b4 = asm_ncipher_be (b4, rkey); \
+             b5 = asm_ncipher_be (b5, rkey); \
+             b6 = asm_ncipher_be (b6, rkey); \
+             b7 = asm_ncipher_be (b7, rkey);
+
+         DO_ROUND(1);
+         DO_ROUND(2);
+         DO_ROUND(3);
+         DO_ROUND(4);
+         DO_ROUND(5);
+         DO_ROUND(6);
+         DO_ROUND(7);
+         DO_ROUND(8);
+         DO_ROUND(9);
+         if (rounds >= 12)
+           {
+             DO_ROUND(10);
+             DO_ROUND(11);
+             if (rounds > 12)
+               {
+                 DO_ROUND(12);
+                 DO_ROUND(13);
+               }
+           }
+
+#undef DO_ROUND
+
+         b0 = asm_ncipherlast_be (b0, rkeylast);
+         b1 = asm_ncipherlast_be (b1, rkeylast);
+         b2 = asm_ncipherlast_be (b2, rkeylast);
+         b3 = asm_ncipherlast_be (b3, rkeylast);
+         b4 = asm_ncipherlast_be (b4, rkeylast);
+         b5 = asm_ncipherlast_be (b5, rkeylast);
+         b6 = asm_ncipherlast_be (b6, rkeylast);
+         b7 = asm_ncipherlast_be (b7, rkeylast);
+       }
+
+      VEC_STORE_BE (out, 0, b0, bige_const);
+      VEC_STORE_BE (out, 1, b1, bige_const);
+      VEC_STORE_BE (out, 2, b2, bige_const);
+      VEC_STORE_BE (out, 3, b3, bige_const);
+      VEC_STORE_BE (out, 4, b4, bige_const);
+      VEC_STORE_BE (out, 5, b5, bige_const);
+      VEC_STORE_BE (out, 6, b6, bige_const);
+      VEC_STORE_BE (out, 7, b7, bige_const);
+      out += 8;
     }
 
-  VEC_STORE_BE (iv_arg, 0, iv, bige_const);
+  if (nblocks >= 4)
+    {
+      b0 = VEC_LOAD_BE (in, 0, bige_const);
+      b1 = VEC_LOAD_BE (in, 1, bige_const);
+      b2 = VEC_LOAD_BE (in, 2, bige_const);
+      b3 = VEC_LOAD_BE (in, 3, bige_const);
+
+      b0 = asm_xor (rkey0, b0);
+      b1 = asm_xor (rkey0, b1);
+      b2 = asm_xor (rkey0, b2);
+      b3 = asm_xor (rkey0, b3);
+
+      if (encrypt)
+       {
+#define DO_ROUND(r) \
+             rkey = ALIGNED_LOAD (rk, r); \
+             b0 = asm_cipher_be (b0, rkey); \
+             b1 = asm_cipher_be (b1, rkey); \
+             b2 = asm_cipher_be (b2, rkey); \
+             b3 = asm_cipher_be (b3, rkey);
+
+         DO_ROUND(1);
+         DO_ROUND(2);
+         DO_ROUND(3);
+         DO_ROUND(4);
+         DO_ROUND(5);
+         DO_ROUND(6);
+         DO_ROUND(7);
+         DO_ROUND(8);
+         DO_ROUND(9);
+         if (rounds >= 12)
+           {
+             DO_ROUND(10);
+             DO_ROUND(11);
+             if (rounds > 12)
+               {
+                 DO_ROUND(12);
+                 DO_ROUND(13);
+               }
+           }
+#undef DO_ROUND
+
+         b0 = asm_cipherlast_be (b0, rkeylast);
+         b1 = asm_cipherlast_be (b1, rkeylast);
+         b2 = asm_cipherlast_be (b2, rkeylast);
+         b3 = asm_cipherlast_be (b3, rkeylast);
+       }
+      else
+        {
+#define DO_ROUND(r) \
+             rkey = ALIGNED_LOAD (rk, r); \
+             b0 = asm_ncipher_be (b0, rkey); \
+             b1 = asm_ncipher_be (b1, rkey); \
+             b2 = asm_ncipher_be (b2, rkey); \
+             b3 = asm_ncipher_be (b3, rkey);
+
+         DO_ROUND(1);
+         DO_ROUND(2);
+         DO_ROUND(3);
+         DO_ROUND(4);
+         DO_ROUND(5);
+         DO_ROUND(6);
+         DO_ROUND(7);
+         DO_ROUND(8);
+         DO_ROUND(9);
+         if (rounds >= 12)
+           {
+             DO_ROUND(10);
+             DO_ROUND(11);
+             if (rounds > 12)
+               {
+                 DO_ROUND(12);
+                 DO_ROUND(13);
+               }
+           }
+#undef DO_ROUND
+
+         b0 = asm_ncipherlast_be (b0, rkeylast);
+         b1 = asm_ncipherlast_be (b1, rkeylast);
+         b2 = asm_ncipherlast_be (b2, rkeylast);
+         b3 = asm_ncipherlast_be (b3, rkeylast);
+       }
+
+      VEC_STORE_BE (out, 0, b0, bige_const);
+      VEC_STORE_BE (out, 1, b1, bige_const);
+      VEC_STORE_BE (out, 2, b2, bige_const);
+      VEC_STORE_BE (out, 3, b3, bige_const);
+
+      in += 4;
+      out += 4;
+      nblocks -= 4;
+    }
+
+  for (; nblocks; nblocks--)
+    {
+      b0 = VEC_LOAD_BE (in, 0, bige_const);
+
+      if (encrypt)
+       {
+         AES_ENCRYPT (b0, rounds);
+       }
+      else
+       {
+         AES_DECRYPT (b0, rounds);
+       }
+
+      VEC_STORE_BE (out, 0, b0, bige_const);
+
+      out++;
+      in++;
+    }
 }
 
-void CFB_DEC_FUNC (void *context, unsigned char *iv_arg,
-                  void *outbuf_arg, const void *inbuf_arg,
-                  size_t nblocks)
+
+void PPC_OPT_ATTR
+CFB_DEC_FUNC (void *context, unsigned char *iv_arg, void *outbuf_arg,
+             const void *inbuf_arg, size_t nblocks)
 {
   const block bige_const = asm_load_be_const();
   RIJNDAEL_context *ctx = context;
@@ -313,9 +574,9 @@ void CFB_DEC_FUNC (void *context, unsigned char *iv_arg,
 }
 
 
-void CBC_ENC_FUNC (void *context, unsigned char *iv_arg,
-                  void *outbuf_arg, const void *inbuf_arg,
-                  size_t nblocks, int cbc_mac)
+void PPC_OPT_ATTR
+CBC_ENC_FUNC (void *context, unsigned char *iv_arg, void *outbuf_arg,
+             const void *inbuf_arg, size_t nblocks, int cbc_mac)
 {
   const block bige_const = asm_load_be_const();
   RIJNDAEL_context *ctx = context;
@@ -324,52 +585,67 @@ void CBC_ENC_FUNC (void *context, unsigned char *iv_arg,
   byte *out = (byte *)outbuf_arg;
   int rounds = ctx->rounds;
   ROUND_KEY_VARIABLES_ALL;
-  block lastiv, b;
+  block iv, key0_xor_keylast, nextiv, outiv;
   unsigned int outadd = -(!cbc_mac) & 16;
 
-  lastiv = VEC_LOAD_BE (iv_arg, 0, bige_const);
+  if (nblocks == 0) /* CMAC may call with nblocks 0. */
+    return;
 
-  PRELOAD_ROUND_KEYS_ALL (rounds);
-
-  for (; nblocks >= 2; nblocks -= 2)
-    {
-      block in2, lastiv1;
-
-      b = lastiv ^ VEC_LOAD_BE (in, 0, bige_const);
-      in2 = VEC_LOAD_BE (in + 1, 0, bige_const);
-      in += 2;
-
-      AES_ENCRYPT_ALL (b, rounds);
-
-      lastiv1 = b;
-      b = lastiv1 ^ in2;
+  iv = VEC_LOAD_BE (iv_arg, 0, bige_const);
 
-      AES_ENCRYPT_ALL (b, rounds);
+  PRELOAD_ROUND_KEYS_ALL (rounds);
+  key0_xor_keylast = rkey0 ^ rkeylast;
 
-      lastiv = b;
-      VEC_STORE_BE ((u128_t *)out, 0, lastiv1, bige_const);
-      out += outadd;
-      VEC_STORE_BE ((u128_t *)out, 0, lastiv, bige_const);
-      out += outadd;
-    }
+  nextiv = VEC_LOAD_BE (in++, 0, bige_const);
+  iv ^= rkey0 ^ nextiv;
 
-  for (; nblocks; nblocks--)
+  do
     {
-      b = lastiv ^ VEC_LOAD_BE (in++, 0, bige_const);
-
-      AES_ENCRYPT_ALL (b, rounds);
+      if (--nblocks)
+       {
+         nextiv = key0_xor_keylast ^ VEC_LOAD_BE (in++, 0, bige_const);
+       }
 
-      lastiv = b;
-      VEC_STORE_BE ((u128_t *)out, 0, b, bige_const);
+      iv = asm_cipher_be (iv, rkey1);
+      iv = asm_cipher_be (iv, rkey2);
+      iv = asm_cipher_be (iv, rkey3);
+      iv = asm_cipher_be (iv, rkey4);
+      iv = asm_cipher_be (iv, rkey5);
+      iv = asm_cipher_be (iv, rkey6);
+      iv = asm_cipher_be (iv, rkey7);
+      iv = asm_cipher_be (iv, rkey8);
+      iv = asm_cipher_be (iv, rkey9);
+      if (rounds >= 12)
+       {
+         iv = asm_cipher_be (iv, rkey10);
+         iv = asm_cipher_be (iv, rkey11);
+         if (rounds > 12)
+           {
+             iv = asm_cipher_be (iv, rkey12);
+             iv = asm_cipher_be (iv, rkey13);
+           }
+       }
+      outiv = iv;
+      /* Proper order for following instructions is important for best
+       * performance on POWER8: the output path vcipherlast needs to be
+       * last one. */
+      __asm__ volatile ("vcipherlast %0, %0, %2\n\t"
+                       "vcipherlast %1, %1, %3\n\t"
+                       : "+v" (iv), "+v" (outiv)
+                       : "v" (nextiv), "v" (rkeylast));
+
+      VEC_STORE_BE ((u128_t *)out, 0, outiv, bige_const);
       out += outadd;
     }
+  while (nblocks);
 
-  VEC_STORE_BE (iv_arg, 0, lastiv, bige_const);
+  VEC_STORE_BE (iv_arg, 0, outiv, bige_const);
 }
 
-void CBC_DEC_FUNC (void *context, unsigned char *iv_arg,
-                  void *outbuf_arg, const void *inbuf_arg,
-                  size_t nblocks)
+
+void PPC_OPT_ATTR
+CBC_DEC_FUNC (void *context, unsigned char *iv_arg, void *outbuf_arg,
+             const void *inbuf_arg, size_t nblocks)
 {
   const block bige_const = asm_load_be_const();
   RIJNDAEL_context *ctx = context;
@@ -572,9 +848,9 @@ void CBC_DEC_FUNC (void *context, unsigned char *iv_arg,
 }
 
 
-void CTR_ENC_FUNC (void *context, unsigned char *ctr_arg,
-                  void *outbuf_arg, const void *inbuf_arg,
-                  size_t nblocks)
+void PPC_OPT_ATTR
+CTR_ENC_FUNC (void *context, unsigned char *ctr_arg, void *outbuf_arg,
+             const void *inbuf_arg, size_t nblocks)
 {
   static const unsigned char vec_one_const[16] =
     { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1 };
@@ -805,9 +1081,9 @@ void CTR_ENC_FUNC (void *context, unsigned char *ctr_arg,
 }
 
 
-size_t OCB_CRYPT_FUNC (gcry_cipher_hd_t c, void *outbuf_arg,
-                      const void *inbuf_arg, size_t nblocks,
-                      int encrypt)
+size_t PPC_OPT_ATTR
+OCB_CRYPT_FUNC (gcry_cipher_hd_t c, void *outbuf_arg, const void *inbuf_arg,
+               size_t nblocks, int encrypt)
 {
   const block bige_const = asm_load_be_const();
   RIJNDAEL_context *ctx = (void *)&c->context.c;
@@ -1311,7 +1587,9 @@ size_t OCB_CRYPT_FUNC (gcry_cipher_hd_t c, void *outbuf_arg,
   return 0;
 }
 
-size_t OCB_AUTH_FUNC (gcry_cipher_hd_t c, void *abuf_arg, size_t nblocks)
+
+size_t PPC_OPT_ATTR
+OCB_AUTH_FUNC (gcry_cipher_hd_t c, void *abuf_arg, size_t nblocks)
 {
   const block bige_const = asm_load_be_const();
   RIJNDAEL_context *ctx = (void *)&c->context.c;
@@ -1520,9 +1798,9 @@ size_t OCB_AUTH_FUNC (gcry_cipher_hd_t c, void *abuf_arg, size_t nblocks)
 }
 
 
-void XTS_CRYPT_FUNC (void *context, unsigned char *tweak_arg,
-                    void *outbuf_arg, const void *inbuf_arg,
-                    size_t nblocks, int encrypt)
+void PPC_OPT_ATTR
+XTS_CRYPT_FUNC (void *context, unsigned char *tweak_arg, void *outbuf_arg,
+               const void *inbuf_arg, size_t nblocks, int encrypt)
 {
 #ifdef WORDS_BIGENDIAN
   static const block vec_bswap128_const =
@@ -2018,3 +2296,249 @@ void XTS_CRYPT_FUNC (void *context, unsigned char *tweak_arg,
 
 #undef GEN_TWEAK
 }
+
+
+void PPC_OPT_ATTR
+CTR32LE_ENC_FUNC(void *context, unsigned char *ctr_arg, void *outbuf_arg,
+                const void *inbuf_arg, size_t nblocks)
+{
+#ifndef WORDS_BIGENDIAN
+  static const vec_u32 vec_u32_one = { 1, 0, 0, 0 };
+#else
+  static const vec_u32 vec_u32_one = { 0, 0, 0, 1 };
+#endif
+  const block bige_const = asm_load_be_const();
+  RIJNDAEL_context *ctx = context;
+  const u128_t *rk = (u128_t *)&ctx->keyschenc;
+  const u128_t *in = (const u128_t *)inbuf_arg;
+  u128_t *out = (u128_t *)outbuf_arg;
+  int rounds = ctx->rounds;
+  ROUND_KEY_VARIABLES;
+  block rkeylast_orig;
+  block b;
+  vec_u32 ctr, one;
+
+  ctr = (vec_u32)vec_reve (VEC_LOAD_BE (ctr_arg, 0, bige_const));
+  one = vec_u32_one;
+
+  PRELOAD_ROUND_KEYS (rounds);
+  rkeylast_orig = rkeylast;
+
+#define VEC_ADD_CTRLE32(ctrv_u32, addv_u32) \
+      vec_reve((block)((ctrv_u32) + (addv_u32)))
+
+  if (nblocks >= 4)
+    {
+      block in0, in1, in2, in3, in4, in5, in6, in7;
+      block b0, b1, b2, b3, b4, b5, b6, b7;
+      vec_u32 two, three, four, five, six, seven, eight;
+      block rkey;
+
+      two   = one + one;
+      three = two + one;
+      four  = two + two;
+      five  = three + two;
+      six   = three + three;
+      seven = four + three;
+      eight = four + four;
+
+      for (; nblocks >= 8; nblocks -= 8)
+       {
+         b1 = VEC_ADD_CTRLE32 (ctr, one);
+         b2 = VEC_ADD_CTRLE32 (ctr, two);
+         b3 = VEC_ADD_CTRLE32 (ctr, three);
+         b4 = VEC_ADD_CTRLE32 (ctr, four);
+         b5 = VEC_ADD_CTRLE32 (ctr, five);
+         b6 = VEC_ADD_CTRLE32 (ctr, six);
+         b7 = VEC_ADD_CTRLE32 (ctr, seven);
+         b0 = asm_xor (rkey0, vec_reve((block)ctr));
+         rkey = ALIGNED_LOAD (rk, 1);
+         ctr = ctr + eight;
+         b1 = asm_xor (rkey0, b1);
+         b2 = asm_xor (rkey0, b2);
+         b3 = asm_xor (rkey0, b3);
+         b0 = asm_cipher_be (b0, rkey);
+         b1 = asm_cipher_be (b1, rkey);
+         b2 = asm_cipher_be (b2, rkey);
+         b3 = asm_cipher_be (b3, rkey);
+         b4 = asm_xor (rkey0, b4);
+         b5 = asm_xor (rkey0, b5);
+         b6 = asm_xor (rkey0, b6);
+         b7 = asm_xor (rkey0, b7);
+         b4 = asm_cipher_be (b4, rkey);
+         b5 = asm_cipher_be (b5, rkey);
+         b6 = asm_cipher_be (b6, rkey);
+         b7 = asm_cipher_be (b7, rkey);
+
+#define DO_ROUND(r) \
+             rkey = ALIGNED_LOAD (rk, r); \
+             b0 = asm_cipher_be (b0, rkey); \
+             b1 = asm_cipher_be (b1, rkey); \
+             b2 = asm_cipher_be (b2, rkey); \
+             b3 = asm_cipher_be (b3, rkey); \
+             b4 = asm_cipher_be (b4, rkey); \
+             b5 = asm_cipher_be (b5, rkey); \
+             b6 = asm_cipher_be (b6, rkey); \
+             b7 = asm_cipher_be (b7, rkey);
+
+         in0 = VEC_LOAD_BE_NOSWAP (in, 0);
+         DO_ROUND(2);
+         in1 = VEC_LOAD_BE_NOSWAP (in, 1);
+         DO_ROUND(3);
+         in2 = VEC_LOAD_BE_NOSWAP (in, 2);
+         DO_ROUND(4);
+         in3 = VEC_LOAD_BE_NOSWAP (in, 3);
+         DO_ROUND(5);
+         in4 = VEC_LOAD_BE_NOSWAP (in, 4);
+         DO_ROUND(6);
+         in5 = VEC_LOAD_BE_NOSWAP (in, 5);
+         DO_ROUND(7);
+         in6 = VEC_LOAD_BE_NOSWAP (in, 6);
+         DO_ROUND(8);
+         in7 = VEC_LOAD_BE_NOSWAP (in, 7);
+         in += 8;
+         DO_ROUND(9);
+
+         if (rounds >= 12)
+           {
+             DO_ROUND(10);
+             DO_ROUND(11);
+             if (rounds > 12)
+               {
+                 DO_ROUND(12);
+                 DO_ROUND(13);
+               }
+           }
+
+#undef DO_ROUND
+
+         in0 = VEC_BE_SWAP (in0, bige_const);
+         in1 = VEC_BE_SWAP (in1, bige_const);
+         in2 = VEC_BE_SWAP (in2, bige_const);
+         in3 = VEC_BE_SWAP (in3, bige_const);
+         in4 = VEC_BE_SWAP (in4, bige_const);
+         in5 = VEC_BE_SWAP (in5, bige_const);
+         in6 = VEC_BE_SWAP (in6, bige_const);
+         in7 = VEC_BE_SWAP (in7, bige_const);
+
+         in0 = asm_xor (rkeylast, in0);
+         in1 = asm_xor (rkeylast, in1);
+         in2 = asm_xor (rkeylast, in2);
+         in3 = asm_xor (rkeylast, in3);
+         b0 = asm_cipherlast_be (b0, in0);
+         b1 = asm_cipherlast_be (b1, in1);
+         in4 = asm_xor (rkeylast, in4);
+         in5 = asm_xor (rkeylast, in5);
+         b2 = asm_cipherlast_be (b2, in2);
+         b3 = asm_cipherlast_be (b3, in3);
+         in6 = asm_xor (rkeylast, in6);
+         in7 = asm_xor (rkeylast, in7);
+         b4 = asm_cipherlast_be (b4, in4);
+         b5 = asm_cipherlast_be (b5, in5);
+         b6 = asm_cipherlast_be (b6, in6);
+         b7 = asm_cipherlast_be (b7, in7);
+
+         b0 = VEC_BE_SWAP (b0, bige_const);
+         b1 = VEC_BE_SWAP (b1, bige_const);
+         b2 = VEC_BE_SWAP (b2, bige_const);
+         b3 = VEC_BE_SWAP (b3, bige_const);
+         b4 = VEC_BE_SWAP (b4, bige_const);
+         b5 = VEC_BE_SWAP (b5, bige_const);
+         b6 = VEC_BE_SWAP (b6, bige_const);
+         b7 = VEC_BE_SWAP (b7, bige_const);
+         VEC_STORE_BE_NOSWAP (out, 0, b0);
+         VEC_STORE_BE_NOSWAP (out, 1, b1);
+         VEC_STORE_BE_NOSWAP (out, 2, b2);
+         VEC_STORE_BE_NOSWAP (out, 3, b3);
+         VEC_STORE_BE_NOSWAP (out, 4, b4);
+         VEC_STORE_BE_NOSWAP (out, 5, b5);
+         VEC_STORE_BE_NOSWAP (out, 6, b6);
+         VEC_STORE_BE_NOSWAP (out, 7, b7);
+         out += 8;
+       }
+
+      if (nblocks >= 4)
+       {
+         b1 = VEC_ADD_CTRLE32 (ctr, one);
+         b2 = VEC_ADD_CTRLE32 (ctr, two);
+         b3 = VEC_ADD_CTRLE32 (ctr, three);
+         b0 = asm_xor (rkey0, vec_reve((block)ctr));
+         ctr = ctr + four;
+         b1 = asm_xor (rkey0, b1);
+         b2 = asm_xor (rkey0, b2);
+         b3 = asm_xor (rkey0, b3);
+
+#define DO_ROUND(r) \
+             rkey = ALIGNED_LOAD (rk, r); \
+             b0 = asm_cipher_be (b0, rkey); \
+             b1 = asm_cipher_be (b1, rkey); \
+             b2 = asm_cipher_be (b2, rkey); \
+             b3 = asm_cipher_be (b3, rkey);
+
+         DO_ROUND(1);
+         DO_ROUND(2);
+         DO_ROUND(3);
+         DO_ROUND(4);
+         DO_ROUND(5);
+         DO_ROUND(6);
+         DO_ROUND(7);
+         DO_ROUND(8);
+
+         in0 = VEC_LOAD_BE (in, 0, bige_const);
+         in1 = VEC_LOAD_BE (in, 1, bige_const);
+         in2 = VEC_LOAD_BE (in, 2, bige_const);
+         in3 = VEC_LOAD_BE (in, 3, bige_const);
+
+         DO_ROUND(9);
+         if (rounds >= 12)
+           {
+             DO_ROUND(10);
+             DO_ROUND(11);
+             if (rounds > 12)
+               {
+                 DO_ROUND(12);
+                 DO_ROUND(13);
+               }
+           }
+
+#undef DO_ROUND
+
+         in0 = asm_xor (rkeylast, in0);
+         in1 = asm_xor (rkeylast, in1);
+         in2 = asm_xor (rkeylast, in2);
+         in3 = asm_xor (rkeylast, in3);
+
+         b0 = asm_cipherlast_be (b0, in0);
+         b1 = asm_cipherlast_be (b1, in1);
+         b2 = asm_cipherlast_be (b2, in2);
+         b3 = asm_cipherlast_be (b3, in3);
+
+         VEC_STORE_BE (out, 0, b0, bige_const);
+         VEC_STORE_BE (out, 1, b1, bige_const);
+         VEC_STORE_BE (out, 2, b2, bige_const);
+         VEC_STORE_BE (out, 3, b3, bige_const);
+
+         in += 4;
+         out += 4;
+         nblocks -= 4;
+       }
+    }
+
+  for (; nblocks; nblocks--)
+    {
+      b = vec_reve((block)ctr);
+      ctr = ctr + one;
+      rkeylast = rkeylast_orig ^ VEC_LOAD_BE (in, 0, bige_const);
+
+      AES_ENCRYPT (b, rounds);
+
+      VEC_STORE_BE (out, 0, b, bige_const);
+
+      out++;
+      in++;
+    }
+
+#undef VEC_ADD_CTRLE32
+
+  VEC_STORE_BE (ctr_arg, 0, vec_reve((block)ctr), bige_const);
+}
index f5c3236111f829277b99f007f9678e356d2d25f1..055b00c02cd09143efbe7285da310a6dfd8af60d 100644 (file)
@@ -1,6 +1,6 @@
 /* Rijndael (AES) for GnuPG - PowerPC Vector Crypto AES implementation
  * Copyright (C) 2019 Shawn Landden <shawn@git.icu>
- * Copyright (C) 2019-2020 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ * Copyright (C) 2019-2020, 2022 Jussi Kivilinna <jussi.kivilinna@iki.fi>
  *
  * This file is part of Libgcrypt.
  *
 #include "rijndael-ppc-common.h"
 
 
-#ifdef WORDS_BIGENDIAN
-static const block vec_bswap32_const =
-  { 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12 };
+#ifdef HAVE_GCC_ATTRIBUTE_OPTIMIZE
+# define FUNC_ATTR_OPT __attribute__((optimize("-O2")))
+#else
+# define FUNC_ATTR_OPT
+#endif
+
+#if defined(__clang__) && defined(HAVE_CLANG_ATTRIBUTE_PPC_TARGET)
+# define PPC_OPT_ATTR __attribute__((target("arch=pwr8"))) FUNC_ATTR_OPT
+#elif defined(HAVE_GCC_ATTRIBUTE_PPC_TARGET)
+# define PPC_OPT_ATTR __attribute__((target("cpu=power8"))) FUNC_ATTR_OPT
 #else
+# define PPC_OPT_ATTR FUNC_ATTR_OPT
+#endif
+
+
+#ifndef WORDS_BIGENDIAN
 static const block vec_bswap32_const_neg =
   { ~3, ~2, ~1, ~0, ~7, ~6, ~5, ~4, ~11, ~10, ~9, ~8, ~15, ~14, ~13, ~12 };
 #endif
@@ -104,138 +116,95 @@ asm_store_be_noswap(block vec, unsigned long offset, void *ptr)
 }
 
 
+static ASM_FUNC_ATTR_INLINE unsigned int
+keysched_idx(unsigned int in)
+{
+#ifdef WORDS_BIGENDIAN
+  return in;
+#else
+  return (in & ~3U) | (3U - (in & 3U));
+#endif
+}
+
+
+static ASM_FUNC_ATTR_INLINE vec_u32
+bcast_u32_to_vec(u32 x)
+{
+  vec_u32 v = { x, x, x, x };
+  return v;
+}
+
+
 static ASM_FUNC_ATTR_INLINE u32
-_gcry_aes_sbox4_ppc8(u32 fourbytes)
+u32_from_vec(vec_u32 x)
 {
-  union
-    {
-      PROPERLY_ALIGNED_TYPE dummy;
-      block data_vec;
-      u32 data32[4];
-    } u;
-
-  u.data32[0] = fourbytes;
-  u.data_vec = vec_sbox_be(u.data_vec);
-  return u.data32[0];
+#ifdef WORDS_BIGENDIAN
+  return x[1];
+#else
+  return x[2];
+#endif
 }
 
-void
+
+void PPC_OPT_ATTR
 _gcry_aes_ppc8_setkey (RIJNDAEL_context *ctx, const byte *key)
 {
-  const block bige_const = asm_load_be_const();
-  union
-    {
-      PROPERLY_ALIGNED_TYPE dummy;
-      byte data[MAXKC][4];
-      u32 data32[MAXKC];
-    } tkk[2];
+  static const vec_u32 rotate24 = { 24, 24, 24, 24 };
+  static const vec_u32 rcon_const = { 0x1b, 0x1b, 0x1b, 0x1b };
+  vec_u32 tk_vu32[MAXKC];
   unsigned int rounds = ctx->rounds;
-  int KC = rounds - 6;
-  unsigned int keylen = KC * 4;
-  u128_t *ekey = (u128_t *)(void *)ctx->keyschenc;
-  unsigned int i, r, t;
-  byte rcon = 1;
-  int j;
-#define k      tkk[0].data
-#define k_u32  tkk[0].data32
-#define tk     tkk[1].data
-#define tk_u32 tkk[1].data32
-#define W      (ctx->keyschenc)
-#define W_u32  (ctx->keyschenc32)
-
-  for (i = 0; i < keylen; i++)
-    {
-      k[i >> 2][i & 3] = key[i];
-    }
+  unsigned int KC = rounds - 6;
+  u32 *W_u32 = ctx->keyschenc32b;
+  unsigned int i, j;
+  vec_u32 tk_prev;
+  vec_u32 rcon = { 1, 1, 1, 1 };
 
-  for (j = KC-1; j >= 0; j--)
+  for (i = 0; i < KC; i += 2)
     {
-      tk_u32[j] = k_u32[j];
+      unsigned int idx0 = keysched_idx(i + 0);
+      unsigned int idx1 = keysched_idx(i + 1);
+      tk_vu32[i + 0] = bcast_u32_to_vec(buf_get_le32(key + i * 4 + 0));
+      tk_vu32[i + 1] = bcast_u32_to_vec(buf_get_le32(key + i * 4 + 4));
+      W_u32[idx0] = u32_from_vec(vec_revb(tk_vu32[i + 0]));
+      W_u32[idx1] = u32_from_vec(vec_revb(tk_vu32[i + 1]));
     }
-  r = 0;
-  t = 0;
-  /* Copy values into round key array.  */
-  for (j = 0; (j < KC) && (r < rounds + 1); )
-    {
-      for (; (j < KC) && (t < 4); j++, t++)
-        {
-          W_u32[r][t] = le_bswap32(tk_u32[j]);
-        }
-      if (t == 4)
-        {
-          r++;
-          t = 0;
-        }
-    }
-  while (r < rounds + 1)
+
+  for (i = KC, j = KC, tk_prev = tk_vu32[KC - 1];
+       i < 4 * (rounds + 1);
+       i += 2, j += 2)
     {
-      tk_u32[0] ^=
-       le_bswap32(
-         _gcry_aes_sbox4_ppc8(rol(le_bswap32(tk_u32[KC - 1]), 24)) ^ rcon);
+      unsigned int idx0 = keysched_idx(i + 0);
+      unsigned int idx1 = keysched_idx(i + 1);
+      vec_u32 temp0 = tk_prev;
+      vec_u32 temp1;
 
-      if (KC != 8)
+      if (j == KC)
         {
-          for (j = 1; j < KC; j++)
-            {
-              tk_u32[j] ^= tk_u32[j-1];
-            }
+          j = 0;
+          temp0 = (vec_u32)(asm_sbox_be((block)vec_rl(temp0, rotate24))) ^ rcon;
+          rcon = (vec_u32)(((block)rcon << 1)
+                           ^ (-((block)rcon >> 7) & (block)rcon_const));
         }
-      else
+      else if (KC == 8 && j == 4)
         {
-          for (j = 1; j < KC/2; j++)
-            {
-              tk_u32[j] ^= tk_u32[j-1];
-            }
-
-          tk_u32[KC/2] ^=
-           le_bswap32(_gcry_aes_sbox4_ppc8(le_bswap32(tk_u32[KC/2 - 1])));
-
-          for (j = KC/2 + 1; j < KC; j++)
-            {
-              tk_u32[j] ^= tk_u32[j-1];
-            }
+          temp0 = (vec_u32)asm_sbox_be((block)temp0);
         }
 
-      /* Copy values into round key array.  */
-      for (j = 0; (j < KC) && (r < rounds + 1); )
-        {
-          for (; (j < KC) && (t < 4); j++, t++)
-            {
-              W_u32[r][t] = le_bswap32(tk_u32[j]);
-            }
-          if (t == 4)
-            {
-              r++;
-              t = 0;
-            }
-        }
+      temp1 = tk_vu32[j + 0];
 
-      rcon = (rcon << 1) ^ (-(rcon >> 7) & 0x1b);
-    }
+      tk_vu32[j + 0] = temp0 ^ temp1;
+      tk_vu32[j + 1] ^= temp0 ^ temp1;
+      tk_prev = tk_vu32[j + 1];
 
-  /* Store in big-endian order. */
-  for (r = 0; r <= rounds; r++)
-    {
-#ifndef WORDS_BIGENDIAN
-      VEC_STORE_BE(ekey, r, ALIGNED_LOAD (ekey, r), bige_const);
-#else
-      block rvec = ALIGNED_LOAD (ekey, r);
-      ALIGNED_STORE (ekey, r,
-                     vec_perm(rvec, rvec, vec_bswap32_const));
-      (void)bige_const;
-#endif
+      W_u32[idx0] = u32_from_vec(vec_revb(tk_vu32[j + 0]));
+      W_u32[idx1] = u32_from_vec(vec_revb(tk_vu32[j + 1]));
     }
 
-#undef W
-#undef tk
-#undef k
-#undef W_u32
-#undef tk_u32
-#undef k_u32
-  wipememory(&tkk, sizeof(tkk));
+  wipememory(tk_vu32, sizeof(tk_vu32));
 }
 
-void
+
+void PPC_OPT_ATTR
 _gcry_aes_ppc8_prepare_decryption (RIJNDAEL_context *ctx)
 {
   internal_aes_ppc_prepare_decryption (ctx);
@@ -245,6 +214,7 @@ _gcry_aes_ppc8_prepare_decryption (RIJNDAEL_context *ctx)
 #define GCRY_AES_PPC8 1
 #define ENCRYPT_BLOCK_FUNC     _gcry_aes_ppc8_encrypt
 #define DECRYPT_BLOCK_FUNC     _gcry_aes_ppc8_decrypt
+#define ECB_CRYPT_FUNC         _gcry_aes_ppc8_ecb_crypt
 #define CFB_ENC_FUNC           _gcry_aes_ppc8_cfb_enc
 #define CFB_DEC_FUNC           _gcry_aes_ppc8_cfb_dec
 #define CBC_ENC_FUNC           _gcry_aes_ppc8_cbc_enc
@@ -253,6 +223,7 @@ _gcry_aes_ppc8_prepare_decryption (RIJNDAEL_context *ctx)
 #define OCB_CRYPT_FUNC         _gcry_aes_ppc8_ocb_crypt
 #define OCB_AUTH_FUNC          _gcry_aes_ppc8_ocb_auth
 #define XTS_CRYPT_FUNC         _gcry_aes_ppc8_xts_crypt
+#define CTR32LE_ENC_FUNC       _gcry_aes_ppc8_ctr32le_enc
 
 #include <rijndael-ppc-functions.h>
 
index facdedd4f24d3867d2dcaa03ddd354834b398bb5..6a44bcf33bfec0050441bfc69485090fc2e56c37 100644 (file)
 #include "rijndael-ppc-common.h"
 
 
+#ifdef HAVE_GCC_ATTRIBUTE_OPTIMIZE
+# define FUNC_ATTR_OPT __attribute__((optimize("-O2")))
+#else
+# define FUNC_ATTR_OPT
+#endif
+
+#if defined(__clang__) && defined(HAVE_CLANG_ATTRIBUTE_PPC_TARGET)
+# define PPC_OPT_ATTR __attribute__((target("arch=pwr9"))) FUNC_ATTR_OPT
+#elif defined(HAVE_GCC_ATTRIBUTE_PPC_TARGET)
+# define PPC_OPT_ATTR __attribute__((target("cpu=power9"))) FUNC_ATTR_OPT
+#else
+# define PPC_OPT_ATTR FUNC_ATTR_OPT
+#endif
+
+
 static ASM_FUNC_ATTR_INLINE block
 asm_load_be_const(void)
 {
@@ -88,6 +103,7 @@ asm_store_be_noswap(block vec, unsigned long offset, void *ptr)
 #define GCRY_AES_PPC9LE 1
 #define ENCRYPT_BLOCK_FUNC     _gcry_aes_ppc9le_encrypt
 #define DECRYPT_BLOCK_FUNC     _gcry_aes_ppc9le_decrypt
+#define ECB_CRYPT_FUNC         _gcry_aes_ppc9le_ecb_crypt
 #define CFB_ENC_FUNC           _gcry_aes_ppc9le_cfb_enc
 #define CFB_DEC_FUNC           _gcry_aes_ppc9le_cfb_dec
 #define CBC_ENC_FUNC           _gcry_aes_ppc9le_cbc_enc
@@ -96,6 +112,7 @@ asm_store_be_noswap(block vec, unsigned long offset, void *ptr)
 #define OCB_CRYPT_FUNC         _gcry_aes_ppc9le_ocb_crypt
 #define OCB_AUTH_FUNC          _gcry_aes_ppc9le_ocb_auth
 #define XTS_CRYPT_FUNC         _gcry_aes_ppc9le_xts_crypt
+#define CTR32LE_ENC_FUNC       _gcry_aes_ppc9le_ctr32le_enc
 
 #include <rijndael-ppc-functions.h>
 
index e50537ed9922dbee7d7a378b401b3c24e26b8062..0a26020e22fbb2b74132186fea33069633af067c 100644 (file)
@@ -1058,7 +1058,7 @@ int _gcry_aes_s390x_setup_acceleration(RIJNDAEL_context *ctx,
       func = KM_FUNCTION_AES_256;
       func_xts = KM_FUNCTION_XTS_AES_256;
       func_mask = km_function_to_mask(KM_FUNCTION_AES_256);
-      func_xts_mask = km_function_to_mask(KM_FUNCTION_AES_256);
+      func_xts_mask = km_function_to_mask(KM_FUNCTION_XTS_AES_256);
       break;
     }
 
index b98dca26ed954645abeb4d93cd1984928ed0e947..5153cb282b05c0836106c3f96db6d0adf4b36ec3 100644 (file)
@@ -47,6 +47,7 @@
 ##
 ##  _gcry_aes_ssse3_enc_preload
 ##
+.align 16
 ELF(.type _gcry_aes_ssse3_enc_preload,@function)
 .globl _gcry_aes_ssse3_enc_preload
 _gcry_aes_ssse3_enc_preload:
@@ -68,6 +69,7 @@ ELF(.size _gcry_aes_ssse3_enc_preload,.-_gcry_aes_ssse3_enc_preload)
 ##
 ##  _gcry_aes_ssse3_dec_preload
 ##
+.align 16
 ELF(.type _gcry_aes_ssse3_dec_preload,@function)
 .globl _gcry_aes_ssse3_dec_preload
 _gcry_aes_ssse3_dec_preload:
@@ -689,8 +691,11 @@ ELF(.size _gcry_aes_ssse3_schedule_core,.-_gcry_aes_ssse3_schedule_core)
 ##                                                    ##
 ########################################################
 
+SECTION_RODATA
+
 .align 16
-ELF(.type _aes_consts,@object)
+ELF(.type _aes_ssse3_consts,@object)
+_aes_ssse3_consts:
 .Laes_consts:
 _aes_consts:
        # s0F
index b07238531c815621f6173d1dacf53a08ca1ed011..0f0abf6286be5b2c21dee645751be930ea59aa94 100644 (file)
@@ -43,7 +43,6 @@
 #include "g10lib.h"
 #include "cipher.h"
 #include "bufhelp.h"
-#include "cipher-selftest.h"
 #include "rijndael-internal.h"
 #include "./cipher-internal.h"
 
index b54d95939353ee760ee2d69d0c268998162f8ba3..e46ce08c03310b8bf3fdaf87ed3ef778274453b5 100644 (file)
@@ -218,10 +218,3 @@ static struct
 
 #define decT dec_tables.T
 #define inv_sbox dec_tables.inv_sbox
-
-static const u32 rcon[30] =
-  {
-    0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x1b, 0x36, 0x6c,
-    0xd8, 0xab, 0x4d, 0x9a, 0x2f, 0x5e, 0xbc, 0x63, 0xc6, 0x97, 0x35,
-    0x6a, 0xd4, 0xb3, 0x7d, 0xfa, 0xef, 0xc5, 0x91
-  };
index f94b58dbcffb6e6a2759e6710392dfa6c22c9894..51ccf932ef739bd43fd027aab830fc1f8d85c039 100644 (file)
@@ -1,5 +1,5 @@
 /* VAES/AVX2 AMD64 accelerated AES for Libgcrypt
- * Copyright (C) 2021 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ * Copyright (C) 2021,2023 Jussi Kivilinna <jussi.kivilinna@iki.fi>
  *
  * This file is part of Libgcrypt.
  *
@@ -86,6 +86,7 @@
  **********************************************************************/
 ELF(.type _gcry_vaes_avx2_cbc_dec_amd64,@function)
 .globl _gcry_vaes_avx2_cbc_dec_amd64
+.align 16
 _gcry_vaes_avx2_cbc_dec_amd64:
        /* input:
         *      %rdi: round keys
@@ -118,6 +119,7 @@ _gcry_vaes_avx2_cbc_dec_amd64:
        vmovdqu (10 * 16)(%rcx), %ymm5;
        vmovdqu (12 * 16)(%rcx), %ymm6;
        vmovdqu (14 * 16)(%rcx), %ymm7;
+       vinserti128 $1, %xmm0, %ymm15, %ymm9;
        vpxor %ymm8, %ymm0, %ymm0;
        vpxor %ymm8, %ymm1, %ymm1;
        vpxor %ymm8, %ymm2, %ymm2;
@@ -127,7 +129,6 @@ _gcry_vaes_avx2_cbc_dec_amd64:
        vpxor %ymm8, %ymm6, %ymm6;
        vpxor %ymm8, %ymm7, %ymm7;
        vbroadcasti128 (1 * 16)(%rdi), %ymm8;
-       vinserti128 $1, (0 * 16)(%rcx), %ymm15, %ymm9;
        vmovdqu (1 * 16)(%rcx), %ymm10;
        vmovdqu (3 * 16)(%rcx), %ymm11;
        vmovdqu (5 * 16)(%rcx), %ymm12;
@@ -211,12 +212,12 @@ _gcry_vaes_avx2_cbc_dec_amd64:
        vmovdqu (2 * 16)(%rcx), %ymm1;
        vmovdqu (4 * 16)(%rcx), %ymm2;
        vmovdqu (6 * 16)(%rcx), %ymm3;
+       vinserti128 $1, %xmm0, %ymm15, %ymm10;
        vpxor %ymm4, %ymm0, %ymm0;
        vpxor %ymm4, %ymm1, %ymm1;
        vpxor %ymm4, %ymm2, %ymm2;
        vpxor %ymm4, %ymm3, %ymm3;
        vbroadcasti128 (1 * 16)(%rdi), %ymm4;
-       vinserti128 $1, (0 * 16)(%rcx), %ymm15, %ymm10;
        vmovdqu (1 * 16)(%rcx), %ymm11;
        vmovdqu (3 * 16)(%rcx), %ymm12;
        vmovdqu (5 * 16)(%rcx), %ymm13;
@@ -282,10 +283,10 @@ _gcry_vaes_avx2_cbc_dec_amd64:
        vbroadcasti128 (0 * 16)(%rdi), %ymm4;
        vmovdqu (0 * 16)(%rcx), %ymm0;
        vmovdqu (2 * 16)(%rcx), %ymm1;
+       vinserti128 $1, %xmm0, %ymm15, %ymm10;
        vpxor %ymm4, %ymm0, %ymm0;
        vpxor %ymm4, %ymm1, %ymm1;
        vbroadcasti128 (1 * 16)(%rdi), %ymm4;
-       vinserti128 $1, (0 * 16)(%rcx), %ymm15, %ymm10;
        vmovdqu (1 * 16)(%rcx), %ymm11;
        vmovdqu (3 * 16)(%rcx), %xmm15;
        leaq (4 * 16)(%rcx), %rcx;
@@ -392,6 +393,7 @@ ELF(.size _gcry_vaes_avx2_cbc_dec_amd64,.-_gcry_vaes_avx2_cbc_dec_amd64)
  **********************************************************************/
 ELF(.type _gcry_vaes_avx2_cfb_dec_amd64,@function)
 .globl _gcry_vaes_avx2_cfb_dec_amd64
+.align 16
 _gcry_vaes_avx2_cfb_dec_amd64:
        /* input:
         *      %rdi: round keys
@@ -416,7 +418,8 @@ _gcry_vaes_avx2_cfb_dec_amd64:
 
        /* Load input and xor first key. Update IV. */
        vbroadcasti128 (0 * 16)(%rdi), %ymm8;
-       vinserti128 $1, (0 * 16)(%rcx), %ymm15, %ymm0;
+       vmovdqu (0 * 16)(%rcx), %ymm9;
+       vinserti128 $1, %xmm9, %ymm15, %ymm0;
        vmovdqu (1 * 16)(%rcx), %ymm1;
        vmovdqu (3 * 16)(%rcx), %ymm2;
        vmovdqu (5 * 16)(%rcx), %ymm3;
@@ -434,7 +437,6 @@ _gcry_vaes_avx2_cfb_dec_amd64:
        vpxor %ymm8, %ymm6, %ymm6;
        vpxor %ymm8, %ymm7, %ymm7;
        vbroadcasti128 (1 * 16)(%rdi), %ymm8;
-       vmovdqu (0 * 16)(%rcx), %ymm9;
        vmovdqu (2 * 16)(%rcx), %ymm10;
        vmovdqu (4 * 16)(%rcx), %ymm11;
        vmovdqu (6 * 16)(%rcx), %ymm12;
@@ -514,7 +516,8 @@ _gcry_vaes_avx2_cfb_dec_amd64:
 
        /* Load input and xor first key. Update IV. */
        vbroadcasti128 (0 * 16)(%rdi), %ymm4;
-       vinserti128 $1, (0 * 16)(%rcx), %ymm15, %ymm0;
+       vmovdqu (0 * 16)(%rcx), %ymm10;
+       vinserti128 $1, %xmm10, %ymm15, %ymm0;
        vmovdqu (1 * 16)(%rcx), %ymm1;
        vmovdqu (3 * 16)(%rcx), %ymm2;
        vmovdqu (5 * 16)(%rcx), %ymm3;
@@ -524,7 +527,6 @@ _gcry_vaes_avx2_cfb_dec_amd64:
        vpxor %ymm4, %ymm2, %ymm2;
        vpxor %ymm4, %ymm3, %ymm3;
        vbroadcasti128 (1 * 16)(%rdi), %ymm4;
-       vmovdqu (0 * 16)(%rcx), %ymm10;
        vmovdqu (2 * 16)(%rcx), %ymm11;
        vmovdqu (4 * 16)(%rcx), %ymm12;
        vmovdqu (6 * 16)(%rcx), %ymm13;
@@ -588,13 +590,13 @@ _gcry_vaes_avx2_cfb_dec_amd64:
 
        /* Load input and xor first key. Update IV. */
        vbroadcasti128 (0 * 16)(%rdi), %ymm4;
-       vinserti128 $1, (0 * 16)(%rcx), %ymm15, %ymm0;
+       vmovdqu (0 * 16)(%rcx), %ymm10;
+       vinserti128 $1, %xmm10, %ymm15, %ymm0;
        vmovdqu (1 * 16)(%rcx), %ymm1;
        vmovdqu (3 * 16)(%rcx), %xmm15;
        vpxor %ymm4, %ymm0, %ymm0;
        vpxor %ymm4, %ymm1, %ymm1;
        vbroadcasti128 (1 * 16)(%rdi), %ymm4;
-       vmovdqu (0 * 16)(%rcx), %ymm10;
        vmovdqu (2 * 16)(%rcx), %ymm11;
 
        leaq (4 * 16)(%rcx), %rcx;
@@ -700,6 +702,7 @@ ELF(.size _gcry_vaes_avx2_cfb_dec_amd64,.-_gcry_vaes_avx2_cfb_dec_amd64)
  **********************************************************************/
 ELF(.type _gcry_vaes_avx2_ctr_enc_amd64,@function)
 .globl _gcry_vaes_avx2_ctr_enc_amd64
+.align 16
 _gcry_vaes_avx2_ctr_enc_amd64:
        /* input:
         *      %rdi: round keys
@@ -735,6 +738,16 @@ _gcry_vaes_avx2_ctr_enc_amd64:
        vpslldq $8, tmp2, tmp2; \
        vpsubq tmp2, x, x;
 
+#define handle_ctr_128bit_add(nblks) \
+       addq $(nblks), %r10; \
+       adcq $0, %r11; \
+       bswapq %r10; \
+       bswapq %r11; \
+       movq %r10, 8(%rsi); \
+       movq %r11, 0(%rsi); \
+       bswapq %r10; \
+       bswapq %r11;
+
        /* Process 16 blocks per loop. */
 .align 8
 .Lctr_enc_blk16:
@@ -750,6 +763,9 @@ _gcry_vaes_avx2_ctr_enc_amd64:
        addb $16, 15(%rsi);
        jc .Lctr_enc_blk16_handle_carry;
 
+       leaq 16(%r10), %r10;
+
+  .Lctr_enc_blk16_byte_bige_add:
        /* Increment counters. */
        vpaddb .Lbige_addb_0 rRIP, %ymm7, %ymm0;
        vpaddb .Lbige_addb_2 rRIP, %ymm7, %ymm1;
@@ -759,7 +775,6 @@ _gcry_vaes_avx2_ctr_enc_amd64:
        vpaddb .Lbige_addb_10 rRIP, %ymm7, %ymm5;
        vpaddb .Lbige_addb_12 rRIP, %ymm7, %ymm6;
        vpaddb .Lbige_addb_14 rRIP, %ymm7, %ymm7;
-       leaq 16(%r10), %r10;
 
   .Lctr_enc_blk16_rounds:
        /* AES rounds */
@@ -826,22 +841,21 @@ _gcry_vaes_avx2_ctr_enc_amd64:
 
        jmp .Lctr_enc_blk16;
 
+  .align 8
+  .Lctr_enc_blk16_handle_only_ctr_carry:
+       handle_ctr_128bit_add(16);
+       jmp .Lctr_enc_blk16_byte_bige_add;
+
   .align 8
   .Lctr_enc_blk16_handle_carry:
+       jz .Lctr_enc_blk16_handle_only_ctr_carry;
        /* Increment counters (handle carry). */
        vpshufb %xmm13, %xmm7, %xmm1; /* be => le */
        vmovdqa %xmm1, %xmm0;
        inc_le128(%xmm1, %xmm15, %xmm5);
        vinserti128 $1, %xmm1, %ymm0, %ymm7; /* ctr: +1:+0 */
        vpshufb %ymm13, %ymm7, %ymm0;
-       addq $16, %r10;
-       adcq $0, %r11;
-       bswapq %r10;
-       bswapq %r11;
-       movq %r10, 8(%rsi);
-       movq %r11, 0(%rsi);
-       bswapq %r10;
-       bswapq %r11;
+       handle_ctr_128bit_add(16);
        add2_le128(%ymm7, %ymm15, %ymm14, %ymm9, %ymm10); /* ctr: +3:+2 */
        vpshufb %ymm13, %ymm7, %ymm1;
        add2_le128(%ymm7, %ymm15, %ymm14, %ymm9, %ymm10); /* ctr: +5:+4 */
@@ -874,12 +888,14 @@ _gcry_vaes_avx2_ctr_enc_amd64:
        addb $8, 15(%rsi);
        jc .Lctr_enc_blk8_handle_carry;
 
+       leaq 8(%r10), %r10;
+
+  .Lctr_enc_blk8_byte_bige_add:
        /* Increment counters. */
        vpaddb .Lbige_addb_0 rRIP, %ymm3, %ymm0;
        vpaddb .Lbige_addb_2 rRIP, %ymm3, %ymm1;
        vpaddb .Lbige_addb_4 rRIP, %ymm3, %ymm2;
        vpaddb .Lbige_addb_6 rRIP, %ymm3, %ymm3;
-       leaq 8(%r10), %r10;
 
   .Lctr_enc_blk8_rounds:
        /* AES rounds */
@@ -934,22 +950,21 @@ _gcry_vaes_avx2_ctr_enc_amd64:
 
        jmp .Lctr_enc_blk4;
 
+  .align 8
+  .Lctr_enc_blk8_handle_only_ctr_carry:
+       handle_ctr_128bit_add(8);
+       jmp .Lctr_enc_blk8_byte_bige_add;
+
   .align 8
   .Lctr_enc_blk8_handle_carry:
+       jz .Lctr_enc_blk8_handle_only_ctr_carry;
        /* Increment counters (handle carry). */
        vpshufb %xmm13, %xmm3, %xmm1; /* be => le */
        vmovdqa %xmm1, %xmm0;
        inc_le128(%xmm1, %xmm15, %xmm5);
        vinserti128 $1, %xmm1, %ymm0, %ymm3; /* ctr: +1:+0 */
        vpshufb %ymm13, %ymm3, %ymm0;
-       addq $8, %r10;
-       adcq $0, %r11;
-       bswapq %r10;
-       bswapq %r11;
-       movq %r10, 8(%rsi);
-       movq %r11, 0(%rsi);
-       bswapq %r10;
-       bswapq %r11;
+       handle_ctr_128bit_add(8);
        add2_le128(%ymm3, %ymm15, %ymm14, %ymm5, %ymm6); /* ctr: +3:+2 */
        vpshufb %ymm13, %ymm3, %ymm1;
        add2_le128(%ymm3, %ymm15, %ymm14, %ymm5, %ymm6); /* ctr: +5:+4 */
@@ -974,10 +989,12 @@ _gcry_vaes_avx2_ctr_enc_amd64:
        addb $4, 15(%rsi);
        jc .Lctr_enc_blk4_handle_carry;
 
+       leaq 4(%r10), %r10;
+
+  .Lctr_enc_blk4_byte_bige_add:
        /* Increment counters. */
        vpaddb .Lbige_addb_0 rRIP, %ymm3, %ymm0;
        vpaddb .Lbige_addb_2 rRIP, %ymm3, %ymm1;
-       leaq 4(%r10), %r10;
 
   .Lctr_enc_blk4_rounds:
        /* AES rounds */
@@ -1026,22 +1043,21 @@ _gcry_vaes_avx2_ctr_enc_amd64:
 
        jmp .Lctr_enc_blk1;
 
+  .align 8
+  .Lctr_enc_blk4_handle_only_ctr_carry:
+       handle_ctr_128bit_add(4);
+       jmp .Lctr_enc_blk4_byte_bige_add;
+
   .align 8
   .Lctr_enc_blk4_handle_carry:
+       jz .Lctr_enc_blk4_handle_only_ctr_carry;
        /* Increment counters (handle carry). */
        vpshufb %xmm13, %xmm3, %xmm1; /* be => le */
        vmovdqa %xmm1, %xmm0;
        inc_le128(%xmm1, %xmm15, %xmm5);
        vinserti128 $1, %xmm1, %ymm0, %ymm3; /* ctr: +1:+0 */
        vpshufb %ymm13, %ymm3, %ymm0;
-       addq $4, %r10;
-       adcq $0, %r11;
-       bswapq %r10;
-       bswapq %r11;
-       movq %r10, 8(%rsi);
-       movq %r11, 0(%rsi);
-       bswapq %r10;
-       bswapq %r11;
+       handle_ctr_128bit_add(4);
        add2_le128(%ymm3, %ymm15, %ymm14, %ymm5, %ymm6); /* ctr: +3:+2 */
        vpshufb %ymm13, %ymm3, %ymm1;
 
@@ -1057,14 +1073,7 @@ _gcry_vaes_avx2_ctr_enc_amd64:
 
        /* Load and increament counter. */
        vmovdqu (%rsi), %xmm0;
-       addq $1, %r10;
-       adcq $0, %r11;
-       bswapq %r10;
-       bswapq %r11;
-       movq %r10, 8(%rsi);
-       movq %r11, 0(%rsi);
-       bswapq %r10;
-       bswapq %r11;
+       handle_ctr_128bit_add(1);
 
        /* AES rounds. */
        vpxor (0 * 16)(%rdi), %xmm0, %xmm0;
@@ -1112,6 +1121,7 @@ ELF(.size _gcry_vaes_avx2_ctr_enc_amd64,.-_gcry_vaes_avx2_ctr_enc_amd64)
  **********************************************************************/
 ELF(.type _gcry_vaes_avx2_ctr32le_enc_amd64,@function)
 .globl _gcry_vaes_avx2_ctr32le_enc_amd64
+.align 16
 _gcry_vaes_avx2_ctr32le_enc_amd64:
        /* input:
         *      %rdi: round keys
@@ -1392,155 +1402,11 @@ _gcry_vaes_avx2_ctr32le_enc_amd64:
 ELF(.size _gcry_vaes_avx2_ctr32le_enc_amd64,.-_gcry_vaes_avx2_ctr32le_enc_amd64)
 
 /**********************************************************************
-  OCB-mode encryption/decryption
+  OCB-mode encryption/decryption/authentication
  **********************************************************************/
-ELF(.type _gcry_vaes_avx2_ocb_checksum,@function)
-_gcry_vaes_avx2_ocb_checksum:
-       /* input:
-        *      %rax:     offset pointer
-        *      %r10:     plaintext pointer
-        *      %r11:     nblocks
-        */
-       CFI_STARTPROC();
-
-       vpxor %xmm0, %xmm0, %xmm0;
-       cmpq $4, %r11;
-       jb .Locb_checksum_blk1;
-       vpxor %xmm1, %xmm1, %xmm1;
-       vpxor %xmm2, %xmm2, %xmm2;
-       vpxor %xmm3, %xmm3, %xmm3;
-       cmpq $16, %r11;
-       jb .Locb_checksum_blk4;
-       vpxor %xmm4, %xmm4, %xmm4;
-       vpxor %xmm5, %xmm5, %xmm5;
-       vpxor %xmm6, %xmm6, %xmm6;
-       vpxor %xmm7, %xmm7, %xmm7;
-       cmpq $32, %r11;
-       jb .Locb_checksum_blk16;
-       vpxor %xmm8, %xmm8, %xmm8;
-       vpxor %xmm9, %xmm9, %xmm9;
-       vpxor %xmm10, %xmm10, %xmm10;
-       vpxor %xmm11, %xmm11, %xmm11;
-       vpxor %xmm12, %xmm12, %xmm12;
-       vpxor %xmm13, %xmm13, %xmm13;
-       vpxor %xmm14, %xmm14, %xmm14;
-       vpxor %xmm15, %xmm15, %xmm15;
-
-.align 8
-.Locb_checksum_blk32:
-       cmpq $32, %r11;
-       jb .Locb_checksum_blk32_done;
-
-       leaq -32(%r11), %r11;
-
-       vpxor (0 * 16)(%r10), %ymm0, %ymm0;
-       vpxor (2 * 16)(%r10), %ymm1, %ymm1;
-       vpxor (4 * 16)(%r10), %ymm2, %ymm2;
-       vpxor (6 * 16)(%r10), %ymm3, %ymm3;
-       vpxor (8 * 16)(%r10), %ymm4, %ymm4;
-       vpxor (10 * 16)(%r10), %ymm5, %ymm5;
-       vpxor (12 * 16)(%r10), %ymm6, %ymm6;
-       vpxor (14 * 16)(%r10), %ymm7, %ymm7;
-       vpxor (16 * 16)(%r10), %ymm8, %ymm8;
-       vpxor (18 * 16)(%r10), %ymm9, %ymm9;
-       vpxor (20 * 16)(%r10), %ymm10, %ymm10;
-       vpxor (22 * 16)(%r10), %ymm11, %ymm11;
-       vpxor (24 * 16)(%r10), %ymm12, %ymm12;
-       vpxor (26 * 16)(%r10), %ymm13, %ymm13;
-       vpxor (28 * 16)(%r10), %ymm14, %ymm14;
-       vpxor (30 * 16)(%r10), %ymm15, %ymm15;
-       leaq (32 * 16)(%r10), %r10;
-
-       jmp .Locb_checksum_blk32;
-
-.align 8
-.Locb_checksum_blk32_done:
-       vpxor %ymm8, %ymm0, %ymm0;
-       vpxor %ymm9, %ymm1, %ymm1;
-       vpxor %ymm10, %ymm2, %ymm2;
-       vpxor %ymm11, %ymm3, %ymm3;
-       vpxor %ymm12, %ymm4, %ymm4;
-       vpxor %ymm13, %ymm5, %ymm5;
-       vpxor %ymm14, %ymm6, %ymm6;
-       vpxor %ymm15, %ymm7, %ymm7;
-
-.align 8
-.Locb_checksum_blk16:
-       cmpq $16, %r11;
-       jb .Locb_checksum_blk16_done;
-
-       leaq -16(%r11), %r11;
-
-       vpxor (0 * 16)(%r10), %ymm0, %ymm0;
-       vpxor (2 * 16)(%r10), %ymm1, %ymm1;
-       vpxor (4 * 16)(%r10), %ymm2, %ymm2;
-       vpxor (6 * 16)(%r10), %ymm3, %ymm3;
-       vpxor (8 * 16)(%r10), %ymm4, %ymm4;
-       vpxor (10 * 16)(%r10), %ymm5, %ymm5;
-       vpxor (12 * 16)(%r10), %ymm6, %ymm6;
-       vpxor (14 * 16)(%r10), %ymm7, %ymm7;
-       leaq (16 * 16)(%r10), %r10;
-
-       jmp .Locb_checksum_blk16;
-
-.align 8
-.Locb_checksum_blk16_done:
-       vpxor %ymm4, %ymm0, %ymm0;
-       vpxor %ymm5, %ymm1, %ymm1;
-       vpxor %ymm6, %ymm2, %ymm2;
-       vpxor %ymm7, %ymm3, %ymm3;
-       vextracti128 $1, %ymm0, %xmm4;
-       vextracti128 $1, %ymm1, %xmm5;
-       vextracti128 $1, %ymm2, %xmm6;
-       vextracti128 $1, %ymm3, %xmm7;
-       vpxor %xmm4, %xmm0, %xmm0;
-       vpxor %xmm5, %xmm1, %xmm1;
-       vpxor %xmm6, %xmm2, %xmm2;
-       vpxor %xmm7, %xmm3, %xmm3;
-
-.align 8
-.Locb_checksum_blk4:
-       cmpq $4, %r11;
-       jb .Locb_checksum_blk4_done;
-
-       leaq -4(%r11), %r11;
-
-       vpxor (0 * 16)(%r10), %xmm0, %xmm0;
-       vpxor (1 * 16)(%r10), %xmm1, %xmm1;
-       vpxor (2 * 16)(%r10), %xmm2, %xmm2;
-       vpxor (3 * 16)(%r10), %xmm3, %xmm3;
-       leaq (4 * 16)(%r10), %r10;
-
-       jmp .Locb_checksum_blk4;
-
-.align 8
-.Locb_checksum_blk4_done:
-       vpxor %xmm1, %xmm0, %xmm0;
-       vpxor %xmm3, %xmm2, %xmm2;
-       vpxor %xmm2, %xmm0, %xmm0;
-
-.align 8
-.Locb_checksum_blk1:
-       cmpq $1, %r11;
-       jb .Locb_checksum_done;
-
-       leaq -1(%r11), %r11;
-
-       vpxor (%r10), %xmm0, %xmm0;
-       leaq 16(%r10), %r10;
-
-       jmp .Locb_checksum_blk1;
-
-.align 8
-.Locb_checksum_done:
-       vpxor (%rax), %xmm0, %xmm0;
-       vmovdqu %xmm0, (%rax);
-       ret_spec_stop;
-       CFI_ENDPROC();
-ELF(.size _gcry_vaes_avx2_ocb_checksum,.-_gcry_vaes_avx2_ocb_checksum)
-
 ELF(.type _gcry_vaes_avx2_ocb_crypt_amd64,@function)
 .globl _gcry_vaes_avx2_ocb_crypt_amd64
+.align 16
 _gcry_vaes_avx2_ocb_crypt_amd64:
        /* input:
         *      %rdi:     round keys
@@ -1552,12 +1418,16 @@ _gcry_vaes_avx2_ocb_crypt_amd64:
         *      16(%rbp): offset
         *      24(%rbp): checksum
         *      32(%rbp): L-array
-        *      40(%rbp): encrypt (%r15d)
+        *      40(%rbp): decrypt/encrypt/auth (%r15d)
         */
        CFI_STARTPROC();
 
-#define STACK_REGS_POS (16 * 16 + 4 * 16)
-#define STACK_ALLOC (STACK_REGS_POS + 6 * 8)
+#define STACK_REGS_POS (16 * 16 + 4 * 16 + 2 * 16)
+#define STACK_ALLOC    (STACK_REGS_POS + 5 * 8)
+#define OFFSET_PTR_Q   16(%rbp)
+#define CHECKSUM_PTR_Q 24(%rbp)
+#define L_ARRAY_PTR_L  32(%rbp)
+#define OPER_MODE_L    40(%rbp)
 
        pushq %rbp;
        CFI_PUSH(%rbp);
@@ -1575,37 +1445,20 @@ _gcry_vaes_avx2_ocb_crypt_amd64:
        CFI_REG_ON_STACK(r14, STACK_REGS_POS + 2 * 8);
        movq %r15, (STACK_REGS_POS + 3 * 8)(%rsp);
        CFI_REG_ON_STACK(r15, STACK_REGS_POS + 3 * 8);
+       movq %rbx, (STACK_REGS_POS + 4 * 8)(%rsp);
+       CFI_REG_ON_STACK(rbx, STACK_REGS_POS + 4 * 8);
 
-       movl 40(%rbp), %r15d; /* encrypt-flag. */
-       movq 16(%rbp), %r14; /* offset ptr. */
-
-       /* Handle encryption checksumming. */
-       testl %r15d, %r15d;
-       jz .Locb_dec_checksum_prepare;
-       movq 24(%rbp), %rax; /* checksum ptr. */
-       movq %rcx, %r10;
-       movq %r8, %r11;
-       call _gcry_vaes_avx2_ocb_checksum;
-       jmp .Locb_enc_checksum_done;
-.Locb_dec_checksum_prepare:
-       /* Store plaintext address and number of blocks for decryption
-        * checksumming. */
-       movq %rdx, (STACK_REGS_POS + 4 * 8)(%rsp);
-       movq %r8, (STACK_REGS_POS + 5 * 8)(%rsp);
-.Locb_enc_checksum_done:
+       movl OPER_MODE_L, %r15d; /* decrypt/encrypt/auth-mode. */
+       movq OFFSET_PTR_Q, %r14; /* offset ptr. */
+       movq CHECKSUM_PTR_Q, %rbx; /* checksum ptr. */
 
+       leal (, %r9d, 4), %eax;
        vmovdqu (%r14), %xmm15; /* Load offset. */
-       movq 32(%rbp), %r14; /* L-array ptr. */
+       movq L_ARRAY_PTR_L, %r14; /* L-array ptr. */
        vmovdqa (0 * 16)(%rdi), %xmm0; /* first key */
-       movl $(10 * 16), %eax;
-       cmpl $12, %r9d;
-       jb .Llast_key_ptr;
-       movl $(12 * 16), %eax;
-       je .Llast_key_ptr;
-       movl $(14 * 16), %eax;
-  .align 8
-  .Llast_key_ptr:
-       vpxor (%rdi, %rax), %xmm0, %xmm0; /* first key ^ last key */
+       vpxor %xmm14, %xmm14, %xmm14;
+       vpxor %xmm13, %xmm13, %xmm13;
+       vpxor (%rdi, %rax, 4), %xmm0, %xmm0; /* first key ^ last key */
        vpxor (0 * 16)(%rdi), %xmm15, %xmm15; /* offset ^ first key */
        vmovdqa %xmm0, (14 * 16)(%rsp);
        vmovdqa %xmm0, (15 * 16)(%rsp);
@@ -1678,16 +1531,25 @@ _gcry_vaes_avx2_ocb_crypt_amd64:
        vinserti128 $1, %xmm10, %ymm9, %ymm7;
        vinserti128 $1, %xmm15, %ymm11, %ymm8;
 
-       vpxor (0 * 16)(%rcx), %ymm5, %ymm0;
-       vpxor (2 * 16)(%rcx), %ymm6, %ymm1;
-       vpxor (4 * 16)(%rcx), %ymm7, %ymm2;
-       vpxor (6 * 16)(%rcx), %ymm8, %ymm3;
-       leaq (8 * 16)(%rcx), %rcx;
-
-       vmovdqa (14 * 16)(%rsp), %ymm9;
+       cmpl $1, %r15d;
+       jb .Locb_unaligned_blk8_dec;
+       ja .Locb_unaligned_blk8_auth;
+               vmovdqu (0 * 16)(%rcx), %ymm0;
+               vmovdqu (2 * 16)(%rcx), %ymm1;
+               vmovdqu (4 * 16)(%rcx), %ymm2;
+               vmovdqu (6 * 16)(%rcx), %ymm3;
+               leaq (8 * 16)(%rcx), %rcx;
+               vpxor %ymm0, %ymm14, %ymm14;
+               vpxor %ymm1, %ymm13, %ymm13;
+               vpxor %ymm2, %ymm14, %ymm14;
+               vpxor %ymm3, %ymm13, %ymm13;
+               vpxor %ymm5, %ymm0, %ymm0;
+               vpxor %ymm6, %ymm1, %ymm1;
+               vpxor %ymm7, %ymm2, %ymm2;
+               vpxor %ymm8, %ymm3, %ymm3;
+
+               vmovdqa (14 * 16)(%rsp), %ymm9;
 
-       testl %r15d, %r15d;
-       jz .Locb_unaligned_blk8_dec;
                /* AES rounds */
                vbroadcasti128 (1 * 16)(%rdi), %ymm4;
                VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
@@ -1737,8 +1599,69 @@ _gcry_vaes_avx2_ocb_crypt_amd64:
 
                jmp .Locb_unaligned_blk8;
 
+       .align 8
+       .Locb_unaligned_blk8_auth:
+               vpxor (0 * 16)(%rcx), %ymm5, %ymm0;
+               vpxor (2 * 16)(%rcx), %ymm6, %ymm1;
+               vpxor (4 * 16)(%rcx), %ymm7, %ymm2;
+               vpxor (6 * 16)(%rcx), %ymm8, %ymm3;
+               leaq (8 * 16)(%rcx), %rcx;
+
+               /* AES rounds */
+               vbroadcasti128 (1 * 16)(%rdi), %ymm4;
+               VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+               vbroadcasti128 (2 * 16)(%rdi), %ymm4;
+               VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+               vbroadcasti128 (3 * 16)(%rdi), %ymm4;
+               VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+               vbroadcasti128 (4 * 16)(%rdi), %ymm4;
+               VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+               vbroadcasti128 (5 * 16)(%rdi), %ymm4;
+               VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+               vbroadcasti128 (6 * 16)(%rdi), %ymm4;
+               VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+               vbroadcasti128 (7 * 16)(%rdi), %ymm4;
+               VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+               vbroadcasti128 (8 * 16)(%rdi), %ymm4;
+               VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+               vbroadcasti128 (9 * 16)(%rdi), %ymm4;
+               VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+               vbroadcasti128 (10 * 16)(%rdi), %ymm4;
+               cmpl $12, %r9d;
+               jb .Locb_unaligned_blk8_auth_last;
+               VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+               vbroadcasti128 (11 * 16)(%rdi), %ymm4;
+               VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+               vbroadcasti128 (12 * 16)(%rdi), %ymm4;
+               jz .Locb_unaligned_blk8_auth_last;
+               VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+               vbroadcasti128 (13 * 16)(%rdi), %ymm4;
+               VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+               vbroadcasti128 (14 * 16)(%rdi), %ymm4;
+
+               /* Last round and output handling. */
+       .Locb_unaligned_blk8_auth_last:
+               vaesenclast %ymm4, %ymm0, %ymm0;
+               vaesenclast %ymm4, %ymm1, %ymm1;
+               vaesenclast %ymm4, %ymm2, %ymm2;
+               vaesenclast %ymm4, %ymm3, %ymm3;
+               vpxor %ymm0, %ymm14, %ymm14;
+               vpxor %ymm1, %ymm13, %ymm13;
+               vpxor %ymm2, %ymm14, %ymm14;
+               vpxor %ymm3, %ymm13, %ymm13;
+
+               jmp .Locb_unaligned_blk8;
+
        .align 8
        .Locb_unaligned_blk8_dec:
+               vpxor (0 * 16)(%rcx), %ymm5, %ymm0;
+               vpxor (2 * 16)(%rcx), %ymm6, %ymm1;
+               vpxor (4 * 16)(%rcx), %ymm7, %ymm2;
+               vpxor (6 * 16)(%rcx), %ymm8, %ymm3;
+               leaq (8 * 16)(%rcx), %rcx;
+
+               vmovdqa (14 * 16)(%rsp), %ymm9;
+
                /* AES rounds */
                vbroadcasti128 (1 * 16)(%rdi), %ymm4;
                VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
@@ -1780,6 +1703,10 @@ _gcry_vaes_avx2_ocb_crypt_amd64:
                vaesdeclast %ymm6, %ymm1, %ymm1;
                vaesdeclast %ymm7, %ymm2, %ymm2;
                vaesdeclast %ymm4, %ymm3, %ymm3;
+               vpxor %ymm0, %ymm14, %ymm14;
+               vpxor %ymm1, %ymm13, %ymm13;
+               vpxor %ymm2, %ymm14, %ymm14;
+               vpxor %ymm3, %ymm13, %ymm13;
                vmovdqu %ymm0, (0 * 16)(%rdx);
                vmovdqu %ymm1, (2 * 16)(%rdx);
                vmovdqu %ymm2, (4 * 16)(%rdx);
@@ -1817,12 +1744,17 @@ _gcry_vaes_avx2_ocb_crypt_amd64:
        vpxor (%r14, %rax), %xmm7, %xmm15;
        vinserti128 $1, %xmm15, %ymm7, %ymm6;
 
-       vpxor (0 * 16)(%rcx), %ymm5, %ymm0;
-       vpxor (2 * 16)(%rcx), %ymm6, %ymm1;
-       leaq (4 * 16)(%rcx), %rcx;
+       cmpl $1, %r15d;
+       jb .Locb_unaligned_blk4_dec;
+       ja .Locb_unaligned_blk4_auth;
+               vmovdqu (0 * 16)(%rcx), %ymm0;
+               vmovdqu (2 * 16)(%rcx), %ymm1;
+               leaq (4 * 16)(%rcx), %rcx;
+               vpxor %ymm0, %ymm14, %ymm14;
+               vpxor %ymm1, %ymm13, %ymm13;
+               vpxor %ymm5, %ymm0, %ymm0;
+               vpxor %ymm6, %ymm1, %ymm1;
 
-       testl %r15d, %r15d;
-       jz .Locb_unaligned_blk4_dec;
                /* AES rounds */
                vbroadcasti128 (1 * 16)(%rdi), %ymm4;
                VAESENC2(%ymm4, %ymm0, %ymm1);
@@ -1867,8 +1799,59 @@ _gcry_vaes_avx2_ocb_crypt_amd64:
 
                jmp .Locb_unaligned_blk1;
 
+       .align 8
+       .Locb_unaligned_blk4_auth:
+               vpxor (0 * 16)(%rcx), %ymm5, %ymm0;
+               vpxor (2 * 16)(%rcx), %ymm6, %ymm1;
+               leaq (4 * 16)(%rcx), %rcx;
+
+               /* AES rounds */
+               vbroadcasti128 (1 * 16)(%rdi), %ymm4;
+               VAESENC2(%ymm4, %ymm0, %ymm1);
+               vbroadcasti128 (2 * 16)(%rdi), %ymm4;
+               VAESENC2(%ymm4, %ymm0, %ymm1);
+               vbroadcasti128 (3 * 16)(%rdi), %ymm4;
+               VAESENC2(%ymm4, %ymm0, %ymm1);
+               vbroadcasti128 (4 * 16)(%rdi), %ymm4;
+               VAESENC2(%ymm4, %ymm0, %ymm1);
+               vbroadcasti128 (5 * 16)(%rdi), %ymm4;
+               VAESENC2(%ymm4, %ymm0, %ymm1);
+               vbroadcasti128 (6 * 16)(%rdi), %ymm4;
+               VAESENC2(%ymm4, %ymm0, %ymm1);
+               vbroadcasti128 (7 * 16)(%rdi), %ymm4;
+               VAESENC2(%ymm4, %ymm0, %ymm1);
+               vbroadcasti128 (8 * 16)(%rdi), %ymm4;
+               VAESENC2(%ymm4, %ymm0, %ymm1);
+               vbroadcasti128 (9 * 16)(%rdi), %ymm4;
+               VAESENC2(%ymm4, %ymm0, %ymm1);
+               vbroadcasti128 (10 * 16)(%rdi), %ymm4;
+               cmpl $12, %r9d;
+               jb .Locb_unaligned_blk4_auth_last;
+               VAESENC2(%ymm4, %ymm0, %ymm1);
+               vbroadcasti128 (11 * 16)(%rdi), %ymm4;
+               VAESENC2(%ymm4, %ymm0, %ymm1);
+               vbroadcasti128 (12 * 16)(%rdi), %ymm4;
+               jz .Locb_unaligned_blk4_auth_last;
+               VAESENC2(%ymm4, %ymm0, %ymm1);
+               vbroadcasti128 (13 * 16)(%rdi), %ymm4;
+               VAESENC2(%ymm4, %ymm0, %ymm1);
+               vbroadcasti128 (14 * 16)(%rdi), %ymm4;
+
+             /* Last round and output handling. */
+       .Locb_unaligned_blk4_auth_last:
+               vaesenclast %ymm4, %ymm0, %ymm0;
+               vaesenclast %ymm4, %ymm1, %ymm1;
+               vpxor %ymm0, %ymm14, %ymm14;
+               vpxor %ymm1, %ymm13, %ymm13;
+
+               jmp .Locb_unaligned_blk1;
+
        .align 8
        .Locb_unaligned_blk4_dec:
+               vpxor (0 * 16)(%rcx), %ymm5, %ymm0;
+               vpxor (2 * 16)(%rcx), %ymm6, %ymm1;
+               leaq (4 * 16)(%rcx), %rcx;
+
                /* AES rounds */
                vbroadcasti128 (1 * 16)(%rdi), %ymm4;
                VAESDEC2(%ymm4, %ymm0, %ymm1);
@@ -1907,6 +1890,8 @@ _gcry_vaes_avx2_ocb_crypt_amd64:
                vpxor %ymm6, %ymm8, %ymm6;
                vaesdeclast %ymm5, %ymm0, %ymm0;
                vaesdeclast %ymm6, %ymm1, %ymm1;
+               vpxor %ymm0, %ymm14, %ymm14;
+               vpxor %ymm1, %ymm13, %ymm13;
                vmovdqu %ymm0, (0 * 16)(%rdx);
                vmovdqu %ymm1, (2 * 16)(%rdx);
                leaq (4 * 16)(%rdx), %rdx;
@@ -1924,11 +1909,15 @@ _gcry_vaes_avx2_ocb_crypt_amd64:
        tzcntl %esi, %r11d;
        shll $4, %r11d;
        vpxor (%r14, %r11), %xmm15, %xmm15;
-       vpxor (%rcx), %xmm15, %xmm0;
-       leaq 16(%rcx), %rcx;
 
-       testl %r15d, %r15d;
-       jz .Locb_unaligned_blk1_dec;
+       cmpl $1, %r15d;
+       jb .Locb_unaligned_blk1_dec;
+       ja .Locb_unaligned_blk1_auth;
+               vmovdqu (%rcx), %xmm0;
+               vpxor %ymm0, %ymm14, %ymm14;
+               vpxor %xmm15, %xmm0, %xmm0;
+               leaq 16(%rcx), %rcx;
+
                /* AES rounds. */
                vaesenc (1 * 16)(%rdi), %xmm0, %xmm0;
                vaesenc (2 * 16)(%rdi), %xmm0, %xmm0;
@@ -1956,8 +1945,44 @@ _gcry_vaes_avx2_ocb_crypt_amd64:
 
                jmp .Locb_unaligned_blk1;
 
+       .align 8
+       .Locb_unaligned_blk1_auth:
+               vpxor (%rcx), %xmm15, %xmm0;
+               leaq 16(%rcx), %rcx;
+
+               /* AES rounds. */
+               vaesenc (1 * 16)(%rdi), %xmm0, %xmm0;
+               vaesenc (2 * 16)(%rdi), %xmm0, %xmm0;
+               vaesenc (3 * 16)(%rdi), %xmm0, %xmm0;
+               vaesenc (4 * 16)(%rdi), %xmm0, %xmm0;
+               vaesenc (5 * 16)(%rdi), %xmm0, %xmm0;
+               vaesenc (6 * 16)(%rdi), %xmm0, %xmm0;
+               vaesenc (7 * 16)(%rdi), %xmm0, %xmm0;
+               vaesenc (8 * 16)(%rdi), %xmm0, %xmm0;
+               vaesenc (9 * 16)(%rdi), %xmm0, %xmm0;
+               vmovdqa (10 * 16)(%rdi), %xmm1;
+               cmpl $12, %r9d;
+               jb .Locb_unaligned_blk1_auth_last;
+               vaesenc %xmm1, %xmm0, %xmm0;
+               vaesenc (11 * 16)(%rdi), %xmm0, %xmm0;
+               vmovdqa (12 * 16)(%rdi), %xmm1;
+               jz .Locb_unaligned_blk1_auth_last;
+               vaesenc %xmm1, %xmm0, %xmm0;
+               vaesenc (13 * 16)(%rdi), %xmm0, %xmm0;
+               vmovdqa (14 * 16)(%rdi), %xmm1;
+
+               /* Last round and output handling. */
+       .Locb_unaligned_blk1_auth_last:
+               vaesenclast %xmm1, %xmm0, %xmm0;
+               vpxor %ymm0, %ymm14, %ymm14;
+
+               jmp .Locb_unaligned_blk1;
+
        .align 8
        .Locb_unaligned_blk1_dec:
+               vpxor (%rcx), %xmm15, %xmm0;
+               leaq 16(%rcx), %rcx;
+
                /* AES rounds. */
                vaesdec (1 * 16)(%rdi), %xmm0, %xmm0;
                vaesdec (2 * 16)(%rdi), %xmm0, %xmm0;
@@ -1980,6 +2005,7 @@ _gcry_vaes_avx2_ocb_crypt_amd64:
        .Locb_unaligned_blk1_dec_last:
                vpxor (14 * 16)(%rsp), %xmm15, %xmm1;
                vaesdeclast %xmm1, %xmm0, %xmm0;
+               vpxor %ymm0, %ymm14, %ymm14;
                vmovdqu %xmm0, (%rdx);
                leaq 16(%rdx), %rdx;
 
@@ -2021,6 +2047,8 @@ _gcry_vaes_avx2_ocb_crypt_amd64:
        vmovdqu (1 * 16)(%r14), %xmm1;
        vmovdqu (2 * 16)(%r14), %xmm2;
        vmovdqu (3 * 16)(%r14), %xmm3;
+       vpxor %ymm13, %ymm14, %ymm14;
+       vmovdqa %ymm14, (20 * 16)(%rsp);
        vpxor %xmm0, %xmm1, %xmm4; /* L[0] ^ L[1] */
        vpxor %xmm0, %xmm2, %xmm5; /* L[0] ^ L[2] */
        vpxor %xmm0, %xmm3, %xmm6; /* L[0] ^ L[3] */
@@ -2069,26 +2097,41 @@ _gcry_vaes_avx2_ocb_crypt_amd64:
        vpxor (%r14, %rax), %xmm13, %xmm14; /* offset ^ first key ^ L[3] ^ L[ntz{nblk+16}] */
        vinserti128 $1, %xmm14, %ymm13, %ymm14;
 
-       vpxor (10 * 16)(%rsp), %ymm15, %ymm13;
-       vpxor (14 * 16)(%rcx), %ymm14, %ymm7;
-
-       vpxor (0 * 16)(%rcx), %ymm8, %ymm0;
-       vpxor (2 * 16)(%rcx), %ymm9, %ymm1;
-       vpxor (4 * 16)(%rcx), %ymm10, %ymm2;
-       vpxor (6 * 16)(%rcx), %ymm11, %ymm3;
-       vpxor (8 * 16)(%rcx), %ymm12, %ymm4;
-       vpxor (10 * 16)(%rcx), %ymm13, %ymm5;
-       vmovdqa %ymm13, (16 * 16)(%rsp);
-       vpxor (12 * 16)(%rsp), %ymm15, %ymm13;
-       vpxor (12 * 16)(%rcx), %ymm13, %ymm6;
-       vmovdqa %ymm13, (18 * 16)(%rsp);
-
-       leaq (16 * 16)(%rcx), %rcx;
-
-       vperm2i128 $0x11, %ymm14, %ymm14, %ymm15;
+       cmpl $1, %r15d;
+       jb .Locb_aligned_blk16_dec;
+       ja .Locb_aligned_blk16_auth;
+               vmovdqu (0 * 16)(%rcx), %ymm0;
+               vmovdqu (2 * 16)(%rcx), %ymm1;
+               vmovdqu (4 * 16)(%rcx), %ymm2;
+               vmovdqu (6 * 16)(%rcx), %ymm3;
+               vpxor (8 * 16)(%rcx), %ymm0, %ymm4;
+               vpxor (10 * 16)(%rcx), %ymm1, %ymm5;
+               vpxor (12 * 16)(%rcx), %ymm2, %ymm6;
+               vpxor (14 * 16)(%rcx), %ymm3, %ymm7;
+               vpxor %ymm4, %ymm5, %ymm5;
+               vpxor %ymm6, %ymm7, %ymm7;
+               vpxor %ymm5, %ymm7, %ymm7;
+               vpxor (20 * 16)(%rsp), %ymm7, %ymm7;
+               vmovdqa %ymm7, (20 * 16)(%rsp);
+
+               vpxor (10 * 16)(%rsp), %ymm15, %ymm13;
+               vpxor (14 * 16)(%rcx), %ymm14, %ymm7;
+
+               vpxor %ymm8, %ymm0, %ymm0;
+               vpxor %ymm9, %ymm1, %ymm1;
+               vpxor %ymm10, %ymm2, %ymm2;
+               vpxor %ymm11, %ymm3, %ymm3;
+               vpxor (8 * 16)(%rcx), %ymm12, %ymm4;
+               vpxor (10 * 16)(%rcx), %ymm13, %ymm5;
+               vmovdqa %ymm13, (16 * 16)(%rsp);
+               vpxor (12 * 16)(%rsp), %ymm15, %ymm13;
+               vpxor (12 * 16)(%rcx), %ymm13, %ymm6;
+               vmovdqa %ymm13, (18 * 16)(%rsp);
+
+               leaq (16 * 16)(%rcx), %rcx;
+
+               vperm2i128 $0x11, %ymm14, %ymm14, %ymm15;
 
-       testl %r15d, %r15d;
-       jz .Locb_aligned_blk16_dec;
                /* AES rounds */
                vbroadcasti128 (1 * 16)(%rdi), %ymm13;
                VAESENC8(%ymm13, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
@@ -2151,8 +2194,101 @@ _gcry_vaes_avx2_ocb_crypt_amd64:
 
                jmp .Locb_aligned_blk16;
 
+       .align 8
+       .Locb_aligned_blk16_auth:
+               vpxor (10 * 16)(%rsp), %ymm15, %ymm13;
+               vpxor (14 * 16)(%rcx), %ymm14, %ymm7;
+
+               vpxor (0 * 16)(%rcx), %ymm8, %ymm0;
+               vpxor (2 * 16)(%rcx), %ymm9, %ymm1;
+               vpxor (4 * 16)(%rcx), %ymm10, %ymm2;
+               vpxor (6 * 16)(%rcx), %ymm11, %ymm3;
+               vpxor (8 * 16)(%rcx), %ymm12, %ymm4;
+               vpxor (10 * 16)(%rcx), %ymm13, %ymm5;
+               vmovdqa %ymm13, (16 * 16)(%rsp);
+               vpxor (12 * 16)(%rsp), %ymm15, %ymm13;
+               vpxor (12 * 16)(%rcx), %ymm13, %ymm6;
+               vmovdqa %ymm13, (18 * 16)(%rsp);
+
+               leaq (16 * 16)(%rcx), %rcx;
+
+               vperm2i128 $0x11, %ymm14, %ymm14, %ymm15;
+
+               /* AES rounds */
+               vbroadcasti128 (1 * 16)(%rdi), %ymm13;
+               VAESENC8(%ymm13, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
+               vbroadcasti128 (2 * 16)(%rdi), %ymm13;
+               VAESENC8(%ymm13, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
+               vbroadcasti128 (3 * 16)(%rdi), %ymm13;
+               VAESENC8(%ymm13, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
+               vbroadcasti128 (4 * 16)(%rdi), %ymm13;
+               VAESENC8(%ymm13, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
+               vbroadcasti128 (5 * 16)(%rdi), %ymm13;
+               VAESENC8(%ymm13, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
+               vbroadcasti128 (6 * 16)(%rdi), %ymm13;
+               VAESENC8(%ymm13, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
+               vbroadcasti128 (7 * 16)(%rdi), %ymm13;
+               VAESENC8(%ymm13, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
+               vbroadcasti128 (8 * 16)(%rdi), %ymm13;
+               VAESENC8(%ymm13, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
+               vbroadcasti128 (9 * 16)(%rdi), %ymm13;
+               VAESENC8(%ymm13, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
+               vbroadcasti128 (10 * 16)(%rdi), %ymm13;
+               cmpl $12, %r9d;
+               jb .Locb_aligned_blk16_auth_last;
+               VAESENC8(%ymm13, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
+               vbroadcasti128 (11 * 16)(%rdi), %ymm13;
+               VAESENC8(%ymm13, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
+               vbroadcasti128 (12 * 16)(%rdi), %ymm13;
+               jz .Locb_aligned_blk16_auth_last;
+               VAESENC8(%ymm13, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
+               vbroadcasti128 (13 * 16)(%rdi), %ymm13;
+               VAESENC8(%ymm13, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
+               vbroadcasti128 (14 * 16)(%rdi), %ymm13;
+
+               /* Last round and output handling. */
+       .Locb_aligned_blk16_auth_last:
+               vaesenclast %ymm13, %ymm0, %ymm0;
+               vaesenclast %ymm13, %ymm1, %ymm1;
+               vaesenclast %ymm13, %ymm2, %ymm2;
+               vaesenclast %ymm13, %ymm3, %ymm3;
+               vaesenclast %ymm13, %ymm4, %ymm4;
+               vaesenclast %ymm13, %ymm5, %ymm5;
+               vaesenclast %ymm13, %ymm6, %ymm6;
+               vaesenclast %ymm13, %ymm7, %ymm7;
+
+               vpxor %ymm1, %ymm0, %ymm0;
+               vpxor %ymm3, %ymm2, %ymm2;
+               vpxor %ymm5, %ymm4, %ymm4;
+               vpxor %ymm7, %ymm6, %ymm6;
+               vpxor %ymm2, %ymm0, %ymm0;
+               vpxor %ymm6, %ymm4, %ymm4;
+               vpxor %ymm4, %ymm0, %ymm0;
+               vpxor (20 * 16)(%rsp), %ymm0, %ymm0;
+               vmovdqa %ymm0, (20 * 16)(%rsp);
+
+               jmp .Locb_aligned_blk16;
+
        .align 8
        .Locb_aligned_blk16_dec:
+               vpxor (10 * 16)(%rsp), %ymm15, %ymm13;
+               vpxor (14 * 16)(%rcx), %ymm14, %ymm7;
+
+               vpxor (0 * 16)(%rcx), %ymm8, %ymm0;
+               vpxor (2 * 16)(%rcx), %ymm9, %ymm1;
+               vpxor (4 * 16)(%rcx), %ymm10, %ymm2;
+               vpxor (6 * 16)(%rcx), %ymm11, %ymm3;
+               vpxor (8 * 16)(%rcx), %ymm12, %ymm4;
+               vpxor (10 * 16)(%rcx), %ymm13, %ymm5;
+               vmovdqa %ymm13, (16 * 16)(%rsp);
+               vpxor (12 * 16)(%rsp), %ymm15, %ymm13;
+               vpxor (12 * 16)(%rcx), %ymm13, %ymm6;
+               vmovdqa %ymm13, (18 * 16)(%rsp);
+
+               leaq (16 * 16)(%rcx), %rcx;
+
+               vperm2i128 $0x11, %ymm14, %ymm14, %ymm15;
+
                /* AES rounds */
                vbroadcasti128 (1 * 16)(%rdi), %ymm13;
                VAESDEC8(%ymm13, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
@@ -2207,12 +2343,22 @@ _gcry_vaes_avx2_ocb_crypt_amd64:
                vmovdqu %ymm1, (2 * 16)(%rdx);
                vmovdqu %ymm2, (4 * 16)(%rdx);
                vmovdqu %ymm3, (6 * 16)(%rdx);
+               vpxor %ymm1, %ymm0, %ymm0;
+               vpxor %ymm3, %ymm2, %ymm2;
                vmovdqu %ymm4, (8 * 16)(%rdx);
                vmovdqu %ymm5, (10 * 16)(%rdx);
                vmovdqu %ymm6, (12 * 16)(%rdx);
                vmovdqu %ymm7, (14 * 16)(%rdx);
+               vpxor %ymm5, %ymm4, %ymm4;
+               vpxor %ymm7, %ymm6, %ymm6;
                leaq (16 * 16)(%rdx), %rdx;
 
+               vpxor %ymm4, %ymm0, %ymm0;
+               vpxor %ymm6, %ymm2, %ymm2;
+               vpxor %ymm2, %ymm0, %ymm0;
+               vpxor (20 * 16)(%rsp), %ymm0, %ymm0;
+               vmovdqa %ymm0, (20 * 16)(%rsp);
+
                jmp .Locb_aligned_blk16;
 
        /* Aligned: Process trailing eight blocks. */
@@ -2235,18 +2381,29 @@ _gcry_vaes_avx2_ocb_crypt_amd64:
        vpxor (%r14, %rax), %xmm13, %xmm14; /* offset ^ first key ^ L[2] ^ L[ntz{nblk+8}] */
        vinserti128 $1, %xmm14, %ymm13, %ymm14;
 
-       vpxor (0 * 16)(%rcx), %ymm5, %ymm0;
-       vpxor (2 * 16)(%rcx), %ymm6, %ymm1;
-       vpxor (4 * 16)(%rcx), %ymm7, %ymm2;
-       vpxor (6 * 16)(%rcx), %ymm14, %ymm3;
-       leaq (8 * 16)(%rcx), %rcx;
-
-       vperm2i128 $0x11, %ymm14, %ymm14, %ymm15;
+       cmpl $1, %r15d;
+       jb .Locb_aligned_blk8_dec;
+       ja .Locb_aligned_blk8_auth;
+               vmovdqu (0 * 16)(%rcx), %ymm0;
+               vmovdqu (2 * 16)(%rcx), %ymm1;
+               vmovdqu (4 * 16)(%rcx), %ymm2;
+               vmovdqu (6 * 16)(%rcx), %ymm3;
+               vpxor %ymm2, %ymm0, %ymm10;
+               vpxor %ymm3, %ymm1, %ymm11;
+               vpxor %ymm11, %ymm10, %ymm10;
+               vpxor (20 * 16)(%rsp), %ymm10, %ymm10;
+               vmovdqa %ymm10, (20 * 16)(%rsp);
+
+               vpxor %ymm5, %ymm0, %ymm0;
+               vpxor %ymm6, %ymm1, %ymm1;
+               vpxor %ymm7, %ymm2, %ymm2;
+               vpxor %ymm14, %ymm3, %ymm3;
+               leaq (8 * 16)(%rcx), %rcx;
+
+               vperm2i128 $0x11, %ymm14, %ymm14, %ymm15;
 
-       vmovdqa (14 * 16)(%rsp), %ymm8;
+               vmovdqa (14 * 16)(%rsp), %ymm8;
 
-       testl %r15d, %r15d;
-       jz .Locb_aligned_blk8_dec;
                /* AES rounds */
                vbroadcasti128 (1 * 16)(%rdi), %ymm4;
                VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
@@ -2297,7 +2454,74 @@ _gcry_vaes_avx2_ocb_crypt_amd64:
                jmp .Locb_aligned_done;
 
        .align 8
-       .Locb_aligned_blk8_dec:
+       .Locb_aligned_blk8_auth:
+               vpxor (0 * 16)(%rcx), %ymm5, %ymm0;
+               vpxor (2 * 16)(%rcx), %ymm6, %ymm1;
+               vpxor (4 * 16)(%rcx), %ymm7, %ymm2;
+               vpxor (6 * 16)(%rcx), %ymm14, %ymm3;
+               leaq (8 * 16)(%rcx), %rcx;
+
+               vperm2i128 $0x11, %ymm14, %ymm14, %ymm15;
+
+               /* AES rounds */
+               vbroadcasti128 (1 * 16)(%rdi), %ymm4;
+               VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+               vbroadcasti128 (2 * 16)(%rdi), %ymm4;
+               VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+               vbroadcasti128 (3 * 16)(%rdi), %ymm4;
+               VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+               vbroadcasti128 (4 * 16)(%rdi), %ymm4;
+               VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+               vbroadcasti128 (5 * 16)(%rdi), %ymm4;
+               VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+               vbroadcasti128 (6 * 16)(%rdi), %ymm4;
+               VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+               vbroadcasti128 (7 * 16)(%rdi), %ymm4;
+               VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+               vbroadcasti128 (8 * 16)(%rdi), %ymm4;
+               VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+               vbroadcasti128 (9 * 16)(%rdi), %ymm4;
+               VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+               vbroadcasti128 (10 * 16)(%rdi), %ymm4;
+               cmpl $12, %r9d;
+               jb .Locb_aligned_blk8_auth_last;
+               VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+               vbroadcasti128 (11 * 16)(%rdi), %ymm4;
+               VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+               vbroadcasti128 (12 * 16)(%rdi), %ymm4;
+               jz .Locb_aligned_blk8_auth_last;
+               VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+               vbroadcasti128 (13 * 16)(%rdi), %ymm4;
+               VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+               vbroadcasti128 (14 * 16)(%rdi), %ymm4;
+
+               /* Last round and output handling. */
+       .Locb_aligned_blk8_auth_last:
+               vaesenclast %ymm4, %ymm0, %ymm0;
+               vaesenclast %ymm4, %ymm1, %ymm1;
+               vaesenclast %ymm4, %ymm2, %ymm2;
+               vaesenclast %ymm4, %ymm3, %ymm3;
+
+               vpxor %ymm1, %ymm0, %ymm0;
+               vpxor %ymm3, %ymm2, %ymm2;
+               vpxor %ymm2, %ymm0, %ymm0;
+               vpxor (20 * 16)(%rsp), %ymm0, %ymm0;
+               vmovdqa %ymm0, (20 * 16)(%rsp);
+
+               jmp .Locb_aligned_done;
+
+       .align 8
+       .Locb_aligned_blk8_dec:
+               vpxor (0 * 16)(%rcx), %ymm5, %ymm0;
+               vpxor (2 * 16)(%rcx), %ymm6, %ymm1;
+               vpxor (4 * 16)(%rcx), %ymm7, %ymm2;
+               vpxor (6 * 16)(%rcx), %ymm14, %ymm3;
+               leaq (8 * 16)(%rcx), %rcx;
+
+               vperm2i128 $0x11, %ymm14, %ymm14, %ymm15;
+
+               vmovdqa (14 * 16)(%rsp), %ymm8;
+
                /* AES rounds */
                vbroadcasti128 (1 * 16)(%rdi), %ymm4;
                VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
@@ -2346,19 +2570,28 @@ _gcry_vaes_avx2_ocb_crypt_amd64:
                vmovdqu %ymm3, (6 * 16)(%rdx);
                leaq (8 * 16)(%rdx), %rdx;
 
+               vpxor %ymm1, %ymm0, %ymm0;
+               vpxor %ymm3, %ymm2, %ymm2;
+               vpxor %ymm2, %ymm0, %ymm0;
+               vpxor (20 * 16)(%rsp), %ymm0, %ymm0;
+               vmovdqa %ymm0, (20 * 16)(%rsp);
+
 .align 8
 .Locb_aligned_done:
+       vmovdqa (20 * 16)(%rsp), %ymm14;
+       vpxor %xmm13, %xmm13, %xmm13;
+
        /* Burn stack. */
-       vpxor %ymm0, %ymm0, %ymm0;
-       vmovdqa %ymm0, (0 * 16)(%rsp);
-       vmovdqa %ymm0, (2 * 16)(%rsp);
-       vmovdqa %ymm0, (4 * 16)(%rsp);
-       vmovdqa %ymm0, (6 * 16)(%rsp);
-       vmovdqa %ymm0, (8 * 16)(%rsp);
-       vmovdqa %ymm0, (10 * 16)(%rsp);
-       vmovdqa %ymm0, (12 * 16)(%rsp);
-       vmovdqa %ymm0, (16 * 16)(%rsp);
-       vmovdqa %ymm0, (18 * 16)(%rsp);
+       vmovdqa %ymm13, (0 * 16)(%rsp);
+       vmovdqa %ymm13, (2 * 16)(%rsp);
+       vmovdqa %ymm13, (4 * 16)(%rsp);
+       vmovdqa %ymm13, (6 * 16)(%rsp);
+       vmovdqa %ymm13, (8 * 16)(%rsp);
+       vmovdqa %ymm13, (10 * 16)(%rsp);
+       vmovdqa %ymm13, (12 * 16)(%rsp);
+       vmovdqa %ymm13, (16 * 16)(%rsp);
+       vmovdqa %ymm13, (18 * 16)(%rsp);
+       vmovdqa %ymm13, (20 * 16)(%rsp);
 
        /* Handle tailing 1…7 blocks in nblk-unaligned loop. */
        movq %r8, %r10;
@@ -2367,20 +2600,16 @@ _gcry_vaes_avx2_ocb_crypt_amd64:
 
 .align 8
 .Ldone_ocb:
-       movq 16(%rbp), %r14; /* offset ptr. */
+       vpxor %ymm13, %ymm14, %ymm14;
+       vextracti128 $1, %ymm14, %xmm13;
+       vpxor (%rbx), %xmm14, %xmm14;
+       vpxor %xmm13, %xmm14, %xmm14;
+       vmovdqu %xmm14, (%rbx);
+
+       movq OFFSET_PTR_Q, %r14; /* offset ptr. */
        vpxor (0 * 16)(%rdi), %xmm15, %xmm15; /* offset ^ first key ^ first key */
        vmovdqu %xmm15, (%r14); /* Store offset. */
 
-       /* Handle decryption checksumming. */
-
-       testl %r15d, %r15d;
-       jnz .Locb_dec_checksum_done;
-       movq 24(%rbp), %rax; /* checksum ptr. */
-       movq (STACK_REGS_POS + 4 * 8)(%rsp), %r10;
-       movq (STACK_REGS_POS + 5 * 8)(%rsp), %r11;
-       call _gcry_vaes_avx2_ocb_checksum;
-.Locb_dec_checksum_done:
-
        /* Burn stack. */
        vpxor %ymm0, %ymm0, %ymm0;
        vmovdqa %ymm0, (14 * 16)(%rsp);
@@ -2395,6 +2624,10 @@ _gcry_vaes_avx2_ocb_crypt_amd64:
        CFI_RESTORE(%r14);
        movq (STACK_REGS_POS + 3 * 8)(%rsp), %r15;
        CFI_RESTORE(%r15);
+       movq (STACK_REGS_POS + 4 * 8)(%rsp), %rbx;
+       CFI_RESTORE(%rbx);
+
+       xorl %eax, %eax;
 
        leave;
        CFI_LEAVE();
@@ -2407,10 +2640,11 @@ _gcry_vaes_avx2_ocb_crypt_amd64:
 ELF(.size _gcry_vaes_avx2_ocb_crypt_amd64,.-_gcry_vaes_avx2_ocb_crypt_amd64)
 
 /**********************************************************************
-  CTR-mode encryption
+  XTS-mode encryption
  **********************************************************************/
 ELF(.type _gcry_vaes_avx2_xts_crypt_amd64,@function)
 .globl _gcry_vaes_avx2_xts_crypt_amd64
+.align 16
 _gcry_vaes_avx2_xts_crypt_amd64:
        /* input:
         *      %rdi: round keys
@@ -2720,7 +2954,7 @@ _gcry_vaes_avx2_xts_crypt_amd64:
 .align 8
 .Lxts_crypt_blk4:
        /* Try exit early as typically input length is large power of 2. */
-       cmpq $0, %r8;
+       cmpq $1, %r8;
        jb .Ldone_xts_crypt;
        cmpq $4, %r8;
        jb .Lxts_crypt_blk1;
@@ -2923,9 +3157,442 @@ _gcry_vaes_avx2_xts_crypt_amd64:
        CFI_ENDPROC();
 ELF(.size _gcry_vaes_avx2_xts_crypt_amd64,.-_gcry_vaes_avx2_xts_crypt_amd64)
 
+/**********************************************************************
+  ECB-mode encryption
+ **********************************************************************/
+ELF(.type _gcry_vaes_avx2_ecb_crypt_amd64,@function)
+.globl _gcry_vaes_avx2_ecb_crypt_amd64
+.align 16
+_gcry_vaes_avx2_ecb_crypt_amd64:
+       /* input:
+        *      %rdi: round keys
+        *      %esi: encrypt
+        *      %rdx: dst
+        *      %rcx: src
+        *      %r8:  nblocks
+        *      %r9:  nrounds
+        */
+       CFI_STARTPROC();
+
+       /* Process 16 blocks per loop. */
+.align 8
+.Lecb_blk16:
+       cmpq $16, %r8;
+       jb .Lecb_blk8;
+
+       leaq -16(%r8), %r8;
+
+       /* Load input and xor first key. */
+       vbroadcasti128 (0 * 16)(%rdi), %ymm8;
+       vmovdqu (0 * 16)(%rcx), %ymm0;
+       vmovdqu (2 * 16)(%rcx), %ymm1;
+       vmovdqu (4 * 16)(%rcx), %ymm2;
+       vmovdqu (6 * 16)(%rcx), %ymm3;
+       vmovdqu (8 * 16)(%rcx), %ymm4;
+       vmovdqu (10 * 16)(%rcx), %ymm5;
+       vmovdqu (12 * 16)(%rcx), %ymm6;
+       vmovdqu (14 * 16)(%rcx), %ymm7;
+       vpxor %ymm8, %ymm0, %ymm0;
+       vpxor %ymm8, %ymm1, %ymm1;
+       vpxor %ymm8, %ymm2, %ymm2;
+       vpxor %ymm8, %ymm3, %ymm3;
+       vpxor %ymm8, %ymm4, %ymm4;
+       vpxor %ymm8, %ymm5, %ymm5;
+       vpxor %ymm8, %ymm6, %ymm6;
+       vpxor %ymm8, %ymm7, %ymm7;
+       vbroadcasti128 (1 * 16)(%rdi), %ymm8;
+       leaq (16 * 16)(%rcx), %rcx;
+
+       testl %esi, %esi;
+       jz .Lecb_dec_blk16;
+               /* AES rounds */
+               VAESENC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
+               vbroadcasti128 (2 * 16)(%rdi), %ymm8;
+               VAESENC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
+               vbroadcasti128 (3 * 16)(%rdi), %ymm8;
+               VAESENC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
+               vbroadcasti128 (4 * 16)(%rdi), %ymm8;
+               VAESENC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
+               vbroadcasti128 (5 * 16)(%rdi), %ymm8;
+               VAESENC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
+               vbroadcasti128 (6 * 16)(%rdi), %ymm8;
+               VAESENC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
+               vbroadcasti128 (7 * 16)(%rdi), %ymm8;
+               VAESENC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
+               vbroadcasti128 (8 * 16)(%rdi), %ymm8;
+               VAESENC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
+               vbroadcasti128 (9 * 16)(%rdi), %ymm8;
+               VAESENC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
+               vbroadcasti128 (10 * 16)(%rdi), %ymm8;
+               cmpl $12, %r9d;
+               jb .Lecb_enc_blk16_last;
+               VAESENC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
+               vbroadcasti128 (11 * 16)(%rdi), %ymm8;
+               VAESENC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
+               vbroadcasti128 (12 * 16)(%rdi), %ymm8;
+               jz .Lecb_enc_blk16_last;
+               VAESENC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
+               vbroadcasti128 (13 * 16)(%rdi), %ymm8;
+               VAESENC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
+               vbroadcasti128 (14 * 16)(%rdi), %ymm8;
+         .Lecb_enc_blk16_last:
+               vaesenclast %ymm8, %ymm0, %ymm0;
+               vaesenclast %ymm8, %ymm1, %ymm1;
+               vaesenclast %ymm8, %ymm2, %ymm2;
+               vaesenclast %ymm8, %ymm3, %ymm3;
+               vaesenclast %ymm8, %ymm4, %ymm4;
+               vaesenclast %ymm8, %ymm5, %ymm5;
+               vaesenclast %ymm8, %ymm6, %ymm6;
+               vaesenclast %ymm8, %ymm7, %ymm7;
+               jmp .Lecb_blk16_end;
+
+         .align 8
+         .Lecb_dec_blk16:
+               /* AES rounds */
+               VAESDEC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
+               vbroadcasti128 (2 * 16)(%rdi), %ymm8;
+               VAESDEC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
+               vbroadcasti128 (3 * 16)(%rdi), %ymm8;
+               VAESDEC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
+               vbroadcasti128 (4 * 16)(%rdi), %ymm8;
+               VAESDEC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
+               vbroadcasti128 (5 * 16)(%rdi), %ymm8;
+               VAESDEC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
+               vbroadcasti128 (6 * 16)(%rdi), %ymm8;
+               VAESDEC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
+               vbroadcasti128 (7 * 16)(%rdi), %ymm8;
+               VAESDEC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
+               vbroadcasti128 (8 * 16)(%rdi), %ymm8;
+               VAESDEC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
+               vbroadcasti128 (9 * 16)(%rdi), %ymm8;
+               VAESDEC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
+               vbroadcasti128 (10 * 16)(%rdi), %ymm8;
+               cmpl $12, %r9d;
+               jb .Lecb_dec_blk16_last;
+               VAESDEC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
+               vbroadcasti128 (11 * 16)(%rdi), %ymm8;
+               VAESDEC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
+               vbroadcasti128 (12 * 16)(%rdi), %ymm8;
+               jz .Lecb_dec_blk16_last;
+               VAESDEC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
+               vbroadcasti128 (13 * 16)(%rdi), %ymm8;
+               VAESDEC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
+               vbroadcasti128 (14 * 16)(%rdi), %ymm8;
+         .Lecb_dec_blk16_last:
+               vaesdeclast %ymm8, %ymm0, %ymm0;
+               vaesdeclast %ymm8, %ymm1, %ymm1;
+               vaesdeclast %ymm8, %ymm2, %ymm2;
+               vaesdeclast %ymm8, %ymm3, %ymm3;
+               vaesdeclast %ymm8, %ymm4, %ymm4;
+               vaesdeclast %ymm8, %ymm5, %ymm5;
+               vaesdeclast %ymm8, %ymm6, %ymm6;
+               vaesdeclast %ymm8, %ymm7, %ymm7;
+               jmp .Lecb_blk16_end;
+
+  .align 8
+  .Lecb_blk16_end:
+       vmovdqu %ymm0, (0 * 16)(%rdx);
+       vmovdqu %ymm1, (2 * 16)(%rdx);
+       vmovdqu %ymm2, (4 * 16)(%rdx);
+       vmovdqu %ymm3, (6 * 16)(%rdx);
+       vmovdqu %ymm4, (8 * 16)(%rdx);
+       vmovdqu %ymm5, (10 * 16)(%rdx);
+       vmovdqu %ymm6, (12 * 16)(%rdx);
+       vmovdqu %ymm7, (14 * 16)(%rdx);
+       leaq (16 * 16)(%rdx), %rdx;
+
+       jmp .Lecb_blk16;
+
+       /* Handle trailing eight blocks. */
+.align 8
+.Lecb_blk8:
+       cmpq $8, %r8;
+       jmp .Lecb_blk4;
+
+       leaq -8(%r8), %r8;
+
+       /* Load input and xor first key. */
+       vbroadcasti128 (0 * 16)(%rdi), %ymm4;
+       vmovdqu (0 * 16)(%rcx), %ymm0;
+       vmovdqu (2 * 16)(%rcx), %ymm1;
+       vmovdqu (4 * 16)(%rcx), %ymm2;
+       vmovdqu (6 * 16)(%rcx), %ymm3;
+       vpxor %ymm4, %ymm0, %ymm0;
+       vpxor %ymm4, %ymm1, %ymm1;
+       vpxor %ymm4, %ymm2, %ymm2;
+       vpxor %ymm4, %ymm3, %ymm3;
+       vbroadcasti128 (1 * 16)(%rdi), %ymm4;
+       leaq (8 * 16)(%rcx), %rcx;
+
+       testl %esi, %esi;
+       jz .Lecb_dec_blk8;
+               /* AES rounds */
+               VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+               vbroadcasti128 (2 * 16)(%rdi), %ymm4;
+               VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+               vbroadcasti128 (3 * 16)(%rdi), %ymm4;
+               VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+               vbroadcasti128 (4 * 16)(%rdi), %ymm4;
+               VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+               vbroadcasti128 (5 * 16)(%rdi), %ymm4;
+               VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+               vbroadcasti128 (6 * 16)(%rdi), %ymm4;
+               VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+               vbroadcasti128 (7 * 16)(%rdi), %ymm4;
+               VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+               vbroadcasti128 (8 * 16)(%rdi), %ymm4;
+               VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+               vbroadcasti128 (9 * 16)(%rdi), %ymm4;
+               VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+               vbroadcasti128 (10 * 16)(%rdi), %ymm4;
+               cmpl $12, %r9d;
+               jb .Lecb_enc_blk8_last;
+               VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+               vbroadcasti128 (11 * 16)(%rdi), %ymm4;
+               VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+               vbroadcasti128 (12 * 16)(%rdi), %ymm4;
+               jz .Lecb_enc_blk8_last;
+               VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+               vbroadcasti128 (13 * 16)(%rdi), %ymm4;
+               VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+               vbroadcasti128 (14 * 16)(%rdi), %ymm4;
+         .Lecb_enc_blk8_last:
+               vaesenclast %ymm4, %ymm0, %ymm0;
+               vaesenclast %ymm4, %ymm1, %ymm1;
+               vaesenclast %ymm4, %ymm2, %ymm2;
+               vaesenclast %ymm4, %ymm3, %ymm3;
+               vmovdqu %ymm0, (0 * 16)(%rdx);
+               vmovdqu %ymm1, (2 * 16)(%rdx);
+               vmovdqu %ymm2, (4 * 16)(%rdx);
+               vmovdqu %ymm3, (6 * 16)(%rdx);
+               leaq (8 * 16)(%rdx), %rdx;
+               jmp .Lecb_blk4;
+
+         .align 8
+         .Lecb_dec_blk8:
+               /* AES rounds */
+               VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+               vbroadcasti128 (2 * 16)(%rdi), %ymm4;
+               VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+               vbroadcasti128 (3 * 16)(%rdi), %ymm4;
+               VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+               vbroadcasti128 (4 * 16)(%rdi), %ymm4;
+               VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+               vbroadcasti128 (5 * 16)(%rdi), %ymm4;
+               VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+               vbroadcasti128 (6 * 16)(%rdi), %ymm4;
+               VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+               vbroadcasti128 (7 * 16)(%rdi), %ymm4;
+               VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+               vbroadcasti128 (8 * 16)(%rdi), %ymm4;
+               VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+               vbroadcasti128 (9 * 16)(%rdi), %ymm4;
+               VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+               vbroadcasti128 (10 * 16)(%rdi), %ymm4;
+               cmpl $12, %r9d;
+               jb .Lecb_dec_blk8_last;
+               VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+               vbroadcasti128 (11 * 16)(%rdi), %ymm4;
+               VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+               vbroadcasti128 (12 * 16)(%rdi), %ymm4;
+               jz .Lecb_dec_blk8_last;
+               VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+               vbroadcasti128 (13 * 16)(%rdi), %ymm4;
+               VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+               vbroadcasti128 (14 * 16)(%rdi), %ymm4;
+         .Lecb_dec_blk8_last:
+               vaesdeclast %ymm4, %ymm0, %ymm0;
+               vaesdeclast %ymm4, %ymm1, %ymm1;
+               vaesdeclast %ymm4, %ymm2, %ymm2;
+               vaesdeclast %ymm4, %ymm3, %ymm3;
+               vmovdqu %ymm0, (0 * 16)(%rdx);
+               vmovdqu %ymm1, (2 * 16)(%rdx);
+               vmovdqu %ymm2, (4 * 16)(%rdx);
+               vmovdqu %ymm3, (6 * 16)(%rdx);
+               leaq (8 * 16)(%rdx), %rdx;
+
+       /* Handle trailing four blocks. */
+.align 8
+.Lecb_blk4:
+       cmpq $4, %r8;
+       jb .Lecb_blk1;
+
+       leaq -4(%r8), %r8;
+
+       /* Load input and xor first key. */
+       vbroadcasti128 (0 * 16)(%rdi), %ymm4;
+       vmovdqu (0 * 16)(%rcx), %ymm0;
+       vmovdqu (2 * 16)(%rcx), %ymm1;
+       vpxor %ymm4, %ymm0, %ymm0;
+       vpxor %ymm4, %ymm1, %ymm1;
+       vbroadcasti128 (1 * 16)(%rdi), %ymm4;
+       leaq (4 * 16)(%rcx), %rcx;
+
+       testl %esi, %esi;
+       jz .Lecb_dec_blk4;
+               /* AES rounds */
+               VAESENC2(%ymm4, %ymm0, %ymm1);
+               vbroadcasti128 (2 * 16)(%rdi), %ymm4;
+               VAESENC2(%ymm4, %ymm0, %ymm1);
+               vbroadcasti128 (3 * 16)(%rdi), %ymm4;
+               VAESENC2(%ymm4, %ymm0, %ymm1);
+               vbroadcasti128 (4 * 16)(%rdi), %ymm4;
+               VAESENC2(%ymm4, %ymm0, %ymm1);
+               vbroadcasti128 (5 * 16)(%rdi), %ymm4;
+               VAESENC2(%ymm4, %ymm0, %ymm1);
+               vbroadcasti128 (6 * 16)(%rdi), %ymm4;
+               VAESENC2(%ymm4, %ymm0, %ymm1);
+               vbroadcasti128 (7 * 16)(%rdi), %ymm4;
+               VAESENC2(%ymm4, %ymm0, %ymm1);
+               vbroadcasti128 (8 * 16)(%rdi), %ymm4;
+               VAESENC2(%ymm4, %ymm0, %ymm1);
+               vbroadcasti128 (9 * 16)(%rdi), %ymm4;
+               VAESENC2(%ymm4, %ymm0, %ymm1);
+               vbroadcasti128 (10 * 16)(%rdi), %ymm4;
+               cmpl $12, %r9d;
+               jb .Lecb_enc_blk4_last;
+               VAESENC2(%ymm4, %ymm0, %ymm1);
+               vbroadcasti128 (11 * 16)(%rdi), %ymm4;
+               VAESENC2(%ymm4, %ymm0, %ymm1);
+               vbroadcasti128 (12 * 16)(%rdi), %ymm4;
+               jz .Lecb_enc_blk4_last;
+               VAESENC2(%ymm4, %ymm0, %ymm1);
+               vbroadcasti128 (13 * 16)(%rdi), %ymm4;
+               VAESENC2(%ymm4, %ymm0, %ymm1);
+               vbroadcasti128 (14 * 16)(%rdi), %ymm4;
+         .Lecb_enc_blk4_last:
+               vaesenclast %ymm4, %ymm0, %ymm0;
+               vaesenclast %ymm4, %ymm1, %ymm1;
+               vmovdqu %ymm0, (0 * 16)(%rdx);
+               vmovdqu %ymm1, (2 * 16)(%rdx);
+               leaq (4 * 16)(%rdx), %rdx;
+               jmp .Lecb_blk1;
+
+         .align 8
+         .Lecb_dec_blk4:
+               /* AES rounds */
+               VAESDEC2(%ymm4, %ymm0, %ymm1);
+               vbroadcasti128 (2 * 16)(%rdi), %ymm4;
+               VAESDEC2(%ymm4, %ymm0, %ymm1);
+               vbroadcasti128 (3 * 16)(%rdi), %ymm4;
+               VAESDEC2(%ymm4, %ymm0, %ymm1);
+               vbroadcasti128 (4 * 16)(%rdi), %ymm4;
+               VAESDEC2(%ymm4, %ymm0, %ymm1);
+               vbroadcasti128 (5 * 16)(%rdi), %ymm4;
+               VAESDEC2(%ymm4, %ymm0, %ymm1);
+               vbroadcasti128 (6 * 16)(%rdi), %ymm4;
+               VAESDEC2(%ymm4, %ymm0, %ymm1);
+               vbroadcasti128 (7 * 16)(%rdi), %ymm4;
+               VAESDEC2(%ymm4, %ymm0, %ymm1);
+               vbroadcasti128 (8 * 16)(%rdi), %ymm4;
+               VAESDEC2(%ymm4, %ymm0, %ymm1);
+               vbroadcasti128 (9 * 16)(%rdi), %ymm4;
+               VAESDEC2(%ymm4, %ymm0, %ymm1);
+               vbroadcasti128 (10 * 16)(%rdi), %ymm4;
+               cmpl $12, %r9d;
+               jb .Lecb_dec_blk4_last;
+               VAESDEC2(%ymm4, %ymm0, %ymm1);
+               vbroadcasti128 (11 * 16)(%rdi), %ymm4;
+               VAESDEC2(%ymm4, %ymm0, %ymm1);
+               vbroadcasti128 (12 * 16)(%rdi), %ymm4;
+               jz .Lecb_dec_blk4_last;
+               VAESDEC2(%ymm4, %ymm0, %ymm1);
+               vbroadcasti128 (13 * 16)(%rdi), %ymm4;
+               VAESDEC2(%ymm4, %ymm0, %ymm1);
+               vbroadcasti128 (14 * 16)(%rdi), %ymm4;
+         .Lecb_dec_blk4_last:
+               vaesdeclast %ymm4, %ymm0, %ymm0;
+               vaesdeclast %ymm4, %ymm1, %ymm1;
+               vmovdqu %ymm0, (0 * 16)(%rdx);
+               vmovdqu %ymm1, (2 * 16)(%rdx);
+               leaq (4 * 16)(%rdx), %rdx;
+
+       /* Process trailing one to three blocks, one per loop. */
+.align 8
+.Lecb_blk1:
+       cmpq $1, %r8;
+       jb .Ldone_ecb;
+
+       leaq -1(%r8), %r8;
+
+       /* Load input. */
+       vmovdqu (%rcx), %xmm2;
+       leaq 16(%rcx), %rcx;
+
+       /* Xor first key. */
+       vpxor (0 * 16)(%rdi), %xmm2, %xmm0;
+
+       testl %esi, %esi;
+       jz .Lecb_dec_blk1;
+               /* AES rounds. */
+               vaesenc (1 * 16)(%rdi), %xmm0, %xmm0;
+               vaesenc (2 * 16)(%rdi), %xmm0, %xmm0;
+               vaesenc (3 * 16)(%rdi), %xmm0, %xmm0;
+               vaesenc (4 * 16)(%rdi), %xmm0, %xmm0;
+               vaesenc (5 * 16)(%rdi), %xmm0, %xmm0;
+               vaesenc (6 * 16)(%rdi), %xmm0, %xmm0;
+               vaesenc (7 * 16)(%rdi), %xmm0, %xmm0;
+               vaesenc (8 * 16)(%rdi), %xmm0, %xmm0;
+               vaesenc (9 * 16)(%rdi), %xmm0, %xmm0;
+               vmovdqa (10 * 16)(%rdi), %xmm1;
+               cmpl $12, %r9d;
+               jb .Lecb_enc_blk1_last;
+               vaesenc %xmm1, %xmm0, %xmm0;
+               vaesenc (11 * 16)(%rdi), %xmm0, %xmm0;
+               vmovdqa (12 * 16)(%rdi), %xmm1;
+               jz .Lecb_enc_blk1_last;
+               vaesenc %xmm1, %xmm0, %xmm0;
+               vaesenc (13 * 16)(%rdi), %xmm0, %xmm0;
+               vmovdqa (14 * 16)(%rdi), %xmm1;
+         .Lecb_enc_blk1_last:
+               vaesenclast %xmm1, %xmm0, %xmm0;
+               jmp .Lecb_blk1_end;
+
+         .align 8
+         .Lecb_dec_blk1:
+               /* AES rounds. */
+               vaesdec (1 * 16)(%rdi), %xmm0, %xmm0;
+               vaesdec (2 * 16)(%rdi), %xmm0, %xmm0;
+               vaesdec (3 * 16)(%rdi), %xmm0, %xmm0;
+               vaesdec (4 * 16)(%rdi), %xmm0, %xmm0;
+               vaesdec (5 * 16)(%rdi), %xmm0, %xmm0;
+               vaesdec (6 * 16)(%rdi), %xmm0, %xmm0;
+               vaesdec (7 * 16)(%rdi), %xmm0, %xmm0;
+               vaesdec (8 * 16)(%rdi), %xmm0, %xmm0;
+               vaesdec (9 * 16)(%rdi), %xmm0, %xmm0;
+               vmovdqa (10 * 16)(%rdi), %xmm1;
+               cmpl $12, %r9d;
+               jb .Lecb_dec_blk1_last;
+               vaesdec %xmm1, %xmm0, %xmm0;
+               vaesdec (11 * 16)(%rdi), %xmm0, %xmm0;
+               vmovdqa (12 * 16)(%rdi), %xmm1;
+               jz .Lecb_dec_blk1_last;
+               vaesdec %xmm1, %xmm0, %xmm0;
+               vaesdec (13 * 16)(%rdi), %xmm0, %xmm0;
+               vmovdqa (14 * 16)(%rdi), %xmm1;
+         .Lecb_dec_blk1_last:
+               vaesdeclast %xmm1, %xmm0, %xmm0;
+               jmp .Lecb_blk1_end;
+
+  .align 8
+  .Lecb_blk1_end:
+       vmovdqu %xmm0, (%rdx);
+       leaq 16(%rdx), %rdx;
+
+       jmp .Lecb_blk1;
+
+.align 8
+.Ldone_ecb:
+       vzeroall;
+       ret_spec_stop
+       CFI_ENDPROC();
+ELF(.size _gcry_vaes_avx2_ecb_crypt_amd64,.-_gcry_vaes_avx2_ecb_crypt_amd64)
+
 /**********************************************************************
   constants
  **********************************************************************/
+SECTION_RODATA
+
 ELF(.type _gcry_vaes_consts,@object)
 _gcry_vaes_consts:
 .align 32
diff --git a/cipher/rijndael-vaes-avx2-i386.S b/cipher/rijndael-vaes-avx2-i386.S
new file mode 100644 (file)
index 0000000..245e844
--- /dev/null
@@ -0,0 +1,2804 @@
+/* VAES/AVX2 i386 accelerated AES for Libgcrypt
+ * Copyright (C) 2023 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#if defined(__i386__)
+#include <config.h>
+#if (defined(HAVE_COMPATIBLE_GCC_I386_PLATFORM_AS) || \
+     defined(HAVE_COMPATIBLE_GCC_WIN32_PLATFORM_AS)) && \
+    defined(ENABLE_AESNI_SUPPORT) && defined(ENABLE_AVX2_SUPPORT) && \
+    defined(HAVE_GCC_INLINE_ASM_VAES_VPCLMUL)
+
+#include "asm-common-i386.h"
+
+.text
+
+DECL_GET_PC_THUNK(eax);
+
+/**********************************************************************
+  helper macros
+ **********************************************************************/
+#define AES_OP4(op, key, b0, b1, b2, b3) \
+       op key, b0, b0; \
+       op key, b1, b1; \
+       op key, b2, b2; \
+       op key, b3, b3;
+
+#define VAESENC4(key, b0, b1, b2, b3) \
+       AES_OP4(vaesenc, key, b0, b1, b2, b3)
+
+#define VAESDEC4(key, b0, b1, b2, b3) \
+       AES_OP4(vaesdec, key, b0, b1, b2, b3)
+
+#define XOR4(key, b0, b1, b2, b3) \
+       AES_OP4(vpxor, key, b0, b1, b2, b3)
+
+#define AES_OP2(op, key, b0, b1) \
+       op key, b0, b0; \
+       op key, b1, b1;
+
+#define VAESENC2(key, b0, b1) \
+       AES_OP2(vaesenc, key, b0, b1)
+
+#define VAESDEC2(key, b0, b1) \
+       AES_OP2(vaesdec, key, b0, b1)
+
+#define XOR2(key, b0, b1) \
+       AES_OP2(vpxor, key, b0, b1)
+
+#define VAESENC6(key, b0, b1, b2, b3, b4, b5) \
+       AES_OP4(vaesenc, key, b0, b1, b2, b3); \
+       AES_OP2(vaesenc, key, b4, b5)
+
+#define VAESDEC6(key, b0, b1, b2, b3, b4, b5) \
+       AES_OP4(vaesdec, key, b0, b1, b2, b3); \
+       AES_OP2(vaesdec, key, b4, b5)
+
+#define XOR6(key, b0, b1, b2, b3, b4, b5) \
+       AES_OP4(vpxor, key, b0, b1, b2, b3); \
+       AES_OP2(vpxor, key, b4, b5)
+
+#define CADDR(name, reg) \
+       (name - SYM_NAME(_gcry_vaes_consts))(reg)
+
+/**********************************************************************
+  CBC-mode decryption
+ **********************************************************************/
+ELF(.type SYM_NAME(_gcry_vaes_avx2_cbc_dec_i386),@function)
+.globl SYM_NAME(_gcry_vaes_avx2_cbc_dec_i386)
+.align 16
+SYM_NAME(_gcry_vaes_avx2_cbc_dec_i386):
+       /* input:
+        *      (esp + 4): round keys
+        *      (esp + 8): iv
+        *      (esp + 12): dst
+        *      (esp + 16): src
+        *      (esp + 20): nblocks
+        *      (esp + 24): nrounds
+        */
+       CFI_STARTPROC();
+       pushl %edi;
+       CFI_PUSH(%edi);
+       pushl %esi;
+       CFI_PUSH(%esi);
+
+       movl 8+4(%esp), %edi;
+       movl 8+8(%esp), %esi;
+       movl 8+12(%esp), %edx;
+       movl 8+16(%esp), %ecx;
+       movl 8+20(%esp), %eax;
+
+       /* Process 8 blocks per loop. */
+.align 8
+.Lcbc_dec_blk8:
+       cmpl $8, %eax;
+       jb .Lcbc_dec_blk4;
+
+       leal -8(%eax), %eax;
+
+       /* Load input and xor first key. Update IV. */
+       vbroadcasti128 (0 * 16)(%edi), %ymm4;
+       vmovdqu (0 * 16)(%ecx), %ymm0;
+       vmovdqu (2 * 16)(%ecx), %ymm1;
+       vmovdqu (4 * 16)(%ecx), %ymm2;
+       vmovdqu (6 * 16)(%ecx), %ymm3;
+       vmovdqu (%esi), %xmm6; /* Load IV. */
+       vinserti128 $1, %xmm0, %ymm6, %ymm5;
+       vextracti128 $1, %ymm3, (%esi); /* Store IV. */
+       vpxor %ymm4, %ymm0, %ymm0;
+       vpxor %ymm4, %ymm1, %ymm1;
+       vpxor %ymm4, %ymm2, %ymm2;
+       vpxor %ymm4, %ymm3, %ymm3;
+       vmovdqu (1 * 16)(%ecx), %ymm6;
+       vmovdqu (3 * 16)(%ecx), %ymm7;
+
+       /* AES rounds */
+       vbroadcasti128 (1 * 16)(%edi), %ymm4;
+       VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+       vbroadcasti128 (2 * 16)(%edi), %ymm4;
+       VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+       vbroadcasti128 (3 * 16)(%edi), %ymm4;
+       VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+       vbroadcasti128 (4 * 16)(%edi), %ymm4;
+       VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+       vbroadcasti128 (5 * 16)(%edi), %ymm4;
+       VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+       vbroadcasti128 (6 * 16)(%edi), %ymm4;
+       VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+       vbroadcasti128 (7 * 16)(%edi), %ymm4;
+       VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+       vbroadcasti128 (8 * 16)(%edi), %ymm4;
+       VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+       vbroadcasti128 (9 * 16)(%edi), %ymm4;
+       VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+       vbroadcasti128 (10 * 16)(%edi), %ymm4;
+       cmpl $12, 8+24(%esp);
+       jb .Lcbc_dec_blk8_last;
+       VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+       vbroadcasti128 (11 * 16)(%edi), %ymm4;
+       VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+       vbroadcasti128 (12 * 16)(%edi), %ymm4;
+       jz .Lcbc_dec_blk8_last;
+       VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+       vbroadcasti128 (13 * 16)(%edi), %ymm4;
+       VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+       vbroadcasti128 (14 * 16)(%edi), %ymm4;
+
+       /* Last round and output handling. */
+  .Lcbc_dec_blk8_last:
+       vpxor %ymm4, %ymm5, %ymm5;
+       vpxor %ymm4, %ymm6, %ymm6;
+       vpxor %ymm4, %ymm7, %ymm7;
+       vpxor (5 * 16)(%ecx), %ymm4, %ymm4;
+       leal (8 * 16)(%ecx), %ecx;
+       vaesdeclast %ymm5, %ymm0, %ymm0;
+       vaesdeclast %ymm6, %ymm1, %ymm1;
+       vaesdeclast %ymm7, %ymm2, %ymm2;
+       vaesdeclast %ymm4, %ymm3, %ymm3;
+       vmovdqu %ymm0, (0 * 16)(%edx);
+       vmovdqu %ymm1, (2 * 16)(%edx);
+       vmovdqu %ymm2, (4 * 16)(%edx);
+       vmovdqu %ymm3, (6 * 16)(%edx);
+       leal (8 * 16)(%edx), %edx;
+
+       jmp .Lcbc_dec_blk8;
+
+       /* Handle trailing four blocks. */
+.align 8
+.Lcbc_dec_blk4:
+       cmpl $4, %eax;
+       jb .Lcbc_dec_blk1;
+
+       leal -4(%eax), %eax;
+
+       /* Load input and xor first key. Update IV. */
+       vbroadcasti128 (0 * 16)(%edi), %ymm4;
+       vmovdqu (0 * 16)(%ecx), %ymm0;
+       vmovdqu (2 * 16)(%ecx), %ymm1;
+       vmovdqu (%esi), %xmm6; /* Load IV. */
+       vinserti128 $1, %xmm0, %ymm6, %ymm5;
+       vextracti128 $1, %ymm1, (%esi); /* Store IV. */
+       vpxor %ymm4, %ymm0, %ymm0;
+       vpxor %ymm4, %ymm1, %ymm1;
+       vmovdqu (1 * 16)(%ecx), %ymm6;
+       leal (4 * 16)(%ecx), %ecx;
+
+       /* AES rounds */
+       vbroadcasti128 (1 * 16)(%edi), %ymm4;
+       VAESDEC2(%ymm4, %ymm0, %ymm1);
+       vbroadcasti128 (2 * 16)(%edi), %ymm4;
+       VAESDEC2(%ymm4, %ymm0, %ymm1);
+       vbroadcasti128 (3 * 16)(%edi), %ymm4;
+       VAESDEC2(%ymm4, %ymm0, %ymm1);
+       vbroadcasti128 (4 * 16)(%edi), %ymm4;
+       VAESDEC2(%ymm4, %ymm0, %ymm1);
+       vbroadcasti128 (5 * 16)(%edi), %ymm4;
+       VAESDEC2(%ymm4, %ymm0, %ymm1);
+       vbroadcasti128 (6 * 16)(%edi), %ymm4;
+       VAESDEC2(%ymm4, %ymm0, %ymm1);
+       vbroadcasti128 (7 * 16)(%edi), %ymm4;
+       VAESDEC2(%ymm4, %ymm0, %ymm1);
+       vbroadcasti128 (8 * 16)(%edi), %ymm4;
+       VAESDEC2(%ymm4, %ymm0, %ymm1);
+       vbroadcasti128 (9 * 16)(%edi), %ymm4;
+       VAESDEC2(%ymm4, %ymm0, %ymm1);
+       vbroadcasti128 (10 * 16)(%edi), %ymm4;
+       cmpl $12, 8+24(%esp);
+       jb .Lcbc_dec_blk4_last;
+       VAESDEC2(%ymm4, %ymm0, %ymm1);
+       vbroadcasti128 (11 * 16)(%edi), %ymm4;
+       VAESDEC2(%ymm4, %ymm0, %ymm1);
+       vbroadcasti128 (12 * 16)(%edi), %ymm4;
+       jz .Lcbc_dec_blk4_last;
+       VAESDEC2(%ymm4, %ymm0, %ymm1);
+       vbroadcasti128 (13 * 16)(%edi), %ymm4;
+       VAESDEC2(%ymm4, %ymm0, %ymm1);
+       vbroadcasti128 (14 * 16)(%edi), %ymm4;
+
+       /* Last round and output handling. */
+  .Lcbc_dec_blk4_last:
+       vpxor %ymm4, %ymm5, %ymm5;
+       vpxor %ymm4, %ymm6, %ymm6;
+       vaesdeclast %ymm5, %ymm0, %ymm0;
+       vaesdeclast %ymm6, %ymm1, %ymm1;
+       vmovdqu %ymm0, (0 * 16)(%edx);
+       vmovdqu %ymm1, (2 * 16)(%edx);
+       leal (4 * 16)(%edx), %edx;
+
+       /* Process trailing one to three blocks, one per loop. */
+.align 8
+.Lcbc_dec_blk1:
+       cmpl $1, %eax;
+       jb .Ldone_cbc_dec;
+
+       leal -1(%eax), %eax;
+
+       /* Load input. */
+       vmovdqu (%ecx), %xmm2;
+       leal 16(%ecx), %ecx;
+
+       /* Xor first key. */
+       vpxor (0 * 16)(%edi), %xmm2, %xmm0;
+
+       /* AES rounds. */
+       vaesdec (1 * 16)(%edi), %xmm0, %xmm0;
+       vaesdec (2 * 16)(%edi), %xmm0, %xmm0;
+       vaesdec (3 * 16)(%edi), %xmm0, %xmm0;
+       vaesdec (4 * 16)(%edi), %xmm0, %xmm0;
+       vaesdec (5 * 16)(%edi), %xmm0, %xmm0;
+       vaesdec (6 * 16)(%edi), %xmm0, %xmm0;
+       vaesdec (7 * 16)(%edi), %xmm0, %xmm0;
+       vaesdec (8 * 16)(%edi), %xmm0, %xmm0;
+       vaesdec (9 * 16)(%edi), %xmm0, %xmm0;
+       vmovdqa (10 * 16)(%edi), %xmm1;
+       cmpl $12, 8+24(%esp);
+       jb .Lcbc_dec_blk1_last;
+       vaesdec %xmm1, %xmm0, %xmm0;
+       vaesdec (11 * 16)(%edi), %xmm0, %xmm0;
+       vmovdqa (12 * 16)(%edi), %xmm1;
+       jz .Lcbc_dec_blk1_last;
+       vaesdec %xmm1, %xmm0, %xmm0;
+       vaesdec (13 * 16)(%edi), %xmm0, %xmm0;
+       vmovdqa (14 * 16)(%edi), %xmm1;
+
+       /* Last round and output handling. */
+  .Lcbc_dec_blk1_last:
+       vpxor (%esi), %xmm1, %xmm1;
+       vaesdeclast %xmm1, %xmm0, %xmm0;
+       vmovdqu %xmm2, (%esi);
+       vmovdqu %xmm0, (%edx);
+       leal 16(%edx), %edx;
+
+       jmp .Lcbc_dec_blk1;
+
+.align 8
+.Ldone_cbc_dec:
+       popl %esi;
+       CFI_POP(%esi);
+       popl %edi;
+       CFI_POP(%edi);
+       vzeroall;
+       ret_spec_stop
+       CFI_ENDPROC();
+ELF(.size SYM_NAME(_gcry_vaes_avx2_cbc_dec_i386),
+         .-SYM_NAME(_gcry_vaes_avx2_cbc_dec_i386))
+
+/**********************************************************************
+  CFB-mode decryption
+ **********************************************************************/
+ELF(.type SYM_NAME(_gcry_vaes_avx2_cfb_dec_i386),@function)
+.globl SYM_NAME(_gcry_vaes_avx2_cfb_dec_i386)
+.align 16
+SYM_NAME(_gcry_vaes_avx2_cfb_dec_i386):
+       /* input:
+        *      (esp + 4): round keys
+        *      (esp + 8): iv
+        *      (esp + 12): dst
+        *      (esp + 16): src
+        *      (esp + 20): nblocks
+        *      (esp + 24): nrounds
+        */
+       CFI_STARTPROC();
+       pushl %edi;
+       CFI_PUSH(%edi);
+       pushl %esi;
+       CFI_PUSH(%esi);
+
+       movl 8+4(%esp), %edi;
+       movl 8+8(%esp), %esi;
+       movl 8+12(%esp), %edx;
+       movl 8+16(%esp), %ecx;
+       movl 8+20(%esp), %eax;
+
+       /* Process 8 blocks per loop. */
+.align 8
+.Lcfb_dec_blk8:
+       cmpl $8, %eax;
+       jb .Lcfb_dec_blk4;
+
+       leal -8(%eax), %eax;
+
+       /* Load IV. */
+       vmovdqu (%esi), %xmm0;
+
+       /* Load input and xor first key. Update IV. */
+       vbroadcasti128 (0 * 16)(%edi), %ymm4;
+       vmovdqu (0 * 16)(%ecx), %ymm5;
+       vinserti128 $1, %xmm5, %ymm0, %ymm0;
+       vmovdqu (1 * 16)(%ecx), %ymm1;
+       vmovdqu (3 * 16)(%ecx), %ymm2;
+       vmovdqu (5 * 16)(%ecx), %ymm3;
+       vmovdqu (7 * 16)(%ecx), %xmm6;
+       vpxor %ymm4, %ymm0, %ymm0;
+       vpxor %ymm4, %ymm1, %ymm1;
+       vpxor %ymm4, %ymm2, %ymm2;
+       vpxor %ymm4, %ymm3, %ymm3;
+       vbroadcasti128 (1 * 16)(%edi), %ymm4;
+       vmovdqu %xmm6, (%esi); /* Store IV. */
+       vmovdqu (2 * 16)(%ecx), %ymm6;
+       vmovdqu (4 * 16)(%ecx), %ymm7;
+
+       /* AES rounds */
+       vbroadcasti128 (1 * 16)(%edi), %ymm4;
+       VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+       vbroadcasti128 (2 * 16)(%edi), %ymm4;
+       VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+       vbroadcasti128 (3 * 16)(%edi), %ymm4;
+       VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+       vbroadcasti128 (4 * 16)(%edi), %ymm4;
+       VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+       vbroadcasti128 (5 * 16)(%edi), %ymm4;
+       VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+       vbroadcasti128 (6 * 16)(%edi), %ymm4;
+       VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+       vbroadcasti128 (7 * 16)(%edi), %ymm4;
+       VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+       vbroadcasti128 (8 * 16)(%edi), %ymm4;
+       VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+       vbroadcasti128 (9 * 16)(%edi), %ymm4;
+       VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+       vbroadcasti128 (10 * 16)(%edi), %ymm4;
+       cmpl $12, 8+24(%esp);
+       jb .Lcfb_dec_blk8_last;
+       VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+       vbroadcasti128 (11 * 16)(%edi), %ymm4;
+       VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+       vbroadcasti128 (12 * 16)(%edi), %ymm4;
+       jz .Lcfb_dec_blk8_last;
+       VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+       vbroadcasti128 (13 * 16)(%edi), %ymm4;
+       VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+       vbroadcasti128 (14 * 16)(%edi), %ymm4;
+
+       /* Last round and output handling. */
+  .Lcfb_dec_blk8_last:
+       vpxor %ymm4, %ymm5, %ymm5;
+       vpxor %ymm4, %ymm6, %ymm6;
+       vpxor %ymm4, %ymm7, %ymm7;
+       vpxor (6 * 16)(%ecx), %ymm4, %ymm4;
+       leal (8 * 16)(%ecx), %ecx;
+       vaesenclast %ymm5, %ymm0, %ymm0;
+       vaesenclast %ymm6, %ymm1, %ymm1;
+       vaesenclast %ymm7, %ymm2, %ymm2;
+       vaesenclast %ymm4, %ymm3, %ymm3;
+       vmovdqu %ymm0, (0 * 16)(%edx);
+       vmovdqu %ymm1, (2 * 16)(%edx);
+       vmovdqu %ymm2, (4 * 16)(%edx);
+       vmovdqu %ymm3, (6 * 16)(%edx);
+       leal (8 * 16)(%edx), %edx;
+
+       jmp .Lcfb_dec_blk8;
+
+       /* Handle trailing four blocks. */
+.align 8
+.Lcfb_dec_blk4:
+       cmpl $4, %eax;
+       jb .Lcfb_dec_blk1;
+
+       leal -4(%eax), %eax;
+
+       /* Load IV. */
+       vmovdqu (%esi), %xmm0;
+
+       /* Load input and xor first key. Update IV. */
+       vbroadcasti128 (0 * 16)(%edi), %ymm4;
+       vmovdqu (0 * 16)(%ecx), %ymm5;
+       vinserti128 $1, %xmm5, %ymm0, %ymm0;
+       vmovdqu (1 * 16)(%ecx), %ymm1;
+       vmovdqu (3 * 16)(%ecx), %xmm6;
+       vpxor %ymm4, %ymm0, %ymm0;
+       vpxor %ymm4, %ymm1, %ymm1;
+       vbroadcasti128 (1 * 16)(%edi), %ymm4;
+       vmovdqu %xmm6, (%esi); /* Store IV. */
+       vmovdqu (2 * 16)(%ecx), %ymm6;
+
+       leal (4 * 16)(%ecx), %ecx;
+
+       /* AES rounds */
+       vbroadcasti128 (1 * 16)(%edi), %ymm4;
+       VAESENC2(%ymm4, %ymm0, %ymm1);
+       vbroadcasti128 (2 * 16)(%edi), %ymm4;
+       VAESENC2(%ymm4, %ymm0, %ymm1);
+       vbroadcasti128 (3 * 16)(%edi), %ymm4;
+       VAESENC2(%ymm4, %ymm0, %ymm1);
+       vbroadcasti128 (4 * 16)(%edi), %ymm4;
+       VAESENC2(%ymm4, %ymm0, %ymm1);
+       vbroadcasti128 (5 * 16)(%edi), %ymm4;
+       VAESENC2(%ymm4, %ymm0, %ymm1);
+       vbroadcasti128 (6 * 16)(%edi), %ymm4;
+       VAESENC2(%ymm4, %ymm0, %ymm1);
+       vbroadcasti128 (7 * 16)(%edi), %ymm4;
+       VAESENC2(%ymm4, %ymm0, %ymm1);
+       vbroadcasti128 (8 * 16)(%edi), %ymm4;
+       VAESENC2(%ymm4, %ymm0, %ymm1);
+       vbroadcasti128 (9 * 16)(%edi), %ymm4;
+       VAESENC2(%ymm4, %ymm0, %ymm1);
+       vbroadcasti128 (10 * 16)(%edi), %ymm4;
+       cmpl $12, 8+24(%esp);
+       jb .Lcfb_dec_blk4_last;
+       VAESENC2(%ymm4, %ymm0, %ymm1);
+       vbroadcasti128 (11 * 16)(%edi), %ymm4;
+       VAESENC2(%ymm4, %ymm0, %ymm1);
+       vbroadcasti128 (12 * 16)(%edi), %ymm4;
+       jz .Lcfb_dec_blk4_last;
+       VAESENC2(%ymm4, %ymm0, %ymm1);
+       vbroadcasti128 (13 * 16)(%edi), %ymm4;
+       VAESENC2(%ymm4, %ymm0, %ymm1);
+       vbroadcasti128 (14 * 16)(%edi), %ymm4;
+
+       /* Last round and output handling. */
+  .Lcfb_dec_blk4_last:
+       vpxor %ymm4, %ymm5, %ymm5;
+       vpxor %ymm4, %ymm6, %ymm6;
+       vaesenclast %ymm5, %ymm0, %ymm0;
+       vaesenclast %ymm6, %ymm1, %ymm1;
+       vmovdqu %ymm0, (0 * 16)(%edx);
+       vmovdqu %ymm1, (2 * 16)(%edx);
+       leal (4 * 16)(%edx), %edx;
+
+       /* Process trailing one to three blocks, one per loop. */
+.align 8
+.Lcfb_dec_blk1:
+       cmpl $1, %eax;
+       jb .Ldone_cfb_dec;
+
+       leal -1(%eax), %eax;
+
+       /* Load IV. */
+       vmovdqu (%esi), %xmm0;
+
+       /* Xor first key. */
+       vpxor (0 * 16)(%edi), %xmm0, %xmm0;
+
+       /* Load input as next IV. */
+       vmovdqu (%ecx), %xmm2;
+       leal 16(%ecx), %ecx;
+
+       /* AES rounds. */
+       vaesenc (1 * 16)(%edi), %xmm0, %xmm0;
+       vaesenc (2 * 16)(%edi), %xmm0, %xmm0;
+       vaesenc (3 * 16)(%edi), %xmm0, %xmm0;
+       vaesenc (4 * 16)(%edi), %xmm0, %xmm0;
+       vaesenc (5 * 16)(%edi), %xmm0, %xmm0;
+       vaesenc (6 * 16)(%edi), %xmm0, %xmm0;
+       vaesenc (7 * 16)(%edi), %xmm0, %xmm0;
+       vaesenc (8 * 16)(%edi), %xmm0, %xmm0;
+       vaesenc (9 * 16)(%edi), %xmm0, %xmm0;
+       vmovdqa (10 * 16)(%edi), %xmm1;
+       vmovdqu %xmm2, (%esi); /* Store IV. */
+       cmpl $12, 8+24(%esp);
+       jb .Lcfb_dec_blk1_last;
+       vaesenc %xmm1, %xmm0, %xmm0;
+       vaesenc (11 * 16)(%edi), %xmm0, %xmm0;
+       vmovdqa (12 * 16)(%edi), %xmm1;
+       jz .Lcfb_dec_blk1_last;
+       vaesenc %xmm1, %xmm0, %xmm0;
+       vaesenc (13 * 16)(%edi), %xmm0, %xmm0;
+       vmovdqa (14 * 16)(%edi), %xmm1;
+
+       /* Last round and output handling. */
+  .Lcfb_dec_blk1_last:
+       vpxor %xmm2, %xmm1, %xmm1;
+       vaesenclast %xmm1, %xmm0, %xmm0;
+       vmovdqu %xmm0, (%edx);
+       leal 16(%edx), %edx;
+
+       jmp .Lcfb_dec_blk1;
+
+.align 8
+.Ldone_cfb_dec:
+       popl %esi;
+       CFI_POP(%esi);
+       popl %edi;
+       CFI_POP(%edi);
+       vzeroall;
+       ret_spec_stop
+       CFI_ENDPROC();
+ELF(.size SYM_NAME(_gcry_vaes_avx2_cfb_dec_i386),
+         .-SYM_NAME(_gcry_vaes_avx2_cfb_dec_i386))
+
+/**********************************************************************
+  CTR-mode encryption
+ **********************************************************************/
+ELF(.type SYM_NAME(_gcry_vaes_avx2_ctr_enc_i386),@function)
+.globl SYM_NAME(_gcry_vaes_avx2_ctr_enc_i386)
+.align 16
+SYM_NAME(_gcry_vaes_avx2_ctr_enc_i386):
+       /* input:
+        *      (esp + 4): round keys
+        *      (esp + 8): iv
+        *      (esp + 12): dst
+        *      (esp + 16): src
+        *      (esp + 20): nblocks
+        *      (esp + 24): nrounds
+        */
+       CFI_STARTPROC();
+
+       GET_DATA_POINTER(SYM_NAME(_gcry_vaes_consts), eax);
+
+       pushl %ebp;
+       CFI_PUSH(%ebp);
+       movl %esp, %ebp;
+       CFI_DEF_CFA_REGISTER(%ebp);
+
+       subl $(3 * 32 + 3 * 4), %esp;
+       andl $-32, %esp;
+
+       movl %edi, (3 * 32 + 0 * 4)(%esp);
+       CFI_REG_ON_STACK(edi, 3 * 32 + 0 * 4);
+       movl %esi, (3 * 32 + 1 * 4)(%esp);
+       CFI_REG_ON_STACK(esi, 3 * 32 + 1 * 4);
+       movl %ebx, (3 * 32 + 2 * 4)(%esp);
+       CFI_REG_ON_STACK(ebx, 3 * 32 + 2 * 4);
+
+       movl %eax, %ebx;
+       movl 4+4(%ebp), %edi;
+       movl 4+8(%ebp), %esi;
+       movl 4+12(%ebp), %edx;
+       movl 4+16(%ebp), %ecx;
+
+#define prepare_ctr_const(minus_one, minus_two) \
+       vpcmpeqd minus_one, minus_one, minus_one; \
+       vpsrldq $8, minus_one, minus_one;       /* 0:-1 */ \
+       vpaddq minus_one, minus_one, minus_two; /* 0:-2 */
+
+#define inc_le128(x, minus_one, tmp) \
+       vpcmpeqq minus_one, x, tmp; \
+       vpsubq minus_one, x, x; \
+       vpslldq $8, tmp, tmp; \
+       vpsubq tmp, x, x;
+
+#define add2_le128(x, minus_one, minus_two, tmp1, tmp2) \
+       vpcmpeqq minus_one, x, tmp1; \
+       vpcmpeqq minus_two, x, tmp2; \
+       vpor tmp1, tmp2, tmp2; \
+       vpsubq minus_two, x, x; \
+       vpslldq $8, tmp2, tmp2; \
+       vpsubq tmp2, x, x;
+
+#define handle_ctr_128bit_add(nblks) \
+       movl 12(%esi), %eax; \
+       bswapl %eax; \
+       addl $nblks, %eax; \
+       bswapl %eax; \
+       movl %eax, 12(%esi); \
+       jnc 1f; \
+       \
+       movl 8(%esi), %eax; \
+       bswapl %eax; \
+       adcl $0, %eax; \
+       bswapl %eax; \
+       movl %eax, 8(%esi); \
+       \
+       movl 4(%esi), %eax; \
+       bswapl %eax; \
+       adcl $0, %eax; \
+       bswapl %eax; \
+       movl %eax, 4(%esi); \
+       \
+       movl 0(%esi), %eax; \
+       bswapl %eax; \
+       adcl $0, %eax; \
+       bswapl %eax; \
+       movl %eax, 0(%esi); \
+       .align 8; \
+       1:;
+
+       cmpl $12, 4+20(%ebp);
+       jae .Lctr_enc_blk12_loop;
+       jmp .Lctr_enc_blk4;
+
+       /* Process 12 blocks per loop. */
+.align 16
+.Lctr_enc_blk12_loop:
+       subl $12, 4+20(%ebp);
+
+       vbroadcasti128 (%esi), %ymm6;
+
+       /* detect if carry handling is needed */
+       movl 12(%esi), %eax;
+       addl $(12 << 24), %eax;
+       jc .Lctr_enc_blk12_handle_carry;
+       movl %eax, 12(%esi);
+
+  .Lctr_enc_blk12_byte_bige_add:
+       /* Increment counters. */
+       vpaddb CADDR(.Lbige_addb_0, %ebx), %ymm6, %ymm0;
+       vpaddb CADDR(.Lbige_addb_2, %ebx), %ymm6, %ymm1;
+       vpaddb CADDR(.Lbige_addb_4, %ebx), %ymm6, %ymm2;
+       vpaddb CADDR(.Lbige_addb_6, %ebx), %ymm6, %ymm3;
+       vpaddb CADDR(.Lbige_addb_8, %ebx), %ymm6, %ymm5;
+       vpaddb CADDR(.Lbige_addb_10, %ebx), %ymm6, %ymm6;
+
+  .Lctr_enc_blk12_rounds:
+       /* AES rounds */
+       vbroadcasti128 (0 * 16)(%edi), %ymm4;
+       XOR6(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3, %ymm5, %ymm6);
+       vbroadcasti128 (1 * 16)(%edi), %ymm4;
+       VAESENC6(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3, %ymm5, %ymm6);
+       vbroadcasti128 (2 * 16)(%edi), %ymm4;
+       VAESENC6(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3, %ymm5, %ymm6);
+       vbroadcasti128 (3 * 16)(%edi), %ymm4;
+       VAESENC6(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3, %ymm5, %ymm6);
+       vbroadcasti128 (4 * 16)(%edi), %ymm4;
+       VAESENC6(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3, %ymm5, %ymm6);
+       vbroadcasti128 (5 * 16)(%edi), %ymm4;
+       VAESENC6(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3, %ymm5, %ymm6);
+       vbroadcasti128 (6 * 16)(%edi), %ymm4;
+       VAESENC6(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3, %ymm5, %ymm6);
+       vbroadcasti128 (7 * 16)(%edi), %ymm4;
+       VAESENC6(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3, %ymm5, %ymm6);
+       vbroadcasti128 (8 * 16)(%edi), %ymm4;
+       VAESENC6(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3, %ymm5, %ymm6);
+       vbroadcasti128 (9 * 16)(%edi), %ymm4;
+       VAESENC6(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3, %ymm5, %ymm6);
+       vbroadcasti128 (10 * 16)(%edi), %ymm4;
+       cmpl $12, 4+24(%ebp);
+       jb .Lctr_enc_blk12_last;
+       VAESENC6(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3, %ymm5, %ymm6);
+       vbroadcasti128 (11 * 16)(%edi), %ymm4;
+       VAESENC6(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3, %ymm5, %ymm6);
+       vbroadcasti128 (12 * 16)(%edi), %ymm4;
+       jz .Lctr_enc_blk12_last;
+       VAESENC6(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3, %ymm5, %ymm6);
+       vbroadcasti128 (13 * 16)(%edi), %ymm4;
+       VAESENC6(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3, %ymm5, %ymm6);
+       vbroadcasti128 (14 * 16)(%edi), %ymm4;
+
+       /* Last round and output handling. */
+  .Lctr_enc_blk12_last:
+       vpxor (0 * 16)(%ecx), %ymm4, %ymm7; /* Xor src to last round key. */
+       vaesenclast %ymm7, %ymm0, %ymm0;
+       vmovdqu %ymm0, (0 * 16)(%edx);
+       vpxor (2 * 16)(%ecx), %ymm4, %ymm7;
+       vpxor (4 * 16)(%ecx), %ymm4, %ymm0;
+       vaesenclast %ymm7, %ymm1, %ymm1;
+       vaesenclast %ymm0, %ymm2, %ymm2;
+       vpxor (6 * 16)(%ecx), %ymm4, %ymm7;
+       vpxor (8 * 16)(%ecx), %ymm4, %ymm0;
+       vpxor (10 * 16)(%ecx), %ymm4, %ymm4;
+       leal (12 * 16)(%ecx), %ecx;
+       vaesenclast %ymm7, %ymm3, %ymm3;
+       vaesenclast %ymm0, %ymm5, %ymm5;
+       vaesenclast %ymm4, %ymm6, %ymm6;
+       vmovdqu %ymm1, (2 * 16)(%edx);
+       vmovdqu %ymm2, (4 * 16)(%edx);
+       vmovdqu %ymm3, (6 * 16)(%edx);
+       vmovdqu %ymm5, (8 * 16)(%edx);
+       vmovdqu %ymm6, (10 * 16)(%edx);
+       leal (12 * 16)(%edx), %edx;
+
+       cmpl $12, 4+20(%ebp);
+       jae .Lctr_enc_blk12_loop;
+       jmp .Lctr_enc_blk4;
+
+  .align 8
+  .Lctr_enc_blk12_handle_only_ctr_carry:
+       handle_ctr_128bit_add(12);
+       jmp .Lctr_enc_blk12_byte_bige_add;
+
+  .align 8
+  .Lctr_enc_blk12_handle_carry:
+       jz .Lctr_enc_blk12_handle_only_ctr_carry;
+       /* Increment counters (handle carry). */
+       prepare_ctr_const(%ymm4, %ymm7);
+       vmovdqa CADDR(.Lbswap128_mask, %ebx), %ymm2;
+       vpshufb %xmm2, %xmm6, %xmm1; /* be => le */
+       vmovdqa %xmm1, %xmm0;
+       inc_le128(%xmm1, %xmm4, %xmm5);
+       vinserti128 $1, %xmm1, %ymm0, %ymm6; /* ctr: +1:+0 */
+       handle_ctr_128bit_add(12);
+       vpshufb %ymm2, %ymm6, %ymm0;
+       vmovdqa %ymm0, (0 * 32)(%esp);
+       add2_le128(%ymm6, %ymm4, %ymm7, %ymm5, %ymm1); /* ctr: +3:+2 */
+       vpshufb %ymm2, %ymm6, %ymm0;
+       vmovdqa %ymm0, (1 * 32)(%esp);
+       add2_le128(%ymm6, %ymm4, %ymm7, %ymm5, %ymm1); /* ctr: +5:+4 */
+       vpshufb %ymm2, %ymm6, %ymm0;
+       vmovdqa %ymm0, (2 * 32)(%esp);
+       add2_le128(%ymm6, %ymm4, %ymm7, %ymm5, %ymm1); /* ctr: +7:+6 */
+       vpshufb %ymm2, %ymm6, %ymm3;
+       add2_le128(%ymm6, %ymm4, %ymm7, %ymm5, %ymm1); /* ctr: +9:+8 */
+       vpshufb %ymm2, %ymm6, %ymm5;
+       add2_le128(%ymm6, %ymm4, %ymm7, %ymm2, %ymm1); /* ctr: +11:+10 */
+       vmovdqa (0 * 32)(%esp), %ymm0;
+       vmovdqa (1 * 32)(%esp), %ymm1;
+       vmovdqa (2 * 32)(%esp), %ymm2;
+       vpshufb CADDR(.Lbswap128_mask, %ebx), %ymm6, %ymm6;
+
+       jmp .Lctr_enc_blk12_rounds;
+
+       /* Handle trailing four blocks. */
+.align 8
+.Lctr_enc_blk4:
+       cmpl $4, 4+20(%ebp);
+       jb .Lctr_enc_blk1;
+
+       subl $4, 4+20(%ebp);
+
+       vbroadcasti128 (%esi), %ymm3;
+
+       /* detect if carry handling is needed */
+       movl 12(%esi), %eax;
+       addl $(4 << 24), %eax;
+       jc .Lctr_enc_blk4_handle_carry;
+       movl %eax, 12(%esi);
+
+  .Lctr_enc_blk4_byte_bige_add:
+       /* Increment counters. */
+       vpaddb CADDR(.Lbige_addb_0, %ebx), %ymm3, %ymm0;
+       vpaddb CADDR(.Lbige_addb_2, %ebx), %ymm3, %ymm1;
+
+  .Lctr_enc_blk4_rounds:
+       /* AES rounds */
+       vbroadcasti128 (0 * 16)(%edi), %ymm4;
+       XOR2(%ymm4, %ymm0, %ymm1);
+       vbroadcasti128 (1 * 16)(%edi), %ymm4;
+       VAESENC2(%ymm4, %ymm0, %ymm1);
+       vbroadcasti128 (2 * 16)(%edi), %ymm4;
+       VAESENC2(%ymm4, %ymm0, %ymm1);
+       vbroadcasti128 (3 * 16)(%edi), %ymm4;
+       VAESENC2(%ymm4, %ymm0, %ymm1);
+       vbroadcasti128 (4 * 16)(%edi), %ymm4;
+       VAESENC2(%ymm4, %ymm0, %ymm1);
+       vbroadcasti128 (5 * 16)(%edi), %ymm4;
+       VAESENC2(%ymm4, %ymm0, %ymm1);
+       vbroadcasti128 (6 * 16)(%edi), %ymm4;
+       VAESENC2(%ymm4, %ymm0, %ymm1);
+       vbroadcasti128 (7 * 16)(%edi), %ymm4;
+       VAESENC2(%ymm4, %ymm0, %ymm1);
+       vbroadcasti128 (8 * 16)(%edi), %ymm4;
+       VAESENC2(%ymm4, %ymm0, %ymm1);
+       vbroadcasti128 (9 * 16)(%edi), %ymm4;
+       VAESENC2(%ymm4, %ymm0, %ymm1);
+       vbroadcasti128 (10 * 16)(%edi), %ymm4;
+       cmpl $12, 4+24(%ebp);
+       jb .Lctr_enc_blk4_last;
+       VAESENC2(%ymm4, %ymm0, %ymm1);
+       vbroadcasti128 (11 * 16)(%edi), %ymm4;
+       VAESENC2(%ymm4, %ymm0, %ymm1);
+       vbroadcasti128 (12 * 16)(%edi), %ymm4;
+       jz .Lctr_enc_blk4_last;
+       VAESENC2(%ymm4, %ymm0, %ymm1);
+       vbroadcasti128 (13 * 16)(%edi), %ymm4;
+       VAESENC2(%ymm4, %ymm0, %ymm1);
+       vbroadcasti128 (14 * 16)(%edi), %ymm4;
+
+       /* Last round and output handling. */
+  .Lctr_enc_blk4_last:
+       vpxor (0 * 16)(%ecx), %ymm4, %ymm5; /* Xor src to last round key. */
+       vpxor (2 * 16)(%ecx), %ymm4, %ymm6;
+       leal (4 * 16)(%ecx), %ecx;
+       vaesenclast %ymm5, %ymm0, %ymm0;
+       vaesenclast %ymm6, %ymm1, %ymm1;
+       vmovdqu %ymm0, (0 * 16)(%edx);
+       vmovdqu %ymm1, (2 * 16)(%edx);
+       leal (4 * 16)(%edx), %edx;
+
+       jmp .Lctr_enc_blk1;
+
+  .align 8
+  .Lctr_enc_blk4_handle_only_ctr_carry:
+       handle_ctr_128bit_add(4);
+       jmp .Lctr_enc_blk4_byte_bige_add;
+
+  .align 8
+  .Lctr_enc_blk4_handle_carry:
+       jz .Lctr_enc_blk4_handle_only_ctr_carry;
+       /* Increment counters (handle carry). */
+       prepare_ctr_const(%ymm4, %ymm7);
+       vpshufb CADDR(.Lbswap128_mask, %ebx), %xmm3, %xmm1; /* be => le */
+       vmovdqa %xmm1, %xmm0;
+       inc_le128(%xmm1, %xmm4, %xmm5);
+       vinserti128 $1, %xmm1, %ymm0, %ymm3; /* ctr: +1:+0 */
+       vpshufb CADDR(.Lbswap128_mask, %ebx), %ymm3, %ymm0;
+       handle_ctr_128bit_add(4);
+       add2_le128(%ymm3, %ymm4, %ymm7, %ymm5, %ymm6); /* ctr: +3:+2 */
+       vpshufb CADDR(.Lbswap128_mask, %ebx), %ymm3, %ymm1;
+
+       jmp .Lctr_enc_blk4_rounds;
+
+       /* Process trailing one to three blocks, one per loop. */
+.align 8
+.Lctr_enc_blk1:
+       cmpl $1, 4+20(%ebp);
+       jb .Ldone_ctr_enc;
+
+       subl $1, 4+20(%ebp);
+
+       /* Load and increament counter. */
+       vmovdqu (%esi), %xmm0;
+       handle_ctr_128bit_add(1);
+
+       /* AES rounds. */
+       vpxor (0 * 16)(%edi), %xmm0, %xmm0;
+       vaesenc (1 * 16)(%edi), %xmm0, %xmm0;
+       vaesenc (2 * 16)(%edi), %xmm0, %xmm0;
+       vaesenc (3 * 16)(%edi), %xmm0, %xmm0;
+       vaesenc (4 * 16)(%edi), %xmm0, %xmm0;
+       vaesenc (5 * 16)(%edi), %xmm0, %xmm0;
+       vaesenc (6 * 16)(%edi), %xmm0, %xmm0;
+       vaesenc (7 * 16)(%edi), %xmm0, %xmm0;
+       vaesenc (8 * 16)(%edi), %xmm0, %xmm0;
+       vaesenc (9 * 16)(%edi), %xmm0, %xmm0;
+       vmovdqa (10 * 16)(%edi), %xmm1;
+       cmpl $12, 4+24(%ebp);
+       jb .Lctr_enc_blk1_last;
+       vaesenc %xmm1, %xmm0, %xmm0;
+       vaesenc (11 * 16)(%edi), %xmm0, %xmm0;
+       vmovdqa (12 * 16)(%edi), %xmm1;
+       jz .Lctr_enc_blk1_last;
+       vaesenc %xmm1, %xmm0, %xmm0;
+       vaesenc (13 * 16)(%edi), %xmm0, %xmm0;
+       vmovdqa (14 * 16)(%edi), %xmm1;
+
+       /* Last round and output handling. */
+  .Lctr_enc_blk1_last:
+       vpxor (%ecx), %xmm1, %xmm1; /* Xor src to last round key. */
+       leal 16(%ecx), %ecx;
+       vaesenclast %xmm1, %xmm0, %xmm0; /* Last round and xor with xmm1. */
+       vmovdqu %xmm0, (%edx);
+       leal 16(%edx), %edx;
+
+       jmp .Lctr_enc_blk1;
+
+.align 8
+.Ldone_ctr_enc:
+       vpxor %ymm0, %ymm0, %ymm0;
+       movl (3 * 32 + 0 * 4)(%esp), %edi;
+       CFI_RESTORE(edi);
+       movl (3 * 32 + 1 * 4)(%esp), %esi;
+       CFI_RESTORE(esi);
+       movl (3 * 32 + 2 * 4)(%esp), %ebx;
+       CFI_RESTORE(ebx);
+       vmovdqa %ymm0, (0 * 32)(%esp);
+       vmovdqa %ymm0, (1 * 32)(%esp);
+       vmovdqa %ymm0, (2 * 32)(%esp);
+       leave;
+       CFI_LEAVE();
+       vzeroall;
+       ret_spec_stop
+       CFI_ENDPROC();
+ELF(.size SYM_NAME(_gcry_vaes_avx2_ctr_enc_i386),
+         .-SYM_NAME(_gcry_vaes_avx2_ctr_enc_i386))
+
+/**********************************************************************
+  Little-endian 32-bit CTR-mode encryption (GCM-SIV)
+ **********************************************************************/
+ELF(.type SYM_NAME(_gcry_vaes_avx2_ctr32le_enc_i386),@function)
+.globl SYM_NAME(_gcry_vaes_avx2_ctr32le_enc_i386)
+.align 16
+SYM_NAME(_gcry_vaes_avx2_ctr32le_enc_i386):
+       /* input:
+        *      (esp + 4): round keys
+        *      (esp + 8): counter
+        *      (esp + 12): dst
+        *      (esp + 16): src
+        *      (esp + 20): nblocks
+        *      (esp + 24): nrounds
+        */
+       CFI_STARTPROC();
+
+       GET_DATA_POINTER(SYM_NAME(_gcry_vaes_consts), eax);
+
+       pushl %ebp;
+       CFI_PUSH(%ebp);
+       movl %esp, %ebp;
+       CFI_DEF_CFA_REGISTER(%ebp);
+
+       subl $(3 * 4), %esp;
+
+       movl %edi, (0 * 4)(%esp);
+       CFI_REG_ON_STACK(edi, 0 * 4);
+       movl %esi, (1 * 4)(%esp);
+       CFI_REG_ON_STACK(esi, 1 * 4);
+       movl %ebx, (2 * 4)(%esp);
+       CFI_REG_ON_STACK(ebx, 2 * 4);
+
+       movl %eax, %ebx;
+       movl 4+4(%ebp), %edi;
+       movl 4+8(%ebp), %esi;
+       movl 4+12(%ebp), %edx;
+       movl 4+16(%ebp), %ecx;
+       movl 4+20(%ebp), %eax;
+
+       vbroadcasti128 (%esi), %ymm7; /* Load CTR. */
+
+       /* Process 12 blocks per loop. */
+.align 8
+.Lctr32le_enc_blk12:
+       cmpl $12, %eax;
+       jb .Lctr32le_enc_blk4;
+
+       leal -12(%eax), %eax;
+
+       vbroadcasti128 (0 * 16)(%edi), %ymm4;
+
+       /* Increment counters. */
+       vpaddd CADDR(.Lle_addd_0, %ebx), %ymm7, %ymm0;
+       vpaddd CADDR(.Lle_addd_2, %ebx), %ymm7, %ymm1;
+       vpaddd CADDR(.Lle_addd_4, %ebx), %ymm7, %ymm2;
+       vpaddd CADDR(.Lle_addd_6, %ebx), %ymm7, %ymm3;
+       vpaddd CADDR(.Lle_addd_8, %ebx), %ymm7, %ymm5;
+       vpaddd CADDR(.Lle_addd_10, %ebx), %ymm7, %ymm6;
+
+       vpaddd CADDR(.Lle_addd_12_2, %ebx), %ymm7, %ymm7;
+       vmovdqu %xmm7, (%esi); /* Store CTR. */
+
+       /* AES rounds */
+       XOR6(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3, %ymm5, %ymm6);
+       vbroadcasti128 (1 * 16)(%edi), %ymm4;
+       VAESENC6(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3, %ymm5, %ymm6);
+       vbroadcasti128 (2 * 16)(%edi), %ymm4;
+       VAESENC6(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3, %ymm5, %ymm6);
+       vbroadcasti128 (3 * 16)(%edi), %ymm4;
+       VAESENC6(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3, %ymm5, %ymm6);
+       vbroadcasti128 (4 * 16)(%edi), %ymm4;
+       VAESENC6(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3, %ymm5, %ymm6);
+       vbroadcasti128 (5 * 16)(%edi), %ymm4;
+       VAESENC6(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3, %ymm5, %ymm6);
+       vbroadcasti128 (6 * 16)(%edi), %ymm4;
+       VAESENC6(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3, %ymm5, %ymm6);
+       vbroadcasti128 (7 * 16)(%edi), %ymm4;
+       VAESENC6(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3, %ymm5, %ymm6);
+       vbroadcasti128 (8 * 16)(%edi), %ymm4;
+       VAESENC6(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3, %ymm5, %ymm6);
+       vbroadcasti128 (9 * 16)(%edi), %ymm4;
+       VAESENC6(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3, %ymm5, %ymm6);
+       vbroadcasti128 (10 * 16)(%edi), %ymm4;
+       cmpl $12, 4+24(%ebp);
+       jb .Lctr32le_enc_blk8_last;
+       VAESENC6(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3, %ymm5, %ymm6);
+       vbroadcasti128 (11 * 16)(%edi), %ymm4;
+       VAESENC6(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3, %ymm5, %ymm6);
+       vbroadcasti128 (12 * 16)(%edi), %ymm4;
+       jz .Lctr32le_enc_blk8_last;
+       VAESENC6(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3, %ymm5, %ymm6);
+       vbroadcasti128 (13 * 16)(%edi), %ymm4;
+       VAESENC6(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3, %ymm5, %ymm6);
+       vbroadcasti128 (14 * 16)(%edi), %ymm4;
+
+       /* Last round and output handling. */
+  .Lctr32le_enc_blk8_last:
+       vpxor (0 * 16)(%ecx), %ymm4, %ymm7; /* Xor src to last round key. */
+       vaesenclast %ymm7, %ymm0, %ymm0;
+       vpxor (2 * 16)(%ecx), %ymm4, %ymm7;
+       vaesenclast %ymm7, %ymm1, %ymm1;
+       vpxor (4 * 16)(%ecx), %ymm4, %ymm7;
+       vaesenclast %ymm7, %ymm2, %ymm2;
+       vpxor (6 * 16)(%ecx), %ymm4, %ymm7;
+       vaesenclast %ymm7, %ymm3, %ymm3;
+       vpxor (8 * 16)(%ecx), %ymm4, %ymm7;
+       vpxor (10 * 16)(%ecx), %ymm4, %ymm4;
+       vaesenclast %ymm7, %ymm5, %ymm5;
+       vbroadcasti128 (%esi), %ymm7; /* Reload CTR. */
+       vaesenclast %ymm4, %ymm6, %ymm6;
+       leal (12 * 16)(%ecx), %ecx;
+       vmovdqu %ymm0, (0 * 16)(%edx);
+       vmovdqu %ymm1, (2 * 16)(%edx);
+       vmovdqu %ymm2, (4 * 16)(%edx);
+       vmovdqu %ymm3, (6 * 16)(%edx);
+       vmovdqu %ymm5, (8 * 16)(%edx);
+       vmovdqu %ymm6, (10 * 16)(%edx);
+       leal (12 * 16)(%edx), %edx;
+
+       jmp .Lctr32le_enc_blk12;
+
+       /* Handle trailing four blocks. */
+.align 8
+.Lctr32le_enc_blk4:
+       cmpl $4, %eax;
+       jb .Lctr32le_enc_blk1;
+
+       leal -4(%eax), %eax;
+
+       vbroadcasti128 (0 * 16)(%edi), %ymm4;
+
+       /* Increment counters. */
+       vpaddd CADDR(.Lle_addd_0, %ebx), %ymm7, %ymm0;
+       vpaddd CADDR(.Lle_addd_2, %ebx), %ymm7, %ymm1;
+
+       vpaddd CADDR(.Lle_addd_4_2, %ebx), %ymm7, %ymm7;
+
+       /* AES rounds */
+       XOR2(%ymm4, %ymm0, %ymm1);
+       vbroadcasti128 (1 * 16)(%edi), %ymm4;
+       VAESENC2(%ymm4, %ymm0, %ymm1);
+       vbroadcasti128 (2 * 16)(%edi), %ymm4;
+       VAESENC2(%ymm4, %ymm0, %ymm1);
+       vbroadcasti128 (3 * 16)(%edi), %ymm4;
+       VAESENC2(%ymm4, %ymm0, %ymm1);
+       vbroadcasti128 (4 * 16)(%edi), %ymm4;
+       VAESENC2(%ymm4, %ymm0, %ymm1);
+       vbroadcasti128 (5 * 16)(%edi), %ymm4;
+       VAESENC2(%ymm4, %ymm0, %ymm1);
+       vbroadcasti128 (6 * 16)(%edi), %ymm4;
+       VAESENC2(%ymm4, %ymm0, %ymm1);
+       vbroadcasti128 (7 * 16)(%edi), %ymm4;
+       VAESENC2(%ymm4, %ymm0, %ymm1);
+       vbroadcasti128 (8 * 16)(%edi), %ymm4;
+       VAESENC2(%ymm4, %ymm0, %ymm1);
+       vbroadcasti128 (9 * 16)(%edi), %ymm4;
+       VAESENC2(%ymm4, %ymm0, %ymm1);
+       vbroadcasti128 (10 * 16)(%edi), %ymm4;
+       cmpl $12, 4+24(%ebp);
+       jb .Lctr32le_enc_blk4_last;
+       VAESENC2(%ymm4, %ymm0, %ymm1);
+       vbroadcasti128 (11 * 16)(%edi), %ymm4;
+       VAESENC2(%ymm4, %ymm0, %ymm1);
+       vbroadcasti128 (12 * 16)(%edi), %ymm4;
+       jz .Lctr32le_enc_blk4_last;
+       VAESENC2(%ymm4, %ymm0, %ymm1);
+       vbroadcasti128 (13 * 16)(%edi), %ymm4;
+       VAESENC2(%ymm4, %ymm0, %ymm1);
+       vbroadcasti128 (14 * 16)(%edi), %ymm4;
+
+       /* Last round and output handling. */
+  .Lctr32le_enc_blk4_last:
+       vpxor (0 * 16)(%ecx), %ymm4, %ymm5; /* Xor src to last round key. */
+       vpxor (2 * 16)(%ecx), %ymm4, %ymm6;
+       leal (4 * 16)(%ecx), %ecx;
+       vaesenclast %ymm5, %ymm0, %ymm0;
+       vaesenclast %ymm6, %ymm1, %ymm1;
+       vmovdqu %ymm0, (0 * 16)(%edx);
+       vmovdqu %ymm1, (2 * 16)(%edx);
+       leal (4 * 16)(%edx), %edx;
+
+       /* Process trailing one to three blocks, one per loop. */
+.align 8
+.Lctr32le_enc_blk1:
+       cmpl $1, %eax;
+       jb .Ldone_ctr32le_enc;
+
+       leal -1(%eax), %eax;
+
+       /* Load and increament counter. */
+       vmovdqu %xmm7, %xmm0;
+       vpaddd CADDR(.Lle_addd_1, %ebx), %xmm7, %xmm7;
+
+       /* AES rounds. */
+       vpxor (0 * 16)(%edi), %xmm0, %xmm0;
+       vaesenc (1 * 16)(%edi), %xmm0, %xmm0;
+       vaesenc (2 * 16)(%edi), %xmm0, %xmm0;
+       vaesenc (3 * 16)(%edi), %xmm0, %xmm0;
+       vaesenc (4 * 16)(%edi), %xmm0, %xmm0;
+       vaesenc (5 * 16)(%edi), %xmm0, %xmm0;
+       vaesenc (6 * 16)(%edi), %xmm0, %xmm0;
+       vaesenc (7 * 16)(%edi), %xmm0, %xmm0;
+       vaesenc (8 * 16)(%edi), %xmm0, %xmm0;
+       vaesenc (9 * 16)(%edi), %xmm0, %xmm0;
+       vmovdqa (10 * 16)(%edi), %xmm1;
+       cmpl $12, 4+24(%ebp);
+       jb .Lctr32le_enc_blk1_last;
+       vaesenc %xmm1, %xmm0, %xmm0;
+       vaesenc (11 * 16)(%edi), %xmm0, %xmm0;
+       vmovdqa (12 * 16)(%edi), %xmm1;
+       jz .Lctr32le_enc_blk1_last;
+       vaesenc %xmm1, %xmm0, %xmm0;
+       vaesenc (13 * 16)(%edi), %xmm0, %xmm0;
+       vmovdqa (14 * 16)(%edi), %xmm1;
+
+       /* Last round and output handling. */
+  .Lctr32le_enc_blk1_last:
+       vpxor (%ecx), %xmm1, %xmm1; /* Xor src to last round key. */
+       leal 16(%ecx), %ecx;
+       vaesenclast %xmm1, %xmm0, %xmm0; /* Last round and xor with xmm1. */
+       vmovdqu %xmm0, (%edx);
+       leal 16(%edx), %edx;
+
+       jmp .Lctr32le_enc_blk1;
+
+.align 8
+.Ldone_ctr32le_enc:
+       vmovdqu %xmm7, (%esi); /* Store CTR. */
+       movl (0 * 4)(%esp), %edi;
+       CFI_RESTORE(edi);
+       movl (1 * 4)(%esp), %esi;
+       CFI_RESTORE(esi);
+       movl (2 * 4)(%esp), %ebx;
+       CFI_RESTORE(ebx);
+       leave;
+       CFI_LEAVE();
+       vzeroall;
+       ret_spec_stop
+       CFI_ENDPROC();
+ELF(.size SYM_NAME(_gcry_vaes_avx2_ctr32le_enc_i386),
+         .-SYM_NAME(_gcry_vaes_avx2_ctr32le_enc_i386))
+
+/**********************************************************************
+  OCB-mode encryption/decryption/authentication
+ **********************************************************************/
+ELF(.type SYM_NAME(_gcry_vaes_avx2_ocb_crypt_i386),@function)
+.globl SYM_NAME(_gcry_vaes_avx2_ocb_crypt_i386)
+.align 16
+SYM_NAME(_gcry_vaes_avx2_ocb_crypt_i386):
+       /* input:
+        *      (esp + 4): round keys
+        *      (esp + 8): dst
+        *      (esp + 12): src
+        *      (esp + 16): nblocks
+        *      (esp + 20): nrounds
+        *      (esp + 24): offset
+        *      (esp + 28): checksum
+        *      (esp + 32): blkn
+        *      (esp + 36): L table
+        *      (esp + 44): encrypt/decrypt/auth mode
+        */
+       CFI_STARTPROC();
+
+       pushl %ebp;
+       CFI_PUSH(%ebp);
+       movl %esp, %ebp;
+       CFI_DEF_CFA_REGISTER(%ebp);
+
+#define STACK_VEC_POS           0
+#define STACK_TMP_Y0            (STACK_VEC_POS + 0 * 32)
+#define STACK_TMP_Y1            (STACK_VEC_POS + 1 * 32)
+#define STACK_TMP_Y2            (STACK_VEC_POS + 2 * 32)
+#define STACK_TMP_Y3            (STACK_VEC_POS + 3 * 32)
+#define STACK_TMP_Y4            (STACK_VEC_POS + 4 * 32)
+#define STACK_TMP_Y5            (STACK_VEC_POS + 5 * 32)
+#define STACK_FXL_KEY           (STACK_VEC_POS + 6 * 32)
+#define STACK_OFFSET_AND_F_KEY  (STACK_VEC_POS + 7 * 32)
+#define STACK_CHECKSUM          (STACK_VEC_POS + 8 * 32)
+#define STACK_GPR_POS           (9 * 32)
+#define STACK_END_POS           (STACK_GPR_POS + 3 * 4)
+
+       subl $STACK_END_POS, %esp;
+       andl $-32, %esp;
+
+       movl %edi, (STACK_GPR_POS + 0 * 4)(%esp);
+       CFI_REG_ON_STACK(edi, STACK_GPR_POS + 0 * 4);
+       movl %esi, (STACK_GPR_POS + 1 * 4)(%esp);
+       CFI_REG_ON_STACK(esi, STACK_GPR_POS + 1 * 4);
+       movl %ebx, (STACK_GPR_POS + 2 * 4)(%esp);
+       CFI_REG_ON_STACK(ebx, STACK_GPR_POS + 2 * 4);
+
+       movl 4+4(%ebp), %edi;
+       movl 4+8(%ebp), %esi;
+       movl 4+12(%ebp), %edx;
+       movl 4+32(%ebp), %ebx;
+
+       movl 4+24(%ebp), %eax;
+       movl 4+20(%ebp), %ecx;
+       leal (, %ecx, 4), %ecx;
+       vmovdqu (%eax), %xmm1; /* offset */
+       vmovdqa (%edi), %xmm0; /* first key */
+       vpxor %xmm0, %xmm1, %xmm1; /* offset ^ first key */
+       vpxor (%edi, %ecx, 4), %xmm0, %xmm0; /* first key ^ last key */
+       vinserti128 $1, %xmm0, %ymm0, %ymm0;
+       vpxor %ymm2, %ymm2, %ymm2;
+       vmovdqa %xmm1, (STACK_OFFSET_AND_F_KEY)(%esp);
+       vmovdqa %ymm2, (STACK_CHECKSUM)(%esp);
+       vmovdqa %ymm0, (STACK_FXL_KEY)(%esp);
+
+       cmpl $12, 4+16(%ebp);
+       jae .Locb_crypt_blk12_loop;
+       jmp .Locb_crypt_blk4;
+
+       /* Process 12 blocks per loop. */
+.align 16
+.Locb_crypt_blk12_loop:
+       subl $12, 4+16(%ebp);
+
+       movl 4+36(%ebp), %ecx;
+       vmovdqa (%ecx), %xmm7; /* Preload L[0] */
+
+       testl $1, %ebx;
+       jz .Locb_crypt_blk12_nblk_even;
+               /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
+               leal 1(%ebx), %eax;
+               tzcntl %eax, %eax; // ntz(blkn+1)
+               shll $4, %eax;
+               vmovdqa (STACK_OFFSET_AND_F_KEY)(%esp), %xmm1;
+               vpxor (%ecx, %eax), %xmm1, %xmm1;
+
+               vpxor %xmm7, %xmm1, %xmm0;
+               vinserti128 $1, %xmm0, %ymm1, %ymm1;
+               vmovdqa %ymm1, (STACK_TMP_Y0)(%esp);
+
+               leal 3(%ebx), %eax;
+               tzcntl %eax, %eax; // ntz(blkn+3)
+               shll $4, %eax;
+               vpxor (%ecx, %eax), %xmm0, %xmm1;
+
+               vpxor %xmm7, %xmm1, %xmm0;
+               vinserti128 $1, %xmm0, %ymm1, %ymm2;
+
+               leal 5(%ebx), %eax;
+               tzcntl %eax, %eax; // ntz(blkn+5)
+               shll $4, %eax;
+               vpxor (%ecx, %eax), %xmm0, %xmm1;
+
+               vpxor %xmm7, %xmm1, %xmm0;
+               vinserti128 $1, %xmm0, %ymm1, %ymm3;
+
+               leal 7(%ebx), %eax;
+               tzcntl %eax, %eax; // ntz(blkn+7)
+               shll $4, %eax;
+               vpxor (%ecx, %eax), %xmm0, %xmm1;
+
+               vpxor %xmm7, %xmm1, %xmm0;
+               vinserti128 $1, %xmm0, %ymm1, %ymm4;
+
+               leal 9(%ebx), %eax;
+               tzcntl %eax, %eax; // ntz(blkn+9)
+               shll $4, %eax;
+               vpxor (%ecx, %eax), %xmm0, %xmm1;
+
+               vpxor %xmm7, %xmm1, %xmm0;
+               vinserti128 $1, %xmm0, %ymm1, %ymm5;
+
+               leal 11(%ebx), %eax;
+               tzcntl %eax, %eax; // ntz(blkn+11)
+               shll $4, %eax;
+               vpxor (%ecx, %eax), %xmm0, %xmm1;
+
+               leal 12(%ebx), %ebx;
+               vpxor %xmm7, %xmm1, %xmm0;
+               vinserti128 $1, %xmm0, %ymm1, %ymm6;
+
+               cmpl $1, 4+40(%ebp);
+               jb .Locb_dec_blk12;
+               ja .Locb_auth_blk12;
+               jmp .Locb_enc_blk12;
+
+       .align 8
+       .Locb_crypt_blk12_nblk_even:
+               /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
+               vpxor (STACK_OFFSET_AND_F_KEY)(%esp), %xmm7, %xmm1;
+
+               leal 2(%ebx), %eax;
+               tzcntl %eax, %eax; // ntz(blkn+2)
+               shll $4, %eax;
+               vpxor (%ecx, %eax), %xmm1, %xmm0;
+               vinserti128 $1, %xmm0, %ymm1, %ymm1;
+               vmovdqa %ymm1, (STACK_TMP_Y0)(%esp);
+
+               vpxor %xmm7, %xmm0, %xmm1;
+
+               leal 4(%ebx), %eax;
+               tzcntl %eax, %eax; // ntz(blkn+4)
+               shll $4, %eax;
+               vpxor (%ecx, %eax), %xmm1, %xmm0;
+               vinserti128 $1, %xmm0, %ymm1, %ymm2;
+
+               vpxor %xmm7, %xmm0, %xmm1;
+
+               leal 6(%ebx), %eax;
+               tzcntl %eax, %eax; // ntz(blkn+6)
+               shll $4, %eax;
+               vpxor (%ecx, %eax), %xmm1, %xmm0;
+               vinserti128 $1, %xmm0, %ymm1, %ymm3;
+
+               vpxor %xmm7, %xmm0, %xmm1;
+
+               leal 8(%ebx), %eax;
+               tzcntl %eax, %eax; // ntz(blkn+8)
+               shll $4, %eax;
+               vpxor (%ecx, %eax), %xmm1, %xmm0;
+               vinserti128 $1, %xmm0, %ymm1, %ymm4;
+
+               vpxor %xmm7, %xmm0, %xmm1;
+
+               leal 10(%ebx), %eax;
+               tzcntl %eax, %eax; // ntz(blkn+10)
+               shll $4, %eax;
+               vpxor (%ecx, %eax), %xmm1, %xmm0;
+               vinserti128 $1, %xmm0, %ymm1, %ymm5;
+
+               vpxor %xmm7, %xmm0, %xmm1;
+
+               leal 12(%ebx), %ebx;
+               tzcntl %ebx, %eax; // ntz(blkn+12)
+               shll $4, %eax;
+               vpxor (%ecx, %eax), %xmm1, %xmm0;
+               vinserti128 $1, %xmm0, %ymm1, %ymm6;
+
+               cmpl $1, 4+40(%ebp);
+               jb .Locb_dec_blk12;
+               ja .Locb_auth_blk12;
+
+       .align 8
+       .Locb_enc_blk12:
+               vmovdqa %ymm2, (STACK_TMP_Y1)(%esp);
+               vmovdqa %ymm3, (STACK_TMP_Y2)(%esp);
+               vmovdqa %ymm4, (STACK_TMP_Y3)(%esp);
+               vmovdqa %ymm5, (STACK_TMP_Y4)(%esp);
+               vmovdqa %ymm6, (STACK_TMP_Y5)(%esp);
+               vmovdqa %xmm0, (STACK_OFFSET_AND_F_KEY)(%esp);
+
+               vmovdqu 0*16(%edx), %ymm1;
+               vmovdqu 2*16(%edx), %ymm2;
+               vmovdqu 4*16(%edx), %ymm3;
+               vmovdqu 6*16(%edx), %ymm4;
+               vmovdqu 8*16(%edx), %ymm5;
+               vmovdqu 10*16(%edx), %ymm6;
+               leal 12*16(%edx), %edx;
+
+               /* Checksum_i = Checksum_{i-1} xor P_i  */
+               vpxor %ymm1, %ymm2, %ymm0;
+               vpxor %ymm3, %ymm4, %ymm7;
+               vpxor %ymm5, %ymm0, %ymm0;
+               vpxor %ymm6, %ymm7, %ymm7;
+               vpxor %ymm0, %ymm7, %ymm7;
+               vbroadcasti128 (1 * 16)(%edi), %ymm0;
+               vpxor (STACK_CHECKSUM)(%esp), %ymm7, %ymm7;
+
+               /* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i)  */
+               vpxor (STACK_TMP_Y0)(%esp), %ymm1, %ymm1;
+               vpxor (STACK_TMP_Y1)(%esp), %ymm2, %ymm2;
+               vpxor (STACK_TMP_Y2)(%esp), %ymm3, %ymm3;
+               vpxor (STACK_TMP_Y3)(%esp), %ymm4, %ymm4;
+               vpxor (STACK_TMP_Y4)(%esp), %ymm5, %ymm5;
+               vpxor (STACK_TMP_Y5)(%esp), %ymm6, %ymm6;
+
+               vmovdqa %ymm7, (STACK_CHECKSUM)(%esp);
+
+               /* AES rounds */
+               VAESENC6(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6);
+               vbroadcasti128 (2 * 16)(%edi), %ymm0;
+               VAESENC6(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6);
+               vbroadcasti128 (3 * 16)(%edi), %ymm0;
+               VAESENC6(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6);
+               vbroadcasti128 (4 * 16)(%edi), %ymm0;
+               VAESENC6(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6);
+               vbroadcasti128 (5 * 16)(%edi), %ymm0;
+               VAESENC6(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6);
+               vbroadcasti128 (6 * 16)(%edi), %ymm0;
+               VAESENC6(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6);
+               vbroadcasti128 (7 * 16)(%edi), %ymm0;
+               VAESENC6(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6);
+               vbroadcasti128 (8 * 16)(%edi), %ymm0;
+               VAESENC6(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6);
+               vbroadcasti128 (9 * 16)(%edi), %ymm0;
+               VAESENC6(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6);
+               cmpl $12, 4+20(%ebp);
+               jb .Locb_enc_blk12_last;
+               vbroadcasti128 (10 * 16)(%edi), %ymm0;
+               VAESENC6(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6);
+               vbroadcasti128 (11 * 16)(%edi), %ymm0;
+               VAESENC6(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6);
+               jz .Locb_enc_blk12_last;
+               vbroadcasti128 (12 * 16)(%edi), %ymm0;
+               VAESENC6(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6);
+               vbroadcasti128 (13 * 16)(%edi), %ymm0;
+               VAESENC6(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6);
+
+               /* Last round and output handling. */
+         .Locb_enc_blk12_last:
+               vmovdqa (STACK_FXL_KEY)(%esp), %ymm0;
+               vpxor (STACK_TMP_Y0)(%esp), %ymm0, %ymm7;
+               vaesenclast %ymm7, %ymm1, %ymm1;
+               vpxor (STACK_TMP_Y1)(%esp), %ymm0, %ymm7;
+               vmovdqu %ymm1, 0*16(%esi);
+               vpxor (STACK_TMP_Y2)(%esp), %ymm0, %ymm1;
+               vaesenclast %ymm7, %ymm2, %ymm2;
+               vpxor (STACK_TMP_Y3)(%esp), %ymm0, %ymm7;
+               vaesenclast %ymm1, %ymm3, %ymm3;
+               vpxor (STACK_TMP_Y4)(%esp), %ymm0, %ymm1;
+               vaesenclast %ymm7, %ymm4, %ymm4;
+               vpxor (STACK_TMP_Y5)(%esp), %ymm0, %ymm7;
+               vaesenclast %ymm1, %ymm5, %ymm5;
+               vaesenclast %ymm7, %ymm6, %ymm6;
+               vmovdqu %ymm2, 2*16(%esi);
+               vmovdqu %ymm3, 4*16(%esi);
+               vmovdqu %ymm4, 6*16(%esi);
+               vmovdqu %ymm5, 8*16(%esi);
+               vmovdqu %ymm6, 10*16(%esi);
+               leal 12*16(%esi), %esi;
+
+               cmpl $12, 4+16(%ebp);
+               jae .Locb_crypt_blk12_loop;
+               jmp .Locb_crypt_blk12_cleanup;
+
+       .align 8
+       .Locb_auth_blk12:
+               vmovdqa %xmm0, (STACK_OFFSET_AND_F_KEY)(%esp);
+               vbroadcasti128 (1 * 16)(%edi), %ymm0;
+
+               /* Sum_i = Sum_{i-1} xor ENCIPHER(K, A_i xor Offset_i)  */
+               vmovdqa (STACK_TMP_Y0)(%esp), %ymm1;
+               vpxor 0*16(%edx), %ymm1, %ymm1;
+               vpxor 2*16(%edx), %ymm2, %ymm2;
+               vpxor 4*16(%edx), %ymm3, %ymm3;
+               vpxor 6*16(%edx), %ymm4, %ymm4;
+               vpxor 8*16(%edx), %ymm5, %ymm5;
+               vpxor 10*16(%edx), %ymm6, %ymm6;
+               leal 12*16(%edx), %edx;
+
+               /* AES rounds */
+               VAESENC6(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6);
+               vbroadcasti128 (2 * 16)(%edi), %ymm0;
+               VAESENC6(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6);
+               vbroadcasti128 (3 * 16)(%edi), %ymm0;
+               VAESENC6(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6);
+               vbroadcasti128 (4 * 16)(%edi), %ymm0;
+               VAESENC6(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6);
+               vbroadcasti128 (5 * 16)(%edi), %ymm0;
+               VAESENC6(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6);
+               vbroadcasti128 (6 * 16)(%edi), %ymm0;
+               VAESENC6(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6);
+               vbroadcasti128 (7 * 16)(%edi), %ymm0;
+               VAESENC6(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6);
+               vbroadcasti128 (8 * 16)(%edi), %ymm0;
+               VAESENC6(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6);
+               vbroadcasti128 (9 * 16)(%edi), %ymm0;
+               VAESENC6(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6);
+               vbroadcasti128 (10 * 16)(%edi), %ymm0;
+               cmpl $12, 4+20(%ebp);
+               jb .Locb_auth_blk12_last;
+               VAESENC6(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6);
+               vbroadcasti128 (11 * 16)(%edi), %ymm0;
+               VAESENC6(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6);
+               vbroadcasti128 (12 * 16)(%edi), %ymm0;
+               jz .Locb_auth_blk12_last;
+               VAESENC6(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6);
+               vbroadcasti128 (13 * 16)(%edi), %ymm0;
+               VAESENC6(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6);
+               vbroadcasti128 (14 * 16)(%edi), %ymm0;
+
+               /* Last round and output handling. */
+         .Locb_auth_blk12_last:
+               vaesenclast %ymm0, %ymm1, %ymm1;
+               vaesenclast %ymm0, %ymm2, %ymm2;
+               vaesenclast %ymm0, %ymm3, %ymm3;
+               vaesenclast %ymm0, %ymm4, %ymm4;
+               vaesenclast %ymm0, %ymm5, %ymm5;
+               vaesenclast %ymm0, %ymm6, %ymm6;
+
+               vpxor %ymm1, %ymm2, %ymm0;
+               vpxor %ymm3, %ymm4, %ymm4;
+               vpxor %ymm5, %ymm0, %ymm0;
+               vpxor %ymm6, %ymm4, %ymm4;
+               vpxor %ymm0, %ymm4, %ymm4;
+               vpxor (STACK_CHECKSUM)(%esp), %ymm4, %ymm4;
+               vmovdqa %ymm4, (STACK_CHECKSUM)(%esp);
+
+               cmpl $12, 4+16(%ebp);
+               jae .Locb_crypt_blk12_loop;
+               jmp .Locb_crypt_blk12_cleanup;
+
+       .align 8
+       .Locb_dec_blk12:
+               vmovdqa %xmm0, (STACK_OFFSET_AND_F_KEY)(%esp);
+
+               /* P_i = Offset_i xor DECIPHER(K, C_i xor Offset_i)  */
+               vmovdqa (STACK_TMP_Y0)(%esp), %ymm1;
+               vmovdqu 0*16(%edx), %ymm0;
+               vmovdqu 2*16(%edx), %ymm7;
+               vpxor %ymm0, %ymm1, %ymm1;
+               vmovdqa %ymm2, (STACK_TMP_Y1)(%esp);
+               vpxor %ymm7, %ymm2, %ymm2;
+               vmovdqu 4*16(%edx), %ymm0;
+               vmovdqu 6*16(%edx), %ymm7;
+               vmovdqa %ymm3, (STACK_TMP_Y2)(%esp);
+               vmovdqa %ymm4, (STACK_TMP_Y3)(%esp);
+               vpxor %ymm0, %ymm3, %ymm3;
+               vpxor %ymm7, %ymm4, %ymm4;
+               vmovdqu 8*16(%edx), %ymm0;
+               vmovdqu 10*16(%edx), %ymm7;
+               leal 12*16(%edx), %edx;
+               vmovdqa %ymm5, (STACK_TMP_Y4)(%esp);
+               vmovdqa %ymm6, (STACK_TMP_Y5)(%esp);
+               vpxor %ymm0, %ymm5, %ymm5;
+               vbroadcasti128 (1 * 16)(%edi), %ymm0;
+               vpxor %ymm7, %ymm6, %ymm6;
+
+               /* AES rounds */
+               VAESDEC6(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6);
+               vbroadcasti128 (2 * 16)(%edi), %ymm0;
+               VAESDEC6(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6);
+               vbroadcasti128 (3 * 16)(%edi), %ymm0;
+               VAESDEC6(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6);
+               vbroadcasti128 (4 * 16)(%edi), %ymm0;
+               VAESDEC6(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6);
+               vbroadcasti128 (5 * 16)(%edi), %ymm0;
+               VAESDEC6(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6);
+               vbroadcasti128 (6 * 16)(%edi), %ymm0;
+               VAESDEC6(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6);
+               vbroadcasti128 (7 * 16)(%edi), %ymm0;
+               VAESDEC6(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6);
+               vbroadcasti128 (8 * 16)(%edi), %ymm0;
+               VAESDEC6(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6);
+               vbroadcasti128 (9 * 16)(%edi), %ymm0;
+               VAESDEC6(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6);
+               cmpl $12, 4+20(%ebp);
+               jb .Locb_dec_blk12_last;
+               vbroadcasti128 (10 * 16)(%edi), %ymm0;
+               VAESDEC6(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6);
+               vbroadcasti128 (11 * 16)(%edi), %ymm0;
+               VAESDEC6(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6);
+               jz .Locb_dec_blk12_last;
+               vbroadcasti128 (12 * 16)(%edi), %ymm0;
+               VAESDEC6(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6);
+               vbroadcasti128 (13 * 16)(%edi), %ymm0;
+               VAESDEC6(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6);
+
+               /* Last round and output handling. */
+         .Locb_dec_blk12_last:
+               vmovdqa (STACK_FXL_KEY)(%esp), %ymm0;
+               vpxor (STACK_TMP_Y0)(%esp), %ymm0, %ymm7;
+               vaesdeclast %ymm7, %ymm1, %ymm1;
+               vmovdqu %ymm1, 0*16(%esi);
+               vpxor (STACK_TMP_Y1)(%esp), %ymm0, %ymm1;
+               vpxor (STACK_TMP_Y2)(%esp), %ymm0, %ymm7;
+               vaesdeclast %ymm1, %ymm2, %ymm2;
+               vpxor (STACK_TMP_Y3)(%esp), %ymm0, %ymm1;
+               vaesdeclast %ymm7, %ymm3, %ymm3;
+               vpxor (STACK_TMP_Y4)(%esp), %ymm0, %ymm7;
+               vaesdeclast %ymm1, %ymm4, %ymm4;
+               vpxor (STACK_TMP_Y5)(%esp), %ymm0, %ymm0;
+               vaesdeclast %ymm7, %ymm5, %ymm5;
+               vaesdeclast %ymm0, %ymm6, %ymm6;
+
+               /* Checksum_i = Checksum_{i-1} xor P_i  */
+               vpxor %ymm2, %ymm3, %ymm0;
+               vpxor %ymm4, %ymm5, %ymm7;
+               vpxor %ymm6, %ymm0, %ymm0;
+               vpxor 0*16(%esi), %ymm7, %ymm7;
+               vpxor %ymm0, %ymm7, %ymm7;
+               vpxor (STACK_CHECKSUM)(%esp), %ymm7, %ymm7;
+
+               vmovdqu %ymm2, 2*16(%esi);
+               vmovdqu %ymm3, 4*16(%esi);
+               vmovdqu %ymm4, 6*16(%esi);
+               vmovdqu %ymm5, 8*16(%esi);
+               vmovdqu %ymm6, 10*16(%esi);
+               leal 12*16(%esi), %esi;
+
+               vmovdqa %ymm7, (STACK_CHECKSUM)(%esp);
+
+               cmpl $12, 4+16(%ebp);
+               jae .Locb_crypt_blk12_loop;
+
+.align 8
+.Locb_crypt_blk12_cleanup:
+       vpxor %ymm0, %ymm0, %ymm0;
+       vmovdqa %ymm0, (STACK_TMP_Y0)(%esp);
+       vmovdqa %ymm0, (STACK_TMP_Y1)(%esp);
+       vmovdqa %ymm0, (STACK_TMP_Y2)(%esp);
+       vmovdqa %ymm0, (STACK_TMP_Y3)(%esp);
+       vmovdqa %ymm0, (STACK_TMP_Y4)(%esp);
+       vmovdqa %ymm0, (STACK_TMP_Y5)(%esp);
+
+       /* Process trailing four blocks. */
+.align 8
+.Locb_crypt_blk4:
+       cmpl $4, 4+16(%ebp);
+       jb .Locb_crypt_blk1;
+
+       subl $4, 4+16(%ebp);
+
+       movl 4+36(%ebp), %ecx;
+       vmovdqa (%ecx), %xmm7; /* Preload L[0] */
+
+       testl $1, %ebx;
+       jz .Locb_crypt_blk4_nblk_even;
+               /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
+               leal 1(%ebx), %eax;
+               tzcntl %eax, %eax; // ntz(blkn+1)
+               shll $4, %eax;
+               vmovdqa (STACK_OFFSET_AND_F_KEY)(%esp), %xmm1;
+               vpxor (%ecx, %eax), %xmm1, %xmm1;
+
+               vpxor %xmm7, %xmm1, %xmm2;
+               vinserti128 $1, %xmm2, %ymm1, %ymm6;
+
+               leal 3(%ebx), %eax;
+               tzcntl %eax, %eax; // ntz(blkn+3)
+               shll $4, %eax;
+               vpxor (%ecx, %eax), %xmm2, %xmm3;
+
+               leal 4(%ebx), %ebx;
+               vpxor %xmm7, %xmm3, %xmm4;
+               vinserti128 $1, %xmm4, %ymm3, %ymm7;
+               vmovdqa %xmm4, (STACK_OFFSET_AND_F_KEY)(%esp);
+
+               cmpl $1, 4+40(%ebp);
+               jb .Locb_dec_blk4;
+               ja .Locb_auth_blk4;
+               jmp .Locb_enc_blk4;
+
+       .align 8
+       .Locb_crypt_blk4_nblk_even:
+               /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
+               vmovdqa (STACK_OFFSET_AND_F_KEY)(%esp), %xmm1;
+               vpxor %xmm7, %xmm1, %xmm1;
+
+               leal 2(%ebx), %eax;
+               tzcntl %eax, %eax; // ntz(blkn+2)
+               shll $4, %eax;
+               vpxor (%ecx, %eax), %xmm1, %xmm2;
+               vinserti128 $1, %xmm2, %ymm1, %ymm6;
+
+               vpxor %xmm7, %xmm2, %xmm3;
+
+               leal 4(%ebx), %ebx;
+               tzcntl %ebx, %eax; // ntz(blkn+4)
+               shll $4, %eax;
+               vpxor (%ecx, %eax), %xmm3, %xmm4;
+               vinserti128 $1, %xmm4, %ymm3, %ymm7;
+               vmovdqa %xmm4, (STACK_OFFSET_AND_F_KEY)(%esp);
+
+               cmpl $1, 4+40(%ebp);
+               jb .Locb_dec_blk4;
+               ja .Locb_auth_blk4;
+
+       .align 8
+       .Locb_enc_blk4:
+               vmovdqu 0*16(%edx), %ymm1;
+               vmovdqu 2*16(%edx), %ymm2;
+               leal 4*16(%edx), %edx;
+
+               /* Checksum_i = Checksum_{i-1} xor P_i  */
+               vpxor %ymm1, %ymm2, %ymm5;
+               vpxor (STACK_CHECKSUM)(%esp), %ymm5, %ymm5;
+               vmovdqa %ymm5, (STACK_CHECKSUM)(%esp);
+
+               /* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i)  */
+               vpxor %ymm6, %ymm1, %ymm1;
+               vpxor %ymm7, %ymm2, %ymm2;
+
+               /* AES rounds */
+               vbroadcasti128 (1 * 16)(%edi), %ymm0;
+               VAESENC2(%ymm0, %ymm1, %ymm2);
+               vbroadcasti128 (2 * 16)(%edi), %ymm0;
+               VAESENC2(%ymm0, %ymm1, %ymm2);
+               vbroadcasti128 (3 * 16)(%edi), %ymm0;
+               VAESENC2(%ymm0, %ymm1, %ymm2);
+               vbroadcasti128 (4 * 16)(%edi), %ymm0;
+               VAESENC2(%ymm0, %ymm1, %ymm2);
+               vbroadcasti128 (5 * 16)(%edi), %ymm0;
+               VAESENC2(%ymm0, %ymm1, %ymm2);
+               vbroadcasti128 (6 * 16)(%edi), %ymm0;
+               VAESENC2(%ymm0, %ymm1, %ymm2);
+               vbroadcasti128 (7 * 16)(%edi), %ymm0;
+               VAESENC2(%ymm0, %ymm1, %ymm2);
+               vbroadcasti128 (8 * 16)(%edi), %ymm0;
+               VAESENC2(%ymm0, %ymm1, %ymm2);
+               vbroadcasti128 (9 * 16)(%edi), %ymm0;
+               VAESENC2(%ymm0, %ymm1, %ymm2);
+               cmpl $12, 4+20(%ebp);
+               jb .Locb_enc_blk4_last;
+               vbroadcasti128 (10 * 16)(%edi), %ymm0;
+               VAESENC2(%ymm0, %ymm1, %ymm2);
+               vbroadcasti128 (11 * 16)(%edi), %ymm0;
+               VAESENC2(%ymm0, %ymm1, %ymm2);
+               jz .Locb_enc_blk4_last;
+               vbroadcasti128 (12 * 16)(%edi), %ymm0;
+               VAESENC2(%ymm0, %ymm1, %ymm2);
+               vbroadcasti128 (13 * 16)(%edi), %ymm0;
+               VAESENC2(%ymm0, %ymm1, %ymm2);
+
+               /* Last round and output handling. */
+         .Locb_enc_blk4_last:
+               vmovdqa (STACK_FXL_KEY)(%esp), %ymm0;
+               vpxor %ymm0, %ymm6, %ymm6; /* Xor offset to last round key. */
+               vpxor %ymm0, %ymm7, %ymm7;
+               vaesenclast %ymm6, %ymm1, %ymm1;
+               vaesenclast %ymm7, %ymm2, %ymm2;
+               vmovdqu %ymm1, 0*16(%esi);
+               vmovdqu %ymm2, 2*16(%esi);
+               leal 4*16(%esi), %esi;
+
+               jmp .Locb_crypt_blk1;
+
+       .align 8
+       .Locb_auth_blk4:
+               /* Sum_i = Sum_{i-1} xor ENCIPHER(K, A_i xor Offset_i)  */
+               vpxor 0*16(%edx), %ymm6, %ymm1;
+               vpxor 2*16(%edx), %ymm7, %ymm2;
+               leal 4*16(%edx), %edx;
+
+               /* AES rounds */
+               vbroadcasti128 (1 * 16)(%edi), %ymm0;
+               VAESENC2(%ymm0, %ymm1, %ymm2);
+               vbroadcasti128 (2 * 16)(%edi), %ymm0;
+               VAESENC2(%ymm0, %ymm1, %ymm2);
+               vbroadcasti128 (3 * 16)(%edi), %ymm0;
+               VAESENC2(%ymm0, %ymm1, %ymm2);
+               vbroadcasti128 (4 * 16)(%edi), %ymm0;
+               VAESENC2(%ymm0, %ymm1, %ymm2);
+               vbroadcasti128 (5 * 16)(%edi), %ymm0;
+               VAESENC2(%ymm0, %ymm1, %ymm2);
+               vbroadcasti128 (6 * 16)(%edi), %ymm0;
+               VAESENC2(%ymm0, %ymm1, %ymm2);
+               vbroadcasti128 (7 * 16)(%edi), %ymm0;
+               VAESENC2(%ymm0, %ymm1, %ymm2);
+               vbroadcasti128 (8 * 16)(%edi), %ymm0;
+               VAESENC2(%ymm0, %ymm1, %ymm2);
+               vbroadcasti128 (9 * 16)(%edi), %ymm0;
+               VAESENC2(%ymm0, %ymm1, %ymm2);
+               vbroadcasti128 (10 * 16)(%edi), %ymm0;
+               cmpl $12, 4+20(%ebp);
+               jb .Locb_auth_blk4_last;
+               VAESENC2(%ymm0, %ymm1, %ymm2);
+               vbroadcasti128 (11 * 16)(%edi), %ymm0;
+               VAESENC2(%ymm0, %ymm1, %ymm2);
+               vbroadcasti128 (12 * 16)(%edi), %ymm0;
+               jz .Locb_auth_blk4_last;
+               VAESENC2(%ymm0, %ymm1, %ymm2);
+               vbroadcasti128 (13 * 16)(%edi), %ymm0;
+               VAESENC2(%ymm0, %ymm1, %ymm2);
+               vbroadcasti128 (14 * 16)(%edi), %ymm0;
+
+               /* Last round and output handling. */
+         .Locb_auth_blk4_last:
+               vaesenclast %ymm0, %ymm1, %ymm1;
+               vaesenclast %ymm0, %ymm2, %ymm2;
+
+               /* Checksum_i = Checksum_{i-1} xor P_i  */
+               vpxor %ymm1, %ymm2, %ymm5;
+               vpxor (STACK_CHECKSUM)(%esp), %ymm5, %ymm5;
+               vmovdqa %ymm5, (STACK_CHECKSUM)(%esp);
+
+               jmp .Locb_crypt_blk1;
+
+       .align 8
+       .Locb_dec_blk4:
+               /* P_i = Offset_i xor DECIPHER(K, C_i xor Offset_i)  */
+               vpxor 0*16(%edx), %ymm6, %ymm1;
+               vpxor 2*16(%edx), %ymm7, %ymm2;
+               leal 4*16(%edx), %edx;
+
+               /* AES rounds */
+               vbroadcasti128 (1 * 16)(%edi), %ymm0;
+               VAESDEC2(%ymm0, %ymm1, %ymm2);
+               vbroadcasti128 (2 * 16)(%edi), %ymm0;
+               VAESDEC2(%ymm0, %ymm1, %ymm2);
+               vbroadcasti128 (3 * 16)(%edi), %ymm0;
+               VAESDEC2(%ymm0, %ymm1, %ymm2);
+               vbroadcasti128 (4 * 16)(%edi), %ymm0;
+               VAESDEC2(%ymm0, %ymm1, %ymm2);
+               vbroadcasti128 (5 * 16)(%edi), %ymm0;
+               VAESDEC2(%ymm0, %ymm1, %ymm2);
+               vbroadcasti128 (6 * 16)(%edi), %ymm0;
+               VAESDEC2(%ymm0, %ymm1, %ymm2);
+               vbroadcasti128 (7 * 16)(%edi), %ymm0;
+               VAESDEC2(%ymm0, %ymm1, %ymm2);
+               vbroadcasti128 (8 * 16)(%edi), %ymm0;
+               VAESDEC2(%ymm0, %ymm1, %ymm2);
+               vbroadcasti128 (9 * 16)(%edi), %ymm0;
+               VAESDEC2(%ymm0, %ymm1, %ymm2);
+               cmpl $12, 4+20(%ebp);
+               jb .Locb_dec_blk4_last;
+               vbroadcasti128 (10 * 16)(%edi), %ymm0;
+               VAESDEC2(%ymm0, %ymm1, %ymm2);
+               vbroadcasti128 (11 * 16)(%edi), %ymm0;
+               VAESDEC2(%ymm0, %ymm1, %ymm2);
+               jz .Locb_dec_blk4_last;
+               vbroadcasti128 (12 * 16)(%edi), %ymm0;
+               VAESDEC2(%ymm0, %ymm1, %ymm2);
+               vbroadcasti128 (13 * 16)(%edi), %ymm0;
+               VAESDEC2(%ymm0, %ymm1, %ymm2);
+
+               /* Last round and output handling. */
+         .Locb_dec_blk4_last:
+               vmovdqa (STACK_FXL_KEY)(%esp), %ymm0;
+               vpxor %ymm0, %ymm6, %ymm6; /* Xor offset to last round key. */
+               vpxor %ymm0, %ymm7, %ymm7;
+               vaesdeclast %ymm6, %ymm1, %ymm1;
+               vaesdeclast %ymm7, %ymm2, %ymm2;
+
+               /* Checksum_i = Checksum_{i-1} xor P_i  */
+               vpxor %ymm1, %ymm2, %ymm5;
+               vpxor (STACK_CHECKSUM)(%esp), %ymm5, %ymm5;
+
+               vmovdqu %ymm1, 0*16(%esi);
+               vmovdqu %ymm2, 2*16(%esi);
+               leal 4*16(%esi), %esi;
+
+               vmovdqa %ymm5, (STACK_CHECKSUM)(%esp);
+
+       /* Process trailing one to three blocks, one per loop. */
+.align 8
+.Locb_crypt_blk1:
+       cmpl $1, 4+16(%ebp);
+       jb .Locb_crypt_done;
+
+       subl $1, 4+16(%ebp);
+
+       movl 4+36(%ebp), %ecx;
+       leal 1(%ebx), %ebx;
+       tzcntl %ebx, %eax; // ntz(blkn+1)
+       shll $4, %eax;
+       vmovdqa (STACK_OFFSET_AND_F_KEY)(%esp), %xmm7;
+       vpxor (%ecx, %eax), %xmm7, %xmm7;
+
+       /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
+       vmovdqa %xmm7, (STACK_OFFSET_AND_F_KEY)(%esp);
+
+       cmpl $1, 4+40(%ebp);
+       jb .Locb_dec_blk1;
+       ja .Locb_auth_blk1;
+               vmovdqu (%edx), %xmm0;
+               leal 16(%edx), %edx;
+
+               /* Checksum_i = Checksum_{i-1} xor P_i  */
+               vpxor (STACK_CHECKSUM)(%esp), %xmm0, %xmm1;
+               vmovdqa %xmm1, (STACK_CHECKSUM)(%esp);
+
+               /* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i)  */
+               vpxor %xmm7, %xmm0, %xmm0;
+
+               /* AES rounds. */
+               vaesenc (1 * 16)(%edi), %xmm0, %xmm0;
+               vaesenc (2 * 16)(%edi), %xmm0, %xmm0;
+               vaesenc (3 * 16)(%edi), %xmm0, %xmm0;
+               vaesenc (4 * 16)(%edi), %xmm0, %xmm0;
+               vaesenc (5 * 16)(%edi), %xmm0, %xmm0;
+               vaesenc (6 * 16)(%edi), %xmm0, %xmm0;
+               vaesenc (7 * 16)(%edi), %xmm0, %xmm0;
+               vaesenc (8 * 16)(%edi), %xmm0, %xmm0;
+               vaesenc (9 * 16)(%edi), %xmm0, %xmm0;
+               cmpl $12, 4+20(%ebp);
+               jb .Locb_enc_blk1_last;
+               vaesenc (10 * 16)(%edi), %xmm0, %xmm0;
+               vaesenc (11 * 16)(%edi), %xmm0, %xmm0;
+               jz .Locb_enc_blk1_last;
+               vaesenc (12 * 16)(%edi), %xmm0, %xmm0;
+               vaesenc (13 * 16)(%edi), %xmm0, %xmm0;
+
+               /* Last round and output handling. */
+         .Locb_enc_blk1_last:
+               vpxor (STACK_FXL_KEY)(%esp), %xmm7, %xmm1;
+               vaesenclast %xmm1, %xmm0, %xmm0;
+               vmovdqu %xmm0, (%esi);
+               leal 16(%esi), %esi;
+
+               jmp .Locb_crypt_blk1;
+
+       .align 8
+       .Locb_auth_blk1:
+               /* Sum_i = Sum_{i-1} xor ENCIPHER(K, A_i xor Offset_i)  */
+               vpxor (%edx), %xmm7, %xmm0;
+               leal 16(%edx), %edx;
+
+               /* AES rounds. */
+               vaesenc (1 * 16)(%edi), %xmm0, %xmm0;
+               vaesenc (2 * 16)(%edi), %xmm0, %xmm0;
+               vaesenc (3 * 16)(%edi), %xmm0, %xmm0;
+               vaesenc (4 * 16)(%edi), %xmm0, %xmm0;
+               vaesenc (5 * 16)(%edi), %xmm0, %xmm0;
+               vaesenc (6 * 16)(%edi), %xmm0, %xmm0;
+               vaesenc (7 * 16)(%edi), %xmm0, %xmm0;
+               vaesenc (8 * 16)(%edi), %xmm0, %xmm0;
+               vaesenc (9 * 16)(%edi), %xmm0, %xmm0;
+               vmovdqa (10 * 16)(%edi), %xmm1;
+               cmpl $12, 4+20(%ebp);
+               jb .Locb_auth_blk1_last;
+               vaesenc %xmm1, %xmm0, %xmm0;
+               vaesenc (11 * 16)(%edi), %xmm0, %xmm0;
+               vmovdqa (12 * 16)(%edi), %xmm1;
+               jz .Locb_auth_blk1_last;
+               vaesenc %xmm1, %xmm0, %xmm0;
+               vaesenc (13 * 16)(%edi), %xmm0, %xmm0;
+               vmovdqa (14 * 16)(%edi), %xmm1;
+
+               /* Last round and output handling. */
+         .Locb_auth_blk1_last:
+               vpxor (STACK_CHECKSUM)(%esp), %xmm1, %xmm1;
+               vaesenclast %xmm1, %xmm0, %xmm0;
+               vmovdqa %xmm0, (STACK_CHECKSUM)(%esp);
+
+               jmp .Locb_crypt_blk1;
+
+       .align 8
+       .Locb_dec_blk1:
+               /* P_i = Offset_i xor DECIPHER(K, C_i xor Offset_i)  */
+               vpxor (%edx), %xmm7, %xmm0;
+               leal 16(%edx), %edx;
+
+               /* AES rounds. */
+               vaesdec (1 * 16)(%edi), %xmm0, %xmm0;
+               vaesdec (2 * 16)(%edi), %xmm0, %xmm0;
+               vaesdec (3 * 16)(%edi), %xmm0, %xmm0;
+               vaesdec (4 * 16)(%edi), %xmm0, %xmm0;
+               vaesdec (5 * 16)(%edi), %xmm0, %xmm0;
+               vaesdec (6 * 16)(%edi), %xmm0, %xmm0;
+               vaesdec (7 * 16)(%edi), %xmm0, %xmm0;
+               vaesdec (8 * 16)(%edi), %xmm0, %xmm0;
+               vaesdec (9 * 16)(%edi), %xmm0, %xmm0;
+               cmpl $12, 4+20(%ebp);
+               jb .Locb_dec_blk1_last;
+               vaesdec (10 * 16)(%edi), %xmm0, %xmm0;
+               vaesdec (11 * 16)(%edi), %xmm0, %xmm0;
+               jz .Locb_dec_blk1_last;
+               vaesdec (12 * 16)(%edi), %xmm0, %xmm0;
+               vaesdec (13 * 16)(%edi), %xmm0, %xmm0;
+
+               /* Last round and output handling. */
+         .Locb_dec_blk1_last:
+               vpxor (STACK_FXL_KEY)(%esp), %xmm7, %xmm1;
+               vaesdeclast %xmm1, %xmm0, %xmm0;
+
+               /* Checksum_i = Checksum_{i-1} xor P_i  */
+               vpxor (STACK_CHECKSUM)(%esp), %xmm0, %xmm1;
+
+               vmovdqu %xmm0, (%esi);
+               leal 16(%esi), %esi;
+
+               vmovdqa %xmm1, (STACK_CHECKSUM)(%esp);
+
+               jmp .Locb_crypt_blk1;
+
+.align 8
+.Locb_crypt_done:
+       movl 4+24(%ebp), %ecx;
+       vmovdqa (STACK_OFFSET_AND_F_KEY)(%esp), %xmm1;
+       vpxor (%edi), %xmm1, %xmm1;
+       vmovdqu %xmm1, (%ecx);
+
+       movl 4+28(%ebp), %eax;
+       vmovdqa (STACK_CHECKSUM)(%esp), %xmm2;
+       vpxor (STACK_CHECKSUM + 16)(%esp), %xmm2, %xmm2;
+       vpxor (%eax), %xmm2, %xmm2;
+       vmovdqu %xmm2, (%eax);
+
+       movl (STACK_GPR_POS + 0 * 4)(%esp), %edi;
+       CFI_RESTORE(edi);
+       movl (STACK_GPR_POS + 1 * 4)(%esp), %esi;
+       CFI_RESTORE(esi);
+       movl (STACK_GPR_POS + 2 * 4)(%esp), %ebx;
+       CFI_RESTORE(ebx);
+
+       vpxor %ymm0, %ymm0, %ymm0;
+       vmovdqa %ymm0, (STACK_OFFSET_AND_F_KEY)(%esp);
+       vmovdqa %ymm0, (STACK_CHECKSUM)(%esp);
+
+       xorl %eax, %eax;
+       leave;
+       CFI_LEAVE();
+       vzeroall;
+       ret_spec_stop
+       CFI_ENDPROC();
+ELF(.size SYM_NAME(_gcry_vaes_avx2_ocb_crypt_i386),
+         .-SYM_NAME(_gcry_vaes_avx2_ocb_crypt_i386))
+
+/**********************************************************************
+  XTS-mode encryption
+ **********************************************************************/
+ELF(.type SYM_NAME(_gcry_vaes_avx2_xts_crypt_i386),@function)
+.globl SYM_NAME(_gcry_vaes_avx2_xts_crypt_i386)
+.align 16
+SYM_NAME(_gcry_vaes_avx2_xts_crypt_i386):
+       /* input:
+        *      (esp + 4): round keys
+        *      (esp + 8): tweak
+        *      (esp + 12): dst
+        *      (esp + 16): src
+        *      (esp + 20): nblocks
+        *      (esp + 24): nrounds
+        *      (esp + 28): encrypt
+        */
+       CFI_STARTPROC();
+
+       GET_DATA_POINTER(SYM_NAME(_gcry_vaes_consts), eax);
+
+       pushl %ebp;
+       CFI_PUSH(%ebp);
+       movl %esp, %ebp;
+       CFI_DEF_CFA_REGISTER(%ebp);
+
+       subl $(4 * 32 + 3 * 4), %esp;
+       andl $-32, %esp;
+
+       movl %edi, (4 * 32 + 0 * 4)(%esp);
+       CFI_REG_ON_STACK(edi, 4 * 32 + 0 * 4);
+       movl %esi, (4 * 32 + 1 * 4)(%esp);
+       CFI_REG_ON_STACK(esi, 4 * 32 + 1 * 4);
+       movl %ebx, (4 * 32 + 2 * 4)(%esp);
+       CFI_REG_ON_STACK(ebx, 4 * 32 + 2 * 4);
+
+       movl %eax, %ebx;
+       movl 4+4(%ebp), %edi;
+       movl 4+8(%ebp), %esi;
+       movl 4+12(%ebp), %edx;
+       movl 4+16(%ebp), %ecx;
+       movl 4+20(%ebp), %eax;
+
+#define tweak_clmul(shift, out, tweak, hi_tweak, tmp1, tmp2) \
+       vpsrld $(32-(shift)), hi_tweak, tmp2; \
+       vpsllq $(shift), tweak, out; \
+       vpclmulqdq $0, CADDR(.Lxts_gfmul_clmul, %ebx), tmp2, tmp1; \
+       vpunpckhqdq tmp2, tmp1, tmp1; \
+       vpxor tmp1, out, out;
+
+       /* Prepare tweak. */
+       vmovdqu (%esi), %xmm7;
+       vpshufb CADDR(.Lxts_high_bit_shuf, %ebx), %xmm7, %xmm6;
+       tweak_clmul(1, %xmm5, %xmm7, %xmm6, %xmm0, %xmm1);
+       vinserti128 $1, %xmm5, %ymm7, %ymm7; /* tweak:tweak1 */
+       vpshufb CADDR(.Lxts_high_bit_shuf, %ebx), %ymm7, %ymm6;
+
+       /* Process eight blocks per loop. */
+.align 8
+.Lxts_crypt_blk8:
+       cmpl $8, %eax;
+       jb .Lxts_crypt_blk4;
+
+       leal -8(%eax), %eax;
+
+       vmovdqa %ymm7, (0 * 32)(%esp);
+       tweak_clmul(2, %ymm2, %ymm7, %ymm6, %ymm0, %ymm1);
+       vmovdqa %ymm2, (1 * 32)(%esp);
+       tweak_clmul(4, %ymm2, %ymm7, %ymm6, %ymm0, %ymm1);
+       vmovdqa %ymm2, (2 * 32)(%esp);
+       tweak_clmul(6, %ymm2, %ymm7, %ymm6, %ymm0, %ymm1);
+       vmovdqa %ymm2, (3 * 32)(%esp);
+       tweak_clmul(8, %ymm7, %ymm7, %ymm6, %ymm0, %ymm1);
+       vpshufb CADDR(.Lxts_high_bit_shuf, %ebx), %ymm7, %ymm6;
+
+       vbroadcasti128 (0 * 16)(%edi), %ymm4;
+       vmovdqa (0 * 32)(%esp), %ymm0;
+       vmovdqa (1 * 32)(%esp), %ymm1;
+       vmovdqa (2 * 32)(%esp), %ymm2;
+       vmovdqa (3 * 32)(%esp), %ymm3;
+       vpxor (0 * 16)(%ecx), %ymm0, %ymm0;
+       vpxor (2 * 16)(%ecx), %ymm1, %ymm1;
+       vpxor (4 * 16)(%ecx), %ymm2, %ymm2;
+       vpxor (6 * 16)(%ecx), %ymm3, %ymm3;
+
+       leal (8 * 16)(%ecx), %ecx;
+
+       cmpl $1, 4+28(%ebp);
+       jne .Lxts_dec_blk8;
+               /* AES rounds */
+               XOR4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+               vbroadcasti128 (1 * 16)(%edi), %ymm4;
+               VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+               vbroadcasti128 (2 * 16)(%edi), %ymm4;
+               VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+               vbroadcasti128 (3 * 16)(%edi), %ymm4;
+               VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+               vbroadcasti128 (4 * 16)(%edi), %ymm4;
+               VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+               vbroadcasti128 (5 * 16)(%edi), %ymm4;
+               VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+               vbroadcasti128 (6 * 16)(%edi), %ymm4;
+               VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+               vbroadcasti128 (7 * 16)(%edi), %ymm4;
+               VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+               vbroadcasti128 (8 * 16)(%edi), %ymm4;
+               VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+               vbroadcasti128 (9 * 16)(%edi), %ymm4;
+               VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+               vbroadcasti128 (10 * 16)(%edi), %ymm4;
+               cmpl $12, 4+24(%ebp);
+               jb .Lxts_enc_blk8_last;
+               VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+               vbroadcasti128 (11 * 16)(%edi), %ymm4;
+               VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+               vbroadcasti128 (12 * 16)(%edi), %ymm4;
+               jz .Lxts_enc_blk8_last;
+               VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+               vbroadcasti128 (13 * 16)(%edi), %ymm4;
+               VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+               vbroadcasti128 (14 * 16)(%edi), %ymm4;
+
+               /* Last round and output handling. */
+       .Lxts_enc_blk8_last:
+               vpxor (0 * 32)(%esp), %ymm4, %ymm5; /* Xor tweak to last round key. */
+               vaesenclast %ymm5, %ymm0, %ymm0;
+               vpxor (1 * 32)(%esp), %ymm4, %ymm5;
+               vaesenclast %ymm5, %ymm1, %ymm1;
+               vpxor (2 * 32)(%esp), %ymm4, %ymm5;
+               vpxor (3 * 32)(%esp), %ymm4, %ymm4;
+               vaesenclast %ymm5, %ymm2, %ymm2;
+               vaesenclast %ymm4, %ymm3, %ymm3;
+               vmovdqu %ymm0, (0 * 16)(%edx);
+               vmovdqu %ymm1, (2 * 16)(%edx);
+               vmovdqu %ymm2, (4 * 16)(%edx);
+               vmovdqu %ymm3, (6 * 16)(%edx);
+               leal (8 * 16)(%edx), %edx;
+
+               jmp .Lxts_crypt_blk8;
+
+       .align 8
+       .Lxts_dec_blk8:
+               /* AES rounds */
+               XOR4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+               vbroadcasti128 (1 * 16)(%edi), %ymm4;
+               VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+               vbroadcasti128 (2 * 16)(%edi), %ymm4;
+               VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+               vbroadcasti128 (3 * 16)(%edi), %ymm4;
+               VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+               vbroadcasti128 (4 * 16)(%edi), %ymm4;
+               VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+               vbroadcasti128 (5 * 16)(%edi), %ymm4;
+               VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+               vbroadcasti128 (6 * 16)(%edi), %ymm4;
+               VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+               vbroadcasti128 (7 * 16)(%edi), %ymm4;
+               VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+               vbroadcasti128 (8 * 16)(%edi), %ymm4;
+               VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+               vbroadcasti128 (9 * 16)(%edi), %ymm4;
+               VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+               vbroadcasti128 (10 * 16)(%edi), %ymm4;
+               cmpl $12, 4+24(%ebp);
+               jb .Lxts_dec_blk8_last;
+               VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+               vbroadcasti128 (11 * 16)(%edi), %ymm4;
+               VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+               vbroadcasti128 (12 * 16)(%edi), %ymm4;
+               jz .Lxts_dec_blk8_last;
+               VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+               vbroadcasti128 (13 * 16)(%edi), %ymm4;
+               VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+               vbroadcasti128 (14 * 16)(%edi), %ymm4;
+
+               /* Last round and output handling. */
+       .Lxts_dec_blk8_last:
+               vpxor (0 * 32)(%esp), %ymm4, %ymm5; /* Xor tweak to last round key. */
+               vaesdeclast %ymm5, %ymm0, %ymm0;
+               vpxor (1 * 32)(%esp), %ymm4, %ymm5;
+               vaesdeclast %ymm5, %ymm1, %ymm1;
+               vpxor (2 * 32)(%esp), %ymm4, %ymm5;
+               vpxor (3 * 32)(%esp), %ymm4, %ymm4;
+               vaesdeclast %ymm5, %ymm2, %ymm2;
+               vaesdeclast %ymm4, %ymm3, %ymm3;
+               vmovdqu %ymm0, (0 * 16)(%edx);
+               vmovdqu %ymm1, (2 * 16)(%edx);
+               vmovdqu %ymm2, (4 * 16)(%edx);
+               vmovdqu %ymm3, (6 * 16)(%edx);
+               leal (8 * 16)(%edx), %edx;
+
+               jmp .Lxts_crypt_blk8;
+
+       /* Handle trailing four blocks. */
+.align 8
+.Lxts_crypt_blk4:
+       /* Try exit early as typically input length is large power of 2. */
+       cmpl $1, %eax;
+       jb .Ldone_xts_crypt;
+       cmpl $4, %eax;
+       jb .Lxts_crypt_blk1;
+
+       leal -4(%eax), %eax;
+
+       vmovdqa %ymm7, %ymm2;
+       tweak_clmul(2, %ymm3, %ymm7, %ymm6, %ymm0, %ymm1);
+       tweak_clmul(4, %ymm7, %ymm7, %ymm6, %ymm0, %ymm1);
+       vpshufb CADDR(.Lxts_high_bit_shuf, %ebx), %ymm7, %ymm6;
+
+       vbroadcasti128 (0 * 16)(%edi), %ymm4;
+       vpxor (0 * 16)(%ecx), %ymm2, %ymm0;
+       vpxor (2 * 16)(%ecx), %ymm3, %ymm1;
+
+       leal (4 * 16)(%ecx), %ecx;
+
+       cmpl $1, 4+28(%ebp);
+       jne .Lxts_dec_blk4;
+               /* AES rounds */
+               XOR2(%ymm4, %ymm0, %ymm1);
+               vbroadcasti128 (1 * 16)(%edi), %ymm4;
+               VAESENC2(%ymm4, %ymm0, %ymm1);
+               vbroadcasti128 (2 * 16)(%edi), %ymm4;
+               VAESENC2(%ymm4, %ymm0, %ymm1);
+               vbroadcasti128 (3 * 16)(%edi), %ymm4;
+               VAESENC2(%ymm4, %ymm0, %ymm1);
+               vbroadcasti128 (4 * 16)(%edi), %ymm4;
+               VAESENC2(%ymm4, %ymm0, %ymm1);
+               vbroadcasti128 (5 * 16)(%edi), %ymm4;
+               VAESENC2(%ymm4, %ymm0, %ymm1);
+               vbroadcasti128 (6 * 16)(%edi), %ymm4;
+               VAESENC2(%ymm4, %ymm0, %ymm1);
+               vbroadcasti128 (7 * 16)(%edi), %ymm4;
+               VAESENC2(%ymm4, %ymm0, %ymm1);
+               vbroadcasti128 (8 * 16)(%edi), %ymm4;
+               VAESENC2(%ymm4, %ymm0, %ymm1);
+               vbroadcasti128 (9 * 16)(%edi), %ymm4;
+               VAESENC2(%ymm4, %ymm0, %ymm1);
+               vbroadcasti128 (10 * 16)(%edi), %ymm4;
+               cmpl $12, 4+24(%ebp);
+               jb .Lxts_enc_blk4_last;
+               VAESENC2(%ymm4, %ymm0, %ymm1);
+               vbroadcasti128 (11 * 16)(%edi), %ymm4;
+               VAESENC2(%ymm4, %ymm0, %ymm1);
+               vbroadcasti128 (12 * 16)(%edi), %ymm4;
+               jz .Lxts_enc_blk4_last;
+               VAESENC2(%ymm4, %ymm0, %ymm1);
+               vbroadcasti128 (13 * 16)(%edi), %ymm4;
+               VAESENC2(%ymm4, %ymm0, %ymm1);
+               vbroadcasti128 (14 * 16)(%edi), %ymm4;
+
+               /* Last round and output handling. */
+       .Lxts_enc_blk4_last:
+               vpxor %ymm4, %ymm2, %ymm2; /* Xor tweak to last round key. */
+               vpxor %ymm4, %ymm3, %ymm3;
+               vaesenclast %ymm2, %ymm0, %ymm0;
+               vaesenclast %ymm3, %ymm1, %ymm1;
+               vmovdqu %ymm0, (0 * 16)(%edx);
+               vmovdqu %ymm1, (2 * 16)(%edx);
+               leal (4 * 16)(%edx), %edx;
+
+               jmp .Lxts_crypt_blk1;
+
+       .align 8
+       .Lxts_dec_blk4:
+               /* AES rounds */
+               XOR2(%ymm4, %ymm0, %ymm1);
+               vbroadcasti128 (1 * 16)(%edi), %ymm4;
+               VAESDEC2(%ymm4, %ymm0, %ymm1);
+               vbroadcasti128 (2 * 16)(%edi), %ymm4;
+               VAESDEC2(%ymm4, %ymm0, %ymm1);
+               vbroadcasti128 (3 * 16)(%edi), %ymm4;
+               VAESDEC2(%ymm4, %ymm0, %ymm1);
+               vbroadcasti128 (4 * 16)(%edi), %ymm4;
+               VAESDEC2(%ymm4, %ymm0, %ymm1);
+               vbroadcasti128 (5 * 16)(%edi), %ymm4;
+               VAESDEC2(%ymm4, %ymm0, %ymm1);
+               vbroadcasti128 (6 * 16)(%edi), %ymm4;
+               VAESDEC2(%ymm4, %ymm0, %ymm1);
+               vbroadcasti128 (7 * 16)(%edi), %ymm4;
+               VAESDEC2(%ymm4, %ymm0, %ymm1);
+               vbroadcasti128 (8 * 16)(%edi), %ymm4;
+               VAESDEC2(%ymm4, %ymm0, %ymm1);
+               vbroadcasti128 (9 * 16)(%edi), %ymm4;
+               VAESDEC2(%ymm4, %ymm0, %ymm1);
+               vbroadcasti128 (10 * 16)(%edi), %ymm4;
+               cmpl $12, 4+24(%ebp);
+               jb .Lxts_dec_blk4_last;
+               VAESDEC2(%ymm4, %ymm0, %ymm1);
+               vbroadcasti128 (11 * 16)(%edi), %ymm4;
+               VAESDEC2(%ymm4, %ymm0, %ymm1);
+               vbroadcasti128 (12 * 16)(%edi), %ymm4;
+               jz .Lxts_dec_blk4_last;
+               VAESDEC2(%ymm4, %ymm0, %ymm1);
+               vbroadcasti128 (13 * 16)(%edi), %ymm4;
+               VAESDEC2(%ymm4, %ymm0, %ymm1);
+               vbroadcasti128 (14 * 16)(%edi), %ymm4;
+
+               /* Last round and output handling. */
+       .Lxts_dec_blk4_last:
+               vpxor %ymm4, %ymm2, %ymm2; /* Xor tweak to last round key. */
+               vpxor %ymm4, %ymm3, %ymm3;
+               vaesdeclast %ymm2, %ymm0, %ymm0;
+               vaesdeclast %ymm3, %ymm1, %ymm1;
+               vmovdqu %ymm0, (0 * 16)(%edx);
+               vmovdqu %ymm1, (2 * 16)(%edx);
+               leal (4 * 16)(%edx), %edx;
+
+       /* Process trailing one to three blocks, one per loop. */
+.align 8
+.Lxts_crypt_blk1:
+       cmpl $1, %eax;
+       jb .Ldone_xts_crypt;
+
+       leal -1(%eax), %eax;
+
+       vpxor (%ecx), %xmm7, %xmm0;
+       vmovdqa %xmm7, %xmm5;
+       tweak_clmul(1, %xmm7, %xmm7, %xmm6, %xmm2, %xmm3);
+       vpshufb CADDR(.Lxts_high_bit_shuf, %ebx), %xmm7, %xmm6;
+
+       leal 16(%ecx), %ecx;
+
+       cmpl $1, 4+28(%ebp);
+       jne .Lxts_dec_blk1;
+               /* AES rounds. */
+               vpxor (0 * 16)(%edi), %xmm0, %xmm0;
+               vaesenc (1 * 16)(%edi), %xmm0, %xmm0;
+               vaesenc (2 * 16)(%edi), %xmm0, %xmm0;
+               vaesenc (3 * 16)(%edi), %xmm0, %xmm0;
+               vaesenc (4 * 16)(%edi), %xmm0, %xmm0;
+               vaesenc (5 * 16)(%edi), %xmm0, %xmm0;
+               vaesenc (6 * 16)(%edi), %xmm0, %xmm0;
+               vaesenc (7 * 16)(%edi), %xmm0, %xmm0;
+               vaesenc (8 * 16)(%edi), %xmm0, %xmm0;
+               vaesenc (9 * 16)(%edi), %xmm0, %xmm0;
+               vmovdqa (10 * 16)(%edi), %xmm1;
+               cmpl $12, 4+24(%ebp);
+               jb .Lxts_enc_blk1_last;
+               vaesenc %xmm1, %xmm0, %xmm0;
+               vaesenc (11 * 16)(%edi), %xmm0, %xmm0;
+               vmovdqa (12 * 16)(%edi), %xmm1;
+               jz .Lxts_enc_blk1_last;
+               vaesenc %xmm1, %xmm0, %xmm0;
+               vaesenc (13 * 16)(%edi), %xmm0, %xmm0;
+               vmovdqa (14 * 16)(%edi), %xmm1;
+
+               /* Last round and output handling. */
+       .Lxts_enc_blk1_last:
+               vpxor %xmm1, %xmm5, %xmm5; /* Xor tweak to last round key. */
+               vaesenclast %xmm5, %xmm0, %xmm0;
+               vmovdqu %xmm0, (%edx);
+               leal 16(%edx), %edx;
+
+               jmp .Lxts_crypt_blk1;
+
+       .align 8
+       .Lxts_dec_blk1:
+               /* AES rounds. */
+               vpxor (0 * 16)(%edi), %xmm0, %xmm0;
+               vaesdec (1 * 16)(%edi), %xmm0, %xmm0;
+               vaesdec (2 * 16)(%edi), %xmm0, %xmm0;
+               vaesdec (3 * 16)(%edi), %xmm0, %xmm0;
+               vaesdec (4 * 16)(%edi), %xmm0, %xmm0;
+               vaesdec (5 * 16)(%edi), %xmm0, %xmm0;
+               vaesdec (6 * 16)(%edi), %xmm0, %xmm0;
+               vaesdec (7 * 16)(%edi), %xmm0, %xmm0;
+               vaesdec (8 * 16)(%edi), %xmm0, %xmm0;
+               vaesdec (9 * 16)(%edi), %xmm0, %xmm0;
+               vmovdqa (10 * 16)(%edi), %xmm1;
+               cmpl $12, 4+24(%ebp);
+               jb .Lxts_dec_blk1_last;
+               vaesdec %xmm1, %xmm0, %xmm0;
+               vaesdec (11 * 16)(%edi), %xmm0, %xmm0;
+               vmovdqa (12 * 16)(%edi), %xmm1;
+               jz .Lxts_dec_blk1_last;
+               vaesdec %xmm1, %xmm0, %xmm0;
+               vaesdec (13 * 16)(%edi), %xmm0, %xmm0;
+               vmovdqa (14 * 16)(%edi), %xmm1;
+
+               /* Last round and output handling. */
+       .Lxts_dec_blk1_last:
+               vpxor %xmm1, %xmm5, %xmm5; /* Xor tweak to last round key. */
+               vaesdeclast %xmm5, %xmm0, %xmm0;
+               vmovdqu %xmm0, (%edx);
+               leal 16(%edx), %edx;
+
+               jmp .Lxts_crypt_blk1;
+
+.align 8
+.Ldone_xts_crypt:
+       /* Store IV. */
+       vmovdqu %xmm7, (%esi);
+
+       vpxor %ymm0, %ymm0, %ymm0;
+       movl (4 * 32 + 0 * 4)(%esp), %edi;
+       CFI_RESTORE(edi);
+       movl (4 * 32 + 1 * 4)(%esp), %esi;
+       CFI_RESTORE(esi);
+       movl (4 * 32 + 2 * 4)(%esp), %ebx;
+       CFI_RESTORE(ebx);
+       vmovdqa %ymm0, (0 * 32)(%esp);
+       vmovdqa %ymm0, (1 * 32)(%esp);
+       vmovdqa %ymm0, (2 * 32)(%esp);
+       vmovdqa %ymm0, (3 * 32)(%esp);
+       leave;
+       CFI_LEAVE();
+       vzeroall;
+       xorl %eax, %eax;
+       ret_spec_stop
+       CFI_ENDPROC();
+ELF(.size SYM_NAME(_gcry_vaes_avx2_xts_crypt_i386),
+         .-SYM_NAME(_gcry_vaes_avx2_xts_crypt_i386))
+
+/**********************************************************************
+  ECB-mode encryption
+ **********************************************************************/
+ELF(.type SYM_NAME(_gcry_vaes_avx2_ecb_crypt_i386),@function)
+.globl SYM_NAME(_gcry_vaes_avx2_ecb_crypt_i386)
+.align 16
+SYM_NAME(_gcry_vaes_avx2_ecb_crypt_i386):
+       /* input:
+        *      (esp + 4): round keys
+        *      (esp + 8): encrypt
+        *      (esp + 12): dst
+        *      (esp + 16): src
+        *      (esp + 20): nblocks
+        *      (esp + 24): nrounds
+        */
+       CFI_STARTPROC();
+       pushl %edi;
+       CFI_PUSH(%edi);
+       pushl %esi;
+       CFI_PUSH(%esi);
+
+       movl 8+4(%esp), %edi;
+       movl 8+8(%esp), %esi;
+       movl 8+12(%esp), %edx;
+       movl 8+16(%esp), %ecx;
+       movl 8+20(%esp), %eax;
+
+       /* Process 8 blocks per loop. */
+.align 8
+.Lecb_blk8:
+       cmpl $8, %eax;
+       jb .Lecb_blk4;
+
+       leal -8(%eax), %eax;
+
+       /* Load input and xor first key. */
+       vbroadcasti128 (0 * 16)(%edi), %ymm4;
+       vmovdqu (0 * 16)(%ecx), %ymm0;
+       vmovdqu (2 * 16)(%ecx), %ymm1;
+       vmovdqu (4 * 16)(%ecx), %ymm2;
+       vmovdqu (6 * 16)(%ecx), %ymm3;
+       vpxor %ymm4, %ymm0, %ymm0;
+       vpxor %ymm4, %ymm1, %ymm1;
+       vpxor %ymm4, %ymm2, %ymm2;
+       vpxor %ymm4, %ymm3, %ymm3;
+       vbroadcasti128 (1 * 16)(%edi), %ymm4;
+       leal (8 * 16)(%ecx), %ecx;
+
+       testl %esi, %esi;
+       jz .Lecb_dec_blk8;
+               /* AES rounds */
+               VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+               vbroadcasti128 (2 * 16)(%edi), %ymm4;
+               VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+               vbroadcasti128 (3 * 16)(%edi), %ymm4;
+               VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+               vbroadcasti128 (4 * 16)(%edi), %ymm4;
+               VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+               vbroadcasti128 (5 * 16)(%edi), %ymm4;
+               VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+               vbroadcasti128 (6 * 16)(%edi), %ymm4;
+               VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+               vbroadcasti128 (7 * 16)(%edi), %ymm4;
+               VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+               vbroadcasti128 (8 * 16)(%edi), %ymm4;
+               VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+               vbroadcasti128 (9 * 16)(%edi), %ymm4;
+               VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+               vbroadcasti128 (10 * 16)(%edi), %ymm4;
+               cmpl $12, 8+24(%esp);
+               jb .Lecb_enc_blk8_last;
+               VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+               vbroadcasti128 (11 * 16)(%edi), %ymm4;
+               VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+               vbroadcasti128 (12 * 16)(%edi), %ymm4;
+               jz .Lecb_enc_blk8_last;
+               VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+               vbroadcasti128 (13 * 16)(%edi), %ymm4;
+               VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+               vbroadcasti128 (14 * 16)(%edi), %ymm4;
+         .Lecb_enc_blk8_last:
+               vaesenclast %ymm4, %ymm0, %ymm0;
+               vaesenclast %ymm4, %ymm1, %ymm1;
+               vaesenclast %ymm4, %ymm2, %ymm2;
+               vaesenclast %ymm4, %ymm3, %ymm3;
+               vmovdqu %ymm0, (0 * 16)(%edx);
+               vmovdqu %ymm1, (2 * 16)(%edx);
+               vmovdqu %ymm2, (4 * 16)(%edx);
+               vmovdqu %ymm3, (6 * 16)(%edx);
+               leal (8 * 16)(%edx), %edx;
+               jmp .Lecb_blk8;
+
+         .align 8
+         .Lecb_dec_blk8:
+               /* AES rounds */
+               VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+               vbroadcasti128 (2 * 16)(%edi), %ymm4;
+               VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+               vbroadcasti128 (3 * 16)(%edi), %ymm4;
+               VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+               vbroadcasti128 (4 * 16)(%edi), %ymm4;
+               VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+               vbroadcasti128 (5 * 16)(%edi), %ymm4;
+               VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+               vbroadcasti128 (6 * 16)(%edi), %ymm4;
+               VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+               vbroadcasti128 (7 * 16)(%edi), %ymm4;
+               VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+               vbroadcasti128 (8 * 16)(%edi), %ymm4;
+               VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+               vbroadcasti128 (9 * 16)(%edi), %ymm4;
+               VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+               vbroadcasti128 (10 * 16)(%edi), %ymm4;
+               cmpl $12, 8+24(%esp);
+               jb .Lecb_dec_blk8_last;
+               VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+               vbroadcasti128 (11 * 16)(%edi), %ymm4;
+               VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+               vbroadcasti128 (12 * 16)(%edi), %ymm4;
+               jz .Lecb_dec_blk8_last;
+               VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+               vbroadcasti128 (13 * 16)(%edi), %ymm4;
+               VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+               vbroadcasti128 (14 * 16)(%edi), %ymm4;
+         .Lecb_dec_blk8_last:
+               vaesdeclast %ymm4, %ymm0, %ymm0;
+               vaesdeclast %ymm4, %ymm1, %ymm1;
+               vaesdeclast %ymm4, %ymm2, %ymm2;
+               vaesdeclast %ymm4, %ymm3, %ymm3;
+               vmovdqu %ymm0, (0 * 16)(%edx);
+               vmovdqu %ymm1, (2 * 16)(%edx);
+               vmovdqu %ymm2, (4 * 16)(%edx);
+               vmovdqu %ymm3, (6 * 16)(%edx);
+               leal (8 * 16)(%edx), %edx;
+               jmp .Lecb_blk8;
+
+       /* Handle trailing four blocks. */
+.align 8
+.Lecb_blk4:
+       cmpl $4, %eax;
+       jb .Lecb_blk1;
+
+       leal -4(%eax), %eax;
+
+       /* Load input and xor first key. */
+       vbroadcasti128 (0 * 16)(%edi), %ymm4;
+       vmovdqu (0 * 16)(%ecx), %ymm0;
+       vmovdqu (2 * 16)(%ecx), %ymm1;
+       vpxor %ymm4, %ymm0, %ymm0;
+       vpxor %ymm4, %ymm1, %ymm1;
+       vbroadcasti128 (1 * 16)(%edi), %ymm4;
+       leal (4 * 16)(%ecx), %ecx;
+
+       testl %esi, %esi;
+       jz .Lecb_dec_blk4;
+               /* AES rounds */
+               VAESENC2(%ymm4, %ymm0, %ymm1);
+               vbroadcasti128 (2 * 16)(%edi), %ymm4;
+               VAESENC2(%ymm4, %ymm0, %ymm1);
+               vbroadcasti128 (3 * 16)(%edi), %ymm4;
+               VAESENC2(%ymm4, %ymm0, %ymm1);
+               vbroadcasti128 (4 * 16)(%edi), %ymm4;
+               VAESENC2(%ymm4, %ymm0, %ymm1);
+               vbroadcasti128 (5 * 16)(%edi), %ymm4;
+               VAESENC2(%ymm4, %ymm0, %ymm1);
+               vbroadcasti128 (6 * 16)(%edi), %ymm4;
+               VAESENC2(%ymm4, %ymm0, %ymm1);
+               vbroadcasti128 (7 * 16)(%edi), %ymm4;
+               VAESENC2(%ymm4, %ymm0, %ymm1);
+               vbroadcasti128 (8 * 16)(%edi), %ymm4;
+               VAESENC2(%ymm4, %ymm0, %ymm1);
+               vbroadcasti128 (9 * 16)(%edi), %ymm4;
+               VAESENC2(%ymm4, %ymm0, %ymm1);
+               vbroadcasti128 (10 * 16)(%edi), %ymm4;
+               cmpl $12, 8+24(%esp);
+               jb .Lecb_enc_blk4_last;
+               VAESENC2(%ymm4, %ymm0, %ymm1);
+               vbroadcasti128 (11 * 16)(%edi), %ymm4;
+               VAESENC2(%ymm4, %ymm0, %ymm1);
+               vbroadcasti128 (12 * 16)(%edi), %ymm4;
+               jz .Lecb_enc_blk4_last;
+               VAESENC2(%ymm4, %ymm0, %ymm1);
+               vbroadcasti128 (13 * 16)(%edi), %ymm4;
+               VAESENC2(%ymm4, %ymm0, %ymm1);
+               vbroadcasti128 (14 * 16)(%edi), %ymm4;
+         .Lecb_enc_blk4_last:
+               vaesenclast %ymm4, %ymm0, %ymm0;
+               vaesenclast %ymm4, %ymm1, %ymm1;
+               vmovdqu %ymm0, (0 * 16)(%edx);
+               vmovdqu %ymm1, (2 * 16)(%edx);
+               leal (4 * 16)(%edx), %edx;
+               jmp .Lecb_blk1;
+
+         .align 8
+         .Lecb_dec_blk4:
+               /* AES rounds */
+               VAESDEC2(%ymm4, %ymm0, %ymm1);
+               vbroadcasti128 (2 * 16)(%edi), %ymm4;
+               VAESDEC2(%ymm4, %ymm0, %ymm1);
+               vbroadcasti128 (3 * 16)(%edi), %ymm4;
+               VAESDEC2(%ymm4, %ymm0, %ymm1);
+               vbroadcasti128 (4 * 16)(%edi), %ymm4;
+               VAESDEC2(%ymm4, %ymm0, %ymm1);
+               vbroadcasti128 (5 * 16)(%edi), %ymm4;
+               VAESDEC2(%ymm4, %ymm0, %ymm1);
+               vbroadcasti128 (6 * 16)(%edi), %ymm4;
+               VAESDEC2(%ymm4, %ymm0, %ymm1);
+               vbroadcasti128 (7 * 16)(%edi), %ymm4;
+               VAESDEC2(%ymm4, %ymm0, %ymm1);
+               vbroadcasti128 (8 * 16)(%edi), %ymm4;
+               VAESDEC2(%ymm4, %ymm0, %ymm1);
+               vbroadcasti128 (9 * 16)(%edi), %ymm4;
+               VAESDEC2(%ymm4, %ymm0, %ymm1);
+               vbroadcasti128 (10 * 16)(%edi), %ymm4;
+               cmpl $12, 8+24(%esp);
+               jb .Lecb_dec_blk4_last;
+               VAESDEC2(%ymm4, %ymm0, %ymm1);
+               vbroadcasti128 (11 * 16)(%edi), %ymm4;
+               VAESDEC2(%ymm4, %ymm0, %ymm1);
+               vbroadcasti128 (12 * 16)(%edi), %ymm4;
+               jz .Lecb_dec_blk4_last;
+               VAESDEC2(%ymm4, %ymm0, %ymm1);
+               vbroadcasti128 (13 * 16)(%edi), %ymm4;
+               VAESDEC2(%ymm4, %ymm0, %ymm1);
+               vbroadcasti128 (14 * 16)(%edi), %ymm4;
+         .Lecb_dec_blk4_last:
+               vaesdeclast %ymm4, %ymm0, %ymm0;
+               vaesdeclast %ymm4, %ymm1, %ymm1;
+               vmovdqu %ymm0, (0 * 16)(%edx);
+               vmovdqu %ymm1, (2 * 16)(%edx);
+               leal (4 * 16)(%edx), %edx;
+
+       /* Process trailing one to three blocks, one per loop. */
+.align 8
+.Lecb_blk1:
+       cmpl $1, %eax;
+       jb .Ldone_ecb;
+
+       leal -1(%eax), %eax;
+
+       /* Load input. */
+       vmovdqu (%ecx), %xmm2;
+       leal 16(%ecx), %ecx;
+
+       /* Xor first key. */
+       vpxor (0 * 16)(%edi), %xmm2, %xmm0;
+
+       testl %esi, %esi;
+       jz .Lecb_dec_blk1;
+               /* AES rounds. */
+               vaesenc (1 * 16)(%edi), %xmm0, %xmm0;
+               vaesenc (2 * 16)(%edi), %xmm0, %xmm0;
+               vaesenc (3 * 16)(%edi), %xmm0, %xmm0;
+               vaesenc (4 * 16)(%edi), %xmm0, %xmm0;
+               vaesenc (5 * 16)(%edi), %xmm0, %xmm0;
+               vaesenc (6 * 16)(%edi), %xmm0, %xmm0;
+               vaesenc (7 * 16)(%edi), %xmm0, %xmm0;
+               vaesenc (8 * 16)(%edi), %xmm0, %xmm0;
+               vaesenc (9 * 16)(%edi), %xmm0, %xmm0;
+               vmovdqa (10 * 16)(%edi), %xmm1;
+               cmpl $12, 8+24(%esp);
+               jb .Lecb_enc_blk1_last;
+               vaesenc %xmm1, %xmm0, %xmm0;
+               vaesenc (11 * 16)(%edi), %xmm0, %xmm0;
+               vmovdqa (12 * 16)(%edi), %xmm1;
+               jz .Lecb_enc_blk1_last;
+               vaesenc %xmm1, %xmm0, %xmm0;
+               vaesenc (13 * 16)(%edi), %xmm0, %xmm0;
+               vmovdqa (14 * 16)(%edi), %xmm1;
+         .Lecb_enc_blk1_last:
+               vaesenclast %xmm1, %xmm0, %xmm0;
+               jmp .Lecb_blk1_end;
+
+         .align 8
+         .Lecb_dec_blk1:
+               /* AES rounds. */
+               vaesdec (1 * 16)(%edi), %xmm0, %xmm0;
+               vaesdec (2 * 16)(%edi), %xmm0, %xmm0;
+               vaesdec (3 * 16)(%edi), %xmm0, %xmm0;
+               vaesdec (4 * 16)(%edi), %xmm0, %xmm0;
+               vaesdec (5 * 16)(%edi), %xmm0, %xmm0;
+               vaesdec (6 * 16)(%edi), %xmm0, %xmm0;
+               vaesdec (7 * 16)(%edi), %xmm0, %xmm0;
+               vaesdec (8 * 16)(%edi), %xmm0, %xmm0;
+               vaesdec (9 * 16)(%edi), %xmm0, %xmm0;
+               vmovdqa (10 * 16)(%edi), %xmm1;
+               cmpl $12, 8+24(%esp);
+               jb .Lecb_dec_blk1_last;
+               vaesdec %xmm1, %xmm0, %xmm0;
+               vaesdec (11 * 16)(%edi), %xmm0, %xmm0;
+               vmovdqa (12 * 16)(%edi), %xmm1;
+               jz .Lecb_dec_blk1_last;
+               vaesdec %xmm1, %xmm0, %xmm0;
+               vaesdec (13 * 16)(%edi), %xmm0, %xmm0;
+               vmovdqa (14 * 16)(%edi), %xmm1;
+         .Lecb_dec_blk1_last:
+               vaesdeclast %xmm1, %xmm0, %xmm0;
+               jmp .Lecb_blk1_end;
+
+  .align 8
+  .Lecb_blk1_end:
+       vmovdqu %xmm0, (%edx);
+       leal 16(%edx), %edx;
+
+       jmp .Lecb_blk1;
+
+.align 8
+.Ldone_ecb:
+       popl %esi;
+       CFI_POP(%esi);
+       popl %edi;
+       CFI_POP(%edi);
+       vzeroall;
+       ret_spec_stop
+       CFI_ENDPROC();
+ELF(.size SYM_NAME(_gcry_vaes_avx2_ecb_crypt_i386),
+         .-SYM_NAME(_gcry_vaes_avx2_ecb_crypt_i386))
+
+/**********************************************************************
+  constants
+ **********************************************************************/
+SECTION_RODATA
+
+ELF(.type SYM_NAME(_gcry_vaes_consts),@object)
+.align 32
+SYM_NAME(_gcry_vaes_consts):
+.Lbige_addb_0:
+       .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+.Lbige_addb_1:
+       .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1
+.Lbige_addb_2:
+       .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2
+.Lbige_addb_3:
+       .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3
+.Lbige_addb_4:
+       .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4
+.Lbige_addb_5:
+       .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5
+.Lbige_addb_6:
+       .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6
+.Lbige_addb_7:
+       .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7
+.Lbige_addb_8:
+       .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 8
+.Lbige_addb_9:
+       .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 9
+.Lbige_addb_10:
+       .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 10
+.Lbige_addb_11:
+       .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 11
+
+.Lle_addd_0:
+       .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+.Lle_addd_1:
+       .byte 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+.Lle_addd_2:
+       .byte 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+.Lle_addd_3:
+       .byte 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+.Lle_addd_4:
+       .byte 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+.Lle_addd_5:
+       .byte 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+.Lle_addd_6:
+       .byte 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+.Lle_addd_7:
+       .byte 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+.Lle_addd_8:
+       .byte 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+.Lle_addd_9:
+       .byte 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+.Lle_addd_10:
+       .byte 10, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+.Lle_addd_11:
+       .byte 11, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+
+.Lle_addd_4_2:
+       .byte 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+       .byte 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+.Lle_addd_12_2:
+       .byte 12, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+       .byte 12, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+
+.Lxts_gfmul_clmul:
+       .long 0x00, 0x87, 0x00, 0x00
+       .long 0x00, 0x87, 0x00, 0x00
+.Lxts_high_bit_shuf:
+       .byte -1, -1, -1, -1, 12, 13, 14, 15
+       .byte 4, 5, 6, 7, -1, -1, -1, -1
+       .byte -1, -1, -1, -1, 12, 13, 14, 15
+       .byte 4, 5, 6, 7, -1, -1, -1, -1
+.Lbswap128_mask:
+       .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
+       .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
+
+ELF(.size SYM_NAME(_gcry_vaes_consts),.-SYM_NAME(_gcry_vaes_consts))
+
+#endif /* HAVE_GCC_INLINE_ASM_VAES */
+#endif /* __i386__ */
diff --git a/cipher/rijndael-vaes-i386.c b/cipher/rijndael-vaes-i386.c
new file mode 100644 (file)
index 0000000..e10d3ac
--- /dev/null
@@ -0,0 +1,231 @@
+/* VAES/AVX2 i386 accelerated AES for Libgcrypt
+ * Copyright (C) 2023 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+#include <config.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "types.h"  /* for byte and u32 typedefs */
+#include "g10lib.h"
+#include "cipher.h"
+#include "bufhelp.h"
+#include "rijndael-internal.h"
+#include "./cipher-internal.h"
+
+
+#ifdef USE_VAES_I386
+
+
+extern void _gcry_aes_aesni_prepare_decryption(RIJNDAEL_context *ctx);
+
+
+extern void _gcry_vaes_avx2_cbc_dec_i386 (const void *keysched,
+                                         unsigned char *iv,
+                                         void *outbuf_arg,
+                                         const void *inbuf_arg,
+                                         size_t nblocks,
+                                         unsigned int nrounds);
+
+extern void _gcry_vaes_avx2_cfb_dec_i386 (const void *keysched,
+                                         unsigned char *iv,
+                                         void *outbuf_arg,
+                                         const void *inbuf_arg,
+                                         size_t nblocks,
+                                         unsigned int nrounds);
+
+extern void _gcry_vaes_avx2_ctr_enc_i386 (const void *keysched,
+                                         unsigned char *ctr,
+                                         void *outbuf_arg,
+                                         const void *inbuf_arg,
+                                         size_t nblocks,
+                                         unsigned int nrounds);
+
+extern void _gcry_vaes_avx2_ctr32le_enc_i386 (const void *keysched,
+                                             unsigned char *ctr,
+                                             void *outbuf_arg,
+                                             const void *inbuf_arg,
+                                             size_t nblocks,
+                                             unsigned int nrounds);
+
+extern size_t _gcry_vaes_avx2_ocb_crypt_i386 (const void *keysched,
+                                             void *outbuf_arg,
+                                             const void *inbuf_arg,
+                                             size_t nblocks,
+                                             unsigned int nrounds,
+                                             unsigned char *offset,
+                                             unsigned char *checksum,
+                                             unsigned int blkn,
+                                             const void *L_table,
+                                             int enc_dec_or_auth);
+
+extern void _gcry_vaes_avx2_xts_crypt_i386 (const void *keysched,
+                                           unsigned char *tweak,
+                                           void *outbuf_arg,
+                                           const void *inbuf_arg,
+                                           size_t nblocks,
+                                           unsigned int nrounds,
+                                           int encrypt);
+
+extern void _gcry_vaes_avx2_ecb_crypt_i386 (const void *keysched,
+                                           int encrypt,
+                                           void *outbuf_arg,
+                                           const void *inbuf_arg,
+                                           size_t nblocks,
+                                           unsigned int nrounds);
+
+
+void
+_gcry_aes_vaes_ecb_crypt (void *context, void *outbuf,
+                         const void *inbuf, size_t nblocks,
+                         int encrypt)
+{
+  RIJNDAEL_context *ctx = context;
+  const void *keysched = encrypt ? ctx->keyschenc32 : ctx->keyschdec32;
+  unsigned int nrounds = ctx->rounds;
+
+  if (!encrypt && !ctx->decryption_prepared)
+    {
+      _gcry_aes_aesni_prepare_decryption (ctx);
+      ctx->decryption_prepared = 1;
+    }
+
+  _gcry_vaes_avx2_ecb_crypt_i386 (keysched, encrypt, outbuf, inbuf,
+                                  nblocks, nrounds);
+}
+
+void
+_gcry_aes_vaes_cbc_dec (void *context, unsigned char *iv,
+                       void *outbuf, const void *inbuf,
+                       size_t nblocks)
+{
+  RIJNDAEL_context *ctx = context;
+  const void *keysched = ctx->keyschdec32;
+  unsigned int nrounds = ctx->rounds;
+
+  if (!ctx->decryption_prepared)
+    {
+      _gcry_aes_aesni_prepare_decryption (ctx);
+      ctx->decryption_prepared = 1;
+    }
+
+  _gcry_vaes_avx2_cbc_dec_i386 (keysched, iv, outbuf, inbuf, nblocks, nrounds);
+}
+
+void
+_gcry_aes_vaes_cfb_dec (void *context, unsigned char *iv,
+                       void *outbuf, const void *inbuf,
+                       size_t nblocks)
+{
+  RIJNDAEL_context *ctx = context;
+  const void *keysched = ctx->keyschenc32;
+  unsigned int nrounds = ctx->rounds;
+
+  _gcry_vaes_avx2_cfb_dec_i386 (keysched, iv, outbuf, inbuf, nblocks, nrounds);
+}
+
+void
+_gcry_aes_vaes_ctr_enc (void *context, unsigned char *iv,
+                       void *outbuf, const void *inbuf,
+                       size_t nblocks)
+{
+  RIJNDAEL_context *ctx = context;
+  const void *keysched = ctx->keyschenc32;
+  unsigned int nrounds = ctx->rounds;
+
+  _gcry_vaes_avx2_ctr_enc_i386 (keysched, iv, outbuf, inbuf, nblocks, nrounds);
+}
+
+void
+_gcry_aes_vaes_ctr32le_enc (void *context, unsigned char *iv,
+                           void *outbuf, const void *inbuf,
+                           size_t nblocks)
+{
+  RIJNDAEL_context *ctx = context;
+  const void *keysched = ctx->keyschenc32;
+  unsigned int nrounds = ctx->rounds;
+
+  _gcry_vaes_avx2_ctr32le_enc_i386 (keysched, iv, outbuf, inbuf, nblocks,
+                                    nrounds);
+}
+
+size_t
+_gcry_aes_vaes_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg,
+                         const void *inbuf_arg, size_t nblocks,
+                         int encrypt)
+{
+  RIJNDAEL_context *ctx = (void *)&c->context.c;
+  const void *keysched = encrypt ? ctx->keyschenc32 : ctx->keyschdec32;
+  unsigned char *outbuf = outbuf_arg;
+  const unsigned char *inbuf = inbuf_arg;
+  u64 blkn;
+
+  if (!encrypt && !ctx->decryption_prepared)
+    {
+      _gcry_aes_aesni_prepare_decryption (ctx);
+      ctx->decryption_prepared = 1;
+    }
+
+  blkn = c->u_mode.ocb.data_nblocks;
+  c->u_mode.ocb.data_nblocks = blkn + nblocks;
+
+  return _gcry_vaes_avx2_ocb_crypt_i386 (keysched, outbuf, inbuf, nblocks,
+                                        ctx->rounds, c->u_iv.iv, c->u_ctr.ctr,
+                                        (unsigned int)blkn,
+                                        &c->u_mode.ocb.L[0], encrypt);
+}
+
+size_t
+_gcry_aes_vaes_ocb_auth (gcry_cipher_hd_t c, const void *inbuf_arg,
+                        size_t nblocks)
+{
+  RIJNDAEL_context *ctx = (void *)&c->context.c;
+  const void *keysched = ctx->keyschenc32;
+  const unsigned char *inbuf = inbuf_arg;
+  u64 blkn = c->u_mode.ocb.aad_nblocks;
+
+  c->u_mode.ocb.aad_nblocks = blkn + nblocks;
+
+  return _gcry_vaes_avx2_ocb_crypt_i386 (keysched, NULL, inbuf, nblocks,
+                                        ctx->rounds, c->u_mode.ocb.aad_offset,
+                                        c->u_mode.ocb.aad_sum,
+                                        (unsigned int)blkn,
+                                        &c->u_mode.ocb.L[0], 2);
+}
+
+void
+_gcry_aes_vaes_xts_crypt (void *context, unsigned char *tweak,
+                         void *outbuf, const void *inbuf,
+                         size_t nblocks, int encrypt)
+{
+  RIJNDAEL_context *ctx = context;
+  const void *keysched = encrypt ? ctx->keyschenc32 : ctx->keyschdec32;
+  unsigned int nrounds = ctx->rounds;
+
+  if (!encrypt && !ctx->decryption_prepared)
+    {
+      _gcry_aes_aesni_prepare_decryption (ctx);
+      ctx->decryption_prepared = 1;
+    }
+
+  _gcry_vaes_avx2_xts_crypt_i386 (keysched, tweak, outbuf, inbuf, nblocks,
+                                  nrounds, encrypt);
+}
+
+#endif /* USE_VAES_I386 */
index 0d7d136728fb3e47b3bfa1ad9d4f738f70e255de..478904d070ef36b6246afe6a98cf4acad667de2a 100644 (file)
@@ -1,4 +1,4 @@
-/* VAES/AVX2 accelerated AES for Libgcrypt
+/* VAES/AVX2 AMD64 accelerated AES for Libgcrypt
  * Copyright (C) 2021 Jussi Kivilinna <jussi.kivilinna@iki.fi>
  *
  * This file is part of Libgcrypt.
@@ -26,7 +26,6 @@
 #include "g10lib.h"
 #include "cipher.h"
 #include "bufhelp.h"
-#include "cipher-selftest.h"
 #include "rijndael-internal.h"
 #include "./cipher-internal.h"
 
@@ -41,7 +40,7 @@
 # endif
 
 
-extern void _gcry_aes_aesni_prepare_decryption(RIJNDAEL_context *ctx);
+extern void _gcry_aes_aesni_prepare_decryption (RIJNDAEL_context *ctx);
 
 
 extern void _gcry_vaes_avx2_cbc_dec_amd64 (const void *keysched,
@@ -73,25 +72,51 @@ extern void _gcry_vaes_avx2_ctr32le_enc_amd64 (const void *keysched,
                                               unsigned int nrounds)
                                                ASM_FUNC_ABI;
 
-extern void _gcry_vaes_avx2_ocb_crypt_amd64 (const void *keysched,
-                                            unsigned int blkn,
+extern size_t _gcry_vaes_avx2_ocb_crypt_amd64 (const void *keysched,
+                                              unsigned int blkn,
+                                              void *outbuf_arg,
+                                              const void *inbuf_arg,
+                                              size_t nblocks,
+                                              unsigned int nrounds,
+                                              unsigned char *offset,
+                                              unsigned char *checksum,
+                                              unsigned char *L_table,
+                                              int enc_dec_auth) ASM_FUNC_ABI;
+
+extern void _gcry_vaes_avx2_xts_crypt_amd64 (const void *keysched,
+                                            unsigned char *tweak,
                                             void *outbuf_arg,
                                             const void *inbuf_arg,
                                             size_t nblocks,
                                             unsigned int nrounds,
-                                            unsigned char *offset,
-                                            unsigned char *checksum,
-                                            unsigned char *L_table,
                                             int encrypt) ASM_FUNC_ABI;
 
-extern void _gcry_vaes_avx2_xts_crypt_amd64 (const void *keysched,
-                                            unsigned char *tweak,
+extern void _gcry_vaes_avx2_ecb_crypt_amd64 (const void *keysched,
+                                            int encrypt,
                                             void *outbuf_arg,
                                             const void *inbuf_arg,
                                             size_t nblocks,
-                                            unsigned int nrounds,
-                                            int encrypt) ASM_FUNC_ABI;
+                                            unsigned int nrounds) ASM_FUNC_ABI;
+
+
+void
+_gcry_aes_vaes_ecb_crypt (void *context, void *outbuf,
+                         const void *inbuf, size_t nblocks,
+                         int encrypt)
+{
+  RIJNDAEL_context *ctx = context;
+  const void *keysched = encrypt ? ctx->keyschenc32 : ctx->keyschdec32;
+  unsigned int nrounds = ctx->rounds;
 
+  if (!encrypt && !ctx->decryption_prepared)
+    {
+      _gcry_aes_aesni_prepare_decryption (ctx);
+      ctx->decryption_prepared = 1;
+    }
+
+  _gcry_vaes_avx2_ecb_crypt_amd64 (keysched, encrypt, outbuf, inbuf,
+                                  nblocks, nrounds);
+}
 
 void
 _gcry_aes_vaes_cbc_dec (void *context, unsigned char *iv,
@@ -168,11 +193,29 @@ _gcry_aes_vaes_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg,
 
   c->u_mode.ocb.data_nblocks = blkn + nblocks;
 
-  _gcry_vaes_avx2_ocb_crypt_amd64 (keysched, (unsigned int)blkn, outbuf, inbuf,
-                                  nblocks, nrounds, c->u_iv.iv, c->u_ctr.ctr,
-                                  c->u_mode.ocb.L[0], encrypt);
+  return _gcry_vaes_avx2_ocb_crypt_amd64 (keysched, (unsigned int)blkn, outbuf,
+                                         inbuf, nblocks, nrounds, c->u_iv.iv,
+                                         c->u_ctr.ctr, c->u_mode.ocb.L[0],
+                                         encrypt);
+}
+
+size_t
+_gcry_aes_vaes_ocb_auth (gcry_cipher_hd_t c, const void *inbuf_arg,
+                        size_t nblocks)
+{
+  RIJNDAEL_context *ctx = (void *)&c->context.c;
+  const void *keysched = ctx->keyschenc32;
+  const unsigned char *inbuf = inbuf_arg;
+  unsigned int nrounds = ctx->rounds;
+  u64 blkn = c->u_mode.ocb.aad_nblocks;
+
+  c->u_mode.ocb.aad_nblocks = blkn + nblocks;
 
-  return 0;
+  return _gcry_vaes_avx2_ocb_crypt_amd64 (keysched, (unsigned int)blkn, NULL,
+                                         inbuf, nblocks, nrounds,
+                                         c->u_mode.ocb.aad_offset,
+                                         c->u_mode.ocb.aad_sum,
+                                         c->u_mode.ocb.L[0], 2);
 }
 
 void
index 9b96b616616f4d0471c66b8245e616bfe9f591a6..f1683007304646c4e59f3c63b04342d37d36108a 100644 (file)
@@ -46,7 +46,6 @@
 #include "g10lib.h"
 #include "cipher.h"
 #include "bufhelp.h"
-#include "cipher-selftest.h"
 #include "rijndael-internal.h"
 #include "./cipher-internal.h"
 
@@ -103,10 +102,13 @@ extern size_t _gcry_aes_aesni_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg
 extern void _gcry_aes_aesni_xts_crypt (void *context, unsigned char *tweak,
                                        void *outbuf_arg, const void *inbuf_arg,
                                        size_t nblocks, int encrypt);
+extern void _gcry_aes_aesni_ecb_crypt (void *context, void *outbuf_arg,
+                                      const void *inbuf_arg, size_t nblocks,
+                                      int encrypt);
 #endif
 
-#ifdef USE_VAES
-/* VAES (AMD64) accelerated implementation of AES */
+#if defined(USE_VAES_I386) || defined(USE_VAES)
+/* VAES (i386/AMD64) accelerated implementation of AES */
 
 extern void _gcry_aes_vaes_cfb_dec (void *context, unsigned char *iv,
                                    void *outbuf_arg, const void *inbuf_arg,
@@ -123,9 +125,15 @@ extern void _gcry_aes_vaes_ctr32le_enc (void *context, unsigned char *ctr,
 extern size_t _gcry_aes_vaes_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg,
                                        const void *inbuf_arg, size_t nblocks,
                                        int encrypt);
+extern size_t _gcry_aes_vaes_ocb_auth (gcry_cipher_hd_t c,
+                                      const void *inbuf_arg,
+                                      size_t nblocks);
 extern void _gcry_aes_vaes_xts_crypt (void *context, unsigned char *tweak,
                                      void *outbuf_arg, const void *inbuf_arg,
                                      size_t nblocks, int encrypt);
+extern void _gcry_aes_vaes_ecb_crypt (void *context, void *outbuf_arg,
+                                     const void *inbuf_arg, size_t nblocks,
+                                     int encrypt);
 #endif
 
 #ifdef USE_SSSE3
@@ -228,6 +236,9 @@ extern void _gcry_aes_armv8_ce_xts_crypt (void *context, unsigned char *tweak,
                                           void *outbuf_arg,
                                           const void *inbuf_arg,
                                           size_t nblocks, int encrypt);
+extern void _gcry_aes_armv8_ce_ecb_crypt (void *context, void *outbuf_arg,
+                                          const void *inbuf_arg, size_t nblocks,
+                                          int encrypt);
 #endif /*USE_ARM_ASM*/
 
 #ifdef USE_PPC_CRYPTO
@@ -242,6 +253,10 @@ extern unsigned int _gcry_aes_ppc8_decrypt(const RIJNDAEL_context *ctx,
                                           unsigned char *dst,
                                           const unsigned char *src);
 
+extern void _gcry_aes_ppc8_ecb_crypt (void *context, void *outbuf_arg,
+                                     const void *inbuf_arg, size_t nblocks,
+                                     int encrypt);
+
 extern void _gcry_aes_ppc8_cfb_enc (void *context, unsigned char *iv,
                                    void *outbuf_arg, const void *inbuf_arg,
                                    size_t nblocks);
@@ -268,6 +283,10 @@ extern void _gcry_aes_ppc8_xts_crypt (void *context, unsigned char *tweak,
                                      void *outbuf_arg,
                                      const void *inbuf_arg,
                                      size_t nblocks, int encrypt);
+
+extern void _gcry_aes_ppc8_ctr32le_enc (void *context, unsigned char *ctr,
+                                       void *outbuf_arg, const void *inbuf_arg,
+                                       size_t nblocks);
 #endif /*USE_PPC_CRYPTO*/
 
 #ifdef USE_PPC_CRYPTO_WITH_PPC9LE
@@ -279,6 +298,10 @@ extern unsigned int _gcry_aes_ppc9le_decrypt(const RIJNDAEL_context *ctx,
                                            unsigned char *dst,
                                            const unsigned char *src);
 
+extern void _gcry_aes_ppc9le_ecb_crypt (void *context, void *outbuf_arg,
+                                       const void *inbuf_arg, size_t nblocks,
+                                       int encrypt);
+
 extern void _gcry_aes_ppc9le_cfb_enc (void *context, unsigned char *iv,
                                      void *outbuf_arg, const void *inbuf_arg,
                                      size_t nblocks);
@@ -306,6 +329,11 @@ extern void _gcry_aes_ppc9le_xts_crypt (void *context, unsigned char *tweak,
                                        const void *inbuf_arg,
                                        size_t nblocks, int encrypt);
 
+extern void _gcry_aes_ppc9le_ctr32le_enc (void *context, unsigned char *ctr,
+                                         void *outbuf_arg,
+                                         const void *inbuf_arg,
+                                         size_t nblocks);
+
 extern size_t _gcry_aes_p10le_gcm_crypt (gcry_cipher_hd_t c, void *outbuf_arg,
                                         const void *inbuf_arg,
                                         size_t nblocks, int encrypt);
@@ -423,6 +451,17 @@ static void prefetch_dec(void)
 
 
 \f
+static inline u32
+sbox4(u32 inb4)
+{
+  u32 out;
+  out =  (encT[(inb4 >> 0) & 0xffU] & 0xff00U) >> 8;
+  out |= (encT[(inb4 >> 8) & 0xffU] & 0xff00U) >> 0;
+  out |= (encT[(inb4 >> 16) & 0xffU] & 0xff0000U) << 0;
+  out |= (encT[(inb4 >> 24) & 0xffU] & 0xff0000U) << 8;
+  return out;
+}
+
 /* Perform the key setup.  */
 static gcry_err_code_t
 do_setkey (RIJNDAEL_context *ctx, const byte *key, const unsigned keylen,
@@ -432,8 +471,7 @@ do_setkey (RIJNDAEL_context *ctx, const byte *key, const unsigned keylen,
   static const char *selftest_failed = 0;
   void (*hw_setkey)(RIJNDAEL_context *ctx, const byte *key) = NULL;
   int rounds;
-  int i,j, r, t, rconpointer = 0;
-  int KC;
+  unsigned int KC;
   unsigned int hwfeatures;
 
   /* The on-the-fly self tests are only run in non-fips mode. In fips
@@ -515,6 +553,7 @@ do_setkey (RIJNDAEL_context *ctx, const byte *key, const unsigned keylen,
       bulk_ops->ocb_crypt = _gcry_aes_aesni_ocb_crypt;
       bulk_ops->ocb_auth = _gcry_aes_aesni_ocb_auth;
       bulk_ops->xts_crypt = _gcry_aes_aesni_xts_crypt;
+      bulk_ops->ecb_crypt = _gcry_aes_aesni_ecb_crypt;
 
 #ifdef USE_VAES
       if ((hwfeatures & HWF_INTEL_VAES_VPCLMUL) &&
@@ -526,7 +565,24 @@ do_setkey (RIJNDAEL_context *ctx, const byte *key, const unsigned keylen,
          bulk_ops->ctr_enc = _gcry_aes_vaes_ctr_enc;
          bulk_ops->ctr32le_enc = _gcry_aes_vaes_ctr32le_enc;
          bulk_ops->ocb_crypt = _gcry_aes_vaes_ocb_crypt;
+         bulk_ops->ocb_auth = _gcry_aes_vaes_ocb_auth;
+         bulk_ops->xts_crypt = _gcry_aes_vaes_xts_crypt;
+         bulk_ops->ecb_crypt = _gcry_aes_vaes_ecb_crypt;
+       }
+#endif
+#ifdef USE_VAES_I386
+      if ((hwfeatures & HWF_INTEL_VAES_VPCLMUL) &&
+         (hwfeatures & HWF_INTEL_AVX2))
+       {
+         /* Setup VAES bulk encryption routines.  */
+         bulk_ops->cfb_dec = _gcry_aes_vaes_cfb_dec;
+         bulk_ops->cbc_dec = _gcry_aes_vaes_cbc_dec;
+         bulk_ops->ctr_enc = _gcry_aes_vaes_ctr_enc;
+         bulk_ops->ctr32le_enc = _gcry_aes_vaes_ctr32le_enc;
+         bulk_ops->ocb_crypt = _gcry_aes_vaes_ocb_crypt;
+         bulk_ops->ocb_auth = _gcry_aes_vaes_ocb_auth;
          bulk_ops->xts_crypt = _gcry_aes_vaes_xts_crypt;
+         bulk_ops->ecb_crypt = _gcry_aes_vaes_ecb_crypt;
        }
 #endif
     }
@@ -582,6 +638,7 @@ do_setkey (RIJNDAEL_context *ctx, const byte *key, const unsigned keylen,
       bulk_ops->ocb_crypt = _gcry_aes_armv8_ce_ocb_crypt;
       bulk_ops->ocb_auth = _gcry_aes_armv8_ce_ocb_auth;
       bulk_ops->xts_crypt = _gcry_aes_armv8_ce_xts_crypt;
+      bulk_ops->ecb_crypt = _gcry_aes_armv8_ce_ecb_crypt;
     }
 #endif
 #ifdef USE_PPC_CRYPTO_WITH_PPC9LE
@@ -595,6 +652,7 @@ do_setkey (RIJNDAEL_context *ctx, const byte *key, const unsigned keylen,
       ctx->prepare_decryption = _gcry_aes_ppc8_prepare_decryption;
 
       /* Setup PPC9LE bulk encryption routines.  */
+      bulk_ops->ecb_crypt = _gcry_aes_ppc9le_ecb_crypt;
       bulk_ops->cfb_enc = _gcry_aes_ppc9le_cfb_enc;
       bulk_ops->cfb_dec = _gcry_aes_ppc9le_cfb_dec;
       bulk_ops->cbc_enc = _gcry_aes_ppc9le_cbc_enc;
@@ -603,8 +661,15 @@ do_setkey (RIJNDAEL_context *ctx, const byte *key, const unsigned keylen,
       bulk_ops->ocb_crypt = _gcry_aes_ppc9le_ocb_crypt;
       bulk_ops->ocb_auth = _gcry_aes_ppc9le_ocb_auth;
       bulk_ops->xts_crypt = _gcry_aes_ppc9le_xts_crypt;
+      bulk_ops->ctr32le_enc = _gcry_aes_ppc9le_ctr32le_enc;
       if (hwfeatures & HWF_PPC_ARCH_3_10)  /* for P10 */
         bulk_ops->gcm_crypt = _gcry_aes_p10le_gcm_crypt;
+# ifdef ENABLE_FORCE_SOFT_HWFEATURES
+      /* HWF_PPC_ARCH_3_10 above is used as soft HW-feature indicator for P10.
+       * Actual implementation works with HWF_PPC_ARCH_3_00 also. */
+      if (hwfeatures & HWF_PPC_ARCH_3_00)
+        bulk_ops->gcm_crypt = _gcry_aes_p10le_gcm_crypt;
+# endif
     }
 #endif
 #ifdef USE_PPC_CRYPTO
@@ -618,6 +683,7 @@ do_setkey (RIJNDAEL_context *ctx, const byte *key, const unsigned keylen,
       ctx->prepare_decryption = _gcry_aes_ppc8_prepare_decryption;
 
       /* Setup PPC8 bulk encryption routines.  */
+      bulk_ops->ecb_crypt = _gcry_aes_ppc8_ecb_crypt;
       bulk_ops->cfb_enc = _gcry_aes_ppc8_cfb_enc;
       bulk_ops->cfb_dec = _gcry_aes_ppc8_cfb_dec;
       bulk_ops->cbc_enc = _gcry_aes_ppc8_cbc_enc;
@@ -626,6 +692,7 @@ do_setkey (RIJNDAEL_context *ctx, const byte *key, const unsigned keylen,
       bulk_ops->ocb_crypt = _gcry_aes_ppc8_ocb_crypt;
       bulk_ops->ocb_auth = _gcry_aes_ppc8_ocb_auth;
       bulk_ops->xts_crypt = _gcry_aes_ppc8_xts_crypt;
+      bulk_ops->ctr32le_enc = _gcry_aes_ppc8_ctr32le_enc;
     }
 #endif
 #ifdef USE_S390X_CRYPTO
@@ -657,101 +724,43 @@ do_setkey (RIJNDAEL_context *ctx, const byte *key, const unsigned keylen,
     }
   else
     {
-      const byte *sbox = ((const byte *)encT) + 1;
-      union
-        {
-          PROPERLY_ALIGNED_TYPE dummy;
-          byte data[MAXKC][4];
-          u32 data32[MAXKC];
-        } tkk[2];
-#define k      tkk[0].data
-#define k_u32  tkk[0].data32
-#define tk     tkk[1].data
-#define tk_u32 tkk[1].data32
-#define W      (ctx->keyschenc)
-#define W_u32  (ctx->keyschenc32)
+      u32 W_prev;
+      u32 *W_u32 = ctx->keyschenc32b;
+      byte rcon = 1;
+      unsigned int i, j;
 
       prefetch_enc();
 
-      for (i = 0; i < keylen; i++)
+      for (i = 0; i < KC; i += 2)
         {
-          k[i >> 2][i & 3] = key[i];
+          W_u32[i + 0] = buf_get_le32(key + i * 4 + 0);
+          W_u32[i + 1] = buf_get_le32(key + i * 4 + 4);
         }
 
-      for (j = KC-1; j >= 0; j--)
-        {
-          tk_u32[j] = k_u32[j];
-        }
-      r = 0;
-      t = 0;
-      /* Copy values into round key array.  */
-      for (j = 0; (j < KC) && (r < rounds + 1); )
+      for (i = KC, j = KC, W_prev = W_u32[KC - 1];
+           i < 4 * (rounds + 1);
+           i += 2, j += 2)
         {
-          for (; (j < KC) && (t < 4); j++, t++)
-            {
-              W_u32[r][t] = le_bswap32(tk_u32[j]);
-            }
-          if (t == 4)
-            {
-              r++;
-              t = 0;
-            }
-        }
+          u32 temp0 = W_prev;
+          u32 temp1;
 
-      while (r < rounds + 1)
-        {
-          /* While not enough round key material calculated calculate
-             new values.  */
-          tk[0][0] ^= sbox[tk[KC-1][1] * 4];
-          tk[0][1] ^= sbox[tk[KC-1][2] * 4];
-          tk[0][2] ^= sbox[tk[KC-1][3] * 4];
-          tk[0][3] ^= sbox[tk[KC-1][0] * 4];
-          tk[0][0] ^= rcon[rconpointer++];
-
-          if (KC != 8)
+          if (j == KC)
             {
-              for (j = 1; j < KC; j++)
-                {
-                  tk_u32[j] ^= tk_u32[j-1];
-                }
+              j = 0;
+              temp0 = sbox4(rol(temp0, 24)) ^ rcon;
+              rcon = ((rcon << 1) ^ (-(rcon >> 7) & 0x1b)) & 0xff;
             }
-          else
+          else if (KC == 8 && j == 4)
             {
-              for (j = 1; j < KC/2; j++)
-                {
-                  tk_u32[j] ^= tk_u32[j-1];
-                }
-              tk[KC/2][0] ^= sbox[tk[KC/2 - 1][0] * 4];
-              tk[KC/2][1] ^= sbox[tk[KC/2 - 1][1] * 4];
-              tk[KC/2][2] ^= sbox[tk[KC/2 - 1][2] * 4];
-              tk[KC/2][3] ^= sbox[tk[KC/2 - 1][3] * 4];
-              for (j = KC/2 + 1; j < KC; j++)
-                {
-                  tk_u32[j] ^= tk_u32[j-1];
-                }
+              temp0 = sbox4(temp0);
             }
 
-          /* Copy values into round key array.  */
-          for (j = 0; (j < KC) && (r < rounds + 1); )
-            {
-              for (; (j < KC) && (t < 4); j++, t++)
-                {
-                  W_u32[r][t] = le_bswap32(tk_u32[j]);
-                }
-              if (t == 4)
-                {
-                  r++;
-                  t = 0;
-                }
-            }
+          temp1 = W_u32[i - KC + 0];
+
+          W_u32[i + 0] = temp0 ^ temp1;
+          W_u32[i + 1] = W_u32[i - KC + 1] ^ temp0 ^ temp1;
+          W_prev = W_u32[i + 1];
         }
-#undef W
-#undef tk
-#undef k
-#undef W_u32
-#undef tk_u32
-#undef k_u32
-      wipememory(&tkk, sizeof(tkk));
     }
 
   return 0;
@@ -1535,7 +1544,7 @@ static const char*
 selftest_basic_128 (void)
 {
   RIJNDAEL_context *ctx;
-  unsigned char *ctxmem;
+  unsigned char ctxmem[sizeof(*ctx) + 16];
   unsigned char scratch[16];
   cipher_bulk_ops_t bulk_ops;
 
@@ -1579,21 +1588,15 @@ selftest_basic_128 (void)
     };
 #endif
 
-  /* Because gcc/ld can only align the CTX struct on 8 bytes on the
-     stack, we need to allocate that context on the heap.  */
-  ctx = _gcry_cipher_selftest_alloc_ctx (sizeof *ctx, &ctxmem);
-  if (!ctx)
-    return "failed to allocate memory";
+  ctx = (void *)(ctxmem + ((16 - ((uintptr_t)ctxmem & 15)) & 15));
 
   rijndael_setkey (ctx, key_128, sizeof (key_128), &bulk_ops);
   rijndael_encrypt (ctx, scratch, plaintext_128);
   if (memcmp (scratch, ciphertext_128, sizeof (ciphertext_128)))
     {
-      xfree (ctxmem);
       return "AES-128 test encryption failed.";
     }
   rijndael_decrypt (ctx, scratch, scratch);
-  xfree (ctxmem);
   if (memcmp (scratch, plaintext_128, sizeof (plaintext_128)))
     return "AES-128 test decryption failed.";
 
@@ -1605,7 +1608,7 @@ static const char*
 selftest_basic_192 (void)
 {
   RIJNDAEL_context *ctx;
-  unsigned char *ctxmem;
+  unsigned char ctxmem[sizeof(*ctx) + 16];
   unsigned char scratch[16];
   cipher_bulk_ops_t bulk_ops;
 
@@ -1626,18 +1629,15 @@ selftest_basic_192 (void)
       0x12,0x13,0x1A,0xC7,0xC5,0x47,0x88,0xAA
     };
 
-  ctx = _gcry_cipher_selftest_alloc_ctx (sizeof *ctx, &ctxmem);
-  if (!ctx)
-    return "failed to allocate memory";
+  ctx = (void *)(ctxmem + ((16 - ((uintptr_t)ctxmem & 15)) & 15));
+
   rijndael_setkey (ctx, key_192, sizeof(key_192), &bulk_ops);
   rijndael_encrypt (ctx, scratch, plaintext_192);
   if (memcmp (scratch, ciphertext_192, sizeof (ciphertext_192)))
     {
-      xfree (ctxmem);
       return "AES-192 test encryption failed.";
     }
   rijndael_decrypt (ctx, scratch, scratch);
-  xfree (ctxmem);
   if (memcmp (scratch, plaintext_192, sizeof (plaintext_192)))
     return "AES-192 test decryption failed.";
 
@@ -1650,7 +1650,7 @@ static const char*
 selftest_basic_256 (void)
 {
   RIJNDAEL_context *ctx;
-  unsigned char *ctxmem;
+  unsigned char ctxmem[sizeof(*ctx) + 16];
   unsigned char scratch[16];
   cipher_bulk_ops_t bulk_ops;
 
@@ -1672,18 +1672,15 @@ selftest_basic_256 (void)
       0x9A,0xCF,0x72,0x80,0x86,0x04,0x0A,0xE3
     };
 
-  ctx = _gcry_cipher_selftest_alloc_ctx (sizeof *ctx, &ctxmem);
-  if (!ctx)
-    return "failed to allocate memory";
+  ctx = (void *)(ctxmem + ((16 - ((uintptr_t)ctxmem & 15)) & 15));
+
   rijndael_setkey (ctx, key_256, sizeof(key_256), &bulk_ops);
   rijndael_encrypt (ctx, scratch, plaintext_256);
   if (memcmp (scratch, ciphertext_256, sizeof (ciphertext_256)))
     {
-      xfree (ctxmem);
       return "AES-256 test encryption failed.";
     }
   rijndael_decrypt (ctx, scratch, scratch);
-  xfree (ctxmem);
   if (memcmp (scratch, plaintext_256, sizeof (plaintext_256)))
     return "AES-256 test decryption failed.";
 
@@ -1691,60 +1688,6 @@ selftest_basic_256 (void)
 }
 
 
-/* Run the self-tests for AES-CTR-128, tests IV increment of bulk CTR
-   encryption.  Returns NULL on success. */
-static const char*
-selftest_ctr_128 (void)
-{
-#ifdef USE_VAES
-  const int nblocks = 16+1;
-#else
-  const int nblocks = 8+1;
-#endif
-  const int blocksize = BLOCKSIZE;
-  const int context_size = sizeof(RIJNDAEL_context);
-
-  return _gcry_selftest_helper_ctr("AES", &rijndael_setkey,
-           &rijndael_encrypt, nblocks, blocksize, context_size);
-}
-
-
-/* Run the self-tests for AES-CBC-128, tests bulk CBC decryption.
-   Returns NULL on success. */
-static const char*
-selftest_cbc_128 (void)
-{
-#ifdef USE_VAES
-  const int nblocks = 16+2;
-#else
-  const int nblocks = 8+2;
-#endif
-  const int blocksize = BLOCKSIZE;
-  const int context_size = sizeof(RIJNDAEL_context);
-
-  return _gcry_selftest_helper_cbc("AES", &rijndael_setkey,
-           &rijndael_encrypt, nblocks, blocksize, context_size);
-}
-
-
-/* Run the self-tests for AES-CFB-128, tests bulk CFB decryption.
-   Returns NULL on success. */
-static const char*
-selftest_cfb_128 (void)
-{
-#ifdef USE_VAES
-  const int nblocks = 16+2;
-#else
-  const int nblocks = 8+2;
-#endif
-  const int blocksize = BLOCKSIZE;
-  const int context_size = sizeof(RIJNDAEL_context);
-
-  return _gcry_selftest_helper_cfb("AES", &rijndael_setkey,
-           &rijndael_encrypt, nblocks, blocksize, context_size);
-}
-
-
 /* Run all the self-tests and return NULL on success.  This function
    is used for the on-the-fly self-tests. */
 static const char *
@@ -1757,15 +1700,6 @@ selftest (void)
        || (r = selftest_basic_256 ()) )
     return r;
 
-  if ( (r = selftest_ctr_128 ()) )
-    return r;
-
-  if ( (r = selftest_cbc_128 ()) )
-    return r;
-
-  if ( (r = selftest_cfb_128 ()) )
-    return r;
-
   return r;
 }
 
index 5c54fdffd826f12359f9b3575ec6092ff3f3950a..6c03d4c72ca06d9ef2a540acb61a7e74d646db53 100644 (file)
@@ -14,8 +14,8 @@
  * GNU Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA
+ * License along with this program; if not, see <https://www.gnu.org/licenses/>.
+ * SPDX-License-Identifier: LGPL-2.1-or-later
  */
 
 #include <config.h>
index 9fe7b406544ad60e4b8b5d9a8353d657190f7c2d..1920eedd84cefdc9ee6c0452fe676f79f6cddeaa 100644 (file)
@@ -829,7 +829,12 @@ _gcry_rsa_pss_encode (gcry_mpi_t *r_result, unsigned int nbits, int algo,
     return rc;
 
   /* Get the length of the digest.  */
-  hlen = _gcry_md_get_algo_dlen (algo);
+  if (algo == GCRY_MD_SHAKE128)
+    hlen = 32;
+  else if (algo == GCRY_MD_SHAKE256)
+    hlen = 64;
+  else
+    hlen = _gcry_md_get_algo_dlen (algo);
   gcry_assert (hlen);  /* We expect a valid ALGO here.  */
 
   /* The FIPS 186-4 Section 5.5 allows only 0 <= sLen <= hLen */
@@ -909,7 +914,20 @@ _gcry_rsa_pss_encode (gcry_mpi_t *r_result, unsigned int nbits, int algo,
   memcpy (p, salt, saltlen);
 
   /* Step 9: dbmask = MGF(H, emlen - hlen - 1).  */
-  mgf1 (dbmask, emlen - hlen - 1, h, hlen, algo);
+  if (algo == GCRY_MD_SHAKE128 || algo == GCRY_MD_SHAKE256)
+    {
+      gcry_buffer_t iov;
+
+      iov.size = 0;
+      iov.data = (void *)h;
+      iov.off = 0;
+      iov.len = hlen;
+
+      _gcry_md_hash_buffers_extract (algo, 0, dbmask, emlen - hlen - 1,
+                                     &iov, 1);
+    }
+  else
+    mgf1 (dbmask, emlen - hlen - 1, h, hlen, algo);
 
   /* Step 10: maskedDB = DB ^ dbMask */
   for (n = 0, p = dbmask; n < emlen - hlen - 1; n++, p++)
@@ -977,7 +995,12 @@ _gcry_rsa_pss_verify (gcry_mpi_t value, int hashed_already,
     return rc;
 
   /* Get the length of the digest.  */
-  hlen = _gcry_md_get_algo_dlen (algo);
+  if (algo == GCRY_MD_SHAKE128)
+    hlen = 32;
+  else if (algo == GCRY_MD_SHAKE256)
+    hlen = 64;
+  else
+    hlen = _gcry_md_get_algo_dlen (algo);
   gcry_assert (hlen);  /* We expect a valid ALGO here.  */
 
   /* The FIPS 186-4 Section 5.5 allows only 0 <= sLen <= hLen */
@@ -1065,7 +1088,20 @@ _gcry_rsa_pss_verify (gcry_mpi_t value, int hashed_already,
     }
 
   /* Step 7: dbmask = MGF(H, emlen - hlen - 1).  */
-  mgf1 (dbmask, emlen - hlen - 1, h, hlen, algo);
+  if (algo == GCRY_MD_SHAKE128 || algo == GCRY_MD_SHAKE256)
+    {
+      gcry_buffer_t iov;
+
+      iov.size = 0;
+      iov.data = (void *)h;
+      iov.off = 0;
+      iov.len = hlen;
+
+      _gcry_md_hash_buffers_extract (algo, 0, dbmask, emlen - hlen - 1,
+                                     &iov, 1);
+    }
+  else
+    mgf1 (dbmask, emlen - hlen - 1, h, hlen, algo);
 
   /* Step 8: maskedDB = DB ^ dbMask.  */
   for (n = 0, p = dbmask; n < emlen - hlen - 1; n++, p++)
index ff66e6f8f3c66e6f118dfba18e86ab43e7e7f9eb..c7a809f49532da34ed5d23c56d31ea497b2330fe 100644 (file)
@@ -1468,12 +1468,6 @@ rsa_decrypt (gcry_sexp_t *r_plain, gcry_sexp_t s_data, gcry_sexp_t keyparms)
       rc = GPG_ERR_INV_DATA;
       goto leave;
     }
-  if (fips_mode () && (ctx.encoding == PUBKEY_ENC_PKCS1 ||
-                       ctx.encoding == PUBKEY_ENC_OAEP))
-    {
-      rc = GPG_ERR_INV_FLAG;
-      goto leave;
-    }
 
   /* Extract the key.  */
   rc = sexp_extract_param (keyparms, NULL, "nedp?q?u?",
@@ -2191,10 +2185,13 @@ selftests_rsa (selftest_report_func_t report, int extended)
   if (errtxt)
     goto failed;
 
-  what = "encrypt";
-  errtxt = selftest_encr_2048 (pkey, skey);
-  if (errtxt)
-    goto failed;
+  if (extended)
+    {
+      what = "encrypt";
+      errtxt = selftest_encr_2048 (pkey, skey);
+      if (errtxt)
+        goto failed;
+    }
 
   sexp_release (pkey);
   sexp_release (skey);
index 646260636cd3cc32392f3b76438b9f4bdbb4eecf..6efb75e02114bd2185f933edf4bf76d850efa192 100644 (file)
@@ -32,7 +32,7 @@
 
 .text
 
-.align 8
+.align 16
 .globl _gcry_salsa20_amd64_keysetup
 ELF(.type  _gcry_salsa20_amd64_keysetup,@function;)
 _gcry_salsa20_amd64_keysetup:
@@ -86,7 +86,7 @@ _gcry_salsa20_amd64_keysetup:
        ret_spec_stop
        CFI_ENDPROC();
 
-.align 8
+.align 16
 .globl _gcry_salsa20_amd64_ivsetup
 ELF(.type  _gcry_salsa20_amd64_ivsetup,@function;)
 _gcry_salsa20_amd64_ivsetup:
@@ -102,7 +102,7 @@ _gcry_salsa20_amd64_ivsetup:
        ret_spec_stop
        CFI_ENDPROC();
 
-.align 8
+.align 16
 .globl _gcry_salsa20_amd64_encrypt_blocks
 ELF(.type  _gcry_salsa20_amd64_encrypt_blocks,@function;)
 _gcry_salsa20_amd64_encrypt_blocks:
index d8c5c81f302d131e515108dafd90fab3434017fd..c6707b7a07548503aca2068331837c399f71fc1c 100644 (file)
@@ -5,7 +5,7 @@
  * This file is part of Libgcrypt.
  *
  * Libgcrypt is free software; you can redistribute it and/or modify
- * it under the terms of the GNU Lesser general Public License as
+ * it under the terms of the GNU Lesser General Public License as
  * published by the Free Software Foundation; either version 2.1 of
  * the License, or (at your option) any later version.
  *
index 5502bdcc6e3b0f87182715e909a272b605b0339c..a6d15a843efe4d423e5367b1d436048c9c8cb400 100644 (file)
@@ -6,7 +6,7 @@
  * This file is part of Libgcrypt.
  *
  * Libgcrypt is free software; you can redistribute it and/or modify
- * it under the terms of the GNU Lesser general Public License as
+ * it under the terms of the GNU Lesser General Public License as
  * published by the Free Software Foundation; either version 2.1 of
  * the License, or (at your option) any later version.
  *
index 4fd93d7525cb0d047b40d0575ef9f163234701d3..9cc34f9e6fcea256532342c5cf60edfa1f21ba55 100644 (file)
@@ -14,8 +14,8 @@
  * GNU Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA
+ * License along with this program; if not, see <https://www.gnu.org/licenses/>.
+ * SPDX-License-Identifier: LGPL-2.1-or-later
  *
  * --
  * This implementation was provided for libgcrypt in public domain
index adff6394634a9b0fbb083a13247ca88aa2bce37b..4179ba2ce4fd2d121674dc7459c010d4d62c3ef8 100644 (file)
@@ -600,6 +600,62 @@ __serpent_dec_blk8:
        bx lr;
 .size __serpent_dec_blk8,.-__serpent_dec_blk8;
 
+.align 3
+.globl _gcry_serpent_neon_blk8
+.type _gcry_serpent_neon_blk8,%function;
+_gcry_serpent_neon_blk8:
+       /* input:
+        *      r0: ctx, CTX
+        *      r1: dst (8 blocks)
+        *      r2: src (8 blocks)
+        *      r3: encrypt
+        */
+
+       push {lr};
+       vpush {RA4-RB2};
+
+       cmp r3, #0
+
+       vld1.8 {RA0, RA1}, [r2]!;
+       vld1.8 {RA2, RA3}, [r2]!;
+       vld1.8 {RB0, RB1}, [r2]!;
+       vld1.8 {RB2, RB3}, [r2]!;
+
+       beq .Lblk8_dec;
+               bl __serpent_enc_blk8;
+               vst1.8 {RA4}, [r1]!;
+               vst1.8 {RA1, RA2}, [r1]!;
+               vst1.8 {RA0}, [r1]!;
+               vst1.8 {RB4}, [r1]!;
+               vst1.8 {RB1, RB2}, [r1]!;
+               vst1.8 {RB0}, [r1]!;
+               b .Lblk8_end;
+       .Lblk8_dec:
+               bl __serpent_dec_blk8;
+               vst1.8 {RA0, RA1}, [r1]!;
+               vst1.8 {RA2, RA3}, [r1]!;
+               vst1.8 {RB0, RB1}, [r1]!;
+               vst1.8 {RB2, RB3}, [r1]!;
+
+.Lblk8_end:
+       /* clear the used registers */
+       veor RA0, RA0;
+       veor RA1, RA1;
+       veor RA2, RA2;
+       veor RA3, RA3;
+
+       vpop {RA4-RB2};
+
+       veor RB3, RB3;
+       veor RB4, RB4;
+       veor RT0, RT0;
+       veor RT1, RT1;
+       veor RT2, RT2;
+       veor RT3, RT3;
+
+       pop {pc};
+.size _gcry_serpent_neon_cbc_dec,.-_gcry_serpent_neon_cbc_dec;
+
 .align 3
 .globl _gcry_serpent_neon_ctr_enc
 .type _gcry_serpent_neon_ctr_enc,%function;
index d3515a21d5baa5fc1de5c3318db10ef31297b1ef..7aba235fa3459b1d90032cc6722b9332242adb4d 100644 (file)
 
 .text
 
-.align 8
+.align 16
 ELF(.type   __serpent_enc_blk16,@function;)
 __serpent_enc_blk16:
        /* input:
@@ -491,7 +491,7 @@ __serpent_enc_blk16:
        CFI_ENDPROC();
 ELF(.size __serpent_enc_blk16,.-__serpent_enc_blk16;)
 
-.align 8
+.align 16
 ELF(.type   __serpent_dec_blk16,@function;)
 __serpent_dec_blk16:
        /* input:
@@ -583,13 +583,63 @@ __serpent_dec_blk16:
        CFI_ENDPROC();
 ELF(.size __serpent_dec_blk16,.-__serpent_dec_blk16;)
 
+.align 16
+.globl _gcry_serpent_avx2_blk16
+ELF(.type   _gcry_serpent_avx2_blk16,@function;)
+_gcry_serpent_avx2_blk16:
+       /* input:
+        *      %rdi: ctx, CTX
+        *      %rsi: dst (16 blocks)
+        *      %rdx: src (16 blocks)
+        *      %ecx: encrypt
+        */
+       CFI_STARTPROC();
+
+       vmovdqu (0 * 32)(%rdx), RA0;
+       vmovdqu (1 * 32)(%rdx), RA1;
+       vmovdqu (2 * 32)(%rdx), RA2;
+       vmovdqu (3 * 32)(%rdx), RA3;
+       vmovdqu (4 * 32)(%rdx), RB0;
+       vmovdqu (5 * 32)(%rdx), RB1;
+       vmovdqu (6 * 32)(%rdx), RB2;
+       vmovdqu (7 * 32)(%rdx), RB3;
+
+       testl %ecx, %ecx;
+       jz .Lblk16_dec;
+               call __serpent_enc_blk16;
+               vmovdqu RA4, (0 * 32)(%rsi);
+               vmovdqu RA1, (1 * 32)(%rsi);
+               vmovdqu RA2, (2 * 32)(%rsi);
+               vmovdqu RA0, (3 * 32)(%rsi);
+               vmovdqu RB4, (4 * 32)(%rsi);
+               vmovdqu RB1, (5 * 32)(%rsi);
+               vmovdqu RB2, (6 * 32)(%rsi);
+               vmovdqu RB0, (7 * 32)(%rsi);
+               jmp .Lblk16_end;
+       .Lblk16_dec:
+               call __serpent_dec_blk16;
+               vmovdqu RA0, (0 * 32)(%rsi);
+               vmovdqu RA1, (1 * 32)(%rsi);
+               vmovdqu RA2, (2 * 32)(%rsi);
+               vmovdqu RA3, (3 * 32)(%rsi);
+               vmovdqu RB0, (4 * 32)(%rsi);
+               vmovdqu RB1, (5 * 32)(%rsi);
+               vmovdqu RB2, (6 * 32)(%rsi);
+               vmovdqu RB3, (7 * 32)(%rsi);
+
+.Lblk16_end:
+       vzeroall;
+       ret_spec_stop;
+       CFI_ENDPROC();
+ELF(.size _gcry_serpent_avx2_blk16,.-_gcry_serpent_avx2_blk16;)
+
 #define inc_le128(x, minus_one, tmp) \
        vpcmpeqq minus_one, x, tmp; \
        vpsubq minus_one, x, x; \
        vpslldq $8, tmp, tmp; \
        vpsubq tmp, x, x;
 
-.align 8
+.align 16
 .globl _gcry_serpent_avx2_ctr_enc
 ELF(.type   _gcry_serpent_avx2_ctr_enc,@function;)
 _gcry_serpent_avx2_ctr_enc:
@@ -701,7 +751,7 @@ _gcry_serpent_avx2_ctr_enc:
        CFI_ENDPROC();
 ELF(.size _gcry_serpent_avx2_ctr_enc,.-_gcry_serpent_avx2_ctr_enc;)
 
-.align 8
+.align 16
 .globl _gcry_serpent_avx2_cbc_dec
 ELF(.type   _gcry_serpent_avx2_cbc_dec,@function;)
 _gcry_serpent_avx2_cbc_dec:
@@ -754,7 +804,7 @@ _gcry_serpent_avx2_cbc_dec:
        CFI_ENDPROC();
 ELF(.size _gcry_serpent_avx2_cbc_dec,.-_gcry_serpent_avx2_cbc_dec;)
 
-.align 8
+.align 16
 .globl _gcry_serpent_avx2_cfb_dec
 ELF(.type   _gcry_serpent_avx2_cfb_dec,@function;)
 _gcry_serpent_avx2_cfb_dec:
@@ -809,7 +859,7 @@ _gcry_serpent_avx2_cfb_dec:
        CFI_ENDPROC();
 ELF(.size _gcry_serpent_avx2_cfb_dec,.-_gcry_serpent_avx2_cfb_dec;)
 
-.align 8
+.align 16
 .globl _gcry_serpent_avx2_ocb_enc
 ELF(.type _gcry_serpent_avx2_ocb_enc,@function;)
 
@@ -923,7 +973,7 @@ _gcry_serpent_avx2_ocb_enc:
        CFI_ENDPROC();
 ELF(.size _gcry_serpent_avx2_ocb_enc,.-_gcry_serpent_avx2_ocb_enc;)
 
-.align 8
+.align 16
 .globl _gcry_serpent_avx2_ocb_dec
 ELF(.type _gcry_serpent_avx2_ocb_dec,@function;)
 
@@ -1047,7 +1097,7 @@ _gcry_serpent_avx2_ocb_dec:
        CFI_ENDPROC();
 ELF(.size _gcry_serpent_avx2_ocb_dec,.-_gcry_serpent_avx2_ocb_dec;)
 
-.align 8
+.align 16
 .globl _gcry_serpent_avx2_ocb_auth
 ELF(.type _gcry_serpent_avx2_ocb_auth,@function;)
 
@@ -1150,9 +1200,13 @@ _gcry_serpent_avx2_ocb_auth:
        CFI_ENDPROC();
 ELF(.size _gcry_serpent_avx2_ocb_auth,.-_gcry_serpent_avx2_ocb_auth;)
 
-.align 16
+
+SECTION_RODATA
+ELF(.type _serpent_avx2_consts,@object)
+_serpent_avx2_consts:
 
 /* For CTR-mode IV byteswap */
+.align 16
 .Lbswap128_mask:
        .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
 
diff --git a/cipher/serpent-avx512-x86.c b/cipher/serpent-avx512-x86.c
new file mode 100644 (file)
index 0000000..5b5c248
--- /dev/null
@@ -0,0 +1,994 @@
+/* serpent-avx512-x86.c  -  AVX512 implementation of Serpent cipher
+ *
+ * Copyright (C) 2023 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <config.h>
+
+#if defined(__x86_64) || defined(__i386)
+#if defined(HAVE_COMPATIBLE_CC_X86_AVX512_INTRINSICS) && \
+    defined(USE_SERPENT) && defined(ENABLE_AVX512_SUPPORT)
+
+#include <immintrin.h>
+#include <string.h>
+#include <stdio.h>
+
+#include "g10lib.h"
+#include "types.h"
+#include "cipher.h"
+#include "bithelp.h"
+#include "bufhelp.h"
+#include "cipher-internal.h"
+#include "bulkhelp.h"
+
+#define ALWAYS_INLINE inline __attribute__((always_inline))
+#define NO_INLINE __attribute__((noinline))
+
+/* Number of rounds per Serpent encrypt/decrypt operation.  */
+#define ROUNDS 32
+
+/* Serpent works on 128 bit blocks.  */
+typedef unsigned int serpent_block_t[4];
+
+/* The key schedule consists of 33 128 bit subkeys.  */
+typedef unsigned int serpent_subkeys_t[ROUNDS + 1][4];
+
+#define vpunpckhdq(a, b, o)  ((o) = _mm512_unpackhi_epi32((b), (a)))
+#define vpunpckldq(a, b, o)  ((o) = _mm512_unpacklo_epi32((b), (a)))
+#define vpunpckhqdq(a, b, o) ((o) = _mm512_unpackhi_epi64((b), (a)))
+#define vpunpcklqdq(a, b, o) ((o) = _mm512_unpacklo_epi64((b), (a)))
+
+#define vpbroadcastd(v) _mm512_set1_epi32(v)
+
+#define vrol(x, s) _mm512_rol_epi32((x), (s))
+#define vror(x, s) _mm512_ror_epi32((x), (s))
+#define vshl(x, s) _mm512_slli_epi32((x), (s))
+
+/* 4x4 32-bit integer matrix transpose */
+#define transpose_4x4(x0, x1, x2, x3, t1, t2, t3) \
+       vpunpckhdq(x1, x0, t2); \
+       vpunpckldq(x1, x0, x0); \
+       \
+       vpunpckldq(x3, x2, t1); \
+       vpunpckhdq(x3, x2, x2); \
+       \
+       vpunpckhqdq(t1, x0, x1); \
+       vpunpcklqdq(t1, x0, x0); \
+       \
+       vpunpckhqdq(x2, t2, x3); \
+       vpunpcklqdq(x2, t2, x2);
+
+/*
+ * These are the S-Boxes of Serpent from following research paper.
+ *
+ *  D. A. Osvik, “Speeding up Serpent,” in Third AES Candidate Conference,
+ *   (New York, New York, USA), p. 317–329, National Institute of Standards and
+ *   Technology, 2000.
+ *
+ * Paper is also available at: http://www.ii.uib.no/~osvik/pub/aes3.pdf
+ *
+ * --
+ *
+ * Following logic gets heavily optimized by compiler to use AVX512F
+ * 'vpternlogq' instruction. This gives higher performance increase than
+ * would be expected from simple wideing of vectors from AVX2/256bit to
+ * AVX512/512bit.
+ *
+ */
+
+#define SBOX0(r0, r1, r2, r3, w, x, y, z) \
+  { \
+    __m512i r4; \
+    \
+    r3 ^= r0; r4 =  r1; \
+    r1 &= r3; r4 ^= r2; \
+    r1 ^= r0; r0 |= r3; \
+    r0 ^= r4; r4 ^= r3; \
+    r3 ^= r2; r2 |= r1; \
+    r2 ^= r4; r4 = ~r4; \
+    r4 |= r1; r1 ^= r3; \
+    r1 ^= r4; r3 |= r0; \
+    r1 ^= r3; r4 ^= r3; \
+    \
+    w = r1; x = r4; y = r2; z = r0; \
+  }
+
+#define SBOX0_INVERSE(r0, r1, r2, r3, w, x, y, z) \
+  { \
+    __m512i r4; \
+    \
+    r2 = ~r2; r4 =  r1; \
+    r1 |= r0; r4 = ~r4; \
+    r1 ^= r2; r2 |= r4; \
+    r1 ^= r3; r0 ^= r4; \
+    r2 ^= r0; r0 &= r3; \
+    r4 ^= r0; r0 |= r1; \
+    r0 ^= r2; r3 ^= r4; \
+    r2 ^= r1; r3 ^= r0; \
+    r3 ^= r1; \
+    r2 &= r3; \
+    r4 ^= r2; \
+    \
+    w = r0; x = r4; y = r1; z = r3; \
+  }
+
+#define SBOX1(r0, r1, r2, r3, w, x, y, z) \
+  { \
+    __m512i r4; \
+    \
+    r0 = ~r0; r2 = ~r2; \
+    r4 =  r0; r0 &= r1; \
+    r2 ^= r0; r0 |= r3; \
+    r3 ^= r2; r1 ^= r0; \
+    r0 ^= r4; r4 |= r1; \
+    r1 ^= r3; r2 |= r0; \
+    r2 &= r4; r0 ^= r1; \
+    r1 &= r2; \
+    r1 ^= r0; r0 &= r2; \
+    r0 ^= r4; \
+    \
+    w = r2; x = r0; y = r3; z = r1; \
+  }
+
+#define SBOX1_INVERSE(r0, r1, r2, r3, w, x, y, z) \
+  { \
+    __m512i r4; \
+    \
+    r4 =  r1; r1 ^= r3; \
+    r3 &= r1; r4 ^= r2; \
+    r3 ^= r0; r0 |= r1; \
+    r2 ^= r3; r0 ^= r4; \
+    r0 |= r2; r1 ^= r3; \
+    r0 ^= r1; r1 |= r3; \
+    r1 ^= r0; r4 = ~r4; \
+    r4 ^= r1; r1 |= r0; \
+    r1 ^= r0; \
+    r1 |= r4; \
+    r3 ^= r1; \
+    \
+    w = r4; x = r0; y = r3; z = r2; \
+  }
+
+#define SBOX2(r0, r1, r2, r3, w, x, y, z) \
+  { \
+    __m512i r4; \
+    \
+    r4 =  r0; r0 &= r2; \
+    r0 ^= r3; r2 ^= r1; \
+    r2 ^= r0; r3 |= r4; \
+    r3 ^= r1; r4 ^= r2; \
+    r1 =  r3; r3 |= r4; \
+    r3 ^= r0; r0 &= r1; \
+    r4 ^= r0; r1 ^= r3; \
+    r1 ^= r4; r4 = ~r4; \
+    \
+    w = r2; x = r3; y = r1; z = r4; \
+  }
+
+#define SBOX2_INVERSE(r0, r1, r2, r3, w, x, y, z) \
+  { \
+    __m512i r4; \
+    \
+    r2 ^= r3; r3 ^= r0; \
+    r4 =  r3; r3 &= r2; \
+    r3 ^= r1; r1 |= r2; \
+    r1 ^= r4; r4 &= r3; \
+    r2 ^= r3; r4 &= r0; \
+    r4 ^= r2; r2 &= r1; \
+    r2 |= r0; r3 = ~r3; \
+    r2 ^= r3; r0 ^= r3; \
+    r0 &= r1; r3 ^= r4; \
+    r3 ^= r0; \
+    \
+    w = r1; x = r4; y = r2; z = r3; \
+  }
+
+#define SBOX3(r0, r1, r2, r3, w, x, y, z) \
+  { \
+    __m512i r4; \
+    \
+    r4 =  r0; r0 |= r3; \
+    r3 ^= r1; r1 &= r4; \
+    r4 ^= r2; r2 ^= r3; \
+    r3 &= r0; r4 |= r1; \
+    r3 ^= r4; r0 ^= r1; \
+    r4 &= r0; r1 ^= r3; \
+    r4 ^= r2; r1 |= r0; \
+    r1 ^= r2; r0 ^= r3; \
+    r2 =  r1; r1 |= r3; \
+    r1 ^= r0; \
+    \
+    w = r1; x = r2; y = r3; z = r4; \
+  }
+
+#define SBOX3_INVERSE(r0, r1, r2, r3, w, x, y, z) \
+  { \
+    __m512i r4; \
+    \
+    r4 =  r2; r2 ^= r1; \
+    r0 ^= r2; r4 &= r2; \
+    r4 ^= r0; r0 &= r1; \
+    r1 ^= r3; r3 |= r4; \
+    r2 ^= r3; r0 ^= r3; \
+    r1 ^= r4; r3 &= r2; \
+    r3 ^= r1; r1 ^= r0; \
+    r1 |= r2; r0 ^= r3; \
+    r1 ^= r4; \
+    r0 ^= r1; \
+    \
+    w = r2; x = r1; y = r3; z = r0; \
+  }
+
+#define SBOX4(r0, r1, r2, r3, w, x, y, z) \
+  { \
+    __m512i r4; \
+    \
+    r1 ^= r3; r3 = ~r3; \
+    r2 ^= r3; r3 ^= r0; \
+    r4 =  r1; r1 &= r3; \
+    r1 ^= r2; r4 ^= r3; \
+    r0 ^= r4; r2 &= r4; \
+    r2 ^= r0; r0 &= r1; \
+    r3 ^= r0; r4 |= r1; \
+    r4 ^= r0; r0 |= r3; \
+    r0 ^= r2; r2 &= r3; \
+    r0 = ~r0; r4 ^= r2; \
+    \
+    w = r1; x = r4; y = r0; z = r3; \
+  }
+
+#define SBOX4_INVERSE(r0, r1, r2, r3, w, x, y, z) \
+  { \
+    __m512i r4; \
+    \
+    r4 =  r2; r2 &= r3; \
+    r2 ^= r1; r1 |= r3; \
+    r1 &= r0; r4 ^= r2; \
+    r4 ^= r1; r1 &= r2; \
+    r0 = ~r0; r3 ^= r4; \
+    r1 ^= r3; r3 &= r0; \
+    r3 ^= r2; r0 ^= r1; \
+    r2 &= r0; r3 ^= r0; \
+    r2 ^= r4; \
+    r2 |= r3; r3 ^= r0; \
+    r2 ^= r1; \
+    \
+    w = r0; x = r3; y = r2; z = r4; \
+  }
+
+#define SBOX5(r0, r1, r2, r3, w, x, y, z) \
+  { \
+    __m512i r4; \
+    \
+    r0 ^= r1; r1 ^= r3; \
+    r3 = ~r3; r4 =  r1; \
+    r1 &= r0; r2 ^= r3; \
+    r1 ^= r2; r2 |= r4; \
+    r4 ^= r3; r3 &= r1; \
+    r3 ^= r0; r4 ^= r1; \
+    r4 ^= r2; r2 ^= r0; \
+    r0 &= r3; r2 = ~r2; \
+    r0 ^= r4; r4 |= r3; \
+    r2 ^= r4; \
+    \
+    w = r1; x = r3; y = r0; z = r2; \
+  }
+
+#define SBOX5_INVERSE(r0, r1, r2, r3, w, x, y, z) \
+  { \
+    __m512i r4; \
+    \
+    r1 = ~r1; r4 =  r3; \
+    r2 ^= r1; r3 |= r0; \
+    r3 ^= r2; r2 |= r1; \
+    r2 &= r0; r4 ^= r3; \
+    r2 ^= r4; r4 |= r0; \
+    r4 ^= r1; r1 &= r2; \
+    r1 ^= r3; r4 ^= r2; \
+    r3 &= r4; r4 ^= r1; \
+    r3 ^= r4; r4 = ~r4; \
+    r3 ^= r0; \
+    \
+    w = r1; x = r4; y = r3; z = r2; \
+  }
+
+#define SBOX6(r0, r1, r2, r3, w, x, y, z) \
+  { \
+    __m512i r4; \
+    \
+    r2 = ~r2; r4 =  r3; \
+    r3 &= r0; r0 ^= r4; \
+    r3 ^= r2; r2 |= r4; \
+    r1 ^= r3; r2 ^= r0; \
+    r0 |= r1; r2 ^= r1; \
+    r4 ^= r0; r0 |= r3; \
+    r0 ^= r2; r4 ^= r3; \
+    r4 ^= r0; r3 = ~r3; \
+    r2 &= r4; \
+    r2 ^= r3; \
+    \
+    w = r0; x = r1; y = r4; z = r2; \
+  }
+
+#define SBOX6_INVERSE(r0, r1, r2, r3, w, x, y, z) \
+  { \
+    __m512i r4; \
+    \
+    r0 ^= r2; r4 =  r2; \
+    r2 &= r0; r4 ^= r3; \
+    r2 = ~r2; r3 ^= r1; \
+    r2 ^= r3; r4 |= r0; \
+    r0 ^= r2; r3 ^= r4; \
+    r4 ^= r1; r1 &= r3; \
+    r1 ^= r0; r0 ^= r3; \
+    r0 |= r2; r3 ^= r1; \
+    r4 ^= r0; \
+    \
+    w = r1; x = r2; y = r4; z = r3; \
+  }
+
+#define SBOX7(r0, r1, r2, r3, w, x, y, z) \
+  { \
+    __m512i r4; \
+    \
+    r4 =  r1; r1 |= r2; \
+    r1 ^= r3; r4 ^= r2; \
+    r2 ^= r1; r3 |= r4; \
+    r3 &= r0; r4 ^= r2; \
+    r3 ^= r1; r1 |= r4; \
+    r1 ^= r0; r0 |= r4; \
+    r0 ^= r2; r1 ^= r4; \
+    r2 ^= r1; r1 &= r0; \
+    r1 ^= r4; r2 = ~r2; \
+    r2 |= r0; \
+    r4 ^= r2; \
+    \
+    w = r4; x = r3; y = r1; z = r0; \
+  }
+
+#define SBOX7_INVERSE(r0, r1, r2, r3, w, x, y, z) \
+  { \
+    __m512i r4; \
+    \
+    r4 =  r2; r2 ^= r0; \
+    r0 &= r3; r4 |= r3; \
+    r2 = ~r2; r3 ^= r1; \
+    r1 |= r0; r0 ^= r2; \
+    r2 &= r4; r3 &= r4; \
+    r1 ^= r2; r2 ^= r0; \
+    r0 |= r2; r4 ^= r1; \
+    r0 ^= r3; r3 ^= r4; \
+    r4 |= r0; r3 ^= r2; \
+    r4 ^= r2; \
+    \
+    w = r3; x = r0; y = r1; z = r4; \
+  }
+
+/* XOR BLOCK1 into BLOCK0.  */
+#define BLOCK_XOR_KEY(block0, rkey)     \
+  {                                     \
+    block0[0] ^= vpbroadcastd(rkey[0]); \
+    block0[1] ^= vpbroadcastd(rkey[1]); \
+    block0[2] ^= vpbroadcastd(rkey[2]); \
+    block0[3] ^= vpbroadcastd(rkey[3]); \
+  }
+
+/* Copy BLOCK_SRC to BLOCK_DST.  */
+#define BLOCK_COPY(block_dst, block_src) \
+  {                                      \
+    block_dst[0] = block_src[0];         \
+    block_dst[1] = block_src[1];         \
+    block_dst[2] = block_src[2];         \
+    block_dst[3] = block_src[3];         \
+  }
+
+/* Apply SBOX number WHICH to to the block found in ARRAY0, writing
+   the output to the block found in ARRAY1.  */
+#define SBOX(which, array0, array1)                         \
+  SBOX##which (array0[0], array0[1], array0[2], array0[3],  \
+               array1[0], array1[1], array1[2], array1[3]);
+
+/* Apply inverse SBOX number WHICH to to the block found in ARRAY0, writing
+   the output to the block found in ARRAY1.  */
+#define SBOX_INVERSE(which, array0, array1)                           \
+  SBOX##which##_INVERSE (array0[0], array0[1], array0[2], array0[3],  \
+                         array1[0], array1[1], array1[2], array1[3]);
+
+/* Apply the linear transformation to BLOCK.  */
+#define LINEAR_TRANSFORMATION(block)                    \
+  {                                                     \
+    block[0] = vrol (block[0], 13);                     \
+    block[2] = vrol (block[2], 3);                      \
+    block[1] = block[1] ^ block[0] ^ block[2];          \
+    block[3] = block[3] ^ block[2] ^ vshl(block[0], 3); \
+    block[1] = vrol (block[1], 1);                      \
+    block[3] = vrol (block[3], 7);                      \
+    block[0] = block[0] ^ block[1] ^ block[3];          \
+    block[2] = block[2] ^ block[3] ^ vshl(block[1], 7); \
+    block[0] = vrol (block[0], 5);                      \
+    block[2] = vrol (block[2], 22);                     \
+  }
+
+/* Apply the inverse linear transformation to BLOCK.  */
+#define LINEAR_TRANSFORMATION_INVERSE(block)            \
+  {                                                     \
+    block[2] = vror (block[2], 22);                     \
+    block[0] = vror (block[0] , 5);                     \
+    block[2] = block[2] ^ block[3] ^ vshl(block[1], 7); \
+    block[0] = block[0] ^ block[1] ^ block[3];          \
+    block[3] = vror (block[3], 7);                      \
+    block[1] = vror (block[1], 1);                      \
+    block[3] = block[3] ^ block[2] ^ vshl(block[0], 3); \
+    block[1] = block[1] ^ block[0] ^ block[2];          \
+    block[2] = vror (block[2], 3);                      \
+    block[0] = vror (block[0], 13);                     \
+  }
+
+/* Apply a Serpent round to BLOCK, using the SBOX number WHICH and the
+   subkeys contained in SUBKEYS.  Use BLOCK_TMP as temporary storage.
+   This macro increments `round'.  */
+#define ROUND(which, subkeys, block, block_tmp) \
+  {                                             \
+    BLOCK_XOR_KEY (block, subkeys[round]);      \
+    SBOX (which, block, block_tmp);             \
+    LINEAR_TRANSFORMATION (block_tmp);          \
+    BLOCK_COPY (block, block_tmp);              \
+  }
+
+/* Apply the last Serpent round to BLOCK, using the SBOX number WHICH
+   and the subkeys contained in SUBKEYS.  Use BLOCK_TMP as temporary
+   storage.  The result will be stored in BLOCK_TMP.  This macro
+   increments `round'.  */
+#define ROUND_LAST(which, subkeys, block, block_tmp) \
+  {                                                  \
+    BLOCK_XOR_KEY (block, subkeys[round]);           \
+    SBOX (which, block, block_tmp);                  \
+    BLOCK_XOR_KEY (block_tmp, subkeys[round+1]);     \
+  }
+
+/* Apply an inverse Serpent round to BLOCK, using the SBOX number
+   WHICH and the subkeys contained in SUBKEYS.  Use BLOCK_TMP as
+   temporary storage.  This macro increments `round'.  */
+#define ROUND_INVERSE(which, subkey, block, block_tmp) \
+  {                                                    \
+    LINEAR_TRANSFORMATION_INVERSE (block);             \
+    SBOX_INVERSE (which, block, block_tmp);            \
+    BLOCK_XOR_KEY (block_tmp, subkey[round]);          \
+    BLOCK_COPY (block, block_tmp);                     \
+  }
+
+/* Apply the first Serpent round to BLOCK, using the SBOX number WHICH
+   and the subkeys contained in SUBKEYS.  Use BLOCK_TMP as temporary
+   storage.  The result will be stored in BLOCK_TMP.  This macro
+   increments `round'.  */
+#define ROUND_FIRST_INVERSE(which, subkeys, block, block_tmp) \
+  {                                                           \
+    BLOCK_XOR_KEY (block, subkeys[round]);                    \
+    SBOX_INVERSE (which, block, block_tmp);                   \
+    BLOCK_XOR_KEY (block_tmp, subkeys[round-1]);              \
+  }
+
+static ALWAYS_INLINE void
+serpent_encrypt_internal_avx512 (const serpent_subkeys_t keys,
+                                const __m512i vin[8], __m512i vout[8])
+{
+  __m512i b[4];
+  __m512i c[4];
+  __m512i b_next[4];
+  __m512i c_next[4];
+  int round = 0;
+
+  b_next[0] = vin[0];
+  b_next[1] = vin[1];
+  b_next[2] = vin[2];
+  b_next[3] = vin[3];
+  c_next[0] = vin[4];
+  c_next[1] = vin[5];
+  c_next[2] = vin[6];
+  c_next[3] = vin[7];
+  transpose_4x4 (b_next[0], b_next[1], b_next[2], b_next[3], b[0], b[1], b[2]);
+  transpose_4x4 (c_next[0], c_next[1], c_next[2], c_next[3], c[0], c[1], c[2]);
+
+  b[0] = b_next[0];
+  b[1] = b_next[1];
+  b[2] = b_next[2];
+  b[3] = b_next[3];
+  c[0] = c_next[0];
+  c[1] = c_next[1];
+  c[2] = c_next[2];
+  c[3] = c_next[3];
+
+  while (1)
+    {
+      ROUND (0, keys, b, b_next); ROUND (0, keys, c, c_next); round++;
+      ROUND (1, keys, b, b_next); ROUND (1, keys, c, c_next); round++;
+      ROUND (2, keys, b, b_next); ROUND (2, keys, c, c_next); round++;
+      ROUND (3, keys, b, b_next); ROUND (3, keys, c, c_next); round++;
+      ROUND (4, keys, b, b_next); ROUND (4, keys, c, c_next); round++;
+      ROUND (5, keys, b, b_next); ROUND (5, keys, c, c_next); round++;
+      ROUND (6, keys, b, b_next); ROUND (6, keys, c, c_next); round++;
+      if (round >= ROUNDS - 1)
+       break;
+      ROUND (7, keys, b, b_next); ROUND (7, keys, c, c_next); round++;
+    }
+
+  ROUND_LAST (7, keys, b, b_next); ROUND_LAST (7, keys, c, c_next);
+
+  transpose_4x4 (b_next[0], b_next[1], b_next[2], b_next[3], b[0], b[1], b[2]);
+  transpose_4x4 (c_next[0], c_next[1], c_next[2], c_next[3], c[0], c[1], c[2]);
+  vout[0] = b_next[0];
+  vout[1] = b_next[1];
+  vout[2] = b_next[2];
+  vout[3] = b_next[3];
+  vout[4] = c_next[0];
+  vout[5] = c_next[1];
+  vout[6] = c_next[2];
+  vout[7] = c_next[3];
+}
+
+static ALWAYS_INLINE void
+serpent_decrypt_internal_avx512 (const serpent_subkeys_t keys,
+                                const __m512i vin[8], __m512i vout[8])
+{
+  __m512i b[4];
+  __m512i c[4];
+  __m512i b_next[4];
+  __m512i c_next[4];
+  int round = ROUNDS;
+
+  b_next[0] = vin[0];
+  b_next[1] = vin[1];
+  b_next[2] = vin[2];
+  b_next[3] = vin[3];
+  c_next[0] = vin[4];
+  c_next[1] = vin[5];
+  c_next[2] = vin[6];
+  c_next[3] = vin[7];
+  transpose_4x4 (b_next[0], b_next[1], b_next[2], b_next[3], b[0], b[1], b[2]);
+  transpose_4x4 (c_next[0], c_next[1], c_next[2], c_next[3], c[0], c[1], c[2]);
+
+  ROUND_FIRST_INVERSE (7, keys, b_next, b); ROUND_FIRST_INVERSE (7, keys, c_next, c);
+  round -= 2;
+
+  while (1)
+    {
+      ROUND_INVERSE (6, keys, b, b_next); ROUND_INVERSE (6, keys, c, c_next); round--;
+      ROUND_INVERSE (5, keys, b, b_next); ROUND_INVERSE (5, keys, c, c_next); round--;
+      ROUND_INVERSE (4, keys, b, b_next); ROUND_INVERSE (4, keys, c, c_next); round--;
+      ROUND_INVERSE (3, keys, b, b_next); ROUND_INVERSE (3, keys, c, c_next); round--;
+      ROUND_INVERSE (2, keys, b, b_next); ROUND_INVERSE (2, keys, c, c_next); round--;
+      ROUND_INVERSE (1, keys, b, b_next); ROUND_INVERSE (1, keys, c, c_next); round--;
+      ROUND_INVERSE (0, keys, b, b_next); ROUND_INVERSE (0, keys, c, c_next); round--;
+      if (round <= 0)
+       break;
+      ROUND_INVERSE (7, keys, b, b_next); ROUND_INVERSE (7, keys, c, c_next); round--;
+    }
+
+  transpose_4x4 (b_next[0], b_next[1], b_next[2], b_next[3], b[0], b[1], b[2]);
+  transpose_4x4 (c_next[0], c_next[1], c_next[2], c_next[3], c[0], c[1], c[2]);
+  vout[0] = b_next[0];
+  vout[1] = b_next[1];
+  vout[2] = b_next[2];
+  vout[3] = b_next[3];
+  vout[4] = c_next[0];
+  vout[5] = c_next[1];
+  vout[6] = c_next[2];
+  vout[7] = c_next[3];
+}
+
+enum crypt_mode_e
+{
+  ECB_ENC = 0,
+  ECB_DEC,
+  CBC_DEC,
+  CFB_DEC,
+  CTR_ENC,
+  OCB_ENC,
+  OCB_DEC
+};
+
+static ALWAYS_INLINE void
+ctr_generate(unsigned char *ctr, __m512i vin[8])
+{
+  const unsigned int blocksize = 16;
+  unsigned char ctr_low = ctr[15];
+
+  if (ctr_low + 32 <= 256)
+    {
+      const __m512i add0123 = _mm512_set_epi64(3LL << 56, 0,
+                                              2LL << 56, 0,
+                                              1LL << 56, 0,
+                                              0LL << 56, 0);
+      const __m512i add4444 = _mm512_set_epi64(4LL << 56, 0,
+                                              4LL << 56, 0,
+                                              4LL << 56, 0,
+                                              4LL << 56, 0);
+      const __m512i add4567 = _mm512_add_epi32(add0123, add4444);
+      const __m512i add8888 = _mm512_add_epi32(add4444, add4444);
+
+      // Fast path without carry handling.
+      __m512i vctr =
+       _mm512_broadcast_i32x4(_mm_loadu_si128((const void *)ctr));
+
+      cipher_block_add(ctr, 32, blocksize);
+      vin[0] = _mm512_add_epi32(vctr, add0123);
+      vin[1] = _mm512_add_epi32(vctr, add4567);
+      vin[2] = _mm512_add_epi32(vin[0], add8888);
+      vin[3] = _mm512_add_epi32(vin[1], add8888);
+      vin[4] = _mm512_add_epi32(vin[2], add8888);
+      vin[5] = _mm512_add_epi32(vin[3], add8888);
+      vin[6] = _mm512_add_epi32(vin[4], add8888);
+      vin[7] = _mm512_add_epi32(vin[5], add8888);
+    }
+  else
+    {
+      // Slow path.
+      u32 blocks[4][blocksize / sizeof(u32)];
+
+      cipher_block_cpy(blocks[0], ctr, blocksize);
+      cipher_block_cpy(blocks[1], ctr, blocksize);
+      cipher_block_cpy(blocks[2], ctr, blocksize);
+      cipher_block_cpy(blocks[3], ctr, blocksize);
+      cipher_block_add(ctr, 32, blocksize);
+      cipher_block_add(blocks[1], 1, blocksize);
+      cipher_block_add(blocks[2], 2, blocksize);
+      cipher_block_add(blocks[3], 3, blocksize);
+      vin[0] = _mm512_loadu_epi32 (blocks);
+      cipher_block_add(blocks[0], 4, blocksize);
+      cipher_block_add(blocks[1], 4, blocksize);
+      cipher_block_add(blocks[2], 4, blocksize);
+      cipher_block_add(blocks[3], 4, blocksize);
+      vin[1] = _mm512_loadu_epi32 (blocks);
+      cipher_block_add(blocks[0], 4, blocksize);
+      cipher_block_add(blocks[1], 4, blocksize);
+      cipher_block_add(blocks[2], 4, blocksize);
+      cipher_block_add(blocks[3], 4, blocksize);
+      vin[2] = _mm512_loadu_epi32 (blocks);
+      cipher_block_add(blocks[0], 4, blocksize);
+      cipher_block_add(blocks[1], 4, blocksize);
+      cipher_block_add(blocks[2], 4, blocksize);
+      cipher_block_add(blocks[3], 4, blocksize);
+      vin[3] = _mm512_loadu_epi32 (blocks);
+      cipher_block_add(blocks[0], 4, blocksize);
+      cipher_block_add(blocks[1], 4, blocksize);
+      cipher_block_add(blocks[2], 4, blocksize);
+      cipher_block_add(blocks[3], 4, blocksize);
+      vin[4] = _mm512_loadu_epi32 (blocks);
+      cipher_block_add(blocks[0], 4, blocksize);
+      cipher_block_add(blocks[1], 4, blocksize);
+      cipher_block_add(blocks[2], 4, blocksize);
+      cipher_block_add(blocks[3], 4, blocksize);
+      vin[5] = _mm512_loadu_epi32 (blocks);
+      cipher_block_add(blocks[0], 4, blocksize);
+      cipher_block_add(blocks[1], 4, blocksize);
+      cipher_block_add(blocks[2], 4, blocksize);
+      cipher_block_add(blocks[3], 4, blocksize);
+      vin[6] = _mm512_loadu_epi32 (blocks);
+      cipher_block_add(blocks[0], 4, blocksize);
+      cipher_block_add(blocks[1], 4, blocksize);
+      cipher_block_add(blocks[2], 4, blocksize);
+      cipher_block_add(blocks[3], 4, blocksize);
+      vin[7] = _mm512_loadu_epi32 (blocks);
+
+      wipememory(blocks, sizeof(blocks));
+    }
+}
+
+static ALWAYS_INLINE __m512i
+ocb_input(__m512i *vchecksum, __m128i *voffset, const unsigned char *input,
+         unsigned char *output, const ocb_L_uintptr_t L[4])
+{
+  __m128i L0 = _mm_loadu_si128((const void *)(uintptr_t)L[0]);
+  __m128i L1 = _mm_loadu_si128((const void *)(uintptr_t)L[1]);
+  __m128i L2 = _mm_loadu_si128((const void *)(uintptr_t)L[2]);
+  __m128i L3 = _mm_loadu_si128((const void *)(uintptr_t)L[3]);
+  __m512i vin = _mm512_loadu_epi32 (input);
+  __m512i voffsets;
+
+  /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
+  /* Checksum_i = Checksum_{i-1} xor P_i  */
+  /* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i)  */
+
+  if (vchecksum)
+    *vchecksum ^= _mm512_loadu_epi32 (input);
+
+  *voffset ^= L0;
+  voffsets = _mm512_castsi128_si512(*voffset);
+  *voffset ^= L1;
+  voffsets = _mm512_inserti32x4(voffsets, *voffset, 1);
+  *voffset ^= L2;
+  voffsets = _mm512_inserti32x4(voffsets, *voffset, 2);
+  *voffset ^= L3;
+  voffsets = _mm512_inserti32x4(voffsets, *voffset, 3);
+  _mm512_storeu_epi32 (output, voffsets);
+
+  return vin ^ voffsets;
+}
+
+static NO_INLINE void
+serpent_avx512_blk32(const void *c, unsigned char *output,
+                    const unsigned char *input, int mode,
+                    unsigned char *iv, unsigned char *checksum,
+                    const ocb_L_uintptr_t Ls[32])
+{
+  __m512i vin[8];
+  __m512i vout[8];
+  int encrypt = 1;
+
+  asm volatile ("vpxor %%ymm0, %%ymm0, %%ymm0;\n\t"
+               "vpopcntb %%zmm0, %%zmm6;\n\t" /* spec stop for old AVX512 CPUs */
+               "vpxor %%ymm6, %%ymm6, %%ymm6;\n\t"
+               :
+               : "m"(*input), "m"(*output)
+               : "xmm6", "xmm0", "memory", "cc");
+
+  // Input handling
+  switch (mode)
+    {
+      default:
+      case CBC_DEC:
+      case ECB_DEC:
+       encrypt = 0;
+       /* fall through */
+      case ECB_ENC:
+       vin[0] = _mm512_loadu_epi32 (input + 0 * 64);
+       vin[1] = _mm512_loadu_epi32 (input + 1 * 64);
+       vin[2] = _mm512_loadu_epi32 (input + 2 * 64);
+       vin[3] = _mm512_loadu_epi32 (input + 3 * 64);
+       vin[4] = _mm512_loadu_epi32 (input + 4 * 64);
+       vin[5] = _mm512_loadu_epi32 (input + 5 * 64);
+       vin[6] = _mm512_loadu_epi32 (input + 6 * 64);
+       vin[7] = _mm512_loadu_epi32 (input + 7 * 64);
+       break;
+
+      case CFB_DEC:
+      {
+       __m128i viv;
+       vin[0] = _mm512_maskz_loadu_epi32(_cvtu32_mask16(0xfff0),
+                                         input - 1 * 64 + 48)
+                 ^ _mm512_maskz_loadu_epi32(_cvtu32_mask16(0x000f), iv);
+       vin[1] = _mm512_loadu_epi32(input + 0 * 64 + 48);
+       vin[2] = _mm512_loadu_epi32(input + 1 * 64 + 48);
+       vin[3] = _mm512_loadu_epi32(input + 2 * 64 + 48);
+       vin[4] = _mm512_loadu_epi32(input + 3 * 64 + 48);
+       vin[5] = _mm512_loadu_epi32(input + 4 * 64 + 48);
+       vin[6] = _mm512_loadu_epi32(input + 5 * 64 + 48);
+       vin[7] = _mm512_loadu_epi32(input + 6 * 64 + 48);
+       viv = _mm_loadu_si128((const void *)(input + 7 * 64 + 48));
+       _mm_storeu_si128((void *)iv, viv);
+       break;
+      }
+
+      case CTR_ENC:
+       ctr_generate(iv, vin);
+       break;
+
+      case OCB_ENC:
+      {
+       const ocb_L_uintptr_t *L = Ls;
+       __m512i vchecksum = _mm512_setzero_epi32();
+       __m128i vchecksum128 = _mm_loadu_si128((const void *)checksum);
+       __m128i voffset = _mm_loadu_si128((const void *)iv);
+       vin[0] = ocb_input(&vchecksum, &voffset, input + 0 * 64, output + 0 * 64, L); L += 4;
+       vin[1] = ocb_input(&vchecksum, &voffset, input + 1 * 64, output + 1 * 64, L); L += 4;
+       vin[2] = ocb_input(&vchecksum, &voffset, input + 2 * 64, output + 2 * 64, L); L += 4;
+       vin[3] = ocb_input(&vchecksum, &voffset, input + 3 * 64, output + 3 * 64, L); L += 4;
+       vin[4] = ocb_input(&vchecksum, &voffset, input + 4 * 64, output + 4 * 64, L); L += 4;
+       vin[5] = ocb_input(&vchecksum, &voffset, input + 5 * 64, output + 5 * 64, L); L += 4;
+       vin[6] = ocb_input(&vchecksum, &voffset, input + 6 * 64, output + 6 * 64, L); L += 4;
+       vin[7] = ocb_input(&vchecksum, &voffset, input + 7 * 64, output + 7 * 64, L);
+       vchecksum128 ^= _mm512_extracti32x4_epi32(vchecksum, 0)
+                       ^ _mm512_extracti32x4_epi32(vchecksum, 1)
+                       ^ _mm512_extracti32x4_epi32(vchecksum, 2)
+                       ^ _mm512_extracti32x4_epi32(vchecksum, 3);
+       _mm_storeu_si128((void *)checksum, vchecksum128);
+       _mm_storeu_si128((void *)iv, voffset);
+       break;
+      }
+
+      case OCB_DEC:
+      {
+       const ocb_L_uintptr_t *L = Ls;
+       __m128i voffset = _mm_loadu_si128((const void *)iv);
+       encrypt = 0;
+       vin[0] = ocb_input(NULL, &voffset, input + 0 * 64, output + 0 * 64, L); L += 4;
+       vin[1] = ocb_input(NULL, &voffset, input + 1 * 64, output + 1 * 64, L); L += 4;
+       vin[2] = ocb_input(NULL, &voffset, input + 2 * 64, output + 2 * 64, L); L += 4;
+       vin[3] = ocb_input(NULL, &voffset, input + 3 * 64, output + 3 * 64, L); L += 4;
+       vin[4] = ocb_input(NULL, &voffset, input + 4 * 64, output + 4 * 64, L); L += 4;
+       vin[5] = ocb_input(NULL, &voffset, input + 5 * 64, output + 5 * 64, L); L += 4;
+       vin[6] = ocb_input(NULL, &voffset, input + 6 * 64, output + 6 * 64, L); L += 4;
+       vin[7] = ocb_input(NULL, &voffset, input + 7 * 64, output + 7 * 64, L);
+       _mm_storeu_si128((void *)iv, voffset);
+       break;
+      }
+    }
+
+  if (encrypt)
+    serpent_encrypt_internal_avx512(c, vin, vout);
+  else
+    serpent_decrypt_internal_avx512(c, vin, vout);
+
+  switch (mode)
+    {
+      case CTR_ENC:
+      case CFB_DEC:
+       vout[0] ^= _mm512_loadu_epi32 (input + 0 * 64);
+       vout[1] ^= _mm512_loadu_epi32 (input + 1 * 64);
+       vout[2] ^= _mm512_loadu_epi32 (input + 2 * 64);
+       vout[3] ^= _mm512_loadu_epi32 (input + 3 * 64);
+       vout[4] ^= _mm512_loadu_epi32 (input + 4 * 64);
+       vout[5] ^= _mm512_loadu_epi32 (input + 5 * 64);
+       vout[6] ^= _mm512_loadu_epi32 (input + 6 * 64);
+       vout[7] ^= _mm512_loadu_epi32 (input + 7 * 64);
+       /* fall through */
+      default:
+      case ECB_DEC:
+      case ECB_ENC:
+       _mm512_storeu_epi32 (output + 0 * 64, vout[0]);
+       _mm512_storeu_epi32 (output + 1 * 64, vout[1]);
+       _mm512_storeu_epi32 (output + 2 * 64, vout[2]);
+       _mm512_storeu_epi32 (output + 3 * 64, vout[3]);
+       _mm512_storeu_epi32 (output + 4 * 64, vout[4]);
+       _mm512_storeu_epi32 (output + 5 * 64, vout[5]);
+       _mm512_storeu_epi32 (output + 6 * 64, vout[6]);
+       _mm512_storeu_epi32 (output + 7 * 64, vout[7]);
+       break;
+
+      case CBC_DEC:
+      {
+       __m128i viv;
+       vout[0] ^= _mm512_maskz_loadu_epi32(_cvtu32_mask16(0xfff0),
+                                           input - 1 * 64 + 48)
+                   ^ _mm512_maskz_loadu_epi32(_cvtu32_mask16(0x000f), iv);
+       vout[1] ^= _mm512_loadu_epi32(input + 0 * 64 + 48);
+       vout[2] ^= _mm512_loadu_epi32(input + 1 * 64 + 48);
+       vout[3] ^= _mm512_loadu_epi32(input + 2 * 64 + 48);
+       vout[4] ^= _mm512_loadu_epi32(input + 3 * 64 + 48);
+       vout[5] ^= _mm512_loadu_epi32(input + 4 * 64 + 48);
+       vout[6] ^= _mm512_loadu_epi32(input + 5 * 64 + 48);
+       vout[7] ^= _mm512_loadu_epi32(input + 6 * 64 + 48);
+       viv = _mm_loadu_si128((const void *)(input + 7 * 64 + 48));
+       _mm_storeu_si128((void *)iv, viv);
+       _mm512_storeu_epi32 (output + 0 * 64, vout[0]);
+       _mm512_storeu_epi32 (output + 1 * 64, vout[1]);
+       _mm512_storeu_epi32 (output + 2 * 64, vout[2]);
+       _mm512_storeu_epi32 (output + 3 * 64, vout[3]);
+       _mm512_storeu_epi32 (output + 4 * 64, vout[4]);
+       _mm512_storeu_epi32 (output + 5 * 64, vout[5]);
+       _mm512_storeu_epi32 (output + 6 * 64, vout[6]);
+       _mm512_storeu_epi32 (output + 7 * 64, vout[7]);
+       break;
+      }
+
+      case OCB_ENC:
+       vout[0] ^= _mm512_loadu_epi32 (output + 0 * 64);
+       vout[1] ^= _mm512_loadu_epi32 (output + 1 * 64);
+       vout[2] ^= _mm512_loadu_epi32 (output + 2 * 64);
+       vout[3] ^= _mm512_loadu_epi32 (output + 3 * 64);
+       vout[4] ^= _mm512_loadu_epi32 (output + 4 * 64);
+       vout[5] ^= _mm512_loadu_epi32 (output + 5 * 64);
+       vout[6] ^= _mm512_loadu_epi32 (output + 6 * 64);
+       vout[7] ^= _mm512_loadu_epi32 (output + 7 * 64);
+       _mm512_storeu_epi32 (output + 0 * 64, vout[0]);
+       _mm512_storeu_epi32 (output + 1 * 64, vout[1]);
+       _mm512_storeu_epi32 (output + 2 * 64, vout[2]);
+       _mm512_storeu_epi32 (output + 3 * 64, vout[3]);
+       _mm512_storeu_epi32 (output + 4 * 64, vout[4]);
+       _mm512_storeu_epi32 (output + 5 * 64, vout[5]);
+       _mm512_storeu_epi32 (output + 6 * 64, vout[6]);
+       _mm512_storeu_epi32 (output + 7 * 64, vout[7]);
+       break;
+
+      case OCB_DEC:
+      {
+       __m512i vchecksum = _mm512_setzero_epi32();
+       __m128i vchecksum128 = _mm_loadu_si128((const void *)checksum);
+       vout[0] ^= _mm512_loadu_epi32 (output + 0 * 64);
+       vout[1] ^= _mm512_loadu_epi32 (output + 1 * 64);
+       vout[2] ^= _mm512_loadu_epi32 (output + 2 * 64);
+       vout[3] ^= _mm512_loadu_epi32 (output + 3 * 64);
+       vout[4] ^= _mm512_loadu_epi32 (output + 4 * 64);
+       vout[5] ^= _mm512_loadu_epi32 (output + 5 * 64);
+       vout[6] ^= _mm512_loadu_epi32 (output + 6 * 64);
+       vout[7] ^= _mm512_loadu_epi32 (output + 7 * 64);
+       vchecksum ^= vout[0];
+       vchecksum ^= vout[1];
+       vchecksum ^= vout[2];
+       vchecksum ^= vout[3];
+       vchecksum ^= vout[4];
+       vchecksum ^= vout[5];
+       vchecksum ^= vout[6];
+       vchecksum ^= vout[7];
+       _mm512_storeu_epi32 (output + 0 * 64, vout[0]);
+       _mm512_storeu_epi32 (output + 1 * 64, vout[1]);
+       _mm512_storeu_epi32 (output + 2 * 64, vout[2]);
+       _mm512_storeu_epi32 (output + 3 * 64, vout[3]);
+       _mm512_storeu_epi32 (output + 4 * 64, vout[4]);
+       _mm512_storeu_epi32 (output + 5 * 64, vout[5]);
+       _mm512_storeu_epi32 (output + 6 * 64, vout[6]);
+       _mm512_storeu_epi32 (output + 7 * 64, vout[7]);
+       vchecksum128 ^= _mm512_extracti32x4_epi32(vchecksum, 0)
+                       ^ _mm512_extracti32x4_epi32(vchecksum, 1)
+                       ^ _mm512_extracti32x4_epi32(vchecksum, 2)
+                       ^ _mm512_extracti32x4_epi32(vchecksum, 3);
+       _mm_storeu_si128((void *)checksum, vchecksum128);
+       break;
+      }
+    }
+
+  _mm256_zeroall();
+#ifdef __x86_64__
+  asm volatile (
+#define CLEAR(mm) "vpxord %%" #mm ", %%" #mm ", %%" #mm ";\n\t"
+               CLEAR(ymm16) CLEAR(ymm17) CLEAR(ymm18) CLEAR(ymm19)
+               CLEAR(ymm20) CLEAR(ymm21) CLEAR(ymm22) CLEAR(ymm23)
+               CLEAR(ymm24) CLEAR(ymm25) CLEAR(ymm26) CLEAR(ymm27)
+               CLEAR(ymm28) CLEAR(ymm29) CLEAR(ymm30) CLEAR(ymm31)
+#undef CLEAR
+               :
+               : "m"(*input), "m"(*output)
+               : "xmm16", "xmm17", "xmm18", "xmm19",
+                 "xmm20", "xmm21", "xmm22", "xmm23",
+                 "xmm24", "xmm25", "xmm26", "xmm27",
+                 "xmm28", "xmm29", "xmm30", "xmm31",
+                 "memory", "cc");
+#endif
+}
+
+void
+_gcry_serpent_avx512_blk32(const void *ctx, unsigned char *out,
+                          const unsigned char *in, int encrypt)
+{
+  serpent_avx512_blk32 (ctx, out, in, encrypt ? ECB_ENC : ECB_DEC,
+                       NULL, NULL, NULL);
+}
+
+void
+_gcry_serpent_avx512_cbc_dec(const void *ctx, unsigned char *out,
+                            const unsigned char *in, unsigned char *iv)
+{
+  serpent_avx512_blk32 (ctx, out, in, CBC_DEC, iv, NULL, NULL);
+}
+
+void
+_gcry_serpent_avx512_cfb_dec(const void *ctx, unsigned char *out,
+                            const unsigned char *in, unsigned char *iv)
+{
+  serpent_avx512_blk32 (ctx, out, in, CFB_DEC, iv, NULL, NULL);
+}
+
+void
+_gcry_serpent_avx512_ctr_enc(const void *ctx, unsigned char *out,
+                            const unsigned char *in, unsigned char *iv)
+{
+  serpent_avx512_blk32 (ctx, out, in, CTR_ENC, iv, NULL, NULL);
+}
+
+void
+_gcry_serpent_avx512_ocb_crypt(const void *ctx, unsigned char *out,
+                              const unsigned char *in, unsigned char *offset,
+                              unsigned char *checksum,
+                              const ocb_L_uintptr_t Ls[32], int encrypt)
+{
+  serpent_avx512_blk32 (ctx, out, in, encrypt ? OCB_ENC : OCB_DEC, offset,
+                       checksum, Ls);
+}
+
+#endif /*defined(USE_SERPENT) && defined(ENABLE_AVX512_SUPPORT)*/
+#endif /*__x86_64 || __i386*/
index b593509515dfeef4f92dc2d12120a18e2c3a3f47..885c2bf1dc9957cd143edff55afc82eb66ec2124 100644 (file)
 
 .text
 
-.align 8
+.align 16
 ELF(.type   __serpent_enc_blk8,@function;)
 __serpent_enc_blk8:
        /* input:
@@ -513,7 +513,7 @@ __serpent_enc_blk8:
        CFI_ENDPROC();
 ELF(.size __serpent_enc_blk8,.-__serpent_enc_blk8;)
 
-.align 8
+.align 16
 ELF(.type   __serpent_dec_blk8,@function;)
 __serpent_dec_blk8:
        /* input:
@@ -605,7 +605,72 @@ __serpent_dec_blk8:
        CFI_ENDPROC();
 ELF(.size __serpent_dec_blk8,.-__serpent_dec_blk8;)
 
-.align 8
+.align 16
+.globl _gcry_serpent_sse2_blk8
+ELF(.type   _gcry_serpent_sse2_blk8,@function;)
+_gcry_serpent_sse2_blk8:
+       /* input:
+        *      %rdi: ctx, CTX
+        *      %rsi: dst (8 blocks)
+        *      %rdx: src (8 blocks)
+        *      %ecx: encrypt
+        */
+       CFI_STARTPROC();
+
+       movdqu (0 * 16)(%rdx), RA0;
+       movdqu (1 * 16)(%rdx), RA1;
+       movdqu (2 * 16)(%rdx), RA2;
+       movdqu (3 * 16)(%rdx), RA3;
+       movdqu (4 * 16)(%rdx), RB0;
+       movdqu (5 * 16)(%rdx), RB1;
+       movdqu (6 * 16)(%rdx), RB2;
+       movdqu (7 * 16)(%rdx), RB3;
+
+       testl %ecx, %ecx;
+       jz .Lblk8_dec;
+               call __serpent_enc_blk8;
+               movdqu RA4, (0 * 16)(%rsi);
+               movdqu RA1, (1 * 16)(%rsi);
+               movdqu RA2, (2 * 16)(%rsi);
+               movdqu RA0, (3 * 16)(%rsi);
+               movdqu RB4, (4 * 16)(%rsi);
+               movdqu RB1, (5 * 16)(%rsi);
+               movdqu RB2, (6 * 16)(%rsi);
+               movdqu RB0, (7 * 16)(%rsi);
+               jmp .Lblk8_end;
+       .Lblk8_dec:
+               call __serpent_dec_blk8;
+               movdqu RA0, (0 * 16)(%rsi);
+               movdqu RA1, (1 * 16)(%rsi);
+               movdqu RA2, (2 * 16)(%rsi);
+               movdqu RA3, (3 * 16)(%rsi);
+               movdqu RB0, (4 * 16)(%rsi);
+               movdqu RB1, (5 * 16)(%rsi);
+               movdqu RB2, (6 * 16)(%rsi);
+               movdqu RB3, (7 * 16)(%rsi);
+
+.Lblk8_end:
+       /* clear the used registers */
+       pxor RA0, RA0;
+       pxor RA1, RA1;
+       pxor RA2, RA2;
+       pxor RA3, RA3;
+       pxor RA4, RA4;
+       pxor RB0, RB0;
+       pxor RB1, RB1;
+       pxor RB2, RB2;
+       pxor RB3, RB3;
+       pxor RB4, RB4;
+       pxor RTMP0, RTMP0;
+       pxor RTMP1, RTMP1;
+       pxor RTMP2, RTMP2;
+       pxor RNOT, RNOT;
+
+       ret_spec_stop;
+       CFI_ENDPROC();
+ELF(.size _gcry_serpent_sse2_blk8,.-_gcry_serpent_sse2_blk8;)
+
+.align 16
 .globl _gcry_serpent_sse2_ctr_enc
 ELF(.type   _gcry_serpent_sse2_ctr_enc,@function;)
 _gcry_serpent_sse2_ctr_enc:
@@ -737,7 +802,7 @@ _gcry_serpent_sse2_ctr_enc:
        CFI_ENDPROC();
 ELF(.size _gcry_serpent_sse2_ctr_enc,.-_gcry_serpent_sse2_ctr_enc;)
 
-.align 8
+.align 16
 .globl _gcry_serpent_sse2_cbc_dec
 ELF(.type   _gcry_serpent_sse2_cbc_dec,@function;)
 _gcry_serpent_sse2_cbc_dec:
@@ -800,7 +865,7 @@ _gcry_serpent_sse2_cbc_dec:
        CFI_ENDPROC();
 ELF(.size _gcry_serpent_sse2_cbc_dec,.-_gcry_serpent_sse2_cbc_dec;)
 
-.align 8
+.align 16
 .globl _gcry_serpent_sse2_cfb_dec
 ELF(.type   _gcry_serpent_sse2_cfb_dec,@function;)
 _gcry_serpent_sse2_cfb_dec:
@@ -866,7 +931,7 @@ _gcry_serpent_sse2_cfb_dec:
        CFI_ENDPROC();
 ELF(.size _gcry_serpent_sse2_cfb_dec,.-_gcry_serpent_sse2_cfb_dec;)
 
-.align 8
+.align 16
 .globl _gcry_serpent_sse2_ocb_enc
 ELF(.type _gcry_serpent_sse2_ocb_enc,@function;)
 
@@ -980,7 +1045,7 @@ _gcry_serpent_sse2_ocb_enc:
        CFI_ENDPROC();
 ELF(.size _gcry_serpent_sse2_ocb_enc,.-_gcry_serpent_sse2_ocb_enc;)
 
-.align 8
+.align 16
 .globl _gcry_serpent_sse2_ocb_dec
 ELF(.type _gcry_serpent_sse2_ocb_dec,@function;)
 
@@ -1104,7 +1169,7 @@ _gcry_serpent_sse2_ocb_dec:
        CFI_ENDPROC();
 ELF(.size _gcry_serpent_sse2_ocb_dec,.-_gcry_serpent_sse2_ocb_dec;)
 
-.align 8
+.align 16
 .globl _gcry_serpent_sse2_ocb_auth
 ELF(.type _gcry_serpent_sse2_ocb_auth,@function;)
 
index 159d889fadb23269048300b74a6acd12e0ce9c0e..74d132ab923605bdb145ee7e7164b71b1f0139fb 100644 (file)
@@ -4,7 +4,7 @@
  * This file is part of Libgcrypt.
  *
  * Libgcrypt is free software; you can redistribute it and/or modify
- * it under the terms of the GNU Lesser general Public License as
+ * it under the terms of the GNU Lesser General Public License as
  * published by the Free Software Foundation; either version 2.1 of
  * the License, or (at your option) any later version.
  *
@@ -14,9 +14,8 @@
  * GNU Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
- * 02111-1307, USA.
+ * License along with this program; if not, see <https://www.gnu.org/licenses/>.
+ * SPDX-License-Identifier: LGPL-2.1-or-later
  */
 
 #include <config.h>
 #include "bithelp.h"
 #include "bufhelp.h"
 #include "cipher-internal.h"
-#include "cipher-selftest.h"
+#include "bulkhelp.h"
 
 
-/* USE_SSE2 indicates whether to compile with AMD64 SSE2 code. */
+/* USE_SSE2 indicates whether to compile with x86-64 SSE2 code. */
 #undef USE_SSE2
 #if defined(__x86_64__) && (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
     defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS))
 # define USE_SSE2 1
 #endif
 
-/* USE_AVX2 indicates whether to compile with AMD64 AVX2 code. */
+/* USE_AVX2 indicates whether to compile with x86-64 AVX2 code. */
 #undef USE_AVX2
 #if defined(__x86_64__) && (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
     defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS))
 # endif
 #endif
 
+/* USE_AVX512 indicates whether to compile with x86 AVX512 code. */
+#undef USE_AVX512
+#if (defined(__x86_64) || defined(__i386)) && \
+    defined(HAVE_COMPATIBLE_CC_X86_AVX512_INTRINSICS)
+# if defined(ENABLE_AVX512_SUPPORT)
+#  define USE_AVX512 1
+# endif
+#endif
+
 /* USE_NEON indicates whether to enable ARM NEON assembly code. */
 #undef USE_NEON
 #ifdef ENABLE_NEON_SUPPORT
@@ -83,6 +91,9 @@ typedef struct serpent_context
 #ifdef USE_AVX2
   int use_avx2;
 #endif
+#ifdef USE_AVX512
+  int use_avx512;
+#endif
 #ifdef USE_NEON
   int use_neon;
 #endif
@@ -139,6 +150,9 @@ extern void _gcry_serpent_sse2_ocb_auth(serpent_context_t *ctx,
                                        unsigned char *offset,
                                        unsigned char *checksum,
                                        const u64 Ls[8]) ASM_FUNC_ABI;
+
+extern void _gcry_serpent_sse2_blk8(const serpent_context_t *c, byte *out,
+                                   const byte *in, int encrypt) ASM_FUNC_ABI;
 #endif
 
 #ifdef USE_AVX2
@@ -179,6 +193,41 @@ extern void _gcry_serpent_avx2_ocb_auth(serpent_context_t *ctx,
                                        unsigned char *offset,
                                        unsigned char *checksum,
                                        const u64 Ls[16]) ASM_FUNC_ABI;
+
+extern void _gcry_serpent_avx2_blk16(const serpent_context_t *c, byte *out,
+                                    const byte *in, int encrypt) ASM_FUNC_ABI;
+#endif
+
+#ifdef USE_AVX512
+/* Assembler implementations of Serpent using AVX512.  Processing 32 blocks in
+   parallel.
+ */
+extern void _gcry_serpent_avx512_cbc_dec(const void *ctx,
+                                        unsigned char *out,
+                                        const unsigned char *in,
+                                        unsigned char *iv);
+
+extern void _gcry_serpent_avx512_cfb_dec(const void *ctx,
+                                        unsigned char *out,
+                                        const unsigned char *in,
+                                        unsigned char *iv);
+
+extern void _gcry_serpent_avx512_ctr_enc(const void *ctx,
+                                        unsigned char *out,
+                                        const unsigned char *in,
+                                        unsigned char *ctr);
+
+extern void _gcry_serpent_avx512_ocb_crypt(const void *ctx,
+                                          unsigned char *out,
+                                          const unsigned char *in,
+                                          unsigned char *offset,
+                                          unsigned char *checksum,
+                                          const ocb_L_uintptr_t Ls[32],
+                                          int encrypt);
+
+extern void _gcry_serpent_avx512_blk32(const void *c, byte *out,
+                                      const byte *in,
+                                      int encrypt);
 #endif
 
 #ifdef USE_NEON
@@ -219,6 +268,9 @@ extern void _gcry_serpent_neon_ocb_auth(serpent_context_t *ctx,
                                        unsigned char *offset,
                                        unsigned char *checksum,
                                        const void *Ls[8]);
+
+extern void _gcry_serpent_neon_blk8(const serpent_context_t *c, byte *out,
+                                   const byte *in, int encrypt);
 #endif
 
 
@@ -239,6 +291,12 @@ static size_t _gcry_serpent_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg,
                                       int encrypt);
 static size_t _gcry_serpent_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg,
                                      size_t nblocks);
+static void _gcry_serpent_xts_crypt (void *context, unsigned char *tweak,
+                                    void *outbuf_arg, const void *inbuf_arg,
+                                    size_t nblocks, int encrypt);
+static void _gcry_serpent_ecb_crypt (void *context, void *outbuf_arg,
+                                    const void *inbuf_arg, size_t nblocks,
+                                    int encrypt);
 
 
 /*
@@ -744,6 +802,14 @@ serpent_setkey_internal (serpent_context_t *context,
   serpent_key_prepare (key, key_length, key_prepared);
   serpent_subkeys_generate (key_prepared, context->keys);
 
+#ifdef USE_AVX512
+  context->use_avx512 = 0;
+  if ((_gcry_get_hw_features () & HWF_INTEL_AVX512))
+    {
+      context->use_avx512 = 1;
+    }
+#endif
+
 #ifdef USE_AVX2
   context->use_avx2 = 0;
   if ((_gcry_get_hw_features () & HWF_INTEL_AVX2))
@@ -790,7 +856,9 @@ serpent_setkey (void *ctx,
   bulk_ops->cfb_dec = _gcry_serpent_cfb_dec;
   bulk_ops->ctr_enc = _gcry_serpent_ctr_enc;
   bulk_ops->ocb_crypt = _gcry_serpent_ocb_crypt;
-  bulk_ops->ocb_auth  = _gcry_serpent_ocb_auth;
+  bulk_ops->ocb_auth = _gcry_serpent_ocb_auth;
+  bulk_ops->xts_crypt = _gcry_serpent_xts_crypt;
+  bulk_ops->ecb_crypt = _gcry_serpent_ecb_crypt;
 
   if (serpent_test_ret)
     ret = GPG_ERR_SELFTEST_FAILED;
@@ -938,6 +1006,34 @@ _gcry_serpent_ctr_enc(void *context, unsigned char *ctr,
   unsigned char tmpbuf[sizeof(serpent_block_t)];
   int burn_stack_depth = 2 * sizeof (serpent_block_t);
 
+#ifdef USE_AVX512
+  if (ctx->use_avx512)
+    {
+      int did_use_avx512 = 0;
+
+      /* Process data in 32 block chunks. */
+      while (nblocks >= 32)
+        {
+          _gcry_serpent_avx512_ctr_enc(ctx, outbuf, inbuf, ctr);
+
+          nblocks -= 32;
+          outbuf += 32 * sizeof(serpent_block_t);
+          inbuf  += 32 * sizeof(serpent_block_t);
+          did_use_avx512 = 1;
+        }
+
+      if (did_use_avx512)
+        {
+          /* serpent-avx512 code does not use stack */
+          if (nblocks == 0)
+            burn_stack_depth = 0;
+        }
+
+      /* Use generic/avx2/sse2 code to handle smaller chunks... */
+      /* TODO: use caching instead? */
+    }
+#endif
+
 #ifdef USE_AVX2
   if (ctx->use_avx2)
     {
@@ -1050,6 +1146,33 @@ _gcry_serpent_cbc_dec(void *context, unsigned char *iv,
   unsigned char savebuf[sizeof(serpent_block_t)];
   int burn_stack_depth = 2 * sizeof (serpent_block_t);
 
+#ifdef USE_AVX512
+  if (ctx->use_avx512)
+    {
+      int did_use_avx512 = 0;
+
+      /* Process data in 32 block chunks. */
+      while (nblocks >= 32)
+        {
+          _gcry_serpent_avx512_cbc_dec(ctx, outbuf, inbuf, iv);
+
+          nblocks -= 32;
+          outbuf += 32 * sizeof(serpent_block_t);
+          inbuf  += 32 * sizeof(serpent_block_t);
+          did_use_avx512 = 1;
+        }
+
+      if (did_use_avx512)
+        {
+          /* serpent-avx512 code does not use stack */
+          if (nblocks == 0)
+            burn_stack_depth = 0;
+        }
+
+      /* Use generic/avx2/sse2 code to handle smaller chunks... */
+    }
+#endif
+
 #ifdef USE_AVX2
   if (ctx->use_avx2)
     {
@@ -1158,6 +1281,33 @@ _gcry_serpent_cfb_dec(void *context, unsigned char *iv,
   const unsigned char *inbuf = inbuf_arg;
   int burn_stack_depth = 2 * sizeof (serpent_block_t);
 
+#ifdef USE_AVX512
+  if (ctx->use_avx512)
+    {
+      int did_use_avx512 = 0;
+
+      /* Process data in 32 block chunks. */
+      while (nblocks >= 32)
+        {
+          _gcry_serpent_avx512_cfb_dec(ctx, outbuf, inbuf, iv);
+
+          nblocks -= 32;
+          outbuf += 32 * sizeof(serpent_block_t);
+          inbuf  += 32 * sizeof(serpent_block_t);
+          did_use_avx512 = 1;
+        }
+
+      if (did_use_avx512)
+        {
+          /* serpent-avx512 code does not use stack */
+          if (nblocks == 0)
+            burn_stack_depth = 0;
+        }
+
+      /* Use generic/avx2/sse2 code to handle smaller chunks... */
+    }
+#endif
+
 #ifdef USE_AVX2
   if (ctx->use_avx2)
     {
@@ -1254,7 +1404,8 @@ static size_t
 _gcry_serpent_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg,
                        const void *inbuf_arg, size_t nblocks, int encrypt)
 {
-#if defined(USE_AVX2) || defined(USE_SSE2) || defined(USE_NEON)
+#if defined(USE_AVX512) || defined(USE_AVX2) || defined(USE_SSE2) \
+    || defined(USE_NEON)
   serpent_context_t *ctx = (void *)&c->context.c;
   unsigned char *outbuf = outbuf_arg;
   const unsigned char *inbuf = inbuf_arg;
@@ -1267,32 +1418,54 @@ _gcry_serpent_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg,
   (void)encrypt;
 #endif
 
+#ifdef USE_AVX512
+  if (ctx->use_avx512)
+    {
+      int did_use_avx512 = 0;
+      ocb_L_uintptr_t Ls[32];
+      ocb_L_uintptr_t *l;
+
+      if (nblocks >= 32)
+       {
+          l = bulk_ocb_prepare_L_pointers_array_blk32 (c, Ls, blkn);
+
+         /* Process data in 32 block chunks. */
+         while (nblocks >= 32)
+           {
+             blkn += 32;
+             *l = (uintptr_t)(void *)ocb_get_l(c, blkn - blkn % 32);
+
+             _gcry_serpent_avx512_ocb_crypt(ctx, outbuf, inbuf, c->u_iv.iv,
+                                            c->u_ctr.ctr, Ls, encrypt);
+
+             nblocks -= 32;
+             outbuf += 32 * sizeof(serpent_block_t);
+             inbuf  += 32 * sizeof(serpent_block_t);
+             did_use_avx512 = 1;
+           }
+       }
+
+      if (did_use_avx512)
+       {
+         /* serpent-avx512 code does not use stack */
+         if (nblocks == 0)
+           burn_stack_depth = 0;
+       }
+
+      /* Use generic code to handle smaller chunks... */
+    }
+#endif
+
 #ifdef USE_AVX2
   if (ctx->use_avx2)
     {
       int did_use_avx2 = 0;
       u64 Ls[16];
-      unsigned int n = 16 - (blkn % 16);
       u64 *l;
-      int i;
 
       if (nblocks >= 16)
        {
-         for (i = 0; i < 16; i += 8)
-           {
-             /* Use u64 to store pointers for x32 support (assembly function
-              * assumes 64-bit pointers). */
-             Ls[(i + 0 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
-             Ls[(i + 1 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[1];
-             Ls[(i + 2 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
-             Ls[(i + 3 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[2];
-             Ls[(i + 4 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
-             Ls[(i + 5 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[1];
-             Ls[(i + 6 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
-           }
-
-         Ls[(7 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[3];
-         l = &Ls[(15 + n) % 16];
+          l = bulk_ocb_prepare_L_pointers_array_blk16 (c, Ls, blkn);
 
          /* Process data in 16 block chunks. */
          while (nblocks >= 16)
@@ -1329,21 +1502,11 @@ _gcry_serpent_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg,
   {
     int did_use_sse2 = 0;
     u64 Ls[8];
-    unsigned int n = 8 - (blkn % 8);
     u64 *l;
 
     if (nblocks >= 8)
       {
-       /* Use u64 to store pointers for x32 support (assembly function
-         * assumes 64-bit pointers). */
-       Ls[(0 + n) % 8] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
-       Ls[(1 + n) % 8] = (uintptr_t)(void *)c->u_mode.ocb.L[1];
-       Ls[(2 + n) % 8] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
-       Ls[(3 + n) % 8] = (uintptr_t)(void *)c->u_mode.ocb.L[2];
-       Ls[(4 + n) % 8] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
-       Ls[(5 + n) % 8] = (uintptr_t)(void *)c->u_mode.ocb.L[1];
-       Ls[(6 + n) % 8] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
-       l = &Ls[(7 + n) % 8];
+        l = bulk_ocb_prepare_L_pointers_array_blk8 (c, Ls, blkn);
 
        /* Process data in 8 block chunks. */
        while (nblocks >= 8)
@@ -1380,33 +1543,25 @@ _gcry_serpent_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg,
   if (ctx->use_neon)
     {
       int did_use_neon = 0;
-      const void *Ls[8];
-      unsigned int n = 8 - (blkn % 8);
-      const void **l;
+      uintptr_t Ls[8];
+      uintptr_t *l;
 
       if (nblocks >= 8)
        {
-         Ls[(0 + n) % 8] = c->u_mode.ocb.L[0];
-         Ls[(1 + n) % 8] = c->u_mode.ocb.L[1];
-         Ls[(2 + n) % 8] = c->u_mode.ocb.L[0];
-         Ls[(3 + n) % 8] = c->u_mode.ocb.L[2];
-         Ls[(4 + n) % 8] = c->u_mode.ocb.L[0];
-         Ls[(5 + n) % 8] = c->u_mode.ocb.L[1];
-         Ls[(6 + n) % 8] = c->u_mode.ocb.L[0];
-         l = &Ls[(7 + n) % 8];
+          l = bulk_ocb_prepare_L_pointers_array_blk8 (c, Ls, blkn);
 
          /* Process data in 8 block chunks. */
          while (nblocks >= 8)
            {
              blkn += 8;
-             *l = ocb_get_l(c,  blkn - blkn % 8);
+             *l = (uintptr_t)(void *)ocb_get_l(c,  blkn - blkn % 8);
 
              if (encrypt)
                _gcry_serpent_neon_ocb_enc(ctx, outbuf, inbuf, c->u_iv.iv,
-                                         c->u_ctr.ctr, Ls);
+                                          c->u_ctr.ctr, (const void **)Ls);
              else
                _gcry_serpent_neon_ocb_dec(ctx, outbuf, inbuf, c->u_iv.iv,
-                                         c->u_ctr.ctr, Ls);
+                                          c->u_ctr.ctr, (const void **)Ls);
 
              nblocks -= 8;
              outbuf += 8 * sizeof(serpent_block_t);
@@ -1426,7 +1581,8 @@ _gcry_serpent_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg,
     }
 #endif
 
-#if defined(USE_AVX2) || defined(USE_SSE2) || defined(USE_NEON)
+#if defined(USE_AVX512) || defined(USE_AVX2) || defined(USE_SSE2) \
+    || defined(USE_NEON)
   c->u_mode.ocb.data_nblocks = blkn;
 
   if (burn_stack_depth)
@@ -1456,27 +1612,11 @@ _gcry_serpent_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg,
     {
       int did_use_avx2 = 0;
       u64 Ls[16];
-      unsigned int n = 16 - (blkn % 16);
       u64 *l;
-      int i;
 
       if (nblocks >= 16)
        {
-         for (i = 0; i < 16; i += 8)
-           {
-             /* Use u64 to store pointers for x32 support (assembly function
-              * assumes 64-bit pointers). */
-             Ls[(i + 0 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
-             Ls[(i + 1 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[1];
-             Ls[(i + 2 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
-             Ls[(i + 3 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[2];
-             Ls[(i + 4 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
-             Ls[(i + 5 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[1];
-             Ls[(i + 6 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
-           }
-
-         Ls[(7 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[3];
-         l = &Ls[(15 + n) % 16];
+        l = bulk_ocb_prepare_L_pointers_array_blk16 (c, Ls, blkn);
 
          /* Process data in 16 block chunks. */
          while (nblocks >= 16)
@@ -1508,21 +1648,11 @@ _gcry_serpent_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg,
   {
     int did_use_sse2 = 0;
     u64 Ls[8];
-    unsigned int n = 8 - (blkn % 8);
     u64 *l;
 
     if (nblocks >= 8)
       {
-       /* Use u64 to store pointers for x32 support (assembly function
-       * assumes 64-bit pointers). */
-       Ls[(0 + n) % 8] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
-       Ls[(1 + n) % 8] = (uintptr_t)(void *)c->u_mode.ocb.L[1];
-       Ls[(2 + n) % 8] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
-       Ls[(3 + n) % 8] = (uintptr_t)(void *)c->u_mode.ocb.L[2];
-       Ls[(4 + n) % 8] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
-       Ls[(5 + n) % 8] = (uintptr_t)(void *)c->u_mode.ocb.L[1];
-       Ls[(6 + n) % 8] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
-       l = &Ls[(7 + n) % 8];
+        l = bulk_ocb_prepare_L_pointers_array_blk8 (c, Ls, blkn);
 
        /* Process data in 8 block chunks. */
        while (nblocks >= 8)
@@ -1554,29 +1684,22 @@ _gcry_serpent_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg,
   if (ctx->use_neon)
     {
       int did_use_neon = 0;
-      const void *Ls[8];
-      unsigned int n = 8 - (blkn % 8);
-      const void **l;
+      uintptr_t Ls[8];
+      uintptr_t *l;
 
       if (nblocks >= 8)
        {
-         Ls[(0 + n) % 8] = c->u_mode.ocb.L[0];
-         Ls[(1 + n) % 8] = c->u_mode.ocb.L[1];
-         Ls[(2 + n) % 8] = c->u_mode.ocb.L[0];
-         Ls[(3 + n) % 8] = c->u_mode.ocb.L[2];
-         Ls[(4 + n) % 8] = c->u_mode.ocb.L[0];
-         Ls[(5 + n) % 8] = c->u_mode.ocb.L[1];
-         Ls[(6 + n) % 8] = c->u_mode.ocb.L[0];
-         l = &Ls[(7 + n) % 8];
+          l = bulk_ocb_prepare_L_pointers_array_blk8 (c, Ls, blkn);
 
          /* Process data in 8 block chunks. */
          while (nblocks >= 8)
            {
              blkn += 8;
-             *l = ocb_get_l(c, blkn - blkn % 8);
+             *l = (uintptr_t)(void *)ocb_get_l(c, blkn - blkn % 8);
 
              _gcry_serpent_neon_ocb_auth(ctx, abuf, c->u_mode.ocb.aad_offset,
-                                         c->u_mode.ocb.aad_sum, Ls);
+                                         c->u_mode.ocb.aad_sum,
+                                         (const void **)Ls);
 
              nblocks -= 8;
              abuf += 8 * sizeof(serpent_block_t);
@@ -1605,49 +1728,145 @@ _gcry_serpent_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg,
   return nblocks;
 }
 
-\f
 
-/* Run the self-tests for SERPENT-CTR-128, tests IV increment of bulk CTR
-   encryption.  Returns NULL on success. */
-static const char*
-selftest_ctr_128 (void)
+static unsigned int
+serpent_crypt_blk1_32(void *context, byte *out, const byte *in,
+                     size_t num_blks, int encrypt)
 {
-  const int nblocks = 16+8+1;
-  const int blocksize = sizeof(serpent_block_t);
-  const int context_size = sizeof(serpent_context_t);
+  serpent_context_t *ctx = context;
+  unsigned int burn, burn_stack_depth = 0;
 
-  return _gcry_selftest_helper_ctr("SERPENT", &serpent_setkey,
-           &serpent_encrypt, nblocks, blocksize, context_size);
+#ifdef USE_AVX512
+  if (num_blks == 32 && ctx->use_avx512)
+    {
+      _gcry_serpent_avx512_blk32 (ctx, out, in, encrypt);
+      return 0;
+    }
+#endif
+
+#ifdef USE_AVX2
+  while (num_blks == 16 && ctx->use_avx2)
+    {
+      _gcry_serpent_avx2_blk16 (ctx, out, in, encrypt);
+      out += 16 * sizeof(serpent_block_t);
+      in += 16 * sizeof(serpent_block_t);
+      num_blks -= 16;
+    }
+#endif
+
+#ifdef USE_SSE2
+  while (num_blks >= 8)
+    {
+      _gcry_serpent_sse2_blk8 (ctx, out, in, encrypt);
+      out += 8 * sizeof(serpent_block_t);
+      in += 8 * sizeof(serpent_block_t);
+      num_blks -= 8;
+    }
+#endif
+
+#ifdef USE_NEON
+  if (ctx->use_neon)
+    {
+      while (num_blks >= 8)
+       {
+         _gcry_serpent_neon_blk8 (ctx, out, in, encrypt);
+         out += 8 * sizeof(serpent_block_t);
+         in += 8 * sizeof(serpent_block_t);
+         num_blks -= 8;
+       }
+    }
+#endif
+
+  while (num_blks >= 1)
+    {
+      if (encrypt)
+       serpent_encrypt_internal((void *)ctx, in, out);
+      else
+       serpent_decrypt_internal((void *)ctx, in, out);
+
+      burn = 2 * sizeof(serpent_block_t);
+      burn_stack_depth = (burn > burn_stack_depth) ? burn : burn_stack_depth;
+      out += sizeof(serpent_block_t);
+      in += sizeof(serpent_block_t);
+      num_blks--;
+    }
+
+  return burn_stack_depth;
+}
+
+static unsigned int
+serpent_encrypt_blk1_32(void *ctx, byte *out, const byte *in,
+                       size_t num_blks)
+{
+  return serpent_crypt_blk1_32 (ctx, out, in, num_blks, 1);
 }
 
+static unsigned int
+serpent_decrypt_blk1_32(void *ctx, byte *out, const byte *in,
+                       size_t num_blks)
+{
+  return serpent_crypt_blk1_32 (ctx, out, in, num_blks, 0);
+}
 
-/* Run the self-tests for SERPENT-CBC-128, tests bulk CBC decryption.
-   Returns NULL on success. */
-static const char*
-selftest_cbc_128 (void)
+
+/* Bulk encryption/decryption of complete blocks in XTS mode. */
+static void
+_gcry_serpent_xts_crypt (void *context, unsigned char *tweak, void *outbuf_arg,
+                        const void *inbuf_arg, size_t nblocks, int encrypt)
 {
-  const int nblocks = 16+8+2;
-  const int blocksize = sizeof(serpent_block_t);
-  const int context_size = sizeof(serpent_context_t);
+  serpent_context_t *ctx = context;
+  unsigned char *outbuf = outbuf_arg;
+  const unsigned char *inbuf = inbuf_arg;
+  int burn_stack_depth = 0;
 
-  return _gcry_selftest_helper_cbc("SERPENT", &serpent_setkey,
-           &serpent_encrypt, nblocks, blocksize, context_size);
+  /* Process remaining blocks. */
+  if (nblocks)
+    {
+      unsigned char tmpbuf[32 * 16];
+      unsigned int tmp_used = 16;
+      size_t nburn;
+
+      nburn = bulk_xts_crypt_128(ctx, encrypt ? serpent_encrypt_blk1_32
+                                              : serpent_decrypt_blk1_32,
+                                 outbuf, inbuf, nblocks,
+                                 tweak, tmpbuf, sizeof(tmpbuf) / 16,
+                                 &tmp_used);
+      burn_stack_depth = nburn > burn_stack_depth ? nburn : burn_stack_depth;
+
+      wipememory(tmpbuf, tmp_used);
+    }
+
+  if (burn_stack_depth)
+    _gcry_burn_stack(burn_stack_depth);
 }
 
 
-/* Run the self-tests for SERPENT-CBC-128, tests bulk CBC decryption.
-   Returns NULL on success. */
-static const char*
-selftest_cfb_128 (void)
+/* Bulk encryption/decryption in ECB mode. */
+static void
+_gcry_serpent_ecb_crypt (void *context, void *outbuf_arg, const void *inbuf_arg,
+                        size_t nblocks, int encrypt)
 {
-  const int nblocks = 16+8+2;
-  const int blocksize = sizeof(serpent_block_t);
-  const int context_size = sizeof(serpent_context_t);
+  serpent_context_t *ctx = context;
+  unsigned char *outbuf = outbuf_arg;
+  const unsigned char *inbuf = inbuf_arg;
+  int burn_stack_depth = 0;
 
-  return _gcry_selftest_helper_cfb("SERPENT", &serpent_setkey,
-           &serpent_encrypt, nblocks, blocksize, context_size);
+  /* Process remaining blocks. */
+  if (nblocks)
+    {
+      size_t nburn;
+
+      nburn = bulk_ecb_crypt_128(ctx, encrypt ? serpent_encrypt_blk1_32
+                                              : serpent_decrypt_blk1_32,
+                                 outbuf, inbuf, nblocks, 32);
+      burn_stack_depth = nburn > burn_stack_depth ? nburn : burn_stack_depth;
+    }
+
+  if (burn_stack_depth)
+    _gcry_burn_stack(burn_stack_depth);
 }
 
+\f
 
 /* Serpent test.  */
 
@@ -1657,7 +1876,6 @@ serpent_test (void)
   serpent_context_t context;
   unsigned char scratch[16];
   unsigned int i;
-  const char *r;
 
   static struct test
   {
@@ -1729,15 +1947,6 @@ serpent_test (void)
        }
     }
 
-  if ( (r = selftest_ctr_128 ()) )
-    return r;
-
-  if ( (r = selftest_cbc_128 ()) )
-    return r;
-
-  if ( (r = selftest_cfb_128 ()) )
-    return r;
-
   return NULL;
 }
 
index ea26564b02f34ded2e623b88a76f3b0e47efea54..e309c08cbeca785271536fb71a540ed7ee1bd509 100644 (file)
 
 .cpu generic+simd+crypto
 
-.text
-
 
 /* Constants */
 
+SECTION_RODATA
+
 #define K1  0x5A827999
 #define K2  0x6ED9EBA1
 #define K3  0x8F1BBCDC
 #define K4  0xCA62C1D6
 .align 4
+ELF(.type gcry_sha1_aarch64_ce_K_VEC,%object;)
 gcry_sha1_aarch64_ce_K_VEC:
 .LK_VEC:
 .LK1:  .long K1, K1, K1, K1
@@ -91,12 +92,14 @@ gcry_sha1_aarch64_ce_K_VEC:
 #define CLEAR_REG(reg) movi reg.16b, #0;
 
 
+.text
+
 /*
  * unsigned int
  * _gcry_sha1_transform_armv8_ce (void *ctx, const unsigned char *data,
  *                                size_t nblks)
  */
-.align 3
+.align 4
 .globl _gcry_sha1_transform_armv8_ce
 ELF(.type  _gcry_sha1_transform_armv8_ce,%function;)
 _gcry_sha1_transform_armv8_ce:
index acada9607cc7a0b348d7139ca50a1da749397b41..e5e55684eec1d8eff7caacb02fc83034de5eab92 100644 (file)
 
 /* Constants */
 
-.text
+SECTION_RODATA
+
+ELF(.type _sha1_avx_consts,@object)
+_sha1_avx_consts:
 #define K1  0x5A827999
 #define K2  0x6ED9EBA1
 #define K3  0x8F1BBCDC
        vpaddd (.LK_XMM + ((i)/20)*16) rRIP, W, tmp0; \
        vmovdqa tmp0, WK((i)&~3);
 
+.text
 
 /*
  * Transform nblks*64 bytes (nblks*16 32-bit words) at DATA.
index 5f4b9e697b7118d7b7df525121add78ff4912265..16a01bfdd4fac460cb15d13f1f0e86d92334aa70 100644 (file)
 
 /* Constants */
 
-.text
+SECTION_RODATA
+
+ELF(.type _sha1_avx_bmi2_consts,@object)
+_sha1_avx_bmi2_consts:
+
 .align 16
 .Lbswap_shufb_ctl:
        .long 0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f
        vpaddd K, W, tmp0; \
        vmovdqa tmp0, WK((i)&~3);
 
+.text
 
 /*
  * Transform nblks*64 bytes (nblks*16 32-bit words) at DATA.
index ed52761b899fce799b958ced55a7bba6eaccaaba..06ff92f058b9324384188ea98e4947e8157a4b96 100644 (file)
 
 /* Constants */
 
+SECTION_RODATA
+
 #define WK_STACK_WORDS (80 * 2)
 
-.text
+ELF(.type _sha1_avx2_bmi2_consts,@object)
+_sha1_avx2_bmi2_consts:
+
 .align 16
 .Lbswap_shufb_ctl:
        .long 0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f
        vpaddd K, W, tmp0; \
        vmovdqa tmp0, PRE_WK((i)&~3);
 
+.text
 
 /*
  * Transform 2*nblks*64 bytes (2*nblks*16 32-bit words) at DATA.
index f09b1de1250e9c9afb19c118a147e70be4fd5116..53a244311af33a83b20e179dad69e2cbdbf5124d 100644 (file)
 
 /* Constants */
 
-.text
+SECTION_RODATA
+
+ELF(.type _sha1_ssse3_consts,@object)
+_sha1_ssse3_consts:
+
 #define K1  0x5A827999
 #define K2  0x6ED9EBA1
 #define K3  0x8F1BBCDC
 
 #define CLEAR_REG(reg) pxor reg, reg;
 
+.text
 
 /*
  * Transform nblks*64 bytes (nblks*16 32-bit words) at DATA.
index d0fa62857bb78c23cd1ddcf1e7117dcf468ad00e..333e792d2d87c6eced522929677899c30d96472f 100644 (file)
 
 .cpu generic+simd+crypto
 
-.text
-
 
 /* Constants */
 
+SECTION_RODATA
+
 .align 4
+ELF(.type gcry_sha256_aarch64_ce_K,%object;)
 gcry_sha256_aarch64_ce_K:
 .LK:
   .long 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5
@@ -101,12 +102,14 @@ gcry_sha256_aarch64_ce_K:
 #define CLEAR_REG(reg) movi reg.16b, #0;
 
 
+.text
+
 /*
  * unsigned int
  * _gcry_sha256_transform_armv8_ce (u32 state[8], const void *input_data,
  *                                  size_t num_blks)
  */
-.align 3
+.align 4
 .globl _gcry_sha256_transform_armv8_ce
 ELF(.type  _gcry_sha256_transform_armv8_ce,%function;)
 _gcry_sha256_transform_armv8_ce:
index be8a799df349a068dcb6026d0816975cadafa21a..8cfd0880a0b94cb87fd3164c1fb1bd125dabb76d 100644 (file)
@@ -475,6 +475,11 @@ _gcry_sha256_transform_amd64_avx:
        CFI_ENDPROC()
 
 
+SECTION_RODATA
+
+ELF(.type _sha256_avx_consts,@object)
+_sha256_avx_consts:
+
 .align 16
 .LK256:
        .long   0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
index 60ad442c0528e6a91462555f6b32905a29cd75d5..e2a5454c15b981e5dc8ddf5ea4b3f0e72bf89715 100644 (file)
@@ -477,6 +477,12 @@ _gcry_sha256_transform_amd64_avx2:
        ret_spec_stop
        CFI_ENDPROC()
 
+
+SECTION_RODATA
+
+ELF(.type _sha256_avx2_consts,@object)
+_sha256_avx2_consts:
+
 .align 64
 .LK256:
        .long   0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
index a9b59714d29243513c7bfb7e5b4265c94196cb4c..e5839a841d1618da64049e4ecf5526322f841479 100644 (file)
@@ -1,5 +1,5 @@
 /* sha256-ppc.c - PowerPC vcrypto implementation of SHA-256 transform
- * Copyright (C) 2019 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ * Copyright (C) 2019,2023 Jussi Kivilinna <jussi.kivilinna@iki.fi>
  *
  * This file is part of Libgcrypt.
  *
@@ -42,26 +42,43 @@ typedef vector unsigned long long vector2x_u64;
 #define ASM_FUNC_ATTR_INLINE   ASM_FUNC_ATTR ALWAYS_INLINE
 #define ASM_FUNC_ATTR_NOINLINE ASM_FUNC_ATTR NO_INLINE
 
+#ifdef HAVE_GCC_ATTRIBUTE_OPTIMIZE
+# define FUNC_ATTR_OPT_O2 __attribute__((optimize("-O2")))
+#else
+# define FUNC_ATTR_OPT_O2
+#endif
+
+#if defined(__clang__) && defined(HAVE_CLANG_ATTRIBUTE_PPC_TARGET)
+# define FUNC_ATTR_TARGET_P8 __attribute__((target("arch=pwr8")))
+# define FUNC_ATTR_TARGET_P9 __attribute__((target("arch=pwr9")))
+#elif defined(HAVE_GCC_ATTRIBUTE_PPC_TARGET)
+# define FUNC_ATTR_TARGET_P8 __attribute__((target("cpu=power8")))
+# define FUNC_ATTR_TARGET_P9 __attribute__((target("cpu=power9")))
+#else
+# define FUNC_ATTR_TARGET_P8
+# define FUNC_ATTR_TARGET_P9
+#endif
 
-static const u32 K[64] =
+
+static const vector4x_u32 K[64 / 4] =
   {
 #define TBL(v) v
-    TBL(0x428a2f98), TBL(0x71374491), TBL(0xb5c0fbcf), TBL(0xe9b5dba5),
-    TBL(0x3956c25b), TBL(0x59f111f1), TBL(0x923f82a4), TBL(0xab1c5ed5),
-    TBL(0xd807aa98), TBL(0x12835b01), TBL(0x243185be), TBL(0x550c7dc3),
-    TBL(0x72be5d74), TBL(0x80deb1fe), TBL(0x9bdc06a7), TBL(0xc19bf174),
-    TBL(0xe49b69c1), TBL(0xefbe4786), TBL(0x0fc19dc6), TBL(0x240ca1cc),
-    TBL(0x2de92c6f), TBL(0x4a7484aa), TBL(0x5cb0a9dc), TBL(0x76f988da),
-    TBL(0x983e5152), TBL(0xa831c66d), TBL(0xb00327c8), TBL(0xbf597fc7),
-    TBL(0xc6e00bf3), TBL(0xd5a79147), TBL(0x06ca6351), TBL(0x14292967),
-    TBL(0x27b70a85), TBL(0x2e1b2138), TBL(0x4d2c6dfc), TBL(0x53380d13),
-    TBL(0x650a7354), TBL(0x766a0abb), TBL(0x81c2c92e), TBL(0x92722c85),
-    TBL(0xa2bfe8a1), TBL(0xa81a664b), TBL(0xc24b8b70), TBL(0xc76c51a3),
-    TBL(0xd192e819), TBL(0xd6990624), TBL(0xf40e3585), TBL(0x106aa070),
-    TBL(0x19a4c116), TBL(0x1e376c08), TBL(0x2748774c), TBL(0x34b0bcb5),
-    TBL(0x391c0cb3), TBL(0x4ed8aa4a), TBL(0x5b9cca4f), TBL(0x682e6ff3),
-    TBL(0x748f82ee), TBL(0x78a5636f), TBL(0x84c87814), TBL(0x8cc70208),
-    TBL(0x90befffa), TBL(0xa4506ceb), TBL(0xbef9a3f7), TBL(0xc67178f2)
+    { TBL(0x428a2f98), TBL(0x71374491), TBL(0xb5c0fbcf), TBL(0xe9b5dba5) },
+    { TBL(0x3956c25b), TBL(0x59f111f1), TBL(0x923f82a4), TBL(0xab1c5ed5) },
+    { TBL(0xd807aa98), TBL(0x12835b01), TBL(0x243185be), TBL(0x550c7dc3) },
+    { TBL(0x72be5d74), TBL(0x80deb1fe), TBL(0x9bdc06a7), TBL(0xc19bf174) },
+    { TBL(0xe49b69c1), TBL(0xefbe4786), TBL(0x0fc19dc6), TBL(0x240ca1cc) },
+    { TBL(0x2de92c6f), TBL(0x4a7484aa), TBL(0x5cb0a9dc), TBL(0x76f988da) },
+    { TBL(0x983e5152), TBL(0xa831c66d), TBL(0xb00327c8), TBL(0xbf597fc7) },
+    { TBL(0xc6e00bf3), TBL(0xd5a79147), TBL(0x06ca6351), TBL(0x14292967) },
+    { TBL(0x27b70a85), TBL(0x2e1b2138), TBL(0x4d2c6dfc), TBL(0x53380d13) },
+    { TBL(0x650a7354), TBL(0x766a0abb), TBL(0x81c2c92e), TBL(0x92722c85) },
+    { TBL(0xa2bfe8a1), TBL(0xa81a664b), TBL(0xc24b8b70), TBL(0xc76c51a3) },
+    { TBL(0xd192e819), TBL(0xd6990624), TBL(0xf40e3585), TBL(0x106aa070) },
+    { TBL(0x19a4c116), TBL(0x1e376c08), TBL(0x2748774c), TBL(0x34b0bcb5) },
+    { TBL(0x391c0cb3), TBL(0x4ed8aa4a), TBL(0x5b9cca4f), TBL(0x682e6ff3) },
+    { TBL(0x748f82ee), TBL(0x78a5636f), TBL(0x84c87814), TBL(0x8cc70208) },
+    { TBL(0x90befffa), TBL(0xa4506ceb), TBL(0xbef9a3f7), TBL(0xc67178f2) }
 #undef TBL
   };
 
@@ -86,13 +103,6 @@ vec_merge_idx0_elems(vector4x_u32 v0, vector4x_u32 v1,
 }
 
 
-static ASM_FUNC_ATTR_INLINE vector4x_u32
-vec_ror_u32(vector4x_u32 v, unsigned int shift)
-{
-  return (v >> (shift & 31)) ^ (v << ((32 - shift) & 31));
-}
-
-
 static ASM_FUNC_ATTR_INLINE vector4x_u32
 vec_vshasigma_u32(vector4x_u32 v, unsigned int a, unsigned int b)
 {
@@ -104,19 +114,75 @@ vec_vshasigma_u32(vector4x_u32 v, unsigned int a, unsigned int b)
 }
 
 
+static ASM_FUNC_ATTR_INLINE vector4x_u32
+vec_add_u32(vector4x_u32 v, vector4x_u32 w)
+{
+  __asm__ ("vadduwm %0,%1,%2"
+          : "=v" (v)
+          : "v" (v), "v" (w)
+          : "memory");
+  return v;
+}
+
+
+static ASM_FUNC_ATTR_INLINE vector4x_u32
+vec_u32_load_be(unsigned long offset, const void *ptr)
+{
+  vector4x_u32 vecu32;
+#if __GNUC__ >= 4
+  if (__builtin_constant_p (offset) && offset == 0)
+    __asm__ volatile ("lxvw4x %x0,0,%1\n\t"
+                     : "=wa" (vecu32)
+                     : "r" ((uintptr_t)ptr)
+                     : "memory");
+  else
+#endif
+    __asm__ volatile ("lxvw4x %x0,%1,%2\n\t"
+                     : "=wa" (vecu32)
+                     : "r" (offset), "r" ((uintptr_t)ptr)
+                     : "memory", "r0");
+#ifndef WORDS_BIGENDIAN
+  return (vector4x_u32)vec_reve((vector16x_u8)vecu32);
+#else
+  return vecu32;
+#endif
+}
+
+
 /* SHA2 round in vector registers */
-#define R(a,b,c,d,e,f,g,h,k,w) do                             \
+#define R(a,b,c,d,e,f,g,h,ki,w) do                            \
     {                                                         \
-      t1  = (h);                                              \
-      t1 += ((k) + (w));                                      \
-      t1 += Cho((e),(f),(g));                                 \
-      t1 += Sum1((e));                                        \
-      t2  = Sum0((a));                                        \
-      t2 += Maj((a),(b),(c));                                 \
-      d  += t1;                                               \
-      h   = t1 + t2;                                          \
+      t1 = vec_add_u32((h), (w));                             \
+      t2 = Cho((e),(f),(g));                                  \
+      t1 = vec_add_u32(t1, GETK(ki));                         \
+      t1 = vec_add_u32(t1, t2);                               \
+      t1 = Sum1add(t1, e);                                    \
+      t2 = Maj((a),(b),(c));                                  \
+      t2 = Sum0add(t2, a);                                    \
+      h  = vec_add_u32(t1, t2);                               \
+      d += t1;                                                \
     } while (0)
 
+#define GETK(kidx) \
+    ({ \
+      vector4x_u32 rk; \
+      if (((kidx) % 4) == 0) \
+       { \
+         rk = ktmp = *(kptr++); \
+         if ((kidx) < 63) \
+           asm volatile("" : "+r" (kptr) :: "memory"); \
+       } \
+      else if (((kidx) % 4) == 1) \
+       { \
+         rk = vec_mergeo(ktmp, ktmp); \
+       } \
+      else \
+       { \
+         rk = vec_rol_elems(ktmp, ((kidx) % 4)); \
+       } \
+      rk; \
+    })
+
 #define Cho(b, c, d)  (vec_sel(d, c, b))
 
 #define Maj(c, d, b)  (vec_sel(c, b, c ^ d))
@@ -125,52 +191,119 @@ vec_vshasigma_u32(vector4x_u32 v, unsigned int a, unsigned int b)
 
 #define Sum1(x)       (vec_vshasigma_u32(x, 1, 15))
 
-
-/* Message expansion on general purpose registers */
-#define S0(x) (ror ((x), 7) ^ ror ((x), 18) ^ ((x) >> 3))
-#define S1(x) (ror ((x), 17) ^ ror ((x), 19) ^ ((x) >> 10))
-
-#define I(i) ( w[i] = buf_get_be32(data + i * 4) )
-#define W(i) ({ w[i&0x0f] +=    w[(i-7) &0x0f];  \
-               w[i&0x0f] += S0(w[(i-15)&0x0f]); \
-               w[i&0x0f] += S1(w[(i-2) &0x0f]); \
-               w[i&0x0f]; })
-
-#define I2(i) ( w2[i] = buf_get_be32(64 + data + i * 4), I(i) )
-#define W2(i) ({ w2[i]  = w2[i-7];       \
-                w2[i] += S1(w2[i-2]);   \
-                w2[i] += S0(w2[i-15]);  \
-                w2[i] += w2[i-16];      \
-                W(i); })
-#define R2(i) ( w2[i] )
-
-
-unsigned int ASM_FUNC_ATTR
-_gcry_sha256_transform_ppc8(u32 state[8], const unsigned char *data,
-                           size_t nblks)
+#define S0(x)         (vec_vshasigma_u32(x, 0, 0))
+
+#define S1(x)         (vec_vshasigma_u32(x, 0, 15))
+
+#define Xadd(X, d, x) vec_add_u32(d, X(x))
+
+#define Sum0add(d, x) Xadd(Sum0, d, x)
+
+#define Sum1add(d, x) Xadd(Sum1, d, x)
+
+#define S0add(d, x)   Xadd(S0, d, x)
+
+#define S1add(d, x)   Xadd(S1, d, x)
+
+#define I(i) \
+    ({ \
+      if (((i) % 4) == 0) \
+       { \
+         w[i] = vec_u32_load_be(0, data); \
+         data += 4 * 4; \
+         if ((i) / 4 < 3) \
+           asm volatile("" : "+r"(data) :: "memory"); \
+       } \
+      else if (((i) % 4) == 1) \
+       { \
+         w[i] = vec_mergeo(w[(i) - 1], w[(i) - 1]); \
+       } \
+      else \
+       { \
+         w[i] = vec_rol_elems(w[(i) - (i) % 4], (i)); \
+       } \
+    })
+
+#define WN(i) ({ w[(i)&0x0f] += w[((i)-7) &0x0f];  \
+                w[(i)&0x0f] = S0add(w[(i)&0x0f], w[((i)-15)&0x0f]); \
+                w[(i)&0x0f] = S1add(w[(i)&0x0f], w[((i)-2) &0x0f]); })
+
+#define W(i) ({ vector4x_u32 r = w[(i)&0x0f]; WN(i); r; })
+
+#define L(i) w[(i)&0x0f]
+
+#define I2(i) \
+    ({ \
+      if ((i) % 4 == 0) \
+       { \
+         vector4x_u32 iw = vec_u32_load_be(0, data); \
+         vector4x_u32 iw2 = vec_u32_load_be(64, data); \
+         if ((i) / 4 < 3) \
+           { \
+             data += 4 * 4; \
+             asm volatile("" : "+r"(data) :: "memory"); \
+           } \
+         else \
+           { \
+             data += 4 * 4 + 64; \
+             asm volatile("" : "+r"(data) :: "memory"); \
+           } \
+         w[(i) + 0] = vec_mergeh(iw, iw2); \
+         w[(i) + 1] = vec_rol_elems(w[(i) + 0], 2); \
+         w[(i) + 2] = vec_mergel(iw, iw2); \
+         w[(i) + 3] = vec_rol_elems(w[(i) + 2], 2); \
+       } \
+    })
+
+#define W2(i) \
+    ({ \
+      vector4x_u32 wt1 = w[(i)&0x0f]; \
+      WN(i); \
+      w2[(i) / 2] = (((i) % 2) == 0) ? wt1 : vec_mergeo(w2[(i) / 2], wt1); \
+      wt1; \
+    })
+
+#define L2(i) \
+    ({ \
+      vector4x_u32 lt1 = w[(i)&0x0f]; \
+      w2[(i) / 2] = (((i) % 2) == 0) ? lt1 : vec_mergeo(w2[(i) / 2], lt1); \
+      lt1; \
+    })
+
+#define WL(i) \
+    ({ \
+      vector4x_u32 wlt1 = w2[(i) / 2]; \
+      if (((i) % 2) == 0 && (i) < 63) \
+       w2[(i) / 2] = vec_mergeo(wlt1, wlt1); \
+      wlt1; \
+    })
+
+static ASM_FUNC_ATTR_INLINE FUNC_ATTR_OPT_O2 unsigned int
+sha256_transform_ppc(u32 state[8], const unsigned char *data, size_t nblks)
 {
-  /* GPRs used for message expansion as vector intrinsics based generates
-   * slower code. */
   vector4x_u32 h0, h1, h2, h3, h4, h5, h6, h7;
   vector4x_u32 h0_h3, h4_h7;
   vector4x_u32 a, b, c, d, e, f, g, h, t1, t2;
-  u32 w[16];
-  u32 w2[64];
+  vector4x_u32 w[16];
+  vector4x_u32 w2[64 / 2];
 
   h0_h3 = vec_vsx_ld (4 * 0, state);
   h4_h7 = vec_vsx_ld (4 * 4, state);
 
   h0 = h0_h3;
-  h1 = vec_rol_elems (h0_h3, 1);
+  h1 = vec_mergeo (h0_h3, h0_h3);
   h2 = vec_rol_elems (h0_h3, 2);
   h3 = vec_rol_elems (h0_h3, 3);
   h4 = h4_h7;
-  h5 = vec_rol_elems (h4_h7, 1);
+  h5 = vec_mergeo (h4_h7, h4_h7);
   h6 = vec_rol_elems (h4_h7, 2);
   h7 = vec_rol_elems (h4_h7, 3);
 
   while (nblks >= 2)
     {
+      const vector4x_u32 *kptr = K;
+      vector4x_u32 ktmp;
+
       a = h0;
       b = h1;
       c = h2;
@@ -180,74 +313,78 @@ _gcry_sha256_transform_ppc8(u32 state[8], const unsigned char *data,
       g = h6;
       h = h7;
 
-      R(a, b, c, d, e, f, g, h, K[0], I2(0));
-      R(h, a, b, c, d, e, f, g, K[1], I2(1));
-      R(g, h, a, b, c, d, e, f, K[2], I2(2));
-      R(f, g, h, a, b, c, d, e, K[3], I2(3));
-      R(e, f, g, h, a, b, c, d, K[4], I2(4));
-      R(d, e, f, g, h, a, b, c, K[5], I2(5));
-      R(c, d, e, f, g, h, a, b, K[6], I2(6));
-      R(b, c, d, e, f, g, h, a, K[7], I2(7));
-      R(a, b, c, d, e, f, g, h, K[8], I2(8));
-      R(h, a, b, c, d, e, f, g, K[9], I2(9));
-      R(g, h, a, b, c, d, e, f, K[10], I2(10));
-      R(f, g, h, a, b, c, d, e, K[11], I2(11));
-      R(e, f, g, h, a, b, c, d, K[12], I2(12));
-      R(d, e, f, g, h, a, b, c, K[13], I2(13));
-      R(c, d, e, f, g, h, a, b, K[14], I2(14));
-      R(b, c, d, e, f, g, h, a, K[15], I2(15));
-      data += 64 * 2;
-
-      R(a, b, c, d, e, f, g, h, K[16], W2(16));
-      R(h, a, b, c, d, e, f, g, K[17], W2(17));
-      R(g, h, a, b, c, d, e, f, K[18], W2(18));
-      R(f, g, h, a, b, c, d, e, K[19], W2(19));
-      R(e, f, g, h, a, b, c, d, K[20], W2(20));
-      R(d, e, f, g, h, a, b, c, K[21], W2(21));
-      R(c, d, e, f, g, h, a, b, K[22], W2(22));
-      R(b, c, d, e, f, g, h, a, K[23], W2(23));
-      R(a, b, c, d, e, f, g, h, K[24], W2(24));
-      R(h, a, b, c, d, e, f, g, K[25], W2(25));
-      R(g, h, a, b, c, d, e, f, K[26], W2(26));
-      R(f, g, h, a, b, c, d, e, K[27], W2(27));
-      R(e, f, g, h, a, b, c, d, K[28], W2(28));
-      R(d, e, f, g, h, a, b, c, K[29], W2(29));
-      R(c, d, e, f, g, h, a, b, K[30], W2(30));
-      R(b, c, d, e, f, g, h, a, K[31], W2(31));
-
-      R(a, b, c, d, e, f, g, h, K[32], W2(32));
-      R(h, a, b, c, d, e, f, g, K[33], W2(33));
-      R(g, h, a, b, c, d, e, f, K[34], W2(34));
-      R(f, g, h, a, b, c, d, e, K[35], W2(35));
-      R(e, f, g, h, a, b, c, d, K[36], W2(36));
-      R(d, e, f, g, h, a, b, c, K[37], W2(37));
-      R(c, d, e, f, g, h, a, b, K[38], W2(38));
-      R(b, c, d, e, f, g, h, a, K[39], W2(39));
-      R(a, b, c, d, e, f, g, h, K[40], W2(40));
-      R(h, a, b, c, d, e, f, g, K[41], W2(41));
-      R(g, h, a, b, c, d, e, f, K[42], W2(42));
-      R(f, g, h, a, b, c, d, e, K[43], W2(43));
-      R(e, f, g, h, a, b, c, d, K[44], W2(44));
-      R(d, e, f, g, h, a, b, c, K[45], W2(45));
-      R(c, d, e, f, g, h, a, b, K[46], W2(46));
-      R(b, c, d, e, f, g, h, a, K[47], W2(47));
-
-      R(a, b, c, d, e, f, g, h, K[48], W2(48));
-      R(h, a, b, c, d, e, f, g, K[49], W2(49));
-      R(g, h, a, b, c, d, e, f, K[50], W2(50));
-      R(f, g, h, a, b, c, d, e, K[51], W2(51));
-      R(e, f, g, h, a, b, c, d, K[52], W2(52));
-      R(d, e, f, g, h, a, b, c, K[53], W2(53));
-      R(c, d, e, f, g, h, a, b, K[54], W2(54));
-      R(b, c, d, e, f, g, h, a, K[55], W2(55));
-      R(a, b, c, d, e, f, g, h, K[56], W2(56));
-      R(h, a, b, c, d, e, f, g, K[57], W2(57));
-      R(g, h, a, b, c, d, e, f, K[58], W2(58));
-      R(f, g, h, a, b, c, d, e, K[59], W2(59));
-      R(e, f, g, h, a, b, c, d, K[60], W2(60));
-      R(d, e, f, g, h, a, b, c, K[61], W2(61));
-      R(c, d, e, f, g, h, a, b, K[62], W2(62));
-      R(b, c, d, e, f, g, h, a, K[63], W2(63));
+      I2(0); I2(1); I2(2); I2(3);
+      I2(4); I2(5); I2(6); I2(7);
+      I2(8); I2(9); I2(10); I2(11);
+      I2(12); I2(13); I2(14); I2(15);
+
+      R(a, b, c, d, e, f, g, h, 0, W2(0));
+      R(h, a, b, c, d, e, f, g, 1, W2(1));
+      R(g, h, a, b, c, d, e, f, 2, W2(2));
+      R(f, g, h, a, b, c, d, e, 3, W2(3));
+      R(e, f, g, h, a, b, c, d, 4, W2(4));
+      R(d, e, f, g, h, a, b, c, 5, W2(5));
+      R(c, d, e, f, g, h, a, b, 6, W2(6));
+      R(b, c, d, e, f, g, h, a, 7, W2(7));
+      R(a, b, c, d, e, f, g, h, 8, W2(8));
+      R(h, a, b, c, d, e, f, g, 9, W2(9));
+      R(g, h, a, b, c, d, e, f, 10, W2(10));
+      R(f, g, h, a, b, c, d, e, 11, W2(11));
+      R(e, f, g, h, a, b, c, d, 12, W2(12));
+      R(d, e, f, g, h, a, b, c, 13, W2(13));
+      R(c, d, e, f, g, h, a, b, 14, W2(14));
+      R(b, c, d, e, f, g, h, a, 15, W2(15));
+
+      R(a, b, c, d, e, f, g, h, 16, W2(16));
+      R(h, a, b, c, d, e, f, g, 17, W2(17));
+      R(g, h, a, b, c, d, e, f, 18, W2(18));
+      R(f, g, h, a, b, c, d, e, 19, W2(19));
+      R(e, f, g, h, a, b, c, d, 20, W2(20));
+      R(d, e, f, g, h, a, b, c, 21, W2(21));
+      R(c, d, e, f, g, h, a, b, 22, W2(22));
+      R(b, c, d, e, f, g, h, a, 23, W2(23));
+      R(a, b, c, d, e, f, g, h, 24, W2(24));
+      R(h, a, b, c, d, e, f, g, 25, W2(25));
+      R(g, h, a, b, c, d, e, f, 26, W2(26));
+      R(f, g, h, a, b, c, d, e, 27, W2(27));
+      R(e, f, g, h, a, b, c, d, 28, W2(28));
+      R(d, e, f, g, h, a, b, c, 29, W2(29));
+      R(c, d, e, f, g, h, a, b, 30, W2(30));
+      R(b, c, d, e, f, g, h, a, 31, W2(31));
+
+      R(a, b, c, d, e, f, g, h, 32, W2(32));
+      R(h, a, b, c, d, e, f, g, 33, W2(33));
+      R(g, h, a, b, c, d, e, f, 34, W2(34));
+      R(f, g, h, a, b, c, d, e, 35, W2(35));
+      R(e, f, g, h, a, b, c, d, 36, W2(36));
+      R(d, e, f, g, h, a, b, c, 37, W2(37));
+      R(c, d, e, f, g, h, a, b, 38, W2(38));
+      R(b, c, d, e, f, g, h, a, 39, W2(39));
+      R(a, b, c, d, e, f, g, h, 40, W2(40));
+      R(h, a, b, c, d, e, f, g, 41, W2(41));
+      R(g, h, a, b, c, d, e, f, 42, W2(42));
+      R(f, g, h, a, b, c, d, e, 43, W2(43));
+      R(e, f, g, h, a, b, c, d, 44, W2(44));
+      R(d, e, f, g, h, a, b, c, 45, W2(45));
+      R(c, d, e, f, g, h, a, b, 46, W2(46));
+      R(b, c, d, e, f, g, h, a, 47, W2(47));
+
+      R(a, b, c, d, e, f, g, h, 48, L2(48));
+      R(h, a, b, c, d, e, f, g, 49, L2(49));
+      R(g, h, a, b, c, d, e, f, 50, L2(50));
+      R(f, g, h, a, b, c, d, e, 51, L2(51));
+      R(e, f, g, h, a, b, c, d, 52, L2(52));
+      R(d, e, f, g, h, a, b, c, 53, L2(53));
+      R(c, d, e, f, g, h, a, b, 54, L2(54));
+      R(b, c, d, e, f, g, h, a, 55, L2(55));
+      R(a, b, c, d, e, f, g, h, 56, L2(56));
+      R(h, a, b, c, d, e, f, g, 57, L2(57));
+      R(g, h, a, b, c, d, e, f, 58, L2(58));
+      R(f, g, h, a, b, c, d, e, 59, L2(59));
+      R(e, f, g, h, a, b, c, d, 60, L2(60));
+      R(d, e, f, g, h, a, b, c, 61, L2(61));
+      R(c, d, e, f, g, h, a, b, 62, L2(62));
+      R(b, c, d, e, f, g, h, a, 63, L2(63));
 
       h0 += a;
       h1 += b;
@@ -258,6 +395,8 @@ _gcry_sha256_transform_ppc8(u32 state[8], const unsigned char *data,
       h6 += g;
       h7 += h;
 
+      kptr = K;
+
       a = h0;
       b = h1;
       c = h2;
@@ -267,73 +406,73 @@ _gcry_sha256_transform_ppc8(u32 state[8], const unsigned char *data,
       g = h6;
       h = h7;
 
-      R(a, b, c, d, e, f, g, h, K[0], R2(0));
-      R(h, a, b, c, d, e, f, g, K[1], R2(1));
-      R(g, h, a, b, c, d, e, f, K[2], R2(2));
-      R(f, g, h, a, b, c, d, e, K[3], R2(3));
-      R(e, f, g, h, a, b, c, d, K[4], R2(4));
-      R(d, e, f, g, h, a, b, c, K[5], R2(5));
-      R(c, d, e, f, g, h, a, b, K[6], R2(6));
-      R(b, c, d, e, f, g, h, a, K[7], R2(7));
-      R(a, b, c, d, e, f, g, h, K[8], R2(8));
-      R(h, a, b, c, d, e, f, g, K[9], R2(9));
-      R(g, h, a, b, c, d, e, f, K[10], R2(10));
-      R(f, g, h, a, b, c, d, e, K[11], R2(11));
-      R(e, f, g, h, a, b, c, d, K[12], R2(12));
-      R(d, e, f, g, h, a, b, c, K[13], R2(13));
-      R(c, d, e, f, g, h, a, b, K[14], R2(14));
-      R(b, c, d, e, f, g, h, a, K[15], R2(15));
-
-      R(a, b, c, d, e, f, g, h, K[16], R2(16));
-      R(h, a, b, c, d, e, f, g, K[17], R2(17));
-      R(g, h, a, b, c, d, e, f, K[18], R2(18));
-      R(f, g, h, a, b, c, d, e, K[19], R2(19));
-      R(e, f, g, h, a, b, c, d, K[20], R2(20));
-      R(d, e, f, g, h, a, b, c, K[21], R2(21));
-      R(c, d, e, f, g, h, a, b, K[22], R2(22));
-      R(b, c, d, e, f, g, h, a, K[23], R2(23));
-      R(a, b, c, d, e, f, g, h, K[24], R2(24));
-      R(h, a, b, c, d, e, f, g, K[25], R2(25));
-      R(g, h, a, b, c, d, e, f, K[26], R2(26));
-      R(f, g, h, a, b, c, d, e, K[27], R2(27));
-      R(e, f, g, h, a, b, c, d, K[28], R2(28));
-      R(d, e, f, g, h, a, b, c, K[29], R2(29));
-      R(c, d, e, f, g, h, a, b, K[30], R2(30));
-      R(b, c, d, e, f, g, h, a, K[31], R2(31));
-
-      R(a, b, c, d, e, f, g, h, K[32], R2(32));
-      R(h, a, b, c, d, e, f, g, K[33], R2(33));
-      R(g, h, a, b, c, d, e, f, K[34], R2(34));
-      R(f, g, h, a, b, c, d, e, K[35], R2(35));
-      R(e, f, g, h, a, b, c, d, K[36], R2(36));
-      R(d, e, f, g, h, a, b, c, K[37], R2(37));
-      R(c, d, e, f, g, h, a, b, K[38], R2(38));
-      R(b, c, d, e, f, g, h, a, K[39], R2(39));
-      R(a, b, c, d, e, f, g, h, K[40], R2(40));
-      R(h, a, b, c, d, e, f, g, K[41], R2(41));
-      R(g, h, a, b, c, d, e, f, K[42], R2(42));
-      R(f, g, h, a, b, c, d, e, K[43], R2(43));
-      R(e, f, g, h, a, b, c, d, K[44], R2(44));
-      R(d, e, f, g, h, a, b, c, K[45], R2(45));
-      R(c, d, e, f, g, h, a, b, K[46], R2(46));
-      R(b, c, d, e, f, g, h, a, K[47], R2(47));
-
-      R(a, b, c, d, e, f, g, h, K[48], R2(48));
-      R(h, a, b, c, d, e, f, g, K[49], R2(49));
-      R(g, h, a, b, c, d, e, f, K[50], R2(50));
-      R(f, g, h, a, b, c, d, e, K[51], R2(51));
-      R(e, f, g, h, a, b, c, d, K[52], R2(52));
-      R(d, e, f, g, h, a, b, c, K[53], R2(53));
-      R(c, d, e, f, g, h, a, b, K[54], R2(54));
-      R(b, c, d, e, f, g, h, a, K[55], R2(55));
-      R(a, b, c, d, e, f, g, h, K[56], R2(56));
-      R(h, a, b, c, d, e, f, g, K[57], R2(57));
-      R(g, h, a, b, c, d, e, f, K[58], R2(58));
-      R(f, g, h, a, b, c, d, e, K[59], R2(59));
-      R(e, f, g, h, a, b, c, d, K[60], R2(60));
-      R(d, e, f, g, h, a, b, c, K[61], R2(61));
-      R(c, d, e, f, g, h, a, b, K[62], R2(62));
-      R(b, c, d, e, f, g, h, a, K[63], R2(63));
+      R(a, b, c, d, e, f, g, h, 0, WL(0));
+      R(h, a, b, c, d, e, f, g, 1, WL(1));
+      R(g, h, a, b, c, d, e, f, 2, WL(2));
+      R(f, g, h, a, b, c, d, e, 3, WL(3));
+      R(e, f, g, h, a, b, c, d, 4, WL(4));
+      R(d, e, f, g, h, a, b, c, 5, WL(5));
+      R(c, d, e, f, g, h, a, b, 6, WL(6));
+      R(b, c, d, e, f, g, h, a, 7, WL(7));
+      R(a, b, c, d, e, f, g, h, 8, WL(8));
+      R(h, a, b, c, d, e, f, g, 9, WL(9));
+      R(g, h, a, b, c, d, e, f, 10, WL(10));
+      R(f, g, h, a, b, c, d, e, 11, WL(11));
+      R(e, f, g, h, a, b, c, d, 12, WL(12));
+      R(d, e, f, g, h, a, b, c, 13, WL(13));
+      R(c, d, e, f, g, h, a, b, 14, WL(14));
+      R(b, c, d, e, f, g, h, a, 15, WL(15));
+
+      R(a, b, c, d, e, f, g, h, 16, WL(16));
+      R(h, a, b, c, d, e, f, g, 17, WL(17));
+      R(g, h, a, b, c, d, e, f, 18, WL(18));
+      R(f, g, h, a, b, c, d, e, 19, WL(19));
+      R(e, f, g, h, a, b, c, d, 20, WL(20));
+      R(d, e, f, g, h, a, b, c, 21, WL(21));
+      R(c, d, e, f, g, h, a, b, 22, WL(22));
+      R(b, c, d, e, f, g, h, a, 23, WL(23));
+      R(a, b, c, d, e, f, g, h, 24, WL(24));
+      R(h, a, b, c, d, e, f, g, 25, WL(25));
+      R(g, h, a, b, c, d, e, f, 26, WL(26));
+      R(f, g, h, a, b, c, d, e, 27, WL(27));
+      R(e, f, g, h, a, b, c, d, 28, WL(28));
+      R(d, e, f, g, h, a, b, c, 29, WL(29));
+      R(c, d, e, f, g, h, a, b, 30, WL(30));
+      R(b, c, d, e, f, g, h, a, 31, WL(31));
+
+      R(a, b, c, d, e, f, g, h, 32, WL(32));
+      R(h, a, b, c, d, e, f, g, 33, WL(33));
+      R(g, h, a, b, c, d, e, f, 34, WL(34));
+      R(f, g, h, a, b, c, d, e, 35, WL(35));
+      R(e, f, g, h, a, b, c, d, 36, WL(36));
+      R(d, e, f, g, h, a, b, c, 37, WL(37));
+      R(c, d, e, f, g, h, a, b, 38, WL(38));
+      R(b, c, d, e, f, g, h, a, 39, WL(39));
+      R(a, b, c, d, e, f, g, h, 40, WL(40));
+      R(h, a, b, c, d, e, f, g, 41, WL(41));
+      R(g, h, a, b, c, d, e, f, 42, WL(42));
+      R(f, g, h, a, b, c, d, e, 43, WL(43));
+      R(e, f, g, h, a, b, c, d, 44, WL(44));
+      R(d, e, f, g, h, a, b, c, 45, WL(45));
+      R(c, d, e, f, g, h, a, b, 46, WL(46));
+      R(b, c, d, e, f, g, h, a, 47, WL(47));
+
+      R(a, b, c, d, e, f, g, h, 48, WL(48));
+      R(h, a, b, c, d, e, f, g, 49, WL(49));
+      R(g, h, a, b, c, d, e, f, 50, WL(50));
+      R(f, g, h, a, b, c, d, e, 51, WL(51));
+      R(e, f, g, h, a, b, c, d, 52, WL(52));
+      R(d, e, f, g, h, a, b, c, 53, WL(53));
+      R(c, d, e, f, g, h, a, b, 54, WL(54));
+      R(b, c, d, e, f, g, h, a, 55, WL(55));
+      R(a, b, c, d, e, f, g, h, 56, WL(56));
+      R(h, a, b, c, d, e, f, g, 57, WL(57));
+      R(g, h, a, b, c, d, e, f, 58, WL(58));
+      R(f, g, h, a, b, c, d, e, 59, WL(59));
+      R(e, f, g, h, a, b, c, d, 60, WL(60));
+      R(d, e, f, g, h, a, b, c, 61, WL(61));
+      R(c, d, e, f, g, h, a, b, 62, WL(62));
+      R(b, c, d, e, f, g, h, a, 63, WL(63));
 
       h0 += a;
       h1 += b;
@@ -347,8 +486,11 @@ _gcry_sha256_transform_ppc8(u32 state[8], const unsigned char *data,
       nblks -= 2;
     }
 
-  while (nblks)
+  if (nblks)
     {
+      const vector4x_u32 *kptr = K;
+      vector4x_u32 ktmp;
+
       a = h0;
       b = h1;
       c = h2;
@@ -358,74 +500,78 @@ _gcry_sha256_transform_ppc8(u32 state[8], const unsigned char *data,
       g = h6;
       h = h7;
 
-      R(a, b, c, d, e, f, g, h, K[0], I(0));
-      R(h, a, b, c, d, e, f, g, K[1], I(1));
-      R(g, h, a, b, c, d, e, f, K[2], I(2));
-      R(f, g, h, a, b, c, d, e, K[3], I(3));
-      R(e, f, g, h, a, b, c, d, K[4], I(4));
-      R(d, e, f, g, h, a, b, c, K[5], I(5));
-      R(c, d, e, f, g, h, a, b, K[6], I(6));
-      R(b, c, d, e, f, g, h, a, K[7], I(7));
-      R(a, b, c, d, e, f, g, h, K[8], I(8));
-      R(h, a, b, c, d, e, f, g, K[9], I(9));
-      R(g, h, a, b, c, d, e, f, K[10], I(10));
-      R(f, g, h, a, b, c, d, e, K[11], I(11));
-      R(e, f, g, h, a, b, c, d, K[12], I(12));
-      R(d, e, f, g, h, a, b, c, K[13], I(13));
-      R(c, d, e, f, g, h, a, b, K[14], I(14));
-      R(b, c, d, e, f, g, h, a, K[15], I(15));
-      data += 64;
-
-      R(a, b, c, d, e, f, g, h, K[16], W(16));
-      R(h, a, b, c, d, e, f, g, K[17], W(17));
-      R(g, h, a, b, c, d, e, f, K[18], W(18));
-      R(f, g, h, a, b, c, d, e, K[19], W(19));
-      R(e, f, g, h, a, b, c, d, K[20], W(20));
-      R(d, e, f, g, h, a, b, c, K[21], W(21));
-      R(c, d, e, f, g, h, a, b, K[22], W(22));
-      R(b, c, d, e, f, g, h, a, K[23], W(23));
-      R(a, b, c, d, e, f, g, h, K[24], W(24));
-      R(h, a, b, c, d, e, f, g, K[25], W(25));
-      R(g, h, a, b, c, d, e, f, K[26], W(26));
-      R(f, g, h, a, b, c, d, e, K[27], W(27));
-      R(e, f, g, h, a, b, c, d, K[28], W(28));
-      R(d, e, f, g, h, a, b, c, K[29], W(29));
-      R(c, d, e, f, g, h, a, b, K[30], W(30));
-      R(b, c, d, e, f, g, h, a, K[31], W(31));
-
-      R(a, b, c, d, e, f, g, h, K[32], W(32));
-      R(h, a, b, c, d, e, f, g, K[33], W(33));
-      R(g, h, a, b, c, d, e, f, K[34], W(34));
-      R(f, g, h, a, b, c, d, e, K[35], W(35));
-      R(e, f, g, h, a, b, c, d, K[36], W(36));
-      R(d, e, f, g, h, a, b, c, K[37], W(37));
-      R(c, d, e, f, g, h, a, b, K[38], W(38));
-      R(b, c, d, e, f, g, h, a, K[39], W(39));
-      R(a, b, c, d, e, f, g, h, K[40], W(40));
-      R(h, a, b, c, d, e, f, g, K[41], W(41));
-      R(g, h, a, b, c, d, e, f, K[42], W(42));
-      R(f, g, h, a, b, c, d, e, K[43], W(43));
-      R(e, f, g, h, a, b, c, d, K[44], W(44));
-      R(d, e, f, g, h, a, b, c, K[45], W(45));
-      R(c, d, e, f, g, h, a, b, K[46], W(46));
-      R(b, c, d, e, f, g, h, a, K[47], W(47));
-
-      R(a, b, c, d, e, f, g, h, K[48], W(48));
-      R(h, a, b, c, d, e, f, g, K[49], W(49));
-      R(g, h, a, b, c, d, e, f, K[50], W(50));
-      R(f, g, h, a, b, c, d, e, K[51], W(51));
-      R(e, f, g, h, a, b, c, d, K[52], W(52));
-      R(d, e, f, g, h, a, b, c, K[53], W(53));
-      R(c, d, e, f, g, h, a, b, K[54], W(54));
-      R(b, c, d, e, f, g, h, a, K[55], W(55));
-      R(a, b, c, d, e, f, g, h, K[56], W(56));
-      R(h, a, b, c, d, e, f, g, K[57], W(57));
-      R(g, h, a, b, c, d, e, f, K[58], W(58));
-      R(f, g, h, a, b, c, d, e, K[59], W(59));
-      R(e, f, g, h, a, b, c, d, K[60], W(60));
-      R(d, e, f, g, h, a, b, c, K[61], W(61));
-      R(c, d, e, f, g, h, a, b, K[62], W(62));
-      R(b, c, d, e, f, g, h, a, K[63], W(63));
+      I(0); I(1); I(2); I(3);
+      I(4); I(5); I(6); I(7);
+      I(8); I(9); I(10); I(11);
+      I(12); I(13); I(14); I(15);
+
+      R(a, b, c, d, e, f, g, h, 0, W(0));
+      R(h, a, b, c, d, e, f, g, 1, W(1));
+      R(g, h, a, b, c, d, e, f, 2, W(2));
+      R(f, g, h, a, b, c, d, e, 3, W(3));
+      R(e, f, g, h, a, b, c, d, 4, W(4));
+      R(d, e, f, g, h, a, b, c, 5, W(5));
+      R(c, d, e, f, g, h, a, b, 6, W(6));
+      R(b, c, d, e, f, g, h, a, 7, W(7));
+      R(a, b, c, d, e, f, g, h, 8, W(8));
+      R(h, a, b, c, d, e, f, g, 9, W(9));
+      R(g, h, a, b, c, d, e, f, 10, W(10));
+      R(f, g, h, a, b, c, d, e, 11, W(11));
+      R(e, f, g, h, a, b, c, d, 12, W(12));
+      R(d, e, f, g, h, a, b, c, 13, W(13));
+      R(c, d, e, f, g, h, a, b, 14, W(14));
+      R(b, c, d, e, f, g, h, a, 15, W(15));
+
+      R(a, b, c, d, e, f, g, h, 16, W(16));
+      R(h, a, b, c, d, e, f, g, 17, W(17));
+      R(g, h, a, b, c, d, e, f, 18, W(18));
+      R(f, g, h, a, b, c, d, e, 19, W(19));
+      R(e, f, g, h, a, b, c, d, 20, W(20));
+      R(d, e, f, g, h, a, b, c, 21, W(21));
+      R(c, d, e, f, g, h, a, b, 22, W(22));
+      R(b, c, d, e, f, g, h, a, 23, W(23));
+      R(a, b, c, d, e, f, g, h, 24, W(24));
+      R(h, a, b, c, d, e, f, g, 25, W(25));
+      R(g, h, a, b, c, d, e, f, 26, W(26));
+      R(f, g, h, a, b, c, d, e, 27, W(27));
+      R(e, f, g, h, a, b, c, d, 28, W(28));
+      R(d, e, f, g, h, a, b, c, 29, W(29));
+      R(c, d, e, f, g, h, a, b, 30, W(30));
+      R(b, c, d, e, f, g, h, a, 31, W(31));
+
+      R(a, b, c, d, e, f, g, h, 32, W(32));
+      R(h, a, b, c, d, e, f, g, 33, W(33));
+      R(g, h, a, b, c, d, e, f, 34, W(34));
+      R(f, g, h, a, b, c, d, e, 35, W(35));
+      R(e, f, g, h, a, b, c, d, 36, W(36));
+      R(d, e, f, g, h, a, b, c, 37, W(37));
+      R(c, d, e, f, g, h, a, b, 38, W(38));
+      R(b, c, d, e, f, g, h, a, 39, W(39));
+      R(a, b, c, d, e, f, g, h, 40, W(40));
+      R(h, a, b, c, d, e, f, g, 41, W(41));
+      R(g, h, a, b, c, d, e, f, 42, W(42));
+      R(f, g, h, a, b, c, d, e, 43, W(43));
+      R(e, f, g, h, a, b, c, d, 44, W(44));
+      R(d, e, f, g, h, a, b, c, 45, W(45));
+      R(c, d, e, f, g, h, a, b, 46, W(46));
+      R(b, c, d, e, f, g, h, a, 47, W(47));
+
+      R(a, b, c, d, e, f, g, h, 48, L(48));
+      R(h, a, b, c, d, e, f, g, 49, L(49));
+      R(g, h, a, b, c, d, e, f, 50, L(50));
+      R(f, g, h, a, b, c, d, e, 51, L(51));
+      R(e, f, g, h, a, b, c, d, 52, L(52));
+      R(d, e, f, g, h, a, b, c, 53, L(53));
+      R(c, d, e, f, g, h, a, b, 54, L(54));
+      R(b, c, d, e, f, g, h, a, 55, L(55));
+      R(a, b, c, d, e, f, g, h, 56, L(56));
+      R(h, a, b, c, d, e, f, g, 57, L(57));
+      R(g, h, a, b, c, d, e, f, 58, L(58));
+      R(f, g, h, a, b, c, d, e, 59, L(59));
+      R(e, f, g, h, a, b, c, d, 60, L(60));
+      R(d, e, f, g, h, a, b, c, 61, L(61));
+      R(c, d, e, f, g, h, a, b, 62, L(62));
+      R(b, c, d, e, f, g, h, a, 63, L(63));
 
       h0 += a;
       h1 += b;
@@ -446,350 +592,19 @@ _gcry_sha256_transform_ppc8(u32 state[8], const unsigned char *data,
 
   return sizeof(w2) + sizeof(w);
 }
-#undef R
-#undef Cho
-#undef Maj
-#undef Sum0
-#undef Sum1
-#undef S0
-#undef S1
-#undef I
-#undef W
-#undef I2
-#undef W2
-#undef R2
-
-
-/* SHA2 round in general purpose registers */
-#define R(a,b,c,d,e,f,g,h,k,w) do                                 \
-          {                                                       \
-            t1 = (h) + Sum1((e)) + Cho((e),(f),(g)) + ((k) + (w));\
-            t2 = Sum0((a)) + Maj((a),(b),(c));                    \
-            d += t1;                                              \
-            h  = t1 + t2;                                         \
-          } while (0)
-
-#define Cho(x, y, z)  ((x & y) + (~x & z))
-
-#define Maj(z, x, y)  ((x & y) + (z & (x ^ y)))
-
-#define Sum0(x)       (ror (x, 2) ^ ror (x ^ ror (x, 22-13), 13))
-
-#define Sum1(x)       (ror (x, 6) ^ ror (x, 11) ^ ror (x, 25))
-
-
-/* Message expansion on general purpose registers */
-#define S0(x) (ror ((x), 7) ^ ror ((x), 18) ^ ((x) >> 3))
-#define S1(x) (ror ((x), 17) ^ ror ((x), 19) ^ ((x) >> 10))
-
-#define I(i) ( w[i] = buf_get_be32(data + i * 4) )
-#define WN(i) ({ w[i&0x0f] +=    w[(i-7) &0x0f];  \
-                w[i&0x0f] += S0(w[(i-15)&0x0f]); \
-                w[i&0x0f] += S1(w[(i-2) &0x0f]); \
-                w[i&0x0f]; })
-#define W(i) ({ u32 r = w[i&0x0f]; WN(i); r; })
-#define L(i) w[i&0x0f]
 
+unsigned int ASM_FUNC_ATTR FUNC_ATTR_TARGET_P8 FUNC_ATTR_OPT_O2
+_gcry_sha256_transform_ppc8(u32 state[8], const unsigned char *data,
+                           size_t nblks)
+{
+  return sha256_transform_ppc(state, data, nblks);
+}
 
-unsigned int ASM_FUNC_ATTR
+unsigned int ASM_FUNC_ATTR FUNC_ATTR_TARGET_P9 FUNC_ATTR_OPT_O2
 _gcry_sha256_transform_ppc9(u32 state[8], const unsigned char *data,
                            size_t nblks)
 {
-  /* GPRs used for round function and message expansion as vector intrinsics
-   * based generates slower code for POWER9. */
-  u32 a, b, c, d, e, f, g, h, t1, t2;
-  u32 w[16];
-
-  a = state[0];
-  b = state[1];
-  c = state[2];
-  d = state[3];
-  e = state[4];
-  f = state[5];
-  g = state[6];
-  h = state[7];
-
-  while (nblks >= 2)
-    {
-      I(0); I(1); I(2); I(3);
-      I(4); I(5); I(6); I(7);
-      I(8); I(9); I(10); I(11);
-      I(12); I(13); I(14); I(15);
-      data += 64;
-      R(a, b, c, d, e, f, g, h, K[0], W(0));
-      R(h, a, b, c, d, e, f, g, K[1], W(1));
-      R(g, h, a, b, c, d, e, f, K[2], W(2));
-      R(f, g, h, a, b, c, d, e, K[3], W(3));
-      R(e, f, g, h, a, b, c, d, K[4], W(4));
-      R(d, e, f, g, h, a, b, c, K[5], W(5));
-      R(c, d, e, f, g, h, a, b, K[6], W(6));
-      R(b, c, d, e, f, g, h, a, K[7], W(7));
-      R(a, b, c, d, e, f, g, h, K[8], W(8));
-      R(h, a, b, c, d, e, f, g, K[9], W(9));
-      R(g, h, a, b, c, d, e, f, K[10], W(10));
-      R(f, g, h, a, b, c, d, e, K[11], W(11));
-      R(e, f, g, h, a, b, c, d, K[12], W(12));
-      R(d, e, f, g, h, a, b, c, K[13], W(13));
-      R(c, d, e, f, g, h, a, b, K[14], W(14));
-      R(b, c, d, e, f, g, h, a, K[15], W(15));
-
-      R(a, b, c, d, e, f, g, h, K[16], W(16));
-      R(h, a, b, c, d, e, f, g, K[17], W(17));
-      R(g, h, a, b, c, d, e, f, K[18], W(18));
-      R(f, g, h, a, b, c, d, e, K[19], W(19));
-      R(e, f, g, h, a, b, c, d, K[20], W(20));
-      R(d, e, f, g, h, a, b, c, K[21], W(21));
-      R(c, d, e, f, g, h, a, b, K[22], W(22));
-      R(b, c, d, e, f, g, h, a, K[23], W(23));
-      R(a, b, c, d, e, f, g, h, K[24], W(24));
-      R(h, a, b, c, d, e, f, g, K[25], W(25));
-      R(g, h, a, b, c, d, e, f, K[26], W(26));
-      R(f, g, h, a, b, c, d, e, K[27], W(27));
-      R(e, f, g, h, a, b, c, d, K[28], W(28));
-      R(d, e, f, g, h, a, b, c, K[29], W(29));
-      R(c, d, e, f, g, h, a, b, K[30], W(30));
-      R(b, c, d, e, f, g, h, a, K[31], W(31));
-
-      R(a, b, c, d, e, f, g, h, K[32], W(32));
-      R(h, a, b, c, d, e, f, g, K[33], W(33));
-      R(g, h, a, b, c, d, e, f, K[34], W(34));
-      R(f, g, h, a, b, c, d, e, K[35], W(35));
-      R(e, f, g, h, a, b, c, d, K[36], W(36));
-      R(d, e, f, g, h, a, b, c, K[37], W(37));
-      R(c, d, e, f, g, h, a, b, K[38], W(38));
-      R(b, c, d, e, f, g, h, a, K[39], W(39));
-      R(a, b, c, d, e, f, g, h, K[40], W(40));
-      R(h, a, b, c, d, e, f, g, K[41], W(41));
-      R(g, h, a, b, c, d, e, f, K[42], W(42));
-      R(f, g, h, a, b, c, d, e, K[43], W(43));
-      R(e, f, g, h, a, b, c, d, K[44], W(44));
-      R(d, e, f, g, h, a, b, c, K[45], W(45));
-      R(c, d, e, f, g, h, a, b, K[46], W(46));
-      R(b, c, d, e, f, g, h, a, K[47], W(47));
-
-      R(a, b, c, d, e, f, g, h, K[48], L(48));
-      R(h, a, b, c, d, e, f, g, K[49], L(49));
-      R(g, h, a, b, c, d, e, f, K[50], L(50));
-      R(f, g, h, a, b, c, d, e, K[51], L(51));
-      I(0); I(1); I(2); I(3);
-      R(e, f, g, h, a, b, c, d, K[52], L(52));
-      R(d, e, f, g, h, a, b, c, K[53], L(53));
-      R(c, d, e, f, g, h, a, b, K[54], L(54));
-      R(b, c, d, e, f, g, h, a, K[55], L(55));
-      I(4); I(5); I(6); I(7);
-      R(a, b, c, d, e, f, g, h, K[56], L(56));
-      R(h, a, b, c, d, e, f, g, K[57], L(57));
-      R(g, h, a, b, c, d, e, f, K[58], L(58));
-      R(f, g, h, a, b, c, d, e, K[59], L(59));
-      I(8); I(9); I(10); I(11);
-      R(e, f, g, h, a, b, c, d, K[60], L(60));
-      R(d, e, f, g, h, a, b, c, K[61], L(61));
-      R(c, d, e, f, g, h, a, b, K[62], L(62));
-      R(b, c, d, e, f, g, h, a, K[63], L(63));
-      I(12); I(13); I(14); I(15);
-      data += 64;
-
-      a += state[0];
-      b += state[1];
-      c += state[2];
-      d += state[3];
-      e += state[4];
-      f += state[5];
-      g += state[6];
-      h += state[7];
-      state[0] = a;
-      state[1] = b;
-      state[2] = c;
-      state[3] = d;
-      state[4] = e;
-      state[5] = f;
-      state[6] = g;
-      state[7] = h;
-
-      R(a, b, c, d, e, f, g, h, K[0], W(0));
-      R(h, a, b, c, d, e, f, g, K[1], W(1));
-      R(g, h, a, b, c, d, e, f, K[2], W(2));
-      R(f, g, h, a, b, c, d, e, K[3], W(3));
-      R(e, f, g, h, a, b, c, d, K[4], W(4));
-      R(d, e, f, g, h, a, b, c, K[5], W(5));
-      R(c, d, e, f, g, h, a, b, K[6], W(6));
-      R(b, c, d, e, f, g, h, a, K[7], W(7));
-      R(a, b, c, d, e, f, g, h, K[8], W(8));
-      R(h, a, b, c, d, e, f, g, K[9], W(9));
-      R(g, h, a, b, c, d, e, f, K[10], W(10));
-      R(f, g, h, a, b, c, d, e, K[11], W(11));
-      R(e, f, g, h, a, b, c, d, K[12], W(12));
-      R(d, e, f, g, h, a, b, c, K[13], W(13));
-      R(c, d, e, f, g, h, a, b, K[14], W(14));
-      R(b, c, d, e, f, g, h, a, K[15], W(15));
-
-      R(a, b, c, d, e, f, g, h, K[16], W(16));
-      R(h, a, b, c, d, e, f, g, K[17], W(17));
-      R(g, h, a, b, c, d, e, f, K[18], W(18));
-      R(f, g, h, a, b, c, d, e, K[19], W(19));
-      R(e, f, g, h, a, b, c, d, K[20], W(20));
-      R(d, e, f, g, h, a, b, c, K[21], W(21));
-      R(c, d, e, f, g, h, a, b, K[22], W(22));
-      R(b, c, d, e, f, g, h, a, K[23], W(23));
-      R(a, b, c, d, e, f, g, h, K[24], W(24));
-      R(h, a, b, c, d, e, f, g, K[25], W(25));
-      R(g, h, a, b, c, d, e, f, K[26], W(26));
-      R(f, g, h, a, b, c, d, e, K[27], W(27));
-      R(e, f, g, h, a, b, c, d, K[28], W(28));
-      R(d, e, f, g, h, a, b, c, K[29], W(29));
-      R(c, d, e, f, g, h, a, b, K[30], W(30));
-      R(b, c, d, e, f, g, h, a, K[31], W(31));
-
-      R(a, b, c, d, e, f, g, h, K[32], W(32));
-      R(h, a, b, c, d, e, f, g, K[33], W(33));
-      R(g, h, a, b, c, d, e, f, K[34], W(34));
-      R(f, g, h, a, b, c, d, e, K[35], W(35));
-      R(e, f, g, h, a, b, c, d, K[36], W(36));
-      R(d, e, f, g, h, a, b, c, K[37], W(37));
-      R(c, d, e, f, g, h, a, b, K[38], W(38));
-      R(b, c, d, e, f, g, h, a, K[39], W(39));
-      R(a, b, c, d, e, f, g, h, K[40], W(40));
-      R(h, a, b, c, d, e, f, g, K[41], W(41));
-      R(g, h, a, b, c, d, e, f, K[42], W(42));
-      R(f, g, h, a, b, c, d, e, K[43], W(43));
-      R(e, f, g, h, a, b, c, d, K[44], W(44));
-      R(d, e, f, g, h, a, b, c, K[45], W(45));
-      R(c, d, e, f, g, h, a, b, K[46], W(46));
-      R(b, c, d, e, f, g, h, a, K[47], W(47));
-
-      R(a, b, c, d, e, f, g, h, K[48], L(48));
-      R(h, a, b, c, d, e, f, g, K[49], L(49));
-      R(g, h, a, b, c, d, e, f, K[50], L(50));
-      R(f, g, h, a, b, c, d, e, K[51], L(51));
-      R(e, f, g, h, a, b, c, d, K[52], L(52));
-      R(d, e, f, g, h, a, b, c, K[53], L(53));
-      R(c, d, e, f, g, h, a, b, K[54], L(54));
-      R(b, c, d, e, f, g, h, a, K[55], L(55));
-      R(a, b, c, d, e, f, g, h, K[56], L(56));
-      R(h, a, b, c, d, e, f, g, K[57], L(57));
-      R(g, h, a, b, c, d, e, f, K[58], L(58));
-      R(f, g, h, a, b, c, d, e, K[59], L(59));
-      R(e, f, g, h, a, b, c, d, K[60], L(60));
-      R(d, e, f, g, h, a, b, c, K[61], L(61));
-      R(c, d, e, f, g, h, a, b, K[62], L(62));
-      R(b, c, d, e, f, g, h, a, K[63], L(63));
-
-      a += state[0];
-      b += state[1];
-      c += state[2];
-      d += state[3];
-      e += state[4];
-      f += state[5];
-      g += state[6];
-      h += state[7];
-      state[0] = a;
-      state[1] = b;
-      state[2] = c;
-      state[3] = d;
-      state[4] = e;
-      state[5] = f;
-      state[6] = g;
-      state[7] = h;
-
-      nblks -= 2;
-    }
-
-  while (nblks)
-    {
-      I(0); I(1); I(2); I(3);
-      I(4); I(5); I(6); I(7);
-      I(8); I(9); I(10); I(11);
-      I(12); I(13); I(14); I(15);
-      data += 64;
-      R(a, b, c, d, e, f, g, h, K[0], W(0));
-      R(h, a, b, c, d, e, f, g, K[1], W(1));
-      R(g, h, a, b, c, d, e, f, K[2], W(2));
-      R(f, g, h, a, b, c, d, e, K[3], W(3));
-      R(e, f, g, h, a, b, c, d, K[4], W(4));
-      R(d, e, f, g, h, a, b, c, K[5], W(5));
-      R(c, d, e, f, g, h, a, b, K[6], W(6));
-      R(b, c, d, e, f, g, h, a, K[7], W(7));
-      R(a, b, c, d, e, f, g, h, K[8], W(8));
-      R(h, a, b, c, d, e, f, g, K[9], W(9));
-      R(g, h, a, b, c, d, e, f, K[10], W(10));
-      R(f, g, h, a, b, c, d, e, K[11], W(11));
-      R(e, f, g, h, a, b, c, d, K[12], W(12));
-      R(d, e, f, g, h, a, b, c, K[13], W(13));
-      R(c, d, e, f, g, h, a, b, K[14], W(14));
-      R(b, c, d, e, f, g, h, a, K[15], W(15));
-
-      R(a, b, c, d, e, f, g, h, K[16], W(16));
-      R(h, a, b, c, d, e, f, g, K[17], W(17));
-      R(g, h, a, b, c, d, e, f, K[18], W(18));
-      R(f, g, h, a, b, c, d, e, K[19], W(19));
-      R(e, f, g, h, a, b, c, d, K[20], W(20));
-      R(d, e, f, g, h, a, b, c, K[21], W(21));
-      R(c, d, e, f, g, h, a, b, K[22], W(22));
-      R(b, c, d, e, f, g, h, a, K[23], W(23));
-      R(a, b, c, d, e, f, g, h, K[24], W(24));
-      R(h, a, b, c, d, e, f, g, K[25], W(25));
-      R(g, h, a, b, c, d, e, f, K[26], W(26));
-      R(f, g, h, a, b, c, d, e, K[27], W(27));
-      R(e, f, g, h, a, b, c, d, K[28], W(28));
-      R(d, e, f, g, h, a, b, c, K[29], W(29));
-      R(c, d, e, f, g, h, a, b, K[30], W(30));
-      R(b, c, d, e, f, g, h, a, K[31], W(31));
-
-      R(a, b, c, d, e, f, g, h, K[32], W(32));
-      R(h, a, b, c, d, e, f, g, K[33], W(33));
-      R(g, h, a, b, c, d, e, f, K[34], W(34));
-      R(f, g, h, a, b, c, d, e, K[35], W(35));
-      R(e, f, g, h, a, b, c, d, K[36], W(36));
-      R(d, e, f, g, h, a, b, c, K[37], W(37));
-      R(c, d, e, f, g, h, a, b, K[38], W(38));
-      R(b, c, d, e, f, g, h, a, K[39], W(39));
-      R(a, b, c, d, e, f, g, h, K[40], W(40));
-      R(h, a, b, c, d, e, f, g, K[41], W(41));
-      R(g, h, a, b, c, d, e, f, K[42], W(42));
-      R(f, g, h, a, b, c, d, e, K[43], W(43));
-      R(e, f, g, h, a, b, c, d, K[44], W(44));
-      R(d, e, f, g, h, a, b, c, K[45], W(45));
-      R(c, d, e, f, g, h, a, b, K[46], W(46));
-      R(b, c, d, e, f, g, h, a, K[47], W(47));
-
-      R(a, b, c, d, e, f, g, h, K[48], L(48));
-      R(h, a, b, c, d, e, f, g, K[49], L(49));
-      R(g, h, a, b, c, d, e, f, K[50], L(50));
-      R(f, g, h, a, b, c, d, e, K[51], L(51));
-      R(e, f, g, h, a, b, c, d, K[52], L(52));
-      R(d, e, f, g, h, a, b, c, K[53], L(53));
-      R(c, d, e, f, g, h, a, b, K[54], L(54));
-      R(b, c, d, e, f, g, h, a, K[55], L(55));
-      R(a, b, c, d, e, f, g, h, K[56], L(56));
-      R(h, a, b, c, d, e, f, g, K[57], L(57));
-      R(g, h, a, b, c, d, e, f, K[58], L(58));
-      R(f, g, h, a, b, c, d, e, K[59], L(59));
-      R(e, f, g, h, a, b, c, d, K[60], L(60));
-      R(d, e, f, g, h, a, b, c, K[61], L(61));
-      R(c, d, e, f, g, h, a, b, K[62], L(62));
-      R(b, c, d, e, f, g, h, a, K[63], L(63));
-
-      a += state[0];
-      b += state[1];
-      c += state[2];
-      d += state[3];
-      e += state[4];
-      f += state[5];
-      g += state[6];
-      h += state[7];
-      state[0] = a;
-      state[1] = b;
-      state[2] = c;
-      state[3] = d;
-      state[4] = e;
-      state[5] = f;
-      state[6] = g;
-      state[7] = h;
-
-      nblks--;
-    }
-
-  return sizeof(w);
+  return sha256_transform_ppc(state, data, nblks);
 }
 
 #endif /* ENABLE_PPC_CRYPTO_SUPPORT */
index 401ff6f44a18c3c1181fc204ee993e2fbf65af75..ab93647caf0e7dd061ee3cb438b0f9cccb461c48 100644 (file)
@@ -497,6 +497,11 @@ _gcry_sha256_transform_amd64_ssse3:
        CFI_ENDPROC()
 
 
+SECTION_RODATA
+
+ELF(.type _sha256_ssse3_consts,@object)
+_sha256_ssse3_consts:
+
 .align 16
 .LK256:
        .long   0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
index 94ec0141e7a8daa200ce7b4e3774bdea2c98314c..1e1d296f27377d509f2f9914bcbecb83d068e4c7 100644 (file)
 #define hd_h ((hd_g) + 8)
 
 /* register macros */
-#define RK    %r2
+#define RK    r2
 
-#define RElo %r0
-#define REhi %r1
+#define RElo r0
+#define REhi r1
 
-#define RT1lo %r3
-#define RT1hi %r4
-#define RT2lo %r5
-#define RT2hi %r6
-#define RWlo  %r7
-#define RWhi  %r8
-#define RT3lo %r9
-#define RT3hi %r10
-#define RT4lo %r11
-#define RT4hi %ip
+#define RT1lo r3
+#define RT1hi r4
+#define RT2lo r5
+#define RT2hi r6
+#define RWlo  r7
+#define RWhi  r8
+#define RT3lo r9
+#define RT3hi r10
+#define RT4lo r11
+#define RT4hi ip
 
-#define RRND  %lr
+#define RRND  lr
 
 /* variable offsets in stack */
 #define ctx (0)
     mov RWhi, REhi, lsr#14; \
     eor RWlo, RWlo, RElo, lsr#18; \
     eor RWhi, RWhi, REhi, lsr#18; \
-    ldr RT3lo, [%sp, #(_f)]; \
+    ldr RT3lo, [sp, #(_f)]; \
     adds RT1lo, RT2lo; /* t1 += K */ \
-    ldr RT3hi, [%sp, #(_f) + 4]; \
+    ldr RT3hi, [sp, #(_f) + 4]; \
     adc RT1hi, RT2hi; \
-    ldr RT4lo, [%sp, #(_g)]; \
+    ldr RT4lo, [sp, #(_g)]; \
     eor RWlo, RWlo, RElo, lsl#23; \
-    ldr RT4hi, [%sp, #(_g) + 4]; \
+    ldr RT4hi, [sp, #(_g) + 4]; \
     eor RWhi, RWhi, REhi, lsl#23; \
     eor RWlo, RWlo, REhi, lsl#18; \
     eor RWhi, RWhi, RElo, lsl#18; \
     \
     /* Load D */ \
     /* t1 += Cho(_e,_f,_g) */ \
-    ldr RElo, [%sp, #(_d)]; \
+    ldr RElo, [sp, #(_d)]; \
     adds RT1lo, RT3lo; \
-    ldr REhi, [%sp, #(_d) + 4]; \
+    ldr REhi, [sp, #(_d) + 4]; \
     adc RT1hi, RT3hi; \
     \
     /* Load A */ \
-    ldr RT3lo, [%sp, #(_a)]; \
+    ldr RT3lo, [sp, #(_a)]; \
     \
     /* _d += t1 */ \
     adds RElo, RT1lo; \
-    ldr RT3hi, [%sp, #(_a) + 4]; \
+    ldr RT3hi, [sp, #(_a) + 4]; \
     adc REhi, RT1hi; \
     \
     /* Store D */ \
-    str RElo, [%sp, #(_d)]; \
+    str RElo, [sp, #(_d)]; \
     \
     /* t2 = Sum0(_a) */ \
     mov RT2lo, RT3lo, lsr#28; \
-    str REhi, [%sp, #(_d) + 4]; \
+    str REhi, [sp, #(_d) + 4]; \
     mov RT2hi, RT3hi, lsr#28; \
-    ldr RWlo, [%sp, #(_b)]; \
+    ldr RWlo, [sp, #(_b)]; \
     eor RT2lo, RT2lo, RT3lo, lsl#30; \
-    ldr RWhi, [%sp, #(_b) + 4]; \
+    ldr RWhi, [sp, #(_b) + 4]; \
     eor RT2hi, RT2hi, RT3hi, lsl#30; \
     eor RT2lo, RT2lo, RT3lo, lsl#25; \
     eor RT2hi, RT2hi, RT3hi, lsl#25; \
     \
     /* t2 += t1 */ \
     adds RT2lo, RT1lo; \
-    ldr RT1lo, [%sp, #(_c)]; \
+    ldr RT1lo, [sp, #(_c)]; \
     adc RT2hi, RT1hi; \
     \
     /* Maj(_a,_b,_c) => ((_a & _b) ^ (_c & (_a ^ _b))) */ \
-    ldr RT1hi, [%sp, #(_c) + 4]; \
+    ldr RT1hi, [sp, #(_c) + 4]; \
     and RT4lo, RWlo, RT3lo; \
     and RT4hi, RWhi, RT3hi; \
     eor RWlo, RWlo, RT3lo; \
 /* Message expansion */
 
 #define W_0_63(_a,_h,i) \
-    ldr RT3lo, [%sp, #(w(i-2))]; \
+    ldr RT3lo, [sp, #(w(i-2))]; \
     adds RT2lo, RWlo; /* _h = t2 + Maj(_a,_b,_c) */ \
-    ldr RT3hi, [%sp, #(w(i-2)) + 4]; \
+    ldr RT3hi, [sp, #(w(i-2)) + 4]; \
     adc RT2hi, RWhi; \
     /* nw = S1(w[i-2]) */ \
-    ldr RT1lo, [%sp, #(_h)]; /* Load H */ \
+    ldr RT1lo, [sp, #(_h)]; /* Load H */ \
     mov RWlo, RT3lo, lsr#19; \
-    str RT2lo, [%sp, #(_a)]; \
+    str RT2lo, [sp, #(_a)]; \
     eor RWlo, RWlo, RT3lo, lsl#3; \
-    ldr RT1hi, [%sp, #(_h) + 4]; \
+    ldr RT1hi, [sp, #(_h) + 4]; \
     mov RWhi, RT3hi, lsr#19; \
-    ldr RT2lo, [%sp, #(w(i-7))]; \
+    ldr RT2lo, [sp, #(w(i-7))]; \
     eor RWhi, RWhi, RT3hi, lsl#3; \
-    str RT2hi, [%sp, #(_a) + 4]; \
+    str RT2hi, [sp, #(_a) + 4]; \
     eor RWlo, RWlo, RT3lo, lsr#6; \
-    ldr RT2hi, [%sp, #(w(i-7)) + 4]; \
+    ldr RT2hi, [sp, #(w(i-7)) + 4]; \
     eor RWhi, RWhi, RT3hi, lsr#6; \
     eor RWlo, RWlo, RT3hi, lsl#13; \
     eor RWhi, RWhi, RT3lo, lsl#13; \
     eor RWlo, RWlo, RT3hi, lsr#29; \
     eor RWhi, RWhi, RT3lo, lsr#29; \
-    ldr RT3lo, [%sp, #(w(i-15))]; \
+    ldr RT3lo, [sp, #(w(i-15))]; \
     eor RWlo, RWlo, RT3hi, lsl#26; \
-    ldr RT3hi, [%sp, #(w(i-15)) + 4]; \
+    ldr RT3hi, [sp, #(w(i-15)) + 4]; \
     \
     adds RT2lo, RWlo; /* nw += w[i-7] */ \
-    ldr RWlo, [%sp, #(w(i-16))]; \
+    ldr RWlo, [sp, #(w(i-16))]; \
     adc RT2hi, RWhi; \
     mov RT4lo, RT3lo, lsr#1; /* S0(w[i-15]) */ \
-    ldr RWhi, [%sp, #(w(i-16)) + 4]; \
+    ldr RWhi, [sp, #(w(i-16)) + 4]; \
     mov RT4hi, RT3hi, lsr#1; \
     adds RT2lo, RWlo; /* nw += w[i-16] */ \
     eor RT4lo, RT4lo, RT3lo, lsr#8; \
     adc RT2hi, RT4hi; \
     \
     /* w[0] = nw */ \
-    str RT2lo, [%sp, #(w(i))]; \
+    str RT2lo, [sp, #(w(i))]; \
     adds RT1lo, RWlo; \
-    str RT2hi, [%sp, #(w(i)) + 4]; \
+    str RT2hi, [sp, #(w(i)) + 4]; \
     adc RT1hi, RWhi;
 
 #define W_64_79(_a,_h,i) \
     adds RT2lo, RWlo; /* _h = t2 + Maj(_a,_b,_c) */ \
-    ldr RWlo, [%sp, #(w(i-16))]; \
+    ldr RWlo, [sp, #(w(i-16))]; \
     adc RT2hi, RWhi; \
-    ldr RWhi, [%sp, #(w(i-16)) + 4]; \
-    ldr RT1lo, [%sp, #(_h)]; /* Load H */ \
-    ldr RT1hi, [%sp, #(_h) + 4]; \
-    str RT2lo, [%sp, #(_a)]; \
-    str RT2hi, [%sp, #(_a) + 4]; \
+    ldr RWhi, [sp, #(w(i-16)) + 4]; \
+    ldr RT1lo, [sp, #(_h)]; /* Load H */ \
+    ldr RT1hi, [sp, #(_h) + 4]; \
+    str RT2lo, [sp, #(_a)]; \
+    str RT2hi, [sp, #(_a) + 4]; \
     adds RT1lo, RWlo; \
     adc RT1hi, RWhi;
 
 
 _gcry_sha512_transform_arm:
        /* Input:
-        *      %r0: SHA512_CONTEXT
-        *      %r1: data
-        *      %r2: u64 k[] constants
-        *      %r3: nblks
+        *      r0: SHA512_CONTEXT
+        *      r1: data
+        *      r2: u64 k[] constants
+        *      r3: nblks
         */
-       push {%r4-%r11, %ip, %lr};
-       sub %sp, %sp, #STACK_MAX;
-       movs RWlo, %r3;
-       str %r0, [%sp, #(ctx)];
+       push {r4-r11, ip, lr};
+       sub sp, sp, #STACK_MAX;
+       movs RWlo, r3;
+       str r0, [sp, #(ctx)];
 
        beq .Ldone;
 
 .Loop_blocks:
-       str RWlo, [%sp, #nblks];
+       str RWlo, [sp, #nblks];
 
        /* Load context to stack */
-       add RWhi, %sp, #(_a);
-       ldm %r0!,  {RT1lo,RT1hi,RT2lo,RT2hi,RT3lo,RT3hi,RT4lo,RT4hi}
+       add RWhi, sp, #(_a);
+       ldm r0!,  {RT1lo,RT1hi,RT2lo,RT2hi,RT3lo,RT3hi,RT4lo,RT4hi}
        stm RWhi!, {RT1lo,RT1hi,RT2lo,RT2hi,RT3lo,RT3hi,RT4lo,RT4hi}
-       ldm %r0,  {RT1lo,RT1hi,RT2lo,RT2hi,RT3lo,RT3hi,RT4lo,RT4hi}
+       ldm r0,  {RT1lo,RT1hi,RT2lo,RT2hi,RT3lo,RT3hi,RT4lo,RT4hi}
        stm RWhi, {RT1lo,RT1hi,RT2lo,RT2hi,RT3lo,RT3hi,RT4lo,RT4hi}
 
        /* Load input to w[16] */
 
        /* test if data is unaligned */
-       tst %r1, #3;
+       tst r1, #3;
        beq 1f;
 
        /* unaligned load */
-       add RWhi, %sp, #(w(0));
-       read_be64_unaligned_4(%r1, 0 * 8, RT1lo, RT1hi, RT2lo, RT2hi, RT3lo, RT3hi, RT4lo, RT4hi, RWlo);
+       add RWhi, sp, #(w(0));
+       read_be64_unaligned_4(r1, 0 * 8, RT1lo, RT1hi, RT2lo, RT2hi, RT3lo, RT3hi, RT4lo, RT4hi, RWlo);
        stm RWhi!, {RT1lo,RT1hi,RT2lo,RT2hi,RT3lo,RT3hi,RT4lo,RT4hi}
 
-       read_be64_unaligned_4(%r1, 4 * 8, RT1lo, RT1hi, RT2lo, RT2hi, RT3lo, RT3hi, RT4lo, RT4hi, RWlo);
+       read_be64_unaligned_4(r1, 4 * 8, RT1lo, RT1hi, RT2lo, RT2hi, RT3lo, RT3hi, RT4lo, RT4hi, RWlo);
        stm RWhi!, {RT1lo,RT1hi,RT2lo,RT2hi,RT3lo,RT3hi,RT4lo,RT4hi}
 
-       read_be64_unaligned_4(%r1, 8 * 8, RT1lo, RT1hi, RT2lo, RT2hi, RT3lo, RT3hi, RT4lo, RT4hi, RWlo);
+       read_be64_unaligned_4(r1, 8 * 8, RT1lo, RT1hi, RT2lo, RT2hi, RT3lo, RT3hi, RT4lo, RT4hi, RWlo);
        stm RWhi!, {RT1lo,RT1hi,RT2lo,RT2hi,RT3lo,RT3hi,RT4lo,RT4hi}
 
-       read_be64_unaligned_4(%r1, 12 * 8, RT1lo, RT1hi, RT2lo, RT2hi, RT3lo, RT3hi, RT4lo, RT4hi, RWlo);
+       read_be64_unaligned_4(r1, 12 * 8, RT1lo, RT1hi, RT2lo, RT2hi, RT3lo, RT3hi, RT4lo, RT4hi, RWlo);
        b 2f;
 1:
        /* aligned load */
-       add RWhi, %sp, #(w(0));
-       read_be64_aligned_4(%r1, 0 * 8, RT1lo, RT1hi, RT2lo, RT2hi, RT3lo, RT3hi, RT4lo, RT4hi, RWlo);
+       add RWhi, sp, #(w(0));
+       read_be64_aligned_4(r1, 0 * 8, RT1lo, RT1hi, RT2lo, RT2hi, RT3lo, RT3hi, RT4lo, RT4hi, RWlo);
        stm RWhi!, {RT1lo,RT1hi,RT2lo,RT2hi,RT3lo,RT3hi,RT4lo,RT4hi}
 
-       read_be64_aligned_4(%r1, 4 * 8, RT1lo, RT1hi, RT2lo, RT2hi, RT3lo, RT3hi, RT4lo, RT4hi, RWlo);
+       read_be64_aligned_4(r1, 4 * 8, RT1lo, RT1hi, RT2lo, RT2hi, RT3lo, RT3hi, RT4lo, RT4hi, RWlo);
        stm RWhi!, {RT1lo,RT1hi,RT2lo,RT2hi,RT3lo,RT3hi,RT4lo,RT4hi}
 
-       read_be64_aligned_4(%r1, 8 * 8, RT1lo, RT1hi, RT2lo, RT2hi, RT3lo, RT3hi, RT4lo, RT4hi, RWlo);
+       read_be64_aligned_4(r1, 8 * 8, RT1lo, RT1hi, RT2lo, RT2hi, RT3lo, RT3hi, RT4lo, RT4hi, RWlo);
        stm RWhi!, {RT1lo,RT1hi,RT2lo,RT2hi,RT3lo,RT3hi,RT4lo,RT4hi}
 
-       read_be64_aligned_4(%r1, 12 * 8, RT1lo, RT1hi, RT2lo, RT2hi, RT3lo, RT3hi, RT4lo, RT4hi, RWlo);
+       read_be64_aligned_4(r1, 12 * 8, RT1lo, RT1hi, RT2lo, RT2hi, RT3lo, RT3hi, RT4lo, RT4hi, RWlo);
 2:
-       add %r1, #(16 * 8);
+       add r1, #(16 * 8);
        stm RWhi, {RT1lo,RT1hi,RT2lo,RT2hi,RT3lo,RT3hi,RT4lo,RT4hi}
-       str %r1, [%sp, #(data)];
+       str r1, [sp, #(data)];
 
        /* preload E & A */
-       ldr RElo, [%sp, #(_e)];
-       ldr REhi, [%sp, #(_e) + 4];
+       ldr RElo, [sp, #(_e)];
+       ldr REhi, [sp, #(_e) + 4];
        mov RWlo, #0;
-       ldr RT2lo, [%sp, #(_a)];
+       ldr RT2lo, [sp, #(_a)];
        mov RRND, #(80-16);
-       ldr RT2hi, [%sp, #(_a) + 4];
+       ldr RT2hi, [sp, #(_a) + 4];
        mov RWhi, #0;
 
 .Loop_rounds:
@@ -406,58 +406,58 @@ _gcry_sha512_transform_arm:
        R(_c, _d, _e, _f, _g, _h, _a, _b, W_64_79, 30);
        R(_b, _c, _d, _e, _f, _g, _h, _a, W_64_79, 31);
 
-       ldr %r0, [%sp, #(ctx)];
+       ldr r0, [sp, #(ctx)];
        adds RT2lo, RWlo; /* _h = t2 + Maj(_a,_b,_c) */
-       ldr %r1, [%sp, #(data)];
+       ldr r1, [sp, #(data)];
        adc RT2hi, RWhi;
 
-       ldm %r0, {RT1lo,RT1hi,RWlo,RWhi,RT3lo,RT3hi,RT4lo,RT4hi}
+       ldm r0, {RT1lo,RT1hi,RWlo,RWhi,RT3lo,RT3hi,RT4lo,RT4hi}
        adds RT1lo, RT2lo;
-       ldr RT2lo, [%sp, #(_b + 0)];
+       ldr RT2lo, [sp, #(_b + 0)];
        adc  RT1hi, RT2hi;
-       ldr RT2hi, [%sp, #(_b + 4)];
+       ldr RT2hi, [sp, #(_b + 4)];
        adds RWlo, RT2lo;
-       ldr RT2lo, [%sp, #(_c + 0)];
+       ldr RT2lo, [sp, #(_c + 0)];
        adc  RWhi, RT2hi;
-       ldr RT2hi, [%sp, #(_c + 4)];
+       ldr RT2hi, [sp, #(_c + 4)];
        adds RT3lo, RT2lo;
-       ldr RT2lo, [%sp, #(_d + 0)];
+       ldr RT2lo, [sp, #(_d + 0)];
        adc  RT3hi, RT2hi;
-       ldr RT2hi, [%sp, #(_d + 4)];
+       ldr RT2hi, [sp, #(_d + 4)];
        adds RT4lo, RT2lo;
-       ldr RT2lo, [%sp, #(_e + 0)];
+       ldr RT2lo, [sp, #(_e + 0)];
        adc  RT4hi, RT2hi;
-       stm %r0!, {RT1lo,RT1hi,RWlo,RWhi,RT3lo,RT3hi,RT4lo,RT4hi}
+       stm r0!, {RT1lo,RT1hi,RWlo,RWhi,RT3lo,RT3hi,RT4lo,RT4hi}
 
-       ldr RT2hi, [%sp, #(_e + 4)];
-       ldm %r0, {RT1lo,RT1hi,RWlo,RWhi,RT3lo,RT3hi,RT4lo,RT4hi}
+       ldr RT2hi, [sp, #(_e + 4)];
+       ldm r0, {RT1lo,RT1hi,RWlo,RWhi,RT3lo,RT3hi,RT4lo,RT4hi}
        adds RT1lo, RT2lo;
-       ldr RT2lo, [%sp, #(_f + 0)];
+       ldr RT2lo, [sp, #(_f + 0)];
        adc  RT1hi, RT2hi;
-       ldr RT2hi, [%sp, #(_f + 4)];
+       ldr RT2hi, [sp, #(_f + 4)];
        adds RWlo, RT2lo;
-       ldr RT2lo, [%sp, #(_g + 0)];
+       ldr RT2lo, [sp, #(_g + 0)];
        adc  RWhi, RT2hi;
-       ldr RT2hi, [%sp, #(_g + 4)];
+       ldr RT2hi, [sp, #(_g + 4)];
        adds RT3lo, RT2lo;
-       ldr RT2lo, [%sp, #(_h + 0)];
+       ldr RT2lo, [sp, #(_h + 0)];
        adc  RT3hi, RT2hi;
-       ldr RT2hi, [%sp, #(_h + 4)];
+       ldr RT2hi, [sp, #(_h + 4)];
        adds RT4lo, RT2lo;
        adc  RT4hi, RT2hi;
-       stm %r0, {RT1lo,RT1hi,RWlo,RWhi,RT3lo,RT3hi,RT4lo,RT4hi}
-       sub %r0, %r0, #(4 * 8);
-       ldr RWlo, [%sp, #nblks];
+       stm r0, {RT1lo,RT1hi,RWlo,RWhi,RT3lo,RT3hi,RT4lo,RT4hi}
+       sub r0, r0, #(4 * 8);
+       ldr RWlo, [sp, #nblks];
 
        sub RK, #(80 * 8);
        subs RWlo, #1;
        bne .Loop_blocks;
 
 .Ldone:
-       mov %r0, #STACK_MAX;
+       mov r0, #STACK_MAX;
 __out:
-       add %sp, %sp, #STACK_MAX;
-       pop {%r4-%r11, %ip, %pc};
+       add sp, sp, #STACK_MAX;
+       pop {r4-r11, ip, pc};
 .size _gcry_sha512_transform_arm,.-_gcry_sha512_transform_arm;
 
 #endif
index 2b186b4778853933534404374693812854dda4a9..a1df73b8e9e7d484605f60d3e9e32fc9b395eb86 100644 (file)
@@ -40,7 +40,7 @@
 #define hd_g ((hd_f) + 8)
 
 /* register macros */
-#define RK %r2
+#define RK r2
 
 #define RA d0
 #define RB d1
 
 _gcry_sha512_transform_armv7_neon:
        /* Input:
-        *      %r0: SHA512_CONTEXT
-        *      %r1: data
-        *      %r2: u64 k[] constants
-        *      %r3: nblks
+        *      r0: SHA512_CONTEXT
+        *      r1: data
+        *      r2: u64 k[] constants
+        *      r3: nblks
         */
-       push {%lr};
+       push {lr};
 
-       mov %lr, #0;
+       mov lr, #0;
 
        /* Load context to d0-d7 */
-       vld1.64 {RA-RD}, [%r0]!;
-       vld1.64 {RE-RH}, [%r0];
-       sub %r0, #(4*8);
+       vld1.64 {RA-RD}, [r0]!;
+       vld1.64 {RE-RH}, [r0];
+       sub r0, #(4*8);
 
        /* Load input to w[16], d16-d31 */
        /* NOTE: Assumes that on ARMv7 unaligned accesses are always allowed. */
-       vld1.64 {RW0-RW3}, [%r1]!;
-       vld1.64 {RW4-RW7}, [%r1]!;
-       vld1.64 {RW8-RW11}, [%r1]!;
-       vld1.64 {RW12-RW15}, [%r1]!;
+       vld1.64 {RW0-RW3}, [r1]!;
+       vld1.64 {RW4-RW7}, [r1]!;
+       vld1.64 {RW8-RW11}, [r1]!;
+       vld1.64 {RW12-RW15}, [r1]!;
 #ifdef __ARMEL__
        /* byteswap */
        vrev64.8 RW01q, RW01q;
@@ -334,46 +334,46 @@ _gcry_sha512_transform_armv7_neon:
        rounds2_0_63(RC, RD, RE, RF, RG, RH, RA, RB, RW6, RW7, RW67q, RW8, RW89q, RW45q, RW15, RW0, vadd_RT01q, RW45q);
        rounds2_0_63(RA, RB, RC, RD, RE, RF, RG, RH, RW8, RW9, RW89q, RW10, RW1011q, RW67q, RW1, RW2, vadd_RT01q, RW67q);
        rounds2_0_63(RG, RH, RA, RB, RC, RD, RE, RF, RW10, RW11, RW1011q, RW12, RW1213q, RW89q, RW3, RW4, vadd_RT01q, RW89q);
-       add %lr, #16;
+       add lr, #16;
        rounds2_0_63(RE, RF, RG, RH, RA, RB, RC, RD, RW12, RW13, RW1213q, RW14, RW1415q, RW1011q, RW5, RW6, vadd_RT01q, RW1011q);
-       cmp %lr, #64;
+       cmp lr, #64;
        rounds2_0_63(RC, RD, RE, RF, RG, RH, RA, RB, RW14, RW15, RW1415q, RW0, RW01q, RW1213q, RW7, RW8, vadd_RT01q, RW1213q);
        bne .Loop_rounds;
 
-       subs %r3, #1;
+       subs r3, #1;
 
        rounds2_64_79(RA, RB, RC, RD, RE, RF, RG, RH, RW0, RW1, vadd_RT01q, RW1415q, dummy, _);
        rounds2_64_79(RG, RH, RA, RB, RC, RD, RE, RF, RW2, RW3, vadd_rg_RT0, RG, vadd_rg_RT1, RG);
        beq .Lhandle_tail;
-       vld1.64 {RW0-RW3}, [%r1]!;
+       vld1.64 {RW0-RW3}, [r1]!;
        rounds2_64_79(RE, RF, RG, RH, RA, RB, RC, RD, RW4, RW5, vadd_rg_RT0, RE, vadd_rg_RT1, RE);
        rounds2_64_79(RC, RD, RE, RF, RG, RH, RA, RB, RW6, RW7, vadd_rg_RT0, RC, vadd_rg_RT1, RC);
 #ifdef __ARMEL__
        vrev64.8 RW01q, RW01q;
        vrev64.8 RW23q, RW23q;
 #endif
-       vld1.64 {RW4-RW7}, [%r1]!;
+       vld1.64 {RW4-RW7}, [r1]!;
        rounds2_64_79(RA, RB, RC, RD, RE, RF, RG, RH, RW8, RW9, vadd_rg_RT0, RA, vadd_rg_RT1, RA);
        rounds2_64_79(RG, RH, RA, RB, RC, RD, RE, RF, RW10, RW11, vadd_rg_RT0, RG, vadd_rg_RT1, RG);
 #ifdef __ARMEL__
        vrev64.8 RW45q, RW45q;
        vrev64.8 RW67q, RW67q;
 #endif
-       vld1.64 {RW8-RW11}, [%r1]!;
+       vld1.64 {RW8-RW11}, [r1]!;
        rounds2_64_79(RE, RF, RG, RH, RA, RB, RC, RD, RW12, RW13, vadd_rg_RT0, RE, vadd_rg_RT1, RE);
        rounds2_64_79(RC, RD, RE, RF, RG, RH, RA, RB, RW14, RW15, vadd_rg_RT0, RC, vadd_rg_RT1, RC);
 #ifdef __ARMEL__
        vrev64.8 RW89q, RW89q;
        vrev64.8 RW1011q, RW1011q;
 #endif
-       vld1.64 {RW12-RW15}, [%r1]!;
+       vld1.64 {RW12-RW15}, [r1]!;
        vadd_rg_RT0(RA);
        vadd_rg_RT1(RA);
 
        /* Load context */
-       vld1.64 {RT0-RT3}, [%r0]!;
-       vld1.64 {RT4-RT7}, [%r0];
-       sub %r0, #(4*8);
+       vld1.64 {RT0-RT3}, [r0]!;
+       vld1.64 {RT4-RT7}, [r0];
+       sub r0, #(4*8);
 
 #ifdef __ARMEL__
        vrev64.8 RW1213q, RW1213q;
@@ -390,11 +390,11 @@ _gcry_sha512_transform_armv7_neon:
        vadd.u64 RH, RT7;
 
        /* Store the first half of context */
-       vst1.64 {RA-RD}, [%r0]!;
+       vst1.64 {RA-RD}, [r0]!;
        sub RK, $(8*80);
-       vst1.64 {RE-RH}, [%r0]; /* Store the last half of context */
-       mov %lr, #0;
-       sub %r0, #(4*8);
+       vst1.64 {RE-RH}, [r0]; /* Store the last half of context */
+       mov lr, #0;
+       sub r0, #(4*8);
 
        b .Loop;
 .ltorg
@@ -408,11 +408,11 @@ _gcry_sha512_transform_armv7_neon:
        rounds2_64_79(RC, RD, RE, RF, RG, RH, RA, RB, RW14, RW15, vadd_rg_RT0, RC, vadd_rg_RT1, RC);
 
        /* Load context to d16-d23 */
-       vld1.64 {RW0-RW3}, [%r0]!;
+       vld1.64 {RW0-RW3}, [r0]!;
        vadd_rg_RT0(RA);
-       vld1.64 {RW4-RW7}, [%r0];
+       vld1.64 {RW4-RW7}, [r0];
        vadd_rg_RT1(RA);
-       sub %r0, #(4*8);
+       sub r0, #(4*8);
 
        vadd.u64 RA, RW0;
        vadd.u64 RB, RW1;
@@ -424,7 +424,7 @@ _gcry_sha512_transform_armv7_neon:
        vadd.u64 RH, RW7;
 
        /* Store the first half of context */
-       vst1.64 {RA-RD}, [%r0]!;
+       vst1.64 {RA-RD}, [r0]!;
 
        /* Clear used registers */
        /* d16-d31 */
@@ -432,7 +432,7 @@ _gcry_sha512_transform_armv7_neon:
        CLEAR_REG(RW23q);
        CLEAR_REG(RW45q);
        CLEAR_REG(RW67q);
-       vst1.64 {RE-RH}, [%r0]; /* Store the last half of context */
+       vst1.64 {RE-RH}, [r0]; /* Store the last half of context */
        CLEAR_REG(RW89q);
        CLEAR_REG(RW1011q);
        CLEAR_REG(RW1213q);
@@ -440,13 +440,13 @@ _gcry_sha512_transform_armv7_neon:
        /* d8-d15 */
        vpop {RT0-RT7};
        /* d0-d7 (q0-q3) */
-       CLEAR_REG(%q0);
-       CLEAR_REG(%q1);
-       CLEAR_REG(%q2);
-       CLEAR_REG(%q3);
+       CLEAR_REG(q0);
+       CLEAR_REG(q1);
+       CLEAR_REG(q2);
+       CLEAR_REG(q3);
 
-       eor %r0, %r0;
-       pop {%pc};
+       eor r0, r0;
+       pop {pc};
 .size _gcry_sha512_transform_armv7_neon,.-_gcry_sha512_transform_armv7_neon;
 
 #endif
diff --git a/cipher/sha512-armv8-aarch64-ce.S b/cipher/sha512-armv8-aarch64-ce.S
new file mode 100644 (file)
index 0000000..fa22519
--- /dev/null
@@ -0,0 +1,383 @@
+/* sha512-armv8-aarch64-ce.S - ARM/CE accelerated SHA-512 transform function
+ * Copyright (C) 2022 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "asm-common-aarch64.h"
+
+#if defined(__AARCH64EL__) && \
+    defined(HAVE_COMPATIBLE_GCC_AARCH64_PLATFORM_AS) && \
+    defined(HAVE_GCC_INLINE_ASM_AARCH64_SHA3_SHA512_SM3_SM4) && \
+    defined(USE_SHA512)
+
+.arch armv8.2-a+sha3+sm4
+
+.text
+
+
+/* Register macros */
+
+#define Qv0 q0
+#define Qv1 q1
+#define Qv2 q2
+#define Qv3 q3
+#define Qv4 q4
+
+#define vT0 v5
+#define vT1 v6
+#define QvT1 q6
+#define vT2 v7
+#define vT3 v16
+
+#define vH01 v17
+#define vH23 v18
+#define vH45 v19
+#define vH67 v20
+
+#define vW0 v21
+#define vW1 v22
+#define vW2 v23
+#define vW3 v24
+#define vW4 v25
+#define vW5 v26
+#define vW6 v27
+#define vW7 v28
+
+#define vK0 v29
+#define vK1 v30
+#define vK2 v31
+
+
+/* Round macros */
+
+#define _(...) /*_*/
+
+#define do_add(a, b) add a.2d, a.2d, b.2d;
+
+#define load_k_3() ld1 {vK0.2d-vK2.2d}, [x3], #48;
+#define load_k_last() ld1 {vK0.2d}, [x3];
+
+#define load_msg1(...) \
+        ld1 {vW0.16b-vW3.16b}, [x1], #64;
+
+#define load_msg2(...) \
+        rev64 vW0.16b, vW0.16b;
+
+#define load_msg3(...) \
+        rev64 vW1.16b, vW1.16b;
+
+#define load_msg4(...) \
+        ld1 {vW4.16b-vW7.16b}, [x1], #64;
+
+#define load_msg5(...) \
+        rev64 vW2.16b, vW2.16b;
+
+#define load_msg6(...) \
+        rev64 vW3.16b, vW3.16b;
+
+#define load_msg7(...) \
+        rev64 vW4.16b, vW4.16b;
+
+#define load_msg8(...) \
+        rev64 vW5.16b, vW5.16b;
+
+#define load_msg9(...) \
+        rev64 vW6.16b, vW6.16b;
+
+#define load_msg10(...) \
+        rev64 vW7.16b, vW7.16b;
+
+#define schedule1(w0, w1, w2, w3, w4, w5, w6, w7) \
+        sha512su0 w0.2d, w1.2d; \
+
+#define schedule2(w0, w1, w2, w3, w4, w5, w6, w7) \
+        ext vT2.16b, w4.16b, w5.16b, #8; \
+        sha512su1 w0.2d, w7.2d, vT2.2d;
+
+#define do_round2(ab, cd, ef, gh, cd_out, \
+                  load_nextk_op, k, \
+                  sched_op1, sched_op2, w0, w1, w2, w3, w4, w5, w6, w7) \
+        add vT3.2d, k.2d, w0.2d; \
+            load_nextk_op(); \
+        ext vT1.16b, ef.16b, gh.16b, #8; \
+        ext vT3.16b, vT3.16b, vT3.16b, #8; \
+        ext vT0.16b, cd.16b, ef.16b, #8; \
+        add gh.2d, gh.2d, vT3.2d; \
+            sched_op1(w0, w1, w2, w3, w4, w5, w6, w7); \
+        sha512h Q##gh, Q##vT1, vT0.2d; \
+            sched_op2(w0, w1, w2, w3, w4, w5, w6, w7); \
+        add cd_out.2d, gh.2d, cd.2d; \
+        sha512h2 Q##gh, Q##cd, ab.2d; \
+
+
+/* Other functional macros */
+
+#undef CLEAR_REG
+#define CLEAR_REG(reg, ...) movi reg.16b, #0;
+
+
+/*
+ * unsigned int
+ * _gcry_sha512_transform_armv8_ce (u64 state[8], const void *input_data,
+ *                                  size_t num_blks, const u64 k[80])
+ */
+.align 4
+.globl _gcry_sha512_transform_armv8_ce
+ELF(.type  _gcry_sha512_transform_armv8_ce,%function;)
+_gcry_sha512_transform_armv8_ce:
+  /* input:
+   *   x0: ctx, CTX
+   *   x1: data (128*nblks bytes)
+   *   x2: nblks
+   *   x3: k table
+   */
+  CFI_STARTPROC()
+
+  cbz x2, .Ldo_nothing
+
+  mov x4, x3
+
+  ld1 {vH01.2d-vH67.2d}, [x0]  /* load state */
+
+  load_msg1()
+  mov v0.16b, vH01.16b
+  mov v1.16b, vH23.16b
+  load_k_3()
+  load_msg2()
+  load_msg3()
+  load_msg4()
+  mov v2.16b, vH45.16b
+  mov v3.16b, vH67.16b
+  load_msg5()
+  load_msg6()
+  load_msg7()
+  load_msg8()
+  load_msg9()
+  load_msg10()
+
+.Loop:
+  sub x2, x2, #1
+
+  # rounds 1-16
+  do_round2(v0, v1, v2, v3, v4,
+            _,        vK0,
+            schedule1, schedule2, vW0, vW1, vW2, vW3, vW4, vW5, vW6, vW7)
+  do_round2(v3, v0, v4, v2, v1,
+            _,        vK1,
+            schedule1, schedule2, vW1, vW2, vW3, vW4, vW5, vW6, vW7, vW0)
+  do_round2(v2, v3, v1, v4, v0,
+            load_k_3, vK2,
+            schedule1, schedule2, vW2, vW3, vW4, vW5, vW6, vW7, vW0, vW1)
+  do_round2(v4, v2, v0, v1, v3,
+            _,        vK0,
+            schedule1, schedule2, vW3, vW4, vW5, vW6, vW7, vW0, vW1, vW2)
+  do_round2(v1, v4, v3, v0, v2,
+            _,        vK1,
+            schedule1, schedule2, vW4, vW5, vW6, vW7, vW0, vW1, vW2, vW3)
+  do_round2(v0, v1, v2, v3, v4,
+            load_k_3, vK2,
+            schedule1, schedule2, vW5, vW6, vW7, vW0, vW1, vW2, vW3, vW4)
+  do_round2(v3, v0, v4, v2, v1,
+            _,        vK0,
+            schedule1, schedule2, vW6, vW7, vW0, vW1, vW2, vW3, vW4, vW5)
+  do_round2(v2, v3, v1, v4, v0,
+            _,        vK1,
+            schedule1, schedule2, vW7, vW0, vW1, vW2, vW3, vW4, vW5, vW6)
+
+  # rounds 17-32
+  do_round2(v4, v2, v0, v1, v3,
+            load_k_3, vK2,
+            schedule1, schedule2, vW0, vW1, vW2, vW3, vW4, vW5, vW6, vW7)
+  do_round2(v1, v4, v3, v0, v2,
+            _,        vK0,
+            schedule1, schedule2, vW1, vW2, vW3, vW4, vW5, vW6, vW7, vW0)
+  do_round2(v0, v1, v2, v3, v4,
+            _,        vK1,
+            schedule1, schedule2, vW2, vW3, vW4, vW5, vW6, vW7, vW0, vW1)
+  do_round2(v3, v0, v4, v2, v1,
+            load_k_3, vK2,
+            schedule1, schedule2, vW3, vW4, vW5, vW6, vW7, vW0, vW1, vW2)
+  do_round2(v2, v3, v1, v4, v0,
+            _,        vK0,
+            schedule1, schedule2, vW4, vW5, vW6, vW7, vW0, vW1, vW2, vW3)
+  do_round2(v4, v2, v0, v1, v3,
+            _,        vK1,
+            schedule1, schedule2, vW5, vW6, vW7, vW0, vW1, vW2, vW3, vW4)
+  do_round2(v1, v4, v3, v0, v2,
+            load_k_3, vK2,
+            schedule1, schedule2, vW6, vW7, vW0, vW1, vW2, vW3, vW4, vW5)
+  do_round2(v0, v1, v2, v3, v4,
+            _,        vK0,
+            schedule1, schedule2, vW7, vW0, vW1, vW2, vW3, vW4, vW5, vW6)
+
+  # rounds 33-48
+  do_round2(v3, v0, v4, v2, v1,
+            _,        vK1,
+            schedule1, schedule2, vW0, vW1, vW2, vW3, vW4, vW5, vW6, vW7)
+  do_round2(v2, v3, v1, v4, v0,
+            load_k_3, vK2,
+            schedule1, schedule2, vW1, vW2, vW3, vW4, vW5, vW6, vW7, vW0)
+  do_round2(v4, v2, v0, v1, v3,
+            _,        vK0,
+            schedule1, schedule2, vW2, vW3, vW4, vW5, vW6, vW7, vW0, vW1)
+  do_round2(v1, v4, v3, v0, v2,
+            _,        vK1,
+            schedule1, schedule2, vW3, vW4, vW5, vW6, vW7, vW0, vW1, vW2)
+  do_round2(v0, v1, v2, v3, v4,
+            load_k_3, vK2,
+            schedule1, schedule2, vW4, vW5, vW6, vW7, vW0, vW1, vW2, vW3)
+  do_round2(v3, v0, v4, v2, v1,
+            _,        vK0,
+            schedule1, schedule2, vW5, vW6, vW7, vW0, vW1, vW2, vW3, vW4)
+  do_round2(v2, v3, v1, v4, v0,
+            _,        vK1,
+            schedule1, schedule2, vW6, vW7, vW0, vW1, vW2, vW3, vW4, vW5)
+  do_round2(v4, v2, v0, v1, v3,
+            load_k_3, vK2,
+            schedule1, schedule2, vW7, vW0, vW1, vW2, vW3, vW4, vW5, vW6)
+
+  # rounds 49-64
+  do_round2(v1, v4, v3, v0, v2,
+            _,        vK0,
+            schedule1, schedule2, vW0, vW1, vW2, vW3, vW4, vW5, vW6, vW7)
+  do_round2(v0, v1, v2, v3, v4,
+            _,        vK1,
+            schedule1, schedule2, vW1, vW2, vW3, vW4, vW5, vW6, vW7, vW0)
+  do_round2(v3, v0, v4, v2, v1,
+            load_k_3, vK2,
+            schedule1, schedule2, vW2, vW3, vW4, vW5, vW6, vW7, vW0, vW1)
+  do_round2(v2, v3, v1, v4, v0,
+            _,        vK0,
+            schedule1, schedule2, vW3, vW4, vW5, vW6, vW7, vW0, vW1, vW2)
+  do_round2(v4, v2, v0, v1, v3,
+            _,        vK1,
+            schedule1, schedule2, vW4, vW5, vW6, vW7, vW0, vW1, vW2, vW3)
+  do_round2(v1, v4, v3, v0, v2,
+            load_k_3, vK2,
+            schedule1, schedule2, vW5, vW6, vW7, vW0, vW1, vW2, vW3, vW4)
+  do_round2(v0, v1, v2, v3, v4,
+            _,        vK0,
+            schedule1, schedule2, vW6, vW7, vW0, vW1, vW2, vW3, vW4, vW5)
+  do_round2(v3, v0, v4, v2, v1,
+            _,        vK1,
+            schedule1, schedule2, vW7, vW0, vW1, vW2, vW3, vW4, vW5, vW6)
+
+  cbz x2, .Lend
+
+  # rounds 65-80
+  do_round2(v2, v3, v1, v4, v0,
+            load_k_3, vK2,
+            _, _, vW0, , , , , , , )
+  do_round2(v4, v2, v0, v1, v3,
+            _,        vK0,
+            _, _, vW1, , , , , , , )
+  do_round2(v1, v4, v3, v0, v2,
+            _,        vK1,
+            _, _, vW2, , , , , , , )
+  do_round2(v0, v1, v2, v3, v4,
+            load_k_3, vK2,
+            _, _, vW3, , , , , , , )
+  do_round2(v3, v0, v4, v2, v1,
+            _,        vK0,
+            load_msg1, _, vW4, , , , , , , )
+  do_round2(v2, v3, v1, v4, v0,
+            _,        vK1,
+            load_msg2, _, vW5, , , , , , , )
+  do_round2(v4, v2, v0, v1, v3,
+            load_k_last, vK2,
+            load_msg3, _, vW6, , , , , , , )
+  mov x3, x4
+  do_round2(v1, v4, v3, v0, v2,
+            load_k_3,    vK0,
+            load_msg4, load_msg5, vW7, , , , , , , )
+
+  load_msg6()
+  load_msg7()
+
+  add vH01.2d, vH01.2d, v0.2d
+  add vH23.2d, vH23.2d, v1.2d
+  add vH45.2d, vH45.2d, v2.2d
+  add vH67.2d, vH67.2d, v3.2d
+  load_msg8()
+  load_msg9()
+  load_msg10()
+  mov v0.16b, vH01.16b
+  mov v1.16b, vH23.16b
+  mov v2.16b, vH45.16b
+  mov v3.16b, vH67.16b
+
+  b .Loop
+
+.Lend:
+
+  # rounds 65-80
+  do_round2(v2, v3, v1, v4, v0,
+            load_k_3, vK2,
+            CLEAR_REG, _, vW0, , , , , , , )
+  do_round2(v4, v2, v0, v1, v3,
+            _,        vK0,
+            CLEAR_REG, _, vW1, , , , , , , )
+  do_round2(v1, v4, v3, v0, v2,
+            _,        vK1,
+            CLEAR_REG, _, vW2, , , , , , , )
+  do_round2(v0, v1, v2, v3, v4,
+            load_k_3, vK2,
+            CLEAR_REG, _, vW3, , , , , , , )
+  do_round2(v3, v0, v4, v2, v1,
+            _,        vK0,
+            CLEAR_REG, _, vW4, , , , , , , )
+  do_round2(v2, v3, v1, v4, v0,
+            _,        vK1,
+            CLEAR_REG, _, vW5, , , , , , , )
+  CLEAR_REG(vK1)
+  do_round2(v4, v2, v0, v1, v3,
+            load_k_last, vK2,
+            CLEAR_REG, _, vW6, , , , , , , )
+  CLEAR_REG(vK2)
+  do_round2(v1, v4, v3, v0, v2,
+            _,           vK0,
+            CLEAR_REG, _, vW7, , , , , , , )
+  CLEAR_REG(vK0)
+
+  CLEAR_REG(v4)
+  add vH01.2d, vH01.2d, v0.2d
+  CLEAR_REG(v0)
+  add vH23.2d, vH23.2d, v1.2d
+  CLEAR_REG(v1)
+  add vH45.2d, vH45.2d, v2.2d
+  CLEAR_REG(v2)
+  add vH67.2d, vH67.2d, v3.2d
+  CLEAR_REG(v3)
+  CLEAR_REG(vT0)
+  CLEAR_REG(vT1)
+  CLEAR_REG(vT2)
+  CLEAR_REG(vT3)
+
+  st1 {vH01.2d-vH67.2d}, [x0] /* store state */
+
+  CLEAR_REG(vH01)
+  CLEAR_REG(vH23)
+  CLEAR_REG(vH45)
+  CLEAR_REG(vH67)
+
+.Ldo_nothing:
+  mov x0, #0
+  ret_spec_stop
+  CFI_ENDPROC()
+ELF(.size _gcry_sha512_transform_armv8_ce,.-_gcry_sha512_transform_armv8_ce;)
+
+#endif
index bfc4435d5c25c41f8bde3973593e609a4a7def68..1bd38060ba8b6d300724580dd14ca84bf5465400 100644 (file)
@@ -408,6 +408,11 @@ _gcry_sha512_transform_amd64_avx:
 ;;; Binary Data
 */
 
+SECTION_RODATA
+
+ELF(.type _sha512_avx_consts,@object)
+_sha512_avx_consts:
+
 .align 16
 
 /* Mask for byte-swapping a couple of qwords in an XMM register using (v)pshufb. */
index a431e196a920c44c9c5a58d31ca71bbd75769671..7b60bf1d376459b5f41e85fc686ff2480e0ce966 100644 (file)
@@ -445,6 +445,11 @@ _gcry_sha512_transform_amd64_avx2:
 /*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; */
 /*;; Binary Data */
 
+SECTION_RODATA
+
+ELF(.type _sha512_avx2_consts,@object)
+_sha512_avx2_consts:
+
 .align 64
 /* K[t] used in SHA512 hashing */
 .LK512:
diff --git a/cipher/sha512-avx512-amd64.S b/cipher/sha512-avx512-amd64.S
new file mode 100644 (file)
index 0000000..61c72e5
--- /dev/null
@@ -0,0 +1,465 @@
+/* sha512-avx512-amd64.c - amd64/AVX512 implementation of SHA-512 transform
+ * Copyright (C) 2022 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+/*
+ * Based on implementation from file "sha512-avx2-bmi2-amd64.S":
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright (c) 2012, Intel Corporation
+;
+; All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions are
+; met:
+;
+; * Redistributions of source code must retain the above copyright
+;   notice, this list of conditions and the following disclaimer.
+;
+; * Redistributions in binary form must reproduce the above copyright
+;   notice, this list of conditions and the following disclaimer in the
+;   documentation and/or other materials provided with the
+;   distribution.
+;
+; * Neither the name of the Intel Corporation nor the names of its
+;   contributors may be used to endorse or promote products derived from
+;   this software without specific prior written permission.
+;
+;
+; THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION "AS IS" AND ANY
+; EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+; PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR
+; CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; This code schedules 1 blocks at a time, with 4 lanes per block
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+*/
+
+#ifdef __x86_64
+#include <config.h>
+#if (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
+     defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) && \
+    defined(HAVE_INTEL_SYNTAX_PLATFORM_AS) && \
+    defined(HAVE_GCC_INLINE_ASM_AVX512) && \
+    defined(USE_SHA512)
+
+#include "asm-common-amd64.h"
+
+.intel_syntax noprefix
+
+.text
+
+/* Virtual Registers */
+#define Y_0 ymm0
+#define Y_1 ymm1
+#define Y_2 ymm2
+#define Y_3 ymm3
+
+#define YTMP0 ymm4
+#define YTMP1 ymm5
+#define YTMP2 ymm6
+#define YTMP3 ymm7
+#define YTMP4 ymm8
+#define XFER YTMP0
+
+#define BYTE_FLIP_MASK ymm9
+#define PERM_VPALIGNR_8 ymm10
+
+#define MASK_DC_00 k1
+
+#define INP rdi /* 1st arg */
+#define CTX rsi /* 2nd arg */
+#define NUM_BLKS rdx /* 3rd arg */
+#define SRND r8d
+#define RSP_SAVE r9
+
+#define TBL rcx
+
+#define a xmm11
+#define b xmm12
+#define c xmm13
+#define d xmm14
+#define e xmm15
+#define f xmm16
+#define g xmm17
+#define h xmm18
+
+#define y0 xmm19
+#define y1 xmm20
+#define y2 xmm21
+#define y3 xmm22
+
+/* Local variables (stack frame) */
+#define frame_XFER         0
+#define frame_XFER_size    (4*4*8)
+#define frame_size         (frame_XFER + frame_XFER_size)
+
+#define clear_reg(x) vpxorq x,x,x
+
+/* addm [mem], reg */
+/* Add reg to mem using reg-mem add and store */
+#define addm(p1, p2) \
+       vmovq   y0, p1; \
+       vpaddq  p2, p2, y0; \
+       vmovq   p1, p2;
+
+/* COPY_YMM_AND_BSWAP ymm, [mem], byte_flip_mask */
+/* Load ymm with mem and byte swap each dword */
+#define COPY_YMM_AND_BSWAP(p1, p2, p3) \
+       vmovdqu p1, p2; \
+       vpshufb p1, p1, p3
+
+/* %macro MY_VPALIGNR  YDST, YSRC1, YSRC2, RVAL */
+/* YDST = {YSRC1, YSRC2} >> RVAL*8 */
+#define MY_VPALIGNR(YDST_SRC1, YSRC2, RVAL) \
+       vpermt2q YDST_SRC1, PERM_VPALIGNR_##RVAL, YSRC2;
+
+#define ONE_ROUND_PART1(XFERIN, a, b, c, d, e, f, g, h) \
+       /* h += Sum1 (e) + Ch (e, f, g) + (k[t] + w[0]); \
+        * d += h; \
+        * h += Sum0 (a) + Maj (a, b, c); \
+        * \
+        * Ch(x, y, z) => ((x & y) + (~x & z)) \
+        * Maj(x, y, z) => ((x & y) + (z & (x ^ y))) \
+        */ \
+       \
+       vmovq y3, [XFERIN]; \
+       vmovdqa64 y2, e; \
+       vpaddq h, h, y3; \
+       vprorq y0, e, 41; \
+       vpternlogq y2, f, g, 0xca; /* Ch (e, f, g) */ \
+       vprorq y1, e, 18; \
+       vprorq y3, e, 14; \
+       vpaddq h, h, y2; \
+       vpternlogq y0, y1, y3, 0x96; /* Sum1 (e) */ \
+       vpaddq h, h, y0; /* h += Sum1 (e) + Ch (e, f, g) + (k[t] + w[0]) */ \
+       vpaddq d, d, h; /* d += h */
+
+#define ONE_ROUND_PART2(a, b, c, d, e, f, g, h) \
+       vmovdqa64 y1, a; \
+       vprorq y0, a, 39; \
+       vpternlogq y1, b, c, 0xe8; /* Maj (a, b, c) */ \
+       vprorq y2, a, 34; \
+       vprorq y3, a, 28; \
+       vpternlogq y0, y2, y3, 0x96; /* Sum0 (a) */ \
+       vpaddq h, h, y1; \
+       vpaddq h, h, y0; /* h += Sum0 (a) + Maj (a, b, c) */
+
+#define FOUR_ROUNDS_AND_SCHED(X, Y_0, Y_1, Y_2, Y_3, a, b, c, d, e, f, g, h) \
+       /*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 0 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; */; \
+               vmovdqa         YTMP0, Y_3; \
+               vmovdqa         YTMP1, Y_1; \
+               /* Extract w[t-7] */; \
+               vpermt2q        YTMP0, PERM_VPALIGNR_8, Y_2     /* YTMP0 = W[-7] */; \
+               /* Calculate w[t-16] + w[t-7] */; \
+               vpaddq          YTMP0, YTMP0, Y_0               /* YTMP0 = W[-7] + W[-16] */; \
+               /* Extract w[t-15] */; \
+               vpermt2q        YTMP1, PERM_VPALIGNR_8, Y_0     /* YTMP1 = W[-15] */; \
+       ONE_ROUND_PART1(rsp+frame_XFER+0*8+X*32, a, b, c, d, e, f, g, h); \
+               \
+               /* Calculate sigma0 */; \
+               \
+               /* Calculate w[t-15] ror 1 */; \
+               vprorq          YTMP3, YTMP1, 1;                /* YTMP3 = W[-15] ror 1 */; \
+               /* Calculate w[t-15] shr 7 */; \
+               vpsrlq          YTMP4, YTMP1, 7                 /* YTMP4 = W[-15] >> 7 */; \
+       \
+       ONE_ROUND_PART2(a, b, c, d, e, f, g, h); \
+       \
+       /*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 1 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; */; \
+               /* Calculate w[t-15] ror 8 */; \
+               vprorq          YTMP1, YTMP1, 8                 /* YTMP1 = W[-15] ror 8 */; \
+               /* XOR the three components */; \
+               vpternlogq      YTMP1, YTMP3, YTMP4, 0x96       /* YTMP1 = s0 = W[-15] ror 1 ^ W[-15] >> 7 ^ W[-15] ror 8 */; \
+               \
+               /* Add three components, w[t-16], w[t-7] and sigma0 */; \
+               vpaddq          YTMP0, YTMP0, YTMP1             /* YTMP0 = W[-16] + W[-7] + s0 */; \
+       ONE_ROUND_PART1(rsp+frame_XFER+1*8+X*32, h, a, b, c, d, e, f, g); \
+               /* Move to appropriate lanes for calculating w[16] and w[17] */; \
+               vshufi64x2      Y_0, YTMP0, YTMP0, 0x0          /* Y_0 = W[-16] + W[-7] + s0 {BABA} */; \
+               \
+               /* Calculate w[16] and w[17] in both 128 bit lanes */; \
+               \
+               /* Calculate sigma1 for w[16] and w[17] on both 128 bit lanes */; \
+               vshufi64x2      YTMP2, Y_3, Y_3, 0b11           /* YTMP2 = W[-2] {BABA} */; \
+               vpsrlq          YTMP4, YTMP2, 6                 /* YTMP4 = W[-2] >> 6 {BABA} */; \
+       \
+       ONE_ROUND_PART2(h, a, b, c, d, e, f, g); \
+       \
+       /*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 2 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; */; \
+               vprorq          YTMP3, YTMP2, 19                /* YTMP3 = W[-2] ror 19 {BABA} */; \
+               vprorq          YTMP1, YTMP2, 61                /* YTMP3 = W[-2] ror 61 {BABA} */; \
+               vpternlogq      YTMP4, YTMP3, YTMP1, 0x96       /* YTMP4 = s1 = (W[-2] ror 19) ^ (W[-2] ror 61) ^ (W[-2] >> 6) {BABA} */; \
+               \
+       ONE_ROUND_PART1(rsp+frame_XFER+2*8+X*32, g, h, a, b, c, d, e, f); \
+               /* Add sigma1 to the other compunents to get w[16] and w[17] */; \
+               vpaddq          Y_0, Y_0, YTMP4                 /* Y_0 = {W[1], W[0], W[1], W[0]} */; \
+               \
+               /* Calculate sigma1 for w[18] and w[19] for upper 128 bit lane */; \
+               vpsrlq          YTMP4, Y_0, 6                   /* YTMP4 = W[-2] >> 6 {DC--} */; \
+       \
+       ONE_ROUND_PART2(g, h, a, b, c, d, e, f); \
+       \
+       /*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 3 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; */; \
+               vprorq          YTMP3, Y_0, 19                  /* YTMP3 = W[-2] ror 19 {DC--} */; \
+               vprorq          YTMP1, Y_0, 61                  /* YTMP1 = W[-2] ror 61 {DC--} */; \
+               vpternlogq      YTMP4, YTMP3, YTMP1, 0x96       /* YTMP4 = s1 = (W[-2] ror 19) ^ (W[-2] ror 61) ^ (W[-2] >> 6) {DC--} */; \
+               \
+       ONE_ROUND_PART1(rsp+frame_XFER+3*8+X*32, f, g, h, a, b, c, d, e); \
+               /* Add the sigma0 + w[t-7] + w[t-16] for w[18] and w[19] to newly calculated sigma1 to get w[18] and w[19] */; \
+               /* Form w[19, w[18], w17], w[16] */; \
+               vpaddq          Y_0{MASK_DC_00}, YTMP0, YTMP4   /* YTMP2 = {W[3], W[2], W[1], W[0]} */; \
+               \
+               vpaddq          XFER, Y_0, [TBL + (4+X)*32]; \
+               vmovdqa         [rsp + frame_XFER + X*32], XFER; \
+       ONE_ROUND_PART2(f, g, h, a, b, c, d, e)
+
+#define ONE_ROUND(XFERIN, a, b, c, d, e, f, g, h) \
+       ONE_ROUND_PART1(XFERIN, a, b, c, d, e, f, g, h); \
+       ONE_ROUND_PART2(a, b, c, d, e, f, g, h)
+
+#define DO_4ROUNDS(X, a, b, c, d, e, f, g, h) \
+       ONE_ROUND(rsp+frame_XFER+0*8+X*32, a, b, c, d, e, f, g, h); \
+       ONE_ROUND(rsp+frame_XFER+1*8+X*32, h, a, b, c, d, e, f, g); \
+       ONE_ROUND(rsp+frame_XFER+2*8+X*32, g, h, a, b, c, d, e, f); \
+       ONE_ROUND(rsp+frame_XFER+3*8+X*32, f, g, h, a, b, c, d, e)
+
+/*
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; void sha512_avx512(const void* M, void* D, uint64_t L);
+; Purpose: Updates the SHA512 digest stored at D with the message stored in M.
+; The size of the message pointed to by M must be an integer multiple of SHA512
+;   message blocks.
+; L is the message length in SHA512 blocks
+*/
+.globl _gcry_sha512_transform_amd64_avx512
+ELF(.type _gcry_sha512_transform_amd64_avx512,@function;)
+.align 16
+_gcry_sha512_transform_amd64_avx512:
+       CFI_STARTPROC()
+       xor     eax, eax
+
+       cmp     rdx, 0
+       je      .Lnowork
+
+       spec_stop_avx512_intel_syntax;
+
+       /* Setup mask register for DC:BA merging. */
+       mov     eax, 0b1100
+       kmovd   MASK_DC_00, eax
+
+       /* Allocate Stack Space */
+       mov     RSP_SAVE, rsp
+       CFI_DEF_CFA_REGISTER(RSP_SAVE);
+       sub     rsp, frame_size
+       and     rsp, ~(0x40 - 1)
+
+       /*; load initial digest */
+       vmovq   a,[8*0 + CTX]
+       vmovq   b,[8*1 + CTX]
+       vmovq   c,[8*2 + CTX]
+       vmovq   d,[8*3 + CTX]
+       vmovq   e,[8*4 + CTX]
+       vmovq   f,[8*5 + CTX]
+       vmovq   g,[8*6 + CTX]
+       vmovq   h,[8*7 + CTX]
+
+       vmovdqa BYTE_FLIP_MASK, [.LPSHUFFLE_BYTE_FLIP_MASK ADD_RIP]
+       vpmovzxbq PERM_VPALIGNR_8, [.LPERM_VPALIGNR_8 ADD_RIP]
+
+       lea     TBL,[.LK512 ADD_RIP]
+
+       /*; byte swap first 16 dwords */
+       COPY_YMM_AND_BSWAP(Y_0, [INP + 0*32], BYTE_FLIP_MASK)
+       COPY_YMM_AND_BSWAP(Y_1, [INP + 1*32], BYTE_FLIP_MASK)
+       COPY_YMM_AND_BSWAP(Y_2, [INP + 2*32], BYTE_FLIP_MASK)
+       COPY_YMM_AND_BSWAP(Y_3, [INP + 3*32], BYTE_FLIP_MASK)
+
+       lea     INP, [INP + 128]
+
+       vpaddq  XFER, Y_0, [TBL + 0*32]
+       vmovdqa [rsp + frame_XFER + 0*32], XFER
+       vpaddq  XFER, Y_1, [TBL + 1*32]
+       vmovdqa [rsp + frame_XFER + 1*32], XFER
+       vpaddq  XFER, Y_2, [TBL + 2*32]
+       vmovdqa [rsp + frame_XFER + 2*32], XFER
+       vpaddq  XFER, Y_3, [TBL + 3*32]
+       vmovdqa [rsp + frame_XFER + 3*32], XFER
+
+       /*; schedule 64 input dwords, by doing 12 rounds of 4 each */
+       mov     SRND, 4
+
+.align 16
+.Loop0:
+       FOUR_ROUNDS_AND_SCHED(0, Y_0, Y_1, Y_2, Y_3, a, b, c, d, e, f, g, h)
+       FOUR_ROUNDS_AND_SCHED(1, Y_1, Y_2, Y_3, Y_0, e, f, g, h, a, b, c, d)
+       FOUR_ROUNDS_AND_SCHED(2, Y_2, Y_3, Y_0, Y_1, a, b, c, d, e, f, g, h)
+       FOUR_ROUNDS_AND_SCHED(3, Y_3, Y_0, Y_1, Y_2, e, f, g, h, a, b, c, d)
+       lea     TBL, [TBL + 4*32]
+
+       sub     SRND, 1
+       jne     .Loop0
+
+       sub     NUM_BLKS, 1
+       je      .Ldone_hash
+
+       lea     TBL, [.LK512 ADD_RIP]
+
+       /* load next block and byte swap */
+       COPY_YMM_AND_BSWAP(Y_0, [INP + 0*32], BYTE_FLIP_MASK)
+       COPY_YMM_AND_BSWAP(Y_1, [INP + 1*32], BYTE_FLIP_MASK)
+       COPY_YMM_AND_BSWAP(Y_2, [INP + 2*32], BYTE_FLIP_MASK)
+       COPY_YMM_AND_BSWAP(Y_3, [INP + 3*32], BYTE_FLIP_MASK)
+
+       lea     INP, [INP + 128]
+
+       DO_4ROUNDS(0, a, b, c, d, e, f, g, h)
+       vpaddq  XFER, Y_0, [TBL + 0*32]
+       vmovdqa [rsp + frame_XFER + 0*32], XFER
+       DO_4ROUNDS(1, e, f, g, h, a, b, c, d)
+       vpaddq  XFER, Y_1, [TBL + 1*32]
+       vmovdqa [rsp + frame_XFER + 1*32], XFER
+       DO_4ROUNDS(2, a, b, c, d, e, f, g, h)
+       vpaddq  XFER, Y_2, [TBL + 2*32]
+       vmovdqa [rsp + frame_XFER + 2*32], XFER
+       DO_4ROUNDS(3, e, f, g, h, a, b, c, d)
+       vpaddq  XFER, Y_3, [TBL + 3*32]
+       vmovdqa [rsp + frame_XFER + 3*32], XFER
+
+       addm([8*0 + CTX],a)
+       addm([8*1 + CTX],b)
+       addm([8*2 + CTX],c)
+       addm([8*3 + CTX],d)
+       addm([8*4 + CTX],e)
+       addm([8*5 + CTX],f)
+       addm([8*6 + CTX],g)
+       addm([8*7 + CTX],h)
+
+       /*; schedule 64 input dwords, by doing 12 rounds of 4 each */
+       mov     SRND, 4
+
+       jmp     .Loop0
+
+.Ldone_hash:
+       DO_4ROUNDS(0, a, b, c, d, e, f, g, h)
+       DO_4ROUNDS(1, e, f, g, h, a, b, c, d)
+       DO_4ROUNDS(2, a, b, c, d, e, f, g, h)
+       DO_4ROUNDS(3, e, f, g, h, a, b, c, d)
+
+       addm([8*0 + CTX],a)
+       xor     eax, eax /* burn stack */
+       addm([8*1 + CTX],b)
+       addm([8*2 + CTX],c)
+       addm([8*3 + CTX],d)
+       addm([8*4 + CTX],e)
+       addm([8*5 + CTX],f)
+       addm([8*6 + CTX],g)
+       addm([8*7 + CTX],h)
+       kxord MASK_DC_00, MASK_DC_00, MASK_DC_00
+
+       vzeroall
+       vmovdqa [rsp + frame_XFER + 0*32], ymm0 /* burn stack */
+       vmovdqa [rsp + frame_XFER + 1*32], ymm0 /* burn stack */
+       vmovdqa [rsp + frame_XFER + 2*32], ymm0 /* burn stack */
+       vmovdqa [rsp + frame_XFER + 3*32], ymm0 /* burn stack */
+       clear_reg(ymm16);
+       clear_reg(ymm17);
+       clear_reg(ymm18);
+       clear_reg(ymm19);
+       clear_reg(ymm20);
+       clear_reg(ymm21);
+       clear_reg(ymm22);
+
+       /* Restore Stack Pointer */
+       mov     rsp, RSP_SAVE
+       CFI_DEF_CFA_REGISTER(rsp)
+
+.Lnowork:
+       ret_spec_stop
+       CFI_ENDPROC()
+ELF(.size _gcry_sha512_transform_amd64_avx512,.-_gcry_sha512_transform_amd64_avx512)
+
+/*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; */
+/*;; Binary Data */
+
+SECTION_RODATA
+
+ELF(.type _gcry_sha512_avx512_consts,@object)
+_gcry_sha512_avx512_consts:
+.align 64
+/* K[t] used in SHA512 hashing */
+.LK512:
+       .quad   0x428a2f98d728ae22,0x7137449123ef65cd
+       .quad   0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
+       .quad   0x3956c25bf348b538,0x59f111f1b605d019
+       .quad   0x923f82a4af194f9b,0xab1c5ed5da6d8118
+       .quad   0xd807aa98a3030242,0x12835b0145706fbe
+       .quad   0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
+       .quad   0x72be5d74f27b896f,0x80deb1fe3b1696b1
+       .quad   0x9bdc06a725c71235,0xc19bf174cf692694
+       .quad   0xe49b69c19ef14ad2,0xefbe4786384f25e3
+       .quad   0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
+       .quad   0x2de92c6f592b0275,0x4a7484aa6ea6e483
+       .quad   0x5cb0a9dcbd41fbd4,0x76f988da831153b5
+       .quad   0x983e5152ee66dfab,0xa831c66d2db43210
+       .quad   0xb00327c898fb213f,0xbf597fc7beef0ee4
+       .quad   0xc6e00bf33da88fc2,0xd5a79147930aa725
+       .quad   0x06ca6351e003826f,0x142929670a0e6e70
+       .quad   0x27b70a8546d22ffc,0x2e1b21385c26c926
+       .quad   0x4d2c6dfc5ac42aed,0x53380d139d95b3df
+       .quad   0x650a73548baf63de,0x766a0abb3c77b2a8
+       .quad   0x81c2c92e47edaee6,0x92722c851482353b
+       .quad   0xa2bfe8a14cf10364,0xa81a664bbc423001
+       .quad   0xc24b8b70d0f89791,0xc76c51a30654be30
+       .quad   0xd192e819d6ef5218,0xd69906245565a910
+       .quad   0xf40e35855771202a,0x106aa07032bbd1b8
+       .quad   0x19a4c116b8d2d0c8,0x1e376c085141ab53
+       .quad   0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
+       .quad   0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
+       .quad   0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
+       .quad   0x748f82ee5defb2fc,0x78a5636f43172f60
+       .quad   0x84c87814a1f0ab72,0x8cc702081a6439ec
+       .quad   0x90befffa23631e28,0xa4506cebde82bde9
+       .quad   0xbef9a3f7b2c67915,0xc67178f2e372532b
+       .quad   0xca273eceea26619c,0xd186b8c721c0c207
+       .quad   0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
+       .quad   0x06f067aa72176fba,0x0a637dc5a2c898a6
+       .quad   0x113f9804bef90dae,0x1b710b35131c471b
+       .quad   0x28db77f523047d84,0x32caab7b40c72493
+       .quad   0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
+       .quad   0x4cc5d4becb3e42b6,0x597f299cfc657e2a
+       .quad   0x5fcb6fab3ad6faec,0x6c44198c4a475817
+
+/* Mask for byte-swapping a couple of qwords in an XMM register using (v)pshufb. */
+.align 32
+.LPSHUFFLE_BYTE_FLIP_MASK:     .octa 0x08090a0b0c0d0e0f0001020304050607
+                               .octa 0x18191a1b1c1d1e1f1011121314151617
+
+.align 4
+.LPERM_VPALIGNR_8:             .byte 5, 6, 7, 0
+ELF(.size _gcry_sha512_avx512_consts,.-_gcry_sha512_avx512_consts)
+
+#endif
+#endif
index 31ea25bf9a4163f81eb129db96f1e1635dfa509e..d213c241ebcdfefc60e8a8bd659db0e65a7127de 100644 (file)
@@ -1,5 +1,5 @@
 /* sha512-ppc.c - PowerPC vcrypto implementation of SHA-512 transform
- * Copyright (C) 2019 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ * Copyright (C) 2019,2023 Jussi Kivilinna <jussi.kivilinna@iki.fi>
  *
  * This file is part of Libgcrypt.
  *
@@ -41,57 +41,68 @@ typedef vector unsigned long long vector2x_u64;
 #define ASM_FUNC_ATTR_INLINE   ASM_FUNC_ATTR ALWAYS_INLINE
 #define ASM_FUNC_ATTR_NOINLINE ASM_FUNC_ATTR NO_INLINE
 
+#ifdef HAVE_GCC_ATTRIBUTE_OPTIMIZE
+# define FUNC_ATTR_OPT_O2 __attribute__((optimize("-O2")))
+#else
+# define FUNC_ATTR_OPT_O2
+#endif
 
-static const u64 K[80] =
-  {
-    U64_C(0x428a2f98d728ae22), U64_C(0x7137449123ef65cd),
-    U64_C(0xb5c0fbcfec4d3b2f), U64_C(0xe9b5dba58189dbbc),
-    U64_C(0x3956c25bf348b538), U64_C(0x59f111f1b605d019),
-    U64_C(0x923f82a4af194f9b), U64_C(0xab1c5ed5da6d8118),
-    U64_C(0xd807aa98a3030242), U64_C(0x12835b0145706fbe),
-    U64_C(0x243185be4ee4b28c), U64_C(0x550c7dc3d5ffb4e2),
-    U64_C(0x72be5d74f27b896f), U64_C(0x80deb1fe3b1696b1),
-    U64_C(0x9bdc06a725c71235), U64_C(0xc19bf174cf692694),
-    U64_C(0xe49b69c19ef14ad2), U64_C(0xefbe4786384f25e3),
-    U64_C(0x0fc19dc68b8cd5b5), U64_C(0x240ca1cc77ac9c65),
-    U64_C(0x2de92c6f592b0275), U64_C(0x4a7484aa6ea6e483),
-    U64_C(0x5cb0a9dcbd41fbd4), U64_C(0x76f988da831153b5),
-    U64_C(0x983e5152ee66dfab), U64_C(0xa831c66d2db43210),
-    U64_C(0xb00327c898fb213f), U64_C(0xbf597fc7beef0ee4),
-    U64_C(0xc6e00bf33da88fc2), U64_C(0xd5a79147930aa725),
-    U64_C(0x06ca6351e003826f), U64_C(0x142929670a0e6e70),
-    U64_C(0x27b70a8546d22ffc), U64_C(0x2e1b21385c26c926),
-    U64_C(0x4d2c6dfc5ac42aed), U64_C(0x53380d139d95b3df),
-    U64_C(0x650a73548baf63de), U64_C(0x766a0abb3c77b2a8),
-    U64_C(0x81c2c92e47edaee6), U64_C(0x92722c851482353b),
-    U64_C(0xa2bfe8a14cf10364), U64_C(0xa81a664bbc423001),
-    U64_C(0xc24b8b70d0f89791), U64_C(0xc76c51a30654be30),
-    U64_C(0xd192e819d6ef5218), U64_C(0xd69906245565a910),
-    U64_C(0xf40e35855771202a), U64_C(0x106aa07032bbd1b8),
-    U64_C(0x19a4c116b8d2d0c8), U64_C(0x1e376c085141ab53),
-    U64_C(0x2748774cdf8eeb99), U64_C(0x34b0bcb5e19b48a8),
-    U64_C(0x391c0cb3c5c95a63), U64_C(0x4ed8aa4ae3418acb),
-    U64_C(0x5b9cca4f7763e373), U64_C(0x682e6ff3d6b2b8a3),
-    U64_C(0x748f82ee5defb2fc), U64_C(0x78a5636f43172f60),
-    U64_C(0x84c87814a1f0ab72), U64_C(0x8cc702081a6439ec),
-    U64_C(0x90befffa23631e28), U64_C(0xa4506cebde82bde9),
-    U64_C(0xbef9a3f7b2c67915), U64_C(0xc67178f2e372532b),
-    U64_C(0xca273eceea26619c), U64_C(0xd186b8c721c0c207),
-    U64_C(0xeada7dd6cde0eb1e), U64_C(0xf57d4f7fee6ed178),
-    U64_C(0x06f067aa72176fba), U64_C(0x0a637dc5a2c898a6),
-    U64_C(0x113f9804bef90dae), U64_C(0x1b710b35131c471b),
-    U64_C(0x28db77f523047d84), U64_C(0x32caab7b40c72493),
-    U64_C(0x3c9ebe0a15c9bebc), U64_C(0x431d67c49c100d4c),
-    U64_C(0x4cc5d4becb3e42b6), U64_C(0x597f299cfc657e2a),
-    U64_C(0x5fcb6fab3ad6faec), U64_C(0x6c44198c4a475817)
-  };
 
+#if defined(__clang__) && defined(HAVE_CLANG_ATTRIBUTE_PPC_TARGET)
+# define FUNC_ATTR_TARGET_P8 __attribute__((target("arch=pwr8")))
+# define FUNC_ATTR_TARGET_P9 __attribute__((target("arch=pwr9")))
+#elif defined(HAVE_GCC_ATTRIBUTE_PPC_TARGET)
+# define FUNC_ATTR_TARGET_P8 __attribute__((target("cpu=power8")))
+# define FUNC_ATTR_TARGET_P9 __attribute__((target("cpu=power9")))
+#else
+# define FUNC_ATTR_TARGET_P8
+# define FUNC_ATTR_TARGET_P9
+#endif
 
-static ASM_FUNC_ATTR_INLINE u64
-ror64 (u64 v, u64 shift)
-{
-  return (v >> (shift & 63)) ^ (v << ((64 - shift) & 63));
-}
+
+static const vector2x_u64 K[80] =
+  {
+    { U64_C(0x428a2f98d728ae22), U64_C(0x7137449123ef65cd) },
+    { U64_C(0xb5c0fbcfec4d3b2f), U64_C(0xe9b5dba58189dbbc) },
+    { U64_C(0x3956c25bf348b538), U64_C(0x59f111f1b605d019) },
+    { U64_C(0x923f82a4af194f9b), U64_C(0xab1c5ed5da6d8118) },
+    { U64_C(0xd807aa98a3030242), U64_C(0x12835b0145706fbe) },
+    { U64_C(0x243185be4ee4b28c), U64_C(0x550c7dc3d5ffb4e2) },
+    { U64_C(0x72be5d74f27b896f), U64_C(0x80deb1fe3b1696b1) },
+    { U64_C(0x9bdc06a725c71235), U64_C(0xc19bf174cf692694) },
+    { U64_C(0xe49b69c19ef14ad2), U64_C(0xefbe4786384f25e3) },
+    { U64_C(0x0fc19dc68b8cd5b5), U64_C(0x240ca1cc77ac9c65) },
+    { U64_C(0x2de92c6f592b0275), U64_C(0x4a7484aa6ea6e483) },
+    { U64_C(0x5cb0a9dcbd41fbd4), U64_C(0x76f988da831153b5) },
+    { U64_C(0x983e5152ee66dfab), U64_C(0xa831c66d2db43210) },
+    { U64_C(0xb00327c898fb213f), U64_C(0xbf597fc7beef0ee4) },
+    { U64_C(0xc6e00bf33da88fc2), U64_C(0xd5a79147930aa725) },
+    { U64_C(0x06ca6351e003826f), U64_C(0x142929670a0e6e70) },
+    { U64_C(0x27b70a8546d22ffc), U64_C(0x2e1b21385c26c926) },
+    { U64_C(0x4d2c6dfc5ac42aed), U64_C(0x53380d139d95b3df) },
+    { U64_C(0x650a73548baf63de), U64_C(0x766a0abb3c77b2a8) },
+    { U64_C(0x81c2c92e47edaee6), U64_C(0x92722c851482353b) },
+    { U64_C(0xa2bfe8a14cf10364), U64_C(0xa81a664bbc423001) },
+    { U64_C(0xc24b8b70d0f89791), U64_C(0xc76c51a30654be30) },
+    { U64_C(0xd192e819d6ef5218), U64_C(0xd69906245565a910) },
+    { U64_C(0xf40e35855771202a), U64_C(0x106aa07032bbd1b8) },
+    { U64_C(0x19a4c116b8d2d0c8), U64_C(0x1e376c085141ab53) },
+    { U64_C(0x2748774cdf8eeb99), U64_C(0x34b0bcb5e19b48a8) },
+    { U64_C(0x391c0cb3c5c95a63), U64_C(0x4ed8aa4ae3418acb) },
+    { U64_C(0x5b9cca4f7763e373), U64_C(0x682e6ff3d6b2b8a3) },
+    { U64_C(0x748f82ee5defb2fc), U64_C(0x78a5636f43172f60) },
+    { U64_C(0x84c87814a1f0ab72), U64_C(0x8cc702081a6439ec) },
+    { U64_C(0x90befffa23631e28), U64_C(0xa4506cebde82bde9) },
+    { U64_C(0xbef9a3f7b2c67915), U64_C(0xc67178f2e372532b) },
+    { U64_C(0xca273eceea26619c), U64_C(0xd186b8c721c0c207) },
+    { U64_C(0xeada7dd6cde0eb1e), U64_C(0xf57d4f7fee6ed178) },
+    { U64_C(0x06f067aa72176fba), U64_C(0x0a637dc5a2c898a6) },
+    { U64_C(0x113f9804bef90dae), U64_C(0x1b710b35131c471b) },
+    { U64_C(0x28db77f523047d84), U64_C(0x32caab7b40c72493) },
+    { U64_C(0x3c9ebe0a15c9bebc), U64_C(0x431d67c49c100d4c) },
+    { U64_C(0x4cc5d4becb3e42b6), U64_C(0x597f299cfc657e2a) },
+    { U64_C(0x5fcb6fab3ad6faec), U64_C(0x6c44198c4a475817) }
+  };
 
 
 static ASM_FUNC_ATTR_INLINE vector2x_u64
@@ -123,6 +134,17 @@ vec_vshasigma_u64(vector2x_u64 v, unsigned int a, unsigned int b)
 }
 
 
+static ASM_FUNC_ATTR_INLINE vector2x_u64
+vec_add_u64(vector2x_u64 v, vector2x_u64 w)
+{
+  __asm__ ("vaddudm %0,%1,%2"
+          : "=v" (v)
+          : "v" (v), "v" (w)
+          : "memory");
+  return v;
+}
+
+
 static ASM_FUNC_ATTR_INLINE vector2x_u64
 vec_u64_load(unsigned long offset, const void *ptr)
 {
@@ -171,19 +193,59 @@ vec_u64_store(vector2x_u64 vecu64, unsigned long offset, void *ptr)
 }
 
 
+static ASM_FUNC_ATTR_INLINE vector2x_u64
+vec_u64_load_be(unsigned long offset, const void *ptr)
+{
+  vector2x_u64 vecu64;
+#if __GNUC__ >= 4
+  if (__builtin_constant_p (offset) && offset == 0)
+    __asm__ volatile ("lxvd2x %x0,0,%1\n\t"
+                     : "=wa" (vecu64)
+                     : "r" ((uintptr_t)ptr)
+                     : "memory");
+  else
+#endif
+    __asm__ volatile ("lxvd2x %x0,%1,%2\n\t"
+                     : "=wa" (vecu64)
+                     : "r" (offset), "r" ((uintptr_t)ptr)
+                     : "memory", "r0");
+#ifndef WORDS_BIGENDIAN
+  return (vector2x_u64)vec_reve((vector16x_u8)vecu64);
+#else
+  return vecu64;
+#endif
+}
+
+
 /* SHA2 round in vector registers */
-#define R(a,b,c,d,e,f,g,h,k,w) do                             \
+#define R(a,b,c,d,e,f,g,h,ki,w) do                            \
     {                                                         \
-      t1  = (h);                                              \
-      t1 += ((k) + (w));                                      \
-      t1 += Cho((e),(f),(g));                                 \
-      t1 += Sum1((e));                                        \
-      t2  = Sum0((a));                                        \
-      t2 += Maj((a),(b),(c));                                 \
-      d  += t1;                                               \
-      h   = t1 + t2;                                          \
+      t1 = vec_add_u64((h), (w));                             \
+      t2 = Cho((e),(f),(g));                                  \
+      t1 = vec_add_u64(t1, GETK(ki));                         \
+      t1 = vec_add_u64(t1, t2);                               \
+      t1 = Sum1add(t1, e);                                    \
+      t2 = Maj((a),(b),(c));                                  \
+      t2 = Sum0add(t2, a);                                    \
+      h  = vec_add_u64(t1, t2);                               \
+      d += t1;                                                \
     } while (0)
 
+#define GETK(kidx) \
+    ({ \
+      if (((kidx) % 2) == 0) \
+       { \
+         ktmp = *(kptr++); \
+         if ((kidx) < 79) \
+           asm volatile("" : "+r" (kptr) :: "memory"); \
+       } \
+      else \
+       { \
+         ktmp = vec_mergel(ktmp, ktmp); \
+       } \
+      ktmp; \
+    })
+
 #define Cho(b, c, d)  (vec_sel(d, c, b))
 
 #define Maj(c, d, b)  (vec_sel(c, b, c ^ d))
@@ -192,29 +254,98 @@ vec_u64_store(vector2x_u64 vecu64, unsigned long offset, void *ptr)
 
 #define Sum1(x)       (vec_vshasigma_u64(x, 1, 15))
 
-
-/* Message expansion on general purpose registers */
-#define S0(x) (ror64 ((x), 1) ^ ror64 ((x), 8) ^ ((x) >> 7))
-#define S1(x) (ror64 ((x), 19) ^ ror64 ((x), 61) ^ ((x) >> 6))
-
-#define I(i) ( w[i] = buf_get_be64(data + i * 8) )
-#define WN(i) ({ w[i&0x0f] +=    w[(i-7) &0x0f];  \
-                w[i&0x0f] += S0(w[(i-15)&0x0f]); \
-                w[i&0x0f] += S1(w[(i-2) &0x0f]); \
-                w[i&0x0f]; })
-#define W(i) ({ u64 r = w[i&0x0f]; WN(i); r; })
-#define L(i) w[i&0x0f]
-
-
-unsigned int ASM_FUNC_ATTR
-_gcry_sha512_transform_ppc8(u64 state[8],
-                           const unsigned char *data, size_t nblks)
+#define S0(x)         (vec_vshasigma_u64(x, 0, 0))
+
+#define S1(x)         (vec_vshasigma_u64(x, 0, 15))
+
+#define Xadd(X, d, x) vec_add_u64(d, X(x))
+
+#define Sum0add(d, x) Xadd(Sum0, d, x)
+
+#define Sum1add(d, x) Xadd(Sum1, d, x)
+
+#define S0add(d, x)   Xadd(S0, d, x)
+
+#define S1add(d, x)   Xadd(S1, d, x)
+
+#define I(i) \
+    ({ \
+      if (((i) % 2) == 0) \
+       { \
+         w[i] = vec_u64_load_be(0, data); \
+         data += 2 * 8; \
+         if ((i) / 2 < 7) \
+           asm volatile("" : "+r"(data) :: "memory"); \
+       } \
+      else \
+       { \
+         w[i] = vec_mergel(w[(i) - 1], w[(i) - 1]); \
+       } \
+    })
+
+#define WN(i) ({ w[(i)&0x0f] += w[((i)-7) &0x0f];  \
+                w[(i)&0x0f] = S0add(w[(i)&0x0f], w[((i)-15)&0x0f]); \
+                w[(i)&0x0f] = S1add(w[(i)&0x0f], w[((i)-2) &0x0f]); })
+
+#define W(i) ({ vector2x_u64 r = w[(i)&0x0f]; WN(i); r; })
+
+#define L(i) w[(i)&0x0f]
+
+#define I2(i) \
+    ({ \
+      if (((i) % 2) == 0) \
+       { \
+         w[i] = vec_u64_load_be(0, data); \
+       } \
+      else \
+       { \
+         vector2x_u64 it1 = vec_u64_load_be(128, data); \
+         vector2x_u64 it2 = vec_mergeh(w[(i) - 1], it1); \
+         w[i] = vec_mergel(w[(i) - 1], it1); \
+         w[(i) - 1] = it2; \
+         if ((i) < 15) \
+           { \
+             data += 2 * 8; \
+             asm volatile("" : "+r"(data) :: "memory"); \
+           } \
+         else \
+           { \
+             data += 2 * 8 + 128; \
+             asm volatile("" : "+r"(data) :: "memory"); \
+           } \
+       } \
+    })
+
+#define W2(i) \
+    ({ \
+      vector2x_u64 wt1 = w[(i)&0x0f]; \
+      WN(i); \
+      w2[(i) / 2] = (((i) % 2) == 0) ? wt1 : vec_mergel(w2[(i) / 2], wt1); \
+      wt1; \
+    })
+
+#define L2(i) \
+    ({ \
+      vector2x_u64 lt1 = w[(i)&0x0f]; \
+      w2[(i) / 2] = (((i) % 2) == 0) ? lt1 : vec_mergel(w2[(i) / 2], lt1); \
+      lt1; \
+    })
+
+#define WL(i) \
+    ({ \
+      vector2x_u64 wlt1 = w2[(i) / 2]; \
+      if (((i) % 2) == 0 && (i) < 79) \
+       w2[(i) / 2] = vec_mergel(wlt1, wlt1); \
+      wlt1; \
+    })
+
+static ASM_FUNC_ATTR_INLINE FUNC_ATTR_OPT_O2 unsigned int
+sha512_transform_ppc(u64 state[8], const unsigned char *data, size_t nblks)
 {
-  /* GPRs used for message expansion as vector intrinsics based generates
-   * slower code. */
   vector2x_u64 h0, h1, h2, h3, h4, h5, h6, h7;
   vector2x_u64 a, b, c, d, e, f, g, h, t1, t2;
-  u64 w[16];
+  vector2x_u64 w[16];
+  vector2x_u64 w2[80 / 2];
 
   h0 = vec_u64_load (8 * 0, (unsigned long long *)state);
   h1 = vec_rol_elems (h0, 1);
@@ -227,6 +358,9 @@ _gcry_sha512_transform_ppc8(u64 state[8],
 
   while (nblks >= 2)
     {
+      const vector2x_u64 *kptr = K;
+      vector2x_u64 ktmp;
+
       a = h0;
       b = h1;
       c = h2;
@@ -236,100 +370,96 @@ _gcry_sha512_transform_ppc8(u64 state[8],
       g = h6;
       h = h7;
 
-      I(0); I(1); I(2); I(3);
-      I(4); I(5); I(6); I(7);
-      I(8); I(9); I(10); I(11);
-      I(12); I(13); I(14); I(15);
-      data += 128;
-      R(a, b, c, d, e, f, g, h, K[0], W(0));
-      R(h, a, b, c, d, e, f, g, K[1], W(1));
-      R(g, h, a, b, c, d, e, f, K[2], W(2));
-      R(f, g, h, a, b, c, d, e, K[3], W(3));
-      R(e, f, g, h, a, b, c, d, K[4], W(4));
-      R(d, e, f, g, h, a, b, c, K[5], W(5));
-      R(c, d, e, f, g, h, a, b, K[6], W(6));
-      R(b, c, d, e, f, g, h, a, K[7], W(7));
-      R(a, b, c, d, e, f, g, h, K[8], W(8));
-      R(h, a, b, c, d, e, f, g, K[9], W(9));
-      R(g, h, a, b, c, d, e, f, K[10], W(10));
-      R(f, g, h, a, b, c, d, e, K[11], W(11));
-      R(e, f, g, h, a, b, c, d, K[12], W(12));
-      R(d, e, f, g, h, a, b, c, K[13], W(13));
-      R(c, d, e, f, g, h, a, b, K[14], W(14));
-      R(b, c, d, e, f, g, h, a, K[15], W(15));
-
-      R(a, b, c, d, e, f, g, h, K[16], W(16));
-      R(h, a, b, c, d, e, f, g, K[17], W(17));
-      R(g, h, a, b, c, d, e, f, K[18], W(18));
-      R(f, g, h, a, b, c, d, e, K[19], W(19));
-      R(e, f, g, h, a, b, c, d, K[20], W(20));
-      R(d, e, f, g, h, a, b, c, K[21], W(21));
-      R(c, d, e, f, g, h, a, b, K[22], W(22));
-      R(b, c, d, e, f, g, h, a, K[23], W(23));
-      R(a, b, c, d, e, f, g, h, K[24], W(24));
-      R(h, a, b, c, d, e, f, g, K[25], W(25));
-      R(g, h, a, b, c, d, e, f, K[26], W(26));
-      R(f, g, h, a, b, c, d, e, K[27], W(27));
-      R(e, f, g, h, a, b, c, d, K[28], W(28));
-      R(d, e, f, g, h, a, b, c, K[29], W(29));
-      R(c, d, e, f, g, h, a, b, K[30], W(30));
-      R(b, c, d, e, f, g, h, a, K[31], W(31));
-
-      R(a, b, c, d, e, f, g, h, K[32], W(32));
-      R(h, a, b, c, d, e, f, g, K[33], W(33));
-      R(g, h, a, b, c, d, e, f, K[34], W(34));
-      R(f, g, h, a, b, c, d, e, K[35], W(35));
-      R(e, f, g, h, a, b, c, d, K[36], W(36));
-      R(d, e, f, g, h, a, b, c, K[37], W(37));
-      R(c, d, e, f, g, h, a, b, K[38], W(38));
-      R(b, c, d, e, f, g, h, a, K[39], W(39));
-      R(a, b, c, d, e, f, g, h, K[40], W(40));
-      R(h, a, b, c, d, e, f, g, K[41], W(41));
-      R(g, h, a, b, c, d, e, f, K[42], W(42));
-      R(f, g, h, a, b, c, d, e, K[43], W(43));
-      R(e, f, g, h, a, b, c, d, K[44], W(44));
-      R(d, e, f, g, h, a, b, c, K[45], W(45));
-      R(c, d, e, f, g, h, a, b, K[46], W(46));
-      R(b, c, d, e, f, g, h, a, K[47], W(47));
-
-      R(a, b, c, d, e, f, g, h, K[48], W(48));
-      R(h, a, b, c, d, e, f, g, K[49], W(49));
-      R(g, h, a, b, c, d, e, f, K[50], W(50));
-      R(f, g, h, a, b, c, d, e, K[51], W(51));
-      R(e, f, g, h, a, b, c, d, K[52], W(52));
-      R(d, e, f, g, h, a, b, c, K[53], W(53));
-      R(c, d, e, f, g, h, a, b, K[54], W(54));
-      R(b, c, d, e, f, g, h, a, K[55], W(55));
-      R(a, b, c, d, e, f, g, h, K[56], W(56));
-      R(h, a, b, c, d, e, f, g, K[57], W(57));
-      R(g, h, a, b, c, d, e, f, K[58], W(58));
-      R(f, g, h, a, b, c, d, e, K[59], W(59));
-      R(e, f, g, h, a, b, c, d, K[60], W(60));
-      R(d, e, f, g, h, a, b, c, K[61], W(61));
-      R(c, d, e, f, g, h, a, b, K[62], W(62));
-      R(b, c, d, e, f, g, h, a, K[63], W(63));
-
-      R(a, b, c, d, e, f, g, h, K[64], L(64));
-      R(h, a, b, c, d, e, f, g, K[65], L(65));
-      R(g, h, a, b, c, d, e, f, K[66], L(66));
-      R(f, g, h, a, b, c, d, e, K[67], L(67));
-      I(0); I(1); I(2); I(3);
-      R(e, f, g, h, a, b, c, d, K[68], L(68));
-      R(d, e, f, g, h, a, b, c, K[69], L(69));
-      R(c, d, e, f, g, h, a, b, K[70], L(70));
-      R(b, c, d, e, f, g, h, a, K[71], L(71));
-      I(4); I(5); I(6); I(7);
-      R(a, b, c, d, e, f, g, h, K[72], L(72));
-      R(h, a, b, c, d, e, f, g, K[73], L(73));
-      R(g, h, a, b, c, d, e, f, K[74], L(74));
-      R(f, g, h, a, b, c, d, e, K[75], L(75));
-      I(8); I(9); I(10); I(11);
-      R(e, f, g, h, a, b, c, d, K[76], L(76));
-      R(d, e, f, g, h, a, b, c, K[77], L(77));
-      R(c, d, e, f, g, h, a, b, K[78], L(78));
-      R(b, c, d, e, f, g, h, a, K[79], L(79));
-      I(12); I(13); I(14); I(15);
-      data += 128;
+      I2(0); I2(1); I2(2); I2(3);
+      I2(4); I2(5); I2(6); I2(7);
+      I2(8); I2(9); I2(10); I2(11);
+      I2(12); I2(13); I2(14); I2(15);
+
+      R(a, b, c, d, e, f, g, h, 0, W2(0));
+      R(h, a, b, c, d, e, f, g, 1, W2(1));
+      R(g, h, a, b, c, d, e, f, 2, W2(2));
+      R(f, g, h, a, b, c, d, e, 3, W2(3));
+      R(e, f, g, h, a, b, c, d, 4, W2(4));
+      R(d, e, f, g, h, a, b, c, 5, W2(5));
+      R(c, d, e, f, g, h, a, b, 6, W2(6));
+      R(b, c, d, e, f, g, h, a, 7, W2(7));
+
+      R(a, b, c, d, e, f, g, h, 8, W2(8));
+      R(h, a, b, c, d, e, f, g, 9, W2(9));
+      R(g, h, a, b, c, d, e, f, 10, W2(10));
+      R(f, g, h, a, b, c, d, e, 11, W2(11));
+      R(e, f, g, h, a, b, c, d, 12, W2(12));
+      R(d, e, f, g, h, a, b, c, 13, W2(13));
+      R(c, d, e, f, g, h, a, b, 14, W2(14));
+      R(b, c, d, e, f, g, h, a, 15, W2(15));
+
+      R(a, b, c, d, e, f, g, h, 16, W2(16));
+      R(h, a, b, c, d, e, f, g, 17, W2(17));
+      R(g, h, a, b, c, d, e, f, 18, W2(18));
+      R(f, g, h, a, b, c, d, e, 19, W2(19));
+      R(e, f, g, h, a, b, c, d, 20, W2(20));
+      R(d, e, f, g, h, a, b, c, 21, W2(21));
+      R(c, d, e, f, g, h, a, b, 22, W2(22));
+      R(b, c, d, e, f, g, h, a, 23, W2(23));
+      R(a, b, c, d, e, f, g, h, 24, W2(24));
+      R(h, a, b, c, d, e, f, g, 25, W2(25));
+      R(g, h, a, b, c, d, e, f, 26, W2(26));
+      R(f, g, h, a, b, c, d, e, 27, W2(27));
+      R(e, f, g, h, a, b, c, d, 28, W2(28));
+      R(d, e, f, g, h, a, b, c, 29, W2(29));
+      R(c, d, e, f, g, h, a, b, 30, W2(30));
+      R(b, c, d, e, f, g, h, a, 31, W2(31));
+
+      R(a, b, c, d, e, f, g, h, 32, W2(32));
+      R(h, a, b, c, d, e, f, g, 33, W2(33));
+      R(g, h, a, b, c, d, e, f, 34, W2(34));
+      R(f, g, h, a, b, c, d, e, 35, W2(35));
+      R(e, f, g, h, a, b, c, d, 36, W2(36));
+      R(d, e, f, g, h, a, b, c, 37, W2(37));
+      R(c, d, e, f, g, h, a, b, 38, W2(38));
+      R(b, c, d, e, f, g, h, a, 39, W2(39));
+      R(a, b, c, d, e, f, g, h, 40, W2(40));
+      R(h, a, b, c, d, e, f, g, 41, W2(41));
+      R(g, h, a, b, c, d, e, f, 42, W2(42));
+      R(f, g, h, a, b, c, d, e, 43, W2(43));
+      R(e, f, g, h, a, b, c, d, 44, W2(44));
+      R(d, e, f, g, h, a, b, c, 45, W2(45));
+      R(c, d, e, f, g, h, a, b, 46, W2(46));
+      R(b, c, d, e, f, g, h, a, 47, W2(47));
+
+      R(a, b, c, d, e, f, g, h, 48, W2(48));
+      R(h, a, b, c, d, e, f, g, 49, W2(49));
+      R(g, h, a, b, c, d, e, f, 50, W2(50));
+      R(f, g, h, a, b, c, d, e, 51, W2(51));
+      R(e, f, g, h, a, b, c, d, 52, W2(52));
+      R(d, e, f, g, h, a, b, c, 53, W2(53));
+      R(c, d, e, f, g, h, a, b, 54, W2(54));
+      R(b, c, d, e, f, g, h, a, 55, W2(55));
+      R(a, b, c, d, e, f, g, h, 56, W2(56));
+      R(h, a, b, c, d, e, f, g, 57, W2(57));
+      R(g, h, a, b, c, d, e, f, 58, W2(58));
+      R(f, g, h, a, b, c, d, e, 59, W2(59));
+      R(e, f, g, h, a, b, c, d, 60, W2(60));
+      R(d, e, f, g, h, a, b, c, 61, W2(61));
+      R(c, d, e, f, g, h, a, b, 62, W2(62));
+      R(b, c, d, e, f, g, h, a, 63, W2(63));
+
+      R(a, b, c, d, e, f, g, h, 64, L2(64));
+      R(h, a, b, c, d, e, f, g, 65, L2(65));
+      R(g, h, a, b, c, d, e, f, 66, L2(66));
+      R(f, g, h, a, b, c, d, e, 67, L2(67));
+      R(e, f, g, h, a, b, c, d, 68, L2(68));
+      R(d, e, f, g, h, a, b, c, 69, L2(69));
+      R(c, d, e, f, g, h, a, b, 70, L2(70));
+      R(b, c, d, e, f, g, h, a, 71, L2(71));
+      R(a, b, c, d, e, f, g, h, 72, L2(72));
+      R(h, a, b, c, d, e, f, g, 73, L2(73));
+      R(g, h, a, b, c, d, e, f, 74, L2(74));
+      R(f, g, h, a, b, c, d, e, 75, L2(75));
+      R(e, f, g, h, a, b, c, d, 76, L2(76));
+      R(d, e, f, g, h, a, b, c, 77, L2(77));
+      R(c, d, e, f, g, h, a, b, 78, L2(78));
+      R(b, c, d, e, f, g, h, a, 79, L2(79));
 
       h0 += a;
       h1 += b;
@@ -339,6 +469,9 @@ _gcry_sha512_transform_ppc8(u64 state[8],
       h5 += f;
       h6 += g;
       h7 += h;
+
+      kptr = K;
+
       a = h0;
       b = h1;
       c = h2;
@@ -348,90 +481,91 @@ _gcry_sha512_transform_ppc8(u64 state[8],
       g = h6;
       h = h7;
 
-      R(a, b, c, d, e, f, g, h, K[0], W(0));
-      R(h, a, b, c, d, e, f, g, K[1], W(1));
-      R(g, h, a, b, c, d, e, f, K[2], W(2));
-      R(f, g, h, a, b, c, d, e, K[3], W(3));
-      R(e, f, g, h, a, b, c, d, K[4], W(4));
-      R(d, e, f, g, h, a, b, c, K[5], W(5));
-      R(c, d, e, f, g, h, a, b, K[6], W(6));
-      R(b, c, d, e, f, g, h, a, K[7], W(7));
-      R(a, b, c, d, e, f, g, h, K[8], W(8));
-      R(h, a, b, c, d, e, f, g, K[9], W(9));
-      R(g, h, a, b, c, d, e, f, K[10], W(10));
-      R(f, g, h, a, b, c, d, e, K[11], W(11));
-      R(e, f, g, h, a, b, c, d, K[12], W(12));
-      R(d, e, f, g, h, a, b, c, K[13], W(13));
-      R(c, d, e, f, g, h, a, b, K[14], W(14));
-      R(b, c, d, e, f, g, h, a, K[15], W(15));
-
-      R(a, b, c, d, e, f, g, h, K[16], W(16));
-      R(h, a, b, c, d, e, f, g, K[17], W(17));
-      R(g, h, a, b, c, d, e, f, K[18], W(18));
-      R(f, g, h, a, b, c, d, e, K[19], W(19));
-      R(e, f, g, h, a, b, c, d, K[20], W(20));
-      R(d, e, f, g, h, a, b, c, K[21], W(21));
-      R(c, d, e, f, g, h, a, b, K[22], W(22));
-      R(b, c, d, e, f, g, h, a, K[23], W(23));
-      R(a, b, c, d, e, f, g, h, K[24], W(24));
-      R(h, a, b, c, d, e, f, g, K[25], W(25));
-      R(g, h, a, b, c, d, e, f, K[26], W(26));
-      R(f, g, h, a, b, c, d, e, K[27], W(27));
-      R(e, f, g, h, a, b, c, d, K[28], W(28));
-      R(d, e, f, g, h, a, b, c, K[29], W(29));
-      R(c, d, e, f, g, h, a, b, K[30], W(30));
-      R(b, c, d, e, f, g, h, a, K[31], W(31));
-
-      R(a, b, c, d, e, f, g, h, K[32], W(32));
-      R(h, a, b, c, d, e, f, g, K[33], W(33));
-      R(g, h, a, b, c, d, e, f, K[34], W(34));
-      R(f, g, h, a, b, c, d, e, K[35], W(35));
-      R(e, f, g, h, a, b, c, d, K[36], W(36));
-      R(d, e, f, g, h, a, b, c, K[37], W(37));
-      R(c, d, e, f, g, h, a, b, K[38], W(38));
-      R(b, c, d, e, f, g, h, a, K[39], W(39));
-      R(a, b, c, d, e, f, g, h, K[40], W(40));
-      R(h, a, b, c, d, e, f, g, K[41], W(41));
-      R(g, h, a, b, c, d, e, f, K[42], W(42));
-      R(f, g, h, a, b, c, d, e, K[43], W(43));
-      R(e, f, g, h, a, b, c, d, K[44], W(44));
-      R(d, e, f, g, h, a, b, c, K[45], W(45));
-      R(c, d, e, f, g, h, a, b, K[46], W(46));
-      R(b, c, d, e, f, g, h, a, K[47], W(47));
-
-      R(a, b, c, d, e, f, g, h, K[48], W(48));
-      R(h, a, b, c, d, e, f, g, K[49], W(49));
-      R(g, h, a, b, c, d, e, f, K[50], W(50));
-      R(f, g, h, a, b, c, d, e, K[51], W(51));
-      R(e, f, g, h, a, b, c, d, K[52], W(52));
-      R(d, e, f, g, h, a, b, c, K[53], W(53));
-      R(c, d, e, f, g, h, a, b, K[54], W(54));
-      R(b, c, d, e, f, g, h, a, K[55], W(55));
-      R(a, b, c, d, e, f, g, h, K[56], W(56));
-      R(h, a, b, c, d, e, f, g, K[57], W(57));
-      R(g, h, a, b, c, d, e, f, K[58], W(58));
-      R(f, g, h, a, b, c, d, e, K[59], W(59));
-      R(e, f, g, h, a, b, c, d, K[60], W(60));
-      R(d, e, f, g, h, a, b, c, K[61], W(61));
-      R(c, d, e, f, g, h, a, b, K[62], W(62));
-      R(b, c, d, e, f, g, h, a, K[63], W(63));
-
-      R(a, b, c, d, e, f, g, h, K[64], L(64));
-      R(h, a, b, c, d, e, f, g, K[65], L(65));
-      R(g, h, a, b, c, d, e, f, K[66], L(66));
-      R(f, g, h, a, b, c, d, e, K[67], L(67));
-      R(e, f, g, h, a, b, c, d, K[68], L(68));
-      R(d, e, f, g, h, a, b, c, K[69], L(69));
-      R(c, d, e, f, g, h, a, b, K[70], L(70));
-      R(b, c, d, e, f, g, h, a, K[71], L(71));
-      R(a, b, c, d, e, f, g, h, K[72], L(72));
-      R(h, a, b, c, d, e, f, g, K[73], L(73));
-      R(g, h, a, b, c, d, e, f, K[74], L(74));
-      R(f, g, h, a, b, c, d, e, K[75], L(75));
-      R(e, f, g, h, a, b, c, d, K[76], L(76));
-      R(d, e, f, g, h, a, b, c, K[77], L(77));
-      R(c, d, e, f, g, h, a, b, K[78], L(78));
-      R(b, c, d, e, f, g, h, a, K[79], L(79));
+      R(a, b, c, d, e, f, g, h, 0, WL(0));
+      R(h, a, b, c, d, e, f, g, 1, WL(1));
+      R(g, h, a, b, c, d, e, f, 2, WL(2));
+      R(f, g, h, a, b, c, d, e, 3, WL(3));
+      R(e, f, g, h, a, b, c, d, 4, WL(4));
+      R(d, e, f, g, h, a, b, c, 5, WL(5));
+      R(c, d, e, f, g, h, a, b, 6, WL(6));
+      R(b, c, d, e, f, g, h, a, 7, WL(7));
+
+      R(a, b, c, d, e, f, g, h, 8, WL(8));
+      R(h, a, b, c, d, e, f, g, 9, WL(9));
+      R(g, h, a, b, c, d, e, f, 10, WL(10));
+      R(f, g, h, a, b, c, d, e, 11, WL(11));
+      R(e, f, g, h, a, b, c, d, 12, WL(12));
+      R(d, e, f, g, h, a, b, c, 13, WL(13));
+      R(c, d, e, f, g, h, a, b, 14, WL(14));
+      R(b, c, d, e, f, g, h, a, 15, WL(15));
+
+      R(a, b, c, d, e, f, g, h, 16, WL(16));
+      R(h, a, b, c, d, e, f, g, 17, WL(17));
+      R(g, h, a, b, c, d, e, f, 18, WL(18));
+      R(f, g, h, a, b, c, d, e, 19, WL(19));
+      R(e, f, g, h, a, b, c, d, 20, WL(20));
+      R(d, e, f, g, h, a, b, c, 21, WL(21));
+      R(c, d, e, f, g, h, a, b, 22, WL(22));
+      R(b, c, d, e, f, g, h, a, 23, WL(23));
+      R(a, b, c, d, e, f, g, h, 24, WL(24));
+      R(h, a, b, c, d, e, f, g, 25, WL(25));
+      R(g, h, a, b, c, d, e, f, 26, WL(26));
+      R(f, g, h, a, b, c, d, e, 27, WL(27));
+      R(e, f, g, h, a, b, c, d, 28, WL(28));
+      R(d, e, f, g, h, a, b, c, 29, WL(29));
+      R(c, d, e, f, g, h, a, b, 30, WL(30));
+      R(b, c, d, e, f, g, h, a, 31, WL(31));
+
+      R(a, b, c, d, e, f, g, h, 32, WL(32));
+      R(h, a, b, c, d, e, f, g, 33, WL(33));
+      R(g, h, a, b, c, d, e, f, 34, WL(34));
+      R(f, g, h, a, b, c, d, e, 35, WL(35));
+      R(e, f, g, h, a, b, c, d, 36, WL(36));
+      R(d, e, f, g, h, a, b, c, 37, WL(37));
+      R(c, d, e, f, g, h, a, b, 38, WL(38));
+      R(b, c, d, e, f, g, h, a, 39, WL(39));
+      R(a, b, c, d, e, f, g, h, 40, WL(40));
+      R(h, a, b, c, d, e, f, g, 41, WL(41));
+      R(g, h, a, b, c, d, e, f, 42, WL(42));
+      R(f, g, h, a, b, c, d, e, 43, WL(43));
+      R(e, f, g, h, a, b, c, d, 44, WL(44));
+      R(d, e, f, g, h, a, b, c, 45, WL(45));
+      R(c, d, e, f, g, h, a, b, 46, WL(46));
+      R(b, c, d, e, f, g, h, a, 47, WL(47));
+
+      R(a, b, c, d, e, f, g, h, 48, WL(48));
+      R(h, a, b, c, d, e, f, g, 49, WL(49));
+      R(g, h, a, b, c, d, e, f, 50, WL(50));
+      R(f, g, h, a, b, c, d, e, 51, WL(51));
+      R(e, f, g, h, a, b, c, d, 52, WL(52));
+      R(d, e, f, g, h, a, b, c, 53, WL(53));
+      R(c, d, e, f, g, h, a, b, 54, WL(54));
+      R(b, c, d, e, f, g, h, a, 55, WL(55));
+      R(a, b, c, d, e, f, g, h, 56, WL(56));
+      R(h, a, b, c, d, e, f, g, 57, WL(57));
+      R(g, h, a, b, c, d, e, f, 58, WL(58));
+      R(f, g, h, a, b, c, d, e, 59, WL(59));
+      R(e, f, g, h, a, b, c, d, 60, WL(60));
+      R(d, e, f, g, h, a, b, c, 61, WL(61));
+      R(c, d, e, f, g, h, a, b, 62, WL(62));
+      R(b, c, d, e, f, g, h, a, 63, WL(63));
+
+      R(a, b, c, d, e, f, g, h, 64, WL(64));
+      R(h, a, b, c, d, e, f, g, 65, WL(65));
+      R(g, h, a, b, c, d, e, f, 66, WL(66));
+      R(f, g, h, a, b, c, d, e, 67, WL(67));
+      R(e, f, g, h, a, b, c, d, 68, WL(68));
+      R(d, e, f, g, h, a, b, c, 69, WL(69));
+      R(c, d, e, f, g, h, a, b, 70, WL(70));
+      R(b, c, d, e, f, g, h, a, 71, WL(71));
+      R(a, b, c, d, e, f, g, h, 72, WL(72));
+      R(h, a, b, c, d, e, f, g, 73, WL(73));
+      R(g, h, a, b, c, d, e, f, 74, WL(74));
+      R(f, g, h, a, b, c, d, e, 75, WL(75));
+      R(e, f, g, h, a, b, c, d, 76, WL(76));
+      R(d, e, f, g, h, a, b, c, 77, WL(77));
+      R(c, d, e, f, g, h, a, b, 78, WL(78));
+      R(b, c, d, e, f, g, h, a, 79, WL(79));
 
       h0 += a;
       h1 += b;
@@ -445,8 +579,11 @@ _gcry_sha512_transform_ppc8(u64 state[8],
       nblks -= 2;
     }
 
-  while (nblks)
+  if (nblks)
     {
+      const vector2x_u64 *kptr = K;
+      vector2x_u64 ktmp;
+
       a = h0;
       b = h1;
       c = h2;
@@ -460,91 +597,92 @@ _gcry_sha512_transform_ppc8(u64 state[8],
       I(4); I(5); I(6); I(7);
       I(8); I(9); I(10); I(11);
       I(12); I(13); I(14); I(15);
-      data += 128;
-      R(a, b, c, d, e, f, g, h, K[0], W(0));
-      R(h, a, b, c, d, e, f, g, K[1], W(1));
-      R(g, h, a, b, c, d, e, f, K[2], W(2));
-      R(f, g, h, a, b, c, d, e, K[3], W(3));
-      R(e, f, g, h, a, b, c, d, K[4], W(4));
-      R(d, e, f, g, h, a, b, c, K[5], W(5));
-      R(c, d, e, f, g, h, a, b, K[6], W(6));
-      R(b, c, d, e, f, g, h, a, K[7], W(7));
-      R(a, b, c, d, e, f, g, h, K[8], W(8));
-      R(h, a, b, c, d, e, f, g, K[9], W(9));
-      R(g, h, a, b, c, d, e, f, K[10], W(10));
-      R(f, g, h, a, b, c, d, e, K[11], W(11));
-      R(e, f, g, h, a, b, c, d, K[12], W(12));
-      R(d, e, f, g, h, a, b, c, K[13], W(13));
-      R(c, d, e, f, g, h, a, b, K[14], W(14));
-      R(b, c, d, e, f, g, h, a, K[15], W(15));
-
-      R(a, b, c, d, e, f, g, h, K[16], W(16));
-      R(h, a, b, c, d, e, f, g, K[17], W(17));
-      R(g, h, a, b, c, d, e, f, K[18], W(18));
-      R(f, g, h, a, b, c, d, e, K[19], W(19));
-      R(e, f, g, h, a, b, c, d, K[20], W(20));
-      R(d, e, f, g, h, a, b, c, K[21], W(21));
-      R(c, d, e, f, g, h, a, b, K[22], W(22));
-      R(b, c, d, e, f, g, h, a, K[23], W(23));
-      R(a, b, c, d, e, f, g, h, K[24], W(24));
-      R(h, a, b, c, d, e, f, g, K[25], W(25));
-      R(g, h, a, b, c, d, e, f, K[26], W(26));
-      R(f, g, h, a, b, c, d, e, K[27], W(27));
-      R(e, f, g, h, a, b, c, d, K[28], W(28));
-      R(d, e, f, g, h, a, b, c, K[29], W(29));
-      R(c, d, e, f, g, h, a, b, K[30], W(30));
-      R(b, c, d, e, f, g, h, a, K[31], W(31));
-
-      R(a, b, c, d, e, f, g, h, K[32], W(32));
-      R(h, a, b, c, d, e, f, g, K[33], W(33));
-      R(g, h, a, b, c, d, e, f, K[34], W(34));
-      R(f, g, h, a, b, c, d, e, K[35], W(35));
-      R(e, f, g, h, a, b, c, d, K[36], W(36));
-      R(d, e, f, g, h, a, b, c, K[37], W(37));
-      R(c, d, e, f, g, h, a, b, K[38], W(38));
-      R(b, c, d, e, f, g, h, a, K[39], W(39));
-      R(a, b, c, d, e, f, g, h, K[40], W(40));
-      R(h, a, b, c, d, e, f, g, K[41], W(41));
-      R(g, h, a, b, c, d, e, f, K[42], W(42));
-      R(f, g, h, a, b, c, d, e, K[43], W(43));
-      R(e, f, g, h, a, b, c, d, K[44], W(44));
-      R(d, e, f, g, h, a, b, c, K[45], W(45));
-      R(c, d, e, f, g, h, a, b, K[46], W(46));
-      R(b, c, d, e, f, g, h, a, K[47], W(47));
-
-      R(a, b, c, d, e, f, g, h, K[48], W(48));
-      R(h, a, b, c, d, e, f, g, K[49], W(49));
-      R(g, h, a, b, c, d, e, f, K[50], W(50));
-      R(f, g, h, a, b, c, d, e, K[51], W(51));
-      R(e, f, g, h, a, b, c, d, K[52], W(52));
-      R(d, e, f, g, h, a, b, c, K[53], W(53));
-      R(c, d, e, f, g, h, a, b, K[54], W(54));
-      R(b, c, d, e, f, g, h, a, K[55], W(55));
-      R(a, b, c, d, e, f, g, h, K[56], W(56));
-      R(h, a, b, c, d, e, f, g, K[57], W(57));
-      R(g, h, a, b, c, d, e, f, K[58], W(58));
-      R(f, g, h, a, b, c, d, e, K[59], W(59));
-      R(e, f, g, h, a, b, c, d, K[60], W(60));
-      R(d, e, f, g, h, a, b, c, K[61], W(61));
-      R(c, d, e, f, g, h, a, b, K[62], W(62));
-      R(b, c, d, e, f, g, h, a, K[63], W(63));
-
-      R(a, b, c, d, e, f, g, h, K[64], L(64));
-      R(h, a, b, c, d, e, f, g, K[65], L(65));
-      R(g, h, a, b, c, d, e, f, K[66], L(66));
-      R(f, g, h, a, b, c, d, e, K[67], L(67));
-      R(e, f, g, h, a, b, c, d, K[68], L(68));
-      R(d, e, f, g, h, a, b, c, K[69], L(69));
-      R(c, d, e, f, g, h, a, b, K[70], L(70));
-      R(b, c, d, e, f, g, h, a, K[71], L(71));
-      R(a, b, c, d, e, f, g, h, K[72], L(72));
-      R(h, a, b, c, d, e, f, g, K[73], L(73));
-      R(g, h, a, b, c, d, e, f, K[74], L(74));
-      R(f, g, h, a, b, c, d, e, K[75], L(75));
-      R(e, f, g, h, a, b, c, d, K[76], L(76));
-      R(d, e, f, g, h, a, b, c, K[77], L(77));
-      R(c, d, e, f, g, h, a, b, K[78], L(78));
-      R(b, c, d, e, f, g, h, a, K[79], L(79));
+
+      R(a, b, c, d, e, f, g, h, 0, W(0));
+      R(h, a, b, c, d, e, f, g, 1, W(1));
+      R(g, h, a, b, c, d, e, f, 2, W(2));
+      R(f, g, h, a, b, c, d, e, 3, W(3));
+      R(e, f, g, h, a, b, c, d, 4, W(4));
+      R(d, e, f, g, h, a, b, c, 5, W(5));
+      R(c, d, e, f, g, h, a, b, 6, W(6));
+      R(b, c, d, e, f, g, h, a, 7, W(7));
+
+      R(a, b, c, d, e, f, g, h, 8, W(8));
+      R(h, a, b, c, d, e, f, g, 9, W(9));
+      R(g, h, a, b, c, d, e, f, 10, W(10));
+      R(f, g, h, a, b, c, d, e, 11, W(11));
+      R(e, f, g, h, a, b, c, d, 12, W(12));
+      R(d, e, f, g, h, a, b, c, 13, W(13));
+      R(c, d, e, f, g, h, a, b, 14, W(14));
+      R(b, c, d, e, f, g, h, a, 15, W(15));
+
+      R(a, b, c, d, e, f, g, h, 16, W(16));
+      R(h, a, b, c, d, e, f, g, 17, W(17));
+      R(g, h, a, b, c, d, e, f, 18, W(18));
+      R(f, g, h, a, b, c, d, e, 19, W(19));
+      R(e, f, g, h, a, b, c, d, 20, W(20));
+      R(d, e, f, g, h, a, b, c, 21, W(21));
+      R(c, d, e, f, g, h, a, b, 22, W(22));
+      R(b, c, d, e, f, g, h, a, 23, W(23));
+      R(a, b, c, d, e, f, g, h, 24, W(24));
+      R(h, a, b, c, d, e, f, g, 25, W(25));
+      R(g, h, a, b, c, d, e, f, 26, W(26));
+      R(f, g, h, a, b, c, d, e, 27, W(27));
+      R(e, f, g, h, a, b, c, d, 28, W(28));
+      R(d, e, f, g, h, a, b, c, 29, W(29));
+      R(c, d, e, f, g, h, a, b, 30, W(30));
+      R(b, c, d, e, f, g, h, a, 31, W(31));
+
+      R(a, b, c, d, e, f, g, h, 32, W(32));
+      R(h, a, b, c, d, e, f, g, 33, W(33));
+      R(g, h, a, b, c, d, e, f, 34, W(34));
+      R(f, g, h, a, b, c, d, e, 35, W(35));
+      R(e, f, g, h, a, b, c, d, 36, W(36));
+      R(d, e, f, g, h, a, b, c, 37, W(37));
+      R(c, d, e, f, g, h, a, b, 38, W(38));
+      R(b, c, d, e, f, g, h, a, 39, W(39));
+      R(a, b, c, d, e, f, g, h, 40, W(40));
+      R(h, a, b, c, d, e, f, g, 41, W(41));
+      R(g, h, a, b, c, d, e, f, 42, W(42));
+      R(f, g, h, a, b, c, d, e, 43, W(43));
+      R(e, f, g, h, a, b, c, d, 44, W(44));
+      R(d, e, f, g, h, a, b, c, 45, W(45));
+      R(c, d, e, f, g, h, a, b, 46, W(46));
+      R(b, c, d, e, f, g, h, a, 47, W(47));
+
+      R(a, b, c, d, e, f, g, h, 48, W(48));
+      R(h, a, b, c, d, e, f, g, 49, W(49));
+      R(g, h, a, b, c, d, e, f, 50, W(50));
+      R(f, g, h, a, b, c, d, e, 51, W(51));
+      R(e, f, g, h, a, b, c, d, 52, W(52));
+      R(d, e, f, g, h, a, b, c, 53, W(53));
+      R(c, d, e, f, g, h, a, b, 54, W(54));
+      R(b, c, d, e, f, g, h, a, 55, W(55));
+      R(a, b, c, d, e, f, g, h, 56, W(56));
+      R(h, a, b, c, d, e, f, g, 57, W(57));
+      R(g, h, a, b, c, d, e, f, 58, W(58));
+      R(f, g, h, a, b, c, d, e, 59, W(59));
+      R(e, f, g, h, a, b, c, d, 60, W(60));
+      R(d, e, f, g, h, a, b, c, 61, W(61));
+      R(c, d, e, f, g, h, a, b, 62, W(62));
+      R(b, c, d, e, f, g, h, a, 63, W(63));
+
+      R(a, b, c, d, e, f, g, h, 64, L(64));
+      R(h, a, b, c, d, e, f, g, 65, L(65));
+      R(g, h, a, b, c, d, e, f, 66, L(66));
+      R(f, g, h, a, b, c, d, e, 67, L(67));
+      R(e, f, g, h, a, b, c, d, 68, L(68));
+      R(d, e, f, g, h, a, b, c, 69, L(69));
+      R(c, d, e, f, g, h, a, b, 70, L(70));
+      R(b, c, d, e, f, g, h, a, 71, L(71));
+      R(a, b, c, d, e, f, g, h, 72, L(72));
+      R(h, a, b, c, d, e, f, g, 73, L(73));
+      R(g, h, a, b, c, d, e, f, 74, L(74));
+      R(f, g, h, a, b, c, d, e, 75, L(75));
+      R(e, f, g, h, a, b, c, d, 76, L(76));
+      R(d, e, f, g, h, a, b, c, 77, L(77));
+      R(c, d, e, f, g, h, a, b, 78, L(78));
+      R(b, c, d, e, f, g, h, a, 79, L(79));
 
       h0 += a;
       h1 += b;
@@ -567,403 +705,21 @@ _gcry_sha512_transform_ppc8(u64 state[8],
   vec_u64_store (h4, 8 * 4, (unsigned long long *)state);
   vec_u64_store (h6, 8 * 6, (unsigned long long *)state);
 
-  return sizeof(w);
+  return sizeof(w) + sizeof(w2);
 }
-#undef R
-#undef Cho
-#undef Maj
-#undef Sum0
-#undef Sum1
-#undef S0
-#undef S1
-#undef I
-#undef W
-#undef I2
-#undef W2
-#undef R2
-
-
-/* SHA2 round in general purpose registers */
-#define R(a,b,c,d,e,f,g,h,k,w) do                                 \
-          {                                                       \
-            t1 = (h) + Sum1((e)) + Cho((e),(f),(g)) + ((k) + (w));\
-            t2 = Sum0((a)) + Maj((a),(b),(c));                    \
-            d += t1;                                              \
-            h  = t1 + t2;                                         \
-          } while (0)
-
-#define Cho(x, y, z)  ((x & y) + (~x & z))
-
-#define Maj(z, x, y)  ((x & y) + (z & (x ^ y)))
-
-#define Sum0(x)       (ror64(x, 28) ^ ror64(x ^ ror64(x, 39-34), 34))
-
-#define Sum1(x)       (ror64(x, 14) ^ ror64(x, 18) ^ ror64(x, 41))
-
-
-/* Message expansion on general purpose registers */
-#define S0(x) (ror64 ((x), 1) ^ ror64 ((x), 8) ^ ((x) >> 7))
-#define S1(x) (ror64 ((x), 19) ^ ror64 ((x), 61) ^ ((x) >> 6))
-
-#define I(i) ( w[i] = buf_get_be64(data + i * 8) )
-#define WN(i) ({ w[i&0x0f] +=    w[(i-7) &0x0f];  \
-                w[i&0x0f] += S0(w[(i-15)&0x0f]); \
-                w[i&0x0f] += S1(w[(i-2) &0x0f]); \
-                w[i&0x0f]; })
-#define W(i) ({ u64 r = w[i&0x0f]; WN(i); r; })
-#define L(i) w[i&0x0f]
 
+unsigned int ASM_FUNC_ATTR FUNC_ATTR_TARGET_P8 FUNC_ATTR_OPT_O2
+_gcry_sha512_transform_ppc8(u64 state[8], const unsigned char *data,
+                           size_t nblks)
+{
+  return sha512_transform_ppc(state, data, nblks);
+}
 
-unsigned int ASM_FUNC_ATTR
+unsigned int ASM_FUNC_ATTR FUNC_ATTR_TARGET_P9 FUNC_ATTR_OPT_O2
 _gcry_sha512_transform_ppc9(u64 state[8], const unsigned char *data,
                            size_t nblks)
 {
-  /* GPRs used for round function and message expansion as vector intrinsics
-   * based generates slower code for POWER9. */
-  u64 a, b, c, d, e, f, g, h, t1, t2;
-  u64 w[16];
-
-  a = state[0];
-  b = state[1];
-  c = state[2];
-  d = state[3];
-  e = state[4];
-  f = state[5];
-  g = state[6];
-  h = state[7];
-
-  while (nblks >= 2)
-    {
-      I(0); I(1); I(2); I(3);
-      I(4); I(5); I(6); I(7);
-      I(8); I(9); I(10); I(11);
-      I(12); I(13); I(14); I(15);
-      data += 128;
-      R(a, b, c, d, e, f, g, h, K[0], W(0));
-      R(h, a, b, c, d, e, f, g, K[1], W(1));
-      R(g, h, a, b, c, d, e, f, K[2], W(2));
-      R(f, g, h, a, b, c, d, e, K[3], W(3));
-      R(e, f, g, h, a, b, c, d, K[4], W(4));
-      R(d, e, f, g, h, a, b, c, K[5], W(5));
-      R(c, d, e, f, g, h, a, b, K[6], W(6));
-      R(b, c, d, e, f, g, h, a, K[7], W(7));
-      R(a, b, c, d, e, f, g, h, K[8], W(8));
-      R(h, a, b, c, d, e, f, g, K[9], W(9));
-      R(g, h, a, b, c, d, e, f, K[10], W(10));
-      R(f, g, h, a, b, c, d, e, K[11], W(11));
-      R(e, f, g, h, a, b, c, d, K[12], W(12));
-      R(d, e, f, g, h, a, b, c, K[13], W(13));
-      R(c, d, e, f, g, h, a, b, K[14], W(14));
-      R(b, c, d, e, f, g, h, a, K[15], W(15));
-
-      R(a, b, c, d, e, f, g, h, K[16], W(16));
-      R(h, a, b, c, d, e, f, g, K[17], W(17));
-      R(g, h, a, b, c, d, e, f, K[18], W(18));
-      R(f, g, h, a, b, c, d, e, K[19], W(19));
-      R(e, f, g, h, a, b, c, d, K[20], W(20));
-      R(d, e, f, g, h, a, b, c, K[21], W(21));
-      R(c, d, e, f, g, h, a, b, K[22], W(22));
-      R(b, c, d, e, f, g, h, a, K[23], W(23));
-      R(a, b, c, d, e, f, g, h, K[24], W(24));
-      R(h, a, b, c, d, e, f, g, K[25], W(25));
-      R(g, h, a, b, c, d, e, f, K[26], W(26));
-      R(f, g, h, a, b, c, d, e, K[27], W(27));
-      R(e, f, g, h, a, b, c, d, K[28], W(28));
-      R(d, e, f, g, h, a, b, c, K[29], W(29));
-      R(c, d, e, f, g, h, a, b, K[30], W(30));
-      R(b, c, d, e, f, g, h, a, K[31], W(31));
-
-      R(a, b, c, d, e, f, g, h, K[32], W(32));
-      R(h, a, b, c, d, e, f, g, K[33], W(33));
-      R(g, h, a, b, c, d, e, f, K[34], W(34));
-      R(f, g, h, a, b, c, d, e, K[35], W(35));
-      R(e, f, g, h, a, b, c, d, K[36], W(36));
-      R(d, e, f, g, h, a, b, c, K[37], W(37));
-      R(c, d, e, f, g, h, a, b, K[38], W(38));
-      R(b, c, d, e, f, g, h, a, K[39], W(39));
-      R(a, b, c, d, e, f, g, h, K[40], W(40));
-      R(h, a, b, c, d, e, f, g, K[41], W(41));
-      R(g, h, a, b, c, d, e, f, K[42], W(42));
-      R(f, g, h, a, b, c, d, e, K[43], W(43));
-      R(e, f, g, h, a, b, c, d, K[44], W(44));
-      R(d, e, f, g, h, a, b, c, K[45], W(45));
-      R(c, d, e, f, g, h, a, b, K[46], W(46));
-      R(b, c, d, e, f, g, h, a, K[47], W(47));
-
-      R(a, b, c, d, e, f, g, h, K[48], W(48));
-      R(h, a, b, c, d, e, f, g, K[49], W(49));
-      R(g, h, a, b, c, d, e, f, K[50], W(50));
-      R(f, g, h, a, b, c, d, e, K[51], W(51));
-      R(e, f, g, h, a, b, c, d, K[52], W(52));
-      R(d, e, f, g, h, a, b, c, K[53], W(53));
-      R(c, d, e, f, g, h, a, b, K[54], W(54));
-      R(b, c, d, e, f, g, h, a, K[55], W(55));
-      R(a, b, c, d, e, f, g, h, K[56], W(56));
-      R(h, a, b, c, d, e, f, g, K[57], W(57));
-      R(g, h, a, b, c, d, e, f, K[58], W(58));
-      R(f, g, h, a, b, c, d, e, K[59], W(59));
-      R(e, f, g, h, a, b, c, d, K[60], W(60));
-      R(d, e, f, g, h, a, b, c, K[61], W(61));
-      R(c, d, e, f, g, h, a, b, K[62], W(62));
-      R(b, c, d, e, f, g, h, a, K[63], W(63));
-
-      R(a, b, c, d, e, f, g, h, K[64], L(64));
-      R(h, a, b, c, d, e, f, g, K[65], L(65));
-      R(g, h, a, b, c, d, e, f, K[66], L(66));
-      R(f, g, h, a, b, c, d, e, K[67], L(67));
-      I(0); I(1); I(2); I(3);
-      R(e, f, g, h, a, b, c, d, K[68], L(68));
-      R(d, e, f, g, h, a, b, c, K[69], L(69));
-      R(c, d, e, f, g, h, a, b, K[70], L(70));
-      R(b, c, d, e, f, g, h, a, K[71], L(71));
-      I(4); I(5); I(6); I(7);
-      R(a, b, c, d, e, f, g, h, K[72], L(72));
-      R(h, a, b, c, d, e, f, g, K[73], L(73));
-      R(g, h, a, b, c, d, e, f, K[74], L(74));
-      R(f, g, h, a, b, c, d, e, K[75], L(75));
-      I(8); I(9); I(10); I(11);
-      R(e, f, g, h, a, b, c, d, K[76], L(76));
-      R(d, e, f, g, h, a, b, c, K[77], L(77));
-      R(c, d, e, f, g, h, a, b, K[78], L(78));
-      R(b, c, d, e, f, g, h, a, K[79], L(79));
-      I(12); I(13); I(14); I(15);
-      data += 128;
-
-      a += state[0];
-      b += state[1];
-      c += state[2];
-      d += state[3];
-      e += state[4];
-      f += state[5];
-      g += state[6];
-      h += state[7];
-      state[0] = a;
-      state[1] = b;
-      state[2] = c;
-      state[3] = d;
-      state[4] = e;
-      state[5] = f;
-      state[6] = g;
-      state[7] = h;
-
-      R(a, b, c, d, e, f, g, h, K[0], W(0));
-      R(h, a, b, c, d, e, f, g, K[1], W(1));
-      R(g, h, a, b, c, d, e, f, K[2], W(2));
-      R(f, g, h, a, b, c, d, e, K[3], W(3));
-      R(e, f, g, h, a, b, c, d, K[4], W(4));
-      R(d, e, f, g, h, a, b, c, K[5], W(5));
-      R(c, d, e, f, g, h, a, b, K[6], W(6));
-      R(b, c, d, e, f, g, h, a, K[7], W(7));
-      R(a, b, c, d, e, f, g, h, K[8], W(8));
-      R(h, a, b, c, d, e, f, g, K[9], W(9));
-      R(g, h, a, b, c, d, e, f, K[10], W(10));
-      R(f, g, h, a, b, c, d, e, K[11], W(11));
-      R(e, f, g, h, a, b, c, d, K[12], W(12));
-      R(d, e, f, g, h, a, b, c, K[13], W(13));
-      R(c, d, e, f, g, h, a, b, K[14], W(14));
-      R(b, c, d, e, f, g, h, a, K[15], W(15));
-
-      R(a, b, c, d, e, f, g, h, K[16], W(16));
-      R(h, a, b, c, d, e, f, g, K[17], W(17));
-      R(g, h, a, b, c, d, e, f, K[18], W(18));
-      R(f, g, h, a, b, c, d, e, K[19], W(19));
-      R(e, f, g, h, a, b, c, d, K[20], W(20));
-      R(d, e, f, g, h, a, b, c, K[21], W(21));
-      R(c, d, e, f, g, h, a, b, K[22], W(22));
-      R(b, c, d, e, f, g, h, a, K[23], W(23));
-      R(a, b, c, d, e, f, g, h, K[24], W(24));
-      R(h, a, b, c, d, e, f, g, K[25], W(25));
-      R(g, h, a, b, c, d, e, f, K[26], W(26));
-      R(f, g, h, a, b, c, d, e, K[27], W(27));
-      R(e, f, g, h, a, b, c, d, K[28], W(28));
-      R(d, e, f, g, h, a, b, c, K[29], W(29));
-      R(c, d, e, f, g, h, a, b, K[30], W(30));
-      R(b, c, d, e, f, g, h, a, K[31], W(31));
-
-      R(a, b, c, d, e, f, g, h, K[32], W(32));
-      R(h, a, b, c, d, e, f, g, K[33], W(33));
-      R(g, h, a, b, c, d, e, f, K[34], W(34));
-      R(f, g, h, a, b, c, d, e, K[35], W(35));
-      R(e, f, g, h, a, b, c, d, K[36], W(36));
-      R(d, e, f, g, h, a, b, c, K[37], W(37));
-      R(c, d, e, f, g, h, a, b, K[38], W(38));
-      R(b, c, d, e, f, g, h, a, K[39], W(39));
-      R(a, b, c, d, e, f, g, h, K[40], W(40));
-      R(h, a, b, c, d, e, f, g, K[41], W(41));
-      R(g, h, a, b, c, d, e, f, K[42], W(42));
-      R(f, g, h, a, b, c, d, e, K[43], W(43));
-      R(e, f, g, h, a, b, c, d, K[44], W(44));
-      R(d, e, f, g, h, a, b, c, K[45], W(45));
-      R(c, d, e, f, g, h, a, b, K[46], W(46));
-      R(b, c, d, e, f, g, h, a, K[47], W(47));
-
-      R(a, b, c, d, e, f, g, h, K[48], W(48));
-      R(h, a, b, c, d, e, f, g, K[49], W(49));
-      R(g, h, a, b, c, d, e, f, K[50], W(50));
-      R(f, g, h, a, b, c, d, e, K[51], W(51));
-      R(e, f, g, h, a, b, c, d, K[52], W(52));
-      R(d, e, f, g, h, a, b, c, K[53], W(53));
-      R(c, d, e, f, g, h, a, b, K[54], W(54));
-      R(b, c, d, e, f, g, h, a, K[55], W(55));
-      R(a, b, c, d, e, f, g, h, K[56], W(56));
-      R(h, a, b, c, d, e, f, g, K[57], W(57));
-      R(g, h, a, b, c, d, e, f, K[58], W(58));
-      R(f, g, h, a, b, c, d, e, K[59], W(59));
-      R(e, f, g, h, a, b, c, d, K[60], W(60));
-      R(d, e, f, g, h, a, b, c, K[61], W(61));
-      R(c, d, e, f, g, h, a, b, K[62], W(62));
-      R(b, c, d, e, f, g, h, a, K[63], W(63));
-
-      R(a, b, c, d, e, f, g, h, K[64], L(64));
-      R(h, a, b, c, d, e, f, g, K[65], L(65));
-      R(g, h, a, b, c, d, e, f, K[66], L(66));
-      R(f, g, h, a, b, c, d, e, K[67], L(67));
-      R(e, f, g, h, a, b, c, d, K[68], L(68));
-      R(d, e, f, g, h, a, b, c, K[69], L(69));
-      R(c, d, e, f, g, h, a, b, K[70], L(70));
-      R(b, c, d, e, f, g, h, a, K[71], L(71));
-      R(a, b, c, d, e, f, g, h, K[72], L(72));
-      R(h, a, b, c, d, e, f, g, K[73], L(73));
-      R(g, h, a, b, c, d, e, f, K[74], L(74));
-      R(f, g, h, a, b, c, d, e, K[75], L(75));
-      R(e, f, g, h, a, b, c, d, K[76], L(76));
-      R(d, e, f, g, h, a, b, c, K[77], L(77));
-      R(c, d, e, f, g, h, a, b, K[78], L(78));
-      R(b, c, d, e, f, g, h, a, K[79], L(79));
-
-      a += state[0];
-      b += state[1];
-      c += state[2];
-      d += state[3];
-      e += state[4];
-      f += state[5];
-      g += state[6];
-      h += state[7];
-      state[0] = a;
-      state[1] = b;
-      state[2] = c;
-      state[3] = d;
-      state[4] = e;
-      state[5] = f;
-      state[6] = g;
-      state[7] = h;
-
-      nblks -= 2;
-    }
-
-  while (nblks)
-    {
-      I(0); I(1); I(2); I(3);
-      I(4); I(5); I(6); I(7);
-      I(8); I(9); I(10); I(11);
-      I(12); I(13); I(14); I(15);
-      data += 128;
-      R(a, b, c, d, e, f, g, h, K[0], W(0));
-      R(h, a, b, c, d, e, f, g, K[1], W(1));
-      R(g, h, a, b, c, d, e, f, K[2], W(2));
-      R(f, g, h, a, b, c, d, e, K[3], W(3));
-      R(e, f, g, h, a, b, c, d, K[4], W(4));
-      R(d, e, f, g, h, a, b, c, K[5], W(5));
-      R(c, d, e, f, g, h, a, b, K[6], W(6));
-      R(b, c, d, e, f, g, h, a, K[7], W(7));
-      R(a, b, c, d, e, f, g, h, K[8], W(8));
-      R(h, a, b, c, d, e, f, g, K[9], W(9));
-      R(g, h, a, b, c, d, e, f, K[10], W(10));
-      R(f, g, h, a, b, c, d, e, K[11], W(11));
-      R(e, f, g, h, a, b, c, d, K[12], W(12));
-      R(d, e, f, g, h, a, b, c, K[13], W(13));
-      R(c, d, e, f, g, h, a, b, K[14], W(14));
-      R(b, c, d, e, f, g, h, a, K[15], W(15));
-
-      R(a, b, c, d, e, f, g, h, K[16], W(16));
-      R(h, a, b, c, d, e, f, g, K[17], W(17));
-      R(g, h, a, b, c, d, e, f, K[18], W(18));
-      R(f, g, h, a, b, c, d, e, K[19], W(19));
-      R(e, f, g, h, a, b, c, d, K[20], W(20));
-      R(d, e, f, g, h, a, b, c, K[21], W(21));
-      R(c, d, e, f, g, h, a, b, K[22], W(22));
-      R(b, c, d, e, f, g, h, a, K[23], W(23));
-      R(a, b, c, d, e, f, g, h, K[24], W(24));
-      R(h, a, b, c, d, e, f, g, K[25], W(25));
-      R(g, h, a, b, c, d, e, f, K[26], W(26));
-      R(f, g, h, a, b, c, d, e, K[27], W(27));
-      R(e, f, g, h, a, b, c, d, K[28], W(28));
-      R(d, e, f, g, h, a, b, c, K[29], W(29));
-      R(c, d, e, f, g, h, a, b, K[30], W(30));
-      R(b, c, d, e, f, g, h, a, K[31], W(31));
-
-      R(a, b, c, d, e, f, g, h, K[32], W(32));
-      R(h, a, b, c, d, e, f, g, K[33], W(33));
-      R(g, h, a, b, c, d, e, f, K[34], W(34));
-      R(f, g, h, a, b, c, d, e, K[35], W(35));
-      R(e, f, g, h, a, b, c, d, K[36], W(36));
-      R(d, e, f, g, h, a, b, c, K[37], W(37));
-      R(c, d, e, f, g, h, a, b, K[38], W(38));
-      R(b, c, d, e, f, g, h, a, K[39], W(39));
-      R(a, b, c, d, e, f, g, h, K[40], W(40));
-      R(h, a, b, c, d, e, f, g, K[41], W(41));
-      R(g, h, a, b, c, d, e, f, K[42], W(42));
-      R(f, g, h, a, b, c, d, e, K[43], W(43));
-      R(e, f, g, h, a, b, c, d, K[44], W(44));
-      R(d, e, f, g, h, a, b, c, K[45], W(45));
-      R(c, d, e, f, g, h, a, b, K[46], W(46));
-      R(b, c, d, e, f, g, h, a, K[47], W(47));
-
-      R(a, b, c, d, e, f, g, h, K[48], W(48));
-      R(h, a, b, c, d, e, f, g, K[49], W(49));
-      R(g, h, a, b, c, d, e, f, K[50], W(50));
-      R(f, g, h, a, b, c, d, e, K[51], W(51));
-      R(e, f, g, h, a, b, c, d, K[52], W(52));
-      R(d, e, f, g, h, a, b, c, K[53], W(53));
-      R(c, d, e, f, g, h, a, b, K[54], W(54));
-      R(b, c, d, e, f, g, h, a, K[55], W(55));
-      R(a, b, c, d, e, f, g, h, K[56], W(56));
-      R(h, a, b, c, d, e, f, g, K[57], W(57));
-      R(g, h, a, b, c, d, e, f, K[58], W(58));
-      R(f, g, h, a, b, c, d, e, K[59], W(59));
-      R(e, f, g, h, a, b, c, d, K[60], W(60));
-      R(d, e, f, g, h, a, b, c, K[61], W(61));
-      R(c, d, e, f, g, h, a, b, K[62], W(62));
-      R(b, c, d, e, f, g, h, a, K[63], W(63));
-
-      R(a, b, c, d, e, f, g, h, K[64], L(64));
-      R(h, a, b, c, d, e, f, g, K[65], L(65));
-      R(g, h, a, b, c, d, e, f, K[66], L(66));
-      R(f, g, h, a, b, c, d, e, K[67], L(67));
-      R(e, f, g, h, a, b, c, d, K[68], L(68));
-      R(d, e, f, g, h, a, b, c, K[69], L(69));
-      R(c, d, e, f, g, h, a, b, K[70], L(70));
-      R(b, c, d, e, f, g, h, a, K[71], L(71));
-      R(a, b, c, d, e, f, g, h, K[72], L(72));
-      R(h, a, b, c, d, e, f, g, K[73], L(73));
-      R(g, h, a, b, c, d, e, f, K[74], L(74));
-      R(f, g, h, a, b, c, d, e, K[75], L(75));
-      R(e, f, g, h, a, b, c, d, K[76], L(76));
-      R(d, e, f, g, h, a, b, c, K[77], L(77));
-      R(c, d, e, f, g, h, a, b, K[78], L(78));
-      R(b, c, d, e, f, g, h, a, K[79], L(79));
-
-      a += state[0];
-      b += state[1];
-      c += state[2];
-      d += state[3];
-      e += state[4];
-      f += state[5];
-      g += state[6];
-      h += state[7];
-      state[0] = a;
-      state[1] = b;
-      state[2] = c;
-      state[3] = d;
-      state[4] = e;
-      state[5] = f;
-      state[6] = g;
-      state[7] = h;
-
-      nblks--;
-    }
-
-  return sizeof(w);
+  return sha512_transform_ppc(state, data, nblks);
 }
 
 #endif /* ENABLE_PPC_CRYPTO_SUPPORT */
index 9cc308920ece5369f9af41cfa50f3fd7f948ba95..bfd3bb54dd22eddd26fe3a706410887a24e9a4d8 100644 (file)
@@ -414,6 +414,11 @@ _gcry_sha512_transform_amd64_ssse3:
 ;;; Binary Data
 */
 
+SECTION_RODATA
+
+ELF(.type _sha512_ssse3_consts,@object)
+_sha512_ssse3_consts:
+
 .align 16
 
 /* Mask for byte-swapping a couple of qwords in an XMM register using (v)pshufb. */
index 57a74664aa68b169e719db88b7dcf5207f7ab01d..bf3f3ff2ed72159b76fa089be7c42df282d6b704 100644 (file)
@@ -4,7 +4,7 @@
  * This file is part of Libgcrypt.
  *
  * Libgcrypt is free software; you can redistribute it and/or modify
- * it under the terms of the GNU Lesser general Public License as
+ * it under the terms of the GNU Lesser General Public License as
  * published by the Free Software Foundation; either version 2.1 of
  * the License, or (at your option) any later version.
  *
 #include "hash-common.h"
 
 
+/* Helper macro to force alignment to 64 bytes.  */
+#ifdef HAVE_GCC_ATTRIBUTE_ALIGNED
+# define ATTR_ALIGNED_64  __attribute__ ((aligned (64)))
+#else
+# define ATTR_ALIGNED_64
+#endif
+
+
 /* USE_ARM_NEON_ASM indicates whether to enable ARM NEON assembly code. */
 #undef USE_ARM_NEON_ASM
 #ifdef ENABLE_NEON_SUPPORT
 # define USE_ARM_ASM 1
 #endif
 
+/* USE_ARM64_SHA512 indicates whether to enable ARMv8 SHA512 extension assembly
+ * code. */
+#undef USE_ARM64_SHA512
+#ifdef ENABLE_ARM_CRYPTO_SUPPORT
+# if defined(__AARCH64EL__) \
+       && defined(HAVE_COMPATIBLE_GCC_AARCH64_PLATFORM_AS) \
+       && defined(HAVE_GCC_INLINE_ASM_AARCH64_SHA3_SHA512_SM3_SM4)
+#  define USE_ARM64_SHA512 1
+# endif
+#endif
+
 
 /* USE_SSSE3 indicates whether to compile with Intel SSSE3 code. */
 #undef USE_SSSE3
 #endif
 
 
+/* USE_AVX512 indicates whether to compile with Intel AVX512 code. */
+#undef USE_AVX512
+#if defined(__x86_64__) && defined(HAVE_GCC_INLINE_ASM_AVX512) && \
+    defined(HAVE_INTEL_SYNTAX_PLATFORM_AS) && \
+    (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
+     defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS))
+# define USE_AVX512 1
+#endif
+
+
 /* USE_SSSE3_I386 indicates whether to compile with Intel SSSE3/i386 code. */
 #undef USE_SSSE3_I386
 #if defined(__i386__) && SIZEOF_UNSIGNED_LONG == 4 && __GNUC__ >= 4 && \
@@ -148,7 +177,7 @@ typedef struct
 } SHA512_CONTEXT;
 
 
-static const u64 k[] =
+static ATTR_ALIGNED_64 const u64 k[] =
   {
     U64_C(0x428a2f98d728ae22), U64_C(0x7137449123ef65cd),
     U64_C(0xb5c0fbcfec4d3b2f), U64_C(0xe9b5dba58189dbbc),
@@ -197,7 +226,8 @@ static const u64 k[] =
  * stack to store XMM6-XMM15 needed on Win64. */
 #undef ASM_FUNC_ABI
 #undef ASM_EXTRA_STACK
-#if defined(USE_SSSE3) || defined(USE_AVX) || defined(USE_AVX2)
+#if defined(USE_SSSE3) || defined(USE_AVX) || defined(USE_AVX2) \
+    || defined(USE_AVX512)
 # ifdef HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS
 #  define ASM_FUNC_ABI __attribute__((sysv_abi))
 #  define ASM_EXTRA_STACK (10 * 16 + 4 * sizeof(void *))
@@ -208,6 +238,21 @@ static const u64 k[] =
 #endif
 
 
+#ifdef USE_ARM64_SHA512
+unsigned int _gcry_sha512_transform_armv8_ce (u64 state[8],
+                                              const unsigned char *data,
+                                              size_t num_blks,
+                                              const u64 k[]);
+
+static unsigned int
+do_sha512_transform_armv8_ce(void *ctx, const unsigned char *data,
+                             size_t nblks)
+{
+  SHA512_CONTEXT *hd = ctx;
+  return _gcry_sha512_transform_armv8_ce (hd->state.h, data, nblks, k);
+}
+#endif
+
 #ifdef USE_ARM_NEON_ASM
 unsigned int _gcry_sha512_transform_armv7_neon (SHA512_STATE *hd,
                                                 const unsigned char *data,
@@ -232,8 +277,10 @@ do_sha512_transform_amd64_ssse3(void *ctx, const unsigned char *data,
                                 size_t nblks)
 {
   SHA512_CONTEXT *hd = ctx;
-  return _gcry_sha512_transform_amd64_ssse3 (data, &hd->state, nblks)
-         + ASM_EXTRA_STACK;
+  unsigned int burn;
+  burn = _gcry_sha512_transform_amd64_ssse3 (data, &hd->state, nblks);
+  burn += burn > 0 ? ASM_EXTRA_STACK : 0;
+  return burn;
 }
 #endif
 
@@ -247,8 +294,10 @@ do_sha512_transform_amd64_avx(void *ctx, const unsigned char *data,
                               size_t nblks)
 {
   SHA512_CONTEXT *hd = ctx;
-  return _gcry_sha512_transform_amd64_avx (data, &hd->state, nblks)
-         + ASM_EXTRA_STACK;
+  unsigned int burn;
+  burn = _gcry_sha512_transform_amd64_avx (data, &hd->state, nblks);
+  burn += burn > 0 ? ASM_EXTRA_STACK : 0;
+  return burn;
 }
 #endif
 
@@ -262,8 +311,27 @@ do_sha512_transform_amd64_avx2(void *ctx, const unsigned char *data,
                                size_t nblks)
 {
   SHA512_CONTEXT *hd = ctx;
-  return _gcry_sha512_transform_amd64_avx2 (data, &hd->state, nblks)
-         + ASM_EXTRA_STACK;
+  unsigned int burn;
+  burn = _gcry_sha512_transform_amd64_avx2 (data, &hd->state, nblks);
+  burn += burn > 0 ? ASM_EXTRA_STACK : 0;
+  return burn;
+}
+#endif
+
+#ifdef USE_AVX512
+unsigned int _gcry_sha512_transform_amd64_avx512(const void *input_data,
+                                                void *state,
+                                                size_t num_blks) ASM_FUNC_ABI;
+
+static unsigned int
+do_sha512_transform_amd64_avx512(void *ctx, const unsigned char *data,
+                                 size_t nblks)
+{
+  SHA512_CONTEXT *hd = ctx;
+  unsigned int burn;
+  burn = _gcry_sha512_transform_amd64_avx512 (data, &hd->state, nblks);
+  burn += burn > 0 ? ASM_EXTRA_STACK : 0;
+  return burn;
 }
 #endif
 
@@ -381,6 +449,10 @@ sha512_init_common (SHA512_CONTEXT *ctx, unsigned int flags)
   if ((features & HWF_ARM_NEON) != 0)
     ctx->bctx.bwrite = do_sha512_transform_armv7_neon;
 #endif
+#ifdef USE_ARM64_SHA512
+  if ((features & HWF_ARM_NEON) && (features & HWF_ARM_SHA512))
+    ctx->bctx.bwrite = do_sha512_transform_armv8_ce;
+#endif
 #ifdef USE_SSSE3
   if ((features & HWF_INTEL_SSSE3) != 0)
     ctx->bctx.bwrite = do_sha512_transform_amd64_ssse3;
@@ -393,6 +465,10 @@ sha512_init_common (SHA512_CONTEXT *ctx, unsigned int flags)
   if ((features & HWF_INTEL_AVX2) && (features & HWF_INTEL_BMI2))
     ctx->bctx.bwrite = do_sha512_transform_amd64_avx2;
 #endif
+#ifdef USE_AVX512
+  if ((features & HWF_INTEL_AVX512) && (features & HWF_INTEL_CPU))
+    ctx->bctx.bwrite = do_sha512_transform_amd64_avx512;
+#endif
 #ifdef USE_PPC_CRYPTO
   if ((features & HWF_PPC_VCRYPTO) != 0)
     ctx->bctx.bwrite = do_sha512_transform_ppc8;
index 3fb8900638875276d7b4788a6d9e43b2d946325c..7ef64666dc72c93ecbee7f3f9be4ac8306dc95de 100644 (file)
@@ -29,7 +29,7 @@
 
 /* Constants */
 
-.text
+SECTION_RODATA
 .align 4
 ELF(.type _gcry_sm3_aarch64_consts,@object)
 _gcry_sm3_aarch64_consts:
@@ -383,6 +383,9 @@ ELF(.size _gcry_sm3_aarch64_consts,.-_gcry_sm3_aarch64_consts)
 #define SCHED_W_W5W0W1W2W3W4_3(iop_num, round) \
         SCHED_W_3_##iop_num(round, W5, W0, W1, W2, W3, W4)
 
+
+.text
+
 /*
  * Transform nblks*64 bytes (nblks*16 32-bit words) at DATA.
  *
@@ -390,7 +393,7 @@ ELF(.size _gcry_sm3_aarch64_consts,.-_gcry_sm3_aarch64_consts)
  * _gcry_sm3_transform_aarch64 (void *ctx, const unsigned char *data,
  *                              size_t nblks)
  */
-.align 3
+.align 4
 .globl _gcry_sm3_transform_aarch64
 ELF(.type _gcry_sm3_transform_aarch64,%function;)
 _gcry_sm3_transform_aarch64:
diff --git a/cipher/sm3-armv8-aarch64-ce.S b/cipher/sm3-armv8-aarch64-ce.S
new file mode 100644 (file)
index 0000000..5f5f599
--- /dev/null
@@ -0,0 +1,221 @@
+/* sm3-armv8-aarch64-ce.S  -  ARMv8/AArch64/CE accelerated SM3 cipher
+ *
+ * Copyright (C) 2022 Alibaba Group.
+ * Copyright (C) 2022 Tianjia Zhang <tianjia.zhang@linux.alibaba.com>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "asm-common-aarch64.h"
+
+#if defined(__AARCH64EL__) && \
+    defined(HAVE_COMPATIBLE_GCC_AARCH64_PLATFORM_AS) && \
+    defined(HAVE_GCC_INLINE_ASM_AARCH64_CRYPTO) && \
+    defined(USE_SM3)
+
+.cpu generic+simd+crypto
+
+/* Must be consistent with register macros */
+#define vecnum_v0       0
+#define vecnum_v1       1
+#define vecnum_v2       2
+#define vecnum_v3       3
+#define vecnum_v4       4
+#define vecnum_CTX1     16
+#define vecnum_CTX2     17
+#define vecnum_SS1      18
+#define vecnum_WT       19
+#define vecnum_K0       20
+#define vecnum_K1       21
+#define vecnum_K2       22
+#define vecnum_K3       23
+#define vecnum_RTMP0    24
+#define vecnum_RTMP1    25
+
+#define sm3partw1(vd, vn, vm) \
+    .inst (0xce60c000 | (vecnum_##vm << 16) | (vecnum_##vn << 5) | vecnum_##vd)
+
+#define sm3partw2(vd, vn, vm) \
+    .inst (0xce60c400 | (vecnum_##vm << 16) | (vecnum_##vn << 5) | vecnum_##vd)
+
+#define sm3ss1(vd, vn, vm, va) \
+    .inst (0xce400000 | (vecnum_##vm << 16) | (vecnum_##va << 10) \
+            | (vecnum_##vn << 5) | vecnum_##vd)
+
+#define sm3tt1a(vd, vn, vm, imm2) \
+    .inst (0xce408000 | (vecnum_##vm << 16) | imm2 << 12 \
+            | (vecnum_##vn << 5) | vecnum_##vd)
+
+#define sm3tt1b(vd, vn, vm, imm2) \
+    .inst (0xce408400 | (vecnum_##vm << 16) | imm2 << 12 \
+            | (vecnum_##vn << 5) | vecnum_##vd)
+
+#define sm3tt2a(vd, vn, vm, imm2) \
+    .inst (0xce408800 | (vecnum_##vm << 16) | imm2 << 12 \
+            | (vecnum_##vn << 5) | vecnum_##vd)
+
+#define sm3tt2b(vd, vn, vm, imm2) \
+    .inst (0xce408c00 | (vecnum_##vm << 16) | imm2 << 12 \
+            | (vecnum_##vn << 5) | vecnum_##vd)
+
+/* Constants */
+
+SECTION_RODATA
+.align 4
+ELF(.type _gcry_sm3_armv8_ce_consts,@object)
+_gcry_sm3_armv8_ce_consts:
+.Lsm3_Ktable:
+    .long 0x79cc4519, 0xf3988a32, 0xe7311465, 0xce6228cb
+    .long 0x9cc45197, 0x3988a32f, 0x7311465e, 0xe6228cbc
+    .long 0xcc451979, 0x988a32f3, 0x311465e7, 0x6228cbce
+    .long 0xc451979c, 0x88a32f39, 0x11465e73, 0x228cbce6
+    .long 0x9d8a7a87, 0x3b14f50f, 0x7629ea1e, 0xec53d43c
+    .long 0xd8a7a879, 0xb14f50f3, 0x629ea1e7, 0xc53d43ce
+    .long 0x8a7a879d, 0x14f50f3b, 0x29ea1e76, 0x53d43cec
+    .long 0xa7a879d8, 0x4f50f3b1, 0x9ea1e762, 0x3d43cec5
+    .long 0x7a879d8a, 0xf50f3b14, 0xea1e7629, 0xd43cec53
+    .long 0xa879d8a7, 0x50f3b14f, 0xa1e7629e, 0x43cec53d
+    .long 0x879d8a7a, 0x0f3b14f5, 0x1e7629ea, 0x3cec53d4
+    .long 0x79d8a7a8, 0xf3b14f50, 0xe7629ea1, 0xcec53d43
+    .long 0x9d8a7a87, 0x3b14f50f, 0x7629ea1e, 0xec53d43c
+    .long 0xd8a7a879, 0xb14f50f3, 0x629ea1e7, 0xc53d43ce
+    .long 0x8a7a879d, 0x14f50f3b, 0x29ea1e76, 0x53d43cec
+    .long 0xa7a879d8, 0x4f50f3b1, 0x9ea1e762, 0x3d43cec5
+ELF(.size _gcry_sm3_armv8_ce_consts,.-_gcry_sm3_armv8_ce_consts)
+
+/* Register macros */
+
+/* Must be consistent with vecnum_ macros */
+#define CTX1    v16
+#define CTX2    v17
+#define SS1     v18
+#define WT      v19
+
+#define K0      v20
+#define K1      v21
+#define K2      v22
+#define K3      v23
+
+#define RTMP0   v24
+#define RTMP1   v25
+
+/* Helper macros. */
+
+#define _(...) /*_*/
+
+#define SCHED_W_1(s0, s1, s2, s3, s4) ext       s4.16b, s1.16b, s2.16b, #12
+#define SCHED_W_2(s0, s1, s2, s3, s4) ext       RTMP0.16b, s0.16b, s1.16b, #12
+#define SCHED_W_3(s0, s1, s2, s3, s4) ext       RTMP1.16b, s2.16b, s3.16b, #8
+#define SCHED_W_4(s0, s1, s2, s3, s4) sm3partw1(s4, s0, s3)
+#define SCHED_W_5(s0, s1, s2, s3, s4) sm3partw2(s4, RTMP1, RTMP0)
+
+#define SCHED_W(n, s0, s1, s2, s3, s4) SCHED_W_##n(s0, s1, s2, s3, s4)
+
+#define R(ab, s0, s1, s2, s3, s4, IOP)                  \
+        ld4     {K0.s, K1.s, K2.s, K3.s}[3], [x3], #16; \
+        eor     WT.16b, s0.16b, s1.16b;                 \
+                                                        \
+        sm3ss1(SS1, CTX1, CTX2, K0);                    \
+      IOP(1, s0, s1, s2, s3, s4);                       \
+        sm3tt1##ab(CTX1, SS1, WT, 0);                   \
+        sm3tt2##ab(CTX2, SS1, s0, 0);                   \
+                                                        \
+      IOP(2, s0, s1, s2, s3, s4);                       \
+        sm3ss1(SS1, CTX1, CTX2, K1);                    \
+      IOP(3, s0, s1, s2, s3, s4);                       \
+        sm3tt1##ab(CTX1, SS1, WT, 1);                   \
+        sm3tt2##ab(CTX2, SS1, s0, 1);                   \
+                                                        \
+        sm3ss1(SS1, CTX1, CTX2, K2);                    \
+      IOP(4, s0, s1, s2, s3, s4);                       \
+        sm3tt1##ab(CTX1, SS1, WT, 2);                   \
+        sm3tt2##ab(CTX2, SS1, s0, 2);                   \
+                                                        \
+        sm3ss1(SS1, CTX1, CTX2, K3);                    \
+      IOP(5, s0, s1, s2, s3, s4);                       \
+        sm3tt1##ab(CTX1, SS1, WT, 3);                   \
+        sm3tt2##ab(CTX2, SS1, s0, 3);
+
+#define R1(s0, s1, s2, s3, s4, IOP)  R(a, s0, s1, s2, s3, s4, IOP)
+#define R2(s0, s1, s2, s3, s4, IOP)  R(b, s0, s1, s2, s3, s4, IOP)
+
+
+.text
+
+.align 4
+.global _gcry_sm3_transform_armv8_ce
+ELF(.type _gcry_sm3_transform_armv8_ce,%function;)
+_gcry_sm3_transform_armv8_ce:
+    /* input:
+     *   x0: CTX
+     *   x1: data
+     *   x2: nblocks
+     */
+    CFI_STARTPROC();
+
+    ld1         {CTX1.4s, CTX2.4s}, [x0];
+    rev64       CTX1.4s, CTX1.4s;
+    rev64       CTX2.4s, CTX2.4s;
+    ext         CTX1.16b, CTX1.16b, CTX1.16b, #8;
+    ext         CTX2.16b, CTX2.16b, CTX2.16b, #8;
+
+.Lloop:
+    GET_DATA_POINTER(x3, .Lsm3_Ktable);
+    ld1         {v0.16b-v3.16b}, [x1], #64;
+    sub         x2, x2, #1;
+
+    mov         v6.16b, CTX1.16b;
+    mov         v7.16b, CTX2.16b;
+
+    rev32       v0.16b, v0.16b;
+    rev32       v1.16b, v1.16b;
+    rev32       v2.16b, v2.16b;
+    rev32       v3.16b, v3.16b;
+
+    R1(v0, v1, v2, v3, v4, SCHED_W);
+    R1(v1, v2, v3, v4, v0, SCHED_W);
+    R1(v2, v3, v4, v0, v1, SCHED_W);
+    R1(v3, v4, v0, v1, v2, SCHED_W);
+    R2(v4, v0, v1, v2, v3, SCHED_W);
+    R2(v0, v1, v2, v3, v4, SCHED_W);
+    R2(v1, v2, v3, v4, v0, SCHED_W);
+    R2(v2, v3, v4, v0, v1, SCHED_W);
+    R2(v3, v4, v0, v1, v2, SCHED_W);
+    R2(v4, v0, v1, v2, v3, SCHED_W);
+    R2(v0, v1, v2, v3, v4, SCHED_W);
+    R2(v1, v2, v3, v4, v0, SCHED_W);
+    R2(v2, v3, v4, v0, v1, SCHED_W);
+    R2(v3, v4, v0, v1, v2, _);
+    R2(v4, v0, v1, v2, v3, _);
+    R2(v0, v1, v2, v3, v4, _);
+
+    eor         CTX1.16b, CTX1.16b, v6.16b;
+    eor         CTX2.16b, CTX2.16b, v7.16b;
+
+    cbnz        x2, .Lloop;
+
+    /* save state */
+    rev64       CTX1.4s, CTX1.4s;
+    rev64       CTX2.4s, CTX2.4s;
+    ext         CTX1.16b, CTX1.16b, CTX1.16b, #8;
+    ext         CTX2.16b, CTX2.16b, CTX2.16b, #8;
+    st1         {CTX1.4s, CTX2.4s}, [x0];
+
+    ret_spec_stop;
+    CFI_ENDPROC();
+ELF(.size _gcry_sm3_transform_armv8_ce, .-_gcry_sm3_transform_armv8_ce;)
+
+#endif
index d9b6206a824b2593a610d7a7cf5da5c38a03d012..ef923165357c5f5a7f57bf5987d060f557a0245b 100644 (file)
@@ -41,7 +41,7 @@
 
 /* Constants */
 
-.text
+SECTION_RODATA
 .align 16
 ELF(.type _gcry_sm3_avx2_consts,@object)
 _gcry_sm3_avx2_consts:
@@ -334,6 +334,8 @@ ELF(.size _gcry_sm3_avx2_consts,.-_gcry_sm3_avx2_consts)
         vpxor w0, XTMP4, XTMP1; \
         vmovdqa XTMP1, XW_W1W2_ADDR((round), 0);
 
+.text
+
 /*
  * Transform nblks*64 bytes (nblks*16 32-bit words) at DATA.
  *
index 0ab5f5067edbbdd80b282c8e66b34904c026a088..bfe9f4c25225c1743fb88428f53c3dcf77aee280 100644 (file)
 # endif
 #endif
 
+/* USE_ARM_CE indicates whether to enable ARMv8 Crypto Extension code. */
+#undef USE_ARM_CE
+#ifdef ENABLE_ARM_CRYPTO_SUPPORT
+# if defined(__AARCH64EL__) && \
+     defined(HAVE_COMPATIBLE_GCC_AARCH64_PLATFORM_AS) && \
+     defined(HAVE_GCC_INLINE_ASM_AARCH64_CRYPTO)
+#   define USE_ARM_CE 1
+# endif
+#endif
+
 
 typedef struct {
   gcry_md_block_ctx_t bctx;
@@ -117,6 +127,20 @@ do_sm3_transform_aarch64(void *context, const unsigned char *data, size_t nblks)
 }
 #endif /* USE_AARCH64_SIMD */
 
+#ifdef USE_ARM_CE
+void _gcry_sm3_transform_armv8_ce(void *state, const void *input_data,
+                                    size_t num_blks);
+
+static unsigned int
+do_sm3_transform_armv8_ce(void *context, const unsigned char *data,
+                            size_t nblks)
+{
+  SM3_CONTEXT *hd = context;
+  _gcry_sm3_transform_armv8_ce (hd->h, data, nblks);
+  return 0;
+}
+#endif /* USE_ARM_CE */
+
 
 static unsigned int
 transform (void *c, const unsigned char *data, size_t nblks);
@@ -153,6 +177,10 @@ sm3_init (void *context, unsigned int flags)
   if (features & HWF_ARM_NEON)
     hd->bctx.bwrite = do_sm3_transform_aarch64;
 #endif
+#ifdef USE_ARM_CE
+  if (features & HWF_ARM_SM3)
+    hd->bctx.bwrite = do_sm3_transform_armv8_ce;
+#endif
 
   (void)features;
 }
diff --git a/cipher/sm4-aarch64.S b/cipher/sm4-aarch64.S
new file mode 100644 (file)
index 0000000..cce6fcc
--- /dev/null
@@ -0,0 +1,644 @@
+/* sm4-aarch64.S  -  ARMv8/AArch64 accelerated SM4 cipher
+ *
+ * Copyright (C) 2022 Alibaba Group.
+ * Copyright (C) 2022 Tianjia Zhang <tianjia.zhang@linux.alibaba.com>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "asm-common-aarch64.h"
+
+#if defined(__AARCH64EL__) && \
+    defined(HAVE_COMPATIBLE_GCC_AARCH64_PLATFORM_AS) && \
+    defined(HAVE_GCC_INLINE_ASM_AARCH64_NEON) && \
+    defined(USE_SM4)
+
+.cpu generic+simd
+
+/* Constants */
+
+SECTION_RODATA
+.align 4
+ELF(.type _gcry_sm4_aarch64_consts,@object)
+_gcry_sm4_aarch64_consts:
+.Lsm4_sbox:
+  .byte 0xd6, 0x90, 0xe9, 0xfe, 0xcc, 0xe1, 0x3d, 0xb7
+  .byte 0x16, 0xb6, 0x14, 0xc2, 0x28, 0xfb, 0x2c, 0x05
+  .byte 0x2b, 0x67, 0x9a, 0x76, 0x2a, 0xbe, 0x04, 0xc3
+  .byte 0xaa, 0x44, 0x13, 0x26, 0x49, 0x86, 0x06, 0x99
+  .byte 0x9c, 0x42, 0x50, 0xf4, 0x91, 0xef, 0x98, 0x7a
+  .byte 0x33, 0x54, 0x0b, 0x43, 0xed, 0xcf, 0xac, 0x62
+  .byte 0xe4, 0xb3, 0x1c, 0xa9, 0xc9, 0x08, 0xe8, 0x95
+  .byte 0x80, 0xdf, 0x94, 0xfa, 0x75, 0x8f, 0x3f, 0xa6
+  .byte 0x47, 0x07, 0xa7, 0xfc, 0xf3, 0x73, 0x17, 0xba
+  .byte 0x83, 0x59, 0x3c, 0x19, 0xe6, 0x85, 0x4f, 0xa8
+  .byte 0x68, 0x6b, 0x81, 0xb2, 0x71, 0x64, 0xda, 0x8b
+  .byte 0xf8, 0xeb, 0x0f, 0x4b, 0x70, 0x56, 0x9d, 0x35
+  .byte 0x1e, 0x24, 0x0e, 0x5e, 0x63, 0x58, 0xd1, 0xa2
+  .byte 0x25, 0x22, 0x7c, 0x3b, 0x01, 0x21, 0x78, 0x87
+  .byte 0xd4, 0x00, 0x46, 0x57, 0x9f, 0xd3, 0x27, 0x52
+  .byte 0x4c, 0x36, 0x02, 0xe7, 0xa0, 0xc4, 0xc8, 0x9e
+  .byte 0xea, 0xbf, 0x8a, 0xd2, 0x40, 0xc7, 0x38, 0xb5
+  .byte 0xa3, 0xf7, 0xf2, 0xce, 0xf9, 0x61, 0x15, 0xa1
+  .byte 0xe0, 0xae, 0x5d, 0xa4, 0x9b, 0x34, 0x1a, 0x55
+  .byte 0xad, 0x93, 0x32, 0x30, 0xf5, 0x8c, 0xb1, 0xe3
+  .byte 0x1d, 0xf6, 0xe2, 0x2e, 0x82, 0x66, 0xca, 0x60
+  .byte 0xc0, 0x29, 0x23, 0xab, 0x0d, 0x53, 0x4e, 0x6f
+  .byte 0xd5, 0xdb, 0x37, 0x45, 0xde, 0xfd, 0x8e, 0x2f
+  .byte 0x03, 0xff, 0x6a, 0x72, 0x6d, 0x6c, 0x5b, 0x51
+  .byte 0x8d, 0x1b, 0xaf, 0x92, 0xbb, 0xdd, 0xbc, 0x7f
+  .byte 0x11, 0xd9, 0x5c, 0x41, 0x1f, 0x10, 0x5a, 0xd8
+  .byte 0x0a, 0xc1, 0x31, 0x88, 0xa5, 0xcd, 0x7b, 0xbd
+  .byte 0x2d, 0x74, 0xd0, 0x12, 0xb8, 0xe5, 0xb4, 0xb0
+  .byte 0x89, 0x69, 0x97, 0x4a, 0x0c, 0x96, 0x77, 0x7e
+  .byte 0x65, 0xb9, 0xf1, 0x09, 0xc5, 0x6e, 0xc6, 0x84
+  .byte 0x18, 0xf0, 0x7d, 0xec, 0x3a, 0xdc, 0x4d, 0x20
+  .byte 0x79, 0xee, 0x5f, 0x3e, 0xd7, 0xcb, 0x39, 0x48
+ELF(.size _gcry_sm4_aarch64_consts,.-_gcry_sm4_aarch64_consts)
+
+/* Register macros */
+
+#define RTMP0   v8
+#define RTMP1   v9
+#define RTMP2   v10
+#define RTMP3   v11
+
+#define RX0     v12
+#define RX1     v13
+#define RKEY    v14
+#define RIV     v15
+
+/* Helper macros. */
+
+#define preload_sbox(ptr)                   \
+        GET_DATA_POINTER(ptr, .Lsm4_sbox);  \
+        ld1 {v16.16b-v19.16b}, [ptr], #64;  \
+        ld1 {v20.16b-v23.16b}, [ptr], #64;  \
+        ld1 {v24.16b-v27.16b}, [ptr], #64;  \
+        ld1 {v28.16b-v31.16b}, [ptr];
+
+#define transpose_4x4(s0, s1, s2, s3)       \
+        zip1 RTMP0.4s, s0.4s, s1.4s;        \
+        zip1 RTMP1.4s, s2.4s, s3.4s;        \
+        zip2 RTMP2.4s, s0.4s, s1.4s;        \
+        zip2 RTMP3.4s, s2.4s, s3.4s;        \
+        zip1 s0.2d, RTMP0.2d, RTMP1.2d;     \
+        zip2 s1.2d, RTMP0.2d, RTMP1.2d;     \
+        zip1 s2.2d, RTMP2.2d, RTMP3.2d;     \
+        zip2 s3.2d, RTMP2.2d, RTMP3.2d;
+
+#define rotate_clockwise_90(s0, s1, s2, s3) \
+        zip1 RTMP0.4s, s1.4s, s0.4s;        \
+        zip2 RTMP1.4s, s1.4s, s0.4s;        \
+        zip1 RTMP2.4s, s3.4s, s2.4s;        \
+        zip2 RTMP3.4s, s3.4s, s2.4s;        \
+        zip1 s0.2d, RTMP2.2d, RTMP0.2d;     \
+        zip2 s1.2d, RTMP2.2d, RTMP0.2d;     \
+        zip1 s2.2d, RTMP3.2d, RTMP1.2d;     \
+        zip2 s3.2d, RTMP3.2d, RTMP1.2d;
+
+
+.text
+
+.align 4
+ELF(.type sm4_aarch64_crypt_blk1_4,%function;)
+sm4_aarch64_crypt_blk1_4:
+    /* input:
+     *   x0: round key array, CTX
+     *   x1: dst
+     *   x2: src
+     *   x3: num blocks (1..4)
+     */
+    CFI_STARTPROC();
+    VPUSH_ABI;
+
+    preload_sbox(x5);
+
+    ld1 {v0.16b}, [x2], #16;
+    mov v1.16b, v0.16b;
+    mov v2.16b, v0.16b;
+    mov v3.16b, v0.16b;
+    cmp x3, #2;
+    blt .Lblk4_load_input_done;
+    ld1 {v1.16b}, [x2], #16;
+    beq .Lblk4_load_input_done;
+    ld1 {v2.16b}, [x2], #16;
+    cmp x3, #3;
+    beq .Lblk4_load_input_done;
+    ld1 {v3.16b}, [x2];
+
+.Lblk4_load_input_done:
+
+    rev32 v0.16b, v0.16b;
+    rev32 v1.16b, v1.16b;
+    rev32 v2.16b, v2.16b;
+    rev32 v3.16b, v3.16b;
+
+    transpose_4x4(v0, v1, v2, v3);
+
+#define ROUND(round, s0, s1, s2, s3)                     \
+        dup RX0.4s, RKEY.s[round];                       \
+        /* rk ^ s1 ^ s2 ^ s3 */                          \
+        eor RTMP1.16b, s2.16b, s3.16b;                   \
+        eor RX0.16b, RX0.16b, s1.16b;                    \
+        eor RX0.16b, RX0.16b, RTMP1.16b;                 \
+                                                         \
+        /* sbox, non-linear part */                      \
+        movi RTMP3.16b, #64;    /* sizeof(sbox) / 4 */   \
+        tbl RTMP0.16b, {v16.16b-v19.16b}, RX0.16b;       \
+        sub RX0.16b, RX0.16b, RTMP3.16b;                 \
+        tbx RTMP0.16b, {v20.16b-v23.16b}, RX0.16b;       \
+        sub RX0.16b, RX0.16b, RTMP3.16b;                 \
+        tbx RTMP0.16b, {v24.16b-v27.16b}, RX0.16b;       \
+        sub RX0.16b, RX0.16b, RTMP3.16b;                 \
+        tbx RTMP0.16b, {v28.16b-v31.16b}, RX0.16b;       \
+                                                         \
+        /* linear part */                                \
+        shl RTMP1.4s, RTMP0.4s, #8;                      \
+        shl RTMP2.4s, RTMP0.4s, #16;                     \
+        shl RTMP3.4s, RTMP0.4s, #24;                     \
+        sri RTMP1.4s, RTMP0.4s, #(32-8);                 \
+        sri RTMP2.4s, RTMP0.4s, #(32-16);                \
+        sri RTMP3.4s, RTMP0.4s, #(32-24);                \
+        /* RTMP1 = x ^ rol32(x, 8) ^ rol32(x, 16) */     \
+        eor RTMP1.16b, RTMP1.16b, RTMP0.16b;             \
+        eor RTMP1.16b, RTMP1.16b, RTMP2.16b;             \
+        /* RTMP3 = x ^ rol32(x, 24) ^ rol32(RTMP1, 2) */ \
+        eor RTMP3.16b, RTMP3.16b, RTMP0.16b;             \
+        shl RTMP2.4s, RTMP1.4s, 2;                       \
+        sri RTMP2.4s, RTMP1.4s, #(32-2);                 \
+        eor RTMP3.16b, RTMP3.16b, RTMP2.16b;             \
+        /* s0 ^= RTMP3 */                                \
+        eor s0.16b, s0.16b, RTMP3.16b;
+
+    mov x6, 8;
+.Lroundloop4:
+    ld1 {RKEY.4s}, [x0], #16;
+    subs x6, x6, #1;
+
+    ROUND(0, v0, v1, v2, v3);
+    ROUND(1, v1, v2, v3, v0);
+    ROUND(2, v2, v3, v0, v1);
+    ROUND(3, v3, v0, v1, v2);
+
+    bne .Lroundloop4;
+
+#undef ROUND
+
+    rotate_clockwise_90(v0, v1, v2, v3);
+    rev32 v0.16b, v0.16b;
+    rev32 v1.16b, v1.16b;
+    rev32 v2.16b, v2.16b;
+    rev32 v3.16b, v3.16b;
+
+    st1 {v0.16b}, [x1], #16;
+    cmp x3, #2;
+    blt .Lblk4_store_output_done;
+    st1 {v1.16b}, [x1], #16;
+    beq .Lblk4_store_output_done;
+    st1 {v2.16b}, [x1], #16;
+    cmp x3, #3;
+    beq .Lblk4_store_output_done;
+    st1 {v3.16b}, [x1];
+
+.Lblk4_store_output_done:
+    VPOP_ABI;
+    ret_spec_stop;
+    CFI_ENDPROC();
+ELF(.size sm4_aarch64_crypt_blk1_4,.-sm4_aarch64_crypt_blk1_4;)
+
+.align 4
+ELF(.type __sm4_crypt_blk8,%function;)
+__sm4_crypt_blk8:
+    /* input:
+     *   x0: round key array, CTX
+     *   v16-v31: fill with sbox
+     *   v0, v1, v2, v3, v4, v5, v6, v7: eight parallel plaintext blocks
+     * output:
+     *   v0, v1, v2, v3, v4, v5, v6, v7: eight parallel ciphertext blocks
+     */
+    CFI_STARTPROC();
+
+    rev32 v0.16b, v0.16b;
+    rev32 v1.16b, v1.16b;
+    rev32 v2.16b, v2.16b;
+    rev32 v3.16b, v3.16b;
+    rev32 v4.16b, v4.16b;
+    rev32 v5.16b, v5.16b;
+    rev32 v6.16b, v6.16b;
+    rev32 v7.16b, v7.16b;
+
+    transpose_4x4(v0, v1, v2, v3);
+    transpose_4x4(v4, v5, v6, v7);
+
+#define ROUND(round, s0, s1, s2, s3, t0, t1, t2, t3)     \
+        /* rk ^ s1 ^ s2 ^ s3 */                          \
+        dup RX0.4s, RKEY.s[round];                       \
+        eor RTMP0.16b, s2.16b, s3.16b;                   \
+        mov RX1.16b, RX0.16b;                            \
+        eor RTMP1.16b, t2.16b, t3.16b;                   \
+        eor RX0.16b, RX0.16b, s1.16b;                    \
+        eor RX1.16b, RX1.16b, t1.16b;                    \
+        eor RX0.16b, RX0.16b, RTMP0.16b;                 \
+        eor RX1.16b, RX1.16b, RTMP1.16b;                 \
+                                                         \
+        /* sbox, non-linear part */                      \
+        movi RTMP3.16b, #64;    /* sizeof(sbox) / 4 */   \
+        tbl RTMP0.16b, {v16.16b-v19.16b}, RX0.16b;       \
+        tbl RTMP1.16b, {v16.16b-v19.16b}, RX1.16b;       \
+        sub RX0.16b, RX0.16b, RTMP3.16b;                 \
+        sub RX1.16b, RX1.16b, RTMP3.16b;                 \
+        tbx RTMP0.16b, {v20.16b-v23.16b}, RX0.16b;       \
+        tbx RTMP1.16b, {v20.16b-v23.16b}, RX1.16b;       \
+        sub RX0.16b, RX0.16b, RTMP3.16b;                 \
+        sub RX1.16b, RX1.16b, RTMP3.16b;                 \
+        tbx RTMP0.16b, {v24.16b-v27.16b}, RX0.16b;       \
+        tbx RTMP1.16b, {v24.16b-v27.16b}, RX1.16b;       \
+        sub RX0.16b, RX0.16b, RTMP3.16b;                 \
+        sub RX1.16b, RX1.16b, RTMP3.16b;                 \
+        tbx RTMP0.16b, {v28.16b-v31.16b}, RX0.16b;       \
+        tbx RTMP1.16b, {v28.16b-v31.16b}, RX1.16b;       \
+                                                         \
+        /* linear part */                                \
+        shl RX0.4s, RTMP0.4s, #8;                        \
+        shl RX1.4s, RTMP1.4s, #8;                        \
+        shl RTMP2.4s, RTMP0.4s, #16;                     \
+        shl RTMP3.4s, RTMP1.4s, #16;                     \
+        sri RX0.4s, RTMP0.4s, #(32 - 8);                 \
+        sri RX1.4s, RTMP1.4s, #(32 - 8);                 \
+        sri RTMP2.4s, RTMP0.4s, #(32 - 16);              \
+        sri RTMP3.4s, RTMP1.4s, #(32 - 16);              \
+        /* RX = x ^ rol32(x, 8) ^ rol32(x, 16) */        \
+        eor RX0.16b, RX0.16b, RTMP0.16b;                 \
+        eor RX1.16b, RX1.16b, RTMP1.16b;                 \
+        eor RX0.16b, RX0.16b, RTMP2.16b;                 \
+        eor RX1.16b, RX1.16b, RTMP3.16b;                 \
+        /* RTMP0/1 ^= x ^ rol32(x, 24) ^ rol32(RX, 2) */ \
+        shl RTMP2.4s, RTMP0.4s, #24;                     \
+        shl RTMP3.4s, RTMP1.4s, #24;                     \
+        sri RTMP2.4s, RTMP0.4s, #(32 - 24);              \
+        sri RTMP3.4s, RTMP1.4s, #(32 - 24);              \
+        eor RTMP0.16b, RTMP0.16b, RTMP2.16b;             \
+        eor RTMP1.16b, RTMP1.16b, RTMP3.16b;             \
+        shl RTMP2.4s, RX0.4s, #2;                        \
+        shl RTMP3.4s, RX1.4s, #2;                        \
+        sri RTMP2.4s, RX0.4s, #(32 - 2);                 \
+        sri RTMP3.4s, RX1.4s, #(32 - 2);                 \
+        eor RTMP0.16b, RTMP0.16b, RTMP2.16b;             \
+        eor RTMP1.16b, RTMP1.16b, RTMP3.16b;             \
+        /* s0/t0 ^= RTMP0/1 */                           \
+        eor s0.16b, s0.16b, RTMP0.16b;                   \
+        eor t0.16b, t0.16b, RTMP1.16b;
+
+    mov x6, 8;
+.Lroundloop8:
+    ld1 {RKEY.4s}, [x0], #16;
+    subs x6, x6, #1;
+
+    ROUND(0, v0, v1, v2, v3, v4, v5, v6, v7);
+    ROUND(1, v1, v2, v3, v0, v5, v6, v7, v4);
+    ROUND(2, v2, v3, v0, v1, v6, v7, v4, v5);
+    ROUND(3, v3, v0, v1, v2, v7, v4, v5, v6);
+
+    bne .Lroundloop8;
+
+#undef ROUND
+
+    rotate_clockwise_90(v0, v1, v2, v3);
+    rotate_clockwise_90(v4, v5, v6, v7);
+    rev32 v0.16b, v0.16b;
+    rev32 v1.16b, v1.16b;
+    rev32 v2.16b, v2.16b;
+    rev32 v3.16b, v3.16b;
+    rev32 v4.16b, v4.16b;
+    rev32 v5.16b, v5.16b;
+    rev32 v6.16b, v6.16b;
+    rev32 v7.16b, v7.16b;
+
+    sub x0, x0, #128;       /* repoint to rkey */
+    ret;
+    CFI_ENDPROC();
+ELF(.size __sm4_crypt_blk8,.-__sm4_crypt_blk8;)
+
+.align 4
+.global _gcry_sm4_aarch64_crypt_blk1_8
+ELF(.type _gcry_sm4_aarch64_crypt_blk1_8,%function;)
+_gcry_sm4_aarch64_crypt_blk1_8:
+    /* input:
+     *   x0: round key array, CTX
+     *   x1: dst
+     *   x2: src
+     *   x3: num blocks (1..8)
+     */
+    CFI_STARTPROC();
+
+    cmp x3, #5;
+    blt sm4_aarch64_crypt_blk1_4;
+
+    stp x29, x30, [sp, #-16]!;
+    CFI_ADJUST_CFA_OFFSET(16);
+    CFI_REG_ON_STACK(29, 0);
+    CFI_REG_ON_STACK(30, 8);
+    VPUSH_ABI;
+
+    preload_sbox(x5);
+
+    ld1 {v0.16b-v3.16b}, [x2], #64;
+    ld1 {v4.16b}, [x2], #16;
+    mov v5.16b, v4.16b;
+    mov v6.16b, v4.16b;
+    mov v7.16b, v4.16b;
+    beq .Lblk8_load_input_done;
+    ld1 {v5.16b}, [x2], #16;
+    cmp x3, #7;
+    blt .Lblk8_load_input_done;
+    ld1 {v6.16b}, [x2], #16;
+    beq .Lblk8_load_input_done;
+    ld1 {v7.16b}, [x2];
+
+.Lblk8_load_input_done:
+    bl __sm4_crypt_blk8;
+
+    cmp x3, #6;
+    st1 {v0.16b-v3.16b}, [x1], #64;
+    st1 {v4.16b}, [x1], #16;
+    blt .Lblk8_store_output_done;
+    st1 {v5.16b}, [x1], #16;
+    beq .Lblk8_store_output_done;
+    st1 {v6.16b}, [x1], #16;
+    cmp x3, #7;
+    beq .Lblk8_store_output_done;
+    st1 {v7.16b}, [x1];
+
+.Lblk8_store_output_done:
+    VPOP_ABI;
+    ldp x29, x30, [sp], #16;
+    CFI_ADJUST_CFA_OFFSET(-16);
+    CFI_RESTORE(x29);
+    CFI_RESTORE(x30);
+    ret_spec_stop;
+    CFI_ENDPROC();
+ELF(.size _gcry_sm4_aarch64_crypt_blk1_8,.-_gcry_sm4_aarch64_crypt_blk1_8;)
+
+
+.align 4
+.global _gcry_sm4_aarch64_crypt
+ELF(.type _gcry_sm4_aarch64_crypt,%function;)
+_gcry_sm4_aarch64_crypt:
+    /* input:
+     *   x0: round key array, CTX
+     *   x1: dst
+     *   x2: src
+     *   x3: nblocks (multiples of 8)
+     */
+    CFI_STARTPROC();
+
+    stp x29, x30, [sp, #-16]!;
+    CFI_ADJUST_CFA_OFFSET(16);
+    CFI_REG_ON_STACK(29, 0);
+    CFI_REG_ON_STACK(30, 8);
+    VPUSH_ABI;
+
+    preload_sbox(x5);
+
+.Lcrypt_loop_blk:
+    subs x3, x3, #8;
+    bmi .Lcrypt_end;
+
+    ld1 {v0.16b-v3.16b}, [x2], #64;
+    ld1 {v4.16b-v7.16b}, [x2], #64;
+    bl __sm4_crypt_blk8;
+    st1 {v0.16b-v3.16b}, [x1], #64;
+    st1 {v4.16b-v7.16b}, [x1], #64;
+    b .Lcrypt_loop_blk;
+
+.Lcrypt_end:
+    VPOP_ABI;
+    ldp x29, x30, [sp], #16;
+    CFI_ADJUST_CFA_OFFSET(-16);
+    CFI_RESTORE(x29);
+    CFI_RESTORE(x30);
+    ret_spec_stop;
+    CFI_ENDPROC();
+ELF(.size _gcry_sm4_aarch64_crypt,.-_gcry_sm4_aarch64_crypt;)
+
+
+.align 4
+.global _gcry_sm4_aarch64_cbc_dec
+ELF(.type _gcry_sm4_aarch64_cbc_dec,%function;)
+_gcry_sm4_aarch64_cbc_dec:
+    /* input:
+     *   x0: round key array, CTX
+     *   x1: dst
+     *   x2: src
+     *   x3: iv (big endian, 128 bit)
+     *   x4: nblocks (multiples of 8)
+     */
+    CFI_STARTPROC();
+
+    stp x29, x30, [sp, #-16]!;
+    CFI_ADJUST_CFA_OFFSET(16);
+    CFI_REG_ON_STACK(29, 0);
+    CFI_REG_ON_STACK(30, 8);
+    VPUSH_ABI;
+
+    preload_sbox(x5);
+    ld1 {RIV.16b}, [x3];
+
+.Lcbc_loop_blk:
+    subs x4, x4, #8;
+    bmi .Lcbc_end;
+
+    ld1 {v0.16b-v3.16b}, [x2], #64;
+    ld1 {v4.16b-v7.16b}, [x2];
+
+    bl __sm4_crypt_blk8;
+
+    sub x2, x2, #64;
+    eor v0.16b, v0.16b, RIV.16b;
+    ld1 {RTMP0.16b-RTMP3.16b}, [x2], #64;
+    eor v1.16b, v1.16b, RTMP0.16b;
+    eor v2.16b, v2.16b, RTMP1.16b;
+    eor v3.16b, v3.16b, RTMP2.16b;
+    st1 {v0.16b-v3.16b}, [x1], #64;
+
+    eor v4.16b, v4.16b, RTMP3.16b;
+    ld1 {RTMP0.16b-RTMP3.16b}, [x2], #64;
+    eor v5.16b, v5.16b, RTMP0.16b;
+    eor v6.16b, v6.16b, RTMP1.16b;
+    eor v7.16b, v7.16b, RTMP2.16b;
+
+    mov RIV.16b, RTMP3.16b;
+    st1 {v4.16b-v7.16b}, [x1], #64;
+
+    b .Lcbc_loop_blk;
+
+.Lcbc_end:
+    /* store new IV */
+    st1 {RIV.16b}, [x3];
+
+    VPOP_ABI;
+    ldp x29, x30, [sp], #16;
+    CFI_ADJUST_CFA_OFFSET(-16);
+    CFI_RESTORE(x29);
+    CFI_RESTORE(x30);
+    ret_spec_stop;
+    CFI_ENDPROC();
+ELF(.size _gcry_sm4_aarch64_cbc_dec,.-_gcry_sm4_aarch64_cbc_dec;)
+
+.align 4
+.global _gcry_sm4_aarch64_cfb_dec
+ELF(.type _gcry_sm4_aarch64_cfb_dec,%function;)
+_gcry_sm4_aarch64_cfb_dec:
+    /* input:
+     *   x0: round key array, CTX
+     *   x1: dst
+     *   x2: src
+     *   x3: iv (big endian, 128 bit)
+     *   x4: nblocks (multiples of 8)
+     */
+    CFI_STARTPROC();
+
+    stp x29, x30, [sp, #-16]!;
+    CFI_ADJUST_CFA_OFFSET(16);
+    CFI_REG_ON_STACK(29, 0);
+    CFI_REG_ON_STACK(30, 8);
+    VPUSH_ABI;
+
+    preload_sbox(x5);
+    ld1 {v0.16b}, [x3];
+
+.Lcfb_loop_blk:
+    subs x4, x4, #8;
+    bmi .Lcfb_end;
+
+    ld1 {v1.16b, v2.16b, v3.16b}, [x2], #48;
+    ld1 {v4.16b-v7.16b}, [x2];
+
+    bl __sm4_crypt_blk8;
+
+    sub x2, x2, #48;
+    ld1 {RTMP0.16b-RTMP3.16b}, [x2], #64;
+    eor v0.16b, v0.16b, RTMP0.16b;
+    eor v1.16b, v1.16b, RTMP1.16b;
+    eor v2.16b, v2.16b, RTMP2.16b;
+    eor v3.16b, v3.16b, RTMP3.16b;
+    st1 {v0.16b-v3.16b}, [x1], #64;
+
+    ld1 {RTMP0.16b-RTMP3.16b}, [x2], #64;
+    eor v4.16b, v4.16b, RTMP0.16b;
+    eor v5.16b, v5.16b, RTMP1.16b;
+    eor v6.16b, v6.16b, RTMP2.16b;
+    eor v7.16b, v7.16b, RTMP3.16b;
+    st1 {v4.16b-v7.16b}, [x1], #64;
+
+    mov v0.16b, RTMP3.16b;
+
+    b .Lcfb_loop_blk;
+
+.Lcfb_end:
+    /* store new IV */
+    st1 {v0.16b}, [x3];
+
+    VPOP_ABI;
+    ldp x29, x30, [sp], #16;
+    CFI_ADJUST_CFA_OFFSET(-16);
+    CFI_RESTORE(x29);
+    CFI_RESTORE(x30);
+    ret_spec_stop;
+    CFI_ENDPROC();
+ELF(.size _gcry_sm4_aarch64_cfb_dec,.-_gcry_sm4_aarch64_cfb_dec;)
+
+.align 4
+.global _gcry_sm4_aarch64_ctr_enc
+ELF(.type _gcry_sm4_aarch64_ctr_enc,%function;)
+_gcry_sm4_aarch64_ctr_enc:
+    /* input:
+     *   x0: round key array, CTX
+     *   x1: dst
+     *   x2: src
+     *   x3: ctr (big endian, 128 bit)
+     *   x4: nblocks (multiples of 8)
+     */
+    CFI_STARTPROC();
+
+    stp x29, x30, [sp, #-16]!;
+    CFI_ADJUST_CFA_OFFSET(16);
+    CFI_REG_ON_STACK(29, 0);
+    CFI_REG_ON_STACK(30, 8);
+    VPUSH_ABI;
+
+    preload_sbox(x5);
+
+    ldp x7, x8, [x3];
+    rev x7, x7;
+    rev x8, x8;
+
+.Lctr_loop_blk:
+    subs x4, x4, #8;
+    bmi .Lctr_end;
+
+#define inc_le128(vctr)       \
+    mov vctr.d[1], x8;        \
+    mov vctr.d[0], x7;        \
+    adds x8, x8, #1;          \
+    adc x7, x7, xzr;          \
+    rev64 vctr.16b, vctr.16b;
+
+    /* construct CTRs */
+    inc_le128(v0);      /* +0 */
+    inc_le128(v1);      /* +1 */
+    inc_le128(v2);      /* +2 */
+    inc_le128(v3);      /* +3 */
+    inc_le128(v4);      /* +4 */
+    inc_le128(v5);      /* +5 */
+    inc_le128(v6);      /* +6 */
+    inc_le128(v7);      /* +7 */
+
+    bl __sm4_crypt_blk8;
+
+    ld1 {RTMP0.16b-RTMP3.16b}, [x2], #64;
+    eor v0.16b, v0.16b, RTMP0.16b;
+    eor v1.16b, v1.16b, RTMP1.16b;
+    eor v2.16b, v2.16b, RTMP2.16b;
+    eor v3.16b, v3.16b, RTMP3.16b;
+    st1 {v0.16b-v3.16b}, [x1], #64;
+
+    ld1 {RTMP0.16b-RTMP3.16b}, [x2], #64;
+    eor v4.16b, v4.16b, RTMP0.16b;
+    eor v5.16b, v5.16b, RTMP1.16b;
+    eor v6.16b, v6.16b, RTMP2.16b;
+    eor v7.16b, v7.16b, RTMP3.16b;
+    st1 {v4.16b-v7.16b}, [x1], #64;
+
+    b .Lctr_loop_blk;
+
+.Lctr_end:
+    /* store new CTR */
+    rev x7, x7;
+    rev x8, x8;
+    stp x7, x8, [x3];
+
+    VPOP_ABI;
+    ldp x29, x30, [sp], #16;
+    CFI_ADJUST_CFA_OFFSET(-16);
+    CFI_RESTORE(x29);
+    CFI_RESTORE(x30);
+    ret_spec_stop;
+    CFI_ENDPROC();
+ELF(.size _gcry_sm4_aarch64_ctr_enc,.-_gcry_sm4_aarch64_ctr_enc;)
+
+#endif
index 7a99e070da865b8705a86e8381ae368c4d03f1c0..ca9be44a16842288a93ab35a89e18ebb75b7069a 100644 (file)
@@ -1,6 +1,6 @@
 /* sm4-avx-aesni-amd64.S  -  AES-NI/AVX implementation of SM4 cipher
  *
- * Copyright (C) 2020 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ * Copyright (C) 2020,2023 Jussi Kivilinna <jussi.kivilinna@iki.fi>
  *
  * This file is part of Libgcrypt.
  *
   4-way && 8-way SM4 with AES-NI and AVX
  **********************************************************************/
 
-.text
+SECTION_RODATA
 .align 16
 
+ELF(.type _sm4_aesni_avx_consts,@object)
+_sm4_aesni_avx_consts:
+
 /*
  * Following four affine transform look-up tables are from work by
  * Markku-Juhani O. Saarinen, at https://github.com/mjosaarinen/sm4ni
 .Lbswap32_mask:
        .byte 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12
 
+/* CTR byte addition constants */
+.Lbige_addb_1:
+       .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1
+.Lbige_addb_2:
+       .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2
+.Lbige_addb_3:
+       .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3
+.Lbige_addb_4:
+       .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4
+.Lbige_addb_5:
+       .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5
+.Lbige_addb_6:
+       .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6
+.Lbige_addb_7:
+       .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7
+.Lbige_addb_8:
+       .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 8
+.Lbige_addb_9:
+       .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 9
+.Lbige_addb_10:
+       .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 10
+.Lbige_addb_11:
+       .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 11
+.Lbige_addb_12:
+       .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 12
+.Lbige_addb_13:
+       .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 13
+.Lbige_addb_14:
+       .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 14
+.Lbige_addb_15:
+       .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 15
+
 .align 4
 /* 4-bit mask */
 .L0f0f0f0f:
        .long 0x0f0f0f0f
 
-.align 8
+.text
+
+.align 16
 .globl _gcry_sm4_aesni_avx_expand_key
 ELF(.type   _gcry_sm4_aesni_avx_expand_key,@function;)
 _gcry_sm4_aesni_avx_expand_key:
@@ -244,7 +281,7 @@ _gcry_sm4_aesni_avx_expand_key:
        CFI_ENDPROC();
 ELF(.size _gcry_sm4_aesni_avx_expand_key,.-_gcry_sm4_aesni_avx_expand_key;)
 
-.align 8
+.align 16
 ELF(.type   sm4_aesni_avx_crypt_blk1_4,@function;)
 sm4_aesni_avx_crypt_blk1_4:
        /* input:
@@ -349,7 +386,7 @@ sm4_aesni_avx_crypt_blk1_4:
        CFI_ENDPROC();
 ELF(.size sm4_aesni_avx_crypt_blk1_4,.-sm4_aesni_avx_crypt_blk1_4;)
 
-.align 8
+.align 16
 ELF(.type __sm4_crypt_blk8,@function;)
 __sm4_crypt_blk8:
        /* input:
@@ -458,7 +495,7 @@ __sm4_crypt_blk8:
        CFI_ENDPROC();
 ELF(.size __sm4_crypt_blk8,.-__sm4_crypt_blk8;)
 
-.align 8
+.align 16
 .globl _gcry_sm4_aesni_avx_crypt_blk1_8
 ELF(.type   _gcry_sm4_aesni_avx_crypt_blk1_8,@function;)
 _gcry_sm4_aesni_avx_crypt_blk1_8:
@@ -512,7 +549,7 @@ _gcry_sm4_aesni_avx_crypt_blk1_8:
        CFI_ENDPROC();
 ELF(.size _gcry_sm4_aesni_avx_crypt_blk1_8,.-_gcry_sm4_aesni_avx_crypt_blk1_8;)
 
-.align 8
+.align 16
 .globl _gcry_sm4_aesni_avx_ctr_enc
 ELF(.type   _gcry_sm4_aesni_avx_ctr_enc,@function;)
 _gcry_sm4_aesni_avx_ctr_enc:
@@ -524,6 +561,9 @@ _gcry_sm4_aesni_avx_ctr_enc:
         */
        CFI_STARTPROC();
 
+       cmpb $(0x100 - 8), 15(%rcx);
+       jbe .Lctr_byteadd;
+
        /* load IV and byteswap */
        vmovdqu (%rcx), RA0;
 
@@ -560,6 +600,8 @@ _gcry_sm4_aesni_avx_ctr_enc:
        /* store new IV */
        vmovdqu RTMP1, (%rcx);
 
+.align 8
+.Lload_ctr_done:
        call __sm4_crypt_blk8;
 
        vpxor (0 * 16)(%rdx), RA0, RA0;
@@ -583,10 +625,39 @@ _gcry_sm4_aesni_avx_ctr_enc:
        vzeroall;
 
        ret_spec_stop;
+       .align 8
+
+.Lctr_byteadd_full_ctr_carry:
+       movq 8(%rcx), %r11;
+       movq (%rcx), %r10;
+       bswapq %r11;
+       bswapq %r10;
+       addq $8, %r11;
+       adcq $0, %r10;
+       bswapq %r11;
+       bswapq %r10;
+       movq %r11, 8(%rcx);
+       movq %r10, (%rcx);
+       jmp .Lctr_byteadd_xmm;
+.align 8
+.Lctr_byteadd:
+       vmovdqu (%rcx), RA0;
+       je .Lctr_byteadd_full_ctr_carry;
+       addb $8, 15(%rcx);
+.Lctr_byteadd_xmm:
+       vpaddb .Lbige_addb_1 rRIP, RA0, RA1;
+       vpaddb .Lbige_addb_2 rRIP, RA0, RA2;
+       vpaddb .Lbige_addb_3 rRIP, RA0, RA3;
+       vpaddb .Lbige_addb_4 rRIP, RA0, RB0;
+       vpaddb .Lbige_addb_5 rRIP, RA0, RB1;
+       vpaddb .Lbige_addb_6 rRIP, RA0, RB2;
+       vpaddb .Lbige_addb_7 rRIP, RA0, RB3;
+
+       jmp .Lload_ctr_done;
        CFI_ENDPROC();
 ELF(.size _gcry_sm4_aesni_avx_ctr_enc,.-_gcry_sm4_aesni_avx_ctr_enc;)
 
-.align 8
+.align 16
 .globl _gcry_sm4_aesni_avx_cbc_dec
 ELF(.type   _gcry_sm4_aesni_avx_cbc_dec,@function;)
 _gcry_sm4_aesni_avx_cbc_dec:
@@ -635,7 +706,7 @@ _gcry_sm4_aesni_avx_cbc_dec:
        CFI_ENDPROC();
 ELF(.size _gcry_sm4_aesni_avx_cbc_dec,.-_gcry_sm4_aesni_avx_cbc_dec;)
 
-.align 8
+.align 16
 .globl _gcry_sm4_aesni_avx_cfb_dec
 ELF(.type   _gcry_sm4_aesni_avx_cfb_dec,@function;)
 _gcry_sm4_aesni_avx_cfb_dec:
@@ -687,7 +758,7 @@ _gcry_sm4_aesni_avx_cfb_dec:
        CFI_ENDPROC();
 ELF(.size _gcry_sm4_aesni_avx_cfb_dec,.-_gcry_sm4_aesni_avx_cfb_dec;)
 
-.align 8
+.align 16
 .globl _gcry_sm4_aesni_avx_ocb_enc
 ELF(.type _gcry_sm4_aesni_avx_ocb_enc,@function;)
 
@@ -786,7 +857,7 @@ _gcry_sm4_aesni_avx_ocb_enc:
        CFI_ENDPROC();
 ELF(.size _gcry_sm4_aesni_avx_ocb_enc,.-_gcry_sm4_aesni_avx_ocb_enc;)
 
-.align 8
+.align 16
 .globl _gcry_sm4_aesni_avx_ocb_dec
 ELF(.type _gcry_sm4_aesni_avx_ocb_dec,@function;)
 
@@ -895,7 +966,7 @@ _gcry_sm4_aesni_avx_ocb_dec:
        CFI_ENDPROC();
 ELF(.size _gcry_sm4_aesni_avx_ocb_dec,.-_gcry_sm4_aesni_avx_ocb_dec;)
 
-.align 8
+.align 16
 .globl _gcry_sm4_aesni_avx_ocb_auth
 ELF(.type _gcry_sm4_aesni_avx_ocb_auth,@function;)
 
index 7a8b9558f1cfa816829b3e0208d16daef434b5d2..03f979fa6c3a9b9e346ea265c4e9546645cdb410 100644 (file)
@@ -1,6 +1,6 @@
 /* sm4-avx2-amd64.S  -  AVX2 implementation of SM4 cipher
  *
- * Copyright (C) 2020 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ * Copyright (C) 2020, 2022-2023 Jussi Kivilinna <jussi.kivilinna@iki.fi>
  *
  * This file is part of Libgcrypt.
  *
 #define RA1          %ymm9
 #define RA2          %ymm10
 #define RA3          %ymm11
+#define RA0x         %xmm8
+#define RA1x         %xmm9
+#define RA2x         %xmm10
+#define RA3x         %xmm11
 
 #define RB0          %ymm12
 #define RB1          %ymm13
 #define RB2          %ymm14
 #define RB3          %ymm15
+#define RB0x         %xmm12
+#define RB1x         %xmm13
+#define RB2x         %xmm14
+#define RB3x         %xmm15
 
 #define RNOT         %ymm0
 #define RBSWAP       %ymm1
   16-way SM4 with AES-NI and AVX
  **********************************************************************/
 
-.text
+SECTION_RODATA
 .align 16
 
+ELF(.type _sm4_aesni_avx2_consts,@object)
+_sm4_aesni_avx2_consts:
+
 /*
  * Following four affine transform look-up tables are from work by
  * Markku-Juhani O. Saarinen, at https://github.com/mjosaarinen/sm4ni
 .Lbswap32_mask:
        .byte 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12
 
+/* CTR byte addition constants */
+.align 32
+.Lbige_addb_0_1:
+       .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+       .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1
+.Lbige_addb_2_3:
+       .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2
+       .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3
+.Lbige_addb_4_5:
+       .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4
+       .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5
+.Lbige_addb_6_7:
+       .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6
+       .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7
+.Lbige_addb_8_9:
+       .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 8
+       .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 9
+.Lbige_addb_10_11:
+       .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 10
+       .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 11
+.Lbige_addb_12_13:
+       .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 12
+       .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 13
+.Lbige_addb_14_15:
+       .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 14
+       .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 15
+
 .align 4
 /* 4-bit mask */
 .L0f0f0f0f:
        .long 0x0f0f0f0f
 
-.align 8
+.text
+
+.align 16
 ELF(.type   __sm4_crypt_blk16,@function;)
 __sm4_crypt_blk16:
        /* input:
@@ -252,14 +292,14 @@ __sm4_crypt_blk16:
 
        leaq (32*4)(%rdi), %rax;
 .align 16
-.Lroundloop_blk8:
+.Lroundloop_blk16:
        ROUND(0, RA0, RA1, RA2, RA3, RB0, RB1, RB2, RB3);
        ROUND(1, RA1, RA2, RA3, RA0, RB1, RB2, RB3, RB0);
        ROUND(2, RA2, RA3, RA0, RA1, RB2, RB3, RB0, RB1);
        ROUND(3, RA3, RA0, RA1, RA2, RB3, RB0, RB1, RB2);
        leaq (4*4)(%rdi), %rdi;
        cmpq %rax, %rdi;
-       jne .Lroundloop_blk8;
+       jne .Lroundloop_blk16;
 
 #undef ROUND
 
@@ -280,13 +320,73 @@ __sm4_crypt_blk16:
        CFI_ENDPROC();
 ELF(.size __sm4_crypt_blk16,.-__sm4_crypt_blk16;)
 
+.align 16
+.globl _gcry_sm4_aesni_avx2_crypt_blk1_16
+ELF(.type   _gcry_sm4_aesni_avx2_crypt_blk1_16,@function;)
+_gcry_sm4_aesni_avx2_crypt_blk1_16:
+       /* input:
+        *      %rdi: round key array, CTX
+        *      %rsi: dst (1..16 blocks)
+        *      %rdx: src (1..16 blocks)
+        *      %rcx: num blocks (1..16)
+        */
+       CFI_STARTPROC();
+
+#define LOAD_INPUT(offset, yreg) \
+       cmpq $(1 + 2 * (offset)), %rcx; \
+       jb .Lblk16_load_input_done; \
+       ja 1f; \
+         vmovdqu (offset) * 32(%rdx), yreg##x; \
+         jmp .Lblk16_load_input_done; \
+       1: \
+         vmovdqu (offset) * 32(%rdx), yreg;
+
+       LOAD_INPUT(0, RA0);
+       LOAD_INPUT(1, RA1);
+       LOAD_INPUT(2, RA2);
+       LOAD_INPUT(3, RA3);
+       LOAD_INPUT(4, RB0);
+       LOAD_INPUT(5, RB1);
+       LOAD_INPUT(6, RB2);
+       LOAD_INPUT(7, RB3);
+#undef LOAD_INPUT
+
+.Lblk16_load_input_done:
+       call __sm4_crypt_blk16;
+
+#define STORE_OUTPUT(yreg, offset) \
+       cmpq $(1 + 2 * (offset)), %rcx; \
+       jb .Lblk16_store_output_done; \
+       ja 1f; \
+         vmovdqu yreg##x, (offset) * 32(%rsi); \
+         jmp .Lblk16_store_output_done; \
+       1: \
+         vmovdqu yreg, (offset) * 32(%rsi);
+
+       STORE_OUTPUT(RA0, 0);
+       STORE_OUTPUT(RA1, 1);
+       STORE_OUTPUT(RA2, 2);
+       STORE_OUTPUT(RA3, 3);
+       STORE_OUTPUT(RB0, 4);
+       STORE_OUTPUT(RB1, 5);
+       STORE_OUTPUT(RB2, 6);
+       STORE_OUTPUT(RB3, 7);
+#undef STORE_OUTPUT
+
+.Lblk16_store_output_done:
+       vzeroall;
+       xorl %eax, %eax;
+       ret_spec_stop;
+       CFI_ENDPROC();
+ELF(.size _gcry_sm4_aesni_avx2_crypt_blk1_16,.-_gcry_sm4_aesni_avx2_crypt_blk1_16;)
+
 #define inc_le128(x, minus_one, tmp) \
        vpcmpeqq minus_one, x, tmp; \
        vpsubq minus_one, x, x; \
        vpslldq $8, tmp, tmp; \
        vpsubq tmp, x, x;
 
-.align 8
+.align 16
 .globl _gcry_sm4_aesni_avx2_ctr_enc
 ELF(.type   _gcry_sm4_aesni_avx2_ctr_enc,@function;)
 _gcry_sm4_aesni_avx2_ctr_enc:
@@ -298,11 +398,12 @@ _gcry_sm4_aesni_avx2_ctr_enc:
         */
        CFI_STARTPROC();
 
+       cmpb $(0x100 - 16), 15(%rcx);
+       jbe .Lctr_byteadd;
+
        movq 8(%rcx), %rax;
        bswapq %rax;
 
-       vzeroupper;
-
        vbroadcasti128 .Lbswap128_mask rRIP, RTMP3;
        vpcmpeqd RNOT, RNOT, RNOT;
        vpsrldq $8, RNOT, RNOT;   /* ab: -1:0 ; cd: -1:0 */
@@ -367,11 +468,12 @@ _gcry_sm4_aesni_avx2_ctr_enc:
        vextracti128 $1, RTMP0, RTMP0x;
        vpshufb RTMP3x, RTMP0x, RTMP0x; /* +16 */
 
-.align 4
 .Lctr_carry_done:
        /* store new IV */
        vmovdqu RTMP0x, (%rcx);
 
+.align 8
+.Lload_ctr_done:
        call __sm4_crypt_blk16;
 
        vpxor (0 * 32)(%rdx), RA0, RA0;
@@ -395,10 +497,40 @@ _gcry_sm4_aesni_avx2_ctr_enc:
        vzeroall;
 
        ret_spec_stop;
+
+.align 8
+.Lctr_byteadd_full_ctr_carry:
+       movq 8(%rcx), %r11;
+       movq (%rcx), %r10;
+       bswapq %r11;
+       bswapq %r10;
+       addq $16, %r11;
+       adcq $0, %r10;
+       bswapq %r11;
+       bswapq %r10;
+       movq %r11, 8(%rcx);
+       movq %r10, (%rcx);
+       jmp .Lctr_byteadd_ymm;
+.align 8
+.Lctr_byteadd:
+       vbroadcasti128 (%rcx), RB3;
+       je .Lctr_byteadd_full_ctr_carry;
+       addb $16, 15(%rcx);
+.Lctr_byteadd_ymm:
+       vpaddb .Lbige_addb_0_1 rRIP, RB3, RA0;
+       vpaddb .Lbige_addb_2_3 rRIP, RB3, RA1;
+       vpaddb .Lbige_addb_4_5 rRIP, RB3, RA2;
+       vpaddb .Lbige_addb_6_7 rRIP, RB3, RA3;
+       vpaddb .Lbige_addb_8_9 rRIP, RB3, RB0;
+       vpaddb .Lbige_addb_10_11 rRIP, RB3, RB1;
+       vpaddb .Lbige_addb_12_13 rRIP, RB3, RB2;
+       vpaddb .Lbige_addb_14_15 rRIP, RB3, RB3;
+
+       jmp .Lload_ctr_done;
        CFI_ENDPROC();
 ELF(.size _gcry_sm4_aesni_avx2_ctr_enc,.-_gcry_sm4_aesni_avx2_ctr_enc;)
 
-.align 8
+.align 16
 .globl _gcry_sm4_aesni_avx2_cbc_dec
 ELF(.type   _gcry_sm4_aesni_avx2_cbc_dec,@function;)
 _gcry_sm4_aesni_avx2_cbc_dec:
@@ -410,8 +542,6 @@ _gcry_sm4_aesni_avx2_cbc_dec:
         */
        CFI_STARTPROC();
 
-       vzeroupper;
-
        vmovdqu (0 * 32)(%rdx), RA0;
        vmovdqu (1 * 32)(%rdx), RA1;
        vmovdqu (2 * 32)(%rdx), RA2;
@@ -451,7 +581,7 @@ _gcry_sm4_aesni_avx2_cbc_dec:
        CFI_ENDPROC();
 ELF(.size _gcry_sm4_aesni_avx2_cbc_dec,.-_gcry_sm4_aesni_avx2_cbc_dec;)
 
-.align 8
+.align 16
 .globl _gcry_sm4_aesni_avx2_cfb_dec
 ELF(.type   _gcry_sm4_aesni_avx2_cfb_dec,@function;)
 _gcry_sm4_aesni_avx2_cfb_dec:
@@ -463,8 +593,6 @@ _gcry_sm4_aesni_avx2_cfb_dec:
         */
        CFI_STARTPROC();
 
-       vzeroupper;
-
        /* Load input */
        vmovdqu (%rcx), RNOTx;
        vinserti128 $1, (%rdx), RNOT, RA0;
@@ -506,7 +634,7 @@ _gcry_sm4_aesni_avx2_cfb_dec:
        CFI_ENDPROC();
 ELF(.size _gcry_sm4_aesni_avx2_cfb_dec,.-_gcry_sm4_aesni_avx2_cfb_dec;)
 
-.align 8
+.align 16
 .globl _gcry_sm4_aesni_avx2_ocb_enc
 ELF(.type _gcry_sm4_aesni_avx2_ocb_enc,@function;)
 
@@ -521,8 +649,6 @@ _gcry_sm4_aesni_avx2_ocb_enc:
         */
        CFI_STARTPROC();
 
-       vzeroupper;
-
        subq $(4 * 8), %rsp;
        CFI_ADJUST_CFA_OFFSET(4 * 8);
 
@@ -620,7 +746,7 @@ _gcry_sm4_aesni_avx2_ocb_enc:
        CFI_ENDPROC();
 ELF(.size _gcry_sm4_aesni_avx2_ocb_enc,.-_gcry_sm4_aesni_avx2_ocb_enc;)
 
-.align 8
+.align 16
 .globl _gcry_sm4_aesni_avx2_ocb_dec
 ELF(.type _gcry_sm4_aesni_avx2_ocb_dec,@function;)
 
@@ -635,8 +761,6 @@ _gcry_sm4_aesni_avx2_ocb_dec:
         */
        CFI_STARTPROC();
 
-       vzeroupper;
-
        subq $(4 * 8), %rsp;
        CFI_ADJUST_CFA_OFFSET(4 * 8);
 
@@ -744,7 +868,7 @@ _gcry_sm4_aesni_avx2_ocb_dec:
        CFI_ENDPROC();
 ELF(.size _gcry_sm4_aesni_avx2_ocb_dec,.-_gcry_sm4_aesni_avx2_ocb_dec;)
 
-.align 8
+.align 16
 .globl _gcry_sm4_aesni_avx2_ocb_auth
 ELF(.type _gcry_sm4_aesni_avx2_ocb_auth,@function;)
 
@@ -758,8 +882,6 @@ _gcry_sm4_aesni_avx2_ocb_auth:
         */
        CFI_STARTPROC();
 
-       vzeroupper;
-
        subq $(4 * 8), %rsp;
        CFI_ADJUST_CFA_OFFSET(4 * 8);
 
diff --git a/cipher/sm4-armv8-aarch64-ce.S b/cipher/sm4-armv8-aarch64-ce.S
new file mode 100644 (file)
index 0000000..eea56cd
--- /dev/null
@@ -0,0 +1,731 @@
+/* sm4-armv8-aarch64-ce.S  -  ARMv8/AArch64/CE accelerated SM4 cipher
+ *
+ * Copyright (C) 2022 Alibaba Group.
+ * Copyright (C) 2022 Tianjia Zhang <tianjia.zhang@linux.alibaba.com>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "asm-common-aarch64.h"
+
+#if defined(__AARCH64EL__) && \
+    defined(HAVE_COMPATIBLE_GCC_AARCH64_PLATFORM_AS) && \
+    defined(HAVE_GCC_INLINE_ASM_AARCH64_CRYPTO) && \
+    defined(USE_SM4)
+
+.cpu generic+simd+crypto
+
+#define vecnum_v0 0
+#define vecnum_v1 1
+#define vecnum_v2 2
+#define vecnum_v3 3
+#define vecnum_v4 4
+#define vecnum_v5 5
+#define vecnum_v6 6
+#define vecnum_v7 7
+#define vecnum_v16 16
+#define vecnum_v24 24
+#define vecnum_v25 25
+#define vecnum_v26 26
+#define vecnum_v27 27
+#define vecnum_v28 28
+#define vecnum_v29 29
+#define vecnum_v30 30
+#define vecnum_v31 31
+
+#define sm4e(vd, vn) \
+   .inst (0xcec08400 | (vecnum_##vn << 5) | vecnum_##vd)
+
+#define sm4ekey(vd, vn, vm) \
+   .inst (0xce60c800 | (vecnum_##vm << 16) | (vecnum_##vn << 5) | vecnum_##vd)
+
+.text
+
+/* Register macros */
+
+#define RTMP0   v16
+#define RTMP1   v17
+#define RTMP2   v18
+#define RTMP3   v19
+
+#define RIV     v20
+#define RMASK   v21
+
+/* Helper macros. */
+
+#define load_rkey(ptr)                     \
+        ld1 {v24.16b-v27.16b}, [ptr], #64; \
+        ld1 {v28.16b-v31.16b}, [ptr];
+
+#define SM4_CRYPT_BLK(b0)                       \
+        rev32       b0.16b, b0.16b;             \
+        sm4e(b0, v24);                          \
+        sm4e(b0, v25);                          \
+        sm4e(b0, v26);                          \
+        sm4e(b0, v27);                          \
+        sm4e(b0, v28);                          \
+        sm4e(b0, v29);                          \
+        sm4e(b0, v30);                          \
+        sm4e(b0, v31);                          \
+        rev64       b0.4s, b0.4s;               \
+        ext         b0.16b, b0.16b, b0.16b, #8; \
+        rev32       b0.16b, b0.16b;
+
+#define crypt_blk4(b0, b1, b2, b3)         \
+        rev32 b0.16b, b0.16b;              \
+        rev32 b1.16b, b1.16b;              \
+        rev32 b2.16b, b2.16b;              \
+        rev32 b3.16b, b3.16b;              \
+        sm4e(b0, v24);                     \
+        sm4e(b1, v24);                     \
+        sm4e(b2, v24);                     \
+        sm4e(b3, v24);                     \
+        sm4e(b0, v25);                     \
+        sm4e(b1, v25);                     \
+        sm4e(b2, v25);                     \
+        sm4e(b3, v25);                     \
+        sm4e(b0, v26);                     \
+        sm4e(b1, v26);                     \
+        sm4e(b2, v26);                     \
+        sm4e(b3, v26);                     \
+        sm4e(b0, v27);                     \
+        sm4e(b1, v27);                     \
+        sm4e(b2, v27);                     \
+        sm4e(b3, v27);                     \
+        sm4e(b0, v28);                     \
+        sm4e(b1, v28);                     \
+        sm4e(b2, v28);                     \
+        sm4e(b3, v28);                     \
+        sm4e(b0, v29);                     \
+        sm4e(b1, v29);                     \
+        sm4e(b2, v29);                     \
+        sm4e(b3, v29);                     \
+        sm4e(b0, v30);                     \
+        sm4e(b1, v30);                     \
+        sm4e(b2, v30);                     \
+        sm4e(b3, v30);                     \
+        sm4e(b0, v31);                     \
+        sm4e(b1, v31);                     \
+        sm4e(b2, v31);                     \
+        sm4e(b3, v31);                     \
+        rev64 b0.4s, b0.4s;                \
+        rev64 b1.4s, b1.4s;                \
+        rev64 b2.4s, b2.4s;                \
+        rev64 b3.4s, b3.4s;                \
+        ext b0.16b, b0.16b, b0.16b, #8;    \
+        ext b1.16b, b1.16b, b1.16b, #8;    \
+        ext b2.16b, b2.16b, b2.16b, #8;    \
+        ext b3.16b, b3.16b, b3.16b, #8;    \
+        rev32 b0.16b, b0.16b;              \
+        rev32 b1.16b, b1.16b;              \
+        rev32 b2.16b, b2.16b;              \
+        rev32 b3.16b, b3.16b;
+
+#define crypt_blk8(b0, b1, b2, b3, b4, b5, b6, b7) \
+        rev32 b0.16b, b0.16b;              \
+        rev32 b1.16b, b1.16b;              \
+        rev32 b2.16b, b2.16b;              \
+        rev32 b3.16b, b3.16b;              \
+        rev32 b4.16b, b4.16b;              \
+        rev32 b5.16b, b5.16b;              \
+        rev32 b6.16b, b6.16b;              \
+        rev32 b7.16b, b7.16b;              \
+        sm4e(b0, v24);                     \
+        sm4e(b1, v24);                     \
+        sm4e(b2, v24);                     \
+        sm4e(b3, v24);                     \
+        sm4e(b4, v24);                     \
+        sm4e(b5, v24);                     \
+        sm4e(b6, v24);                     \
+        sm4e(b7, v24);                     \
+        sm4e(b0, v25);                     \
+        sm4e(b1, v25);                     \
+        sm4e(b2, v25);                     \
+        sm4e(b3, v25);                     \
+        sm4e(b4, v25);                     \
+        sm4e(b5, v25);                     \
+        sm4e(b6, v25);                     \
+        sm4e(b7, v25);                     \
+        sm4e(b0, v26);                     \
+        sm4e(b1, v26);                     \
+        sm4e(b2, v26);                     \
+        sm4e(b3, v26);                     \
+        sm4e(b4, v26);                     \
+        sm4e(b5, v26);                     \
+        sm4e(b6, v26);                     \
+        sm4e(b7, v26);                     \
+        sm4e(b0, v27);                     \
+        sm4e(b1, v27);                     \
+        sm4e(b2, v27);                     \
+        sm4e(b3, v27);                     \
+        sm4e(b4, v27);                     \
+        sm4e(b5, v27);                     \
+        sm4e(b6, v27);                     \
+        sm4e(b7, v27);                     \
+        sm4e(b0, v28);                     \
+        sm4e(b1, v28);                     \
+        sm4e(b2, v28);                     \
+        sm4e(b3, v28);                     \
+        sm4e(b4, v28);                     \
+        sm4e(b5, v28);                     \
+        sm4e(b6, v28);                     \
+        sm4e(b7, v28);                     \
+        sm4e(b0, v29);                     \
+        sm4e(b1, v29);                     \
+        sm4e(b2, v29);                     \
+        sm4e(b3, v29);                     \
+        sm4e(b4, v29);                     \
+        sm4e(b5, v29);                     \
+        sm4e(b6, v29);                     \
+        sm4e(b7, v29);                     \
+        sm4e(b0, v30);                     \
+        sm4e(b1, v30);                     \
+        sm4e(b2, v30);                     \
+        sm4e(b3, v30);                     \
+        sm4e(b4, v30);                     \
+        sm4e(b5, v30);                     \
+        sm4e(b6, v30);                     \
+        sm4e(b7, v30);                     \
+        sm4e(b0, v31);                     \
+        sm4e(b1, v31);                     \
+        sm4e(b2, v31);                     \
+        sm4e(b3, v31);                     \
+        sm4e(b4, v31);                     \
+        sm4e(b5, v31);                     \
+        sm4e(b6, v31);                     \
+        sm4e(b7, v31);                     \
+        rev64 b0.4s, b0.4s;                \
+        rev64 b1.4s, b1.4s;                \
+        rev64 b2.4s, b2.4s;                \
+        rev64 b3.4s, b3.4s;                \
+        rev64 b4.4s, b4.4s;                \
+        rev64 b5.4s, b5.4s;                \
+        rev64 b6.4s, b6.4s;                \
+        rev64 b7.4s, b7.4s;                \
+        ext b0.16b, b0.16b, b0.16b, #8;    \
+        ext b1.16b, b1.16b, b1.16b, #8;    \
+        ext b2.16b, b2.16b, b2.16b, #8;    \
+        ext b3.16b, b3.16b, b3.16b, #8;    \
+        ext b4.16b, b4.16b, b4.16b, #8;    \
+        ext b5.16b, b5.16b, b5.16b, #8;    \
+        ext b6.16b, b6.16b, b6.16b, #8;    \
+        ext b7.16b, b7.16b, b7.16b, #8;    \
+        rev32 b0.16b, b0.16b;              \
+        rev32 b1.16b, b1.16b;              \
+        rev32 b2.16b, b2.16b;              \
+        rev32 b3.16b, b3.16b;              \
+        rev32 b4.16b, b4.16b;              \
+        rev32 b5.16b, b5.16b;              \
+        rev32 b6.16b, b6.16b;              \
+        rev32 b7.16b, b7.16b;
+
+
+.align 4
+.global _gcry_sm4_armv8_ce_expand_key
+ELF(.type _gcry_sm4_armv8_ce_expand_key,%function;)
+_gcry_sm4_armv8_ce_expand_key:
+    /* input:
+     *   x0: 128-bit key
+     *   x1: rkey_enc
+     *   x2: rkey_dec
+     *   x3: fk array
+     *   x4: ck array
+     */
+    CFI_STARTPROC();
+
+    ld1 {v0.16b}, [x0];
+    rev32 v0.16b, v0.16b;
+    ld1 {v1.16b}, [x3];
+    load_rkey(x4);
+
+    /* input ^ fk */
+    eor v0.16b, v0.16b, v1.16b;
+
+    sm4ekey(v0, v0, v24);
+    sm4ekey(v1, v0, v25);
+    sm4ekey(v2, v1, v26);
+    sm4ekey(v3, v2, v27);
+    sm4ekey(v4, v3, v28);
+    sm4ekey(v5, v4, v29);
+    sm4ekey(v6, v5, v30);
+    sm4ekey(v7, v6, v31);
+
+    st1 {v0.16b-v3.16b}, [x1], #64;
+    st1 {v4.16b-v7.16b}, [x1];
+    rev64 v7.4s, v7.4s;
+    rev64 v6.4s, v6.4s;
+    rev64 v5.4s, v5.4s;
+    rev64 v4.4s, v4.4s;
+    rev64 v3.4s, v3.4s;
+    rev64 v2.4s, v2.4s;
+    rev64 v1.4s, v1.4s;
+    rev64 v0.4s, v0.4s;
+    ext v7.16b, v7.16b, v7.16b, #8;
+    ext v6.16b, v6.16b, v6.16b, #8;
+    ext v5.16b, v5.16b, v5.16b, #8;
+    ext v4.16b, v4.16b, v4.16b, #8;
+    ext v3.16b, v3.16b, v3.16b, #8;
+    ext v2.16b, v2.16b, v2.16b, #8;
+    ext v1.16b, v1.16b, v1.16b, #8;
+    ext v0.16b, v0.16b, v0.16b, #8;
+    st1 {v7.16b}, [x2], #16;
+    st1 {v6.16b}, [x2], #16;
+    st1 {v5.16b}, [x2], #16;
+    st1 {v4.16b}, [x2], #16;
+    st1 {v3.16b}, [x2], #16;
+    st1 {v2.16b}, [x2], #16;
+    st1 {v1.16b}, [x2], #16;
+    st1 {v0.16b}, [x2];
+
+    ret_spec_stop;
+    CFI_ENDPROC();
+ELF(.size _gcry_sm4_armv8_ce_expand_key,.-_gcry_sm4_armv8_ce_expand_key;)
+
+.align 4
+ELF(.type sm4_armv8_ce_crypt_blk1_4,%function;)
+sm4_armv8_ce_crypt_blk1_4:
+    /* input:
+     *   x0: round key array, CTX
+     *   x1: dst
+     *   x2: src
+     *   x3: num blocks (1..4)
+     */
+    CFI_STARTPROC();
+
+    load_rkey(x0);
+
+    ld1 {v0.16b}, [x2], #16;
+    mov v1.16b, v0.16b;
+    mov v2.16b, v0.16b;
+    mov v3.16b, v0.16b;
+    cmp x3, #2;
+    blt .Lblk4_load_input_done;
+    ld1 {v1.16b}, [x2], #16;
+    beq .Lblk4_load_input_done;
+    ld1 {v2.16b}, [x2], #16;
+    cmp x3, #3;
+    beq .Lblk4_load_input_done;
+    ld1 {v3.16b}, [x2];
+
+.Lblk4_load_input_done:
+    crypt_blk4(v0, v1, v2, v3);
+
+    st1 {v0.16b}, [x1], #16;
+    cmp x3, #2;
+    blt .Lblk4_store_output_done;
+    st1 {v1.16b}, [x1], #16;
+    beq .Lblk4_store_output_done;
+    st1 {v2.16b}, [x1], #16;
+    cmp x3, #3;
+    beq .Lblk4_store_output_done;
+    st1 {v3.16b}, [x1];
+
+.Lblk4_store_output_done:
+    ret_spec_stop;
+    CFI_ENDPROC();
+ELF(.size sm4_armv8_ce_crypt_blk1_4,.-sm4_armv8_ce_crypt_blk1_4;)
+
+.align 4
+.global _gcry_sm4_armv8_ce_crypt_blk1_8
+ELF(.type _gcry_sm4_armv8_ce_crypt_blk1_8,%function;)
+_gcry_sm4_armv8_ce_crypt_blk1_8:
+    /* input:
+     *   x0: round key array, CTX
+     *   x1: dst
+     *   x2: src
+     *   x3: num blocks (1..8)
+     */
+    CFI_STARTPROC();
+
+    cmp x3, #5;
+    blt sm4_armv8_ce_crypt_blk1_4;
+
+    load_rkey(x0);
+
+    ld1 {v0.16b-v3.16b}, [x2], #64;
+    ld1 {v4.16b}, [x2], #16;
+    mov v5.16b, v4.16b;
+    mov v6.16b, v4.16b;
+    mov v7.16b, v4.16b;
+    beq .Lblk8_load_input_done;
+    ld1 {v5.16b}, [x2], #16;
+    cmp x3, #7;
+    blt .Lblk8_load_input_done;
+    ld1 {v6.16b}, [x2], #16;
+    beq .Lblk8_load_input_done;
+    ld1 {v7.16b}, [x2];
+
+.Lblk8_load_input_done:
+    crypt_blk8(v0, v1, v2, v3, v4, v5, v6, v7);
+
+    cmp x3, #6;
+    st1 {v0.16b-v3.16b}, [x1], #64;
+    st1 {v4.16b}, [x1], #16;
+    blt .Lblk8_store_output_done;
+    st1 {v5.16b}, [x1], #16;
+    beq .Lblk8_store_output_done;
+    st1 {v6.16b}, [x1], #16;
+    cmp x3, #7;
+    beq .Lblk8_store_output_done;
+    st1 {v7.16b}, [x1];
+
+.Lblk8_store_output_done:
+    ret_spec_stop;
+    CFI_ENDPROC();
+ELF(.size _gcry_sm4_armv8_ce_crypt_blk1_8,.-_gcry_sm4_armv8_ce_crypt_blk1_8;)
+
+.align 4
+.global _gcry_sm4_armv8_ce_crypt
+ELF(.type _gcry_sm4_armv8_ce_crypt,%function;)
+_gcry_sm4_armv8_ce_crypt:
+    /* input:
+     *   x0: round key array, CTX
+     *   x1: dst
+     *   x2: src
+     *   x3: nblocks (multiples of 8)
+     */
+    CFI_STARTPROC();
+
+    load_rkey(x0);
+
+.Lcrypt_loop_blk:
+    subs x3, x3, #8;
+    bmi .Lcrypt_end;
+
+    ld1 {v0.16b-v3.16b}, [x2], #64;
+    ld1 {v4.16b-v7.16b}, [x2], #64;
+
+    crypt_blk8(v0, v1, v2, v3, v4, v5, v6, v7);
+
+    st1 {v0.16b-v3.16b}, [x1], #64;
+    st1 {v4.16b-v7.16b}, [x1], #64;
+
+    b .Lcrypt_loop_blk;
+
+.Lcrypt_end:
+    ret_spec_stop;
+    CFI_ENDPROC();
+ELF(.size _gcry_sm4_armv8_ce_crypt,.-_gcry_sm4_armv8_ce_crypt;)
+
+.align 4
+.global _gcry_sm4_armv8_ce_cbc_dec
+ELF(.type _gcry_sm4_armv8_ce_cbc_dec,%function;)
+_gcry_sm4_armv8_ce_cbc_dec:
+    /* input:
+     *   x0: round key array, CTX
+     *   x1: dst
+     *   x2: src
+     *   x3: iv (big endian, 128 bit)
+     *   x4: nblocks (multiples of 8)
+     */
+    CFI_STARTPROC();
+
+    load_rkey(x0);
+    ld1 {RIV.16b}, [x3];
+
+.Lcbc_loop_blk:
+    subs x4, x4, #8;
+    bmi .Lcbc_end;
+
+    ld1 {v0.16b-v3.16b}, [x2], #64;
+    ld1 {v4.16b-v7.16b}, [x2];
+
+    crypt_blk8(v0, v1, v2, v3, v4, v5, v6, v7);
+
+    sub x2, x2, #64;
+    eor v0.16b, v0.16b, RIV.16b;
+    ld1 {RTMP0.16b-RTMP3.16b}, [x2], #64;
+    eor v1.16b, v1.16b, RTMP0.16b;
+    eor v2.16b, v2.16b, RTMP1.16b;
+    eor v3.16b, v3.16b, RTMP2.16b;
+    st1 {v0.16b-v3.16b}, [x1], #64;
+
+    eor v4.16b, v4.16b, RTMP3.16b;
+    ld1 {RTMP0.16b-RTMP3.16b}, [x2], #64;
+    eor v5.16b, v5.16b, RTMP0.16b;
+    eor v6.16b, v6.16b, RTMP1.16b;
+    eor v7.16b, v7.16b, RTMP2.16b;
+
+    mov RIV.16b, RTMP3.16b;
+    st1 {v4.16b-v7.16b}, [x1], #64;
+
+    b .Lcbc_loop_blk;
+
+.Lcbc_end:
+    /* store new IV */
+    st1 {RIV.16b}, [x3];
+
+    ret_spec_stop;
+    CFI_ENDPROC();
+ELF(.size _gcry_sm4_armv8_ce_cbc_dec,.-_gcry_sm4_armv8_ce_cbc_dec;)
+
+.align 4
+.global _gcry_sm4_armv8_ce_cfb_dec
+ELF(.type _gcry_sm4_armv8_ce_cfb_dec,%function;)
+_gcry_sm4_armv8_ce_cfb_dec:
+    /* input:
+     *   x0: round key array, CTX
+     *   x1: dst
+     *   x2: src
+     *   x3: iv (big endian, 128 bit)
+     *   x4: nblocks (multiples of 8)
+     */
+    CFI_STARTPROC();
+
+    load_rkey(x0);
+    ld1 {v0.16b}, [x3];
+
+.Lcfb_loop_blk:
+    subs x4, x4, #8;
+    bmi .Lcfb_end;
+
+    ld1 {v1.16b, v2.16b, v3.16b}, [x2], #48;
+    ld1 {v4.16b-v7.16b}, [x2];
+
+    crypt_blk8(v0, v1, v2, v3, v4, v5, v6, v7);
+
+    sub x2, x2, #48;
+    ld1 {RTMP0.16b-RTMP3.16b}, [x2], #64;
+    eor v0.16b, v0.16b, RTMP0.16b;
+    eor v1.16b, v1.16b, RTMP1.16b;
+    eor v2.16b, v2.16b, RTMP2.16b;
+    eor v3.16b, v3.16b, RTMP3.16b;
+    st1 {v0.16b-v3.16b}, [x1], #64;
+
+    ld1 {RTMP0.16b-RTMP3.16b}, [x2], #64;
+    eor v4.16b, v4.16b, RTMP0.16b;
+    eor v5.16b, v5.16b, RTMP1.16b;
+    eor v6.16b, v6.16b, RTMP2.16b;
+    eor v7.16b, v7.16b, RTMP3.16b;
+    st1 {v4.16b-v7.16b}, [x1], #64;
+
+    mov v0.16b, RTMP3.16b;
+
+    b .Lcfb_loop_blk;
+
+.Lcfb_end:
+    /* store new IV */
+    st1 {v0.16b}, [x3];
+
+    ret_spec_stop;
+    CFI_ENDPROC();
+ELF(.size _gcry_sm4_armv8_ce_cfb_dec,.-_gcry_sm4_armv8_ce_cfb_dec;)
+
+.align 4
+.global _gcry_sm4_armv8_ce_ctr_enc
+ELF(.type _gcry_sm4_armv8_ce_ctr_enc,%function;)
+_gcry_sm4_armv8_ce_ctr_enc:
+    /* input:
+     *   x0: round key array, CTX
+     *   x1: dst
+     *   x2: src
+     *   x3: ctr (big endian, 128 bit)
+     *   x4: nblocks (multiples of 8)
+     */
+    CFI_STARTPROC();
+
+    load_rkey(x0);
+
+    ldp x7, x8, [x3];
+    rev x7, x7;
+    rev x8, x8;
+
+.Lctr_loop_blk:
+    subs x4, x4, #8;
+    bmi .Lctr_end;
+
+#define inc_le128(vctr)       \
+    mov vctr.d[1], x8;        \
+    mov vctr.d[0], x7;        \
+    adds x8, x8, #1;          \
+    adc x7, x7, xzr;          \
+    rev64 vctr.16b, vctr.16b;
+
+    /* construct CTRs */
+    inc_le128(v0);      /* +0 */
+    inc_le128(v1);      /* +1 */
+    inc_le128(v2);      /* +2 */
+    inc_le128(v3);      /* +3 */
+    inc_le128(v4);      /* +4 */
+    inc_le128(v5);      /* +5 */
+    inc_le128(v6);      /* +6 */
+    inc_le128(v7);      /* +7 */
+
+    crypt_blk8(v0, v1, v2, v3, v4, v5, v6, v7);
+
+    ld1 {RTMP0.16b-RTMP3.16b}, [x2], #64;
+    eor v0.16b, v0.16b, RTMP0.16b;
+    eor v1.16b, v1.16b, RTMP1.16b;
+    eor v2.16b, v2.16b, RTMP2.16b;
+    eor v3.16b, v3.16b, RTMP3.16b;
+    st1 {v0.16b-v3.16b}, [x1], #64;
+
+    ld1 {RTMP0.16b-RTMP3.16b}, [x2], #64;
+    eor v4.16b, v4.16b, RTMP0.16b;
+    eor v5.16b, v5.16b, RTMP1.16b;
+    eor v6.16b, v6.16b, RTMP2.16b;
+    eor v7.16b, v7.16b, RTMP3.16b;
+    st1 {v4.16b-v7.16b}, [x1], #64;
+
+    b .Lctr_loop_blk;
+
+.Lctr_end:
+    /* store new CTR */
+    rev x7, x7;
+    rev x8, x8;
+    stp x7, x8, [x3];
+
+    ret_spec_stop;
+    CFI_ENDPROC();
+ELF(.size _gcry_sm4_armv8_ce_ctr_enc,.-_gcry_sm4_armv8_ce_ctr_enc;)
+
+.align 4
+.global _gcry_sm4_armv8_ce_xts_crypt
+ELF(.type _gcry_sm4_armv8_ce_xts_crypt,%function;)
+_gcry_sm4_armv8_ce_xts_crypt:
+    /* input:
+     *   x0: round key array, CTX
+     *   x1: dst
+     *   x2: src
+     *   x3: tweak (big endian, 128 bit)
+     *   x4: nblocks
+     */
+    CFI_STARTPROC()
+    VPUSH_ABI
+
+    load_rkey(x0)
+
+    mov         x7, #0x87
+    mov         x8, #0x1
+    mov         RMASK.d[0], x7
+    mov         RMASK.d[1], x8
+
+    ld1         {RIV.16b}, [x3]
+    mov         v8.16b, RIV.16b
+    ext         RIV.16b, RIV.16b, RIV.16b, #8
+
+.Lxts_loop_blk:
+    sub         x4, x4, #8
+    tbnz        x4, #63, .Lxts_tail8
+
+#define tweak_next(vt, vin, RTMP)                       \
+        sshr        RTMP.2d, RIV.2d, #63;               \
+        add         vt.2d, vin.2d, vin.2d;              \
+        and         RTMP.16b, RTMP.16b, RMASK.16b;      \
+        add         RIV.2d, RIV.2d, RIV.2d;             \
+        eor         vt.16b, vt.16b, RTMP.16b;
+
+    tweak_next( v9,  v8, RTMP0)
+    tweak_next(v10,  v9, RTMP1)
+    tweak_next(v11, v10, RTMP2)
+    tweak_next(v12, v11, RTMP3)
+    tweak_next(v13, v12, RTMP0)
+    tweak_next(v14, v13, RTMP1)
+    tweak_next(v15, v14, RTMP2)
+
+    ld1         {v0.16b-v3.16b}, [x2], #64
+    eor         v0.16b, v0.16b,  v8.16b
+    eor         v1.16b, v1.16b,  v9.16b
+    eor         v2.16b, v2.16b, v10.16b
+    eor         v3.16b, v3.16b, v11.16b
+    ld1         {v4.16b-v7.16b}, [x2], #64
+    eor         v4.16b, v4.16b, v12.16b
+    eor         v5.16b, v5.16b, v13.16b
+    eor         v6.16b, v6.16b, v14.16b
+    eor         v7.16b, v7.16b, v15.16b
+
+    crypt_blk8(v0, v1, v2, v3, v4, v5, v6, v7)
+
+    eor         v0.16b, v0.16b,  v8.16b
+    eor         v1.16b, v1.16b,  v9.16b
+    eor         v2.16b, v2.16b, v10.16b
+    eor         v3.16b, v3.16b, v11.16b
+    st1         {v0.16b-v3.16b}, [x1], #64
+    eor         v4.16b, v4.16b, v12.16b
+    eor         v5.16b, v5.16b, v13.16b
+    eor         v6.16b, v6.16b, v14.16b
+    eor         v7.16b, v7.16b, v15.16b
+    st1         {v4.16b-v7.16b}, [x1], #64
+
+    tweak_next(v8, v15, RTMP3)
+
+    cbz         x4, .Lxts_end
+    b           .Lxts_loop_blk
+
+.Lxts_tail8:
+    add         x4, x4, #8
+    cmp         x4, #4
+    blt         .Lxts_tail4
+
+    sub         x4, x4, #4
+
+    tweak_next( v9,  v8, RTMP0)
+    tweak_next(v10,  v9, RTMP1)
+    tweak_next(v11, v10, RTMP2)
+
+    ld1         {v0.16b-v3.16b}, [x2], #64
+    eor         v0.16b, v0.16b,  v8.16b
+    eor         v1.16b, v1.16b,  v9.16b
+    eor         v2.16b, v2.16b, v10.16b
+    eor         v3.16b, v3.16b, v11.16b
+
+    crypt_blk4(v0, v1, v2, v3);
+
+    eor         v0.16b, v0.16b,  v8.16b
+    eor         v1.16b, v1.16b,  v9.16b
+    eor         v2.16b, v2.16b, v10.16b
+    eor         v3.16b, v3.16b, v11.16b
+    st1         {v0.16b-v3.16b}, [x1], #64
+
+    tweak_next(v8, v11, RTMP3)
+
+    cbz         x4, .Lxts_end
+
+.Lxts_tail4:
+    sub         x4, x4, #1
+
+    ld1         {v0.16b}, [x2], #16
+    eor         v0.16b, v0.16b, v8.16b
+
+    SM4_CRYPT_BLK(v0)
+
+    eor         v0.16b, v0.16b, v8.16b
+    st1         {v0.16b}, [x1], #16
+
+    tweak_next(v8, v8, RTMP0)
+
+    cbnz        x4, .Lxts_tail4
+
+.Lxts_end:
+    /* store new tweak */
+    st1         {v8.16b}, [x3]
+
+    CLEAR_REG(v8)
+    CLEAR_REG(v9)
+    CLEAR_REG(v10)
+    CLEAR_REG(v11)
+    CLEAR_REG(v12)
+    CLEAR_REG(v13)
+    CLEAR_REG(v14)
+    CLEAR_REG(v15)
+    CLEAR_REG(RIV)
+
+    VPOP_ABI
+    ret_spec_stop
+    CFI_ENDPROC()
+ELF(.size _gcry_sm4_armv8_ce_xts_crypt,.-_gcry_sm4_armv8_ce_xts_crypt;)
+
+#endif
diff --git a/cipher/sm4-armv9-aarch64-sve-ce.S b/cipher/sm4-armv9-aarch64-sve-ce.S
new file mode 100644 (file)
index 0000000..f01a41b
--- /dev/null
@@ -0,0 +1,967 @@
+/* sm4-armv9-aarch64-sve-ce.S - ARMv9/AArch64 SVE Cryptography accelerated SM4
+ *
+ * Copyright (C) 2022 Alibaba Group.
+ * Copyright (C) 2022 Tianjia Zhang <tianjia.zhang@linux.alibaba.com>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "asm-common-aarch64.h"
+
+#if defined(__AARCH64EL__) && \
+    defined(HAVE_COMPATIBLE_GCC_AARCH64_PLATFORM_AS) && \
+    defined(HAVE_GCC_INLINE_ASM_AARCH64_CRYPTO) && \
+    defined(HAVE_GCC_INLINE_ASM_AARCH64_SVE) && \
+    defined(HAVE_GCC_INLINE_ASM_AARCH64_SVE2) && \
+    defined(USE_SM4)
+
+.cpu generic+simd+crypto+sve+sve2
+
+/* Constants */
+
+SECTION_RODATA
+.align 4
+ELF(.type _gcry_sm4_armv9_svesm4_consts,@object)
+_gcry_sm4_armv9_svesm4_consts:
+.Lbswap128_mask:
+    .byte 0x0c, 0x0d, 0x0e, 0x0f, 0x08, 0x09, 0x0a, 0x0b
+    .byte 0x04, 0x05, 0x06, 0x07, 0x00, 0x01, 0x02, 0x03
+    .byte 0x1c, 0x1d, 0x1e, 0x1f, 0x18, 0x19, 0x1a, 0x1b
+    .byte 0x14, 0x15, 0x16, 0x17, 0x10, 0x11, 0x12, 0x13
+    .byte 0x2c, 0x2d, 0x2e, 0x2f, 0x28, 0x29, 0x2a, 0x2b
+    .byte 0x24, 0x25, 0x26, 0x27, 0x20, 0x21, 0x22, 0x23
+    .byte 0x3c, 0x3d, 0x3e, 0x3f, 0x38, 0x39, 0x3a, 0x3b
+    .byte 0x34, 0x35, 0x36, 0x37, 0x30, 0x31, 0x32, 0x33
+    .byte 0x4c, 0x4d, 0x4e, 0x4f, 0x48, 0x49, 0x4a, 0x4b
+    .byte 0x44, 0x45, 0x46, 0x47, 0x40, 0x41, 0x42, 0x43
+    .byte 0x5c, 0x5d, 0x5e, 0x5f, 0x58, 0x59, 0x5a, 0x5b
+    .byte 0x54, 0x55, 0x56, 0x57, 0x50, 0x51, 0x52, 0x53
+    .byte 0x6c, 0x6d, 0x6e, 0x6f, 0x68, 0x69, 0x6a, 0x6b
+    .byte 0x64, 0x65, 0x66, 0x67, 0x60, 0x61, 0x62, 0x63
+    .byte 0x7c, 0x7d, 0x7e, 0x7f, 0x78, 0x79, 0x7a, 0x7b
+    .byte 0x74, 0x75, 0x76, 0x77, 0x70, 0x71, 0x72, 0x73
+    .byte 0x8c, 0x8d, 0x8e, 0x8f, 0x88, 0x89, 0x8a, 0x8b
+    .byte 0x84, 0x85, 0x86, 0x87, 0x80, 0x81, 0x82, 0x83
+    .byte 0x9c, 0x9d, 0x9e, 0x9f, 0x98, 0x99, 0x9a, 0x9b
+    .byte 0x94, 0x95, 0x96, 0x97, 0x90, 0x91, 0x92, 0x93
+    .byte 0xac, 0xad, 0xae, 0xaf, 0xa8, 0xa9, 0xaa, 0xab
+    .byte 0xa4, 0xa5, 0xa6, 0xa7, 0xa0, 0xa1, 0xa2, 0xa3
+    .byte 0xbc, 0xbd, 0xbe, 0xbf, 0xb8, 0xb9, 0xba, 0xbb
+    .byte 0xb4, 0xb5, 0xb6, 0xb7, 0xb0, 0xb1, 0xb2, 0xb3
+    .byte 0xcc, 0xcd, 0xce, 0xcf, 0xc8, 0xc9, 0xca, 0xcb
+    .byte 0xc4, 0xc5, 0xc6, 0xc7, 0xc0, 0xc1, 0xc2, 0xc3
+    .byte 0xdc, 0xdd, 0xde, 0xdf, 0xd8, 0xd9, 0xda, 0xdb
+    .byte 0xd4, 0xd5, 0xd6, 0xd7, 0xd0, 0xd1, 0xd2, 0xd3
+    .byte 0xec, 0xed, 0xee, 0xef, 0xe8, 0xe9, 0xea, 0xeb
+    .byte 0xe4, 0xe5, 0xe6, 0xe7, 0xe0, 0xe1, 0xe2, 0xe3
+    .byte 0xfc, 0xfd, 0xfe, 0xff, 0xf8, 0xf9, 0xfa, 0xfb
+    .byte 0xf4, 0xf5, 0xf6, 0xf7, 0xf0, 0xf1, 0xf2, 0xf3
+
+.Lle128_inc:
+    .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+    .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+    .byte 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+    .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+    .byte 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+    .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+    .byte 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+    .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+    .byte 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+    .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+    .byte 0x05, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+    .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+    .byte 0x06, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+    .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+    .byte 0x07, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+    .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+    .byte 0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+    .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+    .byte 0x09, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+    .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+    .byte 0x0a, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+    .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+    .byte 0x0b, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+    .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+    .byte 0x0c, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+    .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+    .byte 0x0d, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+    .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+    .byte 0x0e, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+    .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+    .byte 0x0f, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+    .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+ELF(.size _gcry_sm4_armv9_svesm4_consts,.-_gcry_sm4_armv9_svesm4_consts)
+
+/* Register macros */
+
+#define RCTR        z16
+#define RCTRv       v16
+#define RIV         z16
+#define RIVv        v16
+#define RSWAP128    z17
+#define RZERO       z18
+#define RLE128_INC  z19
+
+#define RTMP0       z20
+#define RTMP1       z21
+#define RTMP2       z22
+#define RTMP3       z23
+#define RTMP0v      v20
+
+#define vecnum_z0   0
+#define vecnum_z1   1
+#define vecnum_z2   2
+#define vecnum_z3   3
+#define vecnum_z4   4
+#define vecnum_z5   5
+#define vecnum_z6   6
+#define vecnum_z7   7
+#define vecnum_z8   8
+#define vecnum_z9   9
+#define vecnum_z10  10
+#define vecnum_z11  11
+#define vecnum_z12  12
+#define vecnum_z13  13
+#define vecnum_z14  14
+#define vecnum_z15  15
+#define vecnum_z16  16
+#define vecnum_z24  24
+#define vecnum_z25  25
+#define vecnum_z26  26
+#define vecnum_z27  27
+#define vecnum_z28  28
+#define vecnum_z29  29
+#define vecnum_z30  30
+#define vecnum_z31  31
+
+#define vecnum_v0 0
+#define vecnum_v15 15
+#define vecnum_v24 24
+#define vecnum_v25 25
+#define vecnum_v26 26
+#define vecnum_v27 27
+#define vecnum_v28 28
+#define vecnum_v29 29
+#define vecnum_v30 30
+#define vecnum_v31 31
+
+#define sm4e_ce(vd, vn) \
+    .inst (0xcec08400 | (vecnum_##vn << 5) | vecnum_##vd)
+
+#define sm4e_sve(zd, zm) \
+    .inst (0x4523e000 | (vecnum_##zm << 5) | vecnum_##zd)
+
+/* Helper macros. */
+
+#define PREPARE()                                   \
+        GET_DATA_POINTER(x7, .Lbswap128_mask);     \
+        ptrue       p0.b, ALL;                      \
+        rdvl        x5, #1;                         \
+        ld1b        {RSWAP128.b}, p0/z, [x7];       \
+                                                    \
+        ld1         {v24.16b-v27.16b}, [x0], #64;   \
+        ld1         {v28.16b-v31.16b}, [x0];        \
+        dup         z24.q, z24.q[0];                \
+        dup         z25.q, z25.q[0];                \
+        dup         z26.q, z26.q[0];                \
+        dup         z27.q, z27.q[0];                \
+        dup         z28.q, z28.q[0];                \
+        dup         z29.q, z29.q[0];                \
+        dup         z30.q, z30.q[0];                \
+        dup         z31.q, z31.q[0];
+
+
+#define SM4_SVE_CE_CRYPT_BLK(b0)                    \
+        revb        b0.s, p0/m, b0.s;               \
+        sm4e_sve(b0, z24);                          \
+        sm4e_sve(b0, z25);                          \
+        sm4e_sve(b0, z26);                          \
+        sm4e_sve(b0, z27);                          \
+        sm4e_sve(b0, z28);                          \
+        sm4e_sve(b0, z29);                          \
+        sm4e_sve(b0, z30);                          \
+        sm4e_sve(b0, z31);                          \
+        tbl         b0.b, {b0.b}, RSWAP128.b;       \
+        revb        b0.s, p0/m, b0.s;
+
+
+#define SM4_SVE_CE_CRYPT_BLK4(b0, b1, b2, b3)       \
+        revb        b0.s, p0/m, b0.s;               \
+        revb        b1.s, p0/m, b1.s;               \
+        revb        b2.s, p0/m, b2.s;               \
+        revb        b3.s, p0/m, b3.s;               \
+        sm4e_sve(b0, z24);                          \
+        sm4e_sve(b1, z24);                          \
+        sm4e_sve(b2, z24);                          \
+        sm4e_sve(b3, z24);                          \
+        sm4e_sve(b0, z25);                          \
+        sm4e_sve(b1, z25);                          \
+        sm4e_sve(b2, z25);                          \
+        sm4e_sve(b3, z25);                          \
+        sm4e_sve(b0, z26);                          \
+        sm4e_sve(b1, z26);                          \
+        sm4e_sve(b2, z26);                          \
+        sm4e_sve(b3, z26);                          \
+        sm4e_sve(b0, z27);                          \
+        sm4e_sve(b1, z27);                          \
+        sm4e_sve(b2, z27);                          \
+        sm4e_sve(b3, z27);                          \
+        sm4e_sve(b0, z28);                          \
+        sm4e_sve(b1, z28);                          \
+        sm4e_sve(b2, z28);                          \
+        sm4e_sve(b3, z28);                          \
+        sm4e_sve(b0, z29);                          \
+        sm4e_sve(b1, z29);                          \
+        sm4e_sve(b2, z29);                          \
+        sm4e_sve(b3, z29);                          \
+        sm4e_sve(b0, z30);                          \
+        sm4e_sve(b1, z30);                          \
+        sm4e_sve(b2, z30);                          \
+        sm4e_sve(b3, z30);                          \
+        sm4e_sve(b0, z31);                          \
+        sm4e_sve(b1, z31);                          \
+        sm4e_sve(b2, z31);                          \
+        sm4e_sve(b3, z31);                          \
+        tbl         b0.b, {b0.b}, RSWAP128.b;       \
+        tbl         b1.b, {b1.b}, RSWAP128.b;       \
+        tbl         b2.b, {b2.b}, RSWAP128.b;       \
+        tbl         b3.b, {b3.b}, RSWAP128.b;       \
+        revb        b0.s, p0/m, b0.s;               \
+        revb        b1.s, p0/m, b1.s;               \
+        revb        b2.s, p0/m, b2.s;               \
+        revb        b3.s, p0/m, b3.s;
+
+
+#define SM4_SVE_CE_CRYPT_BLK8(b0, b1, b2, b3, b4, b5, b6, b7) \
+        revb        b0.s, p0/m, b0.s;               \
+        revb        b1.s, p0/m, b1.s;               \
+        revb        b2.s, p0/m, b2.s;               \
+        revb        b3.s, p0/m, b3.s;               \
+        revb        b4.s, p0/m, b4.s;               \
+        revb        b5.s, p0/m, b5.s;               \
+        revb        b6.s, p0/m, b6.s;               \
+        revb        b7.s, p0/m, b7.s;               \
+        sm4e_sve(b0, z24);                          \
+        sm4e_sve(b1, z24);                          \
+        sm4e_sve(b2, z24);                          \
+        sm4e_sve(b3, z24);                          \
+        sm4e_sve(b4, z24);                          \
+        sm4e_sve(b5, z24);                          \
+        sm4e_sve(b6, z24);                          \
+        sm4e_sve(b7, z24);                          \
+        sm4e_sve(b0, z25);                          \
+        sm4e_sve(b1, z25);                          \
+        sm4e_sve(b2, z25);                          \
+        sm4e_sve(b3, z25);                          \
+        sm4e_sve(b4, z25);                          \
+        sm4e_sve(b5, z25);                          \
+        sm4e_sve(b6, z25);                          \
+        sm4e_sve(b7, z25);                          \
+        sm4e_sve(b0, z26);                          \
+        sm4e_sve(b1, z26);                          \
+        sm4e_sve(b2, z26);                          \
+        sm4e_sve(b3, z26);                          \
+        sm4e_sve(b4, z26);                          \
+        sm4e_sve(b5, z26);                          \
+        sm4e_sve(b6, z26);                          \
+        sm4e_sve(b7, z26);                          \
+        sm4e_sve(b0, z27);                          \
+        sm4e_sve(b1, z27);                          \
+        sm4e_sve(b2, z27);                          \
+        sm4e_sve(b3, z27);                          \
+        sm4e_sve(b4, z27);                          \
+        sm4e_sve(b5, z27);                          \
+        sm4e_sve(b6, z27);                          \
+        sm4e_sve(b7, z27);                          \
+        sm4e_sve(b0, z28);                          \
+        sm4e_sve(b1, z28);                          \
+        sm4e_sve(b2, z28);                          \
+        sm4e_sve(b3, z28);                          \
+        sm4e_sve(b4, z28);                          \
+        sm4e_sve(b5, z28);                          \
+        sm4e_sve(b6, z28);                          \
+        sm4e_sve(b7, z28);                          \
+        sm4e_sve(b0, z29);                          \
+        sm4e_sve(b1, z29);                          \
+        sm4e_sve(b2, z29);                          \
+        sm4e_sve(b3, z29);                          \
+        sm4e_sve(b4, z29);                          \
+        sm4e_sve(b5, z29);                          \
+        sm4e_sve(b6, z29);                          \
+        sm4e_sve(b7, z29);                          \
+        sm4e_sve(b0, z30);                          \
+        sm4e_sve(b1, z30);                          \
+        sm4e_sve(b2, z30);                          \
+        sm4e_sve(b3, z30);                          \
+        sm4e_sve(b4, z30);                          \
+        sm4e_sve(b5, z30);                          \
+        sm4e_sve(b6, z30);                          \
+        sm4e_sve(b7, z30);                          \
+        sm4e_sve(b0, z31);                          \
+        sm4e_sve(b1, z31);                          \
+        sm4e_sve(b2, z31);                          \
+        sm4e_sve(b3, z31);                          \
+        sm4e_sve(b4, z31);                          \
+        sm4e_sve(b5, z31);                          \
+        sm4e_sve(b6, z31);                          \
+        sm4e_sve(b7, z31);                          \
+        tbl         b0.b, {b0.b}, RSWAP128.b;       \
+        tbl         b1.b, {b1.b}, RSWAP128.b;       \
+        tbl         b2.b, {b2.b}, RSWAP128.b;       \
+        tbl         b3.b, {b3.b}, RSWAP128.b;       \
+        tbl         b4.b, {b4.b}, RSWAP128.b;       \
+        tbl         b5.b, {b5.b}, RSWAP128.b;       \
+        tbl         b6.b, {b6.b}, RSWAP128.b;       \
+        tbl         b7.b, {b7.b}, RSWAP128.b;       \
+        revb        b0.s, p0/m, b0.s;               \
+        revb        b1.s, p0/m, b1.s;               \
+        revb        b2.s, p0/m, b2.s;               \
+        revb        b3.s, p0/m, b3.s;               \
+        revb        b4.s, p0/m, b4.s;               \
+        revb        b5.s, p0/m, b5.s;               \
+        revb        b6.s, p0/m, b6.s;               \
+        revb        b7.s, p0/m, b7.s;
+
+
+#define SM4_CE_CRYPT_BLK(b0)                        \
+        rev32       b0.16b, b0.16b;                 \
+        sm4e_ce(b0, v24);                           \
+        sm4e_ce(b0, v25);                           \
+        sm4e_ce(b0, v26);                           \
+        sm4e_ce(b0, v27);                           \
+        sm4e_ce(b0, v28);                           \
+        sm4e_ce(b0, v29);                           \
+        sm4e_ce(b0, v30);                           \
+        sm4e_ce(b0, v31);                           \
+        rev64       b0.4s, b0.4s;                   \
+        ext         b0.16b, b0.16b, b0.16b, #8;     \
+        rev32       b0.16b, b0.16b;
+
+
+.align 4
+.global _gcry_sm4_armv9_sve_ce_crypt
+ELF(.type _gcry_sm4_armv9_sve_ce_crypt,%function;)
+_gcry_sm4_armv9_sve_ce_crypt:
+    /* input:
+     *   x0: round key array, CTX
+     *   x1: dst
+     *   x2: src
+     *   x3: nblocks
+     */
+    CFI_STARTPROC();
+
+    PREPARE();
+
+.Lcrypt_loop_blks:
+    sub         x3, x3, x5, LSR #1;         /* x3 - (8 * VL) */
+    tbnz        x3, #63, .Lcrypt_tail8;
+
+    ld1b        {z0.b}, p0/z, [x2];
+    ld1b        {z1.b}, p0/z, [x2, #1, MUL VL];
+    ld1b        {z2.b}, p0/z, [x2, #2, MUL VL];
+    ld1b        {z3.b}, p0/z, [x2, #3, MUL VL];
+    ld1b        {z4.b}, p0/z, [x2, #4, MUL VL];
+    ld1b        {z5.b}, p0/z, [x2, #5, MUL VL];
+    ld1b        {z6.b}, p0/z, [x2, #6, MUL VL];
+    ld1b        {z7.b}, p0/z, [x2, #7, MUL VL];
+    addvl       x2, x2, #8;
+
+    SM4_SVE_CE_CRYPT_BLK8(z0, z1, z2, z3, z4, z5, z6, z7);
+
+    st1b        {z0.b}, p0, [x1];
+    st1b        {z1.b}, p0, [x1, #1, MUL VL];
+    st1b        {z2.b}, p0, [x1, #2, MUL VL];
+    st1b        {z3.b}, p0, [x1, #3, MUL VL];
+    st1b        {z4.b}, p0, [x1, #4, MUL VL];
+    st1b        {z5.b}, p0, [x1, #5, MUL VL];
+    st1b        {z6.b}, p0, [x1, #6, MUL VL];
+    st1b        {z7.b}, p0, [x1, #7, MUL VL];
+    addvl       x1, x1, #8;
+
+    cbz         x3, .Lcrypt_end;
+    b           .Lcrypt_loop_blks;
+
+.Lcrypt_tail8:
+    add         x3, x3, x5, LSR #1;
+    cmp         x3, x5, LSR #2;
+    blt         .Lcrypt_tail4;
+
+    sub         x3, x3, x5, LSR #2;     /* x3 - (4 * VL) */
+
+    ld1b        {z0.b}, p0/z, [x2];
+    ld1b        {z1.b}, p0/z, [x2, #1, MUL VL];
+    ld1b        {z2.b}, p0/z, [x2, #2, MUL VL];
+    ld1b        {z3.b}, p0/z, [x2, #3, MUL VL];
+    addvl       x2, x2, #4;
+
+    SM4_SVE_CE_CRYPT_BLK4(z0, z1, z2, z3);
+
+    st1b        {z0.b}, p0, [x1];
+    st1b        {z1.b}, p0, [x1, #1, MUL VL];
+    st1b        {z2.b}, p0, [x1, #2, MUL VL];
+    st1b        {z3.b}, p0, [x1, #3, MUL VL];
+    addvl       x1, x1, #4;
+
+    cbz         x3, .Lcrypt_end;
+
+.Lcrypt_tail4:
+    cmp         x3, x5, LSR #4;
+    blt         .Lcrypt_tail;
+
+    sub         x3, x3, x5, LSR #4;     /* x3 - VL */
+
+    ld1b        {z0.b}, p0/z, [x2];
+    addvl       x2, x2, #1;
+
+    SM4_SVE_CE_CRYPT_BLK(z0);
+
+    st1b        {z0.b}, p0, [x1];
+    addvl       x1, x1, #1;
+
+    cbz         x3, .Lcrypt_end;
+
+.Lcrypt_tail:
+    sub         x3, x3, #1;
+
+    ld1         {v0.16b}, [x2], #16;
+    SM4_CE_CRYPT_BLK(v0);
+    st1         {v0.16b}, [x1], #16;
+
+    cbnz        x3, .Lcrypt_tail;
+
+.Lcrypt_end:
+    ret_spec_stop;
+    CFI_ENDPROC();
+ELF(.size _gcry_sm4_armv9_sve_ce_crypt,.-_gcry_sm4_armv9_sve_ce_crypt;)
+
+.align 4
+.global _gcry_sm4_armv9_sve_ce_cbc_dec
+ELF(.type _gcry_sm4_armv9_sve_ce_cbc_dec,%function;)
+_gcry_sm4_armv9_sve_ce_cbc_dec:
+    /* input:
+     *   x0: round key array, CTX
+     *   x1: dst
+     *   x2: src
+     *   x3: iv (big endian, 128 bit)
+     *   x4: nblocks
+     */
+    CFI_STARTPROC();
+    VPUSH_ABI;
+
+    PREPARE();
+    ld1         {RIVv.16b}, [x3];
+    ext         RIV.b, RIV.b, RIV.b, #16;
+
+.Lcbc_loop_blks:
+    sub         x4, x4, x5, LSR #1;         /* x4 - (8 * VL) */
+    tbnz        x4, #63, .Lcbc_tail8;
+
+    ld1b        {z15.b}, p0/z, [x2];
+    ld1b        {z14.b}, p0/z, [x2, #1, MUL VL];
+    ld1b        {z13.b}, p0/z, [x2, #2, MUL VL];
+    ld1b        {z12.b}, p0/z, [x2, #3, MUL VL];
+    ld1b        {z11.b}, p0/z, [x2, #4, MUL VL];
+    ld1b        {z10.b}, p0/z, [x2, #5, MUL VL];
+    ld1b        {z9.b}, p0/z, [x2, #6, MUL VL];
+    ld1b        {z8.b}, p0/z, [x2, #7, MUL VL];
+    rev         z0.b, z15.b;
+    rev         z1.b, z14.b;
+    rev         z2.b, z13.b;
+    rev         z3.b, z12.b;
+    rev         z4.b, z11.b;
+    rev         z5.b, z10.b;
+    rev         z6.b, z9.b;
+    rev         z7.b, z8.b;
+    rev         RTMP0.b, RIV.b;
+    ext         z7.b, z7.b, z6.b, #16;
+    ext         z6.b, z6.b, z5.b, #16;
+    ext         z5.b, z5.b, z4.b, #16;
+    ext         z4.b, z4.b, z3.b, #16;
+    ext         z3.b, z3.b, z2.b, #16;
+    ext         z2.b, z2.b, z1.b, #16;
+    ext         z1.b, z1.b, z0.b, #16;
+    ext         z0.b, z0.b, RTMP0.b, #16;
+    rev         z7.b, z7.b;
+    rev         z6.b, z6.b;
+    rev         z5.b, z5.b;
+    rev         z4.b, z4.b;
+    rev         z3.b, z3.b;
+    rev         z2.b, z2.b;
+    rev         z1.b, z1.b;
+    rev         z0.b, z0.b;
+    mov         RIV.d, z8.d;
+
+    SM4_SVE_CE_CRYPT_BLK8(z15, z14, z13, z12, z11, z10, z9, z8);
+
+    eor         z0.d, z0.d, z15.d;
+    eor         z1.d, z1.d, z14.d;
+    eor         z2.d, z2.d, z13.d;
+    eor         z3.d, z3.d, z12.d;
+    eor         z4.d, z4.d, z11.d;
+    eor         z5.d, z5.d, z10.d;
+    eor         z6.d, z6.d, z9.d;
+    eor         z7.d, z7.d, z8.d;
+    st1b        {z0.b}, p0, [x1];
+    st1b        {z1.b}, p0, [x1, #1, MUL VL];
+    st1b        {z2.b}, p0, [x1, #2, MUL VL];
+    st1b        {z3.b}, p0, [x1, #3, MUL VL];
+    st1b        {z4.b}, p0, [x1, #4, MUL VL];
+    st1b        {z5.b}, p0, [x1, #5, MUL VL];
+    st1b        {z6.b}, p0, [x1, #6, MUL VL];
+    st1b        {z7.b}, p0, [x1, #7, MUL VL];
+    addvl       x2, x2, #8;
+    addvl       x1, x1, #8;
+
+    cbz         x4, .Lcbc_end;
+    b           .Lcbc_loop_blks;
+
+.Lcbc_tail8:
+    add         x4, x4, x5, LSR #1;
+    cmp         x4, x5, LSR #2;
+    blt         .Lcbc_tail4;
+
+    sub         x4, x4, x5, LSR #2;         /* x4 - (4 * VL) */
+
+    ld1b        {z15.b}, p0/z, [x2];
+    ld1b        {z14.b}, p0/z, [x2, #1, MUL VL];
+    ld1b        {z13.b}, p0/z, [x2, #2, MUL VL];
+    ld1b        {z12.b}, p0/z, [x2, #3, MUL VL];
+    rev         z0.b, z15.b;
+    rev         z1.b, z14.b;
+    rev         z2.b, z13.b;
+    rev         z3.b, z12.b;
+    rev         RTMP0.b, RIV.b;
+    ext         z3.b, z3.b, z2.b, #16;
+    ext         z2.b, z2.b, z1.b, #16;
+    ext         z1.b, z1.b, z0.b, #16;
+    ext         z0.b, z0.b, RTMP0.b, #16;
+    rev         z3.b, z3.b;
+    rev         z2.b, z2.b;
+    rev         z1.b, z1.b;
+    rev         z0.b, z0.b;
+    mov         RIV.d, z12.d;
+
+    SM4_SVE_CE_CRYPT_BLK4(z15, z14, z13, z12);
+
+    eor         z0.d, z0.d, z15.d;
+    eor         z1.d, z1.d, z14.d;
+    eor         z2.d, z2.d, z13.d;
+    eor         z3.d, z3.d, z12.d;
+    st1b        {z0.b}, p0, [x1];
+    st1b        {z1.b}, p0, [x1, #1, MUL VL];
+    st1b        {z2.b}, p0, [x1, #2, MUL VL];
+    st1b        {z3.b}, p0, [x1, #3, MUL VL];
+    addvl       x2, x2, #4;
+    addvl       x1, x1, #4;
+
+    cbz         x4, .Lcbc_end;
+
+.Lcbc_tail4:
+    cmp         x4, x5, LSR #4;
+    blt         .Lcbc_tail_ce;
+
+    sub         x4, x4, x5, LSR #4;         /* x4 - VL */
+
+    ld1b        {z15.b}, p0/z, [x2];
+    rev         RTMP0.b, RIV.b;
+    rev         z0.b, z15.b;
+    ext         z0.b, z0.b, RTMP0.b, #16;
+    rev         z0.b, z0.b;
+    mov         RIV.d, z15.d;
+
+    SM4_SVE_CE_CRYPT_BLK(z15);
+
+    eor         z0.d, z0.d, z15.d;
+    st1b        {z0.b}, p0, [x1];
+    addvl       x2, x2, #1;
+    addvl       x1, x1, #1;
+
+    cbz         x4, .Lcbc_end;
+    b           .Lcbc_tail4;
+
+.Lcbc_tail_ce:
+    rev         RIV.s, RIV.s;
+    tbl         RIV.b, {RIV.b}, RSWAP128.b;
+
+.Lcbc_tail:
+    sub         x4, x4, #1;
+
+    ld1         {v15.16b}, [x2], #16;
+    mov         v0.16b, RIVv.16b;
+    mov         RIVv.16b, v15.16b;
+    SM4_CE_CRYPT_BLK(v15);
+    eor         v0.16b, v0.16b, v15.16b;
+    st1         {v0.16b}, [x1], #16;
+
+    cbnz        x4, .Lcbc_tail;
+
+    ext         RIV.b, RIV.b, RIV.b, #16;
+
+.Lcbc_end:
+    /* store new IV */
+    rev         RIV.s, RIV.s;
+    tbl         RIV.b, {RIV.b}, RSWAP128.b;
+    st1         {RIVv.16b}, [x3];
+
+    VPOP_ABI;
+    ret_spec_stop;
+    CFI_ENDPROC();
+ELF(.size _gcry_sm4_armv9_sve_ce_cbc_dec,.-_gcry_sm4_armv9_sve_ce_cbc_dec;)
+
+.align 4
+.global _gcry_sm4_armv9_sve_ce_cfb_dec
+ELF(.type _gcry_sm4_armv9_sve_ce_cfb_dec,%function;)
+_gcry_sm4_armv9_sve_ce_cfb_dec:
+    /* input:
+     *   x0: round key array, CTX
+     *   x1: dst
+     *   x2: src
+     *   x3: iv (big endian, 128 bit)
+     *   x4: nblocks
+     */
+    CFI_STARTPROC();
+    VPUSH_ABI;
+
+    PREPARE();
+    ld1         {RIVv.16b}, [x3];
+    ext         RIV.b, RIV.b, RIV.b, #16;
+
+.Lcfb_loop_blks:
+    sub         x4, x4, x5, LSR #1;         /* x4 - (8 * VL) */
+    tbnz        x4, #63, .Lcfb_tail8;
+
+    ld1b        {z15.b}, p0/z, [x2];
+    ld1b        {z14.b}, p0/z, [x2, #1, MUL VL];
+    ld1b        {z13.b}, p0/z, [x2, #2, MUL VL];
+    ld1b        {z12.b}, p0/z, [x2, #3, MUL VL];
+    ld1b        {z11.b}, p0/z, [x2, #4, MUL VL];
+    ld1b        {z10.b}, p0/z, [x2, #5, MUL VL];
+    ld1b        {z9.b}, p0/z, [x2, #6, MUL VL];
+    ld1b        {z8.b}, p0/z, [x2, #7, MUL VL];
+    rev         z0.b, z15.b;
+    rev         z1.b, z14.b;
+    rev         z2.b, z13.b;
+    rev         z3.b, z12.b;
+    rev         z4.b, z11.b;
+    rev         z5.b, z10.b;
+    rev         z6.b, z9.b;
+    rev         z7.b, z8.b;
+    rev         RTMP0.b, RIV.b;
+    ext         z7.b, z7.b, z6.b, #16;
+    ext         z6.b, z6.b, z5.b, #16;
+    ext         z5.b, z5.b, z4.b, #16;
+    ext         z4.b, z4.b, z3.b, #16;
+    ext         z3.b, z3.b, z2.b, #16;
+    ext         z2.b, z2.b, z1.b, #16;
+    ext         z1.b, z1.b, z0.b, #16;
+    ext         z0.b, z0.b, RTMP0.b, #16;
+    rev         z7.b, z7.b;
+    rev         z6.b, z6.b;
+    rev         z5.b, z5.b;
+    rev         z4.b, z4.b;
+    rev         z3.b, z3.b;
+    rev         z2.b, z2.b;
+    rev         z1.b, z1.b;
+    rev         z0.b, z0.b;
+    mov         RIV.d, z8.d;
+
+    SM4_SVE_CE_CRYPT_BLK8(z0, z1, z2, z3, z4, z5, z6, z7);
+
+    eor         z0.d, z0.d, z15.d;
+    eor         z1.d, z1.d, z14.d;
+    eor         z2.d, z2.d, z13.d;
+    eor         z3.d, z3.d, z12.d;
+    eor         z4.d, z4.d, z11.d;
+    eor         z5.d, z5.d, z10.d;
+    eor         z6.d, z6.d, z9.d;
+    eor         z7.d, z7.d, z8.d;
+    st1b        {z0.b}, p0, [x1];
+    st1b        {z1.b}, p0, [x1, #1, MUL VL];
+    st1b        {z2.b}, p0, [x1, #2, MUL VL];
+    st1b        {z3.b}, p0, [x1, #3, MUL VL];
+    st1b        {z4.b}, p0, [x1, #4, MUL VL];
+    st1b        {z5.b}, p0, [x1, #5, MUL VL];
+    st1b        {z6.b}, p0, [x1, #6, MUL VL];
+    st1b        {z7.b}, p0, [x1, #7, MUL VL];
+    addvl       x2, x2, #8;
+    addvl       x1, x1, #8;
+
+    cbz         x4, .Lcfb_end;
+    b           .Lcfb_loop_blks;
+
+.Lcfb_tail8:
+    add         x4, x4, x5, LSR #1;
+    cmp         x4, x5, LSR #2;
+    blt         .Lcfb_tail4;
+
+    sub         x4, x4, x5, LSR #2;         /* x4 - (4 * VL) */
+
+    ld1b        {z15.b}, p0/z, [x2];
+    ld1b        {z14.b}, p0/z, [x2, #1, MUL VL];
+    ld1b        {z13.b}, p0/z, [x2, #2, MUL VL];
+    ld1b        {z12.b}, p0/z, [x2, #3, MUL VL];
+    rev         z0.b, z15.b;
+    rev         z1.b, z14.b;
+    rev         z2.b, z13.b;
+    rev         z3.b, z12.b;
+    rev         RTMP0.b, RIV.b;
+    ext         z3.b, z3.b, z2.b, #16;
+    ext         z2.b, z2.b, z1.b, #16;
+    ext         z1.b, z1.b, z0.b, #16;
+    ext         z0.b, z0.b, RTMP0.b, #16;
+    rev         z3.b, z3.b;
+    rev         z2.b, z2.b;
+    rev         z1.b, z1.b;
+    rev         z0.b, z0.b;
+    mov         RIV.d, z12.d;
+
+    SM4_SVE_CE_CRYPT_BLK4(z0, z1, z2, z3);
+
+    eor         z0.d, z0.d, z15.d;
+    eor         z1.d, z1.d, z14.d;
+    eor         z2.d, z2.d, z13.d;
+    eor         z3.d, z3.d, z12.d;
+    st1b        {z0.b}, p0, [x1];
+    st1b        {z1.b}, p0, [x1, #1, MUL VL];
+    st1b        {z2.b}, p0, [x1, #2, MUL VL];
+    st1b        {z3.b}, p0, [x1, #3, MUL VL];
+    addvl       x2, x2, #4;
+    addvl       x1, x1, #4;
+
+    cbz         x4, .Lcfb_end;
+
+.Lcfb_tail4:
+    cmp         x4, x5, LSR #4;
+    blt         .Lcfb_tail_ce;
+
+    sub         x4, x4, x5, LSR #4;         /* x4 - VL */
+
+    ld1b        {z15.b}, p0/z, [x2];
+    rev         RTMP0.b, RIV.b;
+    rev         z0.b, z15.b;
+    ext         z0.b, z0.b, RTMP0.b, #16;
+    rev         z0.b, z0.b;
+    mov         RIV.d, z15.d;
+
+    SM4_SVE_CE_CRYPT_BLK(z0);
+
+    eor         z0.d, z0.d, z15.d;
+    st1b        {z0.b}, p0, [x1];
+    addvl       x2, x2, #1;
+    addvl       x1, x1, #1;
+
+    cbz         x4, .Lcfb_end;
+    b           .Lcfb_tail4;
+
+.Lcfb_tail_ce:
+    rev         RIV.s, RIV.s;
+    tbl         RIV.b, {RIV.b}, RSWAP128.b;
+
+.Lcfb_tail:
+    sub         x4, x4, #1;
+
+    ld1         {v15.16b}, [x2], #16;
+    mov         v0.16b, RIVv.16b;
+    mov         RIVv.16b, v15.16b;
+    SM4_CE_CRYPT_BLK(v0);
+    eor         v0.16b, v0.16b, v15.16b;
+    st1         {v0.16b}, [x1], #16;
+
+    cbnz        x4, .Lcfb_tail;
+
+    ext         RIV.b, RIV.b, RIV.b, #16;
+
+.Lcfb_end:
+    /* store new IV */
+    rev         RIV.s, RIV.s;
+    tbl         RIV.b, {RIV.b}, RSWAP128.b;
+    st1         {RIVv.16b}, [x3];
+
+    VPOP_ABI;
+    ret_spec_stop;
+    CFI_ENDPROC();
+ELF(.size _gcry_sm4_armv9_sve_ce_cfb_dec,.-_gcry_sm4_armv9_sve_ce_cfb_dec;)
+
+.align 4
+.global _gcry_sm4_armv9_sve_ce_ctr_enc
+ELF(.type _gcry_sm4_armv9_sve_ce_ctr_enc,%function;)
+_gcry_sm4_armv9_sve_ce_ctr_enc:
+    /* input:
+     *   x0: round key array, CTX
+     *   x1: dst
+     *   x2: src
+     *   x3: ctr (big endian, 128 bit)
+     *   x4: nblocks
+     */
+    CFI_STARTPROC();
+
+    PREPARE();
+
+    dup         RZERO.d, #0;
+    GET_DATA_POINTER(x6, .Lle128_inc);
+    ld1b        {RLE128_INC.b}, p0/z, [x6];
+
+    ldp         x7, x8, [x3];
+    rev         x7, x7;
+    rev         x8, x8;
+
+#define inc_le128(zctr)                             \
+        mov         RCTRv.d[1], x8;                 \
+        mov         RCTRv.d[0], x7;                 \
+        mov         zctr.d, RLE128_INC.d;           \
+        dup         RCTR.q, RCTR.q[0];              \
+        adds        x8, x8, x5, LSR #4;             \
+        adc         x7, x7, xzr;                    \
+        adclt       zctr.d, RCTR.d, RZERO.d;        \
+        adclt       RCTR.d, zctr.d, RZERO.d;        \
+        trn1        zctr.d, RCTR.d, zctr.d;         \
+        revb        zctr.d, p0/m, zctr.d;
+
+.Lctr_loop_blks:
+    sub         x4, x4, x5, LSR #1;         /* x4 - (8 * VL) */
+    tbnz        x4, #63, .Lctr_tail8;
+
+    inc_le128(z0);
+    inc_le128(z1);
+    inc_le128(z2);
+    inc_le128(z3);
+    inc_le128(z4);
+    inc_le128(z5);
+    inc_le128(z6);
+    inc_le128(z7);
+
+    SM4_SVE_CE_CRYPT_BLK8(z0, z1, z2, z3, z4, z5, z6, z7);
+
+    ld1b        {RTMP0.b}, p0/z, [x2];
+    ld1b        {RTMP1.b}, p0/z, [x2, #1, MUL VL];
+    ld1b        {RTMP2.b}, p0/z, [x2, #2, MUL VL];
+    ld1b        {RTMP3.b}, p0/z, [x2, #3, MUL VL];
+    eor         z0.d, z0.d, RTMP0.d;
+    eor         z1.d, z1.d, RTMP1.d;
+    eor         z2.d, z2.d, RTMP2.d;
+    eor         z3.d, z3.d, RTMP3.d;
+    ld1b        {RTMP0.b}, p0/z, [x2, #4, MUL VL];
+    ld1b        {RTMP1.b}, p0/z, [x2, #5, MUL VL];
+    ld1b        {RTMP2.b}, p0/z, [x2, #6, MUL VL];
+    ld1b        {RTMP3.b}, p0/z, [x2, #7, MUL VL];
+    eor         z4.d, z4.d, RTMP0.d;
+    eor         z5.d, z5.d, RTMP1.d;
+    eor         z6.d, z6.d, RTMP2.d;
+    eor         z7.d, z7.d, RTMP3.d;
+    addvl       x2, x2, #8;
+
+    st1b        {z0.b}, p0, [x1];
+    st1b        {z1.b}, p0, [x1, #1, MUL VL];
+    st1b        {z2.b}, p0, [x1, #2, MUL VL];
+    st1b        {z3.b}, p0, [x1, #3, MUL VL];
+    st1b        {z4.b}, p0, [x1, #4, MUL VL];
+    st1b        {z5.b}, p0, [x1, #5, MUL VL];
+    st1b        {z6.b}, p0, [x1, #6, MUL VL];
+    st1b        {z7.b}, p0, [x1, #7, MUL VL];
+    addvl       x1, x1, #8;
+
+    cbz         x4, .Lctr_end;
+    b           .Lctr_loop_blks;
+
+.Lctr_tail8:
+    add         x4, x4, x5, LSR #1;
+    cmp         x4, x5, LSR #2;
+    blt         .Lctr_tail4;
+
+    sub         x4, x4, x5, LSR #2;         /* x4 - (4 * VL) */
+
+    inc_le128(z0);
+    inc_le128(z1);
+    inc_le128(z2);
+    inc_le128(z3);
+
+    SM4_SVE_CE_CRYPT_BLK4(z0, z1, z2, z3);
+
+    ld1b        {RTMP0.b}, p0/z, [x2];
+    ld1b        {RTMP1.b}, p0/z, [x2, #1, MUL VL];
+    ld1b        {RTMP2.b}, p0/z, [x2, #2, MUL VL];
+    ld1b        {RTMP3.b}, p0/z, [x2, #3, MUL VL];
+    eor         z0.d, z0.d, RTMP0.d;
+    eor         z1.d, z1.d, RTMP1.d;
+    eor         z2.d, z2.d, RTMP2.d;
+    eor         z3.d, z3.d, RTMP3.d;
+    st1b        {z0.b}, p0, [x1];
+    st1b        {z1.b}, p0, [x1, #1, MUL VL];
+    st1b        {z2.b}, p0, [x1, #2, MUL VL];
+    st1b        {z3.b}, p0, [x1, #3, MUL VL];
+    addvl       x2, x2, #4;
+    addvl       x1, x1, #4;
+
+    cbz         x4, .Lctr_end;
+
+.Lctr_tail4:
+    cmp         x4, x5, LSR #4;
+    blt         .Lctr_tail;
+
+    sub         x4, x4, x5, LSR #4;         /* x4 - VL */
+
+    inc_le128(z0);
+    SM4_SVE_CE_CRYPT_BLK(z0);
+    ld1b        {RTMP0.b}, p0/z, [x2];
+    eor         z0.d, z0.d, RTMP0.d;
+    st1b        {z0.b}, p0, [x1];
+    addvl       x2, x2, #1;
+    addvl       x1, x1, #1;
+
+    cbz         x4, .Lctr_end;
+    b           .Lctr_tail4;
+
+.Lctr_tail:
+    sub         x4, x4, #1;
+
+    /* inc_le128 for CE */
+    mov         v0.d[1], x8;
+    mov         v0.d[0], x7;
+    adds        x8, x8, #1;
+    adc         x7, x7, xzr;
+    rev64       v0.16b, v0.16b;
+
+    SM4_CE_CRYPT_BLK(v0);
+    ld1         {RTMP0v.16b}, [x2], #16;
+    eor         v0.16b, v0.16b, RTMP0v.16b;
+    st1         {v0.16b}, [x1], #16;
+
+    cbnz        x4, .Lctr_tail;
+
+.Lctr_end:
+    /* store new CTR */
+    rev x7, x7;
+    rev x8, x8;
+    stp x7, x8, [x3];
+
+    ret_spec_stop;
+    CFI_ENDPROC();
+ELF(.size _gcry_sm4_armv9_sve_ce_ctr_enc,.-_gcry_sm4_armv9_sve_ce_ctr_enc;)
+
+.align 4
+.global _gcry_sm4_armv9_sve_get_vl
+ELF(.type _gcry_sm4_armv9_sve_get_vl,%function;)
+_gcry_sm4_armv9_sve_get_vl:
+    CFI_STARTPROC();
+
+    /* VL in bytes */
+    rdvl        x0, #1;
+
+    ret_spec_stop;
+    CFI_ENDPROC();
+ELF(.size _gcry_sm4_armv9_sve_get_vl,.-_gcry_sm4_armv9_sve_get_vl;)
+
+#endif
diff --git a/cipher/sm4-gfni-avx2-amd64.S b/cipher/sm4-gfni-avx2-amd64.S
new file mode 100644 (file)
index 0000000..464da39
--- /dev/null
@@ -0,0 +1,1260 @@
+/* sm4-gfni-avx2-amd64.S  -  GFNI/AVX2 implementation of SM4 cipher
+ *
+ * Copyright (C) 2022-2023 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <config.h>
+
+#ifdef __x86_64
+#if (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
+     defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) && \
+    defined(ENABLE_GFNI_SUPPORT) && defined(ENABLE_AVX2_SUPPORT)
+
+#include "asm-common-amd64.h"
+
+/**********************************************************************
+  helper macros
+ **********************************************************************/
+
+/* Transpose four 32-bit words between 128-bit vectors. */
+#define transpose_4x4(x0, x1, x2, x3, t1, t2) \
+       vpunpckhdq x1, x0, t2; \
+       vpunpckldq x1, x0, x0; \
+       \
+       vpunpckldq x3, x2, t1; \
+       vpunpckhdq x3, x2, x2; \
+       \
+       vpunpckhqdq t1, x0, x1; \
+       vpunpcklqdq t1, x0, x0; \
+       \
+       vpunpckhqdq x2, t2, x3; \
+       vpunpcklqdq x2, t2, x2;
+
+/**********************************************************************
+  4-way && 8-way SM4 with GFNI and AVX2
+ **********************************************************************/
+
+/* vector registers */
+#define RX0          %ymm0
+#define RX1          %ymm1
+#define RX0x         %xmm0
+#define RX1x         %xmm1
+
+#define RTMP0        %ymm2
+#define RTMP1        %ymm3
+#define RTMP2        %ymm4
+#define RTMP3        %ymm5
+#define RTMP4        %ymm6
+#define RTMP0x       %xmm2
+#define RTMP1x       %xmm3
+#define RTMP2x       %xmm4
+#define RTMP3x       %xmm5
+#define RTMP4x       %xmm6
+
+#define RNOT         %ymm7
+#define RNOTx        %xmm7
+
+#define RA0          %ymm8
+#define RA1          %ymm9
+#define RA2          %ymm10
+#define RA3          %ymm11
+#define RA0x         %xmm8
+#define RA1x         %xmm9
+#define RA2x         %xmm10
+#define RA3x         %xmm11
+
+#define RB0          %ymm12
+#define RB1          %ymm13
+#define RB2          %ymm14
+#define RB3          %ymm15
+#define RB0x         %xmm12
+#define RB1x         %xmm13
+#define RB2x         %xmm14
+#define RB3x         %xmm15
+
+SECTION_RODATA
+.align 32
+
+ELF(.type _sm4_gfni_avx2_consts,@object)
+_sm4_gfni_avx2_consts:
+
+/* Affine transform, SM4 field to AES field */
+.Lpre_affine_s:
+       .byte 0x52, 0xbc, 0x2d, 0x02, 0x9e, 0x25, 0xac, 0x34
+       .byte 0x52, 0xbc, 0x2d, 0x02, 0x9e, 0x25, 0xac, 0x34
+       .byte 0x52, 0xbc, 0x2d, 0x02, 0x9e, 0x25, 0xac, 0x34
+       .byte 0x52, 0xbc, 0x2d, 0x02, 0x9e, 0x25, 0xac, 0x34
+
+/* Affine transform, AES field to SM4 field */
+.Lpost_affine_s:
+       .byte 0x19, 0x8b, 0x6c, 0x1e, 0x51, 0x8e, 0x2d, 0xd7
+       .byte 0x19, 0x8b, 0x6c, 0x1e, 0x51, 0x8e, 0x2d, 0xd7
+       .byte 0x19, 0x8b, 0x6c, 0x1e, 0x51, 0x8e, 0x2d, 0xd7
+       .byte 0x19, 0x8b, 0x6c, 0x1e, 0x51, 0x8e, 0x2d, 0xd7
+
+/* Rotate left by 8 bits on 32-bit words with vpshufb */
+.Lrol_8:
+       .byte 0x03, 0x00, 0x01, 0x02, 0x07, 0x04, 0x05, 0x06
+       .byte 0x0b, 0x08, 0x09, 0x0a, 0x0f, 0x0c, 0x0d, 0x0e
+       .byte 0x03, 0x00, 0x01, 0x02, 0x07, 0x04, 0x05, 0x06
+       .byte 0x0b, 0x08, 0x09, 0x0a, 0x0f, 0x0c, 0x0d, 0x0e
+
+/* Rotate left by 16 bits on 32-bit words with vpshufb */
+.Lrol_16:
+       .byte 0x02, 0x03, 0x00, 0x01, 0x06, 0x07, 0x04, 0x05
+       .byte 0x0a, 0x0b, 0x08, 0x09, 0x0e, 0x0f, 0x0c, 0x0d
+       .byte 0x02, 0x03, 0x00, 0x01, 0x06, 0x07, 0x04, 0x05
+       .byte 0x0a, 0x0b, 0x08, 0x09, 0x0e, 0x0f, 0x0c, 0x0d
+
+/* Rotate left by 24 bits on 32-bit words with vpshufb */
+.Lrol_24:
+       .byte 0x01, 0x02, 0x03, 0x00, 0x05, 0x06, 0x07, 0x04
+       .byte 0x09, 0x0a, 0x0b, 0x08, 0x0d, 0x0e, 0x0f, 0x0c
+       .byte 0x01, 0x02, 0x03, 0x00, 0x05, 0x06, 0x07, 0x04
+       .byte 0x09, 0x0a, 0x0b, 0x08, 0x0d, 0x0e, 0x0f, 0x0c
+
+/* For CTR-mode IV byteswap */
+.Lbswap128_mask:
+       .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
+
+/* For input word byte-swap */
+.Lbswap32_mask:
+       .byte 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12
+
+/* CTR byte addition constants */
+.align 32
+.Lbige_addb_0_1:
+       .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+       .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1
+.Lbige_addb_2_3:
+       .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2
+       .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3
+.Lbige_addb_4_5:
+       .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4
+       .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5
+.Lbige_addb_6_7:
+       .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6
+       .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7
+.Lbige_addb_8_9:
+       .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 8
+       .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 9
+.Lbige_addb_10_11:
+       .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 10
+       .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 11
+.Lbige_addb_12_13:
+       .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 12
+       .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 13
+.Lbige_addb_14_15:
+       .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 14
+       .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 15
+
+.text
+
+.align 16
+.globl _gcry_sm4_gfni_avx2_expand_key
+ELF(.type   _gcry_sm4_gfni_avx2_expand_key,@function;)
+_gcry_sm4_gfni_avx2_expand_key:
+       /* input:
+        *      %rdi: 128-bit key
+        *      %rsi: rkey_enc
+        *      %rdx: rkey_dec
+        *      %rcx: fk array
+        *      %r8: ck array
+        */
+       CFI_STARTPROC();
+
+       vmovd 0*4(%rdi), RA0x;
+       vmovd 1*4(%rdi), RA1x;
+       vmovd 2*4(%rdi), RA2x;
+       vmovd 3*4(%rdi), RA3x;
+
+       vmovdqa .Lbswap32_mask rRIP, RTMP2x;
+       vpshufb RTMP2x, RA0x, RA0x;
+       vpshufb RTMP2x, RA1x, RA1x;
+       vpshufb RTMP2x, RA2x, RA2x;
+       vpshufb RTMP2x, RA3x, RA3x;
+
+       vmovd 0*4(%rcx), RB0x;
+       vmovd 1*4(%rcx), RB1x;
+       vmovd 2*4(%rcx), RB2x;
+       vmovd 3*4(%rcx), RB3x;
+       vpxor RB0x, RA0x, RA0x;
+       vpxor RB1x, RA1x, RA1x;
+       vpxor RB2x, RA2x, RA2x;
+       vpxor RB3x, RA3x, RA3x;
+
+#define ROUND(round, s0, s1, s2, s3) \
+       vpbroadcastd (4*(round))(%r8), RX0x; \
+       vpxor s1, RX0x, RX0x; \
+       vpxor s2, RX0x, RX0x; \
+       vpxor s3, RX0x, RX0x; /* s1 ^ s2 ^ s3 ^ rk */ \
+       \
+       /* sbox, non-linear part */ \
+       vgf2p8affineqb $0x65, .Lpre_affine_s rRIP, RX0x, RX0x; \
+       vgf2p8affineinvqb $0xd3, .Lpost_affine_s rRIP, RX0x, RX0x; \
+       \
+       /* linear part */ \
+       vpxor RX0x, s0, s0; /* s0 ^ x */ \
+       vpslld $13, RX0x, RTMP0x; \
+       vpsrld $19, RX0x, RTMP1x; \
+       vpslld $23, RX0x, RTMP2x; \
+       vpsrld $9, RX0x, RTMP3x; \
+       vpxor RTMP0x, RTMP1x, RTMP1x;  \
+       vpxor RTMP2x, RTMP3x, RTMP3x;  \
+       vpxor RTMP1x, s0, s0; /* s0 ^ x ^ rol(x,13) */ \
+       vpxor RTMP3x, s0, s0; /* s0 ^ x ^ rol(x,13) ^ rol(x,23) */
+
+       leaq (32*4)(%r8), %rax;
+       leaq (32*4)(%rdx), %rdx;
+.align 16
+.Lroundloop_expand_key:
+       leaq (-4*4)(%rdx), %rdx;
+       ROUND(0, RA0x, RA1x, RA2x, RA3x);
+       ROUND(1, RA1x, RA2x, RA3x, RA0x);
+       ROUND(2, RA2x, RA3x, RA0x, RA1x);
+       ROUND(3, RA3x, RA0x, RA1x, RA2x);
+       leaq (4*4)(%r8), %r8;
+       vmovd RA0x, (0*4)(%rsi);
+       vmovd RA1x, (1*4)(%rsi);
+       vmovd RA2x, (2*4)(%rsi);
+       vmovd RA3x, (3*4)(%rsi);
+       vmovd RA0x, (3*4)(%rdx);
+       vmovd RA1x, (2*4)(%rdx);
+       vmovd RA2x, (1*4)(%rdx);
+       vmovd RA3x, (0*4)(%rdx);
+       leaq (4*4)(%rsi), %rsi;
+       cmpq %rax, %r8;
+       jne .Lroundloop_expand_key;
+
+#undef ROUND
+
+       vzeroall;
+       ret_spec_stop;
+       CFI_ENDPROC();
+ELF(.size _gcry_sm4_gfni_avx2_expand_key,.-_gcry_sm4_gfni_avx2_expand_key;)
+
+.align 16
+ELF(.type   sm4_gfni_avx2_crypt_blk1_4,@function;)
+sm4_gfni_avx2_crypt_blk1_4:
+       /* input:
+        *      %rdi: round key array, CTX
+        *      %rsi: dst (1..4 blocks)
+        *      %rdx: src (1..4 blocks)
+        *      %rcx: num blocks (1..4)
+        */
+       CFI_STARTPROC();
+
+       vmovdqu 0*16(%rdx), RA0x;
+       vmovdqa RA0x, RA1x;
+       vmovdqa RA0x, RA2x;
+       vmovdqa RA0x, RA3x;
+       cmpq $2, %rcx;
+       jb .Lblk4_load_input_done;
+       vmovdqu 1*16(%rdx), RA1x;
+       je .Lblk4_load_input_done;
+       vmovdqu 2*16(%rdx), RA2x;
+       cmpq $3, %rcx;
+       je .Lblk4_load_input_done;
+       vmovdqu 3*16(%rdx), RA3x;
+
+.Lblk4_load_input_done:
+
+       vmovdqa .Lbswap32_mask rRIP, RTMP2x;
+       vpshufb RTMP2x, RA0x, RA0x;
+       vpshufb RTMP2x, RA1x, RA1x;
+       vpshufb RTMP2x, RA2x, RA2x;
+       vpshufb RTMP2x, RA3x, RA3x;
+
+       vmovdqa .Lrol_8 rRIP, RTMP2x;
+       vmovdqa .Lrol_16 rRIP, RTMP3x;
+       vmovdqa .Lrol_24 rRIP, RB3x;
+       transpose_4x4(RA0x, RA1x, RA2x, RA3x, RTMP0x, RTMP1x);
+
+#define ROUND(round, s0, s1, s2, s3) \
+       vpbroadcastd (4*(round))(%rdi), RX0x; \
+       vpxor s1, RX0x, RX0x; \
+       vpxor s2, RX0x, RX0x; \
+       vpxor s3, RX0x, RX0x; /* s1 ^ s2 ^ s3 ^ rk */ \
+       \
+       /* sbox, non-linear part */ \
+       vgf2p8affineqb $0x65, .Lpre_affine_s rRIP, RX0x, RX0x; \
+       vgf2p8affineinvqb $0xd3, .Lpost_affine_s rRIP, RX0x, RX0x; \
+       \
+       /* linear part */ \
+       vpxor RX0x, s0, s0; /* s0 ^ x */ \
+       vpshufb RTMP2x, RX0x, RTMP1x; \
+       vpxor RTMP1x, RX0x, RTMP0x; /* x ^ rol(x,8) */ \
+       vpshufb RTMP3x, RX0x, RTMP1x; \
+       vpxor RTMP1x, RTMP0x, RTMP0x; /* x ^ rol(x,8) ^ rol(x,16) */ \
+       vpshufb RB3x, RX0x, RTMP1x; \
+       vpxor RTMP1x, s0, s0; /* s0 ^ x ^ rol(x,24) */ \
+       vpslld $2, RTMP0x, RTMP1x; \
+       vpsrld $30, RTMP0x, RTMP0x; \
+       vpxor RTMP0x, s0, s0;  \
+       vpxor RTMP1x, s0, s0; /* s0 ^ x ^ rol(x,2) ^ rol(x,10) ^ rol(x,18) ^ rol(x,24) */
+
+       leaq (32*4)(%rdi), %rax;
+.align 16
+.Lroundloop_blk4:
+       ROUND(0, RA0x, RA1x, RA2x, RA3x);
+       ROUND(1, RA1x, RA2x, RA3x, RA0x);
+       ROUND(2, RA2x, RA3x, RA0x, RA1x);
+       ROUND(3, RA3x, RA0x, RA1x, RA2x);
+       leaq (4*4)(%rdi), %rdi;
+       cmpq %rax, %rdi;
+       jne .Lroundloop_blk4;
+
+#undef ROUND
+
+       vmovdqa .Lbswap128_mask rRIP, RTMP2x;
+
+       transpose_4x4(RA0x, RA1x, RA2x, RA3x, RTMP0x, RTMP1x);
+       vpshufb RTMP2x, RA0x, RA0x;
+       vpshufb RTMP2x, RA1x, RA1x;
+       vpshufb RTMP2x, RA2x, RA2x;
+       vpshufb RTMP2x, RA3x, RA3x;
+
+       vmovdqu RA0x, 0*16(%rsi);
+       cmpq $2, %rcx;
+       jb .Lblk4_store_output_done;
+       vmovdqu RA1x, 1*16(%rsi);
+       je .Lblk4_store_output_done;
+       vmovdqu RA2x, 2*16(%rsi);
+       cmpq $3, %rcx;
+       je .Lblk4_store_output_done;
+       vmovdqu RA3x, 3*16(%rsi);
+
+.Lblk4_store_output_done:
+       vzeroall;
+       xorl %eax, %eax;
+       ret_spec_stop;
+       CFI_ENDPROC();
+ELF(.size sm4_gfni_avx2_crypt_blk1_4,.-sm4_gfni_avx2_crypt_blk1_4;)
+
+.align 16
+ELF(.type __sm4_gfni_crypt_blk8,@function;)
+__sm4_gfni_crypt_blk8:
+       /* input:
+        *      %rdi: round key array, CTX
+        *      RA0, RA1, RA2, RA3, RB0, RB1, RB2, RB3: eight parallel
+        *                                              ciphertext blocks
+        * output:
+        *      RA0, RA1, RA2, RA3, RB0, RB1, RB2, RB3: eight parallel plaintext
+        *                                              blocks
+        */
+       CFI_STARTPROC();
+
+       vmovdqa .Lbswap32_mask rRIP, RTMP2x;
+       vpshufb RTMP2x, RA0x, RA0x;
+       vpshufb RTMP2x, RA1x, RA1x;
+       vpshufb RTMP2x, RA2x, RA2x;
+       vpshufb RTMP2x, RA3x, RA3x;
+       vpshufb RTMP2x, RB0x, RB0x;
+       vpshufb RTMP2x, RB1x, RB1x;
+       vpshufb RTMP2x, RB2x, RB2x;
+       vpshufb RTMP2x, RB3x, RB3x;
+
+       transpose_4x4(RA0x, RA1x, RA2x, RA3x, RTMP0x, RTMP1x);
+       transpose_4x4(RB0x, RB1x, RB2x, RB3x, RTMP0x, RTMP1x);
+
+#define ROUND(round, s0, s1, s2, s3, r0, r1, r2, r3) \
+       vpbroadcastd (4*(round))(%rdi), RX0x; \
+       vmovdqa .Lpre_affine_s rRIP, RTMP2x; \
+       vmovdqa .Lpost_affine_s rRIP, RTMP3x; \
+       vmovdqa RX0x, RX1x; \
+       vpxor s1, RX0x, RX0x; \
+       vpxor s2, RX0x, RX0x; \
+       vpxor s3, RX0x, RX0x; /* s1 ^ s2 ^ s3 ^ rk */ \
+           vpxor r1, RX1x, RX1x; \
+           vpxor r2, RX1x, RX1x; \
+           vpxor r3, RX1x, RX1x; /* r1 ^ r2 ^ r3 ^ rk */ \
+       \
+       /* sbox, non-linear part */ \
+       vmovdqa .Lrol_8 rRIP, RTMP4x; \
+       vgf2p8affineqb $0x65, RTMP2x, RX0x, RX0x; \
+       vgf2p8affineinvqb $0xd3, RTMP3x, RX0x, RX0x; \
+           vgf2p8affineqb $0x65, RTMP2x, RX1x, RX1x; \
+           vgf2p8affineinvqb $0xd3, RTMP3x, RX1x, RX1x; \
+       \
+       /* linear part */ \
+       vpxor RX0x, s0, s0; /* s0 ^ x */ \
+       vpshufb RTMP4x, RX0x, RTMP1x; \
+       vpxor RTMP1x, RX0x, RTMP0x; /* x ^ rol(x,8) */ \
+           vpxor RX1x, r0, r0; /* r0 ^ x */ \
+           vpshufb RTMP4x, RX1x, RTMP3x; \
+           vmovdqa .Lrol_16 rRIP, RTMP4x; \
+           vpxor RTMP3x, RX1x, RTMP2x; /* x ^ rol(x,8) */ \
+       vpshufb RTMP4x, RX0x, RTMP1x; \
+       vpxor RTMP1x, RTMP0x, RTMP0x; /* x ^ rol(x,8) ^ rol(x,16) */ \
+           vpshufb RTMP4x, RX1x, RTMP3x; \
+           vmovdqa .Lrol_24 rRIP, RTMP4x; \
+           vpxor RTMP3x, RTMP2x, RTMP2x; /* x ^ rol(x,8) ^ rol(x,16) */ \
+       vpshufb RTMP4x, RX0x, RTMP1x; \
+       vpxor RTMP1x, s0, s0; /* s0 ^ x ^ rol(x,24) */ \
+       vpslld $2, RTMP0x, RTMP1x; \
+       vpsrld $30, RTMP0x, RTMP0x; \
+       vpxor RTMP0x, s0, s0;  \
+       vpxor RTMP1x, s0, s0; /* s0 ^ x ^ rol(x,2) ^ rol(x,10) ^ rol(x,18) ^ rol(x,24) */ \
+           vpshufb RTMP4x, RX1x, RTMP3x; \
+           vpxor RTMP3x, r0, r0; /* r0 ^ x ^ rol(x,24) */ \
+           vpslld $2, RTMP2x, RTMP3x; \
+           vpsrld $30, RTMP2x, RTMP2x; \
+           vpxor RTMP2x, r0, r0;  \
+           vpxor RTMP3x, r0, r0; /* r0 ^ x ^ rol(x,2) ^ rol(x,10) ^ rol(x,18) ^ rol(x,24) */
+
+       leaq (32*4)(%rdi), %rax;
+.align 16
+.Lroundloop_blk8:
+       ROUND(0, RA0x, RA1x, RA2x, RA3x, RB0x, RB1x, RB2x, RB3x);
+       ROUND(1, RA1x, RA2x, RA3x, RA0x, RB1x, RB2x, RB3x, RB0x);
+       ROUND(2, RA2x, RA3x, RA0x, RA1x, RB2x, RB3x, RB0x, RB1x);
+       ROUND(3, RA3x, RA0x, RA1x, RA2x, RB3x, RB0x, RB1x, RB2x);
+       leaq (4*4)(%rdi), %rdi;
+       cmpq %rax, %rdi;
+       jne .Lroundloop_blk8;
+
+#undef ROUND
+
+       vmovdqa .Lbswap128_mask rRIP, RTMP2x;
+
+       transpose_4x4(RA0x, RA1x, RA2x, RA3x, RTMP0x, RTMP1x);
+       transpose_4x4(RB0x, RB1x, RB2x, RB3x, RTMP0x, RTMP1x);
+       vpshufb RTMP2x, RA0x, RA0x;
+       vpshufb RTMP2x, RA1x, RA1x;
+       vpshufb RTMP2x, RA2x, RA2x;
+       vpshufb RTMP2x, RA3x, RA3x;
+       vpshufb RTMP2x, RB0x, RB0x;
+       vpshufb RTMP2x, RB1x, RB1x;
+       vpshufb RTMP2x, RB2x, RB2x;
+       vpshufb RTMP2x, RB3x, RB3x;
+
+       ret_spec_stop;
+       CFI_ENDPROC();
+ELF(.size __sm4_gfni_crypt_blk8,.-__sm4_gfni_crypt_blk8;)
+
+.align 16
+ELF(.type   _gcry_sm4_gfni_avx2_crypt_blk1_8,@function;)
+_gcry_sm4_gfni_avx2_crypt_blk1_8:
+       /* input:
+        *      %rdi: round key array, CTX
+        *      %rsi: dst (1..8 blocks)
+        *      %rdx: src (1..8 blocks)
+        *      %rcx: num blocks (1..8)
+        */
+       CFI_STARTPROC();
+
+       cmpq $5, %rcx;
+       jb sm4_gfni_avx2_crypt_blk1_4;
+       vmovdqu (0 * 16)(%rdx), RA0x;
+       vmovdqu (1 * 16)(%rdx), RA1x;
+       vmovdqu (2 * 16)(%rdx), RA2x;
+       vmovdqu (3 * 16)(%rdx), RA3x;
+       vmovdqu (4 * 16)(%rdx), RB0x;
+       vmovdqa RB0x, RB1x;
+       vmovdqa RB0x, RB2x;
+       vmovdqa RB0x, RB3x;
+       je .Lblk8_load_input_done;
+       vmovdqu (5 * 16)(%rdx), RB1x;
+       cmpq $7, %rcx;
+       jb .Lblk8_load_input_done;
+       vmovdqu (6 * 16)(%rdx), RB2x;
+       je .Lblk8_load_input_done;
+       vmovdqu (7 * 16)(%rdx), RB3x;
+
+.Lblk8_load_input_done:
+       call __sm4_gfni_crypt_blk8;
+
+       cmpq $6, %rcx;
+       vmovdqu RA0x, (0 * 16)(%rsi);
+       vmovdqu RA1x, (1 * 16)(%rsi);
+       vmovdqu RA2x, (2 * 16)(%rsi);
+       vmovdqu RA3x, (3 * 16)(%rsi);
+       vmovdqu RB0x, (4 * 16)(%rsi);
+       jb .Lblk8_store_output_done;
+       vmovdqu RB1x, (5 * 16)(%rsi);
+       je .Lblk8_store_output_done;
+       vmovdqu RB2x, (6 * 16)(%rsi);
+       cmpq $7, %rcx;
+       je .Lblk8_store_output_done;
+       vmovdqu RB3x, (7 * 16)(%rsi);
+
+.Lblk8_store_output_done:
+       vzeroall;
+       xorl %eax, %eax;
+       ret_spec_stop;
+       CFI_ENDPROC();
+ELF(.size _gcry_sm4_gfni_avx2_crypt_blk1_8,.-_gcry_sm4_gfni_avx2_crypt_blk1_8;)
+
+/**********************************************************************
+  16-way SM4 with GFNI and AVX2
+ **********************************************************************/
+
+.align 16
+ELF(.type   __sm4_gfni_crypt_blk16,@function;)
+__sm4_gfni_crypt_blk16:
+       /* input:
+        *      %rdi: ctx, CTX
+        *      RA0, RA1, RA2, RA3, RB0, RB1, RB2, RB3: sixteen parallel
+        *                                              plaintext blocks
+        * output:
+        *      RA0, RA1, RA2, RA3, RB0, RB1, RB2, RB3: sixteen parallel
+        *                                              ciphertext blocks
+        */
+       CFI_STARTPROC();
+
+       vbroadcasti128 .Lbswap32_mask rRIP, RTMP2;
+       vpshufb RTMP2, RA0, RA0;
+       vpshufb RTMP2, RA1, RA1;
+       vpshufb RTMP2, RA2, RA2;
+       vpshufb RTMP2, RA3, RA3;
+       vpshufb RTMP2, RB0, RB0;
+       vpshufb RTMP2, RB1, RB1;
+       vpshufb RTMP2, RB2, RB2;
+       vpshufb RTMP2, RB3, RB3;
+
+       transpose_4x4(RA0, RA1, RA2, RA3, RTMP0, RTMP1);
+       transpose_4x4(RB0, RB1, RB2, RB3, RTMP0, RTMP1);
+
+#define ROUND(round, s0, s1, s2, s3, r0, r1, r2, r3) \
+       vpbroadcastd (4*(round))(%rdi), RX0; \
+       vbroadcasti128 .Lpre_affine_s rRIP, RTMP2; \
+       vbroadcasti128 .Lpost_affine_s rRIP, RTMP3; \
+       vmovdqa RX0, RX1; \
+       vpxor s1, RX0, RX0; \
+       vpxor s2, RX0, RX0; \
+       vpxor s3, RX0, RX0; /* s1 ^ s2 ^ s3 ^ rk */ \
+           vpxor r1, RX1, RX1; \
+           vpxor r2, RX1, RX1; \
+           vpxor r3, RX1, RX1; /* r1 ^ r2 ^ r3 ^ rk */ \
+       \
+       /* sbox, non-linear part */ \
+       vbroadcasti128 .Lrol_8 rRIP, RTMP4; \
+       vgf2p8affineqb $0x65, RTMP2, RX0, RX0; \
+       vgf2p8affineinvqb $0xd3, RTMP3, RX0, RX0; \
+           vgf2p8affineqb $0x65, RTMP2, RX1, RX1; \
+           vgf2p8affineinvqb $0xd3, RTMP3, RX1, RX1; \
+       \
+       /* linear part */ \
+       vpxor RX0, s0, s0; /* s0 ^ x */ \
+       vpshufb RTMP4, RX0, RTMP1; \
+       vpxor RTMP1, RX0, RTMP0; /* x ^ rol(x,8) */ \
+           vpxor RX1, r0, r0; /* r0 ^ x */ \
+           vpshufb RTMP4, RX1, RTMP3; \
+           vbroadcasti128 .Lrol_16 rRIP, RTMP4; \
+           vpxor RTMP3, RX1, RTMP2; /* x ^ rol(x,8) */ \
+       vpshufb RTMP4, RX0, RTMP1; \
+       vpxor RTMP1, RTMP0, RTMP0; /* x ^ rol(x,8) ^ rol(x,16) */ \
+           vpshufb RTMP4, RX1, RTMP3; \
+           vbroadcasti128 .Lrol_24 rRIP, RTMP4; \
+           vpxor RTMP3, RTMP2, RTMP2; /* x ^ rol(x,8) ^ rol(x,16) */ \
+       vpshufb RTMP4, RX0, RTMP1; \
+       vpxor RTMP1, s0, s0; /* s0 ^ x ^ rol(x,24) */ \
+       vpslld $2, RTMP0, RTMP1; \
+       vpsrld $30, RTMP0, RTMP0; \
+       vpxor RTMP0, s0, s0;  \
+       vpxor RTMP1, s0, s0; /* s0 ^ x ^ rol(x,2) ^ rol(x,10) ^ rol(x,18) ^ rol(x,24) */ \
+           vpshufb RTMP4, RX1, RTMP3; \
+           vpxor RTMP3, r0, r0; /* r0 ^ x ^ rol(x,24) */ \
+           vpslld $2, RTMP2, RTMP3; \
+           vpsrld $30, RTMP2, RTMP2; \
+           vpxor RTMP2, r0, r0;  \
+           vpxor RTMP3, r0, r0; /* r0 ^ x ^ rol(x,2) ^ rol(x,10) ^ rol(x,18) ^ rol(x,24) */
+
+       leaq (32*4)(%rdi), %rax;
+.align 16
+.Lroundloop_blk16:
+       ROUND(0, RA0, RA1, RA2, RA3, RB0, RB1, RB2, RB3);
+       ROUND(1, RA1, RA2, RA3, RA0, RB1, RB2, RB3, RB0);
+       ROUND(2, RA2, RA3, RA0, RA1, RB2, RB3, RB0, RB1);
+       ROUND(3, RA3, RA0, RA1, RA2, RB3, RB0, RB1, RB2);
+       leaq (4*4)(%rdi), %rdi;
+       cmpq %rax, %rdi;
+       jne .Lroundloop_blk16;
+
+#undef ROUND
+
+       vbroadcasti128 .Lbswap128_mask rRIP, RTMP2;
+
+       transpose_4x4(RA0, RA1, RA2, RA3, RTMP0, RTMP1);
+       transpose_4x4(RB0, RB1, RB2, RB3, RTMP0, RTMP1);
+       vpshufb RTMP2, RA0, RA0;
+       vpshufb RTMP2, RA1, RA1;
+       vpshufb RTMP2, RA2, RA2;
+       vpshufb RTMP2, RA3, RA3;
+       vpshufb RTMP2, RB0, RB0;
+       vpshufb RTMP2, RB1, RB1;
+       vpshufb RTMP2, RB2, RB2;
+       vpshufb RTMP2, RB3, RB3;
+
+       ret_spec_stop;
+       CFI_ENDPROC();
+ELF(.size __sm4_gfni_crypt_blk16,.-__sm4_gfni_crypt_blk16;)
+
+.align 16
+.globl _gcry_sm4_gfni_avx2_crypt_blk1_16
+ELF(.type   _gcry_sm4_gfni_avx2_crypt_blk1_16,@function;)
+_gcry_sm4_gfni_avx2_crypt_blk1_16:
+       /* input:
+        *      %rdi: round key array, CTX
+        *      %rsi: dst (1..16 blocks)
+        *      %rdx: src (1..16 blocks)
+        *      %rcx: num blocks (1..16)
+        */
+       CFI_STARTPROC();
+
+#define LOAD_INPUT(offset, yreg) \
+       cmpq $(1 + 2 * (offset)), %rcx; \
+       jb .Lblk16_load_input_done; \
+       ja 1f; \
+         vmovdqu (offset) * 32(%rdx), yreg##x; \
+         jmp .Lblk16_load_input_done; \
+       1: \
+         vmovdqu (offset) * 32(%rdx), yreg;
+
+       cmpq $8, %rcx;
+       jbe _gcry_sm4_gfni_avx2_crypt_blk1_8;
+       vmovdqu (0 * 32)(%rdx), RA0;
+       vmovdqu (1 * 32)(%rdx), RA1;
+       vmovdqu (2 * 32)(%rdx), RA2;
+       vmovdqu (3 * 32)(%rdx), RA3;
+       LOAD_INPUT(4, RB0);
+       LOAD_INPUT(5, RB1);
+       LOAD_INPUT(6, RB2);
+       LOAD_INPUT(7, RB3);
+#undef LOAD_INPUT
+
+.Lblk16_load_input_done:
+       call __sm4_gfni_crypt_blk16;
+
+#define STORE_OUTPUT(yreg, offset) \
+       cmpq $(1 + 2 * (offset)), %rcx; \
+       jb .Lblk16_store_output_done; \
+       ja 1f; \
+         vmovdqu yreg##x, (offset) * 32(%rsi); \
+         jmp .Lblk16_store_output_done; \
+       1: \
+         vmovdqu yreg, (offset) * 32(%rsi);
+
+       vmovdqu RA0, (0 * 32)(%rsi);
+       vmovdqu RA1, (1 * 32)(%rsi);
+       vmovdqu RA2, (2 * 32)(%rsi);
+       vmovdqu RA3, (3 * 32)(%rsi);
+       STORE_OUTPUT(RB0, 4);
+       STORE_OUTPUT(RB1, 5);
+       STORE_OUTPUT(RB2, 6);
+       STORE_OUTPUT(RB3, 7);
+#undef STORE_OUTPUT
+
+.Lblk16_store_output_done:
+       vzeroall;
+       xorl %eax, %eax;
+       ret_spec_stop;
+       CFI_ENDPROC();
+ELF(.size _gcry_sm4_gfni_avx2_crypt_blk1_16,.-_gcry_sm4_gfni_avx2_crypt_blk1_16;)
+
+#define inc_le128(x, minus_one, tmp) \
+       vpcmpeqq minus_one, x, tmp; \
+       vpsubq minus_one, x, x; \
+       vpslldq $8, tmp, tmp; \
+       vpsubq tmp, x, x;
+
+.align 16
+.globl _gcry_sm4_gfni_avx2_ctr_enc
+ELF(.type   _gcry_sm4_gfni_avx2_ctr_enc,@function;)
+_gcry_sm4_gfni_avx2_ctr_enc:
+       /* input:
+        *      %rdi: ctx, CTX
+        *      %rsi: dst (16 blocks)
+        *      %rdx: src (16 blocks)
+        *      %rcx: iv (big endian, 128bit)
+        */
+       CFI_STARTPROC();
+
+       cmpb $(0x100 - 16), 15(%rcx);
+       jbe .Lctr_byteadd;
+
+       movq 8(%rcx), %rax;
+       bswapq %rax;
+
+       vbroadcasti128 .Lbswap128_mask rRIP, RTMP3;
+       vpcmpeqd RNOT, RNOT, RNOT;
+       vpsrldq $8, RNOT, RNOT;   /* ab: -1:0 ; cd: -1:0 */
+       vpaddq RNOT, RNOT, RTMP2; /* ab: -2:0 ; cd: -2:0 */
+
+       /* load IV and byteswap */
+       vmovdqu (%rcx), RTMP4x;
+       vpshufb RTMP3x, RTMP4x, RTMP4x;
+       vmovdqa RTMP4x, RTMP0x;
+       inc_le128(RTMP4x, RNOTx, RTMP1x);
+       vinserti128 $1, RTMP4x, RTMP0, RTMP0;
+       vpshufb RTMP3, RTMP0, RA0; /* +1 ; +0 */
+
+       /* check need for handling 64-bit overflow and carry */
+       cmpq $(0xffffffffffffffff - 16), %rax;
+       ja .Lhandle_ctr_carry;
+
+       /* construct IVs */
+       vpsubq RTMP2, RTMP0, RTMP0; /* +3 ; +2 */
+       vpshufb RTMP3, RTMP0, RA1;
+       vpsubq RTMP2, RTMP0, RTMP0; /* +5 ; +4 */
+       vpshufb RTMP3, RTMP0, RA2;
+       vpsubq RTMP2, RTMP0, RTMP0; /* +7 ; +6 */
+       vpshufb RTMP3, RTMP0, RA3;
+       vpsubq RTMP2, RTMP0, RTMP0; /* +9 ; +8 */
+       vpshufb RTMP3, RTMP0, RB0;
+       vpsubq RTMP2, RTMP0, RTMP0; /* +11 ; +10 */
+       vpshufb RTMP3, RTMP0, RB1;
+       vpsubq RTMP2, RTMP0, RTMP0; /* +13 ; +12 */
+       vpshufb RTMP3, RTMP0, RB2;
+       vpsubq RTMP2, RTMP0, RTMP0; /* +15 ; +14 */
+       vpshufb RTMP3, RTMP0, RB3;
+       vpsubq RTMP2, RTMP0, RTMP0; /* +16 */
+       vpshufb RTMP3x, RTMP0x, RTMP0x;
+
+       jmp .Lctr_carry_done;
+
+.Lhandle_ctr_carry:
+       /* construct IVs */
+       inc_le128(RTMP0, RNOT, RTMP1);
+       inc_le128(RTMP0, RNOT, RTMP1);
+       vpshufb RTMP3, RTMP0, RA1; /* +3 ; +2 */
+       inc_le128(RTMP0, RNOT, RTMP1);
+       inc_le128(RTMP0, RNOT, RTMP1);
+       vpshufb RTMP3, RTMP0, RA2; /* +5 ; +4 */
+       inc_le128(RTMP0, RNOT, RTMP1);
+       inc_le128(RTMP0, RNOT, RTMP1);
+       vpshufb RTMP3, RTMP0, RA3; /* +7 ; +6 */
+       inc_le128(RTMP0, RNOT, RTMP1);
+       inc_le128(RTMP0, RNOT, RTMP1);
+       vpshufb RTMP3, RTMP0, RB0; /* +9 ; +8 */
+       inc_le128(RTMP0, RNOT, RTMP1);
+       inc_le128(RTMP0, RNOT, RTMP1);
+       vpshufb RTMP3, RTMP0, RB1; /* +11 ; +10 */
+       inc_le128(RTMP0, RNOT, RTMP1);
+       inc_le128(RTMP0, RNOT, RTMP1);
+       vpshufb RTMP3, RTMP0, RB2; /* +13 ; +12 */
+       inc_le128(RTMP0, RNOT, RTMP1);
+       inc_le128(RTMP0, RNOT, RTMP1);
+       vpshufb RTMP3, RTMP0, RB3; /* +15 ; +14 */
+       inc_le128(RTMP0, RNOT, RTMP1);
+       vextracti128 $1, RTMP0, RTMP0x;
+       vpshufb RTMP3x, RTMP0x, RTMP0x; /* +16 */
+
+.Lctr_carry_done:
+       /* store new IV */
+       vmovdqu RTMP0x, (%rcx);
+
+.align 8
+.Lload_ctr_done:
+       call __sm4_gfni_crypt_blk16;
+
+       vpxor (0 * 32)(%rdx), RA0, RA0;
+       vpxor (1 * 32)(%rdx), RA1, RA1;
+       vpxor (2 * 32)(%rdx), RA2, RA2;
+       vpxor (3 * 32)(%rdx), RA3, RA3;
+       vpxor (4 * 32)(%rdx), RB0, RB0;
+       vpxor (5 * 32)(%rdx), RB1, RB1;
+       vpxor (6 * 32)(%rdx), RB2, RB2;
+       vpxor (7 * 32)(%rdx), RB3, RB3;
+
+       vmovdqu RA0, (0 * 32)(%rsi);
+       vmovdqu RA1, (1 * 32)(%rsi);
+       vmovdqu RA2, (2 * 32)(%rsi);
+       vmovdqu RA3, (3 * 32)(%rsi);
+       vmovdqu RB0, (4 * 32)(%rsi);
+       vmovdqu RB1, (5 * 32)(%rsi);
+       vmovdqu RB2, (6 * 32)(%rsi);
+       vmovdqu RB3, (7 * 32)(%rsi);
+
+       vzeroall;
+
+       ret_spec_stop;
+
+.align 8
+.Lctr_byteadd_full_ctr_carry:
+       movq 8(%rcx), %r11;
+       movq (%rcx), %r10;
+       bswapq %r11;
+       bswapq %r10;
+       addq $16, %r11;
+       adcq $0, %r10;
+       bswapq %r11;
+       bswapq %r10;
+       movq %r11, 8(%rcx);
+       movq %r10, (%rcx);
+       jmp .Lctr_byteadd_ymm;
+.align 8
+.Lctr_byteadd:
+       vbroadcasti128 (%rcx), RB3;
+       je .Lctr_byteadd_full_ctr_carry;
+       addb $16, 15(%rcx);
+.Lctr_byteadd_ymm:
+       vpaddb .Lbige_addb_0_1 rRIP, RB3, RA0;
+       vpaddb .Lbige_addb_2_3 rRIP, RB3, RA1;
+       vpaddb .Lbige_addb_4_5 rRIP, RB3, RA2;
+       vpaddb .Lbige_addb_6_7 rRIP, RB3, RA3;
+       vpaddb .Lbige_addb_8_9 rRIP, RB3, RB0;
+       vpaddb .Lbige_addb_10_11 rRIP, RB3, RB1;
+       vpaddb .Lbige_addb_12_13 rRIP, RB3, RB2;
+       vpaddb .Lbige_addb_14_15 rRIP, RB3, RB3;
+
+       jmp .Lload_ctr_done;
+       CFI_ENDPROC();
+ELF(.size _gcry_sm4_gfni_avx2_ctr_enc,.-_gcry_sm4_gfni_avx2_ctr_enc;)
+
+.align 16
+.globl _gcry_sm4_gfni_avx2_cbc_dec
+ELF(.type   _gcry_sm4_gfni_avx2_cbc_dec,@function;)
+_gcry_sm4_gfni_avx2_cbc_dec:
+       /* input:
+        *      %rdi: ctx, CTX
+        *      %rsi: dst (16 blocks)
+        *      %rdx: src (16 blocks)
+        *      %rcx: iv
+        */
+       CFI_STARTPROC();
+
+       vmovdqu (0 * 32)(%rdx), RA0;
+       vmovdqu (1 * 32)(%rdx), RA1;
+       vmovdqu (2 * 32)(%rdx), RA2;
+       vmovdqu (3 * 32)(%rdx), RA3;
+       vmovdqu (4 * 32)(%rdx), RB0;
+       vmovdqu (5 * 32)(%rdx), RB1;
+       vmovdqu (6 * 32)(%rdx), RB2;
+       vmovdqu (7 * 32)(%rdx), RB3;
+
+       call __sm4_gfni_crypt_blk16;
+
+       vmovdqu (%rcx), RNOTx;
+       vinserti128 $1, (%rdx), RNOT, RNOT;
+       vpxor RNOT, RA0, RA0;
+       vpxor (0 * 32 + 16)(%rdx), RA1, RA1;
+       vpxor (1 * 32 + 16)(%rdx), RA2, RA2;
+       vpxor (2 * 32 + 16)(%rdx), RA3, RA3;
+       vpxor (3 * 32 + 16)(%rdx), RB0, RB0;
+       vpxor (4 * 32 + 16)(%rdx), RB1, RB1;
+       vpxor (5 * 32 + 16)(%rdx), RB2, RB2;
+       vpxor (6 * 32 + 16)(%rdx), RB3, RB3;
+       vmovdqu (7 * 32 + 16)(%rdx), RNOTx;
+       vmovdqu RNOTx, (%rcx); /* store new IV */
+
+       vmovdqu RA0, (0 * 32)(%rsi);
+       vmovdqu RA1, (1 * 32)(%rsi);
+       vmovdqu RA2, (2 * 32)(%rsi);
+       vmovdqu RA3, (3 * 32)(%rsi);
+       vmovdqu RB0, (4 * 32)(%rsi);
+       vmovdqu RB1, (5 * 32)(%rsi);
+       vmovdqu RB2, (6 * 32)(%rsi);
+       vmovdqu RB3, (7 * 32)(%rsi);
+
+       vzeroall;
+
+       ret_spec_stop;
+       CFI_ENDPROC();
+ELF(.size _gcry_sm4_gfni_avx2_cbc_dec,.-_gcry_sm4_gfni_avx2_cbc_dec;)
+
+.align 16
+.globl _gcry_sm4_gfni_avx2_cfb_dec
+ELF(.type   _gcry_sm4_gfni_avx2_cfb_dec,@function;)
+_gcry_sm4_gfni_avx2_cfb_dec:
+       /* input:
+        *      %rdi: ctx, CTX
+        *      %rsi: dst (16 blocks)
+        *      %rdx: src (16 blocks)
+        *      %rcx: iv
+        */
+       CFI_STARTPROC();
+
+       /* Load input */
+       vmovdqu (%rcx), RNOTx;
+       vinserti128 $1, (%rdx), RNOT, RA0;
+       vmovdqu (0 * 32 + 16)(%rdx), RA1;
+       vmovdqu (1 * 32 + 16)(%rdx), RA2;
+       vmovdqu (2 * 32 + 16)(%rdx), RA3;
+       vmovdqu (3 * 32 + 16)(%rdx), RB0;
+       vmovdqu (4 * 32 + 16)(%rdx), RB1;
+       vmovdqu (5 * 32 + 16)(%rdx), RB2;
+       vmovdqu (6 * 32 + 16)(%rdx), RB3;
+
+       /* Update IV */
+       vmovdqu (7 * 32 + 16)(%rdx), RNOTx;
+       vmovdqu RNOTx, (%rcx);
+
+       call __sm4_gfni_crypt_blk16;
+
+       vpxor (0 * 32)(%rdx), RA0, RA0;
+       vpxor (1 * 32)(%rdx), RA1, RA1;
+       vpxor (2 * 32)(%rdx), RA2, RA2;
+       vpxor (3 * 32)(%rdx), RA3, RA3;
+       vpxor (4 * 32)(%rdx), RB0, RB0;
+       vpxor (5 * 32)(%rdx), RB1, RB1;
+       vpxor (6 * 32)(%rdx), RB2, RB2;
+       vpxor (7 * 32)(%rdx), RB3, RB3;
+
+       vmovdqu RA0, (0 * 32)(%rsi);
+       vmovdqu RA1, (1 * 32)(%rsi);
+       vmovdqu RA2, (2 * 32)(%rsi);
+       vmovdqu RA3, (3 * 32)(%rsi);
+       vmovdqu RB0, (4 * 32)(%rsi);
+       vmovdqu RB1, (5 * 32)(%rsi);
+       vmovdqu RB2, (6 * 32)(%rsi);
+       vmovdqu RB3, (7 * 32)(%rsi);
+
+       vzeroall;
+
+       ret_spec_stop;
+       CFI_ENDPROC();
+ELF(.size _gcry_sm4_gfni_avx2_cfb_dec,.-_gcry_sm4_gfni_avx2_cfb_dec;)
+
+.align 16
+.globl _gcry_sm4_gfni_avx2_ocb_enc
+ELF(.type _gcry_sm4_gfni_avx2_ocb_enc,@function;)
+
+_gcry_sm4_gfni_avx2_ocb_enc:
+       /* input:
+        *      %rdi: ctx, CTX
+        *      %rsi: dst (16 blocks)
+        *      %rdx: src (16 blocks)
+        *      %rcx: offset
+        *      %r8 : checksum
+        *      %r9 : L pointers (void *L[16])
+        */
+       CFI_STARTPROC();
+
+       subq $(4 * 8), %rsp;
+       CFI_ADJUST_CFA_OFFSET(4 * 8);
+
+       movq %r10, (0 * 8)(%rsp);
+       movq %r11, (1 * 8)(%rsp);
+       movq %r12, (2 * 8)(%rsp);
+       movq %r13, (3 * 8)(%rsp);
+       CFI_REL_OFFSET(%r10, 0 * 8);
+       CFI_REL_OFFSET(%r11, 1 * 8);
+       CFI_REL_OFFSET(%r12, 2 * 8);
+       CFI_REL_OFFSET(%r13, 3 * 8);
+
+       vmovdqu (%rcx), RTMP0x;
+       vmovdqu (%r8), RTMP1x;
+
+       /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
+       /* Checksum_i = Checksum_{i-1} xor P_i  */
+       /* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i)  */
+
+#define OCB_INPUT(n, l0reg, l1reg, yreg) \
+         vmovdqu (n * 32)(%rdx), yreg; \
+         vpxor (l0reg), RTMP0x, RNOTx; \
+         vpxor (l1reg), RNOTx, RTMP0x; \
+         vinserti128 $1, RTMP0x, RNOT, RNOT; \
+         vpxor yreg, RTMP1, RTMP1; \
+         vpxor yreg, RNOT, yreg; \
+         vmovdqu RNOT, (n * 32)(%rsi);
+
+       movq (0 * 8)(%r9), %r10;
+       movq (1 * 8)(%r9), %r11;
+       movq (2 * 8)(%r9), %r12;
+       movq (3 * 8)(%r9), %r13;
+       OCB_INPUT(0, %r10, %r11, RA0);
+       OCB_INPUT(1, %r12, %r13, RA1);
+       movq (4 * 8)(%r9), %r10;
+       movq (5 * 8)(%r9), %r11;
+       movq (6 * 8)(%r9), %r12;
+       movq (7 * 8)(%r9), %r13;
+       OCB_INPUT(2, %r10, %r11, RA2);
+       OCB_INPUT(3, %r12, %r13, RA3);
+       movq (8 * 8)(%r9), %r10;
+       movq (9 * 8)(%r9), %r11;
+       movq (10 * 8)(%r9), %r12;
+       movq (11 * 8)(%r9), %r13;
+       OCB_INPUT(4, %r10, %r11, RB0);
+       OCB_INPUT(5, %r12, %r13, RB1);
+       movq (12 * 8)(%r9), %r10;
+       movq (13 * 8)(%r9), %r11;
+       movq (14 * 8)(%r9), %r12;
+       movq (15 * 8)(%r9), %r13;
+       OCB_INPUT(6, %r10, %r11, RB2);
+       OCB_INPUT(7, %r12, %r13, RB3);
+#undef OCB_INPUT
+
+       vextracti128 $1, RTMP1, RNOTx;
+       vmovdqu RTMP0x, (%rcx);
+       vpxor RNOTx, RTMP1x, RTMP1x;
+       vmovdqu RTMP1x, (%r8);
+
+       movq (0 * 8)(%rsp), %r10;
+       movq (1 * 8)(%rsp), %r11;
+       movq (2 * 8)(%rsp), %r12;
+       movq (3 * 8)(%rsp), %r13;
+       CFI_RESTORE(%r10);
+       CFI_RESTORE(%r11);
+       CFI_RESTORE(%r12);
+       CFI_RESTORE(%r13);
+
+       call __sm4_gfni_crypt_blk16;
+
+       addq $(4 * 8), %rsp;
+       CFI_ADJUST_CFA_OFFSET(-4 * 8);
+
+       vpxor (0 * 32)(%rsi), RA0, RA0;
+       vpxor (1 * 32)(%rsi), RA1, RA1;
+       vpxor (2 * 32)(%rsi), RA2, RA2;
+       vpxor (3 * 32)(%rsi), RA3, RA3;
+       vpxor (4 * 32)(%rsi), RB0, RB0;
+       vpxor (5 * 32)(%rsi), RB1, RB1;
+       vpxor (6 * 32)(%rsi), RB2, RB2;
+       vpxor (7 * 32)(%rsi), RB3, RB3;
+
+       vmovdqu RA0, (0 * 32)(%rsi);
+       vmovdqu RA1, (1 * 32)(%rsi);
+       vmovdqu RA2, (2 * 32)(%rsi);
+       vmovdqu RA3, (3 * 32)(%rsi);
+       vmovdqu RB0, (4 * 32)(%rsi);
+       vmovdqu RB1, (5 * 32)(%rsi);
+       vmovdqu RB2, (6 * 32)(%rsi);
+       vmovdqu RB3, (7 * 32)(%rsi);
+
+       vzeroall;
+
+       ret_spec_stop;
+       CFI_ENDPROC();
+ELF(.size _gcry_sm4_gfni_avx2_ocb_enc,.-_gcry_sm4_gfni_avx2_ocb_enc;)
+
+.align 16
+.globl _gcry_sm4_gfni_avx2_ocb_dec
+ELF(.type _gcry_sm4_gfni_avx2_ocb_dec,@function;)
+
+_gcry_sm4_gfni_avx2_ocb_dec:
+       /* input:
+        *      %rdi: ctx, CTX
+        *      %rsi: dst (16 blocks)
+        *      %rdx: src (16 blocks)
+        *      %rcx: offset
+        *      %r8 : checksum
+        *      %r9 : L pointers (void *L[16])
+        */
+       CFI_STARTPROC();
+
+       subq $(4 * 8), %rsp;
+       CFI_ADJUST_CFA_OFFSET(4 * 8);
+
+       movq %r10, (0 * 8)(%rsp);
+       movq %r11, (1 * 8)(%rsp);
+       movq %r12, (2 * 8)(%rsp);
+       movq %r13, (3 * 8)(%rsp);
+       CFI_REL_OFFSET(%r10, 0 * 8);
+       CFI_REL_OFFSET(%r11, 1 * 8);
+       CFI_REL_OFFSET(%r12, 2 * 8);
+       CFI_REL_OFFSET(%r13, 3 * 8);
+
+       vmovdqu (%rcx), RTMP0x;
+
+       /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
+       /* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i)  */
+
+#define OCB_INPUT(n, l0reg, l1reg, yreg) \
+         vmovdqu (n * 32)(%rdx), yreg; \
+         vpxor (l0reg), RTMP0x, RNOTx; \
+         vpxor (l1reg), RNOTx, RTMP0x; \
+         vinserti128 $1, RTMP0x, RNOT, RNOT; \
+         vpxor yreg, RNOT, yreg; \
+         vmovdqu RNOT, (n * 32)(%rsi);
+
+       movq (0 * 8)(%r9), %r10;
+       movq (1 * 8)(%r9), %r11;
+       movq (2 * 8)(%r9), %r12;
+       movq (3 * 8)(%r9), %r13;
+       OCB_INPUT(0, %r10, %r11, RA0);
+       OCB_INPUT(1, %r12, %r13, RA1);
+       movq (4 * 8)(%r9), %r10;
+       movq (5 * 8)(%r9), %r11;
+       movq (6 * 8)(%r9), %r12;
+       movq (7 * 8)(%r9), %r13;
+       OCB_INPUT(2, %r10, %r11, RA2);
+       OCB_INPUT(3, %r12, %r13, RA3);
+       movq (8 * 8)(%r9), %r10;
+       movq (9 * 8)(%r9), %r11;
+       movq (10 * 8)(%r9), %r12;
+       movq (11 * 8)(%r9), %r13;
+       OCB_INPUT(4, %r10, %r11, RB0);
+       OCB_INPUT(5, %r12, %r13, RB1);
+       movq (12 * 8)(%r9), %r10;
+       movq (13 * 8)(%r9), %r11;
+       movq (14 * 8)(%r9), %r12;
+       movq (15 * 8)(%r9), %r13;
+       OCB_INPUT(6, %r10, %r11, RB2);
+       OCB_INPUT(7, %r12, %r13, RB3);
+#undef OCB_INPUT
+
+       vmovdqu RTMP0x, (%rcx);
+
+       movq (0 * 8)(%rsp), %r10;
+       movq (1 * 8)(%rsp), %r11;
+       movq (2 * 8)(%rsp), %r12;
+       movq (3 * 8)(%rsp), %r13;
+       CFI_RESTORE(%r10);
+       CFI_RESTORE(%r11);
+       CFI_RESTORE(%r12);
+       CFI_RESTORE(%r13);
+
+       call __sm4_gfni_crypt_blk16;
+
+       addq $(4 * 8), %rsp;
+       CFI_ADJUST_CFA_OFFSET(-4 * 8);
+
+       vmovdqu (%r8), RTMP1x;
+
+       vpxor (0 * 32)(%rsi), RA0, RA0;
+       vpxor (1 * 32)(%rsi), RA1, RA1;
+       vpxor (2 * 32)(%rsi), RA2, RA2;
+       vpxor (3 * 32)(%rsi), RA3, RA3;
+       vpxor (4 * 32)(%rsi), RB0, RB0;
+       vpxor (5 * 32)(%rsi), RB1, RB1;
+       vpxor (6 * 32)(%rsi), RB2, RB2;
+       vpxor (7 * 32)(%rsi), RB3, RB3;
+
+       /* Checksum_i = Checksum_{i-1} xor P_i  */
+
+       vmovdqu RA0, (0 * 32)(%rsi);
+       vpxor RA0, RTMP1, RTMP1;
+       vmovdqu RA1, (1 * 32)(%rsi);
+       vpxor RA1, RTMP1, RTMP1;
+       vmovdqu RA2, (2 * 32)(%rsi);
+       vpxor RA2, RTMP1, RTMP1;
+       vmovdqu RA3, (3 * 32)(%rsi);
+       vpxor RA3, RTMP1, RTMP1;
+       vmovdqu RB0, (4 * 32)(%rsi);
+       vpxor RB0, RTMP1, RTMP1;
+       vmovdqu RB1, (5 * 32)(%rsi);
+       vpxor RB1, RTMP1, RTMP1;
+       vmovdqu RB2, (6 * 32)(%rsi);
+       vpxor RB2, RTMP1, RTMP1;
+       vmovdqu RB3, (7 * 32)(%rsi);
+       vpxor RB3, RTMP1, RTMP1;
+
+       vextracti128 $1, RTMP1, RNOTx;
+       vpxor RNOTx, RTMP1x, RTMP1x;
+       vmovdqu RTMP1x, (%r8);
+
+       vzeroall;
+
+       ret_spec_stop;
+       CFI_ENDPROC();
+ELF(.size _gcry_sm4_gfni_avx2_ocb_dec,.-_gcry_sm4_gfni_avx2_ocb_dec;)
+
+.align 16
+.globl _gcry_sm4_gfni_avx2_ocb_auth
+ELF(.type _gcry_sm4_gfni_avx2_ocb_auth,@function;)
+
+_gcry_sm4_gfni_avx2_ocb_auth:
+       /* input:
+        *      %rdi: ctx, CTX
+        *      %rsi: abuf (16 blocks)
+        *      %rdx: offset
+        *      %rcx: checksum
+        *      %r8 : L pointers (void *L[16])
+        */
+       CFI_STARTPROC();
+
+       subq $(4 * 8), %rsp;
+       CFI_ADJUST_CFA_OFFSET(4 * 8);
+
+       movq %r10, (0 * 8)(%rsp);
+       movq %r11, (1 * 8)(%rsp);
+       movq %r12, (2 * 8)(%rsp);
+       movq %r13, (3 * 8)(%rsp);
+       CFI_REL_OFFSET(%r10, 0 * 8);
+       CFI_REL_OFFSET(%r11, 1 * 8);
+       CFI_REL_OFFSET(%r12, 2 * 8);
+       CFI_REL_OFFSET(%r13, 3 * 8);
+
+       vmovdqu (%rdx), RTMP0x;
+
+       /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
+       /* Sum_i = Sum_{i-1} xor ENCIPHER(K, A_i xor Offset_i)  */
+
+#define OCB_INPUT(n, l0reg, l1reg, yreg) \
+         vmovdqu (n * 32)(%rsi), yreg; \
+         vpxor (l0reg), RTMP0x, RNOTx; \
+         vpxor (l1reg), RNOTx, RTMP0x; \
+         vinserti128 $1, RTMP0x, RNOT, RNOT; \
+         vpxor yreg, RNOT, yreg;
+
+       movq (0 * 8)(%r8), %r10;
+       movq (1 * 8)(%r8), %r11;
+       movq (2 * 8)(%r8), %r12;
+       movq (3 * 8)(%r8), %r13;
+       OCB_INPUT(0, %r10, %r11, RA0);
+       OCB_INPUT(1, %r12, %r13, RA1);
+       movq (4 * 8)(%r8), %r10;
+       movq (5 * 8)(%r8), %r11;
+       movq (6 * 8)(%r8), %r12;
+       movq (7 * 8)(%r8), %r13;
+       OCB_INPUT(2, %r10, %r11, RA2);
+       OCB_INPUT(3, %r12, %r13, RA3);
+       movq (8 * 8)(%r8), %r10;
+       movq (9 * 8)(%r8), %r11;
+       movq (10 * 8)(%r8), %r12;
+       movq (11 * 8)(%r8), %r13;
+       OCB_INPUT(4, %r10, %r11, RB0);
+       OCB_INPUT(5, %r12, %r13, RB1);
+       movq (12 * 8)(%r8), %r10;
+       movq (13 * 8)(%r8), %r11;
+       movq (14 * 8)(%r8), %r12;
+       movq (15 * 8)(%r8), %r13;
+       OCB_INPUT(6, %r10, %r11, RB2);
+       OCB_INPUT(7, %r12, %r13, RB3);
+#undef OCB_INPUT
+
+       vmovdqu RTMP0x, (%rdx);
+
+       movq (0 * 8)(%rsp), %r10;
+       movq (1 * 8)(%rsp), %r11;
+       movq (2 * 8)(%rsp), %r12;
+       movq (3 * 8)(%rsp), %r13;
+       CFI_RESTORE(%r10);
+       CFI_RESTORE(%r11);
+       CFI_RESTORE(%r12);
+       CFI_RESTORE(%r13);
+
+       call __sm4_gfni_crypt_blk16;
+
+       addq $(4 * 8), %rsp;
+       CFI_ADJUST_CFA_OFFSET(-4 * 8);
+
+       vpxor RA0, RB0, RA0;
+       vpxor RA1, RB1, RA1;
+       vpxor RA2, RB2, RA2;
+       vpxor RA3, RB3, RA3;
+
+       vpxor RA1, RA0, RA0;
+       vpxor RA3, RA2, RA2;
+
+       vpxor RA2, RA0, RTMP1;
+
+       vextracti128 $1, RTMP1, RNOTx;
+       vpxor (%rcx), RTMP1x, RTMP1x;
+       vpxor RNOTx, RTMP1x, RTMP1x;
+       vmovdqu RTMP1x, (%rcx);
+
+       vzeroall;
+
+       ret_spec_stop;
+       CFI_ENDPROC();
+ELF(.size _gcry_sm4_gfni_avx2_ocb_auth,.-_gcry_sm4_gfni_avx2_ocb_auth;)
+
+#endif /*defined(ENABLE_GFNI_SUPPORT) && defined(ENABLE_AVX2_SUPPORT)*/
+#endif /*__x86_64*/
diff --git a/cipher/sm4-gfni-avx512-amd64.S b/cipher/sm4-gfni-avx512-amd64.S
new file mode 100644 (file)
index 0000000..91f6e80
--- /dev/null
@@ -0,0 +1,1861 @@
+/* sm4-gfni-avx512-amd64.S  -  GFNI/AVX512 implementation of SM4 cipher
+ *
+ * Copyright (C) 2022-2023 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <config.h>
+
+#ifdef __x86_64
+#if (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
+     defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) && \
+    defined(ENABLE_GFNI_SUPPORT) && defined(ENABLE_AVX512_SUPPORT)
+
+#include "asm-common-amd64.h"
+
+/**********************************************************************
+  helper macros
+ **********************************************************************/
+
+/* Transpose four 32-bit words between 128-bit vectors. */
+#define transpose_4x4(x0, x1, x2, x3, t1, t2) \
+       vpunpckhdq x1, x0, t2; \
+       vpunpckldq x1, x0, x0; \
+       \
+       vpunpckldq x3, x2, t1; \
+       vpunpckhdq x3, x2, x2; \
+       \
+       vpunpckhqdq t1, x0, x1; \
+       vpunpcklqdq t1, x0, x0; \
+       \
+       vpunpckhqdq x2, t2, x3; \
+       vpunpcklqdq x2, t2, x2;
+
+/**********************************************************************
+  4-way && 8-way SM4 with GFNI and AVX512 (128-bit vectors)
+ **********************************************************************/
+
+/* vector registers */
+#define RX0          %ymm0
+#define RX1          %ymm1
+#define RX0x         %xmm0
+#define RX1x         %xmm1
+#define RX0z         %zmm0
+#define RX1z         %zmm1
+
+#define RTMP0        %ymm2
+#define RTMP1        %ymm3
+#define RTMP2        %ymm4
+#define RTMP3        %ymm5
+#define RTMP4        %ymm6
+#define RTMP0x       %xmm2
+#define RTMP1x       %xmm3
+#define RTMP2x       %xmm4
+#define RTMP3x       %xmm5
+#define RTMP4x       %xmm6
+#define RTMP0z       %zmm2
+#define RTMP1z       %zmm3
+#define RTMP2z       %zmm4
+#define RTMP3z       %zmm5
+#define RTMP4z       %zmm6
+
+#define RNOT         %ymm7
+#define RNOTx        %xmm7
+#define RNOTz        %zmm7
+
+#define RA0          %ymm8
+#define RA1          %ymm9
+#define RA2          %ymm10
+#define RA3          %ymm11
+#define RA0x         %xmm8
+#define RA1x         %xmm9
+#define RA2x         %xmm10
+#define RA3x         %xmm11
+#define RA0z         %zmm8
+#define RA1z         %zmm9
+#define RA2z         %zmm10
+#define RA3z         %zmm11
+
+#define RB0          %ymm12
+#define RB1          %ymm13
+#define RB2          %ymm14
+#define RB3          %ymm15
+#define RB0x         %xmm12
+#define RB1x         %xmm13
+#define RB2x         %xmm14
+#define RB3x         %xmm15
+#define RB0z         %zmm12
+#define RB1z         %zmm13
+#define RB2z         %zmm14
+#define RB3z         %zmm15
+
+SECTION_RODATA
+.align 32
+
+/* Affine transform, SM4 field to AES field */
+.Lpre_affine_s:
+       .byte 0x52, 0xbc, 0x2d, 0x02, 0x9e, 0x25, 0xac, 0x34
+       .byte 0x52, 0xbc, 0x2d, 0x02, 0x9e, 0x25, 0xac, 0x34
+       .byte 0x52, 0xbc, 0x2d, 0x02, 0x9e, 0x25, 0xac, 0x34
+       .byte 0x52, 0xbc, 0x2d, 0x02, 0x9e, 0x25, 0xac, 0x34
+
+/* Affine transform, AES field to SM4 field */
+.Lpost_affine_s:
+       .byte 0x19, 0x8b, 0x6c, 0x1e, 0x51, 0x8e, 0x2d, 0xd7
+       .byte 0x19, 0x8b, 0x6c, 0x1e, 0x51, 0x8e, 0x2d, 0xd7
+       .byte 0x19, 0x8b, 0x6c, 0x1e, 0x51, 0x8e, 0x2d, 0xd7
+       .byte 0x19, 0x8b, 0x6c, 0x1e, 0x51, 0x8e, 0x2d, 0xd7
+
+/* For CTR-mode IV byteswap */
+.Lbswap128_mask:
+       .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
+
+/* For input word byte-swap */
+.Lbswap32_mask:
+       .byte 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12
+
+.Lcounter2222_lo:
+       .quad 2, 0
+.Lcounter4444_lo:
+       .quad 4, 0
+.Lcounter8888_lo:
+       .quad 8, 0
+.Lcounter16161616_lo:
+       .quad 16, 0
+.Lcounter1111_hi:
+       .quad 0, 1
+
+.align 64
+.Lcounter0123_lo:
+       .quad 0, 0
+       .quad 1, 0
+       .quad 2, 0
+       .quad 3, 0
+
+/* CTR byte addition constants */
+.align 64
+.Lbige_addb_0_1:
+       .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+       .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1
+.Lbige_addb_2_3:
+       .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2
+       .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3
+.Lbige_addb_4_5:
+       .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4
+       .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5
+.Lbige_addb_6_7:
+       .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6
+       .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7
+.Lbige_addb_8_9:
+       .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 8
+       .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 9
+.Lbige_addb_10_11:
+       .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 10
+       .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 11
+.Lbige_addb_12_13:
+       .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 12
+       .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 13
+.Lbige_addb_14_15:
+       .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 14
+       .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 15
+.Lbige_addb_16:
+       .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 16
+
+.text
+
+.align 16
+.globl _gcry_sm4_gfni_avx512_expand_key
+ELF(.type   _gcry_sm4_gfni_avx512_expand_key,@function;)
+_gcry_sm4_gfni_avx512_expand_key:
+       /* input:
+        *      %rdi: 128-bit key
+        *      %rsi: rkey_enc
+        *      %rdx: rkey_dec
+        *      %rcx: fk array
+        *      %r8: ck array
+        */
+       CFI_STARTPROC();
+       spec_stop_avx512;
+
+       vmovd 0*4(%rdi), RA0x;
+       vmovd 1*4(%rdi), RA1x;
+       vmovd 2*4(%rdi), RA2x;
+       vmovd 3*4(%rdi), RA3x;
+
+       vmovdqa .Lbswap32_mask rRIP, RTMP2x;
+       vpshufb RTMP2x, RA0x, RA0x;
+       vpshufb RTMP2x, RA1x, RA1x;
+       vpshufb RTMP2x, RA2x, RA2x;
+       vpshufb RTMP2x, RA3x, RA3x;
+
+       vmovd 0*4(%rcx), RB0x;
+       vmovd 1*4(%rcx), RB1x;
+       vmovd 2*4(%rcx), RB2x;
+       vmovd 3*4(%rcx), RB3x;
+       vpxor RB0x, RA0x, RA0x;
+       vpxor RB1x, RA1x, RA1x;
+       vpxor RB2x, RA2x, RA2x;
+       vpxor RB3x, RA3x, RA3x;
+
+#define ROUND(round, s0, s1, s2, s3) \
+       vpxord (4*(round))(%r8) {1to4}, s1, RX0x; \
+       vpternlogd $0x96, s2, s3, RX0x; /* s1 ^ s2 ^ s3 ^ rk */ \
+       \
+       /* sbox, non-linear part */ \
+       vgf2p8affineqb $0x65, .Lpre_affine_s rRIP, RX0x, RX0x; \
+       vgf2p8affineinvqb $0xd3, .Lpost_affine_s rRIP, RX0x, RX0x; \
+       \
+       /* linear part */ \
+       vpxor RX0x, s0, s0; /* s0 ^ x */ \
+       vprold $13, RX0x, RTMP1x; \
+       vprold $23, RX0x, RTMP3x; \
+       vpternlogd $0x96, RTMP1x, RTMP3x, s0; /* s0 ^ x ^ rol(x,13) ^ rol(x,23) */
+
+       leaq (32*4)(%r8), %rax;
+       leaq (32*4)(%rdx), %rdx;
+.align 16
+.Lroundloop_expand_key:
+       leaq (-4*4)(%rdx), %rdx;
+       ROUND(0, RA0x, RA1x, RA2x, RA3x);
+       ROUND(1, RA1x, RA2x, RA3x, RA0x);
+       ROUND(2, RA2x, RA3x, RA0x, RA1x);
+       ROUND(3, RA3x, RA0x, RA1x, RA2x);
+       leaq (4*4)(%r8), %r8;
+       vmovd RA0x, (0*4)(%rsi);
+       vmovd RA1x, (1*4)(%rsi);
+       vmovd RA2x, (2*4)(%rsi);
+       vmovd RA3x, (3*4)(%rsi);
+       vmovd RA0x, (3*4)(%rdx);
+       vmovd RA1x, (2*4)(%rdx);
+       vmovd RA2x, (1*4)(%rdx);
+       vmovd RA3x, (0*4)(%rdx);
+       leaq (4*4)(%rsi), %rsi;
+       cmpq %rax, %r8;
+       jne .Lroundloop_expand_key;
+
+#undef ROUND
+
+       vzeroall;
+       ret_spec_stop;
+       CFI_ENDPROC();
+ELF(.size _gcry_sm4_gfni_avx512_expand_key,.-_gcry_sm4_gfni_avx512_expand_key;)
+
+.align 16
+ELF(.type   sm4_gfni_avx512_crypt_blk1_4,@function;)
+sm4_gfni_avx512_crypt_blk1_4:
+       /* input:
+        *      %rdi: round key array, CTX
+        *      %rsi: dst (1..4 blocks)
+        *      %rdx: src (1..4 blocks)
+        *      %rcx: num blocks (1..4)
+        */
+       CFI_STARTPROC();
+
+       vmovdqu 0*16(%rdx), RA0x;
+       vmovdqa RA0x, RA1x;
+       vmovdqa RA0x, RA2x;
+       vmovdqa RA0x, RA3x;
+       cmpq $2, %rcx;
+       jb .Lblk4_load_input_done;
+       vmovdqu 1*16(%rdx), RA1x;
+       je .Lblk4_load_input_done;
+       vmovdqu 2*16(%rdx), RA2x;
+       cmpq $3, %rcx;
+       je .Lblk4_load_input_done;
+       vmovdqu 3*16(%rdx), RA3x;
+
+.Lblk4_load_input_done:
+
+       vmovdqa .Lbswap32_mask rRIP, RTMP2x;
+       vpshufb RTMP2x, RA0x, RA0x;
+       vpshufb RTMP2x, RA1x, RA1x;
+       vpshufb RTMP2x, RA2x, RA2x;
+       vpshufb RTMP2x, RA3x, RA3x;
+
+       transpose_4x4(RA0x, RA1x, RA2x, RA3x, RTMP0x, RTMP1x);
+
+#define ROUND(round, s0, s1, s2, s3) \
+       vpxord (4*(round))(%rdi) {1to4}, s1, RX0x; \
+       vpternlogd $0x96, s2, s3, RX0x; /* s1 ^ s2 ^ s3 ^ rk */ \
+       \
+       /* sbox, non-linear part */ \
+       vgf2p8affineqb $0x65, .Lpre_affine_s rRIP, RX0x, RX0x; \
+       vgf2p8affineinvqb $0xd3, .Lpost_affine_s rRIP, RX0x, RX0x; \
+       \
+       /* linear part */ \
+       vprold $2, RX0x, RTMP0x; \
+       vprold $10, RX0x, RTMP1x; \
+       vprold $18, RX0x, RTMP2x; \
+       vpternlogd $0x96, RTMP0x, RX0x, s0; /* s0 ^ x ^ rol(x,2) */ \
+       vprold $24, RX0x, RX0x; \
+       vpternlogd $0x96, RTMP1x, RTMP2x, RX0x; /* rol(x,10) ^ rol(x,18) ^ rol(x,24) */ \
+       vpxor RX0x, s0, s0; /* s0 ^ x ^ rol(x,2) ^ rol(x,10) ^ rol(x,18) ^ rol(x,24) */
+
+       leaq (32*4)(%rdi), %rax;
+.align 16
+.Lroundloop_blk4:
+       ROUND(0, RA0x, RA1x, RA2x, RA3x);
+       ROUND(1, RA1x, RA2x, RA3x, RA0x);
+       ROUND(2, RA2x, RA3x, RA0x, RA1x);
+       ROUND(3, RA3x, RA0x, RA1x, RA2x);
+       leaq (4*4)(%rdi), %rdi;
+       cmpq %rax, %rdi;
+       jne .Lroundloop_blk4;
+
+#undef ROUND
+
+       vmovdqa .Lbswap128_mask rRIP, RTMP2x;
+
+       transpose_4x4(RA0x, RA1x, RA2x, RA3x, RTMP0x, RTMP1x);
+       vpshufb RTMP2x, RA0x, RA0x;
+       vpshufb RTMP2x, RA1x, RA1x;
+       vpshufb RTMP2x, RA2x, RA2x;
+       vpshufb RTMP2x, RA3x, RA3x;
+
+       vmovdqu RA0x, 0*16(%rsi);
+       cmpq $2, %rcx;
+       jb .Lblk4_store_output_done;
+       vmovdqu RA1x, 1*16(%rsi);
+       je .Lblk4_store_output_done;
+       vmovdqu RA2x, 2*16(%rsi);
+       cmpq $3, %rcx;
+       je .Lblk4_store_output_done;
+       vmovdqu RA3x, 3*16(%rsi);
+
+.Lblk4_store_output_done:
+       vzeroall;
+       xorl %eax, %eax;
+       ret_spec_stop;
+       CFI_ENDPROC();
+ELF(.size sm4_gfni_avx512_crypt_blk1_4,.-sm4_gfni_avx512_crypt_blk1_4;)
+
+.align 16
+ELF(.type __sm4_gfni_crypt_blk8,@function;)
+__sm4_gfni_crypt_blk8:
+       /* input:
+        *      %rdi: round key array, CTX
+        *      RA0, RA1, RA2, RA3, RB0, RB1, RB2, RB3: eight parallel
+        *                                              ciphertext blocks
+        * output:
+        *      RA0, RA1, RA2, RA3, RB0, RB1, RB2, RB3: eight parallel plaintext
+        *                                              blocks
+        */
+       CFI_STARTPROC();
+
+       vmovdqa .Lbswap32_mask rRIP, RTMP2x;
+       vpshufb RTMP2x, RA0x, RA0x;
+       vpshufb RTMP2x, RA1x, RA1x;
+       vpshufb RTMP2x, RA2x, RA2x;
+       vpshufb RTMP2x, RA3x, RA3x;
+       vpshufb RTMP2x, RB0x, RB0x;
+       vpshufb RTMP2x, RB1x, RB1x;
+       vpshufb RTMP2x, RB2x, RB2x;
+       vpshufb RTMP2x, RB3x, RB3x;
+
+       transpose_4x4(RA0x, RA1x, RA2x, RA3x, RTMP0x, RTMP1x);
+       transpose_4x4(RB0x, RB1x, RB2x, RB3x, RTMP0x, RTMP1x);
+
+#define ROUND(round, s0, s1, s2, s3, r0, r1, r2, r3) \
+       vpbroadcastd (4*(round))(%rdi), RX1x; \
+       vmovdqa .Lpre_affine_s rRIP, RTMP2x; \
+       vmovdqa .Lpost_affine_s rRIP, RTMP3x; \
+       vpxor s1, RX1x, RX0x; \
+       vpternlogd $0x96, s2, s3, RX0x; /* s1 ^ s2 ^ s3 ^ rk */ \
+           vpxor r1, RX1x, RX1x; \
+           vpternlogd $0x96, r2, r3, RX1x; /* r1 ^ r2 ^ r3 ^ rk */ \
+       \
+       /* sbox, non-linear part */ \
+       vgf2p8affineqb $0x65, RTMP2x, RX0x, RX0x; \
+       vgf2p8affineinvqb $0xd3, RTMP3x, RX0x, RX0x; \
+           vgf2p8affineqb $0x65, RTMP2x, RX1x, RX1x; \
+           vgf2p8affineinvqb $0xd3, RTMP3x, RX1x, RX1x; \
+       \
+       /* linear part */ \
+       vprold $2, RX0x, RTMP0x; \
+       vprold $10, RX0x, RTMP1x; \
+       vprold $18, RX0x, RTMP2x; \
+       vpternlogd $0x96, RTMP0x, RX0x, s0; /* s0 ^ x ^ rol(x,2) */ \
+       vprold $24, RX0x, RX0x; \
+           vprold $2, RX1x, RTMP3x; \
+           vprold $10, RX1x, RTMP4x; \
+           vprold $18, RX1x, RTMP0x; \
+           vpternlogd $0x96, RTMP3x, RX1x, r0; /* r0 ^ x ^ rol(x,2) */ \
+           vprold $24, RX1x, RX1x; \
+       vpternlogd $0x96, RTMP1x, RTMP2x, RX0x; /* rol(x,10) ^ rol(x,18) ^ rol(x,24) */ \
+           vpternlogd $0x96, RTMP4x, RTMP0x, RX1x; /* rol(x,10) ^ rol(x,18) ^ rol(x,24) */ \
+       vpxor RX0x, s0, s0; /* s0 ^ x ^ rol(x,2) ^ rol(x,10) ^ rol(x,18) ^ rol(x,24) */ \
+           vpxor RX1x, r0, r0; /* r0 ^ x ^ rol(x,2) ^ rol(x,10) ^ rol(x,18) ^ rol(x,24) */
+
+       leaq (32*4)(%rdi), %rax;
+.align 16
+.Lroundloop_blk8:
+       ROUND(0, RA0x, RA1x, RA2x, RA3x, RB0x, RB1x, RB2x, RB3x);
+       ROUND(1, RA1x, RA2x, RA3x, RA0x, RB1x, RB2x, RB3x, RB0x);
+       ROUND(2, RA2x, RA3x, RA0x, RA1x, RB2x, RB3x, RB0x, RB1x);
+       ROUND(3, RA3x, RA0x, RA1x, RA2x, RB3x, RB0x, RB1x, RB2x);
+       leaq (4*4)(%rdi), %rdi;
+       cmpq %rax, %rdi;
+       jne .Lroundloop_blk8;
+
+#undef ROUND
+
+       vmovdqa .Lbswap128_mask rRIP, RTMP2x;
+
+       transpose_4x4(RA0x, RA1x, RA2x, RA3x, RTMP0x, RTMP1x);
+       transpose_4x4(RB0x, RB1x, RB2x, RB3x, RTMP0x, RTMP1x);
+       vpshufb RTMP2x, RA0x, RA0x;
+       vpshufb RTMP2x, RA1x, RA1x;
+       vpshufb RTMP2x, RA2x, RA2x;
+       vpshufb RTMP2x, RA3x, RA3x;
+       vpshufb RTMP2x, RB0x, RB0x;
+       vpshufb RTMP2x, RB1x, RB1x;
+       vpshufb RTMP2x, RB2x, RB2x;
+       vpshufb RTMP2x, RB3x, RB3x;
+
+       ret_spec_stop;
+       CFI_ENDPROC();
+ELF(.size __sm4_gfni_crypt_blk8,.-__sm4_gfni_crypt_blk8;)
+
+.align 16
+ELF(.type   _gcry_sm4_gfni_avx512_crypt_blk1_8,@function;)
+_gcry_sm4_gfni_avx512_crypt_blk1_8:
+       /* input:
+        *      %rdi: round key array, CTX
+        *      %rsi: dst (1..8 blocks)
+        *      %rdx: src (1..8 blocks)
+        *      %rcx: num blocks (1..8)
+        */
+       CFI_STARTPROC();
+
+       cmpq $5, %rcx;
+       jb sm4_gfni_avx512_crypt_blk1_4;
+       vmovdqu (0 * 16)(%rdx), RA0x;
+       vmovdqu (1 * 16)(%rdx), RA1x;
+       vmovdqu (2 * 16)(%rdx), RA2x;
+       vmovdqu (3 * 16)(%rdx), RA3x;
+       vmovdqu (4 * 16)(%rdx), RB0x;
+       vmovdqa RB0x, RB1x;
+       vmovdqa RB0x, RB2x;
+       vmovdqa RB0x, RB3x;
+       je .Lblk8_load_input_done;
+       vmovdqu (5 * 16)(%rdx), RB1x;
+       cmpq $7, %rcx;
+       jb .Lblk8_load_input_done;
+       vmovdqu (6 * 16)(%rdx), RB2x;
+       je .Lblk8_load_input_done;
+       vmovdqu (7 * 16)(%rdx), RB3x;
+
+.Lblk8_load_input_done:
+       call __sm4_gfni_crypt_blk8;
+
+       cmpq $6, %rcx;
+       vmovdqu RA0x, (0 * 16)(%rsi);
+       vmovdqu RA1x, (1 * 16)(%rsi);
+       vmovdqu RA2x, (2 * 16)(%rsi);
+       vmovdqu RA3x, (3 * 16)(%rsi);
+       vmovdqu RB0x, (4 * 16)(%rsi);
+       jb .Lblk8_store_output_done;
+       vmovdqu RB1x, (5 * 16)(%rsi);
+       je .Lblk8_store_output_done;
+       vmovdqu RB2x, (6 * 16)(%rsi);
+       cmpq $7, %rcx;
+       je .Lblk8_store_output_done;
+       vmovdqu RB3x, (7 * 16)(%rsi);
+
+.Lblk8_store_output_done:
+       vzeroall;
+       xorl %eax, %eax;
+       ret_spec_stop;
+       CFI_ENDPROC();
+ELF(.size _gcry_sm4_gfni_avx512_crypt_blk1_8,.-_gcry_sm4_gfni_avx512_crypt_blk1_8;)
+
+/**********************************************************************
+  16-way SM4 with GFNI and AVX512 (256-bit vectors)
+ **********************************************************************/
+
+.align 16
+ELF(.type   __sm4_gfni_crypt_blk16,@function;)
+__sm4_gfni_crypt_blk16:
+       /* input:
+        *      %rdi: ctx, CTX
+        *      RA0, RA1, RA2, RA3, RB0, RB1, RB2, RB3: sixteen parallel
+        *                                              plaintext blocks
+        * output:
+        *      RA0, RA1, RA2, RA3, RB0, RB1, RB2, RB3: sixteen parallel
+        *                                              ciphertext blocks
+        */
+       CFI_STARTPROC();
+
+       vbroadcasti128 .Lbswap32_mask rRIP, RTMP2;
+       vpshufb RTMP2, RA0, RA0;
+       vpshufb RTMP2, RA1, RA1;
+       vpshufb RTMP2, RA2, RA2;
+       vpshufb RTMP2, RA3, RA3;
+       vpshufb RTMP2, RB0, RB0;
+       vpshufb RTMP2, RB1, RB1;
+       vpshufb RTMP2, RB2, RB2;
+       vpshufb RTMP2, RB3, RB3;
+
+       transpose_4x4(RA0, RA1, RA2, RA3, RTMP0, RTMP1);
+       transpose_4x4(RB0, RB1, RB2, RB3, RTMP0, RTMP1);
+
+#define ROUND(round, s0, s1, s2, s3, r0, r1, r2, r3) \
+       vpbroadcastd (4*(round))(%rdi), RX1; \
+       vbroadcasti128 .Lpre_affine_s rRIP, RTMP2; \
+       vbroadcasti128 .Lpost_affine_s rRIP, RTMP3; \
+       vpxor s1, RX1, RX0; \
+       vpternlogd $0x96, s2, s3, RX0; /* s1 ^ s2 ^ s3 ^ rk */ \
+           vpxor r1, RX1, RX1; \
+           vpternlogd $0x96, r2, r3, RX1; /* r1 ^ r2 ^ r3 ^ rk */ \
+       \
+       /* sbox, non-linear part */ \
+       vgf2p8affineqb $0x65, RTMP2, RX0, RX0; \
+       vgf2p8affineinvqb $0xd3, RTMP3, RX0, RX0; \
+           vgf2p8affineqb $0x65, RTMP2, RX1, RX1; \
+           vgf2p8affineinvqb $0xd3, RTMP3, RX1, RX1; \
+       \
+       /* linear part */ \
+       vprold $2, RX0, RTMP0; \
+       vprold $10, RX0, RTMP1; \
+       vprold $18, RX0, RTMP2; \
+       vpternlogd $0x96, RTMP0, RX0, s0; /* s0 ^ x ^ rol(x,2) */ \
+       vprold $24, RX0, RX0; \
+           vprold $2, RX1, RTMP3; \
+           vprold $10, RX1, RTMP4; \
+           vprold $18, RX1, RTMP0; \
+           vpternlogd $0x96, RTMP3, RX1, r0; /* r0 ^ x ^ rol(x,2) */ \
+           vprold $24, RX1, RX1; \
+       vpternlogd $0x96, RTMP1, RTMP2, RX0; /* rol(x,10) ^ rol(x,18) ^ rol(x,24) */ \
+           vpternlogd $0x96, RTMP4, RTMP0, RX1; /* rol(x,10) ^ rol(x,18) ^ rol(x,24) */ \
+       vpxor RX0, s0, s0; /* s0 ^ x ^ rol(x,2) ^ rol(x,10) ^ rol(x,18) ^ rol(x,24) */ \
+           vpxor RX1, r0, r0; /* r0 ^ x ^ rol(x,2) ^ rol(x,10) ^ rol(x,18) ^ rol(x,24) */
+
+       leaq (32*4)(%rdi), %rax;
+.align 16
+.Lroundloop_blk16:
+       ROUND(0, RA0, RA1, RA2, RA3, RB0, RB1, RB2, RB3);
+       ROUND(1, RA1, RA2, RA3, RA0, RB1, RB2, RB3, RB0);
+       ROUND(2, RA2, RA3, RA0, RA1, RB2, RB3, RB0, RB1);
+       ROUND(3, RA3, RA0, RA1, RA2, RB3, RB0, RB1, RB2);
+       leaq (4*4)(%rdi), %rdi;
+       cmpq %rax, %rdi;
+       jne .Lroundloop_blk16;
+
+#undef ROUND
+
+       vbroadcasti128 .Lbswap128_mask rRIP, RTMP2;
+
+       transpose_4x4(RA0, RA1, RA2, RA3, RTMP0, RTMP1);
+       transpose_4x4(RB0, RB1, RB2, RB3, RTMP0, RTMP1);
+       vpshufb RTMP2, RA0, RA0;
+       vpshufb RTMP2, RA1, RA1;
+       vpshufb RTMP2, RA2, RA2;
+       vpshufb RTMP2, RA3, RA3;
+       vpshufb RTMP2, RB0, RB0;
+       vpshufb RTMP2, RB1, RB1;
+       vpshufb RTMP2, RB2, RB2;
+       vpshufb RTMP2, RB3, RB3;
+
+       ret_spec_stop;
+       CFI_ENDPROC();
+ELF(.size __sm4_gfni_crypt_blk16,.-__sm4_gfni_crypt_blk16;)
+
+.align 16
+.globl _gcry_sm4_gfni_avx512_crypt_blk1_16
+ELF(.type   _gcry_sm4_gfni_avx512_crypt_blk1_16,@function;)
+_gcry_sm4_gfni_avx512_crypt_blk1_16:
+       /* input:
+        *      %rdi: round key array, CTX
+        *      %rsi: dst (1..16 blocks)
+        *      %rdx: src (1..16 blocks)
+        *      %rcx: num blocks (1..16)
+        */
+       CFI_STARTPROC();
+       spec_stop_avx512;
+
+#define LOAD_INPUT(offset, yreg) \
+       cmpq $(1 + 2 * (offset)), %rcx; \
+       jb .Lblk16_load_input_done; \
+       ja 1f; \
+         vmovdqu (offset) * 32(%rdx), yreg##x; \
+         jmp .Lblk16_load_input_done; \
+       1: \
+         vmovdqu (offset) * 32(%rdx), yreg;
+
+       cmpq $8, %rcx;
+       jbe _gcry_sm4_gfni_avx512_crypt_blk1_8;
+       vmovdqu (0 * 32)(%rdx), RA0;
+       vmovdqu (1 * 32)(%rdx), RA1;
+       vmovdqu (2 * 32)(%rdx), RA2;
+       vmovdqu (3 * 32)(%rdx), RA3;
+       LOAD_INPUT(4, RB0);
+       LOAD_INPUT(5, RB1);
+       LOAD_INPUT(6, RB2);
+       LOAD_INPUT(7, RB3);
+#undef LOAD_INPUT
+
+.Lblk16_load_input_done:
+       call __sm4_gfni_crypt_blk16;
+
+#define STORE_OUTPUT(yreg, offset) \
+       cmpq $(1 + 2 * (offset)), %rcx; \
+       jb .Lblk16_store_output_done; \
+       ja 1f; \
+         vmovdqu yreg##x, (offset) * 32(%rsi); \
+         jmp .Lblk16_store_output_done; \
+       1: \
+         vmovdqu yreg, (offset) * 32(%rsi);
+
+       vmovdqu RA0, (0 * 32)(%rsi);
+       vmovdqu RA1, (1 * 32)(%rsi);
+       vmovdqu RA2, (2 * 32)(%rsi);
+       vmovdqu RA3, (3 * 32)(%rsi);
+       STORE_OUTPUT(RB0, 4);
+       STORE_OUTPUT(RB1, 5);
+       STORE_OUTPUT(RB2, 6);
+       STORE_OUTPUT(RB3, 7);
+#undef STORE_OUTPUT
+
+.Lblk16_store_output_done:
+       vzeroall;
+       xorl %eax, %eax;
+       ret_spec_stop;
+       CFI_ENDPROC();
+ELF(.size _gcry_sm4_gfni_avx512_crypt_blk1_16,.-_gcry_sm4_gfni_avx512_crypt_blk1_16;)
+
+#define add_le128(out, in, lo_counter, hi_counter1) \
+       vpaddq lo_counter, in, out; \
+       vpcmpuq $1, lo_counter, out, %k1; \
+       kaddb %k1, %k1, %k1; \
+       vpaddq hi_counter1, out, out{%k1};
+
+.align 16
+.globl _gcry_sm4_gfni_avx512_ctr_enc
+ELF(.type   _gcry_sm4_gfni_avx512_ctr_enc,@function;)
+_gcry_sm4_gfni_avx512_ctr_enc:
+       /* input:
+        *      %rdi: ctx, CTX
+        *      %rsi: dst (16 blocks)
+        *      %rdx: src (16 blocks)
+        *      %rcx: iv (big endian, 128bit)
+        */
+       CFI_STARTPROC();
+       spec_stop_avx512;
+
+       cmpb $(0x100 - 16), 15(%rcx);
+       jbe .Lctr_byteadd16;
+
+       vbroadcasti128 .Lbswap128_mask rRIP, RTMP0;
+       vmovdqa .Lcounter0123_lo rRIP, RTMP1;
+       vbroadcasti128 .Lcounter2222_lo rRIP, RTMP2;
+       vbroadcasti128 .Lcounter4444_lo rRIP, RTMP3;
+       vbroadcasti128 .Lcounter8888_lo rRIP, RTMP4;
+
+       /* load IV and byteswap */
+       movq 8(%rcx), %r11;
+       bswapq %r11;
+       vbroadcasti128 (%rcx), RB3;
+       vpshufb RTMP0, RB3, RB3;
+
+       /* check need for handling 64-bit overflow and carry */
+       cmpq $(0xffffffffffffffff - 16), %r11;
+       ja .Lhandle_ctr_carry_blk16;
+
+       /* construct IVs */
+       vpaddq RTMP1, RB3, RA0; /* +0:+1 */
+       vpaddq RTMP2, RA0, RA1; /* +2:+3 */
+       vpaddq RTMP3, RA0, RA2; /* +4:+5 */
+       vpaddq RTMP3, RA1, RA3; /* +6:+7 */
+       vpaddq RTMP4, RA0, RB0; /* +8... */
+       vpaddq RTMP4, RA1, RB1; /* +10... */
+       vpaddq RTMP4, RA2, RB2; /* +12... */
+       vpaddq RTMP4, RA3, RB3; /* +14... */
+
+       /* Update counter */
+       leaq 16(%r11), %r11;
+       bswapq %r11;
+       movq %r11, 8(%rcx);
+
+       jmp .Lctr_carry_done_blk16;
+
+.Lhandle_ctr_carry_blk16:
+       vbroadcasti128 .Lcounter1111_hi rRIP, RNOT;
+
+       /* construct IVs */
+       add_le128(RA0, RB3, RTMP1, RNOT); /* +0:+1 */
+       add_le128(RA1, RA0, RTMP2, RNOT); /* +2:+3 */
+       add_le128(RA2, RA0, RTMP3, RNOT); /* +4:+5 */
+       add_le128(RA3, RA1, RTMP3, RNOT); /* +6:+7 */
+       add_le128(RB0, RA0, RTMP4, RNOT); /* +8... */
+       add_le128(RB1, RA1, RTMP4, RNOT); /* +10... */
+       add_le128(RB2, RA2, RTMP4, RNOT); /* +12... */
+       add_le128(RB3, RA3, RTMP4, RNOT); /* +14... */
+
+       /* Update counter */
+       addq $16, %r11;
+       movq (%rcx), %r10;
+       bswapq %r10;
+       adcq $0, %r10;
+       bswapq %r11;
+       bswapq %r10;
+       movq %r11, 8(%rcx);
+       movq %r10, (%rcx);
+
+.align 16
+.Lctr_carry_done_blk16:
+       /* Byte-swap IVs. */
+       vpshufb RTMP0, RA0, RA0;
+       vpshufb RTMP0, RA1, RA1;
+       vpshufb RTMP0, RA2, RA2;
+       vpshufb RTMP0, RA3, RA3;
+       vpshufb RTMP0, RB0, RB0;
+       vpshufb RTMP0, RB1, RB1;
+       vpshufb RTMP0, RB2, RB2;
+       vpshufb RTMP0, RB3, RB3;
+
+.align 16
+.Lload_ctr_done16:
+       call __sm4_gfni_crypt_blk16;
+
+       vpxor (0 * 32)(%rdx), RA0, RA0;
+       vpxor (1 * 32)(%rdx), RA1, RA1;
+       vpxor (2 * 32)(%rdx), RA2, RA2;
+       vpxor (3 * 32)(%rdx), RA3, RA3;
+       vpxor (4 * 32)(%rdx), RB0, RB0;
+       vpxor (5 * 32)(%rdx), RB1, RB1;
+       vpxor (6 * 32)(%rdx), RB2, RB2;
+       vpxor (7 * 32)(%rdx), RB3, RB3;
+
+       vmovdqu RA0, (0 * 32)(%rsi);
+       vmovdqu RA1, (1 * 32)(%rsi);
+       vmovdqu RA2, (2 * 32)(%rsi);
+       vmovdqu RA3, (3 * 32)(%rsi);
+       vmovdqu RB0, (4 * 32)(%rsi);
+       vmovdqu RB1, (5 * 32)(%rsi);
+       vmovdqu RB2, (6 * 32)(%rsi);
+       vmovdqu RB3, (7 * 32)(%rsi);
+
+       vzeroall;
+       kxorq %k1, %k1, %k1;
+
+       ret_spec_stop;
+
+.align 16
+.Lctr_byteadd_full_ctr_carry16:
+       movq 8(%rcx), %r11;
+       movq (%rcx), %r10;
+       bswapq %r11;
+       bswapq %r10;
+       addq $16, %r11;
+       adcq $0, %r10;
+       bswapq %r11;
+       bswapq %r10;
+       movq %r11, 8(%rcx);
+       movq %r10, (%rcx);
+       jmp .Lctr_byteadd_ymm16;
+.align 16
+.Lctr_byteadd16:
+       vbroadcasti128 (%rcx), RB3;
+       je .Lctr_byteadd_full_ctr_carry16;
+       addb $16, 15(%rcx);
+.Lctr_byteadd_ymm16:
+       vpaddb .Lbige_addb_0_1 rRIP, RB3, RA0;
+       vpaddb .Lbige_addb_2_3 rRIP, RB3, RA1;
+       vpaddb .Lbige_addb_4_5 rRIP, RB3, RA2;
+       vpaddb .Lbige_addb_6_7 rRIP, RB3, RA3;
+       vpaddb .Lbige_addb_8_9 rRIP, RB3, RB0;
+       vpaddb .Lbige_addb_10_11 rRIP, RB3, RB1;
+       vpaddb .Lbige_addb_12_13 rRIP, RB3, RB2;
+       vpaddb .Lbige_addb_14_15 rRIP, RB3, RB3;
+
+       jmp .Lload_ctr_done16;
+       CFI_ENDPROC();
+ELF(.size _gcry_sm4_gfni_avx512_ctr_enc,.-_gcry_sm4_gfni_avx512_ctr_enc;)
+
+.align 16
+.globl _gcry_sm4_gfni_avx512_cbc_dec
+ELF(.type   _gcry_sm4_gfni_avx512_cbc_dec,@function;)
+_gcry_sm4_gfni_avx512_cbc_dec:
+       /* input:
+        *      %rdi: ctx, CTX
+        *      %rsi: dst (16 blocks)
+        *      %rdx: src (16 blocks)
+        *      %rcx: iv
+        */
+       CFI_STARTPROC();
+       spec_stop_avx512;
+
+       vmovdqu (0 * 32)(%rdx), RA0;
+       vmovdqu (1 * 32)(%rdx), RA1;
+       vmovdqu (2 * 32)(%rdx), RA2;
+       vmovdqu (3 * 32)(%rdx), RA3;
+       vmovdqu (4 * 32)(%rdx), RB0;
+       vmovdqu (5 * 32)(%rdx), RB1;
+       vmovdqu (6 * 32)(%rdx), RB2;
+       vmovdqu (7 * 32)(%rdx), RB3;
+
+       call __sm4_gfni_crypt_blk16;
+
+       vmovdqu (%rcx), RNOTx;
+       vinserti128 $1, (%rdx), RNOT, RNOT;
+       vpxor RNOT, RA0, RA0;
+       vpxor (0 * 32 + 16)(%rdx), RA1, RA1;
+       vpxor (1 * 32 + 16)(%rdx), RA2, RA2;
+       vpxor (2 * 32 + 16)(%rdx), RA3, RA3;
+       vpxor (3 * 32 + 16)(%rdx), RB0, RB0;
+       vpxor (4 * 32 + 16)(%rdx), RB1, RB1;
+       vpxor (5 * 32 + 16)(%rdx), RB2, RB2;
+       vpxor (6 * 32 + 16)(%rdx), RB3, RB3;
+       vmovdqu (7 * 32 + 16)(%rdx), RNOTx;
+       vmovdqu RNOTx, (%rcx); /* store new IV */
+
+       vmovdqu RA0, (0 * 32)(%rsi);
+       vmovdqu RA1, (1 * 32)(%rsi);
+       vmovdqu RA2, (2 * 32)(%rsi);
+       vmovdqu RA3, (3 * 32)(%rsi);
+       vmovdqu RB0, (4 * 32)(%rsi);
+       vmovdqu RB1, (5 * 32)(%rsi);
+       vmovdqu RB2, (6 * 32)(%rsi);
+       vmovdqu RB3, (7 * 32)(%rsi);
+
+       vzeroall;
+
+       ret_spec_stop;
+       CFI_ENDPROC();
+ELF(.size _gcry_sm4_gfni_avx512_cbc_dec,.-_gcry_sm4_gfni_avx512_cbc_dec;)
+
+.align 16
+.globl _gcry_sm4_gfni_avx512_cfb_dec
+ELF(.type   _gcry_sm4_gfni_avx512_cfb_dec,@function;)
+_gcry_sm4_gfni_avx512_cfb_dec:
+       /* input:
+        *      %rdi: ctx, CTX
+        *      %rsi: dst (16 blocks)
+        *      %rdx: src (16 blocks)
+        *      %rcx: iv
+        */
+       CFI_STARTPROC();
+       spec_stop_avx512;
+
+       /* Load input */
+       vmovdqu (%rcx), RNOTx;
+       vinserti128 $1, (%rdx), RNOT, RA0;
+       vmovdqu (0 * 32 + 16)(%rdx), RA1;
+       vmovdqu (1 * 32 + 16)(%rdx), RA2;
+       vmovdqu (2 * 32 + 16)(%rdx), RA3;
+       vmovdqu (3 * 32 + 16)(%rdx), RB0;
+       vmovdqu (4 * 32 + 16)(%rdx), RB1;
+       vmovdqu (5 * 32 + 16)(%rdx), RB2;
+       vmovdqu (6 * 32 + 16)(%rdx), RB3;
+
+       /* Update IV */
+       vmovdqu (7 * 32 + 16)(%rdx), RNOTx;
+       vmovdqu RNOTx, (%rcx);
+
+       call __sm4_gfni_crypt_blk16;
+
+       vpxor (0 * 32)(%rdx), RA0, RA0;
+       vpxor (1 * 32)(%rdx), RA1, RA1;
+       vpxor (2 * 32)(%rdx), RA2, RA2;
+       vpxor (3 * 32)(%rdx), RA3, RA3;
+       vpxor (4 * 32)(%rdx), RB0, RB0;
+       vpxor (5 * 32)(%rdx), RB1, RB1;
+       vpxor (6 * 32)(%rdx), RB2, RB2;
+       vpxor (7 * 32)(%rdx), RB3, RB3;
+
+       vmovdqu RA0, (0 * 32)(%rsi);
+       vmovdqu RA1, (1 * 32)(%rsi);
+       vmovdqu RA2, (2 * 32)(%rsi);
+       vmovdqu RA3, (3 * 32)(%rsi);
+       vmovdqu RB0, (4 * 32)(%rsi);
+       vmovdqu RB1, (5 * 32)(%rsi);
+       vmovdqu RB2, (6 * 32)(%rsi);
+       vmovdqu RB3, (7 * 32)(%rsi);
+
+       vzeroall;
+
+       ret_spec_stop;
+       CFI_ENDPROC();
+ELF(.size _gcry_sm4_gfni_avx512_cfb_dec,.-_gcry_sm4_gfni_avx512_cfb_dec;)
+
+.align 16
+.globl _gcry_sm4_gfni_avx512_ocb_enc
+ELF(.type _gcry_sm4_gfni_avx512_ocb_enc,@function;)
+
+_gcry_sm4_gfni_avx512_ocb_enc:
+       /* input:
+        *      %rdi: ctx, CTX
+        *      %rsi: dst (16 blocks)
+        *      %rdx: src (16 blocks)
+        *      %rcx: offset
+        *      %r8 : checksum
+        *      %r9 : L pointers (void *L[16])
+        */
+       CFI_STARTPROC();
+       spec_stop_avx512;
+
+       subq $(4 * 8), %rsp;
+       CFI_ADJUST_CFA_OFFSET(4 * 8);
+
+       movq %r10, (0 * 8)(%rsp);
+       movq %r11, (1 * 8)(%rsp);
+       movq %r12, (2 * 8)(%rsp);
+       movq %r13, (3 * 8)(%rsp);
+       CFI_REL_OFFSET(%r10, 0 * 8);
+       CFI_REL_OFFSET(%r11, 1 * 8);
+       CFI_REL_OFFSET(%r12, 2 * 8);
+       CFI_REL_OFFSET(%r13, 3 * 8);
+
+       vmovdqu (%rcx), RTMP0x;
+       vmovdqu (%r8), RTMP1x;
+
+       /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
+       /* Checksum_i = Checksum_{i-1} xor P_i  */
+       /* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i)  */
+
+#define OCB_INPUT(n, l0reg, l1reg, yreg, inreg) \
+         vmovdqu (n * 32)(%rdx), inreg; \
+         vpxor (l0reg), RTMP0x, RNOTx; \
+         vpxor (l1reg), RNOTx, RTMP0x; \
+         vinserti128 $1, RTMP0x, RNOT, RNOT; \
+         vpxor inreg, RNOT, yreg; \
+         vmovdqu RNOT, (n * 32)(%rsi);
+
+       movq (0 * 8)(%r9), %r10;
+       movq (1 * 8)(%r9), %r11;
+       movq (2 * 8)(%r9), %r12;
+       movq (3 * 8)(%r9), %r13;
+       OCB_INPUT(0, %r10, %r11, RA0, RTMP2);
+       OCB_INPUT(1, %r12, %r13, RA1, RTMP3);
+       movq (4 * 8)(%r9), %r10;
+       movq (5 * 8)(%r9), %r11;
+       movq (6 * 8)(%r9), %r12;
+       movq (7 * 8)(%r9), %r13;
+       OCB_INPUT(2, %r10, %r11, RA2, RTMP4);
+       vpternlogd $0x96, RTMP2, RTMP3, RTMP4;
+       OCB_INPUT(3, %r12, %r13, RA3, RX0);
+       movq (8 * 8)(%r9), %r10;
+       movq (9 * 8)(%r9), %r11;
+       movq (10 * 8)(%r9), %r12;
+       movq (11 * 8)(%r9), %r13;
+       OCB_INPUT(4, %r10, %r11, RB0, RX1);
+       OCB_INPUT(5, %r12, %r13, RB1, RTMP2);
+       vpternlogd $0x96, RX0, RX1, RTMP2;
+       movq (12 * 8)(%r9), %r10;
+       movq (13 * 8)(%r9), %r11;
+       movq (14 * 8)(%r9), %r12;
+       movq (15 * 8)(%r9), %r13;
+       OCB_INPUT(6, %r10, %r11, RB2, RTMP3);
+       OCB_INPUT(7, %r12, %r13, RB3, RX0);
+       vpternlogd $0x96, RTMP3, RX0, RTMP1;
+#undef OCB_INPUT
+
+       vpternlogd $0x96, RTMP4, RTMP2, RTMP1;
+       vextracti128 $1, RTMP1, RNOTx;
+       vmovdqu RTMP0x, (%rcx);
+       vpxor RNOTx, RTMP1x, RTMP1x;
+       vmovdqu RTMP1x, (%r8);
+
+       movq (0 * 8)(%rsp), %r10;
+       movq (1 * 8)(%rsp), %r11;
+       movq (2 * 8)(%rsp), %r12;
+       movq (3 * 8)(%rsp), %r13;
+       CFI_RESTORE(%r10);
+       CFI_RESTORE(%r11);
+       CFI_RESTORE(%r12);
+       CFI_RESTORE(%r13);
+
+       call __sm4_gfni_crypt_blk16;
+
+       addq $(4 * 8), %rsp;
+       CFI_ADJUST_CFA_OFFSET(-4 * 8);
+
+       vpxor (0 * 32)(%rsi), RA0, RA0;
+       vpxor (1 * 32)(%rsi), RA1, RA1;
+       vpxor (2 * 32)(%rsi), RA2, RA2;
+       vpxor (3 * 32)(%rsi), RA3, RA3;
+       vpxor (4 * 32)(%rsi), RB0, RB0;
+       vpxor (5 * 32)(%rsi), RB1, RB1;
+       vpxor (6 * 32)(%rsi), RB2, RB2;
+       vpxor (7 * 32)(%rsi), RB3, RB3;
+
+       vmovdqu RA0, (0 * 32)(%rsi);
+       vmovdqu RA1, (1 * 32)(%rsi);
+       vmovdqu RA2, (2 * 32)(%rsi);
+       vmovdqu RA3, (3 * 32)(%rsi);
+       vmovdqu RB0, (4 * 32)(%rsi);
+       vmovdqu RB1, (5 * 32)(%rsi);
+       vmovdqu RB2, (6 * 32)(%rsi);
+       vmovdqu RB3, (7 * 32)(%rsi);
+
+       vzeroall;
+
+       ret_spec_stop;
+       CFI_ENDPROC();
+ELF(.size _gcry_sm4_gfni_avx512_ocb_enc,.-_gcry_sm4_gfni_avx512_ocb_enc;)
+
+.align 16
+.globl _gcry_sm4_gfni_avx512_ocb_dec
+ELF(.type _gcry_sm4_gfni_avx512_ocb_dec,@function;)
+
+_gcry_sm4_gfni_avx512_ocb_dec:
+       /* input:
+        *      %rdi: ctx, CTX
+        *      %rsi: dst (16 blocks)
+        *      %rdx: src (16 blocks)
+        *      %rcx: offset
+        *      %r8 : checksum
+        *      %r9 : L pointers (void *L[16])
+        */
+       CFI_STARTPROC();
+       spec_stop_avx512;
+
+       subq $(4 * 8), %rsp;
+       CFI_ADJUST_CFA_OFFSET(4 * 8);
+
+       movq %r10, (0 * 8)(%rsp);
+       movq %r11, (1 * 8)(%rsp);
+       movq %r12, (2 * 8)(%rsp);
+       movq %r13, (3 * 8)(%rsp);
+       CFI_REL_OFFSET(%r10, 0 * 8);
+       CFI_REL_OFFSET(%r11, 1 * 8);
+       CFI_REL_OFFSET(%r12, 2 * 8);
+       CFI_REL_OFFSET(%r13, 3 * 8);
+
+       vmovdqu (%rcx), RTMP0x;
+
+       /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
+       /* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i)  */
+
+#define OCB_INPUT(n, l0reg, l1reg, yreg) \
+         vmovdqu (n * 32)(%rdx), yreg; \
+         vpxor (l0reg), RTMP0x, RNOTx; \
+         vpxor (l1reg), RNOTx, RTMP0x; \
+         vinserti128 $1, RTMP0x, RNOT, RNOT; \
+         vpxor yreg, RNOT, yreg; \
+         vmovdqu RNOT, (n * 32)(%rsi);
+
+       movq (0 * 8)(%r9), %r10;
+       movq (1 * 8)(%r9), %r11;
+       movq (2 * 8)(%r9), %r12;
+       movq (3 * 8)(%r9), %r13;
+       OCB_INPUT(0, %r10, %r11, RA0);
+       OCB_INPUT(1, %r12, %r13, RA1);
+       movq (4 * 8)(%r9), %r10;
+       movq (5 * 8)(%r9), %r11;
+       movq (6 * 8)(%r9), %r12;
+       movq (7 * 8)(%r9), %r13;
+       OCB_INPUT(2, %r10, %r11, RA2);
+       OCB_INPUT(3, %r12, %r13, RA3);
+       movq (8 * 8)(%r9), %r10;
+       movq (9 * 8)(%r9), %r11;
+       movq (10 * 8)(%r9), %r12;
+       movq (11 * 8)(%r9), %r13;
+       OCB_INPUT(4, %r10, %r11, RB0);
+       OCB_INPUT(5, %r12, %r13, RB1);
+       movq (12 * 8)(%r9), %r10;
+       movq (13 * 8)(%r9), %r11;
+       movq (14 * 8)(%r9), %r12;
+       movq (15 * 8)(%r9), %r13;
+       OCB_INPUT(6, %r10, %r11, RB2);
+       OCB_INPUT(7, %r12, %r13, RB3);
+#undef OCB_INPUT
+
+       vmovdqu RTMP0x, (%rcx);
+
+       movq (0 * 8)(%rsp), %r10;
+       movq (1 * 8)(%rsp), %r11;
+       movq (2 * 8)(%rsp), %r12;
+       movq (3 * 8)(%rsp), %r13;
+       CFI_RESTORE(%r10);
+       CFI_RESTORE(%r11);
+       CFI_RESTORE(%r12);
+       CFI_RESTORE(%r13);
+
+       call __sm4_gfni_crypt_blk16;
+
+       addq $(4 * 8), %rsp;
+       CFI_ADJUST_CFA_OFFSET(-4 * 8);
+
+       vpxor (0 * 32)(%rsi), RA0, RA0;
+       vpxor (1 * 32)(%rsi), RA1, RA1;
+       vpxor (2 * 32)(%rsi), RA2, RA2;
+       vpxor (3 * 32)(%rsi), RA3, RA3;
+       vpxor (4 * 32)(%rsi), RB0, RB0;
+       vpxor (5 * 32)(%rsi), RB1, RB1;
+       vpxor (6 * 32)(%rsi), RB2, RB2;
+       vpxor (7 * 32)(%rsi), RB3, RB3;
+
+       /* Checksum_i = Checksum_{i-1} xor P_i  */
+
+       vmovdqu RA0, (0 * 32)(%rsi);
+       vmovdqu RA1, (1 * 32)(%rsi);
+       vmovdqu RA2, (2 * 32)(%rsi);
+       vmovdqu RA3, (3 * 32)(%rsi);
+       vmovdqu RB0, (4 * 32)(%rsi);
+       vmovdqu RB1, (5 * 32)(%rsi);
+       vmovdqu RB2, (6 * 32)(%rsi);
+       vmovdqu RB3, (7 * 32)(%rsi);
+
+       vpternlogd $0x96, RA0, RA1, RA2;
+       vpternlogd $0x96, RA3, RB0, RB1;
+       vpternlogd $0x96, RB2, RB3, RA2;
+       vpxord RA2, RB1, RTMP1;
+
+       vextracti128 $1, RTMP1, RNOTx;
+       vpternlogd $0x96, (%r8), RNOTx, RTMP1x;
+       vmovdqu RTMP1x, (%r8);
+
+       vzeroall;
+
+       ret_spec_stop;
+       CFI_ENDPROC();
+ELF(.size _gcry_sm4_gfni_avx512_ocb_dec,.-_gcry_sm4_gfni_avx512_ocb_dec;)
+
+.align 16
+.globl _gcry_sm4_gfni_avx512_ocb_auth
+ELF(.type _gcry_sm4_gfni_avx512_ocb_auth,@function;)
+
+_gcry_sm4_gfni_avx512_ocb_auth:
+       /* input:
+        *      %rdi: ctx, CTX
+        *      %rsi: abuf (16 blocks)
+        *      %rdx: offset
+        *      %rcx: checksum
+        *      %r8 : L pointers (void *L[16])
+        */
+       CFI_STARTPROC();
+       spec_stop_avx512;
+
+       subq $(4 * 8), %rsp;
+       CFI_ADJUST_CFA_OFFSET(4 * 8);
+
+       movq %r10, (0 * 8)(%rsp);
+       movq %r11, (1 * 8)(%rsp);
+       movq %r12, (2 * 8)(%rsp);
+       movq %r13, (3 * 8)(%rsp);
+       CFI_REL_OFFSET(%r10, 0 * 8);
+       CFI_REL_OFFSET(%r11, 1 * 8);
+       CFI_REL_OFFSET(%r12, 2 * 8);
+       CFI_REL_OFFSET(%r13, 3 * 8);
+
+       vmovdqu (%rdx), RTMP0x;
+
+       /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
+       /* Sum_i = Sum_{i-1} xor ENCIPHER(K, A_i xor Offset_i)  */
+
+#define OCB_INPUT(n, l0reg, l1reg, yreg) \
+         vmovdqu (n * 32)(%rsi), yreg; \
+         vpxor (l0reg), RTMP0x, RNOTx; \
+         vpxor (l1reg), RNOTx, RTMP0x; \
+         vinserti128 $1, RTMP0x, RNOT, RNOT; \
+         vpxor yreg, RNOT, yreg;
+
+       movq (0 * 8)(%r8), %r10;
+       movq (1 * 8)(%r8), %r11;
+       movq (2 * 8)(%r8), %r12;
+       movq (3 * 8)(%r8), %r13;
+       OCB_INPUT(0, %r10, %r11, RA0);
+       OCB_INPUT(1, %r12, %r13, RA1);
+       movq (4 * 8)(%r8), %r10;
+       movq (5 * 8)(%r8), %r11;
+       movq (6 * 8)(%r8), %r12;
+       movq (7 * 8)(%r8), %r13;
+       OCB_INPUT(2, %r10, %r11, RA2);
+       OCB_INPUT(3, %r12, %r13, RA3);
+       movq (8 * 8)(%r8), %r10;
+       movq (9 * 8)(%r8), %r11;
+       movq (10 * 8)(%r8), %r12;
+       movq (11 * 8)(%r8), %r13;
+       OCB_INPUT(4, %r10, %r11, RB0);
+       OCB_INPUT(5, %r12, %r13, RB1);
+       movq (12 * 8)(%r8), %r10;
+       movq (13 * 8)(%r8), %r11;
+       movq (14 * 8)(%r8), %r12;
+       movq (15 * 8)(%r8), %r13;
+       OCB_INPUT(6, %r10, %r11, RB2);
+       OCB_INPUT(7, %r12, %r13, RB3);
+#undef OCB_INPUT
+
+       vmovdqu RTMP0x, (%rdx);
+
+       movq (0 * 8)(%rsp), %r10;
+       movq (1 * 8)(%rsp), %r11;
+       movq (2 * 8)(%rsp), %r12;
+       movq (3 * 8)(%rsp), %r13;
+       CFI_RESTORE(%r10);
+       CFI_RESTORE(%r11);
+       CFI_RESTORE(%r12);
+       CFI_RESTORE(%r13);
+
+       call __sm4_gfni_crypt_blk16;
+
+       addq $(4 * 8), %rsp;
+       CFI_ADJUST_CFA_OFFSET(-4 * 8);
+
+       vpternlogd $0x96, RA0, RA1, RA2;
+       vpternlogd $0x96, RA3, RB0, RB1;
+       vpternlogd $0x96, RB2, RB3, RA2;
+       vpxor RA2, RB1, RTMP1;
+
+       vextracti128 $1, RTMP1, RNOTx;
+       vpternlogd $0x96, (%rcx), RNOTx, RTMP1x;
+       vmovdqu RTMP1x, (%rcx);
+
+       vzeroall;
+
+       ret_spec_stop;
+       CFI_ENDPROC();
+ELF(.size _gcry_sm4_gfni_avx512_ocb_auth,.-_gcry_sm4_gfni_avx512_ocb_auth;)
+
+/**********************************************************************
+  32-way SM4 with GFNI and AVX512 (512-bit vectors)
+ **********************************************************************/
+
+.align 16
+ELF(.type   __sm4_gfni_crypt_blk32,@function;)
+__sm4_gfni_crypt_blk32:
+       /* input:
+        *      %rdi: ctx, CTX
+        *      RA0z, RA1z, RA2z, RA3z, RB0z, RB1z, RB2z, RB3z: 32 parallel plaintext blocks
+        * output:
+        *      RA0z, RA1z, RA2z, RA3z, RB0z, RB1z, RB2z, RB3z: 32 parallel ciphertext blocks
+        */
+       CFI_STARTPROC();
+
+       vbroadcasti32x4 .Lbswap32_mask rRIP, RTMP2z;
+       vpshufb RTMP2z, RA0z, RA0z;
+       vpshufb RTMP2z, RA1z, RA1z;
+       vpshufb RTMP2z, RA2z, RA2z;
+       vpshufb RTMP2z, RA3z, RA3z;
+       vpshufb RTMP2z, RB0z, RB0z;
+       vpshufb RTMP2z, RB1z, RB1z;
+       vpshufb RTMP2z, RB2z, RB2z;
+       vpshufb RTMP2z, RB3z, RB3z;
+
+       vbroadcasti32x4 .Lpre_affine_s rRIP, %zmm16;
+       vbroadcasti32x4 .Lpost_affine_s rRIP, %zmm17;
+
+       transpose_4x4(RA0z, RA1z, RA2z, RA3z, RTMP0z, RTMP1z);
+       transpose_4x4(RB0z, RB1z, RB2z, RB3z, RTMP0z, RTMP1z);
+
+#define ROUND(round, s0, s1, s2, s3, r0, r1, r2, r3) \
+       vpbroadcastd (4*(round))(%rdi), RX1z; \
+       vpxord s1, RX1z, RX0z; \
+       vpternlogd $0x96, s2, s3, RX0z; /* s1 ^ s2 ^ s3 ^ rk */ \
+           vpxord r1, RX1z, RX1z; \
+           vpternlogd $0x96, r2, r3, RX1z; /* r1 ^ r2 ^ r3 ^ rk */ \
+       \
+       /* sbox, non-linear part */ \
+       vgf2p8affineqb $0x65, %zmm16, RX0z, RX0z; \
+       vgf2p8affineinvqb $0xd3, %zmm17, RX0z, RX0z; \
+           vgf2p8affineqb $0x65, %zmm16, RX1z, RX1z; \
+           vgf2p8affineinvqb $0xd3, %zmm17, RX1z, RX1z; \
+       \
+       /* linear part */ \
+       vprold $2, RX0z, RTMP0z; \
+       vprold $10, RX0z, RTMP1z; \
+       vprold $18, RX0z, RTMP2z; \
+       vpternlogd $0x96, RTMP0z, RX0z, s0; /* s0 ^ x ^ rol(x,2) */ \
+       vprold $24, RX0z, RX0z; \
+           vprold $2, RX1z, RTMP3z; \
+           vprold $10, RX1z, RTMP4z; \
+           vprold $18, RX1z, RTMP0z; \
+           vpternlogd $0x96, RTMP3z, RX1z, r0; /* r0 ^ x ^ rol(x,2) */ \
+           vprold $24, RX1z, RX1z; \
+       vpternlogd $0x96, RTMP1z, RTMP2z, RX0z; /* rol(x,10) ^ rol(x,18) ^ rol(x,24) */ \
+           vpternlogd $0x96, RTMP4z, RTMP0z, RX1z; /* rol(x,10) ^ rol(x,18) ^ rol(x,24) */ \
+       vpxord RX0z, s0, s0; /* s0 ^ x ^ rol(x,2) ^ rol(x,10) ^ rol(x,18) ^ rol(x,24) */ \
+           vpxord RX1z, r0, r0; /* r0 ^ x ^ rol(x,2) ^ rol(x,10) ^ rol(x,18) ^ rol(x,24) */
+
+       leaq (32*4)(%rdi), %rax;
+.align 16
+.Lroundloop_blk32:
+       ROUND(0, RA0z, RA1z, RA2z, RA3z, RB0z, RB1z, RB2z, RB3z);
+       ROUND(1, RA1z, RA2z, RA3z, RA0z, RB1z, RB2z, RB3z, RB0z);
+       ROUND(2, RA2z, RA3z, RA0z, RA1z, RB2z, RB3z, RB0z, RB1z);
+       ROUND(3, RA3z, RA0z, RA1z, RA2z, RB3z, RB0z, RB1z, RB2z);
+       leaq (4*4)(%rdi), %rdi;
+       cmpq %rax, %rdi;
+       jne .Lroundloop_blk32;
+
+#undef ROUND
+
+       vbroadcasti32x4 .Lbswap128_mask rRIP, RTMP2z;
+
+       transpose_4x4(RA0z, RA1z, RA2z, RA3z, RTMP0z, RTMP1z);
+       transpose_4x4(RB0z, RB1z, RB2z, RB3z, RTMP0z, RTMP1z);
+       vpshufb RTMP2z, RA0z, RA0z;
+       vpshufb RTMP2z, RA1z, RA1z;
+       vpshufb RTMP2z, RA2z, RA2z;
+       vpshufb RTMP2z, RA3z, RA3z;
+       vpshufb RTMP2z, RB0z, RB0z;
+       vpshufb RTMP2z, RB1z, RB1z;
+       vpshufb RTMP2z, RB2z, RB2z;
+       vpshufb RTMP2z, RB3z, RB3z;
+
+       vpxord %zmm16, %zmm16, %zmm16;
+       vpxord %zmm17, %zmm17, %zmm17;
+
+       ret_spec_stop;
+       CFI_ENDPROC();
+ELF(.size __sm4_gfni_crypt_blk32,.-__sm4_gfni_crypt_blk32;)
+
+.align 16
+.globl _gcry_sm4_gfni_avx512_crypt_blk32
+ELF(.type   _gcry_sm4_gfni_avx512_crypt_blk32,@function;)
+_gcry_sm4_gfni_avx512_crypt_blk32:
+       /* input:
+        *      %rdi: ctx, CTX
+        *      %rsi: dst (32 blocks)
+        *      %rdx: src (32 blocks)
+        */
+       CFI_STARTPROC();
+       spec_stop_avx512;
+
+       /* Load input */
+       vmovdqu32 (0 * 64)(%rdx), RA0z;
+       vmovdqu32 (1 * 64)(%rdx), RA1z;
+       vmovdqu32 (2 * 64)(%rdx), RA2z;
+       vmovdqu32 (3 * 64)(%rdx), RA3z;
+       vmovdqu32 (4 * 64)(%rdx), RB0z;
+       vmovdqu32 (5 * 64)(%rdx), RB1z;
+       vmovdqu32 (6 * 64)(%rdx), RB2z;
+       vmovdqu32 (7 * 64)(%rdx), RB3z;
+
+       call __sm4_gfni_crypt_blk32;
+
+       vmovdqu32 RA0z, (0 * 64)(%rsi);
+       vmovdqu32 RA1z, (1 * 64)(%rsi);
+       vmovdqu32 RA2z, (2 * 64)(%rsi);
+       vmovdqu32 RA3z, (3 * 64)(%rsi);
+       vmovdqu32 RB0z, (4 * 64)(%rsi);
+       vmovdqu32 RB1z, (5 * 64)(%rsi);
+       vmovdqu32 RB2z, (6 * 64)(%rsi);
+       vmovdqu32 RB3z, (7 * 64)(%rsi);
+
+       xorl %eax, %eax;
+       vzeroall;
+
+       ret_spec_stop;
+       CFI_ENDPROC();
+ELF(.size _gcry_sm4_gfni_avx512_crypt_blk32,.-_gcry_sm4_gfni_avx512_crypt_blk32;)
+
+.align 16
+.globl _gcry_sm4_gfni_avx512_ctr_enc_blk32
+ELF(.type   _gcry_sm4_gfni_avx512_ctr_enc_blk32,@function;)
+_gcry_sm4_gfni_avx512_ctr_enc_blk32:
+       /* input:
+        *      %rdi: ctx, CTX
+        *      %rsi: dst (32 blocks)
+        *      %rdx: src (32 blocks)
+        *      %rcx: iv (big endian, 128bit)
+        */
+       CFI_STARTPROC();
+       spec_stop_avx512;
+
+       cmpb $(0x100 - 32), 15(%rcx);
+       jbe .Lctr_byteadd32;
+
+       vbroadcasti64x2 .Lbswap128_mask rRIP, RTMP0z;
+       vmovdqa32 .Lcounter0123_lo rRIP, RTMP1z;
+       vbroadcasti64x2 .Lcounter4444_lo rRIP, RTMP2z;
+       vbroadcasti64x2 .Lcounter8888_lo rRIP, RTMP3z;
+       vbroadcasti64x2 .Lcounter16161616_lo rRIP, RTMP4z;
+
+       /* load IV and byteswap */
+       movq 8(%rcx), %r11;
+       bswapq %r11;
+       vbroadcasti64x2 (%rcx), RB3z;
+       vpshufb RTMP0z, RB3z, RB3z;
+
+       /* check need for handling 64-bit overflow and carry */
+       cmpq $(0xffffffffffffffff - 32), %r11;
+       ja .Lhandle_ctr_carry_blk32;
+
+       /* construct IVs */
+       vpaddq RTMP1z, RB3z, RA0z; /* +0:+1:+2:+3 */
+       vpaddq RTMP2z, RA0z, RA1z; /* +4:+5:+6:+7 */
+       vpaddq RTMP3z, RA0z, RA2z; /* +8:+9:+10:+11 */
+       vpaddq RTMP3z, RA1z, RA3z; /* +12:+13:+14:+15 */
+       vpaddq RTMP4z, RA0z, RB0z; /* +16... */
+       vpaddq RTMP4z, RA1z, RB1z; /* +20... */
+       vpaddq RTMP4z, RA2z, RB2z; /* +24... */
+       vpaddq RTMP4z, RA3z, RB3z; /* +28... */
+
+       /* Update counter */
+       leaq 32(%r11), %r11;
+       bswapq %r11;
+       movq %r11, 8(%rcx);
+
+       jmp .Lctr_carry_done_blk32;
+
+.Lhandle_ctr_carry_blk32:
+       vbroadcasti64x2 .Lcounter1111_hi rRIP, RNOTz;
+
+       /* construct IVs */
+       add_le128(RA0z, RB3z, RTMP1z, RNOTz); /* +0:+1:+2:+3 */
+       add_le128(RA1z, RA0z, RTMP2z, RNOTz); /* +4:+5:+6:+7 */
+       add_le128(RA2z, RA0z, RTMP3z, RNOTz); /* +8:+9:+10:+11 */
+       add_le128(RA3z, RA1z, RTMP3z, RNOTz); /* +12:+13:+14:+15 */
+       add_le128(RB0z, RA0z, RTMP4z, RNOTz); /* +16... */
+       add_le128(RB1z, RA1z, RTMP4z, RNOTz); /* +20... */
+       add_le128(RB2z, RA2z, RTMP4z, RNOTz); /* +24... */
+       add_le128(RB3z, RA3z, RTMP4z, RNOTz); /* +28... */
+
+       /* Update counter */
+       addq $32, %r11;
+       movq (%rcx), %r10;
+       bswapq %r10;
+       adcq $0, %r10;
+       bswapq %r11;
+       bswapq %r10;
+       movq %r11, 8(%rcx);
+       movq %r10, (%rcx);
+
+.align 16
+.Lctr_carry_done_blk32:
+       /* Byte-swap IVs. */
+       vpshufb RTMP0z, RA0z, RA0z;
+       vpshufb RTMP0z, RA1z, RA1z;
+       vpshufb RTMP0z, RA2z, RA2z;
+       vpshufb RTMP0z, RA3z, RA3z;
+       vpshufb RTMP0z, RB0z, RB0z;
+       vpshufb RTMP0z, RB1z, RB1z;
+       vpshufb RTMP0z, RB2z, RB2z;
+       vpshufb RTMP0z, RB3z, RB3z;
+
+.align 16
+.Lload_ctr_done32:
+       call __sm4_gfni_crypt_blk32;
+
+       vpxord (0 * 64)(%rdx), RA0z, RA0z;
+       vpxord (1 * 64)(%rdx), RA1z, RA1z;
+       vpxord (2 * 64)(%rdx), RA2z, RA2z;
+       vpxord (3 * 64)(%rdx), RA3z, RA3z;
+       vpxord (4 * 64)(%rdx), RB0z, RB0z;
+       vpxord (5 * 64)(%rdx), RB1z, RB1z;
+       vpxord (6 * 64)(%rdx), RB2z, RB2z;
+       vpxord (7 * 64)(%rdx), RB3z, RB3z;
+
+       vmovdqu32 RA0z, (0 * 64)(%rsi);
+       vmovdqu32 RA1z, (1 * 64)(%rsi);
+       vmovdqu32 RA2z, (2 * 64)(%rsi);
+       vmovdqu32 RA3z, (3 * 64)(%rsi);
+       vmovdqu32 RB0z, (4 * 64)(%rsi);
+       vmovdqu32 RB1z, (5 * 64)(%rsi);
+       vmovdqu32 RB2z, (6 * 64)(%rsi);
+       vmovdqu32 RB3z, (7 * 64)(%rsi);
+
+       vzeroall;
+       kxorq %k1, %k1, %k1;
+
+       ret_spec_stop;
+
+.align 16
+.Lctr_byteadd_full_ctr_carry32:
+       movq 8(%rcx), %r11;
+       movq (%rcx), %r10;
+       bswapq %r11;
+       bswapq %r10;
+       addq $32, %r11;
+       adcq $0, %r10;
+       bswapq %r11;
+       bswapq %r10;
+       movq %r11, 8(%rcx);
+       movq %r10, (%rcx);
+       jmp .Lctr_byteadd_zmm32;
+.align 16
+.Lctr_byteadd32:
+       vbroadcasti64x2 (%rcx), RA3z;
+       je .Lctr_byteadd_full_ctr_carry32;
+       addb $32, 15(%rcx);
+.Lctr_byteadd_zmm32:
+       vbroadcasti64x2 .Lbige_addb_16 rRIP, RB3z;
+       vpaddb RB3z, RA3z, RB3z;
+       vpaddb .Lbige_addb_0_1 rRIP, RA3z, RA0z;
+       vpaddb .Lbige_addb_4_5 rRIP, RA3z, RA1z;
+       vpaddb .Lbige_addb_8_9 rRIP, RA3z, RA2z;
+       vpaddb .Lbige_addb_12_13 rRIP, RA3z, RA3z;
+       vpaddb .Lbige_addb_0_1 rRIP, RB3z, RB0z;
+       vpaddb .Lbige_addb_4_5 rRIP, RB3z, RB1z;
+       vpaddb .Lbige_addb_8_9 rRIP, RB3z, RB2z;
+       vpaddb .Lbige_addb_12_13 rRIP, RB3z, RB3z;
+
+       jmp .Lload_ctr_done32;
+       CFI_ENDPROC();
+ELF(.size _gcry_sm4_gfni_avx512_ctr_enc_blk32,.-_gcry_sm4_gfni_avx512_ctr_enc_blk32;)
+
+.align 16
+.globl _gcry_sm4_gfni_avx512_cbc_dec_blk32
+ELF(.type   _gcry_sm4_gfni_avx512_cbc_dec_blk32,@function;)
+_gcry_sm4_gfni_avx512_cbc_dec_blk32:
+       /* input:
+        *      %rdi: ctx, CTX
+        *      %rsi: dst (32 blocks)
+        *      %rdx: src (32 blocks)
+        *      %rcx: iv
+        */
+       CFI_STARTPROC();
+       spec_stop_avx512;
+
+       vmovdqu32 (0 * 64)(%rdx), RA0z;
+       vmovdqu32 (1 * 64)(%rdx), RA1z;
+       vmovdqu32 (2 * 64)(%rdx), RA2z;
+       vmovdqu32 (3 * 64)(%rdx), RA3z;
+       vmovdqu32 (4 * 64)(%rdx), RB0z;
+       vmovdqu32 (5 * 64)(%rdx), RB1z;
+       vmovdqu32 (6 * 64)(%rdx), RB2z;
+       vmovdqu32 (7 * 64)(%rdx), RB3z;
+
+       call __sm4_gfni_crypt_blk32;
+
+       vmovdqu (%rcx), RNOTx;
+       vinserti64x2 $1, (0 * 16)(%rdx), RNOT, RNOT;
+       vinserti64x4 $1, (1 * 16)(%rdx), RNOTz, RNOTz;
+       vpxord RNOTz, RA0z, RA0z;
+       vpxord (0 * 64 + 48)(%rdx), RA1z, RA1z;
+       vpxord (1 * 64 + 48)(%rdx), RA2z, RA2z;
+       vpxord (2 * 64 + 48)(%rdx), RA3z, RA3z;
+       vpxord (3 * 64 + 48)(%rdx), RB0z, RB0z;
+       vpxord (4 * 64 + 48)(%rdx), RB1z, RB1z;
+       vpxord (5 * 64 + 48)(%rdx), RB2z, RB2z;
+       vpxord (6 * 64 + 48)(%rdx), RB3z, RB3z;
+       vmovdqu (7 * 64 + 48)(%rdx), RNOTx;
+       vmovdqu RNOTx, (%rcx); /* store new IV */
+
+       vmovdqu32 RA0z, (0 * 64)(%rsi);
+       vmovdqu32 RA1z, (1 * 64)(%rsi);
+       vmovdqu32 RA2z, (2 * 64)(%rsi);
+       vmovdqu32 RA3z, (3 * 64)(%rsi);
+       vmovdqu32 RB0z, (4 * 64)(%rsi);
+       vmovdqu32 RB1z, (5 * 64)(%rsi);
+       vmovdqu32 RB2z, (6 * 64)(%rsi);
+       vmovdqu32 RB3z, (7 * 64)(%rsi);
+
+       vzeroall;
+
+       ret_spec_stop;
+       CFI_ENDPROC();
+ELF(.size _gcry_sm4_gfni_avx512_cbc_dec_blk32,.-_gcry_sm4_gfni_avx512_cbc_dec_blk32;)
+
+.align 16
+.globl _gcry_sm4_gfni_avx512_cfb_dec_blk32
+ELF(.type   _gcry_sm4_gfni_avx512_cfb_dec_blk32,@function;)
+_gcry_sm4_gfni_avx512_cfb_dec_blk32:
+       /* input:
+        *      %rdi: ctx, CTX
+        *      %rsi: dst (32 blocks)
+        *      %rdx: src (32 blocks)
+        *      %rcx: iv
+        */
+       CFI_STARTPROC();
+       spec_stop_avx512;
+
+       /* Load input */
+       vmovdqu (%rcx), RA0x;
+       vinserti64x2 $1, (%rdx), RA0, RA0;
+       vinserti64x4 $1, 16(%rdx), RA0z, RA0z;
+       vmovdqu32 (0 * 64 + 48)(%rdx), RA1z;
+       vmovdqu32 (1 * 64 + 48)(%rdx), RA2z;
+       vmovdqu32 (2 * 64 + 48)(%rdx), RA3z;
+       vmovdqu32 (3 * 64 + 48)(%rdx), RB0z;
+       vmovdqu32 (4 * 64 + 48)(%rdx), RB1z;
+       vmovdqu32 (5 * 64 + 48)(%rdx), RB2z;
+       vmovdqu32 (6 * 64 + 48)(%rdx), RB3z;
+
+       /* Update IV */
+       vmovdqu (7 * 64 + 48)(%rdx), RNOTx;
+       vmovdqu RNOTx, (%rcx);
+
+       call __sm4_gfni_crypt_blk32;
+
+       vpxord (0 * 64)(%rdx), RA0z, RA0z;
+       vpxord (1 * 64)(%rdx), RA1z, RA1z;
+       vpxord (2 * 64)(%rdx), RA2z, RA2z;
+       vpxord (3 * 64)(%rdx), RA3z, RA3z;
+       vpxord (4 * 64)(%rdx), RB0z, RB0z;
+       vpxord (5 * 64)(%rdx), RB1z, RB1z;
+       vpxord (6 * 64)(%rdx), RB2z, RB2z;
+       vpxord (7 * 64)(%rdx), RB3z, RB3z;
+
+       vmovdqu32 RA0z, (0 * 64)(%rsi);
+       vmovdqu32 RA1z, (1 * 64)(%rsi);
+       vmovdqu32 RA2z, (2 * 64)(%rsi);
+       vmovdqu32 RA3z, (3 * 64)(%rsi);
+       vmovdqu32 RB0z, (4 * 64)(%rsi);
+       vmovdqu32 RB1z, (5 * 64)(%rsi);
+       vmovdqu32 RB2z, (6 * 64)(%rsi);
+       vmovdqu32 RB3z, (7 * 64)(%rsi);
+
+       vzeroall;
+
+       ret_spec_stop;
+       CFI_ENDPROC();
+ELF(.size _gcry_sm4_gfni_avx512_cfb_dec_blk32,.-_gcry_sm4_gfni_avx512_cfb_dec_blk32;)
+
+.align 16
+.globl _gcry_sm4_gfni_avx512_ocb_enc_blk32
+ELF(.type _gcry_sm4_gfni_avx512_ocb_enc_blk32,@function;)
+_gcry_sm4_gfni_avx512_ocb_enc_blk32:
+       /* input:
+        *      %rdi: ctx, CTX
+        *      %rsi: dst (32 blocks)
+        *      %rdx: src (32 blocks)
+        *      %rcx: offset
+        *      %r8 : checksum
+        *      %r9 : L pointers (void *L[32])
+        */
+       CFI_STARTPROC();
+       spec_stop_avx512;
+
+       subq $(5 * 8), %rsp;
+       CFI_ADJUST_CFA_OFFSET(5 * 8);
+
+       movq %r12, (0 * 8)(%rsp);
+       movq %r13, (1 * 8)(%rsp);
+       movq %r14, (2 * 8)(%rsp);
+       movq %r15, (3 * 8)(%rsp);
+       movq %rbx, (4 * 8)(%rsp);
+       CFI_REL_OFFSET(%r12, 0 * 8);
+       CFI_REL_OFFSET(%r13, 1 * 8);
+       CFI_REL_OFFSET(%r14, 2 * 8);
+       CFI_REL_OFFSET(%r15, 3 * 8);
+       CFI_REL_OFFSET(%rbx, 4 * 8);
+
+       vmovdqu (%rcx), RTMP0x;
+
+       /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
+       /* Checksum_i = Checksum_{i-1} xor P_i  */
+       /* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i)  */
+
+#define OCB_INPUT(n, l0reg, l1reg, l2reg, l3reg, zreg, zplain) \
+         vmovdqu32 (n * 64)(%rdx), zplain; \
+         vpxor (l0reg), RTMP0x, RNOTx; \
+         vpxor (l1reg), RNOTx, RTMP0x; \
+         vinserti64x2 $1, RTMP0x, RNOT, RNOT; \
+         vpxor (l2reg), RTMP0x, RTMP0x; \
+         vinserti64x2 $2, RTMP0x, RNOTz, RNOTz; \
+         vpxor (l3reg), RTMP0x, RTMP0x; \
+         vinserti64x2 $3, RTMP0x, RNOTz, RNOTz; \
+         vpxord zplain, RNOTz, zreg; \
+         vmovdqu32 RNOTz, (n * 64)(%rsi);
+
+#define OCB_LOAD_PTRS(n) \
+         movq ((n * 4 * 8) + (0 * 8))(%r9), %r10; \
+         movq ((n * 4 * 8) + (1 * 8))(%r9), %r11; \
+         movq ((n * 4 * 8) + (2 * 8))(%r9), %r12; \
+         movq ((n * 4 * 8) + (3 * 8))(%r9), %r13; \
+         movq ((n * 4 * 8) + (4 * 8))(%r9), %r14; \
+         movq ((n * 4 * 8) + (5 * 8))(%r9), %r15; \
+         movq ((n * 4 * 8) + (6 * 8))(%r9), %rax; \
+         movq ((n * 4 * 8) + (7 * 8))(%r9), %rbx;
+
+       OCB_LOAD_PTRS(0);
+       OCB_INPUT(0, %r10, %r11, %r12, %r13, RA0z, RTMP1z);
+       OCB_INPUT(1, %r14, %r15, %rax, %rbx, RA1z, RTMP2z);
+       OCB_LOAD_PTRS(2);
+       OCB_INPUT(2, %r10, %r11, %r12, %r13, RA2z, RTMP3z);
+       vpternlogd $0x96, RTMP1z, RTMP2z, RTMP3z;
+       OCB_INPUT(3, %r14, %r15, %rax, %rbx, RA3z, RTMP4z);
+       OCB_LOAD_PTRS(4);
+       OCB_INPUT(4, %r10, %r11, %r12, %r13, RB0z, RX0z);
+       OCB_INPUT(5, %r14, %r15, %rax, %rbx, RB1z, RX1z);
+       vpternlogd $0x96, RTMP4z, RX0z, RX1z;
+       OCB_LOAD_PTRS(6);
+       OCB_INPUT(6, %r10, %r11, %r12, %r13, RB2z, RTMP4z);
+       OCB_INPUT(7, %r14, %r15, %rax, %rbx, RB3z, RX0z);
+#undef OCB_LOAD_PTRS
+#undef OCB_INPUT
+
+       vpternlogd $0x96, RTMP3z, RTMP4z, RX0z;
+       vpxord RX1z, RX0z, RNOTz;
+       vextracti64x4 $1, RNOTz, RTMP1;
+       vpxor RTMP1, RNOT, RNOT;
+       vextracti128 $1, RNOT, RTMP1x;
+       vpternlogd $0x96, (%r8), RTMP1x, RNOTx;
+
+       movq (0 * 8)(%rsp), %r12;
+       movq (1 * 8)(%rsp), %r13;
+       movq (2 * 8)(%rsp), %r14;
+       movq (3 * 8)(%rsp), %r15;
+       movq (4 * 8)(%rsp), %rbx;
+       CFI_RESTORE(%r12);
+       CFI_RESTORE(%r13);
+       CFI_RESTORE(%r14);
+       CFI_RESTORE(%r15);
+       CFI_RESTORE(%rbx);
+
+       vmovdqu RTMP0x, (%rcx);
+       vmovdqu RNOTx, (%r8);
+
+       call __sm4_gfni_crypt_blk32;
+
+       addq $(5 * 8), %rsp;
+       CFI_ADJUST_CFA_OFFSET(-5 * 8);
+
+       vpxord (0 * 64)(%rsi), RA0z, RA0z;
+       vpxord (1 * 64)(%rsi), RA1z, RA1z;
+       vpxord (2 * 64)(%rsi), RA2z, RA2z;
+       vpxord (3 * 64)(%rsi), RA3z, RA3z;
+       vpxord (4 * 64)(%rsi), RB0z, RB0z;
+       vpxord (5 * 64)(%rsi), RB1z, RB1z;
+       vpxord (6 * 64)(%rsi), RB2z, RB2z;
+       vpxord (7 * 64)(%rsi), RB3z, RB3z;
+
+       vmovdqu32 RA0z, (0 * 64)(%rsi);
+       vmovdqu32 RA1z, (1 * 64)(%rsi);
+       vmovdqu32 RA2z, (2 * 64)(%rsi);
+       vmovdqu32 RA3z, (3 * 64)(%rsi);
+       vmovdqu32 RB0z, (4 * 64)(%rsi);
+       vmovdqu32 RB1z, (5 * 64)(%rsi);
+       vmovdqu32 RB2z, (6 * 64)(%rsi);
+       vmovdqu32 RB3z, (7 * 64)(%rsi);
+
+       vzeroall;
+
+       ret_spec_stop;
+       CFI_ENDPROC();
+ELF(.size _gcry_sm4_gfni_avx512_ocb_enc_blk32,.-_gcry_sm4_gfni_avx512_ocb_enc_blk32;)
+
+.align 16
+.globl _gcry_sm4_gfni_avx512_ocb_dec_blk32
+ELF(.type _gcry_sm4_gfni_avx512_ocb_dec_blk32,@function;)
+_gcry_sm4_gfni_avx512_ocb_dec_blk32:
+       /* input:
+        *      %rdi: ctx, CTX
+        *      %rsi: dst (32 blocks)
+        *      %rdx: src (32 blocks)
+        *      %rcx: offset
+        *      %r8 : checksum
+        *      %r9 : L pointers (void *L[32])
+        */
+       CFI_STARTPROC();
+       spec_stop_avx512;
+
+       subq $(5 * 8), %rsp;
+       CFI_ADJUST_CFA_OFFSET(5 * 8);
+
+       movq %r12, (0 * 8)(%rsp);
+       movq %r13, (1 * 8)(%rsp);
+       movq %r14, (2 * 8)(%rsp);
+       movq %r15, (3 * 8)(%rsp);
+       movq %rbx, (4 * 8)(%rsp);
+       CFI_REL_OFFSET(%r12, 0 * 8);
+       CFI_REL_OFFSET(%r13, 1 * 8);
+       CFI_REL_OFFSET(%r14, 2 * 8);
+       CFI_REL_OFFSET(%r15, 3 * 8);
+       CFI_REL_OFFSET(%rbx, 4 * 8);
+
+       vmovdqu (%rcx), RTMP0x;
+
+       /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
+       /* C_i = Offset_i xor DECIPHER(K, P_i xor Offset_i)  */
+
+#define OCB_INPUT(n, l0reg, l1reg, l2reg, l3reg, zreg) \
+         vmovdqu32 (n * 64)(%rdx), RTMP1z; \
+         vpxor (l0reg), RTMP0x, RNOTx; \
+         vpxor (l1reg), RNOTx, RTMP0x; \
+         vinserti64x2 $1, RTMP0x, RNOT, RNOT; \
+         vpxor (l2reg), RTMP0x, RTMP0x; \
+         vinserti64x2 $2, RTMP0x, RNOTz, RNOTz; \
+         vpxor (l3reg), RTMP0x, RTMP0x; \
+         vinserti64x2 $3, RTMP0x, RNOTz, RNOTz; \
+         vpxord RTMP1z, RNOTz, zreg; \
+         vmovdqu32 RNOTz, (n * 64)(%rsi);
+
+#define OCB_LOAD_PTRS(n) \
+         movq ((n * 4 * 8) + (0 * 8))(%r9), %r10; \
+         movq ((n * 4 * 8) + (1 * 8))(%r9), %r11; \
+         movq ((n * 4 * 8) + (2 * 8))(%r9), %r12; \
+         movq ((n * 4 * 8) + (3 * 8))(%r9), %r13; \
+         movq ((n * 4 * 8) + (4 * 8))(%r9), %r14; \
+         movq ((n * 4 * 8) + (5 * 8))(%r9), %r15; \
+         movq ((n * 4 * 8) + (6 * 8))(%r9), %rax; \
+         movq ((n * 4 * 8) + (7 * 8))(%r9), %rbx;
+
+       OCB_LOAD_PTRS(0);
+       OCB_INPUT(0, %r10, %r11, %r12, %r13, RA0z);
+       OCB_INPUT(1, %r14, %r15, %rax, %rbx, RA1z);
+       OCB_LOAD_PTRS(2);
+       OCB_INPUT(2, %r10, %r11, %r12, %r13, RA2z);
+       OCB_INPUT(3, %r14, %r15, %rax, %rbx, RA3z);
+       OCB_LOAD_PTRS(4);
+       OCB_INPUT(4, %r10, %r11, %r12, %r13, RB0z);
+       OCB_INPUT(5, %r14, %r15, %rax, %rbx, RB1z);
+       OCB_LOAD_PTRS(6);
+       OCB_INPUT(6, %r10, %r11, %r12, %r13, RB2z);
+       OCB_INPUT(7, %r14, %r15, %rax, %rbx, RB3z);
+#undef OCB_LOAD_PTRS
+#undef OCB_INPUT
+
+       movq (0 * 8)(%rsp), %r12;
+       movq (1 * 8)(%rsp), %r13;
+       movq (2 * 8)(%rsp), %r14;
+       movq (3 * 8)(%rsp), %r15;
+       movq (4 * 8)(%rsp), %rbx;
+       CFI_RESTORE(%r12);
+       CFI_RESTORE(%r13);
+       CFI_RESTORE(%r14);
+       CFI_RESTORE(%r15);
+       CFI_RESTORE(%rbx);
+
+       vmovdqu RTMP0x, (%rcx);
+
+       call __sm4_gfni_crypt_blk32;
+
+       addq $(5 * 8), %rsp;
+       CFI_ADJUST_CFA_OFFSET(-5 * 8);
+
+       vpxord (0 * 64)(%rsi), RA0z, RA0z;
+       vpxord (1 * 64)(%rsi), RA1z, RA1z;
+       vpxord (2 * 64)(%rsi), RA2z, RA2z;
+       vpxord (3 * 64)(%rsi), RA3z, RA3z;
+       vpxord (4 * 64)(%rsi), RB0z, RB0z;
+       vpxord (5 * 64)(%rsi), RB1z, RB1z;
+       vpxord (6 * 64)(%rsi), RB2z, RB2z;
+       vpxord (7 * 64)(%rsi), RB3z, RB3z;
+
+       vmovdqu32 RA0z, (0 * 64)(%rsi);
+       vmovdqu32 RA1z, (1 * 64)(%rsi);
+       vmovdqu32 RA2z, (2 * 64)(%rsi);
+       vmovdqu32 RA3z, (3 * 64)(%rsi);
+       vmovdqu32 RB0z, (4 * 64)(%rsi);
+       vmovdqu32 RB1z, (5 * 64)(%rsi);
+       vmovdqu32 RB2z, (6 * 64)(%rsi);
+       vmovdqu32 RB3z, (7 * 64)(%rsi);
+
+       /* Checksum_i = Checksum_{i-1} xor C_i  */
+       vpternlogd $0x96, RA0z, RA1z, RA2z;
+       vpternlogd $0x96, RA3z, RB0z, RB1z;
+       vpternlogd $0x96, RB2z, RB3z, RA2z;
+       vpxord RA2z, RB1z, RTMP1z;
+
+       vextracti64x4 $1, RTMP1z, RNOT;
+       vpxor RNOT, RTMP1, RTMP1;
+       vextracti128 $1, RTMP1, RNOTx;
+       vpternlogd $0x96, (%r8), RNOTx, RTMP1x;
+       vmovdqu RTMP1x, (%r8);
+
+       vzeroall;
+
+       ret_spec_stop;
+       CFI_ENDPROC();
+ELF(.size _gcry_sm4_gfni_avx512_ocb_dec_blk32,.-_gcry_sm4_gfni_avx512_ocb_dec_blk32;)
+
+#endif /*defined(ENABLE_GFNI_SUPPORT) && defined(ENABLE_AVX512_SUPPORT)*/
+#endif /*__x86_64*/
diff --git a/cipher/sm4-ppc.c b/cipher/sm4-ppc.c
new file mode 100644 (file)
index 0000000..bb2c55e
--- /dev/null
@@ -0,0 +1,342 @@
+/* sm4-ppc.c  -  PowerPC implementation of SM4 cipher
+ *
+ * Copyright (C) 2023 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <config.h>
+
+#if defined(ENABLE_PPC_CRYPTO_SUPPORT) && \
+    defined(HAVE_COMPATIBLE_CC_PPC_ALTIVEC) && \
+    defined(HAVE_GCC_INLINE_ASM_PPC_ALTIVEC) && \
+    !defined(WORDS_BIGENDIAN) && (__GNUC__ >= 4)
+
+#include <altivec.h>
+#include "bufhelp.h"
+
+typedef vector unsigned char vector16x_u8;
+typedef vector unsigned int vector4x_u32;
+typedef vector unsigned long long vector2x_u64;
+
+#ifdef HAVE_GCC_ATTRIBUTE_OPTIMIZE
+# define FUNC_ATTR_OPT __attribute__((optimize("-O2")))
+#else
+# define FUNC_ATTR_OPT
+#endif
+
+#if defined(__clang__) && defined(HAVE_CLANG_ATTRIBUTE_PPC_TARGET)
+# define FUNC_ATTR_TARGET_P8 __attribute__((target("arch=pwr8")))
+# define FUNC_ATTR_TARGET_P9 __attribute__((target("arch=pwr9")))
+# define HAVE_FUNC_ATTR_TARGET 1
+#elif defined(HAVE_GCC_ATTRIBUTE_PPC_TARGET)
+# define FUNC_ATTR_TARGET_P8 __attribute__((target("cpu=power8")))
+# define FUNC_ATTR_TARGET_P9 __attribute__((target("cpu=power9")))
+# define HAVE_FUNC_ATTR_TARGET 1
+#else
+# define FUNC_ATTR_TARGET_P8
+# define FUNC_ATTR_TARGET_P9
+# undef HAVE_FUNC_ATTR_TARGET
+#endif
+
+#define ALWAYS_INLINE inline __attribute__((always_inline))
+#define NO_INLINE __attribute__((noinline))
+#define NO_INSTRUMENT_FUNCTION __attribute__((no_instrument_function))
+
+#define ASM_FUNC_ATTR          NO_INSTRUMENT_FUNCTION
+#define ASM_FUNC_ATTR_INLINE   ASM_FUNC_ATTR ALWAYS_INLINE
+#define ASM_FUNC_ATTR_NOINLINE ASM_FUNC_ATTR NO_INLINE
+
+#ifdef __clang__
+/* clang has mismatching prototype for vec_sbox_be. */
+static ASM_FUNC_ATTR_INLINE vector16x_u8
+asm_sbox_be(vector16x_u8 b)
+{
+  vector16x_u8 o;
+  __asm__ ("vsbox %0, %1\n\t" : "=v" (o) : "v" (b));
+  return o;
+}
+#undef vec_sbox_be
+#define vec_sbox_be asm_sbox_be
+#endif /* __clang__ */
+
+#define transpose_4x4(x0, x1, x2, x3, t1, t2) \
+       t2 = (vector4x_u32)vec_mergel((vector4x_u32)x0, (vector4x_u32)x1); \
+       x0 = (vector4x_u32)vec_mergeh((vector4x_u32)x0, (vector4x_u32)x1); \
+       \
+       t1 = (vector4x_u32)vec_mergeh((vector4x_u32)x2, (vector4x_u32)x3); \
+       x2 = (vector4x_u32)vec_mergel((vector4x_u32)x2, (vector4x_u32)x3); \
+       \
+       x1 = (vector4x_u32)vec_mergel((vector2x_u64)x0, (vector2x_u64)t1); \
+       x0 = (vector4x_u32)vec_mergeh((vector2x_u64)x0, (vector2x_u64)t1); \
+       \
+       x3 = (vector4x_u32)vec_mergel((vector2x_u64)t2, (vector2x_u64)x2); \
+       x2 = (vector4x_u32)vec_mergeh((vector2x_u64)t2, (vector2x_u64)x2);
+
+#define filter_8bit(x, lo_t, hi_t, mask4bit, tmp0) ({ \
+       tmp0 = x & mask4bit; \
+       x = (vector4x_u32)((vector16x_u8)x >> 4); \
+       \
+       tmp0 = (vector4x_u32)vec_perm((vector16x_u8)lo_t, (vector16x_u8)lo_t, \
+                                     (vector16x_u8)tmp0); \
+       x = (vector4x_u32)vec_perm((vector16x_u8)hi_t, (vector16x_u8)hi_t, \
+                                  (vector16x_u8)x); \
+       x = x ^ tmp0; \
+      })
+
+#define GET_RKEY(round) vec_splat(r4keys, round)
+
+#define ROUND4(round, s0, s1, s2, s3) ({ \
+       vector4x_u32 rkey = GET_RKEY(round); \
+       vector4x_u32 rx0 = rkey ^ s1 ^ s2 ^ s3; \
+       filter_8bit(rx0, pre_tf_lo_s, pre_tf_hi_s, mask_0f, tmp0); \
+       rx0 = (vector4x_u32)vec_sbox_be((vector16x_u8)rx0); \
+       filter_8bit(rx0, post_tf_lo_s, post_tf_hi_s, mask_0f, tmp0); \
+       s0 ^= rx0 ^ vec_rl(rx0, rotate2) ^ vec_rl(rx0, rotate10) ^ \
+                   vec_rl(rx0, rotate18) ^ vec_rl(rx0, rotate24); \
+      })
+
+#define ROUND8(round, s0, s1, s2, s3, r0, r1, r2, r3) ({ \
+       vector4x_u32 rkey = GET_RKEY(round); \
+       vector4x_u32 rx0 = rkey ^ s1 ^ s2 ^ s3; \
+       vector4x_u32 rx1 = rkey ^ r1 ^ r2 ^ r3; \
+       filter_8bit(rx0, pre_tf_lo_s, pre_tf_hi_s, mask_0f, tmp0); \
+       filter_8bit(rx1, pre_tf_lo_s, pre_tf_hi_s, mask_0f, tmp0); \
+       rx0 = (vector4x_u32)vec_sbox_be((vector16x_u8)rx0); \
+       rx1 = (vector4x_u32)vec_sbox_be((vector16x_u8)rx1); \
+       filter_8bit(rx0, post_tf_lo_s, post_tf_hi_s, mask_0f, tmp0); \
+       filter_8bit(rx1, post_tf_lo_s, post_tf_hi_s, mask_0f, tmp0); \
+       s0 ^= rx0 ^ vec_rl(rx0, rotate2) ^ vec_rl(rx0, rotate10) ^ \
+                   vec_rl(rx0, rotate18) ^ vec_rl(rx0, rotate24); \
+       r0 ^= rx1 ^ vec_rl(rx1, rotate2) ^ vec_rl(rx1, rotate10) ^ \
+                   vec_rl(rx1, rotate18) ^ vec_rl(rx1, rotate24); \
+      })
+
+static const vector4x_u32 mask_0f =
+  { 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f };
+static const vector2x_u64 pre_tf_lo_s =
+  { 0x9096E3E575730600ULL, 0xC6C0B5B323255056ULL };
+static const vector2x_u64 pre_tf_hi_s =
+  { 0xE341AA08EA48A301ULL, 0xF153B81AF85AB113ULL };
+static const vector2x_u64 post_tf_lo_s =
+  { 0x6F53C6FA95A93C00ULL, 0xD9E5704C231F8AB6ULL };
+static const vector2x_u64 post_tf_hi_s =
+  { 0x9A4635E9479BE834ULL, 0x25F98A56F824578BULL };
+static const vector4x_u32 rotate2 = { 2, 2, 2, 2 };
+static const vector4x_u32 rotate10 = { 10, 10, 10, 10 };
+static const vector4x_u32 rotate18 = { 18, 18, 18, 18 };
+static const vector4x_u32 rotate24 = { 24, 24, 24, 24 };
+
+static ASM_FUNC_ATTR_INLINE void
+sm4_ppc_crypt_blk16(u32 *rk, byte *out, const byte *in)
+{
+  vector4x_u32 ra0, ra1, ra2, ra3;
+  vector4x_u32 rb0, rb1, rb2, rb3;
+  vector4x_u32 rc0, rc1, rc2, rc3;
+  vector4x_u32 rd0, rd1, rd2, rd3;
+  vector4x_u32 tmp0, tmp1;
+  u32 *rk_end;
+
+  ra0 = vec_revb((vector4x_u32)vec_xl(0, in + 0 * 16));
+  ra1 = vec_revb((vector4x_u32)vec_xl(0, in + 1 * 16));
+  ra2 = vec_revb((vector4x_u32)vec_xl(0, in + 2 * 16));
+  ra3 = vec_revb((vector4x_u32)vec_xl(0, in + 3 * 16));
+  rb0 = vec_revb((vector4x_u32)vec_xl(0, in + 4 * 16));
+  rb1 = vec_revb((vector4x_u32)vec_xl(0, in + 5 * 16));
+  rb2 = vec_revb((vector4x_u32)vec_xl(0, in + 6 * 16));
+  rb3 = vec_revb((vector4x_u32)vec_xl(0, in + 7 * 16));
+  in += 8 * 16;
+  rc0 = vec_revb((vector4x_u32)vec_xl(0, in + 0 * 16));
+  rc1 = vec_revb((vector4x_u32)vec_xl(0, in + 1 * 16));
+  rc2 = vec_revb((vector4x_u32)vec_xl(0, in + 2 * 16));
+  rc3 = vec_revb((vector4x_u32)vec_xl(0, in + 3 * 16));
+  rd0 = vec_revb((vector4x_u32)vec_xl(0, in + 4 * 16));
+  rd1 = vec_revb((vector4x_u32)vec_xl(0, in + 5 * 16));
+  rd2 = vec_revb((vector4x_u32)vec_xl(0, in + 6 * 16));
+  rd3 = vec_revb((vector4x_u32)vec_xl(0, in + 7 * 16));
+
+  transpose_4x4(ra0, ra1, ra2, ra3, tmp0, tmp1);
+  transpose_4x4(rb0, rb1, rb2, rb3, tmp0, tmp1);
+  transpose_4x4(rc0, rc1, rc2, rc3, tmp0, tmp1);
+  transpose_4x4(rd0, rd1, rd2, rd3, tmp0, tmp1);
+
+  for (rk_end = rk + 32; rk < rk_end; rk += 4)
+    {
+      vector4x_u32 r4keys = vec_xl(0, rk);
+      ROUND8(0, ra0, ra1, ra2, ra3, rb0, rb1, rb2, rb3);
+      ROUND8(0, rc0, rc1, rc2, rc3, rd0, rd1, rd2, rd3);
+      ROUND8(1, ra1, ra2, ra3, ra0, rb1, rb2, rb3, rb0);
+      ROUND8(1, rc1, rc2, rc3, rc0, rd1, rd2, rd3, rd0);
+      ROUND8(2, ra2, ra3, ra0, ra1, rb2, rb3, rb0, rb1);
+      ROUND8(2, rc2, rc3, rc0, rc1, rd2, rd3, rd0, rd1);
+      ROUND8(3, ra3, ra0, ra1, ra2, rb3, rb0, rb1, rb2);
+      ROUND8(3, rc3, rc0, rc1, rc2, rd3, rd0, rd1, rd2);
+    }
+
+  transpose_4x4(ra3, ra2, ra1, ra0, tmp0, tmp1);
+  transpose_4x4(rb3, rb2, rb1, rb0, tmp0, tmp1);
+  transpose_4x4(rc3, rc2, rc1, rc0, tmp0, tmp1);
+  transpose_4x4(rd3, rd2, rd1, rd0, tmp0, tmp1);
+
+  vec_xst((vector16x_u8)vec_revb(ra3), 0, out + 0 * 16);
+  vec_xst((vector16x_u8)vec_revb(ra2), 0, out + 1 * 16);
+  vec_xst((vector16x_u8)vec_revb(ra1), 0, out + 2 * 16);
+  vec_xst((vector16x_u8)vec_revb(ra0), 0, out + 3 * 16);
+  vec_xst((vector16x_u8)vec_revb(rb3), 0, out + 4 * 16);
+  vec_xst((vector16x_u8)vec_revb(rb2), 0, out + 5 * 16);
+  vec_xst((vector16x_u8)vec_revb(rb1), 0, out + 6 * 16);
+  vec_xst((vector16x_u8)vec_revb(rb0), 0, out + 7 * 16);
+  out += 8 * 16;
+  vec_xst((vector16x_u8)vec_revb(rc3), 0, out + 0 * 16);
+  vec_xst((vector16x_u8)vec_revb(rc2), 0, out + 1 * 16);
+  vec_xst((vector16x_u8)vec_revb(rc1), 0, out + 2 * 16);
+  vec_xst((vector16x_u8)vec_revb(rc0), 0, out + 3 * 16);
+  vec_xst((vector16x_u8)vec_revb(rd3), 0, out + 4 * 16);
+  vec_xst((vector16x_u8)vec_revb(rd2), 0, out + 5 * 16);
+  vec_xst((vector16x_u8)vec_revb(rd1), 0, out + 6 * 16);
+  vec_xst((vector16x_u8)vec_revb(rd0), 0, out + 7 * 16);
+}
+
+static ASM_FUNC_ATTR_INLINE void
+sm4_ppc_crypt_blk8(u32 *rk, byte *out, const byte *in)
+{
+  vector4x_u32 ra0, ra1, ra2, ra3;
+  vector4x_u32 rb0, rb1, rb2, rb3;
+  vector4x_u32 tmp0, tmp1;
+  u32 *rk_end;
+
+  ra0 = vec_revb((vector4x_u32)vec_xl(0, in + 0 * 16));
+  ra1 = vec_revb((vector4x_u32)vec_xl(0, in + 1 * 16));
+  ra2 = vec_revb((vector4x_u32)vec_xl(0, in + 2 * 16));
+  ra3 = vec_revb((vector4x_u32)vec_xl(0, in + 3 * 16));
+  rb0 = vec_revb((vector4x_u32)vec_xl(0, in + 4 * 16));
+  rb1 = vec_revb((vector4x_u32)vec_xl(0, in + 5 * 16));
+  rb2 = vec_revb((vector4x_u32)vec_xl(0, in + 6 * 16));
+  rb3 = vec_revb((vector4x_u32)vec_xl(0, in + 7 * 16));
+
+  transpose_4x4(ra0, ra1, ra2, ra3, tmp0, tmp1);
+  transpose_4x4(rb0, rb1, rb2, rb3, tmp0, tmp1);
+
+  for (rk_end = rk + 32; rk < rk_end; rk += 4)
+    {
+      vector4x_u32 r4keys = vec_xl(0, rk);
+      ROUND8(0, ra0, ra1, ra2, ra3, rb0, rb1, rb2, rb3);
+      ROUND8(1, ra1, ra2, ra3, ra0, rb1, rb2, rb3, rb0);
+      ROUND8(2, ra2, ra3, ra0, ra1, rb2, rb3, rb0, rb1);
+      ROUND8(3, ra3, ra0, ra1, ra2, rb3, rb0, rb1, rb2);
+    }
+
+  transpose_4x4(ra3, ra2, ra1, ra0, tmp0, tmp1);
+  transpose_4x4(rb3, rb2, rb1, rb0, tmp0, tmp1);
+
+  vec_xst((vector16x_u8)vec_revb(ra3), 0, out + 0 * 16);
+  vec_xst((vector16x_u8)vec_revb(ra2), 0, out + 1 * 16);
+  vec_xst((vector16x_u8)vec_revb(ra1), 0, out + 2 * 16);
+  vec_xst((vector16x_u8)vec_revb(ra0), 0, out + 3 * 16);
+  vec_xst((vector16x_u8)vec_revb(rb3), 0, out + 4 * 16);
+  vec_xst((vector16x_u8)vec_revb(rb2), 0, out + 5 * 16);
+  vec_xst((vector16x_u8)vec_revb(rb1), 0, out + 6 * 16);
+  vec_xst((vector16x_u8)vec_revb(rb0), 0, out + 7 * 16);
+}
+
+static ASM_FUNC_ATTR_INLINE void
+sm4_ppc_crypt_blk1_4(u32 *rk, byte *out, const byte *in, size_t nblks)
+{
+  vector4x_u32 ra0, ra1, ra2, ra3;
+  vector4x_u32 tmp0, tmp1;
+  u32 *rk_end;
+
+  ra0 = vec_revb((vector4x_u32)vec_xl(0, in + 0 * 16));
+  ra1 = ra0;
+  ra2 = ra0;
+  ra3 = ra0;
+  if (LIKELY(nblks > 1))
+    ra1 = vec_revb((vector4x_u32)vec_xl(0, in + 1 * 16));
+  if (LIKELY(nblks > 2))
+    ra2 = vec_revb((vector4x_u32)vec_xl(0, in + 2 * 16));
+  if (LIKELY(nblks > 3))
+    ra3 = vec_revb((vector4x_u32)vec_xl(0, in + 3 * 16));
+
+  transpose_4x4(ra0, ra1, ra2, ra3, tmp0, tmp1);
+
+  for (rk_end = rk + 32; rk < rk_end; rk += 4)
+    {
+      vector4x_u32 r4keys = vec_xl(0, rk);
+      ROUND4(0, ra0, ra1, ra2, ra3);
+      ROUND4(1, ra1, ra2, ra3, ra0);
+      ROUND4(2, ra2, ra3, ra0, ra1);
+      ROUND4(3, ra3, ra0, ra1, ra2);
+    }
+
+  transpose_4x4(ra3, ra2, ra1, ra0, tmp0, tmp1);
+
+  vec_xst((vector16x_u8)vec_revb(ra3), 0, out + 0 * 16);
+  if (LIKELY(nblks > 1))
+    vec_xst((vector16x_u8)vec_revb(ra2), 0, out + 1 * 16);
+  if (LIKELY(nblks > 2))
+    vec_xst((vector16x_u8)vec_revb(ra1), 0, out + 2 * 16);
+  if (LIKELY(nblks > 3))
+    vec_xst((vector16x_u8)vec_revb(ra0), 0, out + 3 * 16);
+}
+
+static ASM_FUNC_ATTR_INLINE void
+sm4_ppc_crypt_blk1_16(u32 *rk, byte *out, const byte *in, size_t nblks)
+{
+  if (nblks >= 16)
+    {
+      sm4_ppc_crypt_blk16(rk, out, in);
+      return;
+    }
+
+  while (nblks >= 8)
+    {
+      sm4_ppc_crypt_blk8(rk, out, in);
+      in += 8 * 16;
+      out += 8 * 16;
+      nblks -= 8;
+    }
+
+  while (nblks)
+    {
+      size_t currblks = nblks > 4 ? 4 : nblks;
+      sm4_ppc_crypt_blk1_4(rk, out, in, currblks);
+      in += currblks * 16;
+      out += currblks * 16;
+      nblks -= currblks;
+    }
+}
+
+ASM_FUNC_ATTR_NOINLINE FUNC_ATTR_TARGET_P8 void
+_gcry_sm4_ppc8le_crypt_blk1_16(u32 *rk, byte *out, const byte *in,
+                              size_t nblks)
+{
+  sm4_ppc_crypt_blk1_16(rk, out, in, nblks);
+}
+
+ASM_FUNC_ATTR_NOINLINE FUNC_ATTR_TARGET_P9 void
+_gcry_sm4_ppc9le_crypt_blk1_16(u32 *rk, byte *out, const byte *in,
+                              size_t nblks)
+{
+#ifdef HAVE_FUNC_ATTR_TARGET
+  /* Inline for POWER9 target optimization. */
+  sm4_ppc_crypt_blk1_16(rk, out, in, nblks);
+#else
+  /* Target selecting not working, just call the other noinline function. */
+  _gcry_sm4_ppc8le_crypt_blk1_16(rk, out, in, nblks);
+#endif
+}
+
+#endif /* ENABLE_PPC_CRYPTO_SUPPORT */
index 81662988620ab48faceca5602c319030c647db09..06b843f8ebc29cf47f9482ffab04a5538bfe83e6 100644 (file)
@@ -1,7 +1,7 @@
 /* sm4.c  -  SM4 Cipher Algorithm
  * Copyright (C) 2020 Alibaba Group.
- * Copyright (C) 2020 Tianjia Zhang <tianjia.zhang@linux.alibaba.com>
- * Copyright (C) 2020 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ * Copyright (C) 2020-2022 Tianjia Zhang <tianjia.zhang@linux.alibaba.com>
+ * Copyright (C) 2020-2022 Jussi Kivilinna <jussi.kivilinna@iki.fi>
  *
  * This file is part of Libgcrypt.
  *
@@ -29,7 +29,7 @@
 #include "cipher.h"
 #include "bufhelp.h"
 #include "cipher-internal.h"
-#include "cipher-selftest.h"
+#include "bulkhelp.h"
 
 /* Helper macro to force alignment to 64 bytes.  */
 #ifdef HAVE_GCC_ATTRIBUTE_ALIGNED
@@ -47,7 +47,7 @@
 # endif
 #endif
 
-/* USE_AESNI_AVX inidicates whether to compile with Intel AES-NI/AVX2 code. */
+/* USE_AESNI_AVX2 inidicates whether to compile with Intel AES-NI/AVX2 code. */
 #undef USE_AESNI_AVX2
 #if defined(ENABLE_AESNI_SUPPORT) && defined(ENABLE_AVX2_SUPPORT)
 # if defined(__x86_64__) && (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
 # endif
 #endif
 
+/* USE_GFNI_AVX2 inidicates whether to compile with Intel GFNI/AVX2 code. */
+#undef USE_GFNI_AVX2
+#if defined(ENABLE_GFNI_SUPPORT) && defined(ENABLE_AVX2_SUPPORT)
+# if defined(__x86_64__) && (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
+     defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS))
+#  define USE_GFNI_AVX2 1
+# endif
+#endif
+
+/* USE_GFNI_AVX512 inidicates whether to compile with Intel GFNI/AVX512 code. */
+#undef USE_GFNI_AVX512
+#if defined(ENABLE_GFNI_SUPPORT) && defined(ENABLE_AVX512_SUPPORT)
+# if defined(__x86_64__) && (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
+     defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS))
+#  define USE_GFNI_AVX512 1
+# endif
+#endif
+
 /* Assembly implementations use SystemV ABI, ABI conversion and additional
  * stack to store XMM6-XMM15 needed on Win64. */
 #undef ASM_FUNC_ABI
-#if defined(USE_AESNI_AVX) || defined(USE_AESNI_AVX2)
+#if defined(USE_AESNI_AVX) || defined(USE_AESNI_AVX2) || \
+    defined(USE_GFNI_AVX2) || defined(USE_GFNI_AVX512)
 # ifdef HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS
 #  define ASM_FUNC_ABI __attribute__((sysv_abi))
 # else
 # endif
 #endif
 
+#undef USE_AARCH64_SIMD
+#ifdef ENABLE_NEON_SUPPORT
+# if defined(__AARCH64EL__) && \
+     defined(HAVE_COMPATIBLE_GCC_AARCH64_PLATFORM_AS) && \
+     defined(HAVE_GCC_INLINE_ASM_AARCH64_NEON)
+#   define USE_AARCH64_SIMD 1
+# endif
+#endif
+
+#undef USE_ARM_CE
+#ifdef ENABLE_ARM_CRYPTO_SUPPORT
+# if defined(__AARCH64EL__) && \
+     defined(HAVE_COMPATIBLE_GCC_AARCH64_PLATFORM_AS) && \
+     defined(HAVE_GCC_INLINE_ASM_AARCH64_CRYPTO)
+#   define USE_ARM_CE 1
+# endif
+#endif
+
+#undef USE_ARM_SVE_CE
+#ifdef ENABLE_SVE_SUPPORT
+# if defined(__AARCH64EL__) && \
+     defined(HAVE_COMPATIBLE_GCC_AARCH64_PLATFORM_AS) && \
+     defined(HAVE_GCC_INLINE_ASM_AARCH64_CRYPTO) && \
+     defined(HAVE_GCC_INLINE_ASM_AARCH64_SVE) && \
+     defined(HAVE_GCC_INLINE_ASM_AARCH64_SVE2)
+#   define USE_ARM_SVE_CE 1
+# endif
+#endif
+
+#undef USE_PPC_CRYPTO
+#if defined(ENABLE_PPC_CRYPTO_SUPPORT) && \
+    defined(HAVE_COMPATIBLE_CC_PPC_ALTIVEC) && \
+    defined(HAVE_GCC_INLINE_ASM_PPC_ALTIVEC) && \
+    !defined(WORDS_BIGENDIAN) && (__GNUC__ >= 4)
+# define USE_PPC_CRYPTO 1
+#endif
+
 static const char *sm4_selftest (void);
 
 static void _gcry_sm4_ctr_enc (void *context, unsigned char *ctr,
@@ -78,22 +134,53 @@ static void _gcry_sm4_cbc_dec (void *context, unsigned char *iv,
 static void _gcry_sm4_cfb_dec (void *context, unsigned char *iv,
                               void *outbuf_arg, const void *inbuf_arg,
                               size_t nblocks);
+static void _gcry_sm4_xts_crypt (void *context, unsigned char *tweak,
+                                 void *outbuf_arg, const void *inbuf_arg,
+                                 size_t nblocks, int encrypt);
+static void _gcry_sm4_ecb_crypt (void *context, void *outbuf_arg,
+                                const void *inbuf_arg, size_t nblocks,
+                                int encrypt);
+static void _gcry_sm4_ctr32le_enc(void *context, unsigned char *ctr,
+                                  void *outbuf_arg, const void *inbuf_arg,
+                                  size_t nblocks);
 static size_t _gcry_sm4_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg,
                                   const void *inbuf_arg, size_t nblocks,
                                   int encrypt);
 static size_t _gcry_sm4_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg,
                                  size_t nblocks);
 
+typedef bulk_crypt_fn_t crypt_blk1_16_fn_t;
+
 typedef struct
 {
   u32 rkey_enc[32];
   u32 rkey_dec[32];
+  crypt_blk1_16_fn_t crypt_blk1_16;
 #ifdef USE_AESNI_AVX
   unsigned int use_aesni_avx:1;
 #endif
 #ifdef USE_AESNI_AVX2
   unsigned int use_aesni_avx2:1;
 #endif
+#ifdef USE_GFNI_AVX2
+  unsigned int use_gfni_avx2:1;
+#endif
+#ifdef USE_GFNI_AVX512
+  unsigned int use_gfni_avx512:1;
+#endif
+#ifdef USE_AARCH64_SIMD
+  unsigned int use_aarch64_simd:1;
+#endif
+#ifdef USE_ARM_CE
+  unsigned int use_arm_ce:1;
+#endif
+#ifdef USE_ARM_SVE_CE
+  unsigned int use_arm_sve_ce:1;
+#endif
+#ifdef USE_PPC_CRYPTO
+  unsigned int use_ppc8le:1;
+  unsigned int use_ppc9le:1;
+#endif
 } SM4_context;
 
 static const u32 fk[4] =
@@ -160,6 +247,8 @@ static const u32 ck[] =
   0x10171e25, 0x2c333a41, 0x484f565d, 0x646b7279
 };
 
+static inline crypt_blk1_16_fn_t sm4_get_crypt_blk1_16_fn(SM4_context *ctx);
+
 #ifdef USE_AESNI_AVX
 extern void _gcry_sm4_aesni_avx_expand_key(const byte *key, u32 *rk_enc,
                                           u32 *rk_dec, const u32 *fk,
@@ -195,13 +284,21 @@ extern void _gcry_sm4_aesni_avx_ocb_auth(const u32 *rk_enc,
                                         const u64 Ls[8]) ASM_FUNC_ABI;
 
 extern unsigned int
-_gcry_sm4_aesni_avx_crypt_blk1_8(const u32 *rk, byte *out, const byte *in,
+_gcry_sm4_aesni_avx_crypt_blk1_8(u32 *rk, byte *out, const byte *in,
                                 unsigned int num_blks) ASM_FUNC_ABI;
 
 static inline unsigned int
-sm4_aesni_avx_crypt_blk1_8(const u32 *rk, byte *out, const byte *in,
-                          unsigned int num_blks)
+sm4_aesni_avx_crypt_blk1_16(void *rk, byte *out, const byte *in,
+                            size_t num_blks)
 {
+  if (num_blks > 8)
+    {
+      _gcry_sm4_aesni_avx_crypt_blk1_8(rk, out, in, 8);
+      in += 8 * 16;
+      out += 8 * 16;
+      num_blks -= 8;
+    }
+
   return _gcry_sm4_aesni_avx_crypt_blk1_8(rk, out, in, num_blks);
 }
 
@@ -239,8 +336,302 @@ extern void _gcry_sm4_aesni_avx2_ocb_auth(const u32 *rk_enc,
                                          unsigned char *offset,
                                          unsigned char *checksum,
                                          const u64 Ls[16]) ASM_FUNC_ABI;
+
+extern unsigned int
+_gcry_sm4_aesni_avx2_crypt_blk1_16(u32 *rk, byte *out, const byte *in,
+                                  unsigned int num_blks) ASM_FUNC_ABI;
+
+static inline unsigned int
+sm4_aesni_avx2_crypt_blk1_16(void *rk, byte *out, const byte *in,
+                            size_t num_blks)
+{
+#ifdef USE_AESNI_AVX
+  /* Use 128-bit register implementation for short input. */
+  if (num_blks <= 8)
+    return _gcry_sm4_aesni_avx_crypt_blk1_8(rk, out, in, num_blks);
+#endif
+
+  return _gcry_sm4_aesni_avx2_crypt_blk1_16(rk, out, in, num_blks);
+}
+
 #endif /* USE_AESNI_AVX2 */
 
+#ifdef USE_GFNI_AVX2
+extern void _gcry_sm4_gfni_avx2_expand_key(const byte *key, u32 *rk_enc,
+                                           u32 *rk_dec, const u32 *fk,
+                                           const u32 *ck) ASM_FUNC_ABI;
+
+extern void _gcry_sm4_gfni_avx2_ctr_enc(const u32 *rk_enc, byte *out,
+                                       const byte *in,
+                                       byte *ctr) ASM_FUNC_ABI;
+
+extern void _gcry_sm4_gfni_avx2_cbc_dec(const u32 *rk_dec, byte *out,
+                                       const byte *in,
+                                       byte *iv) ASM_FUNC_ABI;
+
+extern void _gcry_sm4_gfni_avx2_cfb_dec(const u32 *rk_enc, byte *out,
+                                       const byte *in,
+                                       byte *iv) ASM_FUNC_ABI;
+
+extern void _gcry_sm4_gfni_avx2_ocb_enc(const u32 *rk_enc,
+                                       unsigned char *out,
+                                       const unsigned char *in,
+                                       unsigned char *offset,
+                                       unsigned char *checksum,
+                                       const u64 Ls[16]) ASM_FUNC_ABI;
+
+extern void _gcry_sm4_gfni_avx2_ocb_dec(const u32 *rk_dec,
+                                       unsigned char *out,
+                                       const unsigned char *in,
+                                       unsigned char *offset,
+                                       unsigned char *checksum,
+                                       const u64 Ls[16]) ASM_FUNC_ABI;
+
+extern void _gcry_sm4_gfni_avx2_ocb_auth(const u32 *rk_enc,
+                                        const unsigned char *abuf,
+                                        unsigned char *offset,
+                                        unsigned char *checksum,
+                                        const u64 Ls[16]) ASM_FUNC_ABI;
+
+extern unsigned int
+_gcry_sm4_gfni_avx2_crypt_blk1_16(u32 *rk, byte *out, const byte *in,
+                                 unsigned int num_blks) ASM_FUNC_ABI;
+
+static inline unsigned int
+sm4_gfni_avx2_crypt_blk1_16(void *rk, byte *out, const byte *in,
+                           size_t num_blks)
+{
+  return _gcry_sm4_gfni_avx2_crypt_blk1_16(rk, out, in, num_blks);
+}
+
+#endif /* USE_GFNI_AVX2 */
+
+#ifdef USE_GFNI_AVX512
+extern void _gcry_sm4_gfni_avx512_expand_key(const byte *key, u32 *rk_enc,
+                                             u32 *rk_dec, const u32 *fk,
+                                             const u32 *ck) ASM_FUNC_ABI;
+
+extern void _gcry_sm4_gfni_avx512_ctr_enc(const u32 *rk_enc, byte *out,
+                                          const byte *in,
+                                          byte *ctr) ASM_FUNC_ABI;
+
+extern void _gcry_sm4_gfni_avx512_cbc_dec(const u32 *rk_dec, byte *out,
+                                          const byte *in,
+                                          byte *iv) ASM_FUNC_ABI;
+
+extern void _gcry_sm4_gfni_avx512_cfb_dec(const u32 *rk_enc, byte *out,
+                                          const byte *in,
+                                          byte *iv) ASM_FUNC_ABI;
+
+extern void _gcry_sm4_gfni_avx512_ocb_enc(const u32 *rk_enc,
+                                          unsigned char *out,
+                                          const unsigned char *in,
+                                          unsigned char *offset,
+                                          unsigned char *checksum,
+                                          const u64 Ls[16]) ASM_FUNC_ABI;
+
+extern void _gcry_sm4_gfni_avx512_ocb_dec(const u32 *rk_dec,
+                                          unsigned char *out,
+                                          const unsigned char *in,
+                                          unsigned char *offset,
+                                          unsigned char *checksum,
+                                          const u64 Ls[16]) ASM_FUNC_ABI;
+
+extern void _gcry_sm4_gfni_avx512_ocb_auth(const u32 *rk_enc,
+                                           const unsigned char *abuf,
+                                           unsigned char *offset,
+                                           unsigned char *checksum,
+                                           const u64 Ls[16]) ASM_FUNC_ABI;
+
+extern void _gcry_sm4_gfni_avx512_ctr_enc_blk32(const u32 *rk_enc, byte *out,
+                                                const byte *in,
+                                                byte *ctr) ASM_FUNC_ABI;
+
+extern void _gcry_sm4_gfni_avx512_cbc_dec_blk32(const u32 *rk_enc, byte *out,
+                                                const byte *in,
+                                                byte *iv) ASM_FUNC_ABI;
+
+extern void _gcry_sm4_gfni_avx512_cfb_dec_blk32(const u32 *rk_enc, byte *out,
+                                                const byte *in,
+                                                byte *iv) ASM_FUNC_ABI;
+
+extern void _gcry_sm4_gfni_avx512_ocb_enc_blk32(const u32 *rk_enc,
+                                                unsigned char *out,
+                                                const unsigned char *in,
+                                                unsigned char *offset,
+                                                unsigned char *checksum,
+                                                const u64 Ls[32]) ASM_FUNC_ABI;
+
+extern void _gcry_sm4_gfni_avx512_ocb_dec_blk32(const u32 *rk_dec,
+                                                unsigned char *out,
+                                                const unsigned char *in,
+                                                unsigned char *offset,
+                                                unsigned char *checksum,
+                                                const u64 Ls[32]) ASM_FUNC_ABI;
+
+extern unsigned int
+_gcry_sm4_gfni_avx512_crypt_blk1_16(u32 *rk, byte *out, const byte *in,
+                                    unsigned int num_blks) ASM_FUNC_ABI;
+
+extern unsigned int
+_gcry_sm4_gfni_avx512_crypt_blk32(u32 *rk, byte *out,
+                                  const byte *in) ASM_FUNC_ABI;
+
+static inline unsigned int
+sm4_gfni_avx512_crypt_blk1_16(void *rk, byte *out, const byte *in,
+                             size_t num_blks)
+{
+  return _gcry_sm4_gfni_avx512_crypt_blk1_16(rk, out, in, num_blks);
+}
+
+#endif /* USE_GFNI_AVX2 */
+
+#ifdef USE_AARCH64_SIMD
+extern void _gcry_sm4_aarch64_crypt(const u32 *rk, byte *out,
+                                   const byte *in,
+                                   size_t num_blocks);
+
+extern void _gcry_sm4_aarch64_ctr_enc(const u32 *rk_enc, byte *out,
+                                     const byte *in,
+                                     byte *ctr,
+                                     size_t nblocks);
+
+extern void _gcry_sm4_aarch64_cbc_dec(const u32 *rk_dec, byte *out,
+                                     const byte *in,
+                                     byte *iv,
+                                     size_t nblocks);
+
+extern void _gcry_sm4_aarch64_cfb_dec(const u32 *rk_enc, byte *out,
+                                     const byte *in,
+                                     byte *iv,
+                                     size_t nblocks);
+
+extern void _gcry_sm4_aarch64_crypt_blk1_8(u32 *rk, byte *out,
+                                          const byte *in,
+                                          size_t num_blocks);
+
+static inline unsigned int
+sm4_aarch64_crypt_blk1_16(void *rk, byte *out, const byte *in,
+                         size_t num_blks)
+{
+  if (num_blks > 8)
+    {
+      _gcry_sm4_aarch64_crypt_blk1_8(rk, out, in, 8);
+      in += 8 * 16;
+      out += 8 * 16;
+      num_blks -= 8;
+    }
+
+  _gcry_sm4_aarch64_crypt_blk1_8(rk, out, in, num_blks);
+  return 0;
+}
+
+#endif /* USE_AARCH64_SIMD */
+
+#ifdef USE_ARM_CE
+extern void _gcry_sm4_armv8_ce_expand_key(const byte *key,
+                                         u32 *rkey_enc, u32 *rkey_dec,
+                                         const u32 *fk, const u32 *ck);
+
+extern void _gcry_sm4_armv8_ce_crypt(const u32 *rk, byte *out,
+                                    const byte *in,
+                                    size_t num_blocks);
+
+extern void _gcry_sm4_armv8_ce_ctr_enc(const u32 *rk_enc, byte *out,
+                                      const byte *in,
+                                      byte *ctr,
+                                      size_t nblocks);
+
+extern void _gcry_sm4_armv8_ce_cbc_dec(const u32 *rk_dec, byte *out,
+                                      const byte *in,
+                                      byte *iv,
+                                      size_t nblocks);
+
+extern void _gcry_sm4_armv8_ce_cfb_dec(const u32 *rk_enc, byte *out,
+                                      const byte *in,
+                                      byte *iv,
+                                      size_t nblocks);
+
+extern void _gcry_sm4_armv8_ce_xts_crypt(const u32 *rk, byte *out,
+                                        const byte *in,
+                                        byte *tweak,
+                                        size_t nblocks);
+
+extern void _gcry_sm4_armv8_ce_crypt_blk1_8(u32 *rk, byte *out,
+                                           const byte *in,
+                                           size_t num_blocks);
+
+static inline unsigned int
+sm4_armv8_ce_crypt_blk1_16(void *rk, byte *out, const byte *in,
+                          size_t num_blks)
+{
+  if (num_blks > 8)
+    {
+      _gcry_sm4_armv8_ce_crypt_blk1_8(rk, out, in, 8);
+      in += 8 * 16;
+      out += 8 * 16;
+      num_blks -= 8;
+    }
+
+  _gcry_sm4_armv8_ce_crypt_blk1_8(rk, out, in, num_blks);
+  return 0;
+}
+
+#endif /* USE_ARM_CE */
+
+#ifdef USE_ARM_SVE_CE
+extern void _gcry_sm4_armv9_sve_ce_crypt(u32 *rk, byte *out,
+                                        const byte *in,
+                                        size_t nblocks);
+
+extern void _gcry_sm4_armv9_sve_ce_ctr_enc(const u32 *rk_enc, byte *out,
+                                          const byte *in,
+                                          byte *ctr,
+                                          size_t nblocks);
+
+extern void _gcry_sm4_armv9_sve_ce_cbc_dec(const u32 *rk_dec, byte *out,
+                                          const byte *in,
+                                          byte *iv,
+                                          size_t nblocks);
+
+extern void _gcry_sm4_armv9_sve_ce_cfb_dec(const u32 *rk_enc, byte *out,
+                                          const byte *in,
+                                          byte *iv,
+                                          size_t nblocks);
+
+static inline unsigned int
+sm4_armv9_sve_ce_crypt_blk1_16(void *rk, byte *out, const byte *in,
+                              size_t num_blks)
+{
+  _gcry_sm4_armv9_sve_ce_crypt(rk, out, in, num_blks);
+  return 0;
+}
+
+extern unsigned int _gcry_sm4_armv9_sve_get_vl(void);
+#endif /* USE_ARM_SVE_CE */
+
+#ifdef USE_PPC_CRYPTO
+extern void _gcry_sm4_ppc8le_crypt_blk1_16(u32 *rk, byte *out, const byte *in,
+                                          size_t num_blks);
+
+extern void _gcry_sm4_ppc9le_crypt_blk1_16(u32 *rk, byte *out, const byte *in,
+                                          size_t num_blks);
+
+static inline unsigned int
+sm4_ppc8le_crypt_blk1_16(void *rk, byte *out, const byte *in, size_t num_blks)
+{
+  _gcry_sm4_ppc8le_crypt_blk1_16(rk, out, in, num_blks);
+  return 0;
+}
+
+static inline unsigned int
+sm4_ppc9le_crypt_blk1_16(void *rk, byte *out, const byte *in, size_t num_blks)
+{
+  _gcry_sm4_ppc9le_crypt_blk1_16(rk, out, in, num_blks);
+  return 0;
+}
+#endif /* USE_PPC_CRYPTO */
+
 static inline void prefetch_sbox_table(void)
 {
   const volatile byte *vtab = (void *)&sbox_table;
@@ -309,6 +700,24 @@ sm4_expand_key (SM4_context *ctx, const byte *key)
   u32 rk[4];
   int i;
 
+#ifdef USE_GFNI_AVX512
+  if (ctx->use_gfni_avx512)
+    {
+      _gcry_sm4_gfni_avx512_expand_key (key, ctx->rkey_enc, ctx->rkey_dec,
+                                        fk, ck);
+      return;
+    }
+#endif
+
+#ifdef USE_GFNI_AVX2
+  if (ctx->use_gfni_avx2)
+    {
+      _gcry_sm4_gfni_avx2_expand_key (key, ctx->rkey_enc, ctx->rkey_dec,
+                                      fk, ck);
+      return;
+    }
+#endif
+
 #ifdef USE_AESNI_AVX
   if (ctx->use_aesni_avx)
     {
@@ -318,6 +727,17 @@ sm4_expand_key (SM4_context *ctx, const byte *key)
     }
 #endif
 
+#ifdef USE_ARM_CE
+  if (ctx->use_arm_ce)
+    {
+      _gcry_sm4_armv8_ce_expand_key (key, ctx->rkey_enc, ctx->rkey_dec,
+                                    fk, ck);
+      return;
+    }
+#endif
+
+  prefetch_sbox_table ();
+
   rk[0] = buf_get_be32(key + 4 * 0) ^ fk[0];
   rk[1] = buf_get_be32(key + 4 * 1) ^ fk[1];
   rk[2] = buf_get_be32(key + 4 * 2) ^ fk[2];
@@ -372,12 +792,51 @@ sm4_setkey (void *context, const byte *key, const unsigned keylen,
 #ifdef USE_AESNI_AVX2
   ctx->use_aesni_avx2 = (hwf & HWF_INTEL_AESNI) && (hwf & HWF_INTEL_AVX2);
 #endif
+#ifdef USE_GFNI_AVX2
+  ctx->use_gfni_avx2 = (hwf & HWF_INTEL_GFNI) && (hwf & HWF_INTEL_AVX2);
+#endif
+#ifdef USE_GFNI_AVX512
+  ctx->use_gfni_avx512 = (hwf & HWF_INTEL_GFNI) && (hwf & HWF_INTEL_AVX512);
+#endif
+#ifdef USE_AARCH64_SIMD
+  ctx->use_aarch64_simd = !!(hwf & HWF_ARM_NEON);
+#endif
+#ifdef USE_ARM_CE
+  ctx->use_arm_ce = !!(hwf & HWF_ARM_SM4);
+#endif
+#ifdef USE_ARM_SVE_CE
+  /* Only enabled when the SVE vector length is greater than 128 bits */
+  ctx->use_arm_sve_ce = (hwf & HWF_ARM_SVE2) && (hwf & HWF_ARM_SVESM4)
+               && _gcry_sm4_armv9_sve_get_vl() > 16;
+#endif
+#ifdef USE_PPC_CRYPTO
+  ctx->use_ppc8le = (hwf & HWF_PPC_VCRYPTO) != 0;
+  ctx->use_ppc9le = (hwf & HWF_PPC_VCRYPTO) && (hwf & HWF_PPC_ARCH_3_00);
+#endif
+
+#ifdef USE_GFNI_AVX2
+  if (ctx->use_gfni_avx2)
+    {
+      /* Disable AESNI implementations when GFNI implementation is enabled. */
+#ifdef USE_AESNI_AVX
+      ctx->use_aesni_avx = 0;
+#endif
+#ifdef USE_AESNI_AVX2
+      ctx->use_aesni_avx2 = 0;
+#endif
+    }
+#endif
+
+  ctx->crypt_blk1_16 = sm4_get_crypt_blk1_16_fn(ctx);
 
   /* Setup bulk encryption routines.  */
   memset (bulk_ops, 0, sizeof(*bulk_ops));
   bulk_ops->cbc_dec = _gcry_sm4_cbc_dec;
   bulk_ops->cfb_dec = _gcry_sm4_cfb_dec;
   bulk_ops->ctr_enc = _gcry_sm4_ctr_enc;
+  bulk_ops->xts_crypt = _gcry_sm4_xts_crypt;
+  bulk_ops->ecb_crypt = _gcry_sm4_ecb_crypt;
+  bulk_ops->ctr32le_enc = _gcry_sm4_ctr32le_enc;
   bulk_ops->ocb_crypt = _gcry_sm4_ocb_crypt;
   bulk_ops->ocb_auth  = _gcry_sm4_ocb_auth;
 
@@ -417,6 +876,21 @@ sm4_encrypt (void *context, byte *outbuf, const byte *inbuf)
 {
   SM4_context *ctx = context;
 
+#ifdef USE_GFNI_AVX512
+  if (ctx->use_gfni_avx512)
+    return sm4_gfni_avx512_crypt_blk1_16(ctx->rkey_enc, outbuf, inbuf, 1);
+#endif
+
+#ifdef USE_GFNI_AVX2
+  if (ctx->use_gfni_avx2)
+    return sm4_gfni_avx2_crypt_blk1_16(ctx->rkey_enc, outbuf, inbuf, 1);
+#endif
+
+#ifdef USE_ARM_CE
+  if (ctx->use_arm_ce)
+    return sm4_armv8_ce_crypt_blk1_16(ctx->rkey_enc, outbuf, inbuf, 1);
+#endif
+
   prefetch_sbox_table ();
 
   return sm4_do_crypt (ctx->rkey_enc, outbuf, inbuf);
@@ -427,6 +901,21 @@ sm4_decrypt (void *context, byte *outbuf, const byte *inbuf)
 {
   SM4_context *ctx = context;
 
+#ifdef USE_GFNI_AVX512
+  if (ctx->use_gfni_avx512)
+    return sm4_gfni_avx512_crypt_blk1_16(ctx->rkey_dec, outbuf, inbuf, 1);
+#endif
+
+#ifdef USE_GFNI_AVX2
+  if (ctx->use_gfni_avx2)
+    return sm4_gfni_avx2_crypt_blk1_16(ctx->rkey_dec, outbuf, inbuf, 1);
+#endif
+
+#ifdef USE_ARM_CE
+  if (ctx->use_arm_ce)
+    return sm4_armv8_ce_crypt_blk1_16(ctx->rkey_dec, outbuf, inbuf, 1);
+#endif
+
   prefetch_sbox_table ();
 
   return sm4_do_crypt (ctx->rkey_dec, outbuf, inbuf);
@@ -481,9 +970,10 @@ sm4_do_crypt_blks2 (const u32 *rk, byte *out, const byte *in)
 }
 
 static unsigned int
-sm4_crypt_blocks (const u32 *rk, byte *out, const byte *in,
-                 unsigned int num_blks)
+sm4_crypt_blocks (void *ctx, byte *out, const byte *in,
+                 size_t num_blks)
 {
+  const u32 *rk = ctx;
   unsigned int burn_depth = 0;
   unsigned int nburn;
 
@@ -510,6 +1000,70 @@ sm4_crypt_blocks (const u32 *rk, byte *out, const byte *in,
   return burn_depth;
 }
 
+static inline crypt_blk1_16_fn_t
+sm4_get_crypt_blk1_16_fn(SM4_context *ctx)
+{
+  if (0)
+    ;
+#ifdef USE_GFNI_AVX512
+  else if (ctx->use_gfni_avx512)
+    {
+      return &sm4_gfni_avx512_crypt_blk1_16;
+    }
+#endif
+#ifdef USE_GFNI_AVX2
+  else if (ctx->use_gfni_avx2)
+    {
+      return &sm4_gfni_avx2_crypt_blk1_16;
+    }
+#endif
+#ifdef USE_AESNI_AVX2
+  else if (ctx->use_aesni_avx2)
+    {
+      return &sm4_aesni_avx2_crypt_blk1_16;
+    }
+#endif
+#ifdef USE_AESNI_AVX
+  else if (ctx->use_aesni_avx)
+    {
+      return &sm4_aesni_avx_crypt_blk1_16;
+    }
+#endif
+#ifdef USE_ARM_SVE_CE
+  else if (ctx->use_arm_sve_ce)
+    {
+      return &sm4_armv9_sve_ce_crypt_blk1_16;
+    }
+#endif
+#ifdef USE_ARM_CE
+  else if (ctx->use_arm_ce)
+    {
+      return &sm4_armv8_ce_crypt_blk1_16;
+    }
+#endif
+#ifdef USE_AARCH64_SIMD
+  else if (ctx->use_aarch64_simd)
+    {
+      return &sm4_aarch64_crypt_blk1_16;
+    }
+#endif
+#ifdef USE_PPC_CRYPTO
+  else if (ctx->use_ppc9le)
+    {
+      return &sm4_ppc9le_crypt_blk1_16;
+    }
+  else if (ctx->use_ppc8le)
+    {
+      return &sm4_ppc8le_crypt_blk1_16;
+    }
+#endif
+  else
+    {
+      (void)ctx;
+      return &sm4_crypt_blocks;
+    }
+}
+
 /* Bulk encryption of complete blocks in CTR mode.  This function is only
    intended for the bulk encryption feature of cipher.c.  CTR is expected to be
    of size 16. */
@@ -523,6 +1077,47 @@ _gcry_sm4_ctr_enc(void *context, unsigned char *ctr,
   const byte *inbuf = inbuf_arg;
   int burn_stack_depth = 0;
 
+#ifdef USE_GFNI_AVX512
+  if (ctx->use_gfni_avx512)
+    {
+      /* Process data in 32 block chunks. */
+      while (nblocks >= 32)
+        {
+          _gcry_sm4_gfni_avx512_ctr_enc_blk32(ctx->rkey_enc,
+                                              outbuf, inbuf, ctr);
+
+          nblocks -= 32;
+          outbuf += 32 * 16;
+          inbuf += 32 * 16;
+        }
+
+      /* Process data in 16 block chunks. */
+      if (nblocks >= 16)
+        {
+          _gcry_sm4_gfni_avx512_ctr_enc(ctx->rkey_enc, outbuf, inbuf, ctr);
+
+          nblocks -= 16;
+          outbuf += 16 * 16;
+          inbuf += 16 * 16;
+        }
+    }
+#endif
+
+#ifdef USE_GFNI_AVX2
+  if (ctx->use_gfni_avx2)
+    {
+      /* Process data in 16 block chunks. */
+      while (nblocks >= 16)
+        {
+          _gcry_sm4_gfni_avx2_ctr_enc(ctx->rkey_enc, outbuf, inbuf, ctr);
+
+          nblocks -= 16;
+          outbuf += 16 * 16;
+          inbuf += 16 * 16;
+        }
+    }
+#endif
+
 #ifdef USE_AESNI_AVX2
   if (ctx->use_aesni_avx2)
     {
@@ -553,57 +1148,65 @@ _gcry_sm4_ctr_enc(void *context, unsigned char *ctr,
     }
 #endif
 
-  /* Process remaining blocks. */
-  if (nblocks)
+#ifdef USE_ARM_SVE_CE
+  if (ctx->use_arm_sve_ce)
     {
-      unsigned int (*crypt_blk1_8)(const u32 *rk, byte *out, const byte *in,
-                                  unsigned int num_blks);
-      byte tmpbuf[16 * 8];
-      unsigned int tmp_used = 16;
-
-      if (0)
-       ;
-#ifdef USE_AESNI_AVX
-      else if (ctx->use_aesni_avx)
-       {
-         crypt_blk1_8 = sm4_aesni_avx_crypt_blk1_8;
-       }
+      /* Process all blocks at a time. */
+      _gcry_sm4_armv9_sve_ce_ctr_enc(ctx->rkey_enc, outbuf, inbuf,
+                                    ctr, nblocks);
+      nblocks = 0;
+    }
 #endif
-      else
-       {
-         prefetch_sbox_table ();
-         crypt_blk1_8 = sm4_crypt_blocks;
-       }
-
-      /* Process remaining blocks. */
-      while (nblocks)
-       {
-         size_t curr_blks = nblocks > 8 ? 8 : nblocks;
-         size_t i;
 
-         if (curr_blks * 16 > tmp_used)
-           tmp_used = curr_blks * 16;
+#ifdef USE_ARM_CE
+  if (ctx->use_arm_ce)
+    {
+      /* Process multiples of 8 blocks at a time. */
+      if (nblocks >= 8)
+        {
+          size_t nblks = nblocks & ~(8 - 1);
 
-         cipher_block_cpy (tmpbuf + 0 * 16, ctr, 16);
-         for (i = 1; i < curr_blks; i++)
-           {
-             cipher_block_cpy (&tmpbuf[i * 16], ctr, 16);
-             cipher_block_add (&tmpbuf[i * 16], i, 16);
-           }
-         cipher_block_add (ctr, curr_blks, 16);
+          _gcry_sm4_armv8_ce_ctr_enc(ctx->rkey_enc, outbuf, inbuf, ctr, nblks);
 
-         burn_stack_depth = crypt_blk1_8 (ctx->rkey_enc, tmpbuf, tmpbuf,
-                                          curr_blks);
+          nblocks -= nblks;
+          outbuf += nblks * 16;
+          inbuf += nblks * 16;
+        }
+    }
+#endif
 
-         for (i = 0; i < curr_blks; i++)
-           {
-             cipher_block_xor (outbuf, &tmpbuf[i * 16], inbuf, 16);
-             outbuf += 16;
-             inbuf += 16;
-           }
+#ifdef USE_AARCH64_SIMD
+  if (ctx->use_aarch64_simd)
+    {
+      /* Process multiples of 8 blocks at a time. */
+      if (nblocks >= 8)
+        {
+          size_t nblks = nblocks & ~(8 - 1);
 
-         nblocks -= curr_blks;
-       }
+          _gcry_sm4_aarch64_ctr_enc(ctx->rkey_enc, outbuf, inbuf, ctr, nblks);
+
+          nblocks -= nblks;
+          outbuf += nblks * 16;
+          inbuf += nblks * 16;
+        }
+    }
+#endif
+
+  /* Process remaining blocks. */
+  if (nblocks)
+    {
+      crypt_blk1_16_fn_t crypt_blk1_16 = ctx->crypt_blk1_16;
+      byte tmpbuf[16 * 16];
+      unsigned int tmp_used = 16;
+      size_t nburn;
+
+      if (crypt_blk1_16 == &sm4_crypt_blocks)
+       prefetch_sbox_table ();
+
+      nburn = bulk_ctr_enc_128(ctx->rkey_enc, crypt_blk1_16, outbuf, inbuf,
+                               nblocks, ctr, tmpbuf, sizeof(tmpbuf) / 16,
+                               &tmp_used);
+      burn_stack_depth = nburn > burn_stack_depth ? nburn : burn_stack_depth;
 
       wipememory(tmpbuf, tmp_used);
     }
@@ -624,6 +1227,46 @@ _gcry_sm4_cbc_dec(void *context, unsigned char *iv,
   const unsigned char *inbuf = inbuf_arg;
   int burn_stack_depth = 0;
 
+#ifdef USE_GFNI_AVX512
+  if (ctx->use_gfni_avx512)
+    {
+      /* Process data in 32 block chunks. */
+      while (nblocks >= 32)
+        {
+          _gcry_sm4_gfni_avx512_cbc_dec_blk32(ctx->rkey_dec, outbuf, inbuf, iv);
+
+          nblocks -= 32;
+          outbuf += 32 * 16;
+          inbuf += 32 * 16;
+        }
+
+      /* Process data in 16 block chunks. */
+      if (nblocks >= 16)
+        {
+          _gcry_sm4_gfni_avx512_cbc_dec(ctx->rkey_dec, outbuf, inbuf, iv);
+
+          nblocks -= 16;
+          outbuf += 16 * 16;
+          inbuf += 16 * 16;
+        }
+    }
+#endif
+
+#ifdef USE_GFNI_AVX2
+  if (ctx->use_gfni_avx2)
+    {
+      /* Process data in 16 block chunks. */
+      while (nblocks >= 16)
+        {
+          _gcry_sm4_gfni_avx2_cbc_dec(ctx->rkey_dec, outbuf, inbuf, iv);
+
+          nblocks -= 16;
+          outbuf += 16 * 16;
+          inbuf += 16 * 16;
+        }
+    }
+#endif
+
 #ifdef USE_AESNI_AVX2
   if (ctx->use_aesni_avx2)
     {
@@ -654,52 +1297,67 @@ _gcry_sm4_cbc_dec(void *context, unsigned char *iv,
     }
 #endif
 
-  /* Process remaining blocks. */
-  if (nblocks)
+#ifdef USE_ARM_SVE_CE
+  if (ctx->use_arm_sve_ce)
     {
-      unsigned int (*crypt_blk1_8)(const u32 *rk, byte *out, const byte *in,
-                                  unsigned int num_blks);
-      unsigned char savebuf[16 * 8];
-      unsigned int tmp_used = 16;
+      /* Process all blocks at a time. */
+      _gcry_sm4_armv9_sve_ce_cbc_dec(ctx->rkey_dec, outbuf, inbuf,
+                                    iv, nblocks);
+      nblocks = 0;
+    }
+#endif
 
-      if (0)
-       ;
-#ifdef USE_AESNI_AVX
-      else if (ctx->use_aesni_avx)
-       {
-         crypt_blk1_8 = sm4_aesni_avx_crypt_blk1_8;
-       }
+#ifdef USE_ARM_CE
+  if (ctx->use_arm_ce)
+    {
+      /* Process multiples of 8 blocks at a time. */
+      if (nblocks >= 8)
+        {
+          size_t nblks = nblocks & ~(8 - 1);
+
+          _gcry_sm4_armv8_ce_cbc_dec(ctx->rkey_dec, outbuf, inbuf, iv, nblks);
+
+          nblocks -= nblks;
+          outbuf += nblks * 16;
+          inbuf += nblks * 16;
+        }
+    }
 #endif
-      else
-       {
-         prefetch_sbox_table ();
-         crypt_blk1_8 = sm4_crypt_blocks;
-       }
 
-      /* Process remaining blocks. */
-      while (nblocks)
-       {
-         size_t curr_blks = nblocks > 8 ? 8 : nblocks;
-         size_t i;
+#ifdef USE_AARCH64_SIMD
+  if (ctx->use_aarch64_simd)
+    {
+      /* Process multiples of 8 blocks at a time. */
+      if (nblocks >= 8)
+        {
+          size_t nblks = nblocks & ~(8 - 1);
 
-         if (curr_blks * 16 > tmp_used)
-           tmp_used = curr_blks * 16;
+          _gcry_sm4_aarch64_cbc_dec(ctx->rkey_dec, outbuf, inbuf, iv, nblks);
 
-         burn_stack_depth = crypt_blk1_8 (ctx->rkey_dec, savebuf, inbuf,
-                                          curr_blks);
+          nblocks -= nblks;
+          outbuf += nblks * 16;
+          inbuf += nblks * 16;
+        }
+    }
+#endif
 
-         for (i = 0; i < curr_blks; i++)
-           {
-             cipher_block_xor_n_copy_2(outbuf, &savebuf[i * 16], iv, inbuf,
-                                       16);
-             outbuf += 16;
-             inbuf += 16;
-           }
+  /* Process remaining blocks. */
+  if (nblocks)
+    {
+      crypt_blk1_16_fn_t crypt_blk1_16 = ctx->crypt_blk1_16;
+      unsigned char tmpbuf[16 * 16];
+      unsigned int tmp_used = 16;
+      size_t nburn;
 
-         nblocks -= curr_blks;
-       }
+      if (crypt_blk1_16 == &sm4_crypt_blocks)
+       prefetch_sbox_table ();
 
-      wipememory(savebuf, tmp_used);
+      nburn = bulk_cbc_dec_128(ctx->rkey_dec, crypt_blk1_16, outbuf, inbuf,
+                               nblocks, iv, tmpbuf, sizeof(tmpbuf) / 16,
+                               &tmp_used);
+      burn_stack_depth = nburn > burn_stack_depth ? nburn : burn_stack_depth;
+
+      wipememory(tmpbuf, tmp_used);
     }
 
   if (burn_stack_depth)
@@ -718,6 +1376,46 @@ _gcry_sm4_cfb_dec(void *context, unsigned char *iv,
   const unsigned char *inbuf = inbuf_arg;
   int burn_stack_depth = 0;
 
+#ifdef USE_GFNI_AVX512
+  if (ctx->use_gfni_avx512)
+    {
+      /* Process data in 32 block chunks. */
+      while (nblocks >= 32)
+        {
+          _gcry_sm4_gfni_avx512_cfb_dec_blk32(ctx->rkey_enc, outbuf, inbuf, iv);
+
+          nblocks -= 32;
+          outbuf += 32 * 16;
+          inbuf += 32 * 16;
+        }
+
+      /* Process data in 16 block chunks. */
+      if (nblocks >= 16)
+        {
+          _gcry_sm4_gfni_avx512_cfb_dec(ctx->rkey_enc, outbuf, inbuf, iv);
+
+          nblocks -= 16;
+          outbuf += 16 * 16;
+          inbuf += 16 * 16;
+        }
+    }
+#endif
+
+#ifdef USE_GFNI_AVX2
+  if (ctx->use_gfni_avx2)
+    {
+      /* Process data in 16 block chunks. */
+      while (nblocks >= 16)
+        {
+          _gcry_sm4_gfni_avx2_cfb_dec(ctx->rkey_enc, outbuf, inbuf, iv);
+
+          nblocks -= 16;
+          outbuf += 16 * 16;
+          inbuf += 16 * 16;
+        }
+    }
+#endif
+
 #ifdef USE_AESNI_AVX2
   if (ctx->use_aesni_avx2)
     {
@@ -748,62 +1446,230 @@ _gcry_sm4_cfb_dec(void *context, unsigned char *iv,
     }
 #endif
 
+#ifdef USE_ARM_SVE_CE
+  if (ctx->use_arm_sve_ce)
+    {
+      /* Process all blocks at a time. */
+      _gcry_sm4_armv9_sve_ce_cfb_dec(ctx->rkey_enc, outbuf, inbuf,
+                                    iv, nblocks);
+      nblocks = 0;
+    }
+#endif
+
+#ifdef USE_ARM_CE
+  if (ctx->use_arm_ce)
+    {
+      /* Process multiples of 8 blocks at a time. */
+      if (nblocks >= 8)
+        {
+          size_t nblks = nblocks & ~(8 - 1);
+
+          _gcry_sm4_armv8_ce_cfb_dec(ctx->rkey_enc, outbuf, inbuf, iv, nblks);
+
+          nblocks -= nblks;
+          outbuf += nblks * 16;
+          inbuf += nblks * 16;
+        }
+    }
+#endif
+
+#ifdef USE_AARCH64_SIMD
+  if (ctx->use_aarch64_simd)
+    {
+      /* Process multiples of 8 blocks at a time. */
+      if (nblocks >= 8)
+        {
+          size_t nblks = nblocks & ~(8 - 1);
+
+          _gcry_sm4_aarch64_cfb_dec(ctx->rkey_enc, outbuf, inbuf, iv, nblks);
+
+          nblocks -= nblks;
+          outbuf += nblks * 16;
+          inbuf += nblks * 16;
+        }
+    }
+#endif
+
   /* Process remaining blocks. */
   if (nblocks)
     {
-      unsigned int (*crypt_blk1_8)(const u32 *rk, byte *out, const byte *in,
-                                  unsigned int num_blks);
-      unsigned char ivbuf[16 * 8];
+      crypt_blk1_16_fn_t crypt_blk1_16 = ctx->crypt_blk1_16;
+      unsigned char tmpbuf[16 * 16];
       unsigned int tmp_used = 16;
+      size_t nburn;
 
-      if (0)
-       ;
-#ifdef USE_AESNI_AVX
-      else if (ctx->use_aesni_avx)
-       {
-         crypt_blk1_8 = sm4_aesni_avx_crypt_blk1_8;
-       }
+      if (crypt_blk1_16 == &sm4_crypt_blocks)
+       prefetch_sbox_table ();
+
+      nburn = bulk_cfb_dec_128(ctx->rkey_enc, crypt_blk1_16, outbuf, inbuf,
+                               nblocks, iv, tmpbuf, sizeof(tmpbuf) / 16,
+                               &tmp_used);
+      burn_stack_depth = nburn > burn_stack_depth ? nburn : burn_stack_depth;
+
+      wipememory(tmpbuf, tmp_used);
+    }
+
+  if (burn_stack_depth)
+    _gcry_burn_stack(burn_stack_depth);
+}
+
+static unsigned int
+sm4_crypt_blk1_32 (SM4_context *ctx, byte *outbuf, const byte *inbuf,
+                  size_t num_blks, u32 *rk)
+{
+  crypt_blk1_16_fn_t crypt_blk1_16 = ctx->crypt_blk1_16;
+  unsigned int stack_burn_size = 0;
+  unsigned int nburn;
+
+  gcry_assert (num_blks <= 32);
+
+#ifdef USE_GFNI_AVX512
+  if (num_blks == 32 && ctx->use_gfni_avx512)
+    {
+      return _gcry_sm4_gfni_avx512_crypt_blk32 (rk, outbuf, inbuf);
+    }
+#endif
+#ifdef USE_ARM_SVE_CE
+  if (ctx->use_arm_sve_ce)
+    {
+      _gcry_sm4_armv9_sve_ce_crypt (rk, outbuf, inbuf, num_blks);
+      return 0;
+    }
 #endif
-      else
-       {
-         prefetch_sbox_table ();
-         crypt_blk1_8 = sm4_crypt_blocks;
-       }
 
-      /* Process remaining blocks. */
-      while (nblocks)
-       {
-         size_t curr_blks = nblocks > 8 ? 8 : nblocks;
-         size_t i;
+  do
+    {
+      unsigned int curr_blks = num_blks > 16 ? 16 : num_blks;
+      nburn = crypt_blk1_16 (rk, outbuf, inbuf, curr_blks);
+      stack_burn_size = nburn > stack_burn_size ? nburn : stack_burn_size;
+      outbuf += curr_blks * 16;
+      inbuf += curr_blks * 16;
+      num_blks -= curr_blks;
+    }
+  while (num_blks > 0);
 
-         if (curr_blks * 16 > tmp_used)
-           tmp_used = curr_blks * 16;
+  return stack_burn_size;
+}
 
-         cipher_block_cpy (&ivbuf[0 * 16], iv, 16);
-         for (i = 1; i < curr_blks; i++)
-           cipher_block_cpy (&ivbuf[i * 16], &inbuf[(i - 1) * 16], 16);
-         cipher_block_cpy (iv, &inbuf[(i - 1) * 16], 16);
+static unsigned int
+sm4_encrypt_blk1_32 (void *context, byte *out, const byte *in,
+                    size_t num_blks)
+{
+  SM4_context *ctx = context;
+  return sm4_crypt_blk1_32 (ctx, out, in, num_blks, ctx->rkey_enc);
+}
+
+static unsigned int
+sm4_decrypt_blk1_32 (void *context, byte *out, const byte *in,
+                    size_t num_blks)
+{
+  SM4_context *ctx = context;
+  return sm4_crypt_blk1_32 (ctx, out, in, num_blks, ctx->rkey_dec);
+}
 
-         burn_stack_depth = crypt_blk1_8 (ctx->rkey_enc, ivbuf, ivbuf,
-                                          curr_blks);
+/* Bulk encryption/decryption in ECB mode. */
+static void
+_gcry_sm4_ecb_crypt (void *context, void *outbuf_arg,
+                    const void *inbuf_arg, size_t nblocks, int encrypt)
+{
+  SM4_context *ctx = context;
+  unsigned char *outbuf = outbuf_arg;
+  const unsigned char *inbuf = inbuf_arg;
+  int burn_stack_depth = 0;
 
-         for (i = 0; i < curr_blks; i++)
-           {
-             cipher_block_xor (outbuf, inbuf, &ivbuf[i * 16], 16);
-             outbuf += 16;
-             inbuf += 16;
-           }
+  /* Process remaining blocks. */
+  if (nblocks)
+    {
+      size_t nburn;
 
-         nblocks -= curr_blks;
-       }
+      if (ctx->crypt_blk1_16 == &sm4_crypt_blocks)
+       prefetch_sbox_table ();
 
-      wipememory(ivbuf, tmp_used);
+      nburn = bulk_ecb_crypt_128(ctx, encrypt ? sm4_encrypt_blk1_32
+                                              : sm4_decrypt_blk1_32,
+                                 outbuf, inbuf, nblocks, 32);
+      burn_stack_depth = nburn > burn_stack_depth ? nburn : burn_stack_depth;
     }
 
   if (burn_stack_depth)
     _gcry_burn_stack(burn_stack_depth);
 }
 
+/* Bulk encryption/decryption of complete blocks in XTS mode. */
+static void
+_gcry_sm4_xts_crypt (void *context, unsigned char *tweak, void *outbuf_arg,
+                     const void *inbuf_arg, size_t nblocks, int encrypt)
+{
+  SM4_context *ctx = context;
+  unsigned char *outbuf = outbuf_arg;
+  const unsigned char *inbuf = inbuf_arg;
+  int burn_stack_depth = 0;
+
+#ifdef USE_ARM_CE
+  if (ctx->use_arm_ce)
+    {
+      /* Process all blocks at a time. */
+      _gcry_sm4_armv8_ce_xts_crypt(encrypt ? ctx->rkey_enc : ctx->rkey_dec,
+                                   outbuf, inbuf, tweak, nblocks);
+
+      nblocks = 0;
+    }
+#endif
+
+  /* Process remaining blocks. */
+  if (nblocks)
+    {
+      unsigned char tmpbuf[32 * 16];
+      unsigned int tmp_used = 16;
+      size_t nburn;
+
+      if (ctx->crypt_blk1_16 == &sm4_crypt_blocks)
+       prefetch_sbox_table ();
+
+      nburn = bulk_xts_crypt_128(ctx, encrypt ? sm4_encrypt_blk1_32
+                                              : sm4_decrypt_blk1_32,
+                                 outbuf, inbuf, nblocks,
+                                 tweak, tmpbuf, sizeof(tmpbuf) / 16,
+                                 &tmp_used);
+      burn_stack_depth = nburn > burn_stack_depth ? nburn : burn_stack_depth;
+
+      wipememory(tmpbuf, tmp_used);
+    }
+
+  if (burn_stack_depth)
+    _gcry_burn_stack(burn_stack_depth);
+}
+
+/* Bulk encryption of complete blocks in CTR32LE mode (for GCM-SIV). */
+static void
+_gcry_sm4_ctr32le_enc(void *context, unsigned char *ctr,
+                      void *outbuf_arg, const void *inbuf_arg,
+                      size_t nblocks)
+{
+  SM4_context *ctx = context;
+  byte *outbuf = outbuf_arg;
+  const byte *inbuf = inbuf_arg;
+  int burn_stack_depth = 0;
+
+  /* Process remaining blocks. */
+  if (nblocks)
+    {
+      byte tmpbuf[32 * 16];
+      unsigned int tmp_used = 16;
+      size_t nburn;
+
+      nburn = bulk_ctr32le_enc_128 (ctx, sm4_encrypt_blk1_32, outbuf, inbuf,
+                                    nblocks, ctr, tmpbuf, sizeof(tmpbuf) / 16,
+                                    &tmp_used);
+      burn_stack_depth = nburn > burn_stack_depth ? nburn : burn_stack_depth;
+
+      wipememory (tmpbuf, tmp_used);
+    }
+
+  if (burn_stack_depth)
+    _gcry_burn_stack (burn_stack_depth);
+}
+
 /* Bulk encryption/decryption of complete blocks in OCB mode. */
 static size_t
 _gcry_sm4_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg,
@@ -815,31 +1681,99 @@ _gcry_sm4_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg,
   u64 blkn = c->u_mode.ocb.data_nblocks;
   int burn_stack_depth = 0;
 
-#ifdef USE_AESNI_AVX2
-  if (ctx->use_aesni_avx2)
+#ifdef USE_GFNI_AVX512
+  if (ctx->use_gfni_avx512)
+    {
+      u64 Ls[32];
+      u64 *l;
+
+      if (nblocks >= 32)
+       {
+          l = bulk_ocb_prepare_L_pointers_array_blk32 (c, Ls, blkn);
+
+         /* Process data in 32 block chunks. */
+         while (nblocks >= 32)
+           {
+             blkn += 32;
+             *l = (uintptr_t)(void *)ocb_get_l (c, blkn - blkn % 32);
+
+             if (encrypt)
+               _gcry_sm4_gfni_avx512_ocb_enc_blk32 (ctx->rkey_enc, outbuf,
+                                                     inbuf, c->u_iv.iv,
+                                                     c->u_ctr.ctr, Ls);
+             else
+               _gcry_sm4_gfni_avx512_ocb_dec_blk32 (ctx->rkey_dec, outbuf,
+                                                     inbuf, c->u_iv.iv,
+                                                     c->u_ctr.ctr, Ls);
+
+             nblocks -= 32;
+             outbuf += 32 * 16;
+             inbuf += 32 * 16;
+           }
+       }
+
+      if (nblocks >= 16)
+       {
+          l = bulk_ocb_prepare_L_pointers_array_blk16 (c, Ls, blkn);
+
+         /* Process data in 16 block chunks. */
+         blkn += 16;
+         *l = (uintptr_t)(void *)ocb_get_l(c, blkn - blkn % 16);
+
+         if (encrypt)
+           _gcry_sm4_gfni_avx512_ocb_enc(ctx->rkey_enc, outbuf, inbuf,
+                                         c->u_iv.iv, c->u_ctr.ctr, Ls);
+         else
+           _gcry_sm4_gfni_avx512_ocb_dec(ctx->rkey_dec, outbuf, inbuf,
+                                         c->u_iv.iv, c->u_ctr.ctr, Ls);
+
+         nblocks -= 16;
+         outbuf += 16 * 16;
+         inbuf += 16 * 16;
+       }
+    }
+#endif
+
+#ifdef USE_GFNI_AVX2
+  if (ctx->use_gfni_avx2)
     {
       u64 Ls[16];
-      unsigned int n = 16 - (blkn % 16);
       u64 *l;
-      int i;
 
       if (nblocks >= 16)
        {
-         for (i = 0; i < 16; i += 8)
+          l = bulk_ocb_prepare_L_pointers_array_blk16 (c, Ls, blkn);
+
+         /* Process data in 16 block chunks. */
+         while (nblocks >= 16)
            {
-             /* Use u64 to store pointers for x32 support (assembly function
-              * assumes 64-bit pointers). */
-             Ls[(i + 0 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
-             Ls[(i + 1 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[1];
-             Ls[(i + 2 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
-             Ls[(i + 3 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[2];
-             Ls[(i + 4 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
-             Ls[(i + 5 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[1];
-             Ls[(i + 6 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
+             blkn += 16;
+             *l = (uintptr_t)(void *)ocb_get_l(c, blkn - blkn % 16);
+
+             if (encrypt)
+               _gcry_sm4_gfni_avx2_ocb_enc(ctx->rkey_enc, outbuf, inbuf,
+                                           c->u_iv.iv, c->u_ctr.ctr, Ls);
+             else
+               _gcry_sm4_gfni_avx2_ocb_dec(ctx->rkey_dec, outbuf, inbuf,
+                                           c->u_iv.iv, c->u_ctr.ctr, Ls);
+
+             nblocks -= 16;
+             outbuf += 16 * 16;
+             inbuf += 16 * 16;
            }
+       }
+    }
+#endif
 
-         Ls[(7 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[3];
-         l = &Ls[(15 + n) % 16];
+#ifdef USE_AESNI_AVX2
+  if (ctx->use_aesni_avx2)
+    {
+      u64 Ls[16];
+      u64 *l;
+
+      if (nblocks >= 16)
+       {
+          l = bulk_ocb_prepare_L_pointers_array_blk16 (c, Ls, blkn);
 
          /* Process data in 16 block chunks. */
          while (nblocks >= 16)
@@ -866,22 +1800,11 @@ _gcry_sm4_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg,
   if (ctx->use_aesni_avx)
     {
       u64 Ls[8];
-      unsigned int n = 8 - (blkn % 8);
       u64 *l;
 
       if (nblocks >= 8)
        {
-         /* Use u64 to store pointers for x32 support (assembly function
-          * assumes 64-bit pointers). */
-         Ls[(0 + n) % 8] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
-         Ls[(1 + n) % 8] = (uintptr_t)(void *)c->u_mode.ocb.L[1];
-         Ls[(2 + n) % 8] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
-         Ls[(3 + n) % 8] = (uintptr_t)(void *)c->u_mode.ocb.L[2];
-         Ls[(4 + n) % 8] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
-         Ls[(5 + n) % 8] = (uintptr_t)(void *)c->u_mode.ocb.L[1];
-         Ls[(6 + n) % 8] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
-         Ls[(7 + n) % 8] = (uintptr_t)(void *)c->u_mode.ocb.L[3];
-         l = &Ls[(7 + n) % 8];
+          l = bulk_ocb_prepare_L_pointers_array_blk8 (c, Ls, blkn);
 
          /* Process data in 8 block chunks. */
          while (nblocks >= 8)
@@ -904,66 +1827,19 @@ _gcry_sm4_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg,
     }
 #endif
 
+  /* Process remaining blocks. */
   if (nblocks)
     {
-      unsigned int (*crypt_blk1_8)(const u32 *rk, byte *out, const byte *in,
-                                  unsigned int num_blks);
-      const u32 *rk = encrypt ? ctx->rkey_enc : ctx->rkey_dec;
-      unsigned char tmpbuf[16 * 8];
+      crypt_blk1_16_fn_t crypt_blk1_16 = ctx->crypt_blk1_16;
+      u32 *rk = encrypt ? ctx->rkey_enc : ctx->rkey_dec;
+      unsigned char tmpbuf[16 * 16];
       unsigned int tmp_used = 16;
+      size_t nburn;
 
-      if (0)
-       ;
-#ifdef USE_AESNI_AVX
-      else if (ctx->use_aesni_avx)
-       {
-         crypt_blk1_8 = sm4_aesni_avx_crypt_blk1_8;
-       }
-#endif
-      else
-       {
-         prefetch_sbox_table ();
-         crypt_blk1_8 = sm4_crypt_blocks;
-       }
-
-      while (nblocks)
-       {
-         size_t curr_blks = nblocks > 8 ? 8 : nblocks;
-         size_t i;
-
-         if (curr_blks * 16 > tmp_used)
-           tmp_used = curr_blks * 16;
-
-         for (i = 0; i < curr_blks; i++)
-           {
-             const unsigned char *l = ocb_get_l(c, ++blkn);
-
-             /* Checksum_i = Checksum_{i-1} xor P_i  */
-             if (encrypt)
-               cipher_block_xor_1(c->u_ctr.ctr, &inbuf[i * 16], 16);
-
-             /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
-             cipher_block_xor_2dst (&tmpbuf[i * 16], c->u_iv.iv, l, 16);
-             cipher_block_xor (&outbuf[i * 16], &inbuf[i * 16],
-                               c->u_iv.iv, 16);
-           }
-
-         /* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i)  */
-         crypt_blk1_8 (rk, outbuf, outbuf, curr_blks);
-
-         for (i = 0; i < curr_blks; i++)
-           {
-             cipher_block_xor_1 (&outbuf[i * 16], &tmpbuf[i * 16], 16);
-
-             /* Checksum_i = Checksum_{i-1} xor P_i  */
-             if (!encrypt)
-                 cipher_block_xor_1(c->u_ctr.ctr, &outbuf[i * 16], 16);
-           }
-
-         outbuf += curr_blks * 16;
-         inbuf  += curr_blks * 16;
-         nblocks -= curr_blks;
-       }
+      nburn = bulk_ocb_crypt_128 (c, rk, crypt_blk1_16, outbuf, inbuf, nblocks,
+                                  &blkn, encrypt, tmpbuf, sizeof(tmpbuf) / 16,
+                                  &tmp_used);
+      burn_stack_depth = nburn > burn_stack_depth ? nburn : burn_stack_depth;
 
       wipememory(tmpbuf, tmp_used);
     }
@@ -983,32 +1859,71 @@ _gcry_sm4_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg, size_t nblocks)
   SM4_context *ctx = (void *)&c->context.c;
   const unsigned char *abuf = abuf_arg;
   u64 blkn = c->u_mode.ocb.aad_nblocks;
+  int burn_stack_depth = 0;
 
-#ifdef USE_AESNI_AVX2
-  if (ctx->use_aesni_avx2)
+#ifdef USE_GFNI_AVX512
+  if (ctx->use_gfni_avx512)
+    {
+      u64 Ls[16];
+      u64 *l;
+
+      if (nblocks >= 16)
+        {
+          l = bulk_ocb_prepare_L_pointers_array_blk16 (c, Ls, blkn);
+
+          /* Process data in 16 block chunks. */
+          while (nblocks >= 16)
+            {
+              blkn += 16;
+              *l = (uintptr_t)(void *)ocb_get_l (c, blkn - blkn % 16);
+
+              _gcry_sm4_gfni_avx512_ocb_auth (ctx->rkey_enc, abuf,
+                                              c->u_mode.ocb.aad_offset,
+                                              c->u_mode.ocb.aad_sum, Ls);
+
+              nblocks -= 16;
+              abuf += 16 * 16;
+            }
+        }
+    }
+#endif
+
+#ifdef USE_GFNI_AVX2
+  if (ctx->use_gfni_avx2)
     {
       u64 Ls[16];
-      unsigned int n = 16 - (blkn % 16);
       u64 *l;
-      int i;
 
       if (nblocks >= 16)
        {
-         for (i = 0; i < 16; i += 8)
+          l = bulk_ocb_prepare_L_pointers_array_blk16 (c, Ls, blkn);
+
+         /* Process data in 16 block chunks. */
+         while (nblocks >= 16)
            {
-             /* Use u64 to store pointers for x32 support (assembly function
-              * assumes 64-bit pointers). */
-             Ls[(i + 0 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
-             Ls[(i + 1 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[1];
-             Ls[(i + 2 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
-             Ls[(i + 3 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[2];
-             Ls[(i + 4 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
-             Ls[(i + 5 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[1];
-             Ls[(i + 6 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
+             blkn += 16;
+             *l = (uintptr_t)(void *)ocb_get_l(c, blkn - blkn % 16);
+
+             _gcry_sm4_gfni_avx2_ocb_auth(ctx->rkey_enc, abuf,
+                                          c->u_mode.ocb.aad_offset,
+                                          c->u_mode.ocb.aad_sum, Ls);
+
+             nblocks -= 16;
+             abuf += 16 * 16;
            }
+       }
+    }
+#endif
 
-         Ls[(7 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[3];
-         l = &Ls[(15 + n) % 16];
+#ifdef USE_AESNI_AVX2
+  if (ctx->use_aesni_avx2)
+    {
+      u64 Ls[16];
+      u64 *l;
+
+      if (nblocks >= 16)
+       {
+          l = bulk_ocb_prepare_L_pointers_array_blk16 (c, Ls, blkn);
 
          /* Process data in 16 block chunks. */
          while (nblocks >= 16)
@@ -1031,22 +1946,11 @@ _gcry_sm4_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg, size_t nblocks)
   if (ctx->use_aesni_avx)
     {
       u64 Ls[8];
-      unsigned int n = 8 - (blkn % 8);
       u64 *l;
 
       if (nblocks >= 8)
        {
-         /* Use u64 to store pointers for x32 support (assembly function
-           * assumes 64-bit pointers). */
-         Ls[(0 + n) % 8] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
-         Ls[(1 + n) % 8] = (uintptr_t)(void *)c->u_mode.ocb.L[1];
-         Ls[(2 + n) % 8] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
-         Ls[(3 + n) % 8] = (uintptr_t)(void *)c->u_mode.ocb.L[2];
-         Ls[(4 + n) % 8] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
-         Ls[(5 + n) % 8] = (uintptr_t)(void *)c->u_mode.ocb.L[1];
-         Ls[(6 + n) % 8] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
-         Ls[(7 + n) % 8] = (uintptr_t)(void *)c->u_mode.ocb.L[3];
-         l = &Ls[(7 + n) % 8];
+          l = bulk_ocb_prepare_L_pointers_array_blk8 (c, Ls, blkn);
 
          /* Process data in 8 block chunks. */
          while (nblocks >= 8)
@@ -1065,102 +1969,27 @@ _gcry_sm4_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg, size_t nblocks)
     }
 #endif
 
+  /* Process remaining blocks. */
   if (nblocks)
     {
-      unsigned int (*crypt_blk1_8)(const u32 *rk, byte *out, const byte *in,
-                                  unsigned int num_blks);
-      unsigned char tmpbuf[16 * 8];
+      crypt_blk1_16_fn_t crypt_blk1_16 = ctx->crypt_blk1_16;
+      unsigned char tmpbuf[16 * 16];
       unsigned int tmp_used = 16;
+      size_t nburn;
 
-      if (0)
-       ;
-#ifdef USE_AESNI_AVX
-      else if (ctx->use_aesni_avx)
-       {
-         crypt_blk1_8 = sm4_aesni_avx_crypt_blk1_8;
-       }
-#endif
-      else
-       {
-         prefetch_sbox_table ();
-         crypt_blk1_8 = sm4_crypt_blocks;
-       }
-
-      while (nblocks)
-       {
-         size_t curr_blks = nblocks > 8 ? 8 : nblocks;
-         size_t i;
-
-         if (curr_blks * 16 > tmp_used)
-           tmp_used = curr_blks * 16;
-
-         for (i = 0; i < curr_blks; i++)
-           {
-             const unsigned char *l = ocb_get_l(c, ++blkn);
-
-             /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
-             cipher_block_xor_2dst (&tmpbuf[i * 16],
-                                    c->u_mode.ocb.aad_offset, l, 16);
-             cipher_block_xor_1 (&tmpbuf[i * 16], &abuf[i * 16], 16);
-           }
-
-         /* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i)  */
-         crypt_blk1_8 (ctx->rkey_enc, tmpbuf, tmpbuf, curr_blks);
-
-         for (i = 0; i < curr_blks; i++)
-           {
-             cipher_block_xor_1 (c->u_mode.ocb.aad_sum, &tmpbuf[i * 16], 16);
-           }
-
-         abuf += curr_blks * 16;
-         nblocks -= curr_blks;
-       }
+      nburn = bulk_ocb_auth_128 (c, ctx->rkey_enc, crypt_blk1_16, abuf, nblocks,
+                                 &blkn, tmpbuf, sizeof(tmpbuf) / 16, &tmp_used);
+      burn_stack_depth = nburn > burn_stack_depth ? nburn : burn_stack_depth;
 
       wipememory(tmpbuf, tmp_used);
     }
 
   c->u_mode.ocb.aad_nblocks = blkn;
 
-  return 0;
-}
-
-/* Run the self-tests for SM4-CTR, tests IV increment of bulk CTR
-   encryption.  Returns NULL on success. */
-static const char*
-selftest_ctr_128 (void)
-{
-  const int nblocks = 16 - 1;
-  const int blocksize = 16;
-  const int context_size = sizeof(SM4_context);
-
-  return _gcry_selftest_helper_ctr("SM4", &sm4_setkey,
-           &sm4_encrypt, nblocks, blocksize, context_size);
-}
-
-/* Run the self-tests for SM4-CBC, tests bulk CBC decryption.
-   Returns NULL on success. */
-static const char*
-selftest_cbc_128 (void)
-{
-  const int nblocks = 16 - 1;
-  const int blocksize = 16;
-  const int context_size = sizeof(SM4_context);
-
-  return _gcry_selftest_helper_cbc("SM4", &sm4_setkey,
-           &sm4_encrypt, nblocks, blocksize, context_size);
-}
-
-/* Run the self-tests for SM4-CFB, tests bulk CFB decryption.
-   Returns NULL on success. */
-static const char*
-selftest_cfb_128 (void)
-{
-  const int nblocks = 16 - 1;
-  const int blocksize = 16;
-  const int context_size = sizeof(SM4_context);
+  if (burn_stack_depth)
+    _gcry_burn_stack(burn_stack_depth);
 
-  return _gcry_selftest_helper_cfb("SM4", &sm4_setkey,
-           &sm4_encrypt, nblocks, blocksize, context_size);
+  return 0;
 }
 
 static const char *
@@ -1168,7 +1997,6 @@ sm4_selftest (void)
 {
   SM4_context ctx;
   byte scratch[16];
-  const char *r;
 
   static const byte plaintext[16] = {
     0x01, 0x23, 0x45, 0x67, 0x89, 0xAB, 0xCD, 0xEF,
@@ -1193,15 +2021,6 @@ sm4_selftest (void)
   if (memcmp (scratch, plaintext, sizeof (plaintext)))
     return "SM4 test decryption failed.";
 
-  if ( (r = selftest_ctr_128 ()) )
-    return r;
-
-  if ( (r = selftest_cbc_128 ()) )
-    return r;
-
-  if ( (r = selftest_cfb_128 ()) )
-    return r;
-
   return NULL;
 }
 
diff --git a/cipher/sntrup761.c b/cipher/sntrup761.c
new file mode 100644 (file)
index 0000000..7d6b85c
--- /dev/null
@@ -0,0 +1,1062 @@
+/* sntrup761.c  -  Streamlined NTRU Prime sntrup761 key-encapsulation method
+ * Copyright (C) 2023 Simon Josefsson <simon@josefsson.org>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <https://www.gnu.org/licenses/>.
+ * SPDX-License-Identifier: LGPL-2.1-or-later
+ *
+ * For a description of the algorithm, see:
+ *   https://ntruprime.cr.yp.to/
+ */
+
+/*
+ * Derived from public domain source, written by (in alphabetical order):
+ * - Daniel J. Bernstein
+ * - Chitchanok Chuengsatiansup
+ * - Tanja Lange
+ * - Christine van Vredendaal
+ */
+
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include "sntrup761.h"
+
+/* from supercop-20201130/crypto_sort/int32/portable4/int32_minmax.inc */
+#define int32_MINMAX(a,b) \
+do { \
+  int64_t ab = (int64_t)b ^ (int64_t)a; \
+  int64_t c = (int64_t)b - (int64_t)a; \
+  c ^= ab & (c ^ b); \
+  c >>= 31; \
+  c &= ab; \
+  a ^= c; \
+  b ^= c; \
+} while(0)
+
+/* from supercop-20201130/crypto_sort/int32/portable4/sort.c */
+static void
+crypto_sort_int32 (void *array, long long n)
+{
+  long long top, p, q, r, i, j;
+  int32_t *x = array;
+
+  if (n < 2)
+    return;
+  top = 1;
+  while (top < n - top)
+    top += top;
+
+  for (p = top; p >= 1; p >>= 1)
+    {
+      i = 0;
+      while (i + 2 * p <= n)
+       {
+         for (j = i; j < i + p; ++j)
+           int32_MINMAX (x[j], x[j + p]);
+         i += 2 * p;
+       }
+      for (j = i; j < n - p; ++j)
+       int32_MINMAX (x[j], x[j + p]);
+
+      i = 0;
+      j = 0;
+      for (q = top; q > p; q >>= 1)
+       {
+         if (j != i)
+           for (;;)
+             {
+                int32_t a;
+
+               if (j == n - q)
+                 goto done;
+               a = x[j + p];
+               for (r = q; r > p; r >>= 1)
+                 int32_MINMAX (a, x[j + r]);
+               x[j + p] = a;
+               ++j;
+               if (j == i + p)
+                 {
+                   i += 2 * p;
+                   break;
+                 }
+             }
+         while (i + p <= n - q)
+           {
+             for (j = i; j < i + p; ++j)
+               {
+                 int32_t a = x[j + p];
+                 for (r = q; r > p; r >>= 1)
+                   int32_MINMAX (a, x[j + r]);
+                 x[j + p] = a;
+               }
+             i += 2 * p;
+           }
+         /* now i + p > n - q */
+         j = i;
+         while (j < n - q)
+           {
+             int32_t a = x[j + p];
+             for (r = q; r > p; r >>= 1)
+               int32_MINMAX (a, x[j + r]);
+             x[j + p] = a;
+             ++j;
+           }
+
+       done:;
+       }
+    }
+}
+
+/* from supercop-20201130/crypto_sort/uint32/useint32/sort.c */
+
+/* can save time by vectorizing xor loops */
+/* can save time by integrating xor loops with int32_sort */
+
+static void
+crypto_sort_uint32 (void *array, long long n)
+{
+  uint32_t *x = array;
+  long long j;
+  for (j = 0; j < n; ++j)
+    x[j] ^= 0x80000000;
+  crypto_sort_int32 (array, n);
+  for (j = 0; j < n; ++j)
+    x[j] ^= 0x80000000;
+}
+
+/* from supercop-20201130/crypto_kem/sntrup761/ref/uint32.c */
+
+/*
+CPU division instruction typically takes time depending on x.
+This software is designed to take time independent of x.
+Time still varies depending on m; user must ensure that m is constant.
+Time also varies on CPUs where multiplication is variable-time.
+There could be more CPU issues.
+There could also be compiler issues.
+*/
+
+static void
+uint32_divmod_uint14 (uint32_t * q, uint16_t * r, uint32_t x, uint16_t m)
+{
+  uint32_t v = 0x80000000;
+  uint32_t qpart;
+  uint32_t mask;
+
+  v /= m;
+
+  /* caller guarantees m > 0 */
+  /* caller guarantees m < 16384 */
+  /* vm <= 2^31 <= vm+m-1 */
+  /* xvm <= 2^31 x <= xvm+x(m-1) */
+
+  *q = 0;
+
+  qpart = (x * (uint64_t) v) >> 31;
+  /* 2^31 qpart <= xv <= 2^31 qpart + 2^31-1 */
+  /* 2^31 qpart m <= xvm <= 2^31 qpart m + (2^31-1)m */
+  /* 2^31 qpart m <= 2^31 x <= 2^31 qpart m + (2^31-1)m + x(m-1) */
+  /* 0 <= 2^31 newx <= (2^31-1)m + x(m-1) */
+  /* 0 <= newx <= (1-1/2^31)m + x(m-1)/2^31 */
+  /* 0 <= newx <= (1-1/2^31)(2^14-1) + (2^32-1)((2^14-1)-1)/2^31 */
+
+  x -= qpart * m;
+  *q += qpart;
+  /* x <= 49146 */
+
+  qpart = (x * (uint64_t) v) >> 31;
+  /* 0 <= newx <= (1-1/2^31)m + x(m-1)/2^31 */
+  /* 0 <= newx <= m + 49146(2^14-1)/2^31 */
+  /* 0 <= newx <= m + 0.4 */
+  /* 0 <= newx <= m */
+
+  x -= qpart * m;
+  *q += qpart;
+  /* x <= m */
+
+  x -= m;
+  *q += 1;
+  mask = -(x >> 31);
+  x += mask & (uint32_t) m;
+  *q += mask;
+  /* x < m */
+
+  *r = x;
+}
+
+
+static uint16_t
+uint32_mod_uint14 (uint32_t x, uint16_t m)
+{
+  uint32_t q;
+  uint16_t r;
+  uint32_divmod_uint14 (&q, &r, x, m);
+  return r;
+}
+
+/* from supercop-20201130/crypto_kem/sntrup761/ref/int32.c */
+
+static void
+int32_divmod_uint14 (int32_t * q, uint16_t * r, int32_t x, uint16_t m)
+{
+  uint32_t uq, uq2;
+  uint16_t ur, ur2;
+  uint32_t mask;
+
+  uint32_divmod_uint14 (&uq, &ur, 0x80000000 + (uint32_t) x, m);
+  uint32_divmod_uint14 (&uq2, &ur2, 0x80000000, m);
+  ur -= ur2;
+  uq -= uq2;
+  mask = -(uint32_t) (ur >> 15);
+  ur += mask & m;
+  uq += mask;
+  *r = ur;
+  *q = uq;
+}
+
+
+static uint16_t
+int32_mod_uint14 (int32_t x, uint16_t m)
+{
+  int32_t q;
+  uint16_t r;
+  int32_divmod_uint14 (&q, &r, x, m);
+  return r;
+}
+
+/* from supercop-20201130/crypto_kem/sntrup761/ref/paramsmenu.h */
+#define p 761
+#define q 4591
+#define Rounded_bytes 1007
+#define Rq_bytes 1158
+#define w 286
+
+/* from supercop-20201130/crypto_kem/sntrup761/ref/Decode.h */
+
+/* Decode(R,s,M,len) */
+/* assumes 0 < M[i] < 16384 */
+/* produces 0 <= R[i] < M[i] */
+
+/* from supercop-20201130/crypto_kem/sntrup761/ref/Decode.c */
+
+static void
+Decode (uint16_t * out, const unsigned char *S, const uint16_t * M,
+       long long len)
+{
+  if (len == 1)
+    {
+      if (M[0] == 1)
+       *out = 0;
+      else if (M[0] <= 256)
+       *out = uint32_mod_uint14 (S[0], M[0]);
+      else
+       *out = uint32_mod_uint14 (S[0] + (((uint16_t) S[1]) << 8), M[0]);
+    }
+  if (len > 1)
+    {
+      uint16_t R2[(len + 1) / 2];
+      uint16_t M2[(len + 1) / 2];
+      uint16_t bottomr[len / 2];
+      uint32_t bottomt[len / 2];
+      long long i;
+      for (i = 0; i < len - 1; i += 2)
+       {
+         uint32_t m = M[i] * (uint32_t) M[i + 1];
+         if (m > 256 * 16383)
+           {
+             bottomt[i / 2] = 256 * 256;
+             bottomr[i / 2] = S[0] + 256 * S[1];
+             S += 2;
+             M2[i / 2] = (((m + 255) >> 8) + 255) >> 8;
+           }
+         else if (m >= 16384)
+           {
+             bottomt[i / 2] = 256;
+             bottomr[i / 2] = S[0];
+             S += 1;
+             M2[i / 2] = (m + 255) >> 8;
+           }
+         else
+           {
+             bottomt[i / 2] = 1;
+             bottomr[i / 2] = 0;
+             M2[i / 2] = m;
+           }
+       }
+      if (i < len)
+       M2[i / 2] = M[i];
+      Decode (R2, S, M2, (len + 1) / 2);
+      for (i = 0; i < len - 1; i += 2)
+       {
+         uint32_t r = bottomr[i / 2];
+         uint32_t r1;
+         uint16_t r0;
+         r += bottomt[i / 2] * R2[i / 2];
+         uint32_divmod_uint14 (&r1, &r0, r, M[i]);
+         r1 = uint32_mod_uint14 (r1, M[i + 1]);        /* only needed for invalid inputs */
+         *out++ = r0;
+         *out++ = r1;
+       }
+      if (i < len)
+       *out++ = R2[i / 2];
+    }
+}
+
+/* from supercop-20201130/crypto_kem/sntrup761/ref/Encode.h */
+
+/* Encode(s,R,M,len) */
+/* assumes 0 <= R[i] < M[i] < 16384 */
+
+/* from supercop-20201130/crypto_kem/sntrup761/ref/Encode.c */
+
+/* 0 <= R[i] < M[i] < 16384 */
+static void
+Encode (unsigned char *out, const uint16_t * R, const uint16_t * M,
+       long long len)
+{
+  if (len == 1)
+    {
+      uint16_t r = R[0];
+      uint16_t m = M[0];
+      while (m > 1)
+       {
+         *out++ = r;
+         r >>= 8;
+         m = (m + 255) >> 8;
+       }
+    }
+  if (len > 1)
+    {
+      uint16_t R2[(len + 1) / 2];
+      uint16_t M2[(len + 1) / 2];
+      long long i;
+      for (i = 0; i < len - 1; i += 2)
+       {
+         uint32_t m0 = M[i];
+         uint32_t r = R[i] + R[i + 1] * m0;
+         uint32_t m = M[i + 1] * m0;
+         while (m >= 16384)
+           {
+             *out++ = r;
+             r >>= 8;
+             m = (m + 255) >> 8;
+           }
+         R2[i / 2] = r;
+         M2[i / 2] = m;
+       }
+      if (i < len)
+       {
+         R2[i / 2] = R[i];
+         M2[i / 2] = M[i];
+       }
+      Encode (out, R2, M2, (len + 1) / 2);
+    }
+}
+
+/* from supercop-20201130/crypto_kem/sntrup761/ref/kem.c */
+
+/* ----- masks */
+
+/* return -1 if x!=0; else return 0 */
+static int
+int16_t_nonzero_mask (int16_t x)
+{
+  uint16_t u = x;              /* 0, else 1...65535 */
+  uint32_t v = u;              /* 0, else 1...65535 */
+  v = -v;                      /* 0, else 2^32-65535...2^32-1 */
+  v >>= 31;                    /* 0, else 1 */
+  return -v;                   /* 0, else -1 */
+}
+
+/* return -1 if x<0; otherwise return 0 */
+static int
+int16_t_negative_mask (int16_t x)
+{
+  uint16_t u = x;
+  u >>= 15;
+  return -(int) u;
+  /* alternative with gcc -fwrapv: */
+  /* x>>15 compiles to CPU's arithmetic right shift */
+}
+
+/* ----- arithmetic mod 3 */
+
+typedef int8_t small;
+
+/* F3 is always represented as -1,0,1 */
+/* so ZZ_fromF3 is a no-op */
+
+/* x must not be close to top int16_t */
+static small
+F3_freeze (int16_t x)
+{
+  return int32_mod_uint14 (x + 1, 3) - 1;
+}
+
+/* ----- arithmetic mod q */
+
+#define q12 ((q-1)/2)
+typedef int16_t Fq;
+/* always represented as -q12...q12 */
+/* so ZZ_fromFq is a no-op */
+
+/* x must not be close to top int32 */
+static Fq
+Fq_freeze (int32_t x)
+{
+  return int32_mod_uint14 (x + q12, q) - q12;
+}
+
+static Fq
+Fq_recip (Fq a1)
+{
+  int i = 1;
+  Fq ai = a1;
+
+  while (i < q - 2)
+    {
+      ai = Fq_freeze (a1 * (int32_t) ai);
+      i += 1;
+    }
+  return ai;
+}
+
+/* ----- small polynomials */
+
+/* 0 if Weightw_is(r), else -1 */
+static int
+Weightw_mask (small * r)
+{
+  int weight = 0;
+  int i;
+
+  for (i = 0; i < p; ++i)
+    weight += r[i] & 1;
+  return int16_t_nonzero_mask (weight - w);
+}
+
+/* R3_fromR(R_fromRq(r)) */
+static void
+R3_fromRq (small * out, const Fq * r)
+{
+  int i;
+  for (i = 0; i < p; ++i)
+    out[i] = F3_freeze (r[i]);
+}
+
+/* h = f*g in the ring R3 */
+static void
+R3_mult (small * h, const small * f, const small * g)
+{
+  small fg[p + p - 1];
+  small result;
+  int i, j;
+
+  for (i = 0; i < p; ++i)
+    {
+      result = 0;
+      for (j = 0; j <= i; ++j)
+       result = F3_freeze (result + f[j] * g[i - j]);
+      fg[i] = result;
+    }
+  for (i = p; i < p + p - 1; ++i)
+    {
+      result = 0;
+      for (j = i - p + 1; j < p; ++j)
+       result = F3_freeze (result + f[j] * g[i - j]);
+      fg[i] = result;
+    }
+
+  for (i = p + p - 2; i >= p; --i)
+    {
+      fg[i - p] = F3_freeze (fg[i - p] + fg[i]);
+      fg[i - p + 1] = F3_freeze (fg[i - p + 1] + fg[i]);
+    }
+
+  for (i = 0; i < p; ++i)
+    h[i] = fg[i];
+}
+
+/* returns 0 if recip succeeded; else -1 */
+static int
+R3_recip (small * out, const small * in)
+{
+  small f[p + 1], g[p + 1], v[p + 1], r[p + 1];
+  int i, loop, delta;
+  int sign, swap, t;
+
+  for (i = 0; i < p + 1; ++i)
+    v[i] = 0;
+  for (i = 0; i < p + 1; ++i)
+    r[i] = 0;
+  r[0] = 1;
+  for (i = 0; i < p; ++i)
+    f[i] = 0;
+  f[0] = 1;
+  f[p - 1] = f[p] = -1;
+  for (i = 0; i < p; ++i)
+    g[p - 1 - i] = in[i];
+  g[p] = 0;
+
+  delta = 1;
+
+  for (loop = 0; loop < 2 * p - 1; ++loop)
+    {
+      for (i = p; i > 0; --i)
+       v[i] = v[i - 1];
+      v[0] = 0;
+
+      sign = -g[0] * f[0];
+      swap = int16_t_negative_mask (-delta) & int16_t_nonzero_mask (g[0]);
+      delta ^= swap & (delta ^ -delta);
+      delta += 1;
+
+      for (i = 0; i < p + 1; ++i)
+       {
+         t = swap & (f[i] ^ g[i]);
+         f[i] ^= t;
+         g[i] ^= t;
+         t = swap & (v[i] ^ r[i]);
+         v[i] ^= t;
+         r[i] ^= t;
+       }
+
+      for (i = 0; i < p + 1; ++i)
+       g[i] = F3_freeze (g[i] + sign * f[i]);
+      for (i = 0; i < p + 1; ++i)
+       r[i] = F3_freeze (r[i] + sign * v[i]);
+
+      for (i = 0; i < p; ++i)
+       g[i] = g[i + 1];
+      g[p] = 0;
+    }
+
+  sign = f[0];
+  for (i = 0; i < p; ++i)
+    out[i] = sign * v[p - 1 - i];
+
+  return int16_t_nonzero_mask (delta);
+}
+
+/* ----- polynomials mod q */
+
+/* h = f*g in the ring Rq */
+static void
+Rq_mult_small (Fq * h, const Fq * f, const small * g)
+{
+  Fq fg[p + p - 1];
+  Fq result;
+  int i, j;
+
+  for (i = 0; i < p; ++i)
+    {
+      result = 0;
+      for (j = 0; j <= i; ++j)
+       result = Fq_freeze (result + f[j] * (int32_t) g[i - j]);
+      fg[i] = result;
+    }
+  for (i = p; i < p + p - 1; ++i)
+    {
+      result = 0;
+      for (j = i - p + 1; j < p; ++j)
+       result = Fq_freeze (result + f[j] * (int32_t) g[i - j]);
+      fg[i] = result;
+    }
+
+  for (i = p + p - 2; i >= p; --i)
+    {
+      fg[i - p] = Fq_freeze (fg[i - p] + fg[i]);
+      fg[i - p + 1] = Fq_freeze (fg[i - p + 1] + fg[i]);
+    }
+
+  for (i = 0; i < p; ++i)
+    h[i] = fg[i];
+}
+
+/* h = 3f in Rq */
+static void
+Rq_mult3 (Fq * h, const Fq * f)
+{
+  int i;
+
+  for (i = 0; i < p; ++i)
+    h[i] = Fq_freeze (3 * f[i]);
+}
+
+/* out = 1/(3*in) in Rq */
+/* returns 0 if recip succeeded; else -1 */
+static int
+Rq_recip3 (Fq * out, const small * in)
+{
+  Fq f[p + 1], g[p + 1], v[p + 1], r[p + 1];
+  int i, loop, delta;
+  int swap, t;
+  int32_t f0, g0;
+  Fq scale;
+
+  for (i = 0; i < p + 1; ++i)
+    v[i] = 0;
+  for (i = 0; i < p + 1; ++i)
+    r[i] = 0;
+  r[0] = Fq_recip (3);
+  for (i = 0; i < p; ++i)
+    f[i] = 0;
+  f[0] = 1;
+  f[p - 1] = f[p] = -1;
+  for (i = 0; i < p; ++i)
+    g[p - 1 - i] = in[i];
+  g[p] = 0;
+
+  delta = 1;
+
+  for (loop = 0; loop < 2 * p - 1; ++loop)
+    {
+      for (i = p; i > 0; --i)
+       v[i] = v[i - 1];
+      v[0] = 0;
+
+      swap = int16_t_negative_mask (-delta) & int16_t_nonzero_mask (g[0]);
+      delta ^= swap & (delta ^ -delta);
+      delta += 1;
+
+      for (i = 0; i < p + 1; ++i)
+       {
+         t = swap & (f[i] ^ g[i]);
+         f[i] ^= t;
+         g[i] ^= t;
+         t = swap & (v[i] ^ r[i]);
+         v[i] ^= t;
+         r[i] ^= t;
+       }
+
+      f0 = f[0];
+      g0 = g[0];
+      for (i = 0; i < p + 1; ++i)
+       g[i] = Fq_freeze (f0 * g[i] - g0 * f[i]);
+      for (i = 0; i < p + 1; ++i)
+       r[i] = Fq_freeze (f0 * r[i] - g0 * v[i]);
+
+      for (i = 0; i < p; ++i)
+       g[i] = g[i + 1];
+      g[p] = 0;
+    }
+
+  scale = Fq_recip (f[0]);
+  for (i = 0; i < p; ++i)
+    out[i] = Fq_freeze (scale * (int32_t) v[p - 1 - i]);
+
+  return int16_t_nonzero_mask (delta);
+}
+
+/* ----- rounded polynomials mod q */
+
+static void
+Round (Fq * out, const Fq * a)
+{
+  int i;
+  for (i = 0; i < p; ++i)
+    out[i] = a[i] - F3_freeze (a[i]);
+}
+
+/* ----- sorting to generate short polynomial */
+
+static void
+Short_fromlist (small * out, const uint32_t * in)
+{
+  uint32_t L[p];
+  int i;
+
+  for (i = 0; i < w; ++i)
+    L[i] = in[i] & (uint32_t) - 2;
+  for (i = w; i < p; ++i)
+    L[i] = (in[i] & (uint32_t) - 3) | 1;
+  crypto_sort_uint32 (L, p);
+  for (i = 0; i < p; ++i)
+    out[i] = (L[i] & 3) - 1;
+}
+
+/* ----- underlying hash function */
+
+#define Hash_bytes 32
+
+/* e.g., b = 0 means out = Hash0(in) */
+static void
+Hash_prefix (unsigned char *out, int b, const unsigned char *in, int inlen)
+{
+  unsigned char x[inlen + 1];
+  unsigned char h[64];
+  int i;
+
+  x[0] = b;
+  for (i = 0; i < inlen; ++i)
+    x[i + 1] = in[i];
+  crypto_hash_sha512 (h, x, inlen + 1);
+  for (i = 0; i < 32; ++i)
+    out[i] = h[i];
+}
+
+/* ----- higher-level randomness */
+
+static uint32_t
+urandom32 (void *random_ctx, sntrup761_random_func * random)
+{
+  unsigned char c[4];
+  uint32_t out[4];
+
+  random (random_ctx, 4, c);
+  out[0] = (uint32_t) c[0];
+  out[1] = ((uint32_t) c[1]) << 8;
+  out[2] = ((uint32_t) c[2]) << 16;
+  out[3] = ((uint32_t) c[3]) << 24;
+  return out[0] + out[1] + out[2] + out[3];
+}
+
+static void
+Short_random (small * out, void *random_ctx, sntrup761_random_func * random)
+{
+  uint32_t L[p];
+  int i;
+
+  for (i = 0; i < p; ++i)
+    L[i] = urandom32 (random_ctx, random);
+  Short_fromlist (out, L);
+}
+
+static void
+Small_random (small * out, void *random_ctx, sntrup761_random_func * random)
+{
+  int i;
+
+  for (i = 0; i < p; ++i)
+    out[i] = (((urandom32 (random_ctx, random) & 0x3fffffff) * 3) >> 30) - 1;
+}
+
+/* ----- Streamlined NTRU Prime Core */
+
+/* h,(f,ginv) = KeyGen() */
+static void
+KeyGen (Fq * h, small * f, small * ginv, void *random_ctx,
+       sntrup761_random_func * random)
+{
+  small g[p];
+  Fq finv[p];
+
+  for (;;)
+    {
+      Small_random (g, random_ctx, random);
+      if (R3_recip (ginv, g) == 0)
+       break;
+    }
+  Short_random (f, random_ctx, random);
+  Rq_recip3 (finv, f);         /* always works */
+  Rq_mult_small (h, finv, g);
+}
+
+/* c = Encrypt(r,h) */
+static void
+Encrypt (Fq * c, const small * r, const Fq * h)
+{
+  Fq hr[p];
+
+  Rq_mult_small (hr, h, r);
+  Round (c, hr);
+}
+
+/* r = Decrypt(c,(f,ginv)) */
+static void
+Decrypt (small * r, const Fq * c, const small * f, const small * ginv)
+{
+  Fq cf[p];
+  Fq cf3[p];
+  small e[p];
+  small ev[p];
+  int mask;
+  int i;
+
+  Rq_mult_small (cf, c, f);
+  Rq_mult3 (cf3, cf);
+  R3_fromRq (e, cf3);
+  R3_mult (ev, e, ginv);
+
+  mask = Weightw_mask (ev);    /* 0 if weight w, else -1 */
+  for (i = 0; i < w; ++i)
+    r[i] = ((ev[i] ^ 1) & ~mask) ^ 1;
+  for (i = w; i < p; ++i)
+    r[i] = ev[i] & ~mask;
+}
+
+/* ----- encoding small polynomials (including short polynomials) */
+
+#define Small_bytes ((p+3)/4)
+
+/* these are the only functions that rely on p mod 4 = 1 */
+
+static void
+Small_encode (unsigned char *s, const small * f)
+{
+  small x;
+  int i;
+
+  for (i = 0; i < p / 4; ++i)
+    {
+      x = *f++ + 1;
+      x += (*f++ + 1) << 2;
+      x += (*f++ + 1) << 4;
+      x += (*f++ + 1) << 6;
+      *s++ = x;
+    }
+  x = *f++ + 1;
+  *s++ = x;
+}
+
+static void
+Small_decode (small * f, const unsigned char *s)
+{
+  unsigned char x;
+  int i;
+
+  for (i = 0; i < p / 4; ++i)
+    {
+      x = *s++;
+      *f++ = ((small) (x & 3)) - 1;
+      x >>= 2;
+      *f++ = ((small) (x & 3)) - 1;
+      x >>= 2;
+      *f++ = ((small) (x & 3)) - 1;
+      x >>= 2;
+      *f++ = ((small) (x & 3)) - 1;
+    }
+  x = *s++;
+  *f++ = ((small) (x & 3)) - 1;
+}
+
+/* ----- encoding general polynomials */
+
+static void
+Rq_encode (unsigned char *s, const Fq * r)
+{
+  uint16_t R[p], M[p];
+  int i;
+
+  for (i = 0; i < p; ++i)
+    R[i] = r[i] + q12;
+  for (i = 0; i < p; ++i)
+    M[i] = q;
+  Encode (s, R, M, p);
+}
+
+static void
+Rq_decode (Fq * r, const unsigned char *s)
+{
+  uint16_t R[p], M[p];
+  int i;
+
+  for (i = 0; i < p; ++i)
+    M[i] = q;
+  Decode (R, s, M, p);
+  for (i = 0; i < p; ++i)
+    r[i] = ((Fq) R[i]) - q12;
+}
+
+/* ----- encoding rounded polynomials */
+
+static void
+Rounded_encode (unsigned char *s, const Fq * r)
+{
+  uint16_t R[p], M[p];
+  int i;
+
+  for (i = 0; i < p; ++i)
+    R[i] = ((r[i] + q12) * 10923) >> 15;
+  for (i = 0; i < p; ++i)
+    M[i] = (q + 2) / 3;
+  Encode (s, R, M, p);
+}
+
+static void
+Rounded_decode (Fq * r, const unsigned char *s)
+{
+  uint16_t R[p], M[p];
+  int i;
+
+  for (i = 0; i < p; ++i)
+    M[i] = (q + 2) / 3;
+  Decode (R, s, M, p);
+  for (i = 0; i < p; ++i)
+    r[i] = R[i] * 3 - q12;
+}
+
+/* ----- Streamlined NTRU Prime Core plus encoding */
+
+typedef small Inputs[p];       /* passed by reference */
+#define Inputs_random Short_random
+#define Inputs_encode Small_encode
+#define Inputs_bytes Small_bytes
+
+#define Ciphertexts_bytes Rounded_bytes
+#define SecretKeys_bytes (2*Small_bytes)
+#define PublicKeys_bytes Rq_bytes
+
+/* pk,sk = ZKeyGen() */
+static void
+ZKeyGen (unsigned char *pk, unsigned char *sk, void *random_ctx,
+        sntrup761_random_func * random)
+{
+  Fq h[p];
+  small f[p], v[p];
+
+  KeyGen (h, f, v, random_ctx, random);
+  Rq_encode (pk, h);
+  Small_encode (sk, f);
+  sk += Small_bytes;
+  Small_encode (sk, v);
+}
+
+/* C = ZEncrypt(r,pk) */
+static void
+ZEncrypt (unsigned char *C, const Inputs r, const unsigned char *pk)
+{
+  Fq h[p];
+  Fq c[p];
+  Rq_decode (h, pk);
+  Encrypt (c, r, h);
+  Rounded_encode (C, c);
+}
+
+/* r = ZDecrypt(C,sk) */
+static void
+ZDecrypt (Inputs r, const unsigned char *C, const unsigned char *sk)
+{
+  small f[p], v[p];
+  Fq c[p];
+
+  Small_decode (f, sk);
+  sk += Small_bytes;
+  Small_decode (v, sk);
+  Rounded_decode (c, C);
+  Decrypt (r, c, f, v);
+}
+
+/* ----- confirmation hash */
+
+#define Confirm_bytes 32
+
+/* h = HashConfirm(r,pk,cache); cache is Hash4(pk) */
+static void
+HashConfirm (unsigned char *h, const unsigned char *r,
+            /* const unsigned char *pk, */ const unsigned char *cache)
+{
+  unsigned char x[Hash_bytes * 2];
+  int i;
+
+  Hash_prefix (x, 3, r, Inputs_bytes);
+  for (i = 0; i < Hash_bytes; ++i)
+    x[Hash_bytes + i] = cache[i];
+  Hash_prefix (h, 2, x, sizeof x);
+}
+
+/* ----- session-key hash */
+
+/* k = HashSession(b,y,z) */
+static void
+HashSession (unsigned char *k, int b, const unsigned char *y,
+            const unsigned char *z)
+{
+  unsigned char x[Hash_bytes + Ciphertexts_bytes + Confirm_bytes];
+  int i;
+
+  Hash_prefix (x, 3, y, Inputs_bytes);
+  for (i = 0; i < Ciphertexts_bytes + Confirm_bytes; ++i)
+    x[Hash_bytes + i] = z[i];
+  Hash_prefix (k, b, x, sizeof x);
+}
+
+/* ----- Streamlined NTRU Prime */
+
+/* pk,sk = KEM_KeyGen() */
+void
+sntrup761_keypair (unsigned char *pk, unsigned char *sk, void *random_ctx,
+                  sntrup761_random_func * random)
+{
+  int i;
+
+  ZKeyGen (pk, sk, random_ctx, random);
+  sk += SecretKeys_bytes;
+  for (i = 0; i < PublicKeys_bytes; ++i)
+    *sk++ = pk[i];
+  random (random_ctx, Inputs_bytes, sk);
+  sk += Inputs_bytes;
+  Hash_prefix (sk, 4, pk, PublicKeys_bytes);
+}
+
+/* c,r_enc = Hide(r,pk,cache); cache is Hash4(pk) */
+static void
+Hide (unsigned char *c, unsigned char *r_enc, const Inputs r,
+      const unsigned char *pk, const unsigned char *cache)
+{
+  Inputs_encode (r_enc, r);
+  ZEncrypt (c, r, pk);
+  c += Ciphertexts_bytes;
+  HashConfirm (c, r_enc, cache);
+}
+
+/* c,k = Encap(pk) */
+void
+sntrup761_enc (unsigned char *c, unsigned char *k, const unsigned char *pk,
+              void *random_ctx, sntrup761_random_func * random)
+{
+  Inputs r;
+  unsigned char r_enc[Inputs_bytes];
+  unsigned char cache[Hash_bytes];
+
+  Hash_prefix (cache, 4, pk, PublicKeys_bytes);
+  Inputs_random (r, random_ctx, random);
+  Hide (c, r_enc, r, pk, cache);
+  HashSession (k, 1, r_enc, c);
+}
+
+/* 0 if matching ciphertext+confirm, else -1 */
+static int
+Ciphertexts_diff_mask (const unsigned char *c, const unsigned char *c2)
+{
+  uint16_t differentbits = 0;
+  int len = Ciphertexts_bytes + Confirm_bytes;
+
+  while (len-- > 0)
+    differentbits |= (*c++) ^ (*c2++);
+  return (1 & ((differentbits - 1) >> 8)) - 1;
+}
+
+/* k = Decap(c,sk) */
+void
+sntrup761_dec (unsigned char *k, const unsigned char *c, const unsigned char *sk)
+{
+  const unsigned char *pk = sk + SecretKeys_bytes;
+  const unsigned char *rho = pk + PublicKeys_bytes;
+  const unsigned char *cache = rho + Inputs_bytes;
+  Inputs r;
+  unsigned char r_enc[Inputs_bytes];
+  unsigned char cnew[Ciphertexts_bytes + Confirm_bytes];
+  int mask;
+  int i;
+
+  ZDecrypt (r, c, sk);
+  Hide (cnew, r_enc, r, pk, cache);
+  mask = Ciphertexts_diff_mask (c, cnew);
+  for (i = 0; i < Inputs_bytes; ++i)
+    r_enc[i] ^= mask & (r_enc[i] ^ rho[i]);
+  HashSession (k, 1 + mask, r_enc, c);
+}
diff --git a/cipher/sntrup761.h b/cipher/sntrup761.h
new file mode 100644 (file)
index 0000000..a9974bb
--- /dev/null
@@ -0,0 +1,73 @@
+/* sntrup761.h  -  Streamlined NTRU Prime sntrup761 key-encapsulation method
+ * Copyright (C) 2023 Simon Josefsson <simon@josefsson.org>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <https://www.gnu.org/licenses/>.
+ * SPDX-License-Identifier: LGPL-2.1-or-later
+ *
+ * For a description of the algorithm, see:
+ *   https://ntruprime.cr.yp.to/
+ */
+
+/*
+ * Derived from public domain source, written by (in alphabetical order):
+ * - Daniel J. Bernstein
+ * - Chitchanok Chuengsatiansup
+ * - Tanja Lange
+ * - Christine van Vredendaal
+ */
+
+#ifndef SNTRUP761_H
+#define SNTRUP761_H
+
+#include <string.h>
+#include <stdint.h>
+
+#ifdef _GCRYPT_IN_LIBGCRYPT
+/**** Start of the glue code to libgcrypt ****/
+#include "gcrypt-int.h"
+
+static inline void
+crypto_hash_sha512 (unsigned char *out,
+                   const unsigned char *in, size_t inlen)
+{
+  _gcry_md_hash_buffer (GCRY_MD_SHA512, out, in, inlen);
+}
+
+#define sntrup761_keypair _gcry_sntrup761_keypair
+#define sntrup761_enc     _gcry_sntrup761_enc
+#define sntrup761_dec     _gcry_sntrup761_dec
+/**** End of the glue code ****/
+#else
+#define SNTRUP761_SECRETKEY_SIZE 1763
+#define SNTRUP761_PUBLICKEY_SIZE 1158
+#define SNTRUP761_CIPHERTEXT_SIZE 1039
+#define SNTRUP761_SIZE 32
+#endif
+
+typedef void sntrup761_random_func (void *ctx, size_t length, uint8_t *dst);
+
+void
+sntrup761_keypair (uint8_t *pk, uint8_t *sk,
+                  void *random_ctx, sntrup761_random_func *random);
+
+void
+sntrup761_enc (uint8_t *c, uint8_t *k, const uint8_t *pk,
+              void *random_ctx, sntrup761_random_func *random);
+
+void
+sntrup761_dec (uint8_t *k, const uint8_t *c, const uint8_t *sk);
+
+#endif /* SNTRUP761_H */
index ae55359ca712e384e58d494674b8a73c5996f391..4e6b1239adeec197b0d876ecb18b1b0c4c3cd66f 100644 (file)
@@ -14,8 +14,8 @@
  * GNU Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA
+ * License along with this program; if not, see <https://www.gnu.org/licenses/>.
+ * SPDX-License-Identifier: LGPL-2.1-or-later
  */
 
 /* See http://www.cs.technion.ac.il/~biham/Reports/Tiger/  */
index 7941fe3ac2f12a13e1524b68d5829d3f98fc13fc..b8314adbf75766832afa569ad3bf629aab44de40 100644 (file)
 .globl _gcry_twofish_arm_encrypt_block
 ELF(.type   _gcry_twofish_arm_encrypt_block,%function;)
 
+.align 4
 _gcry_twofish_arm_encrypt_block:
        /* input:
         *      x0: ctx
@@ -264,12 +265,12 @@ _gcry_twofish_arm_encrypt_block:
 
        ret_spec_stop;
        CFI_ENDPROC();
-.ltorg
 ELF(.size _gcry_twofish_arm_encrypt_block,.-_gcry_twofish_arm_encrypt_block;)
 
 .globl _gcry_twofish_arm_decrypt_block
 ELF(.type   _gcry_twofish_arm_decrypt_block,%function;)
 
+.align 4
 _gcry_twofish_arm_decrypt_block:
        /* input:
         *      %r0: ctx
index a7a60553399bd257e1a1d428dc17d663a570ec29..913b252d409de0089f888154445b3f7e754ddad4 100644 (file)
        xorl (w + 4 * (m))(CTX), x; \
        movl x, (4 * (n))(out);
 
-.align 8
+.align 16
 .globl _gcry_twofish_amd64_encrypt_block
 ELF(.type   _gcry_twofish_amd64_encrypt_block,@function;)
 
@@ -215,7 +215,7 @@ _gcry_twofish_amd64_encrypt_block:
        CFI_ENDPROC();
 ELF(.size _gcry_twofish_amd64_encrypt_block,.-_gcry_twofish_amd64_encrypt_block;)
 
-.align 8
+.align 16
 .globl _gcry_twofish_amd64_decrypt_block
 ELF(.type   _gcry_twofish_amd64_decrypt_block,@function;)
 
@@ -486,7 +486,7 @@ ELF(.size _gcry_twofish_amd64_encrypt_block,.-_gcry_twofish_amd64_encrypt_block;
        rorq $32,                       RAB2; \
        outunpack3(RAB, 2);
 
-.align 8
+.align 16
 ELF(.type __twofish_enc_blk3,@function;)
 
 __twofish_enc_blk3:
@@ -515,7 +515,7 @@ __twofish_enc_blk3:
        CFI_ENDPROC();
 ELF(.size __twofish_enc_blk3,.-__twofish_enc_blk3;)
 
-.align 8
+.align 16
 ELF(.type  __twofish_dec_blk3,@function;)
 
 __twofish_dec_blk3:
@@ -544,7 +544,81 @@ __twofish_dec_blk3:
        CFI_ENDPROC();
 ELF(.size __twofish_dec_blk3,.-__twofish_dec_blk3;)
 
-.align 8
+.align 16
+.globl _gcry_twofish_amd64_blk3
+ELF(.type   _gcry_twofish_amd64_blk3,@function;)
+_gcry_twofish_amd64_blk3:
+       /* input:
+        *      %rdi: ctx, CTX
+        *      %rsi: dst (3 blocks)
+        *      %rdx: src (3 blocks)
+        *      %ecx: encrypt (0 or 1)
+        */
+       CFI_STARTPROC();
+       ENTER_SYSV_FUNC_PARAMS_0_4
+
+       subq $(8 * 8), %rsp;
+       CFI_ADJUST_CFA_OFFSET(8 * 8);
+       movq %rbp, (0 * 8)(%rsp);
+       movq %rbx, (1 * 8)(%rsp);
+       movq %r12, (2 * 8)(%rsp);
+       movq %r13, (3 * 8)(%rsp);
+       movq %r14, (4 * 8)(%rsp);
+       movq %r15, (5 * 8)(%rsp);
+       CFI_REL_OFFSET(%rbp, 0 * 8);
+       CFI_REL_OFFSET(%rbx, 1 * 8);
+       CFI_REL_OFFSET(%r12, 2 * 8);
+       CFI_REL_OFFSET(%r13, 3 * 8);
+       CFI_REL_OFFSET(%r14, 4 * 8);
+       CFI_REL_OFFSET(%r15, 5 * 8);
+
+       testl %ecx, %ecx;
+       movq %rdx, RX0;
+       movq %rsi, (6 * 8)(%rsp);
+
+       movq (0 * 8)(RX0), RAB0;
+       movq (1 * 8)(RX0), RCD0;
+       movq (2 * 8)(RX0), RAB1;
+       movq (3 * 8)(RX0), RCD1;
+       movq (4 * 8)(RX0), RAB2;
+       movq (5 * 8)(RX0), RCD2;
+
+       jz .Lblk1_3_dec;
+               call __twofish_enc_blk3;
+               jmp .Lblk1_3_end;
+       .Lblk1_3_dec:
+               call __twofish_dec_blk3;
+
+.Lblk1_3_end:
+       movq (6 * 8)(%rsp), RX0;
+       movq RCD0, (0 * 8)(RX0);
+       movq RAB0, (1 * 8)(RX0);
+       movq RCD1, (2 * 8)(RX0);
+       movq RAB1, (3 * 8)(RX0);
+       movq RCD2, (4 * 8)(RX0);
+       movq RAB2, (5 * 8)(RX0);
+
+       movq (0 * 8)(%rsp), %rbp;
+       movq (1 * 8)(%rsp), %rbx;
+       movq (2 * 8)(%rsp), %r12;
+       movq (3 * 8)(%rsp), %r13;
+       movq (4 * 8)(%rsp), %r14;
+       movq (5 * 8)(%rsp), %r15;
+       CFI_RESTORE(%rbp);
+       CFI_RESTORE(%rbx);
+       CFI_RESTORE(%r12);
+       CFI_RESTORE(%r13);
+       CFI_RESTORE(%r14);
+       CFI_RESTORE(%r15);
+       addq $(8 * 8), %rsp;
+       CFI_ADJUST_CFA_OFFSET(-8 * 8);
+
+       EXIT_SYSV_FUNC
+       ret_spec_stop;
+       CFI_ENDPROC();
+ELF(.size _gcry_twofish_amd64_blk3,.-_gcry_twofish_amd64_blk3;)
+
+.align 16
 .globl _gcry_twofish_amd64_ctr_enc
 ELF(.type   _gcry_twofish_amd64_ctr_enc,@function;)
 _gcry_twofish_amd64_ctr_enc:
@@ -645,7 +719,7 @@ _gcry_twofish_amd64_ctr_enc:
        CFI_ENDPROC();
 ELF(.size _gcry_twofish_amd64_ctr_enc,.-_gcry_twofish_amd64_ctr_enc;)
 
-.align 8
+.align 16
 .globl _gcry_twofish_amd64_cbc_dec
 ELF(.type   _gcry_twofish_amd64_cbc_dec,@function;)
 _gcry_twofish_amd64_cbc_dec:
@@ -730,7 +804,7 @@ _gcry_twofish_amd64_cbc_dec:
        CFI_ENDPROC();
 ELF(.size _gcry_twofish_amd64_cbc_dec,.-_gcry_twofish_amd64_cbc_dec;)
 
-.align 8
+.align 16
 .globl _gcry_twofish_amd64_cfb_dec
 ELF(.type   _gcry_twofish_amd64_cfb_dec,@function;)
 _gcry_twofish_amd64_cfb_dec:
@@ -815,7 +889,7 @@ _gcry_twofish_amd64_cfb_dec:
        CFI_ENDPROC();
 ELF(.size _gcry_twofish_amd64_cfb_dec,.-_gcry_twofish_amd64_cfb_dec;)
 
-.align 8
+.align 16
 .globl _gcry_twofish_amd64_ocb_enc
 ELF(.type   _gcry_twofish_amd64_ocb_enc,@function;)
 _gcry_twofish_amd64_ocb_enc:
@@ -941,7 +1015,7 @@ _gcry_twofish_amd64_ocb_enc:
        CFI_ENDPROC();
 ELF(.size _gcry_twofish_amd64_ocb_enc,.-_gcry_twofish_amd64_ocb_enc;)
 
-.align 8
+.align 16
 .globl _gcry_twofish_amd64_ocb_dec
 ELF(.type   _gcry_twofish_amd64_ocb_dec,@function;)
 _gcry_twofish_amd64_ocb_dec:
@@ -1075,7 +1149,7 @@ _gcry_twofish_amd64_ocb_dec:
        CFI_ENDPROC();
 ELF(.size _gcry_twofish_amd64_ocb_dec,.-_gcry_twofish_amd64_ocb_dec;)
 
-.align 8
+.align 16
 .globl _gcry_twofish_amd64_ocb_auth
 ELF(.type   _gcry_twofish_amd64_ocb_auth,@function;)
 _gcry_twofish_amd64_ocb_auth:
index 2e1da6cd15ebcd240c9c28c7ba437686861b061e..b381e546e01a8fa31bfd29b95dd575933a3fe055 100644 (file)
 #define k  ((w) + 4 * 8)
 
 /* register macros */
-#define CTX %r0
-#define CTXs0 %r0
-#define CTXs1 %r1
-#define CTXs3 %r7
+#define CTX r0
+#define CTXs0 r0
+#define CTXs1 r1
+#define CTXs3 r7
 
-#define RA %r3
-#define RB %r4
-#define RC %r5
-#define RD %r6
+#define RA r3
+#define RB r4
+#define RC r5
+#define RD r6
 
-#define RX %r2
-#define RY %ip
+#define RX r2
+#define RY ip
 
-#define RMASK %lr
+#define RMASK lr
 
-#define RT0 %r8
-#define RT1 %r9
-#define RT2 %r10
-#define RT3 %r11
+#define RT0 r8
+#define RT1 r9
+#define RT2 r10
+#define RT3 r11
 
 /* helper macros */
 #define ldr_unaligned_le(rout, rsrc, offs, rtmp) \
 
 _gcry_twofish_arm_encrypt_block:
        /* input:
-        *      %r0: ctx
-        *      %r1: dst
-        *      %r2: src
+        *      r0: ctx
+        *      r1: dst
+        *      r2: src
         */
-       push {%r1, %r4-%r11, %ip, %lr};
+       push {r1, r4-r11, ip, lr};
 
        add RY, CTXs0, #w;
 
-       ldr_input_le(%r2, RA, RB, RC, RD, RT0);
+       ldr_input_le(r2, RA, RB, RC, RD, RT0);
 
        /* Input whitening */
        ldm RY, {RT0, RT1, RT2, RT3};
@@ -292,7 +292,7 @@ _gcry_twofish_arm_encrypt_block:
        last_encrypt_cycle(7);
 
        add RY, CTXs3, #(w + 4*4 - s3);
-       pop {%r1}; /* dst */
+       pop {r1}; /* dst */
 
        /* Output whitening */
        ldm RY, {RT0, RT1, RT2, RT3};
@@ -301,9 +301,9 @@ _gcry_twofish_arm_encrypt_block:
        eor RA, RA, RT2;
        eor RB, RB, RT3;
 
-       str_output_le(%r1, RC, RD, RA, RB, RT0, RT1);
+       str_output_le(r1, RC, RD, RA, RB, RT0, RT1);
 
-       pop {%r4-%r11, %ip, %pc};
+       pop {r4-r11, ip, pc};
 .ltorg
 .size _gcry_twofish_arm_encrypt_block,.-_gcry_twofish_arm_encrypt_block;
 
@@ -313,15 +313,15 @@ _gcry_twofish_arm_encrypt_block:
 
 _gcry_twofish_arm_decrypt_block:
        /* input:
-        *      %r0: ctx
-        *      %r1: dst
-        *      %r2: src
+        *      r0: ctx
+        *      r1: dst
+        *      r2: src
         */
-       push {%r1, %r4-%r11, %ip, %lr};
+       push {r1, r4-r11, ip, lr};
 
        add CTXs3, CTXs0, #(s3 - s0);
 
-       ldr_input_le(%r2, RC, RD, RA, RB, RT0);
+       ldr_input_le(r2, RC, RD, RA, RB, RT0);
 
        add RY, CTXs3, #(w + 4*4 - s3);
        add CTXs3, CTXs0, #(s3 - s0);
@@ -345,7 +345,7 @@ _gcry_twofish_arm_decrypt_block:
        last_decrypt_cycle(0);
 
        add RY, CTXs0, #w;
-       pop {%r1}; /* dst */
+       pop {r1}; /* dst */
 
        /* Output whitening */
        ldm RY, {RT0, RT1, RT2, RT3};
@@ -354,9 +354,9 @@ _gcry_twofish_arm_decrypt_block:
        eor RC, RC, RT2;
        eor RD, RD, RT3;
 
-       str_output_le(%r1, RA, RB, RC, RD, RT0, RT1);
+       str_output_le(r1, RA, RB, RC, RD, RT0, RT1);
 
-       pop {%r4-%r11, %ip, %pc};
+       pop {r4-r11, ip, pc};
 .size _gcry_twofish_arm_decrypt_block,.-_gcry_twofish_arm_decrypt_block;
 
 #endif /*HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS*/
index 930ac792c1f05677e9871d8d96ec11162e67d7a3..3f61f87b1544cb3cec81fd510e0c800592ccbcec 100644 (file)
 /* register macros */
 #define CTX    %rdi
 
-#define RROUND  %rbp
-#define RROUNDd %ebp
+#define RROUND  %r13
+#define RROUNDd %r13d
 #define RS0    CTX
 #define RS1    %r8
 #define RS2    %r9
 #define RS3    %r10
 #define RK     %r11
-#define RW     %rax
+#define RW     %r12
+#define RIDX0  %rax
+#define RIDX0d %eax
+#define RIDX1  %rbx
+#define RIDX1d %ebx
+#define RIDX2  %r14
+#define RIDX3  %r15
 
 #define RA0    %ymm8
 #define RB0    %ymm9
 #define RX1    %ymm2
 #define RY1    %ymm3
 #define RT0    %ymm4
-#define RIDX   %ymm5
+#define RT1    %ymm5
 
 #define RX0x   %xmm0
 #define RY0x   %xmm1
 #define RX1x   %xmm2
 #define RY1x   %xmm3
 #define RT0x   %xmm4
-#define RIDXx  %xmm5
+#define RT1x   %xmm5
 
 #define RTMP0   RX0
 #define RTMP0x  RX0x
@@ -80,8 +86,8 @@
 #define RTMP2x  RY0x
 #define RTMP3   RY1
 #define RTMP3x  RY1x
-#define RTMP4   RIDX
-#define RTMP4x  RIDXx
+#define RTMP4   RT1
+#define RTMP4x  RT1x
 
 /* vpgatherdd mask and '-1' */
 #define RNOT   %ymm6
        leaq s2(CTX), RS2; \
        leaq s3(CTX), RS3; \
 
+#define do_gather(stoffs, byteoffs, rs, out) \
+       movzbl (stoffs + 0*4 + byteoffs)(%rsp), RIDX0d; \
+       movzbl (stoffs + 1*4 + byteoffs)(%rsp), RIDX1d; \
+       movzbq (stoffs + 2*4 + byteoffs)(%rsp), RIDX2; \
+       movzbq (stoffs + 3*4 + byteoffs)(%rsp), RIDX3; \
+       vmovd (rs, RIDX0, 4), RT1x; \
+       movzbl (stoffs + 4*4 + byteoffs)(%rsp), RIDX0d; \
+       vmovd (rs, RIDX0, 4), RT0x; \
+       vpinsrd $1, (rs, RIDX1, 4), RT1x, RT1x; \
+       movzbl (stoffs + 5*4 + byteoffs)(%rsp), RIDX1d; \
+       vpinsrd $1, (rs, RIDX1, 4), RT0x, RT0x; \
+       vpinsrd $2, (rs, RIDX2, 4), RT1x, RT1x; \
+       movzbq (stoffs + 6*4 + byteoffs)(%rsp), RIDX2; \
+       vpinsrd $2, (rs, RIDX2, 4), RT0x, RT0x; \
+       vpinsrd $3, (rs, RIDX3, 4), RT1x, RT1x; \
+       movzbq (stoffs + 7*4 + byteoffs)(%rsp), RIDX3; \
+       vpinsrd $3, (rs, RIDX3, 4), RT0x, RT0x; \
+       vinserti128 $1, RT0x, RT1, out;
+
 #define g16(ab, rs0, rs1, rs2, rs3, xy) \
-       vpand RBYTE, ab ## 0, RIDX; \
-       vpgatherdd RNOT, (rs0, RIDX, 4), xy ## 0; \
-       vpcmpeqd RNOT, RNOT, RNOT; \
-               \
-               vpand RBYTE, ab ## 1, RIDX; \
-               vpgatherdd RNOT, (rs0, RIDX, 4), xy ## 1; \
-               vpcmpeqd RNOT, RNOT, RNOT; \
-       \
-       vpsrld $8, ab ## 0, RIDX; \
-       vpand RBYTE, RIDX, RIDX; \
-       vpgatherdd RNOT, (rs1, RIDX, 4), RT0; \
-       vpcmpeqd RNOT, RNOT, RNOT; \
-       vpxor RT0, xy ## 0, xy ## 0; \
-               \
-               vpsrld $8, ab ## 1, RIDX; \
-               vpand RBYTE, RIDX, RIDX; \
-               vpgatherdd RNOT, (rs1, RIDX, 4), RT0; \
-               vpcmpeqd RNOT, RNOT, RNOT; \
-               vpxor RT0, xy ## 1, xy ## 1; \
-       \
-       vpsrld $16, ab ## 0, RIDX; \
-       vpand RBYTE, RIDX, RIDX; \
-       vpgatherdd RNOT, (rs2, RIDX, 4), RT0; \
-       vpcmpeqd RNOT, RNOT, RNOT; \
-       vpxor RT0, xy ## 0, xy ## 0; \
-               \
-               vpsrld $16, ab ## 1, RIDX; \
-               vpand RBYTE, RIDX, RIDX; \
-               vpgatherdd RNOT, (rs2, RIDX, 4), RT0; \
-               vpcmpeqd RNOT, RNOT, RNOT; \
-               vpxor RT0, xy ## 1, xy ## 1; \
-       \
-       vpsrld $24, ab ## 0, RIDX; \
-       vpgatherdd RNOT, (rs3, RIDX, 4), RT0; \
-       vpcmpeqd RNOT, RNOT, RNOT; \
-       vpxor RT0, xy ## 0, xy ## 0; \
-               \
-               vpsrld $24, ab ## 1, RIDX; \
-               vpgatherdd RNOT, (rs3, RIDX, 4), RT0; \
-               vpcmpeqd RNOT, RNOT, RNOT; \
-               vpxor RT0, xy ## 1, xy ## 1;
+       vmovdqa ab ## 0, 0(%rsp); \
+       vmovdqa ab ## 1, 32(%rsp); \
+       do_gather(0*32, 0, rs0, xy ## 0); \
+               do_gather(1*32, 0, rs0, xy ## 1); \
+       do_gather(0*32, 1, rs1, RT1); \
+       vpxor RT1, xy ## 0, xy ## 0; \
+               do_gather(1*32, 1, rs1, RT1); \
+               vpxor RT1, xy ## 1, xy ## 1; \
+       do_gather(0*32, 2, rs2, RT1); \
+       vpxor RT1, xy ## 0, xy ## 0; \
+               do_gather(1*32, 2, rs2, RT1); \
+               vpxor RT1, xy ## 1, xy ## 1; \
+       do_gather(0*32, 3, rs3, RT1); \
+       vpxor RT1, xy ## 0, xy ## 0; \
+               do_gather(1*32, 3, rs3, RT1); \
+               vpxor RT1, xy ## 1, xy ## 1;
 
 #define g1_16(a, x) \
        g16(a, RS0, RS1, RS2, RS3, x);
 #define encrypt_round_end16(a, b, c, d, nk, r) \
        vpaddd RY0, RX0, RX0; \
        vpaddd RX0, RY0, RY0; \
-       vpbroadcastd ((nk)+((r)*8))(RK), RT0; \
+       vpbroadcastd ((nk))(RK,r), RT0; \
        vpaddd RT0, RX0, RX0; \
-       vpbroadcastd 4+((nk)+((r)*8))(RK), RT0; \
+       vpbroadcastd 4+((nk))(RK,r), RT0; \
        vpaddd RT0, RY0, RY0; \
        \
        vpxor RY0, d ## 0, d ## 0; \
        \
                vpaddd RY1, RX1, RX1; \
                vpaddd RX1, RY1, RY1; \
-               vpbroadcastd ((nk)+((r)*8))(RK), RT0; \
+               vpbroadcastd ((nk))(RK,r), RT0; \
                vpaddd RT0, RX1, RX1; \
-               vpbroadcastd 4+((nk)+((r)*8))(RK), RT0; \
+               vpbroadcastd 4+((nk))(RK,r), RT0; \
                vpaddd RT0, RY1, RY1; \
                \
                vpxor RY1, d ## 1, d ## 1; \
 #define decrypt_round_end16(a, b, c, d, nk, r) \
        vpaddd RY0, RX0, RX0; \
        vpaddd RX0, RY0, RY0; \
-       vpbroadcastd ((nk)+((r)*8))(RK), RT0; \
+       vpbroadcastd ((nk))(RK,r), RT0; \
        vpaddd RT0, RX0, RX0; \
-       vpbroadcastd 4+((nk)+((r)*8))(RK), RT0; \
+       vpbroadcastd 4+((nk))(RK,r), RT0; \
        vpaddd RT0, RY0, RY0; \
        \
        vpxor RX0, c ## 0, c ## 0; \
        \
                vpaddd RY1, RX1, RX1; \
                vpaddd RX1, RY1, RY1; \
-               vpbroadcastd ((nk)+((r)*8))(RK), RT0; \
+               vpbroadcastd ((nk))(RK,r), RT0; \
                vpaddd RT0, RX1, RX1; \
-               vpbroadcastd 4+((nk)+((r)*8))(RK), RT0; \
+               vpbroadcastd 4+((nk))(RK,r), RT0; \
                vpaddd RT0, RY1, RY1; \
                \
                vpxor RX1, c ## 1, c ## 1; \
        \
        decrypt_round_end16(a, b, c, d, nk, r);
 
-#define encrypt_cycle16(r) \
-       encrypt_round16(RA, RB, RC, RD, 0, r); \
-       encrypt_round16(RC, RD, RA, RB, 8, r);
-
-#define encrypt_cycle_first16(r) \
-       encrypt_round_first16(RA, RB, RC, RD, 0, r); \
-       encrypt_round16(RC, RD, RA, RB, 8, r);
-
-#define encrypt_cycle_last16(r) \
-       encrypt_round16(RA, RB, RC, RD, 0, r); \
-       encrypt_round_last16(RC, RD, RA, RB, 8, r);
-
-#define decrypt_cycle16(r) \
-       decrypt_round16(RC, RD, RA, RB, 8, r); \
-       decrypt_round16(RA, RB, RC, RD, 0, r);
-
-#define decrypt_cycle_first16(r) \
-       decrypt_round_first16(RC, RD, RA, RB, 8, r); \
-       decrypt_round16(RA, RB, RC, RD, 0, r);
-
-#define decrypt_cycle_last16(r) \
-       decrypt_round16(RC, RD, RA, RB, 8, r); \
-       decrypt_round_last16(RA, RB, RC, RD, 0, r);
-
 #define transpose_4x4(x0,x1,x2,x3,t1,t2) \
        vpunpckhdq x1, x0, t2; \
        vpunpckldq x1, x0, x0; \
        vpunpckhqdq x2, t2, x3; \
        vpunpcklqdq x2, t2, x2;
 
-#define read_blocks8(offs,a,b,c,d) \
-       vmovdqu 16*offs(RIO), a; \
-       vmovdqu 16*offs+32(RIO), b; \
-       vmovdqu 16*offs+64(RIO), c; \
-       vmovdqu 16*offs+96(RIO), d; \
-       \
-       transpose_4x4(a, b, c, d, RX0, RY0);
-
-#define write_blocks8(offs,a,b,c,d) \
-       transpose_4x4(a, b, c, d, RX0, RY0); \
-       \
-       vmovdqu a, 16*offs(RIO); \
-       vmovdqu b, 16*offs+32(RIO); \
-       vmovdqu c, 16*offs+64(RIO); \
-       vmovdqu d, 16*offs+96(RIO);
-
 #define inpack_enc8(a,b,c,d) \
        vpbroadcastd 4*0(RW), RT0; \
        vpxor RT0, a, a; \
        outunpack_dec8(a ## 0, b ## 0, c ## 0, d ## 0); \
        outunpack_dec8(a ## 1, b ## 1, c ## 1, d ## 1);
 
-.align 8
+.align 16
 ELF(.type __twofish_enc_blk16,@function;)
 __twofish_enc_blk16:
        /* input:
@@ -414,28 +374,68 @@ __twofish_enc_blk16:
         *                                              ciphertext blocks
         */
        CFI_STARTPROC();
+
+       pushq %rbp;
+       CFI_PUSH(%rbp);
+       movq %rsp, %rbp;
+       CFI_DEF_CFA_REGISTER(%rbp);
+       subq $(64 + 5 * 8), %rsp;
+       andq $-64, %rsp;
+
+       movq %rbx, (64 + 0 * 8)(%rsp);
+       movq %r12, (64 + 1 * 8)(%rsp);
+       movq %r13, (64 + 2 * 8)(%rsp);
+       movq %r14, (64 + 3 * 8)(%rsp);
+       movq %r15, (64 + 4 * 8)(%rsp);
+       CFI_REG_ON_STACK(rbx, 64 + 0 * 8);
+       CFI_REG_ON_STACK(r12, 64 + 1 * 8);
+       CFI_REG_ON_STACK(r13, 64 + 2 * 8);
+       CFI_REG_ON_STACK(r14, 64 + 3 * 8);
+       CFI_REG_ON_STACK(r15, 64 + 4 * 8);
+
        init_round_constants();
 
        transpose4x4_16(RA, RB, RC, RD);
        inpack_enc16(RA, RB, RC, RD);
 
-       encrypt_cycle_first16(0);
-       encrypt_cycle16(2);
-       encrypt_cycle16(4);
-       encrypt_cycle16(6);
-       encrypt_cycle16(8);
-       encrypt_cycle16(10);
-       encrypt_cycle16(12);
-       encrypt_cycle_last16(14);
+       xorl RROUNDd, RROUNDd;
+
+       encrypt_round_first16(RA, RB, RC, RD, 0, RROUND);
+
+.align 16
+.Loop_enc16:
+       encrypt_round16(RC, RD, RA, RB, 8, RROUND);
+       encrypt_round16(RA, RB, RC, RD, 16, RROUND);
+       leal 16(RROUNDd), RROUNDd;
+       cmpl $8*14, RROUNDd;
+       jb .Loop_enc16;
+
+       encrypt_round_last16(RC, RD, RA, RB, 8, RROUND);
 
        outunpack_enc16(RA, RB, RC, RD);
        transpose4x4_16(RA, RB, RC, RD);
 
+       movq (64 + 0 * 8)(%rsp), %rbx;
+       movq (64 + 1 * 8)(%rsp), %r12;
+       movq (64 + 2 * 8)(%rsp), %r13;
+       movq (64 + 3 * 8)(%rsp), %r14;
+       movq (64 + 4 * 8)(%rsp), %r15;
+       CFI_RESTORE(%rbx);
+       CFI_RESTORE(%r12);
+       CFI_RESTORE(%r13);
+       CFI_RESTORE(%r14);
+       CFI_RESTORE(%r15);
+       vpxor RT0, RT0, RT0;
+       vmovdqa RT0, 0(%rsp);
+       vmovdqa RT0, 32(%rsp);
+       leave;
+       CFI_LEAVE();
+
        ret_spec_stop;
        CFI_ENDPROC();
 ELF(.size __twofish_enc_blk16,.-__twofish_enc_blk16;)
 
-.align 8
+.align 16
 ELF(.type __twofish_dec_blk16,@function;)
 __twofish_dec_blk16:
        /* input:
@@ -447,34 +447,119 @@ __twofish_dec_blk16:
         *                                              ciphertext blocks
         */
        CFI_STARTPROC();
+
+       pushq %rbp;
+       CFI_PUSH(%rbp);
+       movq %rsp, %rbp;
+       CFI_DEF_CFA_REGISTER(%rbp);
+       subq $(64 + 5 * 8), %rsp;
+       andq $-64, %rsp;
+
+       movq %rbx, (64 + 0 * 8)(%rsp);
+       movq %r12, (64 + 1 * 8)(%rsp);
+       movq %r13, (64 + 2 * 8)(%rsp);
+       movq %r14, (64 + 3 * 8)(%rsp);
+       movq %r15, (64 + 4 * 8)(%rsp);
+       CFI_REG_ON_STACK(rbx, 64 + 0 * 8);
+       CFI_REG_ON_STACK(r12, 64 + 1 * 8);
+       CFI_REG_ON_STACK(r13, 64 + 2 * 8);
+       CFI_REG_ON_STACK(r14, 64 + 3 * 8);
+       CFI_REG_ON_STACK(r15, 64 + 4 * 8);
+
        init_round_constants();
 
        transpose4x4_16(RA, RB, RC, RD);
        inpack_dec16(RA, RB, RC, RD);
 
-       decrypt_cycle_first16(14);
-       decrypt_cycle16(12);
-       decrypt_cycle16(10);
-       decrypt_cycle16(8);
-       decrypt_cycle16(6);
-       decrypt_cycle16(4);
-       decrypt_cycle16(2);
-       decrypt_cycle_last16(0);
+       movl $14*8, RROUNDd;
+
+       decrypt_round_first16(RC, RD, RA, RB, 8, RROUND);
+
+.align 16
+.Loop_dec16:
+       decrypt_round16(RA, RB, RC, RD, 0, RROUND);
+       decrypt_round16(RC, RD, RA, RB, -8, RROUND);
+       subl $16, RROUNDd;
+       jnz .Loop_dec16;
+
+       decrypt_round_last16(RA, RB, RC, RD, 0, RROUND);
 
        outunpack_dec16(RA, RB, RC, RD);
        transpose4x4_16(RA, RB, RC, RD);
 
+       movq (64 + 0 * 8)(%rsp), %rbx;
+       movq (64 + 1 * 8)(%rsp), %r12;
+       movq (64 + 2 * 8)(%rsp), %r13;
+       movq (64 + 3 * 8)(%rsp), %r14;
+       movq (64 + 4 * 8)(%rsp), %r15;
+       CFI_RESTORE(%rbx);
+       CFI_RESTORE(%r12);
+       CFI_RESTORE(%r13);
+       CFI_RESTORE(%r14);
+       CFI_RESTORE(%r15);
+       vpxor RT0, RT0, RT0;
+       vmovdqa RT0, 0(%rsp);
+       vmovdqa RT0, 32(%rsp);
+       leave;
+       CFI_LEAVE();
+
        ret_spec_stop;
        CFI_ENDPROC();
 ELF(.size __twofish_dec_blk16,.-__twofish_dec_blk16;)
 
+.align 16
+.globl _gcry_twofish_avx2_blk16
+ELF(.type   _gcry_twofish_avx2_blk16,@function;)
+_gcry_twofish_avx2_blk16:
+       /* input:
+        *      %rdi: ctx, CTX
+        *      %rsi: dst (16 blocks)
+        *      %rdx: src (16 blocks)
+        *      %ecx: encrypt
+        */
+       CFI_STARTPROC();
+
+       vzeroupper;
+
+       vmovdqu (0 * 32)(%rdx), RA0;
+       vmovdqu (1 * 32)(%rdx), RB0;
+       vmovdqu (2 * 32)(%rdx), RC0;
+       vmovdqu (3 * 32)(%rdx), RD0;
+       vmovdqu (4 * 32)(%rdx), RA1;
+       vmovdqu (5 * 32)(%rdx), RB1;
+       vmovdqu (6 * 32)(%rdx), RC1;
+       vmovdqu (7 * 32)(%rdx), RD1;
+
+       testl %ecx, %ecx;
+       jz .Lblk16_dec;
+               call __twofish_enc_blk16;
+               jmp .Lblk16_end;
+       .Lblk16_dec:
+               call __twofish_dec_blk16;
+
+.Lblk16_end:
+       vmovdqu RA0, (0 * 32)(%rsi);
+       vmovdqu RB0, (1 * 32)(%rsi);
+       vmovdqu RC0, (2 * 32)(%rsi);
+       vmovdqu RD0, (3 * 32)(%rsi);
+       vmovdqu RA1, (4 * 32)(%rsi);
+       vmovdqu RB1, (5 * 32)(%rsi);
+       vmovdqu RC1, (6 * 32)(%rsi);
+       vmovdqu RD1, (7 * 32)(%rsi);
+
+       vzeroall;
+
+       ret_spec_stop;
+       CFI_ENDPROC();
+ELF(.size _gcry_twofish_avx2_blk16,.-_gcry_twofish_avx2_blk16;)
+
 #define inc_le128(x, minus_one, tmp) \
        vpcmpeqq minus_one, x, tmp; \
        vpsubq minus_one, x, x; \
        vpslldq $8, tmp, tmp; \
        vpsubq tmp, x, x;
 
-.align 8
+.align 16
 .globl _gcry_twofish_avx2_ctr_enc
 ELF(.type   _gcry_twofish_avx2_ctr_enc,@function;)
 _gcry_twofish_avx2_ctr_enc:
@@ -586,7 +671,7 @@ _gcry_twofish_avx2_ctr_enc:
        CFI_ENDPROC();
 ELF(.size _gcry_twofish_avx2_ctr_enc,.-_gcry_twofish_avx2_ctr_enc;)
 
-.align 8
+.align 16
 .globl _gcry_twofish_avx2_cbc_dec
 ELF(.type   _gcry_twofish_avx2_cbc_dec,@function;)
 _gcry_twofish_avx2_cbc_dec:
@@ -639,7 +724,7 @@ _gcry_twofish_avx2_cbc_dec:
        CFI_ENDPROC();
 ELF(.size _gcry_twofish_avx2_cbc_dec,.-_gcry_twofish_avx2_cbc_dec;)
 
-.align 8
+.align 16
 .globl _gcry_twofish_avx2_cfb_dec
 ELF(.type   _gcry_twofish_avx2_cfb_dec,@function;)
 _gcry_twofish_avx2_cfb_dec:
@@ -694,7 +779,7 @@ _gcry_twofish_avx2_cfb_dec:
        CFI_ENDPROC();
 ELF(.size _gcry_twofish_avx2_cfb_dec,.-_gcry_twofish_avx2_cfb_dec;)
 
-.align 8
+.align 16
 .globl _gcry_twofish_avx2_ocb_enc
 ELF(.type _gcry_twofish_avx2_ocb_enc,@function;)
 
@@ -808,7 +893,7 @@ _gcry_twofish_avx2_ocb_enc:
        CFI_ENDPROC();
 ELF(.size _gcry_twofish_avx2_ocb_enc,.-_gcry_twofish_avx2_ocb_enc;)
 
-.align 8
+.align 16
 .globl _gcry_twofish_avx2_ocb_dec
 ELF(.type _gcry_twofish_avx2_ocb_dec,@function;)
 
@@ -933,7 +1018,7 @@ _gcry_twofish_avx2_ocb_dec:
        CFI_ENDPROC();
 ELF(.size _gcry_twofish_avx2_ocb_dec,.-_gcry_twofish_avx2_ocb_dec;)
 
-.align 8
+.align 16
 .globl _gcry_twofish_avx2_ocb_auth
 ELF(.type _gcry_twofish_avx2_ocb_auth,@function;)
 
@@ -1036,10 +1121,13 @@ _gcry_twofish_avx2_ocb_auth:
        CFI_ENDPROC();
 ELF(.size _gcry_twofish_avx2_ocb_auth,.-_gcry_twofish_avx2_ocb_auth;)
 
+SECTION_RODATA
+
 .align 16
 
 /* For CTR-mode IV byteswap */
- _gcry_twofish_bswap128_mask:
+ELF(.type _gcry_twofish_bswap128_mask,@object)
+_gcry_twofish_bswap128_mask:
 .Lbswap128_mask:
        .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
 ELF(.size _gcry_twofish_bswap128_mask,.-_gcry_twofish_bswap128_mask;)
index d19e079046a6a6da391116af0f37e55cc429e079..11a6e251792189a7718c78cf8700d81da63dfc32 100644 (file)
@@ -17,8 +17,8 @@
  * GNU Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA
+ * License along with this program; if not, see <https://www.gnu.org/licenses/>.
+ * SPDX-License-Identifier: LGPL-2.1-or-later
  ********************************************************************
  *
  * This code is a "clean room" implementation, written from the paper
@@ -46,7 +46,7 @@
 #include "cipher.h"
 #include "bufhelp.h"
 #include "cipher-internal.h"
-#include "cipher-selftest.h"
+#include "bulkhelp.h"
 
 
 #define TWOFISH_BLOCKSIZE 16
@@ -101,7 +101,12 @@ static size_t _gcry_twofish_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg,
                                       int encrypt);
 static size_t _gcry_twofish_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg,
                                      size_t nblocks);
-
+static void _gcry_twofish_xts_crypt (void *context, unsigned char *tweak,
+                                    void *outbuf_arg, const void *inbuf_arg,
+                                    size_t nblocks, int encrypt);
+static void _gcry_twofish_ecb_crypt (void *context, void *outbuf_arg,
+                                    const void *inbuf_arg, size_t nblocks,
+                                    int encrypt);
 
 /* Structure for an expanded Twofish key.  s contains the key-dependent
  * S-boxes composed with the MDS matrix; w contains the eight "whitening"
@@ -762,11 +767,7 @@ twofish_setkey (void *context, const byte *key, unsigned int keylen,
   rc = do_twofish_setkey (ctx, key, keylen);
 
 #ifdef USE_AVX2
-  ctx->use_avx2 = 0;
-  if ((hwfeatures & HWF_INTEL_AVX2) && (hwfeatures & HWF_INTEL_FAST_VPGATHER))
-    {
-      ctx->use_avx2 = 1;
-    }
+  ctx->use_avx2 = (hwfeatures & HWF_INTEL_AVX2) != 0;
 #endif
 
   /* Setup bulk encryption routines.  */
@@ -775,7 +776,9 @@ twofish_setkey (void *context, const byte *key, unsigned int keylen,
   bulk_ops->cfb_dec = _gcry_twofish_cfb_dec;
   bulk_ops->ctr_enc = _gcry_twofish_ctr_enc;
   bulk_ops->ocb_crypt = _gcry_twofish_ocb_crypt;
-  bulk_ops->ocb_auth  = _gcry_twofish_ocb_auth;
+  bulk_ops->ocb_auth = _gcry_twofish_ocb_auth;
+  bulk_ops->xts_crypt = _gcry_twofish_xts_crypt;
+  bulk_ops->ecb_crypt = _gcry_twofish_ecb_crypt;
 
   (void)hwfeatures;
 
@@ -788,6 +791,9 @@ twofish_setkey (void *context, const byte *key, unsigned int keylen,
 /* Assembler implementations of Twofish using AVX2.  Process 16 block in
    parallel.
  */
+extern void _gcry_twofish_avx2_blk16 (const TWOFISH_context *c, byte *out,
+                                     const byte *in, int encrypt) ASM_FUNC_ABI;
+
 extern void _gcry_twofish_avx2_ctr_enc(const TWOFISH_context *ctx,
                                       unsigned char *out,
                                       const unsigned char *in,
@@ -835,6 +841,9 @@ extern void _gcry_twofish_amd64_decrypt_block(const TWOFISH_context *c,
                                              byte *out, const byte *in);
 
 /* These assembly implementations process three blocks in parallel. */
+extern void _gcry_twofish_amd64_blk3(const TWOFISH_context *c, byte *out,
+                                    const byte *in, int encrypt);
+
 extern void _gcry_twofish_amd64_ctr_enc(const TWOFISH_context *c, byte *out,
                                        const byte *in, byte *ctr);
 
@@ -1358,27 +1367,11 @@ _gcry_twofish_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg,
     {
       int did_use_avx2 = 0;
       u64 Ls[16];
-      unsigned int n = 16 - (blkn % 16);
       u64 *l;
-      int i;
 
       if (nblocks >= 16)
        {
-         for (i = 0; i < 16; i += 8)
-           {
-             /* Use u64 to store pointers for x32 support (assembly function
-              * assumes 64-bit pointers). */
-             Ls[(i + 0 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
-             Ls[(i + 1 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[1];
-             Ls[(i + 2 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
-             Ls[(i + 3 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[2];
-             Ls[(i + 4 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
-             Ls[(i + 5 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[1];
-             Ls[(i + 6 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
-           }
-
-         Ls[(7 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[3];
-         l = &Ls[(15 + n) % 16];
+          l = bulk_ocb_prepare_L_pointers_array_blk16 (c, Ls, blkn);
 
          /* Process data in 16 block chunks. */
          while (nblocks >= 16)
@@ -1471,27 +1464,11 @@ _gcry_twofish_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg,
     {
       int did_use_avx2 = 0;
       u64 Ls[16];
-      unsigned int n = 16 - (blkn % 16);
       u64 *l;
-      int i;
 
       if (nblocks >= 16)
        {
-         for (i = 0; i < 16; i += 8)
-           {
-             /* Use u64 to store pointers for x32 support (assembly function
-              * assumes 64-bit pointers). */
-             Ls[(i + 0 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
-             Ls[(i + 1 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[1];
-             Ls[(i + 2 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
-             Ls[(i + 3 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[2];
-             Ls[(i + 4 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
-             Ls[(i + 5 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[1];
-             Ls[(i + 6 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
-           }
-
-         Ls[(7 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[3];
-         l = &Ls[(15 + n) % 16];
+          l = bulk_ocb_prepare_L_pointers_array_blk16 (c, Ls, blkn);
 
          /* Process data in 16 block chunks. */
          while (nblocks >= 16)
@@ -1533,7 +1510,7 @@ _gcry_twofish_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg,
        blkn += 3;
 
        twofish_amd64_ocb_auth(ctx, abuf, c->u_mode.ocb.aad_offset,
-                             c->u_mode.ocb.aad_sum, Ls);
+                              c->u_mode.ocb.aad_sum, Ls);
 
        nblocks -= 3;
        abuf += 3 * TWOFISH_BLOCKSIZE;
@@ -1558,47 +1535,135 @@ _gcry_twofish_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg,
   return nblocks;
 }
 
-\f
 
-/* Run the self-tests for TWOFISH-CTR, tests IV increment of bulk CTR
-   encryption.  Returns NULL on success. */
-static const char *
-selftest_ctr (void)
+static unsigned int
+twofish_crypt_blk1_16(void *context, byte *out, const byte *in,
+                     size_t num_blks, int encrypt)
 {
-  const int nblocks = 16+1;
-  const int blocksize = TWOFISH_BLOCKSIZE;
-  const int context_size = sizeof(TWOFISH_context);
+  TWOFISH_context *ctx = context;
+  unsigned int burn, burn_stack_depth = 0;
 
-  return _gcry_selftest_helper_ctr("TWOFISH", &twofish_setkey,
-           &twofish_encrypt, nblocks, blocksize, context_size);
+#ifdef USE_AVX2
+  if (num_blks == 16 && ctx->use_avx2)
+    {
+      _gcry_twofish_avx2_blk16 (ctx, out, in, encrypt);
+      return 0;
+    }
+#endif
+
+#ifdef USE_AMD64_ASM
+  while (num_blks >= 3)
+    {
+      _gcry_twofish_amd64_blk3 (ctx, out, in, encrypt);
+      burn = 8 * sizeof(void *);
+      burn_stack_depth = (burn > burn_stack_depth) ? burn : burn_stack_depth;
+      out += 3 * TWOFISH_BLOCKSIZE;
+      in += 3 * TWOFISH_BLOCKSIZE;
+      num_blks -= 3;
+    }
+#endif
+
+  while (num_blks >= 1)
+    {
+      if (encrypt)
+       burn = twofish_encrypt((void *)ctx, out, in);
+      else
+       burn = twofish_decrypt((void *)ctx, out, in);
+
+      burn_stack_depth = (burn > burn_stack_depth) ? burn : burn_stack_depth;
+      out += TWOFISH_BLOCKSIZE;
+      in += TWOFISH_BLOCKSIZE;
+      num_blks--;
+    }
+
+  return burn_stack_depth;
 }
 
-/* Run the self-tests for TWOFISH-CBC, tests bulk CBC decryption.
-   Returns NULL on success. */
-static const char *
-selftest_cbc (void)
+static unsigned int
+twofish_encrypt_blk1_16(void *ctx, byte *out, const byte *in,
+                       size_t num_blks)
 {
-  const int nblocks = 16+2;
-  const int blocksize = TWOFISH_BLOCKSIZE;
-  const int context_size = sizeof(TWOFISH_context);
+  return twofish_crypt_blk1_16 (ctx, out, in, num_blks, 1);
+}
 
-  return _gcry_selftest_helper_cbc("TWOFISH", &twofish_setkey,
-           &twofish_encrypt, nblocks, blocksize, context_size);
+static unsigned int
+twofish_decrypt_blk1_16(void *ctx, byte *out, const byte *in,
+                       size_t num_blks)
+{
+  return twofish_crypt_blk1_16 (ctx, out, in, num_blks, 0);
 }
 
-/* Run the self-tests for TWOFISH-CFB, tests bulk CBC decryption.
-   Returns NULL on success. */
-static const char *
-selftest_cfb (void)
+
+/* Bulk encryption/decryption of complete blocks in XTS mode. */
+static void
+_gcry_twofish_xts_crypt (void *context, unsigned char *tweak, void *outbuf_arg,
+                        const void *inbuf_arg, size_t nblocks, int encrypt)
+{
+  TWOFISH_context *ctx = context;
+  unsigned char *outbuf = outbuf_arg;
+  const unsigned char *inbuf = inbuf_arg;
+  int burn_stack_depth = 0;
+
+  /* Process remaining blocks. */
+  if (nblocks)
+    {
+      unsigned char tmpbuf[16 * 16];
+      unsigned int tmp_used = 16;
+      size_t tmpbufsize = 15 * 16;
+      size_t nburn;
+
+#ifdef USE_AVX2
+      if (ctx->use_avx2)
+       tmpbufsize = 16 * 16;
+#endif
+
+      nburn = bulk_xts_crypt_128(ctx, encrypt ? twofish_encrypt_blk1_16
+                                              : twofish_decrypt_blk1_16,
+                                 outbuf, inbuf, nblocks,
+                                 tweak, tmpbuf, tmpbufsize / 16,
+                                 &tmp_used);
+      burn_stack_depth = nburn > burn_stack_depth ? nburn : burn_stack_depth;
+
+      wipememory(tmpbuf, tmp_used);
+    }
+
+  if (burn_stack_depth)
+    _gcry_burn_stack(burn_stack_depth);
+}
+
+
+/* Bulk encryption/decryption in ECB mode. */
+static void
+_gcry_twofish_ecb_crypt (void *context, void *outbuf_arg, const void *inbuf_arg,
+                        size_t nblocks, int encrypt)
 {
-  const int nblocks = 16+2;
-  const int blocksize = TWOFISH_BLOCKSIZE;
-  const int context_size = sizeof(TWOFISH_context);
+  TWOFISH_context *ctx = context;
+  unsigned char *outbuf = outbuf_arg;
+  const unsigned char *inbuf = inbuf_arg;
+  int burn_stack_depth = 0;
+
+  /* Process remaining blocks. */
+  if (nblocks)
+    {
+      size_t fn_maxblocks = 15;
+      size_t nburn;
 
-  return _gcry_selftest_helper_cfb("TWOFISH", &twofish_setkey,
-           &twofish_encrypt, nblocks, blocksize, context_size);
+#ifdef USE_AVX2
+      if (ctx->use_avx2)
+       fn_maxblocks = 16;
+#endif
+
+      nburn = bulk_ecb_crypt_128(ctx, encrypt ? twofish_encrypt_blk1_16
+                                              : twofish_decrypt_blk1_16,
+                                 outbuf, inbuf, nblocks, fn_maxblocks);
+      burn_stack_depth = nburn > burn_stack_depth ? nburn : burn_stack_depth;
+    }
+
+  if (burn_stack_depth)
+    _gcry_burn_stack(burn_stack_depth);
 }
 
+
 \f
 /* Test a single encryption and decryption with each key size. */
 
@@ -1608,7 +1673,6 @@ selftest (void)
   TWOFISH_context ctx; /* Expanded key. */
   byte scratch[16];    /* Encryption/decryption result buffer. */
   cipher_bulk_ops_t bulk_ops;
-  const char *r;
 
   /* Test vectors for single encryption/decryption.  Note that I am using
    * the vectors from the Twofish paper's "known answer test", I=3 for
@@ -1658,13 +1722,6 @@ selftest (void)
   if (memcmp (scratch, plaintext_256, sizeof (plaintext_256)))
     return "Twofish-256 test decryption failed.";
 
-  if ((r = selftest_ctr()) != NULL)
-    return r;
-  if ((r = selftest_cbc()) != NULL)
-    return r;
-  if ((r = selftest_cfb()) != NULL)
-    return r;
-
   return NULL;
 }
 \f
index 82ac36ef4114936c2179cb5f2c0502df8cf0f12f..c001743a2e313cee2b6b3ba52ae4e8c5b5d26522 100644 (file)
@@ -126,8 +126,8 @@ am__aclocal_m4_deps = $(top_srcdir)/m4/ax_cc_for_build.m4 \
        $(top_srcdir)/m4/gpg-error.m4 $(top_srcdir)/m4/libtool.m4 \
        $(top_srcdir)/m4/ltoptions.m4 $(top_srcdir)/m4/ltsugar.m4 \
        $(top_srcdir)/m4/ltversion.m4 $(top_srcdir)/m4/lt~obsolete.m4 \
-       $(top_srcdir)/m4/noexecstack.m4 $(top_srcdir)/m4/socklen.m4 \
-       $(top_srcdir)/acinclude.m4 $(top_srcdir)/configure.ac
+       $(top_srcdir)/m4/noexecstack.m4 $(top_srcdir)/acinclude.m4 \
+       $(top_srcdir)/configure.ac
 am__configure_deps = $(am__aclocal_m4_deps) $(CONFIGURE_DEPENDENCIES) \
        $(ACLOCAL_M4)
 DIST_COMMON = $(srcdir)/Makefile.am $(am__DIST_COMMON)
@@ -304,9 +304,6 @@ PACKAGE_TARNAME = @PACKAGE_TARNAME@
 PACKAGE_URL = @PACKAGE_URL@
 PACKAGE_VERSION = @PACKAGE_VERSION@
 PATH_SEPARATOR = @PATH_SEPARATOR@
-PTH_CFLAGS = @PTH_CFLAGS@
-PTH_CONFIG = @PTH_CONFIG@
-PTH_LIBS = @PTH_LIBS@
 RANLIB = @RANLIB@
 RC = @RC@
 READELF = @READELF@
index d7ca1b5d4380d4231b9e1b598a575c9dd591f46a..7684e4f312d8e810abb4f3302629d0ee4b04924e 100644 (file)
@@ -30,9 +30,9 @@ _gcry_compat_identification (void)
   static const char blurb[] =
     "\n\n"
     "This is Libgcrypt " PACKAGE_VERSION " - The GNU Crypto Library\n"
-    "Copyright (C) 2012-2022 g10 Code GmbH\n"
-    "Copyright (C) 2013-2022 Jussi Kivilinna\n"
     "Copyright (C) 2000-2018 Free Software Foundation, Inc.\n"
+    "Copyright (C) 2012-2024 g10 Code GmbH\n"
+    "Copyright (C) 2013-2024 Jussi Kivilinna\n"
     "\n"
     "(" BUILD_REVISION " " BUILD_TIMESTAMP ")\n"
     "\n\n";
index b5a7649129369fdb0382af59f69c963f6b3c0064..d3eb81bfcd3776077bf2ea46ddf37722a13a765f 100644 (file)
@@ -4,7 +4,7 @@
  * This file is part of Libgcrypt.
  *
  * Libgcrypt is free software; you can redistribute it and/or modify
- * it under the terms of the GNU Lesser general Public License as
+ * it under the terms of the GNU Lesser General Public License as
  * published by the Free Software Foundation; either version 2.1 of
  * the License, or (at your option) any later version.
  *
index 883969db25a6d9e16066da5f858bbbbd01d26871..5e51c16be66c47e6284e26d368b5e556e20533ea 100644 (file)
@@ -36,6 +36,9 @@
 /* Enable support for Intel AVX2 instructions. */
 #undef ENABLE_AVX2_SUPPORT
 
+/* Enable support for Intel AVX512 instructions. */
+#undef ENABLE_AVX512_SUPPORT
+
 /* Enable support for Intel AVX instructions. */
 #undef ENABLE_AVX_SUPPORT
 
@@ -45,6 +48,9 @@
 /* Enable forcing 'soft' HW feature bits on (for testing). */
 #undef ENABLE_FORCE_SOFT_HWFEATURES
 
+/* Enable support for Intel GFNI instructions. */
+#undef ENABLE_GFNI_SUPPORT
+
 /* Define to support an HMAC based integrity check */
 #undef ENABLE_HMAC_BINARY_CHECK
 
 /* Enable support for Intel SSE4.1 instructions. */
 #undef ENABLE_SSE41_SUPPORT
 
+/* Enable support for ARMv9 SVE2 instructions. */
+#undef ENABLE_SVE2_SUPPORT
+
+/* Enable support for ARMv8 SVE instructions. */
+#undef ENABLE_SVE_SUPPORT
+
 /* Define FIPS module version for certification */
 #undef FIPS_MODULE_VERSION
 
 /* Define to 1 if the system has the type `byte'. */
 #undef HAVE_BYTE
 
+/* Defined if compiler supports clang PowerPC target attributes */
+#undef HAVE_CLANG_ATTRIBUTE_PPC_TARGET
+
 /* Define to 1 if you have the `clock' function. */
 #undef HAVE_CLOCK
 
 /* Define to 1 if you have the `clock_gettime' function. */
 #undef HAVE_CLOCK_GETTIME
 
+/* Defined if underlying compiler supports AArch64/NEON/crypto intrinsics */
+#undef HAVE_COMPATIBLE_CC_AARCH64_NEON_INTRINSICS
+
+/* Defined if underlying compiler supports AArch64/NEON/crypto intrinsics with
+   extra GCC flags */
+#undef HAVE_COMPATIBLE_CC_AARCH64_NEON_INTRINSICS_WITH_CFLAGS
+
 /* Defined if underlying compiler supports PowerPC AltiVec/VSX/crypto
    intrinsics */
 #undef HAVE_COMPATIBLE_CC_PPC_ALTIVEC
    intrinsics with extra GCC flags */
 #undef HAVE_COMPATIBLE_CC_PPC_ALTIVEC_WITH_CFLAGS
 
+/* Defined if underlying compiler supports x86/AVX512 intrinsics */
+#undef HAVE_COMPATIBLE_CC_X86_AVX512_INTRINSICS
+
 /* Defined if underlying assembler is compatible with ARMv8/Aarch64 assembly
    implementations */
 #undef HAVE_COMPATIBLE_GCC_AARCH64_PLATFORM_AS
    implementations */
 #undef HAVE_COMPATIBLE_GCC_ARM_PLATFORM_AS
 
+/* Defined if underlying assembler is compatible with i386 assembly
+   implementations */
+#undef HAVE_COMPATIBLE_GCC_I386_PLATFORM_AS
+
+/* Defined if underlying assembler is compatible with WIN32 assembly
+   implementations */
+#undef HAVE_COMPATIBLE_GCC_WIN32_PLATFORM_AS
+
 /* Defined if underlying assembler is compatible with WIN64 assembly
    implementations */
 #undef HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS
    */
 #undef HAVE_GCC_ATTRIBUTE_MS_ABI
 
+/* Defined if compiler supports "__attribute__ ((optimize))" function
+   attribute */
+#undef HAVE_GCC_ATTRIBUTE_OPTIMIZE
+
 /* Defined if a GCC style "__attribute__ ((packed))" is supported */
 #undef HAVE_GCC_ATTRIBUTE_PACKED
 
+/* Defined if compiler supports GCC PowerPC target attributes */
+#undef HAVE_GCC_ATTRIBUTE_PPC_TARGET
+
 /* Defined if compiler supports "__attribute__ ((sysv_abi))" function
    attribute */
 #undef HAVE_GCC_ATTRIBUTE_SYSV_ABI
 /* Defined if inline assembler supports AArch64 NEON instructions */
 #undef HAVE_GCC_INLINE_ASM_AARCH64_NEON
 
+/* Defined if inline assembler supports AArch64 SHA3/SHA512/SM3/SM4
+   instructions */
+#undef HAVE_GCC_INLINE_ASM_AARCH64_SHA3_SHA512_SM3_SM4
+
+/* Defined if inline assembler supports AArch64 SVE instructions */
+#undef HAVE_GCC_INLINE_ASM_AARCH64_SVE
+
+/* Defined if inline assembler supports AArch64 SVE2 instructions */
+#undef HAVE_GCC_INLINE_ASM_AARCH64_SVE2
+
 /* Defined if inline assembler supports AVX instructions */
 #undef HAVE_GCC_INLINE_ASM_AVX
 
 /* Defined if inline assembler supports AVX2 instructions */
 #undef HAVE_GCC_INLINE_ASM_AVX2
 
+/* Defined if inline assembler supports AVX512 instructions */
+#undef HAVE_GCC_INLINE_ASM_AVX512
+
 /* Defined if inline assembler supports BMI2 instructions */
 #undef HAVE_GCC_INLINE_ASM_BMI2
 
+/* Defined if inline assembler supports GFNI instructions */
+#undef HAVE_GCC_INLINE_ASM_GFNI
+
 /* Defined if inline assembler supports NEON instructions */
 #undef HAVE_GCC_INLINE_ASM_NEON
 
 /* Define to 1 if you have the `mmap' function. */
 #undef HAVE_MMAP
 
-/* Defined if the GNU Pth is available */
-#undef HAVE_PTH
-
 /* Define if we have pthread. */
 #undef HAVE_PTHREAD
 
 /* Define to 1 if you have the `sysconf' function. */
 #undef HAVE_SYSCONF
 
+/* Define to 1 if you have the `sysctlbyname' function. */
+#undef HAVE_SYSCTLBYNAME
+
 /* Define to 1 if you have the `syslog' function. */
 #undef HAVE_SYSLOG
 
 /* Define to 1 if you have the <sys/random.h> header file. */
 #undef HAVE_SYS_RANDOM_H
 
-/* Define to 1 if you have the <sys/socket.h> header file. */
-#undef HAVE_SYS_SOCKET_H
-
 /* Define to 1 if you have the <sys/stat.h> header file. */
 #undef HAVE_SYS_STAT_H
 
+/* Define to 1 if you have the <sys/sysctl.h> header file. */
+#undef HAVE_SYS_SYSCTL_H
+
 /* Define to 1 if you have the <sys/types.h> header file. */
 #undef HAVE_SYS_TYPES_H
 
 /* Define to 1 if you have the <wchar.h> header file. */
 #undef HAVE_WCHAR_H
 
-/* Define to 1 if you have the <ws2tcpip.h> header file. */
-#undef HAVE_WS2TCPIP_H
-
 /* Defined if this is not a regular release */
 #undef IS_DEVELOPMENT_VERSION
 
    */
 #undef LT_OBJDIR
 
-/* Define to use the (obsolete) malloc guarding feature */
-#undef M_GUARD
-
 /* defined to the name of the strong random device */
 #undef NAME_OF_DEV_RANDOM
 
 /* The size of `unsigned short', as computed by sizeof. */
 #undef SIZEOF_UNSIGNED_SHORT
 
+/* The size of `unsigned __int128', as computed by sizeof. */
+#undef SIZEOF_UNSIGNED___INT128
+
 /* The size of `void *', as computed by sizeof. */
 #undef SIZEOF_VOID_P
 
 /* Defined if this module should be included */
 #undef USE_ARCFOUR
 
+/* Defined if this module should be included */
+#undef USE_ARIA
+
 /* Defined if this module should be included */
 #undef USE_BLAKE2
 
 /* Defined if this module should be included */
 #undef USE_ELGAMAL
 
-/* Defined if the GNU Portable Thread Library should be used */
-#undef USE_GNU_PTH
-
 /* Defined if this module should be included */
 #undef USE_GOST28147
 
 /* Define to `unsigned int' if <sys/types.h> does not define. */
 #undef size_t
 
-/* type to use in place of socklen_t if not defined */
-#undef socklen_t
-
 /* Define to the type of an unsigned integer type wide enough to hold a
    pointer, if such a type exists, and if the system does not define it. */
 #undef uintptr_t
index 131114c68a87deab0e3c96264a40aebcc4e985bf..4a7bee2fbeb93c1e22144d77adcfbfa88284d475 100755 (executable)
--- a/configure
+++ b/configure
@@ -1,7 +1,7 @@
 #! /bin/sh
 # From configure.ac Revision.
 # Guess values for system-dependent variables and create Makefiles.
-# Generated by GNU Autoconf 2.71 for libgcrypt 1.10.3.
+# Generated by GNU Autoconf 2.71 for libgcrypt 1.11.0.
 #
 # Report bugs to <https://bugs.gnupg.org>.
 #
@@ -184,6 +184,7 @@ test -x / || exit 1"
   as_lineno_2=";as_suggested=$as_suggested$LINENO;as_suggested=$as_suggested" as_lineno_2a=\$LINENO
   eval 'test \"x\$as_lineno_1'\$as_run'\" != \"x\$as_lineno_2'\$as_run'\" &&
   test \"x\`expr \$as_lineno_1'\$as_run' + 1\`\" = \"x\$as_lineno_2'\$as_run'\"' || exit 1
+test \$(( 1 + 1 )) = 2 || exit 1
 
   test -n \"\${ZSH_VERSION+set}\${BASH_VERSION+set}\" || (
     ECHO='\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\'
@@ -191,8 +192,7 @@ test -x / || exit 1"
     ECHO=\$ECHO\$ECHO\$ECHO\$ECHO\$ECHO\$ECHO
     PATH=/empty FPATH=/empty; export PATH FPATH
     test \"X\`printf %s \$ECHO\`\" = \"X\$ECHO\" \\
-      || test \"X\`print -r -- \$ECHO\`\" = \"X\$ECHO\" ) || exit 1
-test \$(( 1 + 1 )) = 2 || exit 1"
+      || test \"X\`print -r -- \$ECHO\`\" = \"X\$ECHO\" ) || exit 1"
   if (eval "$as_required") 2>/dev/null
 then :
   as_have_required=yes
@@ -622,8 +622,8 @@ MAKEFLAGS=
 # Identity of this package.
 PACKAGE_NAME='libgcrypt'
 PACKAGE_TARNAME='libgcrypt'
-PACKAGE_VERSION='1.10.3'
-PACKAGE_STRING='libgcrypt 1.10.3'
+PACKAGE_VERSION='1.11.0'
+PACKAGE_STRING='libgcrypt 1.11.0'
 PACKAGE_BUGREPORT='https://bugs.gnupg.org'
 PACKAGE_URL=''
 
@@ -700,6 +700,10 @@ DL_LIBS
 LIBOBJS
 ENABLE_PPC_VCRYPTO_EXTRA_CFLAGS_FALSE
 ENABLE_PPC_VCRYPTO_EXTRA_CFLAGS_TRUE
+ENABLE_AARCH64_NEON_INTRINSICS_EXTRA_CFLAGS_FALSE
+ENABLE_AARCH64_NEON_INTRINSICS_EXTRA_CFLAGS_TRUE
+ENABLE_X86_AVX512_INTRINSICS_EXTRA_CFLAGS_FALSE
+ENABLE_X86_AVX512_INTRINSICS_EXTRA_CFLAGS_TRUE
 MPI_MOD_C_UDIV_QRNND_FALSE
 MPI_MOD_C_UDIV_QRNND_TRUE
 MPI_MOD_C_UDIV_FALSE
@@ -737,9 +741,8 @@ MPI_MOD_ASM_MPIH_SUB1_TRUE
 MPI_MOD_ASM_MPIH_ADD1_FALSE
 MPI_MOD_ASM_MPIH_ADD1_TRUE
 MPI_SFLAGS
-PTH_LIBS
-PTH_CFLAGS
-PTH_CONFIG
+USE_GPGRT_CONFIG_FALSE
+USE_GPGRT_CONFIG_TRUE
 GPG_ERROR_MT_LIBS
 GPG_ERROR_MT_CFLAGS
 GPG_ERROR_LIBS
@@ -758,8 +761,6 @@ USE_HMAC_BINARY_CHECK_TRUE
 READELF
 OBJCOPY
 RUN_LARGE_DATA_TESTS
-ENABLE_RANDOM_DAEMON_FALSE
-ENABLE_RANDOM_DAEMON_TRUE
 emacs_local_vars_end
 emacs_local_vars_read_only
 emacs_local_vars_begin
@@ -782,9 +783,7 @@ NM
 ac_ct_DUMPBIN
 DUMPBIN
 LD
-FGREP
 EGREP
-GREP
 SED
 LIBTOOL
 OBJDUMP
@@ -794,6 +793,8 @@ EXEEXT_FOR_BUILD
 CC_FOR_BUILD
 VERSION_NUMBER
 LDADD_FOR_TESTS_KLUDGE
+FGREP
+GREP
 am__fastdepCCAS_FALSE
 am__fastdepCCAS_TRUE
 CCASDEPMODE
@@ -922,9 +923,7 @@ enable_kdfs
 enable_random
 enable_dev_random
 with_egd_socket
-enable_random_daemon
 enable_asm
-enable_m_guard
 enable_large_data_tests
 enable_force_soft_hwfeatures
 with_capabilities
@@ -939,8 +938,12 @@ enable_sse41_support
 enable_drng_support
 enable_avx_support
 enable_avx2_support
+enable_avx512_support
+enable_gfni_support
 enable_neon_support
 enable_arm_crypto_support
+enable_sve_support
+enable_sve2_support
 enable_ppc_crypto_support
 enable_O_flag_munging
 enable_instrumentation_munging
@@ -949,7 +952,6 @@ enable_ld_version_script
 with_libtool_modification
 with_libgpg_error_prefix
 with_gpg_error_prefix
-with_pth_prefix
 enable_mpi_path
 enable_optimization
 enable_noexecstack
@@ -1516,7 +1518,7 @@ if test "$ac_init_help" = "long"; then
   # Omit some internal or obsolete options to make the list less imposing.
   # This message is too long to be a string in the A/UX 3.1 sh.
   cat <<_ACEOF
-\`configure' configures libgcrypt 1.10.3 to adapt to many kinds of systems.
+\`configure' configures libgcrypt 1.11.0 to adapt to many kinds of systems.
 
 Usage: $0 [OPTION]... [VAR=VALUE]...
 
@@ -1587,7 +1589,7 @@ fi
 
 if test -n "$ac_init_help"; then
   case $ac_init_help in
-     short | recursive ) echo "Configuration of libgcrypt 1.10.3:";;
+     short | recursive ) echo "Configuration of libgcrypt 1.11.0:";;
    esac
   cat <<\_ACEOF
 
@@ -1620,9 +1622,7 @@ Optional Features:
   --enable-kfds=kdfs      select the KDFs to include
   --enable-random=name    select which random number generator to use
   --disable-dev-random    disable the use of dev random
-  --enable-random-daemon  Build the experimental gcryptrnd
   --disable-asm           Disable MPI and cipher assembler modules
-  --enable-m-guard        Enable memory guard facility
   --enable-large-data-tests
                           Enable the real long ruinning large data tests
   --enable-force-soft-hwfeatures
@@ -1643,10 +1643,15 @@ Optional Features:
                           instruction)
   --disable-avx-support   Disable support for the Intel AVX instructions
   --disable-avx2-support  Disable support for the Intel AVX2 instructions
+  --disable-avx512-support
+                          Disable support for the Intel AVX512 instructions
+  --disable-gfni-support  Disable support for the Intel GFNI instructions
   --disable-neon-support  Disable support for the ARM NEON instructions
   --disable-arm-crypto-support
                           Disable support for the ARMv8 Crypto Extension
                           instructions
+  --disable-sve-support   Disable support for the ARMv8 SVE instructions
+  --disable-sve2-support  Disable support for the ARMv9 SVE2 instructions
   --disable-ppc-crypto-support
                           Disable support for the PPC crypto instructions
                           introduced in POWER 8 (PowerISA 2.07)
@@ -1687,7 +1692,6 @@ Optional Packages:
   --with-libgpg-error-prefix=PFX
                           prefix where GPG Error is installed (optional)
 
-  --with-pth-prefix=PFX   prefix where GNU Pth is installed (optional)
 
 Some influential environment variables:
   CC          C compiler command
@@ -1769,7 +1773,7 @@ fi
 test -n "$ac_init_help" && exit $ac_status
 if $ac_init_version; then
   cat <<\_ACEOF
-libgcrypt configure 1.10.3
+libgcrypt configure 1.11.0
 generated by GNU Autoconf 2.71
 
 Copyright (C) 2021 Free Software Foundation, Inc.
@@ -2314,7 +2318,7 @@ cat >config.log <<_ACEOF
 This file contains any messages produced by compilers while
 running configure, to aid debugging if configure makes a mistake.
 
-It was created by libgcrypt $as_me 1.10.3, which was
+It was created by libgcrypt $as_me 1.11.0, which was
 generated by GNU Autoconf 2.71.  Invocation command line was
 
   $ $0$ac_configure_args_raw
@@ -2904,7 +2908,6 @@ as_fn_append ac_header_c_list " sys/types.h sys_types_h HAVE_SYS_TYPES_H"
 as_fn_append ac_header_c_list " unistd.h unistd_h HAVE_UNISTD_H"
 as_fn_append ac_header_c_list " wchar.h wchar_h HAVE_WCHAR_H"
 as_fn_append ac_header_c_list " minix/config.h minix_config_h HAVE_MINIX_CONFIG_H"
-as_fn_append ac_header_c_list " sys/socket.h sys_socket_h HAVE_SYS_SOCKET_H"
 as_fn_append ac_func_c_list " vprintf HAVE_VPRINTF"
 
 # Auxiliary files required by this configure script.
@@ -3083,9 +3086,9 @@ ac_compiler_gnu=$ac_cv_c_compiler_gnu
 #   (Interfaces removed:    CURRENT++, AGE=0, REVISION=0)
 #   (Interfaces added:      CURRENT++, AGE++, REVISION=0)
 #   (No interfaces changed:                   REVISION++)
-LIBGCRYPT_LT_CURRENT=24
-LIBGCRYPT_LT_AGE=4
-LIBGCRYPT_LT_REVISION=3
+LIBGCRYPT_LT_CURRENT=25
+LIBGCRYPT_LT_AGE=5
+LIBGCRYPT_LT_REVISION=0
 ################################################
 
 
@@ -3100,7 +3103,7 @@ LIBGCRYPT_CONFIG_API_VERSION=1
 
 # If you change the required gpg-error version, please remove
 # unnecessary error code defines in src/gcrypt-int.h.
-NEED_GPG_ERROR_VERSION=1.27
+NEED_GPG_ERROR_VERSION=1.49
 
 
 
@@ -3618,7 +3621,7 @@ fi
 
 # Define the identity of the package.
  PACKAGE='libgcrypt'
- VERSION='1.10.3'
+ VERSION='1.11.0'
 
 
 printf "%s\n" "#define PACKAGE \"$PACKAGE\"" >>confdefs.h
@@ -6640,6 +6643,148 @@ fi
   test -n "$AWK" && break
 done
 
+{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking for grep that handles long lines and -e" >&5
+printf %s "checking for grep that handles long lines and -e... " >&6; }
+if test ${ac_cv_path_GREP+y}
+then :
+  printf %s "(cached) " >&6
+else $as_nop
+  if test -z "$GREP"; then
+  ac_path_GREP_found=false
+  # Loop through the user's path and test for each of PROGNAME-LIST
+  as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
+for as_dir in $PATH$PATH_SEPARATOR/usr/xpg4/bin
+do
+  IFS=$as_save_IFS
+  case $as_dir in #(((
+    '') as_dir=./ ;;
+    */) ;;
+    *) as_dir=$as_dir/ ;;
+  esac
+    for ac_prog in grep ggrep
+   do
+    for ac_exec_ext in '' $ac_executable_extensions; do
+      ac_path_GREP="$as_dir$ac_prog$ac_exec_ext"
+      as_fn_executable_p "$ac_path_GREP" || continue
+# Check for GNU ac_path_GREP and select it if it is found.
+  # Check for GNU $ac_path_GREP
+case `"$ac_path_GREP" --version 2>&1` in
+*GNU*)
+  ac_cv_path_GREP="$ac_path_GREP" ac_path_GREP_found=:;;
+*)
+  ac_count=0
+  printf %s 0123456789 >"conftest.in"
+  while :
+  do
+    cat "conftest.in" "conftest.in" >"conftest.tmp"
+    mv "conftest.tmp" "conftest.in"
+    cp "conftest.in" "conftest.nl"
+    printf "%s\n" 'GREP' >> "conftest.nl"
+    "$ac_path_GREP" -e 'GREP$' -e '-(cannot match)-' < "conftest.nl" >"conftest.out" 2>/dev/null || break
+    diff "conftest.out" "conftest.nl" >/dev/null 2>&1 || break
+    as_fn_arith $ac_count + 1 && ac_count=$as_val
+    if test $ac_count -gt ${ac_path_GREP_max-0}; then
+      # Best one so far, save it but keep looking for a better one
+      ac_cv_path_GREP="$ac_path_GREP"
+      ac_path_GREP_max=$ac_count
+    fi
+    # 10*(2^10) chars as input seems more than enough
+    test $ac_count -gt 10 && break
+  done
+  rm -f conftest.in conftest.tmp conftest.nl conftest.out;;
+esac
+
+      $ac_path_GREP_found && break 3
+    done
+  done
+  done
+IFS=$as_save_IFS
+  if test -z "$ac_cv_path_GREP"; then
+    as_fn_error $? "no acceptable grep could be found in $PATH$PATH_SEPARATOR/usr/xpg4/bin" "$LINENO" 5
+  fi
+else
+  ac_cv_path_GREP=$GREP
+fi
+
+fi
+{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: $ac_cv_path_GREP" >&5
+printf "%s\n" "$ac_cv_path_GREP" >&6; }
+ GREP="$ac_cv_path_GREP"
+
+
+{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking for fgrep" >&5
+printf %s "checking for fgrep... " >&6; }
+if test ${ac_cv_path_FGREP+y}
+then :
+  printf %s "(cached) " >&6
+else $as_nop
+  if echo 'ab*c' | $GREP -F 'ab*c' >/dev/null 2>&1
+   then ac_cv_path_FGREP="$GREP -F"
+   else
+     if test -z "$FGREP"; then
+  ac_path_FGREP_found=false
+  # Loop through the user's path and test for each of PROGNAME-LIST
+  as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
+for as_dir in $PATH$PATH_SEPARATOR/usr/xpg4/bin
+do
+  IFS=$as_save_IFS
+  case $as_dir in #(((
+    '') as_dir=./ ;;
+    */) ;;
+    *) as_dir=$as_dir/ ;;
+  esac
+    for ac_prog in fgrep
+   do
+    for ac_exec_ext in '' $ac_executable_extensions; do
+      ac_path_FGREP="$as_dir$ac_prog$ac_exec_ext"
+      as_fn_executable_p "$ac_path_FGREP" || continue
+# Check for GNU ac_path_FGREP and select it if it is found.
+  # Check for GNU $ac_path_FGREP
+case `"$ac_path_FGREP" --version 2>&1` in
+*GNU*)
+  ac_cv_path_FGREP="$ac_path_FGREP" ac_path_FGREP_found=:;;
+*)
+  ac_count=0
+  printf %s 0123456789 >"conftest.in"
+  while :
+  do
+    cat "conftest.in" "conftest.in" >"conftest.tmp"
+    mv "conftest.tmp" "conftest.in"
+    cp "conftest.in" "conftest.nl"
+    printf "%s\n" 'FGREP' >> "conftest.nl"
+    "$ac_path_FGREP" FGREP < "conftest.nl" >"conftest.out" 2>/dev/null || break
+    diff "conftest.out" "conftest.nl" >/dev/null 2>&1 || break
+    as_fn_arith $ac_count + 1 && ac_count=$as_val
+    if test $ac_count -gt ${ac_path_FGREP_max-0}; then
+      # Best one so far, save it but keep looking for a better one
+      ac_cv_path_FGREP="$ac_path_FGREP"
+      ac_path_FGREP_max=$ac_count
+    fi
+    # 10*(2^10) chars as input seems more than enough
+    test $ac_count -gt 10 && break
+  done
+  rm -f conftest.in conftest.tmp conftest.nl conftest.out;;
+esac
+
+      $ac_path_FGREP_found && break 3
+    done
+  done
+  done
+IFS=$as_save_IFS
+  if test -z "$ac_cv_path_FGREP"; then
+    as_fn_error $? "no acceptable fgrep could be found in $PATH$PATH_SEPARATOR/usr/xpg4/bin" "$LINENO" 5
+  fi
+else
+  ac_cv_path_FGREP=$FGREP
+fi
+
+   fi
+fi
+{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: $ac_cv_path_FGREP" >&5
+printf "%s\n" "$ac_cv_path_FGREP" >&6; }
+ FGREP="$ac_cv_path_FGREP"
+
+
 
 # Taken from mpfr-4.0.1, then modified for LDADD_FOR_TESTS_KLUDGE
 case $host in
@@ -6674,7 +6819,7 @@ rm -f core conftest.err conftest.$ac_objext conftest.beam \
 esac
 
 
-VERSION_NUMBER=0x010a03
+VERSION_NUMBER=0x010b00
 
 
 # We need to compile and run a program on the build machine.
@@ -6902,75 +7047,6 @@ Xsed="$SED -e 1s/^X//"
 
 
 
-{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking for grep that handles long lines and -e" >&5
-printf %s "checking for grep that handles long lines and -e... " >&6; }
-if test ${ac_cv_path_GREP+y}
-then :
-  printf %s "(cached) " >&6
-else $as_nop
-  if test -z "$GREP"; then
-  ac_path_GREP_found=false
-  # Loop through the user's path and test for each of PROGNAME-LIST
-  as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
-for as_dir in $PATH$PATH_SEPARATOR/usr/xpg4/bin
-do
-  IFS=$as_save_IFS
-  case $as_dir in #(((
-    '') as_dir=./ ;;
-    */) ;;
-    *) as_dir=$as_dir/ ;;
-  esac
-    for ac_prog in grep ggrep
-   do
-    for ac_exec_ext in '' $ac_executable_extensions; do
-      ac_path_GREP="$as_dir$ac_prog$ac_exec_ext"
-      as_fn_executable_p "$ac_path_GREP" || continue
-# Check for GNU ac_path_GREP and select it if it is found.
-  # Check for GNU $ac_path_GREP
-case `"$ac_path_GREP" --version 2>&1` in
-*GNU*)
-  ac_cv_path_GREP="$ac_path_GREP" ac_path_GREP_found=:;;
-*)
-  ac_count=0
-  printf %s 0123456789 >"conftest.in"
-  while :
-  do
-    cat "conftest.in" "conftest.in" >"conftest.tmp"
-    mv "conftest.tmp" "conftest.in"
-    cp "conftest.in" "conftest.nl"
-    printf "%s\n" 'GREP' >> "conftest.nl"
-    "$ac_path_GREP" -e 'GREP$' -e '-(cannot match)-' < "conftest.nl" >"conftest.out" 2>/dev/null || break
-    diff "conftest.out" "conftest.nl" >/dev/null 2>&1 || break
-    as_fn_arith $ac_count + 1 && ac_count=$as_val
-    if test $ac_count -gt ${ac_path_GREP_max-0}; then
-      # Best one so far, save it but keep looking for a better one
-      ac_cv_path_GREP="$ac_path_GREP"
-      ac_path_GREP_max=$ac_count
-    fi
-    # 10*(2^10) chars as input seems more than enough
-    test $ac_count -gt 10 && break
-  done
-  rm -f conftest.in conftest.tmp conftest.nl conftest.out;;
-esac
-
-      $ac_path_GREP_found && break 3
-    done
-  done
-  done
-IFS=$as_save_IFS
-  if test -z "$ac_cv_path_GREP"; then
-    as_fn_error $? "no acceptable grep could be found in $PATH$PATH_SEPARATOR/usr/xpg4/bin" "$LINENO" 5
-  fi
-else
-  ac_cv_path_GREP=$GREP
-fi
-
-fi
-{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: $ac_cv_path_GREP" >&5
-printf "%s\n" "$ac_cv_path_GREP" >&6; }
- GREP="$ac_cv_path_GREP"
-
-
 { printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking for egrep" >&5
 printf %s "checking for egrep... " >&6; }
 if test ${ac_cv_path_EGREP+y}
@@ -7044,103 +7120,30 @@ printf "%s\n" "$ac_cv_path_EGREP" >&6; }
  EGREP="$ac_cv_path_EGREP"
 
 
-{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking for fgrep" >&5
-printf %s "checking for fgrep... " >&6; }
-if test ${ac_cv_path_FGREP+y}
+test -z "$GREP" && GREP=grep
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+# Check whether --with-gnu-ld was given.
+if test ${with_gnu_ld+y}
 then :
-  printf %s "(cached) " >&6
-else $as_nop
-  if echo 'ab*c' | $GREP -F 'ab*c' >/dev/null 2>&1
-   then ac_cv_path_FGREP="$GREP -F"
-   else
-     if test -z "$FGREP"; then
-  ac_path_FGREP_found=false
-  # Loop through the user's path and test for each of PROGNAME-LIST
-  as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
-for as_dir in $PATH$PATH_SEPARATOR/usr/xpg4/bin
-do
-  IFS=$as_save_IFS
-  case $as_dir in #(((
-    '') as_dir=./ ;;
-    */) ;;
-    *) as_dir=$as_dir/ ;;
-  esac
-    for ac_prog in fgrep
-   do
-    for ac_exec_ext in '' $ac_executable_extensions; do
-      ac_path_FGREP="$as_dir$ac_prog$ac_exec_ext"
-      as_fn_executable_p "$ac_path_FGREP" || continue
-# Check for GNU ac_path_FGREP and select it if it is found.
-  # Check for GNU $ac_path_FGREP
-case `"$ac_path_FGREP" --version 2>&1` in
-*GNU*)
-  ac_cv_path_FGREP="$ac_path_FGREP" ac_path_FGREP_found=:;;
-*)
-  ac_count=0
-  printf %s 0123456789 >"conftest.in"
-  while :
-  do
-    cat "conftest.in" "conftest.in" >"conftest.tmp"
-    mv "conftest.tmp" "conftest.in"
-    cp "conftest.in" "conftest.nl"
-    printf "%s\n" 'FGREP' >> "conftest.nl"
-    "$ac_path_FGREP" FGREP < "conftest.nl" >"conftest.out" 2>/dev/null || break
-    diff "conftest.out" "conftest.nl" >/dev/null 2>&1 || break
-    as_fn_arith $ac_count + 1 && ac_count=$as_val
-    if test $ac_count -gt ${ac_path_FGREP_max-0}; then
-      # Best one so far, save it but keep looking for a better one
-      ac_cv_path_FGREP="$ac_path_FGREP"
-      ac_path_FGREP_max=$ac_count
-    fi
-    # 10*(2^10) chars as input seems more than enough
-    test $ac_count -gt 10 && break
-  done
-  rm -f conftest.in conftest.tmp conftest.nl conftest.out;;
-esac
-
-      $ac_path_FGREP_found && break 3
-    done
-  done
-  done
-IFS=$as_save_IFS
-  if test -z "$ac_cv_path_FGREP"; then
-    as_fn_error $? "no acceptable fgrep could be found in $PATH$PATH_SEPARATOR/usr/xpg4/bin" "$LINENO" 5
-  fi
-else
-  ac_cv_path_FGREP=$FGREP
-fi
-
-   fi
-fi
-{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: $ac_cv_path_FGREP" >&5
-printf "%s\n" "$ac_cv_path_FGREP" >&6; }
- FGREP="$ac_cv_path_FGREP"
-
-
-test -z "$GREP" && GREP=grep
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-# Check whether --with-gnu-ld was given.
-if test ${with_gnu_ld+y}
-then :
-  withval=$with_gnu_ld; test "$withval" = no || with_gnu_ld=yes
+  withval=$with_gnu_ld; test "$withval" = no || with_gnu_ld=yes
 else $as_nop
   with_gnu_ld=no
 fi
@@ -14710,7 +14713,7 @@ LIBGCRYPT_CONFIG_HOST="$host"
 # Definitions for symmetric ciphers.
 available_ciphers="arcfour blowfish cast5 des aes twofish serpent rfc2268 seed"
 available_ciphers="$available_ciphers camellia idea salsa20 gost28147 chacha20"
-available_ciphers="$available_ciphers sm4"
+available_ciphers="$available_ciphers sm4 aria"
 enabled_ciphers=""
 
 # Definitions for public-key ciphers.
@@ -15255,6 +15258,39 @@ printf "%s\n" "$ac_cv_sizeof_unsigned_long_long" >&6; }
 printf "%s\n" "#define SIZEOF_UNSIGNED_LONG_LONG $ac_cv_sizeof_unsigned_long_long" >>confdefs.h
 
 
+# The cast to long int works around a bug in the HP C Compiler
+# version HP92453-01 B.11.11.23709.GP, which incorrectly rejects
+# declarations like `int a3[[(sizeof (unsigned char)) >= 0]];'.
+# This bug is HP SR number 8606223364.
+{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking size of unsigned __int128" >&5
+printf %s "checking size of unsigned __int128... " >&6; }
+if test ${ac_cv_sizeof_unsigned___int128+y}
+then :
+  printf %s "(cached) " >&6
+else $as_nop
+  if ac_fn_c_compute_int "$LINENO" "(long int) (sizeof (unsigned __int128))" "ac_cv_sizeof_unsigned___int128"        "$ac_includes_default"
+then :
+
+else $as_nop
+  if test "$ac_cv_type_unsigned___int128" = yes; then
+     { { printf "%s\n" "$as_me:${as_lineno-$LINENO}: error: in \`$ac_pwd':" >&5
+printf "%s\n" "$as_me: error: in \`$ac_pwd':" >&2;}
+as_fn_error 77 "cannot compute sizeof (unsigned __int128)
+See \`config.log' for more details" "$LINENO" 5; }
+   else
+     ac_cv_sizeof_unsigned___int128=0
+   fi
+fi
+
+fi
+{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: $ac_cv_sizeof_unsigned___int128" >&5
+printf "%s\n" "$ac_cv_sizeof_unsigned___int128" >&6; }
+
+
+
+printf "%s\n" "#define SIZEOF_UNSIGNED___INT128 $ac_cv_sizeof_unsigned___int128" >>confdefs.h
+
+
 # The cast to long int works around a bug in the HP C Compiler
 # version HP92453-01 B.11.11.23709.GP, which incorrectly rejects
 # declarations like `int a3[[(sizeof (unsigned char)) >= 0]];'.
 printf "%s\n" "#define EGD_SOCKET_NAME \"$egd_socket_name\"" >>confdefs.h
 
 
-# Implementation of the --enable-random-daemon
-{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking whether the experimental random daemon is requested" >&5
-printf %s "checking whether the experimental random daemon is requested... " >&6; }
-# Check whether --enable-random-daemon was given.
-if test ${enable_random_daemon+y}
-then :
-  enableval=$enable_random_daemon; enable_random_daemon=$enableval
-else $as_nop
-  enable_random_daemon=no
-fi
-
-{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: $enable_random_daemon" >&5
-printf "%s\n" "$enable_random_daemon" >&6; }
- if test x$enable_random_daemon = xyes; then
-  ENABLE_RANDOM_DAEMON_TRUE=
-  ENABLE_RANDOM_DAEMON_FALSE='#'
-else
-  ENABLE_RANDOM_DAEMON_TRUE='#'
-  ENABLE_RANDOM_DAEMON_FALSE=
-fi
-
-
 
 # Implementation of --disable-asm.
 { printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking whether MPI and cipher assembler modules are requested" >&5
@@ -15673,25 +15687,6 @@ printf "%s\n" "#define ASM_DISABLED 1" >>confdefs.h
 
 fi
 
-# Implementation of the --enable-m-guard switch.
-{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking whether memory guard is requested" >&5
-printf %s "checking whether memory guard is requested... " >&6; }
-# Check whether --enable-m-guard was given.
-if test ${enable_m_guard+y}
-then :
-  enableval=$enable_m_guard; use_m_guard=$enableval
-else $as_nop
-  use_m_guard=no
-fi
-
-{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: $use_m_guard" >&5
-printf "%s\n" "$use_m_guard" >&6; }
-if test "$use_m_guard" = yes ; then
-
-printf "%s\n" "#define M_GUARD 1" >>confdefs.h
-
-fi
-
 # Implementation of the --enable-large-data-tests switch.
 { printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking whether to run large data tests" >&5
 printf %s "checking whether to run large data tests... " >&6; }
 { printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: $avx2support" >&5
 printf "%s\n" "$avx2support" >&6; }
 
+# Implementation of the --disable-avx512-support switch.
+{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking whether AVX512 support is requested" >&5
+printf %s "checking whether AVX512 support is requested... " >&6; }
+# Check whether --enable-avx512-support was given.
+if test ${enable_avx512_support+y}
+then :
+  enableval=$enable_avx512_support; avx512support=$enableval
+else $as_nop
+  avx512support=yes
+fi
+
+{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: $avx512support" >&5
+printf "%s\n" "$avx512support" >&6; }
+
+# Implementation of the --disable-gfni-support switch.
+{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking whether GFNI support is requested" >&5
+printf %s "checking whether GFNI support is requested... " >&6; }
+# Check whether --enable-gfni-support was given.
+if test ${enable_gfni_support+y}
+then :
+  enableval=$enable_gfni_support; gfnisupport=$enableval
+else $as_nop
+  gfnisupport=yes
+fi
+
+{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: $gfnisupport" >&5
+printf "%s\n" "$gfnisupport" >&6; }
+
 # Implementation of the --disable-neon-support switch.
 { printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking whether NEON support is requested" >&5
 printf %s "checking whether NEON support is requested... " >&6; }
 { printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: $armcryptosupport" >&5
 printf "%s\n" "$armcryptosupport" >&6; }
 
+# Implementation of the --disable-sve-support switch.
+{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking whether SVE support is requested" >&5
+printf %s "checking whether SVE support is requested... " >&6; }
+# Check whether --enable-sve-support was given.
+if test ${enable_sve_support+y}
+then :
+  enableval=$enable_sve_support; svesupport=$enableval
+else $as_nop
+  svesupport=yes
+fi
+
+{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: $svesupport" >&5
+printf "%s\n" "$svesupport" >&6; }
+
+# Implementation of the --disable-sve2-support switch.
+{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking whether SVE2 support is requested" >&5
+printf %s "checking whether SVE2 support is requested... " >&6; }
+# Check whether --enable-sve2-support was given.
+if test ${enable_sve2_support+y}
+then :
+  enableval=$enable_sve2_support; sve2support=$enableval
+else $as_nop
+  sve2support=yes
+fi
+
+{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: $sve2support" >&5
+printf "%s\n" "$sve2support" >&6; }
+
 # Implementation of the --disable-ppc-crypto-support switch.
 { printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking whether PPC crypto support is requested" >&5
 printf %s "checking whether PPC crypto support is requested... " >&6; }
   fi
 
   if test -n "$gpgrt_libdir"; then
+    # Add the --libdir option to GPGRT_CONFIG
     GPGRT_CONFIG="$GPGRT_CONFIG --libdir=$gpgrt_libdir"
-    if $GPGRT_CONFIG gpg-error >/dev/null 2>&1; then
-      GPG_ERROR_CONFIG="$GPGRT_CONFIG gpg-error"
-      { printf "%s\n" "$as_me:${as_lineno-$LINENO}: Use gpgrt-config with $gpgrt_libdir as gpg-error-config" >&5
-printf "%s\n" "$as_me: Use gpgrt-config with $gpgrt_libdir as gpg-error-config" >&6;}
-      gpg_error_config_version=`$GPG_ERROR_CONFIG --modversion`
-    else
-      gpg_error_config_version=`$GPG_ERROR_CONFIG --version`
+    # Make sure if gpgrt-config really works, by testing config gpg-error
+    if ! $GPGRT_CONFIG gpg-error --exists; then
+      # If it doesn't work, clear the GPGRT_CONFIG variable.
       unset GPGRT_CONFIG
     fi
-  elif test "$GPG_ERROR_CONFIG" != "no"; then
-    gpg_error_config_version=`$GPG_ERROR_CONFIG --version`
+  else
+    # GPGRT_CONFIG found but no suitable dir for --libdir found.
+    # This is a failure.  Clear the GPGRT_CONFIG variable.
     unset GPGRT_CONFIG
   fi
 
+       if test x"$GPGRT_CONFIG" != x -a "$GPGRT_CONFIG" != "no"; then
+    GPG_ERROR_CONFIG="$GPGRT_CONFIG gpg-error"
+    { printf "%s\n" "$as_me:${as_lineno-$LINENO}: Use gpgrt-config with $gpgrt_libdir as gpg-error-config" >&5
+printf "%s\n" "$as_me: Use gpgrt-config with $gpgrt_libdir as gpg-error-config" >&6;}
+    gpg_error_config_version=`$GPG_ERROR_CONFIG --modversion`
+  elif test x"$GPG_ERROR_CONFIG" != x -a "$GPG_ERROR_CONFIG" != "no"; then
+    gpg_error_config_version=`$GPG_ERROR_CONFIG --version`
+  else
+    gpg_error_config_version="0.0"
+  fi
+
   min_gpg_error_version="$NEED_GPG_ERROR_VERSION"
   ok=no
   if test "$GPG_ERROR_CONFIG" != "no"; then
 printf "%s\n" "#define GPG_ERR_SOURCE_DEFAULT GPG_ERR_SOURCE_GCRYPT" >>confdefs.h
 
 
-#
-# Check whether the GNU Pth library is available.  We require this
-# to build the optional gcryptrnd program.
-#
-
-# Check whether --with-pth-prefix was given.
-if test ${with_pth_prefix+y}
-then :
-  withval=$with_pth_prefix; pth_config_prefix="$withval"
-else $as_nop
-  pth_config_prefix=""
-fi
-
-if test x$pth_config_prefix != x ; then
-   PTH_CONFIG="$pth_config_prefix/bin/pth-config"
-fi
-if test "$enable_random_daemon" = "yes"; then
-  # Extract the first word of "pth-config", so it can be a program name with args.
-set dummy pth-config; ac_word=$2
-{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5
-printf %s "checking for $ac_word... " >&6; }
-if test ${ac_cv_path_PTH_CONFIG+y}
-then :
-  printf %s "(cached) " >&6
-else $as_nop
-  case $PTH_CONFIG in
-  [\\/]* | ?:[\\/]*)
-  ac_cv_path_PTH_CONFIG="$PTH_CONFIG" # Let the user override the test with a path.
-  ;;
-  *)
-  as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
-for as_dir in $PATH
-do
-  IFS=$as_save_IFS
-  case $as_dir in #(((
-    '') as_dir=./ ;;
-    */) ;;
-    *) as_dir=$as_dir/ ;;
-  esac
-    for ac_exec_ext in '' $ac_executable_extensions; do
-  if as_fn_executable_p "$as_dir$ac_word$ac_exec_ext"; then
-    ac_cv_path_PTH_CONFIG="$as_dir$ac_word$ac_exec_ext"
-    printf "%s\n" "$as_me:${as_lineno-$LINENO}: found $as_dir$ac_word$ac_exec_ext" >&5
-    break 2
-  fi
-done
-  done
-IFS=$as_save_IFS
-
-  test -z "$ac_cv_path_PTH_CONFIG" && ac_cv_path_PTH_CONFIG="no"
-  ;;
-esac
-fi
-PTH_CONFIG=$ac_cv_path_PTH_CONFIG
-if test -n "$PTH_CONFIG"; then
-  { printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: $PTH_CONFIG" >&5
-printf "%s\n" "$PTH_CONFIG" >&6; }
+ if test -n "$GPGRT_CONFIG" \
+                                  -a "$ac_cv_path_GPG_ERROR_CONFIG" = no; then
+  USE_GPGRT_CONFIG_TRUE=
+  USE_GPGRT_CONFIG_FALSE='#'
 else
-  { printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: no" >&5
-printf "%s\n" "no" >&6; }
-fi
-
-
-  if test "$PTH_CONFIG" = "no"; then
-    { printf "%s\n" "$as_me:${as_lineno-$LINENO}: WARNING:
-***
-*** To build the Libgcrypt's random number daemon
-*** we need the support of the GNU Portable Threads Library.
-*** Download it from ftp://ftp.gnu.org/gnu/pth/
-*** On a Debian GNU/Linux system you might want to try
-***   apt-get install libpth-dev
-***" >&5
-printf "%s\n" "$as_me: WARNING:
-***
-*** To build the Libgcrypt's random number daemon
-*** we need the support of the GNU Portable Threads Library.
-*** Download it from ftp://ftp.gnu.org/gnu/pth/
-*** On a Debian GNU/Linux system you might want to try
-***   apt-get install libpth-dev
-***" >&2;}
-  else
-
-    _pth_version=`$PTH_CONFIG --version | awk 'NR==1 {print $3}'`
-    _req_version="1.3.7"
-
-    { printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking for PTH - version >= $_req_version" >&5
-printf %s "checking for PTH - version >= $_req_version... " >&6; }
-    for _var in _pth_version _req_version; do
-        eval "_val=\"\$${_var}\""
-        _major=`echo $_val | sed 's/\([0-9]*\)\.\([0-9]*\)\([ab.]\)\([0-9]*\)/\1/'`
-        _minor=`echo $_val | sed 's/\([0-9]*\)\.\([0-9]*\)\([ab.]\)\([0-9]*\)/\2/'`
-        _rtype=`echo $_val | sed 's/\([0-9]*\)\.\([0-9]*\)\([ab.]\)\([0-9]*\)/\3/'`
-        _micro=`echo $_val | sed 's/\([0-9]*\)\.\([0-9]*\)\([ab.]\)\([0-9]*\)/\4/'`
-        case $_rtype in
-            "a" ) _rtype=0 ;;
-            "b" ) _rtype=1 ;;
-            "." ) _rtype=2 ;;
-        esac
-        _hex=`echo dummy | awk '{ printf("%d%02d%1d%02d", major, minor, rtype, micro); }' \
-              "major=$_major" "minor=$_minor" "rtype=$_rtype" "micro=$_micro"`
-        eval "${_var}_hex=\"\$_hex\""
-    done
-    have_pth=no
-    if test ".$_pth_version_hex" != .; then
-        if test ".$_req_version_hex" != .; then
-            if test $_pth_version_hex -ge $_req_version_hex; then
-                have_pth=yes
-            fi
-        fi
-    fi
-    if test $have_pth = yes; then
-       { printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: yes" >&5
-printf "%s\n" "yes" >&6; }
-       { printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking whether PTH installation is sane" >&5
-printf %s "checking whether PTH installation is sane... " >&6; }
-       if test ${gnupg_cv_pth_is_sane+y}
-then :
-  printf %s "(cached) " >&6
-else $as_nop
-
-         _gnupg_pth_save_cflags=$CFLAGS
-         _gnupg_pth_save_ldflags=$LDFLAGS
-         _gnupg_pth_save_libs=$LIBS
-         CFLAGS="$CFLAGS `$PTH_CONFIG --cflags`"
-         LDFLAGS="$LDFLAGS `$PTH_CONFIG --ldflags`"
-         LIBS="$LIBS `$PTH_CONFIG --libs`"
-         cat confdefs.h - <<_ACEOF >conftest.$ac_ext
-/* end confdefs.h.  */
-#include <pth.h>
-
-int
-main (void)
-{
- pth_init ();
-  ;
-  return 0;
-}
-_ACEOF
-if ac_fn_c_try_link "$LINENO"
-then :
-  gnupg_cv_pth_is_sane=yes
-else $as_nop
-  gnupg_cv_pth_is_sane=no
-fi
-rm -f core conftest.err conftest.$ac_objext conftest.beam \
-    conftest$ac_exeext conftest.$ac_ext
-         CFLAGS=$_gnupg_pth_save_cflags
-         LDFLAGS=$_gnupg_pth_save_ldflags
-         LIBS=$_gnupg_pth_save_libs
-
-fi
-
-       if test $gnupg_cv_pth_is_sane != yes; then
-          have_pth=no
-       fi
-       { printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: $gnupg_cv_pth_is_sane" >&5
-printf "%s\n" "$gnupg_cv_pth_is_sane" >&6; }
-    else
-       { printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: no" >&5
-printf "%s\n" "no" >&6; }
-    fi
-
-    if test $have_pth = yes; then
-       PTH_CFLAGS=`$PTH_CONFIG --cflags`
-       PTH_LIBS=`$PTH_CONFIG --ldflags`
-       PTH_LIBS="$PTH_LIBS `$PTH_CONFIG --libs --all`"
-
-printf "%s\n" "#define USE_GNU_PTH 1" >>confdefs.h
-
-
-printf "%s\n" "#define HAVE_PTH 1" >>confdefs.h
-
-    fi
-  fi
+  USE_GPGRT_CONFIG_TRUE='#'
+  USE_GPGRT_CONFIG_FALSE=
 fi
 
 
-
 #
 # Check whether pthreads is available
 #
@@ -17023,6 +16915,12 @@ then :
   printf "%s\n" "#define HAVE_SYS_RANDOM_H 1" >>confdefs.h
 
 fi
+ac_fn_c_check_header_compile "$LINENO" "sys/sysctl.h" "ac_cv_header_sys_sysctl_h" "$ac_includes_default"
+if test "x$ac_cv_header_sys_sysctl_h" = xyes
+then :
+  printf "%s\n" "#define HAVE_SYS_SYSCTL_H 1" >>confdefs.h
+
+fi
 
 
 
@@ -17248,94 +17146,18 @@ printf "%s\n" "#define HAVE_U64 1" >>confdefs.h
 fi
 
 
-
-
-   if test $ac_cv_header_sys_socket_h = no; then
-                         ac_fn_c_check_header_compile "$LINENO" "ws2tcpip.h" "ac_cv_header_ws2tcpip_h" "$ac_includes_default"
-if test "x$ac_cv_header_ws2tcpip_h" = xyes
+#
+# Check for __builtin_bswap32 intrinsic.
+#
+{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking for __builtin_bswap32" >&5
+printf %s "checking for __builtin_bswap32... " >&6; }
+if test ${gcry_cv_have_builtin_bswap32+y}
 then :
-  printf "%s\n" "#define HAVE_WS2TCPIP_H 1" >>confdefs.h
-
-fi
-
-   fi
-
-   ac_fn_c_check_type "$LINENO" "socklen_t" "ac_cv_type_socklen_t" "
-/* <sys/types.h> is not needed according to POSIX, but the
-   <sys/socket.h> in i386-unknown-freebsd4.10 and
-   powerpc-apple-darwin5.5 required it. */
-#include <sys/types.h>
-#if HAVE_SYS_SOCKET_H
-# include <sys/socket.h>
-#elif HAVE_WS2TCPIP_H
-# include <ws2tcpip.h>
-#endif
-
-"
-if test "x$ac_cv_type_socklen_t" = xyes
-then :
-
-else $as_nop
-  { printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking for socklen_t equivalent" >&5
-printf %s "checking for socklen_t equivalent... " >&6; }
-if test ${gl_cv_socklen_t_equiv+y}
-then :
-  printf %s "(cached) " >&6
-else $as_nop
-  # Systems have either "struct sockaddr *" or
-         # "void *" as the second argument to getpeername
-         gl_cv_socklen_t_equiv=
-         for arg2 in "struct sockaddr" void; do
-           for t in int size_t "unsigned int" "long int" "unsigned long int"; do
-             cat confdefs.h - <<_ACEOF >conftest.$ac_ext
-/* end confdefs.h.  */
-#include <sys/types.h>
-                   #include <sys/socket.h>
-
-                   int getpeername (int, $arg2 *, $t *);
-int
-main (void)
-{
-$t len;
-                  getpeername (0, 0, &len);
-  ;
-  return 0;
-}
-_ACEOF
-if ac_fn_c_try_compile "$LINENO"
-then :
-  gl_cv_socklen_t_equiv="$t"
-fi
-rm -f core conftest.err conftest.$ac_objext conftest.beam conftest.$ac_ext
-             test "$gl_cv_socklen_t_equiv" != "" && break
-           done
-           test "$gl_cv_socklen_t_equiv" != "" && break
-         done
-         if test "$gl_cv_socklen_t_equiv" = ""; then
-           as_fn_error $? "Cannot find a type to use in place of socklen_t" "$LINENO" 5
-         fi
-
-fi
-{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: $gl_cv_socklen_t_equiv" >&5
-printf "%s\n" "$gl_cv_socklen_t_equiv" >&6; }
-
-printf "%s\n" "#define socklen_t $gl_cv_socklen_t_equiv" >>confdefs.h
-
-fi
-
-
-#
-# Check for __builtin_bswap32 intrinsic.
-#
-{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking for __builtin_bswap32" >&5
-printf %s "checking for __builtin_bswap32... " >&6; }
-if test ${gcry_cv_have_builtin_bswap32+y}
-then :
-  printf %s "(cached) " >&6
-else $as_nop
-  gcry_cv_have_builtin_bswap32=no
-        cat confdefs.h - <<_ACEOF >conftest.$ac_ext
-/* end confdefs.h.  */
+  printf %s "(cached) " >&6
+else $as_nop
+  gcry_cv_have_builtin_bswap32=no
+        cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
 
 int
 main (void)
@@ -17986,7 +17808,7 @@ __asm__(
                 ".text\n\t"
                 /* Following causes error if assembler ignored '.syntax unified'.  */
                 "asmfunc:\n\t"
-                "add %r0, %r0, %r4, ror #12;\n\t"
+                "add r0, r0, r4, ror #12;\n\t"
 
                 /* Test if '.type' and '.size' are supported.  */
                 ".size asmfunc,.-asmfunc;\n\t"
@@ -18094,7 +17916,6 @@ __asm__(
                 ".long 0\n\t"
                 ".cfi_endproc\n\t"
             );
-            void asmfunc(void)
 int
 main (void)
 {
@@ -18194,8 +18015,8 @@ then :
 else $as_nop
   ac_cv_sys_symbol_underscore=no
    cat > conftest.$ac_ext <<EOF
-      void nm_test_func(){}
-      int main(){nm_test_func;return 0;}
+      void nm_test_func(void){}
+      int main(void){nm_test_func();return 0;}
 EOF
   if { { eval echo "\"\$as_me\":${as_lineno-$LINENO}: \"$ac_compile\""; } >&5
   (eval $ac_compile) 2>&5
@@ -18203,20 +18024,20 @@ EOF
   printf "%s\n" "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
   test $ac_status = 0; }; then
     # Now try to grab the symbols.
-    ac_nlist=conftest.nm
-    if { { eval echo "\"\$as_me\":${as_lineno-$LINENO}: \"$NM conftest.$ac_objext \| $lt_cv_sys_global_symbol_pipe \| cut -d \' \' -f 2 \> $ac_nlist\""; } >&5
-  (eval $NM conftest.$ac_objext \| $lt_cv_sys_global_symbol_pipe \| cut -d \' \' -f 2 \> $ac_nlist) 2>&5
+    nlist=conftest.nm
+    if { { eval echo "\"\$as_me\":${as_lineno-$LINENO}: \"$NM conftest.$ac_objext \| "$lt_cv_sys_global_symbol_pipe" \> $nlist\""; } >&5
+  (eval $NM conftest.$ac_objext \| "$lt_cv_sys_global_symbol_pipe" \> $nlist) 2>&5
   ac_status=$?
   printf "%s\n" "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
-  test $ac_status = 0; } && test -s "$ac_nlist"; then
+  test $ac_status = 0; } && test -s "$nlist"; then
       # See whether the symbols have a leading underscore.
-      if egrep '^_nm_test_func' "$ac_nlist" >/dev/null; then
+      if $GREP ' _nm_test_func$' "$nlist" >/dev/null; then
         ac_cv_sys_symbol_underscore=yes
       else
-        if egrep '^nm_test_func ' "$ac_nlist" >/dev/null; then
+        if $GREP ' nm_test_func$' "$nlist" >/dev/null; then
           :
         else
-          echo "configure: cannot find nm_test_func in $ac_nlist" >&5
+          echo "configure: cannot find nm_test_func in $nlist" >&5
         fi
       fi
     else
@@ -18428,6 +18249,8 @@ if test "$mpi_cpu_arch" != "x86" ; then
    sse41support="n/a"
    avxsupport="n/a"
    avx2support="n/a"
+   avx512support="n/a"
+   gfnisupport="n/a"
    padlocksupport="n/a"
    drngsupport="n/a"
 fi
@@ -18436,6 +18259,8 @@ if test "$mpi_cpu_arch" != "arm" ; then
    if test "$mpi_cpu_arch" != "aarch64" ; then
      neonsupport="n/a"
      armcryptosupport="n/a"
+     svesupport="n/a"
+     sve2support="n/a"
    fi
 fi
 
@@ -18456,6 +18281,35 @@ _gcc_cflags_save=$CFLAGS
 CFLAGS="$CFLAGS -Werror"
 
 
+#
+# Check whether compiler supports 'optimize' function attribute
+#
+{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking whether compiler supports 'optimize' function attribute" >&5
+printf %s "checking whether compiler supports 'optimize' function attribute... " >&6; }
+if test ${gcry_cv_gcc_attribute_optimize+y}
+then :
+  printf %s "(cached) " >&6
+else $as_nop
+  gcry_cv_gcc_attribute_optimize=no
+        cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+int __attribute__ ((optimize("-O2"))) fn(int i){return i;}
+_ACEOF
+if ac_fn_c_try_compile "$LINENO"
+then :
+  gcry_cv_gcc_attribute_optimize=yes
+fi
+rm -f core conftest.err conftest.$ac_objext conftest.beam conftest.$ac_ext
+fi
+{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: $gcry_cv_gcc_attribute_optimize" >&5
+printf "%s\n" "$gcry_cv_gcc_attribute_optimize" >&6; }
+if test "$gcry_cv_gcc_attribute_optimize" = "yes" ; then
+
+printf "%s\n" "#define HAVE_GCC_ATTRIBUTE_OPTIMIZE 1" >>confdefs.h
+
+fi
+
+
 #
 # Check whether compiler supports 'ms_abi' function attribute.
 #
@@ -18866,6 +18720,54 @@ printf "%s\n" "#define HAVE_GCC_INLINE_ASM_AVX2 1" >>confdefs.h
 fi
 
 
+#
+# Check whether GCC inline assembler supports AVX512 instructions
+#
+{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking whether GCC inline assembler supports AVX512 instructions" >&5
+printf %s "checking whether GCC inline assembler supports AVX512 instructions... " >&6; }
+if test ${gcry_cv_gcc_inline_asm_avx512+y}
+then :
+  printf %s "(cached) " >&6
+else $as_nop
+  if test "$mpi_cpu_arch" != "x86" ||
+           test "$try_asm_modules" != "yes" ; then
+          gcry_cv_gcc_inline_asm_avx512="n/a"
+        else
+          gcry_cv_gcc_inline_asm_avx512=no
+          cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+void a(void) {
+              __asm__("xgetbv; vpopcntq %%zmm7, %%zmm1%{%%k1%}%{z%};\n\t":::"cc");
+              __asm__("vpexpandb %%zmm3, %%zmm1;\n\t":::"cc");
+              __asm__("vpxorq %%xmm7, %%xmm7, %%xmm7;\n\t":::"cc");
+              __asm__("vpxorq %%ymm7, %%ymm7, %%ymm7;\n\t":::"cc");
+              __asm__("vpxorq (%%eax)%{1to8%}, %%zmm7, %%zmm7;\n\t":::"cc");
+            }
+int
+main (void)
+{
+ a();
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_link "$LINENO"
+then :
+  gcry_cv_gcc_inline_asm_avx512=yes
+fi
+rm -f core conftest.err conftest.$ac_objext conftest.beam \
+    conftest$ac_exeext conftest.$ac_ext
+        fi
+fi
+{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: $gcry_cv_gcc_inline_asm_avx512" >&5
+printf "%s\n" "$gcry_cv_gcc_inline_asm_avx512" >&6; }
+if test "$gcry_cv_gcc_inline_asm_avx512" = "yes" ; then
+
+printf "%s\n" "#define HAVE_GCC_INLINE_ASM_AVX512 1" >>confdefs.h
+
+fi
+
+
 #
 # Check whether GCC inline assembler supports VAES and VPCLMUL instructions
 #
@@ -18913,6 +18815,52 @@ printf "%s\n" "#define HAVE_GCC_INLINE_ASM_VAES_VPCLMUL 1" >>confdefs.h
 fi
 
 
+#
+# Check whether GCC inline assembler supports GFNI instructions
+#
+{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking whether GCC inline assembler supports GFNI instructions" >&5
+printf %s "checking whether GCC inline assembler supports GFNI instructions... " >&6; }
+if test ${gcry_cv_gcc_inline_asm_gfni+y}
+then :
+  printf %s "(cached) " >&6
+else $as_nop
+  if test "$mpi_cpu_arch" != "x86" ||
+           test "$try_asm_modules" != "yes" ; then
+          gcry_cv_gcc_inline_asm_gfni="n/a"
+        else
+          gcry_cv_gcc_inline_asm_gfni=no
+          cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+void a(void) {
+              __asm__("gf2p8affineqb \$123, %%xmm0, %%xmm0;\n\t":::"cc"); /* SSE */
+              __asm__("vgf2p8affineinvqb \$234, %%ymm1, %%ymm1, %%ymm1;\n\t":::"cc"); /* AVX */
+              __asm__("vgf2p8mulb (%%eax), %%zmm2, %%zmm2;\n\t":::"cc"); /* AVX512 */
+            }
+int
+main (void)
+{
+ a();
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_link "$LINENO"
+then :
+  gcry_cv_gcc_inline_asm_gfni=yes
+fi
+rm -f core conftest.err conftest.$ac_objext conftest.beam \
+    conftest$ac_exeext conftest.$ac_ext
+        fi
+fi
+{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: $gcry_cv_gcc_inline_asm_gfni" >&5
+printf "%s\n" "$gcry_cv_gcc_inline_asm_gfni" >&6; }
+if test "$gcry_cv_gcc_inline_asm_gfni" = "yes" ; then
+
+printf "%s\n" "#define HAVE_GCC_INLINE_ASM_GFNI 1" >>confdefs.h
+
+fi
+
+
 #
 # Check whether GCC inline assembler supports BMI2 instructions
 #
@@ -18964,6 +18912,67 @@ printf "%s\n" "#define HAVE_GCC_INLINE_ASM_BMI2 1" >>confdefs.h
 fi
 
 
+#
+# Check whether compiler supports x86/AVX512 intrinsics
+#
+_gcc_cflags_save=$CFLAGS
+CFLAGS="$CFLAGS -mavx512f"
+
+{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking whether compiler supports x86/AVX512 intrinsics" >&5
+printf %s "checking whether compiler supports x86/AVX512 intrinsics... " >&6; }
+if test ${gcry_cv_cc_x86_avx512_intrinsics+y}
+then :
+  printf %s "(cached) " >&6
+else $as_nop
+  if test "$mpi_cpu_arch" != "x86" ||
+         test "$try_asm_modules" != "yes" ; then
+       gcry_cv_cc_x86_avx512_intrinsics="n/a"
+      else
+       gcry_cv_cc_x86_avx512_intrinsics=no
+       cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+#include <immintrin.h>
+         __m512i fn(void *in, __m128i y)
+         {
+           __m512i x;
+            x = _mm512_loadu_epi32 (in); /* check the GCC bug 90980. */
+           x = _mm512_maskz_loadu_epi32(_cvtu32_mask16(0xfff0), in)
+                 ^ _mm512_castsi128_si512(y);
+           asm volatile ("vinserti32x4 \$3, %0, %%zmm6, %%zmm6;\n\t"
+                         "vpxord %%zmm6, %%zmm6, %%zmm6"
+                         ::"x"(y),"r"(in):"memory","xmm6");
+           return x;
+         }
+
+_ACEOF
+if ac_fn_c_try_compile "$LINENO"
+then :
+  gcry_cv_cc_x86_avx512_intrinsics=yes
+fi
+rm -f core conftest.err conftest.$ac_objext conftest.beam conftest.$ac_ext
+      fi
+fi
+{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: $gcry_cv_cc_x86_avx512_intrinsics" >&5
+printf "%s\n" "$gcry_cv_cc_x86_avx512_intrinsics" >&6; }
+if test "$gcry_cv_cc_x86_avx512_intrinsics" = "yes" ; then
+
+printf "%s\n" "#define HAVE_COMPATIBLE_CC_X86_AVX512_INTRINSICS 1" >>confdefs.h
+
+fi
+
+ if test "$gcry_cv_cc_x86_avx512_intrinsics" = "yes"; then
+  ENABLE_X86_AVX512_INTRINSICS_EXTRA_CFLAGS_TRUE=
+  ENABLE_X86_AVX512_INTRINSICS_EXTRA_CFLAGS_FALSE='#'
+else
+  ENABLE_X86_AVX512_INTRINSICS_EXTRA_CFLAGS_TRUE='#'
+  ENABLE_X86_AVX512_INTRINSICS_EXTRA_CFLAGS_FALSE=
+fi
+
+
+# Restore flags.
+CFLAGS=$_gcc_cflags_save;
+
+
 #
 # Check whether GCC assembler needs "-Wa,--divide" to correctly handle
 # constant division
 
 
 #
-# Check whether GCC assembler supports features needed for our amd64
+# Check whether GCC assembler supports features needed for our i386/amd64
 # implementations
 #
 if test $amd64_as_feature_detection = yes; then
-  { printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking whether GCC assembler is compatible for amd64 assembly implementations" >&5
-printf %s "checking whether GCC assembler is compatible for amd64 assembly implementations... " >&6; }
-if test ${gcry_cv_gcc_amd64_platform_as_ok+y}
+  { printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking whether GCC assembler is compatible for i386/amd64 assembly implementations" >&5
+printf %s "checking whether GCC assembler is compatible for i386/amd64 assembly implementations... " >&6; }
+if test ${gcry_cv_gcc_x86_platform_as_ok+y}
 then :
   printf %s "(cached) " >&6
 else $as_nop
   if test "$mpi_cpu_arch" != "x86" ||
            test "$try_asm_modules" != "yes" ; then
-          gcry_cv_gcc_amd64_platform_as_ok="n/a"
+          gcry_cv_gcc_x86_platform_as_ok="n/a"
         else
-          gcry_cv_gcc_amd64_platform_as_ok=no
+          gcry_cv_gcc_x86_platform_as_ok=no
           cat confdefs.h - <<_ACEOF >conftest.$ac_ext
 /* end confdefs.h.  */
 __asm__(
@@ -19081,20 +19090,27 @@ main (void)
 _ACEOF
 if ac_fn_c_try_link "$LINENO"
 then :
-  gcry_cv_gcc_amd64_platform_as_ok=yes
+  gcry_cv_gcc_x86_platform_as_ok=yes
 fi
 rm -f core conftest.err conftest.$ac_objext conftest.beam \
     conftest$ac_exeext conftest.$ac_ext
         fi
 fi
-{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: $gcry_cv_gcc_amd64_platform_as_ok" >&5
-printf "%s\n" "$gcry_cv_gcc_amd64_platform_as_ok" >&6; }
-  if test "$gcry_cv_gcc_amd64_platform_as_ok" = "yes" ; then
+{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: $gcry_cv_gcc_x86_platform_as_ok" >&5
+printf "%s\n" "$gcry_cv_gcc_x86_platform_as_ok" >&6; }
+  if test "$gcry_cv_gcc_x86_platform_as_ok" = "yes" &&
+     test "$ac_cv_sizeof_unsigned_long" = "8"; then
 
 printf "%s\n" "#define HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS 1" >>confdefs.h
 
   fi
-  if test "$gcry_cv_gcc_amd64_platform_as_ok" = "no" &&
+  if test "$gcry_cv_gcc_x86_platform_as_ok" = "yes" &&
+     test "$ac_cv_sizeof_unsigned_long" = "4"; then
+
+printf "%s\n" "#define HAVE_COMPATIBLE_GCC_I386_PLATFORM_AS 1" >>confdefs.h
+
+  fi
+  if test "$gcry_cv_gcc_x86_platform_as_ok" = "no" &&
      test "$gcry_cv_gcc_attribute_sysv_abi" = "yes" &&
      test "$gcry_cv_gcc_default_abi_is_ms_abi" = "yes"; then
     { printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking whether GCC assembler is compatible for WIN64 assembly implementations" >&5
@@ -19134,6 +19150,47 @@ printf "%s\n" "$gcry_cv_gcc_win64_platform_as_ok" >&6; }
 
 printf "%s\n" "#define HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS 1" >>confdefs.h
 
+    fi
+  fi
+  if test "$gcry_cv_gcc_x86_platform_as_ok" = "no" &&
+     test "$ac_cv_sizeof_unsigned_long" = "4"; then
+    { printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking whether GCC assembler is compatible for WIN32 assembly implementations" >&5
+printf %s "checking whether GCC assembler is compatible for WIN32 assembly implementations... " >&6; }
+if test ${gcry_cv_gcc_win32_platform_as_ok+y}
+then :
+  printf %s "(cached) " >&6
+else $as_nop
+  gcry_cv_gcc_win32_platform_as_ok=no
+      cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+__asm__(
+              ".text\n\t"
+              ".globl _asmfunc\n\t"
+              "_asmfunc:\n\t"
+              "xorl \$(1234), %ebp;\n\t"
+          );
+          void asmfunc(void);
+int
+main (void)
+{
+ asmfunc();
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_link "$LINENO"
+then :
+  gcry_cv_gcc_win32_platform_as_ok=yes
+fi
+rm -f core conftest.err conftest.$ac_objext conftest.beam \
+    conftest$ac_exeext conftest.$ac_ext
+fi
+{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: $gcry_cv_gcc_win32_platform_as_ok" >&5
+printf "%s\n" "$gcry_cv_gcc_win32_platform_as_ok" >&6; }
+    if test "$gcry_cv_gcc_win32_platform_as_ok" = "yes" ; then
+
+printf "%s\n" "#define HAVE_COMPATIBLE_GCC_WIN32_PLATFORM_AS 1" >>confdefs.h
+
     fi
   fi
 fi
@@ -19264,10 +19321,10 @@ __asm__(
                 ".fpu neon\n\t"
                 ".text\n\t"
                 "testfn:\n\t"
-                "vld1.64 {%q0-%q1}, [%r0]!;\n\t"
-                "vrev64.8 %q0, %q3;\n\t"
-                "vadd.u64 %q0, %q1;\n\t"
-                "vadd.s64 %d3, %d2, %d3;\n\t"
+                "vld1.64 {q0-q1}, [r0]!;\n\t"
+                "vrev64.8 q0, q3;\n\t"
+                "vadd.u64 q0, q1;\n\t"
+                "vadd.s64 d3, d2, d3;\n\t"
                 );
             void testfn(void);
 
@@ -19377,18 +19434,197 @@ then :
 else $as_nop
   if test "$mpi_cpu_arch" != "aarch64" ||
            test "$try_asm_modules" != "yes" ; then
-          gcry_cv_gcc_inline_asm_aarch64_neon="n/a"
+          gcry_cv_gcc_inline_asm_aarch64_neon="n/a"
+        else
+          gcry_cv_gcc_inline_asm_aarch64_neon=no
+          cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+__asm__(
+                ".cpu generic+simd\n\t"
+                ".text\n\t"
+                "testfn:\n\t"
+                "mov w0, \#42;\n\t"
+                "dup v0.8b, w0;\n\t"
+                "ld4 {v0.8b,v1.8b,v2.8b,v3.8b},[x0],\#32;\n\t"
+                );
+            void testfn(void);
+
+int
+main (void)
+{
+ testfn();
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_link "$LINENO"
+then :
+  gcry_cv_gcc_inline_asm_aarch64_neon=yes
+fi
+rm -f core conftest.err conftest.$ac_objext conftest.beam \
+    conftest$ac_exeext conftest.$ac_ext
+        fi
+fi
+{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: $gcry_cv_gcc_inline_asm_aarch64_neon" >&5
+printf "%s\n" "$gcry_cv_gcc_inline_asm_aarch64_neon" >&6; }
+if test "$gcry_cv_gcc_inline_asm_aarch64_neon" = "yes" ; then
+
+printf "%s\n" "#define HAVE_GCC_INLINE_ASM_AARCH64_NEON 1" >>confdefs.h
+
+fi
+
+
+#
+# Check whether GCC inline assembler supports AArch64 Crypto Extension instructions
+#
+{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking whether GCC inline assembler supports AArch64 Crypto Extension instructions" >&5
+printf %s "checking whether GCC inline assembler supports AArch64 Crypto Extension instructions... " >&6; }
+if test ${gcry_cv_gcc_inline_asm_aarch64_crypto+y}
+then :
+  printf %s "(cached) " >&6
+else $as_nop
+  if test "$mpi_cpu_arch" != "aarch64" ||
+           test "$try_asm_modules" != "yes" ; then
+          gcry_cv_gcc_inline_asm_aarch64_crypto="n/a"
+        else
+          gcry_cv_gcc_inline_asm_aarch64_crypto=no
+          cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+__asm__(
+                ".cpu generic+simd+crypto\n\t"
+                ".text\n\t"
+                "testfn:\n\t"
+                "mov w0, \#42;\n\t"
+                "dup v0.8b, w0;\n\t"
+                "ld4 {v0.8b,v1.8b,v2.8b,v3.8b},[x0],\#32;\n\t"
+
+                "sha1h s0, s0;\n\t"
+                "sha1c q0, s0, v0.4s;\n\t"
+                "sha1p q0, s0, v0.4s;\n\t"
+                "sha1su0 v0.4s, v0.4s, v0.4s;\n\t"
+                "sha1su1 v0.4s, v0.4s;\n\t"
+
+                "sha256h q0, q0, v0.4s;\n\t"
+                "sha256h2 q0, q0, v0.4s;\n\t"
+                "sha1p q0, s0, v0.4s;\n\t"
+                "sha256su0 v0.4s, v0.4s;\n\t"
+                "sha256su1 v0.4s, v0.4s, v31.4s;\n\t"
+
+                "aese v0.16b, v0.16b;\n\t"
+                "aesd v0.16b, v0.16b;\n\t"
+                "aesmc v0.16b, v0.16b;\n\t"
+                "aesimc v0.16b, v0.16b;\n\t"
+
+                "pmull v0.1q, v0.1d, v31.1d;\n\t"
+                "pmull2 v0.1q, v0.2d, v31.2d;\n\t"
+                );
+            void testfn(void);
+
+int
+main (void)
+{
+ testfn();
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_link "$LINENO"
+then :
+  gcry_cv_gcc_inline_asm_aarch64_crypto=yes
+fi
+rm -f core conftest.err conftest.$ac_objext conftest.beam \
+    conftest$ac_exeext conftest.$ac_ext
+        fi
+fi
+{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: $gcry_cv_gcc_inline_asm_aarch64_crypto" >&5
+printf "%s\n" "$gcry_cv_gcc_inline_asm_aarch64_crypto" >&6; }
+if test "$gcry_cv_gcc_inline_asm_aarch64_crypto" = "yes" ; then
+
+printf "%s\n" "#define HAVE_GCC_INLINE_ASM_AARCH64_CRYPTO 1" >>confdefs.h
+
+fi
+
+
+#
+# Check whether GCC inline assembler supports AArch64 SVE instructions
+#
+{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking whether GCC inline assembler supports AArch64 SVE instructions" >&5
+printf %s "checking whether GCC inline assembler supports AArch64 SVE instructions... " >&6; }
+if test ${gcry_cv_gcc_inline_asm_aarch64_sve+y}
+then :
+  printf %s "(cached) " >&6
+else $as_nop
+  if test "$mpi_cpu_arch" != "aarch64" ||
+           test "$try_asm_modules" != "yes" ; then
+          gcry_cv_gcc_inline_asm_aarch64_sve="n/a"
+        else
+          gcry_cv_gcc_inline_asm_aarch64_sve=no
+          cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+__asm__(
+                ".cpu generic+simd+sve\n\t"
+                ".text\n\t"
+                "testfn:\n\t"
+                "mov x0, \#60;\n\t"
+                "whilelo p0.s, xzr, x0;\n\t"
+                "mov z0.s, p0/z, \#55;\n\t"
+                "ld1b {z0.b}, p0/z, [x1];\n\t"
+                );
+            void testfn(void);
+
+int
+main (void)
+{
+ testfn();
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_link "$LINENO"
+then :
+  gcry_cv_gcc_inline_asm_aarch64_sve=yes
+fi
+rm -f core conftest.err conftest.$ac_objext conftest.beam \
+    conftest$ac_exeext conftest.$ac_ext
+        fi
+fi
+{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: $gcry_cv_gcc_inline_asm_aarch64_sve" >&5
+printf "%s\n" "$gcry_cv_gcc_inline_asm_aarch64_sve" >&6; }
+if test "$gcry_cv_gcc_inline_asm_aarch64_sve" = "yes" ; then
+
+printf "%s\n" "#define HAVE_GCC_INLINE_ASM_AARCH64_SVE 1" >>confdefs.h
+
+fi
+
+
+#
+# Check whether GCC inline assembler supports AArch64 SVE2 instructions
+#
+{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking whether GCC inline assembler supports AArch64 SVE2 instructions" >&5
+printf %s "checking whether GCC inline assembler supports AArch64 SVE2 instructions... " >&6; }
+if test ${gcry_cv_gcc_inline_asm_aarch64_sve2+y}
+then :
+  printf %s "(cached) " >&6
+else $as_nop
+  if test "$mpi_cpu_arch" != "aarch64" ||
+           test "$try_asm_modules" != "yes" ; then
+          gcry_cv_gcc_inline_asm_aarch64_sve2="n/a"
         else
-          gcry_cv_gcc_inline_asm_aarch64_neon=no
+          gcry_cv_gcc_inline_asm_aarch64_sve2=no
           cat confdefs.h - <<_ACEOF >conftest.$ac_ext
 /* end confdefs.h.  */
 __asm__(
-                ".cpu generic+simd\n\t"
+                ".cpu generic+simd+sve2\n\t"
                 ".text\n\t"
                 "testfn:\n\t"
-                "mov w0, \#42;\n\t"
-                "dup v0.8b, w0;\n\t"
-                "ld4 {v0.8b,v1.8b,v2.8b,v3.8b},[x0],\#32;\n\t"
+                ";\n\t"
+                "eor3 z0.d, z0.d, z1.d, z2.d;\n\t"
+                "ext z8.b, {z20.b, z21.b}, \#3;\n\t"
+                "adclt z0.d, z1.d, z2.d;\n\t"
+                "tbl z0.b, {z8.b, z9.b}, z1.b;\n\t"
+                "addhnb z16.s, z17.d, z18.d;\n\t"
+                "mov z0.s, p0/z, \#55;\n\t"
+                "ld1b {z0.b}, p0/z, [x1];\n\t"
                 );
             void testfn(void);
 
@@ -19402,64 +19638,66 @@ main (void)
 _ACEOF
 if ac_fn_c_try_link "$LINENO"
 then :
-  gcry_cv_gcc_inline_asm_aarch64_neon=yes
+  gcry_cv_gcc_inline_asm_aarch64_sve2=yes
 fi
 rm -f core conftest.err conftest.$ac_objext conftest.beam \
     conftest$ac_exeext conftest.$ac_ext
         fi
 fi
-{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: $gcry_cv_gcc_inline_asm_aarch64_neon" >&5
-printf "%s\n" "$gcry_cv_gcc_inline_asm_aarch64_neon" >&6; }
-if test "$gcry_cv_gcc_inline_asm_aarch64_neon" = "yes" ; then
+{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: $gcry_cv_gcc_inline_asm_aarch64_sve2" >&5
+printf "%s\n" "$gcry_cv_gcc_inline_asm_aarch64_sve2" >&6; }
+if test "$gcry_cv_gcc_inline_asm_aarch64_sve2" = "yes" ; then
 
-printf "%s\n" "#define HAVE_GCC_INLINE_ASM_AARCH64_NEON 1" >>confdefs.h
+printf "%s\n" "#define HAVE_GCC_INLINE_ASM_AARCH64_SVE2 1" >>confdefs.h
 
 fi
 
 
 #
-# Check whether GCC inline assembler supports AArch64 Crypto Extension instructions
+# Check whether GCC inline assembler supports AArch64 SHA3/SHA512/SM3/SM4 instructions
 #
-{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking whether GCC inline assembler supports AArch64 Crypto Extension instructions" >&5
-printf %s "checking whether GCC inline assembler supports AArch64 Crypto Extension instructions... " >&6; }
-if test ${gcry_cv_gcc_inline_asm_aarch64_crypto+y}
+{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking whether GCC inline assembler supports AArch64 SHA3/SHA512/SM3/SM4 instructions" >&5
+printf %s "checking whether GCC inline assembler supports AArch64 SHA3/SHA512/SM3/SM4 instructions... " >&6; }
+if test ${gcry_cv_gcc_inline_asm_aarch64_sha3_sha512_sm3_sm4+y}
 then :
   printf %s "(cached) " >&6
 else $as_nop
   if test "$mpi_cpu_arch" != "aarch64" ||
            test "$try_asm_modules" != "yes" ; then
-          gcry_cv_gcc_inline_asm_aarch64_crypto="n/a"
+          gcry_cv_gcc_inline_asm_aarch64_sha3_sha512_sm3_sm4="n/a"
         else
-          gcry_cv_gcc_inline_asm_aarch64_crypto=no
+          gcry_cv_gcc_inline_asm_aarch64_sha3_sha512_sm3_sm4=no
           cat confdefs.h - <<_ACEOF >conftest.$ac_ext
 /* end confdefs.h.  */
 __asm__(
-                ".cpu generic+simd+crypto\n\t"
+                ".arch armv8.2-a+sha3+sm4\n\t"
                 ".text\n\t"
                 "testfn:\n\t"
-                "mov w0, \#42;\n\t"
-                "dup v0.8b, w0;\n\t"
-                "ld4 {v0.8b,v1.8b,v2.8b,v3.8b},[x0],\#32;\n\t"
-
-                "sha1h s0, s0;\n\t"
-                "sha1c q0, s0, v0.4s;\n\t"
-                "sha1p q0, s0, v0.4s;\n\t"
-                "sha1su0 v0.4s, v0.4s, v0.4s;\n\t"
-                "sha1su1 v0.4s, v0.4s;\n\t"
-
-                "sha256h q0, q0, v0.4s;\n\t"
-                "sha256h2 q0, q0, v0.4s;\n\t"
-                "sha1p q0, s0, v0.4s;\n\t"
-                "sha256su0 v0.4s, v0.4s;\n\t"
-                "sha256su1 v0.4s, v0.4s, v31.4s;\n\t"
-
-                "aese v0.16b, v0.16b;\n\t"
-                "aesd v0.16b, v0.16b;\n\t"
-                "aesmc v0.16b, v0.16b;\n\t"
-                "aesimc v0.16b, v0.16b;\n\t"
 
-                "pmull v0.1q, v0.1d, v31.1d;\n\t"
-                "pmull2 v0.1q, v0.2d, v31.2d;\n\t"
+                /* Test for SHA512 instructions */
+                "sha512h q0, q0, v0.2d;\n\t"
+                "sha512h2 q0, q0, v0.2d;\n\t"
+                "sha512su0 v0.2d, v0.2d;\n\t"
+                "sha512su1 v0.2d, v0.2d, v31.2d;\n\t"
+
+                /* Test for SHA3 instructions */
+                "bcax v0.16b, v1.16b, v2.16b, v3.16b;\n\t"
+                "eor3 v0.16b, v1.16b, v2.16b, v3.16b;\n\t"
+                "rax1 v0.2d, v1.2d, v2.2d;\n\t"
+                "xar v0.2d, v1.2d, v2.2d, \#1;\n\t"
+
+                /* Test for SM3 instructions */
+                "sm3partw1 v0.4s, v1.4s, v2.4s;\n\t"
+                "sm3partw2 v0.4s, v1.4s, v2.4s;\n\t"
+                "sm3ss1 v0.4s, v1.4s, v2.4s, v3.4s;\n\t"
+                "sm3tt1a v0.4s, v1.4s, v2.s[0];\n\t"
+                "sm3tt1b v0.4s, v1.4s, v2.s[0];\n\t"
+                "sm3tt2a v0.4s, v1.4s, v2.s[0];\n\t"
+                "sm3tt2b v0.4s, v1.4s, v2.s[0];\n\t"
+
+                /* Test for SM4 instructions */
+                "sm4e v0.4s, v1.4s;\n\t"
+                "sm4ekey v0.4s, v1.4s, v2.4s;\n\t"
                 );
             void testfn(void);
 
@@ -19473,26 +19711,157 @@ main (void)
 _ACEOF
 if ac_fn_c_try_link "$LINENO"
 then :
-  gcry_cv_gcc_inline_asm_aarch64_crypto=yes
+  gcry_cv_gcc_inline_asm_aarch64_sha3_sha512_sm3_sm4=yes
 fi
 rm -f core conftest.err conftest.$ac_objext conftest.beam \
     conftest$ac_exeext conftest.$ac_ext
         fi
 fi
-{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: $gcry_cv_gcc_inline_asm_aarch64_crypto" >&5
-printf "%s\n" "$gcry_cv_gcc_inline_asm_aarch64_crypto" >&6; }
-if test "$gcry_cv_gcc_inline_asm_aarch64_crypto" = "yes" ; then
+{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: $gcry_cv_gcc_inline_asm_aarch64_sha3_sha512_sm3_sm4" >&5
+printf "%s\n" "$gcry_cv_gcc_inline_asm_aarch64_sha3_sha512_sm3_sm4" >&6; }
+if test "$gcry_cv_gcc_inline_asm_aarch64_sha3_sha512_sm3_sm4" = "yes" ; then
 
-printf "%s\n" "#define HAVE_GCC_INLINE_ASM_AARCH64_CRYPTO 1" >>confdefs.h
+printf "%s\n" "#define HAVE_GCC_INLINE_ASM_AARCH64_SHA3_SHA512_SM3_SM4 1" >>confdefs.h
+
+fi
+
+
+#
+# Check whether compiler supports AArch64/NEON/crypto intrinsics
+#
+{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking whether compiler supports AArch64/NEON/crypto intrinsics" >&5
+printf %s "checking whether compiler supports AArch64/NEON/crypto intrinsics... " >&6; }
+if test ${gcry_cv_cc_aarch64_neon_intrinsics+y}
+then :
+  printf %s "(cached) " >&6
+else $as_nop
+  if test "$mpi_cpu_arch" != "aarch64" ||
+         test "$try_asm_modules" != "yes" ; then
+       gcry_cv_cc_aarch64_neon_intrinsics="n/a"
+      else
+       gcry_cv_cc_aarch64_neon_intrinsics=no
+       cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+#include <arm_neon.h>
+         #define __m128i uint64x2_t
+         #define vpsrldq128(s, a, o) \
+           ({ uint64x2_t __tmp = { 0, 0 }; \
+               o = (__m128i)vextq_u8((uint8x16_t)a, \
+                                     (uint8x16_t)__tmp, (s) & 15); })
+         #define vaesenclast128(a, b, o) \
+           (o = (__m128i)vaeseq_u8((uint8x16_t)b, (uint8x16_t)a))
+         #define memory_barrier_with_vec(a) __asm__("" : "+w"(a) :: "memory")
+         static inline __attribute__((always_inline)) __m128i
+         fn2(__m128i a)
+         {
+           vpsrldq128(2, a, a);
+           return a;
+         }
+         __m128i fn(__m128i in)
+         {
+           __m128i x;
+           memory_barrier_with_vec(in);
+           x = fn2(in);
+           memory_barrier_with_vec(x);
+           vaesenclast128(in, x, in);
+           memory_barrier_with_vec(in);
+           return in;
+         }
+
+_ACEOF
+if ac_fn_c_try_compile "$LINENO"
+then :
+  gcry_cv_cc_aarch64_neon_intrinsics=yes
+fi
+rm -f core conftest.err conftest.$ac_objext conftest.beam conftest.$ac_ext
+      fi
+fi
+{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: $gcry_cv_cc_aarch64_neon_intrinsics" >&5
+printf "%s\n" "$gcry_cv_cc_aarch64_neon_intrinsics" >&6; }
+if test "$gcry_cv_cc_aarch64_neon_intrinsics" = "yes" ; then
+
+printf "%s\n" "#define HAVE_COMPATIBLE_CC_AARCH64_NEON_INTRINSICS 1" >>confdefs.h
+
+fi
+
+_gcc_cflags_save=$CFLAGS
+CFLAGS="$CFLAGS -O2 -march=armv8-a+crypto"
+
+if test "$gcry_cv_cc_aarch64_neon_intrinsics" = "no" &&
+   test "$mpi_cpu_arch" = "aarch64" &&
+   test "$try_asm_modules" = "yes" ; then
+  { printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking whether compiler supports AArch64/NEON/crypto intrinsics with extra GCC flags" >&5
+printf %s "checking whether compiler supports AArch64/NEON/crypto intrinsics with extra GCC flags... " >&6; }
+if test ${gcry_cv_cc_aarch64_neon_intrinsics_cflags+y}
+then :
+  printf %s "(cached) " >&6
+else $as_nop
+  gcry_cv_cc_aarch64_neon_intrinsics_cflags=no
+    cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+#include <arm_neon.h>
+       #define __m128i uint64x2_t
+       #define vpsrldq128(s, a, o) \
+         ({ uint64x2_t __tmp = { 0, 0 }; \
+             o = (__m128i)vextq_u8((uint8x16_t)a, \
+                                   (uint8x16_t)__tmp, (s) & 15); })
+       #define vaesenclast128(a, b, o) \
+         (o = (__m128i)vaeseq_u8((uint8x16_t)b, (uint8x16_t)a))
+       #define memory_barrier_with_vec(a) __asm__("" : "+w"(a) :: "memory")
+       static inline __attribute__((always_inline)) __m128i
+       fn2(__m128i a)
+       {
+         vpsrldq128(2, a, a);
+         return a;
+       }
+       __m128i fn(__m128i in)
+       {
+         __m128i x;
+         memory_barrier_with_vec(in);
+         x = fn2(in);
+         memory_barrier_with_vec(x);
+         vaesenclast128(in, x, in);
+         memory_barrier_with_vec(in);
+         return in;
+       }
+
+_ACEOF
+if ac_fn_c_try_compile "$LINENO"
+then :
+  gcry_cv_cc_aarch64_neon_intrinsics_cflags=yes
+fi
+rm -f core conftest.err conftest.$ac_objext conftest.beam conftest.$ac_ext
+fi
+{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: $gcry_cv_cc_aarch64_neon_intrinsics_cflags" >&5
+printf "%s\n" "$gcry_cv_cc_aarch64_neon_intrinsics_cflags" >&6; }
+  if test "$gcry_cv_cc_aarch64_neon_intrinsics_cflags" = "yes" ; then
+
+printf "%s\n" "#define HAVE_COMPATIBLE_CC_AARCH64_NEON_INTRINSICS 1" >>confdefs.h
+
+
+printf "%s\n" "#define HAVE_COMPATIBLE_CC_AARCH64_NEON_INTRINSICS_WITH_CFLAGS 1" >>confdefs.h
 
+  fi
+fi
+
+ if test "$gcry_cv_cc_aarch64_neon_intrinsics_cflags" = "yes"; then
+  ENABLE_AARCH64_NEON_INTRINSICS_EXTRA_CFLAGS_TRUE=
+  ENABLE_AARCH64_NEON_INTRINSICS_EXTRA_CFLAGS_FALSE='#'
+else
+  ENABLE_AARCH64_NEON_INTRINSICS_EXTRA_CFLAGS_TRUE='#'
+  ENABLE_AARCH64_NEON_INTRINSICS_EXTRA_CFLAGS_FALSE=
 fi
 
 
+# Restore flags.
+CFLAGS=$_gcc_cflags_save;
+
+
 #
-# Check whether PowerPC AltiVec/VSX intrinsics
+# Check whether compiler supports PowerPC AltiVec/VSX intrinsics
 #
-{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking whether compiler supports PowerPC AltiVec/VSX intrinsics" >&5
-printf %s "checking whether compiler supports PowerPC AltiVec/VSX intrinsics... " >&6; }
+{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking whether compiler supports PowerPC AltiVec/VSX/crypto intrinsics" >&5
+printf %s "checking whether compiler supports PowerPC AltiVec/VSX/crypto intrinsics... " >&6; }
 if test ${gcry_cv_cc_ppc_altivec+y}
 then :
   printf %s "(cached) " >&6
@@ -19507,10 +19876,16 @@ else $as_nop
 #include <altivec.h>
          typedef vector unsigned char block;
          typedef vector unsigned int vecu32;
+         static inline __attribute__((always_inline)) vecu32
+         vec_sld_u32(vecu32 a, vecu32 b, unsigned int idx)
+         {
+           return vec_sld (a, b, (4 * idx) & 15);
+         }
          block fn(block in)
          {
            block t = vec_perm (in, in, vec_vsx_ld (0, (unsigned char*)0));
            vecu32 y = vec_vsx_ld (0, (unsigned int*)0);
+           y = vec_sld_u32 (y, y, 3);
            return vec_cipher_be (t, in) ^ (block)y;
          }
 
@@ -19531,11 +19906,11 @@ printf "%s\n" "#define HAVE_COMPATIBLE_CC_PPC_ALTIVEC 1" >>confdefs.h
 fi
 
 _gcc_cflags_save=$CFLAGS
-CFLAGS="$CFLAGS -maltivec -mvsx -mcrypto"
+CFLAGS="$CFLAGS -O2 -maltivec -mvsx -mcrypto"
 
 if test "$gcry_cv_cc_ppc_altivec" = "no" &&
-    test "$mpi_cpu_arch" = "ppc" &&
-    test "$try_asm_modules" == "yes" ; then
+   test "$mpi_cpu_arch" = "ppc" &&
+   test "$try_asm_modules" = "yes" ; then
   { printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking whether compiler supports PowerPC AltiVec/VSX/crypto intrinsics with extra GCC flags" >&5
 printf %s "checking whether compiler supports PowerPC AltiVec/VSX/crypto intrinsics with extra GCC flags... " >&6; }
 if test ${gcry_cv_cc_ppc_altivec_cflags+y}
@@ -19548,12 +19923,19 @@ else $as_nop
 #include <altivec.h>
        typedef vector unsigned char block;
        typedef vector unsigned int vecu32;
+       static inline __attribute__((always_inline)) vecu32
+       vec_sld_u32(vecu32 a, vecu32 b, unsigned int idx)
+       {
+         return vec_sld (a, b, (4 * idx) & 15);
+       }
        block fn(block in)
        {
          block t = vec_perm (in, in, vec_vsx_ld (0, (unsigned char*)0));
          vecu32 y = vec_vsx_ld (0, (unsigned int*)0);
+         y = vec_sld_u32 (y, y, 3);
          return vec_cipher_be (t, in) ^ (block)y;
        }
+
 _ACEOF
 if ac_fn_c_try_compile "$LINENO"
 then :
@@ -19689,6 +20071,96 @@ printf "%s\n" "#define HAVE_GCC_INLINE_ASM_PPC_ARCH_3_00 1" >>confdefs.h
 fi
 
 
+#
+# Check whether compiler supports GCC PowerPC target attributes
+#
+{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking whether compiler supports GCC PowerPC target attributes" >&5
+printf %s "checking whether compiler supports GCC PowerPC target attributes... " >&6; }
+if test ${gcry_cv_gcc_attribute_ppc_target+y}
+then :
+  printf %s "(cached) " >&6
+else $as_nop
+  if test "$mpi_cpu_arch" != "ppc" ; then
+          gcry_cv_gcc_attribute_ppc_target="n/a"
+        else
+          gcry_cv_gcc_attribute_ppc_target=no
+          cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+void __attribute__((always_inline)) inline aifn(void) {}
+            void __attribute__((target("cpu=power8"))) testfn8(void) {aifn();}
+            void __attribute__((target("cpu=power9"))) testfn9(void)
+            { testfn8(); aifn(); }
+
+int
+main (void)
+{
+ testfn9(); aifn();
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_link "$LINENO"
+then :
+  gcry_cv_gcc_attribute_ppc_target=yes
+fi
+rm -f core conftest.err conftest.$ac_objext conftest.beam \
+    conftest$ac_exeext conftest.$ac_ext
+        fi
+fi
+{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: $gcry_cv_gcc_attribute_ppc_target" >&5
+printf "%s\n" "$gcry_cv_gcc_attribute_ppc_target" >&6; }
+if test "$gcry_cv_gcc_attribute_ppc_target" = "yes" ; then
+
+printf "%s\n" "#define HAVE_GCC_ATTRIBUTE_PPC_TARGET 1" >>confdefs.h
+
+fi
+
+
+#
+# Check whether compiler supports clang PowerPC target attributes
+#
+{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking whether compiler supports clang PowerPC target attributes" >&5
+printf %s "checking whether compiler supports clang PowerPC target attributes... " >&6; }
+if test ${gcry_cv_clang_attribute_ppc_target+y}
+then :
+  printf %s "(cached) " >&6
+else $as_nop
+  if test "$mpi_cpu_arch" != "ppc" ; then
+          gcry_cv_clang_attribute_ppc_target="n/a"
+        else
+          gcry_cv_clang_attribute_ppc_target=no
+          cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+void __attribute__((always_inline)) inline aifn(void) {}
+            void __attribute__((target("arch=pwr8"))) testfn8(void) {aifn();}
+            void __attribute__((target("arch=pwr9"))) testfn9(void)
+            { testfn8(); aifn(); }
+
+int
+main (void)
+{
+ testfn9(); aifn();
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_link "$LINENO"
+then :
+  gcry_cv_clang_attribute_ppc_target=yes
+fi
+rm -f core conftest.err conftest.$ac_objext conftest.beam \
+    conftest$ac_exeext conftest.$ac_ext
+        fi
+fi
+{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: $gcry_cv_clang_attribute_ppc_target" >&5
+printf "%s\n" "$gcry_cv_clang_attribute_ppc_target" >&6; }
+if test "$gcry_cv_clang_attribute_ppc_target" = "yes" ; then
+
+printf "%s\n" "#define HAVE_CLANG_ATTRIBUTE_PPC_TARGET 1" >>confdefs.h
+
+fi
+
+
 #
 # Check whether GCC inline assembler supports zSeries instructions
 #
@@ -20028,6 +20500,12 @@ if test "x$ac_cv_func_getentropy" = xyes
 then :
   printf "%s\n" "#define HAVE_GETENTROPY 1" >>confdefs.h
 
+fi
+ac_fn_c_check_func "$LINENO" "sysctlbyname" "ac_cv_func_sysctlbyname"
+if test "x$ac_cv_func_sysctlbyname" = xyes
+then :
+  printf "%s\n" "#define HAVE_SYSCTLBYNAME 1" >>confdefs.h
+
 fi
 
 
@@ -20179,7 +20657,7 @@ else $as_nop
 #include <sys/types.h>
 #include <fcntl.h>
 
-int main()
+int main(void)
 {
     char *pool;
     int err;
@@ -20732,6 +21210,16 @@ if test x"$avx2support" = xyes ; then
     avx2support="no (unsupported by compiler)"
   fi
 fi
+if test x"$avx512support" = xyes ; then
+  if test "$gcry_cv_gcc_inline_asm_avx512" != "yes" ; then
+    avx512support="no (unsupported by compiler)"
+  fi
+fi
+if test x"$gfnisupport" = xyes ; then
+  if test "$gcry_cv_gcc_inline_asm_gfni" != "yes" ; then
+    gfnisupport="no (unsupported by compiler)"
+  fi
+fi
 if test x"$neonsupport" = xyes ; then
   if test "$gcry_cv_gcc_inline_asm_neon" != "yes" ; then
     if test "$gcry_cv_gcc_inline_asm_aarch64_neon" != "yes" ; then
 if test x"$armcryptosupport" = xyes ; then
   if test "$gcry_cv_gcc_inline_asm_aarch32_crypto" != "yes" ; then
     if test "$gcry_cv_gcc_inline_asm_aarch64_crypto" != "yes" ; then
-      neonsupport="no (unsupported by compiler)"
+      armcryptosupport="no (unsupported by compiler)"
+    fi
+  fi
+fi
+if test x"$svesupport" = xyes ; then
+  if test "$gcry_cv_gcc_inline_asm_sve" != "yes" ; then
+    if test "$gcry_cv_gcc_inline_asm_aarch64_sve" != "yes" ; then
+      svesupport="no (unsupported by compiler)"
+    fi
+  fi
+fi
+if test x"$sve2support" = xyes ; then
+  if test "$gcry_cv_gcc_inline_asm_sve2" != "yes" ; then
+    if test "$gcry_cv_gcc_inline_asm_aarch64_sve2" != "yes" ; then
+      sve2support="no (unsupported by compiler)"
     fi
   fi
 fi
@@ -20776,6 +21278,16 @@ if test x"$avx2support" = xyes ; then
 
 printf "%s\n" "#define ENABLE_AVX2_SUPPORT 1" >>confdefs.h
 
+fi
+if test x"$avx512support" = xyes ; then
+
+printf "%s\n" "#define ENABLE_AVX512_SUPPORT 1" >>confdefs.h
+
+fi
+if test x"$gfnisupport" = xyes ; then
+
+printf "%s\n" "#define ENABLE_GFNI_SUPPORT 1" >>confdefs.h
+
 fi
 if test x"$neonsupport" = xyes ; then
 
@@ -20786,6 +21298,16 @@ if test x"$armcryptosupport" = xyes ; then
 
 printf "%s\n" "#define ENABLE_ARM_CRYPTO_SUPPORT 1" >>confdefs.h
 
+fi
+if test x"$svesupport" = xyes ; then
+
+printf "%s\n" "#define ENABLE_SVE_SUPPORT 1" >>confdefs.h
+
+fi
+if test x"$sve2support" = xyes ; then
+
+printf "%s\n" "#define ENABLE_SVE2_SUPPORT 1" >>confdefs.h
+
 fi
 if test x"$ppccryptosupport" = xyes ; then
 
@@ -21008,6 +21530,10 @@ printf "%s\n" "#define USE_AES 1" >>confdefs.h
 
          # Build with the Padlock implementation
          GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS rijndael-padlock.lo"
+
+         # Build with the VAES/AVX2 implementation
+         GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS rijndael-vaes-i386.lo"
+         GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS rijndael-vaes-avx2-i386.lo"
       ;;
    esac
 fi
@@ -21079,6 +21605,11 @@ printf "%s\n" "#define USE_SERPENT 1" >>confdefs.h
       GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS serpent-avx2-amd64.lo"
    fi
 
+   if test x"$avx512support" = xyes ; then
+      # Build with the AVX512 implementation
+      GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS serpent-avx512-x86.lo"
+   fi
+
    if test x"$neonsupport" = xyes ; then
       # Build with the NEON implementation
       GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS serpent-armv7-neon.lo"
@@ -21146,6 +21677,12 @@ printf "%s\n" "#define USE_CAMELLIA 1" >>confdefs.h
       aarch64-*-*)
          # Build with the assembly implementation
          GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS camellia-aarch64.lo"
+         GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS camellia-aarch64-ce.lo"
+      ;;
+      powerpc64le-*-*)
+         # Build with the POWER vector implementations
+         GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS camellia-ppc8le.lo"
+         GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS camellia-ppc9le.lo"
       ;;
    esac
 
@@ -21163,6 +21700,12 @@ printf "%s\n" "#define USE_CAMELLIA 1" >>confdefs.h
 
         # Build with the VAES/AVX2 implementation
         GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS camellia-vaes-avx2-amd64.lo"
+
+        # Build with the GFNI/AVX2 implementation
+        GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS camellia-gfni-avx2-amd64.lo"
+
+        # Build with the GFNI/AVX512 implementation
+        GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS camellia-gfni-avx512-amd64.lo"
       fi
    fi
 fi
@@ -21255,6 +21798,7 @@ printf "%s\n" "#define USE_CHACHA20 1" >>confdefs.h
          # Build with the assembly implementation
          GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS chacha20-amd64-ssse3.lo"
          GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS chacha20-amd64-avx2.lo"
+         GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS chacha20-amd64-avx512.lo"
       ;;
       aarch64-*-*)
          # Build with the assembly implementation
@@ -21263,6 +21807,11 @@ printf "%s\n" "#define USE_CHACHA20 1" >>confdefs.h
       powerpc64le-*-*)
          # Build with the ppc8 vector implementation
          GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS chacha20-ppc.lo"
+         # Build with the assembly implementation
+         if test "$gcry_cv_gcc_inline_asm_ppc_altivec" = "yes" &&
+            test "$gcry_cv_gcc_inline_asm_ppc_arch_3_00" = "yes" ; then
+            GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS chacha20-p10le-8x.lo"
+         fi
       ;;
       powerpc64-*-*)
          # Build with the ppc8 vector implementation
@@ -21306,6 +21855,45 @@ printf "%s\n" "#define USE_SM4 1" >>confdefs.h
          # Build with the assembly implementation
          GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS sm4-aesni-avx-amd64.lo"
          GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS sm4-aesni-avx2-amd64.lo"
+         GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS sm4-gfni-avx2-amd64.lo"
+         GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS sm4-gfni-avx512-amd64.lo"
+      ;;
+      aarch64-*-*)
+         # Build with the assembly implementation
+         GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS sm4-aarch64.lo"
+         GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS sm4-armv8-aarch64-ce.lo"
+         GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS sm4-armv9-aarch64-sve-ce.lo"
+      ;;
+      powerpc64le-*-*)
+         # Build with the ppc64le vector implementation
+         GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS sm4-ppc.lo"
+      ;;
+   esac
+fi
+
+
+name=aria
+list=$enabled_ciphers
+found=0
+
+for n in $list; do
+  if test "x$name" = "x$n"; then
+    found=1
+  fi
+done
+
+if test "$found" = "1" ; then
+   GCRYPT_CIPHERS="$GCRYPT_CIPHERS aria.lo"
+
+printf "%s\n" "#define USE_ARIA 1" >>confdefs.h
+
+
+   case "${host}" in
+      x86_64-*-*)
+         # Build with the assembly implementation
+         GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS aria-aesni-avx-amd64.lo"
+         GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS aria-aesni-avx2-amd64.lo"
+         GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS aria-gfni-avx512-amd64.lo"
       ;;
    esac
 fi
@@ -21656,6 +22244,7 @@ printf "%s\n" "#define USE_SHA512 1" >>confdefs.h
          GCRYPT_ASM_DIGESTS="$GCRYPT_ASM_DIGESTS sha512-ssse3-amd64.lo"
          GCRYPT_ASM_DIGESTS="$GCRYPT_ASM_DIGESTS sha512-avx-amd64.lo"
          GCRYPT_ASM_DIGESTS="$GCRYPT_ASM_DIGESTS sha512-avx2-bmi2-amd64.lo"
+         GCRYPT_ASM_DIGESTS="$GCRYPT_ASM_DIGESTS sha512-avx512-amd64.lo"
       ;;
       i?86-*-*)
          # Build with the assembly implementation
@@ -21665,6 +22254,10 @@ printf "%s\n" "#define USE_SHA512 1" >>confdefs.h
          # Build with the assembly implementation
          GCRYPT_ASM_DIGESTS="$GCRYPT_ASM_DIGESTS sha512-arm.lo"
       ;;
+      aarch64-*-*)
+         # Build with the assembly implementation
+         GCRYPT_ASM_DIGESTS="$GCRYPT_ASM_DIGESTS sha512-armv8-aarch64-ce.lo"
+      ;;
       powerpc64le-*-*)
          # Build with the crypto extension implementation
          GCRYPT_ASM_DIGESTS="$GCRYPT_ASM_DIGESTS sha512-ppc.lo"
@@ -21706,7 +22299,7 @@ printf "%s\n" "#define USE_SHA3 1" >>confdefs.h
    case "${host}" in
       x86_64-*-*)
          # Build with the assembly implementation
-         :
+         GCRYPT_ASM_DIGESTS="$GCRYPT_ASM_DIGESTS keccak-amd64-avx512.lo"
       ;;
    esac
 
@@ -21780,7 +22373,9 @@ printf "%s\n" "#define USE_BLAKE2 1" >>confdefs.h
       x86_64-*-*)
          # Build with the assembly implementation
          GCRYPT_ASM_DIGESTS="$GCRYPT_ASM_DIGESTS blake2b-amd64-avx2.lo"
+         GCRYPT_ASM_DIGESTS="$GCRYPT_ASM_DIGESTS blake2b-amd64-avx512.lo"
          GCRYPT_ASM_DIGESTS="$GCRYPT_ASM_DIGESTS blake2s-amd64-avx.lo"
+         GCRYPT_ASM_DIGESTS="$GCRYPT_ASM_DIGESTS blake2s-amd64-avx512.lo"
       ;;
    esac
 fi
@@ -21810,6 +22405,7 @@ printf "%s\n" "#define USE_SM3 1" >>confdefs.h
      aarch64-*-*)
         # Build with the assembly implementation
         GCRYPT_ASM_DIGESTS="$GCRYPT_ASM_DIGESTS sm3-aarch64.lo"
+        GCRYPT_ASM_DIGESTS="$GCRYPT_ASM_DIGESTS sm3-armv8-aarch64-ce.lo"
      ;;
    esac
 fi
@@ -21869,6 +22465,16 @@ case "${host}" in
   s390x-*-*)
     GCRYPT_ASM_DIGESTS="$GCRYPT_ASM_DIGESTS poly1305-s390x.lo"
   ;;
+  x86_64-*-*)
+    GCRYPT_ASM_DIGESTS="$GCRYPT_ASM_DIGESTS poly1305-amd64-avx512.lo"
+  ;;
+  powerpc64le-*-*)
+    # Build with the assembly implementation
+    if test "$gcry_cv_gcc_inline_asm_ppc_altivec" = "yes" &&
+       test "$gcry_cv_gcc_inline_asm_ppc_arch_3_00" = "yes" ; then
+       GCRYPT_ASM_DIGESTS="$GCRYPT_ASM_DIGESTS poly1305-p10le.lo"
+    fi
+  ;;
 esac
 
 
 #
 # Provide information about the build.
 #
-BUILD_REVISION="aa161086"
+BUILD_REVISION="9d94d784"
 
 
 printf "%s\n" "#define BUILD_REVISION \"$BUILD_REVISION\"" >>confdefs.h
 
 
 BUILD_VERSION=`echo "$PACKAGE_VERSION" | sed 's/\([0-9.]*\).*/\1./'`
-BUILD_VERSION="${BUILD_VERSION}43542"
+BUILD_VERSION="${BUILD_VERSION}40340"
 BUILD_FILEVERSION=`echo "${BUILD_VERSION}" | tr . ,`
 
 
@@ -22155,6 +22761,8 @@ printf "%s\n" "#define BUILD_TIMESTAMP \"$BUILD_TIMESTAMP\"" >>confdefs.h
 # And create the files.
 ac_config_files="$ac_config_files Makefile m4/Makefile compat/Makefile mpi/Makefile cipher/Makefile random/Makefile doc/Makefile src/Makefile src/gcrypt.h src/libgcrypt-config src/libgcrypt.pc src/versioninfo.rc tests/Makefile"
 
+ac_config_files="$ac_config_files tests/hashtest-6g"
+
 ac_config_files="$ac_config_files tests/hashtest-256g"
 
 ac_config_files="$ac_config_files tests/basic-disable-all-hwf"
@@ -22313,10 +22921,6 @@ if test -z "${HAVE_W32CE_SYSTEM_TRUE}" && test -z "${HAVE_W32CE_SYSTEM_FALSE}";
 Usually this means the macro was only invoked conditionally." "$LINENO" 5
 fi
 
-if test -z "${ENABLE_RANDOM_DAEMON_TRUE}" && test -z "${ENABLE_RANDOM_DAEMON_FALSE}"; then
-  as_fn_error $? "conditional \"ENABLE_RANDOM_DAEMON\" was never defined.
-Usually this means the macro was only invoked conditionally." "$LINENO" 5
-fi
 if test -z "${USE_HMAC_BINARY_CHECK_TRUE}" && test -z "${USE_HMAC_BINARY_CHECK_FALSE}"; then
   as_fn_error $? "conditional \"USE_HMAC_BINARY_CHECK\" was never defined.
 Usually this means the macro was only invoked conditionally." "$LINENO" 5
@@ -22333,6 +22937,10 @@ if test -z "${HAVE_LD_VERSION_SCRIPT_TRUE}" && test -z "${HAVE_LD_VERSION_SCRIPT
   as_fn_error $? "conditional \"HAVE_LD_VERSION_SCRIPT\" was never defined.
 Usually this means the macro was only invoked conditionally." "$LINENO" 5
 fi
+if test -z "${USE_GPGRT_CONFIG_TRUE}" && test -z "${USE_GPGRT_CONFIG_FALSE}"; then
+  as_fn_error $? "conditional \"USE_GPGRT_CONFIG\" was never defined.
+Usually this means the macro was only invoked conditionally." "$LINENO" 5
+fi
 if test -z "${MPI_MOD_ASM_MPIH_ADD1_TRUE}" && test -z "${MPI_MOD_ASM_MPIH_ADD1_FALSE}"; then
   as_fn_error $? "conditional \"MPI_MOD_ASM_MPIH_ADD1\" was never defined.
 Usually this means the macro was only invoked conditionally." "$LINENO" 5
@@ -22405,6 +23013,14 @@ if test -z "${MPI_MOD_C_UDIV_QRNND_TRUE}" && test -z "${MPI_MOD_C_UDIV_QRNND_FAL
   as_fn_error $? "conditional \"MPI_MOD_C_UDIV_QRNND\" was never defined.
 Usually this means the macro was only invoked conditionally." "$LINENO" 5
 fi
+if test -z "${ENABLE_X86_AVX512_INTRINSICS_EXTRA_CFLAGS_TRUE}" && test -z "${ENABLE_X86_AVX512_INTRINSICS_EXTRA_CFLAGS_FALSE}"; then
+  as_fn_error $? "conditional \"ENABLE_X86_AVX512_INTRINSICS_EXTRA_CFLAGS\" was never defined.
+Usually this means the macro was only invoked conditionally." "$LINENO" 5
+fi
+if test -z "${ENABLE_AARCH64_NEON_INTRINSICS_EXTRA_CFLAGS_TRUE}" && test -z "${ENABLE_AARCH64_NEON_INTRINSICS_EXTRA_CFLAGS_FALSE}"; then
+  as_fn_error $? "conditional \"ENABLE_AARCH64_NEON_INTRINSICS_EXTRA_CFLAGS\" was never defined.
+Usually this means the macro was only invoked conditionally." "$LINENO" 5
+fi
 if test -z "${ENABLE_PPC_VCRYPTO_EXTRA_CFLAGS_TRUE}" && test -z "${ENABLE_PPC_VCRYPTO_EXTRA_CFLAGS_FALSE}"; then
   as_fn_error $? "conditional \"ENABLE_PPC_VCRYPTO_EXTRA_CFLAGS\" was never defined.
 Usually this means the macro was only invoked conditionally." "$LINENO" 5
@@ -22823,7 +23439,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
 # report actual input values of CONFIG_FILES etc. instead of their
 # values after options handling.
 ac_log="
-This file was extended by libgcrypt $as_me 1.10.3, which was
+This file was extended by libgcrypt $as_me 1.11.0, which was
 generated by GNU Autoconf 2.71.  Invocation command line was
 
   CONFIG_FILES    = $CONFIG_FILES
@@ -22895,7 +23511,7 @@ ac_cs_config_escaped=`printf "%s\n" "$ac_cs_config" | sed "s/^ //; s/'/'\\\\\\\\
 cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
 ac_cs_config='$ac_cs_config_escaped'
 ac_cs_version="\\
-libgcrypt config.status 1.10.3
+libgcrypt config.status 1.11.0
 configured by $0, generated by GNU Autoconf 2.71,
   with options \\"\$ac_cs_config\\"
 
@@ -23404,6 +24020,7 @@ do
     "src/libgcrypt.pc") CONFIG_FILES="$CONFIG_FILES src/libgcrypt.pc" ;;
     "src/versioninfo.rc") CONFIG_FILES="$CONFIG_FILES src/versioninfo.rc" ;;
     "tests/Makefile") CONFIG_FILES="$CONFIG_FILES tests/Makefile" ;;
+    "tests/hashtest-6g") CONFIG_FILES="$CONFIG_FILES tests/hashtest-6g" ;;
     "tests/hashtest-256g") CONFIG_FILES="$CONFIG_FILES tests/hashtest-256g" ;;
     "tests/basic-disable-all-hwf") CONFIG_FILES="$CONFIG_FILES tests/basic-disable-all-hwf" ;;
 
@@ -24919,6 +25536,7 @@ _LT_EOF
     "gcrypt-conf":C)
 chmod +x src/libgcrypt-config
  ;;
+    "tests/hashtest-6g":F) chmod +x tests/hashtest-6g ;;
     "tests/hashtest-256g":F) chmod +x tests/hashtest-256g ;;
     "tests/basic-disable-all-hwf":F) chmod +x tests/basic-disable-all-hwf ;;
 
@@ -25101,12 +25719,24 @@ test -n "$detection_module" || detection_module="none"
      echo "        Try using Intel AVX2:      $avx2support" 1>&6
 
 
+     echo "        Try using Intel AVX512:    $avx512support" 1>&6
+
+
+     echo "        Try using Intel GFNI:      $gfnisupport" 1>&6
+
+
      echo "        Try using ARM NEON:        $neonsupport" 1>&6
 
 
      echo "        Try using ARMv8 crypto:    $armcryptosupport" 1>&6
 
 
+     echo "        Try using ARMv8 SVE:       $svesupport" 1>&6
+
+
+     echo "        Try using ARMv9 SVE2:      $sve2support" 1>&6
+
+
      echo "        Try using PPC crypto:      $ppccryptosupport" 1>&6
 
 
@@ -25131,11 +25761,3 @@ cat <<G10EOF
 G10EOF
 fi
 
-if test -n "$gpl"; then
-  echo "Please note that you are building a version of Libgcrypt with"
-  echo "  $gpl"
-  echo "included.  These parts are licensed under the GPL and thus the"
-  echo "use of this library has to comply with the conditions of the GPL."
-  echo ""
-fi
-
index 6aef89781e796806dadd3a0056d3c9becf5347af..1d06ca3b174c071ade9a361ed81c922140d32221 100644 (file)
@@ -1,7 +1,7 @@
 # Configure.ac script for Libgcrypt
 # Copyright (C) 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2006,
 #               2007, 2008, 2009, 2011 Free Software Foundation, Inc.
-# Copyright (C) 2012-2021  g10 Code GmbH
+# Copyright (C) 2012-2024  g10 Code GmbH
 #
 # This file is part of Libgcrypt.
 #
@@ -30,8 +30,8 @@ min_automake_version="1.14"
 # for the LT versions.
 m4_define([mym4_package],[libgcrypt])
 m4_define([mym4_major], [1])
-m4_define([mym4_minor], [10])
-m4_define([mym4_micro], [3])
+m4_define([mym4_minor], [11])
+m4_define([mym4_micro], [0])
 
 # Below is m4 magic to extract and compute the git revision number,
 # the decimalized short revision number, a beta version string and a
@@ -55,9 +55,9 @@ AC_INIT([mym4_package],[mym4_version],[https://bugs.gnupg.org])
 #   (Interfaces removed:    CURRENT++, AGE=0, REVISION=0)
 #   (Interfaces added:      CURRENT++, AGE++, REVISION=0)
 #   (No interfaces changed:                   REVISION++)
-LIBGCRYPT_LT_CURRENT=24
-LIBGCRYPT_LT_AGE=4
-LIBGCRYPT_LT_REVISION=3
+LIBGCRYPT_LT_CURRENT=25
+LIBGCRYPT_LT_AGE=5
+LIBGCRYPT_LT_REVISION=0
 ################################################
 
 AC_SUBST(LIBGCRYPT_LT_CURRENT)
@@ -72,7 +72,7 @@ LIBGCRYPT_CONFIG_API_VERSION=1
 
 # If you change the required gpg-error version, please remove
 # unnecessary error code defines in src/gcrypt-int.h.
-NEED_GPG_ERROR_VERSION=1.27
+NEED_GPG_ERROR_VERSION=1.49
 
 AC_CONFIG_AUX_DIR([build-aux])
 AC_CONFIG_SRCDIR([src/libgcrypt.vers])
@@ -150,6 +150,7 @@ AM_PROG_AS
 AC_SEARCH_LIBS([strerror],[cposix])
 AC_PROG_INSTALL
 AC_PROG_AWK
+AC_PROG_FGREP
 
 # Taken from mpfr-4.0.1, then modified for LDADD_FOR_TESTS_KLUDGE
 dnl Under Linux, make sure that the old dtags are used if LD_LIBRARY_PATH
@@ -211,7 +212,7 @@ LIBGCRYPT_CONFIG_HOST="$host"
 # Definitions for symmetric ciphers.
 available_ciphers="arcfour blowfish cast5 des aes twofish serpent rfc2268 seed"
 available_ciphers="$available_ciphers camellia idea salsa20 gost28147 chacha20"
-available_ciphers="$available_ciphers sm4"
+available_ciphers="$available_ciphers sm4 aria"
 enabled_ciphers=""
 
 # Definitions for public-key ciphers.
@@ -359,6 +360,7 @@ AC_CHECK_SIZEOF(unsigned short, 2)
 AC_CHECK_SIZEOF(unsigned int, 4)
 AC_CHECK_SIZEOF(unsigned long, 4)
 AC_CHECK_SIZEOF(unsigned long long, 0)
+AC_CHECK_SIZEOF(unsigned __int128, 0)
 AC_CHECK_SIZEOF(void *, 0)
 
 AC_TYPE_UINTPTR_T
@@ -522,16 +524,6 @@ AC_DEFINE_UNQUOTED(EGD_SOCKET_NAME, "$egd_socket_name",
                    [Define if you don't want the default EGD socket name.
                     For details see cipher/rndegd.c])
 
-# Implementation of the --enable-random-daemon
-AC_MSG_CHECKING([whether the experimental random daemon is requested])
-AC_ARG_ENABLE([random-daemon],
-              AS_HELP_STRING([--enable-random-daemon],
-                             [Build the experimental gcryptrnd]),
-              [enable_random_daemon=$enableval],
-              [enable_random_daemon=no])
-AC_MSG_RESULT($enable_random_daemon)
-AM_CONDITIONAL(ENABLE_RANDOM_DAEMON, test x$enable_random_daemon = xyes)
-
 
 # Implementation of --disable-asm.
 AC_MSG_CHECKING([whether MPI and cipher assembler modules are requested])
@@ -545,17 +537,6 @@ if test "$try_asm_modules" != yes ; then
     AC_DEFINE(ASM_DISABLED,1,[Defined if --disable-asm was used to configure])
 fi
 
-# Implementation of the --enable-m-guard switch.
-AC_MSG_CHECKING([whether memory guard is requested])
-AC_ARG_ENABLE(m-guard,
-              AS_HELP_STRING([--enable-m-guard],
-                             [Enable memory guard facility]),
-              [use_m_guard=$enableval], [use_m_guard=no])
-AC_MSG_RESULT($use_m_guard)
-if test "$use_m_guard" = yes ; then
-    AC_DEFINE(M_GUARD,1,[Define to use the (obsolete) malloc guarding feature])
-fi
-
 # Implementation of the --enable-large-data-tests switch.
 AC_MSG_CHECKING([whether to run large data tests])
 AC_ARG_ENABLE(large-data-tests,
@@ -686,6 +667,22 @@ AC_ARG_ENABLE(avx2-support,
              avx2support=$enableval,avx2support=yes)
 AC_MSG_RESULT($avx2support)
 
+# Implementation of the --disable-avx512-support switch.
+AC_MSG_CHECKING([whether AVX512 support is requested])
+AC_ARG_ENABLE(avx512-support,
+              AS_HELP_STRING([--disable-avx512-support],
+                 [Disable support for the Intel AVX512 instructions]),
+             avx512support=$enableval,avx512support=yes)
+AC_MSG_RESULT($avx512support)
+
+# Implementation of the --disable-gfni-support switch.
+AC_MSG_CHECKING([whether GFNI support is requested])
+AC_ARG_ENABLE(gfni-support,
+              AS_HELP_STRING([--disable-gfni-support],
+                 [Disable support for the Intel GFNI instructions]),
+             gfnisupport=$enableval,gfnisupport=yes)
+AC_MSG_RESULT($gfnisupport)
+
 # Implementation of the --disable-neon-support switch.
 AC_MSG_CHECKING([whether NEON support is requested])
 AC_ARG_ENABLE(neon-support,
@@ -702,6 +699,22 @@ AC_ARG_ENABLE(arm-crypto-support,
              armcryptosupport=$enableval,armcryptosupport=yes)
 AC_MSG_RESULT($armcryptosupport)
 
+# Implementation of the --disable-sve-support switch.
+AC_MSG_CHECKING([whether SVE support is requested])
+AC_ARG_ENABLE(sve-support,
+              AS_HELP_STRING([--disable-sve-support],
+                 [Disable support for the ARMv8 SVE instructions]),
+             svesupport=$enableval,svesupport=yes)
+AC_MSG_RESULT($svesupport)
+
+# Implementation of the --disable-sve2-support switch.
+AC_MSG_CHECKING([whether SVE2 support is requested])
+AC_ARG_ENABLE(sve2-support,
+              AS_HELP_STRING([--disable-sve2-support],
+                 [Disable support for the ARMv9 SVE2 instructions]),
+             sve2support=$enableval,sve2support=yes)
+AC_MSG_RESULT($sve2support)
+
 # Implementation of the --disable-ppc-crypto-support switch.
 AC_MSG_CHECKING([whether PPC crypto support is requested])
 AC_ARG_ENABLE(ppc-crypto-support,
@@ -822,43 +835,8 @@ fi
 AC_DEFINE(GPG_ERR_SOURCE_DEFAULT, GPG_ERR_SOURCE_GCRYPT,
           [The default error source for libgcrypt.])
 
-#
-# Check whether the GNU Pth library is available.  We require this
-# to build the optional gcryptrnd program.
-#
-AC_ARG_WITH(pth-prefix,
-            AS_HELP_STRING([--with-pth-prefix=PFX],
-                           [prefix where GNU Pth is installed (optional)]),
-     pth_config_prefix="$withval", pth_config_prefix="")
-if test x$pth_config_prefix != x ; then
-   PTH_CONFIG="$pth_config_prefix/bin/pth-config"
-fi
-if test "$enable_random_daemon" = "yes"; then
-  AC_PATH_PROG(PTH_CONFIG, pth-config, no)
-  if test "$PTH_CONFIG" = "no"; then
-    AC_MSG_WARN([[
-***
-*** To build the Libgcrypt's random number daemon
-*** we need the support of the GNU Portable Threads Library.
-*** Download it from ftp://ftp.gnu.org/gnu/pth/
-*** On a Debian GNU/Linux system you might want to try
-***   apt-get install libpth-dev
-***]])
-  else
-    GNUPG_PTH_VERSION_CHECK([1.3.7])
-    if test $have_pth = yes; then
-       PTH_CFLAGS=`$PTH_CONFIG --cflags`
-       PTH_LIBS=`$PTH_CONFIG --ldflags`
-       PTH_LIBS="$PTH_LIBS `$PTH_CONFIG --libs --all`"
-       AC_DEFINE(USE_GNU_PTH, 1,
-                [Defined if the GNU Portable Thread Library should be used])
-       AC_DEFINE(HAVE_PTH, 1,
-                [Defined if the GNU Pth is available])
-    fi
-  fi
-fi
-AC_SUBST(PTH_CFLAGS)
-AC_SUBST(PTH_LIBS)
+AM_CONDITIONAL(USE_GPGRT_CONFIG, [test -n "$GPGRT_CONFIG" \
+                                  -a "$ac_cv_path_GPG_ERROR_CONFIG" = no])
 
 #
 # Check whether pthreads is available
@@ -881,7 +859,7 @@ AC_SEARCH_LIBS(setsockopt, [nsl])
 #### Checks for header files. ####
 ##################################
 
-AC_CHECK_HEADERS(unistd.h sys/auxv.h sys/random.h)
+AC_CHECK_HEADERS(unistd.h sys/auxv.h sys/random.h sys/sysctl.h)
 
 
 ##########################################
@@ -896,8 +874,6 @@ AC_TYPE_PID_T
 
 AC_CHECK_TYPES([byte, ushort, u16, u32, u64])
 
-gl_TYPE_SOCKLEN_T
-
 #
 # Check for __builtin_bswap32 intrinsic.
 #
@@ -1242,7 +1218,7 @@ AC_CACHE_CHECK([whether GCC assembler is compatible for ARM assembly implementat
                 ".text\n\t"
                 /* Following causes error if assembler ignored '.syntax unified'.  */
                 "asmfunc:\n\t"
-                "add %r0, %r0, %r4, ror #12;\n\t"
+                "add r0, r0, r4, ror #12;\n\t"
 
                 /* Test if '.type' and '.size' are supported.  */
                 ".size asmfunc,.-asmfunc;\n\t"
@@ -1305,8 +1281,7 @@ AC_CACHE_CHECK([whether GCC assembler supports for CFI directives],
                 ".cfi_restore_state\n\t"
                 ".long 0\n\t"
                 ".cfi_endproc\n\t"
-            );
-            void asmfunc(void)]])],
+            );]])],
           [gcry_cv_gcc_asm_cfi_directives=yes])])
 if test "$gcry_cv_gcc_asm_cfi_directives" = "yes" ; then
    AC_DEFINE(HAVE_GCC_ASM_CFI_DIRECTIVES,1,
@@ -1392,6 +1367,8 @@ if test "$mpi_cpu_arch" != "x86" ; then
    sse41support="n/a"
    avxsupport="n/a"
    avx2support="n/a"
+   avx512support="n/a"
+   gfnisupport="n/a"
    padlocksupport="n/a"
    drngsupport="n/a"
 fi
@@ -1400,6 +1377,8 @@ if test "$mpi_cpu_arch" != "arm" ; then
    if test "$mpi_cpu_arch" != "aarch64" ; then
      neonsupport="n/a"
      armcryptosupport="n/a"
+     svesupport="n/a"
+     sve2support="n/a"
    fi
 fi
 
@@ -1420,6 +1399,21 @@ _gcc_cflags_save=$CFLAGS
 CFLAGS="$CFLAGS -Werror"
 
 
+#
+# Check whether compiler supports 'optimize' function attribute
+#
+AC_CACHE_CHECK([whether compiler supports 'optimize' function attribute],
+       [gcry_cv_gcc_attribute_optimize],
+       [gcry_cv_gcc_attribute_optimize=no
+        AC_COMPILE_IFELSE([AC_LANG_SOURCE(
+          [[int __attribute__ ((optimize("-O2"))) fn(int i){return i;}]])],
+          [gcry_cv_gcc_attribute_optimize=yes])])
+if test "$gcry_cv_gcc_attribute_optimize" = "yes" ; then
+   AC_DEFINE(HAVE_GCC_ATTRIBUTE_OPTIMIZE,1,
+     [Defined if compiler supports "__attribute__ ((optimize))" function attribute])
+fi
+
+
 #
 # Check whether compiler supports 'ms_abi' function attribute.
 #
@@ -1642,6 +1636,32 @@ if test "$gcry_cv_gcc_inline_asm_avx2" = "yes" ; then
 fi
 
 
+#
+# Check whether GCC inline assembler supports AVX512 instructions
+#
+AC_CACHE_CHECK([whether GCC inline assembler supports AVX512 instructions],
+       [gcry_cv_gcc_inline_asm_avx512],
+       [if test "$mpi_cpu_arch" != "x86" ||
+           test "$try_asm_modules" != "yes" ; then
+          gcry_cv_gcc_inline_asm_avx512="n/a"
+        else
+          gcry_cv_gcc_inline_asm_avx512=no
+          AC_LINK_IFELSE([AC_LANG_PROGRAM(
+          [[void a(void) {
+              __asm__("xgetbv; vpopcntq %%zmm7, %%zmm1%{%%k1%}%{z%};\n\t":::"cc");
+              __asm__("vpexpandb %%zmm3, %%zmm1;\n\t":::"cc");
+              __asm__("vpxorq %%xmm7, %%xmm7, %%xmm7;\n\t":::"cc");
+              __asm__("vpxorq %%ymm7, %%ymm7, %%ymm7;\n\t":::"cc");
+              __asm__("vpxorq (%%eax)%{1to8%}, %%zmm7, %%zmm7;\n\t":::"cc");
+            }]], [ a(); ] )],
+          [gcry_cv_gcc_inline_asm_avx512=yes])
+        fi])
+if test "$gcry_cv_gcc_inline_asm_avx512" = "yes" ; then
+   AC_DEFINE(HAVE_GCC_INLINE_ASM_AVX512,1,
+     [Defined if inline assembler supports AVX512 instructions])
+fi
+
+
 #
 # Check whether GCC inline assembler supports VAES and VPCLMUL instructions
 #
@@ -1667,6 +1687,30 @@ if test "$gcry_cv_gcc_inline_asm_vaes_vpclmul" = "yes" ; then
 fi
 
 
+#
+# Check whether GCC inline assembler supports GFNI instructions
+#
+AC_CACHE_CHECK([whether GCC inline assembler supports GFNI instructions],
+       [gcry_cv_gcc_inline_asm_gfni],
+       [if test "$mpi_cpu_arch" != "x86" ||
+           test "$try_asm_modules" != "yes" ; then
+          gcry_cv_gcc_inline_asm_gfni="n/a"
+        else
+          gcry_cv_gcc_inline_asm_gfni=no
+          AC_LINK_IFELSE([AC_LANG_PROGRAM(
+          [[void a(void) {
+              __asm__("gf2p8affineqb \$123, %%xmm0, %%xmm0;\n\t":::"cc"); /* SSE */
+              __asm__("vgf2p8affineinvqb \$234, %%ymm1, %%ymm1, %%ymm1;\n\t":::"cc"); /* AVX */
+              __asm__("vgf2p8mulb (%%eax), %%zmm2, %%zmm2;\n\t":::"cc"); /* AVX512 */
+            }]], [ a(); ] )],
+          [gcry_cv_gcc_inline_asm_gfni=yes])
+        fi])
+if test "$gcry_cv_gcc_inline_asm_gfni" = "yes" ; then
+   AC_DEFINE(HAVE_GCC_INLINE_ASM_GFNI,1,
+     [Defined if inline assembler supports GFNI instructions])
+fi
+
+
 #
 # Check whether GCC inline assembler supports BMI2 instructions
 #
@@ -1696,6 +1740,47 @@ if test "$gcry_cv_gcc_inline_asm_bmi2" = "yes" ; then
 fi
 
 
+#
+# Check whether compiler supports x86/AVX512 intrinsics
+#
+_gcc_cflags_save=$CFLAGS
+CFLAGS="$CFLAGS -mavx512f"
+
+AC_CACHE_CHECK([whether compiler supports x86/AVX512 intrinsics],
+      [gcry_cv_cc_x86_avx512_intrinsics],
+      [if test "$mpi_cpu_arch" != "x86" ||
+         test "$try_asm_modules" != "yes" ; then
+       gcry_cv_cc_x86_avx512_intrinsics="n/a"
+      else
+       gcry_cv_cc_x86_avx512_intrinsics=no
+       AC_COMPILE_IFELSE([AC_LANG_SOURCE(
+       [[#include <immintrin.h>
+         __m512i fn(void *in, __m128i y)
+         {
+           __m512i x;
+            x = _mm512_loadu_epi32 (in); /* check the GCC bug 90980. */
+           x = _mm512_maskz_loadu_epi32(_cvtu32_mask16(0xfff0), in)
+                 ^ _mm512_castsi128_si512(y);
+           asm volatile ("vinserti32x4 \$3, %0, %%zmm6, %%zmm6;\n\t"
+                         "vpxord %%zmm6, %%zmm6, %%zmm6"
+                         ::"x"(y),"r"(in):"memory","xmm6");
+           return x;
+         }
+         ]])],
+       [gcry_cv_cc_x86_avx512_intrinsics=yes])
+      fi])
+if test "$gcry_cv_cc_x86_avx512_intrinsics" = "yes" ; then
+    AC_DEFINE(HAVE_COMPATIBLE_CC_X86_AVX512_INTRINSICS,1,
+           [Defined if underlying compiler supports x86/AVX512 intrinsics])
+fi
+
+AM_CONDITIONAL(ENABLE_X86_AVX512_INTRINSICS_EXTRA_CFLAGS,
+              test "$gcry_cv_cc_x86_avx512_intrinsics" = "yes")
+
+# Restore flags.
+CFLAGS=$_gcc_cflags_save;
+
+
 #
 # Check whether GCC assembler needs "-Wa,--divide" to correctly handle
 # constant division
@@ -1732,17 +1817,17 @@ fi
 
 
 #
-# Check whether GCC assembler supports features needed for our amd64
+# Check whether GCC assembler supports features needed for our i386/amd64
 # implementations
 #
 if test $amd64_as_feature_detection = yes; then
-  AC_CACHE_CHECK([whether GCC assembler is compatible for amd64 assembly implementations],
-       [gcry_cv_gcc_amd64_platform_as_ok],
+  AC_CACHE_CHECK([whether GCC assembler is compatible for i386/amd64 assembly implementations],
+       [gcry_cv_gcc_x86_platform_as_ok],
        [if test "$mpi_cpu_arch" != "x86" ||
            test "$try_asm_modules" != "yes" ; then
-          gcry_cv_gcc_amd64_platform_as_ok="n/a"
+          gcry_cv_gcc_x86_platform_as_ok="n/a"
         else
-          gcry_cv_gcc_amd64_platform_as_ok=no
+          gcry_cv_gcc_x86_platform_as_ok=no
           AC_LINK_IFELSE([AC_LANG_PROGRAM(
           [[__asm__(
                 /* Test if '.type' and '.size' are supported.  */
@@ -1758,13 +1843,19 @@ if test $amd64_as_feature_detection = yes; then
                  "xorl \$(123456789/12345678), %ebp;\n\t"
             );
             void asmfunc(void);]], [ asmfunc(); ])],
-          [gcry_cv_gcc_amd64_platform_as_ok=yes])
+          [gcry_cv_gcc_x86_platform_as_ok=yes])
         fi])
-  if test "$gcry_cv_gcc_amd64_platform_as_ok" = "yes" ; then
-     AC_DEFINE(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS,1,
+  if test "$gcry_cv_gcc_x86_platform_as_ok" = "yes" &&
+     test "$ac_cv_sizeof_unsigned_long" = "8"; then
+    AC_DEFINE(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS,1,
               [Defined if underlying assembler is compatible with amd64 assembly implementations])
   fi
-  if test "$gcry_cv_gcc_amd64_platform_as_ok" = "no" &&
+  if test "$gcry_cv_gcc_x86_platform_as_ok" = "yes" &&
+     test "$ac_cv_sizeof_unsigned_long" = "4"; then
+    AC_DEFINE(HAVE_COMPATIBLE_GCC_I386_PLATFORM_AS,1,
+              [Defined if underlying assembler is compatible with i386 assembly implementations])
+  fi
+  if test "$gcry_cv_gcc_x86_platform_as_ok" = "no" &&
      test "$gcry_cv_gcc_attribute_sysv_abi" = "yes" &&
      test "$gcry_cv_gcc_default_abi_is_ms_abi" = "yes"; then
     AC_CACHE_CHECK([whether GCC assembler is compatible for WIN64 assembly implementations],
@@ -1784,6 +1875,25 @@ if test $amd64_as_feature_detection = yes; then
                 [Defined if underlying assembler is compatible with WIN64 assembly implementations])
     fi
   fi
+  if test "$gcry_cv_gcc_x86_platform_as_ok" = "no" &&
+     test "$ac_cv_sizeof_unsigned_long" = "4"; then
+    AC_CACHE_CHECK([whether GCC assembler is compatible for WIN32 assembly implementations],
+      [gcry_cv_gcc_win32_platform_as_ok],
+      [gcry_cv_gcc_win32_platform_as_ok=no
+      AC_LINK_IFELSE([AC_LANG_PROGRAM(
+        [[__asm__(
+              ".text\n\t"
+              ".globl _asmfunc\n\t"
+              "_asmfunc:\n\t"
+              "xorl \$(1234), %ebp;\n\t"
+          );
+          void asmfunc(void);]], [ asmfunc(); ])],
+        [gcry_cv_gcc_win32_platform_as_ok=yes])])
+    if test "$gcry_cv_gcc_win32_platform_as_ok" = "yes" ; then
+      AC_DEFINE(HAVE_COMPATIBLE_GCC_WIN32_PLATFORM_AS,1,
+                [Defined if underlying assembler is compatible with WIN32 assembly implementations])
+    fi
+  fi
 fi
 
 
@@ -1871,10 +1981,10 @@ AC_CACHE_CHECK([whether GCC inline assembler supports NEON instructions],
                 ".fpu neon\n\t"
                 ".text\n\t"
                 "testfn:\n\t"
-                "vld1.64 {%q0-%q1}, [%r0]!;\n\t"
-                "vrev64.8 %q0, %q3;\n\t"
-                "vadd.u64 %q0, %q1;\n\t"
-                "vadd.s64 %d3, %d2, %d3;\n\t"
+                "vld1.64 {q0-q1}, [r0]!;\n\t"
+                "vrev64.8 q0, q3;\n\t"
+                "vadd.u64 q0, q1;\n\t"
+                "vadd.s64 d3, d2, d3;\n\t"
                 );
             void testfn(void);
             ]], [ testfn(); ])],
@@ -2013,9 +2123,220 @@ fi
 
 
 #
-# Check whether PowerPC AltiVec/VSX intrinsics
+# Check whether GCC inline assembler supports AArch64 SVE instructions
+#
+AC_CACHE_CHECK([whether GCC inline assembler supports AArch64 SVE instructions],
+       [gcry_cv_gcc_inline_asm_aarch64_sve],
+       [if test "$mpi_cpu_arch" != "aarch64" ||
+           test "$try_asm_modules" != "yes" ; then
+          gcry_cv_gcc_inline_asm_aarch64_sve="n/a"
+        else
+          gcry_cv_gcc_inline_asm_aarch64_sve=no
+          AC_LINK_IFELSE([AC_LANG_PROGRAM(
+          [[__asm__(
+                ".cpu generic+simd+sve\n\t"
+                ".text\n\t"
+                "testfn:\n\t"
+                "mov x0, \#60;\n\t"
+                "whilelo p0.s, xzr, x0;\n\t"
+                "mov z0.s, p0/z, \#55;\n\t"
+                "ld1b {z0.b}, p0/z, [x1];\n\t"
+                );
+            void testfn(void);
+            ]], [ testfn(); ])],
+          [gcry_cv_gcc_inline_asm_aarch64_sve=yes])
+        fi])
+if test "$gcry_cv_gcc_inline_asm_aarch64_sve" = "yes" ; then
+   AC_DEFINE(HAVE_GCC_INLINE_ASM_AARCH64_SVE,1,
+     [Defined if inline assembler supports AArch64 SVE instructions])
+fi
+
+
+#
+# Check whether GCC inline assembler supports AArch64 SVE2 instructions
+#
+AC_CACHE_CHECK([whether GCC inline assembler supports AArch64 SVE2 instructions],
+       [gcry_cv_gcc_inline_asm_aarch64_sve2],
+       [if test "$mpi_cpu_arch" != "aarch64" ||
+           test "$try_asm_modules" != "yes" ; then
+          gcry_cv_gcc_inline_asm_aarch64_sve2="n/a"
+        else
+          gcry_cv_gcc_inline_asm_aarch64_sve2=no
+          AC_LINK_IFELSE([AC_LANG_PROGRAM(
+          [[__asm__(
+                ".cpu generic+simd+sve2\n\t"
+                ".text\n\t"
+                "testfn:\n\t"
+                ";\n\t"
+                "eor3 z0.d, z0.d, z1.d, z2.d;\n\t"
+                "ext z8.b, {z20.b, z21.b}, \#3;\n\t"
+                "adclt z0.d, z1.d, z2.d;\n\t"
+                "tbl z0.b, {z8.b, z9.b}, z1.b;\n\t"
+                "addhnb z16.s, z17.d, z18.d;\n\t"
+                "mov z0.s, p0/z, \#55;\n\t"
+                "ld1b {z0.b}, p0/z, [x1];\n\t"
+                );
+            void testfn(void);
+            ]], [ testfn(); ])],
+          [gcry_cv_gcc_inline_asm_aarch64_sve2=yes])
+        fi])
+if test "$gcry_cv_gcc_inline_asm_aarch64_sve2" = "yes" ; then
+   AC_DEFINE(HAVE_GCC_INLINE_ASM_AARCH64_SVE2,1,
+     [Defined if inline assembler supports AArch64 SVE2 instructions])
+fi
+
+
+#
+# Check whether GCC inline assembler supports AArch64 SHA3/SHA512/SM3/SM4 instructions
+#
+AC_CACHE_CHECK([whether GCC inline assembler supports AArch64 SHA3/SHA512/SM3/SM4 instructions],
+       [gcry_cv_gcc_inline_asm_aarch64_sha3_sha512_sm3_sm4],
+       [if test "$mpi_cpu_arch" != "aarch64" ||
+           test "$try_asm_modules" != "yes" ; then
+          gcry_cv_gcc_inline_asm_aarch64_sha3_sha512_sm3_sm4="n/a"
+        else
+          gcry_cv_gcc_inline_asm_aarch64_sha3_sha512_sm3_sm4=no
+          AC_LINK_IFELSE([AC_LANG_PROGRAM(
+          [[__asm__(
+                ".arch armv8.2-a+sha3+sm4\n\t"
+                ".text\n\t"
+                "testfn:\n\t"
+
+                /* Test for SHA512 instructions */
+                "sha512h q0, q0, v0.2d;\n\t"
+                "sha512h2 q0, q0, v0.2d;\n\t"
+                "sha512su0 v0.2d, v0.2d;\n\t"
+                "sha512su1 v0.2d, v0.2d, v31.2d;\n\t"
+
+                /* Test for SHA3 instructions */
+                "bcax v0.16b, v1.16b, v2.16b, v3.16b;\n\t"
+                "eor3 v0.16b, v1.16b, v2.16b, v3.16b;\n\t"
+                "rax1 v0.2d, v1.2d, v2.2d;\n\t"
+                "xar v0.2d, v1.2d, v2.2d, \#1;\n\t"
+
+                /* Test for SM3 instructions */
+                "sm3partw1 v0.4s, v1.4s, v2.4s;\n\t"
+                "sm3partw2 v0.4s, v1.4s, v2.4s;\n\t"
+                "sm3ss1 v0.4s, v1.4s, v2.4s, v3.4s;\n\t"
+                "sm3tt1a v0.4s, v1.4s, v2.s[0];\n\t"
+                "sm3tt1b v0.4s, v1.4s, v2.s[0];\n\t"
+                "sm3tt2a v0.4s, v1.4s, v2.s[0];\n\t"
+                "sm3tt2b v0.4s, v1.4s, v2.s[0];\n\t"
+
+                /* Test for SM4 instructions */
+                "sm4e v0.4s, v1.4s;\n\t"
+                "sm4ekey v0.4s, v1.4s, v2.4s;\n\t"
+                );
+            void testfn(void);
+            ]], [ testfn(); ])],
+          [gcry_cv_gcc_inline_asm_aarch64_sha3_sha512_sm3_sm4=yes])
+        fi])
+if test "$gcry_cv_gcc_inline_asm_aarch64_sha3_sha512_sm3_sm4" = "yes" ; then
+   AC_DEFINE(HAVE_GCC_INLINE_ASM_AARCH64_SHA3_SHA512_SM3_SM4,1,
+     [Defined if inline assembler supports AArch64 SHA3/SHA512/SM3/SM4 instructions])
+fi
+
+
+#
+# Check whether compiler supports AArch64/NEON/crypto intrinsics
 #
-AC_CACHE_CHECK([whether compiler supports PowerPC AltiVec/VSX intrinsics],
+AC_CACHE_CHECK([whether compiler supports AArch64/NEON/crypto intrinsics],
+      [gcry_cv_cc_aarch64_neon_intrinsics],
+      [if test "$mpi_cpu_arch" != "aarch64" ||
+         test "$try_asm_modules" != "yes" ; then
+       gcry_cv_cc_aarch64_neon_intrinsics="n/a"
+      else
+       gcry_cv_cc_aarch64_neon_intrinsics=no
+       AC_COMPILE_IFELSE([AC_LANG_SOURCE(
+       [[#include <arm_neon.h>
+         #define __m128i uint64x2_t
+         #define vpsrldq128(s, a, o) \
+           ({ uint64x2_t __tmp = { 0, 0 }; \
+               o = (__m128i)vextq_u8((uint8x16_t)a, \
+                                     (uint8x16_t)__tmp, (s) & 15); })
+         #define vaesenclast128(a, b, o) \
+           (o = (__m128i)vaeseq_u8((uint8x16_t)b, (uint8x16_t)a))
+         #define memory_barrier_with_vec(a) __asm__("" : "+w"(a) :: "memory")
+         static inline __attribute__((always_inline)) __m128i
+         fn2(__m128i a)
+         {
+           vpsrldq128(2, a, a);
+           return a;
+         }
+         __m128i fn(__m128i in)
+         {
+           __m128i x;
+           memory_barrier_with_vec(in);
+           x = fn2(in);
+           memory_barrier_with_vec(x);
+           vaesenclast128(in, x, in);
+           memory_barrier_with_vec(in);
+           return in;
+         }
+         ]])],
+       [gcry_cv_cc_aarch64_neon_intrinsics=yes])
+      fi])
+if test "$gcry_cv_cc_aarch64_neon_intrinsics" = "yes" ; then
+    AC_DEFINE(HAVE_COMPATIBLE_CC_AARCH64_NEON_INTRINSICS,1,
+           [Defined if underlying compiler supports AArch64/NEON/crypto intrinsics])
+fi
+
+_gcc_cflags_save=$CFLAGS
+CFLAGS="$CFLAGS -O2 -march=armv8-a+crypto"
+
+if test "$gcry_cv_cc_aarch64_neon_intrinsics" = "no" &&
+   test "$mpi_cpu_arch" = "aarch64" &&
+   test "$try_asm_modules" = "yes" ; then
+  AC_CACHE_CHECK([whether compiler supports AArch64/NEON/crypto intrinsics with extra GCC flags],
+    [gcry_cv_cc_aarch64_neon_intrinsics_cflags],
+    [gcry_cv_cc_aarch64_neon_intrinsics_cflags=no
+    AC_COMPILE_IFELSE([AC_LANG_SOURCE(
+      [[#include <arm_neon.h>
+       #define __m128i uint64x2_t
+       #define vpsrldq128(s, a, o) \
+         ({ uint64x2_t __tmp = { 0, 0 }; \
+             o = (__m128i)vextq_u8((uint8x16_t)a, \
+                                   (uint8x16_t)__tmp, (s) & 15); })
+       #define vaesenclast128(a, b, o) \
+         (o = (__m128i)vaeseq_u8((uint8x16_t)b, (uint8x16_t)a))
+       #define memory_barrier_with_vec(a) __asm__("" : "+w"(a) :: "memory")
+       static inline __attribute__((always_inline)) __m128i
+       fn2(__m128i a)
+       {
+         vpsrldq128(2, a, a);
+         return a;
+       }
+       __m128i fn(__m128i in)
+       {
+         __m128i x;
+         memory_barrier_with_vec(in);
+         x = fn2(in);
+         memory_barrier_with_vec(x);
+         vaesenclast128(in, x, in);
+         memory_barrier_with_vec(in);
+         return in;
+       }
+       ]])],
+      [gcry_cv_cc_aarch64_neon_intrinsics_cflags=yes])])
+  if test "$gcry_cv_cc_aarch64_neon_intrinsics_cflags" = "yes" ; then
+    AC_DEFINE(HAVE_COMPATIBLE_CC_AARCH64_NEON_INTRINSICS,1,
+             [Defined if underlying compiler supports AArch64/NEON/crypto intrinsics])
+    AC_DEFINE(HAVE_COMPATIBLE_CC_AARCH64_NEON_INTRINSICS_WITH_CFLAGS,1,
+             [Defined if underlying compiler supports AArch64/NEON/crypto intrinsics with extra GCC flags])
+  fi
+fi
+
+AM_CONDITIONAL(ENABLE_AARCH64_NEON_INTRINSICS_EXTRA_CFLAGS,
+              test "$gcry_cv_cc_aarch64_neon_intrinsics_cflags" = "yes")
+
+# Restore flags.
+CFLAGS=$_gcc_cflags_save;
+
+
+#
+# Check whether compiler supports PowerPC AltiVec/VSX intrinsics
+#
+AC_CACHE_CHECK([whether compiler supports PowerPC AltiVec/VSX/crypto intrinsics],
       [gcry_cv_cc_ppc_altivec],
       [if test "$mpi_cpu_arch" != "ppc" ||
          test "$try_asm_modules" != "yes" ; then
@@ -2026,10 +2347,16 @@ AC_CACHE_CHECK([whether compiler supports PowerPC AltiVec/VSX intrinsics],
        [[#include <altivec.h>
          typedef vector unsigned char block;
          typedef vector unsigned int vecu32;
+         static inline __attribute__((always_inline)) vecu32
+         vec_sld_u32(vecu32 a, vecu32 b, unsigned int idx)
+         {
+           return vec_sld (a, b, (4 * idx) & 15);
+         }
          block fn(block in)
          {
            block t = vec_perm (in, in, vec_vsx_ld (0, (unsigned char*)0));
            vecu32 y = vec_vsx_ld (0, (unsigned int*)0);
+           y = vec_sld_u32 (y, y, 3);
            return vec_cipher_be (t, in) ^ (block)y;
          }
          ]])],
@@ -2041,11 +2368,11 @@ if test "$gcry_cv_cc_ppc_altivec" = "yes" ; then
 fi
 
 _gcc_cflags_save=$CFLAGS
-CFLAGS="$CFLAGS -maltivec -mvsx -mcrypto"
+CFLAGS="$CFLAGS -O2 -maltivec -mvsx -mcrypto"
 
 if test "$gcry_cv_cc_ppc_altivec" = "no" &&
-    test "$mpi_cpu_arch" = "ppc" &&
-    test "$try_asm_modules" == "yes" ; then
+   test "$mpi_cpu_arch" = "ppc" &&
+   test "$try_asm_modules" = "yes" ; then
   AC_CACHE_CHECK([whether compiler supports PowerPC AltiVec/VSX/crypto intrinsics with extra GCC flags],
     [gcry_cv_cc_ppc_altivec_cflags],
     [gcry_cv_cc_ppc_altivec_cflags=no
@@ -2053,12 +2380,19 @@ if test "$gcry_cv_cc_ppc_altivec" = "no" &&
       [[#include <altivec.h>
        typedef vector unsigned char block;
        typedef vector unsigned int vecu32;
+       static inline __attribute__((always_inline)) vecu32
+       vec_sld_u32(vecu32 a, vecu32 b, unsigned int idx)
+       {
+         return vec_sld (a, b, (4 * idx) & 15);
+       }
        block fn(block in)
        {
          block t = vec_perm (in, in, vec_vsx_ld (0, (unsigned char*)0));
          vecu32 y = vec_vsx_ld (0, (unsigned int*)0);
+         y = vec_sld_u32 (y, y, 3);
          return vec_cipher_be (t, in) ^ (block)y;
-       }]])],
+       }
+       ]])],
       [gcry_cv_cc_ppc_altivec_cflags=yes])])
   if test "$gcry_cv_cc_ppc_altivec_cflags" = "yes" ; then
     AC_DEFINE(HAVE_COMPATIBLE_CC_PPC_ALTIVEC,1,
@@ -2134,6 +2468,52 @@ if test "$gcry_cv_gcc_inline_asm_ppc_arch_3_00" = "yes" ; then
 fi
 
 
+#
+# Check whether compiler supports GCC PowerPC target attributes
+#
+AC_CACHE_CHECK([whether compiler supports GCC PowerPC target attributes],
+       [gcry_cv_gcc_attribute_ppc_target],
+       [if test "$mpi_cpu_arch" != "ppc" ; then
+          gcry_cv_gcc_attribute_ppc_target="n/a"
+        else
+          gcry_cv_gcc_attribute_ppc_target=no
+          AC_LINK_IFELSE([AC_LANG_PROGRAM(
+          [[void __attribute__((always_inline)) inline aifn(void) {}
+            void __attribute__((target("cpu=power8"))) testfn8(void) {aifn();}
+            void __attribute__((target("cpu=power9"))) testfn9(void)
+            { testfn8(); aifn(); }
+            ]], [ testfn9(); aifn(); ])],
+          [gcry_cv_gcc_attribute_ppc_target=yes])
+        fi])
+if test "$gcry_cv_gcc_attribute_ppc_target" = "yes" ; then
+   AC_DEFINE(HAVE_GCC_ATTRIBUTE_PPC_TARGET,1,
+     [Defined if compiler supports GCC PowerPC target attributes])
+fi
+
+
+#
+# Check whether compiler supports clang PowerPC target attributes
+#
+AC_CACHE_CHECK([whether compiler supports clang PowerPC target attributes],
+       [gcry_cv_clang_attribute_ppc_target],
+       [if test "$mpi_cpu_arch" != "ppc" ; then
+          gcry_cv_clang_attribute_ppc_target="n/a"
+        else
+          gcry_cv_clang_attribute_ppc_target=no
+          AC_LINK_IFELSE([AC_LANG_PROGRAM(
+          [[void __attribute__((always_inline)) inline aifn(void) {}
+            void __attribute__((target("arch=pwr8"))) testfn8(void) {aifn();}
+            void __attribute__((target("arch=pwr9"))) testfn9(void)
+            { testfn8(); aifn(); }
+            ]], [ testfn9(); aifn(); ])],
+          [gcry_cv_clang_attribute_ppc_target=yes])
+        fi])
+if test "$gcry_cv_clang_attribute_ppc_target" = "yes" ; then
+   AC_DEFINE(HAVE_CLANG_ATTRIBUTE_PPC_TARGET,1,
+     [Defined if compiler supports clang PowerPC target attributes])
+fi
+
+
 #
 # Check whether GCC inline assembler supports zSeries instructions
 #
@@ -2238,7 +2618,7 @@ AC_CHECK_FUNCS(strtoul memmove stricmp atexit raise)
 AC_CHECK_FUNCS(strerror rand mmap getpagesize sysconf waitpid wait4)
 AC_CHECK_FUNCS(gettimeofday getrusage gethrtime clock_gettime syslog)
 AC_CHECK_FUNCS(syscall fcntl ftruncate flockfile getauxval elf_aux_info)
-AC_CHECK_FUNCS(explicit_bzero explicit_memset getentropy)
+AC_CHECK_FUNCS(explicit_bzero explicit_memset getentropy sysctlbyname)
 
 GNUPG_CHECK_MLOCK
 
@@ -2466,6 +2846,16 @@ if test x"$avx2support" = xyes ; then
     avx2support="no (unsupported by compiler)"
   fi
 fi
+if test x"$avx512support" = xyes ; then
+  if test "$gcry_cv_gcc_inline_asm_avx512" != "yes" ; then
+    avx512support="no (unsupported by compiler)"
+  fi
+fi
+if test x"$gfnisupport" = xyes ; then
+  if test "$gcry_cv_gcc_inline_asm_gfni" != "yes" ; then
+    gfnisupport="no (unsupported by compiler)"
+  fi
+fi
 if test x"$neonsupport" = xyes ; then
   if test "$gcry_cv_gcc_inline_asm_neon" != "yes" ; then
     if test "$gcry_cv_gcc_inline_asm_aarch64_neon" != "yes" ; then
@@ -2476,7 +2866,21 @@ fi
 if test x"$armcryptosupport" = xyes ; then
   if test "$gcry_cv_gcc_inline_asm_aarch32_crypto" != "yes" ; then
     if test "$gcry_cv_gcc_inline_asm_aarch64_crypto" != "yes" ; then
-      neonsupport="no (unsupported by compiler)"
+      armcryptosupport="no (unsupported by compiler)"
+    fi
+  fi
+fi
+if test x"$svesupport" = xyes ; then
+  if test "$gcry_cv_gcc_inline_asm_sve" != "yes" ; then
+    if test "$gcry_cv_gcc_inline_asm_aarch64_sve" != "yes" ; then
+      svesupport="no (unsupported by compiler)"
+    fi
+  fi
+fi
+if test x"$sve2support" = xyes ; then
+  if test "$gcry_cv_gcc_inline_asm_sve2" != "yes" ; then
+    if test "$gcry_cv_gcc_inline_asm_aarch64_sve2" != "yes" ; then
+      sve2support="no (unsupported by compiler)"
     fi
   fi
 fi
@@ -2505,6 +2909,14 @@ if test x"$avx2support" = xyes ; then
   AC_DEFINE(ENABLE_AVX2_SUPPORT,1,
             [Enable support for Intel AVX2 instructions.])
 fi
+if test x"$avx512support" = xyes ; then
+  AC_DEFINE(ENABLE_AVX512_SUPPORT,1,
+            [Enable support for Intel AVX512 instructions.])
+fi
+if test x"$gfnisupport" = xyes ; then
+  AC_DEFINE(ENABLE_GFNI_SUPPORT,1,
+            [Enable support for Intel GFNI instructions.])
+fi
 if test x"$neonsupport" = xyes ; then
   AC_DEFINE(ENABLE_NEON_SUPPORT,1,
             [Enable support for ARM NEON instructions.])
@@ -2513,6 +2925,14 @@ if test x"$armcryptosupport" = xyes ; then
   AC_DEFINE(ENABLE_ARM_CRYPTO_SUPPORT,1,
             [Enable support for ARMv8 Crypto Extension instructions.])
 fi
+if test x"$svesupport" = xyes ; then
+  AC_DEFINE(ENABLE_SVE_SUPPORT,1,
+            [Enable support for ARMv8 SVE instructions.])
+fi
+if test x"$sve2support" = xyes ; then
+  AC_DEFINE(ENABLE_SVE2_SUPPORT,1,
+            [Enable support for ARMv9 SVE2 instructions.])
+fi
 if test x"$ppccryptosupport" = xyes ; then
   AC_DEFINE(ENABLE_PPC_CRYPTO_SUPPORT,1,
             [Enable support for POWER 8 (PowerISA 2.07) crypto extension.])
@@ -2669,6 +3089,10 @@ if test "$found" = "1" ; then
 
          # Build with the Padlock implementation
          GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS rijndael-padlock.lo"
+
+         # Build with the VAES/AVX2 implementation
+         GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS rijndael-vaes-i386.lo"
+         GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS rijndael-vaes-avx2-i386.lo"
       ;;
    esac
 fi
@@ -2716,6 +3140,11 @@ if test "$found" = "1" ; then
       GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS serpent-avx2-amd64.lo"
    fi
 
+   if test x"$avx512support" = xyes ; then
+      # Build with the AVX512 implementation
+      GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS serpent-avx512-x86.lo"
+   fi
+
    if test x"$neonsupport" = xyes ; then
       # Build with the NEON implementation
       GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS serpent-armv7-neon.lo"
@@ -2747,6 +3176,12 @@ if test "$found" = "1" ; then
       aarch64-*-*)
          # Build with the assembly implementation
          GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS camellia-aarch64.lo"
+         GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS camellia-aarch64-ce.lo"
+      ;;
+      powerpc64le-*-*)
+         # Build with the POWER vector implementations
+         GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS camellia-ppc8le.lo"
+         GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS camellia-ppc9le.lo"
       ;;
    esac
 
@@ -2764,6 +3199,12 @@ if test "$found" = "1" ; then
 
         # Build with the VAES/AVX2 implementation
         GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS camellia-vaes-avx2-amd64.lo"
+
+        # Build with the GFNI/AVX2 implementation
+        GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS camellia-gfni-avx2-amd64.lo"
+
+        # Build with the GFNI/AVX512 implementation
+        GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS camellia-gfni-avx512-amd64.lo"
       fi
    fi
 fi
@@ -2808,6 +3249,7 @@ if test "$found" = "1" ; then
          # Build with the assembly implementation
          GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS chacha20-amd64-ssse3.lo"
          GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS chacha20-amd64-avx2.lo"
+         GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS chacha20-amd64-avx512.lo"
       ;;
       aarch64-*-*)
          # Build with the assembly implementation
@@ -2816,6 +3258,11 @@ if test "$found" = "1" ; then
       powerpc64le-*-*)
          # Build with the ppc8 vector implementation
          GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS chacha20-ppc.lo"
+         # Build with the assembly implementation
+         if test "$gcry_cv_gcc_inline_asm_ppc_altivec" = "yes" &&
+            test "$gcry_cv_gcc_inline_asm_ppc_arch_3_00" = "yes" ; then
+            GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS chacha20-p10le-8x.lo"
+         fi
       ;;
       powerpc64-*-*)
          # Build with the ppc8 vector implementation
@@ -2847,6 +3294,33 @@ if test "$found" = "1" ; then
          # Build with the assembly implementation
          GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS sm4-aesni-avx-amd64.lo"
          GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS sm4-aesni-avx2-amd64.lo"
+         GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS sm4-gfni-avx2-amd64.lo"
+         GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS sm4-gfni-avx512-amd64.lo"
+      ;;
+      aarch64-*-*)
+         # Build with the assembly implementation
+         GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS sm4-aarch64.lo"
+         GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS sm4-armv8-aarch64-ce.lo"
+         GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS sm4-armv9-aarch64-sve-ce.lo"
+      ;;
+      powerpc64le-*-*)
+         # Build with the ppc64le vector implementation
+         GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS sm4-ppc.lo"
+      ;;
+   esac
+fi
+
+LIST_MEMBER(aria, $enabled_ciphers)
+if test "$found" = "1" ; then
+   GCRYPT_CIPHERS="$GCRYPT_CIPHERS aria.lo"
+   AC_DEFINE(USE_ARIA, 1, [Defined if this module should be included])
+
+   case "${host}" in
+      x86_64-*-*)
+         # Build with the assembly implementation
+         GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS aria-aesni-avx-amd64.lo"
+         GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS aria-aesni-avx2-amd64.lo"
+         GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS aria-gfni-avx512-amd64.lo"
       ;;
    esac
 fi
@@ -3003,6 +3477,7 @@ if test "$found" = "1" ; then
          GCRYPT_ASM_DIGESTS="$GCRYPT_ASM_DIGESTS sha512-ssse3-amd64.lo"
          GCRYPT_ASM_DIGESTS="$GCRYPT_ASM_DIGESTS sha512-avx-amd64.lo"
          GCRYPT_ASM_DIGESTS="$GCRYPT_ASM_DIGESTS sha512-avx2-bmi2-amd64.lo"
+         GCRYPT_ASM_DIGESTS="$GCRYPT_ASM_DIGESTS sha512-avx512-amd64.lo"
       ;;
       i?86-*-*)
          # Build with the assembly implementation
@@ -3012,6 +3487,10 @@ if test "$found" = "1" ; then
          # Build with the assembly implementation
          GCRYPT_ASM_DIGESTS="$GCRYPT_ASM_DIGESTS sha512-arm.lo"
       ;;
+      aarch64-*-*)
+         # Build with the assembly implementation
+         GCRYPT_ASM_DIGESTS="$GCRYPT_ASM_DIGESTS sha512-armv8-aarch64-ce.lo"
+      ;;
       powerpc64le-*-*)
          # Build with the crypto extension implementation
          GCRYPT_ASM_DIGESTS="$GCRYPT_ASM_DIGESTS sha512-ppc.lo"
@@ -3041,7 +3520,7 @@ if test "$found" = "1" ; then
    case "${host}" in
       x86_64-*-*)
          # Build with the assembly implementation
-         :
+         GCRYPT_ASM_DIGESTS="$GCRYPT_ASM_DIGESTS keccak-amd64-avx512.lo"
       ;;
    esac
 
@@ -3079,7 +3558,9 @@ if test "$found" = "1" ; then
       x86_64-*-*)
          # Build with the assembly implementation
          GCRYPT_ASM_DIGESTS="$GCRYPT_ASM_DIGESTS blake2b-amd64-avx2.lo"
+         GCRYPT_ASM_DIGESTS="$GCRYPT_ASM_DIGESTS blake2b-amd64-avx512.lo"
          GCRYPT_ASM_DIGESTS="$GCRYPT_ASM_DIGESTS blake2s-amd64-avx.lo"
+         GCRYPT_ASM_DIGESTS="$GCRYPT_ASM_DIGESTS blake2s-amd64-avx512.lo"
       ;;
    esac
 fi
@@ -3097,6 +3578,7 @@ if test "$found" = "1" ; then
      aarch64-*-*)
         # Build with the assembly implementation
         GCRYPT_ASM_DIGESTS="$GCRYPT_ASM_DIGESTS sm3-aarch64.lo"
+        GCRYPT_ASM_DIGESTS="$GCRYPT_ASM_DIGESTS sm3-armv8-aarch64-ce.lo"
      ;;
    esac
 fi
@@ -3154,6 +3636,16 @@ case "${host}" in
   s390x-*-*)
     GCRYPT_ASM_DIGESTS="$GCRYPT_ASM_DIGESTS poly1305-s390x.lo"
   ;;
+  x86_64-*-*)
+    GCRYPT_ASM_DIGESTS="$GCRYPT_ASM_DIGESTS poly1305-amd64-avx512.lo"
+  ;;
+  powerpc64le-*-*)
+    # Build with the assembly implementation
+    if test "$gcry_cv_gcc_inline_asm_ppc_altivec" = "yes" &&
+       test "$gcry_cv_gcc_inline_asm_ppc_arch_3_00" = "yes" ; then
+       GCRYPT_ASM_DIGESTS="$GCRYPT_ASM_DIGESTS poly1305-p10le.lo"
+    fi
+  ;;
 esac
 
 LIST_MEMBER(scrypt, $enabled_kdfs)
@@ -3332,6 +3824,7 @@ src/libgcrypt.pc
 src/versioninfo.rc
 tests/Makefile
 ])
+AC_CONFIG_FILES([tests/hashtest-6g], [chmod +x tests/hashtest-6g])
 AC_CONFIG_FILES([tests/hashtest-256g], [chmod +x tests/hashtest-256g])
 AC_CONFIG_FILES([tests/basic-disable-all-hwf], [chmod +x tests/basic-disable-all-hwf])
 AC_OUTPUT
@@ -3362,8 +3855,12 @@ GCRY_MSG_SHOW([Try using Intel SSE4.1:   ],[$sse41support])
 GCRY_MSG_SHOW([Try using DRNG (RDRAND):  ],[$drngsupport])
 GCRY_MSG_SHOW([Try using Intel AVX:      ],[$avxsupport])
 GCRY_MSG_SHOW([Try using Intel AVX2:     ],[$avx2support])
+GCRY_MSG_SHOW([Try using Intel AVX512:   ],[$avx512support])
+GCRY_MSG_SHOW([Try using Intel GFNI:     ],[$gfnisupport])
 GCRY_MSG_SHOW([Try using ARM NEON:       ],[$neonsupport])
 GCRY_MSG_SHOW([Try using ARMv8 crypto:   ],[$armcryptosupport])
+GCRY_MSG_SHOW([Try using ARMv8 SVE:      ],[$svesupport])
+GCRY_MSG_SHOW([Try using ARMv9 SVE2:     ],[$sve2support])
 GCRY_MSG_SHOW([Try using PPC crypto:     ],[$ppccryptosupport])
 GCRY_MSG_SHOW([],[])
 
@@ -3384,11 +3881,3 @@ cat <<G10EOF
 
 G10EOF
 fi
-
-if test -n "$gpl"; then
-  echo "Please note that you are building a version of Libgcrypt with"
-  echo "  $gpl"
-  echo "included.  These parts are licensed under the GPL and thus the"
-  echo "use of this library has to comply with the conditions of the GPL."
-  echo ""
-fi
index 706afdc9ffb8a6356ea17738a378d03d09f6c0c7..2501e5da0b3e4dadf97700c166421f3e13492b9e 100644 (file)
@@ -14,8 +14,8 @@
 # GNU Lesser General Public License for more details.
 #
 # You should have received a copy of the GNU Lesser General Public
-# License along with this program; if not, write to the Free Software
-# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA
+# License along with this program; if not, see <https://www.gnu.org/licenses/>.
+# SPDX-License-Identifier: LGPL-2.1-or-later
 
 EXTRA_DIST = README.apichanges HACKING DCO \
             libgcrypt-modules.png fips-fsm.png \
@@ -23,7 +23,7 @@ EXTRA_DIST = README.apichanges HACKING DCO \
             yat2m.c
 
 DISTCLEANFILES = gcrypt.cps yat2m-stamp.tmp yat2m-stamp $(myman_pages)
-CLEANFILES = yat2m
+CLEANFILES = yat2m$(EXEEXT_FOR_BUILD)
 
 BUILT_SOURCES = libgcrypt-modules.png fips-fsm.png \
                 libgcrypt-modules.pdf fips-fsm.pdf
@@ -39,7 +39,7 @@ myman_pages   = hmac256.1
 
 man_MANS = $(myman_pages)
 
-yat2m: yat2m.c
+yat2m$(EXEEXT_FOR_BUILD): yat2m.c
        $(CC_FOR_BUILD) $(CFLAGS_FOR_BUILD) $(LDFLAGS_FOR_BUILD) \
            $(CPPFLAGS_FOR_BUILD) -o $@ $(srcdir)/yat2m.c
 
@@ -55,15 +55,15 @@ yat2m: yat2m.c
 .fig.pdf:
        fig2dev -L pdf `test -f '$<' || echo '$(srcdir)/'`$< $@
 
-yat2m-stamp: $(myman_sources)
+yat2m-stamp: $(myman_sources) $(srcdir)/version.texi
        @rm -f yat2m-stamp.tmp
        @touch yat2m-stamp.tmp
        for file in $(myman_sources) ; do \
-              ./yat2m $(YAT2M_OPTIONS) --store \
+              ./yat2m$(EXEEXT_FOR_BUILD) $(YAT2M_OPTIONS) --store \
                  `test -f '$$file' || echo '$(srcdir)/'`$$file ; done
        @mv -f yat2m-stamp.tmp $@
 
-yat2m-stamp: yat2m
+yat2m-stamp: yat2m$(EXEEXT_FOR_BUILD)
 
 $(myman_pages) : yat2m-stamp
        @if test -f $@; then :; else \
index 0f2251092729f480af9d28139160c10c852f37b3..33c687b948824c3e265b8dc570a3b24859fedf6a 100644 (file)
@@ -29,8 +29,8 @@
 # GNU Lesser General Public License for more details.
 #
 # You should have received a copy of the GNU Lesser General Public
-# License along with this program; if not, write to the Free Software
-# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA
+# License along with this program; if not, see <https://www.gnu.org/licenses/>.
+# SPDX-License-Identifier: LGPL-2.1-or-later
 VPATH = @srcdir@
 am__is_gnu_make = { \
   if test -z '$(MAKELEVEL)'; then \
@@ -111,8 +111,8 @@ am__aclocal_m4_deps = $(top_srcdir)/m4/ax_cc_for_build.m4 \
        $(top_srcdir)/m4/gpg-error.m4 $(top_srcdir)/m4/libtool.m4 \
        $(top_srcdir)/m4/ltoptions.m4 $(top_srcdir)/m4/ltsugar.m4 \
        $(top_srcdir)/m4/ltversion.m4 $(top_srcdir)/m4/lt~obsolete.m4 \
-       $(top_srcdir)/m4/noexecstack.m4 $(top_srcdir)/m4/socklen.m4 \
-       $(top_srcdir)/acinclude.m4 $(top_srcdir)/configure.ac
+       $(top_srcdir)/m4/noexecstack.m4 $(top_srcdir)/acinclude.m4 \
+       $(top_srcdir)/configure.ac
 am__configure_deps = $(am__aclocal_m4_deps) $(CONFIGURE_DEPENDENCIES) \
        $(ACLOCAL_M4)
 DIST_COMMON = $(srcdir)/Makefile.am $(srcdir)/version.texi \
@@ -316,9 +316,6 @@ PACKAGE_TARNAME = @PACKAGE_TARNAME@
 PACKAGE_URL = @PACKAGE_URL@
 PACKAGE_VERSION = @PACKAGE_VERSION@
 PATH_SEPARATOR = @PATH_SEPARATOR@
-PTH_CFLAGS = @PTH_CFLAGS@
-PTH_CONFIG = @PTH_CONFIG@
-PTH_LIBS = @PTH_LIBS@
 RANLIB = @RANLIB@
 RC = @RC@
 READELF = @READELF@
@@ -392,7 +389,7 @@ EXTRA_DIST = README.apichanges HACKING DCO \
             yat2m.c
 
 DISTCLEANFILES = gcrypt.cps yat2m-stamp.tmp yat2m-stamp $(myman_pages)
-CLEANFILES = yat2m
+CLEANFILES = yat2m$(EXEEXT_FOR_BUILD)
 BUILT_SOURCES = libgcrypt-modules.png fips-fsm.png \
                 libgcrypt-modules.pdf fips-fsm.pdf
 
@@ -916,7 +913,7 @@ uninstall-man: uninstall-man1
 .PRECIOUS: Makefile
 
 
-yat2m: yat2m.c
+yat2m$(EXEEXT_FOR_BUILD): yat2m.c
        $(CC_FOR_BUILD) $(CFLAGS_FOR_BUILD) $(LDFLAGS_FOR_BUILD) \
            $(CPPFLAGS_FOR_BUILD) -o $@ $(srcdir)/yat2m.c
 
@@ -932,15 +929,15 @@ yat2m: yat2m.c
 .fig.pdf:
        fig2dev -L pdf `test -f '$<' || echo '$(srcdir)/'`$< $@
 
-yat2m-stamp: $(myman_sources)
+yat2m-stamp: $(myman_sources) $(srcdir)/version.texi
        @rm -f yat2m-stamp.tmp
        @touch yat2m-stamp.tmp
        for file in $(myman_sources) ; do \
-              ./yat2m $(YAT2M_OPTIONS) --store \
+              ./yat2m$(EXEEXT_FOR_BUILD) $(YAT2M_OPTIONS) --store \
                  `test -f '$$file' || echo '$(srcdir)/'`$$file ; done
        @mv -f yat2m-stamp.tmp $@
 
-yat2m-stamp: yat2m
+yat2m-stamp: yat2m$(EXEEXT_FOR_BUILD)
 
 $(myman_pages) : yat2m-stamp
        @if test -f $@; then :; else \
index bbb6f6f7c87a406d305134f69e7b0e45c40e5c4f..0bd60d1e591d5b2edee1ab76b144f6e9d855b8da 100644 (file)
Binary files a/doc/fips-fsm.pdf and b/doc/fips-fsm.pdf differ
index 7db3f3c029d8c014300dfb260f3c444ddd130eba..cdb58d8d90c85266c6db5d8dca8939e652ea2cb8 100644 (file)
@@ -1,8 +1,7 @@
 This is gcrypt.info, produced by makeinfo version 6.8 from gcrypt.texi.
 
-This manual is for Libgcrypt version 1.10.3 and was last updated 19
-October 2023.  Libgcrypt is GNU's library of cryptographic building
-blocks.
+This manual is for Libgcrypt version 1.11.0 and was last updated 16 May
+2024.  Libgcrypt is GNU's library of cryptographic building blocks.
 
 Copyright (C) 2000, 2002, 2003, 2004, 2006, 2007, 2008, 2009, 2011, 2012
 Free Software Foundation, Inc.
@@ -21,118 +20,118 @@ END-INFO-DIR-ENTRY
 
 \1f
 Indirect:
-gcrypt.info-1: 862
-gcrypt.info-2: 310055
+gcrypt.info-1: 858
+gcrypt.info-2: 313767
 \1f
 Tag Table:
 (Indirect)
-Node: Top\7f862
-Node: Introduction\7f3415
-Node: Getting Started\7f3787
-Node: Features\7f4667
-Node: Overview\7f5452
-Node: Preparation\7f6075
-Node: Header\7f7060
-Node: Building sources\7f8131
-Node: Building sources using Automake\7f10059
-Node: Initializing the library\7f11359
-Ref: sample-use-suspend-secmem\7f14751
-Ref: sample-use-resume-secmem\7f15594
-Node: Multi-Threading\7f16497
-Ref: Multi-Threading-Footnote-1\7f17676
-Node: Enabling FIPS mode\7f18085
-Ref: enabling fips mode\7f18268
-Node: Disabling FIPS mode\7f19537
-Ref: disabling fips mode\7f19724
-Node: Hardware features\7f20175
-Ref: hardware features\7f20343
-Ref: Hardware features-Footnote-1\7f21667
-Node: Generalities\7f21825
-Node: Controlling the library\7f22084
-Node: Error Handling\7f44629
-Node: Error Values\7f47168
-Node: Error Sources\7f52108
-Node: Error Codes\7f54376
-Node: Error Strings\7f57852
-Node: Handler Functions\7f59036
-Node: Progress handler\7f59595
-Node: Allocation handler\7f61744
-Node: Error handler\7f63290
-Node: Logging handler\7f64856
-Node: Symmetric cryptography\7f65448
-Node: Available ciphers\7f66188
-Node: Available cipher modes\7f69336
-Node: Working with cipher handles\7f75471
-Node: General cipher functions\7f88171
-Node: Public Key cryptography\7f91690
-Node: Available algorithms\7f92529
-Node: Used S-expressions\7f92829
-Node: RSA key parameters\7f93955
-Node: DSA key parameters\7f95233
-Node: ECC key parameters\7f95888
-Ref: ecc_keyparam\7f96039
-Node: Cryptographic Functions\7f100076
-Node: Dedicated ECC Functions\7f114440
-Node: General public-key related Functions\7f115587
-Node: Hashing\7f130620
-Node: Available hash algorithms\7f131353
-Node: Working with hash algorithms\7f137707
-Node: Message Authentication Codes\7f151857
-Node: Available MAC algorithms\7f152525
-Node: Working with MAC algorithms\7f159286
-Node: Key Derivation\7f165278
-Node: Random Numbers\7f167679
-Node: Quality of random numbers\7f167962
-Node: Retrieving random numbers\7f168648
-Node: S-expressions\7f170137
-Node: Data types for S-expressions\7f170780
-Node: Working with S-expressions\7f171106
-Node: MPI library\7f186239
-Node: Data types\7f187261
-Node: Basic functions\7f187570
-Node: MPI formats\7f190591
-Node: Calculations\7f194197
-Node: Comparisons\7f196581
-Node: Bit manipulations\7f197589
-Node: EC functions\7f198911
-Ref: gcry_mpi_ec_new\7f201865
-Node: Miscellaneous\7f207434
-Node: Prime numbers\7f211580
-Node: Generation\7f211850
-Node: Checking\7f213141
-Node: Utilities\7f213551
-Node: Memory allocation\7f213923
-Node: Context management\7f215288
-Ref: gcry_ctx_release\7f215727
-Node: Buffer description\7f215888
-Node: Config reporting\7f216676
-Node: Tools\7f217638
-Node: hmac256\7f217805
-Node: Configuration\7f218810
-Node: Architecture\7f222026
-Ref: fig:subsystems\7f223552
-Ref: Architecture-Footnote-1\7f224640
-Ref: Architecture-Footnote-2\7f224702
-Node: Public-Key Subsystem Architecture\7f224786
-Node: Symmetric Encryption Subsystem Architecture\7f227070
-Node: Hashing and MACing Subsystem Architecture\7f228667
-Node: Multi-Precision-Integer Subsystem Architecture\7f230741
-Node: Prime-Number-Generator Subsystem Architecture\7f232179
-Ref: Prime-Number-Generator Subsystem Architecture-Footnote-1\7f234106
-Node: Random-Number Subsystem Architecture\7f234396
-Node: CSPRNG Description\7f237587
-Ref: CSPRNG Description-Footnote-1\7f239142
-Node: DRBG Description\7f239265
-Node: Self-Tests\7f240946
-Node: FIPS Mode\7f252763
-Ref: fig:fips-fsm\7f256947
-Ref: tbl:fips-states\7f257051
-Ref: tbl:fips-state-transitions\7f258300
-Node: Library Copying\7f261911
-Node: Copying\7f290017
-Node: Figures and Tables\7f310055
-Node: Concept Index\7f310480
-Node: Function and Data Index\7f323105
+Node: Top\7f858
+Node: Introduction\7f3407
+Node: Getting Started\7f3779
+Node: Features\7f4659
+Node: Overview\7f5444
+Node: Preparation\7f6067
+Node: Header\7f7052
+Node: Building sources\7f8123
+Node: Building sources using Automake\7f10051
+Node: Initializing the library\7f11441
+Ref: sample-use-suspend-secmem\7f14833
+Ref: sample-use-resume-secmem\7f15677
+Node: Multi-Threading\7f16580
+Ref: Multi-Threading-Footnote-1\7f17759
+Node: Enabling FIPS mode\7f18168
+Ref: enabling fips mode\7f18351
+Node: Disabling FIPS mode\7f19620
+Ref: disabling fips mode\7f19807
+Node: Hardware features\7f20258
+Ref: hardware features\7f20426
+Ref: Hardware features-Footnote-1\7f21898
+Node: Generalities\7f22056
+Node: Controlling the library\7f22315
+Node: Error Handling\7f44580
+Node: Error Values\7f47119
+Node: Error Sources\7f52059
+Node: Error Codes\7f54327
+Node: Error Strings\7f57803
+Node: Handler Functions\7f58987
+Node: Progress handler\7f59546
+Node: Allocation handler\7f61695
+Node: Error handler\7f63241
+Node: Logging handler\7f64807
+Node: Symmetric cryptography\7f65875
+Node: Available ciphers\7f66616
+Node: Available cipher modes\7f70069
+Node: Working with cipher handles\7f76204
+Node: General cipher functions\7f90086
+Node: Public Key cryptography\7f93605
+Node: Available algorithms\7f94444
+Node: Used S-expressions\7f94744
+Node: RSA key parameters\7f95910
+Node: DSA key parameters\7f97188
+Node: ECC key parameters\7f97843
+Ref: ecc_keyparam\7f97994
+Node: Cryptographic Functions\7f102031
+Node: Dedicated ECC Functions\7f116394
+Node: General public-key related Functions\7f117541
+Node: Hashing\7f132574
+Node: Available hash algorithms\7f133307
+Node: Working with hash algorithms\7f140354
+Node: Message Authentication Codes\7f154868
+Node: Available MAC algorithms\7f155536
+Node: Working with MAC algorithms\7f162905
+Node: Key Derivation\7f168897
+Node: Random Numbers\7f171296
+Node: Quality of random numbers\7f171579
+Node: Retrieving random numbers\7f172265
+Node: S-expressions\7f173754
+Node: Data types for S-expressions\7f174396
+Node: Working with S-expressions\7f174722
+Node: MPI library\7f189855
+Node: Data types\7f190877
+Node: Basic functions\7f191186
+Node: MPI formats\7f194207
+Node: Calculations\7f197813
+Node: Comparisons\7f200197
+Node: Bit manipulations\7f201204
+Node: EC functions\7f202526
+Ref: gcry_mpi_ec_new\7f205480
+Node: Miscellaneous\7f211049
+Node: Prime numbers\7f215195
+Node: Generation\7f215465
+Node: Checking\7f216756
+Node: Utilities\7f217166
+Node: Memory allocation\7f217538
+Node: Context management\7f218904
+Ref: gcry_ctx_release\7f219343
+Node: Buffer description\7f219504
+Node: Config reporting\7f220292
+Node: Tools\7f221255
+Node: hmac256\7f221422
+Node: Configuration\7f222427
+Node: Architecture\7f225643
+Ref: fig:subsystems\7f227169
+Ref: Architecture-Footnote-1\7f228257
+Ref: Architecture-Footnote-2\7f228319
+Node: Public-Key Subsystem Architecture\7f228403
+Node: Symmetric Encryption Subsystem Architecture\7f230687
+Node: Hashing and MACing Subsystem Architecture\7f232284
+Node: Multi-Precision-Integer Subsystem Architecture\7f234358
+Node: Prime-Number-Generator Subsystem Architecture\7f235796
+Ref: Prime-Number-Generator Subsystem Architecture-Footnote-1\7f237723
+Node: Random-Number Subsystem Architecture\7f238013
+Node: CSPRNG Description\7f241204
+Ref: CSPRNG Description-Footnote-1\7f242759
+Node: DRBG Description\7f242882
+Node: Self-Tests\7f244563
+Node: FIPS Mode\7f256380
+Ref: fig:fips-fsm\7f260564
+Ref: tbl:fips-states\7f260668
+Ref: tbl:fips-state-transitions\7f261917
+Node: Library Copying\7f265528
+Node: Copying\7f293656
+Node: Figures and Tables\7f313767
+Node: Concept Index\7f314192
+Node: Function and Data Index\7f326912
 \1f
 End Tag Table
 
index a9c4f3a60c19160c0c0b005ce6557b829b3039dd..b5a73fdd348abfb5fdf4423307956a5d0c94294e 100644 (file)
@@ -1,8 +1,7 @@
 This is gcrypt.info, produced by makeinfo version 6.8 from gcrypt.texi.
 
-This manual is for Libgcrypt version 1.10.3 and was last updated 19
-October 2023.  Libgcrypt is GNU's library of cryptographic building
-blocks.
+This manual is for Libgcrypt version 1.11.0 and was last updated 16 May
+2024.  Libgcrypt is GNU's library of cryptographic building blocks.
 
 Copyright (C) 2000, 2002, 2003, 2004, 2006, 2007, 2008, 2009, 2011, 2012
 Free Software Foundation, Inc.
@@ -25,9 +24,8 @@ File: gcrypt.info,  Node: Top,  Next: Introduction,  Up: (dir)
 The Libgcrypt Library
 *********************
 
-This manual is for Libgcrypt version 1.10.3 and was last updated 19
-October 2023.  Libgcrypt is GNU's library of cryptographic building
-blocks.
+This manual is for Libgcrypt version 1.11.0 and was last updated 16 May
+2024.  Libgcrypt is GNU's library of cryptographic building blocks.
 
 Copyright (C) 2000, 2002, 2003, 2004, 2006, 2007, 2008, 2009, 2011, 2012
 Free Software Foundation, Inc.
@@ -248,12 +246,14 @@ File: gcrypt.info,  Node: Building sources using Automake,  Next: Initializing t
 2.3 Building sources using Automake
 ===================================
 
-It is much easier if you use GNU Automake instead of writing your own
-Makefiles.  If you do that, you do not have to worry about finding and
-invoking the 'pkg-config' script at all.
+You can simply use 'PKG_CHECK_MODULES' macro with 'pkg-config':
 
-   You can use 'PKG_CHECK_MODULES' macro, or, libgcrypt also provides an
-extension to Automake that does all the work for you.
+     PKG_CHECK_MODULES([LIBGCRYPT], [libgcrypt >= 1.11])
+
+   Alternatively, instead of using 'pkg-config', for building on an
+environment with no pkg-config, libgcrypt also provides an extension to
+Automake that does all the work for you.  Please note that it is
+required to have gpgrt-config from libgpg-error installed in this case.
 
  -- Macro: AM_PATH_LIBGCRYPT ([MINIMUM-VERSION], [ACTION-IF-FOUND],
           [ACTION-IF-NOT-FOUND])
@@ -346,7 +346,7 @@ of used and freed memory, you need to initialize Libgcrypt this way:
            exit (2);
          }
 
-       /* We don't want to see any warnings, e.g. because we have not yet
+       /* We don't want to see any warnings, e.g., because we have not yet
           parsed program options which might be used to suppress such
           warnings. */
        gcry_control (GCRYCTL_SUSPEND_SECMEM_WARN);
@@ -439,7 +439,7 @@ Libgcrypt into this mode:
    * If the file '/proc/sys/crypto/fips_enabled' exists and contains a
      numeric value other than '0', Libgcrypt is put into FIPS mode at
      initialization time.  Obviously this works only on systems with a
-     'proc' file system (i.e.  GNU/Linux).
+     'proc' file system (i.e., GNU/Linux).
 
    * If the file '/etc/gcrypt/fips_enabled' exists, Libgcrypt is put
      into FIPS mode at initialization time.  Note that this filename is
@@ -450,7 +450,7 @@ Libgcrypt into this mode:
 
    * If the application requests FIPS mode using the control command
      'GCRYCTL_FORCE_FIPS_MODE'.  This must be done prior to any
-     initialization (i.e.  before 'gcry_check_version').
+     initialization (i.e., before 'gcry_check_version').
 
 \1f
 File: gcrypt.info,  Node: Disabling FIPS mode,  Next: Hardware features,  Prev: Enabling FIPS mode,  Up: Preparation
@@ -465,7 +465,7 @@ provided to switch Libgcrypt into non-FIPS mode:
 
    * If the application requests non-FIPS mode using the control command
      'GCRYCTL_NO_FIPS_MODE'.  This must be done prior to any
-     initialization (i.e.  before 'gcry_check_version').
+     initialization (i.e., before 'gcry_check_version').
 
 \1f
 File: gcrypt.info,  Node: Hardware features,  Prev: Disabling FIPS mode,  Up: Preparation
@@ -496,11 +496,23 @@ are
 'intel-rdtsc'
 'intel-shaext'
 'intel-vaes-vpclmul'
+'intel-avx512'
+'intel-gfni'
 'arm-neon'
 'arm-aes'
 'arm-sha1'
 'arm-sha2'
 'arm-pmull'
+'arm-sha3'
+'arm-sm3'
+'arm-sm4'
+'arm-sha512'
+'arm-sve'
+'arm-sve2'
+'arm-sveaes'
+'arm-svepmull'
+'arm-svesha3'
+'arm-svesm4'
 'ppc-vcrypto'
 'ppc-arch_3_00'
 'ppc-arch_2_07'
@@ -556,12 +568,8 @@ File: gcrypt.info,  Node: Controlling the library,  Next: Error Handling,  Up: G
      have to be provided.
 
      'GCRYCTL_ENABLE_M_GUARD; Arguments: none'
-          This command enables the built-in memory guard.  It must not
-          be used to activate the memory guard after the memory
-          management has already been used; therefore it can ONLY be
-          used before 'gcry_check_version'.  Note that the memory guard
-          is NOT used when the user of the library has set his own
-          memory management callbacks.
+          This command was to enable the built-in memory guard, but not
+          supported any more.
 
      'GCRYCTL_ENABLE_QUICK_RANDOM; Arguments: none'
           This command inhibits the use the very secure random quality
@@ -627,7 +635,7 @@ File: gcrypt.info,  Node: Controlling the library,  Next: Error Handling,  Up: G
      'GCRYCTL_INIT_SECMEM; Arguments: unsigned int nbytes'
           This command is used to allocate a pool of secure memory and
           thus enabling the use of secure memory.  It also drops all
-          extra privileges the process has (i.e.  if it is run as setuid
+          extra privileges the process has (i.e., if it is run as setuid
           (root)).  If the argument NBYTES is 0, secure memory will be
           disabled.  The minimum amount of secure memory allocated is
           currently 16384 bytes; you may thus use a value of 1 to
@@ -873,7 +881,7 @@ File: gcrypt.info,  Node: Controlling the library,  Next: Error Handling,  Up: G
           Libgcrypt detects certain features of the CPU at startup time.
           For performance tests it is sometimes required not to use such
           a feature.  This option may be used to disable a certain
-          feature; i.e.  Libgcrypt behaves as if this feature has not
+          feature; i.e., Libgcrypt behaves as if this feature has not
           been detected.  This call can be used several times to disable
           a set of features, or features may be given as a colon or
           comma delimited string.  The special feature "all" can be used
@@ -881,7 +889,7 @@ File: gcrypt.info,  Node: Controlling the library,  Next: Error Handling,  Up: G
 
           Note that the detection code might be run if the feature has
           been disabled.  This command must be used at initialization
-          time; i.e.  before calling 'gcry_check_version'.
+          time; i.e., before calling 'gcry_check_version'.
 
      'GCRYCTL_REINIT_SYSCALL_CLAMP; Arguments: none'
 
@@ -1492,15 +1500,23 @@ File: gcrypt.info,  Node: Logging handler,  Prev: Error handler,  Up: Handler Fu
 4.4 Logging handler
 ===================
 
+Libgcrypt provides a way to install a different log handler to be used
+instead of the internal.  Only few programs make use of it and thus it
+has been deprecated.  If a log handler is not installed and since
+version 1.11 Libgcrypt uses the logging facility of GpgRT (aka
+Libgpg-error).  They are more flexible that Libgcrypt's old logging
+functions and given that GpgRT is anyway a dependency of Libgcrypt it is
+better to always use the GpgRT logging facilities.
+
  -- Data type: gcry_handler_log_t
      This type is defined as: 'void (*gcry_handler_log_t) (void *, int,
      const char *, va_list)'
 
  -- Function: void gcry_set_log_handler (gcry_handler_log_t FUNC_LOG,
           void *CB_DATA)
-     This function registers FUNC_LOG as 'logging handler', which means
-     that it will be called in case Libgcrypt wants to log a message.
-     This function may and should be used prior to calling
+     This deprecated function registers FUNC_LOG as 'logging handler',
+     which means that it will be called in case Libgcrypt wants to log a
+     message.  This function may and should be used prior to calling
      'gcry_check_version'.
 
 \1f
@@ -1509,7 +1525,7 @@ File: gcrypt.info,  Node: Symmetric cryptography,  Next: Public Key cryptography
 5 Symmetric cryptography
 ************************
 
-The cipher functions are used for symmetrical cryptography, i.e.
+The cipher functions are used for symmetrical cryptography, i.e.,
 cryptography using a shared key.  The programming model follows an
 open/process/close paradigm and is in that similar to other building
 blocks provided by Libgcrypt.
@@ -1625,6 +1641,14 @@ File: gcrypt.info,  Node: Available ciphers,  Next: Available cipher modes,  Up:
      A 128 bit cipher by the State Cryptography Administration of China
      (SCA). See <https://tools.ietf.org/html/draft-ribose-cfrg-sm4-10>.
 
+'GCRY_CIPHER_ARIA128'
+'GCRY_CIPHER_ARIA192'
+'GCRY_CIPHER_ARIA256'
+     ARIA is a general-purpose block cipher algorithm developed by
+     Korean cryptographers in 2003.  It was established as a Korean
+     standard block cipher algorithm in 2004.  See
+     <https://www.rfc-editor.org/rfc/rfc5794.html>.
+
 \1f
 File: gcrypt.info,  Node: Available cipher modes,  Next: Working with cipher handles,  Prev: Available ciphers,  Up: Symmetric cryptography
 
@@ -1643,7 +1667,7 @@ File: gcrypt.info,  Node: Available cipher modes,  Next: Working with cipher han
 'GCRY_CIPHER_MODE_CFB'
 'GCRY_CIPHER_MODE_CFB8'
      Cipher Feedback mode.  For 'GCRY_CIPHER_MODE_CFB' the shift size
-     equals the block size of the cipher (e.g.  for AES it is CFB-128).
+     equals the block size of the cipher (e.g., for AES it is CFB-128).
      For 'GCRY_CIPHER_MODE_CFB8' the shift size is 8 bits but that
      variant is not yet available.
 
@@ -1742,7 +1766,7 @@ File: gcrypt.info,  Node: Available cipher modes,  Next: Working with cipher han
 
 'GCRY_CIPHER_MODE_GCM_SIV'
      This mode implements is GCM-SIV Authenticated Encryption with
-     Associated Data (AEAD) block cipher mode specified in RFC-5297
+     Associated Data (AEAD) block cipher mode specified in RFC-8452
      (AES-GCM-SIV: Nonce Misuse-Resistant Authenticated Encryption).
      This implementations works with block ciphers with block size of
      128 bits and uses tag length of 128 bits.  Supported key lengths by
@@ -1861,7 +1885,7 @@ To set the IV or CTR, use these functions:
      Set the counter vector used for encryption or decryption.  The
      counter is passed as the buffer C of length L bytes and copied to
      internal data structures.  The function checks that the counter
-     matches the requirement of the selected algorithm (i.e., it must
+     matches the requirement of the selected algorithm (i.e.,, it must
      have the same size as the block size).
 
  -- Function: gcry_error_t gcry_cipher_reset (gcry_cipher_hd_t H)
@@ -1907,6 +1931,31 @@ authenticated data, which can be done by using the following functions:
      enforced: For GCM TAGLEN must either be 16 or one of the allowed
      truncated lengths (4, 8, 12, 13, 14, or 15).
 
+   For encryption of AEAD cipher modes, it should be possible to
+generate an initialization vector internally within libgcrypt
+implementation, in coordinated way, instead of calling
+'gcry_cipher_setiv' with arbitrary value, so that it can ensure the
+security properties of AEAD block cipher.  For this purpose, the
+following two functions are provided:
+
+ -- Function: gcry_error_t gcry_cipher_setup_geniv (gcry_cipher_hd_t H,
+          int METHOD, const void *FIXED_IV, size_t FIXED_IVLEN, const
+          void *DYN_IV, size_t DYN_IVLEN)
+
+     Set up an initialization vector generation for AEAD cipher modes.
+     Generation is specified by METHOD, fixed part of initialization
+     vector by FIXED_IV and FIXED_IVLEN, and dynamic part of
+     initialization vector by DYN_IV and DYN_IVLEN.  For METHOD, valid
+     values are 'GCRY_CIPHER_GENIV_METHOD_CONCAT' and
+     'GCRY_CIPHER_GENIV_METHOD_XOR'.
+
+ -- Function: gcry_error_t gcry_cipher_geniv (gcry_cipher_hd_t H, void
+          *IV, size_t IVLEN)
+
+     Generate the initialization vector into the output buffer IV with
+     length IVLEN.  The initialization vector will be used by following
+     'gcry_cipher_encrypt' call.
+
    The actual encryption and decryption is done by using one of the
 following functions.  They may be used as often as required to process
 all the data.
@@ -2143,8 +2192,9 @@ File: gcrypt.info,  Node: Used S-expressions,  Next: Cryptographic Functions,  P
 
 Libgcrypt's API for asymmetric cryptography is based on data structures
 called S-expressions (see
-<http://people.csail.mit.edu/rivest/sexp.html>) and does not work with
-contexts/handles as most of the other building blocks of Libgcrypt do.
+<https://web.archive.orgweb/20230305073119http://people.csail.mit.edurivest/sexp.html>)
+and does not work with contexts/handles as most of the other building
+blocks of Libgcrypt do.
 
 The following information are stored in S-expressions:
 
@@ -2562,7 +2612,7 @@ data.  There are 2 functions to do this:
      data to be encrypted can either be in the simple old format, which
      is a very simple S-expression consisting only of one MPI, or it may
      be a more complex S-expression which also allows to specify flags
-     for operation, like e.g.  padding rules.
+     for operation, like e.g., padding rules.
 
      If you don't want to let Libgcrypt handle the padding, you must
      pass an appropriate MPI using this expression for DATA:
@@ -2638,7 +2688,7 @@ data.  There are 2 functions to do this:
      The function returns 0 on success or an error code.  The variable
      at the address of R_PLAIN will be set to 'NULL' on error or receive
      the decrypted value on success.  The format of R_PLAIN is a simple
-     S-expression part (i.e.  not a valid one) with just one MPI if
+     S-expression part (i.e., not a valid one) with just one MPI if
      there was no 'flags' element in DATA; if at least an empty 'flags'
      is passed in DATA, the format is:
 
@@ -2725,8 +2775,8 @@ similar to the encryption functions:
      is used with "elg" replacing "dsa"; for ECDSA signing, the same
      format is used with "ecdsa" replacing "dsa".
 
-     For the EdDSA algorithm (cf.  Ed25515) the required input
-     parameters are:
+     For the EdDSA algorithm (cf. Ed25519) the required input parameters
+     are:
 
           (data
             (flags eddsa)
@@ -2754,7 +2804,7 @@ signature.  Libgcrypt provides this function:
      key and that no signature is created but a signature, in a format
      as created by 'gcry_pk_sign', is passed to the function in SIG.
 
-     The result is 0 for success (i.e.  the data matches the signature),
+     The result is 0 for success (i.e., the data matches the signature),
      or an error code where the most relevant code is
      'GCRY_ERR_BAD_SIGNATURE' to indicate that the signature does not
      match the provided data.
@@ -3004,7 +3054,7 @@ Libgcrypt also provides a function to generate public key pairs:
           If this parameter is not used, Libgcrypt uses for historic
           reasons 65537.  Note that the value must fit into a 32 bit
           unsigned variable and that the usual C prefixes are considered
-          (e.g.  017 gives 15).
+          (e.g., 017 gives 15).
 
      'qbits N'
           This is only meanigful for DSA keys.  If it is given, the DSA
@@ -3162,7 +3212,7 @@ Libgcrypt also provides a function to generate public key pairs:
 
      As you can see, some of the information is duplicated, but this
      provides an easy way to extract either the public or the private
-     key.  Note that the order of the elements is not defined, e.g.  the
+     key.  Note that the order of the elements is not defined, e.g., the
      private key may be stored before the public key.  N1 N2 ... NN is a
      list of prime numbers used to composite P-MPI; this is in general
      not a very useful information and only available if the key
@@ -3326,6 +3376,21 @@ File: gcrypt.info,  Node: Available hash algorithms,  Next: Working with hash al
      with 256 bit security strength.  See FIPS 202 for the
      specification.
 
+'GCRY_MD_CSHAKE128'
+     This is the cSHAKE128 extendable-output function (XOF) algorithm
+     with 128 bit security strength defined in NIST SP 800-185.  cSHAKE
+     takes two optional additional inputs N and S, which can be set by
+     using 'gcry_md_ctl' with the control commands
+     'GCRYCTL_MD_CUSTOMISE', and the argument 'struct
+     gcry_cshake_customization'.  The lengths of N or S is limited to
+     255 bytes.
+
+'GCRY_MD_CSHAKE256'
+     This is the cSHAKE256 extendable-output function (XOF) algorithm
+     with 256 bit security strength defined in NIST SP 800-185.
+     Regarding the usage of the optional additional inputs N and S, see
+     the above description of cSHAKE128.
+
 'GCRY_MD_CRC32'
      This is the ISO 3309 and ITU-T V.42 cyclic redundancy check.  It
      yields an output of 4 bytes.  Note that this is not a hash
@@ -3563,7 +3628,7 @@ function:
  -- Function: gpg_err_code_t gcry_md_extract (gcry_md_hd_t H, int ALGO,
           void *BUFFER, size_t LENGTH)
 
-     'gcry_mac_read' returns output from extendable-output function.
+     'gcry_md_extract' returns output from extendable-output function.
      This function may be used as often as required to generate more
      output byte stream from the algorithm.  Function extracts the new
      output bytes to BUFFER of the length LENGTH.  Buffer will be fully
@@ -3573,7 +3638,7 @@ function:
      requested algorithm has not been enabled.
 
    Because it is often necessary to get the message digest of blocks of
-memory, two fast convenience function are available for this task:
+memory, three fast convenience functions are available for this task:
 
  -- Function: gpg_err_code_t gcry_md_hash_buffers ( int ALGO,
           unsigned int FLAGS, void *DIGEST, const gcry_buffer_t *IOV,
@@ -3601,6 +3666,14 @@ memory, two fast convenience function are available for this task:
      On success the function returns 0 and stores the resulting hash or
      MAC at DIGEST.
 
+ -- Function: gpg_err_code_t gcry_md_hash_buffers_ext ( int ALGO,
+          unsigned int FLAGS, void *DIGEST, void *DIGESTLEN,
+          const gcry_buffer_t *IOV, int IOVCNT )
+
+     'gcry_md_hash_buffers_ext' is a variant of gcry_md_hash_buffers, so
+     that it can be used with extendable-output function.  It has an
+     additiona argument for DIGESTLEN.
+
  -- Function: void gcry_md_hash_buffer (int ALGO, void *DIGEST, const
           void *BUFFER, size_t LENGTH);
 
@@ -3678,7 +3751,7 @@ that information:
  -- Function: int gcry_md_is_secure (gcry_md_hd_t H)
 
      This function returns true when the digest object H is allocated in
-     "secure memory"; i.e.  H was created with the
+     "secure memory"; i.e., H was created with the
      'GCRY_MD_FLAG_SECURE'.
 
  -- Function: int gcry_md_is_enabled (gcry_md_hd_t H, int ALGO)
@@ -3885,6 +3958,10 @@ File: gcrypt.info,  Node: Available MAC algorithms,  Next: Working with MAC algo
      This is CMAC message authentication algorithm based on the SM4
      block cipher algorithm.
 
+'GCRY_MAC_CMAC_ARIA'
+     This is CMAC message authentication algorithm based on the ARIA
+     block cipher algorithm.
+
 'GCRY_MAC_GMAC_AES'
      This is GMAC (GCM mode based MAC) message authentication algorithm
      based on the AES block cipher algorithm.
@@ -3905,6 +3982,14 @@ File: gcrypt.info,  Node: Available MAC algorithms,  Next: Working with MAC algo
      This is GMAC message authentication algorithm based on the SEED
      block cipher algorithm.
 
+'GCRY_MAC_GMAC_SM4'
+     This is GMAC message authentication algorithm based on the SM4
+     block cipher algorithm.
+
+'GCRY_MAC_GMAC_ARIA'
+     This is GMAC message authentication algorithm based on the ARIA
+     block cipher algorithm.
+
 'GCRY_MAC_POLY1305'
      This is plain Poly1305 message authentication algorithm, used with
      one-time key.
@@ -3929,6 +4014,14 @@ File: gcrypt.info,  Node: Available MAC algorithms,  Next: Working with MAC algo
      This is Poly1305-SEED message authentication algorithm, used with
      key and one-time nonce.
 
+'GCRY_MAC_POLY1305_SM4'
+     This is Poly1305-SM4 message authentication algorithm, used with
+     key and one-time nonce.
+
+'GCRY_MAC_POLY1305_ARIA'
+     This is Poly1305-ARIA message authentication algorithm, used with
+     key and one-time nonce.
+
 'GCRY_MAC_GOST28147_IMIT'
      This is MAC construction defined in GOST 28147-89 (see RFC 5830
      Section 8).
@@ -4113,21 +4206,21 @@ strings.
      Currently supported KDFs (parameter ALGO):
 
      'GCRY_KDF_SIMPLE_S2K'
-          The OpenPGP simple S2K algorithm (cf.  RFC4880).  Its use is
+          The OpenPGP simple S2K algorithm (cf. RFC4880).  Its use is
           strongly deprecated.  SALT and ITERATIONS are not needed and
           may be passed as 'NULL'/'0'.
 
      'GCRY_KDF_SALTED_S2K'
-          The OpenPGP salted S2K algorithm (cf.  RFC4880).  Usually not
+          The OpenPGP salted S2K algorithm (cf. RFC4880).  Usually not
           used.  ITERATIONS is not needed and may be passed as '0'.
           SALTLEN must be given as 8.
 
      'GCRY_KDF_ITERSALTED_S2K'
-          The OpenPGP iterated+salted S2K algorithm (cf.  RFC4880).
-          This is the default for most OpenPGP applications.  SALTLEN
-          must be given as 8.  Note that OpenPGP defines a special
-          encoding of the ITERATIONS; however this function takes the
-          plain decoded iteration count.
+          The OpenPGP iterated+salted S2K algorithm (cf. RFC4880).  This
+          is the default for most OpenPGP applications.  SALTLEN must be
+          given as 8.  Note that OpenPGP defines a special encoding of
+          the ITERATIONS; however this function takes the plain decoded
+          iteration count.
 
      'GCRY_KDF_PBKDF2'
           The PKCS#5 Passphrase Based Key Derivation Function number 2.
@@ -4214,7 +4307,7 @@ File: gcrypt.info,  Node: S-expressions,  Next: MPI library,  Prev: Random Numbe
 
 S-expressions are used by the public key functions to pass complex data
 structures around.  These LISP like objects are used by some
-cryptographic protocols (cf.  RFC-2692) and Libgcrypt provides functions
+cryptographic protocols (cf. RFC-2692) and Libgcrypt provides functions
 to parse and construct them.  For detailed information, see 'Ron Rivest,
 code and description of S-expressions,
 <http://theory.lcs.mit.edu/~rivest/sexp.html>'.
@@ -4860,7 +4953,7 @@ The next 2 functions are used to compare MPIs:
 
      Compare the multi-precision-integers number U and V, returning 0
      for equality, a positive value for U > V and a negative for U < V.
-     If both numbers are opaque values (cf.  'gcry_mpi_set_opaque'), the
+     If both numbers are opaque values (cf. 'gcry_mpi_set_opaque'), the
      comparison is done by checking the bit sizes using memcmp.  If only
      one number is an opaque value, the opaque value is less than the
      other number.
@@ -4998,7 +5091,7 @@ elliptic curve methods for which no explicit support is available.
      CURVENAME is given, the context is initialized for this named
      curve.
 
-     If a parameter specifying a point (e.g.  'g' or 'q') is not found,
+     If a parameter specifying a point (e.g., 'g' or 'q') is not found,
      the parser looks for a non-encoded point by appending '.x', '.y',
      and '.z' to the parameter name and looking them all up to create a
      point.  A parameter with the suffix '.z' is optional and defaults
@@ -5119,7 +5212,7 @@ Two functions implement this kludge:
           unsigned int NBITS)
 
      Store NBITS of the value P points to in A and mark A as an opaque
-     value (i.e.  an value that can't be used for any math calculation
+     value (i.e., an value that can't be used for any math calculation
      and is only used to store an arbitrary bit pattern in A).
      Ownership of P is taken by this function and thus the user may not
      dereference the passed value anymore.  It is required that the
@@ -5290,7 +5383,7 @@ File: gcrypt.info,  Node: Memory allocation,  Next: Context management,  Up: Uti
 
  -- Function: void * gcry_calloc (size_t N, size_t M)
 
-     This function allocates a cleared block of memory (i.e.
+     This function allocates a cleared block of memory (i.e.,
      initialized with zero bytes) long enough to contain a vector of N
      elements, each of size M bytes.  On success it returns a pointer to
      the memory block; in an out-of-core condition, it returns 'NULL'.
@@ -5365,7 +5458,7 @@ be accomplished by using this function:
      This function returns a malloced string with colon delimited
      configure options.  With a value of 0 for MODE this string
      resembles the output of 'GCRYCTL_PRINT_CONFIG'.  However, if WHAT
-     is not 'NULL', only the line where the first field (e.g.
+     is not 'NULL', only the line where the first field (e.g.,
      "cpu-arch") matches WHAT is returned.
 
      Other values than 0 for MODE are not defined.  The caller shall
@@ -6536,7 +6629,7 @@ GNU Lesser General Public License
                       Version 2.1, February 1999
 
      Copyright (C) 1991, 1999 Free Software Foundation, Inc.
-     59 Temple Place - Suite 330, Boston, MA 02111-1307, USA
+     51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
 
      Everyone is permitted to copy and distribute verbatim copies
      of this license document, but changing it is not allowed.
@@ -6546,7 +6639,7 @@ GNU Lesser General Public License
      version number 2.1.]
 
 Preamble
-========
+--------
 
 The licenses for most software are designed to take away your freedom to
 share and change it.  By contrast, the GNU General Public Licenses are
@@ -6647,8 +6740,8 @@ modification follow.  Pay close attention to the difference between a
 former contains code derived from the library, whereas the latter must
 be combined with the library in order to run.
 
-                   GNU LESSER GENERAL PUBLIC LICENSE
-    TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
+TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
+---------------------------------------------------------------
 
   0. This License Agreement applies to any software library or other
      program which contains a notice placed by the copyright holder or
@@ -7001,10 +7094,11 @@ be combined with the library in order to run.
      OTHER SOFTWARE), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN
      ADVISED OF THE POSSIBILITY OF SUCH DAMAGES.
 
-                      END OF TERMS AND CONDITIONS
+END OF TERMS AND CONDITIONS
+---------------------------
 
 How to Apply These Terms to Your New Libraries
-==============================================
+----------------------------------------------
 
 If you develop a new library, and you want it to be of the greatest
 possible use to the public, we recommend making it free software that
@@ -7033,7 +7127,7 @@ found.
 
      You should have received a copy of the GNU Lesser General Public
      License along with this library; if not, write to the Free Software
-     Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307,
+     Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301,
      USA.
 
    Also add information on how to contact you by electronic and paper
@@ -7060,7 +7154,7 @@ GNU General Public License
                          Version 2, June 1991
 
      Copyright (C) 1989, 1991 Free Software Foundation, Inc.
-     59 Temple Place - Suite 330, Boston, MA 02111-1307, USA
+     51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA
 
      Everyone is permitted to copy and distribute verbatim copies
      of this license document, but changing it is not allowed.
@@ -7074,8 +7168,8 @@ intended to guarantee your freedom to share and change free software--to
 make sure the software is free for all its users.  This General Public
 License applies to most of the Free Software Foundation's software and
 to any other program whose authors commit to using it.  (Some other Free
-Software Foundation software is covered by the GNU Library General
-Public License instead.)  You can apply it to your programs, too.
+Software Foundation software is covered by the GNU Lesser General Public
+License instead.)  You can apply it to your programs, too.
 
    When we speak of free software, we are referring to freedom, not
 price.  Our General Public Licenses are designed to make sure that you
@@ -7115,9 +7209,10 @@ patent must be licensed for everyone's free use or not licensed at all.
    The precise terms and conditions for copying, distribution and
 modification follow.
 
-    TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
+TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
+===============================================================
 
-  1. This License applies to any program or other work which contains a
+  0. This License applies to any program or other work which contains a
      notice placed by the copyright holder saying it may be distributed
      under the terms of this General Public License.  The "Program",
      below, refers to any such program or work, and a "work based on the
@@ -7135,7 +7230,7 @@ modification follow.
      the Program (independent of having been made by running the
      Program).  Whether that is true depends on what the Program does.
 
-  2. You may copy and distribute verbatim copies of the Program's source
+  1. You may copy and distribute verbatim copies of the Program's source
      code as you receive it, in any medium, provided that you
      conspicuously and appropriately publish on each copy an appropriate
      copyright notice and disclaimer of warranty; keep intact all the
@@ -7147,7 +7242,7 @@ modification follow.
      and you may at your option offer warranty protection in exchange
      for a fee.
 
-  3. You may modify your copy or copies of the Program or any portion of
+  2. You may modify your copy or copies of the Program or any portion of
      it, thus forming a work based on the Program, and copy and
      distribute such modifications or work under the terms of Section 1
      above, provided that you also meet all of these conditions:
@@ -7193,7 +7288,7 @@ modification follow.
      volume of a storage or distribution medium does not bring the other
      work under the scope of this License.
 
-  4. You may copy and distribute the Program (or a work based on it,
+  3. You may copy and distribute the Program (or a work based on it,
      under Section 2) in object code or executable form under the terms
      of Sections 1 and 2 above provided that you also do one of the
      following:
@@ -7233,7 +7328,7 @@ modification follow.
      distribution of the source code, even though third parties are not
      compelled to copy the source along with the object code.
 
-  5. You may not copy, modify, sublicense, or distribute the Program
+  4. You may not copy, modify, sublicense, or distribute the Program
      except as expressly provided under this License.  Any attempt
      otherwise to copy, modify, sublicense or distribute the Program is
      void, and will automatically terminate your rights under this
@@ -7241,7 +7336,7 @@ modification follow.
      from you under this License will not have their licenses terminated
      so long as such parties remain in full compliance.
 
-  6. You are not required to accept this License, since you have not
+  5. You are not required to accept this License, since you have not
      signed it.  However, nothing else grants you permission to modify
      or distribute the Program or its derivative works.  These actions
      are prohibited by law if you do not accept this License.
@@ -7250,7 +7345,7 @@ modification follow.
      to do so, and all its terms and conditions for copying,
      distributing or modifying the Program or works based on it.
 
-  7. Each time you redistribute the Program (or any work based on the
+  6. Each time you redistribute the Program (or any work based on the
      Program), the recipient automatically receives a license from the
      original licensor to copy, distribute or modify the Program subject
      to these terms and conditions.  You may not impose any further
@@ -7258,7 +7353,7 @@ modification follow.
      herein.  You are not responsible for enforcing compliance by third
      parties to this License.
 
-  8. If, as a consequence of a court judgment or allegation of patent
+  7. If, as a consequence of a court judgment or allegation of patent
      infringement or for any other reason (not limited to patent
      issues), conditions are imposed on you (whether by court order,
      agreement or otherwise) that contradict the conditions of this
@@ -7291,7 +7386,7 @@ modification follow.
      This section is intended to make thoroughly clear what is believed
      to be a consequence of the rest of this License.
 
-  9. If the distribution and/or use of the Program is restricted in
+  8. If the distribution and/or use of the Program is restricted in
      certain countries either by patents or by copyrighted interfaces,
      the original copyright holder who places the Program under this
      License may add an explicit geographical distribution limitation
@@ -7300,7 +7395,7 @@ modification follow.
      License incorporates the limitation as if written in the body of
      this License.
 
-  10. The Free Software Foundation may publish revised and/or new
+  9. The Free Software Foundation may publish revised and/or new
      versions of the General Public License from time to time.  Such new
      versions will be similar in spirit to the present version, but may
      differ in detail to address new problems or concerns.
@@ -7313,7 +7408,7 @@ modification follow.
      specify a version number of this License, you may choose any
      version ever published by the Free Software Foundation.
 
-  11. If you wish to incorporate parts of the Program into other free
+  10. If you wish to incorporate parts of the Program into other free
      programs whose distribution conditions are different, write to the
      author to ask for permission.  For software which is copyrighted by
      the Free Software Foundation, write to the Free Software
@@ -7324,7 +7419,7 @@ modification follow.
 
                               NO WARRANTY
 
-  12. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO
+  11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO
      WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE
      LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS
      AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY
@@ -7335,7 +7430,7 @@ modification follow.
      DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, REPAIR
      OR CORRECTION.
 
-  13. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN
+  12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN
      WRITING WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY
      MODIFY AND/OR REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE
      LIABLE TO YOU FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL,
@@ -7348,8 +7443,8 @@ modification follow.
 
                       END OF TERMS AND CONDITIONS
 
-How to Apply These Terms to Your New Programs
-=============================================
+Appendix: How to Apply These Terms to Your New Programs
+=======================================================
 
 If you develop a new program, and you want it to be of the greatest
 possible use to the public, the best way to achieve this is to make it
@@ -7361,22 +7456,22 @@ to attach them to the start of each source file to most effectively
 convey the exclusion of warranty; and each file should have at least the
 "copyright" line and a pointer to where the full notice is found.
 
-     ONE LINE TO GIVE THE PROGRAM'S NAME AND AN IDEA OF WHAT IT DOES.
-     Copyright (C) 19YY  NAME OF AUTHOR
+     ONE LINE TO GIVE THE PROGRAM'S NAME AND A BRIEF IDEA OF WHAT IT DOES.
+     Copyright (C) YYYY  NAME OF AUTHOR
 
-     This program is free software; you can redistribute it and/or
-     modify it under the terms of the GNU General Public License
-     as published by the Free Software Foundation; either version 2
-     of the License, or (at your option) any later version.
+     This program is free software; you can redistribute it and/or modify
+     it under the terms of the GNU General Public License as published by
+     the Free Software Foundation; either version 2 of the License, or
+     (at your option) any later version.
 
      This program is distributed in the hope that it will be useful,
      but WITHOUT ANY WARRANTY; without even the implied warranty of
      MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
      GNU General Public License for more details.
 
-     You should have received a copy of the GNU General Public License along
-     with this program; if not, write to the Free Software Foundation, Inc.,
-     59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
+     You should have received a copy of the GNU General Public License
+     along with this program; if not, write to the Free Software
+     Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
 
    Also add information on how to contact you by electronic and paper
 mail.
@@ -7384,11 +7479,10 @@ mail.
    If the program is interactive, make it output a short notice like
 this when it starts in an interactive mode:
 
-     Gnomovision version 69, Copyright (C) 19YY NAME OF AUTHOR
-     Gnomovision comes with ABSOLUTELY NO WARRANTY; for details
-     type `show w'.  This is free software, and you are welcome
-     to redistribute it under certain conditions; type `show c'
-     for details.
+     Gnomovision version 69, Copyright (C) YEAR NAME OF AUTHOR
+     Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
+     This is free software, and you are welcome to redistribute it
+     under certain conditions; type `show c' for details.
 
    The hypothetical commands 'show w' and 'show c' should show the
 appropriate parts of the General Public License.  Of course, the
@@ -7400,10 +7494,8 @@ program.
 your school, if any, to sign a "copyright disclaimer" for the program,
 if necessary.  Here is a sample; alter the names:
 
-     Yoyodyne, Inc., hereby disclaims all copyright
-     interest in the program `Gnomovision'
-     (which makes passes at compilers) written
-     by James Hacker.
+     Yoyodyne, Inc., hereby disclaims all copyright interest in the program
+     `Gnomovision' (which makes passes at compilers) written by James Hacker.
 
      SIGNATURE OF TY COON, 1 April 1989
      Ty Coon, President of Vice
@@ -7412,5 +7504,5 @@ if necessary.  Here is a sample; alter the names:
 program into proprietary programs.  If your program is a subroutine
 library, you may consider it more useful to permit linking proprietary
 applications with the library.  If this is what you want to do, use the
-GNU Library General Public License instead of this License.
+GNU Lesser General Public License instead of this License.
 
index 1f15827ec43fbe3efac3bbb32af0aedc17933d27..7b9f78aaf7fde561ef057f18b133955ed07745f7 100644 (file)
Binary files a/doc/gcrypt.info-2 and b/doc/gcrypt.info-2 differ
index 1b5892d47563f9ed5bc479c7c1d0b936fb817eba..5d4287383d3840fdd2287559ce2ca802be1b1c35 100644 (file)
@@ -297,12 +297,17 @@ gcc -o foo foo.c `pkg-config --cflags --libs libgcrypt`
 @node Building sources using Automake
 @section Building sources using Automake
 
-It is much easier if you use GNU Automake instead of writing your own
-Makefiles.  If you do that, you do not have to worry about finding and
-invoking the @command{pkg-config} script at all.
+You can simply use @code{PKG_CHECK_MODULES} macro with @command{pkg-config}:
 
-You can use @code{PKG_CHECK_MODULES} macro, or, libgcrypt also
-provides an extension to Automake that does all the work for you.
+@example
+PKG_CHECK_MODULES([LIBGCRYPT], [libgcrypt >= 1.11])
+@end example
+
+Alternatively, instead of using @command{pkg-config}, for building on
+an environment with no pkg-config, libgcrypt also provides an
+extension to Automake that does all the work for you.  Please note
+that it is required to have gpgrt-config from libgpg-error installed
+in this case.
 
 @c A simple macro for optional variables.
 @macro ovar{varname}
@@ -408,7 +413,7 @@ and freed memory, you need to initialize Libgcrypt this way:
     @}
 
 @anchor{sample-use-suspend-secmem}
-  /* We don't want to see any warnings, e.g. because we have not yet
+  /* We don't want to see any warnings, e.g., because we have not yet
      parsed program options which might be used to suppress such
      warnings. */
   gcry_control (GCRYCTL_SUSPEND_SECMEM_WARN);
@@ -512,7 +517,7 @@ Libgcrypt into this mode:
 If the file @file{/proc/sys/crypto/fips_enabled} exists and contains a
 numeric value other than @code{0}, Libgcrypt is put into FIPS mode at
 initialization time.  Obviously this works only on systems with a
-@code{proc} file system (i.e. GNU/Linux).
+@code{proc} file system (i.e., GNU/Linux).
 
 @item
 If the file @file{/etc/gcrypt/fips_enabled} exists, Libgcrypt is put
@@ -526,7 +531,7 @@ Libgcrypt is put into FIPS mode at initialization time.
 @item
 If the application requests FIPS mode using the control command
 @code{GCRYCTL_FORCE_FIPS_MODE}.  This must be done prior to any
-initialization (i.e. before @code{gcry_check_version}).
+initialization (i.e., before @code{gcry_check_version}).
 
 @end itemize
 
@@ -546,7 +551,7 @@ is provided to switch Libgcrypt into non-FIPS mode:
 @item
 If the application requests non-FIPS mode using the control command
 @code{GCRYCTL_NO_FIPS_MODE}.  This must be done prior to any
-initialization (i.e. before @code{gcry_check_version}).
+initialization (i.e., before @code{gcry_check_version}).
 @end itemize
 
 
@@ -580,11 +585,23 @@ are
 @item intel-rdtsc
 @item intel-shaext
 @item intel-vaes-vpclmul
+@item intel-avx512
+@item intel-gfni
 @item arm-neon
 @item arm-aes
 @item arm-sha1
 @item arm-sha2
 @item arm-pmull
+@item arm-sha3
+@item arm-sm3
+@item arm-sm4
+@item arm-sha512
+@item arm-sve
+@item arm-sve2
+@item arm-sveaes
+@item arm-svepmull
+@item arm-svesha3
+@item arm-svesm4
 @item ppc-vcrypto
 @item ppc-arch_3_00
 @item ppc-arch_2_07
@@ -642,12 +659,8 @@ arguments can or have to be provided.
 
 @table @code
 @item GCRYCTL_ENABLE_M_GUARD; Arguments: none
-This command enables the built-in memory guard.  It must not be used
-to activate the memory guard after the memory management has already
-been used; therefore it can ONLY be used before
-@code{gcry_check_version}.  Note that the memory guard is NOT used
-when the user of the library has set his own memory management
-callbacks.
+This command was to enable the built-in memory guard, but not supported
+any more.
 
 @item GCRYCTL_ENABLE_QUICK_RANDOM; Arguments: none
 This command inhibits the use the very secure random quality level
@@ -711,7 +724,7 @@ privileges.
 @item GCRYCTL_INIT_SECMEM; Arguments: unsigned int nbytes
 This command is used to allocate a pool of secure memory and thus
 enabling the use of secure memory.  It also drops all extra privileges
-the process has (i.e. if it is run as setuid (root)).  If the argument
+the process has (i.e., if it is run as setuid (root)).  If the argument
 @var{nbytes} is 0, secure memory will be disabled.  The minimum amount
 of secure memory allocated is currently 16384 bytes; you may thus use a
 value of 1 to request that default size.
@@ -939,7 +952,7 @@ success or an error code on failure.
 
 Libgcrypt detects certain features of the CPU at startup time.  For
 performance tests it is sometimes required not to use such a feature.
-This option may be used to disable a certain feature; i.e. Libgcrypt
+This option may be used to disable a certain feature; i.e., Libgcrypt
 behaves as if this feature has not been detected.  This call can be
 used several times to disable a set of features, or features may be
 given as a colon or comma delimited string.  The special feature
@@ -947,7 +960,7 @@ given as a colon or comma delimited string.  The special feature
 
 Note that the detection code might be run if the feature has been
 disabled.  This command must be used at initialization time;
-i.e. before calling @code{gcry_check_version}.
+i.e., before calling @code{gcry_check_version}.
 
 @item GCRYCTL_REINIT_SYSCALL_CLAMP; Arguments: none
 
@@ -1576,17 +1589,26 @@ which means that it will be called in error conditions.
 @node Logging handler
 @section Logging handler
 
+Libgcrypt provides a way to install a different log handler to be used
+instead of the internal.  Only few programs make use of it and thus it
+has been deprecated.  If a log handler is not installed and since
+version 1.11 Libgcrypt uses the logging facility of GpgRT (aka
+Libgpg-error).  They are more flexible that Libgcrypt's old logging
+functions and given that GpgRT is anyway a dependency of Libgcrypt it
+is better to always use the GpgRT logging facilities.
+
 @deftp {Data type} gcry_handler_log_t
 This type is defined as: @code{void (*gcry_handler_log_t) (void *, int, const char *, va_list)}
 @end deftp
 
 @deftypefun void gcry_set_log_handler (gcry_handler_log_t @var{func_log}, void *@var{cb_data})
-This function registers @var{func_log} as `logging handler', which means
+This deprecated function registers @var{func_log} as `logging handler', which means
 that it will be called in case Libgcrypt wants to log a message.  This
 function may and should be used prior to calling
 @code{gcry_check_version}.
 @end deftypefun
 
+
 @c **********************************************************
 @c *******************  Ciphers  ****************************
 @c **********************************************************
@@ -1595,7 +1617,7 @@ function may and should be used prior to calling
 @chapter Symmetric cryptography
 
 The cipher functions are used for symmetrical cryptography,
-i.e. cryptography using a shared key.  The programming model follows
+i.e., cryptography using a shared key.  The programming model follows
 an open/process/close paradigm and is in that similar to other
 building blocks provided by Libgcrypt.
 
@@ -1732,6 +1754,15 @@ A 128 bit cipher by the State Cryptography Administration
 of China (SCA).  See
 @uref{https://tools.ietf.org/html/draft-ribose-cfrg-sm4-10}.
 
+@item  GCRY_CIPHER_ARIA128
+@itemx GCRY_CIPHER_ARIA192
+@itemx GCRY_CIPHER_ARIA256
+@cindex ARIA (cipher)
+ARIA is a general-purpose block cipher algorithm developed by
+Korean cryptographers in 2003.  It was established as a Korean
+standard block cipher algorithm in 2004.  See
+@uref{https://www.rfc-editor.org/rfc/rfc5794.html}.
+
 @end table
 
 @node Available cipher modes
@@ -1751,7 +1782,7 @@ Electronic Codebook mode.
 @item GCRY_CIPHER_MODE_CFB8
 @cindex CFB, Cipher Feedback mode
 Cipher Feedback mode.  For @code{GCRY_CIPHER_MODE_CFB} the shift size equals
-the block size of the cipher (e.g. for AES it is CFB-128).  For
+the block size of the cipher (e.g., for AES it is CFB-128).  For
 @code{GCRY_CIPHER_MODE_CFB8} the shift size is 8 bits but that variant is not
 yet available.
 
@@ -1862,7 +1893,7 @@ needs to be given to SIV mode before decryption using
 @item  GCRY_CIPHER_MODE_GCM_SIV
 @cindex GCM-SIV, GCM-SIV mode, AES-GCM-SIV
 This mode implements is GCM-SIV Authenticated Encryption with
-Associated Data (AEAD) block cipher mode specified in RFC-5297
+Associated Data (AEAD) block cipher mode specified in RFC-8452
 (AES-GCM-SIV: Nonce Misuse-Resistant Authenticated Encryption).
 This implementations works with block ciphers with block size of
 128 bits and uses tag length of 128 bits.  Supported key lengths
@@ -1987,7 +2018,7 @@ needs to be called after setting the key.
 Set the counter vector used for encryption or decryption. The counter
 is passed as the buffer @var{c} of length @var{l} bytes and copied to
 internal data structures.  The function checks that the counter
-matches the requirement of the selected algorithm (i.e., it must have
+matches the requirement of the selected algorithm (i.e.,, it must have
 the same size as the block size).
 @end deftypefun
 
@@ -2042,6 +2073,33 @@ truncated lengths (4, 8, 12, 13, 14, or 15).
 
 @end deftypefun
 
+For encryption of AEAD cipher modes, it should be possible to generate
+an initialization vector internally within libgcrypt implementation,
+in coordinated way, instead of calling @code{gcry_cipher_setiv} with
+arbitrary value, so that it can ensure the security properties of AEAD
+block cipher.  For this purpose, the following two functions are provided:
+
+@deftypefun {gcry_error_t} gcry_cipher_setup_geniv (gcry_cipher_hd_t @var{h}, @
+            int @var{method}, const void *@var{fixed_iv}, size_t @var{fixed_ivlen}, @
+            const void *@var{dyn_iv}, size_t @var{dyn_ivlen})
+
+Set up an initialization vector generation for AEAD cipher modes.
+Generation is specified by @var{method}, fixed part of initialization
+vector by @var{fixed_iv} and @var{fixed_ivlen}, and dynamic part of
+initialization vector by @var{dyn_iv} and @var{dyn_ivlen}.
+For @var{method}, valid values are @code{GCRY_CIPHER_GENIV_METHOD_CONCAT}
+and @code{GCRY_CIPHER_GENIV_METHOD_XOR}.
+@end deftypefun
+
+@deftypefun {gcry_error_t} gcry_cipher_geniv (gcry_cipher_hd_t @var{h}, @
+            void *@var{iv}, size_t @var{ivlen})
+
+Generate the initialization vector into the output buffer @var{iv}
+with length @var{ivlen}.  The initialization vector will be used by
+following @code{gcry_cipher_encrypt} call.
+@end deftypefun
+
+
 The actual encryption and decryption is done by using one of the
 following functions.  They may be used as often as required to process
 all the data.
@@ -2307,7 +2365,7 @@ as DSA (Digital Signature Algorithm), Elgamal, ECDSA, ECDH, and EdDSA.
 
 Libgcrypt's API for asymmetric cryptography is based on data structures
 called S-expressions (see
-@uref{http://people.csail.mit.edu/@/rivest/@/sexp.html}) and does not work
+@uref{https://web.archive.org@/web/20230305073119@/http://people.csail.mit.edu@/rivest/sexp.html}) and does not work
 with contexts/handles as most of the other building blocks of Libgcrypt do.
 
 @noindent
@@ -2774,7 +2832,7 @@ expected as an appropriate S-expression (see above) in @var{pkey}.
 The data to be encrypted can either be in the simple old format, which
 is a very simple S-expression consisting only of one MPI, or it may be
 a more complex S-expression which also allows to specify flags for
-operation, like e.g. padding rules.
+operation, like e.g., padding rules.
 
 @noindent
 If you don't want to let Libgcrypt handle the padding, you must pass an
@@ -2871,7 +2929,7 @@ type 2 padding, or @code{oaep} for RSA-OAEP padding.
 The function returns 0 on success or an error code.  The variable at the
 address of @var{r_plain} will be set to @code{NULL} on error or receive the
 decrypted value on success.  The format of @var{r_plain} is a
-simple S-expression part (i.e. not a valid one) with just one MPI if
+simple S-expression part (i.e., not a valid one) with just one MPI if
 there was no @code{flags} element in @var{data}; if at least an empty
 @code{flags} is passed in @var{data}, the format is:
 
@@ -2982,7 +3040,7 @@ is not as secure as the other algorithms), the same format is used
 with "elg" replacing "dsa"; for ECDSA signing, the same format is used
 with "ecdsa" replacing "dsa".
 
-For the EdDSA algorithm (cf. Ed25515) the required input parameters are:
+For the EdDSA algorithm (cf.@: Ed25519) the required input parameters are:
 
 @example
 (data
@@ -3018,7 +3076,7 @@ signature, in a format as created by @code{gcry_pk_sign}, is passed to
 the function in @var{sig}.
 
 @noindent
-The result is 0 for success (i.e. the data matches the signature), or an
+The result is 0 for success (i.e., the data matches the signature), or an
 error code where the most relevant code is @code{GCRY_ERR_BAD_SIGNATURE}
 to indicate that the signature does not match the provided data.
 
@@ -3305,7 +3363,7 @@ Use the given value.
 @noindent
 If this parameter is not used, Libgcrypt uses for historic reasons
 65537.  Note that the value must fit into a 32 bit unsigned variable
-and that the usual C prefixes are considered (e.g. 017 gives 15).
+and that the usual C prefixes are considered (e.g., 017 gives 15).
 
 
 @item qbits @var{n}
@@ -3484,7 +3542,7 @@ elliptic curve key generation:
 @noindent
 As you can see, some of the information is duplicated, but this
 provides an easy way to extract either the public or the private key.
-Note that the order of the elements is not defined, e.g. the private
+Note that the order of the elements is not defined, e.g., the private
 key may be stored before the public key. @var{n1 n2 ... nn} is a list
 of prime numbers used to composite @var{p-mpi}; this is in general not
 a very useful information and only available if the key generation
@@ -3562,7 +3620,7 @@ are also supported.
 @c begin table of hash algorithms
 @cindex SHA-1
 @cindex SHA-224, SHA-256, SHA-384, SHA-512, SHA-512/224, SHA-512/256
-@cindex SHA3-224, SHA3-256, SHA3-384, SHA3-512, SHAKE128, SHAKE256
+@cindex SHA3-224, SHA3-256, SHA3-384, SHA3-512, SHAKE128, SHAKE256, cSHAKE128, cSHAKE256
 @cindex RIPE-MD-160
 @cindex MD2, MD4, MD5
 @cindex TIGER, TIGER1, TIGER2
@@ -3672,6 +3730,19 @@ This is the SHAKE256 extendable-output function (XOF) algorithm with 256 bit
 security strength.
 See FIPS 202 for the specification.
 
+@item GCRY_MD_CSHAKE128
+This is the cSHAKE128 extendable-output function (XOF) algorithm with
+128 bit security strength defined in NIST SP 800-185.  cSHAKE takes
+two optional additional inputs N and S, which can be set by using
+@code{gcry_md_ctl} with the control commands @code{GCRYCTL_MD_CUSTOMISE},
+and the argument @code{struct gcry_cshake_customization}.  The lengths
+of N or S is limited to 255 bytes.
+
+@item GCRY_MD_CSHAKE256
+This is the cSHAKE256 extendable-output function (XOF) algorithm with 256 bit
+security strength defined in NIST SP 800-185.  Regarding the usage of the
+optional additional inputs N and S, see the above description of cSHAKE128.
+
 @item GCRY_MD_CRC32
 This is the ISO 3309 and ITU-T V.42 cyclic redundancy check.  It yields
 an output of 4 bytes.  Note that this is not a hash algorithm in the
@@ -3927,7 +3998,7 @@ function:
 @deftypefun gpg_err_code_t gcry_md_extract (gcry_md_hd_t @var{h}, @
   int @var{algo}, void *@var{buffer}, size_t @var{length})
 
-@code{gcry_mac_read} returns output from extendable-output function.
+@code{gcry_md_extract} returns output from extendable-output function.
 This function may be used as often as required to generate more output
 byte stream from the algorithm.  Function extracts the new output bytes
 to @var{buffer} of the length @var{length}.  Buffer will be fully
@@ -3938,7 +4009,7 @@ been enabled.
 @end deftypefun
 
 Because it is often necessary to get the message digest of blocks of
-memory, two fast convenience function are available for this task:
+memory, three fast convenience functions are available for this task:
 
 @deftypefun gpg_err_code_t gcry_md_hash_buffers ( @
   @w{int @var{algo}}, @w{unsigned int @var{flags}}, @
@@ -3968,6 +4039,16 @@ On success the function returns 0 and stores the resulting hash or MAC
 at @var{digest}.
 @end deftypefun
 
+@deftypefun gpg_err_code_t gcry_md_hash_buffers_ext ( @
+  @w{int @var{algo}}, @w{unsigned int @var{flags}}, @
+  @w{void *@var{digest}}, @w{void *@var{digestlen}}, @
+  @w{const gcry_buffer_t *@var{iov}}, @w{int @var{iovcnt}} )
+
+@code{gcry_md_hash_buffers_ext} is a variant of gcry_md_hash_buffers,
+so that it can be used with extendable-output function.  It has an
+additiona argument for @var{digestlen}.
+@end deftypefun
+
 @deftypefun void gcry_md_hash_buffer (int @var{algo}, void *@var{digest}, const void *@var{buffer}, size_t @var{length});
 
 @code{gcry_md_hash_buffer} is a shortcut function to calculate a message
@@ -4058,7 +4139,7 @@ The following macro might also be useful:
 @deftypefun int gcry_md_is_secure (gcry_md_hd_t @var{h})
 
 This function returns true when the digest object @var{h} is allocated
-in "secure memory"; i.e. @var{h} was created with the
+in "secure memory"; i.e., @var{h} was created with the
 @code{GCRY_MD_FLAG_SECURE}.
 @end deftypefun
 
@@ -4283,6 +4364,10 @@ block cipher algorithm.
 This is CMAC message authentication algorithm based on the SM4
 block cipher algorithm.
 
+@item GCRY_MAC_CMAC_ARIA
+This is CMAC message authentication algorithm based on the ARIA
+block cipher algorithm.
+
 @item GCRY_MAC_GMAC_AES
 This is GMAC (GCM mode based MAC) message authentication algorithm based on
 the AES block cipher algorithm.
@@ -4303,6 +4388,14 @@ block cipher algorithm.
 This is GMAC message authentication algorithm based on the SEED
 block cipher algorithm.
 
+@item GCRY_MAC_GMAC_SM4
+This is GMAC message authentication algorithm based on the SM4
+block cipher algorithm.
+
+@item GCRY_MAC_GMAC_ARIA
+This is GMAC message authentication algorithm based on the ARIA
+block cipher algorithm.
+
 @item GCRY_MAC_POLY1305
 This is plain Poly1305 message authentication algorithm, used with
 one-time key.
@@ -4327,6 +4420,14 @@ key and one-time nonce.
 This is Poly1305-SEED message authentication algorithm, used with
 key and one-time nonce.
 
+@item GCRY_MAC_POLY1305_SM4
+This is Poly1305-SM4 message authentication algorithm, used with
+key and one-time nonce.
+
+@item GCRY_MAC_POLY1305_ARIA
+This is Poly1305-ARIA message authentication algorithm, used with
+key and one-time nonce.
+
 @item GCRY_MAC_GOST28147_IMIT
 This is MAC construction defined in GOST 28147-89 (see RFC 5830 Section 8).
 
@@ -4544,17 +4645,17 @@ Currently supported KDFs (parameter @var{algo}):
 
 @table @code
 @item GCRY_KDF_SIMPLE_S2K
-The OpenPGP simple S2K algorithm (cf. RFC4880).  Its use is strongly
+The OpenPGP simple S2K algorithm (cf.@: RFC4880).  Its use is strongly
 deprecated.  @var{salt} and @var{iterations} are not needed and may be
 passed as @code{NULL}/@code{0}.
 
 @item GCRY_KDF_SALTED_S2K
-The OpenPGP salted S2K algorithm (cf. RFC4880).  Usually not used.
+The OpenPGP salted S2K algorithm (cf.@: RFC4880).  Usually not used.
 @var{iterations} is not needed and may be passed as @code{0}.  @var{saltlen}
 must be given as 8.
 
 @item GCRY_KDF_ITERSALTED_S2K
-The OpenPGP iterated+salted S2K algorithm (cf. RFC4880).  This is the
+The OpenPGP iterated+salted S2K algorithm (cf.@: RFC4880).  This is the
 default for most OpenPGP applications.  @var{saltlen} must be given as
 8.  Note that OpenPGP defines a special encoding of the
 @var{iterations}; however this function takes the plain decoded
@@ -4650,7 +4751,7 @@ and does not drain the precious entropy pool.
 
 S-expressions are used by the public key functions to pass complex data
 structures around.  These LISP like objects are used by some
-cryptographic protocols (cf. RFC-2692) and Libgcrypt provides functions
+cryptographic protocols (cf.@: RFC-2692) and Libgcrypt provides functions
 to parse and construct them.  For detailed information, see
 @cite{Ron Rivest, code and description of S-expressions,
 @uref{http://theory.lcs.mit.edu/~rivest/sexp.html}}.
@@ -5346,7 +5447,7 @@ The next 2 functions are used to compare MPIs:
 Compare the multi-precision-integers number @var{u} and @var{v},
 returning 0 for equality, a positive value for @var{u} > @var{v} and a
 negative for @var{u} < @var{v}.  If both numbers are opaque values
-(cf. @code{gcry_mpi_set_opaque}), the comparison is done by checking the bit
+(cf.@: @code{gcry_mpi_set_opaque}), the comparison is done by checking the bit
 sizes using memcmp.  If only one number is an opaque value, the opaque
 value is less than the other number.
 @end deftypefun
@@ -5501,7 +5602,7 @@ reference, the string @var{curvename} is used to fill in missing
 parameters.  If only @var{curvename} is given, the context is
 initialized for this named curve.
 
-If a parameter specifying a point (e.g. @code{g} or @code{q}) is not
+If a parameter specifying a point (e.g., @code{g} or @code{q}) is not
 found, the parser looks for a non-encoded point by appending
 @code{.x}, @code{.y}, and @code{.z} to the parameter name and looking
 them all up to create a point.  A parameter with the suffix @code{.z}
@@ -5647,7 +5748,7 @@ value.  Two functions implement this kludge:
 @deftypefun gcry_mpi_t gcry_mpi_set_opaque (@w{gcry_mpi_t @var{a}}, @w{void *@var{p}}, @w{unsigned int @var{nbits}})
 
 Store @var{nbits} of the value @var{p} points to in @var{a} and mark
-@var{a} as an opaque value (i.e. an value that can't be used for any
+@var{a} as an opaque value (i.e., an value that can't be used for any
 math calculation and is only used to store an arbitrary bit pattern in
 @var{a}).  Ownership of @var{p} is taken by this function and thus the
 user may not dereference the passed value anymore.  It is required
@@ -5821,7 +5922,7 @@ Like @code{gcry_malloc}, but uses secure memory.
 
 @deftypefun {void *} gcry_calloc (size_t @var{n}, size_t @var{m})
 
-This function allocates a cleared block of memory (i.e. initialized with
+This function allocates a cleared block of memory (i.e., initialized with
 zero bytes) long enough to contain a vector of @var{n} elements, each of
 size @var{m} bytes.  On success it returns a pointer to the memory
 block; in an out-of-core condition, it returns @code{NULL}.
@@ -5900,7 +6001,7 @@ program.  This can be accomplished by using this function:
 This function returns a malloced string with colon delimited configure
 options.  With a value of 0 for @var{mode} this string resembles the
 output of @code{GCRYCTL_PRINT_CONFIG}.  However, if @var{what} is not
-@code{NULL}, only the line where the first field (e.g. "cpu-arch") matches
+@code{NULL}, only the line where the first field (e.g., "cpu-arch") matches
 @var{what} is returned.
 
 Other values than 0 for @var{mode} are not defined.  The caller shall
@@ -7259,8 +7360,16 @@ memory and thus also the encryption contexts with these keys.
 @c **********************************************************
 @c *************  Appendices (license etc.)  ****************
 @c **********************************************************
+@node Library Copying
+@unnumbered GNU Lesser General Public License
+
+@cindex LGPL, GNU Lesser General Public License
 @include lgpl.texi
 
+@node Copying
+@unnumbered GNU General Public License
+
+@cindex GPL, GNU General Public License
 @include gpl.texi
 
 @node Figures and Tables
index 6eb301e2b2c677930aed56c698b01108bbc629e9..38aa91822beb770d5ec041d8f7a208c0560b3bc8 100644 (file)
@@ -1,12 +1,12 @@
-@node Copying
-@unnumbered GNU General Public License
-
-@cindex GPL, GNU General Public License
+@c The GNU General Public License.
 @center Version 2, June 1991
 
+@c This file is intended to be included within another document,
+@c hence no sectioning command or @node.
+
 @display
 Copyright @copyright{} 1989, 1991 Free Software Foundation, Inc.
-59 Temple Place -- Suite 330, Boston, MA 02111-1307, USA
+51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA
 
 Everyone is permitted to copy and distribute verbatim copies
 of this license document, but changing it is not allowed.
@@ -21,7 +21,7 @@ software---to make sure the software is free for all its users.  This
 General Public License applies to most of the Free Software
 Foundation's software and to any other program whose authors commit to
 using it.  (Some other Free Software Foundation software is covered by
-the GNU Library General Public License instead.)  You can apply it to
+the GNU Lesser General Public License instead.)  You can apply it to
 your programs, too.
 
   When we speak of free software, we are referring to freedom, not
@@ -62,14 +62,9 @@ patent must be licensed for everyone's free use or not licensed at all.
   The precise terms and conditions for copying, distribution and
 modification follow.
 
-@iftex
 @heading TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
-@end iftex
-@ifinfo
-@center TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
-@end ifinfo
 
-@enumerate
+@enumerate 0
 @item
 This License applies to any program or other work which contains
 a notice placed by the copyright holder saying it may be distributed
@@ -287,8 +282,14 @@ make exceptions for this.  Our decision will be guided by the two goals
 of preserving the free status of all derivatives of our free software and
 of promoting the sharing and reuse of software generally.
 
+@iftex
+@heading NO WARRANTY
+@end iftex
+@ifinfo
 @center NO WARRANTY
 
+@end ifinfo
+
 @item
 BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY
 FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW.  EXCEPT WHEN
@@ -317,10 +318,11 @@ POSSIBILITY OF SUCH DAMAGES.
 @end iftex
 @ifinfo
 @center END OF TERMS AND CONDITIONS
+
 @end ifinfo
 
 @page
-@heading How to Apply These Terms to Your New Programs
+@heading Appendix: How to Apply These Terms to Your New Programs
 
   If you develop a new program, and you want it to be of the greatest
 possible use to the public, the best way to achieve this is to make it
@@ -332,22 +334,22 @@ convey the exclusion of warranty; and each file should have at least
 the ``copyright'' line and a pointer to where the full notice is found.
 
 @smallexample
-@var{one line to give the program's name and an idea of what it does.}
-Copyright (C) 19@var{yy}  @var{name of author}
+@var{one line to give the program's name and a brief idea of what it does.}
+Copyright (C) @var{yyyy}  @var{name of author}
 
-This program is free software; you can redistribute it and/or
-modify it under the terms of the GNU General Public License
-as published by the Free Software Foundation; either version 2
-of the License, or (at your option) any later version.
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
 
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 
-You should have received a copy of the GNU General Public License along
-with this program; if not, write to the Free Software Foundation, Inc.,
-59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
 @end smallexample
 
 Also add information on how to contact you by electronic and paper mail.
@@ -356,11 +358,10 @@ If the program is interactive, make it output a short notice like this
 when it starts in an interactive mode:
 
 @smallexample
-Gnomovision version 69, Copyright (C) 19@var{yy} @var{name of author}
-Gnomovision comes with ABSOLUTELY NO WARRANTY; for details
-type `show w'.  This is free software, and you are welcome
-to redistribute it under certain conditions; type `show c'
-for details.
+Gnomovision version 69, Copyright (C) @var{year} @var{name of author}
+Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
+This is free software, and you are welcome to redistribute it
+under certain conditions; type `show c' for details.
 @end smallexample
 
 The hypothetical commands @samp{show w} and @samp{show c} should show
@@ -373,20 +374,16 @@ You should also get your employer (if you work as a programmer) or your
 school, if any, to sign a ``copyright disclaimer'' for the program, if
 necessary.  Here is a sample; alter the names:
 
-@smallexample
-@group
-Yoyodyne, Inc., hereby disclaims all copyright
-interest in the program `Gnomovision'
-(which makes passes at compilers) written
-by James Hacker.
+@example
+Yoyodyne, Inc., hereby disclaims all copyright interest in the program
+`Gnomovision' (which makes passes at compilers) written by James Hacker.
 
 @var{signature of Ty Coon}, 1 April 1989
 Ty Coon, President of Vice
-@end group
-@end smallexample
+@end example
 
 This General Public License does not permit incorporating your program into
 proprietary programs.  If your program is a subroutine library, you may
 consider it more useful to permit linking proprietary applications with the
-library.  If this is what you want to do, use the GNU Library General
+library.  If this is what you want to do, use the GNU Lesser General
 Public License instead of this License.
index bbd18a006f3d4c781afc321b11587a24d4471e57..ab03d6cc37d67ebbe718452fe3b5124b77e52ba5 100644 (file)
@@ -1,12 +1,12 @@
-@node Library Copying
-@unnumbered GNU Lesser General Public License
-
-@cindex LGPL, GNU Lesser General Public License
+@c The GNU Lesser General Public License.
 @center Version 2.1, February 1999
 
+@c This file is intended to be included within another document,
+@c hence no sectioning command or @node.
+
 @display
 Copyright @copyright{} 1991, 1999 Free Software Foundation, Inc.
-59 Temple Place -- Suite 330, Boston, MA 02111-1307, USA
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
 
 Everyone is permitted to copy and distribute verbatim copies
 of this license document, but changing it is not allowed.
@@ -16,7 +16,7 @@ as the successor of the GNU Library Public License, version 2, hence the
 version number 2.1.]
 @end display
 
-@heading Preamble
+@subheading Preamble
 
   The licenses for most software are designed to take away your
 freedom to share and change it.  By contrast, the GNU General Public
@@ -118,13 +118,7 @@ modification follow.  Pay close attention to the difference between a
 former contains code derived from the library, whereas the latter must
 be combined with the library in order to run.
 
-@iftex
-@heading TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
-@end iftex
-@ifinfo
-@center GNU LESSER GENERAL PUBLIC LICENSE
-@center TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
-@end ifinfo
+@subheading TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
 
 @enumerate 0
 @item
@@ -476,7 +470,7 @@ decision will be guided by the two goals of preserving the free status
 of all derivatives of our free software and of promoting the sharing
 and reuse of software generally.
 
-@center NO WARRANTY
+@center @b{NO WARRANTY}
 
 @item
 BECAUSE THE LIBRARY IS LICENSED FREE OF CHARGE, THERE IS NO
@@ -502,15 +496,10 @@ SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH
 DAMAGES.
 @end enumerate
 
-@iftex
-@heading END OF TERMS AND CONDITIONS
-@end iftex
-@ifinfo
-@center END OF TERMS AND CONDITIONS
-@end ifinfo
+@subheading END OF TERMS AND CONDITIONS
 
 @page
-@heading How to Apply These Terms to Your New Libraries
+@subheading How to Apply These Terms to Your New Libraries
 
   If you develop a new library, and you want it to be of the greatest
 possible use to the public, we recommend making it free software that
@@ -539,7 +528,7 @@ Lesser General Public License for more details.
 
 You should have received a copy of the GNU Lesser General Public
 License along with this library; if not, write to the Free Software
-Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307,
+Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301,
 USA.
 @end smallexample
 
index d50aecb7063ac0c1a7715a74755f795705c05da5..a53e7d1851c1a78bab0939da157fba43647cab2e 100644 (file)
Binary files a/doc/libgcrypt-modules.pdf and b/doc/libgcrypt-modules.pdf differ
index d7efe79125856974e307662f59a84304957039d5..963330b32efdf7788d9573b0c192927de71cd56d 100644 (file)
@@ -1,4 +1,4 @@
-@set UPDATED 19 October 2023
-@set UPDATED-MONTH October 2023
-@set EDITION 1.10.3
-@set VERSION 1.10.3
+@set UPDATED 16 May 2024
+@set UPDATED-MONTH May 2024
+@set EDITION 1.11.0
+@set VERSION 1.11.0
index d7efe79125856974e307662f59a84304957039d5..963330b32efdf7788d9573b0c192927de71cd56d 100644 (file)
@@ -1,4 +1,4 @@
-@set UPDATED 19 October 2023
-@set UPDATED-MONTH October 2023
-@set EDITION 1.10.3
-@set VERSION 1.10.3
+@set UPDATED 16 May 2024
+@set UPDATED-MONTH May 2024
+@set EDITION 1.11.0
+@set VERSION 1.11.0
index 3c7b3635583ed86946b51feeb2bb6d7820db62ab..c2806e39f51ede5c84b12220aec7a0d22f40bd32 100644 (file)
@@ -49,7 +49,7 @@
       .B whateever you want
       @end ifset
 
-    alternativly a special comment may be used:
+    alternatively a special comment may be used:
 
       @c man:.B whatever you want
 
 
 #if MY_GCC_VERSION >= 20500
 # define ATTR_PRINTF(f, a) __attribute__ ((format(printf,f,a)))
-# define ATTR_NR_PRINTF(f, a) __attribute__ ((noreturn, format(printf,f,a)))
+# define ATTR_NR_PRINTF(f, a) __attribute__ ((__noreturn__, format(printf,f,a)))
 #else
 # define ATTR_PRINTF(f, a)
 # define ATTR_NR_PRINTF(f, a)
 /* Number of allowed condition nestings.  */
 #define MAX_CONDITION_NESTING  10
 
+static char const default_css[] =
+  "<style type=\"text/css\">\n"
+  "  .y2m {\n"
+  "    font-family: monospace;\n"
+  "  }\n"
+  "  .y2m u {\n"
+  "    text-decoration: underline;\n"
+  "  }\n"
+  "  .y2m-sc {\n"
+  "    font-variant: small-caps;\n"
+  "  }\n"
+  "  .y2m li {\n"
+  "    margin-top: 1em;\n"
+  "  }\n"
+  "  .y2m-item {\n"
+  "     display: block;\n"
+  "     font-weight: bold;\n"
+  "  }\n"
+  "  .y2m-args {\n"
+  "     font-weight: normal;\n"
+  "  }\n"
+  "</style>\n";
+
+
+
 /* Option flags. */
 static int verbose;
 static int quiet;
 static int debug;
+static int htmlmode;
 static const char *opt_source;
 static const char *opt_release;
 static const char *opt_date;
@@ -679,6 +705,25 @@ start_page (char *name)
 }
 
 
+/* Write a character to FP.  */
+static void
+writechr (int c, FILE *fp)
+{
+  putc (c, fp);
+}
+
+
+/* Write depending on HTMLMODE either ROFF or HTML to FP.  */
+static void
+writestr (const char *roff, const char *html, FILE *fp)
+{
+  const char *s = htmlmode? html : roff;
+
+  if (s)
+    fputs (s, fp);
+}
+
+
 /* Write the .TH entry of the current page.  Return -1 if there is a
    problem with the page. */
 static int
@@ -686,7 +731,66 @@ write_th (FILE *fp)
 {
   char *name, *p;
 
-  fputs (".\\\" Created from Texinfo source by yat2m " VERSION "\n", fp);
+  writestr (".\\\" Created from Texinfo source by yat2m " VERSION "\n",
+            "<!-- Created from Texinfo source by yat2m " VERSION " -->\n",
+            fp);
+
+  name = ascii_strupr (xstrdup (thepage.name));
+  p = strrchr (name, '.');
+  if (!p || !p[1])
+    {
+      err ("no section name in man page '%s'", thepage.name);
+      free (name);
+      return -1;
+    }
+  *p++ = 0;
+
+  if (htmlmode)
+    {
+      fputs ("<html>\n"
+             "<head>\n", fp);
+      fprintf (fp, " <title>%s(%s)</title>\n", name, p);
+      fputs (default_css, fp);
+      fputs ("</head>\n"
+             "<body>\n", fp);
+      fputs ("<div class=\"y2m\">\n", fp);
+    }
+
+  /* This roff source
+   *   .TH GPG 1 2016-12-20 "GnuPG 2.1.17" "GNU Privacy Guard 2.1"
+   * is rendered by man like this:
+   *   GPG(1)         GNU Privacy Guard 2.1      GPG(1)
+   *   [...]
+   *   GnuPG 2.1.17        2016-12-20            GPG(1)
+   */
+  if (htmlmode)
+    {
+      fprintf (fp, "<p class=\"y2m y2m-top\">"
+               "<span class=\"y2m-left\">%s(%s)</span> "
+               "<span class=\"y2m-center\">%s</span> "
+               "<span class=\"y2m-right\">%s(%s)</span>"
+               "</p>\n",
+               name, p, opt_source, name, p);
+      /* Note that the HTML footer is written by write_bottom().  */
+
+    }
+  else
+    fprintf (fp, ".TH %s %s %s \"%s\" \"%s\"\n",
+             name, p, isodatestring (), opt_release, opt_source);
+
+  free (name);
+  return 0;
+}
+
+
+/* In HTML mode we need to render a footer.  */
+static int
+write_bottom (FILE *fp)
+{
+  char *name, *p;
+
+  if (!htmlmode)
+    return 0;
 
   name = ascii_strupr (xstrdup (thepage.name));
   p = strrchr (name, '.');
@@ -697,15 +801,102 @@ write_th (FILE *fp)
       return -1;
     }
   *p++ = 0;
-  fprintf (fp, ".TH %s %s %s \"%s\" \"%s\"\n",
-           name, p, isodatestring (), opt_release, opt_source);
+
+  /* This roff source
+   *   .TH GPG 1 2016-12-20 "GnuPG 2.1.17" "GNU Privacy Guard 2.1"
+   * is rendered by man to this footer:
+   *   GnuPG 2.1.17        2016-12-20            GPG(1)
+   */
+  fprintf (fp, "<p class=\"y2m y2m-footer\">"
+           "<span class=\"y2m-left\">%s</span> "
+           "<span class=\"y2m-center\">%s</span> "
+           "<span class=\"y2m-right\">%s(%s)</span>"
+           "</p>\n",
+           opt_release, isodatestring (), name, p);
+  fputs ("</div><!-- class y2m -->\n", fp);
+  fputs ("</body>\n"
+         "</html>\n", fp);
+
   free (name);
   return 0;
 }
 
 
+/* Write the .SH header.  With NULL passed for NAME just close a
+ * section in html mode if there is an open section. */
+static void
+write_sh (FILE *fp, const char *name)
+{
+  static int in_section;
+
+  if (htmlmode && in_section)
+    fprintf (fp, "</div>\n");
+  in_section = 0;
+
+  if (name)
+    {
+      if (htmlmode)
+        fprintf (fp,
+                 "<div class=\"y2m-section\">\n"
+                 "<p class=\"y2m-sh\">%s</p>\n", name);
+      else
+        fprintf (fp, ".SH %s\n", name);
+      in_section = 1;
+    }
+}
+
+/* Render a @item line to HTML.  (LINE,LEN) gives the arguments of
+ * @item.  Use NULL for LINE to close a possible open <li>.  ITEMX
+ * flags a @itemx line.  */
+static void
+write_html_item (FILE *fp, const char *line, size_t len, int itemx)
+{
+  static int in_li;
+  const char *rest;
+  size_t n, n0;
+  int eol_action = 0;
+  int table_level = 0;
+
+  if (!itemx && in_li)
+    {
+      fprintf (fp, "</li>\n");
+      in_li = 0;
+    }
+
+  if (line)
+    {
+      /* Trim the LF and skip leading spaces. */
+      if (len && line[len-1] == '\n')
+        len--;
+      for (; len && (*line == ' ' || *line == '\t'); len--, line++)
+        ;
+      if (len)
+        {
+          rest = line;
+          for (n=0; n < len && !(*rest == ' ' || *rest == '\t'); n++, rest++)
+            ;
+          n0 = n;
+          for (; n < len && (*rest == ' ' || *rest == '\t'); n++, rest++)
+            ;
+          len -= n;
+          /* Now the first word is (LINE,N0) and the args are (REST,LEN) */
+          fprintf (fp, "%s<span class=\"y2m-item\">%.*s",
+                   itemx? "    ":"<li>", (int)n0, line);
+          if (len)
+            {
+              fputs (" <span class=\"y2m-args\">", fp);
+              proc_texi_buffer (fp, rest, len, &table_level, &eol_action);
+              fputs ("</span>", fp);
+            }
+          fputs ("</span>\n", fp);
+          in_li = 1;
+        }
+    }
+}
+
+
 /* Process the texinfo command COMMAND (without the leading @) and
-   write output if needed to FP. REST is the remainer of the line
+   write output if needed to FP. REST is the remainder of the line
    which should either point to an opening brace or to a white space.
    The function returns the number of characters already processed
    from REST.  LEN is the usable length of REST.  TABLE_LEVEL is used to
@@ -719,20 +910,23 @@ proc_texi_cmd (FILE *fp, const char *command, const char *rest, size_t len,
     int what;            /* What to do with this command. */
     const char *lead_in; /* String to print with a opening brace.  */
     const char *lead_out;/* String to print with the closing brace. */
+    const char *html_in; /* Same as LEAD_IN but for HTML.  */
+    const char *html_out;/* Same as LEAD_OUT but for HTML.  */
   } cmdtbl[] = {
-    { "command", 0, "\\fB", "\\fR" },
-    { "code",    0, "\\fB", "\\fR" },
-    { "url",     0, "\\fB", "\\fR" },
-    { "sc",      0, "\\fB", "\\fR" },
-    { "var",     0, "\\fI", "\\fR" },
-    { "samp",    0, "\\(aq", "\\(aq"  },
+    { "command", 0, "\\fB", "\\fR", "<i>", "</i>" },
+    { "code",    0, "\\fB", "\\fR", "<samp>", "</samp>" },
+    { "url",     0, "\\fB", "\\fR", "<strong>", "</strong>" },
+    { "sc",      0, "\\fB", "\\fR", "<span class=\"y2m-sc\">", "</span>" },
+    { "var",     0, "\\fI", "\\fR", "<u>", "</u>" },
+    { "samp",    0, "\\(oq", "\\(cq"  },
+    { "kbd",     0, "\\(oq", "\\(cq"  },
     { "file",    0, "\\(oq\\fI","\\fR\\(cq" },
     { "env",     0, "\\(oq\\fI","\\fR\\(cq" },
     { "acronym", 0 },
     { "dfn",     0 },
-    { "option",  0, "\\fB", "\\fR"   },
-    { "example", 1, ".RS 2\n.nf\n" },
-    { "smallexample", 1, ".RS 2\n.nf\n" },
+    { "option",  0, "\\fB", "\\fR", "<samp>", "</samp>" },
+    { "example", 1, ".RS 2\n.nf\n",      NULL, "\n<pre>\n", "\n</pre>\n" },
+    { "smallexample", 1, ".RS 2\n.nf\n", NULL, "\n<pre>\n", "\n</pre>\n" },
     { "asis",    7 },
     { "anchor",  7 },
     { "cartouche", 1 },
@@ -741,7 +935,7 @@ proc_texi_cmd (FILE *fp, const char *command, const char *rest, size_t len,
     { "pxref",   0, "see: [", "]" },
     { "uref",    0, "(\\fB", "\\fR)" },
     { "footnote",0, " ([", "])" },
-    { "emph",    0, "\\fI", "\\fR" },
+    { "emph",    0, "\\fI", "\\fR", "<em>", "</em>" },
     { "w",       1 },
     { "c",       5 },
     { "efindex", 1 },
@@ -755,7 +949,7 @@ proc_texi_cmd (FILE *fp, const char *command, const char *rest, size_t len,
     { "chapheading", 0},
     { "item",    2, ".TP\n.B " },
     { "itemx",   2, ".TQ\n.B " },
-    { "table",   3 },
+    { "table",   3, NULL, NULL, "<ul>\n", "</ul>\n" },
     { "itemize",   3 },
     { "bullet",  0, "* " },
     { "*",       0, "\n.br"},
@@ -769,26 +963,36 @@ proc_texi_cmd (FILE *fp, const char *command, const char *rest, size_t len,
   int i;
   const char *s;
   const char *lead_out = NULL;
+  const char *html_out = NULL;
   int ignore_args = 0;
 
   for (i=0; cmdtbl[i].name && strcmp (cmdtbl[i].name, command); i++)
     ;
   if (cmdtbl[i].name)
     {
-      s = cmdtbl[i].lead_in;
-      if (s)
-        fputs (s, fp);
+      writestr (cmdtbl[i].lead_in, cmdtbl[i].html_in, fp);
       lead_out = cmdtbl[i].lead_out;
+      html_out = cmdtbl[i].html_out;
       switch (cmdtbl[i].what)
         {
         case 1: /* Throw away the entire line.  */
           s = memchr (rest, '\n', len);
           return s? (s-rest)+1 : len;
         case 2: /* Handle @item.  */
+          if (htmlmode)
+            {
+              s = memchr (rest, '\n', len);
+              n = s? (s-rest)+1 : len;
+              write_html_item (fp, rest, n, !strcmp(cmdtbl[i].name, "itemx"));
+              return n;
+            }
           break;
         case 3: /* Handle table.  */
           if (++(*table_level) > 1)
-            fputs (".RS\n", fp);
+            {
+              write_html_item (fp, NULL, 0, 0);
+              writestr (".RS\n", "<ul>\n", fp);
+            }
           /* Now throw away the entire line. */
           s = memchr (rest, '\n', len);
           return s? (s-rest)+1 : len;
@@ -799,25 +1003,27 @@ proc_texi_cmd (FILE *fp, const char *command, const char *rest, size_t len,
           if (n >= 5 && !memcmp (s, "table", 5)
               && (!n || s[5] == ' ' || s[5] == '\t' || s[5] == '\n'))
             {
+              if (htmlmode)
+                write_html_item (fp, NULL, 0, 0);
               if ((*table_level)-- > 1)
-                fputs (".RE\n", fp);
+                writestr (".RE\n", "</ul>\n", fp);
               else
-                fputs (".P\n", fp);
+                writestr (".P\n", "</ul>\n", fp);
             }
           else if (n >= 7 && !memcmp (s, "example", 7)
               && (!n || s[7] == ' ' || s[7] == '\t' || s[7] == '\n'))
             {
-              fputs (".fi\n.RE\n", fp);
+              writestr (".fi\n.RE\n", "</pre>\n", fp);
             }
           else if (n >= 12 && !memcmp (s, "smallexample", 12)
               && (!n || s[12] == ' ' || s[12] == '\t' || s[12] == '\n'))
             {
-              fputs (".fi\n.RE\n", fp);
+              writestr (".fi\n.RE\n", "</pre>\n", fp);
             }
           else if (n >= 9 && !memcmp (s, "quotation", 9)
               && (!n || s[9] == ' ' || s[9] == '\t' || s[9] == '\n'))
             {
-              fputs ("\\fR\n.RE\n", fp);
+              writestr ("\\fR\n.RE\n", "xx", fp);
             }
           /* Now throw away the entire line. */
           s = memchr (rest, '\n', len);
@@ -827,9 +1033,22 @@ proc_texi_cmd (FILE *fp, const char *command, const char *rest, size_t len,
             ;
           if (n >= 4 && !memcmp (s, "man:", 4))
             {
-              for (s+=4, n-=4; n && *s != '\n'; n--, s++)
-                putc (*s, fp);
-              putc ('\n', fp);
+              s += 4;
+              n -= 4;
+              if (htmlmode)
+                {
+                  if (!strncmp (s, ".RE\n", 4)
+                      || !strncmp (s, ".RS\n", 4))
+                    ;
+                  else
+                    inf ("unknown special comment \"man:\"");
+                }
+              else
+                {
+                  for (; n && *s != '\n'; n--, s++)
+                    writechr (*s, fp);
+                  writechr ('\n', fp);
+                }
             }
           /* Now throw away the entire line. */
           s = memchr (rest, '\n', len);
@@ -870,7 +1089,7 @@ proc_texi_cmd (FILE *fp, const char *command, const char *rest, size_t len,
                         break;
                     }
                   if (m)
-                    fputs (m->value, fp);
+                    writestr (m->value, m->value, fp);
                   else
                     inf ("texinfo variable '%.*s' is not set",
                          (int)rlen, rest+1);
@@ -918,8 +1137,7 @@ proc_texi_cmd (FILE *fp, const char *command, const char *rest, size_t len,
   else
     n = 0;
 
-  if (lead_out)
-    fputs (lead_out, fp);
+  writestr (lead_out, html_out, fp);
 
   return n;
 }
@@ -946,16 +1164,16 @@ proc_texi_buffer (FILE *fp, const char *line, size_t len,
               switch (*s)
                 {
                 case '@': case '{': case '}':
-                  putc (*s, fp); in_cmd = 0;
+                  writechr (*s, fp); in_cmd = 0;
                   break;
                 case ':': /* Not ending a sentence flag.  */
                   in_cmd = 0;
                   break;
                 case '.': case '!': case '?': /* Ending a sentence. */
-                  putc (*s, fp); in_cmd = 0;
+                  writechr (*s, fp); in_cmd = 0;
                   break;
                 case ' ': case '\t': case '\n': /* Non collapsing spaces.  */
-                  putc (*s, fp); in_cmd = 0;
+                  writechr (*s, fp); in_cmd = 0;
                   break;
                 default:
                   cmdidx = 0;
@@ -988,17 +1206,17 @@ proc_texi_buffer (FILE *fp, const char *line, size_t len,
           switch (*eol_action)
             {
             case 1: /* Create a dummy paragraph. */
-              fputs ("\n\\ \n", fp);
+              writestr ("\n\\ \n", "\n<-- dummy par -->\n", fp);
               break;
             default:
-              putc (*s, fp);
+              writechr (*s, fp);
             }
           *eol_action = 0;
         }
       else if (*s == '\\')
-        fputs ("\\\\", fp);
+        writestr ("\\\\", "\\\\", fp);
       else
-        putc (*s, fp);
+        writechr (*s, fp);
     }
 
   if (in_cmd > 1)
@@ -1022,12 +1240,13 @@ parse_texi_line (FILE *fp, const char *line, int *table_level)
   /* A quick test whether there are any texinfo commands.  */
   if (!strchr (line, '@'))
     {
-      fputs (line, fp);
-      putc ('\n', fp);
+      /* FIXME: In html mode escape HTML stuff. */
+      writestr (line, line, fp);
+      writechr ('\n', fp);
       return;
     }
   proc_texi_buffer (fp, line, strlen (line), table_level, &eol_action);
-  putc ('\n', fp);
+  writechr ('\n', fp);
 }
 
 
@@ -1042,8 +1261,10 @@ write_content (FILE *fp, line_buffer_t lines)
     {
       if (line->verbatim)
         {
-          fputs (line->line, fp);
-          putc ('\n', fp);
+          /* FIXME: IN HTML mode we need to employ a parser for roff
+           * markup.  */
+          writestr (line->line, line->line, fp);
+          writechr ('\n', fp);
         }
       else
         {
@@ -1102,7 +1323,8 @@ finish_page (void)
     }
   else if (opt_store)
     {
-      inf ("writing '%s'", thepage.name );
+      if (verbose)
+        inf ("writing '%s'", thepage.name );
       fp = fopen ( thepage.name, "w" );
       if (!fp)
         die ("failed to create '%s': %s\n", thepage.name, strerror (errno));
@@ -1126,7 +1348,7 @@ finish_page (void)
 
       if (sect)
         {
-          fprintf (fp, ".SH %s\n", sect->name);
+          write_sh (fp, sect->name);
           write_content (fp, sect->lines);
           /* Now continue with all non standard sections directly
              following this one. */
@@ -1137,7 +1359,7 @@ finish_page (void)
                 break;
               if (sect->name)
                 {
-                  fprintf (fp, ".SH %s\n", sect->name);
+                  write_sh (fp, sect->name);
                   write_content (fp, sect->lines);
                 }
             }
@@ -1145,6 +1367,9 @@ finish_page (void)
         }
     }
 
+  write_sh (fp, NULL);
+  if (write_bottom (fp))
+    goto leave;
 
  leave:
   if (fp != stdout)
@@ -1515,6 +1740,7 @@ main (int argc, char **argv)
           puts (
                 "Usage: " PGM " [OPTION] [FILE]\n"
                 "Extract man pages from a Texinfo source.\n\n"
+                "  --html           render output as HTML\n"
                 "  --source NAME    use NAME as source field\n"
                 "  --release STRING use STRING as the release field\n"
                 "  --date EPOCH     use EPOCH as publication date\n"
@@ -1524,7 +1750,7 @@ main (int argc, char **argv)
                 "  --debug          enable additional debug output\n"
                 "  --help           display this help and exit\n"
                 "  -I DIR           also search in include DIR\n"
-                "  -D gpgone        the only usable define\n\n"
+                "  -D MACRO         define MACRO to 1\n\n"
                 "With no FILE, or when FILE is -, read standard input.\n\n"
                 "Report bugs to <https://bugs.gnupg.org>.");
           exit (0);
@@ -1538,6 +1764,11 @@ main (int argc, char **argv)
                 "under certain conditions. See the file COPYING for details.");
           exit (0);
         }
+      else if (!strcmp (*argv, "--html"))
+        {
+          htmlmode = 1;
+          argc--; argv++;
+        }
       else if (!strcmp (*argv, "--verbose"))
         {
           verbose = 1;
index c33f100971903c495e40b231657131df550f6c29..53800d399887481da50696c8267d7097a21e43e0 100644 (file)
@@ -1,2 +1,2 @@
-EXTRA_DIST = libtool.m4 socklen.m4 noexecstack.m4
+EXTRA_DIST = libtool.m4 noexecstack.m4
 EXTRA_DIST += gpg-error.m4
index b457ecda557a7109bdee266a4d0a0e7a2ff4bc17..9da7134654dba3ada0c8b718a705ddfb2c4c1814 100644 (file)
@@ -93,8 +93,8 @@ am__aclocal_m4_deps = $(top_srcdir)/m4/ax_cc_for_build.m4 \
        $(top_srcdir)/m4/gpg-error.m4 $(top_srcdir)/m4/libtool.m4 \
        $(top_srcdir)/m4/ltoptions.m4 $(top_srcdir)/m4/ltsugar.m4 \
        $(top_srcdir)/m4/ltversion.m4 $(top_srcdir)/m4/lt~obsolete.m4 \
-       $(top_srcdir)/m4/noexecstack.m4 $(top_srcdir)/m4/socklen.m4 \
-       $(top_srcdir)/acinclude.m4 $(top_srcdir)/configure.ac
+       $(top_srcdir)/m4/noexecstack.m4 $(top_srcdir)/acinclude.m4 \
+       $(top_srcdir)/configure.ac
 am__configure_deps = $(am__aclocal_m4_deps) $(CONFIGURE_DEPENDENCIES) \
        $(ACLOCAL_M4)
 DIST_COMMON = $(srcdir)/Makefile.am $(am__DIST_COMMON)
@@ -223,9 +223,6 @@ PACKAGE_TARNAME = @PACKAGE_TARNAME@
 PACKAGE_URL = @PACKAGE_URL@
 PACKAGE_VERSION = @PACKAGE_VERSION@
 PATH_SEPARATOR = @PATH_SEPARATOR@
-PTH_CFLAGS = @PTH_CFLAGS@
-PTH_CONFIG = @PTH_CONFIG@
-PTH_LIBS = @PTH_LIBS@
 RANLIB = @RANLIB@
 RC = @RC@
 READELF = @READELF@
@@ -293,7 +290,7 @@ target_alias = @target_alias@
 top_build_prefix = @top_build_prefix@
 top_builddir = @top_builddir@
 top_srcdir = @top_srcdir@
-EXTRA_DIST = libtool.m4 socklen.m4 noexecstack.m4 gpg-error.m4
+EXTRA_DIST = libtool.m4 noexecstack.m4 gpg-error.m4
 all: all-am
 
 .SUFFIXES:
index 7fa52b12780f4fe71b66acb9e06dc5166afd88e9..2d24071af16bcaab28a390421762078e9b8566ee 100644 (file)
@@ -1,5 +1,5 @@
 # gpg-error.m4 - autoconf macro to detect libgpg-error.
-# Copyright (C) 2002, 2003, 2004, 2011, 2014, 2018, 2020, 2021, 2022
+# Copyright (C) 2002, 2003, 2004, 2011, 2014, 2018, 2020, 2021, 2022, 2024
 #               g10 Code GmbH
 #
 # This file is free software; as a special exception the author gives
@@ -10,7 +10,7 @@
 # WITHOUT ANY WARRANTY, to the extent permitted by law; without even the
 # implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
 #
-# Last-changed: 2023-04-01
+# Last-changed: 2024-06-13
 
 dnl
 dnl Find gpg-error-config, for backward compatibility
@@ -125,17 +125,16 @@ AC_DEFUN([_AM_PATH_GPGRT_CONFIG],[dnl
   fi
 
   if test -n "$gpgrt_libdir"; then
+    # Add the --libdir option to GPGRT_CONFIG
     GPGRT_CONFIG="$GPGRT_CONFIG --libdir=$gpgrt_libdir"
-    if $GPGRT_CONFIG gpg-error >/dev/null 2>&1; then
-      GPG_ERROR_CONFIG="$GPGRT_CONFIG gpg-error"
-      AC_MSG_NOTICE([Use gpgrt-config with $gpgrt_libdir as gpg-error-config])
-      gpg_error_config_version=`$GPG_ERROR_CONFIG --modversion`
-    else
-      gpg_error_config_version=`$GPG_ERROR_CONFIG --version`
+    # Make sure if gpgrt-config really works, by testing config gpg-error
+    if ! $GPGRT_CONFIG gpg-error --exists; then
+      # If it doesn't work, clear the GPGRT_CONFIG variable.
       unset GPGRT_CONFIG
     fi
-  elif test "$GPG_ERROR_CONFIG" != "no"; then
-    gpg_error_config_version=`$GPG_ERROR_CONFIG --version`
+  else
+    # GPGRT_CONFIG found but no suitable dir for --libdir found.
+    # This is a failure.  Clear the GPGRT_CONFIG variable.
     unset GPGRT_CONFIG
   fi
 ])
@@ -145,17 +144,27 @@ dnl                   [ACTION-IF-FOUND [, ACTION-IF-NOT-FOUND ]]])
 dnl
 dnl Test for libgpg-error and define GPG_ERROR_CFLAGS, GPG_ERROR_LIBS,
 dnl GPG_ERROR_MT_CFLAGS, and GPG_ERROR_MT_LIBS.  The _MT_ variants are
-dnl used for programs requireing real multi thread support.
+dnl used for programs requiring real multi thread support.
 dnl
 dnl If a prefix option is not used, the config script is first
 dnl searched in $SYSROOT/bin and then along $PATH.  If the used
 dnl config script does not match the host specification the script
 dnl is added to the gpg_config_script_warn variable.
 dnl
-AC_DEFUN([AM_PATH_GPG_ERROR],[dnl
-AC_REQUIRE([AC_CANONICAL_HOST])dnl
-AC_REQUIRE([_AM_PATH_POSSIBLE_GPG_ERROR_CONFIG])dnl
-AC_REQUIRE([_AM_PATH_GPGRT_CONFIG])dnl
+AC_DEFUN([AM_PATH_GPG_ERROR],
+[ AC_REQUIRE([AC_CANONICAL_HOST])dnl
+  AC_REQUIRE([_AM_PATH_POSSIBLE_GPG_ERROR_CONFIG])dnl
+  AC_REQUIRE([_AM_PATH_GPGRT_CONFIG])dnl
+  if test x"$GPGRT_CONFIG" != x -a "$GPGRT_CONFIG" != "no"; then
+    GPG_ERROR_CONFIG="$GPGRT_CONFIG gpg-error"
+    AC_MSG_NOTICE([Use gpgrt-config with $gpgrt_libdir as gpg-error-config])
+    gpg_error_config_version=`$GPG_ERROR_CONFIG --modversion`
+  elif test x"$GPG_ERROR_CONFIG" != x -a "$GPG_ERROR_CONFIG" != "no"; then
+    gpg_error_config_version=`$GPG_ERROR_CONFIG --version`
+  else
+    gpg_error_config_version="0.0"
+  fi
+
   min_gpg_error_version=ifelse([$1], ,1.33,$1)
   ok=no
   if test "$GPG_ERROR_CONFIG" != "no"; then
index 85df0439c3fcda4bd35f51c637d2483293ee4cce..f3e7539e96bc84a44974d4b25398509b74178f0d 100644 (file)
@@ -12,8 +12,8 @@ dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 dnl Lesser General Public License for more details.
 dnl
 dnl You should have received a copy of the GNU Lesser General Public
-dnl License along with this library; if not, write to the Free Software
-dnl Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307  USA
+dnl License along with this library; if not, see <https://www.gnu.org/licenses/>.
+dnl SPDX-License-Identifier: LGPL-2.1-or-later
 
 dnl Checks whether the stack can be marked nonexecutable by passing an
 dnl option to the C-compiler when acting on .s files.  Returns that
diff --git a/m4/socklen.m4 b/m4/socklen.m4
deleted file mode 100644 (file)
index 251960b..0000000
+++ /dev/null
@@ -1,76 +0,0 @@
-# socklen.m4 serial 11
-dnl Copyright (C) 2005-2007, 2009-2020 Free Software Foundation, Inc.
-dnl This file is free software; the Free Software Foundation
-dnl gives unlimited permission to copy and/or distribute it,
-dnl with or without modifications, as long as this notice is preserved.
-
-dnl From Albert Chin, Windows fixes from Simon Josefsson.
-
-dnl Check for socklen_t: historically on BSD it is an int, and in
-dnl POSIX 1g it is a type of its own, but some platforms use different
-dnl types for the argument to getsockopt, getpeername, etc.:
-dnl HP-UX 10.20, IRIX 6.5, OSF/1 4.0, Interix 3.5, BeOS.
-dnl So we have to test to find something that will work.
-
-AC_DEFUN([gl_TYPE_SOCKLEN_T],
-  [AC_REQUIRE([gl_CHECK_SOCKET_HEADERS])dnl
-   AC_CHECK_TYPE([socklen_t], ,
-     [AC_CACHE_CHECK([for socklen_t equivalent],
-        [gl_cv_socklen_t_equiv],
-        [# Systems have either "struct sockaddr *" or
-         # "void *" as the second argument to getpeername
-         gl_cv_socklen_t_equiv=
-         for arg2 in "struct sockaddr" void; do
-           for t in int size_t "unsigned int" "long int" "unsigned long int"; do
-             AC_COMPILE_IFELSE([AC_LANG_PROGRAM(
-                 [[#include <sys/types.h>
-                   #include <sys/socket.h>
-
-                   int getpeername (int, $arg2 *, $t *);]],
-                 [[$t len;
-                  getpeername (0, 0, &len);]])],
-               [gl_cv_socklen_t_equiv="$t"])
-             test "$gl_cv_socklen_t_equiv" != "" && break
-           done
-           test "$gl_cv_socklen_t_equiv" != "" && break
-         done
-         if test "$gl_cv_socklen_t_equiv" = ""; then
-           AC_MSG_ERROR([Cannot find a type to use in place of socklen_t])
-         fi
-        ])
-      AC_DEFINE_UNQUOTED([socklen_t], [$gl_cv_socklen_t_equiv],
-        [type to use in place of socklen_t if not defined])],
-     [gl_SOCKET_HEADERS])])
-
-dnl On mingw32, socklen_t is in ws2tcpip.h ('int'), so we try to find
-dnl it there too.  But on Cygwin, wc2tcpip.h must not be included.  Users
-dnl of this module should use the same include pattern as gl_SOCKET_HEADERS.
-dnl When you change this macro, keep also in sync:
-dnl   - gl_CHECK_SOCKET_HEADERS,
-dnl   - the Include section of modules/socklen.
-AC_DEFUN([gl_SOCKET_HEADERS],
-[
-/* <sys/types.h> is not needed according to POSIX, but the
-   <sys/socket.h> in i386-unknown-freebsd4.10 and
-   powerpc-apple-darwin5.5 required it. */
-#include <sys/types.h>
-#if HAVE_SYS_SOCKET_H
-# include <sys/socket.h>
-#elif HAVE_WS2TCPIP_H
-# include <ws2tcpip.h>
-#endif
-])
-
-dnl Tests for the existence of the header for socket facilities.
-dnl Defines the C macros HAVE_SYS_SOCKET_H, HAVE_WS2TCPIP_H.
-dnl This macro must match gl_SOCKET_HEADERS.
-AC_DEFUN([gl_CHECK_SOCKET_HEADERS],
-  [AC_CHECK_HEADERS_ONCE([sys/socket.h])
-   if test $ac_cv_header_sys_socket_h = no; then
-     dnl We cannot use AC_CHECK_HEADERS_ONCE here, because that would make
-     dnl the check for those headers unconditional; yet cygwin reports
-     dnl that the headers are present but cannot be compiled (since on
-     dnl cygwin, all socket information should come from sys/socket.h).
-     AC_CHECK_HEADERS([ws2tcpip.h])
-   fi
-  ])
index e1362c888fb727b2c014ffaf2f3f4021d65a6761..6fe63fceba9ccb73d800710838531253265130ea 100644 (file)
@@ -14,8 +14,8 @@
 # GNU Lesser General Public License for more details.
 #
 # You should have received a copy of the GNU Lesser General Public
-# License along with this program; if not, write to the Free Software
-# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA
+# License along with this program; if not, see <https://www.gnu.org/licenses/>.
+# SPDX-License-Identifier: LGPL-2.1-or-later
 
 # 1.5 leads to a combinatorial explosion due to all the conditionals
 # I was not able to build it with 64Megs - 1.6 fixes this.
@@ -177,4 +177,7 @@ libmpi_la_SOURCES = longlong.h         \
              mpiutil.c         \
               ec.c ec-internal.h ec-ed25519.c ec-nist.c ec-inline.h \
               ec-hw-s390x.c
-EXTRA_libmpi_la_SOURCES = asm-common-aarch64.h asm-common-amd64.h
+EXTRA_libmpi_la_SOURCES = \
+              asm-common-aarch64.h \
+              asm-common-amd64.h \
+              asm-common-i386.h
index 64ac5148e0a2f779521bc1e4fedb277f032449fb..9ff469d73c8dbb5f40ce4193023603013b8efa6c 100644 (file)
@@ -29,8 +29,8 @@
 # GNU Lesser General Public License for more details.
 #
 # You should have received a copy of the GNU Lesser General Public
-# License along with this program; if not, write to the Free Software
-# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA
+# License along with this program; if not, see <https://www.gnu.org/licenses/>.
+# SPDX-License-Identifier: LGPL-2.1-or-later
 
 # 1.5 leads to a combinatorial explosion due to all the conditionals
 # I was not able to build it with 64Megs - 1.6 fixes this.
@@ -116,8 +116,8 @@ am__aclocal_m4_deps = $(top_srcdir)/m4/ax_cc_for_build.m4 \
        $(top_srcdir)/m4/gpg-error.m4 $(top_srcdir)/m4/libtool.m4 \
        $(top_srcdir)/m4/ltoptions.m4 $(top_srcdir)/m4/ltsugar.m4 \
        $(top_srcdir)/m4/ltversion.m4 $(top_srcdir)/m4/lt~obsolete.m4 \
-       $(top_srcdir)/m4/noexecstack.m4 $(top_srcdir)/m4/socklen.m4 \
-       $(top_srcdir)/acinclude.m4 $(top_srcdir)/configure.ac
+       $(top_srcdir)/m4/noexecstack.m4 $(top_srcdir)/acinclude.m4 \
+       $(top_srcdir)/configure.ac
 am__configure_deps = $(am__aclocal_m4_deps) $(CONFIGURE_DEPENDENCIES) \
        $(ACLOCAL_M4)
 DIST_COMMON = $(srcdir)/Makefile.am $(am__DIST_COMMON)
@@ -354,9 +354,6 @@ PACKAGE_TARNAME = @PACKAGE_TARNAME@
 PACKAGE_URL = @PACKAGE_URL@
 PACKAGE_VERSION = @PACKAGE_VERSION@
 PATH_SEPARATOR = @PATH_SEPARATOR@
-PTH_CFLAGS = @PTH_CFLAGS@
-PTH_CONFIG = @PTH_CONFIG@
-PTH_LIBS = @PTH_LIBS@
 RANLIB = @RANLIB@
 RC = @RC@
 READELF = @READELF@
@@ -518,7 +515,11 @@ libmpi_la_SOURCES = longlong.h        \
               ec.c ec-internal.h ec-ed25519.c ec-nist.c ec-inline.h \
               ec-hw-s390x.c
 
-EXTRA_libmpi_la_SOURCES = asm-common-aarch64.h asm-common-amd64.h
+EXTRA_libmpi_la_SOURCES = \
+              asm-common-aarch64.h \
+              asm-common-amd64.h \
+              asm-common-i386.h
+
 all: all-am
 
 .SUFFIXES:
index 24859b1790f4e95135fa9760100b6150a8db814b..b4121bf30d35714107df1fc368180a03b979c4b1 100644 (file)
@@ -36,6 +36,7 @@
 
 .globl C_SYMBOL_NAME(_gcry_mpih_add_n)
 ELF(.type  C_SYMBOL_NAME(_gcry_mpih_add_n),%function)
+.align 4
 C_SYMBOL_NAME(_gcry_mpih_add_n):
        CFI_STARTPROC()
        and     w5, w3, #3;
index f34c13c573b7b41a09a0b3a5548812a0cb83566c..f7db416b2d1b2a6be99eaf3932f6b9e3eaed500f 100644 (file)
@@ -36,6 +36,7 @@
 
 .globl C_SYMBOL_NAME(_gcry_mpih_mul_1)
 ELF(.type  C_SYMBOL_NAME(_gcry_mpih_mul_1),%function)
+.align 4
 C_SYMBOL_NAME(_gcry_mpih_mul_1):
        CFI_STARTPROC()
        and     w5, w2, #3;
index 1880999d427104fe8b336d2be6b1c3738abfe385..6199522dbdbb0ecc2dabe81f541e2e42d1624bc6 100644 (file)
@@ -36,6 +36,7 @@
 
 .globl C_SYMBOL_NAME(_gcry_mpih_addmul_1)
 ELF(.type  C_SYMBOL_NAME(_gcry_mpih_addmul_1),%function)
+.align 4
 C_SYMBOL_NAME(_gcry_mpih_addmul_1):
        CFI_STARTPROC()
        and     w5, w2, #3;
index e5faeddcb8e9758f70115085d4caeadb4ecd2cd9..fdaf5dfef70c3945ecc8dc43f9e29ee4a4dfe527 100644 (file)
@@ -36,6 +36,7 @@
 
 .globl C_SYMBOL_NAME(_gcry_mpih_submul_1)
 ELF(.type  C_SYMBOL_NAME(_gcry_mpih_submul_1),%function)
+.align 4
 C_SYMBOL_NAME(_gcry_mpih_submul_1):
        CFI_STARTPROC()
        and     w5, w2, #3;
index 4690828638f818707d5d1c640029385efd0e0c8a..fe060611e89d3f7471af55facc29009509c20fc1 100644 (file)
@@ -36,6 +36,7 @@
 
 .globl C_SYMBOL_NAME(_gcry_mpih_sub_n)
 ELF(.type  C_SYMBOL_NAME(_gcry_mpih_sub_n),%function)
+.align 4
 C_SYMBOL_NAME(_gcry_mpih_sub_n):
        CFI_STARTPROC()
        and     w5, w3, #3;
index 50dbb2b9d23d7d8047306a2eae10f45d5a830f57..463c5c070ef5e88a81fe6ed79b6f270d36d62533 100644 (file)
@@ -15,8 +15,8 @@
  * GNU Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA
+ * License along with this program; if not, see <https://www.gnu.org/licenses/>.
+ * SPDX-License-Identifier: LGPL-2.1-or-later
  */
 
 
index ded4b15c001a65cdf2c3c82a9df92b22f7ebc629..75ae698cc3167bfbc8b9d3453610056eb1869734 100644 (file)
@@ -17,8 +17,8 @@
  * GNU Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA
+ * License along with this program; if not, see <https://www.gnu.org/licenses/>.
+ * SPDX-License-Identifier: LGPL-2.1-or-later
  *
  * Note: This code is heavily based on the GNU MP Library.
  *      Actually it's the same code with only minor changes in the
index cd91b104993327e512b4db9e2b67160c943afbd0..030a288d2cb27e6230199a10290bf61c2629fabe 100644 (file)
@@ -17,8 +17,8 @@
  * GNU Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA
+ * License along with this program; if not, see <https://www.gnu.org/licenses/>.
+ * SPDX-License-Identifier: LGPL-2.1-or-later
  */
 
 
index 5eb6b98be418ff65b9f329f852100473977c2c01..566642eee9311c9d7f804b4b32c1528e71f324c8 100644 (file)
@@ -17,8 +17,8 @@
  * GNU Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA
+ * License along with this program; if not, see <https://www.gnu.org/licenses/>.
+ * SPDX-License-Identifier: LGPL-2.1-or-later
  */
 
 
index 7d5d2afe4102339c6a0f43f4d99f769f2ff2256c..520d1c62278070bbc107b55f2ca57ccd311755c4 100644 (file)
@@ -16,8 +16,8 @@
  * GNU Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA
+ * License along with this program; if not, see <https://www.gnu.org/licenses/>.
+ * SPDX-License-Identifier: LGPL-2.1-or-later
  */
 
 
index f0c98143885f272e8a8eda3f9cfe252357eb0e27..85e0af53820fe8756224aefda9200de5c7e8c270 100644 (file)
@@ -15,8 +15,8 @@
  * GNU Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA
+ * License along with this program; if not, see <https://www.gnu.org/licenses/>.
+ * SPDX-License-Identifier: LGPL-2.1-or-later
  *
  * Note: This code is heavily based on the GNU MP Library.
  *      Actually it's the same code with only minor changes in the
index 9a644468cde41af9ea6e7c4df684671ee211fdec..6896b5746097f6da995196963821735370f77f17 100644 (file)
@@ -16,8 +16,8 @@
  * GNU Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA
+ * License along with this program; if not, see <https://www.gnu.org/licenses/>.
+ * SPDX-License-Identifier: LGPL-2.1-or-later
  */
 
 
index dd0c52d7ded430b5923cf8637e87ce96fef753c1..eb1602a9589e935a30d1eb06ea1f701382b31180 100644 (file)
@@ -16,8 +16,8 @@
  * GNU Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA
+ * License along with this program; if not, see <https://www.gnu.org/licenses/>.
+ * SPDX-License-Identifier: LGPL-2.1-or-later
  *
  * Note: This code is heavily based on the GNU MP Library.
  *      Actually it's the same code with only minor changes in the
index 39c00c524209f86ad61541b5fa51f85fe7dc339f..b4f7489c61a23589a8f932aa5e526afc88b1a4e7 100644 (file)
@@ -3,6 +3,7 @@
  *
  *      Copyright (C) 1992, 1994, 1995, 1998, 
  *                    2001, 2002, 2006 Free Software Foundation, Inc.
+ *      Copyright (C) 2023 Jussi Kivilinna <jussi.kivilinna@iki.fi>
  *
  * This file is part of Libgcrypt.
  *
@@ -17,8 +18,8 @@
  * GNU Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA
+ * License along with this program; if not, see <https://www.gnu.org/licenses/>.
+ * SPDX-License-Identifier: LGPL-2.1-or-later
  *
  * Note: This code is heavily based on the GNU MP Library.
  *      Actually it's the same code with only minor changes in the
  *                mpi_ptr_t s2_ptr,            rdx
  *                mpi_size_t size)             rcx
  */
-
-.text
+       TEXT
+       ALIGN(4)
        .globl C_SYMBOL_NAME(_gcry_mpih_add_n)
 C_SYMBOL_NAME(_gcry_mpih_add_n:)
        FUNC_ENTRY()
-       leaq    (%rsi,%rcx,8), %rsi
-       leaq    (%rdi,%rcx,8), %rdi
-       leaq    (%rdx,%rcx,8), %rdx
-       negq    %rcx
-       xorl    %eax, %eax              /* clear cy */
+       movl    %ecx, %r9d
+       andl    $3, %r9d
+       je      .Lprehandle0
+       cmpl    $2, %r9d
+       jb      .Lprehandle1
+       je      .Lprehandle2
+
+#define FIRST_ADD() \
+       movq    (%rsi), %rax; \
+       addq    (%rdx), %rax; \
+       movq    %rax, (%rdi)
+
+#define NEXT_ADD(offset) \
+       movq    offset(%rsi), %rax; \
+       adcq    offset(%rdx), %rax; \
+       movq    %rax, offset(%rdi)
+
+.Lprehandle3:
+       leaq    -2(%rcx), %rcx
+       FIRST_ADD();
+       NEXT_ADD(8);
+       NEXT_ADD(16);
+       decq    %rcx
+       je      .Lend
+       leaq    24(%rsi), %rsi
+       leaq    24(%rdx), %rdx
+       leaq    24(%rdi), %rdi
+       jmp     .Loop
+
+       ALIGN(3)
+.Lprehandle2:
+       leaq    -1(%rcx), %rcx
+       FIRST_ADD();
+       NEXT_ADD(8);
+       decq    %rcx
+       je      .Lend
+       leaq    16(%rsi), %rsi
+       leaq    16(%rdx), %rdx
+       leaq    16(%rdi), %rdi
+       jmp     .Loop
+
+       ALIGN(3)
+.Lprehandle1:
+       FIRST_ADD();
+       decq    %rcx
+       je      .Lend
+       leaq    8(%rsi), %rsi
+       leaq    8(%rdx), %rdx
+       leaq    8(%rdi), %rdi
+       jmp     .Loop
+
+       ALIGN(3)
+.Lprehandle0:
+       clc                             /* clear cy */
 
        ALIGN(4)                        /* minimal alignment for claimed speed */
-.Loop: movq    (%rsi,%rcx,8), %rax
-       movq    (%rdx,%rcx,8), %r10
-       adcq    %r10, %rax
-       movq    %rax, (%rdi,%rcx,8)
-       incq    %rcx
+.Loop: leaq    -3(%rcx), %rcx
+       NEXT_ADD(0);
+       NEXT_ADD(8);
+       NEXT_ADD(16);
+       NEXT_ADD(24);
+       leaq    32(%rsi), %rsi
+       leaq    32(%rdx), %rdx
+       leaq    32(%rdi), %rdi
+       decq    %rcx
        jne     .Loop
 
-       movq    %rcx, %rax              /* zero %rax */
-       adcq    %rax, %rax
+       ALIGN(2)
+.Lend:
+       movl    $0, %eax                /* zero %rax */
+       adcl    %eax, %eax
        FUNC_EXIT()
index a9c7d7e1eacfe6fdbc36c4f71f69db5120fdb5b5..e62946ae5ece27b4e91f62397629a953aca4ec59 100644 (file)
@@ -17,8 +17,8 @@
  * GNU Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA
+ * License along with this program; if not, see <https://www.gnu.org/licenses/>.
+ * SPDX-License-Identifier: LGPL-2.1-or-later
  *
  * Note: This code is heavily based on the GNU MP Library.
  *      Actually it's the same code with only minor changes in the
@@ -39,7 +39,8 @@
  *                unsigned cnt)        rcx
  */
 
-.text
+       TEXT
+       ALIGN(4)
        .globl C_SYMBOL_NAME(_gcry_mpih_lshift)
 C_SYMBOL_NAME(_gcry_mpih_lshift:)
        FUNC_ENTRY()
@@ -51,7 +52,7 @@ C_SYMBOL_NAME(_gcry_mpih_lshift:)
        movd    %eax, %xmm0
        movdqa  %xmm4, %xmm3
        psrlq   %xmm0, %xmm4
-       movd    %xmm4, %rax
+       movq    %xmm4, %rax
        subq    $2, %rdx
        jl      .Lendo
 
index dacb9d870e1dfacdaf89aff293ba18ac5492aabf..7bd9ff29253623587b31a3445f8328aae301ed73 100644 (file)
@@ -16,8 +16,8 @@
  * GNU Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA
+ * License along with this program; if not, see <https://www.gnu.org/licenses/>.
+ * SPDX-License-Identifier: LGPL-2.1-or-later
  *
  * Note: This code is heavily based on the GNU MP Library.
  *      Actually it's the same code with only minor changes in the
@@ -40,8 +40,7 @@
 
 
        TEXT
-       ALIGN(5)
-       .byte   0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
+       ALIGN(4)
 
        GLOBL   C_SYMBOL_NAME(_gcry_mpih_mul_1)
 C_SYMBOL_NAME(_gcry_mpih_mul_1:)
@@ -53,6 +52,7 @@ C_SYMBOL_NAME(_gcry_mpih_mul_1:)
        negq    %r11
        xorl    %r8d, %r8d
 
+       ALIGN(4)
 .Loop: movq    (%rsi,%r11,8), %rax
        mulq    %rcx
        addq    %r8, %rax
index 07913586d691ad2f000be4f006f676ea00f1ec05..abd39230190c3dfc4c3d7e748c30973a90f3cf42 100644 (file)
@@ -17,8 +17,8 @@
  * GNU Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA
+ * License along with this program; if not, see <https://www.gnu.org/licenses/>.
+ * SPDX-License-Identifier: LGPL-2.1-or-later
  *
  * Note: This code is heavily based on the GNU MP Library.
  *      Actually it's the same code with only minor changes in the
@@ -39,6 +39,7 @@
  *                  mpi_limb_t s2_limb)     (rcx)
  */
        TEXT
+       ALIGN(4)
        GLOBL   C_SYMBOL_NAME(_gcry_mpih_addmul_1)
 C_SYMBOL_NAME(_gcry_mpih_addmul_1:)
        FUNC_ENTRY()
@@ -49,7 +50,7 @@ C_SYMBOL_NAME(_gcry_mpih_addmul_1:)
        xorl    %r8d, %r8d
        xorl    %r10d, %r10d
 
-       ALIGN(3)                        /* minimal alignment for claimed speed */
+       ALIGN(4)                        /* minimal alignment for claimed speed */
 .Loop: movq    (%rsi,%r11,8), %rax
        mulq    %rcx
        addq    (%rdi,%r11,8), %rax
index f8889eb2ad2b02d6cb699eba5490c8d05d7b4362..d0e85ba9e1adae20ecef9734c637a29601d05efe 100644 (file)
@@ -17,8 +17,8 @@
  * GNU Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA
+ * License along with this program; if not, see <https://www.gnu.org/licenses/>.
+ * SPDX-License-Identifier: LGPL-2.1-or-later
  *
  * Note: This code is heavily based on the GNU MP Library.
  *      Actually it's the same code with only minor changes in the
@@ -40,6 +40,7 @@
  *                  mpi_limb_t s2_limb)     (rcx)
  */
        TEXT
+       ALIGN(4)
        GLOBL   C_SYMBOL_NAME(_gcry_mpih_submul_1)
 C_SYMBOL_NAME(_gcry_mpih_submul_1:)
        FUNC_ENTRY()
@@ -49,7 +50,7 @@ C_SYMBOL_NAME(_gcry_mpih_submul_1:)
        negq    %r11
        xorl    %r8d, %r8d
 
-       ALIGN(3)                        /* minimal alignment for claimed speed */
+       ALIGN(4)                        /* minimal alignment for claimed speed */
 .Loop: movq    (%rsi,%r11,8), %rax
        movq    (%rdi,%r11,8), %r10
        mulq    %rcx
index 8ecf155f5220eacc3b715c6dd7e4d775b76333c9..e4e0c09045e3c2e0e662c720fe28aa9a870a9350 100644 (file)
@@ -17,8 +17,8 @@
  * GNU Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA
+ * License along with this program; if not, see <https://www.gnu.org/licenses/>.
+ * SPDX-License-Identifier: LGPL-2.1-or-later
  *
  * Note: This code is heavily based on the GNU MP Library.
  *      Actually it's the same code with only minor changes in the
@@ -39,7 +39,8 @@
  *                unsigned cnt)        rcx
  */
 
-.text
+       TEXT
+       ALIGN(4)
        .globl C_SYMBOL_NAME(_gcry_mpih_rshift)
 C_SYMBOL_NAME(_gcry_mpih_rshift:)
        FUNC_ENTRY()
@@ -51,7 +52,7 @@ C_SYMBOL_NAME(_gcry_mpih_rshift:)
        movd    %eax, %xmm0
        movdqa  %xmm4, %xmm3
        psllq   %xmm0, %xmm4
-       movd    %xmm4, %rax
+       movq    %xmm4, %rax
        leaq    (%rsi,%rdx,8), %rsi
        leaq    (%rdi,%rdx,8), %rdi
        negq    %rdx
index d60b58a5b78d95dd8751274f05e79ba35ff01dc8..c2cc635b5ed13d25d50771631600219520f98522 100644 (file)
@@ -3,6 +3,7 @@
  *
  *      Copyright (C) 1992, 1994, 1995, 1998, 
  *                    2001, 2002, 2006 Free Software Foundation, Inc.
+ *      Copyright (C) 2023 Jussi Kivilinna <jussi.kivilinna@iki.fi>
  *
  * This file is part of Libgcrypt.
  *
@@ -17,8 +18,8 @@
  * GNU Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA
+ * License along with this program; if not, see <https://www.gnu.org/licenses/>.
+ * SPDX-License-Identifier: LGPL-2.1-or-later
  *
  * Note: This code is heavily based on the GNU MP Library.
  *      Actually it's the same code with only minor changes in the
  *                mpi_ptr_t s2_ptr,            rdx
  *                mpi_size_t size)             rcx
  */
-.text
+       TEXT
+       ALIGN(4)
        .globl C_SYMBOL_NAME(_gcry_mpih_sub_n)
 C_SYMBOL_NAME(_gcry_mpih_sub_n:)
        FUNC_ENTRY()
-       leaq    (%rsi,%rcx,8), %rsi
-       leaq    (%rdi,%rcx,8), %rdi
-       leaq    (%rdx,%rcx,8), %rdx
-       negq    %rcx
-       xorl    %eax, %eax              /* clear cy */
+       movl    %ecx, %r9d
+       andl    $3, %r9d
+       je      .Lprehandle0
+       cmpl    $2, %r9d
+       jb      .Lprehandle1
+       je      .Lprehandle2
+
+#define FIRST_SUB() \
+       movq    (%rsi), %rax; \
+       subq    (%rdx), %rax; \
+       movq    %rax, (%rdi)
+
+#define NEXT_SUB(offset) \
+       movq    offset(%rsi), %rax; \
+       sbbq    offset(%rdx), %rax; \
+       movq    %rax, offset(%rdi)
+
+.Lprehandle3:
+       leaq    -2(%rcx), %rcx
+       FIRST_SUB();
+       NEXT_SUB(8);
+       NEXT_SUB(16);
+       decq    %rcx
+       je      .Lend
+       leaq    24(%rsi), %rsi
+       leaq    24(%rdx), %rdx
+       leaq    24(%rdi), %rdi
+       jmp     .Loop
+
+       ALIGN(3)
+.Lprehandle2:
+       leaq    -1(%rcx), %rcx
+       FIRST_SUB();
+       NEXT_SUB(8);
+       decq    %rcx
+       je      .Lend
+       leaq    16(%rsi), %rsi
+       leaq    16(%rdx), %rdx
+       leaq    16(%rdi), %rdi
+       jmp     .Loop
+
+       ALIGN(3)
+.Lprehandle1:
+       FIRST_SUB();
+       decq    %rcx
+       je      .Lend
+       leaq    8(%rsi), %rsi
+       leaq    8(%rdx), %rdx
+       leaq    8(%rdi), %rdi
+       jmp     .Loop
+
+       ALIGN(3)
+.Lprehandle0:
+       clc                             /* clear cy */
 
        ALIGN(4)                        /* minimal alignment for claimed speed */
-.Loop: movq    (%rsi,%rcx,8), %rax
-       movq    (%rdx,%rcx,8), %r10
-       sbbq    %r10, %rax
-       movq    %rax, (%rdi,%rcx,8)
-       incq    %rcx
+.Loop: leaq    -3(%rcx), %rcx
+       NEXT_SUB(0);
+       NEXT_SUB(8);
+       NEXT_SUB(16);
+       NEXT_SUB(24);
+       leaq    32(%rsi), %rsi
+       leaq    32(%rdx), %rdx
+       leaq    32(%rdi), %rdi
+       decq    %rcx
        jne     .Loop
 
-       movq    %rcx, %rax              /* zero %rax */
-       adcq    %rax, %rax
+       ALIGN(2)
+.Lend:
+       movl    $0, %eax                /* zero %rax */
+       adcl    %eax, %eax
        FUNC_EXIT()
index 09e8b3b2bb17ca807816cefe5f2dc61721bcbd9f..d59d3f3d01541d70446fe8b4bdc9748c3bc3be47 100644 (file)
 
 /*******************
  *  mpi_limb_t
- *  _gcry_mpih_add_n( mpi_ptr_t res_ptr,       %r0
- *                mpi_ptr_t s1_ptr,            %r1
- *                mpi_ptr_t s2_ptr,            %r2
- *                mpi_size_t size)             %r3
+ *  _gcry_mpih_add_n( mpi_ptr_t res_ptr,       r0
+ *                mpi_ptr_t s1_ptr,            r1
+ *                mpi_ptr_t s2_ptr,            r2
+ *                mpi_size_t size)             r3
  */
 
 .text
 .globl _gcry_mpih_add_n
 .type  _gcry_mpih_add_n,%function
 _gcry_mpih_add_n:
-       push    {%r4, %r5, %r6, %r7, %r8, %r9, %r10, %lr};
-       cmn     %r0, #0; /* clear carry flag */
+       push    {r4, r5, r6, r7, r8, r9, r10, lr};
+       cmn     r0, #0; /* clear carry flag */
 
-       tst     %r3, #3;
+       tst     r3, #3;
        beq     .Large_loop;
 
 .Loop:
-       ldr     %r4, [%r1], #4;
-       sub     %r3, #1;
-       ldr     %lr, [%r2], #4;
-       adcs    %r4, %lr;
-       tst     %r3, #3;
-       str     %r4, [%r0], #4;
+       ldr     r4, [r1], #4;
+       sub     r3, #1;
+       ldr     lr, [r2], #4;
+       adcs    r4, lr;
+       tst     r3, #3;
+       str     r4, [r0], #4;
        bne     .Loop;
 
-       teq     %r3, #0;
+       teq     r3, #0;
        beq     .Lend;
 
 .Large_loop:
-       ldm     %r1!, {%r4, %r6, %r8, %r10};
-       ldm     %r2!, {%r5, %r7, %r9, %lr};
-       sub     %r3, #4;
-       adcs    %r4, %r5;
-       adcs    %r6, %r7;
-       adcs    %r8, %r9;
-       adcs    %r10, %lr;
-       teq     %r3, #0;
-       stm     %r0!, {%r4, %r6, %r8, %r10};
+       ldm     r1!, {r4, r6, r8, r10};
+       ldm     r2!, {r5, r7, r9, lr};
+       sub     r3, #4;
+       adcs    r4, r5;
+       adcs    r6, r7;
+       adcs    r8, r9;
+       adcs    r10, lr;
+       teq     r3, #0;
+       stm     r0!, {r4, r6, r8, r10};
        bne     .Large_loop;
 
 .Lend:
-       adc     %r0, %r3, #0;
-       pop     {%r4, %r5, %r6, %r7, %r8, %r9, %r10, %pc};
+       adc     r0, r3, #0;
+       pop     {r4, r5, r6, r7, r8, r9, r10, pc};
 .size _gcry_mpih_add_n,.-_gcry_mpih_add_n;
index c2e2854bf1fa27625cedfd3e2a9962b285c8ce31..ea196e8bfc37ca1c77a8345db423ba216e9e6784 100644 (file)
 
 /*******************
  * mpi_limb_t
- * _gcry_mpih_mul_1( mpi_ptr_t res_ptr,                %r0
- *               mpi_ptr_t s1_ptr,             %r1
- *               mpi_size_t s1_size,           %r2
- *               mpi_limb_t s2_limb)           %r3
+ * _gcry_mpih_mul_1( mpi_ptr_t res_ptr,                r0
+ *               mpi_ptr_t s1_ptr,             r1
+ *               mpi_size_t s1_size,           r2
+ *               mpi_limb_t s2_limb)           r3
  */
 
 .text
 .globl _gcry_mpih_mul_1
 .type  _gcry_mpih_mul_1,%function
 _gcry_mpih_mul_1:
-       push    {%r4, %r5, %r6, %r7, %r8, %r9, %r10, %r11, %lr};
-       mov     %r4, #0;
+       push    {r4, r5, r6, r7, r8, r9, r10, r11, lr};
+       mov     r4, #0;
 
-       tst     %r2, #3;
+       tst     r2, #3;
        beq     .Large_loop;
 
 .Loop:
-       ldr     %r5, [%r1], #4;
-       mov     %lr, #0;
-       umlal   %r4, %lr, %r5, %r3;
-       sub     %r2, #1;
-       str     %r4, [%r0], #4;
-       tst     %r2, #3;
-       mov     %r4, %lr;
+       ldr     r5, [r1], #4;
+       mov     lr, #0;
+       umlal   r4, lr, r5, r3;
+       sub     r2, #1;
+       str     r4, [r0], #4;
+       tst     r2, #3;
+       mov     r4, lr;
        bne     .Loop;
 
-       teq     %r2, #0;
+       teq     r2, #0;
        beq     .Lend;
 
 .Large_loop:
-       ldm     %r1!, {%r5, %r6, %r7, %r8};
-       mov     %r9, #0;
-       mov     %r10, #0;
-       umlal   %r4, %r9, %r5, %r3;
-       mov     %r11, #0;
-       umlal   %r9, %r10, %r6, %r3;
-       str     %r4, [%r0], #4;
-       mov     %r4, #0;
-       umlal   %r10, %r11, %r7, %r3;
-       subs    %r2, #4;
-       umlal   %r11, %r4, %r8, %r3;
-       stm     %r0!, {%r9, %r10, %r11};
+       ldm     r1!, {r5, r6, r7, r8};
+       mov     r9, #0;
+       mov     r10, #0;
+       umlal   r4, r9, r5, r3;
+       mov     r11, #0;
+       umlal   r9, r10, r6, r3;
+       str     r4, [r0], #4;
+       mov     r4, #0;
+       umlal   r10, r11, r7, r3;
+       subs    r2, #4;
+       umlal   r11, r4, r8, r3;
+       stm     r0!, {r9, r10, r11};
        bne     .Large_loop;
 
 .Lend:
-       mov     %r0, %r4;
-       pop     {%r4, %r5, %r6, %r7, %r8, %r9, %r10, %r11, %pc};
+       mov     r0, r4;
+       pop     {r4, r5, r6, r7, r8, r9, r10, r11, pc};
 .size _gcry_mpih_mul_1,.-_gcry_mpih_mul_1;
index bce932e9b00f75a7b952f039a16c368fc4ab79cb..8793b20f504ce5b8d0cbe10a7622fc54d3f6d7ec 100644 (file)
 
 /*******************
  * mpi_limb_t
- * _gcry_mpih_addmul_1( mpi_ptr_t res_ptr,     %r0
- *                  mpi_ptr_t s1_ptr,          %r1
- *                  mpi_size_t s1_size,        %r2
- *                  mpi_limb_t s2_limb)        %r3
+ * _gcry_mpih_addmul_1( mpi_ptr_t res_ptr,     r0
+ *                  mpi_ptr_t s1_ptr,          r1
+ *                  mpi_size_t s1_size,        r2
+ *                  mpi_limb_t s2_limb)        r3
  */
 
 .text
 .globl _gcry_mpih_addmul_1
 .type  _gcry_mpih_addmul_1,%function
 _gcry_mpih_addmul_1:
-       push    {%r4, %r5, %r6, %r8, %r10, %lr};
-       mov     %lr, #0;
-       cmn     %r0, #0; /* clear carry flag */
+       push    {r4, r5, r6, r8, r10, lr};
+       mov     lr, #0;
+       cmn     r0, #0; /* clear carry flag */
 
-       tst     %r2, #3;
+       tst     r2, #3;
        beq     .Large_loop;
 .Loop:
-       ldr     %r5, [%r1], #4;
-       ldr     %r4, [%r0];
-       sub     %r2, #1;
-       adcs    %r4, %lr;
-       mov     %lr, #0;
-       umlal   %r4, %lr, %r5, %r3;
-       tst     %r2, #3;
-       str     %r4, [%r0], #4;
+       ldr     r5, [r1], #4;
+       ldr     r4, [r0];
+       sub     r2, #1;
+       adcs    r4, lr;
+       mov     lr, #0;
+       umlal   r4, lr, r5, r3;
+       tst     r2, #3;
+       str     r4, [r0], #4;
        bne     .Loop;
 
-       teq     %r2, #0;
+       teq     r2, #0;
        beq     .Lend;
 
 .Large_loop:
-       ldr     %r5, [%r1], #4;
-       ldm     %r0, {%r4, %r6, %r8, %r10};
+       ldr     r5, [r1], #4;
+       ldm     r0, {r4, r6, r8, r10};
 
-       sub     %r2, #4;
-       adcs    %r4, %lr;
-       mov     %lr, #0;
-       umlal   %r4, %lr, %r5, %r3;
+       sub     r2, #4;
+       adcs    r4, lr;
+       mov     lr, #0;
+       umlal   r4, lr, r5, r3;
 
-       ldr     %r5, [%r1], #4;
-       adcs    %r6, %lr;
-       mov     %lr, #0;
-       umlal   %r6, %lr, %r5, %r3;
+       ldr     r5, [r1], #4;
+       adcs    r6, lr;
+       mov     lr, #0;
+       umlal   r6, lr, r5, r3;
 
-       ldr     %r5, [%r1], #4;
-       adcs    %r8, %lr;
-       mov     %lr, #0;
-       umlal   %r8, %lr, %r5, %r3;
+       ldr     r5, [r1], #4;
+       adcs    r8, lr;
+       mov     lr, #0;
+       umlal   r8, lr, r5, r3;
 
-       ldr     %r5, [%r1], #4;
-       adcs    %r10, %lr;
-       mov     %lr, #0;
-       umlal   %r10, %lr, %r5, %r3;
+       ldr     r5, [r1], #4;
+       adcs    r10, lr;
+       mov     lr, #0;
+       umlal   r10, lr, r5, r3;
 
-       teq     %r2, #0;
-       stm     %r0!, {%r4, %r6, %r8, %r10};
+       teq     r2, #0;
+       stm     r0!, {r4, r6, r8, r10};
        bne     .Large_loop;
 
 .Lend:
-       adc     %r0, %lr, #0;
-       pop     {%r4, %r5, %r6, %r8, %r10, %pc};
+       adc     r0, lr, #0;
+       pop     {r4, r5, r6, r8, r10, pc};
 .size _gcry_mpih_addmul_1,.-_gcry_mpih_addmul_1;
index 33326c7873bc6fcd956ea1cc2383bd3ef26ffb9a..2477c08930b889d23896b2afdcfb825384130553 100644 (file)
 
 /*******************
  * mpi_limb_t
- * _gcry_mpih_submul_1( mpi_ptr_t res_ptr,     %r0
- *                  mpi_ptr_t s1_ptr,          %r1
- *                  mpi_size_t s1_size,        %r2
- *                  mpi_limb_t s2_limb)        %r3
+ * _gcry_mpih_submul_1( mpi_ptr_t res_ptr,     r0
+ *                  mpi_ptr_t s1_ptr,          r1
+ *                  mpi_size_t s1_size,        r2
+ *                  mpi_limb_t s2_limb)        r3
  */
 
 .text
 .globl _gcry_mpih_submul_1
 .type  _gcry_mpih_submul_1,%function
 _gcry_mpih_submul_1:
-       push    {%r4, %r5, %r6, %r8, %r9, %r10, %lr};
-       mov     %lr, #0;
-       cmp     %r0, #0; /* prepare carry flag for sbc */
+       push    {r4, r5, r6, r8, r9, r10, lr};
+       mov     lr, #0;
+       cmp     r0, #0; /* prepare carry flag for sbc */
 
-       tst     %r2, #3;
+       tst     r2, #3;
        beq     .Large_loop;
 .Loop:
-       ldr     %r5, [%r1], #4;
-       mov     %r4, %lr;
-       mov     %lr, #0;
-       ldr     %r6, [%r0];
-       umlal   %r4, %lr, %r5, %r3;
-       sub     %r2, #1;
-       sbcs    %r4, %r6, %r4;
-       tst     %r2, #3;
-       str     %r4, [%r0], #4;
+       ldr     r5, [r1], #4;
+       mov     r4, lr;
+       mov     lr, #0;
+       ldr     r6, [r0];
+       umlal   r4, lr, r5, r3;
+       sub     r2, #1;
+       sbcs    r4, r6, r4;
+       tst     r2, #3;
+       str     r4, [r0], #4;
        bne     .Loop;
 
-       teq     %r2, #0;
+       teq     r2, #0;
        beq     .Lend;
 
 .Large_loop:
-       ldr     %r5, [%r1], #4;
-       mov     %r9, #0;
-       ldr     %r4, [%r0, #0];
+       ldr     r5, [r1], #4;
+       mov     r9, #0;
+       ldr     r4, [r0, #0];
 
-       umlal   %lr, %r9, %r5, %r3;
-       ldr     %r6, [%r0, #4];
-       ldr     %r5, [%r1], #4;
-       sbcs    %r4, %r4, %lr;
+       umlal   lr, r9, r5, r3;
+       ldr     r6, [r0, #4];
+       ldr     r5, [r1], #4;
+       sbcs    r4, r4, lr;
 
-       mov     %lr, #0;
-       umlal   %r9, %lr, %r5, %r3;
-       ldr     %r8, [%r0, #8];
-       ldr     %r5, [%r1], #4;
-       sbcs    %r6, %r6, %r9;
+       mov     lr, #0;
+       umlal   r9, lr, r5, r3;
+       ldr     r8, [r0, #8];
+       ldr     r5, [r1], #4;
+       sbcs    r6, r6, r9;
 
-       mov     %r9, #0;
-       umlal   %lr, %r9, %r5, %r3;
-       ldr     %r10, [%r0, #12];
-       ldr     %r5, [%r1], #4;
-       sbcs    %r8, %r8, %lr;
+       mov     r9, #0;
+       umlal   lr, r9, r5, r3;
+       ldr     r10, [r0, #12];
+       ldr     r5, [r1], #4;
+       sbcs    r8, r8, lr;
 
-       mov     %lr, #0;
-       umlal   %r9, %lr, %r5, %r3;
-       sub     %r2, #4;
-       sbcs    %r10, %r10, %r9;
+       mov     lr, #0;
+       umlal   r9, lr, r5, r3;
+       sub     r2, #4;
+       sbcs    r10, r10, r9;
 
-       teq     %r2, #0;
-       stm     %r0!, {%r4, %r6, %r8, %r10};
+       teq     r2, #0;
+       stm     r0!, {r4, r6, r8, r10};
        bne     .Large_loop;
 
 .Lend:
        it      cc
-       movcc   %r2, #1;
-       add     %r0, %lr, %r2;
-       pop     {%r4, %r5, %r6, %r8, %r9, %r10, %pc};
+       movcc   r2, #1;
+       add     r0, lr, r2;
+       pop     {r4, r5, r6, r8, r9, r10, pc};
 .size _gcry_mpih_submul_1,.-_gcry_mpih_submul_1;
index 593e3cded69571d5bd98f8bf7f0380bb0f2a435f..476d8a33f19912a90ab7a3e24e83458716fe1a83 100644 (file)
 
 /*******************
  *  mpi_limb_t
- *  _gcry_mpih_sub_n( mpi_ptr_t res_ptr,       %r0
- *                mpi_ptr_t s1_ptr,            %r1
- *                mpi_ptr_t s2_ptr,            %r2
- *                mpi_size_t size)             %r3
+ *  _gcry_mpih_sub_n( mpi_ptr_t res_ptr,       r0
+ *                mpi_ptr_t s1_ptr,            r1
+ *                mpi_ptr_t s2_ptr,            r2
+ *                mpi_size_t size)             r3
  */
 
 .text
 .globl _gcry_mpih_sub_n
 .type  _gcry_mpih_sub_n,%function
 _gcry_mpih_sub_n:
-       push    {%r4, %r5, %r6, %r7, %r8, %r9, %r10, %lr};
-       cmp     %r0, #0; /* prepare carry flag for sub */
+       push    {r4, r5, r6, r7, r8, r9, r10, lr};
+       cmp     r0, #0; /* prepare carry flag for sub */
 
-       tst     %r3, #3;
+       tst     r3, #3;
        beq     .Large_loop;
 
 .Loop:
-       ldr     %r4, [%r1], #4;
-       sub     %r3, #1;
-       ldr     %lr, [%r2], #4;
-       sbcs    %r4, %lr;
-       tst     %r3, #3;
-       str     %r4, [%r0], #4;
+       ldr     r4, [r1], #4;
+       sub     r3, #1;
+       ldr     lr, [r2], #4;
+       sbcs    r4, lr;
+       tst     r3, #3;
+       str     r4, [r0], #4;
        bne     .Loop;
 
-       teq     %r3, #0;
+       teq     r3, #0;
        beq     .Lend;
 
 .Large_loop:
-       ldm     %r1!, {%r4, %r6, %r8, %r10};
-       sub     %r3, #4;
-       ldm     %r2!, {%r5, %r7, %r9, %lr};
-       sbcs    %r4, %r5;
-       sbcs    %r6, %r7;
-       sbcs    %r8, %r9;
-       sbcs    %r10, %lr;
-       teq     %r3, #0;
-       stm     %r0!, {%r4, %r6, %r8, %r10};
+       ldm     r1!, {r4, r6, r8, r10};
+       sub     r3, #4;
+       ldm     r2!, {r5, r7, r9, lr};
+       sbcs    r4, r5;
+       sbcs    r6, r7;
+       sbcs    r8, r9;
+       sbcs    r10, lr;
+       teq     r3, #0;
+       stm     r0!, {r4, r6, r8, r10};
        bne     .Large_loop;
 
 .Lend:
-       sbc     %r0, %r3, #0;
-       neg     %r0, %r0;
-       pop     {%r4, %r5, %r6, %r7, %r8, %r9, %r10, %pc};
+       sbc     r0, r3, #0;
+       neg     r0, r0;
+       pop     {r4, r5, r6, r7, r8, r9, r10, pc};
 .size _gcry_mpih_sub_n,.-_gcry_mpih_sub_n;
diff --git a/mpi/asm-common-i386.h b/mpi/asm-common-i386.h
new file mode 100644 (file)
index 0000000..9016645
--- /dev/null
@@ -0,0 +1,26 @@
+/* asm-common-i386.h  -  Common macros for AMD64 assembly
+ *
+ * Copyright (C) 2023 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef MPI_ASM_COMMON_I386_H
+#define MPI_ASM_COMMON_I386_H
+
+#include "../cipher/asm-common-i386.h"
+
+#endif /* MPI_ASM_COMMON_I386_H */
index 8cd6657e6c07b2cc93499d734ee96bb1fd7b07a5..94b42e533d9fe79e558e395928ade7b2d48e2f95 100644 (file)
@@ -15,8 +15,8 @@
 # GNU Lesser General Public License for more details.
 #
 # You should have received a copy of the GNU Lesser General Public
-# License along with this program; if not, write to the Free Software
-# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA
+# License along with this program; if not, see <https://www.gnu.org/licenses/>.
+# SPDX-License-Identifier: LGPL-2.1-or-later
 #
 # sourced by ../configure to get the list of files to link
 # this should set $mpi_ln_list.
index acfe2a69f55d5d1f0e7273e0ca980e9b14249741..b52b339b1c598e81148898ed6ccd04472af68577 100644 (file)
@@ -27,7 +27,7 @@
 #include "g10lib.h"
 #include "context.h"
 #include "ec-context.h"
-
+#include "ec-internal.h"
 
 void
 _gcry_mpi_ec_ed25519_mod (gcry_mpi_t a)
index a07826e39c11b87a1b14fbe99b9986cdcd54a5d6..c24d5352f5a7dc095c1bb3899ce102b835241a64 100644 (file)
@@ -641,116 +641,192 @@ LIMB64_HILO(mpi_limb_t hi, mpi_limb_t lo)
 /* i386 addition/subtraction helpers.  */
 #if defined (__i386__) && defined(HAVE_CPU_ARCH_X86) && __GNUC__ >= 4
 
-#define ADD4_LIMB32(a3, a2, a1, a0, b3, b2, b1, b0, c3, c2, c1, c0) \
-  __asm__ ("addl %11, %3\n" \
-          "adcl %10, %2\n" \
-          "adcl %9, %1\n" \
-          "adcl %8, %0\n" \
-          : "=r" (a3), \
-            "=&r" (a2), \
+#define ADD2_LIMB32_CARRY_OUT(carry, a1, a0, b1, b0, c1, c0) \
+  __asm__ ("addl %7, %2\n" \
+          "adcl %6, %1\n" \
+          "sbbl %0, %0\n" \
+          : "=r" (carry), \
             "=&r" (a1), \
             "=&r" (a0) \
-          : "0" ((mpi_limb_t)(b3)), \
-            "1" ((mpi_limb_t)(b2)), \
-            "2" ((mpi_limb_t)(b1)), \
-            "3" ((mpi_limb_t)(b0)), \
-            "g" ((mpi_limb_t)(c3)), \
-            "g" ((mpi_limb_t)(c2)), \
-            "g" ((mpi_limb_t)(c1)), \
-            "g" ((mpi_limb_t)(c0)) \
+          : "0" ((mpi_limb_t)(0)), \
+            "1" ((mpi_limb_t)(b1)), \
+            "2" ((mpi_limb_t)(b0)), \
+            "re" ((mpi_limb_t)(c1)), \
+            "re" ((mpi_limb_t)(c0)) \
           : "cc")
 
+#define ADD2_LIMB32_CARRY_IN_OUT(a1, a0, b1, b0, c1, c0, carry) \
+  __asm__ ("addl $1, %0\n" \
+          "adcl %7, %2\n" \
+          "adcl %6, %1\n" \
+          "sbbl %0, %0\n" \
+          : "=r" (carry), \
+            "=&r" (a1), \
+            "=&r" (a0) \
+          : "0" ((mpi_limb_t)(carry)), \
+            "1" ((mpi_limb_t)(b1)), \
+            "2" ((mpi_limb_t)(b0)), \
+            "re" ((mpi_limb_t)(c1)), \
+            "re" ((mpi_limb_t)(c0)) \
+          : "cc")
+
+#define ADD2_LIMB32_CARRY_IN(a1, a0, b1, b0, c1, c0, carry) \
+    __asm__ ("addl $1, %2\n" \
+            "adcl %7, %1\n" \
+            "adcl %6, %0\n" \
+            : "=r" (a1), \
+              "=&r" (a0), \
+              "=&g" (carry) \
+            : "0" ((mpi_limb_t)(b1)), \
+              "1" ((mpi_limb_t)(b0)), \
+              "2" ((mpi_limb_t)(carry)), \
+              "re" ((mpi_limb_t)(c1)), \
+              "re" ((mpi_limb_t)(c0)) \
+          : "cc")
+
+#define ADD4_LIMB32(a3, a2, a1, a0, b3, b2, b1, b0, c3, c2, c1, c0) do { \
+    mpi_limb_t __carry4_32; \
+    ADD2_LIMB32_CARRY_OUT(__carry4_32, a1, a0, b1, b0, c1, c0); \
+    ADD2_LIMB32_CARRY_IN(a3, a2, b3, b2, c3, c2, __carry4_32); \
+  } while (0)
+
 #define ADD6_LIMB32(a5, a4, a3, a2, a1, a0, b5, b4, b3, b2, b1, b0, \
                    c5, c4, c3, c2, c1, c0) do { \
     mpi_limb_t __carry6_32; \
-    __asm__ ("addl %10, %3\n" \
-            "adcl %9, %2\n" \
-            "adcl %8, %1\n" \
-            "sbbl %0, %0\n" \
-            : "=r" (__carry6_32), \
-              "=&r" (a2), \
-              "=&r" (a1), \
-              "=&r" (a0) \
-            : "0" ((mpi_limb_t)(0)), \
-              "1" ((mpi_limb_t)(b2)), \
-              "2" ((mpi_limb_t)(b1)), \
-              "3" ((mpi_limb_t)(b0)), \
-              "g" ((mpi_limb_t)(c2)), \
-              "g" ((mpi_limb_t)(c1)), \
-              "g" ((mpi_limb_t)(c0)) \
-            : "cc"); \
-    __asm__ ("addl $1, %3\n" \
-            "adcl %10, %2\n" \
-            "adcl %9, %1\n" \
-            "adcl %8, %0\n" \
-            : "=r" (a5), \
-              "=&r" (a4), \
-              "=&r" (a3), \
-              "=&r" (__carry6_32) \
-            : "0" ((mpi_limb_t)(b5)), \
-              "1" ((mpi_limb_t)(b4)), \
-              "2" ((mpi_limb_t)(b3)), \
-              "3" ((mpi_limb_t)(__carry6_32)), \
-              "g" ((mpi_limb_t)(c5)), \
-              "g" ((mpi_limb_t)(c4)), \
-              "g" ((mpi_limb_t)(c3)) \
-          : "cc"); \
+    ADD2_LIMB32_CARRY_OUT(__carry6_32, a1, a0, b1, b0, c1, c0); \
+    ADD2_LIMB32_CARRY_IN_OUT(a3, a2, b3, b2, c3, c2, __carry6_32); \
+    ADD2_LIMB32_CARRY_IN(a5, a4, b5, b4, c5, c4, __carry6_32); \
+  } while (0)
+
+#define ADD8_LIMB32(a7, a6, a5, a4, a3, a2, a1, a0, \
+                   b7, b6, b5, b4, b3, b2, b1, b0, \
+                   c7, c6, c5, c4, c3, c2, c1, c0) do { \
+    mpi_limb_t __carry8_32; \
+    ADD2_LIMB32_CARRY_OUT(__carry8_32, a1, a0, b1, b0, c1, c0); \
+    ADD2_LIMB32_CARRY_IN_OUT(a3, a2, b3, b2, c3, c2, __carry8_32); \
+    ADD2_LIMB32_CARRY_IN_OUT(a5, a4, b5, b4, c5, c4, __carry8_32); \
+    ADD2_LIMB32_CARRY_IN(a7, a6, b7, b6, c7, c6, __carry8_32); \
+  } while (0)
+
+#define ADD10_LIMB32(a9, a8, a7, a6, a5, a4, a3, a2, a1, a0, \
+                    b9, b8, b7, b6, b5, b4, b3, b2, b1, b0, \
+                    c9, c8, c7, c6, c5, c4, c3, c2, c1, c0) do { \
+    mpi_limb_t __carry10_32; \
+    ADD2_LIMB32_CARRY_OUT(__carry10_32, a1, a0, b1, b0, c1, c0); \
+    ADD2_LIMB32_CARRY_IN_OUT(a3, a2, b3, b2, c3, c2, __carry10_32); \
+    ADD2_LIMB32_CARRY_IN_OUT(a5, a4, b5, b4, c5, c4, __carry10_32); \
+    ADD2_LIMB32_CARRY_IN_OUT(a7, a6, b7, b6, c7, c6, __carry10_32); \
+    ADD2_LIMB32_CARRY_IN(a9, a8, b9, b8, c9, c8, __carry10_32); \
   } while (0)
 
-#define SUB4_LIMB32(a3, a2, a1, a0, b3, b2, b1, b0, c3, c2, c1, c0) \
-  __asm__ ("subl %11, %3\n" \
-          "sbbl %10, %2\n" \
-          "sbbl %9, %1\n" \
-          "sbbl %8, %0\n" \
-          : "=r" (a3), \
-            "=&r" (a2), \
+#define ADD14_LIMB32(a13, a12, a11, a10, a9, a8, a7, \
+                    a6, a5, a4, a3, a2, a1, a0, \
+                    b13, b12, b11, b10, b9, b8, b7, \
+                    b6, b5, b4, b3, b2, b1, b0, \
+                    c13, c12, c11, c10, c9, c8, c7, \
+                    c6, c5, c4, c3, c2, c1, c0) do { \
+    mpi_limb_t __carry14_32; \
+    ADD2_LIMB32_CARRY_OUT(__carry14_32, a1, a0, b1, b0, c1, c0); \
+    ADD2_LIMB32_CARRY_IN_OUT(a3, a2, b3, b2, c3, c2, __carry14_32); \
+    ADD2_LIMB32_CARRY_IN_OUT(a5, a4, b5, b4, c5, c4, __carry14_32); \
+    ADD2_LIMB32_CARRY_IN_OUT(a7, a6, b7, b6, c7, c6, __carry14_32); \
+    ADD2_LIMB32_CARRY_IN_OUT(a9, a8, b9, b8, c9, c8, __carry14_32); \
+    ADD2_LIMB32_CARRY_IN_OUT(a11, a10, b11, b10, c11, c10, __carry14_32); \
+    ADD2_LIMB32_CARRY_IN(a13, a12, b13, b12, c13, c12, __carry14_32); \
+  } while (0)
+
+#define SUB2_LIMB32_CARRY_OUT(carry, a1, a0, b1, b0, c1, c0) \
+  __asm__ ("subl %7, %2\n" \
+          "sbbl %6, %1\n" \
+          "sbbl %0, %0\n" \
+          : "=r" (carry), \
+            "=&r" (a1), \
+            "=&r" (a0) \
+          : "0" ((mpi_limb_t)(0)), \
+            "1" ((mpi_limb_t)(b1)), \
+            "2" ((mpi_limb_t)(b0)), \
+            "re" ((mpi_limb_t)(c1)), \
+            "re" ((mpi_limb_t)(c0)) \
+          : "cc")
+
+#define SUB2_LIMB32_CARRY_IN_OUT(a1, a0, b1, b0, c1, c0, carry) \
+  __asm__ ("addl $1, %0\n" \
+          "sbbl %7, %2\n" \
+          "sbbl %6, %1\n" \
+          "sbbl %0, %0\n" \
+          : "=r" (carry), \
             "=&r" (a1), \
             "=&r" (a0) \
-          : "0" ((mpi_limb_t)(b3)), \
-            "1" ((mpi_limb_t)(b2)), \
-            "2" ((mpi_limb_t)(b1)), \
-            "3" ((mpi_limb_t)(b0)), \
-            "g" ((mpi_limb_t)(c3)), \
-            "g" ((mpi_limb_t)(c2)), \
-            "g" ((mpi_limb_t)(c1)), \
-            "g" ((mpi_limb_t)(c0)) \
+          : "0" ((mpi_limb_t)(carry)), \
+            "1" ((mpi_limb_t)(b1)), \
+            "2" ((mpi_limb_t)(b0)), \
+            "re" ((mpi_limb_t)(c1)), \
+            "re" ((mpi_limb_t)(c0)) \
           : "cc")
 
+#define SUB2_LIMB32_CARRY_IN(a1, a0, b1, b0, c1, c0, carry) \
+    __asm__ ("addl $1, %2\n" \
+            "sbbl %7, %1\n" \
+            "sbbl %6, %0\n" \
+            : "=r" (a1), \
+              "=&r" (a0), \
+              "=&g" (carry) \
+            : "0" ((mpi_limb_t)(b1)), \
+              "1" ((mpi_limb_t)(b0)), \
+              "2" ((mpi_limb_t)(carry)), \
+              "re" ((mpi_limb_t)(c1)), \
+              "re" ((mpi_limb_t)(c0)) \
+          : "cc")
+
+#define SUB4_LIMB32(a3, a2, a1, a0, b3, b2, b1, b0, c3, c2, c1, c0) do { \
+    mpi_limb_t __carry4_32; \
+    SUB2_LIMB32_CARRY_OUT(__carry4_32, a1, a0, b1, b0, c1, c0); \
+    SUB2_LIMB32_CARRY_IN(a3, a2, b3, b2, c3, c2, __carry4_32); \
+  } while (0)
+
 #define SUB6_LIMB32(a5, a4, a3, a2, a1, a0, b5, b4, b3, b2, b1, b0, \
                    c5, c4, c3, c2, c1, c0) do { \
-    mpi_limb_t __borrow6_32; \
-    __asm__ ("subl %10, %3\n" \
-            "sbbl %9, %2\n" \
-            "sbbl %8, %1\n" \
-            "sbbl %0, %0\n" \
-            : "=r" (__borrow6_32), \
-              "=&r" (a2), \
-              "=&r" (a1), \
-              "=&r" (a0) \
-            : "0" ((mpi_limb_t)(0)), \
-              "1" ((mpi_limb_t)(b2)), \
-              "2" ((mpi_limb_t)(b1)), \
-              "3" ((mpi_limb_t)(b0)), \
-              "g" ((mpi_limb_t)(c2)), \
-              "g" ((mpi_limb_t)(c1)), \
-              "g" ((mpi_limb_t)(c0)) \
-            : "cc"); \
-    __asm__ ("addl $1, %3\n" \
-            "sbbl %10, %2\n" \
-            "sbbl %9, %1\n" \
-            "sbbl %8, %0\n" \
-            : "=r" (a5), \
-              "=&r" (a4), \
-              "=&r" (a3), \
-              "=&r" (__borrow6_32) \
-            : "0" ((mpi_limb_t)(b5)), \
-              "1" ((mpi_limb_t)(b4)), \
-              "2" ((mpi_limb_t)(b3)), \
-              "3" ((mpi_limb_t)(__borrow6_32)), \
-              "g" ((mpi_limb_t)(c5)), \
-              "g" ((mpi_limb_t)(c4)), \
-              "g" ((mpi_limb_t)(c3)) \
-          : "cc"); \
+    mpi_limb_t __carry6_32; \
+    SUB2_LIMB32_CARRY_OUT(__carry6_32, a1, a0, b1, b0, c1, c0); \
+    SUB2_LIMB32_CARRY_IN_OUT(a3, a2, b3, b2, c3, c2, __carry6_32); \
+    SUB2_LIMB32_CARRY_IN(a5, a4, b5, b4, c5, c4, __carry6_32); \
+  } while (0)
+
+#define SUB8_LIMB32(a7, a6, a5, a4, a3, a2, a1, a0, \
+                   b7, b6, b5, b4, b3, b2, b1, b0, \
+                   c7, c6, c5, c4, c3, c2, c1, c0) do { \
+    mpi_limb_t __carry8_32; \
+    SUB2_LIMB32_CARRY_OUT(__carry8_32, a1, a0, b1, b0, c1, c0); \
+    SUB2_LIMB32_CARRY_IN_OUT(a3, a2, b3, b2, c3, c2, __carry8_32); \
+    SUB2_LIMB32_CARRY_IN_OUT(a5, a4, b5, b4, c5, c4, __carry8_32); \
+    SUB2_LIMB32_CARRY_IN(a7, a6, b7, b6, c7, c6, __carry8_32); \
+  } while (0)
+
+#define SUB10_LIMB32(a9, a8, a7, a6, a5, a4, a3, a2, a1, a0, \
+                    b9, b8, b7, b6, b5, b4, b3, b2, b1, b0, \
+                    c9, c8, c7, c6, c5, c4, c3, c2, c1, c0) do { \
+    mpi_limb_t __carry10_32; \
+    SUB2_LIMB32_CARRY_OUT(__carry10_32, a1, a0, b1, b0, c1, c0); \
+    SUB2_LIMB32_CARRY_IN_OUT(a3, a2, b3, b2, c3, c2, __carry10_32); \
+    SUB2_LIMB32_CARRY_IN_OUT(a5, a4, b5, b4, c5, c4, __carry10_32); \
+    SUB2_LIMB32_CARRY_IN_OUT(a7, a6, b7, b6, c7, c6, __carry10_32); \
+    SUB2_LIMB32_CARRY_IN(a9, a8, b9, b8, c9, c8, __carry10_32); \
+  } while (0)
+
+#define SUB14_LIMB32(a13, a12, a11, a10, a9, a8, a7, \
+                    a6, a5, a4, a3, a2, a1, a0, \
+                    b13, b12, b11, b10, b9, b8, b7, \
+                    b6, b5, b4, b3, b2, b1, b0, \
+                    c13, c12, c11, c10, c9, c8, c7, \
+                    c6, c5, c4, c3, c2, c1, c0) do { \
+    mpi_limb_t __carry14_32; \
+    SUB2_LIMB32_CARRY_OUT(__carry14_32, a1, a0, b1, b0, c1, c0); \
+    SUB2_LIMB32_CARRY_IN_OUT(a3, a2, b3, b2, c3, c2, __carry14_32); \
+    SUB2_LIMB32_CARRY_IN_OUT(a5, a4, b5, b4, c5, c4, __carry14_32); \
+    SUB2_LIMB32_CARRY_IN_OUT(a7, a6, b7, b6, c7, c6, __carry14_32); \
+    SUB2_LIMB32_CARRY_IN_OUT(a9, a8, b9, b8, c9, c8, __carry14_32); \
+    SUB2_LIMB32_CARRY_IN_OUT(a11, a10, b11, b10, c11, c10, __carry14_32); \
+    SUB2_LIMB32_CARRY_IN(a13, a12, b13, b12, c13, c12, __carry14_32); \
   } while (0)
 
 #endif /* __i386__ */
@@ -820,7 +896,6 @@ LIMB64_HILO(mpi_limb_t hi, mpi_limb_t lo)
             "Ir" ((mpi_limb_t)(C0)) \
           : "cc")
 
-
 #define SUB6_LIMB32(A5, A4, A3, A2, A1, A0, B5, B4, B3, B2, B1, B0, \
                    C5, C4, C3, C2, C1, C0) do { \
     mpi_limb_t __borrow6_32; \
@@ -846,6 +921,46 @@ LIMB64_HILO(mpi_limb_t hi, mpi_limb_t lo)
 
 #endif /* HAVE_COMPATIBLE_GCC_ARM_PLATFORM_AS */
 
+#if defined (__hppa) && __GNUC__ >= 4
+#define ADD4_LIMB32(A3, A2, A1, A0, B3, B2, B1, B0, C3, C2, C1, C0) \
+  __asm__ ("add %7,%11,%3\n\t" \
+          "addc %6,%10,%2\n\t" \
+          "addc %5,%9,%1\n\t" \
+          "addc %4,%8,%0" \
+          : "=r" (A3), \
+            "=&r" (A2), \
+            "=&r" (A1), \
+            "=&r" (A0) \
+          : "rM" ((mpi_limb_t)(B3)), \
+            "rM" ((mpi_limb_t)(B2)), \
+            "rM" ((mpi_limb_t)(B1)), \
+            "rM" ((mpi_limb_t)(B0)), \
+            "rM" ((mpi_limb_t)(C3)), \
+            "rM" ((mpi_limb_t)(C2)), \
+            "rM" ((mpi_limb_t)(C1)), \
+            "rM" ((mpi_limb_t)(C0)) \
+          : "cc")
+
+#define SUB4_LIMB32(A3, A2, A1, A0, B3, B2, B1, B0, C3, C2, C1, C0) \
+  __asm__ ("sub %7,%11,%3\n\t" \
+          "subb %6,%10,%2\n\t" \
+          "subb %5,%9,%1\n\t" \
+          "subb %4,%8,%0\n\t" \
+          : "=r" (A3), \
+            "=&r" (A2), \
+            "=&r" (A1), \
+            "=&r" (A0) \
+          : "rM" ((mpi_limb_t)(B3)), \
+            "rM" ((mpi_limb_t)(B2)), \
+            "rM" ((mpi_limb_t)(B1)), \
+            "rM" ((mpi_limb_t)(B0)), \
+            "rM" ((mpi_limb_t)(C3)), \
+            "rM" ((mpi_limb_t)(C2)), \
+            "rM" ((mpi_limb_t)(C1)), \
+            "rM" ((mpi_limb_t)(C0)) \
+          : "cc")
+
+#endif /* __hppa */
 
 /* Common 32-bit arch addition/subtraction macros.  */
 
@@ -875,7 +990,13 @@ LIMB64_HILO(mpi_limb_t hi, mpi_limb_t lo)
                    C2.hi, C2.lo, C1.hi, C1.lo, C0.hi, C0.lo)
 #endif
 
-#if defined(ADD6_LIMB32)
+#if defined(ADD8_LIMB32)
+/* A[0..3] = B[0..3] + C[0..3] */
+#define ADD4_LIMB64(A3, A2, A1, A0, B3, B2, B1, B0, C3, C2, C1, C0) \
+    ADD8_LIMB32(A3.hi, A3.lo, A2.hi, A2.lo, A1.hi, A1.lo, A0.hi, A0.lo, \
+               B3.hi, B3.lo, B2.hi, B2.lo, B1.hi, B1.lo, B0.hi, B0.lo, \
+               C3.hi, C3.lo, C2.hi, C2.lo, C1.hi, C1.lo, C0.hi, C0.lo)
+#elif defined(ADD6_LIMB32)
 /* A[0..3] = B[0..3] + C[0..3] */
 #define ADD4_LIMB64(A3, A2, A1, A0, B3, B2, B1, B0, C3, C2, C1, C0) do { \
     mpi_limb_t __carry4; \
@@ -888,6 +1009,28 @@ LIMB64_HILO(mpi_limb_t hi, mpi_limb_t lo)
   } while (0)
 #endif
 
+#if defined(ADD10_LIMB32)
+/* A[0..4] = B[0..4] + C[0..4] */
+#define ADD5_LIMB64(A4, A3, A2, A1, A0, B4, B3, B2, B1, B0, \
+                   C4, C3, C2, C1, C0) \
+    ADD10_LIMB32(A4.hi, A4.lo, A3.hi, A3.lo, A2.hi, A2.lo, A1.hi, A1.lo, \
+                A0.hi, A0.lo, B4.hi, B4.lo, B3.hi, B3.lo, B2.hi, B2.lo, \
+                B1.hi, B1.lo, B0.hi, B0.lo, C4.hi, C4.lo, C3.hi, C3.lo, \
+                C2.hi, C2.lo, C1.hi, C1.lo, C0.hi, C0.lo)
+#endif
+
+#if defined(ADD14_LIMB32)
+/* A[0..6] = B[0..6] + C[0..6] */
+#define ADD7_LIMB64(A6, A5, A4, A3, A2, A1, A0, B6, B5, B4, B3, B2, B1, B0, \
+                   C6, C5, C4, C3, C2, C1, C0) \
+    ADD14_LIMB32(A6.hi, A6.lo, A5.hi, A5.lo, A4.hi, A4.lo, A3.hi, A3.lo, \
+                A2.hi, A2.lo, A1.hi, A1.lo, A0.hi, A0.lo, B6.hi, B6.lo, \
+                B5.hi, B5.lo, B4.hi, B4.lo, B3.hi, B3.lo, B2.hi, B2.lo, \
+                B1.hi, B1.lo, B0.hi, B0.lo, C6.hi, C6.lo, C5.hi, C5.lo, \
+                C4.hi, C4.lo, C3.hi, C3.lo, C2.hi, C2.lo, C1.hi, C1.lo, \
+                C0.hi, C0.lo)
+#endif
+
 #if defined(SUB4_LIMB32)
 /* A[0..1] = B[0..1] - C[0..1] */
 #define SUB2_LIMB64(A1, A0, B1, B0, C1, C0) \
@@ -914,7 +1057,13 @@ LIMB64_HILO(mpi_limb_t hi, mpi_limb_t lo)
                    C2.hi, C2.lo, C1.hi, C1.lo, C0.hi, C0.lo)
 #endif
 
-#if defined(SUB6_LIMB32)
+#if defined(SUB8_LIMB32)
+/* A[0..3] = B[0..3] - C[0..3] */
+#define SUB4_LIMB64(A3, A2, A1, A0, B3, B2, B1, B0, C3, C2, C1, C0) \
+    SUB8_LIMB32(A3.hi, A3.lo, A2.hi, A2.lo, A1.hi, A1.lo, A0.hi, A0.lo, \
+               B3.hi, B3.lo, B2.hi, B2.lo, B1.hi, B1.lo, B0.hi, B0.lo, \
+               C3.hi, C3.lo, C2.hi, C2.lo, C1.hi, C1.lo, C0.hi, C0.lo)
+#elif defined(SUB6_LIMB32)
 /* A[0..3] = B[0..3] - C[0..3] */
 #define SUB4_LIMB64(A3, A2, A1, A0, B3, B2, B1, B0, C3, C2, C1, C0) do { \
     mpi_limb_t __borrow4; \
@@ -927,6 +1076,28 @@ LIMB64_HILO(mpi_limb_t hi, mpi_limb_t lo)
   } while (0)
 #endif
 
+#if defined(SUB10_LIMB32)
+/* A[0..4] = B[0..4] - C[0..4] */
+#define SUB5_LIMB64(A4, A3, A2, A1, A0, B4, B3, B2, B1, B0, \
+                   C4, C3, C2, C1, C0) \
+    SUB10_LIMB32(A4.hi, A4.lo, A3.hi, A3.lo, A2.hi, A2.lo, A1.hi, A1.lo, \
+                A0.hi, A0.lo, B4.hi, B4.lo, B3.hi, B3.lo, B2.hi, B2.lo, \
+                B1.hi, B1.lo, B0.hi, B0.lo, C4.hi, C4.lo, C3.hi, C3.lo, \
+                C2.hi, C2.lo, C1.hi, C1.lo, C0.hi, C0.lo)
+#endif
+
+#if defined(SUB14_LIMB32)
+/* A[0..6] = B[0..6] - C[0..6] */
+#define SUB7_LIMB64(A6, A5, A4, A3, A2, A1, A0, B6, B5, B4, B3, B2, B1, B0, \
+                   C6, C5, C4, C3, C2, C1, C0) \
+    SUB14_LIMB32(A6.hi, A6.lo, A5.hi, A5.lo, A4.hi, A4.lo, A3.hi, A3.lo, \
+                A2.hi, A2.lo, A1.hi, A1.lo, A0.hi, A0.lo, B6.hi, B6.lo, \
+                B5.hi, B5.lo, B4.hi, B4.lo, B3.hi, B3.lo, B2.hi, B2.lo, \
+                B1.hi, B1.lo, B0.hi, B0.lo, C6.hi, C6.lo, C5.hi, C5.lo, \
+                C4.hi, C4.lo, C3.hi, C3.lo, C2.hi, C2.lo, C1.hi, C1.lo, \
+                C0.hi, C0.lo)
+#endif
+
 #endif /* BYTES_PER_MPI_LIMB == 4 */
 
 
index f792405c7febf20c06c6debdcdb0495e65eeca36..e03bd0e046cb7004be88ae98495ac826de79d33a 100644 (file)
@@ -88,9 +88,9 @@ _gcry_mpi_ec_nist192_mod (gcry_mpi_t w, mpi_ec_t ctx)
   };
   const mpi_limb64_t zero = LIMB_TO64(0);
   mpi_ptr_t wp;
-  mpi_size_t wsize = 192 / BITS_PER_MPI_LIMB64;
-  mpi_limb64_t s[wsize + 1];
-  mpi_limb64_t o[wsize + 1];
+  mpi_limb64_t s[192 / BITS_PER_MPI_LIMB64 + 1];
+  mpi_limb64_t o[DIM(s)];
+  const mpi_size_t wsize = DIM(s) - 1;
   mpi_limb_t mask1;
   mpi_limb_t mask2;
   mpi_limb_t s_is_negative;
@@ -180,10 +180,10 @@ _gcry_mpi_ec_nist224_mod (gcry_mpi_t w, mpi_ec_t ctx)
   };
   const mpi_limb64_t zero = LIMB_TO64(0);
   mpi_ptr_t wp;
-  mpi_size_t wsize = (224 + BITS_PER_MPI_LIMB64 - 1) / BITS_PER_MPI_LIMB64;
+  mpi_limb64_t s[(224 + BITS_PER_MPI_LIMB64 - 1) / BITS_PER_MPI_LIMB64];
+  mpi_limb64_t d[DIM(s)];
+  const mpi_size_t wsize = DIM(s);
   mpi_size_t psize = ctx->p->nlimbs;
-  mpi_limb64_t s[wsize];
-  mpi_limb64_t d[wsize];
   mpi_limb_t mask1;
   mpi_limb_t mask2;
   mpi_limb_t s_is_negative;
@@ -339,12 +339,12 @@ _gcry_mpi_ec_nist256_mod (gcry_mpi_t w, mpi_ec_t ctx)
   };
   const mpi_limb64_t zero = LIMB_TO64(0);
   mpi_ptr_t wp;
-  mpi_size_t wsize = (256 + BITS_PER_MPI_LIMB64 - 1) / BITS_PER_MPI_LIMB64;
+  mpi_limb64_t s[(256 + BITS_PER_MPI_LIMB64 - 1) / BITS_PER_MPI_LIMB64 + 1];
+  mpi_limb64_t t[DIM(s)];
+  mpi_limb64_t d[DIM(s)];
+  mpi_limb64_t e[DIM(s)];
+  const mpi_size_t wsize = DIM(s) - 1;
   mpi_size_t psize = ctx->p->nlimbs;
-  mpi_limb64_t s[wsize + 1];
-  mpi_limb64_t t[wsize + 1];
-  mpi_limb64_t d[wsize + 1];
-  mpi_limb64_t e[wsize + 1];
   mpi_limb_t mask1;
   mpi_limb_t mask2;
   mpi_limb_t mask3;
@@ -471,11 +471,15 @@ _gcry_mpi_ec_nist256_mod (gcry_mpi_t w, mpi_ec_t ctx)
 
   carry = LO32_LIMB64(s[4]);
 
+  /* Load values to stack to ease register pressure on i386. */
+  e[0] = p_mult[carry + 4][0];
+  e[1] = p_mult[carry + 4][1];
+  e[2] = p_mult[carry + 4][2];
+  e[3] = p_mult[carry + 4][3];
+  e[4] = p_mult[carry + 4][4];
   SUB5_LIMB64 (s[4], s[3], s[2], s[1], s[0],
               s[4], s[3], s[2], s[1], s[0],
-              p_mult[carry + 4][4], p_mult[carry + 4][3],
-              p_mult[carry + 4][2], p_mult[carry + 4][1],
-              p_mult[carry + 4][0]);
+              e[4], e[3], e[2], e[1], e[0]);
 
   /* Add 1*P */
   ADD5_LIMB64 (d[4], d[3], d[2], d[1], d[0],
@@ -589,15 +593,15 @@ _gcry_mpi_ec_nist384_mod (gcry_mpi_t w, mpi_ec_t ctx)
   };
   const mpi_limb64_t zero = LIMB_TO64(0);
   mpi_ptr_t wp;
-  mpi_size_t wsize = (384 + BITS_PER_MPI_LIMB64 - 1) / BITS_PER_MPI_LIMB64;
-  mpi_size_t psize = ctx->p->nlimbs;
+  mpi_limb64_t s[(384 + BITS_PER_MPI_LIMB64 - 1) / BITS_PER_MPI_LIMB64 + 1];
+  mpi_limb64_t t[DIM(s)];
+  mpi_limb64_t d[DIM(s)];
+  mpi_limb64_t x[DIM(s)];
 #if (BITS_PER_MPI_LIMB64 == BITS_PER_MPI_LIMB) && defined(WORDS_BIGENDIAN)
-  mpi_limb_t wp_shr32[wsize * LIMBS_PER_LIMB64];
+  mpi_limb_t wp_shr32[(DIM(s) - 1) * LIMBS_PER_LIMB64];
 #endif
-  mpi_limb64_t s[wsize + 1];
-  mpi_limb64_t t[wsize + 1];
-  mpi_limb64_t d[wsize + 1];
-  mpi_limb64_t x[wsize + 1];
+  const mpi_size_t wsize = DIM(s) - 1;
+  mpi_size_t psize = ctx->p->nlimbs;
   mpi_limb_t mask1;
   mpi_limb_t mask2;
   mpi_limb_t s_is_negative;
@@ -749,12 +753,17 @@ _gcry_mpi_ec_nist384_mod (gcry_mpi_t w, mpi_ec_t ctx)
 
   carry = LO32_LIMB64(s[6]);
 
+  /* Load values to stack to ease register pressure on i386. */
+  x[0] = p_mult[carry + 3][0];
+  x[1] = p_mult[carry + 3][1];
+  x[2] = p_mult[carry + 3][2];
+  x[3] = p_mult[carry + 3][3];
+  x[4] = p_mult[carry + 3][4];
+  x[5] = p_mult[carry + 3][5];
+  x[6] = p_mult[carry + 3][6];
   SUB7_LIMB64 (s[6], s[5], s[4], s[3], s[2], s[1], s[0],
               s[6], s[5], s[4], s[3], s[2], s[1], s[0],
-              p_mult[carry + 3][6], p_mult[carry + 3][5],
-              p_mult[carry + 3][4], p_mult[carry + 3][3],
-              p_mult[carry + 3][2], p_mult[carry + 3][1],
-              p_mult[carry + 3][0]);
+              x[6], x[5], x[4], x[3], x[2], x[1], x[0]);
 
   ADD7_LIMB64 (d[6], d[5], d[4], d[3], d[2], d[1], d[0],
               s[6], s[5], s[4], s[3], s[2], s[1], s[0],
@@ -785,8 +794,8 @@ _gcry_mpi_ec_nist384_mod (gcry_mpi_t w, mpi_ec_t ctx)
 void
 _gcry_mpi_ec_nist521_mod (gcry_mpi_t w, mpi_ec_t ctx)
 {
-  mpi_size_t wsize = (521 + BITS_PER_MPI_LIMB - 1) / BITS_PER_MPI_LIMB;
-  mpi_limb_t s[wsize];
+  mpi_limb_t s[(521 + BITS_PER_MPI_LIMB - 1) / BITS_PER_MPI_LIMB];
+  const mpi_size_t wsize = DIM(s);
   mpi_limb_t cy;
   mpi_ptr_t wp;
 
index e8233ae89a9cd6c74a55adcdc069289360b4ff68..2f8a25a44ef9ba8fe66d74c28a3e6e071f83dd05 100644 (file)
--- a/mpi/ec.c
+++ b/mpi/ec.c
@@ -581,9 +581,9 @@ ec_pow2_448 (gcry_mpi_t w, const gcry_mpi_t b, mpi_ec_t ctx)
 static void
 ec_secp256k1_mod (gcry_mpi_t w, mpi_ec_t ctx)
 {
-  mpi_size_t wsize = (256 + BITS_PER_MPI_LIMB - 1) / BITS_PER_MPI_LIMB;
-  mpi_limb_t n[wsize + 1];
-  mpi_limb_t s[wsize + 1];
+  mpi_limb_t s[(256 + BITS_PER_MPI_LIMB - 1) / BITS_PER_MPI_LIMB + 1];
+  mpi_limb_t n[DIM(s)];
+  const mpi_size_t wsize = DIM(s) - 1;
   mpi_limb_t cy, borrow;
   mpi_ptr_t wp;
 
@@ -857,7 +857,7 @@ ec_p_init (mpi_ec_t ctx, enum gcry_mpi_ec_models model,
           if (!match_p)
             continue;
 
-          for (j=0; i< DIM(ctx->t.scratch) && bad_points_table[i][j]; j++)
+          for (j=0; < DIM(ctx->t.scratch) && bad_points_table[i][j]; j++)
             ctx->t.scratch[j] = scanval (bad_points_table[i][j]);
         }
     }
@@ -1025,7 +1025,7 @@ _gcry_mpi_ec_p_new (gcry_ctx_t *r_ctx,
   if (!p || !a)
     return GPG_ERR_EINVAL;
 
-  ctx = _gcry_ctx_alloc (CONTEXT_TYPE_EC, sizeof *ec, ec_deinit);
+  ctx = _gcry_ctx_alloc (CONTEXT_TYPE_EC, sizeof *ec, ec_deinit, NULL);
   if (!ctx)
     return gpg_err_code_from_syserror ();
   ec = _gcry_ctx_get_pointer (ctx, CONTEXT_TYPE_EC);
index 4a84df64d82dbef905863889ce5a79bad483401b..0a51f06ccdc170af08ae287dd60ca746e9550d6c 100644 (file)
@@ -15,8 +15,8 @@
  * GNU Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA
+ * License along with this program; if not, see <https://www.gnu.org/licenses/>.
+ * SPDX-License-Identifier: LGPL-2.1-or-later
  *
  * Note: This code is heavily based on the GNU MP Library.
  *      Actually it's the same code with only minor changes in the
index f48c12cd02c5f3d12d6e500fe66d70d98279a9af..8a9c1257f4bf36faa67d49a0214d2b3c6cb1e7ca 100644 (file)
@@ -14,8 +14,8 @@
  * GNU Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA
+ * License along with this program; if not, see <https://www.gnu.org/licenses/>.
+ * SPDX-License-Identifier: LGPL-2.1-or-later
  *
  * Note: This code is heavily based on the GNU MP Library.
  *      Actually it's the same code with only minor changes in the
index 0e8197d88a3b47d45b72f71d52db3027abc61380..e88be914570df41dd0d8bd8a87f6077ed07ebc15 100644 (file)
@@ -15,8 +15,8 @@
  * GNU Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA
+ * License along with this program; if not, see <https://www.gnu.org/licenses/>.
+ * SPDX-License-Identifier: LGPL-2.1-or-later
  *
  * Note: This code is heavily based on the GNU MP Library.
  *      Actually it's the same code with only minor changes in the
index 3b7549605dfcd26c377281aace4c012c33573a6d..bc925e0194f2b43d2051bc9045df1c375e71225a 100644 (file)
@@ -15,8 +15,8 @@
  * GNU Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA
+ * License along with this program; if not, see <https://www.gnu.org/licenses/>.
+ * SPDX-License-Identifier: LGPL-2.1-or-later
  *
  * Note: This code is heavily based on the GNU MP Library.
  *      Actually it's the same code with only minor changes in the
index 5e84f94f31621d7f7db5ac8c374f8c6b6c901b42..4ecd2c6be9997650c621c43d7fe4e05a8174d27b 100644 (file)
@@ -15,8 +15,8 @@
  * GNU Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA
+ * License along with this program; if not, see <https://www.gnu.org/licenses/>.
+ * SPDX-License-Identifier: LGPL-2.1-or-later
  *
  * Note: This code is heavily based on the GNU MP Library.
  *      Actually it's the same code with only minor changes in the
index e40794fcf26ae213c4e24352134371634f2746be..156337b38978a56d01b2c14bafdde21233709125 100644 (file)
@@ -15,8 +15,8 @@
  * GNU Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA
+ * License along with this program; if not, see <https://www.gnu.org/licenses/>.
+ * SPDX-License-Identifier: LGPL-2.1-or-later
  *
  * Note: This code is heavily based on the GNU MP Library.
  *      Actually it's the same code with only minor changes in the
index e88821bfb4c56c1b2fe6efee0fc3a45305c4d090..b8eff0dbbba6fae8b0b43a8412171e15524e9652 100644 (file)
@@ -15,8 +15,8 @@
  * GNU Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA
+ * License along with this program; if not, see <https://www.gnu.org/licenses/>.
+ * SPDX-License-Identifier: LGPL-2.1-or-later
  *
  * Note: This code is heavily based on the GNU MP Library.
  *      Actually it's the same code with only minor changes in the
index e80d98bc54444912af8350908bb724fba82cd5c8..719d182694a764deb0ce5feb2606b6f16ae69543 100644 (file)
@@ -16,8 +16,8 @@
  * GNU Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA
+ * License along with this program; if not, see <https://www.gnu.org/licenses/>.
+ * SPDX-License-Identifier: LGPL-2.1-or-later
  */
 
 #include <config.h>
index 3bc0e5e19658495fb9e3889f8e63200caa478321..8001bda3917b427d160c1144aaba8a687c190ca5 100644 (file)
@@ -17,8 +17,8 @@
  * GNU Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA
+ * License along with this program; if not, see <https://www.gnu.org/licenses/>.
+ * SPDX-License-Identifier: LGPL-2.1-or-later
  *
  * Note: This code is heavily based on the GNU MP Library.
  *      Actually it's the same code with only minor changes in the
index 91b29bb6e7b400eab70213e1f9cf3ff6f39dae4d..ba478ff2efd5d3bfacf9b3edf6fa85bc1bf9deb6 100644 (file)
@@ -16,8 +16,8 @@
  * GNU Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA
+ * License along with this program; if not, see <https://www.gnu.org/licenses/>.
+ * SPDX-License-Identifier: LGPL-2.1-or-later
  */
 
 
index 37a9d4ef92b5bd346f91411e1e6041c15dd41661..c486fc75c4e6071bd5781203d17ca73b8a641e4d 100644 (file)
@@ -16,8 +16,8 @@
  * GNU Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA
+ * License along with this program; if not, see <https://www.gnu.org/licenses/>.
+ * SPDX-License-Identifier: LGPL-2.1-or-later
  */
 
 
index 8d197e412ab454a27105887399b30b48fc10cb15..2505632890c8eb606822a8fa424d34b2c96d68d7 100644 (file)
@@ -17,8 +17,8 @@
  * GNU Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA
+ * License along with this program; if not, see <https://www.gnu.org/licenses/>.
+ * SPDX-License-Identifier: LGPL-2.1-or-later
  *
  * Note: This code is heavily based on the GNU MP Library.
  *      Actually it's the same code with only minor changes in the
index 59ebf7a0020b952c62cece339ea4ac59fa524db2..06f06d15e2d3d20e8d5ff7b0c28b8f4671f819c2 100644 (file)
@@ -17,8 +17,8 @@
  * GNU Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA
+ * License along with this program; if not, see <https://www.gnu.org/licenses/>.
+ * SPDX-License-Identifier: LGPL-2.1-or-later
  *
  * Note: This code is heavily based on the GNU MP Library.
  *      Actually it's the same code with only minor changes in the
index 45926dd7b52a00b41aa04d42489e794d0be394a5..b474f27c445bcb869f458a8297d4d65744d9e2c7 100644 (file)
@@ -17,8 +17,8 @@
  * GNU Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA
+ * License along with this program; if not, see <https://www.gnu.org/licenses/>.
+ * SPDX-License-Identifier: LGPL-2.1-or-later
  *
  * Note: This code is heavily based on the GNU MP Library.
  *      Actually it's the same code with only minor changes in the
index 1047ab5649cebd0ea37e79449eb3827a7035fe7c..4ba897145d391b09511ab1b46ca943a0c6161dfe 100644 (file)
@@ -17,8 +17,8 @@
  * GNU Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA
+ * License along with this program; if not, see <https://www.gnu.org/licenses/>.
+ * SPDX-License-Identifier: LGPL-2.1-or-later
  *
  * Note: This code is heavily based on the GNU MP Library.
  *      Actually it's the same code with only minor changes in the
index 632adf1eec7e74da98d50c87e5c792f9e446ff81..d7cff2345cc5134325b16b89a800ea8e9755d2ba 100644 (file)
@@ -17,8 +17,8 @@
  * GNU Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA
+ * License along with this program; if not, see <https://www.gnu.org/licenses/>.
+ * SPDX-License-Identifier: LGPL-2.1-or-later
  *
  * Note: This code is heavily based on the GNU MP Library.
  *      Actually it's the same code with only minor changes in the
index 3f28b7b64dac5c3dc06ad5e3fb82bbd5eca64d40..ebb28d6c5b3a4e3b1c915f48c34a5ec03415920d 100644 (file)
@@ -17,8 +17,8 @@
  * GNU Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA
+ * License along with this program; if not, see <https://www.gnu.org/licenses/>.
+ * SPDX-License-Identifier: LGPL-2.1-or-later
  *
  * Note: This code is heavily based on the GNU MP Library.
  *      Actually it's the same code with only minor changes in the
index 95a75890cb27889c701700bbec2cb5ee641f01b4..7b458fc613530b5c4c87be12e4477fc42135d2d1 100644 (file)
@@ -17,8 +17,8 @@
  * GNU Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA
+ * License along with this program; if not, see <https://www.gnu.org/licenses/>.
+ * SPDX-License-Identifier: LGPL-2.1-or-later
  *
  * Note: This code is heavily based on the GNU MP Library.
  *      Actually it's the same code with only minor changes in the
index 3404cf557c7f9a952dccb64b6b9c28cde7abc4f2..409c6c5995ddfd6a648b825389ae6436474689d1 100644 (file)
@@ -15,8 +15,8 @@
  * GNU Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA
+ * License along with this program; if not, see <https://www.gnu.org/licenses/>.
+ * SPDX-License-Identifier: LGPL-2.1-or-later
  *
  * Note: This code is heavily based on the GNU MP Library.
  *      Actually it's the same code with only minor changes in the
index a672d052f6be207f66dd1e24ad30bda53f360f3e..431730ce73cafe65f9a0f54e2edb5a8f41d21cba 100644 (file)
@@ -16,8 +16,8 @@
  * GNU Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA
+ * License along with this program; if not, see <https://www.gnu.org/licenses/>.
+ * SPDX-License-Identifier: LGPL-2.1-or-later
  *
  * Note: This code is heavily based on the GNU MP Library.
  *      Actually it's the same code with only minor changes in the
index e09c3f7c85c4c5c05de6156826727862ab3f8ab6..58a0d6e37cf41ad92afc0be51de675a6c77438d1 100644 (file)
@@ -17,8 +17,8 @@
  * GNU Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA
+ * License along with this program; if not, see <https://www.gnu.org/licenses/>.
+ * SPDX-License-Identifier: LGPL-2.1-or-later
  *
  * Note: This code is heavily based on the GNU MP Library.
  *      Actually it's the same code with only minor changes in the
index 4112c69973bb224f531e5059c2eb22b01aca6aca..327e6ac894af620fed7f1f7bb1090d66c6609c27 100644 (file)
@@ -17,8 +17,8 @@
  * GNU Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA
+ * License along with this program; if not, see <https://www.gnu.org/licenses/>.
+ * SPDX-License-Identifier: LGPL-2.1-or-later
  *
  * Note: This code is heavily based on the GNU MP Library.
  *      Actually it's the same code with only minor changes in the
index 5d34696ca193bc8a37afb4a4431fb6071f903dbc..18c36c0c4c376d28a0d14125f0f3d8a6b38b352f 100644 (file)
@@ -16,8 +16,8 @@
  * GNU Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA
+ * License along with this program; if not, see <https://www.gnu.org/licenses/>.
+ * SPDX-License-Identifier: LGPL-2.1-or-later
  *
  * Note: This code is heavily based on the GNU MP Library.
  *      Actually it's the same code with only minor changes in the
index 49477ae34ab686700932bf10b7352f46bfa7092a..afd4c0770f569c8836a0802219190ace3a7c8ddf 100644 (file)
@@ -17,8 +17,8 @@
  * GNU Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA
+ * License along with this program; if not, see <https://www.gnu.org/licenses/>.
+ * SPDX-License-Identifier: LGPL-2.1-or-later
  *
  * Note: This code is heavily based on the GNU MP Library.
  *      Actually it's the same code with only minor changes in the
index af4d9e805d4b6b671f00cca40eb8d3982f463366..2383ebd3eeca9c76359efd5ffcffb7803243f1e4 100644 (file)
@@ -16,8 +16,8 @@
  * GNU Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA
+ * License along with this program; if not, see <https://www.gnu.org/licenses/>.
+ * SPDX-License-Identifier: LGPL-2.1-or-later
  *
  * Note: This code is heavily based on the GNU MP Library.
  *      Actually it's the same code with only minor changes in the
 #include <config.h>
 
 #ifdef __i386__
-#ifdef HAVE_GCC_ASM_CFI_DIRECTIVES
-# define CFI_STARTPROC()            .cfi_startproc
-# define CFI_ENDPROC()              .cfi_endproc
-# define CFI_ADJUST_CFA_OFFSET(off) .cfi_adjust_cfa_offset off
-# define CFI_REL_OFFSET(reg,off)    .cfi_rel_offset reg, off
-# define CFI_RESTORE(reg)           .cfi_restore reg
-
-# define CFI_PUSH(reg) \
-       CFI_ADJUST_CFA_OFFSET(4); CFI_REL_OFFSET(reg, 0)
-# define CFI_POP(reg) \
-       CFI_ADJUST_CFA_OFFSET(-4); CFI_RESTORE(reg)
-#else
-# define CFI_STARTPROC()
-# define CFI_ENDPROC()
-# define CFI_ADJUST_CFA_OFFSET(off)
-# define CFI_REL_OFFSET(reg,off)
-# define CFI_RESTORE(reg)
-
-# define CFI_PUSH(reg)
-# define CFI_POP(reg)
-#endif
+#include "asm-common-i386.h"
 #endif
 
 #undef ALIGN
index c299534c34c00037482a42fd4e723784fcb3a7ce..21bd1a7efbf403022e595e73f998ee562e6c7391 100644 (file)
@@ -16,22 +16,32 @@ or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
 License for more details.
 
 You should have received a copy of the GNU Library General Public License
-along with this file; see the file COPYING.LIB.  If not, write to
-the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
-MA 02111-1307, USA. */
+along with this file; see the file COPYING.LIB.  If not, see <https://www.gnu.org/licenses/>.
+SPDX-License-Identifier: LGPL-2.1-or-later
+*/
+
+/* On 32-bit, use 64-bit 'unsigned long long' for UDWtype, if available. */
+#if !defined (UDWtype) && SIZEOF_UNSIGNED_LONG_LONG * 8 == W_TYPE_SIZE * 2
+#  define UDWtype unsigned long long
+#endif
+
+/* On 64-bit, use 128-bit 'unsigned __int128' for UDWtype, if available. */
+#if !defined (UDWtype) && SIZEOF_UNSIGNED___INT128 * 8 == W_TYPE_SIZE * 2
+#  define UDWtype unsigned __int128
+#endif
 
 /* You have to define the following before including this file:
 
-   UWtype -- An unsigned type, default type for operations (typically a "word")
+   UWtype -- An unsigned type, default type for operations (typically a "word").
    UHWtype -- An unsigned type, at least half the size of UWtype.
-   UDWtype -- An unsigned type, at least twice as large a UWtype
-   W_TYPE_SIZE -- size in bits of UWtype
+   UDWtype -- An unsigned type, at least twice as large a UWtype.
+   W_TYPE_SIZE -- size in bits of UWtype.
 
    SItype, USItype -- Signed and unsigned 32 bit types.
    DItype, UDItype -- Signed and unsigned 64 bit types.
 
-   On a 32 bit machine UWtype should typically be USItype;
-   on a 64 bit machine, UWtype should typically be UDItype.
+   On a 32 bit machine UWtype should typically be USItype.
+   On a 64 bit machine, UWtype should typically be UDItype.
 */
 
 #define __BITS4 (W_TYPE_SIZE / 4)
@@ -394,23 +404,23 @@ extern UDItype __udiv_qrnnd ();
  ***************************************/
 #if defined (__hppa) && W_TYPE_SIZE == 32
 # define add_ssaaaa(sh, sl, ah, al, bh, bl) \
-  __asm__ ("   add %4,%5,%1\n"                                             \
-          "    addc %2,%3,%0"                                              \
+  __asm__ ("add %4,%5,%1\n\t"                                           \
+          "addc %2,%3,%0"                                              \
           : "=r" ((USItype)(sh)),                                      \
             "=&r" ((USItype)(sl))                                      \
           : "%rM" ((USItype)(ah)),                                     \
             "rM" ((USItype)(bh)),                                      \
             "%rM" ((USItype)(al)),                                     \
-            "rM" ((USItype)(bl)))
+            "rM" ((USItype)(bl)) __CLOBBER_CC)
 # define sub_ddmmss(sh, sl, ah, al, bh, bl) \
-  __asm__ ("   sub %4,%5,%1\n"                                             \
-          "    subb %2,%3,%0"                                              \
+  __asm__ ("sub %4,%5,%1\n\t"                                           \
+          "subb %2,%3,%0"                                              \
           : "=r" ((USItype)(sh)),                                      \
             "=&r" ((USItype)(sl))                                      \
           : "rM" ((USItype)(ah)),                                      \
             "rM" ((USItype)(bh)),                                      \
             "rM" ((USItype)(al)),                                      \
-            "rM" ((USItype)(bl)))
+            "rM" ((USItype)(bl)) __CLOBBER_CC)
 # if defined (_PA_RISC1_1)
 #  define umul_ppmm(wh, wl, u, v) \
   do {                                                                 \
@@ -555,7 +565,7 @@ extern USItype __udiv_qrnnd ();
     (count) = __cbtmp ^ 31;                                            \
   } while (0)
 # define count_trailing_zeros(count, x) \
-  __asm__ ("bsfl %1,%0" : "=r" (count) : "rm" ((USItype)(x)) __CLOBBER_CC)
+  __asm__ ("rep;bsfl %1,%0" : "=r" (count) : "rm" ((USItype)(x)) __CLOBBER_CC)
 # ifndef UMUL_TIME
 #  define UMUL_TIME 40
 # endif
@@ -614,7 +624,7 @@ extern USItype __udiv_qrnnd ();
 # define count_trailing_zeros(count, x) \
   do {                                                                  \
     UDItype __cbtmp;                                                    \
-    __asm__ ("bsfq %1,%0"                                               \
+    __asm__ ("rep;bsfq %1,%0"                                           \
              : "=r" (__cbtmp) : "rm" ((UDItype)(x))                     \
              __CLOBBER_CC);                                             \
     (count) = __cbtmp;                                                  \
@@ -969,180 +979,130 @@ typedef unsigned int UTItype __attribute__ ((mode (TI)));
 /***************************************
  **************  PPC  ******************
  ***************************************/
-#if (defined (_ARCH_PPC) || defined (_IBMR2)) && W_TYPE_SIZE == 32
+/* Powerpc 32 bit support taken from GCC longlong.h. */
+#if (defined (_ARCH_PPC) || defined (__powerpc__)) && W_TYPE_SIZE == 32
 # define add_ssaaaa(sh, sl, ah, al, bh, bl) \
   do {                                                                 \
-    if (__builtin_constant_p (bh) && (bh) == 0)                        \
-      __asm__ ("{a%I4|add%I4c} %1,%3,%4\n\t{aze|addze} %0,%2"           \
-            : "=r" ((sh)),                                             \
-              "=&r" ((sl))                                             \
-            : "%r" ((USItype)(ah)),                                    \
-              "%r" ((USItype)(al)),                                    \
-              "rI" ((USItype)(bl)));                                   \
-    else if (__builtin_constant_p (bh) && (bh) ==~(USItype) 0)         \
-      __asm__ ("{a%I4|add%I4c} %1,%3,%4\n\t{ame|addme} %0,%2"           \
-            : "=r" ((sh)),                                             \
-              "=&r" ((sl))                                             \
-            : "%r" ((USItype)(ah)),                                    \
-              "%r" ((USItype)(al)),                                    \
-              "rI" ((USItype)(bl)));                                   \
+    if (__builtin_constant_p (bh) && (bh) == 0)                                \
+      __asm__ ("add%I4c %1,%3,%4\n\taddze %0,%2"                       \
+              : "=r" (sh), "=&r" (sl) : "r" (ah), "%r" (al), "rI" (bl) \
+              __CLOBBER_CC);                                           \
+    else if (__builtin_constant_p (bh) && (bh) == ~(USItype) 0)                \
+      __asm__ ("add%I4c %1,%3,%4\n\taddme %0,%2"                       \
+              : "=r" (sh), "=&r" (sl) : "r" (ah), "%r" (al), "rI" (bl) \
+              __CLOBBER_CC);                                           \
     else                                                               \
-      __asm__ ("{a%I5|add%I5c} %1,%4,%5\n\t{ae|adde} %0,%2,%3"          \
-            : "=r" ((sh)),                                             \
-              "=&r" ((sl))                                             \
-            : "%r" ((USItype)(ah)),                                    \
-              "r" ((USItype)(bh)),                                     \
-              "%r" ((USItype)(al)),                                    \
-              "rI" ((USItype)(bl)));                                   \
+      __asm__ ("add%I5c %1,%4,%5\n\tadde %0,%2,%3"                     \
+              : "=r" (sh), "=&r" (sl)                                  \
+              : "%r" (ah), "r" (bh), "%r" (al), "rI" (bl)              \
+              __CLOBBER_CC);                                           \
   } while (0)
 # define sub_ddmmss(sh, sl, ah, al, bh, bl) \
   do {                                                                 \
-    if (__builtin_constant_p (ah) && (ah) == 0)                        \
-      __asm__ ("{sf%I3|subf%I3c} %1,%4,%3\n\t{sfze|subfze} %0,%2"       \
-              : "=r" ((sh)),                                           \
-                "=&r" ((sl))                                           \
-              : "r" ((USItype)(bh)),                                   \
-                "rI" ((USItype)(al)),                                  \
-                "r" ((USItype)(bl)));                                  \
-    else if (__builtin_constant_p (ah) && (ah) ==~(USItype) 0)         \
-      __asm__ ("{sf%I3|subf%I3c} %1,%4,%3\n\t{sfme|subfme} %0,%2"       \
-              : "=r" ((sh)),                                  \
-                "=&r" ((sl))                                  \
-              : "r" ((USItype)(bh)),                                   \
-                "rI" ((USItype)(al)),                                  \
-                "r" ((USItype)(bl)));                                  \
+    if (__builtin_constant_p (ah) && (ah) == 0)                                \
+      __asm__ ("subf%I3c %1,%4,%3\n\tsubfze %0,%2"                     \
+              : "=r" (sh), "=&r" (sl) : "r" (bh), "rI" (al), "r" (bl)  \
+              __CLOBBER_CC);                                           \
+    else if (__builtin_constant_p (ah) && (ah) == ~(USItype) 0)                \
+      __asm__ ("subf%I3c %1,%4,%3\n\tsubfme %0,%2"                     \
+              : "=r" (sh), "=&r" (sl) : "r" (bh), "rI" (al), "r" (bl)  \
+              __CLOBBER_CC);                                           \
     else if (__builtin_constant_p (bh) && (bh) == 0)                   \
-      __asm__ ("{sf%I3|subf%I3c} %1,%4,%3\n\t{ame|addme} %0,%2"         \
-              : "=r" ((sh)),                                           \
-                "=&r" ((sl))                                           \
-              : "r" ((USItype)(ah)),                                   \
-                "rI" ((USItype)(al)),                                  \
-                "r" ((USItype)(bl)));                                  \
-    else if (__builtin_constant_p (bh) && (bh) ==~(USItype) 0)         \
-      __asm__ ("{sf%I3|subf%I3c} %1,%4,%3\n\t{aze|addze} %0,%2"         \
-              : "=r" ((sh)),                                           \
-                "=&r" ((sl))                                           \
-              : "r" ((USItype)(ah)),                                   \
-                "rI" ((USItype)(al)),                                  \
-                "r" ((USItype)(bl)));                                  \
+      __asm__ ("subf%I3c %1,%4,%3\n\taddme %0,%2"                      \
+              : "=r" (sh), "=&r" (sl) : "r" (ah), "rI" (al), "r" (bl)  \
+              __CLOBBER_CC);                                           \
+    else if (__builtin_constant_p (bh) && (bh) == ~(USItype) 0)                \
+      __asm__ ("subf%I3c %1,%4,%3\n\taddze %0,%2"                      \
+              : "=r" (sh), "=&r" (sl) : "r" (ah), "rI" (al), "r" (bl)  \
+              __CLOBBER_CC);                                           \
     else                                                               \
-      __asm__ ("{sf%I4|subf%I4c} %1,%5,%4\n\t{sfe|subfe} %0,%3,%2"      \
-              : "=r" ((sh)),                                           \
-                "=&r" ((sl))                                           \
-              : "r" ((USItype)(ah)),                                   \
-                "r" ((USItype)(bh)),                                   \
-                "rI" ((USItype)(al)),                                  \
-                "r" ((USItype)(bl)));                                  \
+      __asm__ ("subf%I4c %1,%5,%4\n\tsubfe %0,%3,%2"                   \
+              : "=r" (sh), "=&r" (sl)                                  \
+              : "r" (ah), "r" (bh), "rI" (al), "r" (bl)                        \
+              __CLOBBER_CC);                                           \
   } while (0)
 # define count_leading_zeros(count, x) \
-  __asm__ ("{cntlz|cntlzw} %0,%1"                                       \
-          : "=r" ((count))                                             \
-          : "r" ((USItype)(x)))
+  __asm__ ("cntlzw %0,%1" : "=r" (count) : "r" (x))
 # define COUNT_LEADING_ZEROS_0 32
-# if defined (_ARCH_PPC)
-#  define umul_ppmm(ph, pl, m0, m1) \
+# define umul_ppmm(ph, pl, m0, m1) \
   do {                                                                 \
     USItype __m0 = (m0), __m1 = (m1);                                  \
-    __asm__ ("mulhwu %0,%1,%2"                                          \
-            : "=r" (ph)                                                \
-            : "%r" (__m0),                                             \
-              "r" (__m1));                                             \
-    (pl) = __m0 * __m1;                                                \
+    __asm__ ("mulhwu %0,%1,%2" : "=r" (ph) : "%r" (m0), "r" (m1));     \
+    (pl) = __m0 * __m1;                                                        \
   } while (0)
-#  define UMUL_TIME 15
-#  define smul_ppmm(ph, pl, m0, m1) \
+# define UMUL_TIME 15
+# define smul_ppmm(ph, pl, m0, m1) \
   do {                                                                 \
     SItype __m0 = (m0), __m1 = (m1);                                   \
-    __asm__ ("mulhw %0,%1,%2"                                           \
-            : "=r" ((SItype) ph)                                       \
-            : "%r" (__m0),                                             \
-              "r" (__m1));                                             \
-    (pl) = __m0 * __m1;                                                \
-  } while (0)
-#  define SMUL_TIME 14
-#  define UDIV_TIME 120
-# else
-#  define umul_ppmm(xh, xl, m0, m1) \
-  do {                                                                 \
-    USItype __m0 = (m0), __m1 = (m1);                                  \
-    __asm__ ("mul %0,%2,%3"                                             \
-            : "=r" ((xh)),                                             \
-              "=q" ((xl))                                              \
-            : "r" (__m0),                                              \
-              "r" (__m1));                                             \
-    (xh) += ((((SItype) __m0 >> 31) & __m1)                            \
-            + (((SItype) __m1 >> 31) & __m0));                         \
+    __asm__ ("mulhw %0,%1,%2" : "=r" (ph) : "%r" (m0), "r" (m1));      \
+    (pl) = __m0 * __m1;                                                        \
   } while (0)
-#  define UMUL_TIME 8
-#  define smul_ppmm(xh, xl, m0, m1) \
-  __asm__ ("mul %0,%2,%3"                                               \
-          : "=r" ((SItype)(xh)),                                       \
-            "=q" ((SItype)(xl))                                        \
-          : "r" (m0),                                                  \
-            "r" (m1))
-#  define SMUL_TIME 4
-#  define sdiv_qrnnd(q, r, nh, nl, d) \
-  __asm__ ("div %0,%2,%4"                                               \
-          : "=r" ((SItype)(q)), "=q" ((SItype)(r))                     \
-          : "r" ((SItype)(nh)), "1" ((SItype)(nl)), "r" ((SItype)(d)))
-#  define UDIV_TIME 100
-# endif
-#endif /* Power architecture variants. */
+# define SMUL_TIME 14
+# define UDIV_TIME 120
+#endif /* 32-bit POWER architecture variants.  */
 
-/* Powerpc 64 bit support taken from gmp-4.1.2. */
+/* Powerpc 64 bit support taken from GCC longlong.h. */
 /* We should test _IBMR2 here when we add assembly support for the system
    vendor compilers.  */
-#if (defined (_ARCH_PPC) || defined (__powerpc__)) && W_TYPE_SIZE == 64
-#define add_ssaaaa(sh, sl, ah, al, bh, bl) \
+#if (defined (_ARCH_PPC64) || defined (__powerpc64__)) && W_TYPE_SIZE == 64
+# define add_ssaaaa(sh, sl, ah, al, bh, bl) \
   do {                                                                 \
     if (__builtin_constant_p (bh) && (bh) == 0)                                \
-      __asm__ ("{a%I4|add%I4c} %1,%3,%4\n\t{aze|addze} %0,%2"          \
-            : "=r" (sh), "=&r" (sl) : "r" (ah), "%r" (al), "rI" (bl));\
+      __asm__ ("add%I4c %1,%3,%4\n\taddze %0,%2"                       \
+              : "=r" (sh), "=&r" (sl) : "r" (ah), "%r" (al), "rI" (bl) \
+              __CLOBBER_CC);                                           \
     else if (__builtin_constant_p (bh) && (bh) == ~(UDItype) 0)                \
-      __asm__ ("{a%I4|add%I4c} %1,%3,%4\n\t{ame|addme} %0,%2"          \
-            : "=r" (sh), "=&r" (sl) : "r" (ah), "%r" (al), "rI" (bl));\
+      __asm__ ("add%I4c %1,%3,%4\n\taddme %0,%2"                       \
+              : "=r" (sh), "=&r" (sl) : "r" (ah), "%r" (al), "rI" (bl) \
+              __CLOBBER_CC);                                           \
     else                                                               \
-      __asm__ ("{a%I5|add%I5c} %1,%4,%5\n\t{ae|adde} %0,%2,%3"         \
-            : "=r" (sh), "=&r" (sl)                                    \
-            : "%r" (ah), "r" (bh), "%r" (al), "rI" (bl));              \
+      __asm__ ("add%I5c %1,%4,%5\n\tadde %0,%2,%3"                     \
+              : "=r" (sh), "=&r" (sl)                                  \
+              : "%r" (ah), "r" (bh), "%r" (al), "rI" (bl)              \
+              __CLOBBER_CC);                                           \
   } while (0)
-#define sub_ddmmss(sh, sl, ah, al, bh, bl) \
+# define sub_ddmmss(sh, sl, ah, al, bh, bl) \
   do {                                                                 \
     if (__builtin_constant_p (ah) && (ah) == 0)                                \
-      __asm__ ("{sf%I3|subf%I3c} %1,%4,%3\n\t{sfze|subfze} %0,%2"      \
-              : "=r" (sh), "=&r" (sl) : "r" (bh), "rI" (al), "r" (bl));\
+      __asm__ ("subf%I3c %1,%4,%3\n\tsubfze %0,%2"                     \
+              : "=r" (sh), "=&r" (sl) : "r" (bh), "rI" (al), "r" (bl)  \
+              __CLOBBER_CC);                                           \
     else if (__builtin_constant_p (ah) && (ah) == ~(UDItype) 0)                \
-      __asm__ ("{sf%I3|subf%I3c} %1,%4,%3\n\t{sfme|subfme} %0,%2"      \
-              : "=r" (sh), "=&r" (sl) : "r" (bh), "rI" (al), "r" (bl));\
+      __asm__ ("subf%I3c %1,%4,%3\n\tsubfme %0,%2"                     \
+              : "=r" (sh), "=&r" (sl) : "r" (bh), "rI" (al), "r" (bl)  \
+              __CLOBBER_CC);                                           \
     else if (__builtin_constant_p (bh) && (bh) == 0)                   \
-      __asm__ ("{sf%I3|subf%I3c} %1,%4,%3\n\t{ame|addme} %0,%2"                \
-              : "=r" (sh), "=&r" (sl) : "r" (ah), "rI" (al), "r" (bl));\
+      __asm__ ("subf%I3c %1,%4,%3\n\taddme %0,%2"                      \
+              : "=r" (sh), "=&r" (sl) : "r" (ah), "rI" (al), "r" (bl)  \
+              __CLOBBER_CC);                                           \
     else if (__builtin_constant_p (bh) && (bh) == ~(UDItype) 0)                \
-      __asm__ ("{sf%I3|subf%I3c} %1,%4,%3\n\t{aze|addze} %0,%2"                \
-              : "=r" (sh), "=&r" (sl) : "r" (ah), "rI" (al), "r" (bl));\
+      __asm__ ("subf%I3c %1,%4,%3\n\taddze %0,%2"                      \
+              : "=r" (sh), "=&r" (sl) : "r" (ah), "rI" (al), "r" (bl)  \
+              __CLOBBER_CC);                                           \
     else                                                               \
-      __asm__ ("{sf%I4|subf%I4c} %1,%5,%4\n\t{sfe|subfe} %0,%3,%2"     \
-              : "=r" (sh), "=&r" (sl)                                  \
-              : "r" (ah), "r" (bh), "rI" (al), "r" (bl));              \
+      __asm__ ("subf%I4c %1,%5,%4\n\tsubfe %0,%3,%2"                   \
+              : "=r" (sh), "=&r" (sl)                                  \
+              : "r" (ah), "r" (bh), "rI" (al), "r" (bl)                        \
+              __CLOBBER_CC);                                           \
   } while (0)
-#define count_leading_zeros(count, x) \
+# define count_leading_zeros(count, x) \
   __asm__ ("cntlzd %0,%1" : "=r" (count) : "r" (x))
-#define COUNT_LEADING_ZEROS_0 64
-#define umul_ppmm(ph, pl, m0, m1) \
+# define COUNT_LEADING_ZEROS_0 64
+# define umul_ppmm(ph, pl, m0, m1) \
   do {                                                                 \
     UDItype __m0 = (m0), __m1 = (m1);                                  \
     __asm__ ("mulhdu %0,%1,%2" : "=r" (ph) : "%r" (m0), "r" (m1));     \
     (pl) = __m0 * __m1;                                                        \
   } while (0)
-#define UMUL_TIME 15
-#define smul_ppmm(ph, pl, m0, m1) \
+# define UMUL_TIME 15
+# define smul_ppmm(ph, pl, m0, m1) \
   do {                                                                 \
     DItype __m0 = (m0), __m1 = (m1);                                   \
     __asm__ ("mulhd %0,%1,%2" : "=r" (ph) : "%r" (m0), "r" (m1));      \
     (pl) = __m0 * __m1;                                                        \
   } while (0)
-#define SMUL_TIME 14  /* ??? */
-#define UDIV_TIME 120 /* ??? */
+# define SMUL_TIME 14  /* ??? */
+# define UDIV_TIME 120 /* ??? */
 #endif /* 64-bit PowerPC.  */
 
 /***************************************
@@ -1617,7 +1577,21 @@ typedef unsigned int UTItype __attribute__ ((mode (TI)));
 
 /* If this machine has no inline assembler, use C macros.  */
 
-#if !defined (add_ssaaaa)
+#if !defined (add_ssaaaa) && defined (UDWtype)
+/* Use double word type when available. */
+#  define add_ssaaaa(sh, sl, ah, al, bh, bl) \
+  do {                                                                 \
+    UDWtype __audw = (ah);                                             \
+    UDWtype __budw = (bh);                                             \
+    __audw <<= W_TYPE_SIZE;                                            \
+    __audw |= (al);                                                    \
+    __budw <<= W_TYPE_SIZE;                                            \
+    __budw |= (bl);                                                    \
+    __audw += __budw;                                                  \
+    (sh) = (UWtype)(__audw >> W_TYPE_SIZE);                            \
+    (sl) = (UWtype)(__audw);                                           \
+  } while (0)
+#elif !defined (add_ssaaaa)
 #  define add_ssaaaa(sh, sl, ah, al, bh, bl) \
   do {                                                                 \
     UWtype __x;                                                        \
@@ -1627,7 +1601,21 @@ typedef unsigned int UTItype __attribute__ ((mode (TI)));
   } while (0)
 #endif
 
-#if !defined (sub_ddmmss)
+#if !defined (sub_ddmmss) && defined (UDWtype)
+/* Use double word type when available. */
+#  define sub_ddmmss(sh, sl, ah, al, bh, bl) \
+  do {                                                                 \
+    UDWtype __audw = (ah);                                             \
+    UDWtype __budw = (bh);                                             \
+    __audw <<= W_TYPE_SIZE;                                            \
+    __audw |= (al);                                                    \
+    __budw <<= W_TYPE_SIZE;                                            \
+    __budw |= (bl);                                                    \
+    __audw -= __budw;                                                  \
+    (sh) = (UWtype)(__audw >> W_TYPE_SIZE);                            \
+    (sl) = (UWtype)(__audw);                                           \
+  } while (0)
+#elif !defined (sub_ddmmss)
 #  define sub_ddmmss(sh, sl, ah, al, bh, bl) \
   do {                                                                 \
     UWtype __x;                                                        \
@@ -1637,7 +1625,15 @@ typedef unsigned int UTItype __attribute__ ((mode (TI)));
   } while (0)
 #endif
 
-#if !defined (umul_ppmm)
+#if !defined (umul_ppmm) && defined (UDWtype)
+#  define umul_ppmm(w1, w0, u, v)                                      \
+  do {                                                                 \
+    UDWtype __x = (u);                                                 \
+    __x *= (v);                                                                \
+    (w1) = (UWtype)(__x >> W_TYPE_SIZE);                               \
+    (w0) = (UWtype)(__x);                                              \
+  } while (0)
+#elif !defined (umul_ppmm)
 #  define umul_ppmm(w1, w0, u, v)                                      \
   do {                                                                 \
     UWtype __x0, __x1, __x2, __x3;                                     \
@@ -1664,7 +1660,7 @@ typedef unsigned int UTItype __attribute__ ((mode (TI)));
   } while (0)
 #endif
 
-#if !defined (umul_ppmm)
+#if !defined (smul_ppmm)
 #  define smul_ppmm(w1, w0, u, v)                                      \
   do {                                                                 \
     UWtype __w1;                                                       \
@@ -1712,6 +1708,19 @@ typedef unsigned int UTItype __attribute__ ((mode (TI)));
     (r) = __r0;                                                        \
   } while (0)
 
+/* Use double word type if available. */
+#if !defined (udiv_qrnnd) && defined (UDWtype)
+#  define udiv_qrnnd(q, r, nh, nl, d) \
+  do {                                                                 \
+    UWtype __d = (d);                                                  \
+    UDWtype __nudw = (nh);                                             \
+    __nudw <<= W_TYPE_SIZE;                                            \
+    __nudw |= (nl);                                                    \
+    (q) = (UWtype)(__nudw / __d);                                      \
+    (r) = (UWtype)(__nudw % __d);                                      \
+  } while (0)
+#endif
+
 /* If the processor has no udiv_qrnnd but sdiv_qrnnd, go through
    __udiv_w_sdiv (defined in libgcc or elsewhere).  */
 #if !defined (udiv_qrnnd) && defined (sdiv_qrnnd)
index 007c94c6db90ebb69928fdfa7e9e59d251ed35e9..1b55097adcbfff220ce2e56defc05b81f22a87e5 100644 (file)
@@ -17,8 +17,8 @@
  * GNU Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA
+ * License along with this program; if not, see <https://www.gnu.org/licenses/>.
+ * SPDX-License-Identifier: LGPL-2.1-or-later
  *
  * Note: This code is heavily based on the GNU MP Library.
  *      Actually it's the same code with only minor changes in the
index 44baa8d8877f0bc67859bafd02bae3ea84356576..321d9ac70978cfd36097a64c7b551e54813607cf 100644 (file)
@@ -17,8 +17,8 @@
  * GNU Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA
+ * License along with this program; if not, see <https://www.gnu.org/licenses/>.
+ * SPDX-License-Identifier: LGPL-2.1-or-later
  *
  * Note: This code is heavily based on the GNU MP Library.
  *      Actually it's the same code with only minor changes in the
index e958ef611711d7a3c850dc5d2ef56bb88d43ae62..708fdc939f206febaa31e41e2d818d9d6a4b58ff 100644 (file)
@@ -17,8 +17,8 @@
  * GNU Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA
+ * License along with this program; if not, see <https://www.gnu.org/licenses/>.
+ * SPDX-License-Identifier: LGPL-2.1-or-later
  *
  * Note: This code is heavily based on the GNU MP Library.
  *      Actually it's the same code with only minor changes in the
index 8182d21a32b662a4e6aeb377032c3455f366539a..951735a3d5a6f33f8b305a9f6d7158061d18eba1 100644 (file)
@@ -17,8 +17,8 @@
  * GNU Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA
+ * License along with this program; if not, see <https://www.gnu.org/licenses/>.
+ * SPDX-License-Identifier: LGPL-2.1-or-later
  *
  * Note: This code is heavily based on the GNU MP Library.
  *      Actually it's the same code with only minor changes in the
index 133d1aae3d30d63b465ae9d37f449373ccabd516..540567c285585fd648a0a8301ff2c4917b8626f3 100644 (file)
@@ -15,8 +15,8 @@
  * GNU Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA
+ * License along with this program; if not, see <https://www.gnu.org/licenses/>.
+ * SPDX-License-Identifier: LGPL-2.1-or-later
  *
  * Note: This code is heavily based on the GNU MP Library.
  *      Actually it's the same code with only minor changes in the
index be9f43502f30c04eab0d7485b86e0d5fdfe610de..964c141fb0c07776a1729534212c582c868368bd 100644 (file)
@@ -15,8 +15,8 @@
  * GNU Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA
+ * License along with this program; if not, see <https://www.gnu.org/licenses/>.
+ * SPDX-License-Identifier: LGPL-2.1-or-later
  *
  * Note: This code is heavily based on the GNU MP Library.
  *      Actually it's the same code with only minor changes in the
index ee7555f8972c0070cc6505d06785413d8e284930..f81569ceb9f47b362cd51130d8ac646d8012d91d 100644 (file)
@@ -17,8 +17,8 @@
  * GNU Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA
+ * License along with this program; if not, see <https://www.gnu.org/licenses/>.
+ * SPDX-License-Identifier: LGPL-2.1-or-later
  *
  * Note: This code is heavily based on the GNU MP Library.
  *      Actually it's the same code with only minor changes in the
index e27de98b4fa8b3b120d44a7d164615a28fb5e604..1c528db7eb57f69ac823826a018c3048a5631933 100644 (file)
@@ -16,8 +16,8 @@
  * GNU Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA
+ * License along with this program; if not, see <https://www.gnu.org/licenses/>.
+ * SPDX-License-Identifier: LGPL-2.1-or-later
  *
  * Note: This code is heavily based on the GNU MP Library.
  *      Actually it's the same code with only minor changes in the
index f3db029de484cf1ea92e992fb54c639387d6a292..ba5ac1956be629988216d0ca87d4fc7aa3705cd0 100644 (file)
@@ -17,8 +17,8 @@
  * GNU Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA
+ * License along with this program; if not, see <https://www.gnu.org/licenses/>.
+ * SPDX-License-Identifier: LGPL-2.1-or-later
  */
 
 
index 084c109b24a4427a0a1f78de0b60e415e93c4191..013ee4a763562ab288678e14e4d17412db84b1ef 100644 (file)
@@ -16,8 +16,8 @@
  * GNU Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA
+ * License along with this program; if not, see <https://www.gnu.org/licenses/>.
+ * SPDX-License-Identifier: LGPL-2.1-or-later
  */
 
 /*******************
index 6c0099de3f95007a420aa764545898cb9763144e..b2a6d1614efd0f39d70608ba4375d5ac5d430ba6 100644 (file)
@@ -17,8 +17,8 @@
  * GNU Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA
+ * License along with this program; if not, see <https://www.gnu.org/licenses/>.
+ * SPDX-License-Identifier: LGPL-2.1-or-later
  */
 
 
index ca8276388fdfe58f655cae5d6b00ab4c58ed5b8f..49ccce21a3a7b1f268da002b0ebc8096790cc47e 100644 (file)
@@ -17,8 +17,8 @@
  * GNU Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA
+ * License along with this program; if not, see <https://www.gnu.org/licenses/>.
+ * SPDX-License-Identifier: LGPL-2.1-or-later
  */
 
 
index be421a68eecd0a6ef982b6a3cae624c6826172ef..a951ba69475f2fbbe6b098c61433dc4f07942684 100644 (file)
@@ -17,8 +17,8 @@
  * GNU Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA
+ * License along with this program; if not, see <https://www.gnu.org/licenses/>.
+ * SPDX-License-Identifier: LGPL-2.1-or-later
  */
 
 
index e7e035a034bcc25db0fcc97578e642c454126c23..a8d39854d40f1778cbd5f7c68e340da039bff07a 100644 (file)
@@ -16,8 +16,8 @@
  * GNU Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA
+ * License along with this program; if not, see <https://www.gnu.org/licenses/>.
+ * SPDX-License-Identifier: LGPL-2.1-or-later
  */
 
 /*******************
index 9fac67439941e08b1ea61c520c7d804fec7e6f82..ae444b1742ef49d9b9848a8121506eb451e7c6b0 100644 (file)
@@ -17,8 +17,8 @@
  * GNU Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA
+ * License along with this program; if not, see <https://www.gnu.org/licenses/>.
+ * SPDX-License-Identifier: LGPL-2.1-or-later
  */
 
 
index 38dd352f81e50661b1078cd0257b882588f807bc..51dc71b71be57879b8713cdeceed370ff00ed9eb 100644 (file)
@@ -14,8 +14,8 @@
  * GNU Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA
+ * License along with this program; if not, see <https://www.gnu.org/licenses/>.
+ * SPDX-License-Identifier: LGPL-2.1-or-later
  *
  * Note: This code is heavily based on the GNU MP Library.
  *      Actually it's the same code with only minor changes in the
@@ -84,8 +84,8 @@ _gcry_mpi_add_ui (gcry_mpi_t w, gcry_mpi_t u, unsigned long v )
 }
 
 
-void
-_gcry_mpi_add(gcry_mpi_t w, gcry_mpi_t u, gcry_mpi_t v)
+static void
+_gcry_mpi_add_inv_sign(gcry_mpi_t w, gcry_mpi_t u, gcry_mpi_t v, int inv_v_sign)
 {
     mpi_ptr_t wp, up, vp;
     mpi_size_t usize, vsize, wsize;
@@ -93,7 +93,7 @@ _gcry_mpi_add(gcry_mpi_t w, gcry_mpi_t u, gcry_mpi_t v)
 
     if( u->nlimbs < v->nlimbs ) { /* Swap U and V. */
        usize = v->nlimbs;
-       usign = v->sign;
+       usign = v->sign ^ inv_v_sign;
        vsize = u->nlimbs;
        vsign = u->sign;
        wsize = usize + 1;
@@ -106,7 +106,7 @@ _gcry_mpi_add(gcry_mpi_t w, gcry_mpi_t u, gcry_mpi_t v)
        usize = u->nlimbs;
        usign = u->sign;
        vsize = v->nlimbs;
-       vsign = v->sign;
+       vsign = v->sign ^ inv_v_sign;
        wsize = usize + 1;
        RESIZE_IF_NEEDED(w, wsize);
        /* These must be after realloc (u or v may be the same as w).  */
@@ -211,26 +211,51 @@ _gcry_mpi_sub_ui(gcry_mpi_t w, gcry_mpi_t u, unsigned long v )
     w->sign   = wsign;
 }
 
+void
+_gcry_mpi_add(gcry_mpi_t w, gcry_mpi_t u, gcry_mpi_t v)
+{
+  _gcry_mpi_add_inv_sign (w, u, v, 0);
+}
+
 void
 _gcry_mpi_sub(gcry_mpi_t w, gcry_mpi_t u, gcry_mpi_t v)
 {
-  gcry_mpi_t vv = mpi_copy (v);
-  vv->sign = ! vv->sign;
-  mpi_add (w, u, vv);
-  mpi_free (vv);
+  _gcry_mpi_add_inv_sign (w, u, v, 1);
 }
 
 
 void
 _gcry_mpi_addm( gcry_mpi_t w, gcry_mpi_t u, gcry_mpi_t v, gcry_mpi_t m)
 {
+  gcry_mpi_t temp_m = NULL;
+
+  if (w == m)
+    {
+      temp_m = mpi_copy (m);
+      m = temp_m;
+    }
+
   mpi_add (w, u, v);
   mpi_mod (w, w, m);
+
+  if (temp_m)
+    mpi_free(temp_m);
 }
 
 void
 _gcry_mpi_subm( gcry_mpi_t w, gcry_mpi_t u, gcry_mpi_t v, gcry_mpi_t m)
 {
+  gcry_mpi_t temp_m = NULL;
+
+  if (w == m)
+    {
+      temp_m = mpi_copy (m);
+      m = temp_m;
+    }
+
   mpi_sub (w, u, v);
   mpi_mod (w, w, m);
+
+  if (temp_m)
+    mpi_free(temp_m);
 }
index e2170401e41b3fddb5dc5932844aa6da7eef8b6f..9cd408188d79ac556f18f14e17121bf01b425a8b 100644 (file)
@@ -15,8 +15,8 @@
  * GNU Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA
+ * License along with this program; if not, see <https://www.gnu.org/licenses/>.
+ * SPDX-License-Identifier: LGPL-2.1-or-later
  */
 
 #include <config.h>
@@ -251,10 +251,11 @@ _gcry_mpi_rshift_limbs( gcry_mpi_t a, unsigned int count )
 void
 _gcry_mpi_rshift ( gcry_mpi_t x, gcry_mpi_t a, unsigned int n )
 {
-  mpi_size_t xsize;
-  unsigned int i;
   unsigned int nlimbs = (n/BITS_PER_MPI_LIMB);
   unsigned int nbits = (n%BITS_PER_MPI_LIMB);
+  unsigned int i;
+  mpi_size_t alimbs;
+  mpi_ptr_t xp, ap;
 
   if (mpi_is_immutable (x))
     {
@@ -262,75 +263,42 @@ _gcry_mpi_rshift ( gcry_mpi_t x, gcry_mpi_t a, unsigned int n )
       return;
     }
 
-  if ( x == a )
-    {
-      /* In-place operation.  */
-      if ( nlimbs >= x->nlimbs )
-        {
-          x->nlimbs = 0;
-          return;
-        }
+  alimbs = a->nlimbs;
 
-      if (nlimbs)
-        {
-          for (i=0; i < x->nlimbs - nlimbs; i++ )
-            x->d[i] = x->d[i+nlimbs];
-          x->d[i] = 0;
-          x->nlimbs -= nlimbs;
-
-        }
-      if ( x->nlimbs && nbits )
-        _gcry_mpih_rshift ( x->d, x->d, x->nlimbs, nbits );
-    }
-  else if ( nlimbs )
+  if (x != a)
     {
-      /* Copy and shift by more or equal bits than in a limb. */
-      xsize = a->nlimbs;
+      RESIZE_IF_NEEDED (x, alimbs);
+      x->nlimbs = alimbs;
+      x->flags = a->flags;
       x->sign = a->sign;
-      RESIZE_IF_NEEDED (x, xsize);
-      x->nlimbs = xsize;
-      for (i=0; i < a->nlimbs; i++ )
-        x->d[i] = a->d[i];
-      x->nlimbs = i;
-
-      if ( nlimbs >= x->nlimbs )
-        {
-          x->nlimbs = 0;
-          return;
-        }
+    }
+
+  /* In-place operation.  */
+  if (nlimbs >= alimbs)
+    {
+      x->nlimbs = 0;
+      return;
+    }
+
+  xp = x->d;
+  ap = a->d;
 
+  if (alimbs && nbits)
+    {
+      _gcry_mpih_rshift (xp, ap + nlimbs, alimbs - nlimbs, nbits);
       if (nlimbs)
-        {
-          for (i=0; i < x->nlimbs - nlimbs; i++ )
-            x->d[i] = x->d[i+nlimbs];
-          x->d[i] = 0;
-          x->nlimbs -= nlimbs;
-        }
-
-      if ( x->nlimbs && nbits )
-        _gcry_mpih_rshift ( x->d, x->d, x->nlimbs, nbits );
+       xp[alimbs - nlimbs] = 0;
+      x->nlimbs -= nlimbs;
     }
-  else
+  else if (nlimbs || (x != a))
     {
-      /* Copy and shift by less than bits in a limb.  */
-      xsize = a->nlimbs;
-      x->sign = a->sign;
-      RESIZE_IF_NEEDED (x, xsize);
-      x->nlimbs = xsize;
-
-      if ( xsize )
-        {
-          if (nbits )
-            _gcry_mpih_rshift (x->d, a->d, x->nlimbs, nbits );
-          else
-            {
-              /* The rshift helper function is not specified for
-                 NBITS==0, thus we do a plain copy here. */
-              for (i=0; i < x->nlimbs; i++ )
-                x->d[i] = a->d[i];
-            }
-        }
+      for (i = 0; i < alimbs - nlimbs; i++ )
+       xp[i] = ap[i + nlimbs];
+      if (nlimbs)
+       xp[i] = 0;
+      x->nlimbs -= nlimbs;
     }
+
   MPN_NORMALIZE (x->d, x->nlimbs);
 }
 
@@ -368,6 +336,9 @@ _gcry_mpi_lshift ( gcry_mpi_t x, gcry_mpi_t a, unsigned int n )
 {
   unsigned int nlimbs = (n/BITS_PER_MPI_LIMB);
   unsigned int nbits = (n%BITS_PER_MPI_LIMB);
+  mpi_size_t alimbs;
+  mpi_ptr_t xp, ap;
+  int i;
 
   if (mpi_is_immutable (x))
     {
@@ -378,34 +349,27 @@ _gcry_mpi_lshift ( gcry_mpi_t x, gcry_mpi_t a, unsigned int n )
   if (x == a && !n)
     return;  /* In-place shift with an amount of zero.  */
 
-  if ( x != a )
-    {
-      /* Copy A to X.  */
-      unsigned int alimbs = a->nlimbs;
-      int asign  = a->sign;
-      mpi_ptr_t xp, ap;
-
-      RESIZE_IF_NEEDED (x, alimbs+nlimbs+1);
-      xp = x->d;
-      ap = a->d;
-      MPN_COPY (xp, ap, alimbs);
-      x->nlimbs = alimbs;
-      x->flags = a->flags;
-      x->sign = asign;
-    }
+  /* Note: might be in-place operation, so a==x or a!=x. */
+
+  alimbs = a->nlimbs;
 
-  if (nlimbs && !nbits)
+  RESIZE_IF_NEEDED (x, alimbs + nlimbs + 1);
+  xp = x->d;
+  ap = a->d;
+  if (nbits && alimbs)
     {
-      /* Shift a full number of limbs.  */
-      _gcry_mpi_lshift_limbs (x, nlimbs);
+      x->nlimbs = alimbs + nlimbs + 1;
+      xp[alimbs + nlimbs] = _gcry_mpih_lshift (xp + nlimbs, ap, alimbs, nbits);
     }
-  else if (n)
+  else
     {
-      /* We use a very dump approach: Shift left by the number of
-         limbs plus one and than fix it up by an rshift.  */
-      _gcry_mpi_lshift_limbs (x, nlimbs+1);
-      mpi_rshift (x, x, BITS_PER_MPI_LIMB - nbits);
+      x->nlimbs = alimbs + nlimbs;
+      for (i = alimbs - 1; i >= 0; i--)
+       xp[i + nlimbs] = ap[i];
     }
-
+  for (i = 0; i < nlimbs; i++)
+    xp[i] = 0;
+  x->flags = a->flags;
+  x->sign = a->sign;
   MPN_NORMALIZE (x->d, x->nlimbs);
 }
index 8927fa0ecb0b82213c2a5c0a42dc993737bfeb9e..bf2c338f1f55570d104538dce50e9f274437b51f 100644 (file)
@@ -14,8 +14,8 @@
  * GNU Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA
+ * License along with this program; if not, see <https://www.gnu.org/licenses/>.
+ * SPDX-License-Identifier: LGPL-2.1-or-later
  */
 
 #include <config.h>
index 166ab87519bdc5076bc992e95e301757bafa3c83..8d2a85738435bdff9ba9b018f4c550ddd653a753 100644 (file)
@@ -15,8 +15,8 @@
  * GNU Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA
+ * License along with this program; if not, see <https://www.gnu.org/licenses/>.
+ * SPDX-License-Identifier: LGPL-2.1-or-later
  *
  * Note: This code is heavily based on the GNU MP Library.
  *      Actually it's the same code with only minor changes in the
index 77ca05a6fc03579a9131012b10830e08da15a892..9cf3062ee43f4510d2320ee445e00504c83f562d 100644 (file)
@@ -14,8 +14,8 @@
  * GNU Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA
+ * License along with this program; if not, see <https://www.gnu.org/licenses/>.
+ * SPDX-License-Identifier: LGPL-2.1-or-later
  */
 
 #include <config.h>
index 39e22224796c4e5561384b68fdee359915b81f8e..c6c66f3fa5f8d991d8bf6464e8da535604afe08f 100644 (file)
@@ -14,8 +14,8 @@
  * GNU Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA
+ * License along with this program; if not, see <https://www.gnu.org/licenses/>.
+ * SPDX-License-Identifier: LGPL-2.1-or-later
  */
 
 #include <config.h>
index 94e2aec8a1bd77a61e69300dcff7c72695a55cb8..090e8a94a59bcde9617acd65fbd36657d5e6f831 100644 (file)
@@ -15,8 +15,8 @@
  * GNU Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA
+ * License along with this program; if not, see <https://www.gnu.org/licenses/>.
+ * SPDX-License-Identifier: LGPL-2.1-or-later
  *
  * Note: This code is heavily based on the GNU MP Library.
  *      Actually it's the same code with only minor changes in the
index 58dc503a83152bb6c8a0d534bec6e2bccbb90d15..935bf3e1123e76bf7c967ae3de80356ba858e5c5 100644 (file)
@@ -15,8 +15,8 @@
  * GNU Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA
+ * License along with this program; if not, see <https://www.gnu.org/licenses/>.
+ * SPDX-License-Identifier: LGPL-2.1-or-later
  *
  * Note: This code is heavily based on the GNU MP Library.
  *      Actually it's the same code with only minor changes in the
@@ -140,7 +140,7 @@ typedef int mpi_size_t;        /* (must be a signed type) */
            mul_n_basecase (prodp, up, vp, size);       \
        else                                            \
            mul_n (prodp, up, vp, size, tspace);        \
-    } while (0);
+    } while (0)
 
 
 /* Divide the two-limb number in (NH,,NL) by D, with DI being the largest
index 88624720c21c83bb0fc069048e48e2a9fbd5359e..92448a739a43ea27cd864728bece47565fd7aef5 100644 (file)
@@ -1,23 +1,23 @@
 /* mpi-mod.c -  Modular reduction
  Copyright (C) 1998, 1999, 2001, 2002, 2003,
                2007  Free Software Foundation, Inc.
-
  This file is part of Libgcrypt.
-
  Libgcrypt is free software; you can redistribute it and/or modify
  it under the terms of the GNU Lesser General Public License as
  published by the Free Software Foundation; either version 2.1 of
  the License, or (at your option) any later version.
-
  Libgcrypt is distributed in the hope that it will be useful,
  but WITHOUT ANY WARRANTY; without even the implied warranty of
  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  GNU Lesser General Public License for more details.
-
  You should have received a copy of the GNU Lesser General Public
-   License along with this program; if not, write to the Free Software
-   Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301,
  USA.  */
* Copyright (C) 1998, 1999, 2001, 2002, 2003,
*               2007  Free Software Foundation, Inc.
+ *
* This file is part of Libgcrypt.
+ *
* Libgcrypt is free software; you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as
* published by the Free Software Foundation; either version 2.1 of
* the License, or (at your option) any later version.
+ *
* Libgcrypt is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
* GNU Lesser General Public License for more details.
+ *
* You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <https://www.gnu.org/licenses/>.
+ * SPDX-License-Identifier: LGPL-2.1-or-later
+ */
 
 
 #include <config.h>
index 43bd641fb5c1c52749a3a828b29383e8abee78e4..bbd102e3114ad06e6db0ab78cf361e02e5228e28 100644 (file)
@@ -14,8 +14,8 @@
  * GNU Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA
+ * License along with this program; if not, see <https://www.gnu.org/licenses/>.
+ * SPDX-License-Identifier: LGPL-2.1-or-later
  */
 
 #include <config.h>
index 4f4d7096a7836cd05d10206ee036887a88ead7e1..e8e574750301d19718afb44020f1ccea786110a9 100644 (file)
@@ -14,8 +14,8 @@
  * GNU Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA
+ * License along with this program; if not, see <https://www.gnu.org/licenses/>.
+ * SPDX-License-Identifier: LGPL-2.1-or-later
  *
  * Note: This code is heavily based on the GNU MP Library.
  *      Actually it's the same code with only minor changes in the
@@ -207,6 +207,17 @@ _gcry_mpi_mul (gcry_mpi_t w, gcry_mpi_t u, gcry_mpi_t v)
 void
 _gcry_mpi_mulm (gcry_mpi_t w, gcry_mpi_t u, gcry_mpi_t v, gcry_mpi_t m)
 {
+  gcry_mpi_t temp_m = NULL;
+
+  if (w == m)
+    {
+      temp_m = mpi_copy (m);
+      m = temp_m;
+    }
+
   mpi_mul (w, u, v);
   _gcry_mpi_tdiv_r (w, w, m);
+
+  if (temp_m)
+    mpi_free(temp_m);
 }
index 62b4a808302f532040cc2e52f560136aa754ba93..defd675ea5a228f5a1a3f9759b440abd22947a18 100644 (file)
@@ -545,7 +545,7 @@ _gcry_mpi_powm (gcry_mpi_t res,
   {
     mpi_size_t i, j, k;
     mpi_ptr_t xp;
-    mpi_size_t xsize;
+    mpi_size_t xsize = 0;
     int c;
     mpi_limb_t e;
     mpi_limb_t carry_limb;
index e27f7faa991a043b7ca124006328fcfd0d67a8c5..a93bd607f0adfc1f638579abe6efe1c16fb9b24c 100644 (file)
@@ -14,8 +14,8 @@
  * GNU Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA
+ * License along with this program; if not, see <https://www.gnu.org/licenses/>.
+ * SPDX-License-Identifier: LGPL-2.1-or-later
  */
 
 #include <config.h>
index e54224091d368c073d680388068eb320644de7c3..876e03c248c164252d675a384ed66376eff1e826 100644 (file)
@@ -36,6 +36,7 @@ mpih_ct_limb_greater_than (mpi_limb_t x, mpi_limb_t y)
 {
   mpi_limb_t diff_hi, diff_lo;
   sub_ddmmss (diff_hi, diff_lo, 0, y, 0, x);
+  (void)diff_lo;
   return diff_hi >> (BITS_PER_MPI_LIMB - 1);
 }
 
index 57c1b58487921bfb4e3de93ebc14509b52ab7ec9..0f3849d6459f838bc2e275c79eda75bc1480975f 100644 (file)
@@ -15,8 +15,8 @@
  * GNU Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA
+ * License along with this program; if not, see <https://www.gnu.org/licenses/>.
+ * SPDX-License-Identifier: LGPL-2.1-or-later
  *
  * Note: This code is heavily based on the GNU MP Library.
  *      Actually it's the same code with only minor changes in the
index 8b6f06a30ae02b032e1b61ed9bd0a98e430053d6..6c51533fc0a894500fc3c32a30866c3c6c4348e5 100644 (file)
@@ -15,8 +15,8 @@
  * GNU Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA
+ * License along with this program; if not, see <https://www.gnu.org/licenses/>.
+ * SPDX-License-Identifier: LGPL-2.1-or-later
  *
  * Note: This code is heavily based on the GNU MP Library.
  *      Actually it's the same code with only minor changes in the
@@ -39,7 +39,7 @@
            mul_n_basecase (prodp, up, vp, size);       \
        else                                            \
            mul_n (prodp, up, vp, size, tspace);        \
-    } while (0);
+    } while (0)
 
 #define MPN_SQR_N_RECURSE(prodp, up, size, tspace) \
     do {                                           \
@@ -47,7 +47,7 @@
            _gcry_mpih_sqr_n_basecase (prodp, up, size);         \
        else                                        \
            _gcry_mpih_sqr_n (prodp, up, size, tspace);  \
-    } while (0);
+    } while (0)
 
 
 
index 3a372374f2b87801343c93ee1868ca6608accd75..07cef2573489e5b502b6488d3b7d1233015edefe 100644 (file)
@@ -105,13 +105,6 @@ _gcry_mpi_alloc( unsigned nlimbs )
     return a;
 }
 
-void
-_gcry_mpi_m_check( gcry_mpi_t a )
-{
-    _gcry_check_heap(a);
-    _gcry_check_heap(a->d);
-}
-
 gcry_mpi_t
 _gcry_mpi_alloc_secure( unsigned nlimbs )
 {
index 8ade19643ee2a2070fafa51e8c859d0ae8aa7ed0..9bf6f728bb9a128aaa16fe285732b329ba516197 100644 (file)
@@ -17,8 +17,8 @@
  * GNU Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA
+ * License along with this program; if not, see <https://www.gnu.org/licenses/>.
+ * SPDX-License-Identifier: LGPL-2.1-or-later
  */
 
 
index 0624202725b6bbb2f68d0a2701274f7d9c022ddb..d0ceb8b1af67f696330c1661edb44484a96119b2 100644 (file)
@@ -17,8 +17,8 @@
  * GNU Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA
+ * License along with this program; if not, see <https://www.gnu.org/licenses/>.
+ * SPDX-License-Identifier: LGPL-2.1-or-later
  */
 
 
index 876b56c6644faa033b2247e45d01dc590b8b62ac..b992f15823c33aa91f9bbb7238a19ed6185a14d3 100644 (file)
@@ -16,8 +16,8 @@
  * GNU Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA
+ * License along with this program; if not, see <https://www.gnu.org/licenses/>.
+ * SPDX-License-Identifier: LGPL-2.1-or-later
  */
 
 #include "sysdep.h"
index d9e42daf817d770ac682c21f0667e74d12bd0571..1fcd1891caa7ce86cac41d18404a42571db102e7 100644 (file)
@@ -15,8 +15,8 @@
  * GNU Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA
+ * License along with this program; if not, see <https://www.gnu.org/licenses/>.
+ * SPDX-License-Identifier: LGPL-2.1-or-later
  */
 
 #include "sysdep.h"
index 35034fa4083dc7b410e5ccd03ad37c277232039f..5876443c67f6b062744ce37f5836a5ce5522546e 100644 (file)
@@ -16,8 +16,8 @@
  * GNU Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA
+ * License along with this program; if not, see <https://www.gnu.org/licenses/>.
+ * SPDX-License-Identifier: LGPL-2.1-or-later
  */
 
 #include "sysdep.h"
index d056e8f3c22b251f8480f2ca5c828286cae5c011..5b91101e537946662a4fe41ca47ce176537993e3 100644 (file)
@@ -16,8 +16,8 @@
  * GNU Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA
+ * License along with this program; if not, see <https://www.gnu.org/licenses/>.
+ * SPDX-License-Identifier: LGPL-2.1-or-later
  */
 
 #include "sysdep.h"
index 8bc317b763e21e4ec518fb59570d44b20ff3fbd7..8d54e84e47768af28d23972f6d8e1961679e8f94 100644 (file)
@@ -16,8 +16,8 @@
  * GNU Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA
+ * License along with this program; if not, see <https://www.gnu.org/licenses/>.
+ * SPDX-License-Identifier: LGPL-2.1-or-later
  */
 
 #include "sysdep.h"
index f131a86d7b78e3fce4b2df35200c5f1ab7aaa85e..aca51abf1a8ff4c8eb391151cfd35286ee616498 100644 (file)
@@ -15,8 +15,8 @@
  * GNU Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA
+ * License along with this program; if not, see <https://www.gnu.org/licenses/>.
+ * SPDX-License-Identifier: LGPL-2.1-or-later
  */
 
 #include "sysdep.h"
index 02748fc55668896d9b7964a54cfa76f7761a3249..03a8d9da607759bd46ec021b8b4ba8e342d0da3f 100644 (file)
@@ -16,8 +16,8 @@
  * GNU Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA
+ * License along with this program; if not, see <https://www.gnu.org/licenses/>.
+ * SPDX-License-Identifier: LGPL-2.1-or-later
  */
 
 #include "sysdep.h"
index 1661f5e679e4d9a30ec7f4377d8364721aad998d..cd5929433bd6d39474bfcf292881d02399bf358f 100644 (file)
@@ -15,8 +15,8 @@
  * GNU Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA
+ * License along with this program; if not, see <https://www.gnu.org/licenses/>.
+ * SPDX-License-Identifier: LGPL-2.1-or-later
  */
 
 #include "sysdep.h"
@@ -78,8 +78,9 @@ Lend: stw     7,4(3)          # store ultimate result limb
 
    You should have received a copy of the GNU Library General Public
    License along with the GNU C Library; see the file COPYING.LIB.  If not,
-   write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
-   Boston, MA 02111-1307, USA. */
+   see <https://www.gnu.org/licenses/>.
+   SPDX-License-Identifier: LGPL-2.1-or-later
+*/
 
 #include "sysdep.h"
 #include "asm-syntax.h"
index 6231095dc21181da37a3e05957be683ba1e6c040..0f66a4030e7af42447967e447336ec0cbe1a55cb 100644 (file)
@@ -15,8 +15,8 @@
  * GNU Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA
+ * License along with this program; if not, see <https://www.gnu.org/licenses/>.
+ * SPDX-License-Identifier: LGPL-2.1-or-later
  */
 
 #include "sysdep.h"
@@ -90,8 +90,8 @@ Lend2:        slw     0,10,6
 
    You should have received a copy of the GNU Library General Public
    License along with the GNU C Library; see the file COPYING.LIB.  If not,
-   write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
-   Boston, MA 02111-1307, USA. */
+   see <https://www.gnu.org/licenses/>.
+*/
 
 /* mp_limb_t mpn_lshift (mp_ptr wp, mp_srcptr up, mp_size_t usize,
                        unsigned int cnt)  */
index bd418f7e3a50e6b40c12e4f4db58abb241a1715c..75ff66af6389b53fae4a96382d361d88d89d0c9d 100644 (file)
@@ -17,8 +17,8 @@
  * GNU Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA
+ * License along with this program; if not, see <https://www.gnu.org/licenses/>.
+ * SPDX-License-Identifier: LGPL-2.1-or-later
  */
 
 #include "sysdep.h"
@@ -88,9 +88,9 @@ Lend: stw     7,4(3)
 
    You should have received a copy of the GNU Library General Public
    License along with the GNU C Library; see the file COPYING.LIB.  If not,
-   write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
-   Boston, MA 02111-1307, USA. */
-
+   see <https://www.gnu.org/licenses/>.
+   SPDX-License-Identifier: LGPL-2.1-or-later
+*/
 
 /* mp_limb_t mpn_mul_1 (mp_ptr res_ptr, mp_srcptr s1_ptr,
                        mp_size_t s1_size, mp_limb_t s2_limb)
index 1d97b81a4d7e27946d96f7e5c93f234052848a4a..bb3c81d5d6ff66042160c2fee49d2a6bb2cff68d 100644 (file)
@@ -16,8 +16,8 @@
  * GNU Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA
+ * License along with this program; if not, see <https://www.gnu.org/licenses/>.
+ * SPDX-License-Identifier: LGPL-2.1-or-later
  */
 
 #include "sysdep.h"
@@ -92,9 +92,9 @@ Lend: stw     8,4(3)
 
    You should have received a copy of the GNU Library General Public
    License along with the GNU C Library; see the file COPYING.LIB.  If not,
-   write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
-   Boston, MA 02111-1307, USA. */
-
+   see <https://www.gnu.org/licenses/>.
+   SPDX-License-Identifier: LGPL-2.1-or-later
+*/
 
 /* mp_limb_t mpn_addmul_1 (mp_ptr res_ptr, mp_srcptr s1_ptr,
                           mp_size_t s1_size, mp_limb_t s2_limb)
index c410dbb02ed4150116f0b6377d5fe14b1110ba3d..196fca1cb6ecec3258bd40b13841ea62ee1e1a52 100644 (file)
@@ -16,8 +16,8 @@
  * GNU Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA
+ * License along with this program; if not, see <https://www.gnu.org/licenses/>.
+ * SPDX-License-Identifier: LGPL-2.1-or-later
  */
 
 #include "sysdep.h"
@@ -93,8 +93,9 @@ Lend: stw     8,4(3)
 
    You should have received a copy of the GNU Library General Public
    License along with the GNU C Library; see the file COPYING.LIB.  If not,
-   write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
-   Boston, MA 02111-1307, USA. */
+   see <https://www.gnu.org/licenses/>.
+   SPDX-License-Identifier: LGPL-2.1-or-later
+*/
 
 /* mp_limb_t mpn_submul_1 (mp_ptr res_ptr, mp_srcptr s1_ptr,
                           mp_size_t s1_size, mp_limb_t s2_limb)
index 98349edb5b3a52f68b816a5f6a456e5b0c1ce36b..f6702ad1744a64f9888ade67a1616ba346f07d4d 100644 (file)
@@ -15,8 +15,8 @@
  * GNU Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA
+ * License along with this program; if not, see <https://www.gnu.org/licenses/>.
+ * SPDX-License-Identifier: LGPL-2.1-or-later
  */
 
 
@@ -89,9 +89,9 @@ Lend2:        srw     0,10,6
 
    You should have received a copy of the GNU Library General Public
    License along with the GNU C Library; see the file COPYING.LIB.  If not,
-   write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
-   Boston, MA 02111-1307, USA. */
-
+   see <https://www.gnu.org/licenses/>.
+   SPDX-License-Identifier: LGPL-2.1-or-later
+*/
 
 /* INPUT PARAMETERS
    res_ptr     r3
index d612ea890a8ec55761c742519f6a9ea63b851c89..42672423b14348288aa203619292b450559c3959 100644 (file)
@@ -17,8 +17,8 @@
  * GNU Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA
+ * License along with this program; if not, see <https://www.gnu.org/licenses/>.
+ * SPDX-License-Identifier: LGPL-2.1-or-later
  */
 
 #include "sysdep.h"
@@ -80,8 +80,9 @@ Lend: stw     7,4(3)          # store ultimate result limb
 
    You should have received a copy of the GNU Library General Public
    License along with the GNU C Library; see the file COPYING.LIB.  If not,
-   write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
-   Boston, MA 02111-1307, USA. */
+   see <https://www.gnu.org/licenses/>.
+   SPDX-License-Identifier: LGPL-2.1-or-later
+*/
 
 /* mp_limb_t mpn_sub_n (mp_ptr res_ptr, mp_srcptr s1_ptr, mp_srcptr s2_ptr,
                        mp_size_t size)
index 5d4af9f0ae3b8d11ed8ccbf964437709446e661d..5f4e6cf6cd84a3a92e79ebb92ac703d7df433987 100644 (file)
@@ -16,8 +16,9 @@
 
    You should have received a copy of the GNU Library General Public
    License along with the GNU C Library; see the file COPYING.LIB.  If not,
-   write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
-   Boston, MA 02111-1307, USA. */
+   see <https://www.gnu.org/licenses/>.
+   SPDX-License-Identifier: LGPL-2-or-later
+*/
 
 
 #define USE_PPC_PATCHES 1
index 61a80ca320b773c601fdf496f5a962288e3d46fe..d3488f514d724df7ba105c855547577d5f95efad 100644 (file)
@@ -17,8 +17,8 @@
  * GNU Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA
+ * License along with this program; if not, see <https://www.gnu.org/licenses/>.
+ * SPDX-License-Identifier: LGPL-2.1-or-later
  */
 
 
index 3422ab04e531e93576b7b17d26cf3581ceca9e2d..4d544ede222bbd828177e17b99e3f0e2d290cbff 100644 (file)
@@ -16,8 +16,8 @@
  * GNU Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA
+ * License along with this program; if not, see <https://www.gnu.org/licenses/>.
+ * SPDX-License-Identifier: LGPL-2.1-or-later
  */
 
 
index cd3db41df3aa49be2df874d2e048659c12fd0875..a8552d79c87306dc1c27729e369a81d2256fd5cb 100644 (file)
@@ -16,8 +16,8 @@
  * GNU Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA
+ * License along with this program; if not, see <https://www.gnu.org/licenses/>.
+ * SPDX-License-Identifier: LGPL-2.1-or-later
  */
 
 ! INPUT PARAMETERS
index 006b5c125c8f422568489538b2af31199da65285..04841fffc52d9e07bd3596dfc8f6189a7e64b91e 100644 (file)
@@ -17,8 +17,8 @@
  * GNU Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA
+ * License along with this program; if not, see <https://www.gnu.org/licenses/>.
+ * SPDX-License-Identifier: LGPL-2.1-or-later
  *
  * Note: This code is heavily based on the GNU MP Library.
  *      Actually it's the same code with only minor changes in the
index 03fcddab0e76cb47f5c395ccf261c57feff1d3bc..162416b719f2554eeac919a508d6ba6b25e7f1b4 100644 (file)
@@ -17,8 +17,8 @@
  * GNU Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA
+ * License along with this program; if not, see <https://www.gnu.org/licenses/>.
+ * SPDX-License-Identifier: LGPL-2.1-or-later
  *
  * Note: This code is heavily based on the GNU MP Library.
  *      Actually it's the same code with only minor changes in the
index 6f5cc436a7c90e8e0c21679ced5267421a108ed4..a9409836303e9f5063e89ef62edaa3606384bcbe 100644 (file)
@@ -17,8 +17,8 @@
  * GNU Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA
+ * License along with this program; if not, see <https://www.gnu.org/licenses/>.
+ * SPDX-License-Identifier: LGPL-2.1-or-later
  *
  * Note: This code is heavily based on the GNU MP Library.
  *      Actually it's the same code with only minor changes in the
index 93bb19433daf4a11651b6838530d754e0e930976..467484962f7e0b801910f0f5dc9da176ad36d3e5 100644 (file)
@@ -17,8 +17,8 @@
  * GNU Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA
+ * License along with this program; if not, see <https://www.gnu.org/licenses/>.
+ * SPDX-License-Identifier: LGPL-2.1-or-later
  *
  * Note: This code is heavily based on the GNU MP Library.
  *      Actually it's the same code with only minor changes in the
index 79e506a11f8caae4b089492ff7126c4e0e1b0a26..2582e358aa2f4a5a0b62d8d43ac281c209b0175c 100644 (file)
@@ -18,8 +18,8 @@
  * GNU Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA
+ * License along with this program; if not, see <https://www.gnu.org/licenses/>.
+ * SPDX-License-Identifier: LGPL-2.1-or-later
  *
  * Note: This code is heavily based on the GNU MP Library.
  *      Actually it's the same code with only minor changes in the
index 0c935a05952b5f1d91ef5992ec090912a77b03d2..c7100ef8b896f321c9f85249184a8e4a2ca38a1e 100644 (file)
@@ -56,7 +56,7 @@ jitterentropy-base.c jitterentropy.h jitterentropy-base-user.h
 
 # The rndjent module needs to be compiled without optimization.  */
 if ENABLE_O_FLAG_MUNGING
-o_flag_munging = sed -e 's/-O\([1-9sgz][1-9sgz]*\)/-O0/g' -e 's/-Ofast/-O0/g'
+o_flag_munging = sed -e 's/[[:blank:]]-O\([1-9sgz][1-9sgz]*\)/ -O0 /' -e 's/[[:blank:]]-Ofast/ -O0 /g'
 else
 o_flag_munging = cat
 endif
index 5bdb38f707a3b49ca9d797b60c9fb86c116a1cb2..b72a4cb371cf251c3424912a366e694ef5754648 100644 (file)
@@ -114,8 +114,8 @@ am__aclocal_m4_deps = $(top_srcdir)/m4/ax_cc_for_build.m4 \
        $(top_srcdir)/m4/gpg-error.m4 $(top_srcdir)/m4/libtool.m4 \
        $(top_srcdir)/m4/ltoptions.m4 $(top_srcdir)/m4/ltsugar.m4 \
        $(top_srcdir)/m4/ltversion.m4 $(top_srcdir)/m4/lt~obsolete.m4 \
-       $(top_srcdir)/m4/noexecstack.m4 $(top_srcdir)/m4/socklen.m4 \
-       $(top_srcdir)/acinclude.m4 $(top_srcdir)/configure.ac
+       $(top_srcdir)/m4/noexecstack.m4 $(top_srcdir)/acinclude.m4 \
+       $(top_srcdir)/configure.ac
 am__configure_deps = $(am__aclocal_m4_deps) $(CONFIGURE_DEPENDENCIES) \
        $(ACLOCAL_M4)
 DIST_COMMON = $(srcdir)/Makefile.am $(am__DIST_COMMON)
@@ -304,9 +304,6 @@ PACKAGE_TARNAME = @PACKAGE_TARNAME@
 PACKAGE_URL = @PACKAGE_URL@
 PACKAGE_VERSION = @PACKAGE_VERSION@
 PATH_SEPARATOR = @PATH_SEPARATOR@
-PTH_CFLAGS = @PTH_CFLAGS@
-PTH_CONFIG = @PTH_CONFIG@
-PTH_LIBS = @PTH_LIBS@
 RANLIB = @RANLIB@
 RC = @RC@
 READELF = @READELF@
@@ -410,7 +407,7 @@ jitterentropy-base.c jitterentropy.h jitterentropy-base-user.h
 @ENABLE_O_FLAG_MUNGING_FALSE@o_flag_munging = cat
 
 # The rndjent module needs to be compiled without optimization.  */
-@ENABLE_O_FLAG_MUNGING_TRUE@o_flag_munging = sed -e 's/-O\([1-9sgz][1-9sgz]*\)/-O0/g' -e 's/-Ofast/-O0/g'
+@ENABLE_O_FLAG_MUNGING_TRUE@o_flag_munging = sed -e 's/[[:blank:]]-O\([1-9sgz][1-9sgz]*\)/ -O0 /' -e 's/[[:blank:]]-Ofast/ -O0 /g'
 all: all-am
 
 .SUFFIXES:
index 2d2b8909b20d2a175b601d0e9351549086ab69d2..3d7a582ff4fbee43cebafb2f689fab5673f750bd 100644 (file)
@@ -4,7 +4,7 @@
  * This file is part of Libgcrypt.
  *
  * Libgcrypt is free software; you can redistribute it and/or modify
- * it under the terms of the GNU Lesser general Public License as
+ * it under the terms of the GNU Lesser General Public License as
  * published by the Free Software Foundation; either version 2.1 of
  * the License, or (at your option) any later version.
  *
index 85d11789391318cb16803a5c6e0e918ff1a4b398..4f34acc0cc08eeb80b7ddbfcd87cadecd8db9ff1 100644 (file)
@@ -1131,7 +1131,7 @@ add_randomness (const void *buffer, size_t length, enum random_origins origin)
 
 
 static void
-random_poll()
+random_poll (void)
 {
   rndstats.slowpolls++;
   read_random_source (RANDOM_ORIGIN_SLOWPOLL, POOLSIZE/5, GCRY_STRONG_RANDOM);
index af49a5a53e7b4edc91b1f34f87965efa13281fae..cad364abf5dd0b57ebe80550bee0ebf75c88ebf0 100644 (file)
@@ -291,7 +291,7 @@ struct drbg_state_ops_s
 struct drbg_test_data_s
 {
   drbg_string_t *testentropy;  /* TEST PARAMETER: test entropy */
-  int fail_seed_source:1;      /* If set, the seed function will
+  unsigned int fail_seed_source:1; /* If set, the seed function will
                                  * return an error. */
 };
 
@@ -308,8 +308,8 @@ struct drbg_state_s
                                 * operation -- allocated during init */
   void *priv_data;             /* Cipher handle */
   gcry_cipher_hd_t ctr_handle; /* CTR mode cipher handle */
-  int seeded:1;                        /* DRBG fully seeded? */
-  int pr:1;                    /* Prediction resistance enabled? */
+  unsigned int seeded:1;       /* DRBG fully seeded? */
+  unsigned int pr:1;           /* Prediction resistance enabled? */
   /* Taken from libgcrypt ANSI X9.31 DRNG: We need to keep track of the
    * process which did the initialization so that we can detect a fork.
    * The volatile modifier is required so that the compiler does not
index eedf1d0fcb9819c571773b6d4f68da60df421e0e..dee1f9c6189616cdbd1f97ef267e429b4b4b7685 100644 (file)
@@ -14,8 +14,8 @@
  * GNU Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA
+ * License along with this program; if not, see <https://www.gnu.org/licenses/>.
+ * SPDX-License-Identifier: LGPL-2.1-or-later
  */
 #ifndef G10_RANDOM_H
 #define G10_RANDOM_H
index b87115f2342ae6be4102d314ba790d3c32260e96..2645695669f5170a6ecd2721856ba4ccd8c7d628 100644 (file)
@@ -14,8 +14,8 @@
  * GNU Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA
+ * License along with this program; if not, see <https://www.gnu.org/licenses/>.
+ * SPDX-License-Identifier: LGPL-2.1-or-later
  */
 
 #include <config.h>
index b3f63d2072491cfa991582cb6772bfbb14dac2ad..fd979ab97c6197d3279c980f9d5d79c38b92f403 100644 (file)
@@ -264,16 +264,16 @@ init_system_rng (void)
   if (!hAdvAPI32)
     return;
 
-  pCryptAcquireContext = (CRYPTACQUIRECONTEXT)
+  pCryptAcquireContext = (CRYPTACQUIRECONTEXT)(void *)
     GetProcAddress (hAdvAPI32, "CryptAcquireContextA");
-  pCryptGenRandom = (CRYPTGENRANDOM)
+  pCryptGenRandom = (CRYPTGENRANDOM)(void *)
     GetProcAddress (hAdvAPI32, "CryptGenRandom");
-  pCryptReleaseContext = (CRYPTRELEASECONTEXT)
+  pCryptReleaseContext = (CRYPTRELEASECONTEXT)(void *)
     GetProcAddress (hAdvAPI32, "CryptReleaseContext");
 
   /* Get a pointer to the native randomness function if it's available.
      This isn't exported by name, so we have to get it by ordinal.  */
-  pRtlGenRandom = (RTLGENRANDOM)
+  pRtlGenRandom = (RTLGENRANDOM)(void *)
     GetProcAddress (hAdvAPI32, "SystemFunction036");
 
   /* Try and connect to the PIII RNG CSP.  The AMD 768 southbridge (from
@@ -536,11 +536,11 @@ slow_gatherer ( void (*add)(const void*, size_t, enum random_origins),
         {
           if (debug_me)
             log_debug ("rndw32#slow_gatherer: netapi32 loaded\n" );
-          pNetStatisticsGet = (NETSTATISTICSGET)
+          pNetStatisticsGet = (NETSTATISTICSGET)(void *)
             GetProcAddress (hNetAPI32, "NetStatisticsGet");
-          pNetApiBufferSize = (NETAPIBUFFERSIZE)
+          pNetApiBufferSize = (NETAPIBUFFERSIZE)(void *)
             GetProcAddress (hNetAPI32, "NetApiBufferSize");
-          pNetApiBufferFree = (NETAPIBUFFERFREE)
+          pNetApiBufferFree = (NETAPIBUFFERFREE)(void *)
             GetProcAddress (hNetAPI32, "NetApiBufferFree");
 
           if (!pNetStatisticsGet || !pNetApiBufferSize || !pNetApiBufferFree)
@@ -556,11 +556,11 @@ slow_gatherer ( void (*add)(const void*, size_t, enum random_origins),
       if (hNTAPI)
         {
           /* Get a pointer to the NT native information query functions */
-          pNtQuerySystemInformation = (NTQUERYSYSTEMINFORMATION)
+          pNtQuerySystemInformation = (NTQUERYSYSTEMINFORMATION)(void *)
             GetProcAddress (hNTAPI, "NtQuerySystemInformation");
-          pNtQueryInformationProcess = (NTQUERYINFORMATIONPROCESS)
+          pNtQueryInformationProcess = (NTQUERYINFORMATIONPROCESS)(void *)
             GetProcAddress (hNTAPI, "NtQueryInformationProcess");
-          pNtPowerInformation = (NTPOWERINFORMATION)
+          pNtPowerInformation = (NTPOWERINFORMATION)(void *)
             GetProcAddress(hNTAPI, "NtPowerInformation");
 
           if (!pNtQuerySystemInformation || !pNtQueryInformationProcess)
@@ -624,8 +624,15 @@ slow_gatherer ( void (*add)(const void*, size_t, enum random_origins),
         }
       else
         {
-          log_info ("NOTE: you should run 'diskperf -y' "
-                    "to enable the disk statistics\n");
+          DWORD err_code = GetLastError ();
+
+          if (err_code == 0x32)
+            /* If it's ERROR_NOT_SUPPORTED, skip the message, as it
+               won't work.  */
+            ;
+          else
+            log_info ("NOTE: you should run 'diskperf -y' "
+                      "to enable the disk statistics\n");
         }
       CloseHandle (hDevice);
     }
index ea265fc244789c723ecfa289ca1af08a63e55ab0..f6191bc8ed39c089b378002370e4868dae4fac43 100644 (file)
@@ -26,17 +26,17 @@ pkgconfig_DATA = libgcrypt.pc
 EXTRA_DIST = libgcrypt-config.in libgcrypt.m4 libgcrypt.vers \
              gcrypt.h.in libgcrypt.def libgcrypt.pc.in gen-note-integrity.sh
 
+if USE_GPGRT_CONFIG
+noinst_SCRIPTS = libgcrypt-config
+else
 bin_SCRIPTS = libgcrypt-config
+endif
 m4datadir = $(datadir)/aclocal
 m4data_DATA = libgcrypt.m4
 nodist_include_HEADERS = gcrypt.h
 
 lib_LTLIBRARIES = libgcrypt.la
 bin_PROGRAMS = dumpsexp hmac256 mpicalc
-if ENABLE_RANDOM_DAEMON
-sbin_PROGRAMS = gcryptrnd
-bin_PROGRAMS += getrandom
-endif ENABLE_RANDOM_DAEMON
 
 # Depending on the architecture some targets require libgpg-error.
 if HAVE_W32CE_SYSTEM
@@ -142,14 +142,6 @@ hmac256_CFLAGS = -DSTANDALONE @DEF_HMAC_BINARY_CHECK@ \
        $(arch_gpg_error_cflags)
 hmac256_LDADD = $(arch_gpg_error_libs)
 
-if ENABLE_RANDOM_DAEMON
-gcryptrnd_SOURCES = gcryptrnd.c
-gcryptrnd_CFLAGS = $(GPG_ERROR_CFLAGS) $(PTH_CFLAGS)
-gcryptrnd_LDADD = libgcrypt.la $(GPG_ERROR_LIBS) $(PTH_LIBS)
-
-getrandom_SOURCES = getrandom.c
-endif ENABLE_RANDOM_DAEMON
-
 CLEANFILES = libgcrypt.la.done
 if USE_HMAC_BINARY_CHECK
 CLEANFILES += libgcrypt.so.hmac
index 2db6b3c57a3476458d90be88760e1e33e48fcfd2..48f08f03d03b69cef278dc089190e4b3905c9653 100644 (file)
@@ -111,19 +111,16 @@ PRE_UNINSTALL = :
 POST_UNINSTALL = :
 build_triplet = @build@
 host_triplet = @host@
-bin_PROGRAMS = dumpsexp$(EXEEXT) hmac256$(EXEEXT) mpicalc$(EXEEXT) \
-       $(am__EXEEXT_1)
-@ENABLE_RANDOM_DAEMON_TRUE@sbin_PROGRAMS = gcryptrnd$(EXEEXT)
-@ENABLE_RANDOM_DAEMON_TRUE@am__append_1 = getrandom
-@USE_HMAC_BINARY_CHECK_TRUE@am__append_2 = libgcrypt.so.hmac
+bin_PROGRAMS = dumpsexp$(EXEEXT) hmac256$(EXEEXT) mpicalc$(EXEEXT)
+@USE_HMAC_BINARY_CHECK_TRUE@am__append_1 = libgcrypt.so.hmac
 subdir = src
 ACLOCAL_M4 = $(top_srcdir)/aclocal.m4
 am__aclocal_m4_deps = $(top_srcdir)/m4/ax_cc_for_build.m4 \
        $(top_srcdir)/m4/gpg-error.m4 $(top_srcdir)/m4/libtool.m4 \
        $(top_srcdir)/m4/ltoptions.m4 $(top_srcdir)/m4/ltsugar.m4 \
        $(top_srcdir)/m4/ltversion.m4 $(top_srcdir)/m4/lt~obsolete.m4 \
-       $(top_srcdir)/m4/noexecstack.m4 $(top_srcdir)/m4/socklen.m4 \
-       $(top_srcdir)/acinclude.m4 $(top_srcdir)/configure.ac
+       $(top_srcdir)/m4/noexecstack.m4 $(top_srcdir)/acinclude.m4 \
+       $(top_srcdir)/configure.ac
 am__configure_deps = $(am__aclocal_m4_deps) $(CONFIGURE_DEPENDENCIES) \
        $(ACLOCAL_M4)
 DIST_COMMON = $(srcdir)/Makefile.am $(am__DIST_COMMON)
@@ -132,12 +129,10 @@ CONFIG_HEADER = $(top_builddir)/config.h
 CONFIG_CLEAN_FILES = gcrypt.h libgcrypt-config libgcrypt.pc \
        versioninfo.rc
 CONFIG_CLEAN_VPATH_FILES =
-@ENABLE_RANDOM_DAEMON_TRUE@am__EXEEXT_1 = getrandom$(EXEEXT)
-am__installdirs = "$(DESTDIR)$(bindir)" "$(DESTDIR)$(sbindir)" \
-       "$(DESTDIR)$(libdir)" "$(DESTDIR)$(bindir)" \
-       "$(DESTDIR)$(m4datadir)" "$(DESTDIR)$(pkgconfigdir)" \
-       "$(DESTDIR)$(includedir)"
-PROGRAMS = $(bin_PROGRAMS) $(sbin_PROGRAMS)
+am__installdirs = "$(DESTDIR)$(bindir)" "$(DESTDIR)$(libdir)" \
+       "$(DESTDIR)$(bindir)" "$(DESTDIR)$(m4datadir)" \
+       "$(DESTDIR)$(pkgconfigdir)" "$(DESTDIR)$(includedir)"
+PROGRAMS = $(bin_PROGRAMS)
 am__vpath_adj_setup = srcdirstrip=`echo "$(srcdir)" | sed 's|.|.|g'`;
 am__vpath_adj = case $$p in \
     $(srcdir)/*) f=`echo "$$p" | sed "s|^$$srcdirstrip/||"`;; \
@@ -189,20 +184,6 @@ dumpsexp_DEPENDENCIES = $(am__DEPENDENCIES_3)
 dumpsexp_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) \
        $(LIBTOOLFLAGS) --mode=link $(CCLD) $(dumpsexp_CFLAGS) \
        $(CFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
-am__gcryptrnd_SOURCES_DIST = gcryptrnd.c
-@ENABLE_RANDOM_DAEMON_TRUE@am_gcryptrnd_OBJECTS =  \
-@ENABLE_RANDOM_DAEMON_TRUE@    gcryptrnd-gcryptrnd.$(OBJEXT)
-gcryptrnd_OBJECTS = $(am_gcryptrnd_OBJECTS)
-@ENABLE_RANDOM_DAEMON_TRUE@gcryptrnd_DEPENDENCIES = libgcrypt.la \
-@ENABLE_RANDOM_DAEMON_TRUE@    $(am__DEPENDENCIES_2) \
-@ENABLE_RANDOM_DAEMON_TRUE@    $(am__DEPENDENCIES_2)
-gcryptrnd_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) \
-       $(LIBTOOLFLAGS) --mode=link $(CCLD) $(gcryptrnd_CFLAGS) \
-       $(CFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
-am__getrandom_SOURCES_DIST = getrandom.c
-@ENABLE_RANDOM_DAEMON_TRUE@am_getrandom_OBJECTS = getrandom.$(OBJEXT)
-getrandom_OBJECTS = $(am_getrandom_OBJECTS)
-getrandom_LDADD = $(LDADD)
 am_hmac256_OBJECTS = hmac256-hmac256.$(OBJEXT)
 hmac256_OBJECTS = $(am_hmac256_OBJECTS)
 hmac256_DEPENDENCIES = $(am__DEPENDENCIES_3)
@@ -215,7 +196,7 @@ mpicalc_DEPENDENCIES = libgcrypt.la $(am__DEPENDENCIES_2)
 mpicalc_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) \
        $(LIBTOOLFLAGS) --mode=link $(CCLD) $(mpicalc_CFLAGS) \
        $(CFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
-SCRIPTS = $(bin_SCRIPTS)
+SCRIPTS = $(bin_SCRIPTS) $(noinst_SCRIPTS)
 AM_V_P = $(am__v_P_@AM_V@)
 am__v_P_ = $(am__v_P_@AM_DEFAULT_V@)
 am__v_P_0 = false
@@ -232,7 +213,6 @@ DEFAULT_INCLUDES = -I.@am__isrc@ -I$(top_builddir)
 depcomp = $(SHELL) $(top_srcdir)/build-aux/depcomp
 am__maybe_remake_depfiles = depfiles
 am__depfiles_remade = ./$(DEPDIR)/dumpsexp-dumpsexp.Po \
-       ./$(DEPDIR)/gcryptrnd-gcryptrnd.Po ./$(DEPDIR)/getrandom.Po \
        ./$(DEPDIR)/hmac256-hmac256.Po \
        ./$(DEPDIR)/libgcrypt_la-const-time.Plo \
        ./$(DEPDIR)/libgcrypt_la-context.Plo \
@@ -270,12 +250,9 @@ am__v_CCLD_ = $(am__v_CCLD_@AM_DEFAULT_V@)
 am__v_CCLD_0 = @echo "  CCLD    " $@;
 am__v_CCLD_1 = 
 SOURCES = $(libgcrypt_la_SOURCES) $(EXTRA_libgcrypt_la_SOURCES) \
-       $(dumpsexp_SOURCES) $(gcryptrnd_SOURCES) $(getrandom_SOURCES) \
-       $(hmac256_SOURCES) $(mpicalc_SOURCES)
+       $(dumpsexp_SOURCES) $(hmac256_SOURCES) $(mpicalc_SOURCES)
 DIST_SOURCES = $(libgcrypt_la_SOURCES) $(EXTRA_libgcrypt_la_SOURCES) \
-       $(dumpsexp_SOURCES) $(am__gcryptrnd_SOURCES_DIST) \
-       $(am__getrandom_SOURCES_DIST) $(hmac256_SOURCES) \
-       $(mpicalc_SOURCES)
+       $(dumpsexp_SOURCES) $(hmac256_SOURCES) $(mpicalc_SOURCES)
 am__can_run_installinfo = \
   case $$AM_UPDATE_INFO_DIR in \
     n|no|NO) false;; \
@@ -403,9 +380,6 @@ PACKAGE_TARNAME = @PACKAGE_TARNAME@
 PACKAGE_URL = @PACKAGE_URL@
 PACKAGE_VERSION = @PACKAGE_VERSION@
 PATH_SEPARATOR = @PATH_SEPARATOR@
-PTH_CFLAGS = @PTH_CFLAGS@
-PTH_CONFIG = @PTH_CONFIG@
-PTH_LIBS = @PTH_LIBS@
 RANLIB = @RANLIB@
 RC = @RC@
 READELF = @READELF@
@@ -478,7 +452,8 @@ pkgconfig_DATA = libgcrypt.pc
 EXTRA_DIST = libgcrypt-config.in libgcrypt.m4 libgcrypt.vers \
              gcrypt.h.in libgcrypt.def libgcrypt.pc.in gen-note-integrity.sh
 
-bin_SCRIPTS = libgcrypt-config
+@USE_GPGRT_CONFIG_TRUE@noinst_SCRIPTS = libgcrypt-config
+@USE_GPGRT_CONFIG_FALSE@bin_SCRIPTS = libgcrypt-config
 m4datadir = $(datadir)/aclocal
 m4data_DATA = libgcrypt.m4
 nodist_include_HEADERS = gcrypt.h
@@ -552,11 +527,7 @@ hmac256_CFLAGS = -DSTANDALONE @DEF_HMAC_BINARY_CHECK@ \
        $(arch_gpg_error_cflags)
 
 hmac256_LDADD = $(arch_gpg_error_libs)
-@ENABLE_RANDOM_DAEMON_TRUE@gcryptrnd_SOURCES = gcryptrnd.c
-@ENABLE_RANDOM_DAEMON_TRUE@gcryptrnd_CFLAGS = $(GPG_ERROR_CFLAGS) $(PTH_CFLAGS)
-@ENABLE_RANDOM_DAEMON_TRUE@gcryptrnd_LDADD = libgcrypt.la $(GPG_ERROR_LIBS) $(PTH_LIBS)
-@ENABLE_RANDOM_DAEMON_TRUE@getrandom_SOURCES = getrandom.c
-CLEANFILES = libgcrypt.la.done $(am__append_2)
+CLEANFILES = libgcrypt.la.done $(am__append_1)
 all: all-am
 
 .SUFFIXES:
@@ -647,55 +618,6 @@ clean-binPROGRAMS:
        list=`for p in $$list; do echo "$$p"; done | sed 's/$(EXEEXT)$$//'`; \
        echo " rm -f" $$list; \
        rm -f $$list
-install-sbinPROGRAMS: $(sbin_PROGRAMS)
-       @$(NORMAL_INSTALL)
-       @list='$(sbin_PROGRAMS)'; test -n "$(sbindir)" || list=; \
-       if test -n "$$list"; then \
-         echo " $(MKDIR_P) '$(DESTDIR)$(sbindir)'"; \
-         $(MKDIR_P) "$(DESTDIR)$(sbindir)" || exit 1; \
-       fi; \
-       for p in $$list; do echo "$$p $$p"; done | \
-       sed 's/$(EXEEXT)$$//' | \
-       while read p p1; do if test -f $$p \
-        || test -f $$p1 \
-         ; then echo "$$p"; echo "$$p"; else :; fi; \
-       done | \
-       sed -e 'p;s,.*/,,;n;h' \
-           -e 's|.*|.|' \
-           -e 'p;x;s,.*/,,;s/$(EXEEXT)$$//;$(transform);s/$$/$(EXEEXT)/' | \
-       sed 'N;N;N;s,\n, ,g' | \
-       $(AWK) 'BEGIN { files["."] = ""; dirs["."] = 1 } \
-         { d=$$3; if (dirs[d] != 1) { print "d", d; dirs[d] = 1 } \
-           if ($$2 == $$4) files[d] = files[d] " " $$1; \
-           else { print "f", $$3 "/" $$4, $$1; } } \
-         END { for (d in files) print "f", d, files[d] }' | \
-       while read type dir files; do \
-           if test "$$dir" = .; then dir=; else dir=/$$dir; fi; \
-           test -z "$$files" || { \
-           echo " $(INSTALL_PROGRAM_ENV) $(LIBTOOL) $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=install $(INSTALL_PROGRAM) $$files '$(DESTDIR)$(sbindir)$$dir'"; \
-           $(INSTALL_PROGRAM_ENV) $(LIBTOOL) $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=install $(INSTALL_PROGRAM) $$files "$(DESTDIR)$(sbindir)$$dir" || exit $$?; \
-           } \
-       ; done
-
-uninstall-sbinPROGRAMS:
-       @$(NORMAL_UNINSTALL)
-       @list='$(sbin_PROGRAMS)'; test -n "$(sbindir)" || list=; \
-       files=`for p in $$list; do echo "$$p"; done | \
-         sed -e 'h;s,^.*/,,;s/$(EXEEXT)$$//;$(transform)' \
-             -e 's/$$/$(EXEEXT)/' \
-       `; \
-       test -n "$$list" || exit 0; \
-       echo " ( cd '$(DESTDIR)$(sbindir)' && rm -f" $$files ")"; \
-       cd "$(DESTDIR)$(sbindir)" && rm -f $$files
-
-clean-sbinPROGRAMS:
-       @list='$(sbin_PROGRAMS)'; test -n "$$list" || exit 0; \
-       echo " rm -f" $$list; \
-       rm -f $$list || exit $$?; \
-       test -n "$(EXEEXT)" || exit 0; \
-       list=`for p in $$list; do echo "$$p"; done | sed 's/$(EXEEXT)$$//'`; \
-       echo " rm -f" $$list; \
-       rm -f $$list
 
 install-libLTLIBRARIES: $(lib_LTLIBRARIES)
        @$(NORMAL_INSTALL)
@@ -739,14 +661,6 @@ dumpsexp$(EXEEXT): $(dumpsexp_OBJECTS) $(dumpsexp_DEPENDENCIES) $(EXTRA_dumpsexp
        @rm -f dumpsexp$(EXEEXT)
        $(AM_V_CCLD)$(dumpsexp_LINK) $(dumpsexp_OBJECTS) $(dumpsexp_LDADD) $(LIBS)
 
-gcryptrnd$(EXEEXT): $(gcryptrnd_OBJECTS) $(gcryptrnd_DEPENDENCIES) $(EXTRA_gcryptrnd_DEPENDENCIES) 
-       @rm -f gcryptrnd$(EXEEXT)
-       $(AM_V_CCLD)$(gcryptrnd_LINK) $(gcryptrnd_OBJECTS) $(gcryptrnd_LDADD) $(LIBS)
-
-getrandom$(EXEEXT): $(getrandom_OBJECTS) $(getrandom_DEPENDENCIES) $(EXTRA_getrandom_DEPENDENCIES) 
-       @rm -f getrandom$(EXEEXT)
-       $(AM_V_CCLD)$(LINK) $(getrandom_OBJECTS) $(getrandom_LDADD) $(LIBS)
-
 hmac256$(EXEEXT): $(hmac256_OBJECTS) $(hmac256_DEPENDENCIES) $(EXTRA_hmac256_DEPENDENCIES) 
        @rm -f hmac256$(EXEEXT)
        $(AM_V_CCLD)$(hmac256_LINK) $(hmac256_OBJECTS) $(hmac256_LDADD) $(LIBS)
@@ -797,8 +711,6 @@ distclean-compile:
        -rm -f *.tab.c
 
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/dumpsexp-dumpsexp.Po@am__quote@ # am--include-marker
-@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/gcryptrnd-gcryptrnd.Po@am__quote@ # am--include-marker
-@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/getrandom.Po@am__quote@ # am--include-marker
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/hmac256-hmac256.Po@am__quote@ # am--include-marker
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libgcrypt_la-const-time.Plo@am__quote@ # am--include-marker
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libgcrypt_la-context.Plo@am__quote@ # am--include-marker
@@ -963,20 +875,6 @@ dumpsexp-dumpsexp.obj: dumpsexp.c
 @AMDEP_TRUE@@am__fastdepCC_FALSE@      DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
 @am__fastdepCC_FALSE@  $(AM_V_CC@am__nodep@)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(dumpsexp_CFLAGS) $(CFLAGS) -c -o dumpsexp-dumpsexp.obj `if test -f 'dumpsexp.c'; then $(CYGPATH_W) 'dumpsexp.c'; else $(CYGPATH_W) '$(srcdir)/dumpsexp.c'; fi`
 
-gcryptrnd-gcryptrnd.o: gcryptrnd.c
-@am__fastdepCC_TRUE@   $(AM_V_CC)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(gcryptrnd_CFLAGS) $(CFLAGS) -MT gcryptrnd-gcryptrnd.o -MD -MP -MF $(DEPDIR)/gcryptrnd-gcryptrnd.Tpo -c -o gcryptrnd-gcryptrnd.o `test -f 'gcryptrnd.c' || echo '$(srcdir)/'`gcryptrnd.c
-@am__fastdepCC_TRUE@   $(AM_V_at)$(am__mv) $(DEPDIR)/gcryptrnd-gcryptrnd.Tpo $(DEPDIR)/gcryptrnd-gcryptrnd.Po
-@AMDEP_TRUE@@am__fastdepCC_FALSE@      $(AM_V_CC)source='gcryptrnd.c' object='gcryptrnd-gcryptrnd.o' libtool=no @AMDEPBACKSLASH@
-@AMDEP_TRUE@@am__fastdepCC_FALSE@      DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
-@am__fastdepCC_FALSE@  $(AM_V_CC@am__nodep@)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(gcryptrnd_CFLAGS) $(CFLAGS) -c -o gcryptrnd-gcryptrnd.o `test -f 'gcryptrnd.c' || echo '$(srcdir)/'`gcryptrnd.c
-
-gcryptrnd-gcryptrnd.obj: gcryptrnd.c
-@am__fastdepCC_TRUE@   $(AM_V_CC)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(gcryptrnd_CFLAGS) $(CFLAGS) -MT gcryptrnd-gcryptrnd.obj -MD -MP -MF $(DEPDIR)/gcryptrnd-gcryptrnd.Tpo -c -o gcryptrnd-gcryptrnd.obj `if test -f 'gcryptrnd.c'; then $(CYGPATH_W) 'gcryptrnd.c'; else $(CYGPATH_W) '$(srcdir)/gcryptrnd.c'; fi`
-@am__fastdepCC_TRUE@   $(AM_V_at)$(am__mv) $(DEPDIR)/gcryptrnd-gcryptrnd.Tpo $(DEPDIR)/gcryptrnd-gcryptrnd.Po
-@AMDEP_TRUE@@am__fastdepCC_FALSE@      $(AM_V_CC)source='gcryptrnd.c' object='gcryptrnd-gcryptrnd.obj' libtool=no @AMDEPBACKSLASH@
-@AMDEP_TRUE@@am__fastdepCC_FALSE@      DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
-@am__fastdepCC_FALSE@  $(AM_V_CC@am__nodep@)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(gcryptrnd_CFLAGS) $(CFLAGS) -c -o gcryptrnd-gcryptrnd.obj `if test -f 'gcryptrnd.c'; then $(CYGPATH_W) 'gcryptrnd.c'; else $(CYGPATH_W) '$(srcdir)/gcryptrnd.c'; fi`
-
 hmac256-hmac256.o: hmac256.c
 @am__fastdepCC_TRUE@   $(AM_V_CC)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(hmac256_CFLAGS) $(CFLAGS) -MT hmac256-hmac256.o -MD -MP -MF $(DEPDIR)/hmac256-hmac256.Tpo -c -o hmac256-hmac256.o `test -f 'hmac256.c' || echo '$(srcdir)/'`hmac256.c
 @am__fastdepCC_TRUE@   $(AM_V_at)$(am__mv) $(DEPDIR)/hmac256-hmac256.Tpo $(DEPDIR)/hmac256-hmac256.Po
@@ -1164,10 +1062,8 @@ all-am: Makefile $(PROGRAMS) $(LTLIBRARIES) $(SCRIPTS) $(DATA) \
                $(HEADERS)
 install-binPROGRAMS: install-libLTLIBRARIES
 
-install-sbinPROGRAMS: install-libLTLIBRARIES
-
 installdirs:
-       for dir in "$(DESTDIR)$(bindir)" "$(DESTDIR)$(sbindir)" "$(DESTDIR)$(libdir)" "$(DESTDIR)$(bindir)" "$(DESTDIR)$(m4datadir)" "$(DESTDIR)$(pkgconfigdir)" "$(DESTDIR)$(includedir)"; do \
+       for dir in "$(DESTDIR)$(bindir)" "$(DESTDIR)$(libdir)" "$(DESTDIR)$(bindir)" "$(DESTDIR)$(m4datadir)" "$(DESTDIR)$(pkgconfigdir)" "$(DESTDIR)$(includedir)"; do \
          test -z "$$dir" || $(MKDIR_P) "$$dir"; \
        done
 install: install-am
@@ -1204,12 +1100,10 @@ maintainer-clean-generic:
 clean: clean-am
 
 clean-am: clean-binPROGRAMS clean-generic clean-libLTLIBRARIES \
-       clean-libtool clean-sbinPROGRAMS mostlyclean-am
+       clean-libtool mostlyclean-am
 
 distclean: distclean-am
                -rm -f ./$(DEPDIR)/dumpsexp-dumpsexp.Po
-       -rm -f ./$(DEPDIR)/gcryptrnd-gcryptrnd.Po
-       -rm -f ./$(DEPDIR)/getrandom.Po
        -rm -f ./$(DEPDIR)/hmac256-hmac256.Po
        -rm -f ./$(DEPDIR)/libgcrypt_la-const-time.Plo
        -rm -f ./$(DEPDIR)/libgcrypt_la-context.Plo
@@ -1251,7 +1145,7 @@ install-dvi: install-dvi-am
 install-dvi-am:
 
 install-exec-am: install-binPROGRAMS install-binSCRIPTS \
-       install-libLTLIBRARIES install-sbinPROGRAMS
+       install-libLTLIBRARIES
 
 install-html: install-html-am
 
@@ -1275,8 +1169,6 @@ installcheck-am:
 
 maintainer-clean: maintainer-clean-am
                -rm -f ./$(DEPDIR)/dumpsexp-dumpsexp.Po
-       -rm -f ./$(DEPDIR)/gcryptrnd-gcryptrnd.Po
-       -rm -f ./$(DEPDIR)/getrandom.Po
        -rm -f ./$(DEPDIR)/hmac256-hmac256.Po
        -rm -f ./$(DEPDIR)/libgcrypt_la-const-time.Plo
        -rm -f ./$(DEPDIR)/libgcrypt_la-context.Plo
@@ -1312,31 +1204,29 @@ ps-am:
 
 uninstall-am: uninstall-binPROGRAMS uninstall-binSCRIPTS \
        uninstall-libLTLIBRARIES uninstall-local uninstall-m4dataDATA \
-       uninstall-nodist_includeHEADERS uninstall-pkgconfigDATA \
-       uninstall-sbinPROGRAMS
+       uninstall-nodist_includeHEADERS uninstall-pkgconfigDATA
 
 .MAKE: install-am install-strip
 
 .PHONY: CTAGS GTAGS TAGS all all-am am--depfiles check check-am clean \
        clean-binPROGRAMS clean-generic clean-libLTLIBRARIES \
-       clean-libtool clean-sbinPROGRAMS cscopelist-am ctags ctags-am \
-       distclean distclean-compile distclean-generic \
-       distclean-libtool distclean-tags distdir dvi dvi-am html \
-       html-am info info-am install install-am install-binPROGRAMS \
-       install-binSCRIPTS install-data install-data-am \
-       install-data-local install-dvi install-dvi-am install-exec \
-       install-exec-am install-html install-html-am install-info \
-       install-info-am install-libLTLIBRARIES install-m4dataDATA \
-       install-man install-nodist_includeHEADERS install-pdf \
-       install-pdf-am install-pkgconfigDATA install-ps install-ps-am \
-       install-sbinPROGRAMS install-strip installcheck \
-       installcheck-am installdirs maintainer-clean \
+       clean-libtool cscopelist-am ctags ctags-am distclean \
+       distclean-compile distclean-generic distclean-libtool \
+       distclean-tags distdir dvi dvi-am html html-am info info-am \
+       install install-am install-binPROGRAMS install-binSCRIPTS \
+       install-data install-data-am install-data-local install-dvi \
+       install-dvi-am install-exec install-exec-am install-html \
+       install-html-am install-info install-info-am \
+       install-libLTLIBRARIES install-m4dataDATA install-man \
+       install-nodist_includeHEADERS install-pdf install-pdf-am \
+       install-pkgconfigDATA install-ps install-ps-am install-strip \
+       installcheck installcheck-am installdirs maintainer-clean \
        maintainer-clean-generic mostlyclean mostlyclean-compile \
        mostlyclean-generic mostlyclean-libtool pdf pdf-am ps ps-am \
        tags tags-am uninstall uninstall-am uninstall-binPROGRAMS \
        uninstall-binSCRIPTS uninstall-libLTLIBRARIES uninstall-local \
        uninstall-m4dataDATA uninstall-nodist_includeHEADERS \
-       uninstall-pkgconfigDATA uninstall-sbinPROGRAMS
+       uninstall-pkgconfigDATA
 
 .PRECIOUS: Makefile
 
index 36729165d4c4e0bae3c57a7c34b356f3a3a6ddbe..b23eb9cafc67aa6d3d10c929e00022db00a1ff6b 100644 (file)
@@ -4,7 +4,7 @@
  * This file is part of Libgcrypt.
  *
  * Libgcrypt is free software; you can redistribute it and/or modify
- * it under the terms of the GNU Lesser general Public License as
+ * it under the terms of the GNU Lesser General Public License as
  * published by the Free Software Foundation; either version 2.1 of
  * the License, or (at your option) any later version.
  *
@@ -24,9 +24,6 @@
 #define G10_CIPHER_PROTO_H
 
 
-enum pk_encoding;
-
-
 /* Definition of a function used to report selftest failures.
    DOMAIN is a string describing the function block:
           "cipher", "digest", "pubkey or "random",
@@ -219,7 +216,8 @@ typedef void (*gcry_md_final_t) (void *c);
 typedef unsigned char *(*gcry_md_read_t) (void *c);
 
 /* Type for the md_extract function.  */
-typedef void (*gcry_md_extract_t) (void *c, void *outbuf, size_t nbytes);
+typedef gpg_err_code_t (*gcry_md_extract_t) (void *c, void *outbuf,
+                                            size_t nbytes);
 
 /* Type for the md_hash_buffers function. */
 typedef void (*gcry_md_hash_buffers_t) (void *outbuf, size_t nbytes,
index 87f8c4d046abf2590803c3292f04669ea55ae70c..0a2551fe019db07aa7be24a32771c4358da67b20 100644 (file)
@@ -4,7 +4,7 @@
  * This file is part of Libgcrypt.
  *
  * Libgcrypt is free software; you can redistribute it and/or modify
- * it under the terms of the GNU Lesser general Public License as
+ * it under the terms of the GNU Lesser General Public License as
  * published by the Free Software Foundation; either version 2.1 of
  * the License, or (at your option) any later version.
  *
@@ -14,8 +14,8 @@
  * GNU Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA
+ * License along with this program; if not, see <https://www.gnu.org/licenses/>.
+ * SPDX-License-Identifier: LGPL-2.1-or-later
  */
 #ifndef G10_CIPHER_H
 #define G10_CIPHER_H
@@ -119,6 +119,8 @@ void _gcry_sha1_hash_buffer (void *outbuf,
                              const void *buffer, size_t length);
 
 /*-- blake2.c --*/
+gcry_err_code_t blake2b_vl_hash (const void *in, size_t inlen,
+                                 size_t outputlen, void *output);
 gcry_err_code_t _gcry_blake2_init_with_key(void *ctx, unsigned int flags,
                                           const unsigned char *key,
                                           size_t keylen, int algo);
@@ -140,6 +142,10 @@ void _gcry_register_pk_ecc_progress (gcry_handler_progress_t cbc,
 void _gcry_register_primegen_progress (gcry_handler_progress_t cb,
                                        void *cb_data);
 
+/*-- keccak.c --*/
+gpg_err_code_t _gcry_cshake_customize (void *context,
+                                       struct gcry_cshake_customization *p);
+
 /*-- pubkey.c --*/
 
 /* Declarations for the cipher specifications.  */
@@ -169,6 +175,9 @@ extern gcry_cipher_spec_t _gcry_cipher_spec_gost28147;
 extern gcry_cipher_spec_t _gcry_cipher_spec_gost28147_mesh;
 extern gcry_cipher_spec_t _gcry_cipher_spec_chacha20;
 extern gcry_cipher_spec_t _gcry_cipher_spec_sm4;
+extern gcry_cipher_spec_t _gcry_cipher_spec_aria128;
+extern gcry_cipher_spec_t _gcry_cipher_spec_aria192;
+extern gcry_cipher_spec_t _gcry_cipher_spec_aria256;
 
 /* Declarations for the digest specifications.  */
 extern const gcry_md_spec_t _gcry_digest_spec_crc32;
@@ -195,6 +204,8 @@ extern const gcry_md_spec_t _gcry_digest_spec_sha3_512;
 extern const gcry_md_spec_t _gcry_digest_spec_sha3_384;
 extern const gcry_md_spec_t _gcry_digest_spec_shake128;
 extern const gcry_md_spec_t _gcry_digest_spec_shake256;
+extern const gcry_md_spec_t _gcry_digest_spec_cshake128;
+extern const gcry_md_spec_t _gcry_digest_spec_cshake256;
 extern const gcry_md_spec_t _gcry_digest_spec_tiger;
 extern const gcry_md_spec_t _gcry_digest_spec_tiger1;
 extern const gcry_md_spec_t _gcry_digest_spec_tiger2;
@@ -212,9 +223,9 @@ extern const gcry_md_spec_t _gcry_digest_spec_sm3;
 /* Declarations for the pubkey cipher specifications.  */
 extern gcry_pk_spec_t _gcry_pubkey_spec_rsa;
 extern gcry_pk_spec_t _gcry_pubkey_spec_elg;
-extern gcry_pk_spec_t _gcry_pubkey_spec_elg_e;
 extern gcry_pk_spec_t _gcry_pubkey_spec_dsa;
 extern gcry_pk_spec_t _gcry_pubkey_spec_ecc;
+extern gcry_pk_spec_t _gcry_pubkey_spec_kem;
 
 
 #endif /*G10_CIPHER_H*/
index da9948a66f4cc5c25a339f5da93f1461e56cee7b..de0a183d332bf2ee5ebef8cb067f6ef16caba0fa 100644 (file)
@@ -36,6 +36,7 @@
    gcry_ctx_t is used to access it.  */
 struct gcry_context
 {
+  struct gcry_context *next;
   char magic[CTX_MAGIC_LEN]; /* Magic value to cross check that this
                                 is really a context object. */
   char type;     /* The type of the context (CONTEXT_TYPE_foo).  */
@@ -51,14 +52,15 @@ struct gcry_context
    NULL if de-initialization is not required.  Returns NULL and sets
    ERRNO if memory allocation failed.  */
 gcry_ctx_t
-_gcry_ctx_alloc (int type, size_t length, void (*deinit)(void*))
+_gcry_ctx_alloc (int type, size_t length, void (*deinit)(void*),
+                 gcry_ctx_t next)
 {
   gcry_ctx_t ctx;
 
   switch (type)
     {
     case CONTEXT_TYPE_EC:
-    case CONTEXT_TYPE_RANDOM_OVERRIDE:
+    case CONTEXT_TYPE_SINGLE_DATA:
       break;
     default:
       log_bug ("bad context type %d given to _gcry_ctx_alloc\n", type);
@@ -74,6 +76,7 @@ _gcry_ctx_alloc (int type, size_t length, void (*deinit)(void*))
   memcpy (ctx->magic, CTX_MAGIC, CTX_MAGIC_LEN);
   ctx->type = type;
   ctx->deinit = deinit;
+  ctx->next = next;
 
   return ctx;
 }
@@ -83,12 +86,18 @@ _gcry_ctx_alloc (int type, size_t length, void (*deinit)(void*))
    the requested context type.  Using an explicit type allows to cross
    check the type and eventually allows to store several private
    contexts in one context object.  The function does not return an
-   error but aborts if the provided CTX is not valid.  */
+   error but aborts if the provided CTX is not valid.
+   Special usage: using TYPE with 0, which returns CTX->NEXT.
+  */
 void *
 _gcry_ctx_get_pointer (gcry_ctx_t ctx, int type)
 {
   if (!ctx || memcmp (ctx->magic, CTX_MAGIC, CTX_MAGIC_LEN))
     log_fatal ("bad pointer %p passed to _gcry_ctx_get_pointer\n", ctx);
+
+  if (type == 0)
+    return ctx->next;
+
   if (ctx->type != type)
     log_fatal ("wrong context type %d request for context %p of type %d\n",
                type, ctx, ctx->type);
@@ -119,6 +128,9 @@ _gcry_ctx_find_pointer (gcry_ctx_t ctx, int type)
 void
 _gcry_ctx_release (gcry_ctx_t ctx)
 {
+  gcry_ctx_t ctx_next;
+
+ again:
   if (!ctx)
     return;
   if (memcmp (ctx->magic, CTX_MAGIC, CTX_MAGIC_LEN))
@@ -126,7 +138,7 @@ _gcry_ctx_release (gcry_ctx_t ctx)
   switch (ctx->type)
     {
     case CONTEXT_TYPE_EC:
-    case CONTEXT_TYPE_RANDOM_OVERRIDE:
+    case CONTEXT_TYPE_SINGLE_DATA:
       break;
     default:
       log_fatal ("bad context type %d detected in gcry_ctx_relase\n",
@@ -135,5 +147,8 @@ _gcry_ctx_release (gcry_ctx_t ctx)
     }
   if (ctx->deinit)
     ctx->deinit (&ctx->u);
+  ctx_next = ctx->next;
   xfree (ctx);
+  ctx = ctx_next;
+  goto again;
 }
index 5be367b2ff9b2f8286b6ff28a6c744a8b3a5849c..21a1f5dba8fdeb94768ecee4f0dc7c78680b66f8 100644 (file)
@@ -4,7 +4,7 @@
  * This file is part of Libgcrypt.
  *
  * Libgcrypt is free software; you can redistribute it and/or modify
- * it under the terms of the GNU Lesser general Public License as
+ * it under the terms of the GNU Lesser General Public License as
  * published by the Free Software Foundation; either version 2.1 of
  * the License, or (at your option) any later version.
  *
 
 /* Context types as used in struct gcry_context.  */
 #define CONTEXT_TYPE_EC 1  /* The context is used with EC functions.  */
-#define CONTEXT_TYPE_RANDOM_OVERRIDE 2  /* Used with pubkey functions.  */
+#define CONTEXT_TYPE_SINGLE_DATA   2  /* Used with pubkey functions.  */
 
-gcry_ctx_t _gcry_ctx_alloc (int type, size_t length, void (*deinit)(void*));
+gcry_ctx_t _gcry_ctx_alloc (int type, size_t length, void (*deinit)(void*),
+                            gcry_ctx_t next);
 void *_gcry_ctx_get_pointer (gcry_ctx_t ctx, int type);
 void *_gcry_ctx_find_pointer (gcry_ctx_t ctx, int type);
 
index cedc4f4b4742c769168a6683c6f293dd119b459d..ae6563591f7c8ccc526c56e7ad59dbbb908a63f2 100644 (file)
@@ -414,6 +414,7 @@ parse_and_print (FILE *fp)
     }
   state = INIT_STATE;
 
+  (void)level; /* The value is not used, but useful in debug session.  */
 
   while ((c = my_getc (fp)) != EOF )
     {
index 479862f6fa265961d3b020ea8ec0f889b287c417..655d98f02c4e6444b3fd94a20d45afb875f42a8b 100644 (file)
@@ -4,7 +4,7 @@
  * This file is part of Libgcrypt.
  *
  * Libgcrypt is free software; you can redistribute it and/or modify
- * it under the terms of the GNU Lesser general Public License as
+ * it under the terms of the GNU Lesser General Public License as
  * published by the Free Software Foundation; either version 2.1 of
  * the License, or (at your option) any later version.
  *
index 5d71b208ec1088feec778544d78bdebddaf6b7c2..cf91baa8c80dc061f6fe82cd1e95f94be2b139f6 100644 (file)
@@ -365,6 +365,7 @@ _gcry_fips_indicator_cipher (va_list arg_ptr)
         case GCRY_CIPHER_MODE_OFB:
         case GCRY_CIPHER_MODE_CTR:
         case GCRY_CIPHER_MODE_CCM:
+        case GCRY_CIPHER_MODE_GCM:
         case GCRY_CIPHER_MODE_XTS:
         case GCRY_CIPHER_MODE_AESWRAP:
           return GPG_ERR_NO_ERROR;
@@ -421,6 +422,8 @@ _gcry_fips_indicator_md (va_list arg_ptr)
     case GCRY_MD_SHA3_512:
     case GCRY_MD_SHAKE128:
     case GCRY_MD_SHAKE256:
+    case GCRY_MD_CSHAKE128:
+    case GCRY_MD_CSHAKE256:
       return GPG_ERR_NO_ERROR;
     default:
       return GPG_ERR_NOT_SUPPORTED;
@@ -463,6 +466,7 @@ static const char *valid_string_in_sexp[] = {
   "data",
   "e",
   "ecdsa",
+  "eddsa",
   "flags",
   "genkey",
   "hash",
@@ -668,8 +672,12 @@ run_pubkey_selftests (int extended)
 {
   static int algos[] =
     {
+#if USE_RSA
       GCRY_PK_RSA,
+#endif /* USE_RSA */
+#if USE_ECC
       GCRY_PK_ECC,
+#endif /* USE_ECC */
       0
     };
   int idx;
index a7aee80d93d6a3cec2dc924ebed51e50dea2155f..adf03e86177860bb73e8196c6a42f3b5514c8a9a 100644 (file)
@@ -5,7 +5,7 @@
  * This file is part of Libgcrypt.
  *
  * Libgcrypt is free software; you can redistribute it and/or modify
- * it under the terms of the GNU Lesser general Public License as
+ * it under the terms of the GNU Lesser General Public License as
  * published by the Free Software Foundation; either version 2.1 of
  * the License, or (at your option) any later version.
  *
 
 #if __GNUC__ > 2 || (__GNUC__ == 2 && __GNUC_MINOR__ >= 5 )
 #define JNLIB_GCC_M_FUNCTION 1
-#define JNLIB_GCC_A_NR              __attribute__ ((noreturn))
+#define JNLIB_GCC_A_NR              __attribute__ ((__noreturn__))
 #define JNLIB_GCC_A_PRINTF( f, a )  __attribute__ ((format (printf,f,a)))
 #define JNLIB_GCC_A_NR_PRINTF( f, a ) \
-                           __attribute__ ((noreturn, format (printf,f,a)))
+                           __attribute__ ((__noreturn__, format (printf,f,a)))
 #define GCC_ATTR_NORETURN  __attribute__ ((__noreturn__))
 #else
 #define JNLIB_GCC_A_NR
 extern int _gcry_global_any_init_done;
 int _gcry_global_is_operational (void);
 gcry_err_code_t _gcry_vcontrol (enum gcry_ctl_cmds cmd, va_list arg_ptr);
-void _gcry_check_heap (const void *a);
 void _gcry_pre_syscall (void);
 void _gcry_post_syscall (void);
 int _gcry_get_debug_flag (unsigned int mask);
@@ -167,8 +166,9 @@ void _gcry_divide_by_zero (void) JNLIB_GCC_A_NR;
 
 const char *_gcry_gettext (const char *key) GCC_ATTR_FORMAT_ARG(1);
 void _gcry_fatal_error(int rc, const char *text ) JNLIB_GCC_A_NR;
-void _gcry_logv (int level,
-                 const char *fmt, va_list arg_ptr) JNLIB_GCC_A_PRINTF(2,0);
+void _gcry_set_gpgrt_post_log_handler (void);
+void _gcry_logv (int level, const char *fmt,
+                 va_list arg_ptr) JNLIB_GCC_A_PRINTF(2,0);
 void _gcry_log( int level, const char *fmt, ... ) JNLIB_GCC_A_PRINTF(2,3);
 void _gcry_log_bug( const char *fmt, ... )   JNLIB_GCC_A_NR_PRINTF(1,2);
 void _gcry_log_fatal( const char *fmt, ... ) JNLIB_GCC_A_NR_PRINTF(1,2);
@@ -238,6 +238,8 @@ char **_gcry_strtokenize (const char *string, const char *delim);
 #define HWF_INTEL_RDTSC         (1 << 15)
 #define HWF_INTEL_SHAEXT        (1 << 16)
 #define HWF_INTEL_VAES_VPCLMUL  (1 << 17)
+#define HWF_INTEL_AVX512        (1 << 18)
+#define HWF_INTEL_GFNI          (1 << 19)
 
 #elif defined(HAVE_CPU_ARCH_ARM)
 
@@ -246,6 +248,16 @@ char **_gcry_strtokenize (const char *string, const char *delim);
 #define HWF_ARM_SHA1            (1 << 2)
 #define HWF_ARM_SHA2            (1 << 3)
 #define HWF_ARM_PMULL           (1 << 4)
+#define HWF_ARM_SHA3            (1 << 5)
+#define HWF_ARM_SM3             (1 << 6)
+#define HWF_ARM_SM4             (1 << 7)
+#define HWF_ARM_SHA512          (1 << 8)
+#define HWF_ARM_SVE             (1 << 9)
+#define HWF_ARM_SVE2            (1 << 10)
+#define HWF_ARM_SVEAES          (1 << 11)
+#define HWF_ARM_SVEPMULL        (1 << 12)
+#define HWF_ARM_SVESHA3         (1 << 13)
+#define HWF_ARM_SVESM4          (1 << 14)
 
 #elif defined(HAVE_CPU_ARCH_PPC)
 
@@ -419,6 +431,7 @@ gcry_err_code_t _gcry_sexp_vbuild (gcry_sexp_t *retsexp, size_t *erroff,
 char *_gcry_sexp_nth_string (const gcry_sexp_t list, int number);
 gpg_err_code_t _gcry_sexp_vextract_param (gcry_sexp_t sexp, const char *path,
                                           const char *list, va_list arg_ptr);
+void *_gcry_hex2buffer (const char *string, size_t *r_length);
 
 
 /*-- fips.c --*/
index 3071b421e28c1a33806d681f35d3696078854c68..1b449281825e21bba6161b068c8a51fdfd1384d9 100644 (file)
@@ -69,6 +69,12 @@ gcry_err_code_t _gcry_cipher_setkey (gcry_cipher_hd_t hd,
                                      const void *key, size_t keylen);
 gcry_err_code_t _gcry_cipher_setiv (gcry_cipher_hd_t hd,
                                     const void *iv, size_t ivlen);
+gcry_err_code_t _gcry_cipher_setup_geniv (gcry_cipher_hd_t hd, int method,
+                                          const void *fixed_iv,
+                                          size_t fixed_ivlen,
+                                          const void *dyn_iv, size_t dyn_ivlen);
+gcry_err_code_t _gcry_cipher_geniv (gcry_cipher_hd_t hd,
+                                    void *iv, size_t ivlen);
 gpg_err_code_t _gcry_cipher_authenticate (gcry_cipher_hd_t hd, const void *abuf,
                                           size_t abuflen);
 gpg_err_code_t _gcry_cipher_gettag (gcry_cipher_hd_t hd, void *outtag,
@@ -110,21 +116,33 @@ gcry_sexp_t _gcry_pk_get_param (int algo, const char *name);
 gpg_err_code_t _gcry_pubkey_get_sexp (gcry_sexp_t *r_sexp,
                                       int mode, gcry_ctx_t ctx);
 unsigned int _gcry_ecc_get_algo_keylen (int algo);
-gpg_error_t _gcry_ecc_mul_point (int algo, unsigned char *result,
-                                 const unsigned char *scalar,
-                                 const unsigned char *point);
+gpg_err_code_t _gcry_ecc_curve_keypair (const char *curve,
+                                        unsigned char *pubkey,
+                                        size_t pubkey_len,
+                                        unsigned char *seckey,
+                                        size_t seckey_len);
+gpg_err_code_t _gcry_ecc_curve_mul_point (const char *curve,
+                                          unsigned char *result,
+                                          size_t result_len,
+                                          const unsigned char *scalar,
+                                          size_t scalar_len,
+                                          const unsigned char *point,
+                                          size_t point_len);
+gpg_err_code_t _gcry_ecc_mul_point (int algo, unsigned char *result,
+                                    const unsigned char *scalar,
+                                    const unsigned char *point);
 gcry_err_code_t _gcry_pk_sign_md (gcry_sexp_t *r_sig, const char *tmpl,
                                   gcry_md_hd_t hd, gcry_sexp_t s_skey,
                                   gcry_ctx_t ctx);
 gcry_err_code_t _gcry_pk_verify_md (gcry_sexp_t s_sig, const char *tmpl,
                                     gcry_md_hd_t hd, gcry_sexp_t s_pkey,
                                     gcry_ctx_t ctx);
-gpg_err_code_t _gcry_pk_random_override_new (gcry_ctx_t *r_ctx,
-                                             const unsigned char *p,
-                                             size_t len);
-gpg_err_code_t _gcry_pk_get_random_override (gcry_ctx_t ctx,
-                                             const unsigned char **r_p,
-                                             size_t *r_len);
+gpg_err_code_t _gcry_pk_single_data_push (gcry_ctx_t *r_ctx,
+                                          const unsigned char *p,
+                                          size_t len);
+gpg_err_code_t _gcry_pk_get_single_data (gcry_ctx_t *r_ctx,
+                                         const unsigned char **r_p,
+                                         size_t *r_len);
 \f
 gpg_err_code_t _gcry_md_open (gcry_md_hd_t *h, int algo, unsigned int flags);
 void _gcry_md_close (gcry_md_hd_t hd);
@@ -219,6 +237,20 @@ gcry_err_code_t _gcry_kdf_compute (gcry_kdf_hd_t h,
 gpg_err_code_t _gcry_kdf_final (gcry_kdf_hd_t h, size_t resultlen, void *result);
 void _gcry_kdf_close (gcry_kdf_hd_t h);
 
+\f
+gcry_err_code_t _gcry_kem_keypair (int algo,
+                                   void *pubkey, size_t pubkey_len,
+                                   void *seckey, size_t seckey_len);
+gcry_err_code_t _gcry_kem_encap (int algo,
+                                 const void *pubkey, size_t pubkey_len,
+                                 void *ciphertext, size_t ciphertext_len,
+                                 void *shared, size_t shared_len,
+                                 const void *optional, size_t optional_len);
+gcry_err_code_t _gcry_kem_decap (int algo,
+                                 const void *seckey, size_t seckey_len,
+                                 const void *ciphertext, size_t ciphertext_len,
+                                 void *shared, size_t shared_len,
+                                 const void *optional, size_t optional_len);
 \f
 gpg_err_code_t _gcry_prime_generate (gcry_mpi_t *prime,
                                      unsigned int prime_bits,
index 0417754f4dbd18b099466138c4945fedcf94f8c1..f5608084d579440b4fb515336a00bfe6b8569eef 100644 (file)
@@ -41,6 +41,7 @@
 /* For use with gcry_cipher_ctl:  */
 #define PRIV_CIPHERCTL_DISABLE_WEAK_KEY   61
 #define PRIV_CIPHERCTL_GET_INPUT_VECTOR   62
+#define PRIV_CIPHERCTL_GET_COUNTER        63
 
 
 /* Private interfaces for testing of random-drbg.c. */
index 58581daab19b615f73ee5227b47e93718b05e1a5..9cad7a46a577b9a8f5c9455608bbd12ed58cf483 100644 (file)
@@ -1,7 +1,6 @@
 /* gcrypt.h -  GNU Cryptographic Library Interface              -*- c -*-
- * Copyright (C) 2012-2023 g10 Code GmbH
- * Copyright (C) 2013-2023 Jussi Kivilinna
  * Copyright (C) 1998-2018 Free Software Foundation, Inc.
+ * Copyright (C) 2012-2024 g10 Code GmbH
  *
  * This file is part of Libgcrypt.
  *
@@ -16,7 +15,8 @@
  * GNU Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ * License along with this program; if not, see <https://www.gnu.org/licenses/>.
+ * SPDX-License-Identifier: LGPL-2.1-or-later
  *
  * File: @configure_input@
  */
@@ -207,7 +207,7 @@ struct gcry_thread_cbs
        Bits  7 - 0  are used for the thread model
        Bits 15 - 8  are used for the version number.  */
   unsigned int option;
-} _GCRY_ATTR_INTERNAL;
+} _GCRY_GCC_ATTR_DEPRECATED;
 
 #define GCRY_THREAD_OPTION_PTH_IMPL                                     \
   static struct gcry_thread_cbs gcry_threads_pth = {                    \
@@ -334,7 +334,8 @@ enum gcry_ctl_cmds
     GCRYCTL_FIPS_SERVICE_INDICATOR_FUNCTION = 84,
     GCRYCTL_FIPS_SERVICE_INDICATOR_MAC = 85,
     GCRYCTL_FIPS_SERVICE_INDICATOR_MD = 86,
-    GCRYCTL_FIPS_SERVICE_INDICATOR_PK_FLAGS = 87
+    GCRYCTL_FIPS_SERVICE_INDICATOR_PK_FLAGS = 87,
+    GCRYCTL_MD_CUSTOMIZE = 88
   };
 
 /* Perform various operations defined by CMD. */
@@ -947,7 +948,10 @@ enum gcry_cipher_algos
     GCRY_CIPHER_GOST28147   = 315,
     GCRY_CIPHER_CHACHA20    = 316,
     GCRY_CIPHER_GOST28147_MESH   = 317, /* With CryptoPro key meshing.  */
-    GCRY_CIPHER_SM4         = 318
+    GCRY_CIPHER_SM4         = 318,
+    GCRY_CIPHER_ARIA128     = 319,
+    GCRY_CIPHER_ARIA192     = 320,
+    GCRY_CIPHER_ARIA256     = 321
   };
 
 /* The Rijndael algorithm is basically AES, so provide some macros. */
@@ -990,6 +994,13 @@ enum gcry_cipher_flags
     GCRY_CIPHER_EXTENDED    = 16  /* Enable extended AES-WRAP.  */
   };
 
+/* Methods used for AEAD IV generation. */
+enum gcry_cipher_geniv_methods
+  {
+    GCRY_CIPHER_GENIV_METHOD_CONCAT = 1,
+    GCRY_CIPHER_GENIV_METHOD_XOR = 2
+  };
+
 /* GCM works only with blocks of 128 bits */
 #define GCRY_GCM_BLOCK_LEN  (128 / 8)
 
@@ -1061,6 +1072,15 @@ gcry_error_t gcry_cipher_setkey (gcry_cipher_hd_t hd,
 gcry_error_t gcry_cipher_setiv (gcry_cipher_hd_t hd,
                                 const void *iv, size_t ivlen);
 
+/* Initialization vector generation setup for AEAD modes/ciphers.  */
+gcry_error_t gcry_cipher_setup_geniv (gcry_cipher_hd_t hd, int method,
+                                      const void *fixed_iv, size_t fixed_ivlen,
+                                      const void *dyn_iv, size_t dyn_ivlen);
+
+/* Initialization vector generation for AEAD modes/ciphers.  */
+gcry_error_t gcry_cipher_geniv (gcry_cipher_hd_t hd,
+                                void *iv, size_t ivlen);
+
 /* Provide additional authentication data for AEAD modes/ciphers.  */
 gcry_error_t gcry_cipher_authenticate (gcry_cipher_hd_t hd, const void *abuf,
                                        size_t abuflen);
@@ -1132,7 +1152,8 @@ enum gcry_pk_algos
     GCRY_PK_ELG   = 20,     /* Elgamal       */
     GCRY_PK_ECDSA = 301,    /* (only for external use).  */
     GCRY_PK_ECDH  = 302,    /* (only for external use).  */
-    GCRY_PK_EDDSA = 303     /* (only for external use).  */
+    GCRY_PK_EDDSA = 303,    /* (only for external use).  */
+    GCRY_PK_KEM   = 333     /* Pseudo ID for KEM algos.  */
   };
 
 /* Flags describing usage capabilities of a PK algorithm. */
@@ -1286,7 +1307,9 @@ enum gcry_md_algos
     GCRY_MD_BLAKE2S_128   = 325,
     GCRY_MD_SM3           = 326,
     GCRY_MD_SHA512_256    = 327,
-    GCRY_MD_SHA512_224    = 328
+    GCRY_MD_SHA512_224    = 328,
+    GCRY_MD_CSHAKE128     = 329,
+    GCRY_MD_CSHAKE256     = 330
   };
 
 /* Flags used with the open function.  */
@@ -1368,6 +1391,12 @@ void gcry_md_hash_buffer (int algo, void *digest,
 gpg_error_t gcry_md_hash_buffers (int algo, unsigned int flags, void *digest,
                                   const gcry_buffer_t *iov, int iovcnt);
 
+/* Convenience function to hash multiple buffers.
+   Algorithm can be 'expendable-output function'.  */
+gpg_error_t gcry_md_hash_buffers_ext (int algo, unsigned int flags,
+                                      void *digest, int digestlen,
+                                      const gcry_buffer_t *iov, int iovcnt);
+
 /* Retrieve the algorithm used with HD.  This does not work reliable
    if more than one algorithm is enabled in HD. */
 int gcry_md_get_algo (gcry_md_hd_t hd);
@@ -1436,6 +1465,14 @@ void gcry_md_debug (gcry_md_hd_t hd, const char *suffix);
 #define gcry_md_get_asnoid(a,b,n) \
             gcry_md_algo_info((a), GCRYCTL_GET_ASNOID, (b), (n))
 
+struct gcry_cshake_customization
+{
+  const void *n;
+  unsigned int n_len;
+  const void *s;
+  unsigned int s_len;
+};
+
 \f
 
 /**********************************************
@@ -1498,19 +1535,24 @@ enum gcry_mac_algos
     GCRY_MAC_CMAC_IDEA          = 210,
     GCRY_MAC_CMAC_GOST28147     = 211,
     GCRY_MAC_CMAC_SM4           = 212,
+    GCRY_MAC_CMAC_ARIA          = 213,
 
     GCRY_MAC_GMAC_AES           = 401,
     GCRY_MAC_GMAC_CAMELLIA      = 402,
     GCRY_MAC_GMAC_TWOFISH       = 403,
     GCRY_MAC_GMAC_SERPENT       = 404,
     GCRY_MAC_GMAC_SEED          = 405,
+    GCRY_MAC_GMAC_SM4           = 406,
+    GCRY_MAC_GMAC_ARIA          = 407,
 
     GCRY_MAC_POLY1305           = 501,
     GCRY_MAC_POLY1305_AES       = 502,
     GCRY_MAC_POLY1305_CAMELLIA  = 503,
     GCRY_MAC_POLY1305_TWOFISH   = 504,
     GCRY_MAC_POLY1305_SERPENT   = 505,
-    GCRY_MAC_POLY1305_SEED      = 506
+    GCRY_MAC_POLY1305_SEED      = 506,
+    GCRY_MAC_POLY1305_SM4       = 507,
+    GCRY_MAC_POLY1305_ARIA      = 508
   };
 
 /* Flags used with the open function.  */
@@ -1598,8 +1640,22 @@ enum gcry_kdf_algos
     GCRY_KDF_PBKDF1 = 33,
     GCRY_KDF_PBKDF2 = 34,
     GCRY_KDF_SCRYPT = 48,
+    /**/
     GCRY_KDF_ARGON2   = 64,
-    GCRY_KDF_BALLOON  = 65
+    GCRY_KDF_BALLOON  = 65,
+    /**/
+    /* In the original SP 800-56A, it's called
+     * "Concatenation Key Derivation Function".
+     * Now (as of 2022), it's defined in SP 800-56C rev.2, as
+     * "One-Step Key Derivation".
+     */
+    GCRY_KDF_ONESTEP_KDF = 96, /* One-Step Key Derivation with hash */
+    GCRY_KDF_ONESTEP_KDF_MAC = 97, /* One-Step Key Derivation with MAC */
+    GCRY_KDF_HKDF = 98,
+    /* Two-Step Key Derivation with HMAC */
+    /* Two-Step Key Derivation with CMAC */
+    /* KDF PRF in SP 800-108r1 */
+    GCRY_KDF_X963_KDF = 101
   };
 
 enum gcry_kdf_subalgo_argon2
@@ -1643,6 +1699,124 @@ gcry_error_t gcry_kdf_compute (gcry_kdf_hd_t h,
                                const gcry_kdf_thread_ops_t *ops);
 gcry_error_t gcry_kdf_final (gcry_kdf_hd_t h, size_t resultlen, void *result);
 void gcry_kdf_close (gcry_kdf_hd_t h);
+
+\f
+/**********************************
+ *                                *
+ *  Key Encapsulation Mechanisms  *
+ *                                *
+ **********************************/
+
+/* Algorithm IDs for the KEMs.  */
+enum gcry_kem_algos
+  {
+    GCRY_KEM_NONE = 0,
+    GCRY_KEM_SNTRUP761  = 1,
+    GCRY_KEM_CM6688128F = 2,    /* Classic McEliece */
+    GCRY_KEM_MLKEM512   = 3,    /* aka Kyber512  */
+    GCRY_KEM_MLKEM768   = 4,    /* aka Kyber768  */
+    GCRY_KEM_MLKEM1024  = 5,    /* aka Kyber1024 */
+    /* From here, ECC KEMs */
+    GCRY_KEM_RAW_X25519 =31,    /* Using X25519 with Identity KDF */
+    GCRY_KEM_RAW_X448   =32,    /* Using X448 with Identity KDF */
+    GCRY_KEM_RAW_BP256  =33,
+    GCRY_KEM_RAW_BP384  =34,
+    GCRY_KEM_RAW_BP512  =35,
+    GCRY_KEM_RAW_P256R1 =36,
+    GCRY_KEM_RAW_P384R1 =37,
+    GCRY_KEM_RAW_P521R1 =38,
+    GCRY_KEM_DHKEM25519 =41,    /* DHKEM with X25519, HKDF, and SHA256 */
+    GCRY_KEM_DHKEM448 =  42,    /* DHKEM with X448, HKDF, and SHA512 */
+    GCRY_KEM_DHKEMP256R1=43,
+    GCRY_KEM_DHKEMP384R1=44,
+    GCRY_KEM_DHKEMP521R1=45
+  };
+
+/*
+ * Before C99, limitation is 31 significant initial characters in a
+ * macro name
+ *
+ *      1 ...                        31
+ *      |                             |
+ *      v                             v
+ *      _______________________________
+ */
+#define GCRY_KEM_SNTRUP761_SECKEY_LEN   1763
+#define GCRY_KEM_SNTRUP761_PUBKEY_LEN   1158
+#define GCRY_KEM_SNTRUP761_ENCAPS_LEN   1039
+#define GCRY_KEM_SNTRUP761_CIPHER_LEN   GCRY_KEM_SNTRUP761_ENCAPS_LEN
+#define GCRY_KEM_SNTRUP761_SHARED_LEN   32
+
+#define GCRY_KEM_CM6688128F_SECKEY_LEN  13932
+#define GCRY_KEM_CM6688128F_PUBKEY_LEN  1044992
+#define GCRY_KEM_CM6688128F_ENCAPS_LEN  208
+#define GCRY_KEM_CM6688128F_CIPHER_LEN  GCRY_KEM_CM6688128F_ENCAPS_LEN
+#define GCRY_KEM_CM6688128F_SHARED_LEN  32
+
+#define GCRY_KEM_MLKEM512_SECKEY_LEN    (2*384+2*384+32+2*32)  /* 1632 */
+#define GCRY_KEM_MLKEM512_PUBKEY_LEN    (2*384+32)             /*  800 */
+#define GCRY_KEM_MLKEM512_ENCAPS_LEN    (128+2*320)            /*  768 */
+#define GCRY_KEM_MLKEM512_CIPHER_LEN    GCRY_KEM_MLKEM512_ENCAPS_LEN
+#define GCRY_KEM_MLKEM512_SHARED_LEN    32
+
+#define GCRY_KEM_MLKEM768_SECKEY_LEN    (3*384+3*384+32+2*32)  /* 2400 */
+#define GCRY_KEM_MLKEM768_PUBKEY_LEN    (3*384+32)             /* 1184 */
+#define GCRY_KEM_MLKEM768_ENCAPS_LEN    (128+3*320)            /* 1088 */
+#define GCRY_KEM_MLKEM768_CIPHER_LEN    GCRY_KEM_MLKEM768_ENCAPS_LEN
+#define GCRY_KEM_MLKEM768_SHARED_LEN    32
+
+#define GCRY_KEM_MLKEM1024_SECKEY_LEN   (4*384+4*384+32+2*32)  /* 3168 */
+#define GCRY_KEM_MLKEM1024_PUBKEY_LEN   (4*384+32)             /* 1568 */
+#define GCRY_KEM_MLKEM1024_ENCAPS_LEN   (160+4*352)            /* 1568 */
+#define GCRY_KEM_MLKEM1024_CIPHER_LEN   GCRY_KEM_MLKEM1024_ENCAPS_LEN
+#define GCRY_KEM_MLKEM1024_SHARED_LEN   32
+
+/* For ECC, seckey, pubkey, and ciphertext is defined by the curve.  */
+#define GCRY_KEM_ECC_X25519_SECKEY_LEN  32
+#define GCRY_KEM_ECC_X25519_PUBKEY_LEN  32
+#define GCRY_KEM_ECC_X25519_ENCAPS_LEN  32
+#define GCRY_KEM_ECC_X25519_CIPHER_LEN  GCRY_KEM_ECC_X25519_ENCAPS_LEN
+/* And shared secret is specific to the protocol.  */
+#define GCRY_KEM_RAW_X25519_SHARED_LEN  32
+
+#define GCRY_KEM_DHKEM25519_SECKEY_LEN  GCRY_KEM_ECC_X25519_SECKEY_LEN
+#define GCRY_KEM_DHKEM25519_PUBKEY_LEN  GCRY_KEM_ECC_X25519_PUBKEY_LEN
+#define GCRY_KEM_DHKEM25519_ENCAPS_LEN  GCRY_KEM_ECC_X25519_ENCAPS_LEN
+#define GCRY_KEM_DHKEM25519_CIPHER_LEN  GCRY_KEM_DHKEM25519_ENCAPS_LEN
+#define GCRY_KEM_DHKEM25519_SHARED_LEN  32
+
+#define GCRY_KEM_ECC_BP256_SECKEY_LEN   32
+#define GCRY_KEM_ECC_BP256_PUBKEY_LEN   (1+32+32)
+#define GCRY_KEM_ECC_BP256_ENCAPS_LEN   (1+32+32)
+#define GCRY_KEM_ECC_BP256_CIPHER_LEN   GCRY_KEM_ECC_BP256_ENCAPS_LEN
+#define GCRY_KEM_RAW_BP256_SHARED_LEN   (1+32+32)
+
+#define GCRY_KEM_ECC_BP384_SECKEY_LEN   48
+#define GCRY_KEM_ECC_BP384_PUBKEY_LEN   (1+48+48)
+#define GCRY_KEM_ECC_BP384_ENCAPS_LEN   (1+48+48)
+#define GCRY_KEM_ECC_BP384_CIPHER_LEN   GCRY_KEM_ECC_BP384_ENCAPS_LEN
+#define GCRY_KEM_RAW_BP384_SHARED_LEN   (1+48+48)
+
+/* Generate a new key pair with ALGO.  */
+gcry_error_t gcry_kem_keypair (int algo,
+                               void *pubkey, size_t pubkey_len,
+                               void *seckey, size_t seckey_len);
+
+/* With ALGO, for a PUBKEY, generate SHARED secret and encapsulate
+   it into CIPHERTEXT.  */
+gcry_error_t gcry_kem_encap (int algo,
+                             const void *pubkey, size_t pubkey_len,
+                             void *ciphertext, size_t ciphertext_len,
+                             void *shared, size_t shared_len,
+                             const void *optional, size_t optional_len);
+
+/* With ALGO, for a SECKEY and CIPHERTEXT, compute its SHARED secret.  */
+gcry_error_t gcry_kem_decap (int algo,
+                             const void *seckey, size_t seckey_len,
+                             const void *ciphertext, size_t ciphertext_len,
+                             void *shared, size_t shared_len,
+                             const void *optional, size_t optional_len);
+
 \f
 /************************************
  *                                  *
@@ -1823,7 +1997,7 @@ typedef int (*gcry_handler_no_mem_t) (void *, size_t, unsigned int);
 /* Type for fatal error handlers.  */
 typedef void (*gcry_handler_error_t) (void *, int, const char *);
 
-/* Type for logging handlers.  */
+/* Type for the deprecated log handler.  */
 typedef void (*gcry_handler_log_t) (void *, int, const char *, va_list);
 
 /* Certain operations can provide progress information.  This function
@@ -1847,9 +2021,9 @@ void gcry_set_outofcore_handler (gcry_handler_no_mem_t h, void *opaque);
    handler. */
 void gcry_set_fatalerror_handler (gcry_handler_error_t fnc, void *opaque);
 
-/* Register a function used instead of the internal logging
-   facility. */
-void gcry_set_log_handler (gcry_handler_log_t f, void *opaque);
+/* This function has no more effect.  */
+void gcry_set_log_handler (gcry_handler_log_t f,
+                           void *opaque) _GCRY_ATTR_INTERNAL;
 
 /* Reserved for future use. */
 void gcry_set_gettext_handler (const char *(*f)(const char*));
@@ -1902,6 +2076,7 @@ gcry_error_t gcry_pk_hash_verify (gcry_sexp_t sigval,
 
 gcry_error_t gcry_pk_random_override_new (gcry_ctx_t *r_ctx,
                                           const unsigned char *p, size_t len);
+#define gcry_pk_input_data_push gcry_pk_random_override_new
 
 #if 0 /* (Keep Emacsens' auto-indent happy.) */
 {
diff --git a/src/gcryptrnd.c b/src/gcryptrnd.c
deleted file mode 100644 (file)
index b13931b..0000000
+++ /dev/null
@@ -1,680 +0,0 @@
-/* gcryptrnd.c - Libgcrypt Random Number Daemon
- * Copyright (C) 2006 Free Software Foundation, Inc.
- *
- * Gcryptend is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published
- * by the Free Software Foundation; either version 2 of the License,
- * or (at your option) any later version.
- *
- * Gcryptrnd is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
- * 02110-1301, USA.
- */
-
-/* We require vsyslog pth
-   We need to test for:  setrlimit
-
-   We should also prioritize requests.  This is best done by putting
-   the requests into queues and have a main thread processing these
-   queues.
-
- */
-
-#include <config.h>
-#include <stdio.h>
-#include <stddef.h>
-#include <stdlib.h>
-#include <assert.h>
-#include <time.h>
-#include <sys/times.h>
-#include <sys/types.h>
-#include <sys/stat.h>
-#include <stdarg.h>
-#include <syslog.h>
-#include <sys/socket.h>
-#include <sys/un.h>
-#include <unistd.h>
-#include <errno.h>
-#include <pth.h>
-#include <gcrypt.h>
-
-#define PGM "gcryptrnd"
-#define MYVERSION_LINE PGM " (Libgcrypt) " VERSION
-#define BUGREPORT_LINE "\nReport bugs to <bug-libgcrypt@gnupg.org>.\n"
-
-/* Pth wrapper function definitions. */
-GCRY_THREAD_OPTION_PTH_IMPL;
-
-
-/* Flag set to true if we have been daemonized. */
-static int running_detached;
-/* Flag indicating that a shutdown has been requested.  */
-static int shutdown_pending;
-/* Counter for active connections.  */
-static int active_connections;
-
-
-
-/* Local prototypes.  */
-static void serve (int listen_fd);
-
-
-
-\f
-
-/* To avoid that a compiler optimizes certain memset calls away, these
-   macros may be used instead. */
-#define wipememory2(_ptr,_set,_len) do { \
-              volatile char *_vptr=(volatile char *)(_ptr); \
-              size_t _vlen=(_len); \
-              while(_vlen) { *_vptr=(_set); _vptr++; _vlen--; } \
-                  } while(0)
-#define wipememory(_ptr,_len) wipememory2(_ptr,0,_len)
-
-
-
-
-/* Error printing utility.  PRIORITY should be one of syslog's
-   priority levels.  This functions prints to the stderr or syslog
-   depending on whether we are already daemonized. */
-static void
-logit (int priority, const char *format, ...)
-{
-  va_list arg_ptr;
-
-  va_start (arg_ptr, format) ;
-  if (running_detached)
-    {
-      vsyslog (priority, format, arg_ptr);
-    }
-  else
-    {
-      fputs (PGM ": ", stderr);
-      vfprintf (stderr, format, arg_ptr);
-      putc ('\n', stderr);
-    }
-  va_end (arg_ptr);
-}
-
-/* Callback used by libgcrypt for logging. */
-static void
-my_gcry_logger (void *dummy, int level, const char *format, va_list arg_ptr)
-{
-  (void)dummy;
-
-  /* Map the log levels. */
-  switch (level)
-    {
-    case GCRY_LOG_CONT: level = LOG_INFO /* FIXME */; break;
-    case GCRY_LOG_INFO: level = LOG_INFO; break;
-    case GCRY_LOG_WARN: level = LOG_WARNING; break;
-    case GCRY_LOG_ERROR:level = LOG_ERR; break;
-    case GCRY_LOG_FATAL:level = LOG_CRIT; break;
-    case GCRY_LOG_BUG:  level = LOG_CRIT; break;
-    case GCRY_LOG_DEBUG:level = LOG_DEBUG; break;
-    default:            level = LOG_ERR; break;
-    }
-  if (running_detached)
-    {
-      vsyslog (level, format, arg_ptr);
-    }
-  else
-    {
-      fputs (PGM ": ", stderr);
-      vfprintf (stderr, format, arg_ptr);
-      if (!*format || format[strlen (format)-1] != '\n')
-        putc ('\n', stderr);
-    }
-}
-
-
-/* The cleanup handler - used to wipe out the secure memory. */
-static void
-cleanup (void)
-{
-  gcry_control (GCRYCTL_TERM_SECMEM );
-}
-
-
-/* Make us a daemon and open the syslog. */
-static void
-daemonize (void)
-{
-  int i;
-  pid_t pid;
-
-  fflush (NULL);
-
-  pid = fork ();
-  if (pid == (pid_t)-1)
-    {
-      logit (LOG_CRIT, "fork failed: %s", strerror (errno));
-      exit (1);
-    }
-  if (pid)
-    exit (0);
-
-  if (setsid() == -1)
-    {
-      logit (LOG_CRIT, "setsid() failed: %s", strerror(errno));
-      exit (1);
-    }
-
-  signal (SIGHUP, SIG_IGN);
-
-  pid = fork ();
-  if (pid == (pid_t)-1)
-    {
-      logit (LOG_CRIT, PGM ": second fork failed: %s", strerror (errno));
-      exit (1);
-    }
-  if (pid)
-    exit (0); /* First child exits. */
-
-  running_detached = 1;
-
-  if (chdir("/"))
-    {
-      logit (LOG_CRIT, "chdir(\"/\") failed: %s", strerror (errno));
-      exit (1);
-    }
-  umask (0);
-
-  for (i=0; i <= 2; i++)
-    close (i);
-
-  openlog (PGM, LOG_PID, LOG_DAEMON);
-}
-
-
-static void
-disable_core_dumps (void)
-{
-#ifdef HAVE_SETRLIMIT
-  struct rlimit limit;
-
-  if (getrlimit (RLIMIT_CORE, &limit))
-    limit.rlim_max = 0;
-  limit.rlim_cur = 0;
-  if( !setrlimit (RLIMIT_CORE, &limit) )
-    return 0;
-  if (errno != EINVAL && errno != ENOSYS)
-    logit (LOG_ERR, "can't disable core dumps: %s\n", strerror (errno));
-#endif /* HAVE_SETRLIMIT */
-}
-
-
-
-static void
-print_version (int with_help)
-{
-  fputs (MYVERSION_LINE "\n"
-         "Copyright (C) 2006 Free Software Foundation, Inc.\n"
-         "License GPLv2+: GNU GPL version 2 or later "
-         "<http://www.gnu.org/licenses/old-licenses/gpl-2.0.html>\n"
-         "This is free software: you are free to change and redistribute it.\n"
-         "There is NO WARRANTY, to the extent permitted by law.\n",
-         stdout);
-
-  if (with_help)
-    fputs ("\n"
-           "Usage: " PGM " [OPTIONS] [SOCKETNAME]\n"
-           "Start Libgcrypt's random number daemon listening"
-           " on socket SOCKETNAME\n"
-           "SOCKETNAME defaults to XXX\n"
-           "\n"
-           "  --no-detach   do not deatach from the console\n"
-           "  --version     print version of the program and exit\n"
-           "  --help        display this help and exit\n"
-           BUGREPORT_LINE, stdout );
-
-  exit (0);
-}
-
-static int
-print_usage (void)
-{
-  fputs ("usage: " PGM " [OPTIONS] [SOCKETNAME]\n", stderr);
-  fputs ("       (use --help to display options)\n", stderr);
-  exit (1);
-}
-
-
-int
-main (int argc, char **argv)
-{
-  int no_detach = 0;
-  gpg_error_t err;
-  struct sockaddr_un *srvr_addr;
-  socklen_t addrlen;
-  int fd;
-  int rc;
-  const char *socketname = "/var/run/libgcrypt/S.gcryptrnd";
-
-
-  if (argc)
-    {
-      argc--; argv++;
-    }
-  while (argc && **argv == '-' && (*argv)[1] == '-')
-    {
-      if (!(*argv)[2])
-        {
-          argc--; argv++;
-          break;
-        }
-      else if (!strcmp (*argv, "--version"))
-        print_version (0);
-      else if (!strcmp (*argv, "--help"))
-        print_version (1);
-      else if (!strcmp (*argv, "--no-detach"))
-        {
-          no_detach = 1;
-          argc--; argv++;
-        }
-      else
-        print_usage ();
-    }
-
-  if (argc == 1)
-    socketname = argv[0];
-  else if (argc > 1)
-    print_usage ();
-
-  if (!no_detach)
-    daemonize ();
-
-  signal (SIGPIPE, SIG_IGN);
-
-  logit (LOG_NOTICE, "started version " VERSION );
-
-  /* Libgcrypt requires us to register the threading model before we
-     do anything else with it. Note that this also calls pth_init.  We
-     do the initialization while already running as a daemon to avoid
-     overhead with double initialization of Libgcrypt. */
-  err = gcry_control (GCRYCTL_SET_THREAD_CBS, &gcry_threads_pth);
-  if (err)
-    {
-      logit (LOG_CRIT, "can't register GNU Pth with Libgcrypt: %s",
-             gpg_strerror (err));
-      exit (1);
-    }
-
-  /* Check that the libgcrypt version is sufficient.  */
-  if (!gcry_check_version (VERSION) )
-    {
-      logit (LOG_CRIT, "libgcrypt is too old (need %s, have %s)",
-             VERSION, gcry_check_version (NULL) );
-      exit (1);
-    }
-
-  /* Register the logging callback and tell Libcgrypt to put the
-     random pool into secure memory. */
-  gcry_set_log_handler (my_gcry_logger, NULL);
-  gcry_control (GCRYCTL_USE_SECURE_RNDPOOL);
-
-  /* Obviously we don't want to allow any core dumps. */
-  disable_core_dumps ();
-
-  /* Initialize the secure memory stuff which will also drop any extra
-     privileges we have. */
-  gcry_control (GCRYCTL_INIT_SECMEM, 16384, 0);
-
-  /* Register a cleanup handler. */
-  atexit (cleanup);
-
-  /* Create and listen on the socket. */
-  fd = socket (AF_UNIX, SOCK_STREAM, 0);
-  if (fd == -1)
-    {
-      logit (LOG_CRIT, "can't create socket: %s", strerror (errno));
-      exit (1);
-    }
-  srvr_addr = gcry_xmalloc (sizeof *srvr_addr);
-  memset (srvr_addr, 0, sizeof *srvr_addr);
-  srvr_addr->sun_family = AF_UNIX;
-  if (strlen (socketname) + 1 >= sizeof (srvr_addr->sun_path))
-    {
-      logit (LOG_CRIT, "socket name `%s' too long", socketname);
-      exit (1);
-    }
-  strcpy (srvr_addr->sun_path, socketname);
-  addrlen = (offsetof (struct sockaddr_un, sun_path)
-             + strlen (srvr_addr->sun_path) + 1);
-  rc = bind (fd, (struct sockaddr*) srvr_addr, addrlen);
-  if (rc == -1 && errno == EADDRINUSE)
-    {
-      remove (socketname);
-      rc = bind (fd, (struct sockaddr*) srvr_addr, addrlen);
-    }
-  if (rc == -1)
-    {
-      logit (LOG_CRIT, "error binding socket to `%s': %s",
-             srvr_addr->sun_path, strerror (errno));
-      close (fd);
-      exit (1);
-    }
-
-  if (listen (fd, 5 ) == -1)
-    {
-      logit (LOG_CRIT, "listen() failed: %s", strerror (errno));
-      close (fd);
-      exit (1);
-    }
-
-  logit (LOG_INFO, "listening on socket `%s', fd=%d",
-         srvr_addr->sun_path, fd);
-
-  serve (fd);
-  close (fd);
-
-  logit (LOG_NOTICE, "stopped version " VERSION );
-  return 0;
-}
-
-
-/* Send LENGTH bytes of BUFFER to file descriptor FD.  Returns 0 on
-   success or another value on write error. */
-static int
-writen (int fd, const void *buffer, size_t length)
-{
-  while (length)
-    {
-      ssize_t n = pth_write (fd, buffer, length);
-      if (n < 0)
-         {
-           logit (LOG_ERR, "connection %d: write error: %s",
-                  fd, strerror (errno));
-           return -1; /* write error */
-         }
-      length -= n;
-      buffer = (const char*)buffer + n;
-    }
-  return 0;  /* Okay */
-}
-
-
-/* Send an error response back.  Returns 0 on success. */
-static int
-send_error (int fd, int errcode)
-{
-  unsigned char buf[2];
-
-  buf[0] = errcode;
-  buf[1] = 0;
-  return writen (fd, buf, 2 );
-}
-
-/* Send a pong response back.  Returns 0 on success or another value
-   on write error.  */
-static int
-send_pong (int fd)
-{
-  return writen (fd, "\x00\x04pong", 6);
-}
-
-/* Send a nonce of size LENGTH back. Return 0 on success. */
-static int
-send_nonce (int fd, int length)
-{
-  unsigned char buf[2+255];
-  int rc;
-
-  assert (length >= 0 && length <= 255);
-  buf[0] = 0;
-  buf[1] = length;
-  gcry_create_nonce (buf+2, length);
-  rc = writen (fd, buf, 2+length );
-  wipememory (buf+2, length);
-  return rc;
-}
-
-/* Send a random of size LENGTH with quality LEVEL back. Return 0 on
-   success. */
-static int
-send_random (int fd, int length, int level)
-{
-  unsigned char buf[2+255];
-  int rc;
-
-  assert (length >= 0 && length <= 255);
-  assert (level == GCRY_STRONG_RANDOM || level == GCRY_VERY_STRONG_RANDOM);
-  buf[0] = 0;
-  buf[1] = length;
-  /* Note that we don't bother putting the random stuff into secure
-     memory because this daemon is anyway intended to be run under
-     root and it is questionable whether the kernel buffers etc. are
-     equally well protected. */
-  gcry_randomize (buf+2, length, level);
-  rc = writen (fd, buf, 2+length );
-  wipememory (buf+2, length);
-  return rc;
-}
-
-/* Main processing loop for a connection.
-
-   A request is made up of:
-
-    1 byte  Total length of request; must be 3
-    1 byte  Command
-            0   = Ping
-            10  = GetNonce
-            11  = GetStrongRandom
-            12  = GetVeryStrongRandom
-            (all other values are reserved)
-    1 byte  Number of requested bytes.
-            This is ignored for command Ping.
-
-   A response is made up of:
-
-    1 byte  Error Code
-            0    = Everything is fine
-            1    = Bad Command
-            0xff = Other error.
-            (For a bad request the connection will simply be closed)
-    1 byte  Length of data
-    n byte  data
-
-   The requests are read as long as the connection is open.
-
-
- */
-static void
-connection_loop (int fd)
-{
-  unsigned char request[3];
-  unsigned char *p;
-  int nleft, n;
-  int rc;
-
-  for (;;)
-    {
-      for (nleft=3, p=request; nleft > 0; )
-        {
-          n = pth_read (fd, p, nleft);
-          if (!n && p == request)
-            return; /* Client terminated connection. */
-          if (n <= 0)
-            {
-              logit (LOG_ERR, "connection %d: read error: %s",
-                     fd, n? strerror (errno) : "Unexpected EOF");
-              return;
-            }
-          p += n;
-          nleft -= n;
-        }
-      if (request[0] != 3)
-        {
-          logit (LOG_ERR, "connection %d: invalid length (%d) of request",
-                 fd, request[0]);
-          return;
-        }
-
-      switch (request[1])
-        {
-        case 0: /* Ping */
-          rc = send_pong (fd);
-          break;
-        case 10: /* GetNonce */
-          rc = send_nonce (fd, request[2]);
-          break;
-        case 11: /* GetStrongRandom */
-          rc = send_random (fd, request[2], GCRY_STRONG_RANDOM);
-          break;
-        case 12: /* GetVeryStrongRandom */
-          rc = send_random (fd, request[2], GCRY_VERY_STRONG_RANDOM);
-          break;
-
-        default: /* Invalid command */
-          rc = send_error (fd, 1);
-          break;
-        }
-      if (rc)
-        break; /* A write error occurred while sending the response. */
-    }
-}
-
-
-
-/* Entry point for a connection's thread. */
-static void *
-connection_thread (void *arg)
-{
-  int fd = (int)arg;
-
-  active_connections++;
-  logit (LOG_INFO, "connection handler for fd %d started", fd);
-
-  connection_loop (fd);
-
-  close (fd);
-  logit (LOG_INFO, "connection handler for fd %d terminated", fd);
-  active_connections--;
-
-  return NULL;
-}
-
-
-/* This signal handler is called from the main loop between acepting
-   connections.  It is called on the regular stack, thus no special
-   caution needs to be taken.  It returns true to indicate that the
-   process should terminate. */
-static int
-handle_signal (int signo)
-{
-  switch (signo)
-    {
-    case SIGHUP:
-      logit (LOG_NOTICE, "SIGHUP received - re-reading configuration");
-      break;
-
-    case SIGUSR1:
-      logit (LOG_NOTICE, "SIGUSR1 received - no action defined");
-      break;
-
-    case SIGUSR2:
-      logit (LOG_NOTICE, "SIGUSR2 received - no action defined");
-      break;
-
-    case SIGTERM:
-      if (!shutdown_pending)
-        logit (LOG_NOTICE, "SIGTERM received - shutting down ...");
-      else
-        logit (LOG_NOTICE, "SIGTERM received - still %d active connections",
-               active_connections);
-      shutdown_pending++;
-      if (shutdown_pending > 2)
-        {
-          logit (LOG_NOTICE, "shutdown forced");
-          return 1;
-       }
-      break;
-
-    case SIGINT:
-      logit (LOG_NOTICE, "SIGINT received - immediate shutdown");
-      return 1;
-
-    default:
-      logit (LOG_NOTICE, "signal %d received - no action defined\n", signo);
-    }
-  return 0;
-}
-
-
-
-/* Main server loop.  This is called with the FD of the listening
-   socket. */
-static void
-serve (int listen_fd)
-{
-  pth_attr_t tattr;
-  pth_event_t ev;
-  sigset_t sigs;
-  int signo;
-  struct sockaddr_un paddr;
-  socklen_t plen = sizeof (paddr);
-  int fd;
-
-  tattr = pth_attr_new();
-  pth_attr_set (tattr, PTH_ATTR_JOINABLE, 0);
-  pth_attr_set (tattr, PTH_ATTR_STACK_SIZE, 256*1024);
-  pth_attr_set (tattr, PTH_ATTR_NAME, "connection");
-
-  sigemptyset (&sigs);
-  sigaddset (&sigs, SIGHUP);
-  sigaddset (&sigs, SIGUSR1);
-  sigaddset (&sigs, SIGUSR2);
-  sigaddset (&sigs, SIGINT);
-  sigaddset (&sigs, SIGTERM);
-  ev = pth_event (PTH_EVENT_SIGS, &sigs, &signo);
-
-  for (;;)
-    {
-      if (shutdown_pending)
-        {
-          if (!active_connections)
-            break; /* Ready. */
-
-          /* Do not accept anymore connections but wait for existing
-             connections to terminate.  */
-          signo = 0;
-          pth_wait (ev);
-          if (pth_event_occurred (ev) && signo)
-            if (handle_signal (signo))
-              break; /* Stop the loop. */
-          continue;
-       }
-
-      gcry_fast_random_poll ();
-      fd = pth_accept_ev (listen_fd, (struct sockaddr *)&paddr, &plen, ev);
-      if (fd == -1)
-        {
-          if (pth_event_occurred (ev))
-            {
-              if (handle_signal (signo))
-                break; /* Stop the loop. */
-              continue;
-           }
-          logit (LOG_WARNING, "accept failed: %s - waiting 1s\n",
-                 strerror (errno));
-          gcry_fast_random_poll ();
-          pth_sleep (1);
-          continue;
-       }
-
-      if (!pth_spawn (tattr, connection_thread, (void*)fd))
-        {
-          logit (LOG_ERR, "error spawning connection handler: %s\n",
-                 strerror (errno) );
-          close (fd);
-       }
-    }
-
-  pth_event_free (ev, PTH_FREE_ALL);
-}
diff --git a/src/getrandom.c b/src/getrandom.c
deleted file mode 100644 (file)
index f9bb5c0..0000000
+++ /dev/null
@@ -1,326 +0,0 @@
-/* getrandom.c - Libgcrypt Random Number client
- * Copyright (C) 2006 Free Software Foundation, Inc.
- *
- * Getrandom is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published
- * by the Free Software Foundation; either version 2 of the License,
- * or (at your option) any later version.
- *
- * Getrandom is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
- * 02110-1301, USA.
- */
-
-#include <config.h>
-#include <stdio.h>
-#include <stddef.h>
-#include <stdlib.h>
-#include <assert.h>
-#include <sys/types.h>
-#include <sys/stat.h>
-#include <stdarg.h>
-#include <sys/socket.h>
-#include <sys/un.h>
-#include <unistd.h>
-#include <errno.h>
-
-#define PGM "getrandom"
-#define MYVERSION_LINE PGM " (Libgcrypt) " VERSION
-#define BUGREPORT_LINE "\nReport bugs to <bug-libgcrypt@gnupg.org>.\n"
-
-
-static void
-logit (const char *format, ...)
-{
-  va_list arg_ptr;
-
-  va_start (arg_ptr, format) ;
-  fputs (PGM ": ", stderr);
-  vfprintf (stderr, format, arg_ptr);
-  putc ('\n', stderr);
-  va_end (arg_ptr);
-}
-
-
-/* Send LENGTH bytes of BUFFER to file descriptor FD.  Returns 0 on
-   success or another value on write error. */
-static int
-writen (int fd, const void *buffer, size_t length)
-{
-  while (length)
-    {
-      ssize_t n;
-
-      do
-        n = write (fd, buffer, length);
-      while (n < 0 && errno == EINTR);
-      if (n < 0)
-         {
-           logit ("write error: %s", strerror (errno));
-           return -1; /* write error */
-         }
-      length -= n;
-      buffer = (const char *)buffer + n;
-    }
-  return 0;  /* Okay */
-}
-
-
-
-
-static void
-print_version (int with_help)
-{
-  fputs (MYVERSION_LINE "\n"
-         "Copyright (C) 2006 Free Software Foundation, Inc.\n"
-         "License GPLv2+: GNU GPL version 2 or later "
-         "<http://www.gnu.org/licenses/old-licenses/gpl-2.0.html>\n"
-         "This is free software: you are free to change and redistribute it.\n"
-         "There is NO WARRANTY, to the extent permitted by law.\n",
-         stdout);
-
-  if (with_help)
-    fputs ("\n"
-           "Usage: " PGM " [OPTIONS] NBYTES\n"
-           "Connect to libgcrypt's random number daemon and "
-           "return random numbers"
-           "\n"
-           "  --nonce       Return weak random suitable for a nonce\n"
-           "  --very-strong Return very strong random\n"
-           "  --ping        Send a ping\n"
-           "  --socket NAME Name of sockket to connect to\n"
-           "  --hex         Return result as a hex dump\n"
-           "  --verbose     Show what we are doing\n"
-           "  --version     Print version of the program and exit\n"
-           "  --help        Display this help and exit\n"
-           BUGREPORT_LINE, stdout );
-
-  exit (0);
-}
-
-static int
-print_usage (void)
-{
-  fputs ("usage: " PGM " [OPTIONS] NBYTES\n", stderr);
-  fputs ("       (use --help to display options)\n", stderr);
-  exit (1);
-}
-
-
-int
-main (int argc, char **argv)
-{
-  struct sockaddr_un *srvr_addr;
-  socklen_t addrlen;
-  int fd;
-  int rc;
-  unsigned char buffer[300];
-  int nleft, nread;
-  const char *socketname = "/var/run/libgcrypt/S.gcryptrnd";
-  int do_ping = 0;
-  int get_nonce = 0;
-  int get_very_strong = 0;
-  int req_nbytes, nbytes, n;
-  int verbose = 0;
-  int fail = 0;
-  int do_hex = 0;
-
-  if (argc)
-    {
-      argc--; argv++;
-    }
-  while (argc && **argv == '-' && (*argv)[1] == '-')
-    {
-      if (!(*argv)[2])
-        {
-          argc--; argv++;
-          break;
-        }
-      else if (!strcmp (*argv, "--version"))
-        print_version (0);
-      else if (!strcmp (*argv, "--help"))
-        print_version (1);
-      else if (!strcmp (*argv, "--socket") && argc > 1 )
-        {
-          argc--; argv++;
-          socketname = *argv;
-          argc--; argv++;
-        }
-      else if (!strcmp (*argv, "--nonce"))
-        {
-          argc--; argv++;
-          get_nonce = 1;
-        }
-      else if (!strcmp (*argv, "--very-strong"))
-        {
-          argc--; argv++;
-          get_very_strong = 1;
-        }
-      else if (!strcmp (*argv, "--ping"))
-        {
-          argc--; argv++;
-          do_ping = 1;
-        }
-      else if (!strcmp (*argv, "--hex"))
-        {
-          argc--; argv++;
-          do_hex = 1;
-        }
-      else if (!strcmp (*argv, "--verbose"))
-        {
-          argc--; argv++;
-          verbose = 1;
-        }
-      else
-        print_usage ();
-    }
-
-
-  if (!argc && do_ping)
-    ; /* This is allowed. */
-  else if (argc != 1)
-    print_usage ();
-  req_nbytes = argc? atoi (*argv) : 0;
-
-  if (req_nbytes < 0)
-    print_usage ();
-
-  /* Create a socket. */
-  fd = socket (AF_UNIX, SOCK_STREAM, 0);
-  if (fd == -1)
-    {
-      logit ("can't create socket: %s", strerror (errno));
-      exit (1);
-    }
-  srvr_addr = malloc (sizeof *srvr_addr);
-  if (!srvr_addr)
-    {
-      logit ("malloc failed: %s", strerror (errno));
-      exit (1);
-    }
-  memset (srvr_addr, 0, sizeof *srvr_addr);
-  srvr_addr->sun_family = AF_UNIX;
-  if (strlen (socketname) + 1 >= sizeof (srvr_addr->sun_path))
-    {
-      logit ("socket name `%s' too long", socketname);
-      exit (1);
-    }
-  strcpy (srvr_addr->sun_path, socketname);
-  addrlen = (offsetof (struct sockaddr_un, sun_path)
-             + strlen (srvr_addr->sun_path) + 1);
-  rc = connect (fd, (struct sockaddr*) srvr_addr, addrlen);
-  if (rc == -1)
-    {
-      logit ("error connecting socket `%s': %s",
-             srvr_addr->sun_path, strerror (errno));
-      close (fd);
-      exit (1);
-    }
-
-  do
-    {
-      nbytes = req_nbytes > 255? 255 : req_nbytes;
-      req_nbytes -= nbytes;
-
-      buffer[0] = 3;
-      if (do_ping)
-        buffer[1] = 0;
-      else if (get_nonce)
-        buffer[1] = 10;
-      else if (get_very_strong)
-        buffer[1] = 12;
-      else
-        buffer[1] = 11;
-      buffer[2] = nbytes;
-      if (writen (fd, buffer, 3))
-        fail = 1;
-      else
-        {
-          for (nleft=2, nread=0; nleft > 0; )
-            {
-              do
-                n = read (fd, buffer+nread, nleft);
-              while (n < 0 && errno == EINTR);
-              if (n < 0)
-                {
-                  logit ("read error: %s", strerror (errno));
-                  exit (1);
-                }
-              nleft -= n;
-              nread += n;
-              if (nread && buffer[0])
-                {
-                  logit ("server returned error code %d", buffer[0]);
-                  exit (1);
-                }
-            }
-          if (verbose)
-            logit ("received response with %d bytes of data", buffer[1]);
-          if (buffer[1] < nbytes)
-            {
-              logit ("warning: server returned less bytes than requested");
-              fail = 1;
-            }
-          else if (buffer[1] > nbytes && !do_ping)
-            {
-              logit ("warning: server returned more bytes than requested");
-              fail = 1;
-            }
-          nbytes = buffer[1];
-          if (nbytes > sizeof buffer)
-            {
-              logit ("buffer too short to receive data");
-              exit (1);
-            }
-
-          for (nleft=nbytes, nread=0; nleft > 0; )
-            {
-              do
-                n = read (fd, buffer+nread, nleft);
-              while (n < 0 && errno == EINTR);
-              if (n < 0)
-                {
-                  logit ("read error: %s", strerror (errno));
-                  exit (1);
-                }
-              nleft -= n;
-              nread += n;
-            }
-
-          if (do_hex)
-            {
-              for (n=0; n < nbytes; n++)
-                {
-                  if (!n)
-                    ;
-                  else if (!(n % 16))
-                    putchar ('\n');
-                  else
-                    putchar (' ');
-                  printf ("%02X", buffer[n]);
-                }
-              if (nbytes)
-                putchar ('\n');
-            }
-          else
-            {
-              if (fwrite (buffer, nbytes, 1, stdout) != 1)
-                {
-                  logit ("error writing to stdout: %s", strerror (errno));
-                  fail = 1;
-                }
-            }
-        }
-    }
-  while (!fail && req_nbytes);
-
-  close (fd);
-  free (srvr_addr);
-  return fail? 1 : 0;
-}
index 9a356a04162524a1111d1a4b66bfb5f8ae785b9b..593ea406fac57889f9c2f56f309d20285394b80e 100644 (file)
@@ -7,7 +7,7 @@
  * This file is part of Libgcrypt.
  *
  * Libgcrypt is free software; you can redistribute it and/or modify
- * it under the terms of the GNU Lesser general Public License as
+ * it under the terms of the GNU Lesser General Public License as
  * published by the Free Software Foundation; either version 2.1 of
  * the License, or (at your option) any later version.
  *
@@ -102,6 +102,9 @@ global_init (void)
   if (!pre_syscall_func)
     gpgrt_get_syscall_clamp (&pre_syscall_func, &post_syscall_func);
 
+  /* Add a handler to be called after log_fatal and log_debug.  */
+  _gcry_set_gpgrt_post_log_handler ();
+
   /* See whether the system is in FIPS mode.  This needs to come as
      early as possible but after ATH has been initialized.  */
   _gcry_initialize_fips_mode (force_fips_mode);
@@ -307,12 +310,7 @@ print_config (const char *what, gpgrt_stream_t fp)
   if (!what || !strcmp (what, "cc"))
     {
       gpgrt_fprintf (fp, "cc:%d:%s:\n",
-#if GPGRT_VERSION_NUMBER >= 0x011b00 /* 1.27 */
-                     GPGRT_GCC_VERSION
-#else
-                     _GPG_ERR_GCC_VERSION /* Due to a bug in gpg-error.h.  */
-#endif
-                     ,
+                     GPGRT_GCC_VERSION,
 #ifdef __clang__
                      "clang:" __VERSION__
 #elif __GNUC__
@@ -356,6 +354,11 @@ print_config (const char *what, gpgrt_stream_t fp)
       gpgrt_fprintf (fp, "cpu-arch:"
 #if defined(HAVE_CPU_ARCH_X86)
                      "x86"
+#              ifdef __x86_64__
+                     ":amd64"
+#              else
+                     ":i386"
+#              endif
 #elif defined(HAVE_CPU_ARCH_ALPHA)
                      "alpha"
 #elif defined(HAVE_CPU_ARCH_SPARC)
@@ -523,7 +526,7 @@ _gcry_vcontrol (enum gcry_ctl_cmds cmd, va_list arg_ptr)
   switch (cmd)
     {
     case GCRYCTL_ENABLE_M_GUARD:
-      _gcry_private_enable_m_guard ();
+      rc = GPG_ERR_NOT_SUPPORTED;
       break;
 
     case GCRYCTL_ENABLE_QUICK_RANDOM:
@@ -1059,20 +1062,6 @@ _gcry_is_secure (const void *a)
   return _gcry_private_is_secure (a);
 }
 
-void
-_gcry_check_heap( const void *a )
-{
-  (void)a;
-
-    /* FIXME: implement this*/
-#if 0
-    if( some_handler )
-       some_handler(a)
-    else
-       _gcry_private_check_heap(a)
-#endif
-}
-
 static void *
 _gcry_realloc_core (void *a, size_t n, int xhint)
 {
index 899e6d1589b2045c47179b821cd3001383dcdbfc..962de85e29b33c8276918b4501355e9a2b21e9ba 100644 (file)
@@ -35,7 +35,7 @@
      WORDS_BIGENDIAN       Defined to 1 on big endian systems.
      inline                If defined, it should yield the keyword used
                            to inline a function.
-     HAVE_TYPE_U32         Defined if the u32 type is available.
+     HAVE_U32              Defined if the u32 type is available.
      SIZEOF_UNSIGNED_INT   Defined to the size in bytes of an unsigned int.
      SIZEOF_UNSIGNED_LONG  Defined to the size in bytes of an unsigned long.
 
@@ -50,7 +50,7 @@
 # define KEY_FOR_BINARY_CHECK "What am I, a doctor or a moonshuttle conductor?"
 # endif
 #include <stdint.h>
-#define HAVE_TYPE_U32 1
+#define HAVE_U32 1
 typedef uint32_t u32;
 #define VERSION "standalone"
 /* For GCC, we can detect endianness.  If not GCC, please define manually.  */
@@ -82,7 +82,7 @@ typedef uint32_t u32;
 
 
 
-#ifndef HAVE_TYPE_U32
+#ifndef HAVE_U32
 # undef u32 /* Undef a possible macro with that name.  */
 # if SIZEOF_UNSIGNED_INT == 4
    typedef unsigned int u32;
@@ -91,7 +91,7 @@ typedef uint32_t u32;
 # else
 #  error no typedef for u32
 # endif
-# define HAVE_TYPE_U32
+# define HAVE_U32
 #endif
 
 
@@ -103,8 +103,8 @@ struct hmac256_context
   u32  h0, h1, h2, h3, h4, h5, h6, h7;
   u32  nblocks;
   int  count;
-  int  finalized:1;
-  int  use_hmac:1;
+  unsigned int  finalized:1;
+  unsigned int  use_hmac:1;
   unsigned char buf[64];
   unsigned char opad[64];
 };
index df28e72721fc9c1bbe68ee88f780edf1bd1b2447..05700e0e0e66d19160f9b1521fec148a1889f030 100644 (file)
@@ -4,7 +4,7 @@
  * This file is part of Libgcrypt.
  *
  * Libgcrypt is free software; you can redistribute it and/or modify
- * it under the terms of the GNU Lesser general Public License as
+ * it under the terms of the GNU Lesser General Public License as
  * published by the Free Software Foundation; either version 2.1 of
  * the License, or (at your option) any later version.
  *
index 41188583456cfb57d3defecde20a3329895d08f0..500cd97a7179d38a103776159fb3983d76ee21da 100644 (file)
@@ -1,5 +1,5 @@
 /* hwf-arm.c - Detect hardware features - ARM part
- * Copyright (C) 2013,2019 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ * Copyright (C) 2013,2019,2022 Jussi Kivilinna <jussi.kivilinna@iki.fi>
  *
  * This file is part of Libgcrypt.
  *
     defined(HAVE_ELF_AUX_INFO))
 #include <sys/auxv.h>
 #endif
+#if defined(__APPLE__) && defined(HAVE_SYS_SYSCTL_H) && \
+    defined(HAVE_SYSCTLBYNAME)
+#include <sys/sysctl.h>
+#endif
 
 #include "g10lib.h"
 #include "hwf-common.h"
@@ -137,6 +141,37 @@ static const struct feature_map_s arm_features[] =
 #ifndef HWCAP_SHA2
 # define HWCAP_SHA2  64
 #endif
+#ifndef HWCAP_SHA3
+# define HWCAP_SHA3  (1 << 17)
+#endif
+#ifndef HWCAP_SM3
+# define HWCAP_SM3   (1 << 18)
+#endif
+#ifndef HWCAP_SM4
+# define HWCAP_SM4   (1 << 19)
+#endif
+#ifndef HWCAP_SHA512
+# define HWCAP_SHA512 (1 << 21)
+#endif
+#ifndef HWCAP_SVE
+# define HWCAP_SVE    (1 << 22)
+#endif
+
+#ifndef HWCAP2_SVE2
+# define HWCAP2_SVE2        (1 << 1)
+#endif
+#ifndef HWCAP2_SVEAES
+# define HWCAP2_SVEAES      (1 << 2)
+#endif
+#ifndef HWCAP2_SVEPMULL
+# define HWCAP2_SVEPMULL    (1 << 3)
+#endif
+#ifndef HWCAP2_SVESHA3
+# define HWCAP2_SVESHA3     (1 << 5)
+#endif
+#ifndef HWCAP2_SVESM4
+# define HWCAP2_SVESM4      (1 << 6)
+#endif
 
 static const struct feature_map_s arm_features[] =
   {
@@ -148,6 +183,18 @@ static const struct feature_map_s arm_features[] =
     { HWCAP_SHA1, 0, " sha1", HWF_ARM_SHA1 },
     { HWCAP_SHA2, 0, " sha2", HWF_ARM_SHA2 },
     { HWCAP_PMULL, 0, " pmull", HWF_ARM_PMULL },
+    { HWCAP_SHA3, 0, " sha3",  HWF_ARM_SHA3 },
+    { HWCAP_SM3, 0, " sm3",  HWF_ARM_SM3 },
+    { HWCAP_SM4, 0, " sm4",  HWF_ARM_SM4 },
+    { HWCAP_SHA512, 0, " sha512",  HWF_ARM_SHA512 },
+#endif
+#ifdef ENABLE_SVE_SUPPORT
+    { HWCAP_SVE, 0, " sve",  HWF_ARM_SVE },
+    { 0, HWCAP2_SVE2, " sve2",  HWF_ARM_SVE2 },
+    { 0, HWCAP2_SVEAES, " sveaes",  HWF_ARM_SVEAES },
+    { 0, HWCAP2_SVEPMULL, " svepmull",  HWF_ARM_SVEPMULL },
+    { 0, HWCAP2_SVESHA3, " svesha3",  HWF_ARM_SVESHA3 },
+    { 0, HWCAP2_SVESM4, " svesm4",  HWF_ARM_SVESM4 },
 #endif
   };
 
@@ -369,6 +416,128 @@ detect_arm_proc_cpuinfo(unsigned int *broken_hwfs)
 
 #endif /* __linux__ */
 
+
+#undef HAS_APPLE_SYSCTLBYNAME
+#if defined(__APPLE__) && defined(HAVE_SYS_SYSCTL_H) && \
+    defined(HAVE_SYSCTLBYNAME)
+#define HAS_APPLE_SYSCTLBYNAME 1
+
+static unsigned int
+detect_arm_apple_sysctlbyname (void)
+{
+  static const struct
+  {
+    const char *feat_name;
+    unsigned int hwf_flag;
+  } hw_optional_arm_features[] =
+    {
+#ifdef ENABLE_NEON_SUPPORT
+      { "hw.optional.neon",            HWF_ARM_NEON },
+      { "hw.optional.AdvSIMD",         HWF_ARM_NEON },
+#endif
+#ifdef ENABLE_ARM_CRYPTO_SUPPORT
+      { "hw.optional.arm.FEAT_AES",    HWF_ARM_AES },
+      { "hw.optional.arm.FEAT_SHA1",   HWF_ARM_SHA1 },
+      { "hw.optional.arm.FEAT_SHA256", HWF_ARM_SHA2 },
+      { "hw.optional.arm.FEAT_PMULL",  HWF_ARM_PMULL },
+      { "hw.optional.arm.FEAT_SHA3",   HWF_ARM_SHA3 },
+      { "hw.optional.armv8_2_sha3",    HWF_ARM_SHA3 },
+      { "hw.optional.arm.FEAT_SHA512", HWF_ARM_SHA512 },
+      { "hw.optional.armv8_2_sha512",  HWF_ARM_SHA512 },
+#endif
+    };
+  unsigned int i;
+  unsigned int hwf = 0;
+
+  for (i = 0; i < DIM(hw_optional_arm_features); i++)
+    {
+      const char *name = hw_optional_arm_features[i].feat_name;
+      int sysctl_value = 0;
+      size_t value_size = sizeof(sysctl_value);
+
+      if (sysctlbyname (name, &sysctl_value, &value_size, NULL, 0) != 0)
+        continue;
+
+      if (value_size != sizeof(sysctl_value))
+        continue;
+
+      if (sysctl_value == 1)
+        {
+          hwf |= hw_optional_arm_features[i].hwf_flag;
+        }
+    }
+
+  return hwf;
+}
+
+#endif /* __APPLE__ */
+
+
+static unsigned int
+detect_arm_hwf_by_toolchain (void)
+{
+  unsigned int ret = 0;
+
+  /* Detect CPU features required by toolchain.
+   * This allows detection of ARMv8 crypto extension support,
+   * for example, on macOS/aarch64.
+   */
+
+#if __GNUC__ >= 4
+
+#if defined(__ARM_NEON) && defined(ENABLE_NEON_SUPPORT)
+  ret |= HWF_ARM_NEON;
+
+#ifdef HAVE_GCC_INLINE_ASM_NEON
+  /* Early test for NEON instruction to detect faulty toolchain
+   * configuration. */
+  asm volatile ("veor q15, q15, q15":::"q15");
+#endif
+
+#ifdef HAVE_GCC_INLINE_ASM_AARCH64_NEON
+  /* Early test for NEON instruction to detect faulty toolchain
+   * configuration. */
+  asm volatile ("eor v31.16b, v31.16b, v31.16b":::"v31");
+#endif
+
+#endif /* __ARM_NEON */
+
+#if defined(__ARM_FEATURE_CRYPTO) && defined(ENABLE_ARM_CRYPTO_SUPPORT)
+  /* ARMv8 crypto extensions include support for PMULL, AES, SHA1 and SHA2
+   * instructions. */
+  ret |= HWF_ARM_PMULL;
+  ret |= HWF_ARM_AES;
+  ret |= HWF_ARM_SHA1;
+  ret |= HWF_ARM_SHA2;
+
+#ifdef HAVE_GCC_INLINE_ASM_AARCH32_CRYPTO
+  /* Early test for CE instructions to detect faulty toolchain
+   * configuration. */
+  asm volatile ("vmull.p64 q0, d0, d0;\n\t"
+               "aesimc.8 q7, q0;\n\t"
+               "sha1su1.32 q0, q0;\n\t"
+               "sha256su1.32 q0, q7, q15;\n\t"
+               :::
+               "q0", "q7", "q15");
+#endif
+
+#ifdef HAVE_GCC_INLINE_ASM_AARCH64_CRYPTO
+  /* Early test for CE instructions to detect faulty toolchain
+   * configuration. */
+  asm volatile ("pmull2 v0.1q, v0.2d, v31.2d;\n\t"
+               "aesimc v15.16b, v0.16b;\n\t"
+               "sha1su1 v0.4s, v0.4s;\n\t"
+               "sha256su1 v0.4s, v15.4s, v31.4s;\n\t"
+               :::
+               "v0", "v15", "v31");
+#endif
+#endif
+
+#endif
+
+  return ret;
+}
+
 unsigned int
 _gcry_hwf_detect_arm (void)
 {
@@ -383,10 +552,12 @@ _gcry_hwf_detect_arm (void)
   ret |= detect_arm_proc_cpuinfo (&broken_hwfs);
 #endif
 
-#if defined(__ARM_NEON) && defined(ENABLE_NEON_SUPPORT)
-  ret |= HWF_ARM_NEON;
+#if defined (HAS_APPLE_SYSCTLBYNAME)
+  ret |= detect_arm_apple_sysctlbyname ();
 #endif
 
+  ret |= detect_arm_hwf_by_toolchain ();
+
   ret &= ~broken_hwfs;
 
   return ret;
index b10f86be6b69cfedf80e29042d5e3bfd410189c9..ebd045c53245e2f7e651da812fe47c3733e4f453 100644 (file)
@@ -4,7 +4,7 @@
  * This file is part of Libgcrypt.
  *
  * Libgcrypt is free software; you can redistribute it and/or modify
- * it under the terms of the GNU Lesser general Public License as
+ * it under the terms of the GNU Lesser General Public License as
  * published by the Free Software Foundation; either version 2.1 of
  * the License, or (at your option) any later version.
  *
index a1aa02e7813db5bcb70ced812261ea658cfbbba5..bda14d9d6dd389f3fcd1689b7aab1c36f5df2e9a 100644 (file)
 #if defined (__i386__) && SIZEOF_UNSIGNED_LONG == 4 && defined (__GNUC__)
 # define HAS_X86_CPUID 1
 
-#if _GCRY_GCC_VERSION >= 40700 /* 4.7 */
-# define FORCE_FUNC_FRAME_POINTER \
-       __attribute__ ((optimize("no-omit-frame-pointer")))
+#ifdef HAVE_GCC_ASM_CFI_DIRECTIVES
+# define CFI_ADJUST_CFA_OFFSET(off) ".cfi_adjust_cfa_offset " #off "\n\t"
+# define CFI_PUSH4 CFI_ADJUST_CFA_OFFSET(4)
+# define CFI_POP4 CFI_ADJUST_CFA_OFFSET(-4)
 #else
-# define FORCE_FUNC_FRAME_POINTER
+# define CFI_ADJUST_CFA_OFFSET(off)
+# define CFI_PUSH4
+# define CFI_POP4
 #endif
 
-static FORCE_FUNC_FRAME_POINTER int
+static int
 is_cpuid_available(void)
 {
   int has_cpuid = 0;
@@ -55,15 +58,23 @@ is_cpuid_available(void)
      vs 32 bit pushf/popf). */
   asm volatile
     ("pushf\n\t"                 /* Copy flags to EAX.  */
+     CFI_PUSH4
      "popl %%eax\n\t"
+     CFI_POP4
      "movl %%eax, %%ecx\n\t"     /* Save flags into ECX.  */
      "xorl $0x200000, %%eax\n\t" /* Toggle ID bit and copy it to the flags.  */
      "pushl %%eax\n\t"
+     CFI_PUSH4
      "popf\n\t"
+     CFI_POP4
      "pushf\n\t"                 /* Copy changed flags again to EAX.  */
+     CFI_PUSH4
      "popl %%eax\n\t"
+     CFI_POP4
      "pushl %%ecx\n\t"           /* Restore flags from ECX.  */
+     CFI_PUSH4
      "popf\n\t"
+     CFI_POP4
      "xorl %%eax, %%ecx\n\t"     /* Compare flags against saved flags.  */
      "jz .Lno_cpuid%=\n\t"       /* Toggling did not work, thus no CPUID.  */
      "movl $1, %0\n"             /* Worked. true -> HAS_CPUID.  */
@@ -182,12 +193,15 @@ detect_x86_gnuc (void)
   } vendor_id;
   unsigned int features, features2;
   unsigned int os_supports_avx_avx2_registers = 0;
+  unsigned int os_supports_avx512_registers = 0;
   unsigned int max_cpuid_level;
   unsigned int fms, family, model;
   unsigned int result = 0;
   unsigned int avoid_vpgather = 0;
+  unsigned int is_amd_cpu = 0;
 
   (void)os_supports_avx_avx2_registers;
+  (void)os_supports_avx512_registers;
 
   if (!is_cpuid_available())
     return 0;
@@ -240,6 +254,7 @@ detect_x86_gnuc (void)
   else if (!strcmp (vendor_id.c, "AuthenticAMD"))
     {
       /* This is an AMD CPU.  */
+      is_amd_cpu = 1;
     }
 
   /* Detect Intel features, that might also be supported by other
@@ -251,77 +266,6 @@ detect_x86_gnuc (void)
   family = ((fms & 0xf00) >> 8) + ((fms & 0xff00000) >> 20);
   model = ((fms & 0xf0) >> 4) + ((fms & 0xf0000) >> 12);
 
-  if ((result & HWF_INTEL_CPU) && family == 6)
-    {
-      /* These Intel Core processor models have SHLD/SHRD instruction that
-       * can do integer rotation faster actual ROL/ROR instructions. */
-      switch (model)
-       {
-       case 0x2A:
-       case 0x2D:
-       case 0x3A:
-       case 0x3C:
-       case 0x3F:
-       case 0x45:
-       case 0x46:
-       case 0x3D:
-       case 0x4F:
-       case 0x56:
-       case 0x47:
-       case 0x4E:
-       case 0x5E:
-       case 0x8E:
-       case 0x9E:
-       case 0x55:
-       case 0x66:
-         result |= HWF_INTEL_FAST_SHLD;
-         break;
-       }
-
-      /* These Intel Core processors that have AVX2 have slow VPGATHER and
-       * should be avoided for table-lookup use. */
-      switch (model)
-       {
-       case 0x3C:
-       case 0x3F:
-       case 0x45:
-       case 0x46:
-         /* Haswell */
-         avoid_vpgather |= 1;
-         break;
-       }
-    }
-  else
-    {
-      /* Avoid VPGATHER for non-Intel CPUs as testing is needed to
-       * make sure it is fast enough. */
-
-      avoid_vpgather |= 1;
-    }
-
-#ifdef ENABLE_FORCE_SOFT_HWFEATURES
-  /* Soft HW features mark functionality that is available on all systems
-   * but not feasible to use because of slow HW implementation. */
-
-  /* SHLD is faster at rotating register than actual ROR/ROL instructions
-   * on older Intel systems (~sandy-bridge era). However, SHLD is very
-   * slow on almost anything else and later Intel processors have faster
-   * ROR/ROL. Therefore in regular build HWF_INTEL_FAST_SHLD is enabled
-   * only for those Intel processors that benefit from the SHLD
-   * instruction. Enabled here unconditionally as requested. */
-  result |= HWF_INTEL_FAST_SHLD;
-
-  /* VPGATHER instructions are used for look-up table based
-   * implementations which require VPGATHER to be fast enough to beat
-   * regular parallelized look-up table implementations (see Twofish).
-   * So far, only Intel processors beginning with skylake have had
-   * VPGATHER fast enough to be enabled. AMD Zen3 comes close to
-   * being feasible, but not quite (where twofish-avx2 is few percent
-   * slower than twofish-3way). Enable VPGATHER here unconditionally
-   * as requested. */
-  avoid_vpgather = 0;
-#endif
-
 #ifdef ENABLE_PCLMUL_SUPPORT
   /* Test bit 1 for PCLMUL.  */
   if (features & 0x00000002)
@@ -338,13 +282,22 @@ detect_x86_gnuc (void)
   if (features & 0x02000000)
      result |= HWF_INTEL_AESNI;
 #endif /*ENABLE_AESNI_SUPPORT*/
-#if defined(ENABLE_AVX_SUPPORT) || defined(ENABLE_AVX2_SUPPORT)
-  /* Test bit 27 for OSXSAVE (required for AVX/AVX2).  */
+#if defined(ENABLE_AVX_SUPPORT) || defined(ENABLE_AVX2_SUPPORT) \
+    || defined(ENABLE_AVX512_SUPPORT)
+  /* Test bit 27 for OSXSAVE (required for AVX/AVX2/AVX512).  */
   if (features & 0x08000000)
     {
+      unsigned int xmm_ymm_mask = (1 << 2) | (1 << 1);
+      unsigned int zmm15_ymm31_k7_mask = (1 << 7) | (1 << 6) | (1 << 5);
+      unsigned int xgetbv = get_xgetbv();
+
       /* Check that OS has enabled both XMM and YMM state support.  */
-      if ((get_xgetbv() & 0x6) == 0x6)
+      if ((xgetbv & xmm_ymm_mask) == xmm_ymm_mask)
         os_supports_avx_avx2_registers = 1;
+
+      /* Check that OS has enabled full AVX512 state support.  */
+      if ((xgetbv & zmm15_ymm31_k7_mask) == zmm15_ymm31_k7_mask)
+        os_supports_avx512_registers = 1;
     }
 #endif
 #ifdef ENABLE_AVX_SUPPORT
@@ -381,9 +334,6 @@ detect_x86_gnuc (void)
       if (features & 0x00000020)
         if (os_supports_avx_avx2_registers)
           result |= HWF_INTEL_AVX2;
-
-      if ((result & HWF_INTEL_AVX2) && !avoid_vpgather)
-        result |= HWF_INTEL_FAST_VPGATHER;
 #endif /*ENABLE_AVX_SUPPORT*/
 
       /* Test bit 29 for SHA Extensions. */
@@ -392,12 +342,160 @@ detect_x86_gnuc (void)
 
 #if defined(ENABLE_AVX2_SUPPORT) && defined(ENABLE_AESNI_SUPPORT) && \
     defined(ENABLE_PCLMUL_SUPPORT)
-      /* Test bit 9 for VAES and bit 10 for VPCLMULDQD */
+      /* Test features2 bit 9 for VAES and features2 bit 10 for VPCLMULDQD */
       if ((features2 & 0x00000200) && (features2 & 0x00000400))
         result |= HWF_INTEL_VAES_VPCLMUL;
 #endif
+
+#ifdef ENABLE_AVX512_SUPPORT
+      /* Test for AVX512 features. List of features is selected so that
+       * supporting CPUs are new enough not to suffer from reduced clock
+       * frequencies when AVX512 is used, which was issue on early AVX512
+       * capable CPUs.
+       *  - AVX512F (features bit 16)
+       *  - AVX512DQ (features bit 17)
+       *  - AVX512IFMA (features bit 21)
+       *  - AVX512CD (features bit 28)
+       *  - AVX512BW (features bit 30)
+       *  - AVX512VL (features bit 31)
+       *  - AVX512_VBMI (features2 bit 1)
+       *  - AVX512_VBMI2 (features2 bit 6)
+       *  - AVX512_VNNI (features2 bit 11)
+       *  - AVX512_BITALG (features2 bit 12)
+       *  - AVX512_VPOPCNTDQ (features2 bit 14)
+       */
+      if (os_supports_avx512_registers
+         && (features & (1 << 16))
+         && (features & (1 << 17))
+         && (features & (1 << 21))
+         && (features & (1 << 28))
+         && (features & (1 << 30))
+         && (features & (1U << 31))
+         && (features2 & (1 << 1))
+         && (features2 & (1 << 6))
+         && (features2 & (1 << 11))
+         && (features2 & (1 << 12))
+         && (features2 & (1 << 14)))
+       result |= HWF_INTEL_AVX512;
+#endif
+
+      /* Test features2 bit 6 for GFNI (Galois field new instructions).
+       * These instructions are available for SSE/AVX/AVX2/AVX512. */
+      if (features2 & (1 << 6))
+        result |= HWF_INTEL_GFNI;
     }
 
+  if ((result & HWF_INTEL_CPU) && family == 6)
+    {
+      /* These Intel Core processor models have SHLD/SHRD instruction that
+       * can do integer rotation faster actual ROL/ROR instructions. */
+      switch (model)
+       {
+       case 0x2A:
+       case 0x2D:
+       case 0x3A:
+       case 0x3C:
+       case 0x3F:
+       case 0x45:
+       case 0x46:
+       case 0x3D:
+       case 0x4F:
+       case 0x56:
+       case 0x47:
+       case 0x4E:
+       case 0x5E:
+       case 0x8E:
+       case 0x9E:
+       case 0x55:
+       case 0x66:
+         result |= HWF_INTEL_FAST_SHLD;
+         break;
+       }
+
+      /* These Intel Core processors that have AVX2 have slow VPGATHER and
+       * should be avoided for table-lookup use. */
+      switch (model)
+       {
+       case 0x3C:
+       case 0x3F:
+       case 0x45:
+       case 0x46:
+         /* Haswell */
+         avoid_vpgather |= 1;
+         break;
+       }
+
+      /* These Intel Core processors (skylake to tigerlake) have slow VPGATHER
+       * because of mitigation introduced by new microcode (2023-08-08) for
+       * "Downfall" speculative execution vulnerability. */
+      switch (model)
+       {
+       /* Skylake, Cascade Lake, Cooper Lake */
+       case 0x4E:
+       case 0x5E:
+       case 0x55:
+       /* Kaby Lake, Coffee Lake, Whiskey Lake, Amber Lake */
+       case 0x8E:
+       case 0x9E:
+       /* Cannon Lake */
+       case 0x66:
+       /* Comet Lake */
+       case 0xA5:
+       case 0xA6:
+       /* Ice Lake */
+       case 0x7E:
+       case 0x6A:
+       case 0x6C:
+       /* Tiger Lake */
+       case 0x8C:
+       case 0x8D:
+       /* Rocket Lake */
+       case 0xA7:
+         avoid_vpgather |= 1;
+         break;
+       }
+    }
+  else if (is_amd_cpu)
+    {
+      /* Non-AVX512 AMD CPUs (pre-Zen4) have slow VPGATHER and should be
+       * avoided for table-lookup use. */
+      avoid_vpgather |= !(result & HWF_INTEL_AVX512);
+    }
+  else
+    {
+      /* Avoid VPGATHER for non-Intel/non-AMD CPUs as testing is needed to
+       * make sure it is fast enough. */
+      avoid_vpgather |= 1;
+    }
+
+#ifdef ENABLE_FORCE_SOFT_HWFEATURES
+  /* Soft HW features mark functionality that is available on all systems
+   * but not feasible to use because of slow HW implementation. */
+
+  /* Some implementations are disabled for non-Intel CPUs. Mark
+   * current CPU as Intel one to enable those implementations. */
+  result |= HWF_INTEL_CPU;
+
+  /* SHLD is faster at rotating register than actual ROR/ROL instructions
+   * on older Intel systems (~sandy-bridge era). However, SHLD is very
+   * slow on almost anything else and later Intel processors have faster
+   * ROR/ROL. Therefore in regular build HWF_INTEL_FAST_SHLD is enabled
+   * only for those Intel processors that benefit from the SHLD
+   * instruction. Enabled here unconditionally as requested. */
+  result |= HWF_INTEL_FAST_SHLD;
+
+  /* VPGATHER instructions are used for look-up table based
+   * implementations which require VPGATHER to be fast enough to beat
+   * regular parallelized look-up table implementations (see Twofish).
+   * So far, only Intel processors beginning with Skylake and AMD
+   * processors starting with Zen4 have had VPGATHER fast enough to be
+   * enabled. Enable VPGATHER here unconditionally as requested. */
+  avoid_vpgather = 0;
+#endif
+
+  if ((result & HWF_INTEL_AVX2) && !avoid_vpgather)
+    result |= HWF_INTEL_FAST_VPGATHER;
+
   return result;
 }
 #endif /* HAS_X86_CPUID */
index 97e67b3c0f4b26d5707353f630715f3433fc2a2c..b11cadefa9ef346b12202c9311977d3f7804465e 100644 (file)
@@ -62,12 +62,24 @@ static struct
     { HWF_INTEL_RDTSC,         "intel-rdtsc" },
     { HWF_INTEL_SHAEXT,        "intel-shaext" },
     { HWF_INTEL_VAES_VPCLMUL,  "intel-vaes-vpclmul" },
+    { HWF_INTEL_AVX512,        "intel-avx512" },
+    { HWF_INTEL_GFNI,          "intel-gfni" },
 #elif defined(HAVE_CPU_ARCH_ARM)
     { HWF_ARM_NEON,            "arm-neon" },
     { HWF_ARM_AES,             "arm-aes" },
     { HWF_ARM_SHA1,            "arm-sha1" },
     { HWF_ARM_SHA2,            "arm-sha2" },
     { HWF_ARM_PMULL,           "arm-pmull" },
+    { HWF_ARM_SHA3,            "arm-sha3" },
+    { HWF_ARM_SM3,             "arm-sm3" },
+    { HWF_ARM_SM4,             "arm-sm4" },
+    { HWF_ARM_SHA512,          "arm-sha512" },
+    { HWF_ARM_SVE,             "arm-sve" },
+    { HWF_ARM_SVE2,            "arm-sve2" },
+    { HWF_ARM_SVEAES,          "arm-sveaes" },
+    { HWF_ARM_SVEPMULL,        "arm-svepmull" },
+    { HWF_ARM_SVESHA3,         "arm-svesha3" },
+    { HWF_ARM_SVESM4,          "arm-svesm4" },
 #elif defined(HAVE_CPU_ARCH_PPC)
     { HWF_PPC_VCRYPTO,         "ppc-vcrypto" },
     { HWF_PPC_ARCH_3_00,       "ppc-arch_3_00" },
index 6b3b356773330dd8ad890d1d0f28bbbbde338ca3..2456436ab84fa0d1a9403abc97281b0cb796b2f4 100644 (file)
@@ -154,7 +154,7 @@ if test "$echo_cflags" = "yes"; then
 
     tmp=""
     for i in $includes $cflags_final; do
-       if echo "$tmp" | fgrep -v -- "$i" >/dev/null; then
+       if echo "$tmp" | @FGREP@ -v -- "$i" >/dev/null; then
            tmp="$tmp $i"
        fi
     done
@@ -175,7 +175,7 @@ if test "$echo_libs" = "yes"; then
 
     tmp=""
     for i in $libdirs $libs_final; do
-       if echo "$tmp" | fgrep -v -- "$i" >/dev/null; then
+       if echo "$tmp" | @FGREP@ -v -- "$i" >/dev/null; then
            tmp="$tmp $i"
        fi
     done
index d6de731f7ab7e477869b4da9e614f846a81730e7..51f52509c870575a8c2c03e1d3ac249a11a22c05 100644 (file)
@@ -14,8 +14,8 @@
 ;; GNU Lesser General Public License for more details.
 ;;
 ;; You should have received a copy of the GNU Lesser General Public
-;; License along with this program; if not, write to the Free Software
-;; Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA
+;; License along with this program; if not, see <https://www.gnu.org/licenses/>.
+;; SPDX-License-Identifier: LGPL-2.1-or-later
 ;;
 
 ;; Note: This file should be updated manually and the ordinals shall
@@ -298,4 +298,13 @@ EXPORTS
       gcry_kdf_final            @260
       gcry_kdf_close            @261
 
+      gcry_cipher_setup_geniv   @262
+      gcry_cipher_geniv         @263
+
+      gcry_kem_keypair          @264
+      gcry_kem_encap            @265
+      gcry_kem_decap            @266
+
+      gcry_md_hash_buffers_ext  @267
+
 ;; end of file with public symbols for Windows.
index cd4249e87b621a7f48a1c2390f83b3a9c450afde..353df81b6d739da5b2a8caeaa95b139e3dd32ebc 100644 (file)
@@ -1,5 +1,6 @@
 # libgcrypt.m4 - Autoconf macros to detect libgcrypt
-# Copyright (C) 2002, 2003, 2004, 2011, 2014, 2018, 2020 g10 Code GmbH
+# Copyright (C) 2002, 2003, 2004, 2011, 2014, 2018, 2020,
+#               2024 g10 Code GmbH
 #
 # This file is free software; as a special exception the author gives
 # unlimited permission to copy and/or distribute it, with or without
@@ -9,9 +10,94 @@
 # WITHOUT ANY WARRANTY, to the extent permitted by law; without even the
 # implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
 #
-# Last-changed: 2022-11-01
+# Last-changed: 2024-06-13
 
 
+dnl
+dnl Find gpgrt-config, which uses .pc file
+dnl (minimum pkg-config functionality, supporting cross build)
+dnl
+dnl _AM_PATH_GPGRT_CONFIG
+AC_DEFUN([_AM_PATH_GPGRT_CONFIG],[dnl
+  AC_PATH_PROG(GPGRT_CONFIG, gpgrt-config, no, [$prefix/bin:$PATH])
+  if test "$GPGRT_CONFIG" != "no"; then
+    # Determine gpgrt_libdir
+    #
+    # Get the prefix of gpgrt-config assuming it's something like:
+    #   <PREFIX>/bin/gpgrt-config
+    gpgrt_prefix=${GPGRT_CONFIG%/*/*}
+    possible_libdir1=${gpgrt_prefix}/lib
+    # Determine by using system libdir-format with CC, it's like:
+    #   Normal style: /usr/lib
+    #   GNU cross style: /usr/<triplet>/lib
+    #   Debian style: /usr/lib/<multiarch-name>
+    #   Fedora/openSUSE style: /usr/lib, /usr/lib32 or /usr/lib64
+    # It is assumed that CC is specified to the one of host on cross build.
+    if libdir_candidates=$(${CC:-cc} -print-search-dirs | \
+          sed -n -e "/^libraries/{s/libraries: =//;s/:/\\
+/g;p;}"); then
+      # From the output of -print-search-dirs, select valid pkgconfig dirs.
+      libdir_candidates=$(for dir in $libdir_candidates; do
+        if p=$(cd $dir 2>/dev/null && pwd); then
+          test -d "$p/pkgconfig" && echo $p;
+        fi
+      done)
+
+      for possible_libdir0 in $libdir_candidates; do
+        # possible_libdir0:
+        #   Fallback candidate, the one of system-installed (by $CC)
+        #   (/usr/<triplet>/lib, /usr/lib/<multiarch-name> or /usr/lib32)
+        # possible_libdir1:
+        #   Another candidate, user-locally-installed
+        #   (<gpgrt_prefix>/lib)
+        # possible_libdir2
+        #   Most preferred
+        #   (<gpgrt_prefix>/<triplet>/lib,
+        #    <gpgrt_prefix>/lib/<multiarch-name> or <gpgrt_prefix>/lib32)
+        if test "${possible_libdir0##*/}" = "lib"; then
+          possible_prefix0=${possible_libdir0%/lib}
+          possible_prefix0_triplet=${possible_prefix0##*/}
+          if test -z "$possible_prefix0_triplet"; then
+            continue
+          fi
+          possible_libdir2=${gpgrt_prefix}/$possible_prefix0_triplet/lib
+        else
+          possible_prefix0=${possible_libdir0%%/lib*}
+          possible_libdir2=${gpgrt_prefix}${possible_libdir0#$possible_prefix0}
+        fi
+        if test -f ${possible_libdir2}/pkgconfig/gpg-error.pc; then
+          gpgrt_libdir=${possible_libdir2}
+        elif test -f ${possible_libdir1}/pkgconfig/gpg-error.pc; then
+          gpgrt_libdir=${possible_libdir1}
+        elif test -f ${possible_libdir0}/pkgconfig/gpg-error.pc; then
+          gpgrt_libdir=${possible_libdir0}
+        fi
+        if test -n "$gpgrt_libdir"; then break; fi
+      done
+    fi
+    if test -z "$gpgrt_libdir"; then
+      # No valid pkgconfig dir in any of the system directories, fallback
+      gpgrt_libdir=${possible_libdir1}
+    fi
+  else
+    unset GPGRT_CONFIG
+  fi
+
+  if test -n "$gpgrt_libdir"; then
+    # Add the --libdir option to GPGRT_CONFIG
+    GPGRT_CONFIG="$GPGRT_CONFIG --libdir=$gpgrt_libdir"
+    # Make sure if gpgrt-config really works, by testing config gpg-error
+    if ! $GPGRT_CONFIG gpg-error --exists; then
+      # If it doesn't work, clear the GPGRT_CONFIG variable.
+      unset GPGRT_CONFIG
+    fi
+  else
+    # GPGRT_CONFIG found but no suitable dir for --libdir found.
+    # This is a failure.  Clear the GPGRT_CONFIG variable.
+    unset GPGRT_CONFIG
+  fi
+])
+
 dnl AM_PATH_LIBGCRYPT([MINIMUM-VERSION,
 dnl                   [ACTION-IF-FOUND [, ACTION-IF-NOT-FOUND ]]])
 dnl Test for libgcrypt and define LIBGCRYPT_CFLAGS and LIBGCRYPT_LIBS.
@@ -28,7 +114,8 @@ dnl config script does not match the host specification the script
 dnl is added to the gpg_config_script_warn variable.
 dnl
 AC_DEFUN([AM_PATH_LIBGCRYPT],
-[ AC_REQUIRE([AC_CANONICAL_HOST])
+[ AC_REQUIRE([AC_CANONICAL_HOST])dnl
+  AC_REQUIRE([_AM_PATH_GPGRT_CONFIG])dnl
   AC_ARG_WITH(libgcrypt-prefix,
             AS_HELP_STRING([--with-libgcrypt-prefix=PFX],
                            [prefix where LIBGCRYPT is installed (optional)]),
index 2e274f6007b2b4c68eb3f465195042ae6f657b84..7a37c81ed5125454a25aa17a4f4bd578244402f8 100644 (file)
@@ -4,7 +4,7 @@
 # This file is part of Libgcrypt.
 #
 # Libgcrypt is free software; you can redistribute it and/or modify
-# it under the terms of the GNU Lesser general Public License as
+# it under the terms of the GNU Lesser General Public License as
 # published by the Free Software Foundation; either version 2.1 of
 # the License, or (at your option) any later version.
 #
@@ -14,8 +14,8 @@
 # GNU Lesser General Public License for more details.
 #
 # You should have received a copy of the GNU Lesser General Public
-# License along with this program; if not, write to the Free Software
-# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA
+# License along with this program; if not, see <https://www.gnu.org/licenses/>.
+# SPDX-License-Identifier: LGPL-2.1-or-later
 
 # NOTE: When adding new functions, please make sure to add them to
 # visibility.h and libgcrypt.def as well.
@@ -37,7 +37,7 @@ GCRYPT_1.6 {
     gcry_xmalloc_secure; gcry_xrealloc; gcry_xstrdup;
 
     gcry_md_algo_info; gcry_md_algo_name; gcry_md_close;
-    gcry_md_copy; gcry_md_ctl; gcry_md_enable; gcry_md_get;
+    gcry_md_copy; gcry_md_ctl; gcry_md_enable;
     gcry_md_get_algo; gcry_md_get_algo_dlen; gcry_md_hash_buffer;
     gcry_md_hash_buffers;
     gcry_md_info; gcry_md_is_enabled; gcry_md_is_secure;
@@ -61,7 +61,7 @@ GCRYPT_1.6 {
     gcry_pk_algo_info; gcry_pk_algo_name; gcry_pk_ctl;
     gcry_pk_decrypt; gcry_pk_encrypt; gcry_pk_genkey;
     gcry_pk_get_keygrip; gcry_pk_get_nbits;
-    gcry_pk_map_name; gcry_pk_register; gcry_pk_sign;
+    gcry_pk_map_name; gcry_pk_sign;
     gcry_pk_testkey; gcry_pk_verify;
     gcry_pk_get_curve; gcry_pk_get_param;
 
@@ -125,6 +125,13 @@ GCRYPT_1.6 {
     gcry_pk_hash_sign; gcry_pk_hash_verify; gcry_pk_random_override_new;
 
     gcry_kdf_open; gcry_kdf_compute; gcry_kdf_final; gcry_kdf_close;
+
+    gcry_cipher_setup_geniv; gcry_cipher_geniv;
+
+    gcry_kem_keypair; gcry_kem_encap; gcry_kem_decap;
+
+    gcry_md_hash_buffers_ext;
+
   local:
     *;
 
index 4db2d9a4dd9cc8cb80c77c4cd941942d5917a6f4..b1e8eb1ceda40444b4a5e5b6fe9f7f59a6a03bf4 100644 (file)
@@ -42,7 +42,6 @@ static void (*fatal_error_handler)(void*,int, const char*) = NULL;
 static void *fatal_error_handler_value = 0;
 static void (*log_handler)(void*,int, const char*, va_list) = NULL;
 static void *log_handler_value = 0;
-
 static const char *(*user_gettext_handler)( const char * ) = NULL;
 
 void
@@ -97,6 +96,10 @@ _gcry_fatal_error (int rc, const char *text)
   abort ();
 }
 
+/* This is deprecated but very few open source software still uses
+ * this.  However there is more than open source out there and thus we
+ * need to keep the handler specific for Libgcrypt and can't add a
+ * general handler to gpgrt.  */
 void
 _gcry_set_log_handler (void (*f)(void*,int, const char*, va_list), void *opaque)
 {
@@ -116,48 +119,78 @@ _gcry_log_verbosity( int level )
     return verbosity_level >= level;
 }
 
-/****************
- * This is our log function which prints all log messages to stderr or
- * using the function defined with gcry_set_log_handler().
- */
+
+/* This handler is called after log_fatal and log_info to do what we
+ * used to do in our former own logging functions.  */
+static void
+my_gpgrt_post_fatal_handler (int level)
+{
+  static volatile int done;
+
+  if (!done && (level == GPGRT_LOGLVL_FATAL || level == GPGRT_LOGLVL_BUG))
+    {
+      done = 1;
+      fips_signal_fatal_error ("internal error (fatal or bug)");
+      _gcry_secmem_term ();
+    }
+}
+
+
+void
+_gcry_set_gpgrt_post_log_handler (void)
+{
+  gpgrt_add_post_log_func (my_gpgrt_post_fatal_handler);
+}
+
+
+static enum gpgrt_log_levels
+map_log_level (int level)
+{
+  switch (level)
+    {
+    case GCRY_LOG_CONT:  return GPGRT_LOGLVL_CONT;
+    case GCRY_LOG_INFO:  return GPGRT_LOGLVL_INFO;
+    case GCRY_LOG_WARN:  return GPGRT_LOGLVL_WARN;
+    case GCRY_LOG_ERROR: return GPGRT_LOGLVL_ERROR;
+    case GCRY_LOG_FATAL: return GPGRT_LOGLVL_FATAL;
+    case GCRY_LOG_BUG:   return GPGRT_LOGLVL_BUG;
+    case GCRY_LOG_DEBUG:
+    default:             return GPGRT_LOGLVL_DEBUG;
+    }
+}
+
+
 void
-_gcry_logv( int level, const char *fmt, va_list arg_ptr )
+_gcry_logv (int level, const char *fmt, va_list arg_ptr)
 {
   if (log_handler)
-    log_handler (log_handler_value, level, fmt, arg_ptr);
-  else
     {
-      switch (level)
+      /* The deprecated log handler has been registered.  */
+      log_handler (log_handler_value, level, fmt, arg_ptr);
+      if (level == GCRY_LOG_FATAL || level == GCRY_LOG_BUG)
         {
-        case GCRY_LOG_CONT:  break;
-        case GCRY_LOG_INFO:  break;
-        case GCRY_LOG_WARN:  break;
-        case GCRY_LOG_ERROR: break;
-        case GCRY_LOG_FATAL: fputs("Fatal: ",stderr ); break;
-        case GCRY_LOG_BUG:   fputs("Ohhhh jeeee: ", stderr); break;
-        case GCRY_LOG_DEBUG: fputs("DBG: ", stderr ); break;
-        default: fprintf(stderr,"[Unknown log level %d]: ", level ); break;
-       }
-      vfprintf(stderr,fmt,arg_ptr) ;
+          fips_signal_fatal_error ("internal error (fatal or bug)"
+                                   " [legacy bug handler]");
+          _gcry_secmem_term ();
+          abort ();
+        }
     }
-
-  if ( level == GCRY_LOG_FATAL || level == GCRY_LOG_BUG )
+  else
     {
-      fips_signal_fatal_error ("internal error (fatal or bug)");
-      _gcry_secmem_term ();
-      abort ();
+      gpgrt_logv_domain ("gcrypt", map_log_level (level), NULL, NULL, 0,
+                         fmt, arg_ptr);
     }
 }
 
 
 void
-_gcry_log( int level, const char *fmt, ... )
+_gcry_log (int level, const char *fmt, ...)
 {
-    va_list arg_ptr ;
+  va_list arg_ptr;
 
-    va_start( arg_ptr, fmt ) ;
-    _gcry_logv( level, fmt, arg_ptr );
-    va_end(arg_ptr);
+  va_start( arg_ptr, fmt ) ;
+  _gcry_logv (level, fmt, arg_ptr);
+  va_end(arg_ptr);
 }
 
 
@@ -165,7 +198,7 @@ _gcry_log( int level, const char *fmt, ... )
 void
 _gcry_bug( const char *file, int line, const char *func )
 {
-    _gcry_logGCRY_LOG_BUG,
+    _gcry_log (GCRY_LOG_BUG,
             ("... this is a bug (%s:%d:%s)\n"), file, line, func );
     abort(); /* never called, but it makes the compiler happy */
 }
@@ -181,7 +214,7 @@ _gcry_assert_failed (const char *expr, const char *file, int line,
 void
 _gcry_bug( const char *file, int line )
 {
-    _gcry_log( GCRY_LOG_BUG,
+    gpgrt_log( GCRY_LOG_BUG,
             _("you found a bug ... (%s:%d)\n"), file, line);
     abort(); /* never called, but it makes the compiler happy */
 }
@@ -200,7 +233,7 @@ _gcry_log_info( const char *fmt, ... )
     va_list arg_ptr ;
 
     va_start( arg_ptr, fmt ) ;
-    _gcry_logv( GCRY_LOG_INFO, fmt, arg_ptr );
+    _gcry_logv (GCRY_LOG_INFO, fmt, arg_ptr);
     va_end(arg_ptr);
 }
 
@@ -210,7 +243,7 @@ _gcry_log_error( const char *fmt, ... )
     va_list arg_ptr ;
 
     va_start( arg_ptr, fmt ) ;
-    _gcry_logv( GCRY_LOG_ERROR, fmt, arg_ptr );
+    _gcry_logv (GCRY_LOG_ERROR, fmt, arg_ptr);
     va_end(arg_ptr);
 }
 
@@ -221,7 +254,7 @@ _gcry_log_fatal( const char *fmt, ... )
     va_list arg_ptr ;
 
     va_start( arg_ptr, fmt ) ;
-    _gcry_logv( GCRY_LOG_FATAL, fmt, arg_ptr );
+    _gcry_logv (GCRY_LOG_FATAL, fmt, arg_ptr);
     va_end(arg_ptr);
     abort(); /* never called, but it makes the compiler happy */
 }
@@ -232,7 +265,7 @@ _gcry_log_bug( const char *fmt, ... )
     va_list arg_ptr ;
 
     va_start( arg_ptr, fmt ) ;
-    _gcry_logv( GCRY_LOG_BUG, fmt, arg_ptr );
+    _gcry_logv (GCRY_LOG_BUG, fmt, arg_ptr);
     va_end(arg_ptr);
     abort(); /* never called, but it makes the compiler happy */
 }
@@ -243,7 +276,7 @@ _gcry_log_debug( const char *fmt, ... )
     va_list arg_ptr ;
 
     va_start( arg_ptr, fmt ) ;
-    _gcry_logv( GCRY_LOG_DEBUG, fmt, arg_ptr );
+    _gcry_logv (GCRY_LOG_DEBUG, fmt, arg_ptr);
     va_end(arg_ptr);
 }
 
index 4756c00ea77026f0e9cffdcde74942bc8d9d9cca..43407616894e169fa7dbc7af040c055d0158f7cf 100644 (file)
@@ -15,8 +15,8 @@
  * GNU Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA
+ * License along with this program; if not, see <https://www.gnu.org/licenses/>.
+ * SPDX-License-Identifier: LGPL-2.1-or-later
  */
 
 #include <config.h>
index 9e234eff77bee35a3a26960ba12d62d54b2a7afe..376728cd441747884026f5160760a3c2e564c55f 100644 (file)
--- a/src/mpi.h
+++ b/src/mpi.h
@@ -5,7 +5,7 @@
  * This file is part of Libgcrypt.
  *
  * Libgcrypt is free software; you can redistribute it and/or modify
- * it under the terms of the GNU Lesser general Public License as
+ * it under the terms of the GNU Lesser General Public License as
  * published by the Free Software Foundation; either version 2.1 of
  * the License, or (at your option) any later version.
  *
@@ -15,8 +15,8 @@
  * GNU Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA
+ * License along with this program; if not, see <https://www.gnu.org/licenses/>.
+ * SPDX-License-Identifier: LGPL-2.1-or-later
  *
  * Note: This code is heavily based on the GNU MP Library.
  *      Actually it's the same code with only minor changes in the
@@ -117,7 +117,6 @@ void _gcry_mpi_immutable_failed (void);
 #define mpi_alloc_like(a)     _gcry_mpi_alloc_like((a))
 
 #define mpi_alloc_set_ui(a)   _gcry_mpi_alloc_set_ui ((a))
-#define mpi_m_check(a)        _gcry_mpi_m_check ((a))
 #define mpi_const(n)          _gcry_mpi_const ((n))
 #define mpi_swap_cond(a,b,sw)  _gcry_mpi_swap_cond ((a),(b),(sw))
 #define mpi_set_cond(w,u,set)  _gcry_mpi_set_cond ((w),(u),(set))
@@ -128,7 +127,6 @@ gcry_mpi_t _gcry_mpi_set_cond (gcry_mpi_t w, const gcry_mpi_t u,
                                unsigned long swap);
 gcry_mpi_t  _gcry_mpi_alloc_like( gcry_mpi_t a );
 gcry_mpi_t  _gcry_mpi_alloc_set_ui( unsigned long u);
-void _gcry_mpi_m_check( gcry_mpi_t a );
 void _gcry_mpi_swap( gcry_mpi_t a, gcry_mpi_t b);
 void _gcry_mpi_swap_cond (gcry_mpi_t a, gcry_mpi_t b, unsigned long swap);
 void _gcry_mpi_set_bit_cond (gcry_mpi_t a, unsigned int n, unsigned long set);
index ca413cf41620385a48626b0c0d8829ca21f13bb7..0903e0a4d397d811147129b36ebb64ccd2ea930f 100644 (file)
@@ -85,40 +85,26 @@ print_mpi (gcry_mpi_t a)
 
 
 static void
-do_add (int usemod)
+do_add (void)
 {
-  if (stackidx < (usemod?3:2))
+  if (stackidx < 2)
     {
       fputs ("stack underflow\n", stderr);
       return;
     }
-  if (usemod)
-    {
-      mpi_addm (stack[stackidx - 3], stack[stackidx - 3],
-                stack[stackidx - 2], stack[stackidx - 1]);
-      stackidx--;
-    }
-  else
-    mpi_add (stack[stackidx - 2], stack[stackidx - 2], stack[stackidx - 1]);
+  mpi_add (stack[stackidx - 2], stack[stackidx - 2], stack[stackidx - 1]);
   stackidx--;
 }
 
 static void
-do_sub (int usemod)
+do_sub (void)
 {
-  if (stackidx < (usemod?3:2))
+  if (stackidx < 2)
     {
       fputs ("stack underflow\n", stderr);
       return;
     }
-  if (usemod)
-    {
-      mpi_subm (stack[stackidx - 3], stack[stackidx - 3],
-                stack[stackidx - 2], stack[stackidx - 1]);
-      stackidx--;
-    }
-  else
-    mpi_sub (stack[stackidx - 2], stack[stackidx - 2], stack[stackidx - 1]);
+  mpi_sub (stack[stackidx - 2], stack[stackidx - 2], stack[stackidx - 1]);
   stackidx--;
 }
 
@@ -342,7 +328,6 @@ print_help (void)
          "r   reverse       [0] := [1], [1] := [0]    {0}\n"
          "b   # of bits     [0] := nbits([0])         {0}\n"
          "P   prime check   [0] := is_prime([0])?1:0  {0}\n"
-         "M   use mod for next '+' and '-'\n"
          "c   clear stack\n"
          "p   print top item\n"
          "f   print the stack\n"
@@ -363,7 +348,6 @@ main (int argc, char **argv)
   int state = 0;
   char strbuf[4096];
   int stridx = 0;
-  int usemod = 0;
 
   if (argc)
     {
@@ -476,8 +460,7 @@ main (int argc, char **argv)
                  else
                    {
                      ungetc (c, stdin);
-                     do_add (usemod);
-                      usemod = 0;
+                     do_add ();
                    }
                  break;
                 case '-':
@@ -497,8 +480,7 @@ main (int argc, char **argv)
                  else
                    {
                      ungetc (c, stdin);
-                     do_sub (usemod);
-                      usemod = 0;
+                     do_sub ();
                    }
                  break;
                case '*':
@@ -565,9 +547,6 @@ main (int argc, char **argv)
                 case 'P':
                   do_primecheck ();
                   break;
-                case 'M':
-                  usemod = 1;
-                  break;
                case 'c':
                  for (i = 0; i < stackidx; i++)
                     {
index b36c44f6de188ff005ca10800a4ba9fdf5a352d2..4e1d299103dae46cb7a75e4b9ea18b6dd4df65b4 100644 (file)
@@ -6,7 +6,7 @@
  * This file is part of Libgcrypt.
  *
  * Libgcrypt is free software; you can redistribute it and/or modify
- * it under the terms of the GNU Lesser general Public License as
+ * it under the terms of the GNU Lesser General Public License as
  * published by the Free Software Foundation; either version 2.1 of
  * the License, or (at your option) any later version.
  *
@@ -289,48 +289,7 @@ print_warn (void)
 static void
 lock_pool_pages (void *p, size_t n)
 {
-#if defined(USE_CAPABILITIES) && defined(HAVE_MLOCK)
-  int err;
-
-  {
-    cap_t cap;
-
-    if (!no_priv_drop)
-      {
-        cap = cap_from_text ("cap_ipc_lock+ep");
-        cap_set_proc (cap);
-        cap_free (cap);
-      }
-    err = no_mlock? 0 : mlock (p, n);
-    if (err && errno)
-      err = errno;
-    if (!no_priv_drop)
-      {
-        cap = cap_from_text ("cap_ipc_lock+p");
-        cap_set_proc (cap);
-        cap_free(cap);
-      }
-  }
-
-  if (err)
-    {
-      if (err != EPERM
-#ifdef EAGAIN  /* BSD and also Linux may return EAGAIN */
-         && err != EAGAIN
-#endif
-#ifdef ENOSYS  /* Some SCOs return this (function not implemented) */
-         && err != ENOSYS
-#endif
-#ifdef ENOMEM  /* Linux might return this. */
-            && err != ENOMEM
-#endif
-         )
-       log_error ("can't lock memory: %s\n", strerror (err));
-      show_warning = 1;
-      not_locked = 1;
-    }
-
-#elif defined(HAVE_MLOCK)
+#if defined(HAVE_MLOCK)
   uid_t uid;
   int err;
 
@@ -344,18 +303,14 @@ lock_pool_pages (void *p, size_t n)
   if (uid)
     {
       errno = EPERM;
-      err = errno;
+      err = -1;
     }
   else
     {
       err = no_mlock? 0 : mlock (p, n);
-      if (err && errno)
-       err = errno;
     }
 #else /* !HAVE_BROKEN_MLOCK */
   err = no_mlock? 0 : mlock (p, n);
-  if (err && errno)
-    err = errno;
 #endif /* !HAVE_BROKEN_MLOCK */
 
   /* Test whether we are running setuid(0).  */
@@ -373,18 +328,18 @@ lock_pool_pages (void *p, size_t n)
 
   if (err)
     {
-      if (err != EPERM
+      if (errno != EPERM
 #ifdef EAGAIN  /* BSD and also Linux may return this. */
-         && err != EAGAIN
+         && errno != EAGAIN
 #endif
 #ifdef ENOSYS  /* Some SCOs return this (function not implemented). */
-         && err != ENOSYS
+         && errno != ENOSYS
 #endif
 #ifdef ENOMEM  /* Linux might return this. */
-            && err != ENOMEM
+            && errno != ENOMEM
 #endif
          )
-       log_error ("can't lock memory: %s\n", strerror (err));
+       log_error ("can't lock memory: %s\n", strerror (errno));
       show_warning = 1;
       not_locked = 1;
     }
@@ -401,12 +356,6 @@ lock_pool_pages (void *p, size_t n)
      * this whole Windows !@#$% and their user base are inherently insecure. */
   (void)p;
   (void)n;
-#elif defined (__riscos__)
-    /* No virtual memory on RISC OS, so no pages are swapped to disc,
-     * besides we don't have mmap, so we don't use it! ;-)
-     * But don't complain, as explained above.  */
-  (void)p;
-  (void)n;
 #else
   (void)p;
   (void)n;
@@ -615,7 +564,7 @@ _gcry_secmem_init (size_t n)
 
 
 gcry_err_code_t
-_gcry_secmem_module_init ()
+_gcry_secmem_module_init (void)
 {
   /* Not anymore needed.  */
   return 0;
@@ -766,7 +715,7 @@ _gcry_secmem_free_internal (void *a)
   /* This does not make much sense: probably this memory is held in the
    * cache. We do it anyway: */
 #define MB_WIPE_OUT(byte) \
-  wipememory2 (((char *) mb + BLOCK_HEAD_SIZE), (byte), size);
+  wipememory2 (((char *) mb + BLOCK_HEAD_SIZE), (byte), size)
 
   MB_WIPE_OUT (0xff);
   MB_WIPE_OUT (0xaa);
@@ -809,7 +758,7 @@ _gcry_secmem_realloc_internal (void *p, size_t newsize, int xhint)
   void *a;
 
   mb = (memblock_t *) (void *) ((char *) p
-                               - ((size_t) &((memblock_t *) 0)->aligned.c));
+                               - offsetof (memblock_t, aligned.c));
   size = mb->size;
   if (newsize < size)
     {
@@ -876,7 +825,7 @@ _gcry_private_is_secure (const void *p)
  *          there is no chance to get the secure memory cleaned.
  */
 void
-_gcry_secmem_term ()
+_gcry_secmem_term (void)
 {
   pooldesc_t *pool, *next;
 
index 8ad6ef1a38cff9104f16296081fd380426c7718f..3ed883546b47ad8c976fd87e9ca8ab12d81df28a 100644 (file)
@@ -4,7 +4,7 @@
  * This file is part of Libgcrypt.
  *
  * Libgcrypt is free software; you can redistribute it and/or modify
- * it under the terms of the GNU Lesser general Public License as
+ * it under the terms of the GNU Lesser General Public License as
  * published by the Free Software Foundation; either version 2.1 of
  * the License, or (at your option) any later version.
  *
@@ -14,8 +14,8 @@
  * GNU Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA
+ * License along with this program; if not, see <https://www.gnu.org/licenses/>.
+ * SPDX-License-Identifier: LGPL-2.1-or-later
  */
 
 #ifndef G10_SECMEM_H
index d15f1a79047e1f939d127c9ed9a658ee8f71e314..b15cb4865814e68b36d42e9c47b9a4b57a1f3c82 100644 (file)
@@ -6,7 +6,7 @@
  * This file is part of Libgcrypt.
  *
  * Libgcrypt is free software; you can redistribute it and/or modify
- * it under the terms of the GNU Lesser general Public License as
+ * it under the terms of the GNU Lesser General Public License as
  * published by the Free Software Foundation; either version 2.1 of
  * the License, or (at your option) any later version.
  *
@@ -2697,3 +2697,27 @@ _gcry_sexp_extract_param (gcry_sexp_t sexp, const char *path,
   va_end (arg_ptr);
   return rc;
 }
+
+/* Convert STRING consisting of hex characters into its binary
+   representation and return it as an allocated buffer. The valid
+   length of the buffer is returned at R_LENGTH.  The string is
+   delimited by end of string.  The function returns NULL on
+   error.  */
+void *
+_gcry_hex2buffer (const char *string, size_t *r_length)
+{
+  const char *s;
+  unsigned char *buffer;
+  size_t length;
+
+  buffer = xmalloc (strlen(string)/2+1);
+  length = 0;
+  for (s=string; *s; s +=2 )
+    {
+      if (!hexdigitp (s) || !hexdigitp (s+1))
+        return NULL;           /* Invalid hex digits. */
+      ((unsigned char*)buffer)[length++] = xtoi_2 (s);
+    }
+  *r_length = length;
+  return buffer;
+}
index 04ce64fba14b2fd5d58be5050b80d6a159dffed5..e0317a8fc5f06e6dd7ed1094c090c9d646ee66ba 100644 (file)
@@ -4,7 +4,7 @@
  * This file is part of Libgcrypt.
  *
  * Libgcrypt is free software; you can redistribute it and/or modify
- * it under the terms of the GNU Lesser general Public License as
+ * it under the terms of the GNU Lesser General Public License as
  * published by the Free Software Foundation; either version 2.1 of
  * the License, or (at your option) any later version.
  *
 
 \f
 
-#define MAGIC_NOR_BYTE 0x55
-#define MAGIC_SEC_BYTE 0xcc
-#define MAGIC_END_BYTE 0xaa
-
-#if SIZEOF_UNSIGNED_LONG == 8
-#define EXTRA_ALIGN 4
-#else
-#define EXTRA_ALIGN 0
-#endif
-
-
-static int use_m_guard = 0;
-
-/****************
- * Warning: Never use this function after any of the functions
- * here have been used.
- */
-void
-_gcry_private_enable_m_guard (void)
-{
-  use_m_guard = 1;
-}
-
-
 /*
  * Allocate memory of size n.
  * Return NULL if we are out of memory.
@@ -95,23 +71,7 @@ _gcry_private_malloc (size_t n)
                       an error to detect such coding errors.  */
     }
 
-  if (use_m_guard)
-    {
-      char *p;
-
-      if ( !(p = malloc (n + EXTRA_ALIGN+5)) )
-        return NULL;
-      ((byte*)p)[EXTRA_ALIGN+0] = n;
-      ((byte*)p)[EXTRA_ALIGN+1] = n >> 8 ;
-      ((byte*)p)[EXTRA_ALIGN+2] = n >> 16 ;
-      ((byte*)p)[EXTRA_ALIGN+3] = MAGIC_NOR_BYTE;
-      p[4+EXTRA_ALIGN+n] = MAGIC_END_BYTE;
-      return p+EXTRA_ALIGN+4;
-    }
-  else
-    {
-      return malloc( n );
-    }
+  return malloc( n );
 }
 
 
@@ -130,23 +90,7 @@ _gcry_private_malloc_secure (size_t n, int xhint)
                       error to detect such coding errors.  */
     }
 
-  if (use_m_guard)
-    {
-      char *p;
-
-      if (!(p = _gcry_secmem_malloc (n + EXTRA_ALIGN + 5, xhint)))
-        return NULL;
-      ((byte*)p)[EXTRA_ALIGN+0] = n;
-      ((byte*)p)[EXTRA_ALIGN+1] = n >> 8 ;
-      ((byte*)p)[EXTRA_ALIGN+2] = n >> 16 ;
-      ((byte*)p)[EXTRA_ALIGN+3] = MAGIC_SEC_BYTE;
-      p[4+EXTRA_ALIGN+n] = MAGIC_END_BYTE;
-      return p+EXTRA_ALIGN+4;
-    }
-  else
-    {
-      return _gcry_secmem_malloc (n, xhint);
-    }
+  return _gcry_secmem_malloc (n, xhint);
 }
 
 
@@ -158,33 +102,7 @@ _gcry_private_malloc_secure (size_t n, int xhint)
 void *
 _gcry_private_realloc (void *a, size_t n, int xhint)
 {
-  if (use_m_guard)
-    {
-      unsigned char *p = a;
-      char *b;
-      size_t len;
-
-      if (!a)
-        return _gcry_private_malloc(n);
-
-      _gcry_private_check_heap(p);
-      len  = p[-4];
-      len |= p[-3] << 8;
-      len |= p[-2] << 16;
-      if( len >= n ) /* We don't shrink for now. */
-        return a;
-      if (p[-1] == MAGIC_SEC_BYTE)
-        b = _gcry_private_malloc_secure (n, xhint);
-      else
-        b = _gcry_private_malloc(n);
-      if (!b)
-        return NULL;
-      memcpy (b, a, len);
-      memset (b+len, 0, n-len);
-      _gcry_private_free (p);
-      return b;
-    }
-  else if ( _gcry_private_is_secure(a) )
+  if ( _gcry_private_is_secure(a) )
     {
       return _gcry_secmem_realloc (a, n, xhint);
     }
@@ -195,28 +113,6 @@ _gcry_private_realloc (void *a, size_t n, int xhint)
 }
 
 
-void
-_gcry_private_check_heap (const void *a)
-{
-  if (use_m_guard)
-    {
-      const byte *p = a;
-      size_t len;
-
-      if (!p)
-        return;
-
-      if ( !(p[-1] == MAGIC_NOR_BYTE || p[-1] == MAGIC_SEC_BYTE) )
-        _gcry_log_fatal ("memory at %p corrupted (underflow=%02x)\n", p, p[-1]);
-      len  = p[-4];
-      len |= p[-3] << 8;
-      len |= p[-2] << 16;
-      if ( p[len] != MAGIC_END_BYTE )
-        _gcry_log_fatal ("memory at %p corrupted (overflow=%02x)\n", p, p[-1]);
-    }
-}
-
-
 /*
  * Free a memory block allocated by this or the secmem module
  */
@@ -228,15 +124,8 @@ _gcry_private_free (void *a)
 
   if (!p)
     return;
-  if (use_m_guard)
-    {
-      _gcry_private_check_heap (p);
-      freep = p - EXTRA_ALIGN - 4;
-    }
-  else
-    {
-      freep = p;
-    }
+
+  freep = p;
 
   if (!_gcry_private_is_secure (freep) ||
       !_gcry_secmem_free (freep))
index c52aab540deac1a920406c268379569fb81eadf9..12de9d4f834ae1a798606c445591a43dd345ecc5 100644 (file)
@@ -4,7 +4,7 @@
  * This file is part of Libgcrypt.
  *
  * Libgcrypt is free software; you can redistribute it and/or modify
- * it under the terms of the GNU Lesser general Public License as
+ * it under the terms of the GNU Lesser General Public License as
  * published by the Free Software Foundation; either version 2.1 of
  * the License, or (at your option) any later version.
  *
  * GNU Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA
+ * License along with this program; if not, see <https://www.gnu.org/licenses/>.
+ * SPDX-License-Identifier: LGPL-2.1-or-later
  */
 
 #ifndef G10_STDMEM_H
 #define G10_STDMEM_H 1
 
-void _gcry_private_enable_m_guard(void);
-
 void *_gcry_private_malloc (size_t n) _GCRY_GCC_ATTR_MALLOC;
 void *_gcry_private_malloc_secure (size_t n, int xhint) _GCRY_GCC_ATTR_MALLOC;
 void *_gcry_private_realloc (void *a, size_t n, int xhint);
-void _gcry_private_check_heap (const void *a);
 void _gcry_private_free (void *a);
 
 #endif /* G10_STDMEM_H */
index b4f28bc4f4bde4934ad27321f8161547825352bd..733c2a8c262a17ca09636bfd9d53d6ccf737e70a 100644 (file)
@@ -4,7 +4,7 @@
  * This file is part of Libgcrypt.
  *
  * Libgcrypt is free software; you can redistribute it and/or modify
- * it under the terms of the GNU Lesser general Public License as
+ * it under the terms of the GNU Lesser General Public License as
  * published by the Free Software Foundation; either version 2.1 of
  * the License, or (at your option) any later version.
  *
@@ -14,8 +14,8 @@
  * GNU Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA
+ * License along with this program; if not, see <https://www.gnu.org/licenses/>.
+ * SPDX-License-Identifier: LGPL-2.1-or-later
  */
 
 #ifndef GCRYPT_TYPES_H
 
 
 
-#ifndef HAVE_TYPE_BYTE
+#ifndef HAVE_BYTE
 # undef byte   /* In case there is a macro with that name.  */
 # if !(defined(_WIN32) && defined(cbNDRContext))
    /* Windows typedefs byte in the rpc headers.  Avoid warning about
       double definition.  */
    typedef unsigned char byte;
 # endif
-# define HAVE_TYPE_BYTE
+# define HAVE_BYTE
 #endif
 
-#ifndef HAVE_TYPE_USHORT
+#ifndef HAVE_USHORT
 # undef ushort  /* In case there is a macro with that name.  */
   typedef unsigned short ushort;
-# define HAVE_TYPE_USHORT
+# define HAVE_USHORT
 #endif
 
-#ifndef HAVE_TYPE_U16
+#ifndef HAVE_U16
 # undef u16    /* In case there is a macro with that name.  */
 # if SIZEOF_UNSIGNED_INT == 2
    typedef unsigned int   u16;
 # else
 #  error no typedef for u16
 # endif
-# define HAVE_TYPE_U16
+# define HAVE_U16
 #endif
 
-#ifndef HAVE_TYPE_U32
+#ifndef HAVE_U32
 # undef u32    /* In case there is a macro with that name.  */
 # if SIZEOF_UNSIGNED_INT == 4
    typedef unsigned int  u32;
@@ -91,7 +91,7 @@
 # else
 #  error no typedef for u32
 # endif
-# define HAVE_TYPE_U32
+# define HAVE_U32
 #endif
 
 /*
  * the dummy code in cipher/md.c is not available.  Examples are
  * Solaris and IRIX.
  */
-#ifndef HAVE_TYPE_U64
+#ifndef HAVE_U64
 # undef u64    /* In case there is a macro with that name.  */
 # if SIZEOF_UINT64_T == 8
    typedef uint64_t u64;
 #  define U64_C(c) (UINT64_C(c))
-#  define HAVE_TYPE_U64
+#  define HAVE_U64
 # elif SIZEOF_UNSIGNED_INT == 8
    typedef unsigned int u64;
 #  define U64_C(c) (c ## U)
-#  define HAVE_TYPE_U64
+#  define HAVE_U64
 # elif SIZEOF_UNSIGNED_LONG == 8
    typedef unsigned long u64;
 #  define U64_C(c) (c ## UL)
-#  define HAVE_TYPE_U64
+#  define HAVE_U64
 # elif SIZEOF_UNSIGNED_LONG_LONG == 8
    typedef unsigned long long u64;
 #  define U64_C(c) (c ## ULL)
-#  define HAVE_TYPE_U64
+#  define HAVE_U64
 # else
 #  error No way to declare a 64 bit integer type
 # endif
index 929f9ccc5ae76f13bc7f682ecf0ab0785a0c3947..f87d0d0597e59919e78799209068c603910f5518 100644 (file)
@@ -39,7 +39,7 @@ BEGIN
             VALUE "FileDescription", "Libgcrypt - The GNU Crypto Library\0"
             VALUE "FileVersion", "@LIBGCRYPT_LT_CURRENT@.@LIBGCRYPT_LT_AGE@.@LIBGCRYPT_LT_REVISION@.@BUILD_REVISION@\0"
             VALUE "InternalName", "libgcrypt\0"
-            VALUE "LegalCopyright", "Copyright © 2023 g10 Code GmbH\0"
+            VALUE "LegalCopyright", "Copyright © 2021 g10 Code GmbH\0"
             VALUE "LegalTrademarks", "\0"
             VALUE "OriginalFilename", "libgcrypt.dll\0"
             VALUE "PrivateBuild", "\0"
index 5c64618bb779526a95c1fcacc4dd15fa061feaf8..d2e46afdc8827975b22976e3de145d0c4a972722 100644 (file)
@@ -773,6 +773,22 @@ gcry_cipher_setctr (gcry_cipher_hd_t hd, const void *ctr, size_t ctrlen)
   return gcry_error (_gcry_cipher_setctr (hd, ctr, ctrlen));
 }
 
+gcry_error_t
+gcry_cipher_setup_geniv (gcry_cipher_hd_t hd, int method,
+                         const void *fixed_iv, size_t fixed_iv_len,
+                         const void *dyn_iv, size_t dyn_iv_len)
+{
+  return gcry_error (_gcry_cipher_setup_geniv (hd, method,
+                                               fixed_iv, fixed_iv_len,
+                                               dyn_iv, dyn_iv_len));
+}
+
+gcry_error_t
+gcry_cipher_geniv (gcry_cipher_hd_t hd, void *iv, size_t iv_len)
+{
+  return gcry_error (_gcry_cipher_geniv (hd, iv, iv_len));
+}
+
 gcry_error_t
 gcry_cipher_authenticate (gcry_cipher_hd_t hd, const void *abuf, size_t abuflen)
 {
@@ -1062,7 +1078,7 @@ gcry_pk_hash_verify (gcry_sexp_t sigval, const char *data_tmpl, gcry_sexp_t pkey
 gcry_error_t
 gcry_pk_random_override_new (gcry_ctx_t *r_ctx, const unsigned char *p, size_t len)
 {
-  return gpg_error (_gcry_pk_random_override_new (r_ctx, p, len));
+  return gpg_error (_gcry_pk_single_data_push (r_ctx, p, len));
 }
 
 gcry_error_t
@@ -1177,7 +1193,7 @@ gpg_error_t
 gcry_ecc_mul_point (int curveid, unsigned char *result,
                     const unsigned char *scalar, const unsigned char *point)
 {
-  return _gcry_ecc_mul_point (curveid, result, scalar, point);
+  return gpg_error (_gcry_ecc_mul_point (curveid, result, scalar, point));
 }
 
 gcry_error_t
@@ -1251,7 +1267,9 @@ gcry_md_read (gcry_md_hd_t hd, int algo)
 gcry_error_t
 gcry_md_extract (gcry_md_hd_t hd, int algo, void *buffer, size_t length)
 {
-  return gpg_error (_gcry_md_extract(hd, algo, buffer, length));
+  if (!fips_is_operational ())
+    return gpg_error (fips_not_operational ());
+  return gpg_error (_gcry_md_extract (hd, algo, buffer, length));
 }
 
 void
@@ -1278,6 +1296,20 @@ gcry_md_hash_buffers (int algo, unsigned int flags, void *digest,
   return gpg_error (_gcry_md_hash_buffers (algo, flags, digest, iov, iovcnt));
 }
 
+gpg_error_t
+gcry_md_hash_buffers_ext (int algo, unsigned int flags, void *digest,
+                          int digestlen, const gcry_buffer_t *iov,
+                          int iovcnt)
+{
+  if (!fips_is_operational ())
+    {
+      (void)fips_not_operational ();
+      fips_signal_error ("called in non-operational state");
+    }
+  return gpg_error (_gcry_md_hash_buffers_extract (algo, flags, digest,
+                                                   digestlen, iov, iovcnt));
+}
+
 int
 gcry_md_get_algo (gcry_md_hd_t hd)
 {
@@ -1410,6 +1442,45 @@ gcry_kdf_close (gcry_kdf_hd_t h)
   _gcry_kdf_close (h);
 }
 
+gcry_error_t
+gcry_kem_keypair (int algo,
+                  void *pubkey, size_t pubkey_len,
+                  void *seckey, size_t seckey_len)
+
+{
+  return gpg_error (_gcry_kem_keypair (algo,
+                                       pubkey, pubkey_len,
+                                       seckey, seckey_len));
+}
+
+gcry_error_t
+gcry_kem_encap (int algo,
+                const void *pubkey, size_t pubkey_len,
+                void *ciphertext, size_t ciphertext_len,
+                void *shared, size_t shared_len,
+                const void *optional, size_t optional_len)
+{
+  return gpg_error (_gcry_kem_encap (algo,
+                                     pubkey, pubkey_len,
+                                     ciphertext, ciphertext_len,
+                                     shared, shared_len,
+                                     optional, optional_len));
+}
+
+gcry_error_t
+gcry_kem_decap (int algo,
+                const void *seckey, size_t seckey_len,
+                const void *ciphertext, size_t ciphertext_len,
+                void *shared, size_t shared_len,
+                const void *optional, size_t optional_len)
+{
+  return gpg_error (_gcry_kem_decap (algo,
+                                     seckey, seckey_len,
+                                     ciphertext, ciphertext_len,
+                                     shared, shared_len,
+                                     optional, optional_len));
+}
+
 void
 gcry_randomize (void *buffer, size_t length, enum gcry_random_level level)
 {
index 14bf624872d1f96e6c283878bd11775060c5ba96..75d73ea2d7a9e141491f0b43fd8d04aa5ca13b23 100644 (file)
@@ -104,6 +104,7 @@ MARK_VISIBLEX (gcry_md_get_algo)
 MARK_VISIBLEX (gcry_md_get_algo_dlen)
 MARK_VISIBLEX (gcry_md_hash_buffer)
 MARK_VISIBLEX (gcry_md_hash_buffers)
+MARK_VISIBLEX (gcry_md_hash_buffers_ext)
 MARK_VISIBLEX (gcry_md_info)
 MARK_VISIBLEX (gcry_md_is_enabled)
 MARK_VISIBLEX (gcry_md_is_secure)
@@ -122,6 +123,8 @@ MARK_VISIBLEX (gcry_cipher_close)
 MARK_VISIBLEX (gcry_cipher_setkey)
 MARK_VISIBLEX (gcry_cipher_setiv)
 MARK_VISIBLEX (gcry_cipher_setctr)
+MARK_VISIBLEX (gcry_cipher_setup_geniv)
+MARK_VISIBLEX (gcry_cipher_geniv)
 MARK_VISIBLEX (gcry_cipher_authenticate)
 MARK_VISIBLEX (gcry_cipher_checktag)
 MARK_VISIBLEX (gcry_cipher_gettag)
@@ -177,6 +180,10 @@ MARK_VISIBLEX (gcry_kdf_compute)
 MARK_VISIBLEX (gcry_kdf_final)
 MARK_VISIBLEX (gcry_kdf_close)
 
+MARK_VISIBLEX (gcry_kem_keypair)
+MARK_VISIBLEX (gcry_kem_encap)
+MARK_VISIBLEX (gcry_kem_decap)
+
 MARK_VISIBLEX (gcry_prime_check)
 MARK_VISIBLEX (gcry_prime_generate)
 MARK_VISIBLEX (gcry_prime_group_generator)
@@ -344,6 +351,8 @@ MARK_VISIBLEX (_gcry_mpi_get_const)
 #define gcry_cipher_setctr          _gcry_USE_THE_UNDERSCORED_FUNCTION
 #define gcry_cipher_algo_info       _gcry_USE_THE_UNDERSCORED_FUNCTION
 #define gcry_cipher_algo_name       _gcry_USE_THE_UNDERSCORED_FUNCTION
+#define gcry_cipher_setup_geniv     _gcry_USE_THE_UNDERSCORED_FUNCTION
+#define gcry_cipher_geniv           _gcry_USE_THE_UNDERSCORED_FUNCTION
 #define gcry_cipher_authenticate    _gcry_USE_THE_UNDERSCORED_FUNCTION
 #define gcry_cipher_checktag        _gcry_USE_THE_UNDERSCORED_FUNCTION
 #define gcry_cipher_gettag          _gcry_USE_THE_UNDERSCORED_FUNCTION
@@ -388,6 +397,7 @@ MARK_VISIBLEX (_gcry_mpi_get_const)
 #define gcry_md_get_algo_dlen       _gcry_USE_THE_UNDERSCORED_FUNCTION
 #define gcry_md_hash_buffer         _gcry_USE_THE_UNDERSCORED_FUNCTION
 #define gcry_md_hash_buffers        _gcry_USE_THE_UNDERSCORED_FUNCTION
+#define gcry_md_hash_buffers_ext    _gcry_USE_THE_UNDERSCORED_FUNCTION
 #define gcry_md_info                _gcry_USE_THE_UNDERSCORED_FUNCTION
 #define gcry_md_is_enabled          _gcry_USE_THE_UNDERSCORED_FUNCTION
 #define gcry_md_is_secure           _gcry_USE_THE_UNDERSCORED_FUNCTION
@@ -421,6 +431,10 @@ MARK_VISIBLEX (_gcry_mpi_get_const)
 #define gcry_kdf_final              _gcry_USE_THE_UNDERSCORED_FUNCTION
 #define gcry_kdf_close              _gcry_USE_THE_UNDERSCORED_FUNCTION
 
+#define gcry_kem_keypair            _gcry_USE_THE_UNDERSCORED_FUNCTION
+#define gcry_kem_encap              _gcry_USE_THE_UNDERSCORED_FUNCTION
+#define gcry_kem_decap              _gcry_USE_THE_UNDERSCORED_FUNCTION
+
 #define gcry_prime_check            _gcry_USE_THE_UNDERSCORED_FUNCTION
 #define gcry_prime_generate         _gcry_USE_THE_UNDERSCORED_FUNCTION
 #define gcry_prime_group_generator  _gcry_USE_THE_UNDERSCORED_FUNCTION
index 302d923b234b5c7b8252024ec8a07c32171788c7..423bc1cde69c581f0f0597dafca469b87da2f709 100644 (file)
@@ -13,8 +13,8 @@
 # GNU Lesser General Public License for more details.
 #
 # You should have received a copy of the GNU Lesser General Public
-# License along with this program; if not, write to the Free Software
-# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA
+# License along with this program; if not, see <https://www.gnu.org/licenses/>.
+# SPDX-License-Identifier: LGPL-2.1-or-later
 
 ## Process this file with automake to produce Makefile.in
 
@@ -25,7 +25,7 @@ tests_bin = \
         version t-secmem mpitests t-sexp t-convert \
        t-mpi-bit t-mpi-point t-lock \
        prime basic keygen pubkey hmac hashtest t-kdf keygrip \
-       aeswrap random
+       aeswrap random t-kem t-mlkem
 
 if USE_RSA
 tests_bin += pkcs1v2 t-rsa-pss t-rsa-15 t-rsa-testparm
@@ -44,13 +44,14 @@ tests_bin_last = benchmark bench-slope
 
 tests_sh = basic-disable-all-hwf
 
-tests_sh_last = hashtest-256g
+tests_sh_last = hashtest-6g hashtest-256g
 
 TESTS = $(tests_bin) $(tests_sh) $(tests_bin_last) $(tests_sh_last)
 
 # Force sequential run of some tests.
 bench-slope.log:    benchmark.log
-hashtest-256g.log:  bench-slope.log
+hashtest-6g.log:    bench-slope.log
+hashtest-256g.log:  hashtest-6g.log
 
 
 TESTS_ENVIRONMENT = GCRYPT_IN_REGRESSION_TEST=1
@@ -60,7 +61,11 @@ TESTS_ENVIRONMENT = GCRYPT_IN_REGRESSION_TEST=1
 # a built header.
 AM_CPPFLAGS = -I../src -I$(top_srcdir)/src
 AM_CFLAGS = $(GPG_ERROR_CFLAGS)
+if HAVE_W32_SYSTEM
+AM_LDFLAGS = -no-fast-install
+else
 AM_LDFLAGS = -no-install
+endif
 
 standard_ldadd = \
        ../src/libgcrypt.la \
@@ -76,8 +81,9 @@ CLEANFILES = testdrv-build
 EXTRA_DIST = README rsa-16k.key \
             pkcs1v2-oaep.h pkcs1v2-pss.h pkcs1v2-v15c.h pkcs1v2-v15s.h \
             t-ed25519.inp t-ed448.inp t-dsa.inp t-ecdsa.inp t-rsa-15.inp \
-            t-rsa-pss.inp stopwatch.h hashtest-256g.in sha3-224.h \
-            sha3-256.h sha3-384.h sha3-512.h blake2b.h blake2s.h \
+            t-rsa-pss.inp t-mlkem.inp \
+            stopwatch.h hashtest-6g.in hashtest-256g.in \
+            sha3-224.h sha3-256.h sha3-384.h sha3-512.h blake2b.h blake2s.h \
             basic-disable-all-hwf.in basic_all_hwfeature_combinations.sh
 
 LDADD = $(standard_ldadd) $(GPG_ERROR_LIBS) @LDADD_FOR_TESTS_KLUDGE@
index 112534773f17f38e3dc6bf7a5080291d0d05dc7d..27900c39a7994b0fc9f0b7e940efd6b1e883e541 100644 (file)
@@ -29,8 +29,8 @@
 # GNU Lesser General Public License for more details.
 #
 # You should have received a copy of the GNU Lesser General Public
-# License along with this program; if not, write to the Free Software
-# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA
+# License along with this program; if not, see <https://www.gnu.org/licenses/>.
+# SPDX-License-Identifier: LGPL-2.1-or-later
 
 
 VPATH = @srcdir@
@@ -121,15 +121,15 @@ am__aclocal_m4_deps = $(top_srcdir)/m4/ax_cc_for_build.m4 \
        $(top_srcdir)/m4/gpg-error.m4 $(top_srcdir)/m4/libtool.m4 \
        $(top_srcdir)/m4/ltoptions.m4 $(top_srcdir)/m4/ltsugar.m4 \
        $(top_srcdir)/m4/ltversion.m4 $(top_srcdir)/m4/lt~obsolete.m4 \
-       $(top_srcdir)/m4/noexecstack.m4 $(top_srcdir)/m4/socklen.m4 \
-       $(top_srcdir)/acinclude.m4 $(top_srcdir)/configure.ac
+       $(top_srcdir)/m4/noexecstack.m4 $(top_srcdir)/acinclude.m4 \
+       $(top_srcdir)/configure.ac
 am__configure_deps = $(am__aclocal_m4_deps) $(CONFIGURE_DEPENDENCIES) \
        $(ACLOCAL_M4)
 DIST_COMMON = $(srcdir)/Makefile.am $(noinst_HEADERS) \
        $(am__DIST_COMMON)
 mkinstalldirs = $(install_sh) -d
 CONFIG_HEADER = $(top_builddir)/config.h
-CONFIG_CLEAN_FILES = hashtest-256g basic-disable-all-hwf
+CONFIG_CLEAN_FILES = hashtest-6g hashtest-256g basic-disable-all-hwf
 CONFIG_CLEAN_VPATH_FILES =
 @USE_RSA_TRUE@am__EXEEXT_1 = pkcs1v2$(EXEEXT) t-rsa-pss$(EXEEXT) \
 @USE_RSA_TRUE@ t-rsa-15$(EXEEXT) t-rsa-testparm$(EXEEXT)
@@ -143,8 +143,9 @@ am__EXEEXT_4 = version$(EXEEXT) t-secmem$(EXEEXT) mpitests$(EXEEXT) \
        t-mpi-point$(EXEEXT) t-lock$(EXEEXT) prime$(EXEEXT) \
        basic$(EXEEXT) keygen$(EXEEXT) pubkey$(EXEEXT) hmac$(EXEEXT) \
        hashtest$(EXEEXT) t-kdf$(EXEEXT) keygrip$(EXEEXT) \
-       aeswrap$(EXEEXT) random$(EXEEXT) $(am__EXEEXT_1) \
-       $(am__EXEEXT_2) $(am__EXEEXT_3)
+       aeswrap$(EXEEXT) random$(EXEEXT) t-kem$(EXEEXT) \
+       t-mlkem$(EXEEXT) $(am__EXEEXT_1) $(am__EXEEXT_2) \
+       $(am__EXEEXT_3)
 am__EXEEXT_5 = benchmark$(EXEEXT) bench-slope$(EXEEXT)
 PROGRAMS = $(noinst_PROGRAMS)
 aeswrap_SOURCES = aeswrap.c
@@ -267,12 +268,20 @@ t_kdf_OBJECTS = t_kdf-t-kdf.$(OBJEXT)
 t_kdf_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) \
        $(LIBTOOLFLAGS) --mode=link $(CCLD) $(t_kdf_CFLAGS) $(CFLAGS) \
        $(AM_LDFLAGS) $(LDFLAGS) -o $@
+t_kem_SOURCES = t-kem.c
+t_kem_OBJECTS = t-kem.$(OBJEXT)
+t_kem_LDADD = $(LDADD)
+t_kem_DEPENDENCIES = $(standard_ldadd) $(am__DEPENDENCIES_1)
 t_lock_SOURCES = t-lock.c
 t_lock_OBJECTS = t_lock-t-lock.$(OBJEXT)
 t_lock_DEPENDENCIES = $(standard_ldadd) $(am__DEPENDENCIES_1)
 t_lock_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) \
        $(LIBTOOLFLAGS) --mode=link $(CCLD) $(t_lock_CFLAGS) $(CFLAGS) \
        $(AM_LDFLAGS) $(LDFLAGS) -o $@
+t_mlkem_SOURCES = t-mlkem.c
+t_mlkem_OBJECTS = t-mlkem.$(OBJEXT)
+t_mlkem_LDADD = $(LDADD)
+t_mlkem_DEPENDENCIES = $(standard_ldadd) $(am__DEPENDENCIES_1)
 t_mpi_bit_SOURCES = t-mpi-bit.c
 t_mpi_bit_OBJECTS = t-mpi-bit.$(OBJEXT)
 t_mpi_bit_DEPENDENCIES = $(standard_ldadd)
@@ -341,7 +350,8 @@ am__depfiles_remade = ./$(DEPDIR)/aeswrap.Po ./$(DEPDIR)/basic.Po \
        ./$(DEPDIR)/rsacvt.Po ./$(DEPDIR)/t-convert.Po \
        ./$(DEPDIR)/t-cv25519.Po ./$(DEPDIR)/t-dsa.Po \
        ./$(DEPDIR)/t-ecdsa.Po ./$(DEPDIR)/t-ed25519.Po \
-       ./$(DEPDIR)/t-ed448.Po ./$(DEPDIR)/t-mpi-bit.Po \
+       ./$(DEPDIR)/t-ed448.Po ./$(DEPDIR)/t-kem.Po \
+       ./$(DEPDIR)/t-mlkem.Po ./$(DEPDIR)/t-mpi-bit.Po \
        ./$(DEPDIR)/t-mpi-point.Po ./$(DEPDIR)/t-rsa-15.Po \
        ./$(DEPDIR)/t-rsa-pss.Po ./$(DEPDIR)/t-rsa-testparm.Po \
        ./$(DEPDIR)/t-secmem.Po ./$(DEPDIR)/t-sexp.Po \
@@ -372,17 +382,17 @@ SOURCES = aeswrap.c basic.c bench-slope.c benchmark.c curves.c \
        hashtest.c hmac.c keygen.c keygrip.c mpitests.c pkbench.c \
        pkcs1v2.c prime.c pubkey.c random.c rsacvt.c t-convert.c \
        t-cv25519.c t-dsa.c t-ecdsa.c t-ed25519.c t-ed448.c t-kdf.c \
-       t-lock.c t-mpi-bit.c t-mpi-point.c t-rsa-15.c t-rsa-pss.c \
-       t-rsa-testparm.c t-secmem.c t-sexp.c t-x448.c testapi.c \
-       testdrv.c version.c
+       t-kem.c t-lock.c t-mlkem.c t-mpi-bit.c t-mpi-point.c \
+       t-rsa-15.c t-rsa-pss.c t-rsa-testparm.c t-secmem.c t-sexp.c \
+       t-x448.c testapi.c testdrv.c version.c
 DIST_SOURCES = aeswrap.c basic.c bench-slope.c benchmark.c curves.c \
        dsa-rfc6979.c fips186-dsa.c fipsdrv.c gchash.c genhashdata.c \
        hashtest.c hmac.c keygen.c keygrip.c mpitests.c pkbench.c \
        pkcs1v2.c prime.c pubkey.c random.c rsacvt.c t-convert.c \
        t-cv25519.c t-dsa.c t-ecdsa.c t-ed25519.c t-ed448.c t-kdf.c \
-       t-lock.c t-mpi-bit.c t-mpi-point.c t-rsa-15.c t-rsa-pss.c \
-       t-rsa-testparm.c t-secmem.c t-sexp.c t-x448.c testapi.c \
-       testdrv.c version.c
+       t-kem.c t-lock.c t-mlkem.c t-mpi-bit.c t-mpi-point.c \
+       t-rsa-15.c t-rsa-pss.c t-rsa-testparm.c t-secmem.c t-sexp.c \
+       t-x448.c testapi.c testdrv.c version.c
 am__can_run_installinfo = \
   case $$AM_UPDATE_INFO_DIR in \
     n|no|NO) false;; \
@@ -430,7 +440,8 @@ am__tty_colors = { \
 }
 am__DIST_COMMON = $(srcdir)/Makefile.in \
        $(srcdir)/basic-disable-all-hwf.in $(srcdir)/hashtest-256g.in \
-       $(top_srcdir)/build-aux/depcomp README
+       $(srcdir)/hashtest-6g.in $(top_srcdir)/build-aux/depcomp \
+       README
 DISTFILES = $(DIST_COMMON) $(DIST_SOURCES) $(TEXINFOS) $(EXTRA_DIST)
 ACLOCAL = @ACLOCAL@
 AMTAR = @AMTAR@
@@ -531,9 +542,6 @@ PACKAGE_TARNAME = @PACKAGE_TARNAME@
 PACKAGE_URL = @PACKAGE_URL@
 PACKAGE_VERSION = @PACKAGE_VERSION@
 PATH_SEPARATOR = @PATH_SEPARATOR@
-PTH_CFLAGS = @PTH_CFLAGS@
-PTH_CONFIG = @PTH_CONFIG@
-PTH_LIBS = @PTH_LIBS@
 RANLIB = @RANLIB@
 RC = @RC@
 READELF = @READELF@
@@ -607,18 +615,19 @@ top_srcdir = @top_srcdir@
 # the driver is only used for cross-compiling.
 tests_bin = version t-secmem mpitests t-sexp t-convert t-mpi-bit \
        t-mpi-point t-lock prime basic keygen pubkey hmac hashtest \
-       t-kdf keygrip aeswrap random $(am__append_1) $(am__append_2) \
-       $(am__append_3)
+       t-kdf keygrip aeswrap random t-kem t-mlkem $(am__append_1) \
+       $(am__append_2) $(am__append_3)
 tests_bin_last = benchmark bench-slope
 tests_sh = basic-disable-all-hwf
-tests_sh_last = hashtest-256g
+tests_sh_last = hashtest-6g hashtest-256g
 TESTS_ENVIRONMENT = GCRYPT_IN_REGRESSION_TEST=1
 
 # Need to include ../src in addition to top_srcdir because gcrypt.h is
 # a built header.
 AM_CPPFLAGS = -I../src -I$(top_srcdir)/src
 AM_CFLAGS = $(GPG_ERROR_CFLAGS)
-AM_LDFLAGS = -no-install
+@HAVE_W32_SYSTEM_FALSE@AM_LDFLAGS = -no-install
+@HAVE_W32_SYSTEM_TRUE@AM_LDFLAGS = -no-fast-install
 standard_ldadd = \
        ../src/libgcrypt.la \
         ../compat/libcompat.la
@@ -628,8 +637,9 @@ CLEANFILES = testdrv-build
 EXTRA_DIST = README rsa-16k.key \
             pkcs1v2-oaep.h pkcs1v2-pss.h pkcs1v2-v15c.h pkcs1v2-v15s.h \
             t-ed25519.inp t-ed448.inp t-dsa.inp t-ecdsa.inp t-rsa-15.inp \
-            t-rsa-pss.inp stopwatch.h hashtest-256g.in sha3-224.h \
-            sha3-256.h sha3-384.h sha3-512.h blake2b.h blake2s.h \
+            t-rsa-pss.inp t-mlkem.inp \
+            stopwatch.h hashtest-6g.in hashtest-256g.in \
+            sha3-224.h sha3-256.h sha3-384.h sha3-512.h blake2b.h blake2s.h \
             basic-disable-all-hwf.in basic_all_hwfeature_combinations.sh
 
 LDADD = $(standard_ldadd) $(GPG_ERROR_LIBS) @LDADD_FOR_TESTS_KLUDGE@
@@ -684,6 +694,8 @@ $(top_srcdir)/configure: @MAINTAINER_MODE_TRUE@ $(am__configure_deps)
 $(ACLOCAL_M4): @MAINTAINER_MODE_TRUE@ $(am__aclocal_m4_deps)
        cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
 $(am__aclocal_m4_deps):
+hashtest-6g: $(top_builddir)/config.status $(srcdir)/hashtest-6g.in
+       cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@
 hashtest-256g: $(top_builddir)/config.status $(srcdir)/hashtest-256g.in
        cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@
 basic-disable-all-hwf: $(top_builddir)/config.status $(srcdir)/basic-disable-all-hwf.in
@@ -810,10 +822,18 @@ t-kdf$(EXEEXT): $(t_kdf_OBJECTS) $(t_kdf_DEPENDENCIES) $(EXTRA_t_kdf_DEPENDENCIE
        @rm -f t-kdf$(EXEEXT)
        $(AM_V_CCLD)$(t_kdf_LINK) $(t_kdf_OBJECTS) $(t_kdf_LDADD) $(LIBS)
 
+t-kem$(EXEEXT): $(t_kem_OBJECTS) $(t_kem_DEPENDENCIES) $(EXTRA_t_kem_DEPENDENCIES) 
+       @rm -f t-kem$(EXEEXT)
+       $(AM_V_CCLD)$(LINK) $(t_kem_OBJECTS) $(t_kem_LDADD) $(LIBS)
+
 t-lock$(EXEEXT): $(t_lock_OBJECTS) $(t_lock_DEPENDENCIES) $(EXTRA_t_lock_DEPENDENCIES) 
        @rm -f t-lock$(EXEEXT)
        $(AM_V_CCLD)$(t_lock_LINK) $(t_lock_OBJECTS) $(t_lock_LDADD) $(LIBS)
 
+t-mlkem$(EXEEXT): $(t_mlkem_OBJECTS) $(t_mlkem_DEPENDENCIES) $(EXTRA_t_mlkem_DEPENDENCIES) 
+       @rm -f t-mlkem$(EXEEXT)
+       $(AM_V_CCLD)$(LINK) $(t_mlkem_OBJECTS) $(t_mlkem_LDADD) $(LIBS)
+
 t-mpi-bit$(EXEEXT): $(t_mpi_bit_OBJECTS) $(t_mpi_bit_DEPENDENCIES) $(EXTRA_t_mpi_bit_DEPENDENCIES) 
        @rm -f t-mpi-bit$(EXEEXT)
        $(AM_V_CCLD)$(LINK) $(t_mpi_bit_OBJECTS) $(t_mpi_bit_LDADD) $(LIBS)
@@ -891,6 +911,8 @@ distclean-compile:
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t-ecdsa.Po@am__quote@ # am--include-marker
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t-ed25519.Po@am__quote@ # am--include-marker
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t-ed448.Po@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t-kem.Po@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t-mlkem.Po@am__quote@ # am--include-marker
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t-mpi-bit.Po@am__quote@ # am--include-marker
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t-mpi-point.Po@am__quote@ # am--include-marker
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t-rsa-15.Po@am__quote@ # am--include-marker
@@ -1212,6 +1234,8 @@ distclean: distclean-am
        -rm -f ./$(DEPDIR)/t-ecdsa.Po
        -rm -f ./$(DEPDIR)/t-ed25519.Po
        -rm -f ./$(DEPDIR)/t-ed448.Po
+       -rm -f ./$(DEPDIR)/t-kem.Po
+       -rm -f ./$(DEPDIR)/t-mlkem.Po
        -rm -f ./$(DEPDIR)/t-mpi-bit.Po
        -rm -f ./$(DEPDIR)/t-mpi-point.Po
        -rm -f ./$(DEPDIR)/t-rsa-15.Po
@@ -1297,6 +1321,8 @@ maintainer-clean: maintainer-clean-am
        -rm -f ./$(DEPDIR)/t-ecdsa.Po
        -rm -f ./$(DEPDIR)/t-ed25519.Po
        -rm -f ./$(DEPDIR)/t-ed448.Po
+       -rm -f ./$(DEPDIR)/t-kem.Po
+       -rm -f ./$(DEPDIR)/t-mlkem.Po
        -rm -f ./$(DEPDIR)/t-mpi-bit.Po
        -rm -f ./$(DEPDIR)/t-mpi-point.Po
        -rm -f ./$(DEPDIR)/t-rsa-15.Po
@@ -1349,7 +1375,8 @@ uninstall-am:
 
 # Force sequential run of some tests.
 bench-slope.log:    benchmark.log
-hashtest-256g.log:  bench-slope.log
+hashtest-6g.log:    bench-slope.log
+hashtest-256g.log:  hashtest-6g.log
 
 # Build a version of the test driver for the build platform.
 testdrv-build: testdrv.c
index e5ecad755c63c860ce1950d56afb2eb4ecd350a5..c94651710713e0acaeb3ad22f256017ab66c8f98 100644 (file)
@@ -14,8 +14,8 @@
  * GNU Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA
+ * License along with this program; if not, see <https://www.gnu.org/licenses/>.
+ * SPDX-License-Identifier: LGPL-2.1-or-later
  */
 
 #ifdef HAVE_CONFIG_H
index febb6a6192eb262793250d9d9a939d6a147f7134..72c65b58404ba9cef76ecee6d7562e5b3c8f6a77 100644 (file)
 #include <string.h>
 #include <stdarg.h>
 #include <assert.h>
+#ifdef HAVE_STDINT_H
+# include <stdint.h> /* uintptr_t */
+#elif defined(HAVE_INTTYPES_H)
+# include <inttypes.h>
+#else
+/* In this case, uintptr_t is provided by config.h. */
+#endif
 
 #include "../src/gcrypt-int.h"
+#include "../src/gcrypt-testapi.h"
 
 #define PGM "basic"
 #include "t-common.h"
@@ -199,6 +207,22 @@ show_mac_not_available (int algo)
 }
 
 
+static void
+show_pk_not_available (int algo)
+{
+    static int list[100];
+    static int listlen;
+    int i;
+
+    for (i = 0; i < listlen; i++)
+      if (algo == list[i])
+        return; /* Note already printed. */
+    if (listlen < DIM (list))
+      list[listlen++] = algo;
+    show_note ("PK algorithm %d not available - skipping tests", algo);
+}
+
+
 
 static void
 progress_handler (void *cb_data, const char *what, int printchar,
@@ -219,11 +243,12 @@ progress_handler (void *cb_data, const char *what, int printchar,
 
 #if defined(__x86_64__) && defined(HAVE_GCC_INLINE_ASM_SSSE3) && \
     (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
-     defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS))
+     defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) && \
+    defined(__SSE2__)
 # define CLUTTER_VECTOR_REGISTER_AMD64 1
 # define CLUTTER_VECTOR_REGISTER_COUNT 16
 #elif defined(__i386__) && SIZEOF_UNSIGNED_LONG == 4 && __GNUC__ >= 4 && \
-      defined(HAVE_GCC_INLINE_ASM_SSSE3)
+      defined(HAVE_GCC_INLINE_ASM_SSSE3) && defined(__SSE2__)
 # define CLUTTER_VECTOR_REGISTER_I386 1
 # define CLUTTER_VECTOR_REGISTER_COUNT 8
 #elif defined(HAVE_COMPATIBLE_GCC_AARCH64_PLATFORM_AS) && \
@@ -315,68 +340,41 @@ clutter_vector_registers(void)
   prepare_vector_data(data);
 
 #if defined(CLUTTER_VECTOR_REGISTER_AMD64)
-  asm volatile("movdqu %[data0], %%xmm0\n"
-              "movdqu %[data1], %%xmm1\n"
-              "movdqu %[data2], %%xmm2\n"
-              "movdqu %[data3], %%xmm3\n"
-              "movdqu %[data4], %%xmm4\n"
-              "movdqu %[data5], %%xmm5\n"
-              "movdqu %[data6], %%xmm6\n"
-              "movdqu %[data7], %%xmm7\n"
-              "movdqu %[data8], %%xmm8\n"
-              "movdqu %[data9], %%xmm9\n"
-              "movdqu %[data10], %%xmm10\n"
-              "movdqu %[data11], %%xmm11\n"
-              "movdqu %[data12], %%xmm12\n"
-              "movdqu %[data13], %%xmm13\n"
-              "movdqu %[data14], %%xmm14\n"
-              "movdqu %[data15], %%xmm15\n"
+  asm volatile("movdqu (0 * 16)(%[data]), %%xmm0\n"
+              "movdqu (1 * 16)(%[data]), %%xmm1\n"
+              "movdqu (2 * 16)(%[data]), %%xmm2\n"
+              "movdqu (3 * 16)(%[data]), %%xmm3\n"
+              "movdqu (4 * 16)(%[data]), %%xmm4\n"
+              "movdqu (5 * 16)(%[data]), %%xmm5\n"
+              "movdqu (6 * 16)(%[data]), %%xmm6\n"
+              "movdqu (7 * 16)(%[data]), %%xmm7\n"
+              "movdqu (8 * 16)(%[data]), %%xmm8\n"
+              "movdqu (9 * 16)(%[data]), %%xmm9\n"
+              "movdqu (10 * 16)(%[data]), %%xmm10\n"
+              "movdqu (11 * 16)(%[data]), %%xmm11\n"
+              "movdqu (12 * 16)(%[data]), %%xmm12\n"
+              "movdqu (13 * 16)(%[data]), %%xmm13\n"
+              "movdqu (14 * 16)(%[data]), %%xmm14\n"
+              "movdqu (15 * 16)(%[data]), %%xmm15\n"
              :
-             : [data0] "m" (*data[0]),
-               [data1] "m" (*data[1]),
-               [data2] "m" (*data[2]),
-               [data3] "m" (*data[3]),
-               [data4] "m" (*data[4]),
-               [data5] "m" (*data[5]),
-               [data6] "m" (*data[6]),
-               [data7] "m" (*data[7]),
-               [data8] "m" (*data[8]),
-               [data9] "m" (*data[9]),
-               [data10] "m" (*data[10]),
-               [data11] "m" (*data[11]),
-               [data12] "m" (*data[12]),
-               [data13] "m" (*data[13]),
-               [data14] "m" (*data[14]),
-               [data15] "m" (*data[15])
-             : "memory"
-#ifdef __SSE2__
-              ,"xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7",
-               "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14",
-               "xmm15"
-#endif
+             : [data] "r" (&data[0])
+             : "memory", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5",
+               "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12",
+               "xmm13", "xmm14", "xmm15"
              );
 #elif defined(CLUTTER_VECTOR_REGISTER_I386)
-  asm volatile("movdqu %[data0], %%xmm0\n"
-              "movdqu %[data1], %%xmm1\n"
-              "movdqu %[data2], %%xmm2\n"
-              "movdqu %[data3], %%xmm3\n"
-              "movdqu %[data4], %%xmm4\n"
-              "movdqu %[data5], %%xmm5\n"
-              "movdqu %[data6], %%xmm6\n"
-              "movdqu %[data7], %%xmm7\n"
+  asm volatile("movdqu (0 * 16)(%[data]), %%xmm0\n"
+              "movdqu (1 * 16)(%[data]), %%xmm1\n"
+              "movdqu (2 * 16)(%[data]), %%xmm2\n"
+              "movdqu (3 * 16)(%[data]), %%xmm3\n"
+              "movdqu (4 * 16)(%[data]), %%xmm4\n"
+              "movdqu (5 * 16)(%[data]), %%xmm5\n"
+              "movdqu (6 * 16)(%[data]), %%xmm6\n"
+              "movdqu (7 * 16)(%[data]), %%xmm7\n"
              :
-             : [data0] "m" (*data[0]),
-               [data1] "m" (*data[1]),
-               [data2] "m" (*data[2]),
-               [data3] "m" (*data[3]),
-               [data4] "m" (*data[4]),
-               [data5] "m" (*data[5]),
-               [data6] "m" (*data[6]),
-               [data7] "m" (*data[7])
-             : "memory"
-#ifdef __SSE2__
-              ,"xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
-#endif
+             : [data] "r" (&data[0])
+             : "memory", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5",
+               "xmm6", "xmm7"
              );
 #elif defined(CLUTTER_VECTOR_REGISTER_AARCH64)
   asm volatile("mov x0, %[ptr]\n"
@@ -895,6 +893,306 @@ check_ecb_cipher (void)
        }
       },
 #endif /* USE_SM4 */
+#if USE_ARIA
+      { GCRY_CIPHER_ARIA128,
+       "\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f",
+       0, FLAG_NOFIPS,
+       { { "\x00\x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa\xbb\xcc\xdd\xee\xff",
+           16,
+           16,
+           "\xd7\x18\xfb\xd6\xab\x64\x4c\x73\x9d\xa9\x5f\x3b\xe6\x45\x17\x78" },
+         { }
+       }
+      },
+      { GCRY_CIPHER_ARIA128,
+       "\x00\x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa\xbb\xcc\xdd\xee\xff",
+       0, FLAG_NOFIPS,
+       { { "\x11\x11\x11\x11\xaa\xaa\xaa\xaa\x11\x11\x11\x11\xbb\xbb\xbb\xbb"
+           "\x11\x11\x11\x11\xcc\xcc\xcc\xcc\x11\x11\x11\x11\xdd\xdd\xdd\xdd"
+           "\x22\x22\x22\x22\xaa\xaa\xaa\xaa\x22\x22\x22\x22\xbb\xbb\xbb\xbb"
+           "\x22\x22\x22\x22\xcc\xcc\xcc\xcc\x22\x22\x22\x22\xdd\xdd\xdd\xdd"
+           "\x33\x33\x33\x33\xaa\xaa\xaa\xaa\x33\x33\x33\x33\xbb\xbb\xbb\xbb"
+           "\x33\x33\x33\x33\xcc\xcc\xcc\xcc\x33\x33\x33\x33\xdd\xdd\xdd\xdd"
+           "\x44\x44\x44\x44\xaa\xaa\xaa\xaa\x44\x44\x44\x44\xbb\xbb\xbb\xbb"
+           "\x44\x44\x44\x44\xcc\xcc\xcc\xcc\x44\x44\x44\x44\xdd\xdd\xdd\xdd"
+           "\x55\x55\x55\x55\xaa\xaa\xaa\xaa\x55\x55\x55\x55\xbb\xbb\xbb\xbb"
+           "\x55\x55\x55\x55\xcc\xcc\xcc\xcc\x55\x55\x55\x55\xdd\xdd\xdd\xdd"
+           "\x11\x11\x11\x11\xaa\xaa\xaa\xaa\x11\x11\x11\x11\xbb\xbb\xbb\xbb"
+           "\x11\x11\x11\x11\xcc\xcc\xcc\xcc\x11\x11\x11\x11\xdd\xdd\xdd\xdd"
+           "\x22\x22\x22\x22\xaa\xaa\xaa\xaa\x22\x22\x22\x22\xbb\xbb\xbb\xbb"
+           "\x22\x22\x22\x22\xcc\xcc\xcc\xcc\x22\x22\x22\x22\xdd\xdd\xdd\xdd"
+           "\x33\x33\x33\x33\xaa\xaa\xaa\xaa\x33\x33\x33\x33\xbb\xbb\xbb\xbb"
+           "\x33\x33\x33\x33\xcc\xcc\xcc\xcc\x33\x33\x33\x33\xdd\xdd\xdd\xdd"
+           "\x44\x44\x44\x44\xaa\xaa\xaa\xaa\x44\x44\x44\x44\xbb\xbb\xbb\xbb"
+           "\x44\x44\x44\x44\xcc\xcc\xcc\xcc\x44\x44\x44\x44\xdd\xdd\xdd\xdd"
+           "\x55\x55\x55\x55\xaa\xaa\xaa\xaa\x55\x55\x55\x55\xbb\xbb\xbb\xbb"
+           "\x55\x55\x55\x55\xcc\xcc\xcc\xcc\x55\x55\x55\x55\xdd\xdd\xdd\xdd"
+           "\x11\x11\x11\x11\xaa\xaa\xaa\xaa\x11\x11\x11\x11\xbb\xbb\xbb\xbb"
+           "\x11\x11\x11\x11\xcc\xcc\xcc\xcc\x11\x11\x11\x11\xdd\xdd\xdd\xdd"
+           "\x22\x22\x22\x22\xaa\xaa\xaa\xaa\x22\x22\x22\x22\xbb\xbb\xbb\xbb"
+           "\x22\x22\x22\x22\xcc\xcc\xcc\xcc\x22\x22\x22\x22\xdd\xdd\xdd\xdd"
+           "\x33\x33\x33\x33\xaa\xaa\xaa\xaa\x33\x33\x33\x33\xbb\xbb\xbb\xbb"
+           "\x33\x33\x33\x33\xcc\xcc\xcc\xcc\x33\x33\x33\x33\xdd\xdd\xdd\xdd"
+           "\x44\x44\x44\x44\xaa\xaa\xaa\xaa\x44\x44\x44\x44\xbb\xbb\xbb\xbb"
+           "\x44\x44\x44\x44\xcc\xcc\xcc\xcc\x44\x44\x44\x44\xdd\xdd\xdd\xdd"
+           "\x55\x55\x55\x55\xaa\xaa\xaa\xaa\x55\x55\x55\x55\xbb\xbb\xbb\xbb"
+           "\x55\x55\x55\x55\xcc\xcc\xcc\xcc\x55\x55\x55\x55\xdd\xdd\xdd\xdd"
+           "\x11\x11\x11\x11\xaa\xaa\xaa\xaa\x11\x11\x11\x11\xbb\xbb\xbb\xbb"
+           "\x11\x11\x11\x11\xcc\xcc\xcc\xcc\x11\x11\x11\x11\xdd\xdd\xdd\xdd"
+           "\x22\x22\x22\x22\xaa\xaa\xaa\xaa\x22\x22\x22\x22\xbb\xbb\xbb\xbb"
+           "\x22\x22\x22\x22\xcc\xcc\xcc\xcc\x22\x22\x22\x22\xdd\xdd\xdd\xdd"
+           "\x33\x33\x33\x33\xaa\xaa\xaa\xaa\x33\x33\x33\x33\xbb\xbb\xbb\xbb"
+           "\x33\x33\x33\x33\xcc\xcc\xcc\xcc\x33\x33\x33\x33\xdd\xdd\xdd\xdd"
+           "\x44\x44\x44\x44\xaa\xaa\xaa\xaa\x44\x44\x44\x44\xbb\xbb\xbb\xbb"
+           "\x44\x44\x44\x44\xcc\xcc\xcc\xcc\x44\x44\x44\x44\xdd\xdd\xdd\xdd"
+           "\x55\x55\x55\x55\xaa\xaa\xaa\xaa\x55\x55\x55\x55\xbb\xbb\xbb\xbb"
+           "\x55\x55\x55\x55\xcc\xcc\xcc\xcc\x55\x55\x55\x55\xdd\xdd\xdd\xdd",
+           16,
+           4 * 10 * 16,
+           "\xc6\xec\xd0\x8e\x22\xc3\x0a\xbd\xb2\x15\xcf\x74\xe2\x07\x5e\x6e"
+           "\x29\xcc\xaa\xc6\x34\x48\x70\x8d\x33\x1b\x2f\x81\x6c\x51\xb1\x7d"
+           "\x9e\x13\x3d\x15\x28\xdb\xf0\xaf\x57\x87\xc7\xf3\xa3\xf5\xc2\xbf"
+           "\x6b\x6f\x34\x59\x07\xa3\x05\x56\x12\xce\x07\x2f\xf5\x4d\xe7\xd7"
+           "\x88\x42\x4d\xa6\xe8\xcc\xfe\x81\x72\xb3\x91\xbe\x49\x93\x54\x16"
+           "\x56\x65\xba\x78\x64\x91\x70\x00\xa6\xee\xb2\xec\xb4\xa6\x98\xed"
+           "\xfc\x78\x87\xe7\xf5\x56\x37\x76\x14\xab\x0a\x28\x22\x93\xe6\xd8"
+           "\x84\xdb\xb8\x42\x06\xcd\xb1\x6e\xd1\x75\x4e\x77\xa1\xf2\x43\xfd"
+           "\x08\x69\x53\xf7\x52\xcc\x1e\x46\xc7\xc7\x94\xae\x85\x53\x7d\xca"
+           "\xec\x8d\xd7\x21\xf5\x5c\x93\xb6\xed\xfe\x2a\xde\xa4\x38\x73\xe8"
+           "\xc6\xec\xd0\x8e\x22\xc3\x0a\xbd\xb2\x15\xcf\x74\xe2\x07\x5e\x6e"
+           "\x29\xcc\xaa\xc6\x34\x48\x70\x8d\x33\x1b\x2f\x81\x6c\x51\xb1\x7d"
+           "\x9e\x13\x3d\x15\x28\xdb\xf0\xaf\x57\x87\xc7\xf3\xa3\xf5\xc2\xbf"
+           "\x6b\x6f\x34\x59\x07\xa3\x05\x56\x12\xce\x07\x2f\xf5\x4d\xe7\xd7"
+           "\x88\x42\x4d\xa6\xe8\xcc\xfe\x81\x72\xb3\x91\xbe\x49\x93\x54\x16"
+           "\x56\x65\xba\x78\x64\x91\x70\x00\xa6\xee\xb2\xec\xb4\xa6\x98\xed"
+           "\xfc\x78\x87\xe7\xf5\x56\x37\x76\x14\xab\x0a\x28\x22\x93\xe6\xd8"
+           "\x84\xdb\xb8\x42\x06\xcd\xb1\x6e\xd1\x75\x4e\x77\xa1\xf2\x43\xfd"
+           "\x08\x69\x53\xf7\x52\xcc\x1e\x46\xc7\xc7\x94\xae\x85\x53\x7d\xca"
+           "\xec\x8d\xd7\x21\xf5\x5c\x93\xb6\xed\xfe\x2a\xde\xa4\x38\x73\xe8"
+           "\xc6\xec\xd0\x8e\x22\xc3\x0a\xbd\xb2\x15\xcf\x74\xe2\x07\x5e\x6e"
+           "\x29\xcc\xaa\xc6\x34\x48\x70\x8d\x33\x1b\x2f\x81\x6c\x51\xb1\x7d"
+           "\x9e\x13\x3d\x15\x28\xdb\xf0\xaf\x57\x87\xc7\xf3\xa3\xf5\xc2\xbf"
+           "\x6b\x6f\x34\x59\x07\xa3\x05\x56\x12\xce\x07\x2f\xf5\x4d\xe7\xd7"
+           "\x88\x42\x4d\xa6\xe8\xcc\xfe\x81\x72\xb3\x91\xbe\x49\x93\x54\x16"
+           "\x56\x65\xba\x78\x64\x91\x70\x00\xa6\xee\xb2\xec\xb4\xa6\x98\xed"
+           "\xfc\x78\x87\xe7\xf5\x56\x37\x76\x14\xab\x0a\x28\x22\x93\xe6\xd8"
+           "\x84\xdb\xb8\x42\x06\xcd\xb1\x6e\xd1\x75\x4e\x77\xa1\xf2\x43\xfd"
+           "\x08\x69\x53\xf7\x52\xcc\x1e\x46\xc7\xc7\x94\xae\x85\x53\x7d\xca"
+           "\xec\x8d\xd7\x21\xf5\x5c\x93\xb6\xed\xfe\x2a\xde\xa4\x38\x73\xe8"
+           "\xc6\xec\xd0\x8e\x22\xc3\x0a\xbd\xb2\x15\xcf\x74\xe2\x07\x5e\x6e"
+           "\x29\xcc\xaa\xc6\x34\x48\x70\x8d\x33\x1b\x2f\x81\x6c\x51\xb1\x7d"
+           "\x9e\x13\x3d\x15\x28\xdb\xf0\xaf\x57\x87\xc7\xf3\xa3\xf5\xc2\xbf"
+           "\x6b\x6f\x34\x59\x07\xa3\x05\x56\x12\xce\x07\x2f\xf5\x4d\xe7\xd7"
+           "\x88\x42\x4d\xa6\xe8\xcc\xfe\x81\x72\xb3\x91\xbe\x49\x93\x54\x16"
+           "\x56\x65\xba\x78\x64\x91\x70\x00\xa6\xee\xb2\xec\xb4\xa6\x98\xed"
+           "\xfc\x78\x87\xe7\xf5\x56\x37\x76\x14\xab\x0a\x28\x22\x93\xe6\xd8"
+           "\x84\xdb\xb8\x42\x06\xcd\xb1\x6e\xd1\x75\x4e\x77\xa1\xf2\x43\xfd"
+           "\x08\x69\x53\xf7\x52\xcc\x1e\x46\xc7\xc7\x94\xae\x85\x53\x7d\xca"
+           "\xec\x8d\xd7\x21\xf5\x5c\x93\xb6\xed\xfe\x2a\xde\xa4\x38\x73\xe8" },
+         { }
+       }
+      },
+      { GCRY_CIPHER_ARIA192,
+       "\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f"
+        "\x10\x11\x12\x13\x14\x15\x16\x17",
+       0, FLAG_NOFIPS,
+       { { "\x00\x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa\xbb\xcc\xdd\xee\xff",
+           24,
+           16,
+           "\x26\x44\x9c\x18\x05\xdb\xe7\xaa\x25\xa4\x68\xce\x26\x3a\x9e\x79" },
+         { }
+       }
+      },
+      { GCRY_CIPHER_ARIA192,
+       "\x00\x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa\xbb\xcc\xdd\xee\xff"
+        "\x00\x11\x22\x33\x44\x55\x66\x77",
+       0, FLAG_NOFIPS,
+       { { "\x11\x11\x11\x11\xaa\xaa\xaa\xaa\x11\x11\x11\x11\xbb\xbb\xbb\xbb"
+           "\x11\x11\x11\x11\xcc\xcc\xcc\xcc\x11\x11\x11\x11\xdd\xdd\xdd\xdd"
+           "\x22\x22\x22\x22\xaa\xaa\xaa\xaa\x22\x22\x22\x22\xbb\xbb\xbb\xbb"
+           "\x22\x22\x22\x22\xcc\xcc\xcc\xcc\x22\x22\x22\x22\xdd\xdd\xdd\xdd"
+           "\x33\x33\x33\x33\xaa\xaa\xaa\xaa\x33\x33\x33\x33\xbb\xbb\xbb\xbb"
+           "\x33\x33\x33\x33\xcc\xcc\xcc\xcc\x33\x33\x33\x33\xdd\xdd\xdd\xdd"
+           "\x44\x44\x44\x44\xaa\xaa\xaa\xaa\x44\x44\x44\x44\xbb\xbb\xbb\xbb"
+           "\x44\x44\x44\x44\xcc\xcc\xcc\xcc\x44\x44\x44\x44\xdd\xdd\xdd\xdd"
+           "\x55\x55\x55\x55\xaa\xaa\xaa\xaa\x55\x55\x55\x55\xbb\xbb\xbb\xbb"
+           "\x55\x55\x55\x55\xcc\xcc\xcc\xcc\x55\x55\x55\x55\xdd\xdd\xdd\xdd"
+           "\x11\x11\x11\x11\xaa\xaa\xaa\xaa\x11\x11\x11\x11\xbb\xbb\xbb\xbb"
+           "\x11\x11\x11\x11\xcc\xcc\xcc\xcc\x11\x11\x11\x11\xdd\xdd\xdd\xdd"
+           "\x22\x22\x22\x22\xaa\xaa\xaa\xaa\x22\x22\x22\x22\xbb\xbb\xbb\xbb"
+           "\x22\x22\x22\x22\xcc\xcc\xcc\xcc\x22\x22\x22\x22\xdd\xdd\xdd\xdd"
+           "\x33\x33\x33\x33\xaa\xaa\xaa\xaa\x33\x33\x33\x33\xbb\xbb\xbb\xbb"
+           "\x33\x33\x33\x33\xcc\xcc\xcc\xcc\x33\x33\x33\x33\xdd\xdd\xdd\xdd"
+           "\x44\x44\x44\x44\xaa\xaa\xaa\xaa\x44\x44\x44\x44\xbb\xbb\xbb\xbb"
+           "\x44\x44\x44\x44\xcc\xcc\xcc\xcc\x44\x44\x44\x44\xdd\xdd\xdd\xdd"
+           "\x55\x55\x55\x55\xaa\xaa\xaa\xaa\x55\x55\x55\x55\xbb\xbb\xbb\xbb"
+           "\x55\x55\x55\x55\xcc\xcc\xcc\xcc\x55\x55\x55\x55\xdd\xdd\xdd\xdd"
+           "\x11\x11\x11\x11\xaa\xaa\xaa\xaa\x11\x11\x11\x11\xbb\xbb\xbb\xbb"
+           "\x11\x11\x11\x11\xcc\xcc\xcc\xcc\x11\x11\x11\x11\xdd\xdd\xdd\xdd"
+           "\x22\x22\x22\x22\xaa\xaa\xaa\xaa\x22\x22\x22\x22\xbb\xbb\xbb\xbb"
+           "\x22\x22\x22\x22\xcc\xcc\xcc\xcc\x22\x22\x22\x22\xdd\xdd\xdd\xdd"
+           "\x33\x33\x33\x33\xaa\xaa\xaa\xaa\x33\x33\x33\x33\xbb\xbb\xbb\xbb"
+           "\x33\x33\x33\x33\xcc\xcc\xcc\xcc\x33\x33\x33\x33\xdd\xdd\xdd\xdd"
+           "\x44\x44\x44\x44\xaa\xaa\xaa\xaa\x44\x44\x44\x44\xbb\xbb\xbb\xbb"
+           "\x44\x44\x44\x44\xcc\xcc\xcc\xcc\x44\x44\x44\x44\xdd\xdd\xdd\xdd"
+           "\x55\x55\x55\x55\xaa\xaa\xaa\xaa\x55\x55\x55\x55\xbb\xbb\xbb\xbb"
+           "\x55\x55\x55\x55\xcc\xcc\xcc\xcc\x55\x55\x55\x55\xdd\xdd\xdd\xdd"
+           "\x11\x11\x11\x11\xaa\xaa\xaa\xaa\x11\x11\x11\x11\xbb\xbb\xbb\xbb"
+           "\x11\x11\x11\x11\xcc\xcc\xcc\xcc\x11\x11\x11\x11\xdd\xdd\xdd\xdd"
+           "\x22\x22\x22\x22\xaa\xaa\xaa\xaa\x22\x22\x22\x22\xbb\xbb\xbb\xbb"
+           "\x22\x22\x22\x22\xcc\xcc\xcc\xcc\x22\x22\x22\x22\xdd\xdd\xdd\xdd"
+           "\x33\x33\x33\x33\xaa\xaa\xaa\xaa\x33\x33\x33\x33\xbb\xbb\xbb\xbb"
+           "\x33\x33\x33\x33\xcc\xcc\xcc\xcc\x33\x33\x33\x33\xdd\xdd\xdd\xdd"
+           "\x44\x44\x44\x44\xaa\xaa\xaa\xaa\x44\x44\x44\x44\xbb\xbb\xbb\xbb"
+           "\x44\x44\x44\x44\xcc\xcc\xcc\xcc\x44\x44\x44\x44\xdd\xdd\xdd\xdd"
+           "\x55\x55\x55\x55\xaa\xaa\xaa\xaa\x55\x55\x55\x55\xbb\xbb\xbb\xbb"
+           "\x55\x55\x55\x55\xcc\xcc\xcc\xcc\x55\x55\x55\x55\xdd\xdd\xdd\xdd",
+           24,
+           4 * 10 * 16,
+           "\x8d\x14\x70\x62\x5f\x59\xeb\xac\xb0\xe5\x5b\x53\x4b\x3e\x46\x2b"
+           "\x5f\x23\xd3\x3b\xff\x78\xf4\x6c\x3c\x15\x91\x1f\x4a\x21\x80\x9a"
+           "\xac\xca\xd8\x0b\x4b\xda\x91\x5a\xa9\xda\xe6\xbc\xeb\xe0\x6a\x6c"
+           "\x83\xf7\x7f\xd5\x39\x1a\xcf\xe6\x1d\xe2\xf6\x46\xb5\xd4\x47\xed"
+           "\xbf\xd5\xbb\x49\xb1\x2f\xbb\x91\x45\xb2\x27\x89\x5a\x75\x7b\x2a"
+           "\xf1\xf7\x18\x87\x34\x86\x3d\x7b\x8b\x6e\xde\x5a\x5b\x2f\x06\xa0"
+           "\xa2\x33\xc8\x52\x3d\x2d\xb7\x78\xfb\x31\xb0\xe3\x11\xf3\x27\x00"
+           "\x15\x2f\x33\x86\x1e\x9d\x04\x0c\x83\xb5\xeb\x40\xcd\x88\xea\x49"
+           "\x97\x57\x09\xdc\x62\x93\x65\xa1\x89\xf7\x8a\x3e\xc4\x03\x45\xfc"
+           "\x6a\x5a\x30\x7a\x8f\x9a\x44\x13\x09\x1e\x00\x7e\xca\x56\x45\xa0"
+           "\x8d\x14\x70\x62\x5f\x59\xeb\xac\xb0\xe5\x5b\x53\x4b\x3e\x46\x2b"
+           "\x5f\x23\xd3\x3b\xff\x78\xf4\x6c\x3c\x15\x91\x1f\x4a\x21\x80\x9a"
+           "\xac\xca\xd8\x0b\x4b\xda\x91\x5a\xa9\xda\xe6\xbc\xeb\xe0\x6a\x6c"
+           "\x83\xf7\x7f\xd5\x39\x1a\xcf\xe6\x1d\xe2\xf6\x46\xb5\xd4\x47\xed"
+           "\xbf\xd5\xbb\x49\xb1\x2f\xbb\x91\x45\xb2\x27\x89\x5a\x75\x7b\x2a"
+           "\xf1\xf7\x18\x87\x34\x86\x3d\x7b\x8b\x6e\xde\x5a\x5b\x2f\x06\xa0"
+           "\xa2\x33\xc8\x52\x3d\x2d\xb7\x78\xfb\x31\xb0\xe3\x11\xf3\x27\x00"
+           "\x15\x2f\x33\x86\x1e\x9d\x04\x0c\x83\xb5\xeb\x40\xcd\x88\xea\x49"
+           "\x97\x57\x09\xdc\x62\x93\x65\xa1\x89\xf7\x8a\x3e\xc4\x03\x45\xfc"
+           "\x6a\x5a\x30\x7a\x8f\x9a\x44\x13\x09\x1e\x00\x7e\xca\x56\x45\xa0"
+           "\x8d\x14\x70\x62\x5f\x59\xeb\xac\xb0\xe5\x5b\x53\x4b\x3e\x46\x2b"
+           "\x5f\x23\xd3\x3b\xff\x78\xf4\x6c\x3c\x15\x91\x1f\x4a\x21\x80\x9a"
+           "\xac\xca\xd8\x0b\x4b\xda\x91\x5a\xa9\xda\xe6\xbc\xeb\xe0\x6a\x6c"
+           "\x83\xf7\x7f\xd5\x39\x1a\xcf\xe6\x1d\xe2\xf6\x46\xb5\xd4\x47\xed"
+           "\xbf\xd5\xbb\x49\xb1\x2f\xbb\x91\x45\xb2\x27\x89\x5a\x75\x7b\x2a"
+           "\xf1\xf7\x18\x87\x34\x86\x3d\x7b\x8b\x6e\xde\x5a\x5b\x2f\x06\xa0"
+           "\xa2\x33\xc8\x52\x3d\x2d\xb7\x78\xfb\x31\xb0\xe3\x11\xf3\x27\x00"
+           "\x15\x2f\x33\x86\x1e\x9d\x04\x0c\x83\xb5\xeb\x40\xcd\x88\xea\x49"
+           "\x97\x57\x09\xdc\x62\x93\x65\xa1\x89\xf7\x8a\x3e\xc4\x03\x45\xfc"
+           "\x6a\x5a\x30\x7a\x8f\x9a\x44\x13\x09\x1e\x00\x7e\xca\x56\x45\xa0"
+           "\x8d\x14\x70\x62\x5f\x59\xeb\xac\xb0\xe5\x5b\x53\x4b\x3e\x46\x2b"
+           "\x5f\x23\xd3\x3b\xff\x78\xf4\x6c\x3c\x15\x91\x1f\x4a\x21\x80\x9a"
+           "\xac\xca\xd8\x0b\x4b\xda\x91\x5a\xa9\xda\xe6\xbc\xeb\xe0\x6a\x6c"
+           "\x83\xf7\x7f\xd5\x39\x1a\xcf\xe6\x1d\xe2\xf6\x46\xb5\xd4\x47\xed"
+           "\xbf\xd5\xbb\x49\xb1\x2f\xbb\x91\x45\xb2\x27\x89\x5a\x75\x7b\x2a"
+           "\xf1\xf7\x18\x87\x34\x86\x3d\x7b\x8b\x6e\xde\x5a\x5b\x2f\x06\xa0"
+           "\xa2\x33\xc8\x52\x3d\x2d\xb7\x78\xfb\x31\xb0\xe3\x11\xf3\x27\x00"
+           "\x15\x2f\x33\x86\x1e\x9d\x04\x0c\x83\xb5\xeb\x40\xcd\x88\xea\x49"
+           "\x97\x57\x09\xdc\x62\x93\x65\xa1\x89\xf7\x8a\x3e\xc4\x03\x45\xfc"
+           "\x6a\x5a\x30\x7a\x8f\x9a\x44\x13\x09\x1e\x00\x7e\xca\x56\x45\xa0" },
+         { }
+       }
+      },
+      { GCRY_CIPHER_ARIA256,
+       "\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f"
+        "\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f",
+       0, FLAG_NOFIPS,
+       { { "\x00\x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa\xbb\xcc\xdd\xee\xff",
+           32,
+           16,
+           "\xf9\x2b\xd7\xc7\x9f\xb7\x2e\x2f\x2b\x8f\x80\xc1\x97\x2d\x24\xfc" },
+         { }
+       }
+      },
+      { GCRY_CIPHER_ARIA256,
+       "\x00\x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa\xbb\xcc\xdd\xee\xff"
+        "\x00\x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa\xbb\xcc\xdd\xee\xff",
+       0, FLAG_NOFIPS,
+       { { "\x11\x11\x11\x11\xaa\xaa\xaa\xaa\x11\x11\x11\x11\xbb\xbb\xbb\xbb"
+           "\x11\x11\x11\x11\xcc\xcc\xcc\xcc\x11\x11\x11\x11\xdd\xdd\xdd\xdd"
+           "\x22\x22\x22\x22\xaa\xaa\xaa\xaa\x22\x22\x22\x22\xbb\xbb\xbb\xbb"
+           "\x22\x22\x22\x22\xcc\xcc\xcc\xcc\x22\x22\x22\x22\xdd\xdd\xdd\xdd"
+           "\x33\x33\x33\x33\xaa\xaa\xaa\xaa\x33\x33\x33\x33\xbb\xbb\xbb\xbb"
+           "\x33\x33\x33\x33\xcc\xcc\xcc\xcc\x33\x33\x33\x33\xdd\xdd\xdd\xdd"
+           "\x44\x44\x44\x44\xaa\xaa\xaa\xaa\x44\x44\x44\x44\xbb\xbb\xbb\xbb"
+           "\x44\x44\x44\x44\xcc\xcc\xcc\xcc\x44\x44\x44\x44\xdd\xdd\xdd\xdd"
+           "\x55\x55\x55\x55\xaa\xaa\xaa\xaa\x55\x55\x55\x55\xbb\xbb\xbb\xbb"
+           "\x55\x55\x55\x55\xcc\xcc\xcc\xcc\x55\x55\x55\x55\xdd\xdd\xdd\xdd"
+           "\x11\x11\x11\x11\xaa\xaa\xaa\xaa\x11\x11\x11\x11\xbb\xbb\xbb\xbb"
+           "\x11\x11\x11\x11\xcc\xcc\xcc\xcc\x11\x11\x11\x11\xdd\xdd\xdd\xdd"
+           "\x22\x22\x22\x22\xaa\xaa\xaa\xaa\x22\x22\x22\x22\xbb\xbb\xbb\xbb"
+           "\x22\x22\x22\x22\xcc\xcc\xcc\xcc\x22\x22\x22\x22\xdd\xdd\xdd\xdd"
+           "\x33\x33\x33\x33\xaa\xaa\xaa\xaa\x33\x33\x33\x33\xbb\xbb\xbb\xbb"
+           "\x33\x33\x33\x33\xcc\xcc\xcc\xcc\x33\x33\x33\x33\xdd\xdd\xdd\xdd"
+           "\x44\x44\x44\x44\xaa\xaa\xaa\xaa\x44\x44\x44\x44\xbb\xbb\xbb\xbb"
+           "\x44\x44\x44\x44\xcc\xcc\xcc\xcc\x44\x44\x44\x44\xdd\xdd\xdd\xdd"
+           "\x55\x55\x55\x55\xaa\xaa\xaa\xaa\x55\x55\x55\x55\xbb\xbb\xbb\xbb"
+           "\x55\x55\x55\x55\xcc\xcc\xcc\xcc\x55\x55\x55\x55\xdd\xdd\xdd\xdd"
+           "\x11\x11\x11\x11\xaa\xaa\xaa\xaa\x11\x11\x11\x11\xbb\xbb\xbb\xbb"
+           "\x11\x11\x11\x11\xcc\xcc\xcc\xcc\x11\x11\x11\x11\xdd\xdd\xdd\xdd"
+           "\x22\x22\x22\x22\xaa\xaa\xaa\xaa\x22\x22\x22\x22\xbb\xbb\xbb\xbb"
+           "\x22\x22\x22\x22\xcc\xcc\xcc\xcc\x22\x22\x22\x22\xdd\xdd\xdd\xdd"
+           "\x33\x33\x33\x33\xaa\xaa\xaa\xaa\x33\x33\x33\x33\xbb\xbb\xbb\xbb"
+           "\x33\x33\x33\x33\xcc\xcc\xcc\xcc\x33\x33\x33\x33\xdd\xdd\xdd\xdd"
+           "\x44\x44\x44\x44\xaa\xaa\xaa\xaa\x44\x44\x44\x44\xbb\xbb\xbb\xbb"
+           "\x44\x44\x44\x44\xcc\xcc\xcc\xcc\x44\x44\x44\x44\xdd\xdd\xdd\xdd"
+           "\x55\x55\x55\x55\xaa\xaa\xaa\xaa\x55\x55\x55\x55\xbb\xbb\xbb\xbb"
+           "\x55\x55\x55\x55\xcc\xcc\xcc\xcc\x55\x55\x55\x55\xdd\xdd\xdd\xdd"
+           "\x11\x11\x11\x11\xaa\xaa\xaa\xaa\x11\x11\x11\x11\xbb\xbb\xbb\xbb"
+           "\x11\x11\x11\x11\xcc\xcc\xcc\xcc\x11\x11\x11\x11\xdd\xdd\xdd\xdd"
+           "\x22\x22\x22\x22\xaa\xaa\xaa\xaa\x22\x22\x22\x22\xbb\xbb\xbb\xbb"
+           "\x22\x22\x22\x22\xcc\xcc\xcc\xcc\x22\x22\x22\x22\xdd\xdd\xdd\xdd"
+           "\x33\x33\x33\x33\xaa\xaa\xaa\xaa\x33\x33\x33\x33\xbb\xbb\xbb\xbb"
+           "\x33\x33\x33\x33\xcc\xcc\xcc\xcc\x33\x33\x33\x33\xdd\xdd\xdd\xdd"
+           "\x44\x44\x44\x44\xaa\xaa\xaa\xaa\x44\x44\x44\x44\xbb\xbb\xbb\xbb"
+           "\x44\x44\x44\x44\xcc\xcc\xcc\xcc\x44\x44\x44\x44\xdd\xdd\xdd\xdd"
+           "\x55\x55\x55\x55\xaa\xaa\xaa\xaa\x55\x55\x55\x55\xbb\xbb\xbb\xbb"
+           "\x55\x55\x55\x55\xcc\xcc\xcc\xcc\x55\x55\x55\x55\xdd\xdd\xdd\xdd",
+           32,
+           4 * 10 * 16,
+           "\x58\xa8\x75\xe6\x04\x4a\xd7\xff\xfa\x4f\x58\x42\x0f\x7f\x44\x2d"
+           "\x8e\x19\x10\x16\xf2\x8e\x79\xae\xfc\x01\xe2\x04\x77\x32\x80\xd7"
+           "\x01\x8e\x5f\x7a\x93\x8e\xc3\x07\x11\x71\x99\x53\xba\xe8\x65\x42"
+           "\xcd\x7e\xbc\x75\x24\x74\xc1\xa5\xf6\xea\xaa\xce\x2a\x7e\x29\x46"
+           "\x2e\xe7\xdf\xa5\xaf\xdb\x84\x17\x7e\xad\x95\xcc\xd4\xb4\xbb\x6e"
+           "\x1e\xd1\x7b\x95\x34\xcf\xf0\xa5\xfc\x29\x41\x42\x9c\xfe\xe2\xee"
+           "\x49\xc7\xad\xbe\xb7\xe9\xd1\xb0\xd2\xa8\x53\x1d\x94\x20\x79\x59"
+           "\x6a\x27\xed\x79\xf5\xb1\xdd\x13\xec\xd6\x04\xb0\x7a\x48\x88\x5a"
+           "\x3a\xfa\x06\x27\xa0\xe4\xe6\x0a\x3c\x70\x3a\xf2\x92\xf1\xba\xa7"
+           "\x7b\x70\x2f\x16\xc5\x4a\xa7\x4b\xc7\x27\xea\x95\xc7\x46\x8b\x00"
+           "\x58\xa8\x75\xe6\x04\x4a\xd7\xff\xfa\x4f\x58\x42\x0f\x7f\x44\x2d"
+           "\x8e\x19\x10\x16\xf2\x8e\x79\xae\xfc\x01\xe2\x04\x77\x32\x80\xd7"
+           "\x01\x8e\x5f\x7a\x93\x8e\xc3\x07\x11\x71\x99\x53\xba\xe8\x65\x42"
+           "\xcd\x7e\xbc\x75\x24\x74\xc1\xa5\xf6\xea\xaa\xce\x2a\x7e\x29\x46"
+           "\x2e\xe7\xdf\xa5\xaf\xdb\x84\x17\x7e\xad\x95\xcc\xd4\xb4\xbb\x6e"
+           "\x1e\xd1\x7b\x95\x34\xcf\xf0\xa5\xfc\x29\x41\x42\x9c\xfe\xe2\xee"
+           "\x49\xc7\xad\xbe\xb7\xe9\xd1\xb0\xd2\xa8\x53\x1d\x94\x20\x79\x59"
+           "\x6a\x27\xed\x79\xf5\xb1\xdd\x13\xec\xd6\x04\xb0\x7a\x48\x88\x5a"
+           "\x3a\xfa\x06\x27\xa0\xe4\xe6\x0a\x3c\x70\x3a\xf2\x92\xf1\xba\xa7"
+           "\x7b\x70\x2f\x16\xc5\x4a\xa7\x4b\xc7\x27\xea\x95\xc7\x46\x8b\x00"
+           "\x58\xa8\x75\xe6\x04\x4a\xd7\xff\xfa\x4f\x58\x42\x0f\x7f\x44\x2d"
+           "\x8e\x19\x10\x16\xf2\x8e\x79\xae\xfc\x01\xe2\x04\x77\x32\x80\xd7"
+           "\x01\x8e\x5f\x7a\x93\x8e\xc3\x07\x11\x71\x99\x53\xba\xe8\x65\x42"
+           "\xcd\x7e\xbc\x75\x24\x74\xc1\xa5\xf6\xea\xaa\xce\x2a\x7e\x29\x46"
+           "\x2e\xe7\xdf\xa5\xaf\xdb\x84\x17\x7e\xad\x95\xcc\xd4\xb4\xbb\x6e"
+           "\x1e\xd1\x7b\x95\x34\xcf\xf0\xa5\xfc\x29\x41\x42\x9c\xfe\xe2\xee"
+           "\x49\xc7\xad\xbe\xb7\xe9\xd1\xb0\xd2\xa8\x53\x1d\x94\x20\x79\x59"
+           "\x6a\x27\xed\x79\xf5\xb1\xdd\x13\xec\xd6\x04\xb0\x7a\x48\x88\x5a"
+           "\x3a\xfa\x06\x27\xa0\xe4\xe6\x0a\x3c\x70\x3a\xf2\x92\xf1\xba\xa7"
+           "\x7b\x70\x2f\x16\xc5\x4a\xa7\x4b\xc7\x27\xea\x95\xc7\x46\x8b\x00"
+           "\x58\xa8\x75\xe6\x04\x4a\xd7\xff\xfa\x4f\x58\x42\x0f\x7f\x44\x2d"
+           "\x8e\x19\x10\x16\xf2\x8e\x79\xae\xfc\x01\xe2\x04\x77\x32\x80\xd7"
+           "\x01\x8e\x5f\x7a\x93\x8e\xc3\x07\x11\x71\x99\x53\xba\xe8\x65\x42"
+           "\xcd\x7e\xbc\x75\x24\x74\xc1\xa5\xf6\xea\xaa\xce\x2a\x7e\x29\x46"
+           "\x2e\xe7\xdf\xa5\xaf\xdb\x84\x17\x7e\xad\x95\xcc\xd4\xb4\xbb\x6e"
+           "\x1e\xd1\x7b\x95\x34\xcf\xf0\xa5\xfc\x29\x41\x42\x9c\xfe\xe2\xee"
+           "\x49\xc7\xad\xbe\xb7\xe9\xd1\xb0\xd2\xa8\x53\x1d\x94\x20\x79\x59"
+           "\x6a\x27\xed\x79\xf5\xb1\xdd\x13\xec\xd6\x04\xb0\x7a\x48\x88\x5a"
+           "\x3a\xfa\x06\x27\xa0\xe4\xe6\x0a\x3c\x70\x3a\xf2\x92\xf1\xba\xa7"
+           "\x7b\x70\x2f\x16\xc5\x4a\xa7\x4b\xc7\x27\xea\x95\xc7\x46\x8b\x00" },
+         { }
+       }
+      },
+#endif /* USE_ARIA */
     };
   gcry_cipher_hd_t hde, hdd;
   unsigned char out[MAX_DATA_LEN];
@@ -911,7 +1209,7 @@ check_ecb_cipher (void)
 
       if ((err = gcry_cipher_test_algo (algo)))
         {
-          if (in_fips_mode && (tv[0].flags & FLAG_NOFIPS))
+          if (in_fips_mode && (tv[i].flags & FLAG_NOFIPS))
             {
               if (verbose)
                 fprintf (stderr, "    algorithm %d not available in fips mode\n",
@@ -2156,6 +2454,91 @@ check_ctr_cipher (void)
         }
       },
 #endif /* USE_SM4 */
+#if USE_ARIA
+      { GCRY_CIPHER_ARIA128, FLAG_NOFIPS,
+       "\x00\x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa\xbb\xcc\xdd\xee\xff",
+       "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00",
+       { { "\x11\x11\x11\x11\xaa\xaa\xaa\xaa\x11\x11\x11\x11\xbb\xbb\xbb\xbb"
+           "\x11\x11\x11\x11\xcc\xcc\xcc\xcc\x11\x11\x11\x11\xdd\xdd\xdd\xdd"
+           "\x22\x22\x22\x22\xaa\xaa\xaa\xaa\x22\x22\x22\x22\xbb\xbb\xbb\xbb"
+           "\x22\x22\x22\x22\xcc\xcc\xcc\xcc\x22\x22\x22\x22\xdd\xdd\xdd\xdd"
+           "\x33\x33\x33\x33\xaa\xaa\xaa\xaa\x33\x33\x33\x33\xbb\xbb\xbb\xbb"
+           "\x33\x33\x33\x33\xcc\xcc\xcc\xcc\x33\x33\x33\x33\xdd\xdd\xdd\xdd"
+           "\x44\x44\x44\x44\xaa\xaa\xaa\xaa\x44\x44\x44\x44\xbb\xbb\xbb\xbb"
+           "\x44\x44\x44\x44\xcc\xcc\xcc\xcc\x44\x44\x44\x44\xdd\xdd\xdd\xdd"
+           "\x55\x55\x55\x55\xaa\xaa\xaa\xaa\x55\x55\x55\x55\xbb\xbb\xbb\xbb"
+           "\x55\x55\x55\x55\xcc\xcc\xcc\xcc\x55\x55\x55\x55\xdd\xdd\xdd\xdd",
+           10 * 16,
+           "\xac\x5d\x7d\xe8\x05\xa0\xbf\x1c\x57\xc8\x54\x50\x1a\xf6\x0f\xa1"
+           "\x14\x97\xe2\xa3\x45\x19\xde\xa1\x56\x9e\x91\xe5\xb5\xcc\xae\x2f"
+           "\xf3\xbf\xa1\xbf\x97\x5f\x45\x71\xf4\x8b\xe1\x91\x61\x35\x46\xc3"
+           "\x91\x11\x63\xc0\x85\xf8\x71\xf0\xe7\xae\x5f\x2a\x08\x5b\x81\x85"
+           "\x1c\x2a\x3d\xdf\x20\xec\xb8\xfa\x51\x90\x1a\xec\x8e\xe4\xba\x32"
+           "\xa3\x5d\xab\x67\xbb\x72\xcd\x91\x40\xad\x18\x8a\x96\x7a\xc0\xfb"
+           "\xbd\xfa\x94\xea\x6c\xce\x47\xdc\xf8\x52\x5a\xb5\xa8\x14\xcf\xeb"
+           "\x2b\xb6\x0e\xe2\xb1\x26\xe2\xd9\xd8\x47\xc1\xa9\xe9\x6f\x90\x19"
+           "\xe3\xe6\xa7\xfe\x40\xd3\x82\x9a\xfb\x73\xdb\x1c\xc2\x45\x64\x6a"
+           "\xdd\xb6\x2d\x9b\x90\x7b\xaa\xaf\xbe\x46\xa7\x3d\xbc\x13\x1d\x3d" },
+         { "", 0, "" }
+        }
+      },
+      { GCRY_CIPHER_ARIA192, FLAG_NOFIPS,
+       "\x00\x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa\xbb\xcc\xdd\xee\xff"
+       "\x00\x11\x22\x33\x44\x55\x66\x77",
+       "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00",
+       { { "\x11\x11\x11\x11\xaa\xaa\xaa\xaa\x11\x11\x11\x11\xbb\xbb\xbb\xbb"
+           "\x11\x11\x11\x11\xcc\xcc\xcc\xcc\x11\x11\x11\x11\xdd\xdd\xdd\xdd"
+           "\x22\x22\x22\x22\xaa\xaa\xaa\xaa\x22\x22\x22\x22\xbb\xbb\xbb\xbb"
+           "\x22\x22\x22\x22\xcc\xcc\xcc\xcc\x22\x22\x22\x22\xdd\xdd\xdd\xdd"
+           "\x33\x33\x33\x33\xaa\xaa\xaa\xaa\x33\x33\x33\x33\xbb\xbb\xbb\xbb"
+           "\x33\x33\x33\x33\xcc\xcc\xcc\xcc\x33\x33\x33\x33\xdd\xdd\xdd\xdd"
+           "\x44\x44\x44\x44\xaa\xaa\xaa\xaa\x44\x44\x44\x44\xbb\xbb\xbb\xbb"
+           "\x44\x44\x44\x44\xcc\xcc\xcc\xcc\x44\x44\x44\x44\xdd\xdd\xdd\xdd"
+           "\x55\x55\x55\x55\xaa\xaa\xaa\xaa\x55\x55\x55\x55\xbb\xbb\xbb\xbb"
+           "\x55\x55\x55\x55\xcc\xcc\xcc\xcc\x55\x55\x55\x55\xdd\xdd\xdd\xdd",
+           10 * 16,
+           "\x08\x62\x5c\xa8\xfe\x56\x9c\x19\xba\x7a\xf3\x76\x0a\x6e\xd1\xce"
+           "\xf4\xd1\x99\x26\x3e\x99\x9d\xde\x14\x08\x2d\xbb\xa7\x56\x0b\x79"
+           "\xa4\xc6\xb4\x56\xb8\x70\x7d\xce\x75\x1f\x98\x54\xf1\x88\x93\xdf"
+           "\xdb\x3f\x4e\x5a\xfa\x53\x97\x33\xe6\xf1\xe7\x0b\x98\xba\x37\x89"
+           "\x1f\x8f\x81\xe9\x5d\xf8\xef\xc2\x6c\x7c\xe0\x43\x50\x4c\xb1\x89"
+           "\x58\xb8\x65\xe4\xe3\x16\xcd\x2a\xa1\xc9\x7f\x31\xbf\x23\xdc\x04"
+           "\x6e\xf3\x26\xb9\x5a\x69\x2a\x19\x1b\xa0\xf2\xa4\x1c\x5f\xe9\xae"
+           "\x07\x0f\x23\x6f\xf7\x07\x8e\x70\x3b\x42\x66\x6c\xaa\xfb\xdd\x20"
+           "\xba\xd7\x4a\xc4\xc2\x0c\x0f\x46\xc7\xca\x24\xc1\x51\x71\x65\x75"
+           "\xc9\x47\xda\x16\xc9\x0c\xfe\x1b\xf2\x17\xa4\x1c\xfe\xbe\x75\x31" },
+         { "", 0, "" }
+        }
+      },
+      { GCRY_CIPHER_ARIA256, FLAG_NOFIPS,
+       "\x00\x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa\xbb\xcc\xdd\xee\xff"
+       "\x00\x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa\xbb\xcc\xdd\xee\xff",
+       "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00",
+       { { "\x11\x11\x11\x11\xaa\xaa\xaa\xaa\x11\x11\x11\x11\xbb\xbb\xbb\xbb"
+           "\x11\x11\x11\x11\xcc\xcc\xcc\xcc\x11\x11\x11\x11\xdd\xdd\xdd\xdd"
+           "\x22\x22\x22\x22\xaa\xaa\xaa\xaa\x22\x22\x22\x22\xbb\xbb\xbb\xbb"
+           "\x22\x22\x22\x22\xcc\xcc\xcc\xcc\x22\x22\x22\x22\xdd\xdd\xdd\xdd"
+           "\x33\x33\x33\x33\xaa\xaa\xaa\xaa\x33\x33\x33\x33\xbb\xbb\xbb\xbb"
+           "\x33\x33\x33\x33\xcc\xcc\xcc\xcc\x33\x33\x33\x33\xdd\xdd\xdd\xdd"
+           "\x44\x44\x44\x44\xaa\xaa\xaa\xaa\x44\x44\x44\x44\xbb\xbb\xbb\xbb"
+           "\x44\x44\x44\x44\xcc\xcc\xcc\xcc\x44\x44\x44\x44\xdd\xdd\xdd\xdd"
+           "\x55\x55\x55\x55\xaa\xaa\xaa\xaa\x55\x55\x55\x55\xbb\xbb\xbb\xbb"
+           "\x55\x55\x55\x55\xcc\xcc\xcc\xcc\x55\x55\x55\x55\xdd\xdd\xdd\xdd",
+           10 * 16,
+           "\x30\x02\x6c\x32\x96\x66\x14\x17\x21\x17\x8b\x99\xc0\xa1\xf1\xb2"
+           "\xf0\x69\x40\x25\x3f\x7b\x30\x89\xe2\xa3\x0e\xa8\x6a\xa3\xc8\x8f"
+           "\x59\x40\xf0\x5a\xd7\xee\x41\xd7\x13\x47\xbb\x72\x61\xe3\x48\xf1"
+           "\x83\x60\x47\x3f\xdf\x7d\x4e\x77\x23\xbf\xfb\x44\x11\xcc\x13\xf6"
+           "\xcd\xd8\x9f\x3b\xc7\xb9\xc7\x68\x14\x50\x22\xc7\xa7\x4f\x14\xd7"
+           "\xc3\x05\xcd\x01\x2a\x10\xf1\x60\x50\xc2\x3f\x1a\xe5\xc2\x3f\x45"
+           "\x99\x8d\x13\xfb\xaa\x04\x1e\x51\x61\x95\x77\xe0\x77\x27\x64\x89"
+           "\x6a\x5d\x45\x16\xd8\xff\xce\xb3\xbf\x7e\x05\xf6\x13\xed\xd9\xa6"
+           "\x0c\xdc\xed\xaf\xf9\xcf\xca\xf4\xe0\x0d\x44\x5a\x54\x33\x4f\x73"
+           "\xab\x2c\xad\x94\x4e\x51\xd2\x66\x54\x8e\x61\xc6\xeb\x0a\xa1\xcd" },
+         { "", 0, "" }
+        }
+      },
+#endif /* USE_ARIA */
       {        0, 0,
        "",
        "",
@@ -2692,6 +3075,60 @@ check_cfb_cipher (void)
        }
       },
 #endif /* USE_SM4 */
+#if USE_ARIA
+      { GCRY_CIPHER_ARIA128, FLAG_NOFIPS,
+       "\x00\x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa\xbb\xcc\xdd\xee\xff",
+       "\x0f\x1e\x2d\x3c\x4b\x5a\x69\x78\x87\x96\xa5\xb4\xc3\xd2\xe1\xf0",
+       { { "\x11\x11\x11\x11\xaa\xaa\xaa\xaa\x11\x11\x11\x11\xbb\xbb\xbb\xbb"
+           "\x11\x11\x11\x11\xcc\xcc\xcc\xcc\x11\x11\x11\x11\xdd\xdd\xdd\xdd"
+           "\x22\x22\x22\x22\xaa\xaa\xaa\xaa\x22\x22\x22\x22\xbb\xbb\xbb\xbb"
+           "\x22\x22\x22\x22\xcc\xcc\xcc\xcc\x22\x22\x22\x22\xdd\xdd\xdd\xdd"
+           "\x33\x33\x33\x33\xaa\xaa\xaa\xaa\x33\x33\x33\x33\xbb\xbb\xbb\xbb"
+           "\x33\x33\x33\x33\xcc\xcc\xcc\xcc\x33\x33\x33\x33\xdd\xdd\xdd\xdd"
+           "\x44\x44\x44\x44\xaa\xaa\xaa\xaa\x44\x44\x44\x44\xbb\xbb\xbb\xbb"
+           "\x44\x44\x44\x44\xcc\xcc\xcc\xcc\x44\x44\x44\x44\xdd\xdd\xdd\xdd"
+           "\x55\x55\x55\x55\xaa\xaa\xaa\xaa\x55\x55\x55\x55\xbb\xbb\xbb\xbb"
+           "\x55\x55\x55\x55\xcc\xcc\xcc\xcc\x55\x55\x55\x55\xdd\xdd\xdd\xdd",
+           10 * 16,
+           "\x37\x20\xe5\x3b\xa7\xd6\x15\x38\x34\x06\xb0\x9f\x0a\x05\xa2\x00"
+           "\xc0\x7c\x21\xe6\x37\x0f\x41\x3a\x5d\x13\x25\x00\xa6\x82\x85\x01"
+           "\x7c\x61\xb4\x34\xc7\xb7\xca\x96\x85\xa5\x10\x71\x86\x1e\x4d\x4b"
+           "\xb8\x73\xb5\x99\xb4\x79\xe2\xd5\x73\xdd\xde\xaf\xba\x89\xf8\x12"
+           "\xac\x6a\x9e\x44\xd5\x54\x07\x8e\xb3\xbe\x94\x83\x9d\xb4\xb3\x3d"
+           "\xa3\xf5\x9c\x06\x31\x23\xa7\xef\x6f\x20\xe1\x05\x79\xfa\x4f\xd2"
+           "\x39\x10\x0c\xa7\x3b\x52\xd4\xfc\xaf\xea\xde\xe7\x3f\x13\x9f\x78"
+           "\xf9\xb7\x61\x4c\x2b\x3b\x9d\xbe\x01\x0f\x87\xdb\x06\xa8\x9a\x94"
+           "\x35\xf7\x9c\xe8\x12\x14\x31\x37\x1f\x4e\x87\xb9\x84\xe0\x23\x0c"
+           "\x22\xa6\xda\xcb\x32\xfc\x42\xdc\xc6\xac\xce\xf3\x32\x85\xbf\x11" },
+       }
+      },
+      { GCRY_CIPHER_ARIA128, FLAG_NOFIPS | FLAG_CFB8,
+       "\x00\x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa\xbb\xcc\xdd\xee\xff",
+       "\x0f\x1e\x2d\x3c\x4b\x5a\x69\x78\x87\x96\xa5\xb4\xc3\xd2\xe1\xf0",
+       { { "\x11\x11\x11\x11\xaa\xaa\xaa\xaa\x11\x11\x11\x11\xbb\xbb\xbb\xbb"
+           "\x11\x11\x11\x11\xcc\xcc\xcc\xcc\x11\x11\x11\x11\xdd\xdd\xdd\xdd"
+           "\x22\x22\x22\x22\xaa\xaa\xaa\xaa\x22\x22\x22\x22\xbb\xbb\xbb\xbb"
+           "\x22\x22\x22\x22\xcc\xcc\xcc\xcc\x22\x22\x22\x22\xdd\xdd\xdd\xdd"
+           "\x33\x33\x33\x33\xaa\xaa\xaa\xaa\x33\x33\x33\x33\xbb\xbb\xbb\xbb"
+           "\x33\x33\x33\x33\xcc\xcc\xcc\xcc\x33\x33\x33\x33\xdd\xdd\xdd\xdd"
+           "\x44\x44\x44\x44\xaa\xaa\xaa\xaa\x44\x44\x44\x44\xbb\xbb\xbb\xbb"
+           "\x44\x44\x44\x44\xcc\xcc\xcc\xcc\x44\x44\x44\x44\xdd\xdd\xdd\xdd"
+           "\x55\x55\x55\x55\xaa\xaa\xaa\xaa\x55\x55\x55\x55\xbb\xbb\xbb\xbb"
+           "\x55\x55\x55\x55\xcc\xcc\xcc\xcc\x55\x55\x55\x55\xdd\xdd\xdd\xdd",
+           10 * 16,
+           "\x37\x3c\x8f\x6a\x96\x55\x99\xec\x78\x5c\xc8\xf8\x14\x9f\x6c\x81"
+           "\xb6\x32\xcc\xb8\xe0\xc6\xeb\x6a\x97\x07\xae\x52\xc5\x92\x57\xa4"
+           "\x1f\x94\x70\x1c\x10\x96\x93\x31\x27\xa9\x01\x95\xed\x0c\x8e\x98"
+           "\x69\x05\x47\x57\x24\x23\xbb\x45\xc3\xd7\x0e\x4a\x18\xee\x56\xb9"
+           "\x67\xc1\x0e\x00\x0b\xa4\xdf\x5f\xba\x7c\x40\x41\x34\xa3\x43\xd8"
+           "\x37\x5d\x04\xb1\x51\xd1\x61\xef\x83\x41\x7f\xe1\x74\x84\x47\xd3"
+           "\x0a\x67\x23\xc4\x06\x73\x3d\xf7\xd1\x8a\xa3\x9a\x20\x75\x2d\x23"
+           "\x81\x94\x2e\x24\x48\x11\xbb\x97\xf7\x2e\xae\x44\x6b\x18\x15\xaa"
+           "\x69\x0c\xd1\xb1\xad\xcb\xd0\x07\xc0\x08\x8e\xcd\xc9\x1c\xb2\xe2"
+           "\xca\xf0\xe1\x1e\x72\x45\x98\x78\x13\x7e\xea\x64\xac\x62\xa9\xa1" },
+       }
+      },
+#endif /* USE_ARIA */
     };
   gcry_cipher_hd_t hde, hdd;
   unsigned char out[MAX_DATA_LEN];
@@ -5305,19 +5742,20 @@ check_gcm_siv_cipher (void)
   static const struct tv
   {
     int algo;
+    int flags;
     char key[MAX_DATA_LEN];
     char nonce[12];
     char ad[MAX_DATA_LEN];
     int adlen;
-    unsigned char plaintext[MAX_DATA_LEN];
+    unsigned char plaintext[MAX_DATA_LEN * 2];
     int inlen;
-    char out[MAX_DATA_LEN];
+    char out[MAX_DATA_LEN * 2];
     char tag[MAX_DATA_LEN];
   } tv[] =
     {
       /* Test vectors from RFC8452 */
       {
-       GCRY_CIPHER_AES128,
+       GCRY_CIPHER_AES128, 0,
        "\xee\x8e\x1e\xd9\xff\x25\x40\xae\x8f\x2b\xa9\xf5\x0b\xc2\xf2\x7c",
        "\x75\x2a\xba\xd3\xe0\xaf\xb5\xf4\x34\xdc\x43\x10",
        "example",
@@ -5328,7 +5766,7 @@ check_gcm_siv_cipher (void)
        "\x4f\xbc\xde\xb7\xe4\x79\x3f\x4a\x1d\x7e\x4f\xaa\x70\x10\x0a\xf1"
       },
       {
-       GCRY_CIPHER_AES128,
+       GCRY_CIPHER_AES128, 0,
        "\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00",
        "\x03\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00",
        "",
@@ -5339,7 +5777,7 @@ check_gcm_siv_cipher (void)
        "\xdc\x20\xe2\xd8\x3f\x25\x70\x5b\xb4\x9e\x43\x9e\xca\x56\xde\x25"
       },
       {
-       GCRY_CIPHER_AES128,
+       GCRY_CIPHER_AES128, 0,
        "\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00",
        "\x03\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00",
        "",
@@ -5350,7 +5788,7 @@ check_gcm_siv_cipher (void)
        "\xdc\x20\xe2\xd8\x3f\x25\x70\x5b\xb4\x9e\x43\x9e\xca\x56\xde\x25",
       },
       {
-       GCRY_CIPHER_AES128,
+       GCRY_CIPHER_AES128, 0,
        "\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00",
        "\x03\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00",
        "",
@@ -5361,7 +5799,7 @@ check_gcm_siv_cipher (void)
        "\x57\x87\x82\xff\xf6\x01\x3b\x81\x5b\x28\x7c\x22\x49\x3a\x36\x4c",
       },
       {
-       GCRY_CIPHER_AES128,
+       GCRY_CIPHER_AES128, 0,
        "\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00",
        "\x03\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00",
        "",
@@ -5372,7 +5810,7 @@ check_gcm_siv_cipher (void)
        "\xa4\x97\x8d\xb3\x57\x39\x1a\x0b\xc4\xfd\xec\x8b\x0d\x10\x66\x39",
       },
       {
-       GCRY_CIPHER_AES128,
+       GCRY_CIPHER_AES128, 0,
        "\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00",
        "\x03\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00",
        "",
@@ -5383,7 +5821,7 @@ check_gcm_siv_cipher (void)
        "\x30\x3a\xaf\x90\xf6\xfe\x21\x19\x9c\x60\x68\x57\x74\x37\xa0\xc4",
       },
       {
-       GCRY_CIPHER_AES128,
+       GCRY_CIPHER_AES128, 0,
        "\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00",
        "\x03\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00",
        "",
@@ -5396,7 +5834,7 @@ check_gcm_siv_cipher (void)
        "\x1a\x8e\x45\xdc\xd4\x57\x8c\x66\x7c\xd8\x68\x47\xbf\x61\x55\xff",
       },
       {
-       GCRY_CIPHER_AES128,
+       GCRY_CIPHER_AES128, 0,
        "\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00",
        "\x03\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00",
        "",
@@ -5411,7 +5849,7 @@ check_gcm_siv_cipher (void)
        "\x5e\x6e\x31\x1d\xbf\x39\x5d\x35\xb0\xfe\x39\xc2\x71\x43\x88\xf8",
       },
       {
-       GCRY_CIPHER_AES128,
+       GCRY_CIPHER_AES128, 0,
        "\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00",
        "\x03\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00",
        "",
@@ -5428,7 +5866,7 @@ check_gcm_siv_cipher (void)
        "\x8a\x26\x3d\xd3\x17\xaa\x88\xd5\x6b\xdf\x39\x36\xdb\xa7\x5b\xb8",
       },
       {
-       GCRY_CIPHER_AES128,
+       GCRY_CIPHER_AES128, 0,
        "\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00",
        "\x03\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00",
        "\x01",
@@ -5439,7 +5877,7 @@ check_gcm_siv_cipher (void)
        "\x3b\x0a\x1a\x25\x60\x96\x9c\xdf\x79\x0d\x99\x75\x9a\xbd\x15\x08",
       },
       {
-       GCRY_CIPHER_AES128,
+       GCRY_CIPHER_AES128, 0,
        "\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00",
        "\x03\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00",
        "\x01",
@@ -5450,7 +5888,7 @@ check_gcm_siv_cipher (void)
        "\x08\x29\x9c\x51\x02\x74\x5a\xaa\x3a\x0c\x46\x9f\xad\x9e\x07\x5a",
       },
       {
-       GCRY_CIPHER_AES128,
+       GCRY_CIPHER_AES128, 0,
        "\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00",
        "\x03\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00",
        "\x01",
@@ -5461,7 +5899,7 @@ check_gcm_siv_cipher (void)
        "\x8f\x89\x36\xec\x03\x9e\x4e\x4b\xb9\x7e\xbd\x8c\x44\x57\x44\x1f",
       },
       {
-       GCRY_CIPHER_AES128,
+       GCRY_CIPHER_AES128, 0,
        "\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00",
        "\x03\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00",
        "\x01",
@@ -5474,7 +5912,7 @@ check_gcm_siv_cipher (void)
        "\xe6\xaf\x6a\x7f\x87\x28\x7d\xa0\x59\xa7\x16\x84\xed\x34\x98\xe1",
       },
       {
-       GCRY_CIPHER_AES128,
+       GCRY_CIPHER_AES128, 0,
        "\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00",
        "\x03\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00",
        "\x01",
@@ -5489,7 +5927,7 @@ check_gcm_siv_cipher (void)
        "\x6a\x8c\xc3\x86\x5f\x76\x89\x7c\x2e\x4b\x24\x5c\xf3\x1c\x51\xf2",
       },
       {
-       GCRY_CIPHER_AES128,
+       GCRY_CIPHER_AES128, 0,
        "\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00",
        "\x03\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00",
        "\x01",
@@ -5506,7 +5944,7 @@ check_gcm_siv_cipher (void)
        "\xcd\xc4\x6a\xe4\x75\x56\x3d\xe0\x37\x00\x1e\xf8\x4a\xe2\x17\x44",
       },
       {
-       GCRY_CIPHER_AES128,
+       GCRY_CIPHER_AES128, 0,
        "\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00",
        "\x03\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00",
        "\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00",
@@ -5517,7 +5955,7 @@ check_gcm_siv_cipher (void)
        "\x07\xeb\x1f\x84\xfb\x28\xf8\xcb\x73\xde\x8e\x99\xe2\xf4\x8a\x14",
       },
       {
-       GCRY_CIPHER_AES128,
+       GCRY_CIPHER_AES128, 0,
        "\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00",
        "\x03\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00",
        "\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
@@ -5531,7 +5969,7 @@ check_gcm_siv_cipher (void)
        "\x24\xaf\xc9\x80\x5e\x97\x6f\x45\x1e\x6d\x87\xf6\xfe\x10\x65\x14",
       },
       {
-       GCRY_CIPHER_AES128,
+       GCRY_CIPHER_AES128, 0,
        "\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00",
        "\x03\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00",
        "\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
@@ -5545,7 +5983,7 @@ check_gcm_siv_cipher (void)
        "\xbf\xf9\xb2\xef\x00\xfb\x47\x92\x0c\xc7\x2a\x0c\x0f\x13\xb9\xfd",
       },
       {
-       GCRY_CIPHER_AES128,
+       GCRY_CIPHER_AES128, 0,
        "\xe6\x60\x21\xd5\xeb\x8e\x4f\x40\x66\xd4\xad\xb9\xc3\x35\x60\xe4",
        "\xf4\x6e\x44\xbb\x3d\xa0\x01\x5c\x94\xf7\x08\x87",
        "",
@@ -5556,7 +5994,7 @@ check_gcm_siv_cipher (void)
        "\xa4\x19\x4b\x79\x07\x1b\x01\xa8\x7d\x65\xf7\x06\xe3\x94\x95\x78",
       },
       {
-       GCRY_CIPHER_AES128,
+       GCRY_CIPHER_AES128, 0,
        "\x36\x86\x42\x00\xe0\xea\xf5\x28\x4d\x88\x4a\x0e\x77\xd3\x16\x46",
        "\xba\xe8\xe3\x7f\xc8\x34\x41\xb1\x60\x34\x56\x6b",
        "\x46\xbb\x91\xc3\xc5",
@@ -5567,7 +6005,7 @@ check_gcm_siv_cipher (void)
        "\x71\x1b\xd8\x5b\xc1\xe4\xd3\xe0\xa4\x62\xe0\x74\xee\xa4\x28\xa8",
       },
       {
-       GCRY_CIPHER_AES128,
+       GCRY_CIPHER_AES128, 0,
        "\xae\xdb\x64\xa6\xc5\x90\xbc\x84\xd1\xa5\xe2\x69\xe4\xb4\x78\x01",
        "\xaf\xc0\x57\x7e\x34\x69\x9b\x9e\x67\x1f\xdd\x4f",
        "\xfc\x88\x0c\x94\xa9\x51\x98\x87\x42\x96",
@@ -5578,7 +6016,7 @@ check_gcm_siv_cipher (void)
        "\xd6\xa9\xc4\x55\x45\xcf\xc1\x1f\x03\xad\x74\x3d\xba\x20\xf9\x66",
       },
       {
-       GCRY_CIPHER_AES128,
+       GCRY_CIPHER_AES128, 0,
        "\xd5\xcc\x1f\xd1\x61\x32\x0b\x69\x20\xce\x07\x78\x7f\x86\x74\x3b",
        "\x27\x5d\x1a\xb3\x2f\x6d\x1f\x04\x34\xd8\x84\x8c",
        "\x04\x67\x87\xf3\xea\x22\xc1\x27\xaa\xf1\x95\xd1\x89\x47\x28",
@@ -5589,7 +6027,7 @@ check_gcm_siv_cipher (void)
        "\x1d\x02\xfd\x0c\xd1\x74\xc8\x4f\xc5\xda\xe2\xf6\x0f\x52\xfd\x2b",
       },
       {
-       GCRY_CIPHER_AES128,
+       GCRY_CIPHER_AES128, 0,
        "\xb3\xfe\xd1\x47\x3c\x52\x8b\x84\x26\xa5\x82\x99\x59\x29\xa1\x49",
        "\x9e\x9a\xd8\x78\x0c\x8d\x63\xd0\xab\x41\x49\xc0",
        "\xc9\x88\x2e\x53\x86\xfd\x9f\x92\xec\x48\x9c\x8f\xde\x2b\xe2\xcf"
@@ -5601,7 +6039,7 @@ check_gcm_siv_cipher (void)
        "\xc1\xdc\x2f\x87\x1f\xb7\x56\x1d\xa1\x28\x6e\x65\x5e\x24\xb7\xb0",
       },
       {
-       GCRY_CIPHER_AES128,
+       GCRY_CIPHER_AES128, 0,
        "\x2d\x4e\xd8\x7d\xa4\x41\x02\x95\x2e\xf9\x4b\x02\xb8\x05\x24\x9b",
        "\xac\x80\xe6\xf6\x14\x55\xbf\xac\x83\x08\xa2\xd4",
        "\x29\x50\xa7\x0d\x5a\x1d\xb2\x31\x6f\xd5\x68\x37\x8d\xa1\x07\xb5"
@@ -5613,7 +6051,7 @@ check_gcm_siv_cipher (void)
        "\x83\xb3\x44\x9b\x9f\x39\x55\x2d\xe9\x9d\xc2\x14\xa1\x19\x0b\x0b",
       },
       {
-       GCRY_CIPHER_AES128,
+       GCRY_CIPHER_AES128, 0,
        "\xbd\xe3\xb2\xf2\x04\xd1\xe9\xf8\xb0\x6b\xc4\x7f\x97\x45\xb3\xd1",
        "\xae\x06\x55\x6f\xb6\xaa\x78\x90\xbe\xbc\x18\xfe",
        "\x18\x60\xf7\x62\xeb\xfb\xd0\x82\x84\xe4\x21\x70\x2d\xe0\xde\x18"
@@ -5627,7 +6065,7 @@ check_gcm_siv_cipher (void)
        "\x3e\x37\x70\x94\xf0\x47\x09\xf6\x4d\x7b\x98\x53\x10\xa4\xdb\x84",
       },
       {
-       GCRY_CIPHER_AES128,
+       GCRY_CIPHER_AES128, 0,
        "\xf9\x01\xcf\xe8\xa6\x96\x15\xa9\x3f\xdf\x7a\x98\xca\xd4\x81\x79",
        "\x62\x45\x70\x9f\xb1\x88\x53\xf6\x8d\x83\x36\x40",
        "\x75\x76\xf7\x02\x8e\xc6\xeb\x5e\xa7\xe2\x98\x34\x2a\x94\xd4\xb2"
@@ -5642,7 +6080,7 @@ check_gcm_siv_cipher (void)
        "\x2d\x15\x50\x6c\x84\xa9\xed\xd6\x5e\x13\xe9\xd2\x4a\x2a\x6e\x70",
       },
       {
-       GCRY_CIPHER_AES256,
+       GCRY_CIPHER_AES256, 0,
        "\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00",
        "\x03\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00",
@@ -5654,7 +6092,7 @@ check_gcm_siv_cipher (void)
        "\x07\xf5\xf4\x16\x9b\xbf\x55\xa8\x40\x0c\xd4\x7e\xa6\xfd\x40\x0f",
       },
       {
-       GCRY_CIPHER_AES256,
+       GCRY_CIPHER_AES256, 0,
        "\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00",
        "\x03\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00",
@@ -5666,7 +6104,7 @@ check_gcm_siv_cipher (void)
        "\x84\x31\x22\x13\x0f\x73\x64\xb7\x61\xe0\xb9\x74\x27\xe3\xdf\x28",
       },
       {
-       GCRY_CIPHER_AES256,
+       GCRY_CIPHER_AES256, 0,
        "\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00",
        "\x03\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00",
@@ -5678,7 +6116,7 @@ check_gcm_siv_cipher (void)
        "\x8c\xa5\x0d\xa9\xae\x65\x59\xe4\x8f\xd1\x0f\x6e\x5c\x9c\xa1\x7e",
       },
       {
-       GCRY_CIPHER_AES256,
+       GCRY_CIPHER_AES256, 0,
        "\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00",
        "\x03\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00",
@@ -5690,7 +6128,7 @@ check_gcm_siv_cipher (void)
        "\xc9\xea\xc6\xfa\x70\x09\x42\x70\x2e\x90\x86\x23\x83\xc6\xc3\x66",
       },
       {
-       GCRY_CIPHER_AES256,
+       GCRY_CIPHER_AES256, 0,
        "\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00",
        "\x03\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00",
@@ -5704,7 +6142,7 @@ check_gcm_siv_cipher (void)
        "\xe8\x19\xe6\x3a\xbc\xd0\x20\xb0\x06\xa9\x76\x39\x76\x32\xeb\x5d",
       },
       {
-       GCRY_CIPHER_AES256,
+       GCRY_CIPHER_AES256, 0,
        "\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00",
        "\x03\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00",
@@ -5720,7 +6158,7 @@ check_gcm_siv_cipher (void)
        "\x79\x0b\xc9\x68\x80\xa9\x9b\xa8\x04\xbd\x12\xc0\xe6\xa2\x2c\xc4",
       },
       {
-       GCRY_CIPHER_AES256,
+       GCRY_CIPHER_AES256, 0,
        "\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00",
        "\x03\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00",
@@ -5738,7 +6176,7 @@ check_gcm_siv_cipher (void)
        "\x11\x28\x64\xc2\x69\xfc\x0d\x9d\x88\xc6\x1f\xa4\x7e\x39\xaa\x08",
       },
       {
-       GCRY_CIPHER_AES256,
+       GCRY_CIPHER_AES256, 0,
        "\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00",
        "\x03\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00",
@@ -5750,7 +6188,7 @@ check_gcm_siv_cipher (void)
        "\x91\x21\x3f\x26\x7e\x3b\x45\x2f\x02\xd0\x1a\xe3\x3e\x4e\xc8\x54",
       },
       {
-       GCRY_CIPHER_AES256,
+       GCRY_CIPHER_AES256, 0,
        "\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00",
        "\x03\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00",
@@ -5762,7 +6200,7 @@ check_gcm_siv_cipher (void)
        "\xc1\xa4\xa1\x9a\xe8\x00\x94\x1c\xcd\xc5\x7c\xc8\x41\x3c\x27\x7f",
       },
       {
-       GCRY_CIPHER_AES256,
+       GCRY_CIPHER_AES256, 0,
        "\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00",
        "\x03\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00",
@@ -5774,7 +6212,7 @@ check_gcm_siv_cipher (void)
        "\xb2\x92\xd2\x8f\xf6\x11\x89\xe8\xe4\x9f\x38\x75\xef\x91\xaf\xf7",
       },
       {
-       GCRY_CIPHER_AES256,
+       GCRY_CIPHER_AES256, 0,
        "\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00",
        "\x03\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00",
@@ -5788,7 +6226,7 @@ check_gcm_siv_cipher (void)
        "\xae\xa1\xba\xd1\x27\x02\xe1\x96\x56\x04\x37\x4a\xab\x96\xdb\xbc",
       },
       {
-       GCRY_CIPHER_AES256,
+       GCRY_CIPHER_AES256, 0,
        "\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00",
        "\x03\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00",
@@ -5804,7 +6242,7 @@ check_gcm_siv_cipher (void)
        "\x03\x33\x27\x42\xb2\x28\xc6\x47\x17\x36\x16\xcf\xd4\x4c\x54\xeb",
       },
       {
-       GCRY_CIPHER_AES256,
+       GCRY_CIPHER_AES256, 0,
        "\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00",
        "\x03\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00",
@@ -5822,7 +6260,7 @@ check_gcm_siv_cipher (void)
        "\x5b\xde\x02\x85\x03\x7c\x5d\xe8\x1e\x5b\x57\x0a\x04\x9b\x62\xa0",
       },
       {
-       GCRY_CIPHER_AES256,
+       GCRY_CIPHER_AES256, 0,
        "\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00",
        "\x03\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00",
@@ -5834,7 +6272,7 @@ check_gcm_siv_cipher (void)
        "\x18\x35\xe5\x17\x74\x1d\xfd\xdc\xcf\xa0\x7f\xa4\x66\x1b\x74\xcf",
       },
       {
-       GCRY_CIPHER_AES256,
+       GCRY_CIPHER_AES256, 0,
        "\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00",
        "\x03\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00",
@@ -5849,7 +6287,7 @@ check_gcm_siv_cipher (void)
        "\xb8\x79\xad\x97\x6d\x82\x42\xac\xc1\x88\xab\x59\xca\xbf\xe3\x07",
       },
       {
-       GCRY_CIPHER_AES256,
+       GCRY_CIPHER_AES256, 0,
        "\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00",
        "\x03\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00",
@@ -5864,7 +6302,7 @@ check_gcm_siv_cipher (void)
        "\xcf\xcd\xf5\x04\x21\x12\xaa\x29\x68\x5c\x91\x2f\xc2\x05\x65\x43",
       },
       {
-       GCRY_CIPHER_AES256,
+       GCRY_CIPHER_AES256, 0,
        "\xe6\x60\x21\xd5\xeb\x8e\x4f\x40\x66\xd4\xad\xb9\xc3\x35\x60\xe4"
        "\xf4\x6e\x44\xbb\x3d\xa0\x01\x5c\x94\xf7\x08\x87\x36\x86\x42\x00",
        "\xe0\xea\xf5\x28\x4d\x88\x4a\x0e\x77\xd3\x16\x46",
@@ -5876,7 +6314,7 @@ check_gcm_siv_cipher (void)
        "\x16\x9f\xbb\x2f\xbf\x38\x9a\x99\x5f\x63\x90\xaf\x22\x22\x8a\x62",
       },
       {
-       GCRY_CIPHER_AES256,
+       GCRY_CIPHER_AES256, 0,
        "\xba\xe8\xe3\x7f\xc8\x34\x41\xb1\x60\x34\x56\x6b\x7a\x80\x6c\x46"
        "\xbb\x91\xc3\xc5\xae\xdb\x64\xa6\xc5\x90\xbc\x84\xd1\xa5\xe2\x69",
        "\xe4\xb4\x78\x01\xaf\xc0\x57\x7e\x34\x69\x9b\x9e",
@@ -5888,7 +6326,7 @@ check_gcm_siv_cipher (void)
        "\x93\xda\x9b\xb8\x13\x33\xae\xe0\xc7\x85\xb2\x40\xd3\x19\x71\x9d",
       },
       {
-       GCRY_CIPHER_AES256,
+       GCRY_CIPHER_AES256, 0,
        "\x65\x45\xfc\x88\x0c\x94\xa9\x51\x98\x87\x42\x96\xd5\xcc\x1f\xd1"
        "\x61\x32\x0b\x69\x20\xce\x07\x78\x7f\x86\x74\x3b\x27\x5d\x1a\xb3",
        "\x2f\x6d\x1f\x04\x34\xd8\x84\x8c\x11\x77\x44\x1f",
@@ -5900,7 +6338,7 @@ check_gcm_siv_cipher (void)
        "\x6b\x62\xb8\x4d\xc4\x0c\x84\x63\x6a\x5e\xc1\x20\x20\xec\x8c\x2c",
       },
       {
-       GCRY_CIPHER_AES256,
+       GCRY_CIPHER_AES256, 0,
        "\xd1\x89\x47\x28\xb3\xfe\xd1\x47\x3c\x52\x8b\x84\x26\xa5\x82\x99"
        "\x59\x29\xa1\x49\x9e\x9a\xd8\x78\x0c\x8d\x63\xd0\xab\x41\x49\xc0",
        "\x9f\x57\x2c\x61\x4b\x47\x45\x91\x44\x74\xe7\xc7",
@@ -5912,7 +6350,7 @@ check_gcm_siv_cipher (void)
        "\xc0\xfd\x3d\xc6\x62\x8d\xfe\x55\xeb\xb0\xb9\xfb\x22\x95\xc8\xc2",
       },
       {
-       GCRY_CIPHER_AES256,
+       GCRY_CIPHER_AES256, 0,
        "\xa4\x41\x02\x95\x2e\xf9\x4b\x02\xb8\x05\x24\x9b\xac\x80\xe6\xf6"
        "\x14\x55\xbf\xac\x83\x08\xa2\xd4\x0d\x8c\x84\x51\x17\x80\x82\x35",
        "\x5c\x9e\x94\x0f\xea\x2f\x58\x29\x50\xa7\x0d\x5a",
@@ -5925,7 +6363,7 @@ check_gcm_siv_cipher (void)
        "\x40\x40\x99\xc2\x58\x7f\x64\x97\x9f\x21\x82\x67\x06\xd4\x97\xd5",
       },
       {
-       GCRY_CIPHER_AES256,
+       GCRY_CIPHER_AES256, 0,
        "\x97\x45\xb3\xd1\xae\x06\x55\x6f\xb6\xaa\x78\x90\xbe\xbc\x18\xfe"
        "\x6b\x3d\xb4\xda\x3d\x57\xaa\x94\x84\x2b\x98\x03\xa9\x6e\x07\xfb",
        "\x6d\xe7\x18\x60\xf7\x62\xeb\xfb\xd0\x82\x84\xe4",
@@ -5938,7 +6376,7 @@ check_gcm_siv_cipher (void)
        "\xb3\x08\x0d\x28\xf6\xeb\xb5\xd3\x64\x8c\xe9\x7b\xd5\xba\x67\xfd",
       },
       {
-       GCRY_CIPHER_AES256,
+       GCRY_CIPHER_AES256, 0,
        "\xb1\x88\x53\xf6\x8d\x83\x36\x40\xe4\x2a\x3c\x02\xc2\x5b\x64\x86"
        "\x9e\x14\x6d\x7b\x23\x39\x87\xbd\xdf\xc2\x40\x87\x1d\x75\x76\xf7",
        "\x02\x8e\xc6\xeb\x5e\xa7\xe2\x98\x34\x2a\x94\xd4",
@@ -5953,7 +6391,7 @@ check_gcm_siv_cipher (void)
        "\x45\x4f\xc2\xa1\x54\xfe\xa9\x1f\x83\x63\xa3\x9f\xec\x7d\x0a\x49",
       },
       {
-       GCRY_CIPHER_AES256,
+       GCRY_CIPHER_AES256, 0,
        "\x3c\x53\x5d\xe1\x92\xea\xed\x38\x22\xa2\xfb\xbe\x2c\xa9\xdf\xc8"
        "\x82\x55\xe1\x4a\x66\x1b\x8a\xa8\x2c\xc5\x42\x36\x09\x3b\xbc\x23",
        "\x68\x80\x89\xe5\x55\x40\xdb\x18\x72\x50\x4e\x1c",
@@ -5969,7 +6407,7 @@ check_gcm_siv_cipher (void)
        "\x9d\x6c\x70\x29\x67\x5b\x89\xea\xf4\xba\x1d\xed\x1a\x28\x65\x94",
       },
       {
-       GCRY_CIPHER_AES256,
+       GCRY_CIPHER_AES256, 0,
        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00",
        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00",
@@ -5983,7 +6421,7 @@ check_gcm_siv_cipher (void)
        "\xff\xff\xff\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00",
       },
       {
-       GCRY_CIPHER_AES256,
+       GCRY_CIPHER_AES256, 0,
        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00",
        "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00",
@@ -5998,7 +6436,7 @@ check_gcm_siv_cipher (void)
       },
       /* Large block testing */
       {
-       GCRY_CIPHER_AES128,
+       GCRY_CIPHER_AES128, 0,
        "\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
        "\x02\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00",
        "\x03\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00",
@@ -6134,197 +6572,621 @@ check_gcm_siv_cipher (void)
        "\xcb\xec\x6a\x28\xa3\xf3\x4a\x6c\x0d\xb0\x79\x34\x13\x10\x64\xfc"
        "\xee\x12\x55\x82\x25\x25\x30\xb9\xa6\xf8\x3c\x81\x36\xcd\xef",
        "\xce\xc3\x13\x6c\x40\x2a\xcc\x51\xa1\xce\xb3\xed\xe8\xa6\x5b\x04",
-      }
-    };
-
-  gcry_cipher_hd_t hde, hdd;
-  unsigned char out[MAX_DATA_LEN];
-  unsigned char tag[16];
-  int i, keylen;
-  gcry_error_t err = 0;
-  size_t taglen2;
-
-  if (verbose)
-    fprintf (stderr, "  Starting GCM-SIV checks.\n");
-
-  for (i = 0; i < sizeof (tv) / sizeof (tv[0]); i++)
-    {
-      /* The AES algorithm is allowed in FIPS mode */
-      if ((err = gcry_cipher_test_algo (tv[i].algo)))
-        {
-          fail ("aes-gcm-siv, gcry_cipher_test_algo failed: %s\n", gpg_strerror (err));
-          continue;
-        }
-
-      if (verbose)
-       fprintf (stderr, "    checking GCM-SIV mode for %s [%i]\n",
-                gcry_cipher_algo_name (tv[i].algo),
-                tv[i].algo);
-      err = gcry_cipher_open (&hde, tv[i].algo, GCRY_CIPHER_MODE_GCM_SIV, 0);
-      if (!err)
-       err = gcry_cipher_open (&hdd, tv[i].algo, GCRY_CIPHER_MODE_GCM_SIV, 0);
-      if (err)
-       {
-          fail ("aes-gcm-siv, gcry_cipher_open failed: %s\n", gpg_strerror (err));
-         return;
-       }
-
-      keylen = gcry_cipher_get_algo_keylen (tv[i].algo);
-      if (!keylen)
-        {
-          fail ("aes-gcm-siv, gcry_cipher_get_algo_keylen failed\n");
-          return;
-        }
-
-      err = gcry_cipher_setkey (hde, tv[i].key, keylen);
-      if (!err)
-       err = gcry_cipher_setkey (hdd, tv[i].key, keylen);
-      if (err)
-       {
-         fail ("aes-gcm-siv, gcry_cipher_setkey failed: %s\n",
-               gpg_strerror (err));
-         gcry_cipher_close (hde);
-         gcry_cipher_close (hdd);
-         return;
-       }
-
-      err = gcry_cipher_setiv (hde, tv[i].nonce, 12);
-      if (!err)
-       err = gcry_cipher_setiv (hdd, tv[i].nonce, 12);
-      if (err)
-       {
-         fail ("aes-gcm-siv, gcry_cipher_setiv failed: %s\n",
-               gpg_strerror (err));
-         gcry_cipher_close (hde);
-         gcry_cipher_close (hdd);
-         return;
-       }
-
-      if (tv[i].adlen >= 0)
-       {
-         err = gcry_cipher_authenticate (hde, tv[i].ad, tv[i].adlen);
-         if (!err)
-           err = gcry_cipher_authenticate (hdd, tv[i].ad, tv[i].adlen);
-         if (err)
-           {
-             fail ("aes-gcm-siv, gcry_cipher_authenticate failed: %s\n",
-                   gpg_strerror (err));
-             gcry_cipher_close (hde);
-             gcry_cipher_close (hdd);
-             return;
-           }
-       }
-
-      err = gcry_cipher_info (hde, GCRYCTL_GET_TAGLEN, NULL, &taglen2);
-      if (err)
-       {
-         fail ("cipher-siv, gcryctl_get_taglen failed (tv %d): %s\n",
-               i, gpg_strerror (err));
-         gcry_cipher_close (hde);
-         gcry_cipher_close (hdd);
-         return;
-       }
-      if (taglen2 != 16)
-       {
-         fail ("cipher-siv, gcryctl_get_taglen returned bad length"
-               " (tv %d): got=%zu want=%d\n",
-               i, taglen2, 16);
-         gcry_cipher_close (hde);
-         gcry_cipher_close (hdd);
-         return;
-       }
-
-      if (tv[i].inlen)
-       {
-         err = gcry_cipher_encrypt (hde, out, tv[i].inlen,
-                                    tv[i].plaintext, tv[i].inlen);
-         if (err)
-           {
-             fail ("aes-gcm-siv, gcry_cipher_encrypt (%d) failed: %s\n",
-                   i, gpg_strerror (err));
-             gcry_cipher_close (hde);
-             gcry_cipher_close (hdd);
-             return;
-           }
-
-         if (memcmp (tv[i].out, out, tv[i].inlen))
-           {
-             mismatch (tv[i].out, tv[i].inlen, out, tv[i].inlen);
-             fail ("aes-gcm-siv, encrypt mismatch entry %d\n", i);
-           }
-
-         err = gcry_cipher_gettag (hde, tag, taglen2);
-         if (err)
-           {
-             fail ("aes-gcm-siv, gcry_cipher_gettag(%d) failed: %s\n",
-                   i, gpg_strerror (err));
-             gcry_cipher_close (hde);
-             gcry_cipher_close (hdd);
-             return;
-           }
-
-         if (memcmp (tv[i].tag, tag, taglen2))
-           {
-             mismatch (tv[i].tag, taglen2, tag, taglen2);
-             fail ("aes-gcm-siv, tag mismatch entry %d\n", i);
-           }
-
-         err = gcry_cipher_set_decryption_tag (hdd, tag, taglen2);
-         if (err)
-           {
-             fail ("aes-gcm-siv, gcry_cipher_set_decryption_tag (%d) failed: %s\n",
-                   i, gpg_strerror (err));
-             gcry_cipher_close (hde);
-             gcry_cipher_close (hdd);
-             return;
-           }
+      },
+#if USE_SM4
+      {
+       GCRY_CIPHER_SM4, FLAG_NOFIPS,
+       "\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+       "\x02\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00",
+       "\x03\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00",
+       "",
+       0,
+       "\x72\x94\x7b\x5d\x3c\x14\xc0\xa6\x27\x8d\x8d\xee\xbd\xe8\x8c\x6a"
+       "\x21\x34\xce\x64\x8f\x01\x01\xc6\xe4\x5d\xed\x2e\xb9\xec\xac\x53"
+       "\xf2\x07\xed\x60\xc8\xa2\x2f\x2e\x83\x0e\xf2\xbc\x42\x51\x24\x3b"
+       "\x41\x4f\x26\x84\xf0\x25\x69\x3f\x38\x29\xfb\xe9\xbb\x1a\x94\xd1"
+       "\x94\x0c\xce\xad\x8e\x66\xeb\xda\xc9\x1c\x72\x5a\x7f\x95\x4f\x9c"
+       "\x02\x27\x79\x8f\xe7\x51\x51\x3d\x1e\x2c\x4e\xcd\x07\xe5\xd1\xf0"
+       "\x6c\x95\x82\x37\x00\x50\x5e\xff\x82\xfb\x69\x0b\x4e\x7f\x10\x12"
+       "\x7d\x18\x7f\xa8\x88\x59\xfb\x55\x9b\x70\x36\xfc\xde\x75\xed\x77"
+       "\xf9\x09\x87\x29\x30\x7c\x81\x41\x12\xc2\xbd\xcd\x9f\x86\x98\x38"
+       "\x96\x44\x4c\xda\x2e\xbe\x7a\xfb\xdd\x4a\x4e\xa0\x84\x94\xd5\x76"
+       "\xa6\xae\x02\xcb\x1b\xd4\xd8\xcb\xa5\x24\x28\xe1\x3c\x1e\xdc\x3d"
+       "\x25\x50\xe7\xfb\x92\xad\xd9\x80\x33\xe0\xb2\x50\x07\xd4\x43\x40"
+       "\x41\x63\x98\x63\xa6\x1a\xfc\x56\x84\x3f\xf7\x4f\x31\xe7\xfe\xc5"
+       "\x73\x52\xfd\x6d\x9b\xbb\x9b\xf8\x19\xf8\xdc\x9f\x3a\x88\xa6\x7c"
+       "\xf3\x6b\xbe\xde\xda\x05\x2e\x79\x54\xb9\x3e\x59\x43\x0a\x1b\x16"
+       "\xcf\x94\x97\x71\x03\x74\x12\x37\xaf\xd4\x0a\x4b\x30\x16\x9b\x8b"
+       "\x9f\xae\x78\x46\x83\xde\x34\xc5\x31\x71\x67\x5e\xdb\x8d\x93\x71"
+       "\x90\x03\x72\x00\x9f\x4e\x1e\x7d\xf3\x3f\xf8\x31\xe7\xf6\xb4\x6d"
+       "\x8d\xdc\xa0\x85\x32\x7b\x32\x40\x8c\xa9\x90\x69\xac\x03\xdb\xd4"
+       "\xa5\x62\x9c\xfd\x78\xde\xc8\x4a\x18\x67\xa0\xee\x5e\x1e\xad\x1a"
+       "\x1c\xee\x78\xbd\xea\xdc\xc8\x34\xd1\x92\x20\xa7\x0d\x12\x90\x88"
+       "\x91\xe4\x6c\x3c\x06\x78\x13\x00\xdc\xc7\x3e\xd7\x91\xf7\xc1\xd6"
+       "\x5a\x99\x95\x23\xb5\xd8\x3d\x0f\x12\xaf\x25\xd8\xcf\xe8\x27\x7f"
+       "\xbc\x7c\xe2\xad\x34\x66\x7f\xfb\xf5\xa8\x11\xc1\xe6\x04\x37\x41"
+       "\xaf\x96\xb3\xb7\xee\x05\xf5\xd7\x7c\xc6\xfe\x2e\xa9\x07\x47\x08"
+       "\xa4\x50\x65\xc0\x2e\xd7\x27\xd8\x70\x8c\xf1\x12\x30\x4a\x82\xf6"
+       "\xb7\x68\xdb\x9d\x73\xc2\x82\x3d\x44\xda\xfb\xdd\x03\xc1\xdc\xfc"
+       "\x3f\x7f\x2e\xe2\xd3\x73\x24\xaf\xd1\x35\xa9\x4f\x3a\xad\x9d\x5c"
+       "\xd7\xc6\xa3\xb1\x11\xf1\xbb\xa0\x23\xe1\x22\x88\x5b\x10\xb3\xd6"
+       "\x01\x78\x5f\x9e\x4d\x96\x7b\xeb\x81\x6b\xce\x2d\xf5\x6a\xd1\xa8"
+       "\xb7\x56\xdd\xd0\x4b\xb0\xc9\x64\x7a\x2f\x63\xcb\xd6\x61\x84\x4b"
+       "\x9e\x4d\x0b\x2c\x99\xbc\xa2\x94\xf5\x07\x20\xe6\xe9\xc2\xd2\xa6"
+       "\x1c\x37\xd5\x88\x01\x71\xe2\x16\xcd\x10\x7a\x07\x8b\xf3\xb5\x49"
+       "\x75\xbe\x0b\xe1\xb2\x28\x15\x88\x2b\xb4\xee\x34\xfd\x67\x30\xd8"
+       "\xdc\x38\x90\x66\xb6\x51\x90\xb3\xdb\xee\x4e\x66\xc3\x05\xdf\xee"
+       "\x32\xac\x8b\xa2\x00\xcc\xff\xa2\x52\x19\x79\x7e\x6c\xc9\x68\xb2"
+       "\xab\xe4\x69\x11\xea\x00\xc9\x2b\x58\x77\x8b\x6c\x28\x0e\x40\x42"
+       "\xcc\xa7\xb2\x58\xed\x5e\x0b\x19\x49\xe5\x5e\xb1\xb5\x24\x61\x63"
+       "\x7d\x5b\x6a\x7d\x3d\xc1\x6e\x09\x64\x43\x66\x31\x3c\xb0\x26\x2e"
+       "\xc8\x27\xf6\x5a\x5f\x22\x94\x42\x62\x2a\xf6\x5a\x7d\xc2\x4a\x0d"
+       "\xd2\xad\xaa\x0e\xb2\xa4\x29\x1c\xb8\x3b\xaa\xc9\x1d\x1a\x30\xf8"
+       "\x0b\x35\xb2\x84\x75\xc3\x08\x0c\xe5\x36\xa9\xff\xfe\xb9\xc2\xb7"
+       "\x51\xab\x2d\x9d\x3e\x1c\x08\x8c\x6c\x64\xe1\xd9\x97\xf4\xfc\x4d"
+       "\x77\x6d\x0e\xce\x73\x0b\x7f\x57\x41\xed\xdf\x96\x11\xb3\xcc\x61"
+       "\xe8\x12\x31\x16\x72\x4c\x10\xd4\x52\x14\x4c\x83\xaa\x3c\x29\x6c"
+       "\x51\x40\x9a\x4d\x9b\xd0\xe6\x7f\xad\x31\x54\x88\x90\xe1\xa8\x0e"
+       "\xd8\xf4\x84\x11\xdb\x02\x41\xff\xb0\x8a\x92\x95\x97\xd6\x98\x8a"
+       "\xa0\x43\xda\x70\xbb\x17\xd0\x5a\x81\x3e\xf7\xcf\xc9\x33\xd9\x76"
+       "\x2f\x53\xa2\xac\xa0\x8a\x73\xe4\x0c\x81\xbe\x26\x01\x3f\x6d\x79"
+       "\x8a\x37\x59\x5b\x0a\x9a\x10\x6b\x04\x30\xed\xda\x11\x73\x73\xd9"
+       "\xa2\x9a\xf8\x8e\x67\x82\x5a\x8d\xc0\x52\xe8\x42\x89\xcd\x9c\xb1"
+       "\x5c\x3d\xd4\x75\x03\x71\x03\x3f\xdc\x6b\x79\xb4\x02\xb6\xac\xc4"
+       "\x11\x0f\x61\xc8\xf7\x5d\xc6\xbf\x48\x02\xa3\xdc\xa8\x37\x10\x85"
+       "\xb2\x8d\xbd\xb0\x79\x09\xb0\x5f\x30\x6c\x40\xba\x03\xbb\x22\xcc"
+       "\x80\xa1\xc3\x91\x88\x25\x92\xbe\xa6\xfa\x14\x77\x56\xb3\xc0\xb5"
+       "\x69\x8c\x6f\xed\x21\xaf\x0c\x79\x07\x64\xa2\xea\xeb\x47\x2c\x1e"
+       "\x7d\x6c\x12\xae\x75\xc4\xee\x12\x46\x72\x87\x65\x73\x51\xee\xf8"
+       "\x08\x63\x20\xa1\x61\xca\x73\x8f\xdf\xcb\x97\xf8\xfc\xb0\x56\xea"
+       "\x34\x9d\xce\xb8\x91\xb8\xfc\xec\x76\xd0\x71\xb7\x92\xc9\xb2\x28"
+       "\xee\x0b\x5d\x7c\x4a\xf6\x73\x4d\xc2\x5b\x5b\xae\x7b\xa6\x9c\xba"
+       "\x29\x7d\x7d\x3c\x29\x01\x04\x2d\xd1\x6c\x8d\x8d\xe5\xb4\x6b\xf9"
+       "\x2a\x83\xb8\x14\x00\x1c\x91\x72\x5e\x8f\x13\x56\x6d\x9b\x6d\x27"
+       "\xe8\x22\x55\x4b\x2f\x8a\x31\x16\x98\x03\x51\x73\xa7\x2e\x18\x81"
+       "\x51\x0a\x8f\x6d\x17\xd0\xea\x04\x1c\x11\xb9\x6b\x8e\xaa\x76",
+       1023,
+       "\x00\xf8\xa8\x64\x74\x1e\x9c\x18\x72\xa5\x55\x9e\x83\x8f\xde\xa5"
+       "\xd2\x34\x8e\x06\x25\xe8\x00\x15\xac\x4f\x26\x8d\x12\xe7\x3b\x7b"
+       "\xbb\xa7\x16\x54\x9b\xad\x82\x94\x0f\xdf\x3b\x3e\xfe\x42\xc0\x0f"
+       "\x23\x4d\x5b\x8e\xb5\x13\xf8\xb7\x40\x9b\xf6\x09\x1b\xc2\x9d\x2f"
+       "\xa4\x38\x6f\x19\x86\xd5\x6f\xac\x1f\xe4\x2c\x5c\x74\xc2\xdb\x7a"
+       "\x77\xac\xed\x83\xdb\xfe\x5f\x1c\x2a\x4f\xba\x00\xfc\x47\x8b\xe2"
+       "\x77\xb8\x38\x86\x7c\x21\x10\x64\xde\x37\x0f\x4c\x09\xcd\x6f\x0a"
+       "\x3f\x6d\xf4\xf1\x55\x6c\xe2\x29\x7f\xf8\xd6\x84\x31\xd5\x9c\x08"
+       "\x10\x94\x45\x7d\x62\x73\xfa\x28\x5e\x81\x90\x13\xb8\x0a\xd2\x4e"
+       "\xfd\x11\x99\x42\xd7\xb3\x90\x38\x26\x05\xda\xad\x11\x43\x84\x28"
+       "\x64\x0b\x50\xd9\x47\xa7\xad\x7c\xba\x51\xd2\x9c\xe4\xe9\xdf\x67"
+       "\x09\xe5\xb2\xfe\xb1\x60\xe9\x3f\x93\x6b\x4a\xe8\x71\x71\x9c\xdf"
+       "\xbe\x82\x59\x1c\x25\x8b\x72\xe8\x9d\x64\x4c\x21\x4b\x61\x11\x82"
+       "\x65\xb5\xf7\x80\x5c\xec\xee\x08\x7c\x35\x5a\x40\xb5\x64\xf6\xa2"
+       "\xa9\xda\x81\xff\x92\xf9\x49\x4f\x08\x24\xdc\x6a\x2f\x3f\xe6\xac"
+       "\x68\xf8\x5a\x10\xd4\x3b\xc7\x60\x0c\x69\x6c\x42\x99\xa3\x03\x2d"
+       "\x98\x64\x03\xe7\x4d\x07\x49\x91\x82\x1b\x34\x11\x9b\x16\xef\x2c"
+       "\x77\x10\xb4\xd7\xc5\xa7\xca\xbe\xb9\x71\xa0\x74\xb7\xfc\x06\xcd"
+       "\x82\x9f\xb3\xb0\xde\x49\xe2\x5a\x9c\xc6\x3b\x4b\xd7\x3e\x8b\xdf"
+       "\xd0\x27\xb8\x7a\xb5\x45\x05\xe5\xfa\xff\x6c\x9d\x6e\x9c\x0b\x4e"
+       "\xc3\x7d\xd1\x28\xd2\x30\x58\xc9\xa9\x7f\xa0\xd3\x65\xaf\x7b\x04"
+       "\x27\x39\xb3\x80\x6c\x68\x5c\x27\x60\xab\x98\x3b\xca\x16\xb4\x98"
+       "\x3c\xed\xf1\xd6\xf9\x43\x55\x51\xf8\xba\xdb\x96\xc2\xbb\xc4\x53"
+       "\x8e\x49\x0b\x8f\x82\x92\x75\x9c\x83\x7a\xf9\x7b\x0f\x30\x4f\x6d"
+       "\x8b\x6a\x05\xd9\x6e\x47\x88\x09\xfc\x56\x57\x91\x9a\xcd\xbb\xa9"
+       "\x39\x45\x20\x81\xd9\x23\x72\x1d\xfa\xea\x24\xb7\xeb\x2a\xcf\x19"
+       "\xcc\xcc\x63\xd6\xbb\x29\x5f\x9f\x71\x7c\x45\x15\x7b\x37\x12\x82"
+       "\x64\x41\xad\xe6\x20\xf1\x5d\xd0\x14\xff\x7b\x0c\x72\xe9\xc3\xf5"
+       "\x8a\xf2\xa3\x2e\x30\xdd\x32\xdc\x10\x9d\x9e\x05\xd8\x0d\xd8\x22"
+       "\xdd\xa6\x7f\x0d\xf5\x00\x3e\x7a\x92\xa6\x01\x3c\xc7\xdc\xf7\xae"
+       "\x73\x0c\xbf\xd4\x98\xfc\x30\xa5\xe8\xc1\x69\xb8\x57\xc9\x31\x4c"
+       "\x82\x1e\x3e\x17\x5f\x4d\x0c\x4d\x31\xbe\x21\x60\x79\x31\x52\x12"
+       "\x08\x09\x52\x8d\xf7\xbc\x73\x21\x95\x28\x09\x1f\x9b\xcd\x79\x42"
+       "\x61\x1f\x9f\x9e\x87\x53\x4c\x39\x50\x90\x74\xc4\xe1\xf7\x4f\x72"
+       "\xe6\x95\xf3\x38\xcb\x41\x3c\x26\x48\x00\x12\x0f\xbb\x3e\xd3\x17"
+       "\x7c\x03\xe1\x6e\x76\x58\xfc\x87\xa0\x99\x7f\x1e\x00\xea\x9e\x4e"
+       "\xef\x4c\x10\xee\xee\x79\xeb\x13\x8c\x19\x01\xd0\x2a\x74\x48\x99"
+       "\x66\x7e\x77\x1e\xa4\xee\x31\xae\xaf\x7b\x8f\x80\x06\x51\x5d\x7d"
+       "\x5d\x9f\x68\x1d\xea\xa8\x43\x99\xff\xac\x5d\x04\xb0\x30\x70\xf8"
+       "\x4a\xd3\xba\x6c\xd6\xb2\x01\x86\x8f\x4b\x2e\x6b\x5a\xd4\xc3\x74"
+       "\x1c\xb1\xe8\x4e\xbf\x7e\x18\xf3\x14\xe8\xf6\x05\xb5\xb6\x6c\xa7"
+       "\x94\xce\xba\xd2\x70\x3b\x49\x32\x80\xef\xaa\xdd\xa3\xfd\x49\x0d"
+       "\x0e\x24\x36\x69\x0a\x20\x7e\xbf\xfa\xca\x1b\xc9\xd9\xfd\x2b\x83"
+       "\x5d\xab\x3a\xa1\x2c\x43\xc7\xf1\xc4\x43\x37\x97\xa9\xd2\x39\x67"
+       "\x5d\xac\xdd\xf6\x0b\x6e\x99\x9a\x4b\x83\xaf\xba\x74\xbb\xf6\x67"
+       "\xc1\xf3\x38\x16\xc3\x56\x7f\x0d\x4e\x87\xbc\xd0\x85\xa0\x5d\x48"
+       "\x48\x44\x24\x79\x3d\x0d\xd3\x7a\x70\x38\xac\xd6\x3c\xe1\x6e\x2e"
+       "\xea\xb9\xee\x89\xea\xe2\x1d\xe9\xd1\xa5\x0f\x75\x46\xa8\x8d\x0d"
+       "\xf5\x72\x37\xc8\xe0\xaa\x48\x0f\x0e\xa4\x08\xce\x31\x74\x78\xdb"
+       "\x92\x30\x54\x70\x0c\x62\xe0\x62\x00\x90\xdd\x08\xf7\x3c\xa3\x1b"
+       "\x79\x78\x6a\xb6\xdb\xa3\x77\xd9\x3a\x38\xfc\x98\x29\x08\xba\x71"
+       "\x60\xa5\xf6\xcb\xc0\xe7\xe5\x35\x87\x97\xaf\x87\x81\x33\xe4\x1a"
+       "\x3c\x4b\x21\x7d\x7d\x84\x96\x52\x30\xac\xf2\x1b\x47\x28\xc0\x6f"
+       "\xf8\x6d\x9c\x2d\x69\x19\x49\x2e\x37\x1b\x89\x31\xa4\xb5\xbf\x60"
+       "\x9b\x32\x55\x83\x8f\x78\x50\x6b\xc5\x9c\xf6\x58\x8b\x0d\x93\xc0"
+       "\x30\x74\x98\x62\xec\xaa\x0e\x6e\xe7\x9b\x7c\x9b\x28\x97\x9e\xaf"
+       "\x38\xb8\x56\x4d\x78\xbe\x76\x69\xb5\xe0\x84\x2b\x1f\x11\x8e\xf7"
+       "\x18\x90\x4b\xfa\x82\x06\x57\xdd\xc7\xe3\x1d\xd6\x1f\x72\x12\x50"
+       "\x93\x20\x4c\xf7\x4b\x08\x1c\x28\x3f\x46\x47\xd0\x12\x40\xaa\xa9"
+       "\x38\x27\x04\x1e\x5f\x2c\xd0\x6b\xe8\xd0\xcd\xd9\x9d\xcc\x88\x67"
+       "\x8b\x5c\x5f\x80\xca\x54\xd8\x85\x26\x20\x31\xe8\xb8\xd9\xd4\xe9"
+       "\x40\x99\x11\x24\x86\x56\x82\xbe\x75\x5e\x53\x19\xf4\xfd\x38\x06"
+       "\x15\x9d\x58\x4c\x92\xb2\x09\xd1\x69\x03\x6f\xd2\x58\x9f\x85\x09"
+       "\x64\x15\x17\x55\x60\x71\xb4\xaf\xcd\xc8\x90\x25\xc8\xc8\x62",
+       "\xe2\x32\xda\x3a\x5a\x0e\x45\x1b\x8e\xf8\xbb\xe6\x60\x71\x81\xeb",
+      },
+#endif /* USE_SM4 */
+#if USE_CAMELLIA
+      {
+       GCRY_CIPHER_CAMELLIA128, FLAG_NOFIPS,
+       "\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+       "\x02\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00",
+       "\x03\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00",
+       "",
+       0,
+       "\x72\x94\x7b\x5d\x3c\x14\xc0\xa6\x27\x8d\x8d\xee\xbd\xe8\x8c\x6a"
+       "\x21\x34\xce\x64\x8f\x01\x01\xc6\xe4\x5d\xed\x2e\xb9\xec\xac\x53"
+       "\xf2\x07\xed\x60\xc8\xa2\x2f\x2e\x83\x0e\xf2\xbc\x42\x51\x24\x3b"
+       "\x41\x4f\x26\x84\xf0\x25\x69\x3f\x38\x29\xfb\xe9\xbb\x1a\x94\xd1"
+       "\x94\x0c\xce\xad\x8e\x66\xeb\xda\xc9\x1c\x72\x5a\x7f\x95\x4f\x9c"
+       "\x02\x27\x79\x8f\xe7\x51\x51\x3d\x1e\x2c\x4e\xcd\x07\xe5\xd1\xf0"
+       "\x6c\x95\x82\x37\x00\x50\x5e\xff\x82\xfb\x69\x0b\x4e\x7f\x10\x12"
+       "\x7d\x18\x7f\xa8\x88\x59\xfb\x55\x9b\x70\x36\xfc\xde\x75\xed\x77"
+       "\xf9\x09\x87\x29\x30\x7c\x81\x41\x12\xc2\xbd\xcd\x9f\x86\x98\x38"
+       "\x96\x44\x4c\xda\x2e\xbe\x7a\xfb\xdd\x4a\x4e\xa0\x84\x94\xd5\x76"
+       "\xa6\xae\x02\xcb\x1b\xd4\xd8\xcb\xa5\x24\x28\xe1\x3c\x1e\xdc\x3d"
+       "\x25\x50\xe7\xfb\x92\xad\xd9\x80\x33\xe0\xb2\x50\x07\xd4\x43\x40"
+       "\x41\x63\x98\x63\xa6\x1a\xfc\x56\x84\x3f\xf7\x4f\x31\xe7\xfe\xc5"
+       "\x73\x52\xfd\x6d\x9b\xbb\x9b\xf8\x19\xf8\xdc\x9f\x3a\x88\xa6\x7c"
+       "\xf3\x6b\xbe\xde\xda\x05\x2e\x79\x54\xb9\x3e\x59\x43\x0a\x1b\x16"
+       "\xcf\x94\x97\x71\x03\x74\x12\x37\xaf\xd4\x0a\x4b\x30\x16\x9b\x8b"
+       "\x9f\xae\x78\x46\x83\xde\x34\xc5\x31\x71\x67\x5e\xdb\x8d\x93\x71"
+       "\x90\x03\x72\x00\x9f\x4e\x1e\x7d\xf3\x3f\xf8\x31\xe7\xf6\xb4\x6d"
+       "\x8d\xdc\xa0\x85\x32\x7b\x32\x40\x8c\xa9\x90\x69\xac\x03\xdb\xd4"
+       "\xa5\x62\x9c\xfd\x78\xde\xc8\x4a\x18\x67\xa0\xee\x5e\x1e\xad\x1a"
+       "\x1c\xee\x78\xbd\xea\xdc\xc8\x34\xd1\x92\x20\xa7\x0d\x12\x90\x88"
+       "\x91\xe4\x6c\x3c\x06\x78\x13\x00\xdc\xc7\x3e\xd7\x91\xf7\xc1\xd6"
+       "\x5a\x99\x95\x23\xb5\xd8\x3d\x0f\x12\xaf\x25\xd8\xcf\xe8\x27\x7f"
+       "\xbc\x7c\xe2\xad\x34\x66\x7f\xfb\xf5\xa8\x11\xc1\xe6\x04\x37\x41"
+       "\xaf\x96\xb3\xb7\xee\x05\xf5\xd7\x7c\xc6\xfe\x2e\xa9\x07\x47\x08"
+       "\xa4\x50\x65\xc0\x2e\xd7\x27\xd8\x70\x8c\xf1\x12\x30\x4a\x82\xf6"
+       "\xb7\x68\xdb\x9d\x73\xc2\x82\x3d\x44\xda\xfb\xdd\x03\xc1\xdc\xfc"
+       "\x3f\x7f\x2e\xe2\xd3\x73\x24\xaf\xd1\x35\xa9\x4f\x3a\xad\x9d\x5c"
+       "\xd7\xc6\xa3\xb1\x11\xf1\xbb\xa0\x23\xe1\x22\x88\x5b\x10\xb3\xd6"
+       "\x01\x78\x5f\x9e\x4d\x96\x7b\xeb\x81\x6b\xce\x2d\xf5\x6a\xd1\xa8"
+       "\xb7\x56\xdd\xd0\x4b\xb0\xc9\x64\x7a\x2f\x63\xcb\xd6\x61\x84\x4b"
+       "\x9e\x4d\x0b\x2c\x99\xbc\xa2\x94\xf5\x07\x20\xe6\xe9\xc2\xd2\xa6"
+       "\x1c\x37\xd5\x88\x01\x71\xe2\x16\xcd\x10\x7a\x07\x8b\xf3\xb5\x49"
+       "\x75\xbe\x0b\xe1\xb2\x28\x15\x88\x2b\xb4\xee\x34\xfd\x67\x30\xd8"
+       "\xdc\x38\x90\x66\xb6\x51\x90\xb3\xdb\xee\x4e\x66\xc3\x05\xdf\xee"
+       "\x32\xac\x8b\xa2\x00\xcc\xff\xa2\x52\x19\x79\x7e\x6c\xc9\x68\xb2"
+       "\xab\xe4\x69\x11\xea\x00\xc9\x2b\x58\x77\x8b\x6c\x28\x0e\x40\x42"
+       "\xcc\xa7\xb2\x58\xed\x5e\x0b\x19\x49\xe5\x5e\xb1\xb5\x24\x61\x63"
+       "\x7d\x5b\x6a\x7d\x3d\xc1\x6e\x09\x64\x43\x66\x31\x3c\xb0\x26\x2e"
+       "\xc8\x27\xf6\x5a\x5f\x22\x94\x42\x62\x2a\xf6\x5a\x7d\xc2\x4a\x0d"
+       "\xd2\xad\xaa\x0e\xb2\xa4\x29\x1c\xb8\x3b\xaa\xc9\x1d\x1a\x30\xf8"
+       "\x0b\x35\xb2\x84\x75\xc3\x08\x0c\xe5\x36\xa9\xff\xfe\xb9\xc2\xb7"
+       "\x51\xab\x2d\x9d\x3e\x1c\x08\x8c\x6c\x64\xe1\xd9\x97\xf4\xfc\x4d"
+       "\x77\x6d\x0e\xce\x73\x0b\x7f\x57\x41\xed\xdf\x96\x11\xb3\xcc\x61"
+       "\xe8\x12\x31\x16\x72\x4c\x10\xd4\x52\x14\x4c\x83\xaa\x3c\x29\x6c"
+       "\x51\x40\x9a\x4d\x9b\xd0\xe6\x7f\xad\x31\x54\x88\x90\xe1\xa8\x0e"
+       "\xd8\xf4\x84\x11\xdb\x02\x41\xff\xb0\x8a\x92\x95\x97\xd6\x98\x8a"
+       "\xa0\x43\xda\x70\xbb\x17\xd0\x5a\x81\x3e\xf7\xcf\xc9\x33\xd9\x76"
+       "\x2f\x53\xa2\xac\xa0\x8a\x73\xe4\x0c\x81\xbe\x26\x01\x3f\x6d\x79"
+       "\x8a\x37\x59\x5b\x0a\x9a\x10\x6b\x04\x30\xed\xda\x11\x73\x73\xd9"
+       "\xa2\x9a\xf8\x8e\x67\x82\x5a\x8d\xc0\x52\xe8\x42\x89\xcd\x9c\xb1"
+       "\x5c\x3d\xd4\x75\x03\x71\x03\x3f\xdc\x6b\x79\xb4\x02\xb6\xac\xc4"
+       "\x11\x0f\x61\xc8\xf7\x5d\xc6\xbf\x48\x02\xa3\xdc\xa8\x37\x10\x85"
+       "\xb2\x8d\xbd\xb0\x79\x09\xb0\x5f\x30\x6c\x40\xba\x03\xbb\x22\xcc"
+       "\x80\xa1\xc3\x91\x88\x25\x92\xbe\xa6\xfa\x14\x77\x56\xb3\xc0\xb5"
+       "\x69\x8c\x6f\xed\x21\xaf\x0c\x79\x07\x64\xa2\xea\xeb\x47\x2c\x1e"
+       "\x7d\x6c\x12\xae\x75\xc4\xee\x12\x46\x72\x87\x65\x73\x51\xee\xf8"
+       "\x08\x63\x20\xa1\x61\xca\x73\x8f\xdf\xcb\x97\xf8\xfc\xb0\x56\xea"
+       "\x34\x9d\xce\xb8\x91\xb8\xfc\xec\x76\xd0\x71\xb7\x92\xc9\xb2\x28"
+       "\xee\x0b\x5d\x7c\x4a\xf6\x73\x4d\xc2\x5b\x5b\xae\x7b\xa6\x9c\xba"
+       "\x29\x7d\x7d\x3c\x29\x01\x04\x2d\xd1\x6c\x8d\x8d\xe5\xb4\x6b\xf9"
+       "\x2a\x83\xb8\x14\x00\x1c\x91\x72\x5e\x8f\x13\x56\x6d\x9b\x6d\x27"
+       "\xe8\x22\x55\x4b\x2f\x8a\x31\x16\x98\x03\x51\x73\xa7\x2e\x18\x81"
+       "\x00\xf8\xa8\x64\x74\x1e\x9c\x18\x72\xa5\x55\x9e\x83\x8f\xde\xa5"
+       "\xd2\x34\x8e\x06\x25\xe8\x00\x15\xac\x4f\x26\x8d\x12\xe7\x3b\x7b"
+       "\xbb\xa7\x16\x54\x9b\xad\x82\x94\x0f\xdf\x3b\x3e\xfe\x42\xc0\x0f"
+       "\x23\x4d\x5b\x8e\xb5\x13\xf8\xb7\x40\x9b\xf6\x09\x1b\xc2\x9d\x2f"
+       "\xa4\x38\x6f\x19\x86\xd5\x6f\xac\x1f\xe4\x2c\x5c\x74\xc2\xdb\x7a"
+       "\x77\xac\xed\x83\xdb\xfe\x5f\x1c\x2a\x4f\xba\x00\xfc\x47\x8b\xe2"
+       "\x77\xb8\x38\x86\x7c\x21\x10\x64\xde\x37\x0f\x4c\x09\xcd\x6f\x0a"
+       "\x3f\x6d\xf4\xf1\x55\x6c\xe2\x29\x7f\xf8\xd6\x84\x31\xd5\x9c\x08"
+       "\x10\x94\x45\x7d\x62\x73\xfa\x28\x5e\x81\x90\x13\xb8\x0a\xd2\x4e"
+       "\xfd\x11\x99\x42\xd7\xb3\x90\x38\x26\x05\xda\xad\x11\x43\x84\x28"
+       "\x64\x0b\x50\xd9\x47\xa7\xad\x7c\xba\x51\xd2\x9c\xe4\xe9\xdf\x67"
+       "\x09\xe5\xb2\xfe\xb1\x60\xe9\x3f\x93\x6b\x4a\xe8\x71\x71\x9c\xdf"
+       "\xbe\x82\x59\x1c\x25\x8b\x72\xe8\x9d\x64\x4c\x21\x4b\x61\x11\x82"
+       "\x65\xb5\xf7\x80\x5c\xec\xee\x08\x7c\x35\x5a\x40\xb5\x64\xf6\xa2"
+       "\xa9\xda\x81\xff\x92\xf9\x49\x4f\x08\x24\xdc\x6a\x2f\x3f\xe6\xac"
+       "\x68\xf8\x5a\x10\xd4\x3b\xc7\x60\x0c\x69\x6c\x42\x99\xa3\x03\x2d"
+       "\x98\x64\x03\xe7\x4d\x07\x49\x91\x82\x1b\x34\x11\x9b\x16\xef\x2c"
+       "\x77\x10\xb4\xd7\xc5\xa7\xca\xbe\xb9\x71\xa0\x74\xb7\xfc\x06\xcd"
+       "\x82\x9f\xb3\xb0\xde\x49\xe2\x5a\x9c\xc6\x3b\x4b\xd7\x3e\x8b\xdf"
+       "\xd0\x27\xb8\x7a\xb5\x45\x05\xe5\xfa\xff\x6c\x9d\x6e\x9c\x0b\x4e"
+       "\xc3\x7d\xd1\x28\xd2\x30\x58\xc9\xa9\x7f\xa0\xd3\x65\xaf\x7b\x04"
+       "\x27\x39\xb3\x80\x6c\x68\x5c\x27\x60\xab\x98\x3b\xca\x16\xb4\x98"
+       "\x3c\xed\xf1\xd6\xf9\x43\x55\x51\xf8\xba\xdb\x96\xc2\xbb\xc4\x53"
+       "\x8e\x49\x0b\x8f\x82\x92\x75\x9c\x83\x7a\xf9\x7b\x0f\x30\x4f\x6d"
+       "\x8b\x6a\x05\xd9\x6e\x47\x88\x09\xfc\x56\x57\x91\x9a\xcd\xbb\xa9"
+       "\x39\x45\x20\x81\xd9\x23\x72\x1d\xfa\xea\x24\xb7\xeb\x2a\xcf\x19"
+       "\xcc\xcc\x63\xd6\xbb\x29\x5f\x9f\x71\x7c\x45\x15\x7b\x37\x12\x82"
+       "\x64\x41\xad\xe6\x20\xf1\x5d\xd0\x14\xff\x7b\x0c\x72\xe9\xc3\xf5"
+       "\x8a\xf2\xa3\x2e\x30\xdd\x32\xdc\x10\x9d\x9e\x05\xd8\x0d\xd8\x22"
+       "\xdd\xa6\x7f\x0d\xf5\x00\x3e\x7a\x92\xa6\x01\x3c\xc7\xdc\xf7\xae"
+       "\x73\x0c\xbf\xd4\x98\xfc\x30\xa5\xe8\xc1\x69\xb8\x57\xc9\x31\x4c"
+       "\x82\x1e\x3e\x17\x5f\x4d\x0c\x4d\x31\xbe\x21\x60\x79\x31\x52\x12"
+       "\x08\x09\x52\x8d\xf7\xbc\x73\x21\x95\x28\x09\x1f\x9b\xcd\x79\x42"
+       "\x61\x1f\x9f\x9e\x87\x53\x4c\x39\x50\x90\x74\xc4\xe1\xf7\x4f\x72"
+       "\xe6\x95\xf3\x38\xcb\x41\x3c\x26\x48\x00\x12\x0f\xbb\x3e\xd3\x17"
+       "\x7c\x03\xe1\x6e\x76\x58\xfc\x87\xa0\x99\x7f\x1e\x00\xea\x9e\x4e"
+       "\xef\x4c\x10\xee\xee\x79\xeb\x13\x8c\x19\x01\xd0\x2a\x74\x48\x99"
+       "\x66\x7e\x77\x1e\xa4\xee\x31\xae\xaf\x7b\x8f\x80\x06\x51\x5d\x7d"
+       "\x5d\x9f\x68\x1d\xea\xa8\x43\x99\xff\xac\x5d\x04\xb0\x30\x70\xf8"
+       "\x4a\xd3\xba\x6c\xd6\xb2\x01\x86\x8f\x4b\x2e\x6b\x5a\xd4\xc3\x74"
+       "\x1c\xb1\xe8\x4e\xbf\x7e\x18\xf3\x14\xe8\xf6\x05\xb5\xb6\x6c\xa7"
+       "\x94\xce\xba\xd2\x70\x3b\x49\x32\x80\xef\xaa\xdd\xa3\xfd\x49\x0d"
+       "\x0e\x24\x36\x69\x0a\x20\x7e\xbf\xfa\xca\x1b\xc9\xd9\xfd\x2b\x83"
+       "\x5d\xab\x3a\xa1\x2c\x43\xc7\xf1\xc4\x43\x37\x97\xa9\xd2\x39\x67"
+       "\x5d\xac\xdd\xf6\x0b\x6e\x99\x9a\x4b\x83\xaf\xba\x74\xbb\xf6\x67"
+       "\xc1\xf3\x38\x16\xc3\x56\x7f\x0d\x4e\x87\xbc\xd0\x85\xa0\x5d\x48"
+       "\x48\x44\x24\x79\x3d\x0d\xd3\x7a\x70\x38\xac\xd6\x3c\xe1\x6e\x2e"
+       "\xea\xb9\xee\x89\xea\xe2\x1d\xe9\xd1\xa5\x0f\x75\x46\xa8\x8d\x0d"
+       "\xf5\x72\x37\xc8\xe0\xaa\x48\x0f\x0e\xa4\x08\xce\x31\x74\x78\xdb"
+       "\x92\x30\x54\x70\x0c\x62\xe0\x62\x00\x90\xdd\x08\xf7\x3c\xa3\x1b"
+       "\x79\x78\x6a\xb6\xdb\xa3\x77\xd9\x3a\x38\xfc\x98\x29\x08\xba\x71"
+       "\x60\xa5\xf6\xcb\xc0\xe7\xe5\x35\x87\x97\xaf\x87\x81\x33\xe4\x1a"
+       "\x3c\x4b\x21\x7d\x7d\x84\x96\x52\x30\xac\xf2\x1b\x47\x28\xc0\x6f"
+       "\xf8\x6d\x9c\x2d\x69\x19\x49\x2e\x37\x1b\x89\x31\xa4\xb5\xbf\x60"
+       "\x9b\x32\x55\x83\x8f\x78\x50\x6b\xc5\x9c\xf6\x58\x8b\x0d\x93\xc0"
+       "\x30\x74\x98\x62\xec\xaa\x0e\x6e\xe7\x9b\x7c\x9b\x28\x97\x9e\xaf"
+       "\x38\xb8\x56\x4d\x78\xbe\x76\x69\xb5\xe0\x84\x2b\x1f\x11\x8e\xf7"
+       "\x18\x90\x4b\xfa\x82\x06\x57\xdd\xc7\xe3\x1d\xd6\x1f\x72\x12\x50"
+       "\x93\x20\x4c\xf7\x4b\x08\x1c\x28\x3f\x46\x47\xd0\x12\x40\xaa\xa9"
+       "\x38\x27\x04\x1e\x5f\x2c\xd0\x6b\xe8\xd0\xcd\xd9\x9d\xcc\x88\x67"
+       "\x8b\x5c\x5f\x80\xca\x54\xd8\x85\x26\x20\x31\xe8\xb8\xd9\xd4\xe9"
+       "\x40\x99\x11\x24\x86\x56\x82\xbe\x75\x5e\x53\x19\xf4\xfd\x38\x06"
+       "\x15\x9d\x58\x4c\x92\xb2\x09\xd1\x69\x03\x6f\xd2\x58\x9f\x85\x09"
+       "\xe2\x32\xda\x3a\x5a\x0e\x45\x1b\x8e\xf8\xbb\xe6\x60\x71\x81\xeb"
+       "\x51\x0a\x8f\x6d\x17\xd0\xea\x04\x1c\x11\xb9\x6b\x8e\xaa\x76",
+       2047,
+       "\x66\x22\x20\xcc\x13\x5b\xe5\xac\x17\xf7\x54\xe4\xa3\xfd\x9c\xb6"
+       "\xbf\x4d\x20\x8f\x9a\x28\xc4\xd0\xe0\xa7\x8f\x36\xa1\xeb\x0d\xbf"
+       "\xc2\x79\x44\xd2\x42\xf6\xdb\x57\x34\xf3\x07\xac\x43\xdc\xa1\xc7"
+       "\x54\x9f\x0b\xfb\xc1\xb6\x12\x11\xb1\x67\xf1\x80\xef\x70\x0a\x9c"
+       "\x71\x34\xf1\x55\x1e\x9f\x2f\x0f\x93\x2a\x61\xb5\xf2\x01\x26\xfa"
+       "\xa2\xef\xe9\x4f\x00\xf5\xec\x08\x3d\x72\x80\x66\xfe\xde\x72\x9c"
+       "\xf4\x04\x24\xf3\x71\x3b\x07\xa3\x3d\xac\xcf\x59\x4d\xab\xec\x93"
+       "\xb2\x2d\x9a\x40\x66\xd3\xfb\x48\x66\xa5\x0f\xb7\xe6\x37\x50\x86"
+       "\x3d\xf1\x66\x8c\xae\x8d\xce\x1f\xca\xe4\x7e\x80\xb1\x15\x3b\x05"
+       "\x29\x71\xf9\x72\x68\x51\x2d\x5d\x94\xf7\x12\xd7\x24\x9e\x89\xd9"
+       "\x86\x8b\x4c\x1e\xf0\xdb\xec\x86\x1b\x9b\xb9\x84\x72\xce\x6a\x41"
+       "\x37\x1f\x86\x5c\x58\x75\x90\xeb\x2a\xac\xd5\xa2\x31\xc3\x99\xa9"
+       "\xb7\x62\x05\x8c\x2a\x16\x87\xa6\xb7\x9a\xe0\x11\xf9\x3d\xaf\x68"
+       "\x05\x96\x38\x87\x9d\x7c\x45\x22\xf4\xb6\xf1\x02\x03\xca\xbb\x76"
+       "\xd7\x9a\x7c\x55\x0b\x52\x25\x51\x02\x74\x62\x2c\xea\x76\x36\x8d"
+       "\x41\xac\x1b\x8c\xd8\x72\x71\xed\xa7\x93\xa7\x4b\x86\x4e\x0b\xf2"
+       "\xfa\x7d\x0e\xdb\xd9\x05\xc5\x94\x8c\x5f\x42\xc6\x16\xbc\x45\x66"
+       "\xa5\x81\xd9\xba\x14\x46\x38\xd0\x33\x15\x20\x75\xeb\x6a\xff\x27"
+       "\x9b\x91\x98\x5e\x9f\x8b\xd6\x5f\x86\xf6\x16\xba\xa7\x4d\x31\xf9"
+       "\x93\xd5\x85\x09\x5d\xbe\xf2\x41\x6a\xb3\x1a\xc8\x13\xce\xef\x63"
+       "\xc9\x31\x38\x0a\xa0\x22\xab\x4e\xb5\x30\xfe\xa0\x19\xf0\xa2\xa5"
+       "\x90\xc0\xef\xa4\xdd\x0f\xae\x78\x4e\x68\x95\x47\x20\xb9\x89\xbd"
+       "\x0c\x86\xb1\xe8\x1d\x73\x15\x60\xe0\x0c\xb9\x01\x70\x6b\xdf\xdb"
+       "\x6a\x40\xb7\x3d\xe9\x14\x10\xbe\x01\x9e\xe0\xc5\x57\x37\xe9\x81"
+       "\xc2\xe6\x0d\x4f\x82\x25\xe0\xa4\x85\x1f\x1d\xb6\x2e\x03\x22\x82"
+       "\x76\x02\x7c\x3c\x9a\xe1\xa6\xc4\x6b\x12\xbc\x5d\x8a\x94\xa0\x91"
+       "\xf8\x3b\x00\xce\x28\x07\x70\xe8\x5d\xe1\xf3\x0f\x11\xdf\x0a\xef"
+       "\x70\x98\x5b\x43\xe3\xbf\x0b\x0c\xf4\x95\xfd\x32\x99\xd1\x96\xee"
+       "\x1b\xe8\x5f\x20\xe6\x63\x84\x9b\xe0\xf2\x0f\xaa\xc0\x7b\x9c\x0e"
+       "\x8e\x2c\xec\x1b\x22\xa4\x2b\x84\xd9\x1c\x12\x5d\x21\x82\x6a\x6a"
+       "\x54\x65\x42\x36\x3c\x60\x42\x2b\xfa\x58\xac\xbd\x67\x20\xd6\x56"
+       "\x68\x9b\xfa\xc9\x96\x85\x8a\x67\x5c\x0c\x31\xbf\xba\xe8\xcb\x0d"
+       "\xd2\x5d\xd0\xec\x78\x2c\xa3\x13\xdb\x1c\x40\x41\x9f\xed\xea\xc3"
+       "\xc8\x8e\x5a\x52\x54\x75\xe0\xab\x42\x61\x70\x7c\x45\xdd\x02\xac"
+       "\xaf\x7b\x6a\x15\x11\xa0\xad\x48\xe1\x1c\x53\x24\xc7\xd3\x4d\x5c"
+       "\x2f\xc8\xa3\x72\xa5\x09\x45\xd1\x8e\xf8\xbc\x7a\xfd\xfd\x28\x5e"
+       "\x53\xdb\x1d\xe7\x08\x9c\xe8\x08\xc2\xe0\xd6\x21\x4d\x19\xcd\x44"
+       "\x77\xdf\xc8\x09\xb8\xbb\x2a\x88\xc5\xbe\x5a\x0b\x2c\x86\xd9\x9e"
+       "\xed\x85\x8d\x64\x98\x36\x29\x23\xdb\x6f\x8b\x02\x0d\x89\x86\xb0"
+       "\xee\xc9\x5b\xe3\x01\xab\xf3\x3c\xa4\xc1\x99\x27\xe0\xf9\xb1\xa9"
+       "\xc1\xb9\x9e\x8d\xc5\x06\xb8\xb6\xb8\x42\x50\x73\xef\x33\x71\x24"
+       "\x83\x4d\xc7\xe2\x71\xd9\x22\x9e\xad\x26\xc7\xbf\x00\xdb\x80\x34"
+       "\xb9\xf4\xc5\x59\xf8\x18\x66\x9a\x1c\x5f\x71\x22\x26\x1d\x9d\x84"
+       "\xfb\xe6\x5d\x3a\x5b\x6f\x21\x19\x17\x6a\x71\x28\xad\xd1\x67\x86"
+       "\x35\xec\x7b\x88\x3a\x64\xd2\xcb\x18\x2e\xa4\x06\x87\x7e\x5b\x5a"
+       "\x77\xe9\xb9\x68\xd2\xd3\x4b\x16\xaa\x5d\xed\xd4\xcc\x48\x9b\x55"
+       "\xd4\x02\x43\xa2\xc4\x3b\xe4\x67\xd3\x78\x42\x78\xbe\xa7\xb5\x07"
+       "\x8d\x6c\x6c\x96\xd8\x9c\x75\x91\xdb\xe7\x02\xb5\xe5\x00\xed\xf2"
+       "\xa4\x94\xeb\x02\xe8\xbc\x2c\xd8\x3b\xcc\x53\x17\xb2\xa6\x1c\xc0"
+       "\x7d\x4d\x5a\xf9\x52\xab\xb8\xba\xcf\x60\x8c\x7f\x5a\xb2\x51\x8a"
+       "\x7a\x87\xd2\xa2\xee\x78\x70\xe1\xfb\x28\x78\x24\xf9\x9a\x48\x2c"
+       "\x48\xfc\xb2\x28\xdc\xe4\x22\x94\x5a\xf3\xab\x6d\x57\x6e\x4b\x46"
+       "\x76\x5b\x84\xaf\x7c\xbf\x7c\x0b\x1d\x59\x65\x9e\x18\xbb\x26\xdb"
+       "\x52\x6c\x94\x9f\x52\x5f\xb6\x16\x93\x17\x37\x45\x38\x70\x73\x30"
+       "\x3c\x9c\x38\x9d\xb5\x5e\x6a\x53\x4b\xc0\xd1\xec\x40\xbe\x3f\x61"
+       "\x57\x12\x43\xc5\x4c\xe8\x76\xb5\xff\x39\x70\xc3\x2c\x9e\x33\xa0"
+       "\x45\x5d\xdd\xf4\xf1\x5c\xec\x6f\xd6\x22\x23\xa6\xa4\xf3\x55\x69"
+       "\x7e\x5f\xd8\x3d\xc3\xc8\x74\x83\xba\x36\xca\x3f\x94\xf9\x77\x2c"
+       "\x38\xe2\x87\x05\x08\x55\x7f\xa4\x43\x95\xeb\x75\x89\xee\xc2\x4e"
+       "\xf2\x04\xc2\xda\xd7\x05\xf1\xc6\xc0\x3c\x1c\x37\xae\x3f\x6e\x5c"
+       "\xd3\x85\xa9\x01\x70\x91\x55\xf0\x7f\xf2\xd5\x9c\x19\x8d\x21\xfd"
+       "\x01\xc1\xc6\x8a\x2a\x73\x34\x5d\x66\x24\x09\x66\x8f\xe7\x3b\x98"
+       "\xd5\x72\x69\xb9\xea\x8a\x16\xcf\x8b\xea\x4b\x6b\x65\x42\x42\x39"
+       "\xf1\xdb\xfa\x54\x69\xc9\xc0\xeb\x92\xd0\x4f\x10\xed\x69\xc4\xf7"
+       "\xf7\xa2\xcc\x94\xb3\x54\x56\x11\x17\xf5\xdc\xcc\x3a\xa7\x5b\x3f"
+       "\x5e\xfd\x2f\xc8\x5d\xe0\xa7\x35\xbc\xd2\xdd\xf7\x45\x89\xfb\xc9"
+       "\x28\xc2\x82\x19\x9c\x06\xda\xf7\x93\x64\xf0\x41\x41\xff\x00\x41"
+       "\x44\x1e\x9b\x01\x54\x9d\x37\x07\xab\x16\x91\x55\xf2\xf5\x5e\x28"
+       "\x5f\x40\x99\xb0\x09\x8c\xd8\xa9\xd3\xef\xff\x89\xba\xb4\xad\x09"
+       "\x98\x0f\x7c\xb7\x60\x6d\x60\x79\x4b\x9b\x28\x7c\x1a\x69\x7a\x23"
+       "\xe1\xed\xad\x0c\xf1\x61\xd2\xab\xf9\xa4\xe7\xd7\x3b\x5b\xfe\x28"
+       "\x7c\xa7\x92\x53\x90\xd7\x5b\xb6\x5d\x9f\x0f\xbe\xfb\xf9\x4f\xd0"
+       "\x4a\x23\x4a\x49\xd9\x29\xf1\x9c\xd7\xbc\x5c\x0e\xb4\x60\x2e\x95"
+       "\x6e\xe2\x24\x02\x8a\x80\x72\x55\xb1\xfa\xc2\x38\xdc\xa1\x4b\x6f"
+       "\xc7\xe1\xb3\xbb\x38\x9b\xdf\xf1\x49\xf1\x11\xbe\x40\xfd\x27\x9f"
+       "\x5a\x1d\x56\x3a\xc2\xa8\x76\xd7\xf4\x9a\x01\x1a\x9e\x40\x10\x79"
+       "\x7e\xa1\x31\xe8\xc8\x35\xcd\x9e\xa5\xa7\x29\x57\xf9\xd6\x1c\xc9"
+       "\x9e\x93\xb8\xfe\x0f\xd9\x8c\xdd\xcc\x77\x9f\xf8\x2c\x70\x36\x48"
+       "\x81\x75\xe4\x61\x01\x98\x9b\xea\x11\xf9\x47\xfb\x77\x1b\x9e\x16"
+       "\x12\x56\x72\x55\xfe\x64\x5f\xa4\xff\x16\x3d\x00\xbd\xa2\xe0\xd1"
+       "\xd4\x47\x36\x7b\x7e\x94\xd1\x22\x7f\xfb\xcb\x5c\x99\x01\x4e\xaf"
+       "\x82\x50\xf3\x2f\xb1\xcb\x12\x7a\x12\x4a\x5c\x62\x9b\x91\x43\xf2"
+       "\x73\xbe\xfd\x87\xfc\xd0\x59\x4c\xa4\xfb\x7f\x15\x55\x0d\x90\x83"
+       "\xd7\xf7\xe8\x20\xb4\x6a\xc7\xb9\xe7\x32\xc6\xda\xb3\x57\x15\x49"
+       "\x96\xf4\xbc\x03\xa6\x98\xe8\xbf\x3d\x61\x0b\x34\xe5\xad\xd6\xb8"
+       "\xd8\x1c\xc6\x1d\x39\x58\xb6\xef\xb2\xd0\x8b\xe5\x60\x9a\x90\x07"
+       "\x9e\x62\xcc\xf2\x5b\xe8\x20\xe5\x88\x57\xf0\x12\xc8\x66\x96\x27"
+       "\x1d\x9e\x34\x56\x2a\x62\x7e\x75\x94\x10\x93\x69\x50\x68\x6a\x48"
+       "\x8d\x02\xda\x4f\x9e\x82\xe2\x8c\xf1\xaf\x07\xe8\x01\x6d\x04\xce"
+       "\x3f\xc2\xbf\x01\x27\xe6\xd6\x73\xfe\x53\x00\xa2\x0e\x1b\xe0\x9f"
+       "\x4d\x3f\x69\x12\xd6\xc9\xf6\x1d\x5b\x50\xbc\x1f\x3d\x8e\xd4\x7f"
+       "\x57\xb9\x3f\xe4\x52\xe4\xae\xde\x54\xef\x09\xbe\xf8\xc8\x67\x0f"
+       "\xda\x1d\x1b\xe6\xf5\x7f\x61\x78\xac\xbb\xc8\xff\xe4\x42\xdd\xbb"
+       "\x44\x19\x94\xeb\xa1\x2a\xfe\x7d\xb9\x2d\xcb\xbe\x50\xff\x9a\xca"
+       "\x44\xf1\x75\xe3\xc6\x4d\x1d\x0c\x5f\xbd\x39\x25\xce\x43\xa6\xfb"
+       "\x9d\x0a\x55\x79\xf4\x6a\x4e\x36\xff\xda\x25\x63\xd4\xae\xb0\xa9"
+       "\x9b\x2f\x1d\x9a\x1e\xfb\x96\xd1\x85\x53\x54\xc4\x2b\x76\xf7\xe0"
+       "\xee\x1c\xcb\x03\xad\xd4\x49\x5c\xeb\x5e\xc3\x6c\x1e\xa9\x2f\x8a"
+       "\x20\xd9\x11\x8a\x94\xd7\x15\x42\xdf\x2c\x15\xa5\x69\xec\x6b\x9b"
+       "\x00\x71\x59\xfc\xbf\x7d\x3f\x77\xa3\xb8\x89\xc8\x66\x75\xc5\xb1"
+       "\xd9\x69\x79\xe4\x6a\x74\x01\xe3\x5a\x6b\x1e\xc6\xa5\xd0\x11\x29"
+       "\xa1\x5a\x1d\xf0\xe2\xf1\x35\x30\x65\xa6\x14\xf4\xcb\x57\xf2\xa1"
+       "\x0d\x92\xcb\x4c\x24\x50\x8d\xfe\x4f\xbf\x55\x9c\xa1\x54\x54\x9b"
+       "\x72\x45\xdc\x13\xea\x57\x8e\xab\xdf\xe6\xa4\x3f\x2f\x7b\xbe\x86"
+       "\x63\x04\x09\x99\xe6\x38\x19\x2b\x88\x50\xe3\x78\x64\x85\x56\x45"
+       "\x53\xc4\xce\xfb\xa0\xc3\xf8\x77\x87\xa7\xa8\x54\x57\xb7\x18\x2c"
+       "\x87\xb6\x87\xca\xe0\x45\x58\x06\xe1\x3e\xc3\x4a\x0c\xaa\xce\xca"
+       "\x25\xee\x53\x85\x2a\x37\x3d\x37\x29\x85\xda\x1d\x2c\x24\xb3\xd4"
+       "\x63\xbf\xe9\x34\xdd\x35\x01\x41\x2e\x27\x9b\x05\x44\x25\xd1\x5b"
+       "\xdf\x59\xcc\x26\xf9\xd4\xdf\x2e\x23\x71\xe6\xc4\x6e\x3b\xac\xe9"
+       "\x75\x27\x74\xe8\xd4\x0d\xc9\xb5\x8f\x58\x27\x25\xef\x9e\x66\x4b"
+       "\x69\x2e\xfb\x07\x39\x91\x4c\x9a\x00\xf6\x62\xd2\xfb\x15\x73\xb4"
+       "\xe1\x7a\x3b\xd1\x7d\x16\x74\xa5\x09\xa0\xc4\x99\x42\xea\x6d\x64"
+       "\xd6\x15\x18\xa6\x9a\x94\x94\x49\x8c\x5e\xe9\x5f\xc5\x40\x14\xa1"
+       "\xc9\xd8\xf2\x1f\x4a\x75\x3d\x14\xde\x3e\x8c\x89\xa0\xf2\xd4\xf3"
+       "\x3d\xb1\x89\xe4\x0a\x8a\x06\x33\x60\x73\x45\xc5\x3b\x99\x29\xbf"
+       "\xd2\x87\x44\x7b\x80\xf8\xe6\x31\x92\xf5\xd6\x44\xcb\xc9\x02\xd9"
+       "\xf9\x66\xc6\x6a\x40\x1f\x40\x1a\x31\x39\xd9\xcc\x4d\xa2\x5b\x6f"
+       "\x42\xc3\x4d\x53\x89\x66\xb3\x72\xb8\x3d\x6b\x21\xc7\xa4\xe1\x14"
+       "\x06\xd3\x3b\xf7\x7b\x1d\x5e\xd9\xb6\xb4\xc7\xeb\xc9\xea\x51\xe7"
+       "\x33\xaa\xdf\xf9\xe6\x5c\x66\x93\xa9\x4b\x03\x73\xdf\x6b\x3f\x3b"
+       "\xad\x27\xb6\xa4\x09\x30\x31\x06\x30\x17\x3a\xaf\xd2\xa6\x71\xd8"
+       "\x60\x8d\x16\xb5\x0d\xec\x1a\xf7\x42\x38\x32\x08\xf2\x32\xeb\x6a"
+       "\x12\xdc\x77\x42\x93\xe8\x29\x2a\x06\xe9\x2f\xbf\x4d\xfb\xb1\x59"
+       "\x31\xaf\xe7\xfa\x3e\xb1\x54\x33\xe2\xfa\xde\x99\x16\x2e\xeb\x35"
+       "\x52\x98\x12\x12\x7c\x69\xa1\x7f\x9a\xe1\xd8\x2d\x51\xba\xda\xb6"
+       "\x90\xf1\x8a\x57\xed\x86\x20\xfe\x79\x08\x57\xc5\x80\x15\x7b\xf5"
+       "\xfb\x92\x1d\xd1\x31\x8d\x9d\xa1\xc3\x46\x69\xfa\x71\x2c\x42",
+       "\xc6\x54\xf9\xf0\x22\x2c\xc3\xee\xdd\x13\x02\xb8\xe7\x5a\x2e\x7e"
+      }
+#endif /* USE_CAMELLIA */
+    };
+  gcry_cipher_hd_t hde, hdd;
+  unsigned char out[MAX_DATA_LEN * 2];
+  unsigned char tag[16];
+  int i, keylen;
+  gcry_error_t err = 0;
+  size_t taglen2;
 
-         err = gcry_cipher_decrypt (hdd, out, tv[i].inlen, NULL, 0);
-         if (err)
-           {
-             fail ("aes-gcm-siv, gcry_cipher_decrypt (%d) failed: %s\n",
-                   i, gpg_strerror (err));
-             gcry_cipher_close (hde);
-             gcry_cipher_close (hdd);
-             return;
-           }
+  if (verbose)
+    fprintf (stderr, "  Starting GCM-SIV checks.\n");
 
-         if (memcmp (tv[i].plaintext, out, tv[i].inlen))
-           fail ("aes-gcm-siv, decrypt mismatch entry %d\n", i);
+  for (i = 0; i < sizeof (tv) / sizeof (tv[0]); i++)
+    {
+      if ((err = gcry_cipher_test_algo (tv[i].algo)))
+        {
+          if (in_fips_mode && (tv[i].flags & FLAG_NOFIPS))
+            {
+              if (verbose)
+                fprintf (stderr, "    algorithm %d not available in fips mode\n",
+                         tv[i].algo);
+            }
+          else
+            fail ("algo %d GCM-SIV, gcry_cipher_test_algo unexpectedly failed: %s\n",
+                  tv[i].algo, gpg_strerror (err));
+          continue;
+        }
+      else if (in_fips_mode && (tv[i].flags & FLAG_NOFIPS))
+        {
+          fail ("algo %d GCM-SIV, gcry_cipher_test_algo did not fail as expected\n",
+                tv[i].algo);
+          continue;
+        }
 
-         err = gcry_cipher_checktag (hdd, tag, taglen2);
-         if (err)
-           {
-             fail ("aes-gcm-siv, gcry_cipher_checktag (%d) failed: %s\n",
-                   i, gpg_strerror (err));
-             gcry_cipher_close (hde);
-             gcry_cipher_close (hdd);
-             return;
-           }
+      if (verbose)
+       fprintf (stderr, "    checking GCM-SIV mode for %s [%i]\n",
+                gcry_cipher_algo_name (tv[i].algo),
+                tv[i].algo);
+      err = gcry_cipher_open (&hde, tv[i].algo, GCRY_CIPHER_MODE_GCM_SIV, 0);
+      if (!err)
+       err = gcry_cipher_open (&hdd, tv[i].algo, GCRY_CIPHER_MODE_GCM_SIV, 0);
+      if (err)
+       {
+          fail ("algo %d GCM-SIV, gcry_cipher_open failed: %s\n", tv[i].algo,
+               gpg_strerror (err));
+         return;
        }
-      else
+
+      keylen = gcry_cipher_get_algo_keylen (tv[i].algo);
+      if (!keylen)
+        {
+          fail ("algo %d GCM-SIV, gcry_cipher_get_algo_keylen failed\n",
+               tv[i].algo);
+          return;
+        }
+
+      err = gcry_cipher_setkey (hde, tv[i].key, keylen);
+      if (!err)
+       err = gcry_cipher_setkey (hdd, tv[i].key, keylen);
+      if (err)
        {
-         err = gcry_cipher_gettag (hde, tag, taglen2);
-         if (err)
-           {
-             fail ("aes-gcm-siv, gcry_cipher_gettag(%d) failed: %s\n",
-                   i, gpg_strerror (err));
-             gcry_cipher_close (hde);
-             gcry_cipher_close (hdd);
-             return;
-           }
+         fail ("algo %d GCM-SIV, gcry_cipher_setkey failed: %s\n", tv[i].algo,
+               gpg_strerror (err));
+         gcry_cipher_close (hde);
+         gcry_cipher_close (hdd);
+         return;
+       }
 
-         if (memcmp (tv[i].tag, tag, taglen2))
-           {
-             mismatch (tv[i].tag, taglen2, tag, taglen2);
-             fail ("aes-gcm-siv, tag mismatch entry %d\n", i);
-           }
+      err = gcry_cipher_setiv (hde, tv[i].nonce, 12);
+      if (!err)
+       err = gcry_cipher_setiv (hdd, tv[i].nonce, 12);
+      if (err)
+       {
+         fail ("algo %d GCM-SIV, gcry_cipher_setiv failed: %s\n", tv[i].algo,
+               gpg_strerror (err));
+         gcry_cipher_close (hde);
+         gcry_cipher_close (hdd);
+         return;
+       }
 
-         err = gcry_cipher_checktag (hdd, tv[i].tag, taglen2);
+      if (tv[i].adlen >= 0)
+       {
+         err = gcry_cipher_authenticate (hde, tv[i].ad, tv[i].adlen);
+         if (!err)
+           err = gcry_cipher_authenticate (hdd, tv[i].ad, tv[i].adlen);
          if (err)
            {
-             fail ("aes-gcm-siv, gcry_cipher_checktag (%d) failed: %s\n",
-                   i, gpg_strerror (err));
+             fail ("algo %d GCM-SIV, gcry_cipher_authenticate failed: %s\n",
+                   tv[i].algo, gpg_strerror (err));
+             gcry_cipher_close (hde);
+             gcry_cipher_close (hdd);
+             return;
+           }
+       }
+
+      err = gcry_cipher_info (hde, GCRYCTL_GET_TAGLEN, NULL, &taglen2);
+      if (err)
+       {
+         fail ("cipher-siv, gcryctl_get_taglen failed (tv %d): %s\n",
+               i, gpg_strerror (err));
+         gcry_cipher_close (hde);
+         gcry_cipher_close (hdd);
+         return;
+       }
+      if (taglen2 != 16)
+       {
+         fail ("cipher-siv, gcryctl_get_taglen returned bad length"
+               " (tv %d): got=%zu want=%d\n",
+               i, taglen2, 16);
+         gcry_cipher_close (hde);
+         gcry_cipher_close (hdd);
+         return;
+       }
+
+      if (tv[i].inlen)
+       {
+         err = gcry_cipher_encrypt (hde, out, tv[i].inlen,
+                                    tv[i].plaintext, tv[i].inlen);
+         if (err)
+           {
+             fail ("algo %d GCM-SIV, gcry_cipher_encrypt (%d) failed: %s\n",
+                   tv[i].algo, i, gpg_strerror (err));
+             gcry_cipher_close (hde);
+             gcry_cipher_close (hdd);
+             return;
+           }
+
+         if (memcmp (tv[i].out, out, tv[i].inlen))
+           {
+             mismatch (tv[i].out, tv[i].inlen, out, tv[i].inlen);
+             fail ("algo %d GCM-SIV, encrypt mismatch entry %d\n",
+                   tv[i].algo, i);
+           }
+
+         err = gcry_cipher_gettag (hde, tag, taglen2);
+         if (err)
+           {
+             fail ("algo %d GCM-SIV, gcry_cipher_gettag(%d) failed: %s\n",
+                   tv[i].algo, i, gpg_strerror (err));
+             gcry_cipher_close (hde);
+             gcry_cipher_close (hdd);
+             return;
+           }
+
+         if (memcmp (tv[i].tag, tag, taglen2))
+           {
+             mismatch (tv[i].tag, taglen2, tag, taglen2);
+             fail ("algo %d GCM-SIV, tag mismatch entry %d\n", tv[i].algo, i);
+           }
+
+         err = gcry_cipher_set_decryption_tag (hdd, tag, taglen2);
+         if (err)
+           {
+             fail ("algo %d GCM-SIV, gcry_cipher_set_decryption_tag (%d) failed: %s\n",
+                   tv[i].algo, i, gpg_strerror (err));
+             gcry_cipher_close (hde);
+             gcry_cipher_close (hdd);
+             return;
+           }
+
+         err = gcry_cipher_decrypt (hdd, out, tv[i].inlen, NULL, 0);
+         if (err)
+           {
+             fail ("algo %d GCM-SIV, gcry_cipher_decrypt (%d) failed: %s\n",
+                   tv[i].algo, i, gpg_strerror (err));
+             gcry_cipher_close (hde);
+             gcry_cipher_close (hdd);
+             return;
+           }
+
+         if (memcmp (tv[i].plaintext, out, tv[i].inlen))
+           fail ("algo %d GCM-SIV, decrypt mismatch entry %d\n",
+                 tv[i].algo, i);
+
+         err = gcry_cipher_checktag (hdd, tag, taglen2);
+         if (err)
+           {
+             fail ("algo %d GCM-SIV, gcry_cipher_checktag (%d) failed: %s\n",
+                   tv[i].algo, i, gpg_strerror (err));
+             gcry_cipher_close (hde);
+             gcry_cipher_close (hdd);
+             return;
+           }
+       }
+      else
+       {
+         err = gcry_cipher_gettag (hde, tag, taglen2);
+         if (err)
+           {
+             fail ("algo %d GCM-SIV, gcry_cipher_gettag(%d) failed: %s\n",
+                   tv[i].algo, i, gpg_strerror (err));
+             gcry_cipher_close (hde);
+             gcry_cipher_close (hdd);
+             return;
+           }
+
+         if (memcmp (tv[i].tag, tag, taglen2))
+           {
+             mismatch (tv[i].tag, taglen2, tag, taglen2);
+             fail ("algo %d GCM-SIV, tag mismatch entry %d\n", tv[i].algo, i);
+           }
+
+         err = gcry_cipher_checktag (hdd, tv[i].tag, taglen2);
+         if (err)
+           {
+             fail ("algo %d GCM-SIV, gcry_cipher_checktag (%d) failed: %s\n",
+                   tv[i].algo, i, gpg_strerror (err));
              gcry_cipher_close (hde);
              gcry_cipher_close (hdd);
              return;
@@ -6334,8 +7196,8 @@ check_gcm_siv_cipher (void)
          err = gcry_cipher_checktag (hdd, tag, taglen2);
          if (gpg_err_code (err) != GPG_ERR_CHECKSUM)
            {
-             fail ("aes-gcm-siv, gcry_cipher_checktag (%d) expected checksum fail: %s\n",
-                   i, gpg_strerror (err));
+             fail ("algo %d GCM-SIV, gcry_cipher_checktag (%d) expected checksum fail: %s\n",
+                   tv[i].algo, i, gpg_strerror (err));
              gcry_cipher_close (hde);
              gcry_cipher_close (hdd);
              return;
@@ -8893,6 +9755,14 @@ check_ocb_cipher (void)
   check_ocb_cipher_largebuf(GCRY_CIPHER_SM4, 16,
     "\x3c\x32\x54\x5d\xc5\x17\xa1\x16\x3f\x8e\xc7\x1d\x8d\x8b\x2d\xb0");
 #endif /* USE_SM4 */
+#if USE_ARIA
+  check_ocb_cipher_largebuf(GCRY_CIPHER_ARIA128, 16,
+    "\x6c\xcc\x69\x34\x3b\xa3\x55\xe5\xdc\xf6\x13\xe0\x5b\x08\x6a\xd9");
+  check_ocb_cipher_largebuf(GCRY_CIPHER_ARIA192, 24,
+    "\x78\xcb\x2e\xa4\x76\xca\x4b\x01\xe8\x34\x44\x00\x9a\x99\x99\x01");
+  check_ocb_cipher_largebuf(GCRY_CIPHER_ARIA256, 32,
+    "\x70\x92\x29\xf5\xbc\x73\xa8\x02\xcc\x80\xac\x0b\xd4\x86\x7f\x43");
+#endif /* USE_ARIA */
 
   /* Check that the AAD data is correctly buffered.  */
   check_ocb_cipher_splitaad ();
@@ -9421,6 +10291,8 @@ check_gost28147_cipher_basic (enum gcry_cipher_algos algo)
         gcry_cipher_close (hdd);
     }
 
+#else
+  (void) algo;
 #endif
 }
 
@@ -10759,256 +11631,697 @@ check_bulk_cipher_modes (void)
                  gcry_cipher_algo_name (tv[i].algo), tv[i].mode, gpg_strerror (err));
         }
 
-      err = gcry_cipher_open (&hde, tv[i].algo, tv[i].mode, 0);
-      if (!err)
-        err = gcry_cipher_open (&hdd, tv[i].algo, tv[i].mode, 0);
-      if (err)
-        {
-          fail ("gcry_cipher_open failed: %s\n", gpg_strerror (err));
-          goto leave;
-        }
+      err = gcry_cipher_open (&hde, tv[i].algo, tv[i].mode, 0);
+      if (!err)
+        err = gcry_cipher_open (&hdd, tv[i].algo, tv[i].mode, 0);
+      if (err)
+        {
+          fail ("gcry_cipher_open failed: %s\n", gpg_strerror (err));
+          goto leave;
+        }
+
+      keylen = gcry_cipher_get_algo_keylen(tv[i].algo);
+      if (!keylen)
+        {
+          fail ("gcry_cipher_get_algo_keylen failed\n");
+          goto leave;
+        }
+
+      clutter_vector_registers();
+      err = gcry_cipher_setkey (hde, tv[i].key, tv[i].keylen);
+      clutter_vector_registers();
+      if (!err)
+        err = gcry_cipher_setkey (hdd, tv[i].key, tv[i].keylen);
+      if (err)
+        {
+          fail ("gcry_cipher_setkey failed: %s\n", gpg_strerror (err));
+          goto leave;
+        }
+
+      blklen = gcry_cipher_get_algo_blklen(tv[i].algo);
+      if (!blklen)
+        {
+          fail ("gcry_cipher_get_algo_blklen failed\n");
+          goto leave;
+        }
+
+      clutter_vector_registers();
+      err = gcry_cipher_setiv (hde, tv[i].iv, tv[i].ivlen);
+      clutter_vector_registers();
+      if (!err)
+        err = gcry_cipher_setiv (hdd, tv[i].iv,  tv[i].ivlen);
+      if (err)
+        {
+          fail ("gcry_cipher_setiv failed: %s\n", gpg_strerror (err));
+          goto leave;
+        }
+
+      /* Fill the buffer with our test pattern.  */
+      for (j=0; j < buflen; j++)
+        buffer[j] = ((j & 0xff) ^ ((j >> 8) & 0xff));
+
+      clutter_vector_registers();
+      err = gcry_cipher_encrypt (hde, outbuf, buflen, buffer, buflen);
+      if (err)
+        {
+          fail ("gcry_cipher_encrypt (algo %d, mode %d) failed: %s\n",
+                tv[i].algo, tv[i].mode, gpg_strerror (err));
+          goto leave;
+        }
+
+      gcry_md_hash_buffer (GCRY_MD_SHA1, hash, outbuf, buflen);
+#if 0
+      printf ("/*[%d]*/\n", i);
+      fputs ("      {", stdout);
+      for (j=0; j < 20; j++)
+        printf (" 0x%02x%c%s", hash[j], j==19? ' ':',', j == 9? "\n       ":"");
+      puts ("}");
+#endif
+
+      if (memcmp (hash, tv[i].t1_hash, 20))
+        fail ("encrypt mismatch (algo %d, mode %d)\n",
+              tv[i].algo, tv[i].mode);
+
+      clutter_vector_registers();
+      err = gcry_cipher_decrypt (hdd, outbuf, buflen, NULL, 0);
+      if (err)
+        {
+          fail ("gcry_cipher_decrypt (algo %d, mode %d) failed: %s\n",
+                tv[i].algo, tv[i].mode, gpg_strerror (err));
+          goto leave;
+        }
+
+      if (memcmp (buffer, outbuf, buflen))
+        fail ("decrypt mismatch (algo %d, mode %d)\n",
+              tv[i].algo, tv[i].mode);
+
+      gcry_cipher_close (hde); hde = NULL;
+      gcry_cipher_close (hdd); hdd = NULL;
+    }
+
+  if (verbose)
+    fprintf (stderr, "Completed bulk cipher checks.\n");
+ leave:
+  gcry_cipher_close (hde);
+  gcry_cipher_close (hdd);
+  gcry_free (buffer_base);
+  gcry_free (outbuf_base);
+}
+
+
+static unsigned int
+get_algo_mode_blklen (int algo, int mode)
+{
+  unsigned int blklen = gcry_cipher_get_algo_blklen(algo);
+
+  /* Some modes override blklen. */
+  switch (mode)
+    {
+    case GCRY_CIPHER_MODE_STREAM:
+    case GCRY_CIPHER_MODE_OFB:
+    case GCRY_CIPHER_MODE_CTR:
+    case GCRY_CIPHER_MODE_CFB:
+    case GCRY_CIPHER_MODE_CFB8:
+    case GCRY_CIPHER_MODE_CCM:
+    case GCRY_CIPHER_MODE_GCM:
+    case GCRY_CIPHER_MODE_EAX:
+    case GCRY_CIPHER_MODE_POLY1305:
+      return 1;
+    }
+
+  return blklen;
+}
+
+
+static unsigned int
+get_algo_mode_taglen (int algo, int mode)
+{
+  switch (mode)
+    {
+    case GCRY_CIPHER_MODE_CCM:
+    case GCRY_CIPHER_MODE_GCM:
+    case GCRY_CIPHER_MODE_POLY1305:
+      return 16;
+    case GCRY_CIPHER_MODE_EAX:
+      return gcry_cipher_get_algo_blklen(algo);
+    }
+
+  return 0;
+}
+
+
+static int
+check_one_cipher_core_reset (gcry_cipher_hd_t hd, int algo, int mode, int pass,
+                             int nplain)
+{
+  static const unsigned char iv[8] = { 0, 1, 2, 3, 4, 5, 6, 7 };
+  u64 ctl_params[3];
+  int err;
+
+  gcry_cipher_reset (hd);
+
+  if (mode == GCRY_CIPHER_MODE_OCB || mode == GCRY_CIPHER_MODE_CCM)
+    {
+      clutter_vector_registers();
+      err = gcry_cipher_setiv (hd, iv, sizeof(iv));
+      if (err)
+        {
+          fail ("pass %d, algo %d, mode %d, gcry_cipher_setiv failed: %s\n",
+                pass, algo, mode, gpg_strerror (err));
+          gcry_cipher_close (hd);
+          return -1;
+        }
+    }
+
+  if (mode == GCRY_CIPHER_MODE_CCM)
+    {
+      ctl_params[0] = nplain; /* encryptedlen */
+      ctl_params[1] = 0; /* aadlen */
+      ctl_params[2] = 16; /* authtaglen */
+      err = gcry_cipher_ctl (hd, GCRYCTL_SET_CCM_LENGTHS, ctl_params,
+                            sizeof(ctl_params));
+      if (err)
+        {
+          fail ("pass %d, algo %d, mode %d, gcry_cipher_ctl "
+                "GCRYCTL_SET_CCM_LENGTHS failed: %s\n",
+                pass, algo, mode, gpg_strerror (err));
+          gcry_cipher_close (hd);
+          return -1;
+        }
+    }
+
+  return 0;
+}
+
+/* The core of the cipher check.  In addition to the parameters passed
+   to check_one_cipher it also receives the KEY and the plain data.
+   PASS is printed with error messages.  The function returns 0 on
+   success.  */
+static int
+check_one_cipher_core (int algo, int mode, int flags,
+                       const char *key, size_t nkey,
+                       const unsigned char *plain, size_t nplain,
+                       int bufshift, int split_mode, int pass)
+{
+  gcry_cipher_hd_t hd;
+  unsigned char *in_buffer, *out_buffer;
+  unsigned char *enc_result;
+  unsigned char tag_result[16];
+  unsigned char tag[16];
+  unsigned char *in, *out;
+  int keylen;
+  gcry_error_t err = 0;
+  unsigned int blklen;
+  unsigned int piecelen;
+  unsigned int pos;
+  unsigned int taglen;
+
+  in_buffer = malloc (nplain + 1);
+  out_buffer = malloc (nplain + 1);
+  enc_result = malloc (nplain);
+  if (!in_buffer || !out_buffer || !enc_result)
+    {
+      fail ("pass %d, algo %d, mode %d, malloc failed\n",
+           pass, algo, mode);
+      goto err_out_free;
+    }
+
+  blklen = get_algo_mode_blklen(algo, mode);
+  taglen = get_algo_mode_taglen(algo, mode);
+
+  assert (nkey == 64);
+  assert (nplain > 0);
+  assert ((nplain % 16) == 0);
+  assert (blklen > 0);
+
+  if ((mode == GCRY_CIPHER_MODE_CBC && (flags & GCRY_CIPHER_CBC_CTS)) ||
+      mode == GCRY_CIPHER_MODE_XTS)
+    {
+      /* Input cannot be split in to multiple operations with CTS. */
+      blklen = nplain;
+    }
+
+  if (!bufshift)
+    {
+      in = in_buffer;
+      out = out_buffer;
+    }
+  else if (bufshift == 1)
+    {
+      in = in_buffer+1;
+      out = out_buffer;
+    }
+  else if (bufshift == 2)
+    {
+      in = in_buffer+1;
+      out = out_buffer+1;
+    }
+  else
+    {
+      in = in_buffer;
+      out = out_buffer+1;
+    }
+
+  keylen = gcry_cipher_get_algo_keylen (algo);
+  if (!keylen)
+    {
+      fail ("pass %d, algo %d, mode %d, gcry_cipher_get_algo_keylen failed\n",
+           pass, algo, mode);
+      goto err_out_free;
+    }
+
+  if (keylen < 40 / 8 || keylen > 32)
+    {
+      fail ("pass %d, algo %d, mode %d, keylength problem (%d)\n", pass, algo, mode, keylen);
+      goto err_out_free;
+    }
+
+  if (mode == GCRY_CIPHER_MODE_XTS)
+    {
+      keylen *= 2;
+    }
+
+  err = gcry_cipher_open (&hd, algo, mode, flags);
+  if (err)
+    {
+      fail ("pass %d, algo %d, mode %d, gcry_cipher_open failed: %s\n",
+            pass, algo, mode, gpg_strerror (err));
+      goto err_out_free;
+    }
+
+  clutter_vector_registers();
+  err = gcry_cipher_setkey (hd, key, keylen);
+  if (err)
+    {
+      fail ("pass %d, algo %d, mode %d, gcry_cipher_setkey failed: %s\n",
+           pass, algo, mode, gpg_strerror (err));
+      gcry_cipher_close (hd);
+      goto err_out_free;
+    }
+
+  if (check_one_cipher_core_reset (hd, algo, mode, pass, nplain) < 0)
+    goto err_out_free;
+
+  clutter_vector_registers();
+  err = gcry_cipher_encrypt (hd, out, nplain, plain, nplain);
+  if (in_fips_mode && mode == GCRY_CIPHER_MODE_GCM)
+    {
+      if (!err)
+        fail ("pass %d, algo %d, mode %d, gcry_cipher_encrypt is expected to "
+              "fail in FIPS mode: %s\n", pass, algo, mode, gpg_strerror (err));
+      gcry_cipher_close (hd);
+      goto err_out_free;
+    }
+  if (err)
+    {
+      fail ("pass %d, algo %d, mode %d, gcry_cipher_encrypt failed: %s\n",
+           pass, algo, mode, gpg_strerror (err));
+      gcry_cipher_close (hd);
+      goto err_out_free;
+    }
+
+  if (taglen > 0)
+    {
+      clutter_vector_registers();
+      err = gcry_cipher_gettag (hd, tag, taglen);
+      if (err)
+       {
+         fail ("pass %d, algo %d, mode %d, gcry_cipher_gettag failed: %s\n",
+               pass, algo, mode, gpg_strerror (err));
+         gcry_cipher_close (hd);
+         goto err_out_free;
+       }
+
+      memcpy(tag_result, tag, taglen);
+    }
+
+  memcpy (enc_result, out, nplain);
+
+  if (check_one_cipher_core_reset (hd, algo, mode, pass, nplain) < 0)
+    goto err_out_free;
+
+  clutter_vector_registers();
+  err = gcry_cipher_decrypt (hd, in, nplain, out, nplain);
+  if (err)
+    {
+      fail ("pass %d, algo %d, mode %d, gcry_cipher_decrypt failed: %s\n",
+           pass, algo, mode, gpg_strerror (err));
+      gcry_cipher_close (hd);
+      goto err_out_free;
+    }
+
+  if (taglen > 0)
+    {
+      clutter_vector_registers();
+      err = gcry_cipher_checktag (hd, tag_result, taglen);
+      if (err)
+       {
+         fail ("pass %d, algo %d, mode %d, gcry_cipher_checktag failed: %s\n",
+               pass, algo, mode, gpg_strerror (err));
+         gcry_cipher_close (hd);
+         goto err_out_free;
+       }
+    }
+
+  if (memcmp (plain, in, nplain))
+    fail ("pass %d, algo %d, mode %d, encrypt-decrypt mismatch\n",
+          pass, algo, mode);
+
+  /* Again, using in-place encryption.  */
+  if (check_one_cipher_core_reset (hd, algo, mode, pass, nplain) < 0)
+    goto err_out_free;
+
+  memcpy (out, plain, nplain);
+  clutter_vector_registers();
+  err = gcry_cipher_encrypt (hd, out, nplain, NULL, 0);
+  if (err)
+    {
+      fail ("pass %d, algo %d, mode %d, in-place, gcry_cipher_encrypt failed:"
+            " %s\n",
+           pass, algo, mode, gpg_strerror (err));
+      gcry_cipher_close (hd);
+      goto err_out_free;
+    }
+
+  if (taglen > 0)
+    {
+      err = gcry_cipher_gettag (hd, tag, taglen);
+      if (err)
+       {
+         fail ("pass %d, algo %d, mode %d, in-place, "
+               "gcry_cipher_gettag failed: %s\n",
+               pass, algo, mode, gpg_strerror (err));
+         gcry_cipher_close (hd);
+         goto err_out_free;
+       }
+
+      if (memcmp (tag_result, tag, taglen))
+       fail ("pass %d, algo %d, mode %d, in-place, tag mismatch\n",
+             pass, algo, mode);
+    }
+
+  if (memcmp (enc_result, out, nplain))
+    fail ("pass %d, algo %d, mode %d, in-place, encrypt mismatch\n",
+          pass, algo, mode);
+
+  if (check_one_cipher_core_reset (hd, algo, mode, pass, nplain) < 0)
+    goto err_out_free;
+
+  clutter_vector_registers();
+  err = gcry_cipher_decrypt (hd, out, nplain, NULL, 0);
+  if (err)
+    {
+      fail ("pass %d, algo %d, mode %d, in-place, gcry_cipher_decrypt failed:"
+            " %s\n",
+           pass, algo, mode, gpg_strerror (err));
+      gcry_cipher_close (hd);
+      goto err_out_free;
+    }
+
+  if (taglen > 0)
+    {
+      clutter_vector_registers();
+      err = gcry_cipher_checktag (hd, tag_result, taglen);
+      if (err)
+       {
+         fail ("pass %d, algo %d, mode %d, in-place, "
+               "gcry_cipher_checktag failed: %s\n",
+               pass, algo, mode, gpg_strerror (err));
+         gcry_cipher_close (hd);
+         goto err_out_free;
+       }
+    }
+
+  if (memcmp (plain, out, nplain))
+    fail ("pass %d, algo %d, mode %d, in-place, encrypt-decrypt mismatch\n",
+          pass, algo, mode);
+
+  /* Again, splitting encryption in multiple operations. */
+  if (check_one_cipher_core_reset (hd, algo, mode, pass, nplain) < 0)
+    goto err_out_free;
 
-      keylen = gcry_cipher_get_algo_keylen(tv[i].algo);
-      if (!keylen)
-        {
-          fail ("gcry_cipher_get_algo_keylen failed\n");
-          goto leave;
-        }
+  piecelen = blklen;
+  pos = 0;
+  while (pos < nplain)
+    {
+      if (piecelen > nplain - pos)
+        piecelen = nplain - pos;
 
       clutter_vector_registers();
-      err = gcry_cipher_setkey (hde, tv[i].key, tv[i].keylen);
-      clutter_vector_registers();
-      if (!err)
-        err = gcry_cipher_setkey (hdd, tv[i].key, tv[i].keylen);
+      err = gcry_cipher_encrypt (hd, out + pos, piecelen, plain + pos,
+                                 piecelen);
       if (err)
         {
-          fail ("gcry_cipher_setkey failed: %s\n", gpg_strerror (err));
-          goto leave;
+          fail ("pass %d, algo %d, mode %d, split-buffer (pos: %d, "
+                "piecelen: %d), gcry_cipher_encrypt failed: %s\n",
+                pass, algo, mode, pos, piecelen, gpg_strerror (err));
+          gcry_cipher_close (hd);
+          goto err_out_free;
         }
 
-      blklen = gcry_cipher_get_algo_blklen(tv[i].algo);
-      if (!blklen)
-        {
-          fail ("gcry_cipher_get_algo_blklen failed\n");
-          goto leave;
-        }
+      pos += piecelen;
+      piecelen = split_mode == 1
+                  ? (piecelen + blklen)
+                  : (piecelen * 2 - ((piecelen != blklen) ? blklen : 0));
+    }
 
+  if (taglen > 0)
+    {
       clutter_vector_registers();
-      err = gcry_cipher_setiv (hde, tv[i].iv, tv[i].ivlen);
+      err = gcry_cipher_gettag (hd, tag, taglen);
+      if (err)
+       {
+         fail ("pass %d, algo %d, mode %d, split-buffer (pos: %d, "
+                "piecelen: %d), gcry_cipher_gettag failed: %s\n",
+               pass, algo, mode, pos, piecelen, gpg_strerror (err));
+         gcry_cipher_close (hd);
+         goto err_out_free;
+       }
+
+      if (memcmp (tag_result, tag, taglen))
+       fail ("pass %d, algo %d, mode %d, in-place, tag mismatch\n",
+             pass, algo, mode);
+    }
+
+  if (memcmp (enc_result, out, nplain))
+    fail ("pass %d, algo %d, mode %d, split-buffer, encrypt mismatch\n",
+          pass, algo, mode);
+
+  if (check_one_cipher_core_reset (hd, algo, mode, pass, nplain) < 0)
+    goto err_out_free;
+
+  piecelen = blklen;
+  pos = 0;
+  while (pos < nplain)
+    {
+      if (piecelen > nplain - pos)
+        piecelen = nplain - pos;
+
       clutter_vector_registers();
-      if (!err)
-        err = gcry_cipher_setiv (hdd, tv[i].iv,  tv[i].ivlen);
+      err = gcry_cipher_decrypt (hd, in + pos, piecelen, out + pos, piecelen);
       if (err)
         {
-          fail ("gcry_cipher_setiv failed: %s\n", gpg_strerror (err));
-          goto leave;
+          fail ("pass %d, algo %d, mode %d, split-buffer (pos: %d, "
+                "piecelen: %d), gcry_cipher_decrypt failed: %s\n",
+                pass, algo, mode, pos, piecelen, gpg_strerror (err));
+          gcry_cipher_close (hd);
+          goto err_out_free;
         }
 
-      /* Fill the buffer with our test pattern.  */
-      for (j=0; j < buflen; j++)
-        buffer[j] = ((j & 0xff) ^ ((j >> 8) & 0xff));
+      pos += piecelen;
+      piecelen = split_mode == 1
+                  ? (piecelen + blklen)
+                  : (piecelen * 2 - ((piecelen != blklen) ? blklen : 0));
+    }
 
+  if (taglen > 0)
+    {
       clutter_vector_registers();
-      err = gcry_cipher_encrypt (hde, outbuf, buflen, buffer, buflen);
+      err = gcry_cipher_checktag (hd, tag_result, taglen);
       if (err)
-        {
-          fail ("gcry_cipher_encrypt (algo %d, mode %d) failed: %s\n",
-                tv[i].algo, tv[i].mode, gpg_strerror (err));
-          goto leave;
-        }
+       {
+         fail ("pass %d, algo %d, mode %d, split-buffer (pos: %d, "
+                "piecelen: %d), gcry_cipher_checktag failed: %s\n",
+               pass, algo, mode, pos, piecelen, gpg_strerror (err));
+         gcry_cipher_close (hd);
+         goto err_out_free;
+       }
+    }
 
-      gcry_md_hash_buffer (GCRY_MD_SHA1, hash, outbuf, buflen);
-#if 0
-      printf ("/*[%d]*/\n", i);
-      fputs ("      {", stdout);
-      for (j=0; j < 20; j++)
-        printf (" 0x%02x%c%s", hash[j], j==19? ' ':',', j == 9? "\n       ":"");
-      puts ("}");
-#endif
+  if (memcmp (plain, in, nplain))
+    fail ("pass %d, algo %d, mode %d, split-buffer, encrypt-decrypt mismatch\n",
+          pass, algo, mode);
 
-      if (memcmp (hash, tv[i].t1_hash, 20))
-        fail ("encrypt mismatch (algo %d, mode %d)\n",
-              tv[i].algo, tv[i].mode);
+  /* Again, using in-place encryption and splitting encryption in multiple
+   * operations. */
+  if (check_one_cipher_core_reset (hd, algo, mode, pass, nplain) < 0)
+    goto err_out_free;
+
+  piecelen = blklen;
+  pos = 0;
+  while (pos < nplain)
+    {
+      if (piecelen > nplain - pos)
+        piecelen = nplain - pos;
 
+      memcpy (out + pos, plain + pos, piecelen);
       clutter_vector_registers();
-      err = gcry_cipher_decrypt (hdd, outbuf, buflen, NULL, 0);
+      err = gcry_cipher_encrypt (hd, out + pos, piecelen, NULL, 0);
       if (err)
         {
-          fail ("gcry_cipher_decrypt (algo %d, mode %d) failed: %s\n",
-                tv[i].algo, tv[i].mode, gpg_strerror (err));
-          goto leave;
+          fail ("pass %d, algo %d, mode %d, in-place split-buffer (pos: %d, "
+                "piecelen: %d), gcry_cipher_encrypt failed: %s\n",
+                pass, algo, mode, pos, piecelen, gpg_strerror (err));
+          gcry_cipher_close (hd);
+          goto err_out_free;
         }
 
-      if (memcmp (buffer, outbuf, buflen))
-        fail ("decrypt mismatch (algo %d, mode %d)\n",
-              tv[i].algo, tv[i].mode);
-
-      gcry_cipher_close (hde); hde = NULL;
-      gcry_cipher_close (hdd); hdd = NULL;
+      pos += piecelen;
+      piecelen = split_mode == 1
+                  ? (piecelen + blklen)
+                  : (piecelen * 2 - ((piecelen != blklen) ? blklen : 0));
     }
 
-  if (verbose)
-    fprintf (stderr, "Completed bulk cipher checks.\n");
- leave:
-  gcry_cipher_close (hde);
-  gcry_cipher_close (hdd);
-  gcry_free (buffer_base);
-  gcry_free (outbuf_base);
-}
-
+  if (memcmp (enc_result, out, nplain))
+    fail ("pass %d, algo %d, mode %d, in-place split-buffer, encrypt mismatch\n",
+          pass, algo, mode);
 
-static unsigned int
-get_algo_mode_blklen (int algo, int mode)
-{
-  unsigned int blklen = gcry_cipher_get_algo_blklen(algo);
+  if (check_one_cipher_core_reset (hd, algo, mode, pass, nplain) < 0)
+    goto err_out_free;
 
-  /* Some modes override blklen. */
-  switch (mode)
+  piecelen = blklen;
+  pos = 0;
+  while (pos < nplain)
     {
-    case GCRY_CIPHER_MODE_STREAM:
-    case GCRY_CIPHER_MODE_OFB:
-    case GCRY_CIPHER_MODE_CTR:
-    case GCRY_CIPHER_MODE_CFB:
-    case GCRY_CIPHER_MODE_CFB8:
-    case GCRY_CIPHER_MODE_CCM:
-    case GCRY_CIPHER_MODE_GCM:
-    case GCRY_CIPHER_MODE_EAX:
-    case GCRY_CIPHER_MODE_POLY1305:
-      return 1;
+      if (piecelen > nplain - pos)
+        piecelen = nplain - pos;
+
+      clutter_vector_registers();
+      err = gcry_cipher_decrypt (hd, out + pos, piecelen, NULL, 0);
+      if (err)
+        {
+          fail ("pass %d, algo %d, mode %d, in-place split-buffer (pos: %d, "
+                "piecelen: %d), gcry_cipher_decrypt failed: %s\n",
+                pass, algo, mode, pos, piecelen, gpg_strerror (err));
+          gcry_cipher_close (hd);
+          goto err_out_free;
+        }
+
+      pos += piecelen;
+      piecelen = split_mode == 1
+                  ? (piecelen + blklen)
+                  : (piecelen * 2 - ((piecelen != blklen) ? blklen : 0));
     }
 
-  return blklen;
-}
+  if (memcmp (plain, out, nplain))
+    fail ("pass %d, algo %d, mode %d, in-place split-buffer, encrypt-decrypt"
+          " mismatch\n", pass, algo, mode);
 
 
-static unsigned int
-get_algo_mode_taglen (int algo, int mode)
-{
-  switch (mode)
-    {
-    case GCRY_CIPHER_MODE_CCM:
-    case GCRY_CIPHER_MODE_GCM:
-    case GCRY_CIPHER_MODE_POLY1305:
-      return 16;
-    case GCRY_CIPHER_MODE_EAX:
-      return gcry_cipher_get_algo_blklen(algo);
-    }
+  gcry_cipher_close (hd);
 
+  free (enc_result);
+  free (out_buffer);
+  free (in_buffer);
   return 0;
+
+err_out_free:
+  free (enc_result);
+  free (out_buffer);
+  free (in_buffer);
+  return -1;
 }
 
 
+
 static int
-check_one_cipher_core_reset (gcry_cipher_hd_t hd, int algo, int mode, int pass,
-                             int nplain)
+check_one_cipher_ctr_reset (gcry_cipher_hd_t hd, int algo, int mode,
+                           u32 ctr_high_bits, int be_ctr,
+                           int pass)
 {
-  static const unsigned char iv[8] = { 0, 1, 2, 3, 4, 5, 6, 7 };
-  u64 ctl_params[3];
+  unsigned char iv[16] = { 0 };
+  unsigned char swap;
+  unsigned int ivlen;
+  u32 ctr_low_bits;
   int err;
+  int i;
+
+  /* This should be largest parallel block processing count in any
+   * implementation negated. Currently for CTR this is 32 and, for
+   * ChaCha20, count is 8. */
+  ctr_low_bits = (mode == GCRY_CIPHER_MODE_CTR) ? -32 : -8;
 
   gcry_cipher_reset (hd);
 
-  if (mode == GCRY_CIPHER_MODE_OCB || mode == GCRY_CIPHER_MODE_CCM)
+  if (mode == GCRY_CIPHER_MODE_CTR)
+    ivlen = get_algo_mode_blklen(algo, GCRY_CIPHER_MODE_ECB);
+  else
+    ivlen = 16;
+
+  /* Little-endian fill. */
+  for (i = 0; i < 4; i++)
+    iv[i + 0] = (ctr_low_bits >> (i * 8)) & 0xff;
+  for (i = 0; i < 4; i++)
+    iv[i + 4] = (ctr_high_bits >> (i * 8)) & 0xff;
+
+  if (be_ctr)
     {
-      clutter_vector_registers();
-      err = gcry_cipher_setiv (hd, iv, sizeof(iv));
-      if (err)
-        {
-          fail ("pass %d, algo %d, mode %d, gcry_cipher_setiv failed: %s\n",
-                pass, algo, mode, gpg_strerror (err));
-          gcry_cipher_close (hd);
-          return -1;
-        }
+      /* Swap to big-endian. */
+      for (i = 0; i < ivlen / 2; i++)
+       {
+         swap = iv[i];
+         iv[i] = iv[ivlen - (i + 1)];
+         iv[ivlen - (i + 1)] = swap;
+       }
     }
 
-  if (mode == GCRY_CIPHER_MODE_CCM)
+  clutter_vector_registers();
+  if (mode == GCRY_CIPHER_MODE_CTR)
+    err = gcry_cipher_setctr (hd, iv, ivlen);
+  else
+    err = gcry_cipher_setiv (hd, iv, ivlen);
+
+  if (err)
     {
-      ctl_params[0] = nplain; /* encryptedlen */
-      ctl_params[1] = 0; /* aadlen */
-      ctl_params[2] = 16; /* authtaglen */
-      err = gcry_cipher_ctl (hd, GCRYCTL_SET_CCM_LENGTHS, ctl_params,
-                            sizeof(ctl_params));
-      if (err)
-        {
-          fail ("pass %d, algo %d, mode %d, gcry_cipher_ctl "
-                "GCRYCTL_SET_CCM_LENGTHS failed: %s\n",
-                pass, algo, mode, gpg_strerror (err));
-          gcry_cipher_close (hd);
-          return -1;
-        }
+      fail ("pass %d, algo %d, mode %d, gcry_cipher_setiv failed: %s\n",
+           pass, algo, mode, gpg_strerror (err));
+      gcry_cipher_close (hd);
+      return -1;
     }
 
   return 0;
 }
 
-/* The core of the cipher check.  In addition to the parameters passed
-   to check_one_cipher it also receives the KEY and the plain data.
-   PASS is printed with error messages.  The function returns 0 on
-   success.  */
 static int
-check_one_cipher_core (int algo, int mode, int flags,
-                       const char *key, size_t nkey,
-                       const unsigned char *plain, size_t nplain,
-                       int bufshift, int pass)
+check_one_cipher_ctr_overflow (int algo, int mode, int flags,
+                              const char *key, size_t nkey,
+                              const unsigned char *plain, size_t nplain,
+                              unsigned long ctr_high_bits, int be_ctr,
+                              int pass)
 {
   gcry_cipher_hd_t hd;
-  unsigned char *in_buffer, *out_buffer;
+  unsigned char *out;
   unsigned char *enc_result;
-  unsigned char tag_result[16];
-  unsigned char tag[16];
-  unsigned char *in, *out;
   int keylen;
   gcry_error_t err = 0;
+  unsigned int firstlen;
+  unsigned int leftlen;
   unsigned int blklen;
-  unsigned int piecelen;
   unsigned int pos;
-  unsigned int taglen;
+  unsigned int i;
 
-  in_buffer = malloc (nplain + 1);
-  out_buffer = malloc (nplain + 1);
+  out = malloc (nplain);
   enc_result = malloc (nplain);
-  if (!in_buffer || !out_buffer || !enc_result)
+  if (!out || !enc_result)
     {
       fail ("pass %d, algo %d, mode %d, malloc failed\n",
            pass, algo, mode);
       goto err_out_free;
     }
 
-  blklen = get_algo_mode_blklen(algo, mode);
-  taglen = get_algo_mode_taglen(algo, mode);
-
   assert (nkey == 64);
   assert (nplain > 0);
   assert ((nplain % 16) == 0);
-  assert (blklen > 0);
-
-  if ((mode == GCRY_CIPHER_MODE_CBC && (flags & GCRY_CIPHER_CBC_CTS)) ||
-      mode == GCRY_CIPHER_MODE_XTS)
-    {
-      /* Input cannot be split in to multiple operations with CTS. */
-      blklen = nplain;
-    }
-
-  if (!bufshift)
-    {
-      in = in_buffer;
-      out = out_buffer;
-    }
-  else if (bufshift == 1)
-    {
-      in = in_buffer+1;
-      out = out_buffer;
-    }
-  else if (bufshift == 2)
-    {
-      in = in_buffer+1;
-      out = out_buffer+1;
-    }
-  else
-    {
-      in = in_buffer;
-      out = out_buffer+1;
-    }
 
   keylen = gcry_cipher_get_algo_keylen (algo);
   if (!keylen)
@@ -11020,20 +12333,16 @@ check_one_cipher_core (int algo, int mode, int flags,
 
   if (keylen < 40 / 8 || keylen > 32)
     {
-      fail ("pass %d, algo %d, mode %d, keylength problem (%d)\n", pass, algo, mode, keylen);
+      fail ("pass %d, algo %d, mode %d, keylength problem (%d)\n",
+           pass, algo, mode, keylen);
       goto err_out_free;
     }
 
-  if (mode == GCRY_CIPHER_MODE_XTS)
-    {
-      keylen *= 2;
-    }
-
   err = gcry_cipher_open (&hd, algo, mode, flags);
   if (err)
     {
       fail ("pass %d, algo %d, mode %d, gcry_cipher_open failed: %s\n",
-            pass, algo, mode, gpg_strerror (err));
+           pass, algo, mode, gpg_strerror (err));
       goto err_out_free;
     }
 
@@ -11047,623 +12356,1027 @@ check_one_cipher_core (int algo, int mode, int flags,
       goto err_out_free;
     }
 
-  if (check_one_cipher_core_reset (hd, algo, mode, pass, nplain) < 0)
+  if (check_one_cipher_ctr_reset (hd, algo, mode, ctr_high_bits, be_ctr,
+                                 pass) < 0)
     goto err_out_free;
 
-  clutter_vector_registers();
-  err = gcry_cipher_encrypt (hd, out, nplain, plain, nplain);
-  if (in_fips_mode && mode == GCRY_CIPHER_MODE_GCM)
+  /* Non-bulk processing. */
+  for (i = 0; i < nplain; i += 16)
     {
-      if (!err)
-        fail ("pass %d, algo %d, mode %d, gcry_cipher_encrypt is expected to "
-              "fail in FIPS mode: %s\n", pass, algo, mode, gpg_strerror (err));
-      gcry_cipher_close (hd);
-      goto err_out_free;
+      clutter_vector_registers();
+      err = gcry_cipher_encrypt (hd, out + i, 16, plain + i, 16);
+      if (err)
+       {
+         fail ("pass %d, algo %d, mode %d, gcry_cipher_encrypt failed: %s\n",
+               pass, algo, mode, gpg_strerror (err));
+         gcry_cipher_close (hd);
+         goto err_out_free;
+       }
     }
-  if (err)
+
+  memcpy (enc_result, out, nplain);
+
+  /* Test with different bulk processing sizes. */
+  for (blklen = 2 * 16; blklen <= 32 * 16; blklen *= 2)
     {
-      fail ("pass %d, algo %d, mode %d, gcry_cipher_encrypt failed: %s\n",
-           pass, algo, mode, gpg_strerror (err));
-      gcry_cipher_close (hd);
-      goto err_out_free;
+      /* Move bulk processing start offset, test at different spots to
+       * test bulk counter calculation throughly. */
+      for (firstlen = 16; firstlen < 8 * 64; firstlen += 16)
+       {
+         if (check_one_cipher_ctr_reset (hd, algo, mode, ctr_high_bits, be_ctr,
+                                         pass) < 0)
+           goto err_out_free;
+
+         clutter_vector_registers();
+         err = gcry_cipher_encrypt (hd, out, firstlen, plain, firstlen);
+         if (err)
+           {
+             fail ("pass %d, algo %d, mode %d, gcry_cipher_encrypt "
+                   "failed: %s\n", pass, algo, mode, gpg_strerror (err));
+             gcry_cipher_close (hd);
+             goto err_out_free;
+           }
+
+         leftlen = nplain - firstlen;
+         pos = firstlen;
+         while (leftlen)
+           {
+             unsigned int currlen = leftlen > blklen ? blklen : leftlen;
+
+             clutter_vector_registers();
+             err = gcry_cipher_encrypt (hd, out + pos, currlen, plain + pos,
+                                        currlen);
+             if (err)
+               {
+                 fail ("pass %d, algo %d, mode %d, block len %d, first len %d,"
+                       "gcry_cipher_encrypt failed: %s\n", pass, algo, mode,
+                       blklen, firstlen, gpg_strerror (err));
+                 gcry_cipher_close (hd);
+                 goto err_out_free;
+               }
+
+             pos += currlen;
+             leftlen -= currlen;
+           }
+
+         if (memcmp (enc_result, out, nplain))
+           fail ("pass %d, algo %d, mode %d, block len %d, first len %d, "
+                 "encrypt mismatch\n", pass, algo, mode, blklen, firstlen);
+       }
+    }
+
+  gcry_cipher_close (hd);
+
+  free (enc_result);
+  free (out);
+  return 0;
+
+err_out_free:
+  free (enc_result);
+  free (out);
+  return -1;
+}
+
+
+static void
+check_one_cipher (int algo, int mode, int flags)
+{
+  size_t medium_buffer_size = 2048 - 16;
+  size_t large_buffer_size = 64 * 1024 + 1024 - 16;
+  char key[64+1];
+  unsigned char *plain;
+  int bufshift, i;
+
+  plain = malloc (large_buffer_size + 1);
+  if (!plain)
+    {
+      fail ("pass %d, algo %d, mode %d, malloc failed\n", -1, algo, mode);
+      return;
+    }
+
+  for (bufshift = 0; bufshift < 4; bufshift++)
+    {
+      /* Pass 0: Standard test.  */
+      memcpy (key, "0123456789abcdef.,;/[]{}-=ABCDEF_"
+                  "0123456789abcdef.,;/[]{}-=ABCDEF", 64);
+      memcpy (plain, "foobar42FOOBAR17", 16);
+      for (i = 16; i < medium_buffer_size; i += 16)
+        {
+          memcpy (&plain[i], &plain[i-16], 16);
+          if (!++plain[i+7])
+            plain[i+6]++;
+          if (!++plain[i+15])
+            plain[i+14]++;
+        }
+
+      if (check_one_cipher_core (algo, mode, flags, key, 64, plain,
+                                medium_buffer_size, bufshift, 0,
+                                0+10*bufshift))
+        goto out;
+
+      /* Pass 1: Key not aligned.  */
+      memmove (key+1, key, 64);
+      if (check_one_cipher_core (algo, mode, flags, key+1, 64, plain,
+                                medium_buffer_size, bufshift, 0,
+                                1+10*bufshift))
+        goto out;
+
+      /* Pass 2: Key not aligned and data not aligned.  */
+      memmove (plain+1, plain, medium_buffer_size);
+      if (check_one_cipher_core (algo, mode, flags, key+1, 64, plain+1,
+                                medium_buffer_size, bufshift, 0,
+                                2+10*bufshift))
+        goto out;
+
+      /* Pass 3: Key aligned and data not aligned.  */
+      memmove (key, key+1, 64);
+      if (check_one_cipher_core (algo, mode, flags, key, 64, plain+1,
+                                medium_buffer_size, bufshift, 0,
+                                3+10*bufshift))
+        goto out;
     }
 
-  if (taglen > 0)
+  /* Pass 5: Large buffer test.  */
+  memcpy (key, "0123456789abcdef.,;/[]{}-=ABCDEF_"
+               "0123456789abcdef.,;/[]{}-=ABCDEF", 64);
+  memcpy (plain, "foobar42FOOBAR17", 16);
+  for (i = 16; i < large_buffer_size; i += 16)
     {
-      clutter_vector_registers();
-      err = gcry_cipher_gettag (hd, tag, taglen);
-      if (err)
-       {
-         fail ("pass %d, algo %d, mode %d, gcry_cipher_gettag failed: %s\n",
-               pass, algo, mode, gpg_strerror (err));
-         gcry_cipher_close (hd);
-         goto err_out_free;
-       }
-
-      memcpy(tag_result, tag, taglen);
+      memcpy (&plain[i], &plain[i-16], 16);
+      if (!++plain[i+7])
+       plain[i+6]++;
+      if (!++plain[i+15])
+       plain[i+14]++;
     }
 
-  memcpy (enc_result, out, nplain);
+  if (check_one_cipher_core (algo, mode, flags, key, 64, plain,
+                            large_buffer_size, bufshift, 0,
+                            50))
+    goto out;
 
-  if (check_one_cipher_core_reset (hd, algo, mode, pass, nplain) < 0)
-    goto err_out_free;
+  if (check_one_cipher_core (algo, mode, flags, key, 64, plain,
+                            large_buffer_size, bufshift, 1,
+                            51))
+    goto out;
 
-  clutter_vector_registers();
-  err = gcry_cipher_decrypt (hd, in, nplain, out, nplain);
-  if (err)
+  /* Pass 6: Counter overflow tests for ChaCha20 and CTR mode. */
+  if (mode == GCRY_CIPHER_MODE_STREAM && algo == GCRY_CIPHER_CHACHA20)
     {
-      fail ("pass %d, algo %d, mode %d, gcry_cipher_decrypt failed: %s\n",
-           pass, algo, mode, gpg_strerror (err));
-      gcry_cipher_close (hd);
-      goto err_out_free;
+      /* 32bit overflow test (little-endian counter) */
+      if (check_one_cipher_ctr_overflow (algo, mode, flags, key, 64, plain,
+                                         medium_buffer_size, 0UL,
+                                         0, 60))
+       goto out;
+      /* 64bit overflow test (little-endian counter) */
+      if (check_one_cipher_ctr_overflow (algo, mode, flags, key, 64, plain,
+                                         medium_buffer_size, 0xffffffffUL,
+                                         0, 61))
+       goto out;
     }
-
-  if (taglen > 0)
+   else if (mode == GCRY_CIPHER_MODE_CTR)
     {
-      clutter_vector_registers();
-      err = gcry_cipher_checktag (hd, tag_result, taglen);
-      if (err)
-       {
-         fail ("pass %d, algo %d, mode %d, gcry_cipher_checktag failed: %s\n",
-               pass, algo, mode, gpg_strerror (err));
-         gcry_cipher_close (hd);
-         goto err_out_free;
-       }
+      /* 32bit overflow test (big-endian counter) */
+      if (check_one_cipher_ctr_overflow (algo, mode, flags, key, 64, plain,
+                                         medium_buffer_size, 0UL,
+                                         1, 62))
+       goto out;
+      /* 64bit overflow test (big-endian counter) */
+      if (check_one_cipher_ctr_overflow (algo, mode, flags, key, 64, plain,
+                                         medium_buffer_size, 0xffffffffUL,
+                                         1, 63))
+       goto out;
     }
 
-  if (memcmp (plain, in, nplain))
-    fail ("pass %d, algo %d, mode %d, encrypt-decrypt mismatch\n",
-          pass, algo, mode);
+out:
+  free (plain);
+}
 
-  /* Again, using in-place encryption.  */
-  if (check_one_cipher_core_reset (hd, algo, mode, pass, nplain) < 0)
-    goto err_out_free;
 
-  memcpy (out, plain, nplain);
-  clutter_vector_registers();
-  err = gcry_cipher_encrypt (hd, out, nplain, NULL, 0);
-  if (err)
-    {
-      fail ("pass %d, algo %d, mode %d, in-place, gcry_cipher_encrypt failed:"
-            " %s\n",
-           pass, algo, mode, gpg_strerror (err));
-      gcry_cipher_close (hd);
-      goto err_out_free;
-    }
 
-  if (taglen > 0)
-    {
-      err = gcry_cipher_gettag (hd, tag, taglen);
-      if (err)
-       {
-         fail ("pass %d, algo %d, mode %d, in-place, "
-               "gcry_cipher_gettag failed: %s\n",
-               pass, algo, mode, gpg_strerror (err));
-         gcry_cipher_close (hd);
-         goto err_out_free;
-       }
+static void buf_xor(void *vdst, const void *vsrc1, const void *vsrc2, size_t len)
+{
+  char *dst = vdst;
+  const char *src1 = vsrc1;
+  const char *src2 = vsrc2;
 
-      if (memcmp (tag_result, tag, taglen))
-       fail ("pass %d, algo %d, mode %d, in-place, tag mismatch\n",
-             pass, algo, mode);
+  while (len)
+    {
+      *(char *)dst = *(char *)src1 ^ *(char *)src2;
+      dst++;
+      src1++;
+      src2++;
+      len--;
     }
+}
 
-  if (memcmp (enc_result, out, nplain))
-    fail ("pass %d, algo %d, mode %d, in-place, encrypt mismatch\n",
-          pass, algo, mode);
-
-  if (check_one_cipher_core_reset (hd, algo, mode, pass, nplain) < 0)
-    goto err_out_free;
+/* Run the tests for <block cipher>-CBC-<block size>, tests bulk CBC
+   decryption.  Returns NULL on success. */
+static int
+cipher_cbc_bulk_test (int cipher_algo)
+{
+  const int nblocks = 128 - 1;
+  int i, offs;
+  int blocksize;
+  const char *cipher;
+  gcry_cipher_hd_t hd_one;
+  gcry_cipher_hd_t hd_cbc;
+  gcry_error_t err = 0;
+  unsigned char *plaintext, *plaintext2, *ciphertext, *iv, *iv2, *mem;
+  unsigned char *getivbuf;
+  unsigned int memsize;
+  unsigned int keylen;
+
+  static const unsigned char key[32] = {
+      0x66,0x9A,0x00,0x7F,0xC7,0x6A,0x45,0x9F,
+      0x98,0xBA,0xF9,0x17,0xFE,0xDF,0x95,0x22,
+      0x66,0x9A,0x00,0x7F,0xC7,0x6A,0x45,0x9F,
+      0x98,0xBA,0xF9,0x17,0xFE,0xDF,0x95,0x22
+    };
 
-  clutter_vector_registers();
-  err = gcry_cipher_decrypt (hd, out, nplain, NULL, 0);
-  if (err)
+  if (gcry_cipher_test_algo (cipher_algo))
+    return -1;
+  blocksize = gcry_cipher_get_algo_blklen(cipher_algo);
+  if (blocksize < 8)
+    return -1;
+  cipher = gcry_cipher_algo_name (cipher_algo);
+  keylen = gcry_cipher_get_algo_keylen (cipher_algo);
+  if (keylen > sizeof(key))
     {
-      fail ("pass %d, algo %d, mode %d, in-place, gcry_cipher_decrypt failed:"
-            " %s\n",
-           pass, algo, mode, gpg_strerror (err));
-      gcry_cipher_close (hd);
-      goto err_out_free;
+      fail ("%s-CBC-%d test failed (key too short)", cipher, blocksize * 8);
+      return -1;
     }
 
-  if (taglen > 0)
-    {
-      clutter_vector_registers();
-      err = gcry_cipher_checktag (hd, tag_result, taglen);
-      if (err)
-       {
-         fail ("pass %d, algo %d, mode %d, in-place, "
-               "gcry_cipher_checktag failed: %s\n",
-               pass, algo, mode, gpg_strerror (err));
-         gcry_cipher_close (hd);
-         goto err_out_free;
-       }
-    }
+  memsize = (blocksize * 2) + (blocksize * nblocks * 3) + 16 + (blocksize + 1);
 
-  if (memcmp (plain, out, nplain))
-    fail ("pass %d, algo %d, mode %d, in-place, encrypt-decrypt mismatch\n",
-          pass, algo, mode);
+  mem = xcalloc (1, memsize);
+  if (!mem)
+    return -1;
 
-  /* Again, splitting encryption in multiple operations. */
-  if (check_one_cipher_core_reset (hd, algo, mode, pass, nplain) < 0)
-    goto err_out_free;
+  offs = (16 - ((uintptr_t)mem & 15)) & 15;
+  iv = (void*)(mem + offs);
+  iv2 = iv + blocksize;
+  plaintext = iv2 + blocksize;
+  plaintext2 = plaintext + nblocks * blocksize;
+  ciphertext = plaintext2 + nblocks * blocksize;
+  getivbuf = ciphertext + nblocks * blocksize;
 
-  piecelen = blklen;
-  pos = 0;
-  while (pos < nplain)
+  err = gcry_cipher_open (&hd_one, cipher_algo, GCRY_CIPHER_MODE_ECB, 0);
+  if (err)
     {
-      if (piecelen > nplain - pos)
-        piecelen = nplain - pos;
-
-      clutter_vector_registers();
-      err = gcry_cipher_encrypt (hd, out + pos, piecelen, plain + pos,
-                                 piecelen);
-      if (err)
-        {
-          fail ("pass %d, algo %d, mode %d, split-buffer (pos: %d, "
-                "piecelen: %d), gcry_cipher_encrypt failed: %s\n",
-                pass, algo, mode, pos, piecelen, gpg_strerror (err));
-          gcry_cipher_close (hd);
-          goto err_out_free;
-        }
-
-      pos += piecelen;
-      piecelen = piecelen * 2 - ((piecelen != blklen) ? blklen : 0);
+      xfree(mem);
+      fail ("%s-CBC-%d test failed (cipher open fail)", cipher, blocksize * 8);
+      return -1;
     }
-
-  if (taglen > 0)
+  err = gcry_cipher_open (&hd_cbc, cipher_algo, GCRY_CIPHER_MODE_CBC, 0);
+  if (err)
     {
-      clutter_vector_registers();
-      err = gcry_cipher_gettag (hd, tag, taglen);
-      if (err)
-       {
-         fail ("pass %d, algo %d, mode %d, split-buffer (pos: %d, "
-                "piecelen: %d), gcry_cipher_gettag failed: %s\n",
-               pass, algo, mode, pos, piecelen, gpg_strerror (err));
-         gcry_cipher_close (hd);
-         goto err_out_free;
-       }
-
-      if (memcmp (tag_result, tag, taglen))
-       fail ("pass %d, algo %d, mode %d, in-place, tag mismatch\n",
-             pass, algo, mode);
+      gcry_cipher_close (hd_one);
+      xfree(mem);
+      fail ("%s-CBC-%d test failed (cipher open fail)", cipher, blocksize * 8);
+      return -1;
     }
 
-  if (memcmp (enc_result, out, nplain))
-    fail ("pass %d, algo %d, mode %d, split-buffer, encrypt mismatch\n",
-          pass, algo, mode);
-
-  if (check_one_cipher_core_reset (hd, algo, mode, pass, nplain) < 0)
-    goto err_out_free;
-
-  piecelen = blklen;
-  pos = 0;
-  while (pos < nplain)
+  /* Initialize ctx */
+  if (gcry_cipher_setkey (hd_one, key, keylen) ||
+      gcry_cipher_setkey (hd_cbc, key, keylen))
     {
-      if (piecelen > nplain - pos)
-        piecelen = nplain - pos;
+      gcry_cipher_close (hd_one);
+      gcry_cipher_close (hd_cbc);
+      xfree(mem);
+      fail ("%s-CBC-%d test failed (setkey fail)", cipher, blocksize * 8);
+      return -1;
+    }
 
-      clutter_vector_registers();
-      err = gcry_cipher_decrypt (hd, in + pos, piecelen, out + pos, piecelen);
-      if (err)
-        {
-          fail ("pass %d, algo %d, mode %d, split-buffer (pos: %d, "
-                "piecelen: %d), gcry_cipher_decrypt failed: %s\n",
-                pass, algo, mode, pos, piecelen, gpg_strerror (err));
-          gcry_cipher_close (hd);
-          goto err_out_free;
-        }
+  /* Test single block code path */
+  memset (iv, 0x4e, blocksize);
+  memset (iv2, 0x4e, blocksize);
+  for (i = 0; i < blocksize; i++)
+    plaintext[i] = i;
 
-      pos += piecelen;
-      piecelen = piecelen * 2 - ((piecelen != blklen) ? blklen : 0);
+  /* CBC manually.  */
+  buf_xor (ciphertext, iv, plaintext, blocksize);
+  err = gcry_cipher_encrypt (hd_one, ciphertext, blocksize,
+                             ciphertext, blocksize);
+  if (err)
+    {
+      gcry_cipher_close (hd_one);
+      gcry_cipher_close (hd_cbc);
+      xfree(mem);
+      fail ("%s-CBC-%d test failed (ECB encrypt fail)", cipher, blocksize * 8);
+      return -1;
     }
+  memcpy (iv, ciphertext, blocksize);
 
-  if (taglen > 0)
+  /* CBC decrypt.  */
+  err = gcry_cipher_setiv (hd_cbc, iv2, blocksize);
+  if (err)
     {
-      clutter_vector_registers();
-      err = gcry_cipher_checktag (hd, tag_result, taglen);
-      if (err)
-       {
-         fail ("pass %d, algo %d, mode %d, split-buffer (pos: %d, "
-                "piecelen: %d), gcry_cipher_checktag failed: %s\n",
-               pass, algo, mode, pos, piecelen, gpg_strerror (err));
-         gcry_cipher_close (hd);
-         goto err_out_free;
-       }
+      gcry_cipher_close (hd_one);
+      gcry_cipher_close (hd_cbc);
+      xfree(mem);
+      fail ("%s-CBC-%d test failed (setiv fail)", cipher, blocksize * 8);
+      return -1;
     }
-
-  if (memcmp (plain, in, nplain))
-    fail ("pass %d, algo %d, mode %d, split-buffer, encrypt-decrypt mismatch\n",
-          pass, algo, mode);
-
-  /* Again, using in-place encryption and splitting encryption in multiple
-   * operations. */
-  if (check_one_cipher_core_reset (hd, algo, mode, pass, nplain) < 0)
-    goto err_out_free;
-
-  piecelen = blklen;
-  pos = 0;
-  while (pos < nplain)
+  err = gcry_cipher_decrypt (hd_cbc, plaintext2, blocksize * 1,
+                             ciphertext, blocksize * 1);
+  if (err)
     {
-      if (piecelen > nplain - pos)
-        piecelen = nplain - pos;
-
-      memcpy (out + pos, plain + pos, piecelen);
-      clutter_vector_registers();
-      err = gcry_cipher_encrypt (hd, out + pos, piecelen, NULL, 0);
-      if (err)
-        {
-          fail ("pass %d, algo %d, mode %d, in-place split-buffer (pos: %d, "
-                "piecelen: %d), gcry_cipher_encrypt failed: %s\n",
-                pass, algo, mode, pos, piecelen, gpg_strerror (err));
-          gcry_cipher_close (hd);
-          goto err_out_free;
-        }
+      gcry_cipher_close (hd_one);
+      gcry_cipher_close (hd_cbc);
+      xfree(mem);
+      fail ("%s-CBC-%d test failed (CBC decrypt fail)", cipher, blocksize * 8);
+      return -1;
+    }
 
-      pos += piecelen;
-      piecelen = piecelen * 2 - ((piecelen != blklen) ? blklen : 0);
+  if (memcmp (plaintext2, plaintext, blocksize))
+    {
+      gcry_cipher_close (hd_one);
+      gcry_cipher_close (hd_cbc);
+      xfree (mem);
+      fail ("%s-CBC-%d test failed (plaintext mismatch)", cipher, blocksize * 8);
+      return -1;
     }
 
-  if (memcmp (enc_result, out, nplain))
-    fail ("pass %d, algo %d, mode %d, in-place split-buffer, encrypt mismatch\n",
-          pass, algo, mode);
+  err = gcry_cipher_ctl (hd_cbc, PRIV_CIPHERCTL_GET_INPUT_VECTOR, getivbuf,
+                        blocksize + 1);
+  if (err || getivbuf[0] != blocksize)
+    {
+      gcry_cipher_close (hd_one);
+      gcry_cipher_close (hd_cbc);
+      xfree(mem);
+      fail ("%s-CBC-%d test failed (CBC getiv fail)", cipher, blocksize * 8);
+      return -1;
+    }
+  if (memcmp (getivbuf + 1, iv, blocksize))
+    {
+      gcry_cipher_close (hd_one);
+      gcry_cipher_close (hd_cbc);
+      xfree (mem);
+      fail ("%s-CBC-%d test failed (IV mismatch)", cipher, blocksize * 8);
+      return -1;
+    }
 
-  if (check_one_cipher_core_reset (hd, algo, mode, pass, nplain) < 0)
-    goto err_out_free;
+  /* Test parallelized code paths */
+  memset (iv, 0x5f, blocksize);
+  memset (iv2, 0x5f, blocksize);
 
-  piecelen = blklen;
-  pos = 0;
-  while (pos < nplain)
-    {
-      if (piecelen > nplain - pos)
-        piecelen = nplain - pos;
+  for (i = 0; i < nblocks * blocksize; i++)
+    plaintext[i] = i;
 
-      clutter_vector_registers();
-      err = gcry_cipher_decrypt (hd, out + pos, piecelen, NULL, 0);
+  /* Create CBC ciphertext manually.  */
+  for (i = 0; i < nblocks * blocksize; i+=blocksize)
+    {
+      buf_xor (&ciphertext[i], iv, &plaintext[i], blocksize);
+      err = gcry_cipher_encrypt (hd_one, &ciphertext[i], blocksize,
+                                 &ciphertext[i], blocksize);
       if (err)
         {
-          fail ("pass %d, algo %d, mode %d, in-place split-buffer (pos: %d, "
-                "piecelen: %d), gcry_cipher_decrypt failed: %s\n",
-                pass, algo, mode, pos, piecelen, gpg_strerror (err));
-          gcry_cipher_close (hd);
-          goto err_out_free;
+          gcry_cipher_close (hd_one);
+          gcry_cipher_close (hd_cbc);
+          xfree(mem);
+          fail ("%s-CBC-%d test failed (ECB encrypt fail)", cipher, blocksize * 8);
+          return -1;
         }
-
-      pos += piecelen;
-      piecelen = piecelen * 2 - ((piecelen != blklen) ? blklen : 0);
+      memcpy (iv, &ciphertext[i], blocksize);
     }
 
-  if (memcmp (plain, out, nplain))
-    fail ("pass %d, algo %d, mode %d, in-place split-buffer, encrypt-decrypt"
-          " mismatch\n", pass, algo, mode);
-
-
-  gcry_cipher_close (hd);
+  /* Decrypt using bulk CBC and compare result.  */
+  err = gcry_cipher_setiv (hd_cbc, iv2, blocksize);
+  if (err)
+    {
+      gcry_cipher_close (hd_one);
+      gcry_cipher_close (hd_cbc);
+      xfree(mem);
+      fail ("%s-CBC-%d test failed (setiv fail)", cipher, blocksize * 8);
+      return -1;
+    }
+  err = gcry_cipher_decrypt (hd_cbc, plaintext2, blocksize * nblocks,
+                             ciphertext, blocksize * nblocks);
+  if (err)
+    {
+      gcry_cipher_close (hd_one);
+      gcry_cipher_close (hd_cbc);
+      xfree(mem);
+      fail ("%s-CBC-%d test failed (CBC decrypt fail)", cipher, blocksize * 8);
+      return -1;
+    }
 
-  free (enc_result);
-  free (out_buffer);
-  free (in_buffer);
-  return 0;
+  if (memcmp (plaintext2, plaintext, nblocks * blocksize))
+    {
+      gcry_cipher_close (hd_one);
+      gcry_cipher_close (hd_cbc);
+      xfree (mem);
+      fail ("%s-CBC-%d test failed (plaintext mismatch, parallel path)",
+            cipher, blocksize * 8);
+      return -1;
+    }
+  err = gcry_cipher_ctl (hd_cbc, PRIV_CIPHERCTL_GET_INPUT_VECTOR, getivbuf,
+                        blocksize + 1);
+  if (err || getivbuf[0] != blocksize)
+    {
+      gcry_cipher_close (hd_one);
+      gcry_cipher_close (hd_cbc);
+      xfree(mem);
+      fail ("%s-CBC-%d test failed (CBC getiv fail, parallel path)",
+           cipher, blocksize * 8);
+      return -1;
+    }
+  if (memcmp (getivbuf + 1, iv, blocksize))
+    {
+      gcry_cipher_close (hd_one);
+      gcry_cipher_close (hd_cbc);
+      xfree (mem);
+      fail ("%s-CBC-%d test failed (IV mismatch, parallel path)",
+            cipher, blocksize * 8);
+      return -1;
+    }
 
-err_out_free:
-  free (enc_result);
-  free (out_buffer);
-  free (in_buffer);
+  gcry_cipher_close (hd_one);
+  gcry_cipher_close (hd_cbc);
+  xfree (mem);
   return -1;
 }
 
 
+static void
+buf_xor_2dst(void *vdst1, void *vdst2, const void *vsrc, size_t len)
+{
+  byte *dst1 = vdst1;
+  byte *dst2 = vdst2;
+  const byte *src = vsrc;
+
+  for (; len; len--)
+    *dst1++ = (*dst2++ ^= *src++);
+}
 
+/* Run the tests for <block cipher>-CFB-<block size>, tests bulk CFB
+   decryption.  Returns NULL on success. */
 static int
-check_one_cipher_ctr_reset (gcry_cipher_hd_t hd, int algo, int mode,
-                           u32 ctr_high_bits, int be_ctr,
-                           int pass)
+cipher_cfb_bulk_test (int cipher_algo)
 {
-  unsigned char iv[16] = { 0 };
-  unsigned char swap;
-  unsigned int ivlen;
-  u32 ctr_low_bits;
-  int err;
-  int i;
+  const int nblocks = 128 - 1;
+  int blocksize;
+  const char *cipher;
+  gcry_cipher_hd_t hd_one;
+  gcry_cipher_hd_t hd_cfb;
+  gcry_error_t err = 0;
+  int i, offs;
+  unsigned char *plaintext, *plaintext2, *ciphertext, *iv, *iv2, *mem;
+  unsigned char *getivbuf;
+  unsigned int memsize;
+  unsigned int keylen;
+
+  static const unsigned char key[32] = {
+      0x11,0x9A,0x00,0x7F,0xC7,0x6A,0x45,0x9F,
+      0x98,0xBA,0xF9,0x17,0xFE,0xDF,0x95,0x33,
+      0x11,0x9A,0x00,0x7F,0xC7,0x6A,0x45,0x9F,
+      0x98,0xBA,0xF9,0x17,0xFE,0xDF,0x95,0x33
+    };
 
-  /* This should be largest parallel block processing count in any
-   * implementation negated. Currently for CTR this is 32 and, for
-   * ChaCha20, count is 8. */
-  ctr_low_bits = (mode == GCRY_CIPHER_MODE_CTR) ? -32 : -8;
+  if (gcry_cipher_test_algo (cipher_algo))
+    return -1;
+  blocksize = gcry_cipher_get_algo_blklen(cipher_algo);
+  if (blocksize < 8)
+    return -1;
+  cipher = gcry_cipher_algo_name (cipher_algo);
+  keylen = gcry_cipher_get_algo_keylen (cipher_algo);
+  if (keylen > sizeof(key))
+    {
+      fail ("%s-CFB-%d test failed (key too short)", cipher, blocksize * 8);
+      return -1;
+    }
 
-  gcry_cipher_reset (hd);
+  memsize = (blocksize * 2) + (blocksize * nblocks * 3) + 16 + (blocksize + 1);
 
-  if (mode == GCRY_CIPHER_MODE_CTR)
-    ivlen = get_algo_mode_blklen(algo, GCRY_CIPHER_MODE_ECB);
-  else
-    ivlen = 16;
+  mem = xcalloc (1, memsize);
+  if (!mem)
+    return -1;
 
-  /* Little-endian fill. */
-  for (i = 0; i < 4; i++)
-    iv[i + 0] = (ctr_low_bits >> (i * 8)) & 0xff;
-  for (i = 0; i < 4; i++)
-    iv[i + 4] = (ctr_high_bits >> (i * 8)) & 0xff;
+  offs = (16 - ((uintptr_t)mem & 15)) & 15;
+  iv = (void*)(mem + offs);
+  iv2 = iv + blocksize;
+  plaintext = iv2 + blocksize;
+  plaintext2 = plaintext + nblocks * blocksize;
+  ciphertext = plaintext2 + nblocks * blocksize;
+  getivbuf = ciphertext + nblocks * blocksize;
 
-  if (be_ctr)
+  err = gcry_cipher_open (&hd_one, cipher_algo, GCRY_CIPHER_MODE_ECB, 0);
+  if (err)
     {
-      /* Swap to big-endian. */
-      for (i = 0; i < ivlen / 2; i++)
-       {
-         swap = iv[i];
-         iv[i] = iv[ivlen - (i + 1)];
-         iv[ivlen - (i + 1)] = swap;
-       }
+      xfree(mem);
+      fail ("%s-CFB-%d test failed (cipher open fail)", cipher, blocksize * 8);
+      return -1;
     }
-
-  clutter_vector_registers();
-  if (mode == GCRY_CIPHER_MODE_CTR)
-    err = gcry_cipher_setctr (hd, iv, ivlen);
-  else
-    err = gcry_cipher_setiv (hd, iv, ivlen);
-
+  err = gcry_cipher_open (&hd_cfb, cipher_algo, GCRY_CIPHER_MODE_CFB, 0);
   if (err)
     {
-      fail ("pass %d, algo %d, mode %d, gcry_cipher_setiv failed: %s\n",
-           pass, algo, mode, gpg_strerror (err));
-      gcry_cipher_close (hd);
+      gcry_cipher_close (hd_one);
+      xfree(mem);
+      fail ("%s-CFB-%d test failed (cipher open fail)", cipher, blocksize * 8);
       return -1;
     }
 
-  return 0;
-}
-
-static int
-check_one_cipher_ctr_overflow (int algo, int mode, int flags,
-                              const char *key, size_t nkey,
-                              const unsigned char *plain, size_t nplain,
-                              unsigned long ctr_high_bits, int be_ctr,
-                              int pass)
-{
-  gcry_cipher_hd_t hd;
-  unsigned char *out;
-  unsigned char *enc_result;
-  int keylen;
-  gcry_error_t err = 0;
-  unsigned int firstlen;
-  unsigned int leftlen;
-  unsigned int blklen;
-  unsigned int pos;
-  unsigned int i;
-
-  out = malloc (nplain);
-  enc_result = malloc (nplain);
-  if (!out || !enc_result)
+  /* Initialize ctx */
+  if (gcry_cipher_setkey (hd_one, key, keylen) ||
+      gcry_cipher_setkey (hd_cfb, key, keylen))
     {
-      fail ("pass %d, algo %d, mode %d, malloc failed\n",
-           pass, algo, mode);
-      goto err_out_free;
+      gcry_cipher_close (hd_one);
+      gcry_cipher_close (hd_cfb);
+      xfree(mem);
+      fail ("%s-CFB-%d test failed (setkey fail)", cipher, blocksize * 8);
+      return -1;
     }
 
-  assert (nkey == 64);
-  assert (nplain > 0);
-  assert ((nplain % 16) == 0);
+  /* Test single block code path */
+  memset(iv, 0xd3, blocksize);
+  memset(iv2, 0xd3, blocksize);
+  for (i = 0; i < blocksize; i++)
+    plaintext[i] = i;
 
-  keylen = gcry_cipher_get_algo_keylen (algo);
-  if (!keylen)
+  /* CFB manually.  */
+  err = gcry_cipher_encrypt (hd_one, ciphertext, blocksize, iv, blocksize);
+  if (err)
     {
-      fail ("pass %d, algo %d, mode %d, gcry_cipher_get_algo_keylen failed\n",
-           pass, algo, mode);
-      goto err_out_free;
+      gcry_cipher_close (hd_one);
+      gcry_cipher_close (hd_cfb);
+      xfree(mem);
+      fail ("%s-CFB-%d test failed (ECB encrypt fail)", cipher, blocksize * 8);
+      return -1;
     }
+  buf_xor_2dst (iv, ciphertext, plaintext, blocksize);
 
-  if (keylen < 40 / 8 || keylen > 32)
+  /* CFB decrypt.  */
+  err = gcry_cipher_setiv (hd_cfb, iv2, blocksize);
+  if (err)
     {
-      fail ("pass %d, algo %d, mode %d, keylength problem (%d)\n",
-           pass, algo, mode, keylen);
-      goto err_out_free;
+      gcry_cipher_close (hd_one);
+      gcry_cipher_close (hd_cfb);
+      xfree(mem);
+      fail ("%s-CFB-%d test failed (setiv fail)", cipher, blocksize * 8);
+      return -1;
     }
-
-  err = gcry_cipher_open (&hd, algo, mode, flags);
+  err = gcry_cipher_decrypt (hd_cfb, plaintext2, blocksize * 1,
+                             ciphertext, blocksize * 1);
   if (err)
     {
-      fail ("pass %d, algo %d, mode %d, gcry_cipher_open failed: %s\n",
-           pass, algo, mode, gpg_strerror (err));
-      goto err_out_free;
+      gcry_cipher_close (hd_one);
+      gcry_cipher_close (hd_cfb);
+      xfree(mem);
+      fail ("%s-CFB-%d test failed (CFB decrypt fail)", cipher, blocksize * 8);
+      return -1;
+    }
+  if (memcmp(plaintext2, plaintext, blocksize))
+    {
+      gcry_cipher_close (hd_one);
+      gcry_cipher_close (hd_cfb);
+      xfree(mem);
+      fail ("%s-CFB-%d test failed (plaintext mismatch)",
+            cipher, blocksize * 8);
+      return -1;
     }
 
-  clutter_vector_registers();
-  err = gcry_cipher_setkey (hd, key, keylen);
-  if (err)
+  err = gcry_cipher_ctl (hd_cfb, PRIV_CIPHERCTL_GET_INPUT_VECTOR, getivbuf,
+                        blocksize + 1);
+  if (err || getivbuf[0] != blocksize)
     {
-      fail ("pass %d, algo %d, mode %d, gcry_cipher_setkey failed: %s\n",
-           pass, algo, mode, gpg_strerror (err));
-      gcry_cipher_close (hd);
-      goto err_out_free;
+      gcry_cipher_close (hd_one);
+      gcry_cipher_close (hd_cfb);
+      xfree(mem);
+      fail ("%s-CFB-%d test failed (CFB getiv fail)", cipher, blocksize * 8);
+      return -1;
+    }
+  if (memcmp(getivbuf + 1, iv, blocksize))
+    {
+      gcry_cipher_close (hd_one);
+      gcry_cipher_close (hd_cfb);
+      xfree(mem);
+      fail ("%s-CFB-%d test failed (IV mismatch)",
+            cipher, blocksize * 8);
+      return -1;
     }
 
-  if (check_one_cipher_ctr_reset (hd, algo, mode, ctr_high_bits, be_ctr,
-                                 pass) < 0)
-    goto err_out_free;
+  /* Test parallelized code paths */
+  memset(iv, 0xe6, blocksize);
+  memset(iv2, 0xe6, blocksize);
 
-  /* Non-bulk processing. */
-  for (i = 0; i < nplain; i += 16)
+  for (i = 0; i < nblocks * blocksize; i++)
+    plaintext[i] = i;
+
+  /* Create CFB ciphertext manually.  */
+  for (i = 0; i < nblocks * blocksize; i+=blocksize)
     {
-      clutter_vector_registers();
-      err = gcry_cipher_encrypt (hd, out + i, 16, plain + i, 16);
+      err = gcry_cipher_encrypt (hd_one, &ciphertext[i], blocksize,
+                                 iv, blocksize);
       if (err)
-       {
-         fail ("pass %d, algo %d, mode %d, gcry_cipher_encrypt failed: %s\n",
-               pass, algo, mode, gpg_strerror (err));
-         gcry_cipher_close (hd);
-         goto err_out_free;
-       }
+        {
+          gcry_cipher_close (hd_one);
+          gcry_cipher_close (hd_cfb);
+          xfree(mem);
+          fail ("%s-CFB-%d test failed (ECB encrypt fail)", cipher, blocksize * 8);
+          return -1;
+        }
+      buf_xor_2dst (iv, &ciphertext[i], &plaintext[i], blocksize);
     }
 
-  memcpy (enc_result, out, nplain);
-
-  /* Test with different bulk processing sizes. */
-  for (blklen = 2 * 16; blklen <= 32 * 16; blklen *= 2)
+  /* Decrypt using bulk CBC and compare result.  */
+  err = gcry_cipher_setiv (hd_cfb, iv2, blocksize);
+  if (err)
     {
-      /* Move bulk processing start offset, test at different spots to
-       * test bulk counter calculation throughly. */
-      for (firstlen = 16; firstlen < 8 * 64; firstlen += 16)
-       {
-         if (check_one_cipher_ctr_reset (hd, algo, mode, ctr_high_bits, be_ctr,
-                                         pass) < 0)
-           goto err_out_free;
-
-         clutter_vector_registers();
-         err = gcry_cipher_encrypt (hd, out, firstlen, plain, firstlen);
-         if (err)
-           {
-             fail ("pass %d, algo %d, mode %d, gcry_cipher_encrypt "
-                   "failed: %s\n", pass, algo, mode, gpg_strerror (err));
-             gcry_cipher_close (hd);
-             goto err_out_free;
-           }
-
-         leftlen = nplain - firstlen;
-         pos = firstlen;
-         while (leftlen)
-           {
-             unsigned int currlen = leftlen > blklen ? blklen : leftlen;
-
-             clutter_vector_registers();
-             err = gcry_cipher_encrypt (hd, out + pos, currlen, plain + pos,
-                                        currlen);
-             if (err)
-               {
-                 fail ("pass %d, algo %d, mode %d, block len %d, first len %d,"
-                       "gcry_cipher_encrypt failed: %s\n", pass, algo, mode,
-                       blklen, firstlen, gpg_strerror (err));
-                 gcry_cipher_close (hd);
-                 goto err_out_free;
-               }
+      gcry_cipher_close (hd_one);
+      gcry_cipher_close (hd_cfb);
+      xfree(mem);
+      fail ("%s-CFB-%d test failed (setiv fail)", cipher, blocksize * 8);
+      return -1;
+    }
+  err = gcry_cipher_decrypt (hd_cfb, plaintext2, blocksize * nblocks,
+                             ciphertext, blocksize * nblocks);
+  if (err)
+    {
+      gcry_cipher_close (hd_one);
+      gcry_cipher_close (hd_cfb);
+      xfree(mem);
+      fail ("%s-CFB-%d test failed (CFB decrypt fail)", cipher, blocksize * 8);
+      return -1;
+    }
 
-             pos += currlen;
-             leftlen -= currlen;
-           }
+  if (memcmp(plaintext2, plaintext, nblocks * blocksize))
+    {
+      gcry_cipher_close (hd_one);
+      gcry_cipher_close (hd_cfb);
+      xfree(mem);
+      fail ("%s-CFB-%d test failed (plaintext mismatch, parallel path)",
+            cipher, blocksize * 8);
+      return -1;
+    }
 
-         if (memcmp (enc_result, out, nplain))
-           fail ("pass %d, algo %d, mode %d, block len %d, first len %d, "
-                 "encrypt mismatch\n", pass, algo, mode, blklen, firstlen);
-       }
+  err = gcry_cipher_ctl (hd_cfb, PRIV_CIPHERCTL_GET_INPUT_VECTOR, getivbuf,
+                        blocksize + 1);
+  if (err || getivbuf[0] != blocksize)
+    {
+      gcry_cipher_close (hd_one);
+      gcry_cipher_close (hd_cfb);
+      xfree(mem);
+      fail ("%s-CFB-%d test failed (CFB getiv fail, parallel path)",
+           cipher, blocksize * 8);
+      return -1;
+    }
+  if (memcmp(getivbuf + 1, iv, blocksize))
+    {
+      gcry_cipher_close (hd_one);
+      gcry_cipher_close (hd_cfb);
+      xfree(mem);
+      fail ("%s-CFB-%d test failed (IV mismatch, parallel path)",
+            cipher, blocksize * 8);
+      return -1;
     }
 
-  gcry_cipher_close (hd);
-
-  free (enc_result);
-  free (out);
-  return 0;
-
-err_out_free:
-  free (enc_result);
-  free (out);
+  gcry_cipher_close (hd_one);
+  gcry_cipher_close (hd_cfb);
+  xfree(mem);
   return -1;
 }
 
 
-static void
-check_one_cipher (int algo, int mode, int flags)
+/* Run the tests for <block cipher>-CTR-<block size>, tests IV increment
+   of bulk CTR encryption.  Returns NULL on success. */
+static int
+cipher_ctr_bulk_test (int cipher_algo)
 {
-  size_t medium_buffer_size = 2048 - 16;
-  size_t large_buffer_size = 64 * 1024 + 1024 - 16;
-  char key[64+1];
-  unsigned char *plain;
-  int bufshift, i;
+  const int nblocks = 128 - 1;
+  int blocksize;
+  const char *cipher;
+  gcry_cipher_hd_t hd_one;
+  gcry_cipher_hd_t hd_ctr;
+  gcry_error_t err = 0;
+  int i, j, offs, diff;
+  unsigned char *plaintext, *plaintext2, *ciphertext, *ciphertext2,
+                *iv, *iv2, *mem;
+  unsigned char *getctrbuf;
+  unsigned int memsize;
+  unsigned int keylen;
+
+  static const unsigned char key[32] = {
+      0x06,0x9A,0x00,0x7F,0xC7,0x6A,0x45,0x9F,
+      0x98,0xBA,0xF9,0x17,0xFE,0xDF,0x95,0x21,
+      0x06,0x9A,0x00,0x7F,0xC7,0x6A,0x45,0x9F,
+      0x98,0xBA,0xF9,0x17,0xFE,0xDF,0x95,0x21
+    };
 
-  plain = malloc (large_buffer_size + 1);
-  if (!plain)
+  if (gcry_cipher_test_algo (cipher_algo))
+    return -1;
+  blocksize = gcry_cipher_get_algo_blklen(cipher_algo);
+  if (blocksize < 8)
+    return -1;
+  cipher = gcry_cipher_algo_name (cipher_algo);
+  keylen = gcry_cipher_get_algo_keylen (cipher_algo);
+  if (keylen > sizeof(key))
     {
-      fail ("pass %d, algo %d, mode %d, malloc failed\n", -1, algo, mode);
-      return;
+      fail ("%s-CTR-%d test failed (key too short)", cipher, blocksize * 8);
+      return -1;
     }
 
-  for (bufshift = 0; bufshift < 4; bufshift++)
+  memsize = (blocksize * 2) + (blocksize * nblocks * 4) + 16 + (blocksize + 1);
+
+  mem = xcalloc (1, memsize);
+  if (!mem)
+    return -1;
+
+  offs = (16 - ((uintptr_t)mem & 15)) & 15;
+  iv = (void*)(mem + offs);
+  iv2 = iv + blocksize;
+  plaintext = iv2 + blocksize;
+  plaintext2 = plaintext + nblocks * blocksize;
+  ciphertext = plaintext2 + nblocks * blocksize;
+  ciphertext2 = ciphertext + nblocks * blocksize;
+  getctrbuf = ciphertext2 + nblocks * blocksize;
+
+  err = gcry_cipher_open (&hd_one, cipher_algo, GCRY_CIPHER_MODE_ECB, 0);
+  if (err)
     {
-      /* Pass 0: Standard test.  */
-      memcpy (key, "0123456789abcdef.,;/[]{}-=ABCDEF_"
-                  "0123456789abcdef.,;/[]{}-=ABCDEF", 64);
-      memcpy (plain, "foobar42FOOBAR17", 16);
-      for (i = 16; i < medium_buffer_size; i += 16)
-        {
-          memcpy (&plain[i], &plain[i-16], 16);
-          if (!++plain[i+7])
-            plain[i+6]++;
-          if (!++plain[i+15])
-            plain[i+14]++;
-        }
+      xfree(mem);
+      fail ("%s-CTR-%d test failed (cipher open fail)", cipher, blocksize * 8);
+      return -1;
+    }
+  err = gcry_cipher_open (&hd_ctr, cipher_algo, GCRY_CIPHER_MODE_CTR, 0);
+  if (err)
+    {
+      gcry_cipher_close (hd_one);
+      xfree(mem);
+      fail ("%s-CTR-%d test failed (cipher open fail)", cipher, blocksize * 8);
+      return -1;
+    }
 
-      if (check_one_cipher_core (algo, mode, flags, key, 64, plain,
-                                medium_buffer_size, bufshift,
-                                0+10*bufshift))
-        goto out;
+  /* Initialize ctx */
+  if (gcry_cipher_setkey (hd_one, key, keylen) ||
+      gcry_cipher_setkey (hd_ctr, key, keylen))
+    {
+      gcry_cipher_close (hd_one);
+      gcry_cipher_close (hd_ctr);
+      xfree(mem);
+      fail ("%s-CTR-%d test failed (setkey fail)", cipher, blocksize * 8);
+      return -1;
+    }
 
-      /* Pass 1: Key not aligned.  */
-      memmove (key+1, key, 64);
-      if (check_one_cipher_core (algo, mode, flags, key+1, 64, plain,
-                                medium_buffer_size, bufshift,
-                                1+10*bufshift))
-        goto out;
+  /* Test single block code path */
+  memset (iv, 0xff, blocksize);
+  for (i = 0; i < blocksize; i++)
+    plaintext[i] = i;
 
-      /* Pass 2: Key not aligned and data not aligned.  */
-      memmove (plain+1, plain, medium_buffer_size);
-      if (check_one_cipher_core (algo, mode, flags, key+1, 64, plain+1,
-                                medium_buffer_size, bufshift,
-                                2+10*bufshift))
-        goto out;
+  /* CTR manually.  */
+  err = gcry_cipher_encrypt (hd_one, ciphertext, blocksize, iv, blocksize);
+  if (err)
+    {
+      gcry_cipher_close (hd_one);
+      gcry_cipher_close (hd_ctr);
+      xfree(mem);
+      fail ("%s-CTR-%d test failed (ECB encrypt fail)", cipher, blocksize * 8);
+      return -1;
+    }
+  for (i = 0; i < blocksize; i++)
+    ciphertext[i] ^= plaintext[i];
+  for (i = blocksize; i > 0; i--)
+    {
+      iv[i-1]++;
+      if (iv[i-1])
+        break;
+    }
 
-      /* Pass 3: Key aligned and data not aligned.  */
-      memmove (key, key+1, 64);
-      if (check_one_cipher_core (algo, mode, flags, key, 64, plain+1,
-                                medium_buffer_size, bufshift,
-                                3+10*bufshift))
-        goto out;
+  memset (iv2, 0xff, blocksize);
+  err = gcry_cipher_setctr (hd_ctr, iv2, blocksize);
+  if (err)
+    {
+      gcry_cipher_close (hd_one);
+      gcry_cipher_close (hd_ctr);
+      xfree(mem);
+      fail ("%s-CTR-%d test failed (setiv fail)", cipher, blocksize * 8);
+      return -1;
+    }
+  err = gcry_cipher_encrypt (hd_ctr, plaintext2, blocksize * 1,
+                             ciphertext, blocksize * 1);
+  if (err)
+    {
+      gcry_cipher_close (hd_one);
+      gcry_cipher_close (hd_ctr);
+      xfree(mem);
+      fail ("%s-CTR-%d test failed (CTR encrypt fail)", cipher, blocksize * 8);
+      return -1;
     }
 
-  /* Pass 5: Large buffer test.  */
-  memcpy (key, "0123456789abcdef.,;/[]{}-=ABCDEF_"
-               "0123456789abcdef.,;/[]{}-=ABCDEF", 64);
-  memcpy (plain, "foobar42FOOBAR17", 16);
-  for (i = 16; i < large_buffer_size; i += 16)
+  if (memcmp (plaintext2, plaintext, blocksize))
     {
-      memcpy (&plain[i], &plain[i-16], 16);
-      if (!++plain[i+7])
-       plain[i+6]++;
-      if (!++plain[i+15])
-       plain[i+14]++;
+      gcry_cipher_close (hd_one);
+      gcry_cipher_close (hd_ctr);
+      xfree(mem);
+      fail ("%s-CTR-%d test failed (plaintext mismatch)",
+            cipher, blocksize * 8);
+      return -1;
     }
 
-  if (check_one_cipher_core (algo, mode, flags, key, 64, plain,
-                            large_buffer_size, bufshift,
-                            50))
-    goto out;
+  err = gcry_cipher_ctl (hd_ctr, PRIV_CIPHERCTL_GET_COUNTER, getctrbuf,
+                        blocksize + 1);
+  if (err || getctrbuf[0] != blocksize)
+    {
+      gcry_cipher_close (hd_one);
+      gcry_cipher_close (hd_ctr);
+      xfree(mem);
+      fail ("%s-CTR-%d test failed (CTR getctr fail)", cipher, blocksize * 8);
+      return -1;
+    }
+  if (memcmp(getctrbuf + 1, iv, blocksize))
+    {
+      gcry_cipher_close (hd_one);
+      gcry_cipher_close (hd_ctr);
+      xfree(mem);
+      fail ("%s-CTR-%d test failed (IV mismatch)", cipher, blocksize * 8);
+      return -1;
+    }
 
-  /* Pass 6: Counter overflow tests for ChaCha20 and CTR mode. */
-  if (mode == GCRY_CIPHER_MODE_STREAM && algo == GCRY_CIPHER_CHACHA20)
+  /* Test bulk encryption with typical IV. */
+  memset(iv, 0x57, blocksize-4);
+  iv[blocksize-1] = 1;
+  iv[blocksize-2] = 0;
+  iv[blocksize-3] = 0;
+  iv[blocksize-4] = 0;
+  memset(iv2, 0x57, blocksize-4);
+  iv2[blocksize-1] = 1;
+  iv2[blocksize-2] = 0;
+  iv2[blocksize-3] = 0;
+  iv2[blocksize-4] = 0;
+
+  for (i = 0; i < blocksize * nblocks; i++)
+    plaintext2[i] = plaintext[i] = i;
+
+  /* Create CTR ciphertext manually.  */
+  for (i = 0; i < blocksize * nblocks; i+=blocksize)
     {
-      /* 32bit overflow test (little-endian counter) */
-      if (check_one_cipher_ctr_overflow (algo, mode, flags, key, 64, plain,
-                                         medium_buffer_size, 0UL,
-                                         0, 60))
-       goto out;
-      /* 64bit overflow test (little-endian counter) */
-      if (check_one_cipher_ctr_overflow (algo, mode, flags, key, 64, plain,
-                                         medium_buffer_size, 0xffffffffUL,
-                                         0, 61))
-       goto out;
+      err = gcry_cipher_encrypt (hd_one, &ciphertext[i], blocksize,
+                                 iv, blocksize);
+      if (err)
+        {
+          gcry_cipher_close (hd_one);
+          gcry_cipher_close (hd_ctr);
+          xfree(mem);
+          fail ("%s-CTR-%d test failed (ECB encrypt fail)",
+                cipher, blocksize * 8);
+          return -1;
+        }
+      for (j = 0; j < blocksize; j++)
+        ciphertext[i+j] ^= plaintext[i+j];
+      for (j = blocksize; j > 0; j--)
+        {
+          iv[j-1]++;
+          if (iv[j-1])
+            break;
+        }
     }
-   else if (mode == GCRY_CIPHER_MODE_CTR)
+
+  err = gcry_cipher_setctr (hd_ctr, iv2, blocksize);
+  if (err)
     {
-      /* 32bit overflow test (big-endian counter) */
-      if (check_one_cipher_ctr_overflow (algo, mode, flags, key, 64, plain,
-                                         medium_buffer_size, 0UL,
-                                         1, 62))
-       goto out;
-      /* 64bit overflow test (big-endian counter) */
-      if (check_one_cipher_ctr_overflow (algo, mode, flags, key, 64, plain,
-                                         medium_buffer_size, 0xffffffffUL,
-                                         1, 63))
-       goto out;
+      gcry_cipher_close (hd_one);
+      gcry_cipher_close (hd_ctr);
+      xfree(mem);
+      fail ("%s-CTR-%d test failed (setiv fail)", cipher, blocksize * 8);
+      return -1;
+    }
+  err = gcry_cipher_encrypt (hd_ctr, ciphertext2, blocksize * nblocks,
+                             plaintext2, blocksize * nblocks);
+  if (err)
+    {
+      gcry_cipher_close (hd_one);
+      gcry_cipher_close (hd_ctr);
+      xfree(mem);
+      fail ("%s-CTR-%d test failed (CTR encrypt fail)", cipher, blocksize * 8);
+      return -1;
     }
 
-out:
-  free (plain);
+  if (memcmp (ciphertext2, ciphertext, blocksize * nblocks))
+    {
+      gcry_cipher_close (hd_one);
+      gcry_cipher_close (hd_ctr);
+      xfree(mem);
+      fail ("%s-CTR-%d test failed (ciphertext mismatch, bulk)",
+            cipher, blocksize * 8);
+      return -1;
+    }
+
+  err = gcry_cipher_ctl (hd_ctr, PRIV_CIPHERCTL_GET_COUNTER, getctrbuf,
+                        blocksize + 1);
+  if (err || getctrbuf[0] != blocksize)
+    {
+      gcry_cipher_close (hd_one);
+      gcry_cipher_close (hd_ctr);
+      xfree(mem);
+      fail ("%s-CTR-%d test failed (CTR getctr fail, bulk)",
+           cipher, blocksize * 8);
+      return -1;
+    }
+  if (memcmp(getctrbuf + 1, iv, blocksize))
+    {
+      gcry_cipher_close (hd_one);
+      gcry_cipher_close (hd_ctr);
+      xfree(mem);
+      fail ("%s-CTR-%d test failed (IV mismatch, bulk)", cipher, blocksize * 8);
+      return -1;
+    }
+
+  /* Test parallelized code paths (check counter overflow handling) */
+  for (diff = 0; diff < nblocks; diff++)
+    {
+      memset(iv, 0xff, blocksize);
+      iv[blocksize-1] -= diff;
+      iv[0] = iv[1] = 0;
+      iv[2] = 0x07;
+
+      for (i = 0; i < blocksize * nblocks; i++)
+       plaintext[i] = i;
+
+      /* Create CTR ciphertext manually.  */
+      for (i = 0; i < blocksize * nblocks; i+=blocksize)
+       {
+         err = gcry_cipher_encrypt (hd_one, &ciphertext[i], blocksize,
+                                   iv, blocksize);
+         if (err)
+           {
+             gcry_cipher_close (hd_one);
+             gcry_cipher_close (hd_ctr);
+             xfree(mem);
+             fail ("%s-CTR-%d test failed (ECB encrypt fail)",
+                   cipher, blocksize * 8);
+             return -1;
+           }
+         for (j = 0; j < blocksize; j++)
+           ciphertext[i+j] ^= plaintext[i+j];
+         for (j = blocksize; j > 0; j--)
+           {
+             iv[j-1]++;
+             if (iv[j-1])
+               break;
+           }
+       }
+
+      /* Decrypt using bulk CTR and compare result.  */
+      memset(iv2, 0xff, blocksize);
+      iv2[blocksize-1] -= diff;
+      iv2[0] = iv2[1] = 0;
+      iv2[2] = 0x07;
+
+      err = gcry_cipher_setctr (hd_ctr, iv2, blocksize);
+      if (err)
+       {
+         gcry_cipher_close (hd_one);
+         gcry_cipher_close (hd_ctr);
+         xfree(mem);
+         fail ("%s-CTR-%d test failed (setiv fail)", cipher, blocksize * 8);
+         return -1;
+       }
+      err = gcry_cipher_decrypt (hd_ctr, plaintext2, blocksize * nblocks,
+                                ciphertext, blocksize * nblocks);
+      if (err)
+       {
+         gcry_cipher_close (hd_one);
+         gcry_cipher_close (hd_ctr);
+         xfree(mem);
+         fail ("%s-CTR-%d test failed (CTR decrypt fail)", cipher, blocksize * 8);
+         return -1;
+       }
+
+      if (memcmp (plaintext2, plaintext, blocksize * nblocks))
+       {
+         gcry_cipher_close (hd_one);
+         gcry_cipher_close (hd_ctr);
+         xfree(mem);
+         fail ("%s-CTR-%d test failed (plaintext mismatch, diff: %d)",
+               cipher, blocksize * 8, diff);
+         return -1;
+       }
+
+      err = gcry_cipher_ctl (hd_ctr, PRIV_CIPHERCTL_GET_COUNTER, getctrbuf,
+                            blocksize + 1);
+      if (err || getctrbuf[0] != blocksize)
+        {
+          gcry_cipher_close (hd_one);
+          gcry_cipher_close (hd_ctr);
+          xfree(mem);
+          fail ("%s-CTR-%d test failed (CTR getctr fail, diff: %d)",
+               cipher, blocksize * 8, diff);
+          return -1;
+        }
+      if (memcmp(getctrbuf + 1, iv, blocksize))
+       {
+         gcry_cipher_close (hd_one);
+         gcry_cipher_close (hd_ctr);
+         xfree(mem);
+         fail ("%s-CTR-%d test failed (IV mismatch, diff: %d)",
+               cipher, blocksize * 8, diff);
+         return -1;
+       }
+    }
+
+  gcry_cipher_close (hd_one);
+  gcry_cipher_close (hd_ctr);
+  xfree(mem);
+  return -1;
 }
 
 
@@ -11716,6 +13429,11 @@ check_ciphers (void)
 #endif
 #if USE_SM4
     GCRY_CIPHER_SM4,
+#endif
+#if USE_ARIA
+    GCRY_CIPHER_ARIA128,
+    GCRY_CIPHER_ARIA192,
+    GCRY_CIPHER_ARIA256,
 #endif
     0
   };
@@ -11780,6 +13498,13 @@ check_ciphers (void)
         check_one_cipher (algos[i], GCRY_CIPHER_MODE_OCB, 0);
       if (gcry_cipher_get_algo_blklen (algos[i]) == GCRY_XTS_BLOCK_LEN)
         check_one_cipher (algos[i], GCRY_CIPHER_MODE_XTS, 0);
+
+      if (gcry_cipher_get_algo_blklen (algos[i]) >= 8)
+        {
+          cipher_cbc_bulk_test (algos[i]);
+          cipher_cfb_bulk_test (algos[i]);
+          cipher_ctr_bulk_test (algos[i]);
+        }
     }
 
   for (i = 0; algos2[i]; i++)
@@ -11862,7 +13587,7 @@ fillbuf_count (char *buf, size_t buflen, unsigned char pos)
 
 static void
 check_one_md (int algo, const char *data, int len, const char *expect, int elen,
-             const char *key, int klen)
+             const char *key, int klen, const char *n, const char *s)
 {
   gcry_md_hd_t hd, hd2;
   unsigned char *p;
@@ -11879,18 +13604,17 @@ check_one_md (int algo, const char *data, int len, const char *expect, int elen,
     }
 
   mdlen = gcry_md_get_algo_dlen (algo);
-  if (mdlen < 1 || mdlen > 500)
+  if (elen != 0 && mdlen != elen
+      && (algo == GCRY_MD_SHAKE128 || algo == GCRY_MD_SHAKE256
+          || algo == GCRY_MD_CSHAKE128 || algo == GCRY_MD_CSHAKE256))
     {
-      if (mdlen == 0 && (algo == GCRY_MD_SHAKE128 || algo == GCRY_MD_SHAKE256))
-        {
-          xof = 1;
-        }
-      else
-        {
-         gcry_md_close (hd);
-          fail ("algo %d, gcry_md_get_algo_dlen failed: %d\n", algo, mdlen);
-          return;
-        }
+      xof = 1;
+    }
+  else if (mdlen < 1 || mdlen > 500)
+    {
+      gcry_md_close (hd);
+      fail ("algo %d, gcry_md_get_algo_dlen failed: %d\n", algo, mdlen);
+      return;
     }
 
   if (key && klen)
@@ -11968,6 +13692,23 @@ check_one_md (int algo, const char *data, int len, const char *expect, int elen,
       gcry_md_reset (hd);
    }
 
+  if (n && s)
+    {
+      struct gcry_cshake_customization custom;
+
+      custom.n = n;
+      custom.n_len = strlen (n);
+      custom.s = s;
+      custom.s_len = strlen (s);
+
+      err = gcry_md_ctl (hd, GCRYCTL_MD_CUSTOMIZE, &custom, sizeof (custom));
+      if (err)
+       {
+         fail ("algo %d, gcry_md_ctl failed: %s\n", algo, gpg_strerror (err));
+         return;
+       }
+    }
+
   if ((*data == '!' && !data[1]) || /* hash one million times a "a" */
       (*data == '?' && !data[1]))   /* hash million byte data-set with byte pattern 0x00,0x01,0x02,... */
     {
@@ -12217,10 +13958,11 @@ check_one_md (int algo, const char *data, int len, const char *expect, int elen,
 
 
 static void
-check_one_md_multi (int algo, const char *data, int len, const char *expect)
+check_one_md_multi (int algo, const char *data, int len, const char *expect,
+                   int elen, const char *n, const char *s)
 {
   gpg_error_t err;
-  gcry_buffer_t iov[3];
+  gcry_buffer_t iov[5];
   int iovcnt;
   char digest[64];
   int mdlen;
@@ -12229,14 +13971,16 @@ check_one_md_multi (int algo, const char *data, int len, const char *expect)
   mdlen = gcry_md_get_algo_dlen (algo);
   if (mdlen < 1 || mdlen > 64)
     {
-      if (mdlen == 0 && (algo == GCRY_MD_SHAKE128 || algo == GCRY_MD_SHAKE256))
-        return;
-
       fail ("check_one_md_multi: algo %d, gcry_md_get_algo_dlen failed: %d\n",
             algo, mdlen);
       return;
     }
 
+  if (elen != 0 && elen != mdlen
+      && (algo == GCRY_MD_SHAKE128 || algo == GCRY_MD_SHAKE256
+          || algo == GCRY_MD_CSHAKE128 || algo == GCRY_MD_CSHAKE256))
+    return;
+
   if (*data == '!' && !data[1])
     return;  /* We can't do that here.  */
   if (*data == '?' && !data[1])
@@ -12244,14 +13988,24 @@ check_one_md_multi (int algo, const char *data, int len, const char *expect)
 
   memset (iov, 0, sizeof iov);
 
-  iov[0].data = (void*)data;
+  iovcnt = 0;
+  if (n && s)
+    {
+      iov[iovcnt].data = (void *)n;
+      iov[iovcnt].len = strlen (n);
+      iovcnt++;
+      iov[iovcnt].data = (void *)s;
+      iov[iovcnt].len = strlen (s);
+      iovcnt++;
+    }
+  iov[iovcnt].data = (void*)data;
   if (len)
     {
-      iov[0].len = 1;
+      iov[iovcnt].len = 1;
       len--;
       data++;
     }
-  iovcnt = 1;
+  iovcnt++;
   if (len >= 4)
     {
       iov[iovcnt].data = (void*)data;
@@ -12378,6 +14132,8 @@ check_digests (void)
     int expectlen;
     const char *key;
     int keylen;
+    const char *n;
+    const char *s;
   } algos[] =
     {
       { GCRY_MD_MD2, "",
@@ -12402,14 +14158,14 @@ check_digests (void)
        "\xF9\x6B\x69\x7D\x7C\xB7\x93\x8D\x52\x5A\x2F\x31\xAA\xF1\x61\xD0" },
       { GCRY_MD_MD5,
        "Libgcrypt is free software; you can redistribute it and/or modif"
-       "y it under the terms of the GNU Lesser general Public License as"
+       "y it under the terms of the GNU Lesser General Public License as"
        " published by the Free Software Foundation; either version 2.1 o"
        "f the License, or (at your option) any later version.\nLibgcrypt"
        " is distributed in the hope that it will be useful, but WITHOUT "
        "ANY WARRANTY; without even the implied warranty of MERCHANTABILI"
        "TY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser Gene"
        "ral Public License for more details.",
-       "\xc4\x1a\x5c\x0b\x44\x5f\xba\x1a\xda\xbc\xc0\x38\x0e\x0c\x9e\x33" },
+        "\xea\x83\x72\xa0\x63\xa1\x37\xf5\xde\xb1\xc8\x29\x5f\xe7\xb4\xed" },
       { GCRY_MD_MD5, "!",
         "\x77\x07\xd6\xae\x4e\x02\x7c\x70\xee\xa2\xa9\x35\xc2\x29\x6f\x21" },
       { GCRY_MD_MD5, "?",
@@ -12429,15 +14185,15 @@ check_digests (void)
        "\x79\xec\x97\x3b\x98\x4c\x94\x75\xaa\x8f" },
       { GCRY_MD_SHA1,
        "Libgcrypt is free software; you can redistribute it and/or modif"
-       "y it under the terms of the GNU Lesser general Public License as"
+       "y it under the terms of the GNU Lesser General Public License as"
        " published by the Free Software Foundation; either version 2.1 o"
        "f the License, or (at your option) any later version.\nLibgcrypt"
        " is distributed in the hope that it will be useful, but WITHOUT "
        "ANY WARRANTY; without even the implied warranty of MERCHANTABILI"
        "TY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser Gene"
        "ral Public License for more details.",
-       "\xf5\xd9\xcb\x66\x91\xb4\x7a\x7c\x60\x35\xe2\x1c\x38\x26\x52\x13"
-       "\x8e\xd5\xe5\xdf" },
+        "\x45\x68\x60\x0b\xbb\xbf\xc7\x6a\xc3\x8f\xd6\x0b\x9f\x0f\x54\x23"
+        "\x7f\xb8\xa4\x78" },
       /* From RFC3874 */
       {        GCRY_MD_SHA224, "abc",
        "\x23\x09\x7d\x22\x34\x05\xd8\x22\x86\x42\xa4\x77\xbd\xa2\x55\xb3"
@@ -12454,15 +14210,15 @@ check_digests (void)
        "\x3b\xca\x70\x78\xf2\x44\xdf\x62\xab\x27\xb8\xda" },
       { GCRY_MD_SHA224,
        "Libgcrypt is free software; you can redistribute it and/or modif"
-       "y it under the terms of the GNU Lesser general Public License as"
+       "y it under the terms of the GNU Lesser General Public License as"
        " published by the Free Software Foundation; either version 2.1 o"
        "f the License, or (at your option) any later version.\nLibgcrypt"
        " is distributed in the hope that it will be useful, but WITHOUT "
        "ANY WARRANTY; without even the implied warranty of MERCHANTABILI"
        "TY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser Gene"
        "ral Public License for more details.",
-       "\x80\xf0\x60\x79\xb0\xe9\x65\xab\x8a\x76\xbf\x6e\x88\x64\x75\xe7"
-       "\xfd\xf0\xc2\x4c\xf6\xf2\xa6\x01\xed\x50\x71\x08" },
+        "\x68\x9d\x4e\x49\x07\xb9\x97\xc8\x29\x09\x35\xb7\xfe\xbe\x49\x84"
+        "\xf5\xb2\xc2\x05\x4f\xe3\xa0\x56\xbb\xd5\x9e\x7c" },
       {        GCRY_MD_SHA256, "abc",
        "\xba\x78\x16\xbf\x8f\x01\xcf\xea\x41\x41\x40\xde\x5d\xae\x22\x23"
        "\xb0\x03\x61\xa3\x96\x17\x7a\x9c\xb4\x10\xff\x61\xf2\x00\x15\xad" },
@@ -12478,31 +14234,31 @@ check_digests (void)
        "\xd2\x07\xf9\x3f\xc3\xdf\x04\xd7\x57\x2e\x63\x65\xaf\x69\xcd\x0d" },
       { GCRY_MD_SHA256,
        "Libgcrypt is free software; you can redistribute it and/or modif"
-       "y it under the terms of the GNU Lesser general Public License as"
+       "y it under the terms of the GNU Lesser General Public License as"
        " published by the Free Software Foundation; either version 2.1 o"
        "f the License, or (at your option) any later version.\nLibgcrypt"
        " is distributed in the hope that it will be useful, but WITHOUT "
        "ANY WARRANTY; without even the implied warranty of MERCHANTABILI"
        "TY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser Gene"
        "ral Public License for more details.",
-       "\xb0\x18\x70\x67\xb8\xac\x68\x50\xec\x95\x43\x77\xb5\x44\x5b\x0f"
-       "\x2e\xbd\x40\xc9\xdc\x2a\x2c\x33\x8b\x53\xeb\x3e\x9e\x01\xd7\x02" },
+        "\x41\xc6\xe4\x76\x9d\xb0\x7f\xa5\xca\x31\x20\x8c\x5c\x6e\x72\xde"
+        "\x6f\x18\x41\x38\x66\x7c\x17\x5a\xc6\xbf\xcc\x0e\xfb\xd3\x0c\x71" },
       {        GCRY_MD_SHA384, "abc",
        "\xcb\x00\x75\x3f\x45\xa3\x5e\x8b\xb5\xa0\x3d\x69\x9a\xc6\x50\x07"
        "\x27\x2c\x32\xab\x0e\xde\xd1\x63\x1a\x8b\x60\x5a\x43\xff\x5b\xed"
        "\x80\x86\x07\x2b\xa1\xe7\xcc\x23\x58\xba\xec\xa1\x34\xc8\x25\xa7" },
       { GCRY_MD_SHA384,
        "Libgcrypt is free software; you can redistribute it and/or modif"
-       "y it under the terms of the GNU Lesser general Public License as"
+       "y it under the terms of the GNU Lesser General Public License as"
        " published by the Free Software Foundation; either version 2.1 o"
        "f the License, or (at your option) any later version.\nLibgcrypt"
        " is distributed in the hope that it will be useful, but WITHOUT "
        "ANY WARRANTY; without even the implied warranty of MERCHANTABILI"
        "TY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser Gene"
        "ral Public License for more details.",
-       "\xe4\x6d\xb4\x28\x33\x77\x99\x49\x94\x0f\xcf\x87\xc2\x2f\x30\xd6"
-       "\x06\x24\x82\x9d\x80\x64\x8a\x07\xa1\x20\x8f\x5f\xf3\x85\xb3\xaa"
-       "\x39\xb8\x61\x00\xfc\x7f\x18\xc6\x82\x23\x4b\x45\xfa\xf1\xbc\x69" },
+        "\xb5\xa0\x42\x50\x0c\x30\x59\x8b\x65\x55\xbc\xa6\x77\x7b\x13\x84"
+        "\x3d\xf7\xbc\x59\x26\x5c\xba\x83\x29\x3a\x27\xe1\x40\x64\xed\xa1"
+        "\x5d\x75\xaa\xe5\x66\x24\x25\xe1\x67\x4a\x8c\x59\x79\x55\x11\x30" },
       { GCRY_MD_SHA384, "!",
         "\x9d\x0e\x18\x09\x71\x64\x74\xcb\x08\x6e\x83\x4e\x31\x0a\x4a\x1c"
         "\xed\x14\x9e\x9c\x00\xf2\x48\x52\x79\x72\xce\xc5\x70\x4c\x2a\x5b"
@@ -12518,17 +14274,17 @@ check_digests (void)
        "\x45\x4D\x44\x23\x64\x3C\xE8\x0E\x2A\x9A\xC9\x4F\xA5\x4C\xA4\x9F" },
       { GCRY_MD_SHA512,
        "Libgcrypt is free software; you can redistribute it and/or modif"
-       "y it under the terms of the GNU Lesser general Public License as"
+       "y it under the terms of the GNU Lesser General Public License as"
        " published by the Free Software Foundation; either version 2.1 o"
        "f the License, or (at your option) any later version.\nLibgcrypt"
        " is distributed in the hope that it will be useful, but WITHOUT "
        "ANY WARRANTY; without even the implied warranty of MERCHANTABILI"
        "TY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser Gene"
        "ral Public License for more details.",
-       "\x72\x8c\xde\xd8\xe4\xd7\xb6\xa5\x0f\xde\x6b\x4d\x33\xaf\x15\x19"
-       "\xdd\xec\x62\x0f\xf7\x1a\x1e\x10\x32\x05\x02\xa6\xb0\x1f\x70\x37"
-       "\xbc\xd7\x15\xed\x71\x6c\x78\x20\xc8\x54\x87\xd0\x66\x6a\x17\x83"
-       "\x05\x61\x92\xbe\xcc\x8f\x3b\xbf\x11\x72\x22\x69\x23\x5b\x48\x5c" },
+        "\x82\x3e\xde\x39\xfe\xc7\xec\x80\x20\xf2\xea\xf1\x18\x08\x87\x22"
+        "\x04\x14\xd6\xf6\xfa\xac\x9a\x64\xc8\xb2\xd3\xca\x10\x49\x78\x9f"
+        "\xcc\x1f\x68\xb3\xed\x12\x14\x46\x1b\x80\xdf\xa5\xa2\x5f\x69\x8b"
+        "\x95\xeb\xb8\x82\x68\x2d\x7e\xf9\xea\x08\x2b\x5e\x6f\x0e\x5b\x52" },
       { GCRY_MD_SHA512, "!",
         "\xe7\x18\x48\x3d\x0c\xe7\x69\x64\x4e\x2e\x42\xc7\xbc\x15\xb4\x63"
         "\x8e\x1f\x98\xb1\x3b\x20\x44\x28\x56\x32\xa8\x03\xaf\xa9\x73\xeb"
@@ -12663,15 +14419,15 @@ check_digests (void)
        "\x81\xb1\x23\xa8\x5f\xfa\x21\x59\x5f\x36" },
       { GCRY_MD_RMD160,
        "Libgcrypt is free software; you can redistribute it and/or modif"
-       "y it under the terms of the GNU Lesser general Public License as"
+       "y it under the terms of the GNU Lesser General Public License as"
        " published by the Free Software Foundation; either version 2.1 o"
        "f the License, or (at your option) any later version.\nLibgcrypt"
        " is distributed in the hope that it will be useful, but WITHOUT "
        "ANY WARRANTY; without even the implied warranty of MERCHANTABILI"
        "TY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser Gene"
        "ral Public License for more details.",
-       "\x06\x6d\x3c\x4e\xc9\xba\x89\x75\x16\x90\x96\x4e\xfd\x43\x07\xde"
-       "\x04\xca\x69\x6b" },
+        "\x8f\x27\x30\xf6\x1e\x08\x48\xde\xa3\xd2\x37\xdd\xaa\x4f\xb4\xa4"
+        "\xe4\xc6\xb6\x97" },
       { GCRY_MD_RMD160, "!",
         "\x52\x78\x32\x43\xc1\x69\x7b\xdb\xe1\x6d\x37\xf9\x7f\x68\xf0\x83"
         "\x25\xdc\x15\x28" },
@@ -12682,14 +14438,14 @@ check_digests (void)
       {        GCRY_MD_CRC32, "foo", "\x8c\x73\x65\x21" },
       { GCRY_MD_CRC32,
        "Libgcrypt is free software; you can redistribute it and/or modif"
-       "y it under the terms of the GNU Lesser general Public License as"
+       "y it under the terms of the GNU Lesser General Public License as"
        " published by the Free Software Foundation; either version 2.1 o"
        "f the License, or (at your option) any later version.\nLibgcrypt"
        " is distributed in the hope that it will be useful, but WITHOUT "
        "ANY WARRANTY; without even the implied warranty of MERCHANTABILI"
        "TY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser Gene"
        "ral Public License for more details.",
-       "\x4A\x53\x7D\x67" },
+        "\x83\xf6\x3f\x98" },
       { GCRY_MD_CRC32, "123456789", "\xcb\xf4\x39\x26" },
       { GCRY_MD_CRC32, "!", "\xdc\x25\xbf\xbc" },
       { GCRY_MD_CRC32, "?", "\x61\x82\x29\x1B" },
@@ -12784,15 +14540,15 @@ check_digests (void)
         "\xD3\x63\x02\xE9\xB3\xCE\xE0\xD2\xBC\x31\x4B\x41" },
       { GCRY_MD_TIGER1,
        "Libgcrypt is free software; you can redistribute it and/or modif"
-       "y it under the terms of the GNU Lesser general Public License as"
+       "y it under the terms of the GNU Lesser General Public License as"
        " published by the Free Software Foundation; either version 2.1 o"
        "f the License, or (at your option) any later version.\nLibgcrypt"
        " is distributed in the hope that it will be useful, but WITHOUT "
        "ANY WARRANTY; without even the implied warranty of MERCHANTABILI"
        "TY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser Gene"
        "ral Public License for more details.",
-       "\x60\xee\xdf\x95\x39\xc8\x44\x94\x64\xdc\xdf\x3d\x2e\x1c\xe5\x79"
-       "\x6a\x95\xbd\x30\x68\x8c\x7e\xb8" },
+        "\x19\x3e\x11\x06\x51\x9a\x77\xf2\xe6\xf8\x4a\x62\x60\x0e\xc0\x65"
+        "\xfa\x15\x9c\xe6\x5c\xb8\x8b\xde" },
       {        GCRY_MD_TIGER1, "?",
        "\x4b\xe2\x3f\x23\xf5\x34\xbe\xbf\x97\x42\x95\x80"
        "\x54\xe4\x6c\x12\x64\x85\x44\x0a\xa9\x49\x9b\x65" },
@@ -12859,17 +14615,17 @@ check_digests (void)
         "\x8C\x60\x08\xAD\x67\x7F\x77\x12\x69\x53\xB2\x26\xE4\xED\x8B\x01" },
       { GCRY_MD_WHIRLPOOL,
        "Libgcrypt is free software; you can redistribute it and/or modif"
-       "y it under the terms of the GNU Lesser general Public License as"
+       "y it under the terms of the GNU Lesser General Public License as"
        " published by the Free Software Foundation; either version 2.1 o"
        "f the License, or (at your option) any later version.\nLibgcrypt"
        " is distributed in the hope that it will be useful, but WITHOUT "
        "ANY WARRANTY; without even the implied warranty of MERCHANTABILI"
        "TY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser Gene"
        "ral Public License for more details.",
-       "\xcd\x4a\xa4\xaf\xf6\x7f\xec\xce\xbb\x6c\xdf\x91\x96\xe1\xf3\xf6"
-       "\x78\xe2\x8e\x3a\x76\xcf\x06\xc7\xa1\x20\x7b\x81\x32\x60\xf7\x8e"
-       "\x68\x19\x62\x33\x4f\xe5\x0a\x24\xfb\x9e\x74\x03\x74\xe4\x61\x29"
-       "\x6f\xb3\x13\xe6\x7e\xc2\x88\x99\x9e\xfb\xe7\x9d\x11\x30\x89\xd2" },
+        "\x01\x3a\x9d\xba\x05\x60\xcd\xa7\x82\xbc\xc1\x51\x54\xeb\x61\x11"
+        "\xe3\x63\x8b\xf2\x92\x4f\xa4\xfc\xdb\xd3\x2d\x1f\x5f\xd0\x9f\x90"
+        "\x5d\xa1\x92\xa7\xaa\xe0\x31\x13\x23\x6c\x10\xf9\x13\xe1\x79\xd7"
+        "\xc3\x92\x65\xb6\x9a\x90\xc0\x10\x1a\x8f\xc4\x27\x71\xdb\x7b\x7d" },
       { GCRY_MD_GOSTR3411_94,
        "This is message, length=32 bytes",
        "\xB1\xC4\x66\xD3\x75\x19\xB8\x2E\x83\x19\x81\x9F\xF3\x25\x95\xE0"
@@ -12888,15 +14644,15 @@ check_digests (void)
        "\xA7\xDB\xAF\x0E\x7E\xA7\x4E\x9F\xA6\x02\x41\x3C\x90\xA1\x29\xFA" },
       { GCRY_MD_GOSTR3411_94,
        "Libgcrypt is free software; you can redistribute it and/or modif"
-       "y it under the terms of the GNU Lesser general Public License as"
+       "y it under the terms of the GNU Lesser General Public License as"
        " published by the Free Software Foundation; either version 2.1 o"
        "f the License, or (at your option) any later version.\nLibgcrypt"
        " is distributed in the hope that it will be useful, but WITHOUT "
        "ANY WARRANTY; without even the implied warranty of MERCHANTABILI"
        "TY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser Gene"
        "ral Public License for more details.",
-       "\x00\x0c\x85\xc8\x54\xd2\x9a\x6e\x47\x2e\xff\xa4\xa2\xe7\xd0\x2e"
-       "\x8a\xcc\x14\x53\xb4\x87\xc8\x5c\x95\x9a\x3e\x85\x8c\x7d\x6e\x0c" },
+        "\xa3\x6a\xcf\xb1\x63\x5e\xa7\x08\xf6\x63\x80\xe6\x1b\x82\xa4\xf5"
+        "\x9d\xf8\x6a\xf7\x61\xee\xd8\xfb\xc0\x5b\x0b\x7e\xe7\x87\x0c\x20" },
       { GCRY_MD_STRIBOG512,
         "012345678901234567890123456789012345678901234567890123456789012",
         "\x1b\x54\xd0\x1a\x4a\xf5\xb9\xd5\xcc\x3d\x86\xd6\x8d\x28\x54\x62"
@@ -12991,6 +14747,11 @@ check_digests (void)
        "\x43\xE4\x1B\x45\xA6\x53\xF2\xA5\xC4\x49\x2C\x1A\xDD\x54\x45\x12"
        "\xDD\xA2\x52\x98\x33\x46\x2B\x71\xA4\x1A\x45\xBE\x97\x29\x0B\x6F",
        0, 512, },
+      { GCRY_MD_SHAKE128,
+       "",
+       "\x7F\x9C\x2B\xA4\xE8\x8F\x82\x7D\x61\x60\x45\x50\x76\x05\x85\x3E"
+       "\xD7\x3B\x80\x93\xF6\xEF\xBC\x88\xEB\x1A\x6E\xAC\xFA\x66\xEF\x26",
+       0, 0, /* test md_read interface */ },
       { GCRY_MD_SHAKE128,
        "\x5A\xAB\x62\x75\x6D\x30\x7A\x66\x9D\x14\x6A\xBA\x98\x8D\x90\x74"
        "\xC5\xA1\x59\xB3\xDE\x85\x15\x1A\x81\x9B\x11\x7C\xA1\xFF\x65\x97"
@@ -13109,6 +14870,13 @@ check_digests (void)
        "\xAB\x0B\xAE\x31\x63\x39\x89\x43\x04\xE3\x58\x77\xB0\xC2\x8A\x9B"
        "\x1F\xD1\x66\xC7\x96\xB9\xCC\x25\x8A\x06\x4A\x8F\x57\xE2\x7F\x2A",
        0, 512, },
+      { GCRY_MD_SHAKE256,
+       "",
+       "\x46\xB9\xDD\x2B\x0B\xA8\x8D\x13\x23\x3B\x3F\xEB\x74\x3E\xEB\x24"
+       "\x3F\xCD\x52\xEA\x62\xB8\x1B\x82\xB5\x0C\x27\x64\x6E\xD5\x76\x2F"
+       "\xD7\x5D\xC4\xDD\xD8\xC0\xF2\x00\xCB\x05\x01\x9D\x67\xB5\x92\xF6"
+       "\xFC\x82\x1C\x49\x47\x9A\xB4\x86\x40\x29\x2E\xAC\xB3\xB7\xC4\xBE",
+       0, 0, /* test md_read interface */ },
       { GCRY_MD_SHAKE256,
        "\xB3\x2D\x95\xB0\xB9\xAA\xD2\xA8\x81\x6D\xE6\xD0\x6D\x1F\x86\x00"
        "\x85\x05\xBD\x8C\x14\x12\x4F\x6E\x9A\x16\x3B\x5A\x2A\xDE\x55\xF8"
@@ -13187,6 +14955,70 @@ check_digests (void)
        "\x1b\xeb\x65\x53\xf2\x81\xfa\x75\x69\x48\xc4\x38\x49\x4b\x19\xb4"
        "\xee\x69\xa5\x43\x6b\x22\x2b\xc9\x88\xed\xa4\xac\x60\x00\x24\xc9",
        0, 512, },
+      { GCRY_MD_CSHAKE128,
+       "\x00\x01\x02\x03",
+       "\xC1\xC3\x69\x25\xB6\x40\x9A\x04\xF1\xB5\x04\xFC\xBC\xA9\xD8\x2B"
+       "\x40\x17\x27\x7C\xB5\xED\x2B\x20\x65\xFC\x1D\x38\x14\xD5\xAA\xF5",
+       4,
+       32,
+       NULL, 0,
+       "",
+       "Email Signature" },
+      { GCRY_MD_CSHAKE128,
+       "\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0A\x0B\x0C\x0D\x0E\x0F"
+       "\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1A\x1B\x1C\x1D\x1E\x1F"
+       "\x20\x21\x22\x23\x24\x25\x26\x27\x28\x29\x2A\x2B\x2C\x2D\x2E\x2F"
+       "\x30\x31\x32\x33\x34\x35\x36\x37\x38\x39\x3A\x3B\x3C\x3D\x3E\x3F"
+       "\x40\x41\x42\x43\x44\x45\x46\x47\x48\x49\x4A\x4B\x4C\x4D\x4E\x4F"
+       "\x50\x51\x52\x53\x54\x55\x56\x57\x58\x59\x5A\x5B\x5C\x5D\x5E\x5F"
+       "\x60\x61\x62\x63\x64\x65\x66\x67\x68\x69\x6A\x6B\x6C\x6D\x6E\x6F"
+       "\x70\x71\x72\x73\x74\x75\x76\x77\x78\x79\x7A\x7B\x7C\x7D\x7E\x7F"
+       "\x80\x81\x82\x83\x84\x85\x86\x87\x88\x89\x8A\x8B\x8C\x8D\x8E\x8F"
+       "\x90\x91\x92\x93\x94\x95\x96\x97\x98\x99\x9A\x9B\x9C\x9D\x9E\x9F"
+       "\xA0\xA1\xA2\xA3\xA4\xA5\xA6\xA7\xA8\xA9\xAA\xAB\xAC\xAD\xAE\xAF"
+       "\xB0\xB1\xB2\xB3\xB4\xB5\xB6\xB7\xB8\xB9\xBA\xBB\xBC\xBD\xBE\xBF"
+       "\xC0\xC1\xC2\xC3\xC4\xC5\xC6\xC7",
+       "\xC5\x22\x1D\x50\xE4\xF8\x22\xD9\x6A\x2E\x88\x81\xA9\x61\x42\x0F"
+       "\x29\x4B\x7B\x24\xFE\x3D\x20\x94\xBA\xED\x2C\x65\x24\xCC\x16\x6B",
+       200,
+       32,
+       NULL, 0,
+       "",
+       "Email Signature" },
+      { GCRY_MD_CSHAKE256,
+       "\x00\x01\x02\x03",
+       "\xD0\x08\x82\x8E\x2B\x80\xAC\x9D\x22\x18\xFF\xEE\x1D\x07\x0C\x48"
+       "\xB8\xE4\xC8\x7B\xFF\x32\xC9\x69\x9D\x5B\x68\x96\xEE\xE0\xED\xD1"
+       "\x64\x02\x0E\x2B\xE0\x56\x08\x58\xD9\xC0\x0C\x03\x7E\x34\xA9\x69"
+       "\x37\xC5\x61\xA7\x4C\x41\x2B\xB4\xC7\x46\x46\x95\x27\x28\x1C\x8C",
+       4,
+       64,
+       NULL, 0,
+       "",
+       "Email Signature" },
+      { GCRY_MD_CSHAKE256,
+       "\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0A\x0B\x0C\x0D\x0E\x0F"
+       "\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1A\x1B\x1C\x1D\x1E\x1F"
+       "\x20\x21\x22\x23\x24\x25\x26\x27\x28\x29\x2A\x2B\x2C\x2D\x2E\x2F"
+       "\x30\x31\x32\x33\x34\x35\x36\x37\x38\x39\x3A\x3B\x3C\x3D\x3E\x3F"
+       "\x40\x41\x42\x43\x44\x45\x46\x47\x48\x49\x4A\x4B\x4C\x4D\x4E\x4F"
+       "\x50\x51\x52\x53\x54\x55\x56\x57\x58\x59\x5A\x5B\x5C\x5D\x5E\x5F"
+       "\x60\x61\x62\x63\x64\x65\x66\x67\x68\x69\x6A\x6B\x6C\x6D\x6E\x6F"
+       "\x70\x71\x72\x73\x74\x75\x76\x77\x78\x79\x7A\x7B\x7C\x7D\x7E\x7F"
+       "\x80\x81\x82\x83\x84\x85\x86\x87\x88\x89\x8A\x8B\x8C\x8D\x8E\x8F"
+       "\x90\x91\x92\x93\x94\x95\x96\x97\x98\x99\x9A\x9B\x9C\x9D\x9E\x9F"
+       "\xA0\xA1\xA2\xA3\xA4\xA5\xA6\xA7\xA8\xA9\xAA\xAB\xAC\xAD\xAE\xAF"
+       "\xB0\xB1\xB2\xB3\xB4\xB5\xB6\xB7\xB8\xB9\xBA\xBB\xBC\xBD\xBE\xBF"
+       "\xC0\xC1\xC2\xC3\xC4\xC5\xC6\xC7",
+       "\x07\xDC\x27\xB1\x1E\x51\xFB\xAC\x75\xBC\x7B\x3C\x1D\x98\x3E\x8B"
+       "\x4B\x85\xFB\x1D\xEF\xAF\x21\x89\x12\xAC\x86\x43\x02\x73\x09\x17"
+       "\x27\xF4\x2B\x17\xED\x1D\xF6\x3E\x8E\xC1\x18\xF0\x4B\x23\x63\x3C"
+       "\x1D\xFB\x15\x74\xC8\xFB\x55\xCB\x45\xDA\x8E\x25\xAF\xB0\x92\xBB",
+       200,
+       64,
+       NULL, 0,
+       "",
+       "Email Signature" },
       { GCRY_MD_BLAKE2B_512, "abc",
        "\xBA\x80\xA5\x3F\x98\x1C\x4D\x0D\x6A\x27\x97\xB6\x9F\x12\xF6\xE9"
        "\x4C\x21\x2F\x14\x68\x5A\xC4\xB7\x4B\x12\xBB\x6F\xDB\xFF\xA2\xD1"
@@ -13483,15 +15315,15 @@ check_digests (void)
        "\xbc\x5d\x5e\x94\xea\x08\x86\x3d\xfb\xe4\x00\x5a\xd9\xed\x79\x26" },
       { GCRY_MD_SM3,
        "Libgcrypt is free software; you can redistribute it and/or modif"
-       "y it under the terms of the GNU Lesser general Public License as"
+       "y it under the terms of the GNU Lesser General Public License as"
        " published by the Free Software Foundation; either version 2.1 o"
        "f the License, or (at your option) any later version.\nLibgcrypt"
        " is distributed in the hope that it will be useful, but WITHOUT "
        "ANY WARRANTY; without even the implied warranty of MERCHANTABILI"
        "TY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser Gene"
        "ral Public License for more details.",
-       "\x8b\x91\x3f\x0e\x85\xae\x43\x25\x6d\x28\x38\x6c\x09\x5c\xc7\x72"
-       "\xcc\x2e\x78\x89\x7e\x2e\x4e\x5a\x3d\xf6\x55\xfe\x87\xbe\xa6\xbc" },
+        "\xa5\xf5\x75\x8f\x35\x81\xd2\xf8\x50\x4f\x70\x4d\x13\x97\xec\xbb"
+        "\x4a\x36\x15\x82\xa4\x89\x70\x79\xf3\x22\xda\xb5\xab\x52\x6b\x8a" },
 #endif /* USE_SM3 */
 
       { GCRY_MD_GOSTR3411_CP,
@@ -13683,7 +15515,8 @@ check_digests (void)
                    algos[i].datalen > 0 ? algos[i].datalen
                                         : strlen (algos[i].data),
                    algos[i].expect, algos[i].expectlen,
-                   algos[i].key, algos[i].keylen);
+                   algos[i].key, algos[i].keylen,
+                   algos[i].n, algos[i].s);
 
       if (algos[i].key && algos[i].keylen)
        continue;
@@ -13691,7 +15524,8 @@ check_digests (void)
       check_one_md_multi (algos[i].md, algos[i].data,
                          algos[i].datalen > 0 ? algos[i].datalen
                                               : strlen (algos[i].data),
-                         algos[i].expect);
+                         algos[i].expect, algos[i].expectlen,
+                         algos[i].n, algos[i].s);
     }
 
   /* Check the Whirlpool bug emulation.  */
@@ -15607,40 +17441,35 @@ check_pubkey_crypt (int n, gcry_sexp_t skey, gcry_sexp_t pkey, int algo,
        NULL,
        0,
        0,
-       0,
-       FLAG_NOFIPS },
+       0 },
       {        GCRY_PK_RSA,
         "(data\n (flags pkcs1)\n"
        " (value #11223344556677889900AA#))\n",
        "(flags pkcs1)",
        1,
        0,
-       0,
-       FLAG_NOFIPS },
+       0 },
       { GCRY_PK_RSA,
         "(data\n (flags oaep)\n"
        " (value #11223344556677889900AA#))\n",
        "(flags oaep)",
        1,
        0,
-       0,
-       FLAG_NOFIPS },
+       0 },
       { GCRY_PK_RSA,
         "(data\n (flags oaep)\n (hash-algo sha1)\n"
        " (value #11223344556677889900AA#))\n",
        "(flags oaep)(hash-algo sha1)",
        1,
        0,
-       0,
-       FLAG_NOFIPS },
+       0 },
       { GCRY_PK_RSA,
         "(data\n (flags oaep)\n (hash-algo sha1)\n (label \"test\")\n"
        " (value #11223344556677889900AA#))\n",
        "(flags oaep)(hash-algo sha1)(label \"test\")",
        1,
        0,
-       0,
-       FLAG_NOFIPS },
+       0 },
       { GCRY_PK_RSA,
         "(data\n (flags oaep)\n (hash-algo sha1)\n (label \"test\")\n"
        " (value #11223344556677889900AA#)\n"
@@ -15648,8 +17477,7 @@ check_pubkey_crypt (int n, gcry_sexp_t skey, gcry_sexp_t pkey, int algo,
        "(flags oaep)(hash-algo sha1)(label \"test\")",
        1,
        0,
-       0,
-       FLAG_NOFIPS },
+       0 },
       {        0,
         "(data\n (flags )\n" " (value #11223344556677889900AA#))\n",
        NULL,
@@ -15695,7 +17523,7 @@ check_pubkey_crypt (int n, gcry_sexp_t skey, gcry_sexp_t pkey, int algo,
        "(flags pkcs1)",
        1,
        0,
-       GPG_ERR_ENCODING_PROBLEM, FLAG_SPECIAL | FLAG_NOFIPS },
+       GPG_ERR_ENCODING_PROBLEM, FLAG_SPECIAL },
       {        0,
         "(data\n (flags pss)\n"
        " (value #11223344556677889900AA#))\n",
@@ -15722,8 +17550,7 @@ check_pubkey_crypt (int n, gcry_sexp_t skey, gcry_sexp_t pkey, int algo,
        die ("converting data failed: %s\n", gpg_strerror (rc));
 
       rc = gcry_pk_encrypt (&ciph, data, pkey);
-      if (in_fips_mode && ((flags & FLAG_NOFIPS) ||
-                           (datas[dataidx].flags & FLAG_NOFIPS)))
+      if (in_fips_mode && (flags & FLAG_NOFIPS))
         {
           if (!rc)
             fail ("gcry_pk_encrypt did not fail as expected in FIPS mode\n");
@@ -15772,7 +17599,7 @@ check_pubkey_crypt (int n, gcry_sexp_t skey, gcry_sexp_t pkey, int algo,
              ciph = list;
            }
          rc = gcry_pk_decrypt (&plain, ciph, skey);
-          if ((!rc || in_fips_mode) && (datas[dataidx].flags & FLAG_SPECIAL))
+          if (!rc && (datas[dataidx].flags & FLAG_SPECIAL))
             {
               /* It may happen that OAEP formatted data which is
                  decrypted as pkcs#1 data returns a valid pkcs#1
@@ -16328,9 +18155,9 @@ check_pubkey (void)
               }
             else
               {
-                fail ("gcry_pk_test_algo failed: %s\n", gpg_strerror (err));
-                continue;
+                show_pk_not_available (pubkeys[i].id);
               }
+            continue;
           }
         check_one_pubkey (i, pubkeys[i]);
       }
@@ -16339,7 +18166,10 @@ check_pubkey (void)
 
   if (verbose)
     fprintf (stderr, "Starting additional public key check.\n");
-  check_one_pubkey_new (i);
+  if (gcry_pk_test_algo (GCRY_PK_RSA) == 0)
+    check_one_pubkey_new (i);
+  else
+    show_pk_not_available (GCRY_PK_RSA);
   if (verbose)
     fprintf (stderr, "Completed additional public key check.\n");
 
@@ -16469,6 +18299,7 @@ main (int argc, char **argv)
         {
           check_ciphers ();
           check_cipher_modes ();
+          check_bulk_cipher_modes ();
         }
       else if (hash_only)
         {
index 29fbcff9a1f982300e28f661abcc6de52c599c06..4b14541bc3dff3cb8b917542bffe6f6a2bb0c244 100644 (file)
@@ -4,7 +4,7 @@
  * This file is part of Libgcrypt.
  *
  * Libgcrypt is free software; you can redistribute it and/or modify
- * it under the terms of the GNU Lesser general Public License as
+ * it under the terms of the GNU Lesser General Public License as
  * published by the Free Software Foundation; either version 2.1 of
  * the License, or (at your option) any later version.
  *
@@ -467,7 +467,7 @@ slope_benchmark (struct bench_obj *obj)
     goto err_free;
   /* Get aligned buffer */
   buffer = real_buffer;
-  buffer += 128 - ((real_buffer - (unsigned char *) 0) & (128 - 1));
+  buffer += 128 - ((uintptr_t)real_buffer & (128 - 1));
   if (unaligned_mode)
     buffer += unaligned_mode; /* Make buffer unaligned */
 
@@ -515,6 +515,8 @@ err_free:
 
 /********************************************* CPU frequency auto-detection. */
 
+static volatile size_t vone = 1;
+
 static int
 auto_ghz_init (struct bench_obj *obj)
 {
@@ -535,6 +537,9 @@ auto_ghz_free (struct bench_obj *obj)
 static void
 auto_ghz_bench (struct bench_obj *obj, void *buf, size_t buflen)
 {
+  size_t one = vone;
+  size_t two = one + vone;
+
   (void)obj;
   (void)buf;
 
@@ -544,20 +549,23 @@ auto_ghz_bench (struct bench_obj *obj, void *buf, size_t buflen)
    * function will give cycles/iteration result 1024.0 on high-end CPUs.
    * With turbo, result will be less and can be used detect turbo-clock. */
 
+  /* Auto-ghz operation takes two CPU cycles to perform. Variables are
+   * generated through volatile object and therefore compiler is unable
+   * to optimize these operations to immediate values. */
 #ifdef HAVE_GCC_ASM_VOLATILE_MEMORY
   /* Auto-ghz operation takes two CPU cycles to perform. Memory barriers
    * are used to prevent compiler from optimizing this loop away. */
   #define AUTO_GHZ_OPERATION \
-       asm volatile ("":"+r"(buflen)::"memory"); \
-       buflen ^= 1; \
-       asm volatile ("":"+r"(buflen)::"memory"); \
-       buflen -= 2
+       asm volatile ("":"+r"(buflen),"+r"(one),"+r"(two)::"memory"); \
+       buflen ^= one; \
+       asm volatile ("":"+r"(buflen),"+r"(one),"+r"(two)::"memory"); \
+       buflen -= two
 #else
   /* TODO: Needs alternative way of preventing compiler optimizations.
    *       Mix of XOR and subtraction appears to do the trick for now. */
   #define AUTO_GHZ_OPERATION \
-       buflen ^= 1; \
-       buflen -= 2
+       buflen ^= one; \
+       buflen -= two
 #endif
 
 #define AUTO_GHZ_OPERATION_2 \
@@ -982,6 +990,35 @@ struct bench_cipher_mode
 };
 
 
+static void
+bench_set_cipher_key (gcry_cipher_hd_t hd, int keylen)
+{
+  char *key;
+  int err, i;
+
+  key = malloc (keylen);
+  if (!key)
+    {
+      fprintf (stderr, PGM ": couldn't allocate %d bytes\n", keylen);
+      gcry_cipher_close (hd);
+      exit (1);
+    }
+
+  for (i = 0; i < keylen; i++)
+    key[i] = 0x33 ^ (11 - i);
+
+  err = gcry_cipher_setkey (hd, key, keylen);
+  free (key);
+  if (err)
+    {
+      fprintf (stderr, PGM ": gcry_cipher_setkey failed: %s\n",
+                gpg_strerror (err));
+      gcry_cipher_close (hd);
+      exit (1);
+    }
+}
+
+
 static int
 bench_encrypt_init (struct bench_obj *obj)
 {
@@ -1010,20 +1047,7 @@ bench_encrypt_init (struct bench_obj *obj)
 
   if (keylen)
     {
-      char key[keylen];
-      int i;
-
-      for (i = 0; i < keylen; i++)
-       key[i] = 0x33 ^ (11 - i);
-
-      err = gcry_cipher_setkey (hd, key, keylen);
-      if (err)
-       {
-         fprintf (stderr, PGM ": gcry_cipher_setkey failed: %s\n",
-                  gpg_strerror (err));
-         gcry_cipher_close (hd);
-         exit (1);
-       }
+      bench_set_cipher_key (hd, keylen);
     }
   else
     {
@@ -1119,20 +1143,7 @@ bench_xts_encrypt_init (struct bench_obj *obj)
   keylen = gcry_cipher_get_algo_keylen (mode->algo) * 2;
   if (keylen)
     {
-      char key[keylen];
-      int i;
-
-      for (i = 0; i < keylen; i++)
-       key[i] = 0x33 ^ (11 - i);
-
-      err = gcry_cipher_setkey (hd, key, keylen);
-      if (err)
-       {
-         fprintf (stderr, PGM ": gcry_cipher_setkey failed: %s\n",
-                  gpg_strerror (err));
-         gcry_cipher_close (hd);
-         exit (1);
-       }
+      bench_set_cipher_key (hd, keylen);
     }
   else
     {
@@ -1989,7 +2000,9 @@ hash_bench (char **argv, int argc)
   else
     {
       for (i = 1; i < 400; i++)
-       if (!gcry_md_test_algo (i))
+        if (i == GCRY_MD_CSHAKE128 || i == GCRY_MD_CSHAKE256)
+          ; /* Skip the bench. */
+        else if (!gcry_md_test_algo (i))
          _hash_bench (i);
     }
 
@@ -2060,6 +2073,8 @@ bench_mac_init (struct bench_obj *obj)
     case GCRY_MAC_POLY1305_TWOFISH:
     case GCRY_MAC_POLY1305_SERPENT:
     case GCRY_MAC_POLY1305_SEED:
+    case GCRY_MAC_POLY1305_SM4:
+    case GCRY_MAC_POLY1305_ARIA:
       gcry_mac_setiv (hd, key, 16);
       break;
     }
@@ -2282,6 +2297,9 @@ kdf_bench (char **argv, int argc)
        {
          for (j = 1; j < 400; j++)
            {
+              if (i == GCRY_MD_CSHAKE128 || i == GCRY_MD_CSHAKE256)
+                continue; /* Skip the bench. */
+
              if (gcry_md_test_algo (j))
                continue;
 
@@ -2296,7 +2314,9 @@ kdf_bench (char **argv, int argc)
   else
     {
       for (i = 1; i < 400; i++)
-       if (!gcry_md_test_algo (i))
+        if (i == GCRY_MD_CSHAKE128 || i == GCRY_MD_CSHAKE256)
+          ; /* Skip the bench. */
+       else if (!gcry_md_test_algo (i))
          kdf_bench_one (GCRY_KDF_PBKDF2, i);
     }
 
@@ -2367,16 +2387,16 @@ ecc_algo_fips_allowed (int algo)
       case ECC_ALGO_NIST_P256:
       case ECC_ALGO_NIST_P384:
       case ECC_ALGO_NIST_P521:
-       return 1;
-      case ECC_ALGO_SECP256K1:
-      case ECC_ALGO_BRAINP256R1:
       case ECC_ALGO_ED25519:
       case ECC_ALGO_ED448:
+        return 1;
+      case ECC_ALGO_SECP256K1:
+      case ECC_ALGO_BRAINP256R1:
       case ECC_ALGO_X25519:
       case ECC_ALGO_X448:
       case ECC_ALGO_NIST_P192:
       default:
-       return 0;
+        return 0;
     }
 }
 
@@ -2931,13 +2951,310 @@ ecc_bench (char **argv, int argc)
 #endif
 }
 
+/************************************************************ MPI benchmarks. */
+
+#define MPI_START_SIZE 64
+#define MPI_END_SIZE 1024
+#define MPI_STEP_SIZE 8
+#define MPI_NUM_STEPS (((MPI_END_SIZE - MPI_START_SIZE) / MPI_STEP_SIZE) + 1)
+
+enum bench_mpi_test
+{
+  MPI_TEST_ADD = 0,
+  MPI_TEST_SUB,
+  MPI_TEST_RSHIFT3,
+  MPI_TEST_LSHIFT3,
+  MPI_TEST_RSHIFT65,
+  MPI_TEST_LSHIFT65,
+  MPI_TEST_MUL4,
+  MPI_TEST_MUL8,
+  MPI_TEST_MUL16,
+  MPI_TEST_MUL32,
+  MPI_TEST_DIV4,
+  MPI_TEST_DIV8,
+  MPI_TEST_DIV16,
+  MPI_TEST_DIV32,
+  MPI_TEST_MOD4,
+  MPI_TEST_MOD8,
+  MPI_TEST_MOD16,
+  MPI_TEST_MOD32,
+  __MAX_MPI_TEST
+};
+
+static const char * const mpi_test_names[] =
+{
+  "add",
+  "sub",
+  "rshift3",
+  "lshift3",
+  "rshift65",
+  "lshift65",
+  "mul4",
+  "mul8",
+  "mul16",
+  "mul32",
+  "div4",
+  "div8",
+  "div16",
+  "div32",
+  "mod4",
+  "mod8",
+  "mod16",
+  "mod32",
+  NULL,
+};
+
+struct bench_mpi_mode
+{
+  const char *name;
+  struct bench_ops *ops;
+
+  enum bench_mpi_test test_id;
+};
+
+struct bench_mpi_hd
+{
+  gcry_mpi_t bytes[MPI_NUM_STEPS + 1];
+  gcry_mpi_t y;
+};
+
+static int
+bench_mpi_init (struct bench_obj *obj)
+{
+  struct bench_mpi_mode *mode = obj->priv;
+  struct bench_mpi_hd *hd;
+  int y_bytes;
+  int i, j;
+
+  (void)mode;
+
+  obj->min_bufsize = MPI_START_SIZE;
+  obj->max_bufsize = MPI_END_SIZE;
+  obj->step_size = MPI_STEP_SIZE;
+  obj->num_measure_repetitions = num_measurement_repetitions;
+
+  hd = calloc (1, sizeof(*hd));
+  if (!hd)
+    return -1;
+
+  /* Generate input MPIs for benchmark. */
+  for (i = MPI_START_SIZE, j = 0; j < DIM(hd->bytes); i += MPI_STEP_SIZE, j++)
+    {
+      hd->bytes[j] = gcry_mpi_new (i * 8);
+      gcry_mpi_randomize (hd->bytes[j], i * 8, GCRY_WEAK_RANDOM);
+      gcry_mpi_set_bit (hd->bytes[j], i * 8 - 1);
+    }
+
+  switch (mode->test_id)
+    {
+      case MPI_TEST_MUL4:
+      case MPI_TEST_DIV4:
+      case MPI_TEST_MOD4:
+       y_bytes = 4;
+       break;
+
+      case MPI_TEST_MUL8:
+      case MPI_TEST_DIV8:
+      case MPI_TEST_MOD8:
+       y_bytes = 8;
+       break;
+
+      case MPI_TEST_MUL16:
+      case MPI_TEST_DIV16:
+      case MPI_TEST_MOD16:
+       y_bytes = 16;
+       break;
+
+      case MPI_TEST_MUL32:
+      case MPI_TEST_DIV32:
+      case MPI_TEST_MOD32:
+       y_bytes = 32;
+       break;
+
+      default:
+       y_bytes = 0;
+       break;
+    }
+
+  hd->y = gcry_mpi_new (y_bytes * 8);
+  if (y_bytes)
+    {
+      gcry_mpi_randomize (hd->y, y_bytes * 8, GCRY_WEAK_RANDOM);
+      gcry_mpi_set_bit (hd->y, y_bytes * 8 - 1);
+    }
+
+  obj->hd = hd;
+  return 0;
+}
+
+static void
+bench_mpi_free (struct bench_obj *obj)
+{
+  struct bench_mpi_hd *hd = obj->hd;
+  int i;
+
+  gcry_mpi_release (hd->y);
+  for (i = DIM(hd->bytes) - 1; i >= 0; i--)
+    gcry_mpi_release (hd->bytes[i]);
+
+  free(hd);
+}
+
+static void
+bench_mpi_do_bench (struct bench_obj *obj, void *buf, size_t buflen)
+{
+  struct bench_mpi_hd *hd = obj->hd;
+  struct bench_mpi_mode *mode = obj->priv;
+  int bytes_idx = (buflen - MPI_START_SIZE) / MPI_STEP_SIZE;
+  gcry_mpi_t x;
+
+  (void)buf;
+
+  x = gcry_mpi_new (2 * (MPI_END_SIZE + 1) * 8);
+
+  switch (mode->test_id)
+    {
+      case MPI_TEST_ADD:
+       gcry_mpi_add (x, hd->bytes[bytes_idx], hd->bytes[bytes_idx]);
+       break;
+
+      case MPI_TEST_SUB:
+       gcry_mpi_sub (x, hd->bytes[bytes_idx + 1], hd->bytes[bytes_idx]);
+       break;
+
+      case MPI_TEST_RSHIFT3:
+       gcry_mpi_rshift (x, hd->bytes[bytes_idx], 3);
+       break;
+
+      case MPI_TEST_LSHIFT3:
+       gcry_mpi_lshift (x, hd->bytes[bytes_idx], 3);
+       break;
+
+      case MPI_TEST_RSHIFT65:
+       gcry_mpi_rshift (x, hd->bytes[bytes_idx], 65);
+       break;
+
+      case MPI_TEST_LSHIFT65:
+       gcry_mpi_lshift (x, hd->bytes[bytes_idx], 65);
+       break;
+
+      case MPI_TEST_MUL4:
+      case MPI_TEST_MUL8:
+      case MPI_TEST_MUL16:
+      case MPI_TEST_MUL32:
+       gcry_mpi_mul (x, hd->bytes[bytes_idx], hd->y);
+       break;
+
+      case MPI_TEST_DIV4:
+      case MPI_TEST_DIV8:
+      case MPI_TEST_DIV16:
+      case MPI_TEST_DIV32:
+       gcry_mpi_div (x, NULL, hd->bytes[bytes_idx], hd->y, 0);
+       break;
+
+      case MPI_TEST_MOD4:
+      case MPI_TEST_MOD8:
+      case MPI_TEST_MOD16:
+      case MPI_TEST_MOD32:
+       gcry_mpi_mod (x, hd->bytes[bytes_idx], hd->y);
+       break;
+
+      default:
+       break;
+    }
+
+  gcry_mpi_release (x);
+}
+
+static struct bench_ops mpi_ops = {
+  &bench_mpi_init,
+  &bench_mpi_free,
+  &bench_mpi_do_bench
+};
+
+
+static struct bench_mpi_mode mpi_modes[] = {
+  {"", &mpi_ops},
+  {0},
+};
+
+
+static void
+mpi_bench_one (int test_id, struct bench_mpi_mode *pmode)
+{
+  struct bench_mpi_mode mode = *pmode;
+  struct bench_obj obj = { 0 };
+  double result;
+
+  mode.test_id = test_id;
+
+  if (mode.name[0] == '\0')
+    bench_print_algo (-18, mpi_test_names[test_id]);
+  else
+    bench_print_algo (18, mode.name);
+
+  obj.ops = mode.ops;
+  obj.priv = &mode;
+
+  result = do_slope_benchmark (&obj);
+
+  bench_print_result (result);
+}
+
+static void
+_mpi_bench (int test_id)
+{
+  int i;
+
+  for (i = 0; mpi_modes[i].name; i++)
+    mpi_bench_one (test_id, &mpi_modes[i]);
+}
+
+static int
+mpi_match_test(const char *name)
+{
+  int i;
+
+  for (i = 0; i < __MAX_MPI_TEST; i++)
+    if (strcmp(name, mpi_test_names[i]) == 0)
+      return i;
+
+  return -1;
+}
+
+void
+mpi_bench (char **argv, int argc)
+{
+  int i, test_id;
+
+  bench_print_section ("mpi", "MPI");
+  bench_print_header (18, "");
+
+  if (argv && argc)
+    {
+      for (i = 0; i < argc; i++)
+       {
+         test_id = mpi_match_test (argv[i]);
+         if (test_id >= 0)
+           _mpi_bench (test_id);
+       }
+    }
+  else
+    {
+      for (i = 0; i < __MAX_MPI_TEST; i++)
+       _mpi_bench (i);
+    }
+
+  bench_print_footer (18);
+}
+
 /************************************************************** Main program. */
 
 void
 print_help (void)
 {
   static const char *help_lines[] = {
-    "usage: bench-slope [options] [hash|mac|cipher|kdf|ecc [algonames]]",
+    "usage: bench-slope [options] [hash|mac|cipher|kdf|ecc|mpi [algonames]]",
     "",
     " options:",
     "   --cpu-mhz <mhz>           Set CPU speed for calculating cycles",
@@ -2964,6 +3281,9 @@ warm_up_cpu (void)
 {
   struct nsec_time start, end;
 
+  if (in_regression_test)
+    return;
+
   get_nsec_time (&start);
   do
     {
@@ -3123,6 +3443,7 @@ main (int argc, char **argv)
       cipher_bench (NULL, 0);
       kdf_bench (NULL, 0);
       ecc_bench (NULL, 0);
+      mpi_bench (NULL, 0);
     }
   else if (!strcmp (*argv, "hash"))
     {
@@ -3164,6 +3485,14 @@ main (int argc, char **argv)
       warm_up_cpu ();
       ecc_bench ((argc == 0) ? NULL : argv, argc);
     }
+  else if (!strcmp (*argv, "mpi"))
+    {
+      argc--;
+      argv++;
+
+      warm_up_cpu ();
+      mpi_bench ((argc == 0) ? NULL : argv, argc);
+    }
   else
     {
       fprintf (stderr, PGM ": unknown argument: %s\n", *argv);
index a45fc5ff1433835bc79227fc24ae48f6c07ecb48..48025bbca54e30ad00807c7d361e8ff4956d812d 100644 (file)
@@ -4,7 +4,7 @@
  * This file is part of Libgcrypt.
  *
  * Libgcrypt is free software; you can redistribute it and/or modify
- * it under the terms of the GNU Lesser general Public License as
+ * it under the terms of the GNU Lesser General Public License as
  * published by the Free Software Foundation; either version 2.1 of
  * the License, or (at your option) any later version.
  *
@@ -71,6 +71,7 @@ static int with_progress;
 static int single_char_progress;
 
 
+#if USE_DSA
 static const char sample_private_dsa_key_1024[] =
 "(private-key\n"
 "  (dsa\n"
@@ -256,8 +257,10 @@ static const char sample_public_dsa_key_3072[] =
        "3DB98C4297CB678046ED55C0DBE60BF7142C594603E4D705DC3D17270F9F086EC561"
        "2703D518D8D49FF0EBE6#)\n"
 "))\n";
+#endif /* USE_DSA */
 
 
+#if USE_ELGAMAL
 static const char sample_public_elg_key_1024[] =
 "(public-key"
 "  (elg"
@@ -392,6 +395,7 @@ static const char sample_private_elg_key_3072[] =
 "   (x #03A73F0389E470AAC831B039F8AA0C4EBD3A47DD083E32EEA08E4911236CD597C272"
        "9823D47A51C8535DA52FE6DAB3E8D1C20D#)"
 "  ))";
+#endif /* USE_ELGAMAL */
 
 
 #define BUG() do {fprintf ( stderr, "Ooops at %s:%d\n", __FILE__ , __LINE__ );\
@@ -485,6 +489,8 @@ md_bench ( const char *algoname )
       for (i=1; i < 400; i++)
         if (in_fips_mode && i == GCRY_MD_MD5)
           ; /* Don't use MD5 in fips mode.  */
+        else if (i == GCRY_MD_CSHAKE128 || i == GCRY_MD_CSHAKE256)
+          ; /* Skip. */
         else if ( !gcry_md_test_algo (i) )
           md_bench (gcry_md_algo_name (i));
       return;
@@ -648,7 +654,7 @@ mac_bench ( const char *algoname )
   for (i=0; i < bufsize; i++)
     buf[i] = i;
 
-  if (algo >= GCRY_MAC_POLY1305_AES && algo <= GCRY_MAC_POLY1305_SEED)
+  if (algo >= GCRY_MAC_POLY1305_AES && algo <= GCRY_MAC_POLY1305_ARIA)
     {
       static const char iv[16] = { 1, 2, 3, 4, };
       err = gcry_mac_setiv(hd, iv, sizeof(iv));
@@ -715,15 +721,16 @@ mac_bench ( const char *algoname )
 
 static void ccm_aead_init(gcry_cipher_hd_t hd, size_t buflen, int authlen)
 {
-  const int _L = 4;
-  const int noncelen = 15 - _L;
-  char nonce[noncelen];
+  const char _L[4];
+  char nonce[15 - sizeof(_L)];
   u64 params[3];
   gcry_error_t err = GPG_ERR_NO_ERROR;
 
-  memset (nonce, 0x33, noncelen);
+  (void)_L;
 
-  err = gcry_cipher_setiv (hd, nonce, noncelen);
+  memset (nonce, 0x33, sizeof(nonce));
+
+  err = gcry_cipher_setiv (hd, nonce, sizeof(nonce));
   if (err)
     {
       fprintf (stderr, "gcry_cipher_setiv failed: %s\n",
@@ -1136,6 +1143,7 @@ cipher_bench ( const char *algoname )
 static void
 rsa_bench (int iterations, int print_header, int no_blinding)
 {
+#if USE_RSA
   gpg_error_t err;
   int p_sizes[] = { 1024, 2048, 3072, 4096 };
   int testno;
@@ -1257,12 +1265,18 @@ rsa_bench (int iterations, int print_header, int no_blinding)
       gcry_sexp_release (sec_key);
       gcry_sexp_release (pub_key);
     }
+#else /* USE_RSA */
+  (void) iterations;
+  (void) print_header;
+  (void) no_blinding;
+#endif /* USE_RSA */
 }
 
 
 static void
 elg_bench (int iterations, int print_header)
 {
+#ifdef USE_ELGAMAL
   gpg_error_t err;
   gcry_sexp_t pub_key[3], sec_key[3];
   int p_sizes[3] = { 1024, 2048, 3072 };
@@ -1374,12 +1388,17 @@ elg_bench (int iterations, int print_header)
       gcry_sexp_release (sec_key[i]);
       gcry_sexp_release (pub_key[i]);
     }
+#else /* USE_ELGAMAL */
+  (void) iterations;
+  (void) print_header;
+#endif /* USE_ELGAMAL */
 }
 
 
 static void
 dsa_bench (int iterations, int print_header)
 {
+#ifdef USE_DSA
   gpg_error_t err;
   gcry_sexp_t pub_key[3], sec_key[3];
   int p_sizes[3] = { 1024, 2048, 3072 };
@@ -1485,6 +1504,10 @@ dsa_bench (int iterations, int print_header)
       gcry_sexp_release (sec_key[i]);
       gcry_sexp_release (pub_key[i]);
     }
+#else
+  (void) iterations;
+  (void) print_header;
+#endif /* USE_DSA */
 }
 
 
@@ -1517,10 +1540,9 @@ ecc_bench (int iterations, int print_header)
       is_ed448 = !strcmp (p_sizes[testno], "Ed448");
       is_gost = !strncmp (p_sizes[testno], "gost", 4);
 
-      /* Only P-{224,256,384,521} are allowed in fips mode */
+      /* Only P-{224,256,384,521} and EdDSA curves are allowed in fips mode */
       if (gcry_fips_mode_active()
-          && (is_ed25519 || is_ed448 || is_gost
-              || !strcmp (p_sizes[testno], "192")))
+          && (is_gost || !strcmp (p_sizes[testno], "192")))
          continue;
 
       if (is_ed25519)
@@ -1648,6 +1670,9 @@ ecc_bench (int iterations, int print_header)
       gcry_sexp_release (sec_key);
       gcry_sexp_release (pub_key);
     }
+#else
+  (void) iterations;
+  (void) print_header;
 #endif /*USE_ECC*/
 }
 
index 3c738171b54df0d153cf4d0caa3d49174cc2b5fe..3cd744229bc45c1ef8f6d75b193140215e8f96ee 100644 (file)
@@ -14,8 +14,8 @@
  * GNU Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA
+ * License along with this program; if not, see <https://www.gnu.org/licenses/>.
+ * SPDX-License-Identifier: LGPL-2.1-or-later
  */
 
 #ifdef HAVE_CONFIG_H
@@ -144,18 +144,18 @@ check_get_params (void)
     int flags;
   } tv[] =
       {
-       { GCRY_PK_ECC, "Ed25519", TEST_NOFIPS },
-       { GCRY_PK_ECC, "1.3.6.1.4.1.11591.15.1", TEST_NOFIPS },
-       { GCRY_PK_ECC, "1.3.101.112", TEST_NOFIPS },
+       { GCRY_PK_ECC, "Ed25519" },
+       { GCRY_PK_ECC, "1.3.6.1.4.1.11591.15.1" },
+       { GCRY_PK_ECC, "1.3.101.112" },
 
        { GCRY_PK_ECC, "Curve25519", TEST_NOFIPS },
        { GCRY_PK_ECC, "1.3.6.1.4.1.3029.1.5.1", TEST_NOFIPS },
        { GCRY_PK_ECC, "1.3.101.110", TEST_NOFIPS },
        { GCRY_PK_ECC, "X25519", TEST_NOFIPS },
 
-       { GCRY_PK_ECC, "Ed448", TEST_NOFIPS },
+       { GCRY_PK_ECC, "Ed448" },
+       { GCRY_PK_ECC, "1.3.101.113" },
        { GCRY_PK_ECC, "X448", TEST_NOFIPS  },
-       { GCRY_PK_ECC, "1.3.101.113", TEST_NOFIPS },
        { GCRY_PK_ECC, "1.3.101.111", TEST_NOFIPS },
 
        { GCRY_PK_ECC, "NIST P-192", TEST_NOFIPS },
@@ -254,9 +254,9 @@ check_get_params (void)
        { GCRY_PK_ECC, "1.2.156.10197.1.301", TEST_NOFIPS },
 
        /* Check also the ECC algo mapping.  */
-       { GCRY_PK_ECDSA, "Ed25519", TEST_NOFIPS },
-       { GCRY_PK_EDDSA, "Ed25519", TEST_NOFIPS },
-       { GCRY_PK_ECDH,  "Ed25519", TEST_NOFIPS },
+       { GCRY_PK_ECDSA, "Ed25519" },
+       { GCRY_PK_EDDSA, "Ed25519" },
+       { GCRY_PK_ECDH,  "Ed25519" },
        { GCRY_PK_ECDSA, "Curve25519", TEST_NOFIPS },
        { GCRY_PK_EDDSA, "Curve25519", TEST_NOFIPS },
        { GCRY_PK_ECDH,  "Curve25519", TEST_NOFIPS },
index a52b86924b4447eaac368367d7449609bc5ef297..44b69897c60b3abdc36376b8efb6cb140694d686 100755 (executable)
@@ -1,6 +1,6 @@
 #!/bin/sh
 
-algos="SHA1 SHA256 SHA512 SM3"
+algos="SHA1 SHA256 SHA512 SHA3-512 SM3"
 
 test "@RUN_LARGE_DATA_TESTS@" = yes || exit 77
 echo "      now running 256 GiB tests for $algos - this takes looong"
diff --git a/tests/hashtest-6g.in b/tests/hashtest-6g.in
new file mode 100644 (file)
index 0000000..b3f3e2f
--- /dev/null
@@ -0,0 +1,7 @@
+#!/bin/sh
+
+algos="SHA1 SHA256 SHA512 SHA3-512 SM3 BLAKE2S_256 BLAKE2B_512 CRC32 CRC24RFC2440"
+
+test "@RUN_LARGE_DATA_TESTS@" = yes || exit 77
+echo "      now running 6 GiB tests for $algos - this can take long"
+exec ./hashtest@EXEEXT@ --hugeblock --gigs 6 $algos
index 4c9704f3fd7df396174e277acc5ddc24361cb161..05b06f36d0061d7dc322d474f4802bb44de17557 100644 (file)
@@ -34,6 +34,7 @@
 #define PGM "hashtest"
 #include "t-common.h"
 
+static int use_hugeblock;
 static int missing_test_vectors;
 
 static struct {
@@ -113,6 +114,169 @@ static struct {
   { GCRY_MD_SM3, 256, +64,
     "ed34869dbadd62e3bec1f511004d7bbfc9cafa965477cc48843b248293bbe867" },
 
+  { GCRY_MD_BLAKE2S_256, 256, -64,
+    "8a3d4f712275e8e8da70c76501cce364c75f8dd09748be58cf63c9ce38d62627" },
+  { GCRY_MD_BLAKE2S_256, 256, -1,
+    "0c01c9ad1e60e27dc889f2c9034a949ca8b9a9dc90dd99be64963af306d47b92" },
+  { GCRY_MD_BLAKE2S_256, 256, +0,
+    "f8c43d5c4bad93aca702c8c466987c5ac5e640a29b37dd9904252ff27b2348a0" },
+  { GCRY_MD_BLAKE2S_256, 256, +1,
+    "24c34b167b4eea1a7eb7d572ff3cf669a9856ea91bb112e9ef2ccd4b1aceccb4" },
+  { GCRY_MD_BLAKE2S_256, 256, +64,
+    "2f8d754f98e2d4ed7744389f89d0bdb9b770c9fa215b8badd3129ea1364af867" },
+
+  { GCRY_MD_BLAKE2B_512, 256, -64,
+    "36d32ae4deeacab4119401c52e2aec5545675bd2dce4f67871ddc73671a05f94"
+    "e8332c2a31f32f5601878606a571aa7b43029dac3ae71cf9ef141d05651dc4bf" },
+  { GCRY_MD_BLAKE2B_512, 256, -1,
+    "b5dc439f51664a6c9cbc87e2de98ce608ac4064a779e5140909d75d2120c9b2a"
+    "a1d4ae7be9c1ba97025be91ddcfbe42c791c3231cffbfa4b5368ba18f9590e1b" },
+  { GCRY_MD_BLAKE2B_512, 256, +0,
+    "c413d011ba9abbf118dd96bfc827f5fd94493d8350df9f7aff834faace5adba2"
+    "0c3037069dfb2c81718ffc7b418ce1c1320d334b6fe8cddfb5d2dd19eb530853" },
+  { GCRY_MD_BLAKE2B_512, 256, +1,
+    "b6dfb821f1c8167fb33995c29485010da56abd539c3d04ab9c222844301b8bba"
+    "6f57a48e45a748e40847084b93f26706aae82212550671c736becffcc6fb1496" },
+  { GCRY_MD_BLAKE2B_512, 256, +64,
+    "8c21316a4a02044e302d503d0fe669d905c40d9d80ecd5aafc8e30f1df06736f"
+    "51fdaf6002160bb8fe4e868eaad9623fc5ecdd728bcbfee4a19b386503710f48" },
+
+  { GCRY_MD_WHIRLPOOL, 256, -64,
+    "aabf62344c1aa82d2dc7605f339b3571d540f1f320f97e6a8c0229645ee61f1f"
+    "da796acde2f96caa1c56eb2c2f9a6029a6242ad690479def66feac44334cc3af" },
+  { GCRY_MD_WHIRLPOOL, 256, -1,
+    "9a35ec14aa9cefd40e04295d45d39f3111a98c2d76d90c54a7d2b8f2f5b9302b"
+    "79663eab6b6674625c3ae3e4b5dbb3b0a2f5b2f49a7a59cd1723e2b16a3efea2" },
+  { GCRY_MD_WHIRLPOOL, 256, +0,
+    "818ad31a5110b6217cc6ffa099d554aaadc9566bf5291e104a5d58b21d51ae4d"
+    "c216c6de888d1359066c584e24e6606f530a3fce80ef78aed8564de4a28801c8" },
+  { GCRY_MD_WHIRLPOOL, 256, +1,
+    "298805f5fc68488712427c1bcb27581d91aa04337c1c6b4657489ed3d239bb8b"
+    "c70ef654065d380ac1f5596aca5cb59e6da8044b5a067e32ea4cd94ca606f9f3" },
+  { GCRY_MD_WHIRLPOOL, 256, +64,
+    "7bd35c3bee621bc0fb8907904b3b84d6cf4fae4c22cc64fbc744c8c5c8de806d"
+    "0f11a27892d531dc907426597737762c83e3ddcdc62f50d16d130aaefaeec436" },
+
+  { GCRY_MD_SHA1, 6, -64,
+    "eeee82d952403313bd63d6d7c8e342df0a1eea77" },
+  { GCRY_MD_SHA1, 6, -1,
+    "8217b9f987d67db5880bcfff1d6763a6514d629f" },
+  { GCRY_MD_SHA1, 6, +0,
+    "2b38aa63c05668217e5331320a4aee0adad7fc3b" },
+  { GCRY_MD_SHA1, 6, +1,
+    "f3222de4d0704554cff0a537bc95b30f15daa94f" },
+  { GCRY_MD_SHA1, 6, +64,
+    "b3bdd8065bb92d8208d55d28fad2281c6fbf2601" },
+
+  { GCRY_MD_SHA256, 6, -64,
+    "a2d5add5be904b70d6ef9bcd5feb9c6cfc2be0799732a122d9eccb576ff5a922" },
+  { GCRY_MD_SHA256, 6, -1,
+    "88293b7e0e5a47fdef1148c6e510f95272770db6b5296958380209ba57db7a5d" },
+  { GCRY_MD_SHA256, 6, +0,
+    "ccee8e8dfc366eba67471e49c45057b0041be0d2206c6de1aa765ce07ecfc434" },
+  { GCRY_MD_SHA256, 6, +1,
+    "f4a89e92b38e0e61ee17079dc31411de06cfe1f77c83095ae1a2e7aa0205d94b" },
+  { GCRY_MD_SHA256, 6, +64,
+    "338708608c2356ed2927a85b08fe745223c6140243fb3a87f309e12b31b946a8" },
+
+  { GCRY_MD_SHA512, 6, -64,
+    "658f52850932633c00b2f1d65b874c540ab84e2c0fe84a8a6c35f8e90e6f6a9c"
+    "2f7e0ccca5064783562a42ad8f47eab48687aaf6998b04ee94441e82c14e834d" },
+  { GCRY_MD_SHA512, 6, -1,
+    "9ead6d66b46a3a72d77c7990874cfebc1575e5bfda6026430d76b3db6cc62d52"
+    "4ca0dd2674b9c24208b2e780d75542572eee8df6724acadcc23a03eed8f82f0a" },
+  { GCRY_MD_SHA512, 6, +0,
+    "03e4549eb28bd0fb1606c321f1498503b5e889bec8d799cf0688567c7f8ac0d9"
+    "a7ec4e84d1d729d6a359797656e286617c3ef82abb51991bb576aaf05f7b6573" },
+  { GCRY_MD_SHA512, 6, +1,
+    "ffe52f6385ccde6fa7d45845787d8f9993fdcb5833fb58b13c424a84e39ea50f"
+    "52d40e254fe667cb0104ffe3837dc8d0eee3c81721cb8eac10d5851dfb1f91db" },
+  { GCRY_MD_SHA512, 6, +64,
+    "4a19da3d5eaaa79ac1eaff5e4062f23ee56573411f8d302f7bf3c6da8779bd00"
+    "a936e9ad7f535597a49162ed308b0cced7724667f97a1bb24540152fcfe3ec95" },
+
+  { GCRY_MD_SHA3_512, 6, -64,
+    "a99f2913d3beb9b45273402e30daa4d25c7a5e9eb8cf6039996eb2292a45c04c"
+    "b9e3a1a187f71920626f465ed6cf7dc34047ec5578e05516374bb9c56683903a" },
+  { GCRY_MD_SHA3_512, 6, -1,
+    "fca50bde79c55e5fc4c9d97e66eb5cfacef7032395848731e645ca42f07f8d38"
+    "be1d593727c2a82b9a9bc058ebc9744971f867fa920cfa902023448243ac017b" },
+  { GCRY_MD_SHA3_512, 6, +0,
+    "c61bb345c0a553edaa89fd38114ac9799b6d307ba8e3cde53552ad4c77cfe4b7"
+    "2671d82c1519c8e7b23153a9268e2939239564fc7c2060608aa42955e938840d" },
+  { GCRY_MD_SHA3_512, 6, +1,
+    "502a83d8d1b977312806382a45c1cc9c0e7db437ca962e37eb181754d59db686"
+    "14d91df286d510411adf69f7c9befc1027bdc0c33a48a5dd6ae0957b9061e7ca" },
+  { GCRY_MD_SHA3_512, 6, +64,
+    "207bfb83ae788ddd4531188567f0892bbddbbc88d69bc196b2357bee3e668706"
+    "c27f832ecb50e9ae5b63e9f384bdc37373958d4a14f3825146d2f6b1a65d8e51" },
+
+  { GCRY_MD_SM3, 6, -64,
+    "41d96d19cef4c942b0f5f4cdc3e1afe440dc62c0bc103a2c0e9eee9e1733a74a" },
+  { GCRY_MD_SM3, 6, -1,
+    "b7689cc4ef6c7dc795b9e5e6998e5cc3dc1daec02bc1181cdbef8d6812b4957a" },
+  { GCRY_MD_SM3, 6, +0,
+    "c6eae4a82052423cf98017bde4dee8769947c66120a1a2ff79f0f0dc945a3272" },
+  { GCRY_MD_SM3, 6, +1,
+    "f6590f161fee11529585c7a9dfc725f8b81951e49b616844097a3dbdc9ffdbec" },
+  { GCRY_MD_SM3, 6, +64,
+    "f3277fa90c47afe5e4fc52374aadf8e96bc29c2b5a7a4ebf5d704245ada837ea" },
+
+  { GCRY_MD_BLAKE2S_256, 6, -64,
+    "0f3c17610777c34d40a0d11a93d5e5ed444ce16edefebabd0bc8e30392d5c2db" },
+  { GCRY_MD_BLAKE2S_256, 6, -1,
+    "92cbcf142c45de9d64da9791c51dce4e32b58f74d9f3d201b1ea74deac765f51" },
+  { GCRY_MD_BLAKE2S_256, 6, +0,
+    "b20702cb5a0bee2ab104f38eb513429589310a7edde81dd1f40043be7d16d0de" },
+  { GCRY_MD_BLAKE2S_256, 6, +1,
+    "bfc17dc74930989841da05aac08402bf0dcb4a597b17c52402a516ea7e541cdf" },
+  { GCRY_MD_BLAKE2S_256, 6, +64,
+    "d85588cdf5a00bec1327da02f22f1a10b68dd9d6b730f30a3aa65af3a51c1722" },
+
+  { GCRY_MD_BLAKE2B_512, 6, -64,
+    "30b6015f94524861b04b83f0455be10a993460e0f8f0fd755fc3d0270b0c7d00"
+    "039a6e01684ce0689ce4ef70932bd19a676acf4b4ea521c30337d2f445fc2055" },
+  { GCRY_MD_BLAKE2B_512, 6, -1,
+    "49abef820ad7fc5e6ed9b63acddce639a69dcd749b0798b140216649bc3b927c"
+    "637dbe1cb39a41bbafe7f8b675401ccdcf69a7fba227ae4cda5cd28b9ff36776" },
+  { GCRY_MD_BLAKE2B_512, 6, +0,
+    "4182a7307a89391b78af9dbc3ba1e8d643708abbed5919086aa6e2bc65ae9597"
+    "e40229450c86ac5d3117b006427dd0131f5ae4c1a1d64c81420d2731536c81d8" },
+  { GCRY_MD_BLAKE2B_512, 6, +1,
+    "33c0d9e65b1b18e9556134a08c1e725c19155bbf6ed4349d7d6d678f1827fef3"
+    "74b6e3381471f3d3fff7ffbcb9474ce9038143b99e25cd5f8afbb336313d4648" },
+  { GCRY_MD_BLAKE2B_512, 6, +64,
+    "d2d7f388611af78a2ea40b06f99993cff156afd25cbc47695bdb567d4d35b992"
+    "0ff8c325c359a2bdeddf54ececc671ac7b981031e90a7d63d6e0415ec4484282" },
+
+  { GCRY_MD_WHIRLPOOL, 6, -64,
+    "247707d1f9cf31b90ee68527144b1c20ad5ce96293bdccd1a81c8f40bc9df10c"
+    "e7441ac3b3097162d6fbf4d4b67b8fa09de451e2d920f16aad78c47ab00cb833" },
+  { GCRY_MD_WHIRLPOOL, 6, -1,
+    "af49e4a553bdbec1fdafc41713029e0fb1666894753c0ab3ecb280fc5af6eff8"
+    "253120745a229d7a8b5831711e4fd16ed0741258504d8a47e2b42aa2f1886968" },
+  { GCRY_MD_WHIRLPOOL, 6, +0,
+    "f269ffa424bc2aad2da654f01783fc9b2b431219f2b05784d718da0935e78792"
+    "9207b000ebbfb63dfdcc8adf8e5bd321d9616c1b8357430b9be6cb4640df8609" },
+  { GCRY_MD_WHIRLPOOL, 6, +1,
+    "52b77eb13129151b69b63c09abb655dc9cb046cafd4cbf7d4a82ae04b61ef9e6"
+    "531dde04cae7c5ab400ed8ee8da2e3f490d177289b2b3aa29b12b292954b902c" },
+  { GCRY_MD_WHIRLPOOL, 6, +64,
+    "60a950c92f3f08abbc81c41c86ce0463679ffd5ab420e988e15b210615b454ae"
+    "69607d14a1806fa44aacf8c926fbdcee998af46f56e0c642d3fb4ee54c8fb917" },
+
+  { GCRY_MD_CRC32, 6, -64, "20739052" },
+  { GCRY_MD_CRC32, 6, -1,  "971a5a74" },
+  { GCRY_MD_CRC32, 6, +0,  "bf48113c" },
+  { GCRY_MD_CRC32, 6, +1,  "c7678ad5" },
+  { GCRY_MD_CRC32, 6, +64, "1efa7255" },
+
+  { GCRY_MD_CRC24_RFC2440, 6, -64, "747e81" },
+  { GCRY_MD_CRC24_RFC2440, 6, -1,  "deb97d" },
+  { GCRY_MD_CRC24_RFC2440, 6, +0,  "7d5bea" },
+  { GCRY_MD_CRC24_RFC2440, 6, +1,  "acc351" },
+  { GCRY_MD_CRC24_RFC2440, 6, +64, "9d9032" },
+
   { 0 }
 };
 
@@ -251,25 +415,60 @@ run_longtest (int algo, int gigs)
   gcry_md_hd_t hd_post = NULL;
   gcry_md_hd_t hd_post2 = NULL;
   char pattern[1024];
-  int i, g;
+  char *hugepattern = NULL;
+  size_t hugesize;
+  size_t hugegigs;
+  int i, g, gppos, gptot;
   const unsigned char *digest;
   unsigned int digestlen;
 
   memset (pattern, 'a', sizeof pattern);
 
+  if (use_hugeblock)
+    {
+      hugegigs = 5;
+      if (sizeof(size_t) >= 8)
+        {
+          hugesize = hugegigs*1024*1024*1024;
+          hugepattern = malloc(hugesize);
+          if (hugepattern != NULL)
+            memset(hugepattern, 'a', hugesize);
+          else
+            show_note ("failed to allocate %zu GiB huge pattern block: %s",
+                       hugegigs, strerror(errno));
+        }
+      else
+        show_note ("cannot allocate %zu GiB huge pattern block on 32-bit system",
+                   hugegigs);
+    }
+  if (hugepattern == NULL)
+    {
+      hugegigs = 0;
+      hugesize = 0;
+    }
+
   err = gcry_md_open (&hd, algo, 0);
   if (err)
     {
       fail ("gcry_md_open failed for %s (%d): %s",
             gcry_md_algo_name (algo), algo, gpg_strerror (err));
+      free(hugepattern);
       return;
     }
 
   digestlen = gcry_md_get_algo_dlen (algo);
 
-
-  for (g=0; g < gigs; g++)
+  gppos = 0;
+  gptot = 0;
+  for (g=0; g < gigs; )
     {
+      if (gppos >= 16)
+        {
+          gptot += 16;
+          gppos -= 16;
+          show_note ("%d GiB so far hashed with %s", gptot,
+                     gcry_md_algo_name (algo));
+        }
       if (g == gigs - 1)
         {
           for (i = 0; i < 1024*1023; i++)
@@ -283,16 +482,24 @@ run_longtest (int algo, int gigs)
             die ("gcry_md_copy failed for %s (%d): %s",
                  gcry_md_algo_name (algo), algo, gpg_strerror (err));
           gcry_md_write (hd, pattern, sizeof pattern);
+          g++;
+          gppos++;
+        }
+      else if (hugepattern != NULL && gigs - g > hugegigs)
+        {
+          gcry_md_write (hd, hugepattern, hugesize);
+          g += hugegigs;
+          gppos += hugegigs;
         }
       else
         {
           for (i = 0; i < 1024*1024; i++)
             gcry_md_write (hd, pattern, sizeof pattern);
+          g++;
+          gppos++;
         }
-      if (g && !(g % 16))
-        show_note ("%d GiB so far hashed with %s", g, gcry_md_algo_name (algo));
     }
-  if (g >= 16)
+  if (g >= 16 && gppos)
     show_note ("%d GiB hashed with %s", g, gcry_md_algo_name (algo));
 
   err = gcry_md_copy (&hd_post, hd);
@@ -335,6 +542,8 @@ run_longtest (int algo, int gigs)
   gcry_md_close (hd_pre2);
   gcry_md_close (hd_post);
   gcry_md_close (hd_post2);
+
+  free(hugepattern);
 }
 
 
@@ -361,9 +570,12 @@ main (int argc, char **argv)
         {
           fputs ("usage: " PGM " [options] [algos]\n"
                  "Options:\n"
-                 "  --verbose       print timings etc.\n"
-                 "  --debug         flyswatter\n"
-                 "  --gigs N        Run a test on N GiB\n",
+                 "  --verbose                 print timings etc.\n"
+                 "  --debug                   flyswatter\n"
+                 "  --hugeblock               Use 5 GiB pattern block\n"
+                 "  --gigs N                  Run a test on N GiB\n"
+                 "  --disable-hwf <features>  Disable hardware acceleration feature(s)\n"
+                 "                            for benchmarking.\n",
                  stdout);
           exit (0);
         }
@@ -378,6 +590,11 @@ main (int argc, char **argv)
           debug++;
           argc--; argv++;
         }
+      else if (!strcmp (*argv, "--hugeblock"))
+        {
+          use_hugeblock = 1;
+          argc--; argv++;
+        }
       else if (!strcmp (*argv, "--gigs"))
         {
           argc--; argv++;
@@ -387,6 +604,21 @@ main (int argc, char **argv)
               argc--; argv++;
             }
         }
+      else if (!strcmp (*argv, "--disable-hwf"))
+        {
+          argc--;
+          argv++;
+          if (argc)
+            {
+              if (gcry_control (GCRYCTL_DISABLE_HWF, *argv, NULL))
+                fprintf (stderr,
+                        PGM
+                        ": unknown hardware feature `%s' - option ignored\n",
+                        *argv);
+              argc--;
+              argv++;
+            }
+        }
       else if (!strncmp (*argv, "--", 2))
         die ("unknown option '%s'", *argv);
     }
index 2b4c0f9f92567e2eba1e922f31f2e4d63fc8a0b9..9164d3a8836f8cea032eddb3abfac3f0c240ab6a 100644 (file)
@@ -14,8 +14,8 @@
  * GNU Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA
+ * License along with this program; if not, see <https://www.gnu.org/licenses/>.
+ * SPDX-License-Identifier: LGPL-2.1-or-later
  */
 
 #ifdef HAVE_CONFIG_H
index 5b154c9493c70ee8bf72888fb7aa5e2e498fcbaa..b04cc7f5ea5ea2fa4153a994b14e55221d1a20aa 100644 (file)
@@ -24,6 +24,7 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
+#include <ctype.h>
 #include <stdarg.h>
 #include "../src/gcrypt-int.h"
 
@@ -69,6 +70,7 @@ show_sexp (const char *prefix, gcry_sexp_t a)
 }
 
 
+#if USE_RSA
 static void
 show_mpi (const char *prefix, gcry_mpi_t a)
 {
@@ -131,11 +133,13 @@ check_generated_rsa_key (gcry_sexp_t key, unsigned long expected_e)
       gcry_sexp_release (skey);
     }
 }
+#endif /* USE_RSA */
 
 
 static void
 check_rsa_keys (void)
 {
+#if USE_RSA
   gcry_sexp_t keyparm, key;
   int rc;
 
@@ -249,12 +253,14 @@ check_rsa_keys (void)
   if (!rc)
     check_generated_rsa_key (key, 0); /* We don't expect a constant exponent. */
   gcry_sexp_release (key);
+#endif /* USE_RSA */
 }
 
 
 static void
 check_elg_keys (void)
 {
+#if USE_ELGAMAL
   gcry_sexp_t keyparm, key;
   int rc;
 
@@ -276,12 +282,14 @@ check_elg_keys (void)
   if (verbose > 1)
     show_sexp ("1024 bit Elgamal key:\n", key);
   gcry_sexp_release (key);
+#endif /* USE_ELGAMAL */
 }
 
 
 static void
 check_dsa_keys (void)
 {
+#if USE_DSA
   gcry_sexp_t keyparm, key;
   int rc;
   int i;
@@ -389,9 +397,11 @@ check_dsa_keys (void)
   if (verbose > 1)
     show_sexp ("2048 bit DSA key:\n", key);
   gcry_sexp_release (key);
+#endif /* USE_DSA */
 }
 
 
+#if USE_ECC
 static void
 check_generated_ecc_key (gcry_sexp_t key)
 {
@@ -425,11 +435,13 @@ check_generated_ecc_key (gcry_sexp_t key)
       fail ("gcry_pk_testkey failed on key pair: %s\n", gpg_strerror (rc));
   }
 }
+#endif /* USE_ECC */
 
 
 static void
 check_ecc_keys (void)
 {
+#if USE_ECC
   const char *curves[] = { "NIST P-521", "NIST P-384", "NIST P-256",
                            "Ed25519", NULL };
   int testno;
@@ -442,9 +454,6 @@ check_ecc_keys (void)
         info ("creating ECC key using curve %s\n", curves[testno]);
       if (!strcmp (curves[testno], "Ed25519"))
         {
-          /* Ed25519 isn't allowed in fips mode */
-          if (in_fips_mode)
-            continue;
           rc = gcry_sexp_build (&keyparm, NULL,
                                 "(genkey(ecc(curve %s)(flags param eddsa)))",
                                 curves[testno]);
@@ -476,15 +485,9 @@ check_ecc_keys (void)
     die ("error creating S-expression: %s\n", gpg_strerror (rc));
   rc = gcry_pk_genkey (&key, keyparm);
   gcry_sexp_release (keyparm);
-  if (rc && !in_fips_mode)
+  if (rc)
     die ("error generating ECC key using curve Ed25519 for ECDSA: %s\n",
          gpg_strerror (rc));
-  else if (!rc && in_fips_mode)
-    fail ("generating Ed25519 key must not work!");
-
-  if (verbose && rc && in_fips_mode)
-    info ("... correctly rejected key creation in FIPS mode (%s)\n",
-          gpg_strerror (rc));
 
   if (!rc)
     {
@@ -503,16 +506,11 @@ check_ecc_keys (void)
     die ("error creating S-expression: %s\n", gpg_strerror (rc));
   rc = gcry_pk_genkey (&key, keyparm);
   gcry_sexp_release (keyparm);
-  if (rc && !in_fips_mode)
+  if (rc)
     die ("error generating ECC key using curve Ed25519 for ECDSA"
          " (nocomp): %s\n",
          gpg_strerror (rc));
-  else if (!rc && in_fips_mode)
-    fail ("generating Ed25519 key must not work in FIPS mode!");
 
-  if (verbose && rc && in_fips_mode)
-    info ("... correctly rejected key creation in FIPS mode (%s)\n",
-          gpg_strerror (rc));
   gcry_sexp_release (key);
 
   if (verbose)
@@ -564,16 +562,10 @@ check_ecc_keys (void)
     die ("error creating S-expression: %s\n", gpg_strerror (rc));
   rc = gcry_pk_genkey (&key, keyparm);
   gcry_sexp_release (keyparm);
-  if (rc && !in_fips_mode)
+  if (rc)
     die ("error generating ECC key using curve Ed25519 for ECDSA"
          " (transient-key): %s\n",
          gpg_strerror (rc));
-  else if (!rc && in_fips_mode)
-    fail ("generating Ed25519 key must not work in FIPS mode!");
-
-  if (verbose && rc && in_fips_mode)
-    info ("... correctly rejected key creation in FIPS mode (%s)\n",
-          gpg_strerror (rc));
 
   if (!rc)
     {
@@ -593,16 +585,10 @@ check_ecc_keys (void)
     die ("error creating S-expression: %s\n", gpg_strerror (rc));
   rc = gcry_pk_genkey (&key, keyparm);
   gcry_sexp_release (keyparm);
-  if (rc && !in_fips_mode)
+  if (rc)
     die ("error generating ECC key using curve Ed25519 for ECDSA"
          " (transient-key no-keytest): %s\n",
          gpg_strerror (rc));
-  else if (!rc && in_fips_mode)
-    fail ("generating Ed25519 key must not work in FIPS mode!");
-
-  if (verbose && rc && in_fips_mode)
-    info ("... correctly rejected key creation in FIPS mode (%s)\n",
-          gpg_strerror (rc));
 
   if (!rc)
     {
@@ -611,6 +597,119 @@ check_ecc_keys (void)
       check_generated_ecc_key (key);
     }
   gcry_sexp_release (key);
+#endif /* USE_ECC */
+}
+
+
+static void
+check_generated_kem_key (gcry_sexp_t key, const char *algoname)
+{
+  gpg_error_t err;
+  gcry_sexp_t skey, pkey;
+  unsigned char keygrip_pk[20];
+  unsigned char keygrip_all[20];
+  int n, nbits;
+  const char *s;
+
+  pkey = gcry_sexp_find_token (key, "public-key", 0);
+  if (!pkey)
+    fail ("public part missing in return value\n");
+  else
+    {
+      if (!gcry_pk_get_keygrip (pkey, keygrip_pk))
+        fail ("gcry_pk_get_keyrip for pubkey failed\n");
+      gcry_sexp_release (pkey);
+    }
+
+  skey = gcry_sexp_find_token (key, "private-key", 0);
+  if (!skey)
+    fail ("private part missing in return value\n");
+  else
+    {
+      err = gcry_pk_testkey (skey);
+      if (gpg_err_code (err) == GPG_ERR_NOT_IMPLEMENTED)
+        info ("note: gcry_pk_testkey has not yet been implemented\n");
+      else if (err)
+        fail ("gcry_pk_testkey failed: %s\n", gpg_strerror (err));
+      gcry_sexp_release (skey);
+    }
+
+  /* Finally check that gcry_pk_testkey also works on the entire
+     S-expression.  */
+  err = gcry_pk_testkey (key);
+  if (gpg_err_code (err) == GPG_ERR_NOT_IMPLEMENTED)
+    info ("note: gcry_pk_testkey has not yet been implemented\n");
+  else if (err)
+    fail ("gcry_pk_testkey failed on keypair: %s\n", gpg_strerror (err));
+
+  if (!gcry_pk_get_keygrip (key, keygrip_all))
+    fail ("gcry_pk_get_keyrip for all failed\n");
+
+  if (memcmp (keygrip_pk, keygrip_all, 20))
+    fail ("keygrips do not match\n");
+
+  /* Simple hack to check nbits.  */
+  nbits = gcry_pk_get_nbits (key);
+  n = 0;
+  for (s=algoname; !isdigit (*s); s++)
+    ;
+  n = atoi (s);
+  if (n != nbits)
+    fail ("gcry_pk_get_nbits returned a wrong value n=%d nbits=%d\n", n, nbits);
+
+}
+
+
+#define TEST_NOFIPS         (1 << 0)
+
+static void
+check_kem_keys (void)
+{
+  const struct {
+    const char *algonames;
+    int flags;
+  } tv[] = {
+    { "sntrup761", TEST_NOFIPS },
+    { "kyber512", TEST_NOFIPS },
+    { "kyber768", TEST_NOFIPS },
+    { "kyber1024", TEST_NOFIPS },
+  };
+
+  int testno;
+  gcry_sexp_t keyparm, key;
+  int rc;
+
+  for (testno=0; testno < DIM (tv); testno++)
+    {
+      if (verbose)
+        info ("creating KEM key using algo %s\n", tv[testno].algonames);
+      rc = gcry_sexp_build (&keyparm, NULL, "(genkey(%s))",
+                            tv[testno].algonames);
+      if (rc)
+        die ("error creating S-expression: %s\n", gpg_strerror (rc));
+      rc = gcry_pk_genkey (&key, keyparm);
+      gcry_sexp_release (keyparm);
+      if (in_fips_mode && (tv[testno].flags & TEST_NOFIPS))
+        {
+          if (!rc)
+            die ("KEM: creating %s key should have failed in fips mode\n",
+                 tv[testno].algonames);
+          continue;
+        }
+      else
+        {
+          if (rc)
+            die ("error creating KEM key using algo %s: %s\n",
+                 tv[testno].algonames, gpg_strerror (rc));
+        }
+
+      if (verbose > 1)
+        show_sexp ("KEM key:\n", key);
+
+      check_generated_kem_key (key, tv[testno].algonames);
+
+      gcry_sexp_release (key);
+    }
 }
 
 
@@ -772,6 +871,7 @@ main (int argc, char **argv)
       check_elg_keys ();
       check_dsa_keys ();
       check_ecc_keys ();
+      check_kem_keys ();
       check_nonce ();
     }
   else
@@ -785,6 +885,8 @@ main (int argc, char **argv)
           check_dsa_keys ();
         else if (!strcmp (*argv, "ecc"))
           check_ecc_keys ();
+        else if (!strcmp (*argv, "kem"))
+          check_kem_keys ();
         else if (!strcmp (*argv, "nonce"))
           check_nonce ();
         else
index 49bd71bc0095a27948ec27fe66d48525c25b7455..cf6503503f46ddff992bd66e08d4a654fb50868c 100644 (file)
@@ -14,8 +14,8 @@
  * GNU Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA
+ * License along with this program; if not, see <https://www.gnu.org/licenses/>.
+ * SPDX-License-Identifier: LGPL-2.1-or-later
  */
 
 #ifdef HAVE_CONFIG_H
@@ -207,8 +207,7 @@ static struct
       "     47BD24842905C049257673B3F5249524E0A41FAA17B25B818D0F97E625F1A1D0#)"
       "     ))",
       "\x0C\xCA\xB2\xFD\x48\x9A\x33\x40\x2C\xE8"
-      "\xE0\x4A\x1F\xB2\x45\xEA\x80\x3D\x0A\xF1",
-      1
+      "\xE0\x4A\x1F\xB2\x45\xEA\x80\x3D\x0A\xF1"
     },
     { /* Ed25519+EdDSA */
       GCRY_PK_ECC,
@@ -218,8 +217,7 @@ static struct
       " (q #773E72848C1FD5F9652B29E2E7AF79571A04990E96F2016BF4E0EC1890C2B7DB#)"
       " ))",
       "\x9D\xB6\xC6\x4A\x38\x83\x0F\x49\x60\x70"
-      "\x17\x89\x47\x55\x20\xBE\x8C\x82\x1F\x47",
-      1
+      "\x17\x89\x47\x55\x20\xBE\x8C\x82\x1F\x47"
     },
     { /* Ed25519+EdDSA (with compression prefix) */
       GCRY_PK_ECC,
@@ -230,8 +228,7 @@ static struct
       "     773E72848C1FD5F9652B29E2E7AF79571A04990E96F2016BF4E0EC1890C2B7DB#)"
       " ))",
       "\x9D\xB6\xC6\x4A\x38\x83\x0F\x49\x60\x70"
-      "\x17\x89\x47\x55\x20\xBE\x8C\x82\x1F\x47",
-      1
+      "\x17\x89\x47\x55\x20\xBE\x8C\x82\x1F\x47"
     },
     { /* Ed25519+EdDSA  (same but uncompressed)*/
       GCRY_PK_ECC,
@@ -243,8 +240,7 @@ static struct
       "     5bb7c29018ece0f46b01f2960e99041a5779afe7e2292b65f9d51f8c84723e77#)"
       " ))",
       "\x9D\xB6\xC6\x4A\x38\x83\x0F\x49\x60\x70"
-      "\x17\x89\x47\x55\x20\xBE\x8C\x82\x1F\x47",
-      1
+      "\x17\x89\x47\x55\x20\xBE\x8C\x82\x1F\x47"
     },
     { /* Cv25519 */
       GCRY_PK_ECC,
index 48ea18b267b3f877996f5db85cde1c376877a6ed..2ee08bd3ef78069082b9d36744927c7c38547416 100644 (file)
@@ -687,6 +687,58 @@ test_powm (void)
 }
 
 
+/* What we test here is that using the same mpi for divider and result
+   works.  */
+static int
+test_addm_subm_mulm (void)
+{
+  int i;
+
+  for (i = 0; i < 3; i++)
+    {
+      unsigned int expect;
+      const char *func;
+      gcry_mpi_t A;
+      gcry_mpi_t B;
+      gcry_mpi_t C;
+
+      A = gcry_mpi_set_ui (NULL, 2);
+      B = gcry_mpi_set_ui (NULL, 4);
+      C = gcry_mpi_set_ui (NULL, 7);
+
+      if (i == 0)
+       {
+         func = "mpi_addm";
+         expect = 6;
+         gcry_mpi_addm(C, A, B, C);
+       }
+      else if (i == 1)
+       {
+         func = "mpi_subm";
+         expect = 5;
+         gcry_mpi_subm(C, A, B, C);
+       }
+      else if (i == 2)
+       {
+         func = "mpi_mulm";
+         expect = 1;
+         gcry_mpi_mulm(C, A, B, C);
+       }
+
+      if (gcry_mpi_is_neg (C) || gcry_mpi_cmp_ui (C, expect))
+       {
+         die ("test_addm_subm_mulm failed for %s at %d\n", func, __LINE__);
+       }
+
+      gcry_mpi_release(A);
+      gcry_mpi_release(B);
+      gcry_mpi_release(C);
+    }
+
+  return 1;
+}
+
+
 int
 main (int argc, char* argv[])
 {
@@ -710,6 +762,7 @@ main (int argc, char* argv[])
   test_sub ();
   test_mul ();
   test_powm ();
+  test_addm_subm_mulm ();
 
   return !!error_count;
 }
index 2fd495d5fb79ffc7625f5e8fc4f2f775c74795a9..f26e779b1b0c8f964d92ae874ad66b9553c84266 100644 (file)
@@ -186,24 +186,11 @@ check_oaep (void)
           err = gcry_pk_encrypt (&ciph, plain, pub_key);
           if (err)
             {
-              if (in_fips_mode)
-                {
-                  gcry_sexp_release (plain);
-                  plain = NULL;
-                  continue;
-                }
               show_sexp ("plain:\n", ciph);
               fail ("gcry_pk_encrypt failed: %s\n", gpg_strerror (err));
             }
           else
             {
-              if (in_fips_mode)
-                {
-                  fail ("The OAEP encryption unexpectedly worked in FIPS mode\n");
-                  gcry_sexp_release (plain);
-                  plain = NULL;
-                  continue;
-                }
               if (extract_cmp_data (ciph, "a", tbl[tno].m[mno].encr,
                                     tbl[tno].m[mno].desc))
                 {
@@ -467,19 +454,7 @@ check_v15crypt (void)
           gcry_free (seed);
 
           err = gcry_pk_encrypt (&ciph, plain, pub_key);
-          if (in_fips_mode)
-            {
-              if (!err)
-                {
-                  fail ("gcry_pk_encrypt should have failed in FIPS mode:\n");
-                }
-              gcry_sexp_release (plain);
-              plain = NULL;
-              gcry_sexp_release (ciph);
-              ciph = NULL;
-              continue;
-            }
-          else if (err)
+          if (err)
             {
               show_sexp ("plain:\n", ciph);
               fail ("gcry_pk_encrypt failed: %s\n", gpg_strerror (err));
index 422649806d3af478042f3f4d6650b23099b4f0a4..a73eb05ae621dadd2e27a761ece0b6ba1a6b0531 100644 (file)
@@ -12,9 +12,9 @@
    General Public License for more details.
 
    You should have received a copy of the GNU General Public License
-   along with this program; if not, write to the Free Software
-   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
-   USA.  */
+   along with this program; if not, see <https://www.gnu.org/licenses/>.
+   SPDX-License-Identifier: GPL-2-or-later
+*/
 
 #ifdef HAVE_CONFIG_H
 #include <config.h>
index 73e50f3d46fd185d0a1333de5e7fc629bf4ff49c..be150e03a3d1ef118fda14bef89b6865f5b64965 100644 (file)
@@ -31,6 +31,7 @@
 
 static int in_fips_mode;
 
+#if USE_RSA
 /* Sample RSA keys, taken from basic.c.  */
 
 static const char sample_private_key_1[] =
@@ -151,6 +152,7 @@ static const char sample_public_key_1[] =
 "  (e #010001#)\n"
 " )\n"
 ")\n";
+#endif /* USE_RSA */
 
 
 static void
@@ -169,6 +171,7 @@ show_sexp (const char *prefix, gcry_sexp_t a)
   gcry_free (buf);
 }
 
+#if USE_RSA
 /* from ../cipher/pubkey-util.c */
 static gpg_err_code_t
 _gcry_pk_util_get_nbits (gcry_sexp_t list, unsigned int *r_nbits)
@@ -196,6 +199,7 @@ _gcry_pk_util_get_nbits (gcry_sexp_t list, unsigned int *r_nbits)
   gcry_sexp_release (list);
   return 0;
 }
+#endif /* USE_RSA */
 
 /* Convert STRING consisting of hex characters into its binary
    representation and return it as an allocated buffer. The valid
@@ -250,6 +254,7 @@ extract_cmp_data (gcry_sexp_t sexp, const char *name, const char *expected)
 }
 
 
+#if USE_RSA || USE_ELGAMAL
 static void
 check_keys_crypt (gcry_sexp_t pkey, gcry_sexp_t skey,
                  gcry_sexp_t plain0, gpg_err_code_t decrypt_fail_code)
@@ -350,7 +355,9 @@ check_keys (gcry_sexp_t pkey, gcry_sexp_t skey, unsigned int nbits_data,
   check_keys_crypt (pkey, skey, plain, decrypt_fail_code);
   gcry_sexp_release (plain);
 }
+#endif /* USE_RSA || USE_ELGAMAL */
 
+#if USE_RSA
 static void
 get_keys_sample (gcry_sexp_t *pkey, gcry_sexp_t *skey, int secret_variant)
 {
@@ -450,8 +457,10 @@ get_keys_x931_new (gcry_sexp_t *pkey, gcry_sexp_t *skey)
   *pkey = pub_key;
   *skey = sec_key;
 }
+#endif /* USE_RSA */
 
 
+#if USE_ELGAMAL
 static void
 get_elg_key_new (gcry_sexp_t *pkey, gcry_sexp_t *skey, int fixed_x)
 {
@@ -495,8 +504,10 @@ get_elg_key_new (gcry_sexp_t *pkey, gcry_sexp_t *skey, int fixed_x)
   *pkey = pub_key;
   *skey = sec_key;
 }
+#endif /* USE_ELGAMAL */
 
 
+#if USE_DSA
 static void
 get_dsa_key_new (gcry_sexp_t *pkey, gcry_sexp_t *skey, int transient_key)
 {
@@ -729,6 +740,7 @@ get_dsa_key_fips186_with_seed_new (gcry_sexp_t *pkey, gcry_sexp_t *skey)
   *pkey = pub_key;
   *skey = sec_key;
 }
+#endif /* USE_ELGAMAL */
 
 
 static void
@@ -738,6 +750,12 @@ check_run (void)
   gcry_sexp_t pkey, skey;
   int variant;
 
+  (void) err;
+  (void) pkey;
+  (void) skey;
+  (void) variant;
+
+#if USE_RSA
   pkey = skey = NULL;
   for (variant=0; variant < 3; variant++)
     {
@@ -755,7 +773,9 @@ check_run (void)
       gcry_sexp_release (skey);
       pkey = skey = NULL;
     }
+#endif /* USE_RSA */
 
+#if USE_RSA
   if (verbose)
     fprintf (stderr, "Checking generated RSA key.\n");
   get_keys_new (&pkey, &skey);
@@ -772,7 +792,9 @@ check_run (void)
   gcry_sexp_release (pkey);
   gcry_sexp_release (skey);
   pkey = skey = NULL;
+#endif /* USE_RSA */
 
+#if USE_ELGAMAL
   if (verbose)
     fprintf (stderr, "Checking generated Elgamal key.\n");
   get_elg_key_new (&pkey, &skey, 0);
@@ -790,7 +812,9 @@ check_run (void)
   gcry_sexp_release (pkey);
   gcry_sexp_release (skey);
   pkey = skey = NULL;
+#endif /* USE_ELGAMAL */
 
+#if USE_DSA
   if (verbose)
     fprintf (stderr, "Generating DSA key.\n");
   get_dsa_key_new (&pkey, &skey, 0);
@@ -860,10 +884,12 @@ check_run (void)
   gcry_sexp_release (pkey);
   gcry_sexp_release (skey);
   pkey = skey = NULL;
+#endif /* USE_DSA */
 }
 
 
 
+#ifdef USE_RSA
 static gcry_mpi_t
 key_param_from_sexp (gcry_sexp_t sexp, const char *topname, const char *name)
 {
@@ -1091,9 +1117,11 @@ leave:
   gcry_sexp_release (pub_key);
   gcry_sexp_release (sec_key);
 }
+#endif /* USE_RSA */
 
 
 
+#if USE_ECC
 static void
 check_ecc_sample_key (void)
 {
@@ -1307,6 +1335,7 @@ check_ed25519ecdsa_sample_key (void)
   gcry_sexp_release (key);
   gcry_sexp_release (hash);
 }
+#endif /* USE_ECC */
 
 
 int
@@ -1337,12 +1366,15 @@ main (int argc, char **argv)
   for (i=0; i < 2; i++)
     check_run ();
 
+#ifdef USE_RSA
   for (i=0; i < 4; i++)
     check_x931_derived_key (i);
+#endif /* USE_RSA */
 
+#ifdef USE_ECC
   check_ecc_sample_key ();
-  if (!in_fips_mode)
-    check_ed25519ecdsa_sample_key ();
+  check_ed25519ecdsa_sample_key ();
+#endif /* USE_ECC */
 
   return !!error_count;
 }
index 2ffd528b2de539b88cc1bc82983a53b510e5ac08..e56223bad2d1fb3dc36132df3a85c85f20a4e7fb 100644 (file)
@@ -12,9 +12,9 @@
    General Public License for more details.
 
    You should have received a copy of the GNU General Public License
-   along with this program; if not, write to the Free Software
-   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
-   USA.  */
+   along with this program; if not, see <https://www.gnu.org/licenses/>.
+   SPDX-License-Identifier: GPL-2-or-later
+*/
 
 #ifdef HAVE_CONFIG_H
 #include <config.h>
index e356c39ab2792b2189421e37671c7c51ab909cc8..21ef4f62e04f309c1e6a66f562f9dbb9651f857a 100644 (file)
@@ -196,3 +196,156 @@ split_fields_colon (char *string, char **array, int arraysize)
   return n;
 }
 #endif /*NEED_EXTRA_TEST_SUPPORT*/
+
+#ifdef NEED_SHOW_NOTE
+static void
+show_note (const char *format, ...)
+{
+  va_list arg_ptr;
+
+  if (!verbose && getenv ("srcdir"))
+    fputs ("      ", stderr);  /* To align above "PASS: ".  */
+  else
+    fprintf (stderr, "%s: ", PGM);
+  va_start (arg_ptr, format);
+  vfprintf (stderr, format, arg_ptr);
+  if (*format && format[strlen(format)-1] != '\n')
+    putc ('\n', stderr);
+  va_end (arg_ptr);
+}
+#endif
+
+#ifdef NEED_SHOW_SEXP
+static void
+show_sexp (const char *prefix, gcry_sexp_t a)
+{
+  char *buf;
+  size_t size;
+
+  fprintf (stderr, "%s: ", PGM);
+  if (prefix)
+    fputs (prefix, stderr);
+  size = gcry_sexp_sprint (a, GCRYSEXP_FMT_ADVANCED, NULL, 0);
+  buf = xmalloc (size);
+
+  gcry_sexp_sprint (a, GCRYSEXP_FMT_ADVANCED, buf, size);
+  fprintf (stderr, "%.*s", (int)size, buf);
+  gcry_free (buf);
+}
+#endif
+
+#ifdef NEED_PREPEND_SRCDIR
+/* Prepend FNAME with the srcdir environment variable's value and
+ * return an allocated filename.  */
+static char *
+prepend_srcdir (const char *fname)
+{
+  static const char *srcdir;
+  char *result;
+
+  if (!srcdir && !(srcdir = getenv ("srcdir")))
+    srcdir = ".";
+
+  result = xmalloc (strlen (srcdir) + 1 + strlen (fname) + 1);
+  strcpy (result, srcdir);
+  strcat (result, "/");
+  strcat (result, fname);
+  return result;
+}
+#endif
+
+#ifdef NEED_READ_TEXTLINE
+/* Read next line but skip over empty and comment lines.  Caller must
+   xfree the result.  */
+static char *
+read_textline (FILE *fp, int *lineno)
+{
+  char line[8192];
+  char *p;
+
+  do
+    {
+      if (!fgets (line, sizeof line, fp))
+        {
+          if (feof (fp))
+            return NULL;
+          die ("error reading input line: %s\n", strerror (errno));
+        }
+      ++*lineno;
+      p = strchr (line, '\n');
+      if (!p)
+        die ("input line %d not terminated or too long\n", *lineno);
+      *p = 0;
+      for (p--;p > line && my_isascii (*p) && isspace (*p); p--)
+        *p = 0;
+    }
+  while (!*line || *line == '#');
+  /* if (debug) */
+  /*   info ("read line: '%s'\n", line); */
+  return xstrdup (line);
+}
+#endif
+
+#ifdef NEED_COPY_DATA
+/* Copy the data after the tag to BUFFER.  BUFFER will be allocated as
+   needed.  */
+static void
+copy_data (char **buffer, const char *line, int lineno)
+{
+  const char *s;
+
+  xfree (*buffer);
+  *buffer = NULL;
+
+  s = strchr (line, ':');
+  if (!s)
+    {
+      fail ("syntax error at input line %d", lineno);
+      return;
+    }
+  for (s++; my_isascii (*s) && isspace (*s); s++)
+    ;
+  *buffer = xstrdup (s);
+}
+#endif
+
+#ifdef NEED_HEX2BUFFER
+/* Convert STRING consisting of hex characters into its binary
+   representation and return it as an allocated buffer. The valid
+   length of the buffer is returned at R_LENGTH.  The string is
+   delimited by end of string.  The function returns NULL on
+   error.  */
+static unsigned char*
+hex2buffer (const char *string, size_t *r_length)
+{
+  const char *s;
+  unsigned char *buffer;
+  size_t length;
+
+  buffer = xmalloc (strlen(string)/2+1);
+  length = 0;
+  for (s=string; *s; s +=2 )
+    {
+      if (!hexdigitp (s) || !hexdigitp (s+1))
+        return NULL;           /* Invalid hex digits. */
+      buffer[length++] = xtoi_2 (s);
+    }
+  *r_length = length;
+  return buffer;
+}
+#endif
+
+#ifdef NEED_REVERSE_BUFFER
+static void
+reverse_buffer (unsigned char *buffer, unsigned int length)
+{
+  unsigned int tmp, i;
+
+  for (i=0; i < length/2; i++)
+    {
+      tmp = buffer[i];
+      buffer[i] = buffer[length-1-i];
+      buffer[length-1-i] = tmp;
+    }
+}
+#endif
index cc1297615d3c1f8dc1241471a3b601244edecf72..ce828b2730d2c1ad2935af3844f6471c87860f56 100644 (file)
@@ -37,7 +37,6 @@ static int sign_with_pk;
 static int no_verify;
 static int no_fips;
 static int custom_data_file;
-static int in_fips_mode;
 
 
 static void
@@ -193,10 +192,11 @@ one_test (int testno, const char *sk, const char *pk,
   void *buffer = NULL;
   void *buffer2 = NULL;
   size_t buflen, buflen2;
+  gcry_ctx_t ctx = NULL;
+  const char *data_tmpl;
   gcry_sexp_t s_tmp, s_tmp2;
   gcry_sexp_t s_sk = NULL;
   gcry_sexp_t s_pk = NULL;
-  gcry_sexp_t s_msg= NULL;
   gcry_sexp_t s_sig= NULL;
   unsigned char *sig_r = NULL;
   unsigned char *sig_s = NULL;
@@ -262,25 +262,19 @@ one_test (int testno, const char *sk, const char *pk,
             testno, "msg", "invalid hex string");
       goto leave;
     }
-  if ((err = gcry_sexp_build (&s_msg, NULL,
-                              "(data"
-                              " (flags eddsa)"
-                              " (hash-algo sha512)"
-                              " (value %b))",  (int)buflen, buffer)))
+  err = gcry_pk_input_data_push (&ctx, buffer, buflen);
+  if (err)
     {
-      fail ("error building s-exp for test %d, %s: %s",
-            testno, "msg", gpg_strerror (err));
+      fail ("error setting input data for test: %s",
+           gpg_strerror (err));
       goto leave;
     }
 
-  err = gcry_pk_sign (&s_sig, s_msg, s_sk);
-  if (in_fips_mode)
+  data_tmpl = "(data(value %b))";
+  err = gcry_pk_hash_sign (&s_sig, data_tmpl, s_sk, NULL, ctx);
+  if (err)
     {
-      if (!err)
-        fail ("gcry_pk_sign is not expected to work in FIPS mode for test %d",
-              testno);
-      if (verbose > 1)
-        info ("not executed in FIPS mode\n");
+      fail ("gcry_pk_hash_sign failed: %s", gpg_strerror (err));
       goto leave;
     }
   if (err)
@@ -335,16 +329,15 @@ one_test (int testno, const char *sk, const char *pk,
     }
 
   if (!no_verify)
-    if ((err = gcry_pk_verify (s_sig, s_msg, s_pk)))
+    if ((err = gcry_pk_hash_verify (s_sig, data_tmpl, s_pk, NULL, ctx)))
       fail ("gcry_pk_verify failed for test %d: %s",
             testno, gpg_strerror (err));
 
-
  leave:
+  gcry_ctx_release (ctx);
   gcry_sexp_release (s_sig);
   gcry_sexp_release (s_sk);
   gcry_sexp_release (s_pk);
-  gcry_sexp_release (s_msg);
   xfree (buffer);
   xfree (buffer2);
   xfree (sig_r);
@@ -503,9 +496,6 @@ main (int argc, char **argv)
   xgcry_control ((GCRYCTL_ENABLE_QUICK_RANDOM, 0));
   xgcry_control ((GCRYCTL_INITIALIZATION_FINISHED, 0));
 
-  if (gcry_fips_mode_active ())
-    in_fips_mode = 1;
-
   start_timer ();
   check_ed25519 (fname);
   stop_timer ();
index 3e3dd34a5976ca1cca20b6c7cd8d3b81aa86b8db..b68fd855b06bb70772909f1e3833355f2374adf6 100644 (file)
@@ -36,7 +36,6 @@
 static int sign_with_pk;
 static int no_verify;
 static int custom_data_file;
-static int in_fips_mode;
 
 
 static void
@@ -184,7 +183,7 @@ hexdowncase (char *string)
 
 static void
 one_test (int testno, int ph, const char *sk, const char *pk,
-          const char *msg, const char *ctx, const char *sig)
+          const char *msg, const char *ctx_str, const char *sig)
 {
   gpg_error_t err;
   int i;
@@ -192,10 +191,11 @@ one_test (int testno, int ph, const char *sk, const char *pk,
   void *buffer = NULL;
   void *buffer2 = NULL;
   size_t buflen, buflen2;
+  gcry_ctx_t ctx = NULL;
+  const char *data_tmpl;
   gcry_sexp_t s_tmp, s_tmp2;
   gcry_sexp_t s_sk = NULL;
   gcry_sexp_t s_pk = NULL;
-  gcry_sexp_t s_msg= NULL;
   gcry_sexp_t s_sig= NULL;
   unsigned char *sig_r = NULL;
   unsigned char *sig_s = NULL;
@@ -258,61 +258,46 @@ one_test (int testno, int ph, const char *sk, const char *pk,
             testno, "msg", "invalid hex string");
       goto leave;
     }
-  if (ctx)
+  err = gcry_pk_input_data_push (&ctx, buffer, buflen);
+  if (err)
+    {
+      fail ("error setting input data for test: %s",
+            gpg_strerror (err));
+      goto leave;
+    }
+
+  if (ctx_str)
     {
       xfree (buffer2);
-      if (!(buffer2 = hex2buffer (ctx, &buflen2)))
+      if (!(buffer2 = hex2buffer (ctx_str, &buflen2)))
         {
           fail ("error building s-exp for test %d, %s: %s",
                 testno, "ctx", "invalid hex string");
           goto leave;
         }
 
-      if ((err = gcry_sexp_build (&s_msg, NULL,
-                                  ph ?
-                                  "(data"
-                                  " (flags prehash)"
-                                  " (label %b)"
-                                  " (value %b))"
-                                  :
-                                  "(data"
-                                  " (label %b)"
-                                  " (value %b))",
-                                  (int)buflen2, buffer2,
-                                  (int)buflen, buffer)))
+      err = gcry_pk_input_data_push (&ctx, buffer2, buflen2);
+      if (err)
         {
-          fail ("error building s-exp for test %d, %s: %s",
-                testno, "msg", gpg_strerror (err));
+          fail ("error setting ctx for test: %s",
+                gpg_strerror (err));
           goto leave;
         }
+
+      if (ph)
+        data_tmpl = "(data(flags prehash)(label %b)(value %b))";
+      else
+        data_tmpl = "(data(label %b)(value %b))";
     }
   else
     {
-      if ((err = gcry_sexp_build (&s_msg, NULL,
-                                  ph ?
-                                  "(data"
-                                  " (flags prehash)"
-                                  " (value %b))"
-                                  :
-                                  "(data"
-                                  " (value %b))",  (int)buflen, buffer)))
-        {
-          fail ("error building s-exp for test %d, %s: %s",
-                testno, "msg", gpg_strerror (err));
-          goto leave;
-        }
+      if (ph)
+        data_tmpl = "(data(flags prehash)(value %b))";
+      else
+        data_tmpl = "(data(value %b))";
     }
 
-  err = gcry_pk_sign (&s_sig, s_msg, s_sk);
-  if (in_fips_mode)
-    {
-      if (!err)
-        fail ("gcry_pk_sign is not expected to work in FIPS mode for test %d",
-              testno);
-      if (verbose > 1)
-        info ("not executed in FIPS mode\n");
-      goto leave;
-    }
+  err = gcry_pk_hash_sign (&s_sig, data_tmpl, s_sk, NULL, ctx);
   if (err)
     fail ("gcry_pk_sign failed for test %d: %s", testno, gpg_strerror (err));
   if (debug)
@@ -365,16 +350,16 @@ one_test (int testno, int ph, const char *sk, const char *pk,
     }
 
   if (!no_verify)
-    if ((err = gcry_pk_verify (s_sig, s_msg, s_pk)))
+    if ((err = gcry_pk_hash_verify (s_sig, data_tmpl, s_pk, NULL, ctx)))
       fail ("gcry_pk_verify failed for test %d: %s",
             testno, gpg_strerror (err));
 
 
  leave:
+  gcry_ctx_release (ctx);
   gcry_sexp_release (s_sig);
   gcry_sexp_release (s_sk);
   gcry_sexp_release (s_pk);
-  gcry_sexp_release (s_msg);
   xfree (buffer);
   xfree (buffer2);
   xfree (sig_r);
@@ -532,9 +517,6 @@ main (int argc, char **argv)
   xgcry_control ((GCRYCTL_ENABLE_QUICK_RANDOM, 0));
   xgcry_control ((GCRYCTL_INITIALIZATION_FINISHED, 0));
 
-  if (gcry_fips_mode_active ())
-    in_fips_mode = 1;
-
   start_timer ();
   check_ed448 (fname);
   stop_timer ();
index 19c96451dc0f25b21328a3b1dfee676c0c769b0a..10f64a7c156dc16ff7cb9ac8366d34fa74361fac 100644 (file)
@@ -14,8 +14,8 @@
  * GNU Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA
+ * License along with this program; if not, see <https://www.gnu.org/licenses/>.
+ * SPDX-License-Identifier: LGPL-2.1-or-later
  */
 
 #ifdef HAVE_CONFIG_H
@@ -862,6 +862,8 @@ check_openpgp (void)
       /* MD5 isn't supported in fips mode */
       if (in_fips_mode && tv[tvidx].hashalgo == GCRY_MD_MD5)
         continue;
+      if (gcry_md_test_algo (tv[tvidx].hashalgo) != 0)
+        continue;
       if (verbose)
         fprintf (stderr, "checking S2K test vector %d\n", tvidx);
       assert (tv[tvidx].dklen <= sizeof outbuf);
@@ -1105,6 +1107,8 @@ check_pbkdf2 (void)
     {
       if (tv[tvidx].disabled)
         continue;
+      if (gcry_md_test_algo (tv[tvidx].hashalgo) != 0)
+        continue;
       if (verbose)
         fprintf (stderr, "checking PBKDF2 test vector %d algo %d\n", tvidx,
                  tv[tvidx].hashalgo);
@@ -1345,10 +1349,10 @@ static gcry_error_t
 my_kdf_derive (int parallel,
                int algo, int subalgo,
                const unsigned long *params, unsigned int paramslen,
-               const unsigned char *pass, size_t passlen,
-               const unsigned char *salt, size_t saltlen,
-               const unsigned char *key, size_t keylen,
-               const unsigned char *ad, size_t adlen,
+               const char *pass, size_t passlen,
+               const char *salt, size_t saltlen,
+               const char *key, size_t keylen,
+               const char *ad, size_t adlen,
                size_t outlen, unsigned char *out)
 {
   gcry_error_t err;
@@ -1409,95 +1413,458 @@ my_kdf_derive (int parallel,
   return err;
 }
 
-
 static void
 check_argon2 (void)
 {
   gcry_error_t err;
-  const unsigned long param[4] = { 32, 3, 32, 4 };
-  const unsigned char pass[32] = {
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
-  };
-  const unsigned char salt[16] = {
-    2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+  static struct {
+    int subalgo;
+    unsigned long param[4];
+    size_t passlen;
+    const char *pass;
+    size_t saltlen;
+    const char *salt;
+    size_t keylen;
+    const char *key;
+    size_t adlen;
+    const char *ad;
+    size_t dklen;
+    const char *dk;
+  } tv[] = {
+    {
+      GCRY_KDF_ARGON2D,
+      { 32, 3, 32, 4 },
+      32,
+      "\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01"
+      "\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01",
+      16,
+      "\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02",
+      8,
+      "\x03\x03\x03\x03\x03\x03\x03\x03",
+      12,
+      "\x04\x04\x04\x04\x04\x04\x04\x04\x04\x04\x04\x04",
+      32,
+      "\x51\x2b\x39\x1b\x6f\x11\x62\x97\x53\x71\xd3\x09\x19\x73\x42\x94"
+      "\xf8\x68\xe3\xbe\x39\x84\xf3\xc1\xa1\x3a\x4d\xb9\xfa\xbe\x4a\xcb"
+    },
+    {
+      GCRY_KDF_ARGON2I,
+      { 32, 3, 32, 4 },
+      32,
+      "\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01"
+      "\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01",
+      16,
+      "\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02",
+      8,
+      "\x03\x03\x03\x03\x03\x03\x03\x03",
+      12,
+      "\x04\x04\x04\x04\x04\x04\x04\x04\x04\x04\x04\x04",
+      32,
+      "\xc8\x14\xd9\xd1\xdc\x7f\x37\xaa\x13\xf0\xd7\x7f\x24\x94\xbd\xa1"
+      "\xc8\xde\x6b\x01\x6d\xd3\x88\xd2\x99\x52\xa4\xc4\x67\x2b\x6c\xe8"
+    },
+    {
+      GCRY_KDF_ARGON2ID,
+      { 32, 3, 32, 4 },
+      32,
+      "\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01"
+      "\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01",
+      16,
+      "\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02",
+      8,
+      "\x03\x03\x03\x03\x03\x03\x03\x03",
+      12,
+      "\x04\x04\x04\x04\x04\x04\x04\x04\x04\x04\x04\x04",
+      32,
+      "\x0d\x64\x0d\xf5\x8d\x78\x76\x6c\x08\xc0\x37\xa3\x4a\x8b\x53\xc9"
+      "\xd0\x1e\xf0\x45\x2d\x75\xb6\x5e\xb5\x25\x20\xe9\x6b\x01\xe6\x59"
+    },
+    {
+      /* empty password */
+      GCRY_KDF_ARGON2I,
+      { 32, 3, 128, 1 },
+      0, NULL,
+      16,
+      "\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f",
+      0, NULL,
+      0, NULL,
+      32,
+      "\xbb\x1f\xf2\xb9\x9f\xd4\x4a\xd9\xdf\x7f\xb9\x54\x55\x9e\xb8\xeb"
+      "\xb5\x9d\xab\xce\x2e\x62\x9f\x9b\x89\x09\xfe\xde\x57\xcc\x63\x86"
+    },
+    {
+      /* empty password */
+      GCRY_KDF_ARGON2ID,
+      { 32, 3, 128, 1 },
+      0, NULL,
+      16,
+      "\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f",
+      0, NULL,
+      0, NULL,
+      32,
+      "\x09\x2f\x38\x35\xac\xb2\x43\x92\x93\xeb\xcd\xe8\x04\x16\x6a\x31"
+      "\xce\x14\xd4\x55\xdb\xd8\xf7\xe6\xb4\xf5\x9d\x64\x8e\xd0\x3a\xdb"
+    },
   };
-  const unsigned char key[8] = { 3, 3, 3, 3, 3, 3, 3, 3 };
-  const unsigned char ad[12] = { 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4 };
   unsigned char out[32];
-  unsigned char expected[3][32] = {
-    {  /* GCRY_KDF_ARGON2D */
-      0x51, 0x2b, 0x39, 0x1b, 0x6f, 0x11, 0x62, 0x97,
-      0x53, 0x71, 0xd3, 0x09, 0x19, 0x73, 0x42, 0x94,
-      0xf8, 0x68, 0xe3, 0xbe, 0x39, 0x84, 0xf3, 0xc1,
-      0xa1, 0x3a, 0x4d, 0xb9, 0xfa, 0xbe, 0x4a, 0xcb
-    },
-    { /* GCRY_KDF_ARGON2I */
-      0xc8, 0x14, 0xd9, 0xd1, 0xdc, 0x7f, 0x37, 0xaa,
-      0x13, 0xf0, 0xd7, 0x7f, 0x24, 0x94, 0xbd, 0xa1,
-      0xc8, 0xde, 0x6b, 0x01, 0x6d, 0xd3, 0x88, 0xd2,
-      0x99, 0x52, 0xa4, 0xc4, 0x67, 0x2b, 0x6c, 0xe8
-    },
-    { /* GCRY_KDF_ARGON2ID */
-      0x0d, 0x64, 0x0d, 0xf5, 0x8d, 0x78, 0x76, 0x6c,
-      0x08, 0xc0, 0x37, 0xa3, 0x4a, 0x8b, 0x53, 0xc9,
-      0xd0, 0x1e, 0xf0, 0x45, 0x2d, 0x75, 0xb6, 0x5e,
-      0xb5, 0x25, 0x20, 0xe9, 0x6b, 0x01, 0xe6, 0x59
-    }
-  };
   int i;
-  int subalgo = GCRY_KDF_ARGON2D;
-  int count = 0;
+  int count;
 
- again:
+  for (count = 0; count < DIM(tv); count++)
+    {
+      if (verbose)
+        fprintf (stderr, "checking ARGON2 test vector %d\n", count);
+
+      err = my_kdf_derive (0, GCRY_KDF_ARGON2,
+                           tv[count].subalgo, tv[count].param, 4,
+                           tv[count].pass, tv[count].passlen,
+                           tv[count].salt, tv[count].saltlen,
+                           tv[count].key, tv[count].keylen,
+                           tv[count].ad, tv[count].adlen,
+                           tv[count].dklen, out);
+      if (err)
+        fail ("argon2 test %d failed: %s\n", count*2+0, gpg_strerror (err));
+      else if (memcmp (out, tv[count].dk, tv[count].dklen))
+        {
+          fail ("argon2 test %d failed: mismatch\n", count*2+0);
+          fputs ("got:", stderr);
+          for (i=0; i < tv[count].dklen; i++)
+            fprintf (stderr, " %02x", out[i]);
+          putc ('\n', stderr);
+        }
 
-  if (verbose)
-    fprintf (stderr, "checking ARGON2 test vector %d\n", count);
+#ifdef HAVE_PTHREAD
+      err = my_kdf_derive (1, GCRY_KDF_ARGON2,
+                           tv[count].subalgo, tv[count].param, 4,
+                           tv[count].pass, tv[count].passlen,
+                           tv[count].salt, tv[count].saltlen,
+                           tv[count].key, tv[count].keylen,
+                           tv[count].ad, tv[count].adlen,
+                           tv[count].dklen, out);
+      if (err)
+        fail ("argon2 test %d failed: %s\n", count*2+1, gpg_strerror (err));
+      else if (memcmp (out, tv[count].dk, tv[count].dklen))
+        {
+          fail ("argon2 test %d failed: mismatch\n", count*2+1);
+          fputs ("got:", stderr);
+          for (i=0; i < tv[count].dklen; i++)
+            fprintf (stderr, " %02x", out[i]);
+          putc ('\n', stderr);
+        }
+#endif
+    }
+}
 
-  err = my_kdf_derive (0,
-                       GCRY_KDF_ARGON2, subalgo, param, 4,
-                       pass, 32, salt, 16, key, 8, ad, 12,
-                       32, out);
-  if (err)
-    fail ("argon2 test %d failed: %s\n", 0, gpg_strerror (err));
-  else if (memcmp (out, expected[count], 32))
-    {
-      fail ("argon2 test %d failed: mismatch\n", 0);
-      fputs ("got:", stderr);
-      for (i=0; i < 32; i++)
-        fprintf (stderr, " %02x", out[i]);
-      putc ('\n', stderr);
+
+static void
+check_balloon (void)
+{
+  gcry_error_t err;
+  /* Two test vectors generated by the research prototype implementation.
+     $ balloon abcdefghijklmno
+     t_cost         = 1
+     s_cost         = 1024
+     p_cost         = 1
+     passwd         = abcdefghijklmno
+     Time total      : 0.0527251
+     Hashes per sec  : 18.9663
+     Output          : $balloon$v=1$s=1024,t=1,p=1
+                       $FRzqOiIuPvuoy55vGfKzyse+2f28F7m9iFHCctnEBwg=
+                       $NxOGNPyTPZzKiJjgj7H6pJDLIgR05HI7VaxJpxEao5Q=
+     $ balloon -t 12 -s 4096 -p 4 Long_sentence_used_as_passphrase
+     t_cost         = 12
+     s_cost         = 4096
+     p_cost         = 4
+     passwd         = Long_sentence_used_as_passphrase
+     Time total      : 3.70399
+     Hashes per sec  : 0.269979
+     Output          : $balloon$v=1$s=4096,t=12,p=4
+                       $8Yor74EqTwBrrdaeYeSVx0VXVAgDrsILAnJWdVUy93s=
+                       $FaNb9ofeWEggzhW9BUSODgZH5/awzNz5Adoub48+BgQ=
+   */
+  static struct {
+    int subalgo;
+    unsigned long param[3];
+    size_t passlen;
+    const char *pass;
+    size_t saltlen;
+    const char *salt;
+    size_t dklen;
+    const char *dk;
+  } tv[] = {
+    {
+      GCRY_MD_SHA256,
+      { 1024, 1, 1 },
+      15,
+      "abcdefghijklmno",
+      32,
+      "\x15\x1c\xea\x3a\x22\x2e\x3e\xfb\xa8\xcb\x9e\x6f\x19\xf2\xb3\xca"
+      "\xc7\xbe\xd9\xfd\xbc\x17\xb9\xbd\x88\x51\xc2\x72\xd9\xc4\x07\x08",
+      32,
+      "\x37\x13\x86\x34\xfc\x93\x3d\x9c\xca\x88\x98\xe0\x8f\xb1\xfa\xa4"
+      "\x90\xcb\x22\x04\x74\xe4\x72\x3b\x55\xac\x49\xa7\x11\x1a\xa3\x94"
+    },
+    {
+      GCRY_MD_SHA256,
+      { 4096, 12, 4 },
+      32,
+      "Long_sentence_used_as_passphrase",
+      32,
+      "\xf1\x8a\x2b\xef\x81\x2a\x4f\x00\x6b\xad\xd6\x9e\x61\xe4\x95\xc7"
+      "\x45\x57\x54\x08\x03\xae\xc2\x0b\x02\x72\x56\x75\x55\x32\xf7\x7b",
+      32,
+      "\x15\xa3\x5b\xf6\x87\xde\x58\x48\x20\xce\x15\xbd\x05\x44\x8e\x0e"
+      "\x06\x47\xe7\xf6\xb0\xcc\xdc\xf9\x01\xda\x2e\x6f\x8f\x3e\x06\x04"
     }
+  };
+  unsigned char out[32];
+  int i;
+  int count;
+
+  for (count = 0; count < DIM(tv); count++)
+    {
+      if (verbose)
+        fprintf (stderr, "checking Balloon test vector %d\n", count);
+
+      err = my_kdf_derive (0, GCRY_KDF_BALLOON,
+                           tv[count].subalgo, tv[count].param, 3,
+                           tv[count].pass, tv[count].passlen,
+                           tv[count].salt, tv[count].saltlen,
+                           NULL, 0, NULL, 0, tv[count].dklen, out);
+      if (err)
+        fail ("balloon test %d failed: %s\n", count*2+0, gpg_strerror (err));
+      else if (memcmp (out, tv[count].dk, tv[count].dklen))
+        {
+          fail ("balloon test %d failed: mismatch\n", count*2+0);
+          fputs ("got:", stderr);
+          for (i=0; i < tv[count].dklen; i++)
+            fprintf (stderr, " %02x", out[i]);
+          putc ('\n', stderr);
+        }
 
 #ifdef HAVE_PTHREAD
-  err = my_kdf_derive (1,
-                       GCRY_KDF_ARGON2, subalgo, param, 4,
-                       pass, 32, salt, 16, key, 8, ad, 12,
-                       32, out);
-  if (err)
-    fail ("argon2 test %d failed: %s\n", 1, gpg_strerror (err));
-  else if (memcmp (out, expected[count], 32))
-    {
-      fail ("argon2 test %d failed: mismatch\n", 1);
-      fputs ("got:", stderr);
-      for (i=0; i < 32; i++)
-        fprintf (stderr, " %02x", out[i]);
-      putc ('\n', stderr);
-    }
+      err = my_kdf_derive (1, GCRY_KDF_BALLOON,
+                           tv[count].subalgo, tv[count].param, 3,
+                           tv[count].pass, tv[count].passlen,
+                           tv[count].salt, tv[count].saltlen,
+                           NULL, 0, NULL, 0, tv[count].dklen, out);
+      if (err)
+        fail ("balloon test %d failed: %s\n", count*2+1, gpg_strerror (err));
+      else if (memcmp (out, tv[count].dk, tv[count].dklen))
+        {
+          fail ("balloon test %d failed: mismatch\n", count*2+1);
+          fputs ("got:", stderr);
+          for (i=0; i < tv[count].dklen; i++)
+            fprintf (stderr, " %02x", out[i]);
+          putc ('\n', stderr);
+        }
 #endif
+    }
+}
 
-  /* Next algo */
-  if (subalgo == GCRY_KDF_ARGON2D)
-    subalgo = GCRY_KDF_ARGON2I;
-  else if (subalgo == GCRY_KDF_ARGON2I)
-    subalgo = GCRY_KDF_ARGON2ID;
 
-  count++;
-  if (count < 3)
-    goto again;
+static void
+check_onestep_kdf (void)
+{
+  gcry_error_t err;
+  static struct {
+    int algo;
+    int subalgo;
+    unsigned long param[1];
+    size_t inputlen;
+    const char *input;
+    size_t otherlen;
+    const char *other;
+    size_t keylen;
+    const char *key;
+    size_t dklen;
+    const char *dk;
+  } tv[] = {
+    {
+      GCRY_KDF_ONESTEP_KDF, GCRY_MD_SHA256,
+      { 38 },
+      16,
+      "\x3f\x89\x2b\xd8\xb8\x4d\xae\x64\xa7\x82\xa3\x5f\x6e\xaa\x8f\x00",
+      12,
+      "\xec\x3f\x1c\xd8\x73\xd2\x88\x58\xa5\x8c\xc3\x9e",
+      0, NULL,
+      38,
+      "\xa7\xc0\x66\x52\x98\x25\x25\x31\xe0\xdb\x37\x73\x7a\x37\x46\x51"
+      "\xb3\x68\x27\x5f\x20\x48\x28\x4d\x16\xa1\x66\xc6\xd8\xa9\x0a\x91"
+      "\xa4\x91\xc1\x6f\x49\x64"
+   },
+    {
+      GCRY_KDF_ONESTEP_KDF, GCRY_MD_SHA512,
+      { 68 },
+      16,
+      "\xe6\x5b\x19\x05\x87\x8b\x95\xf6\x8b\x55\x35\xbd\x3b\x2b\x10\x13",
+      12,
+      "\x83\x02\x21\xb1\x73\x0d\x91\x76\xf8\x07\xd4\x07",
+      0, NULL,
+      68,
+      "\xb8\xc4\x4b\xdf\x0b\x85\xa6\x4b\x6a\x51\xc1\x2a\x06\x71\x0e\x37"
+      "\x3d\x82\x9b\xb1\xfd\xa5\xb4\xe1\xa2\x07\x95\xc6\x19\x95\x94\xf6"
+      "\xfa\x65\x19\x8a\x72\x12\x57\xf7\xd5\x8c\xb2\xf6\xf6\xdb\x9b\xb5"
+      "\x69\x9f\x73\x86\x30\x45\x90\x90\x54\xb2\x38\x9e\x06\xec\x00\xfe"
+      "\x31\x8c\xab\xd9"
+    },
+    {
+      GCRY_KDF_ONESTEP_KDF_MAC, GCRY_MAC_HMAC_SHA256,
+      { 44 },
+      16,
+      "\x02\xb4\x0d\x33\xe3\xf6\x85\xae\xae\x67\x7a\xc3\x44\xee\xaf\x77",
+      12,
+      "\xc6\x7c\x38\x95\x80\x12\x8f\x18\xf6\xcf\x85\x92",
+      16,
+      "\x0a\xd5\x2c\x93\x57\xc8\x5e\x47\x81\x29\x6a\x36\xca\x72\x03\x9c",
+      44,
+      "\xbe\x32\xe7\xd3\x06\xd8\x91\x02\x8b\xe0\x88\xf2\x13\xf9\xf9\x47"
+      "\xc5\x04\x20\xd9\xb5\xa1\x2c\xa6\x98\x18\xdd\x99\x95\xde\xdd\x8e"
+      "\x61\x37\xc7\x10\x4d\x67\xf2\xca\x90\x91\x5d\xda"
+    },
+    {
+      GCRY_KDF_ONESTEP_KDF_MAC, GCRY_MAC_HMAC_SHA512,
+      { 56 },
+      16,
+      "\x8e\x5c\xd5\xf6\xae\x55\x8f\xfa\x04\xcd\xa2\xfa\xd9\x4d\xd6\x16",
+      12,
+      "\x4a\x43\x30\x18\xe5\x1c\x09\xbb\xd6\x13\x26\xbb",
+      16,
+      "\x6e\xd9\x3b\x6f\xe5\xb3\x50\x2b\xb4\x2b\x4c\x0f\xcb\x13\x36\x62",
+      56,
+      "\x29\x5d\xfb\xeb\x54\xec\x0f\xe2\x4e\xce\x32\xf5\xb8\x7c\x85\x3e"
+      "\x69\x9a\x62\xe3\x9d\x9c\x9e\xe6\xee\x78\xf8\xb9\xa0\xee\x50\xa3"
+      "\x6a\x82\xe6\x06\x2c\x95\xed\x53\xbc\x36\x67\x00\xe2\xd0\xe0\x93"
+      "\xbf\x75\x2e\xea\x42\x99\x47\x2e"
+    },
+  };
+  unsigned char out[68];
+  int i;
+  int count;
+
+  for (count = 0; count < DIM(tv); count++)
+    {
+      if (verbose)
+        fprintf (stderr, "checking OneStepKDF test vector %d\n", count);
+
+      err = my_kdf_derive (0, tv[count].algo, tv[count].subalgo,
+                           tv[count].param, 1,
+                           tv[count].input, tv[count].inputlen, NULL, 0,
+                           tv[count].key, tv[count].keylen,
+                           tv[count].other, tv[count].otherlen,
+                           tv[count].dklen, out);
+      if (err)
+        fail ("OneStepKDF test %d failed: %s\n", count, gpg_strerror (err));
+      else if (memcmp (out, tv[count].dk, tv[count].dklen))
+        {
+          fail ("OneStepKDF test %d failed: mismatch\n", count);
+          fputs ("got:", stderr);
+          for (i=0; i < tv[count].dklen; i++)
+            fprintf (stderr, " %02x", out[i]);
+          putc ('\n', stderr);
+        }
+    }
 }
 
 
+static void
+check_hkdf (void)
+{
+  gcry_error_t err;
+  static struct {
+    unsigned long param[1];
+    size_t inputlen;
+    const char *input;
+    size_t saltlen;
+    const char *salt;
+    size_t infolen;
+    const char *info;
+    size_t dklen;
+    const char *dk;
+  } tv[] = {
+    {
+      { 42 },
+      22,
+      "\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b"
+      "\x0b\x0b\x0b\x0b\x0b\x0b",
+      13,
+      "\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c",
+      10,
+      "\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7\xf8\xf9",
+      42,
+      "\x3c\xb2\x5f\x25\xfa\xac\xd5\x7a\x90\x43\x4f\x64\xd0\x36\x2f\x2a"
+      "\x2d\x2d\x0a\x90\xcf\x1a\x5a\x4c\x5d\xb0\x2d\x56\xec\xc4\xc5\xbf"
+      "\x34\x00\x72\x08\xd5\xb8\x87\x18\x58\x65"
+    },
+    {
+      { 82 },
+      80,
+      "\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f"
+      "\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f"
+      "\x20\x21\x22\x23\x24\x25\x26\x27\x28\x29\x2a\x2b\x2c\x2d\x2e\x2f"
+      "\x30\x31\x32\x33\x34\x35\x36\x37\x38\x39\x3a\x3b\x3c\x3d\x3e\x3f"
+      "\x40\x41\x42\x43\x44\x45\x46\x47\x48\x49\x4a\x4b\x4c\x4d\x4e\x4f",
+      80,
+      "\x60\x61\x62\x63\x64\x65\x66\x67\x68\x69\x6a\x6b\x6c\x6d\x6e\x6f"
+      "\x70\x71\x72\x73\x74\x75\x76\x77\x78\x79\x7a\x7b\x7c\x7d\x7e\x7f"
+      "\x80\x81\x82\x83\x84\x85\x86\x87\x88\x89\x8a\x8b\x8c\x8d\x8e\x8f"
+      "\x90\x91\x92\x93\x94\x95\x96\x97\x98\x99\x9a\x9b\x9c\x9d\x9e\x9f"
+      "\xa0\xa1\xa2\xa3\xa4\xa5\xa6\xa7\xa8\xa9\xaa\xab\xac\xad\xae\xaf",
+      80,
+      "\xb0\xb1\xb2\xb3\xb4\xb5\xb6\xb7\xb8\xb9\xba\xbb\xbc\xbd\xbe\xbf"
+      "\xc0\xc1\xc2\xc3\xc4\xc5\xc6\xc7\xc8\xc9\xca\xcb\xcc\xcd\xce\xcf"
+      "\xd0\xd1\xd2\xd3\xd4\xd5\xd6\xd7\xd8\xd9\xda\xdb\xdc\xdd\xde\xdf"
+      "\xe0\xe1\xe2\xe3\xe4\xe5\xe6\xe7\xe8\xe9\xea\xeb\xec\xed\xee\xef"
+      "\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7\xf8\xf9\xfa\xfb\xfc\xfd\xfe\xff",
+      82,
+      "\xb1\x1e\x39\x8d\xc8\x03\x27\xa1\xc8\xe7\xf7\x8c\x59\x6a\x49\x34"
+      "\x4f\x01\x2e\xda\x2d\x4e\xfa\xd8\xa0\x50\xcc\x4c\x19\xaf\xa9\x7c"
+      "\x59\x04\x5a\x99\xca\xc7\x82\x72\x71\xcb\x41\xc6\x5e\x59\x0e\x09"
+      "\xda\x32\x75\x60\x0c\x2f\x09\xb8\x36\x77\x93\xa9\xac\xa3\xdb\x71"
+      "\xcc\x30\xc5\x81\x79\xec\x3e\x87\xc1\x4c\x01\xd5\xc1\xf3\x43\x4f"
+      "\x1d\x87"
+    },
+    {
+      { 42 },
+      22,
+      "\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b"
+      "\x0b\x0b\x0b\x0b\x0b\x0b",
+      0, NULL,
+      0, NULL,
+      42,
+      "\x8d\xa4\xe7\x75\xa5\x63\xc1\x8f\x71\x5f\x80\x2a\x06\x3c\x5a\x31"
+      "\xb8\xa1\x1f\x5c\x5e\xe1\x87\x9e\xc3\x45\x4e\x5f\x3c\x73\x8d\x2d"
+      "\x9d\x20\x13\x95\xfa\xa4\xb6\x1a\x96\xc8"
+    },
+  };
+  unsigned char out[82];
+  int i;
+  int count;
+
+  for (count = 0; count < DIM(tv); count++)
+    {
+      if (verbose)
+        fprintf (stderr, "checking HKDF test vector %d\n", count);
+
+      err = my_kdf_derive (0, GCRY_KDF_HKDF, GCRY_MAC_HMAC_SHA256,
+                           tv[count].param, 1,
+                           tv[count].input, tv[count].inputlen, NULL, 0,
+                           tv[count].salt, tv[count].saltlen,
+                           tv[count].info, tv[count].infolen,
+                           tv[count].dklen, out);
+      if (err)
+        fail ("HKDF test %d failed: %s\n", count, gpg_strerror (err));
+      else if (memcmp (out, tv[count].dk, tv[count].dklen))
+        {
+          fail ("HKDF test %d failed: mismatch\n", count);
+          fputs ("got:", stderr);
+          for (i=0; i < tv[count].dklen; i++)
+            fprintf (stderr, " %02x", out[i]);
+          putc ('\n', stderr);
+        }
+    }
+}
+
 static void
 check_fips_indicators (void)
 {
@@ -1511,7 +1878,11 @@ check_fips_indicators (void)
     GCRY_KDF_PBKDF1,
     GCRY_KDF_PBKDF2,
     GCRY_KDF_SCRYPT,
-    GCRY_KDF_ARGON2
+    GCRY_KDF_ARGON2  ,
+    GCRY_KDF_BALLOON ,
+    GCRY_KDF_ONESTEP_KDF,
+    GCRY_KDF_ONESTEP_KDF_MAC,
+    GCRY_KDF_HKDF,
   };
   size_t i, j;
 
@@ -1633,6 +2004,9 @@ main (int argc, char **argv)
       check_pbkdf2 ();
       check_scrypt ();
       check_argon2 ();
+      check_balloon ();
+      check_onestep_kdf ();
+      check_hkdf ();
       if (in_fips_mode)
         check_fips_indicators();
     }
diff --git a/tests/t-kem.c b/tests/t-kem.c
new file mode 100644 (file)
index 0000000..12fbb8c
--- /dev/null
@@ -0,0 +1,656 @@
+/* t-kem.c -  KEM regression tests
+ * Copyright (C) 2023 Simon Josefsson <simon@josefsson.org>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <https://www.gnu.org/licenses/>.
+ * SPDX-License-Identifier: LGPL-2.1-or-later
+ */
+
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+#include <stdio.h>
+#include <stdint.h>
+
+#include "stopwatch.h"
+
+#define PGM "t-kem"
+#define NEED_SHOW_NOTE
+#include "t-common.h"
+#define N_TESTS 10
+
+static int in_fips_mode;
+
+static void
+test_kem_sntrup761 (int testno)
+{
+  gcry_error_t err;
+  uint8_t pubkey[GCRY_KEM_SNTRUP761_PUBKEY_LEN];
+  uint8_t seckey[GCRY_KEM_SNTRUP761_SECKEY_LEN];
+  uint8_t ciphertext[GCRY_KEM_SNTRUP761_ENCAPS_LEN];
+  uint8_t key1[GCRY_KEM_SNTRUP761_SHARED_LEN];
+  uint8_t key2[GCRY_KEM_SNTRUP761_SHARED_LEN];
+
+  err = gcry_kem_keypair (GCRY_KEM_SNTRUP761,
+                          pubkey, GCRY_KEM_SNTRUP761_PUBKEY_LEN,
+                          seckey, GCRY_KEM_SNTRUP761_SECKEY_LEN);
+  if (err)
+    {
+      fail ("gcry_kem_keypair %d: %s", testno, gpg_strerror (err));
+      return;
+    }
+
+  err = gcry_kem_encap (GCRY_KEM_SNTRUP761,
+                        pubkey, GCRY_KEM_SNTRUP761_PUBKEY_LEN,
+                        ciphertext, GCRY_KEM_SNTRUP761_ENCAPS_LEN,
+                        key1, GCRY_KEM_SNTRUP761_SHARED_LEN,
+                        NULL, 0);
+  if (err)
+    {
+      fail ("gcry_kem_enc %d: %s", testno, gpg_strerror (err));
+      return;
+    }
+
+  err = gcry_kem_decap (GCRY_KEM_SNTRUP761,
+                        seckey, GCRY_KEM_SNTRUP761_SECKEY_LEN,
+                        ciphertext, GCRY_KEM_SNTRUP761_ENCAPS_LEN,
+                        key2, GCRY_KEM_SNTRUP761_SHARED_LEN,
+                        NULL, 0);
+  if (err)
+    {
+      fail ("gcry_kem_dec %d: %s", testno, gpg_strerror (err));
+      return;
+    }
+
+  if (memcmp (key1, key2, GCRY_KEM_SNTRUP761_SHARED_LEN) != 0)
+    {
+      size_t i;
+
+      fail ("sntrup761 test %d failed: mismatch\n", testno);
+      fputs ("key1:", stderr);
+      for (i = 0; i < GCRY_KEM_SNTRUP761_SHARED_LEN; i++)
+        fprintf (stderr, " %02x", key1[i]);
+      putc ('\n', stderr);
+      fputs ("key2:", stderr);
+      for (i = 0; i < GCRY_KEM_SNTRUP761_SHARED_LEN; i++)
+        fprintf (stderr, " %02x", key2[i]);
+      putc ('\n', stderr);
+    }
+}
+
+
+static void
+test_kem_mceliece6688128f (int testno)
+{
+  gcry_error_t err;
+  uint8_t pubkey[GCRY_KEM_CM6688128F_PUBKEY_LEN];
+  uint8_t seckey[GCRY_KEM_CM6688128F_SECKEY_LEN];
+  uint8_t ciphertext[GCRY_KEM_CM6688128F_ENCAPS_LEN];
+  uint8_t key1[GCRY_KEM_CM6688128F_SHARED_LEN];
+  uint8_t key2[GCRY_KEM_CM6688128F_SHARED_LEN];
+
+  err = gcry_kem_keypair (GCRY_KEM_CM6688128F,
+                         pubkey, GCRY_KEM_CM6688128F_PUBKEY_LEN,
+                         seckey, GCRY_KEM_CM6688128F_SECKEY_LEN);
+  if (err)
+    {
+      fail ("gcry_kem_keypair %d: %s", testno, gpg_strerror (err));
+      return;
+    }
+
+  err = gcry_kem_encap (GCRY_KEM_CM6688128F,
+                       pubkey, GCRY_KEM_CM6688128F_PUBKEY_LEN,
+                       ciphertext, GCRY_KEM_CM6688128F_ENCAPS_LEN,
+                       key1, GCRY_KEM_CM6688128F_SHARED_LEN,
+                       NULL, 0);
+  if (err)
+    {
+      fail ("gcry_kem_enc %d: %s", testno, gpg_strerror (err));
+      return;
+    }
+
+  err = gcry_kem_decap (GCRY_KEM_CM6688128F,
+                       seckey, GCRY_KEM_CM6688128F_SECKEY_LEN,
+                       ciphertext, GCRY_KEM_CM6688128F_ENCAPS_LEN,
+                       key2, GCRY_KEM_CM6688128F_SHARED_LEN,
+                       NULL, 0);
+  if (err)
+    {
+      fail ("gcry_kem_dec %d: %s", testno, gpg_strerror (err));
+      return;
+    }
+
+  if (memcmp (key1, key2, GCRY_KEM_CM6688128F_SHARED_LEN) != 0)
+    {
+      size_t i;
+
+      fail ("cm6688128f test %d failed: mismatch\n", testno);
+      fputs ("key1:", stderr);
+      for (i = 0; i < GCRY_KEM_CM6688128F_SHARED_LEN; i++)
+       fprintf (stderr, " %02x", key1[i]);
+      putc ('\n', stderr);
+      fputs ("key2:", stderr);
+      for (i = 0; i < GCRY_KEM_CM6688128F_SHARED_LEN; i++)
+       fprintf (stderr, " %02x", key2[i]);
+      putc ('\n', stderr);
+    }
+}
+
+
+static void
+test_kem_mlkem512 (int testno)
+{
+  gcry_error_t err;
+  uint8_t pubkey[GCRY_KEM_MLKEM512_PUBKEY_LEN];
+  uint8_t seckey[GCRY_KEM_MLKEM512_SECKEY_LEN];
+  uint8_t ciphertext[GCRY_KEM_MLKEM512_ENCAPS_LEN];
+  uint8_t key1[GCRY_KEM_MLKEM512_SHARED_LEN];
+  uint8_t key2[GCRY_KEM_MLKEM512_SHARED_LEN];
+
+  err = gcry_kem_keypair (GCRY_KEM_MLKEM512,
+                          pubkey, GCRY_KEM_MLKEM512_PUBKEY_LEN,
+                          seckey, GCRY_KEM_MLKEM512_SECKEY_LEN);
+  if (err)
+    {
+      fail ("gcry_kem_keypair %d: %s", testno, gpg_strerror (err));
+      return;
+    }
+
+  err = gcry_kem_encap (GCRY_KEM_MLKEM512,
+                        pubkey, GCRY_KEM_MLKEM512_PUBKEY_LEN,
+                        ciphertext, GCRY_KEM_MLKEM512_ENCAPS_LEN,
+                        key1, GCRY_KEM_MLKEM512_SHARED_LEN,
+                        NULL, 0);
+  if (err)
+    {
+      fail ("gcry_kem_enc %d: %s", testno, gpg_strerror (err));
+      return;
+    }
+
+  err = gcry_kem_decap (GCRY_KEM_MLKEM512,
+                        seckey, GCRY_KEM_MLKEM512_SECKEY_LEN,
+                        ciphertext, GCRY_KEM_MLKEM512_ENCAPS_LEN,
+                        key2, GCRY_KEM_MLKEM512_SHARED_LEN,
+                        NULL, 0);
+  if (err)
+    {
+      fail ("gcry_kem_dec %d: %s", testno, gpg_strerror (err));
+      return;
+    }
+
+  if (memcmp (key1, key2, GCRY_KEM_MLKEM512_SHARED_LEN) != 0)
+    {
+      size_t i;
+
+      fail ("mlkem512 test %d failed: mismatch\n", testno);
+      fputs ("key1:", stderr);
+      for (i = 0; i < GCRY_KEM_MLKEM512_SHARED_LEN; i++)
+        fprintf (stderr, " %02x", key1[i]);
+      putc ('\n', stderr);
+      fputs ("key2:", stderr);
+      for (i = 0; i < GCRY_KEM_MLKEM512_SHARED_LEN; i++)
+        fprintf (stderr, " %02x", key2[i]);
+      putc ('\n', stderr);
+    }
+}
+
+static void
+test_kem_mlkem768 (int testno)
+{
+  gcry_error_t err;
+  uint8_t pubkey[GCRY_KEM_MLKEM768_PUBKEY_LEN];
+  uint8_t seckey[GCRY_KEM_MLKEM768_SECKEY_LEN];
+  uint8_t ciphertext[GCRY_KEM_MLKEM768_ENCAPS_LEN];
+  uint8_t key1[GCRY_KEM_MLKEM768_SHARED_LEN];
+  uint8_t key2[GCRY_KEM_MLKEM768_SHARED_LEN];
+
+  err = gcry_kem_keypair (GCRY_KEM_MLKEM768,
+                          pubkey, GCRY_KEM_MLKEM768_PUBKEY_LEN,
+                          seckey, GCRY_KEM_MLKEM768_SECKEY_LEN);
+  if (err)
+    {
+      fail ("gcry_kem_keypair %d: %s", testno, gpg_strerror (err));
+      return;
+    }
+
+  err = gcry_kem_encap (GCRY_KEM_MLKEM768,
+                        pubkey, GCRY_KEM_MLKEM768_PUBKEY_LEN,
+                        ciphertext, GCRY_KEM_MLKEM768_ENCAPS_LEN,
+                        key1, GCRY_KEM_MLKEM768_SHARED_LEN,
+                        NULL, 0);
+  if (err)
+    {
+      fail ("gcry_kem_enc %d: %s", testno, gpg_strerror (err));
+      return;
+    }
+
+  err = gcry_kem_decap (GCRY_KEM_MLKEM768,
+                        seckey, GCRY_KEM_MLKEM768_SECKEY_LEN,
+                        ciphertext, GCRY_KEM_MLKEM768_ENCAPS_LEN,
+                        key2, GCRY_KEM_MLKEM768_SHARED_LEN,
+                        NULL, 0);
+  if (err)
+    {
+      fail ("gcry_kem_dec %d: %s", testno, gpg_strerror (err));
+      return;
+    }
+
+  if (memcmp (key1, key2, GCRY_KEM_MLKEM768_SHARED_LEN) != 0)
+    {
+      size_t i;
+
+      fail ("mlkem768 test %d failed: mismatch\n", testno);
+      fputs ("key1:", stderr);
+      for (i = 0; i < GCRY_KEM_MLKEM768_SHARED_LEN; i++)
+        fprintf (stderr, " %02x", key1[i]);
+      putc ('\n', stderr);
+      fputs ("key2:", stderr);
+      for (i = 0; i < GCRY_KEM_MLKEM768_SHARED_LEN; i++)
+        fprintf (stderr, " %02x", key2[i]);
+      putc ('\n', stderr);
+    }
+}
+
+static void
+test_kem_mlkem1024 (int testno)
+{
+  gcry_error_t err;
+  uint8_t pubkey[GCRY_KEM_MLKEM1024_PUBKEY_LEN];
+  uint8_t seckey[GCRY_KEM_MLKEM1024_SECKEY_LEN];
+  uint8_t ciphertext[GCRY_KEM_MLKEM1024_ENCAPS_LEN];
+  uint8_t key1[GCRY_KEM_MLKEM1024_SHARED_LEN];
+  uint8_t key2[GCRY_KEM_MLKEM1024_SHARED_LEN];
+
+  err = gcry_kem_keypair (GCRY_KEM_MLKEM1024,
+                          pubkey, GCRY_KEM_MLKEM1024_PUBKEY_LEN,
+                          seckey, GCRY_KEM_MLKEM1024_SECKEY_LEN);
+  if (err)
+    {
+      fail ("gcry_kem_keypair %d: %s", testno, gpg_strerror (err));
+      return;
+    }
+
+  err = gcry_kem_encap (GCRY_KEM_MLKEM1024,
+                        pubkey, GCRY_KEM_MLKEM1024_PUBKEY_LEN,
+                        ciphertext, GCRY_KEM_MLKEM1024_ENCAPS_LEN,
+                        key1, GCRY_KEM_MLKEM1024_SHARED_LEN,
+                        NULL, 0);
+  if (err)
+    {
+      fail ("gcry_kem_enc %d: %s", testno, gpg_strerror (err));
+      return;
+    }
+
+  err = gcry_kem_decap (GCRY_KEM_MLKEM1024,
+                        seckey, GCRY_KEM_MLKEM1024_SECKEY_LEN,
+                        ciphertext, GCRY_KEM_MLKEM1024_ENCAPS_LEN,
+                        key2, GCRY_KEM_MLKEM1024_SHARED_LEN,
+                        NULL, 0);
+  if (err)
+    {
+      fail ("gcry_kem_dec %d: %s", testno, gpg_strerror (err));
+      return;
+    }
+
+  if (memcmp (key1, key2, GCRY_KEM_MLKEM1024_SHARED_LEN) != 0)
+    {
+      size_t i;
+
+      fail ("mlkem1024 test %d failed: mismatch\n", testno);
+      fputs ("key1:", stderr);
+      for (i = 0; i < GCRY_KEM_MLKEM1024_SHARED_LEN; i++)
+        fprintf (stderr, " %02x", key1[i]);
+      putc ('\n', stderr);
+      fputs ("key2:", stderr);
+      for (i = 0; i < GCRY_KEM_MLKEM1024_SHARED_LEN; i++)
+        fprintf (stderr, " %02x", key2[i]);
+      putc ('\n', stderr);
+    }
+}
+
+
+static void
+test_kem_raw_x25519 (int testno)
+{
+  gcry_error_t err;
+  uint8_t pubkey[GCRY_KEM_ECC_X25519_PUBKEY_LEN];
+  uint8_t seckey[GCRY_KEM_ECC_X25519_SECKEY_LEN];
+  uint8_t ciphertext[GCRY_KEM_ECC_X25519_ENCAPS_LEN];
+  uint8_t key1[GCRY_KEM_RAW_X25519_SHARED_LEN];
+  uint8_t key2[GCRY_KEM_RAW_X25519_SHARED_LEN];
+
+  err = gcry_kem_keypair (GCRY_KEM_RAW_X25519,
+                          pubkey, GCRY_KEM_ECC_X25519_PUBKEY_LEN,
+                          seckey, GCRY_KEM_ECC_X25519_SECKEY_LEN);
+  if (in_fips_mode)
+    {
+      if (!err)
+        fail ("gcry_kem_keypair is not expected to work in FIPS mode for test %d",
+              testno);
+      if (verbose > 1)
+        info ("not executed in FIPS mode\n");
+      return;
+    }
+  if (err)
+    {
+      fail ("gcry_kem_keypair %d: %s", testno, gpg_strerror (err));
+      return;
+    }
+
+  err = gcry_kem_encap (GCRY_KEM_RAW_X25519,
+                        pubkey, GCRY_KEM_ECC_X25519_PUBKEY_LEN,
+                        ciphertext, GCRY_KEM_ECC_X25519_ENCAPS_LEN,
+                        key1, GCRY_KEM_RAW_X25519_SHARED_LEN,
+                        NULL, 0);
+  if (err)
+    {
+      fail ("gcry_kem_encap %d: %s", testno, gpg_strerror (err));
+      return;
+    }
+
+  err = gcry_kem_decap (GCRY_KEM_RAW_X25519,
+                        seckey, GCRY_KEM_ECC_X25519_SECKEY_LEN,
+                        ciphertext, GCRY_KEM_ECC_X25519_ENCAPS_LEN,
+                        key2, GCRY_KEM_RAW_X25519_SHARED_LEN,
+                        NULL, 0);
+  if (err)
+    {
+      fail ("gcry_kem_decap %d: %s", testno, gpg_strerror (err));
+      return;
+    }
+
+  if (memcmp (key1, key2, GCRY_KEM_RAW_X25519_SHARED_LEN) != 0)
+    {
+      size_t i;
+
+      fail ("raw-x25519 test %d failed: mismatch\n", testno);
+      fputs ("key1:", stderr);
+      for (i = 0; i < GCRY_KEM_RAW_X25519_SHARED_LEN; i++)
+        fprintf (stderr, " %02x", key1[i]);
+      putc ('\n', stderr);
+      fputs ("key2:", stderr);
+      for (i = 0; i < GCRY_KEM_RAW_X25519_SHARED_LEN; i++)
+        fprintf (stderr, " %02x", key2[i]);
+      putc ('\n', stderr);
+    }
+}
+
+
+static void
+test_kem_dhkem_x25519 (int testno)
+{
+  gcry_error_t err;
+  uint8_t pubkey[GCRY_KEM_DHKEM25519_PUBKEY_LEN];
+  uint8_t seckey[GCRY_KEM_DHKEM25519_SECKEY_LEN];
+  uint8_t ciphertext[GCRY_KEM_DHKEM25519_ENCAPS_LEN];
+  uint8_t key1[GCRY_KEM_DHKEM25519_SHARED_LEN];
+  uint8_t key2[GCRY_KEM_DHKEM25519_SHARED_LEN];
+
+  err = gcry_kem_keypair (GCRY_KEM_DHKEM25519,
+                          pubkey, GCRY_KEM_DHKEM25519_PUBKEY_LEN,
+                          seckey, GCRY_KEM_DHKEM25519_SECKEY_LEN);
+  if (in_fips_mode)
+    {
+      if (!err)
+        fail ("gcry_kem_keypair is not expected to work in FIPS mode for test %d",
+              testno);
+      if (verbose > 1)
+        info ("not executed in FIPS mode\n");
+      return;
+    }
+  if (err)
+    {
+      fail ("gcry_kem_keypair %d: %s", testno, gpg_strerror (err));
+      return;
+    }
+
+  err = gcry_kem_encap (GCRY_KEM_DHKEM25519,
+                        pubkey, GCRY_KEM_DHKEM25519_PUBKEY_LEN,
+                        ciphertext, GCRY_KEM_DHKEM25519_ENCAPS_LEN,
+                        key1, GCRY_KEM_DHKEM25519_SHARED_LEN,
+                        NULL, 0);
+  if (err)
+    {
+      fail ("gcry_kem_encap %d: %s", testno, gpg_strerror (err));
+      return;
+    }
+
+  err = gcry_kem_decap (GCRY_KEM_DHKEM25519,
+                        seckey, GCRY_KEM_DHKEM25519_SECKEY_LEN,
+                        ciphertext, GCRY_KEM_DHKEM25519_ENCAPS_LEN,
+                        key2, GCRY_KEM_DHKEM25519_SHARED_LEN,
+                        pubkey, GCRY_KEM_DHKEM25519_PUBKEY_LEN);
+  if (err)
+    {
+      fail ("gcry_kem_decap %d: %s", testno, gpg_strerror (err));
+      return;
+    }
+
+  if (memcmp (key1, key2, GCRY_KEM_DHKEM25519_SHARED_LEN) != 0)
+    {
+      size_t i;
+
+      fail ("dhkem-x25519 test %d failed: mismatch\n", testno);
+      fputs ("key1:", stderr);
+      for (i = 0; i < GCRY_KEM_DHKEM25519_SHARED_LEN; i++)
+        fprintf (stderr, " %02x", key1[i]);
+      putc ('\n', stderr);
+      fputs ("key2:", stderr);
+      for (i = 0; i < GCRY_KEM_DHKEM25519_SHARED_LEN; i++)
+        fprintf (stderr, " %02x", key2[i]);
+      putc ('\n', stderr);
+    }
+}
+
+
+#define SELECTED_ALGO_SNTRUP761  (1 << 0)
+#define SELECTED_ALGO_MLKEM512   (1 << 1)
+#define SELECTED_ALGO_MLKEM768   (1 << 2)
+#define SELECTED_ALGO_MLKEM1024  (1 << 3)
+#define SELECTED_ALGO_RAW_X25519 (1 << 4)
+#define SELECTED_ALGO_DHKEM25519 (1 << 5)
+#define SELECTED_ALGO_CM6688128F (1 << 6)
+
+static unsigned int selected_algo;
+
+static void
+check_kem (int n_loops)
+{
+  int ntests;
+  int testno;
+
+  info ("Checking KEM.\n");
+
+  ntests = 0;
+  testno = 0;
+  if ((selected_algo & SELECTED_ALGO_SNTRUP761))
+    {
+      for (; testno < n_loops; testno++)
+        test_kem_sntrup761 (testno);
+      ntests += n_loops;
+    }
+
+  if ((selected_algo & SELECTED_ALGO_CM6688128F))
+    {
+      for (; testno < n_loops; testno++)
+        test_kem_mceliece6688128f (testno);
+      ntests += n_loops;
+    }
+
+  if ((selected_algo & SELECTED_ALGO_MLKEM512))
+    {
+      for (; testno < ntests + n_loops; testno++)
+        test_kem_mlkem512 (testno);
+      ntests += n_loops;
+    }
+
+  if ((selected_algo & SELECTED_ALGO_MLKEM768))
+    {
+      for (; testno < ntests + n_loops; testno++)
+        test_kem_mlkem768 (testno);
+      ntests += n_loops;
+    }
+
+  if ((selected_algo & SELECTED_ALGO_MLKEM1024))
+    {
+      for (; testno < ntests + n_loops; testno++)
+        test_kem_mlkem1024 (testno);
+      ntests += n_loops;
+    }
+
+  if ((selected_algo & SELECTED_ALGO_RAW_X25519))
+    {
+      for (; testno < ntests + n_loops; testno++)
+        test_kem_raw_x25519 (testno);
+      ntests += n_loops;
+    }
+
+  if ((selected_algo & SELECTED_ALGO_DHKEM25519))
+    {
+      for (; testno < ntests + n_loops; testno++)
+        test_kem_dhkem_x25519 (testno);
+      ntests += n_loops;
+    }
+
+  show_note ("%d tests done\n", ntests);
+}
+
+int
+main (int argc, char **argv)
+{
+  int last_argc = -1;
+  int n_loops = N_TESTS;
+
+
+  if (argc)
+    {
+      argc--;
+      argv++;
+    }
+
+  selected_algo = ~0;           /* Default is all algos.  */
+
+  while (argc && last_argc != argc)
+    {
+      last_argc = argc;
+      if (!strcmp (*argv, "--"))
+        {
+          argc--;
+          argv++;
+          break;
+        }
+      else if (!strcmp (*argv, "--help"))
+        {
+        usage:
+          fputs ("usage: " PGM " [options]\n"
+                 "Options:\n"
+                 "  --verbose       print timings etc.\n"
+                 "  --debug         flyswatter\n"
+                 "  --loops N       specify the loop count\n"
+                 "  --sntrup761     select SNTRUP761 algo\n"
+                 "  --cm6688128f    select CM6688128F algo\n"
+                 "  --mlkem512      select MLKEM512 algo\n"
+                 "  --mlkem768      select MLKEM768 algo\n"
+                 "  --mlkem1024     select MLKEM1024 algo\n"
+                 "  --dhkem25519    select DHKEM25519 algo\n",
+                 stdout);
+          exit (0);
+        }
+      else if (!strcmp (*argv, "--verbose"))
+        {
+          verbose++;
+          argc--;
+          argv++;
+        }
+      else if (!strcmp (*argv, "--debug"))
+        {
+          verbose += 2;
+          debug++;
+          argc--;
+          argv++;
+        }
+      else if (!strcmp (*argv, "--loops"))
+        {
+          argc--; argv++;
+          if (!argc)
+            goto usage;
+          n_loops = atoi (*argv);
+          argc--; argv++;
+        }
+      else if (!strcmp (*argv, "--sntrup761"))
+        {
+          selected_algo = SELECTED_ALGO_SNTRUP761;
+          argc--;
+          argv++;
+        }
+      else if (!strcmp (*argv, "--cm6688128f"))
+        {
+          selected_algo = SELECTED_ALGO_CM6688128F;
+          argc--;
+          argv++;
+        }
+      else if (!strcmp (*argv, "--mlkem512"))
+        {
+          selected_algo = SELECTED_ALGO_MLKEM512;
+          argc--;
+          argv++;
+        }
+      else if (!strcmp (*argv, "--mlkem768"))
+        {
+          selected_algo = SELECTED_ALGO_MLKEM768;
+          argc--;
+          argv++;
+        }
+      else if (!strcmp (*argv, "--mlkem1024"))
+        {
+          selected_algo = SELECTED_ALGO_MLKEM1024;
+          argc--;
+          argv++;
+        }
+      else if (!strcmp (*argv, "--raw-x25519"))
+        {
+          selected_algo = SELECTED_ALGO_RAW_X25519;
+          argc--;
+          argv++;
+        }
+      else if (!strcmp (*argv, "--dhkem25519"))
+        {
+          selected_algo = SELECTED_ALGO_DHKEM25519;
+          argc--;
+          argv++;
+        }
+      else if (!strncmp (*argv, "--", 2))
+        die ("unknown option '%s'", *argv);
+    }
+
+  xgcry_control ((GCRYCTL_DISABLE_SECMEM, 0));
+  if (!gcry_check_version (GCRYPT_VERSION))
+    die ("version mismatch\n");
+  if (debug)
+    xgcry_control ((GCRYCTL_SET_DEBUG_FLAGS, 1u, 0));
+  xgcry_control ((GCRYCTL_ENABLE_QUICK_RANDOM, 0));
+  xgcry_control ((GCRYCTL_INITIALIZATION_FINISHED, 0));
+
+  if (gcry_fips_mode_active ())
+    in_fips_mode = 1;
+
+  start_timer ();
+  check_kem (n_loops);
+  stop_timer ();
+
+  info ("All tests completed in %s.  Errors: %d\n",
+        elapsed_time (1), error_count);
+  return !!error_count;
+}
diff --git a/tests/t-mlkem.c b/tests/t-mlkem.c
new file mode 100644 (file)
index 0000000..a7b480d
--- /dev/null
@@ -0,0 +1,300 @@
+/* t-mlkem.c - Check the Crystal Kyber computation by Known Answers
+ * Copyright (C) 2024 g10 Code GmbH
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with this program; if not, see <https://www.gnu.org/licenses/>.
+ * SPDX-License-Identifier: LGPL-2.1+
+ */
+
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+#include <stdarg.h>
+#include <stdio.h>
+#include <ctype.h>
+#include <stdlib.h>
+#include <string.h>
+#include <errno.h>
+
+#include "stopwatch.h"
+
+#define PGM "t-mlkem"
+
+#define NEED_SHOW_NOTE
+#define NEED_PREPEND_SRCDIR
+#define NEED_READ_TEXTLINE
+#define NEED_COPY_DATA
+#define NEED_HEX2BUFFER
+#include "t-common.h"
+
+#define N_TESTS 30
+
+static int custom_data_file;
+
+/*
+ * The input line is like:
+ *
+ *      [Kyber-512]
+ *      [Kyber-768]
+ *      [Kyber-1024]
+ *
+ */
+static int
+parse_annotation (const char *line, int lineno)
+{
+  const char *s;
+
+  s = strchr (line, '-');
+  if (!s)
+    {
+      fail ("syntax error at input line %d", lineno);
+      return 0;
+    }
+
+  switch (atoi (s+1))
+    {
+    case 512:
+      return GCRY_KEM_MLKEM512;
+      break;
+    case 768:
+    default:
+      return GCRY_KEM_MLKEM768;
+      break;
+    case 1024:
+      return GCRY_KEM_MLKEM1024;
+      break;
+    }
+}
+
+static void
+one_test (int testno, int algo,
+          const char *sk_str, const char *ct_str, const char *ss_str)
+{
+  gpg_error_t err;
+  unsigned char *sk, *ct, *ss;
+  size_t sk_len, ct_len, ss_len;
+  unsigned char ss_computed[GCRY_KEM_MLKEM1024_SHARED_LEN];
+
+  sk = ct = ss = 0;
+
+  if (verbose > 1)
+    info ("Running test %d\n", testno);
+
+  if (!(sk = hex2buffer (sk_str, &sk_len)))
+    {
+      fail ("error preparing input for test %d, %s: %s",
+            testno, "sk", "invalid hex string");
+      goto leave;
+    }
+  if (!(ct = hex2buffer (ct_str, &ct_len)))
+    {
+      fail ("error preparing input for test %d, %s: %s",
+            testno, "ct", "invalid hex string");
+      goto leave;
+    }
+  if (!(ss = hex2buffer (ss_str, &ss_len)))
+    {
+      fail ("error preparing input for test %d, %s: %s",
+            testno, "ss", "invalid hex string");
+      goto leave;
+    }
+
+  err = gcry_kem_decap (algo, sk, sk_len, ct, ct_len,
+                        ss_computed, ss_len, NULL, 0);
+  if (err)
+    fail ("gcry_kem_decap failed for test %d: %s", testno, gpg_strerror (err));
+
+  if (memcmp (ss_computed, ss, ss_len) != 0)
+    {
+      size_t i;
+
+      fail ("test %d failed: mismatch\n", testno);
+      fputs ("ss_computed:", stderr);
+      for (i = 0; i < ss_len; i++)
+        fprintf (stderr, " %02x", ss_computed[i]);
+      putc ('\n', stderr);
+      fputs ("ss_knownans:", stderr);
+      for (i = 0; i < ss_len; i++)
+        fprintf (stderr, " %02x", ss[i]);
+      putc ('\n', stderr);
+    }
+
+ leave:
+  xfree (sk);
+  xfree (ct);
+  xfree (ss);
+}
+
+static void
+check_mlkem_kat (int algo, const char *fname)
+{
+  FILE *fp;
+  int lineno, ntests;
+  char *line;
+  int testno;
+  char *sk_str, *pk_str, *ct_str, *ss_str;
+
+  info ("Checking ML-KEM.\n");
+
+  fp = fopen (fname, "r");
+  if (!fp)
+    die ("error opening '%s': %s\n", fname, strerror (errno));
+
+  testno = 0;
+  sk_str = pk_str = ct_str = ss_str = NULL;
+  lineno = ntests = 0;
+  while ((line = read_textline (fp, &lineno)))
+    {
+      if (!strncmp (line, "[", 1))
+        algo = parse_annotation (line, lineno);
+      else if (!strncmp (line, "Public Key:", 11))
+        copy_data (&pk_str, line, lineno);
+      else if (!strncmp (line, "Secret Key:", 11))
+        copy_data (&sk_str, line, lineno);
+      else if (!strncmp (line, "Ciphertext:", 11))
+        copy_data (&ct_str, line, lineno);
+      else if (!strncmp (line, "Shared Secret A:", 16))
+        copy_data (&ss_str, line, lineno);
+      else if (!strncmp (line, "Shared Secret B:", 16))
+        ;
+      else if (!strncmp (line, "Pseudorandom", 12))
+        ;
+      else
+        fail ("unknown tag at input line %d", lineno);
+
+      xfree (line);
+      if (pk_str && sk_str && ct_str && ss_str)
+        {
+          testno++;
+          one_test (testno, algo, sk_str, ct_str, ss_str);
+          ntests++;
+          if (!(ntests % 256))
+            show_note ("%d of %d tests done\n", ntests, N_TESTS);
+          xfree (pk_str);  pk_str = NULL;
+          xfree (sk_str);  sk_str = NULL;
+          xfree (ct_str);  ct_str = NULL;
+          xfree (ss_str);  ss_str = NULL;
+        }
+    }
+  xfree (pk_str);
+  xfree (sk_str);
+  xfree (ct_str);
+  xfree (ss_str);
+
+  if (ntests != N_TESTS && !custom_data_file)
+    fail ("did %d tests but expected %d", ntests, N_TESTS);
+  else if ((ntests % 256))
+    show_note ("%d tests done\n", ntests);
+
+  fclose (fp);
+}
+
+int
+main (int argc, char **argv)
+{
+  int last_argc = -1;
+  char *fname   = NULL;
+  int algo = 0;
+
+  if (argc)
+    { argc--; argv++; }
+
+  while (argc && last_argc != argc)
+    {
+      last_argc = argc;
+      if (!strcmp (*argv, "--"))
+        {
+          argc--; argv++;
+          break;
+        }
+      else if (!strcmp (*argv, "--help"))
+        {
+          fputs ("usage: " PGM " [options]\n"
+                 "Options:\n"
+                 "  --verbose       print timings etc.\n"
+                 "  --debug         flyswatter\n"
+                 "  --data FNAME    take test data from file FNAME\n"
+                 "  --512           specify Kyber-512\n"
+                 "  --768           specify Kyber-768\n"
+                 "  --512           specify Kyber-1024\n",
+                 stdout);
+          exit (0);
+        }
+      else if (!strcmp (*argv, "--verbose"))
+        {
+          verbose++;
+          argc--; argv++;
+        }
+      else if (!strcmp (*argv, "--debug"))
+        {
+          verbose += 2;
+          debug++;
+          argc--; argv++;
+        }
+      else if (!strcmp (*argv, "--512"))
+        {
+          algo = GCRY_KEM_MLKEM512;
+          debug++;
+          argc--; argv++;
+        }
+      else if (!strcmp (*argv, "--768"))
+        {
+          algo = GCRY_KEM_MLKEM768;
+          debug++;
+          argc--; argv++;
+        }
+      else if (!strcmp (*argv, "--1024"))
+        {
+          algo = GCRY_KEM_MLKEM1024;
+          debug++;
+          argc--; argv++;
+        }
+      else if (!strcmp (*argv, "--data"))
+        {
+          argc--; argv++;
+          if (argc)
+            {
+              xfree (fname);
+              fname = xstrdup (*argv);
+              argc--; argv++;
+            }
+        }
+      else if (!strncmp (*argv, "--", 2))
+        die ("unknown option '%s'", *argv);
+    }
+
+  if (!fname)
+    fname = prepend_srcdir ("t-mlkem.inp");
+  else
+    custom_data_file = 1;
+
+  xgcry_control ((GCRYCTL_DISABLE_SECMEM, 0));
+  if (!gcry_check_version (GCRYPT_VERSION))
+    die ("version mismatch\n");
+  if (debug)
+    xgcry_control ((GCRYCTL_SET_DEBUG_FLAGS, 1u , 0));
+  xgcry_control ((GCRYCTL_ENABLE_QUICK_RANDOM, 0));
+  xgcry_control ((GCRYCTL_INITIALIZATION_FINISHED, 0));
+
+  start_timer ();
+  check_mlkem_kat (algo, fname);
+  stop_timer ();
+
+  xfree (fname);
+
+  info ("All tests completed in %s.  Errors: %d\n",
+        elapsed_time (1), error_count);
+  return !!error_count;
+}
diff --git a/tests/t-mlkem.inp b/tests/t-mlkem.inp
new file mode 100644 (file)
index 0000000..18d2b28
--- /dev/null
@@ -0,0 +1,193 @@
+# mlkem.inp - 30 test data sets
+#
+# This has extracted from the result files of tvecs512, tvecs768, and tvecs1024
+# of the reference implementaton.  Each of 10 test vectors are taken.
+#
+[Kyber-512]
+#
+Public Key: 5ab0a7bfc623d42b0665542682a7b4a3b7a383775b3b64ccae4070e165059691b7c161acd0904dfec10c80caae89ab920b4540c72589bd4209199243fbf1a0f2e7a09dfcbf80d9aea3f29d163c6d802b853f0687e1d23222c5b31b743a92fa00c44b8dde44c1cd3067c70a4376863e241640d8a31bc67a5ff586acaed0b1c60944af002e36e4b38fd32301f0ae3ec5cc0cf1aceb48be90324dd7113d9e24640f1595a651b29212161f2c909d76bb4dd48e0495a08f989a3fb3a2a5f13051547ef4089a4ff46906928ee70c066c3c382f174fef83818e081d7b69472e23c22875c269e800762c47cbd97092ea08a84099c5535b756790b9ec6818fb51ad273cf66b0a59e822b44051ba433d318980400036ad62c424a28c8d4c724e798254e121033b48bd2317c66a4750cacb3883754c15ad9edc64260bae288aa730aaabfa12087f9c0496ec6c59c92e54480b3fc119fb901326c67c66398ff8b945ce4335ac243912a716e6364f15127be1475f2af912176b79790b94a70a4dd0469723c1775490833554132e5a19843251388350ab77872d2357fd2b974b169e03fc9325b94100f28659c51d0a3410f96518628b87e2b5b3cd87ae48b6b0cca54e483046606a79e79a3e8a01233518ce61a72650833ab79267fc04cae1e71dbaf86201d7407de3732447335bea3c4b75709b307807dbc0c9c647357843f5a76724c395e6b69b643520a45c8a94f83497d6aa82604e9763863f9943e06a8bdbd97ab703a63f6c56f7c758e4284ea50b6cfdf611946c6d350009f2325005949fda49702412a70960151b38311ef01d97b43e924216035986434a7e1d16405a6132212a304ec91da7f0a3e40c1dea0cb46442cddc0361216a7daa533bce0796b9a9088adbaf61e79916e08388e6acaf030bc6dc77882a5e7e9a658f97c47c6a7a3686b943dc538310190eb4922168149b2092b94a825cda24b944996bb270b488579049a1fc845b65fb04520a334017837533acf1a0592234b8c6697923936afebb66dd75bae9eb8d1c58ac80e075765b5ec2336396591bfe94c8efe00a54130a37a3a5e5d81b4cfa93f56ee922c7d660937b5937c3074d62968f006d1211c60296685953e5de
+Secret Key: 3cbc538da4542cea83a2196a1d2c2f6edbced032b2f1c113d49001c28180a29390b86366d667991698ca0bc40e84f9b4d3c13b312985a570852a96b590fca08606b540872346b0beda3c1c78928df3a699578472dfd4325e48bdc3ea33d6f504a1f471543a9a7e8bbd3f768613bcadbbebbbf3113702c8916f7925464c7bd3f8c804c1ad0d237daf45cada733eb5e4ac9bf4c7cb3bc19372b3719acaf611740a10a239d5a821ba2d7b2a953fcc708e78a869a04812a391e1f2776fa8530e632ce8f772d5332f6b78318db81f4c822f3a0361bda2584d0535249b9376f09082d3084ab252c8907852028faf349631a9c38f8087c18bbafd05b9081b6465d33cbf13495e327923a53322277f9a0cc941952ccb8647325079150a6e636ab04ea127368b3bac7a5f0e701fce43bd52db5eb9485ca6e24d9a6caa39fb3eb069864e47154e39b41b77b3bcc39b09fa8aa21755503c760d44981c39ce57795c3ce69b4d805fd0362ef263064b1070a8fa4179915b87e53dd090c11935a966681692c457c24596f9318ef57b01919056a020b059b7bf3a608014d0139fb27b38199d980a9ca198925696bb549093deea5e56ca060c8958e913cd3257ca3891937b4a8c65e8216fb64910a34e46869902bb5969551a4b366ff4167de838ca03e3c1260729ccbb36fa34aec0e114b7e1b8cb824cb4599c00766bcd52b03ac1853363c5917847f3e58f114b7904cb65604c37854828df2937794c073a28c859b1bab27ab71db4719eda0c94b0b3a37ab574966558a4966794ae2724700687b06b7064fde8a6b8113105e8cdf5166e7b639cd799c2bf20bcc9143e816054c327a7ccf60dad295784229be8259b631c096dcc68d3c111243bce38c37b59084103685847963bc6416172ec67f3619b5429a9558aaa062387ec9aad9777ce6ed7acee99b611c79b7c7535cbe32b5d44023e853ed41c42b4f9cd8e8a6e7864b7c103b461330786dc17ac97880c80c688269ae6897cb039a6e613300f399808a61fdd05b87171722edb5d47db520f579ac1c6277af7339093ac620c053ac813d1c614fef0cb2cc6705ab0a7bfc623d42b0665542682a7b4a3b7a383775b3b64ccae4070e165059691b7c161acd0904dfec10c80caae89ab920b4540c72589bd4209199243fbf1a0f2e7a09dfcbf80d9aea3f29d163c6d802b853f0687e1d23222c5b31b743a92fa00c44b8dde44c1cd3067c70a4376863e241640d8a31bc67a5ff586acaed0b1c60944af002e36e4b38fd32301f0ae3ec5cc0cf1aceb48be90324dd7113d9e24640f1595a651b29212161f2c909d76bb4dd48e0495a08f989a3fb3a2a5f13051547ef4089a4ff46906928ee70c066c3c382f174fef83818e081d7b69472e23c22875c269e800762c47cbd97092ea08a84099c5535b756790b9ec6818fb51ad273cf66b0a59e822b44051ba433d318980400036ad62c424a28c8d4c724e798254e121033b48bd2317c66a4750cacb3883754c15ad9edc64260bae288aa730aaabfa12087f9c0496ec6c59c92e54480b3fc119fb901326c67c66398ff8b945ce4335ac243912a716e6364f15127be1475f2af912176b79790b94a70a4dd0469723c1775490833554132e5a19843251388350ab77872d2357fd2b974b169e03fc9325b94100f28659c51d0a3410f96518628b87e2b5b3cd87ae48b6b0cca54e483046606a79e79a3e8a01233518ce61a72650833ab79267fc04cae1e71dbaf86201d7407de3732447335bea3c4b75709b307807dbc0c9c647357843f5a76724c395e6b69b643520a45c8a94f83497d6aa82604e9763863f9943e06a8bdbd97ab703a63f6c56f7c758e4284ea50b6cfdf611946c6d350009f2325005949fda49702412a70960151b38311ef01d97b43e924216035986434a7e1d16405a6132212a304ec91da7f0a3e40c1dea0cb46442cddc0361216a7daa533bce0796b9a9088adbaf61e79916e08388e6acaf030bc6dc77882a5e7e9a658f97c47c6a7a3686b943dc538310190eb4922168149b2092b94a825cda24b944996bb270b488579049a1fc845b65fb04520a334017837533acf1a0592234b8c6697923936afebb66dd75bae9eb8d1c58ac80e075765b5ec2336396591bfe94c8efe00a54130a37a3a5e5d81b4cfa93f56ee922c7d660937b5937c3074d62968f006d1211c60296685953e5dec9587d0d63030d0c1c15e675ff63a2f9ebc156baf15a2da1645528be68c3ac2c3cb1eea988004b93103cfb0aeefd2a686e01fa4a58e8a3639ca8a1e3f9ae57e2
+Ciphertext: 1cd3de5b9fe7bff62ab66fab78ad7448a2dae4f6911c040fe5ccb1c6e864f5d82666c9b67ce9fd8f3f56c48f19eefda32b7d1af98330fe2497b1a926ac6be03e69921b2066452a0ec5b8b17117cc1bac8795c7b3331d3f110e13add9536b76a9d6bf6a0081ab204cd0bc849b7f3cd7b70d7773233bb858fc5fd58c611cba92c179e7cf48ff1d63de03ee41c0373c2de7f19e497d3b00a483a79f472a6f760285acfa54ef9453a4a26468df8628e5eb3f784b28b5c8abc02c0c2e0f98220194b4fad496b9722695ab64d19393ecb6c90f89f0fd17af44e95c9df3bbf85673b2bf8512f062b9a8e530cbfbce0f3900f8a3dbb525e9f967013150dbbbf9610215eb0ab6c2fd65b2b01a119540d9dff00b730760de76830e63cba11c30960b48bdf05fc86dcf8d2973a49a2f1667610d55a763d87d47921c2e50899da68223e19a2d7a22b33e6f75fd49c7a5861940a7742d05ed28ddc732aa69162e2b9695e4890938c4269fabd867c6e4e4cd4872de0a5565558257a63a336baccd7ead7d5037ac93be281c206e411d9955257045aaf2d16e126e9e5a4a0b30318dca84ebf13d01433dd3660bbcb18b2f3643eec2b37cef5c9ae4dc4027db8fe264bf7a171a44d74e14628c21ca6ddf04f36bf523377c923257ed53cd5e3e08dc0befc2959395bccd40e37a60dbd35b6465793f42d84525dfe0d2d5c53a503772ab3287556905240ba21e30b5001c64a9489ece64d52d1d86abfde0572561cb1a403403cc62df75b9b21442326bcb5748fd2ccc83bbcc06581547ac0030760c27ee1490a8ed09b84fa6ca1e3061ed47429fa87e5609b208dfedb9af50716b2051bbfef1ca3d3dc9324079b914e5bb80a1cefe5a7adbc05ecb856cc2ee4b880ec19740467c8512d1614d4c57140e36780a7389d1cc127451388eb50475852fc3ff9a28fed4235ec5429d0f8b0d42e915130158922f976b97344638950cee512bc4b0526b186e70bff6adf34e4e31f734a707dc727c6c8c6301f92f720fdcb4f79244d0f034df06ec252221fa36651850d00391a4bca3dc693387f7809280ecdea354802439e61a0f
+Shared Secret B: 9d275318d78eafae6578f921cfc9d39f75c7abfd4fef3e50a756bd6efe6fc890
+Shared Secret A: 9d275318d78eafae6578f921cfc9d39f75c7abfd4fef3e50a756bd6efe6fc890
+Pseudorandom shared Secret A: f2dbd8461cb2393973b91433b8b3f06757b0e2a6b879e98f9ff01fa1409ccde7
+Public Key: 07b0c3ff335c192caf0234706ad26ec047c90f861b76bbc8e0c9b8d8d61d22f837e27687e2d0923813ad50460e75932769628315a57c25858987ca48a54757acc03f9e46761391ce109c46e5a261456b5c29d443732a7276e07324474b9a3151bac934bdd052817396ae824387b9907e2b6a6b4b95ec76c97ad5a466375e55b84e19d76771b041f4b481a2997f80762f6a25c7449807725406a2f3225b0195cdb5cabfd72691395df1434744ca663e53a8016273f64b1752426ecba481ba52ce0dfa32df2c9c4b550d6b8bba83e08e4cab98ebc4b14d0a6df8b07bc8e68f9c48ca10f5877b92c9a761bb904bbf1d9c8776d7357caba1d627667b8b36d827ce4b68cd164251873c1d140c6a62d252a9346f24a0910a4795ea82aa10940021377d3f360b10ba9f490152f920c739698843f074441585f520611c5878bba5422e128c56c39f6b401f03897e06d18386b3c5de0019cae417db2a17067a7cef2b691ac619965766b5a7380cf35e97513b195bb16ca52808e78d71c2064dcc39691ac21a74abe58220e81bb01a747851e102ff90804115170fc326a8c3938245bfc74b6b32787eeb8c6f51537c66c3abc1a02041b87d0b172a830c58070b7e84041bc0f491e34ba2fff71c693c785fe84d5b58805bd28db7c48ffe35b460dbb5db703d06d7041603afe6764b104cc376f097bd2753fc53b6d0581ac7919da7f2bcff2194599b1debd021ae66a391950efed913bfbc597fc50c9892b61aa965809543d7ecbedac319712430b9e9b4138bc44962969f1243db341fca464a70f69436b5c16b3b96486279f361bf97a57546840c20ba11df20c3cec487555790d9129f8472638d2b6a40c24e0fa3bd79e07c6e5175198a8668f35cc9aabb18d4aeb3c6caaaa3aa33768143da1f3fb7a039249e3995ba114c25f1930059e1c5f76605b45a498fcb38b3b656466023c3166836c01df12c9463d7c97f054fa307b08758078db7ac742471c75668eefba9ac7344af30b02577348c0777029318df086d22600315a00e13ac086ac84bfce9c81616a8229b5baa549c4318b877594950684eeac639138df84ff639deb7414fdd3e2e458ec14849fd3477d14552a960249e9b9c9b0d
+Secret Key: 8ac28bd238b5bd576d7acbc66236355bf41526ac34aa035248d1c9aaf50672d096c85714e2a728c356c17b0cc6a352b31b0c2670e59a4a643789d94dfa501d759c238e446d4a1c99a5e15b3282affe63af974a711ff10265179a10d66172957a2f402896a43b28e5c529923eb2a0c435f65f72648767096f469c287aaa0656002cacfa3bb7d1666492233dfc8e7449338f733f0a321500d6c64e270fae05beb48253e64b915aeb590908a391bb867f459b870586ab733cb9d3590639b9c4d40cd0b92633986cab67813d7219c82a07cab566fcc57c31365434797322d916ca3098a93a2a1902625a582718b0c3c3c2c9b3515388db841251a50af30fcd6ca434e76a8d326fd69cc0529c780277b862c0128a50c27ebb2db00465ef0b4cf0ca9905f69ba06a45de9668c83a457e420561248d03b29e08c8be2a219b299c05c4e6935065bc13588f6d02b14cdac8766a7156e268ca153e6f701f79ab29f2b74ea12c0be33a1c7987070d55783156bc32c0089cac5874dc7a90d0602eb84583fb6b32a25360024d7e877ea142924b4a2f1954c532e33a53865227e95bddfa926b944a21e9706f143e55048e01597c8457c3e2b009336cc73b60b88dcb87312408afe2cd2fd99e98726973225f0a795b1ee19352bb483ae239e048248c986d0df04d23066ca4bba5b8949bdb3c68ada8a830241f95dcc4a339cd001859ee925735c29db100d0cd49703774667bc41e306b91cf647263092eb047b52303b23387a37da12600198767caba19001ebe1a8509f2c5039c3d79534638965a11546717f7754eb0081b714e4eda217de2cb6480186e742e2ff244d17610abf38df563bf80c303ee76bb1e5c9ecf22cf38754c4f2a8745316cfed176af2c061a253f026a1137eb9b1b63975d252717f01e0e9138ec220183544b5347ab9b810bcd0cbc3f27ac37e979617b939d05098592617b3787a626985f9211a7805117ca9d8b755448c49b377aa5e9972657b327655528421637b112b69aeb98685439fd19ae34640bd4691ab1996d851c53bd3bac43749a9a681cb4d17de525bfba0b917966b9cb646607b0c3ff335c192caf0234706ad26ec047c90f861b76bbc8e0c9b8d8d61d22f837e27687e2d0923813ad50460e75932769628315a57c25858987ca48a54757acc03f9e46761391ce109c46e5a261456b5c29d443732a7276e07324474b9a3151bac934bdd052817396ae824387b9907e2b6a6b4b95ec76c97ad5a466375e55b84e19d76771b041f4b481a2997f80762f6a25c7449807725406a2f3225b0195cdb5cabfd72691395df1434744ca663e53a8016273f64b1752426ecba481ba52ce0dfa32df2c9c4b550d6b8bba83e08e4cab98ebc4b14d0a6df8b07bc8e68f9c48ca10f5877b92c9a761bb904bbf1d9c8776d7357caba1d627667b8b36d827ce4b68cd164251873c1d140c6a62d252a9346f24a0910a4795ea82aa10940021377d3f360b10ba9f490152f920c739698843f074441585f520611c5878bba5422e128c56c39f6b401f03897e06d18386b3c5de0019cae417db2a17067a7cef2b691ac619965766b5a7380cf35e97513b195bb16ca52808e78d71c2064dcc39691ac21a74abe58220e81bb01a747851e102ff90804115170fc326a8c3938245bfc74b6b32787eeb8c6f51537c66c3abc1a02041b87d0b172a830c58070b7e84041bc0f491e34ba2fff71c693c785fe84d5b58805bd28db7c48ffe35b460dbb5db703d06d7041603afe6764b104cc376f097bd2753fc53b6d0581ac7919da7f2bcff2194599b1debd021ae66a391950efed913bfbc597fc50c9892b61aa965809543d7ecbedac319712430b9e9b4138bc44962969f1243db341fca464a70f69436b5c16b3b96486279f361bf97a57546840c20ba11df20c3cec487555790d9129f8472638d2b6a40c24e0fa3bd79e07c6e5175198a8668f35cc9aabb18d4aeb3c6caaaa3aa33768143da1f3fb7a039249e3995ba114c25f1930059e1c5f76605b45a498fcb38b3b656466023c3166836c01df12c9463d7c97f054fa307b08758078db7ac742471c75668eefba9ac7344af30b02577348c0777029318df086d22600315a00e13ac086ac84bfce9c81616a8229b5baa549c4318b877594950684eeac639138df84ff639deb7414fdd3e2e458ec14849fd3477d14552a960249e9b9c9b0df0d96adda9e271878c20f80e33f5c8102c599665f47d6af92ff7dd2a209d9fbada219e289331610369271867b145b2908293963cd677c9a1ae6ceb28289b254c
+Ciphertext: 9ba6f9690f25ddac7c0803bc543b3ad0dd6269687d81cb818645ff2621a7b187b7a2176bffe42a89138cb1ea9524eb393f80d43cafbc14148d3437f886a949cc7e7e995eca60d3584d64aeb25c36329f86ad17a14c1764960fbb8a2cd7954678ba0fdbb27df11ed4c850df0dc8dcca73e37c72b921485fb83d1fda9674764e23d0ff3388c56439785632ea678de870e5bb03d6b8902f0e9806011cde0bdd8f3f62b455c6908b4c13b6d37a0a74ec61f3f1a5229c800117d5f228818640e10b44b3fa6daedfdecd6c1c7b356a5ea746d755a26698872e39d1d1c360f83b711c418cabaaf1ef42c56bdee9315d9c671798f83b938bdebc7d261323bc894e6887f9980d5bf3ce7771a2daa932f0dccf1f115212ee882fd7ab0cfeb1f619cfbd820c2662b1b4d9d87ddd16f4e30abcdcadc4f0aca5d9f6ce941818b50adb767a5056268b76fb88ccd5a94dc693f7dbe885b0292080c45a096b7f048d062a26534ba0924ae24a380c96bcca9cbb761dcb4e3ab37520d9d4b9fc7d71ff7fb9cd4df70d27b99d5919aff6be85a38fbeaa653eabeb1ec97fe08c83278b732f34c7aa69d3bc216c3a719a2f5abe5f759e050ec0d6e9cb360fb43e0394554a5131febcc2f6c4ec9ddac0c317982d91ea5c850bd2f02f80d55ede08c03cf8b90c21491dc78fbf73f709232e2311dde34309ef224a910c135e741225f368b0128188d985f2d640d8a73d73035915d6a6e505d84cb3879db6d01172b0e4d1851f87a6f9a43ee3cbabd4b2f6e94aff186f65916d14317667448a76790424e91b761a0abe589bf353188d055368bfa8914987dee6aef126c24bbad11f572e0d6f53f097c0856dfa6fc02395765cf6960cc48a1c780652a54445d1ffafabe2462e389c61db75f0e66688b804c6a22ca6d82fc29423cb14a8c13164851cb3d4373c35acb545ea3a72e20ad451cb6b2539361d75beded88882e54917d41d81909afc57b3da7a0f53b069c68b1eefb114b5f36e12eed8406fd2fcdff82ed5c880d486391cca9443d99a1b621a5fdd14b4a4854f1be248dd6167194cf54bef3e965e221c3e8432b9f173
+Shared Secret B: d51c48e96fad6fd7e3d24cbff13666c93054590398ff28ee7d050e5fdb558be5
+Shared Secret A: d51c48e96fad6fd7e3d24cbff13666c93054590398ff28ee7d050e5fdb558be5
+Pseudorandom shared Secret A: 50276b4ec4ede469eb8b3a78753fb5b70883d989dbc8984292b298e12d8abd99
+Public Key: f3528da475218b09ad55ab3321d20e05638e50783d7878afa60042846b02778b5e4afccf8b9314ec636accc1213323aaef088947219e8f113874961712ca3d8d77b3861c369e4b38c32b89604024b64c3c4ada4eaf09ac64839df4ac87ce7ba7ca933ba4788db8328628da9987f6cd0e0369778b386a08ca57d8238aa9179b9674aab19a1e15c166a0b7d841630c1a42d0e633c595288fc50e27cc786038a9f9236db8b630bc85109fd981bb78b5fad9cffe61588aa62cb1ab04d133cf0d051ac06700adac0bece0a571944148b4c21d7585b2603f31314cf102a91b57c6df97a611e46e88827f86fb3bfbf57bdfeab74f1a44a560926b6b9a346b708be7005554c484165ecd24922e94a7ad0c3278b35c6eeba52146c30971cd7e580f7c45360b87378eb7ce3e450c888745d4928889f462a8707c0b3c679cf70610db347e801942218bb6b3b3451c67e243111889575b13a61b3a132fea84b69212ab428214c69c697b34dca2bb3d4c9989f01d7f99ab37a5c1e588b7c61a4fa385b942491c3f0067a166677fc375e0c89c3133ce364295178024ef59b547899a9c590397a0a46a8969f5896b3f4b1040c7be65a277c5990c79b396e6a031cdb924001a7bb6e3661b8ab1928a9fb2b257a1d3176bb99470922c01377c5d07004871267e5c620a6cab34f7c6aecb9a0c62bad9935d9cf261e24965b46a7e02fa08077a26d9fbbef41548d971657a4753458457b97b50248b881046359b75455ce839eef1448f158d2ab6150d047e1d035a71eb5cdefa88766b970ca67272c022ef749380b42282a3b902c9c185a7af574209f47c4b9ac8b62c32b7a5408a7e26a62782c7c14c48745c64092149bd96c70328be5ab433ad83861fb8cc843acce8248f9d80004c52cb182b5e2bd901518060d7753e536a23e7560b446c7752e19f2b4992bd468513b7013735295ca61f1803c8e8954f7c934a44d637c3e6a03905ace6a2285171806e8354dd605cb8489efe97ae66a206e1d5b81ef0977fa187128715b6b47cc8fb2d34bc85fb84cd158782a0d614b4e9077f87c9d34826e9436bff3893707451acf92250a2ee15d6a7684957367e644c0716cfd6b44e1afa5817a2250bea2247
+Secret Key: 55bc27c1464f4221c3984c6bfeaba9ba0b531a145bdc763e3e39b9e7c86783891cb9b663799556e8da0b6848acab219b747612391c8aaa906ff82a4e6741970f270d7cb131438627873758a231b0ff7b1d2e1b52e92300b7c9153f30bee76cba05daa462868400172c0a474b2752461cc6b660854404f1a63a699caf982983c14d89b9415a15969c83952e22704f78b0e226008fc73c2e769cbca80867a72590598dbccb4f02c88b9775af13c99970b8c6864292bb1548196a22b54510f5b3bf80f15f9880bd9a8c8839e964ee659db857102d4aa604c79ef81ac665a81adaa6b7c0266739770e6b0594c5401494126a4e68c7dba234e03060cce147aa191bab179cbcfa8bd0a322db9b698f71ae499b61b2a2696103127b18407a47689810b4ef224e8f147e195035664114c47bccc12b18cd5228d6473838f44bb404c187f8ca7e77b04ffa542a5ac6e90b198012b4adf037645b2cb2fc86bfd2adb86133a4cb56b15b613d788b86f6cb328646bd4b06ff1c7723122502b00dece96d79524394777ebf695f4955c859ac2ad77781c22a315fc76f53f4ae26339bd59b64759c5145c4b9e3a3cc961cb19cec5b37ecb7a631acb1972761555a46969fba0a7bd2489765020fe519aa124906aec7b6c2c89db5515fa7475cce595fe929cb4b815213e39970c6a1fbb79deb678aa3b3c7f3f5610b5a6344c98ae2f5732e2b23843a3629626a127b103113bb6f5bcca329a180a974a1d78d9c6c911046bc69515a9cc3640d88b5c205746ea5a3e03b4baf87466f6b709911ab6ca94d0d71b97365cb567b5c644a6257691435994f6a2a1e6ef90751eb01ad72ba2d355426951807447e00dbba66c7ba8bc44f8158ab4e2849178879160642b4f08d68d866a30606e0f6aaa31646f108769ed665d65a8704d613be4715ad1c4b0fd23960fa5d18f8737c70a32ef8bdc909840cc09c38cbb4edd21e70accf26008bb684345a76b32ea0ae0dda68f7a08720974f31d3ce8de593c4079f2e8949c18c1b9f6a38c4906d32a642dec9252b029e437a917e6ac2cd050160766a8dd770d3fc303e89732abb5ff3528da475218b09ad55ab3321d20e05638e50783d7878afa60042846b02778b5e4afccf8b9314ec636accc1213323aaef088947219e8f113874961712ca3d8d77b3861c369e4b38c32b89604024b64c3c4ada4eaf09ac64839df4ac87ce7ba7ca933ba4788db8328628da9987f6cd0e0369778b386a08ca57d8238aa9179b9674aab19a1e15c166a0b7d841630c1a42d0e633c595288fc50e27cc786038a9f9236db8b630bc85109fd981bb78b5fad9cffe61588aa62cb1ab04d133cf0d051ac06700adac0bece0a571944148b4c21d7585b2603f31314cf102a91b57c6df97a611e46e88827f86fb3bfbf57bdfeab74f1a44a560926b6b9a346b708be7005554c484165ecd24922e94a7ad0c3278b35c6eeba52146c30971cd7e580f7c45360b87378eb7ce3e450c888745d4928889f462a8707c0b3c679cf70610db347e801942218bb6b3b3451c67e243111889575b13a61b3a132fea84b69212ab428214c69c697b34dca2bb3d4c9989f01d7f99ab37a5c1e588b7c61a4fa385b942491c3f0067a166677fc375e0c89c3133ce364295178024ef59b547899a9c590397a0a46a8969f5896b3f4b1040c7be65a277c5990c79b396e6a031cdb924001a7bb6e3661b8ab1928a9fb2b257a1d3176bb99470922c01377c5d07004871267e5c620a6cab34f7c6aecb9a0c62bad9935d9cf261e24965b46a7e02fa08077a26d9fbbef41548d971657a4753458457b97b50248b881046359b75455ce839eef1448f158d2ab6150d047e1d035a71eb5cdefa88766b970ca67272c022ef749380b42282a3b902c9c185a7af574209f47c4b9ac8b62c32b7a5408a7e26a62782c7c14c48745c64092149bd96c70328be5ab433ad83861fb8cc843acce8248f9d80004c52cb182b5e2bd901518060d7753e536a23e7560b446c7752e19f2b4992bd468513b7013735295ca61f1803c8e8954f7c934a44d637c3e6a03905ace6a2285171806e8354dd605cb8489efe97ae66a206e1d5b81ef0977fa187128715b6b47cc8fb2d34bc85fb84cd158782a0d614b4e9077f87c9d34826e9436bff3893707451acf92250a2ee15d6a7684957367e644c0716cfd6b44e1afa5817a2250bea22471a8b515b87562592730ac59d79913874b8710733174ca8cbe9a8a3c2d5bc0d9887e1c43616d983a7ead5abbdd6a173b273bf206e4cb454a4610d1a995bfdf8c1
+Ciphertext: 869358fa975db4257bcf24cac750a2e81b49076f7a63630a06341b01e8564f066fa83b9960735ea8d39e0bf853e601058bfb080745e8571499e902cdad6545753d449720d5a6545249662038e60292fb6a87dc0e770502a69a162cc5c2a9d655355353f5778fad46b5e23f863357ce2b24e9ad74c22b31badf0462320509f41c53b263bc2d628aa765b15a70ce6c14cdb4c2a3e82a3afdb4417c3ad4337989fd7efac827d76df4194f2cbd4a1dfe20ae7029dbc99475f64baf6f4f5eac124f4377c15b03239cedb759f0e2b165ac4394ef3774866c7983bcd435a1c33202a1f55435c230706aa845e9071bebcd71ac7f63577c05306094d255bfdd2ac10e5a1b5dcdba585956ff324f2dfd1360eda2454c42c0ffe91a03c9f9142a4cc8a5cbdfd7aef6f5183377fd8250523368df364b74407828c14b9c46f0419c1b9b837008af2b2e65d969d96453787e1ac6c1f4daf519e9d07595038b09c1881b7a55f05e53c8974b1d2eddce03502a5d99e32e471a40f3f9d674add3466efbd22a9018036d9b7fc1a589de25eeba7f78f69750b4f46ffb312c6f7f9e3f8be75fab1e1dc419c91e3c376bc0a118af51898bafd330c79d67e1adbdc25b2b120c42a0b36e355223401851b8231a08b8047d44bffd84178cf975d273b8bc1a543a353fbae9a1039b90e2b4ae01910e076dc34bb83020ec474e04564dc963d87f23a8d2847d71a481613712039bebbcdd114935cdfff833ae3f33c0e740dfc72e3ff0b20ec011299b686648d750a6526020ee00372afa5d85e7e2db51b7fb7425527e744f792d8982c6449f55a6b3d5ab0cb45db555c54286346a8303cac759ac83d24a994c535ddb8095421f7ab3c29643ab53d4c3be45c6d915655ce757f535aae3416c7f3bb9da18d4fb3ae5afa1bdc55f550d8a208a73c1895ed6cbbc7596eab1933e74e9e8e2ab13721ca539b900c153bd4959d0b1351f5845f28255a67f3f35687bee19bbcafc02c43c4a61d6bd13b7841cca45d2c9f8607e03c034634d878a2fe57622a3756b3debcf44ccdf71656f574dd42e24bcd2787446408628e287708faec72f
+Shared Secret B: 37c620d468a01cd97cb912d27666c3ad76a9b35508ce06620b440d48a988f553
+Shared Secret A: 37c620d468a01cd97cb912d27666c3ad76a9b35508ce06620b440d48a988f553
+Pseudorandom shared Secret A: 753d9caf212c2d33142ead791f7f275615a7f392e819bda0983a35fe4f4f3d4c
+Public Key: 5c33985c59c54ee6cf65945581fc03811c19ee709b2059a376096aca4645c8193f361212a42231e27cb636d413d052cd5652cc5139657974bb307292eeebb2455c9d5bb896185c706d2355770c86b73b8b0935148b3b1504fb316f48108663b4a8806d24e58ae746cce8c50efc80693a8c81841ba20d4a6d99821e94082fd18727d234690b04669b5baa933c486946cc905624cd118c6f88743f6214a3d58589b0cc2c4605dd302035b515bbd4111459762105a5f381af5f932c1586b4246a9db7bc36f96897cea9b030773a84777c88521ea235b749a7cdac1c6fd0eb12285b8bef19b2bdc226756a694abc4da0f8be8f27bb1c65c68388c03023ca4a499d9a783b2a0a2cd0a181b4c2aab6eb87c7aba6244a67d8a3bdcc96066f44aba5eb1ea707558b9a65ff91cb7f443c59b721261a6aa3a8cf4b1828ecd4726c5b681b94b9b6490d1e4264571c3e6643b8c495291a492778b1bac54cb3b4eb8c47fcad99919b691936e7918fcd09847a95020f472ec8f4cd62435d2ba65d871b5513f6917dd788d3818c713bbad6848b645c8c746bb244c8cdacb880fb99aca34a7742f48aca6288c1bc464415c5d9a0636ce07de22182ec5392943129b5e169f6b1334ed76718e735b4970b1a7509934c46320474b3a434a658800a53cb056b1ddb4904e5d2177187c6a7a538bf73a0cf522f85590e8a102ffd74b479348a409834c413ca42a44a32ab6d4842655a6bad12120962b93a2460638b2a6e85b226e4f02eb77c903537a6359836a9f19d2ff4249aac8d2eaa0dd013654565a66b7bb260a2a39024ac99d11c074a23a1813629e1448e6a1875f9bdf46852b7d922f098cf162719c1f0b45ce092b26baffa7874e68c0b098043569b309a6b4af1d1bb423c7493275ca7802bd481c2f28abd58a7530b67c75316271ed5b0561585f3ab110354a1d8c5a8324a3b96a16645eb0831563baec37739e3a7bfc4773988801decc4f89c118886b5716a25a53a4d1e51188e51bdae8164b1133daac247738262f77c683b953859b78eb08c66386913b2d51a85f062dc9c5d1f990df2089e68f4b465f26ff3be96c9d691a4e553d62fad9a95296a2a8267c24390da18ac5dd78fb21487a9
+Secret Key: a4c7aaf1d070d31b432fe83bf4447690f19fde57683f2621a8666dc1f6530191b9b2a8ab02a326fd5aa3c0279f24a537cddccae6aa49f6f00d116011b6c6397d7c169a26b350c202caf64103141d70d9c7fccb2dcb9146d60800b16111e060cc61209ae8f69a1ceb26488506a177011f8735ea721eb01aae35a790a74b5cc3da254ba569c2c819345ac0c4c42ca557889091312d4360eeb7c72f75c282747d80659e2b5500f0dbadfe8b210a178d3750618e458392c61c0e6451d56104be01814331b295eacab08ccfb9f62bca7342632bb093ab49b4c8757bd6553e0a39eb23a0c793b7a55473e82b550781bf52102ed100ae5696c22b63013e63a60938c78ce81e24817dd0f8640fea10d4128e12a047cae6b75638cfe0877dce4abe8419c0eddb794873c02a9b0d485c702d4782e13085fb0a2726986dc55a65afd36148709e41e69cbdca6e4fd419d04c395d3b0ebcf3033dc15b013a72b9456ff698839bd576e7758e525b392e786faca34ed5802d7ba00e5c24b07ad71a158a68fc8c39da220891d5bee5d3c9dc32290ca3bc6ee690b9a5cfbbf08b6e12c16b38a36c67cd86f517a0e192c1d2018f03b48c3aa44100c81bf454f9e0ce965835b29753d8992b7e0a37556b0bcf18bdd3415ca8571818268b801cbf858bcfdf5a4c788b5fe8037fca2100db8084fe42cf999206ccd04dee385479a993258b9525685c7ad2411b5235516353a219acd821a236791db3d759f484b7783652e3c8b0e5ca7ccf014a2d410829b1cd67d01bd4eb35933a934d92cc72c20ba7fa688b666695898e6aa4b8aaf109cb26364ce397d7e6340f17c1d11119c8dc7f6dc20353991fd168c4624273ac84cbfd495f377958a9ec625a252595f02433c430e7719e9077c180230ef3111808fc0c9d9ca4c9fc4d01971981a9a355882db176bc67b08e917c9c75f4702e080b551b17c9f0b713593bc5650b5b87cb76bc9c85abc6c71a1f8e8a61145339aeb4bd14a6b828b5cd675b5b16c694cd0abc21bc31b6a47b6d0519fab55d19a691dea12162f0a16f1c793a45bd25f07e670ab801fa389743983023815c33985c59c54ee6cf65945581fc03811c19ee709b2059a376096aca4645c8193f361212a42231e27cb636d413d052cd5652cc5139657974bb307292eeebb2455c9d5bb896185c706d2355770c86b73b8b0935148b3b1504fb316f48108663b4a8806d24e58ae746cce8c50efc80693a8c81841ba20d4a6d99821e94082fd18727d234690b04669b5baa933c486946cc905624cd118c6f88743f6214a3d58589b0cc2c4605dd302035b515bbd4111459762105a5f381af5f932c1586b4246a9db7bc36f96897cea9b030773a84777c88521ea235b749a7cdac1c6fd0eb12285b8bef19b2bdc226756a694abc4da0f8be8f27bb1c65c68388c03023ca4a499d9a783b2a0a2cd0a181b4c2aab6eb87c7aba6244a67d8a3bdcc96066f44aba5eb1ea707558b9a65ff91cb7f443c59b721261a6aa3a8cf4b1828ecd4726c5b681b94b9b6490d1e4264571c3e6643b8c495291a492778b1bac54cb3b4eb8c47fcad99919b691936e7918fcd09847a95020f472ec8f4cd62435d2ba65d871b5513f6917dd788d3818c713bbad6848b645c8c746bb244c8cdacb880fb99aca34a7742f48aca6288c1bc464415c5d9a0636ce07de22182ec5392943129b5e169f6b1334ed76718e735b4970b1a7509934c46320474b3a434a658800a53cb056b1ddb4904e5d2177187c6a7a538bf73a0cf522f85590e8a102ffd74b479348a409834c413ca42a44a32ab6d4842655a6bad12120962b93a2460638b2a6e85b226e4f02eb77c903537a6359836a9f19d2ff4249aac8d2eaa0dd013654565a66b7bb260a2a39024ac99d11c074a23a1813629e1448e6a1875f9bdf46852b7d922f098cf162719c1f0b45ce092b26baffa7874e68c0b098043569b309a6b4af1d1bb423c7493275ca7802bd481c2f28abd58a7530b67c75316271ed5b0561585f3ab110354a1d8c5a8324a3b96a16645eb0831563baec37739e3a7bfc4773988801decc4f89c118886b5716a25a53a4d1e51188e51bdae8164b1133daac247738262f77c683b953859b78eb08c66386913b2d51a85f062dc9c5d1f990df2089e68f4b465f26ff3be96c9d691a4e553d62fad9a95296a2a8267c24390da18ac5dd78fb21487a9365ac25e480d982b56546cc33730933edce582858de66243ef6e278b1813fb2218c022eb826812bc213adfa8837d2d76670708477afaaee146d024f7ffce8c2e
+Ciphertext: db77092145da219909e6ca5eaf8c1f9e5b2f8166d0f820023ff02fae9648b56a376faef481ecf1bff4be50792cc243714a83e9746c94b5e440b6f1805a84fa833b38d28a0086fcc6f4f5203fffb4b44b94e727f3c85dfeffebfd7a9968c411d8f7a280ba829a5560e925b26a97e39c6797a608eb8028f7af75a4266ca3c6bea91be3a32364ec907849458d0bbf77a271189a678254a50e54c791922065cf56d74e7963efbf09ce9f0a490e0b956940b58674b7acaea26f9d4baaefc7d9857b0e66ebf8b80616bd1637b861e44e03a3a690b3617a763f956a93eda06cc9b0e058ce5a7618afbf12141c2ad517c6ec4032ed3b61080299611f471316124ee9560315cbae0710525bd67c6ee462179b81ed9b8e142301b53eedde22486a7fb81a1d46b9e0e2ce0f37c040ec44317bf08ee152e5aebcbbe9707525eb58ec493734aadd449522ee93e312339a8ccd1588ebb7ce840d3fabb779b04d4da63f8cbfa741fc9f71e58d0e9fccd98eed43c4fa7e5ab7905a74860d82e1367ef6b8e5446814c14eb2cbd13d150f08c1afb9e7b354a015b9180d98db2e68272476747a38bb6e6e96f7af38607e0e57d9cde09bbb89f44298881a960388130aa34c955ccec4e7c40af6fdec7251f3187ef5a3f621d981a7ea313cb363c395becc7d8b86f47c61fc6ba6395f8475138d7b3230bcc68a425470601077d307968c328783aebeb7882ea1318483176fc921b78bdd6f8f815ba211210c6819a6cf110efa5729e8e53020afdf1c4954d606c481cc951d7bc5a10648bcdc9b432ea6a215bfee605a86b87a2ef1acf00affed2abad53e565ad7035d2fe98100e7ae7abbae1d3558e33bad9adefa63d5cf276510d9314c7189831999c44d90cda28ca44472911b645392a1eb4cc94d80d91173349ef8b84ea172d65c3ccb41d2c99b0a4e20580444072e7319cc25105c4f66f98a4aa85466fd1b16908f14678038cff6a9ee78b278a43e97b2f98380eb0f028dfcda3e56d37ccb519c2613fba35325c3239368eaa052a8fa310efd699e5f056a881ae1d4e782e1efe556254746e22202d46caa6707edc98b
+Shared Secret B: e1f442f2b8905add8582e4682c5061bf334ca777cb784b136628fe8085b18151
+Shared Secret A: e1f442f2b8905add8582e4682c5061bf334ca777cb784b136628fe8085b18151
+Pseudorandom shared Secret A: 3dee2b7285bf4e213b7c2a01777b5cc7906297e1c7b367aaacff623652fde771
+Public Key: 454565875416ead94e31f705df3426a8c04c0ee3b5b69210b9d762280475e29b0fb5990a17854afbfb868a976613f277a4dc879aa8402cfb13291c5b0d0017571860996305dd8222bd808f7cc3a47fc5c4b6616bfd72c3e1e3cca63562c0fc6ee39c179b68696ecb19a92093a17518b69c1b5f4b0a5f7ccb21c058b0805b0d032513261ed4e3a63c2c88f9c01eafc69fdaf4a1d8e14731b4a0296b5f659c17b1262b88f77aa8b58f5642bdb1fcbbd047761e35627f6754e8582dac271d3decad679887b0c802cf4820bf02965a5c90f39c67419a466cfc3622e2b1538b9e16b19831f5ca7e87976dc0adb7fc30729a7e7a936b8a50064c9b6164c4c0b2481efe995291fccccc898778810dc590647d7779bcb1c75a289f4e622700f4a5f4c6501a956752e44aed362694915e177402aac52e1679193b18642ae8b6ce5cbf090ab0f96877fe741d8d648937454ef76c7cac594a7493aca9143d8423ae31579e4cf93c0747870e697f2242c690f83b86b2bb0c7642ff44b0b7d96a2571a99924a1c9c043661759d3e0a644d5916cdb046e8637f3c66c060c67264c446f04a7accb059b90ba3229af243abf42fc3c45d80cd610b920224989b138724294bfcc0b1ecbc532775b23a1480423bc397a17ac263befd5635f464a9b98c64411acb95b7af0b0b1bc7bcc6fac4750c3586a99378a11a85c45c829462491929201333742b64721576ebee4832655cb4e35b163dab11a937f7fca3dab6a41caa20ff24b094e4b60366379a6fa7f82f93688ec4a6181b3f8369e34f8a0d3c1b23a056aa9f5854e03916d93abaa83353fd32b07b12c23497fc1b725cc38150753b85ba312eba7a7d07816a083bddac28f4afc110cf0bf4a535f49f659eb436ab7940101501bcdfb344c79700429c33c3c9f062b970d53802e983c9d016f8d553923e45e7152499716a9927b73ed0a91fa23925ab615f2c3b723222f72da6c426c0ed27c18f258b1b11c61ebebcc8c2612b8faa1e0548ffeb1b801f194dc056e3de978b92684ca410088887caf31a7c43251f6ec5d24451bb3410483e08fc266009340203fea70e993adbf1ae67af704f32c863c1c1d587ef81408bcb19c1bcd874dc5c1f1ce4c
+Secret Key: eed45cb047c53050871393abab373987503691acb3306b9e46b16643565a7d919b11e5c613ba85c17c2161c4cb3055cadbc05e38e245b145cd621c78aaa64a67c284b5c35fcc82b82552c36989b5b8ea6cf8891744339dd36aa15d216e2273a37fd6876102a369148a9058c3805603728895f7e8b8bf5b6c893bb39b99a8b8066364fbc089cb46a4dc020c26bc4fa58c0268c1ebeb2f63609335ec225b7192990377105103d7d128bee39530c5446d84123af43505948f79f188f9f6948dc51cf46a430c84793b9abc22e87984cacd46e6cb79ca5d2a60797e69b36ffc41248276633a7265bca531b130ad21523e5407ce76c98e0ab086897cc833b634b997e488332f8b493211a286457634a48df34789bcb75d42c099e0b0cb93f17daa991e1316c626c026a4c2664ae876e7252d5af2603121c1ff02b0c3a8691a55c0eda72718c5c9a55841222c96e6f4c1cd10cb51b27c9f3c470986cac7a09053b37df070a2dd44501304581e896b81fc0605b90d835562b9b919f8dc1d42037a7419b63e9b76bf135eeae91e6cb71b19a050b85673aa35ac1852843ca79a1a840f140060b5e99f7f8cbd7eaa763f050c8789700699cefa91c3d822bc06a938e8b588d90439c4eb64a4e754c6413ab9639d97ea9dfc18cdfb818399d75d7cd34d33cb7eb047b3936b723c07b67e3b90610c114a2957acfc3008713cbf1b264245b624a419e23607f1a276ec343354fa82ad030ce53567f9d09d1dd69832f5beaa083f011aa383820228d290e439aa8a14b860827fff8097ebe88221e0a029346bf8855f2ce89fa104107bf8259ebb3145d389f3f3556475b341d38494763aa4957abc4b80b2389319d7b9c9c76025e41aa7344e4df7587be136204ab65abc11c32a299c33486a0410427c19f9c1bc7e48a0b32b18e80b1723dc16be8806a485396b7b453690a5b147cf5bd89400f8b0a7354ad1440fe958a6fd815d98d5540e54c098474e88a7050b3cb89dc67df541729b4c11e6361a0fa4153f366ea6732ef5b1cb6229088ec368d6278a4af2bc117b64ef3a3c14781e95b4652cf67b7c5c232c3655454565875416ead94e31f705df3426a8c04c0ee3b5b69210b9d762280475e29b0fb5990a17854afbfb868a976613f277a4dc879aa8402cfb13291c5b0d0017571860996305dd8222bd808f7cc3a47fc5c4b6616bfd72c3e1e3cca63562c0fc6ee39c179b68696ecb19a92093a17518b69c1b5f4b0a5f7ccb21c058b0805b0d032513261ed4e3a63c2c88f9c01eafc69fdaf4a1d8e14731b4a0296b5f659c17b1262b88f77aa8b58f5642bdb1fcbbd047761e35627f6754e8582dac271d3decad679887b0c802cf4820bf02965a5c90f39c67419a466cfc3622e2b1538b9e16b19831f5ca7e87976dc0adb7fc30729a7e7a936b8a50064c9b6164c4c0b2481efe995291fccccc898778810dc590647d7779bcb1c75a289f4e622700f4a5f4c6501a956752e44aed362694915e177402aac52e1679193b18642ae8b6ce5cbf090ab0f96877fe741d8d648937454ef76c7cac594a7493aca9143d8423ae31579e4cf93c0747870e697f2242c690f83b86b2bb0c7642ff44b0b7d96a2571a99924a1c9c043661759d3e0a644d5916cdb046e8637f3c66c060c67264c446f04a7accb059b90ba3229af243abf42fc3c45d80cd610b920224989b138724294bfcc0b1ecbc532775b23a1480423bc397a17ac263befd5635f464a9b98c64411acb95b7af0b0b1bc7bcc6fac4750c3586a99378a11a85c45c829462491929201333742b64721576ebee4832655cb4e35b163dab11a937f7fca3dab6a41caa20ff24b094e4b60366379a6fa7f82f93688ec4a6181b3f8369e34f8a0d3c1b23a056aa9f5854e03916d93abaa83353fd32b07b12c23497fc1b725cc38150753b85ba312eba7a7d07816a083bddac28f4afc110cf0bf4a535f49f659eb436ab7940101501bcdfb344c79700429c33c3c9f062b970d53802e983c9d016f8d553923e45e7152499716a9927b73ed0a91fa23925ab615f2c3b723222f72da6c426c0ed27c18f258b1b11c61ebebcc8c2612b8faa1e0548ffeb1b801f194dc056e3de978b92684ca410088887caf31a7c43251f6ec5d24451bb3410483e08fc266009340203fea70e993adbf1ae67af704f32c863c1c1d587ef81408bcb19c1bcd874dc5c1f1ce4cdb392782a0a39579d12ec9b649f19774a53a038ce71f834d6b9b206c163e315677012f0da937e970dcc22b3ed70dbdf9941ef3063a231e0d09c2abfec7dd70d5
+Ciphertext: 3e682f3ccce26981538e7475ed44e770ebfd0a469f517b07ec10036b77f40f405d8b1fd95c55808149e12952b4bd715080947da2236cd025ef0b933d4b2a6eedc7333270e2462ed2a2715fa7fe7ddd457e6d61850f485c8f038bd880f5727296480782e44db92753393981a9fea25c4eeca616eeab21555d27607c243c35b13186bc3b34015790e6e36e47270b176008093717b517feda2828dcc06cff085abf025398dd072307706b7ca588186a734466676e5e386a29b5f178ea413ba9ca32a7bfb8d2a3179983beb6f1d0c9c3dfb43b169e309bdd46315384f2405ebfb50cc70cde4d7dcf460247794384396bfdff66f12c13809ec364c48969f707b569f7464f6042ba6f5da13bd8f2295b280fdf602a2f567a34fc9a34a2df5ef77d301927c8641e9dfa263f94291129af5599cdcb6f4d827f0573ddc86bd807c392517cb0ce0dda2e2d78d0af5b6caf0d02d43cd56e6f83a483a4afdd1259bbdfebcbee07a72b2d1ee295174b4b8884549ef37ef8a73a3f5e8f7edd51657cc5c94d9771377130921007fbdecd3664eb2675c61a06fc4bdbb5723537349dc5d2ff25a177a99a4f2bb8795080aeae430a9b8793dc5f63977f294ce3c65b0be200e9a1047e69204b133124ce288e496e551f3e0c233d13bd25ea279fc6bdd496bbb02882f5fc99ae68d23be99c79e6660a925ca2ef66fc7c754b7b44d6cd0cc6ce242af754be078b65ab70218c754b680b7ac5a409558898994c9b53ab325038582ff67559ce7278b6ac8e094a31f261ca8c1a482d1ec45b585e4582542b898367a26a0ec7c7d341196e38a55d91a51f7a7205398c3ba02fc988504ae986bd2d66fa37109185bc15b9404ba6746ac65ac289ab4d36c449249fe8b0d141669e98246c2f855d69d035d0668f94c68fcd96d713e77c30eb4dff19ad7f2f74a7a458b0cfe8a38e1ee22d884cc8e7a58f1db7f33db4249d92c06d9bc9737560310de9649bed2ff5013486ed5a24cfb65a3caa24c5eaa6c1ff7c514493470c5227e4810a48fbea86d63ebd8956a6dca7b9592bc0e1bfcca898c45ba118925220812bb3d3c14e6923
+Shared Secret B: 7f6cdf513bfc22b1abf87024ed2b2e793cb4f1486b92d7bce535fd45b4dbdefb
+Shared Secret A: 7f6cdf513bfc22b1abf87024ed2b2e793cb4f1486b92d7bce535fd45b4dbdefb
+Pseudorandom shared Secret A: f427b4df77f3191250c22d1c8f2a4da7ce090f2eea701f76a41ce3bb85d804a3
+Public Key: 5d169a5f808dd0c31a9a726abde1ace2a90d7d139e136b91f9e39bee865a77ec718d159bc035c20f425b22c17b26e2153e5b42e861215756797920979772784a669750f2745d1b059e8c66f7a966e953982e6c4d84172de04b3d0f4970e9c53cca65b86535864c90c1c0a786d975408ac307d2624d4fe2ac8d1790cdbc1fcac7a63cfb4b93409ad15c2cef460b2921868d8c61ee3a9a54c57e666a19da277b1d35353b6907aac148ab31386ad1278980b8aef23fd3e30de12b45175cc6eb6411108590b27c37c9cca7d6854c4667237ec370a38218e5858f7673376fa34bbd66030815b837b162a40c8b197775c0380aa85040bef42186265208975a2e488509401c658b9763e2303129c2355a52abaa7d06575b23938778b012546c4dcaa6774b8ba4edb770e202ad8dd5c5f5e4209702b33ff7b2a7f0755b091d02d277811b0901b68445590a046a241ff75cf73117929a46e8a3addf1c11b2157d9100422cc0b72073aff891430e9b292db976e205c7b259b0a13b76f2b7ceb733049a1a1405ccc899e49414394d3b57238e11c432dc879fa93a47b3c2355c8330ab1174c746c7db7f40761f06167ea0472a1ca06e82ec3b35d357b4b9338bd16d749b6248e2c9b5e6cae58c52e626439af8aff7e23d6a2672ad818bccc867d4c9c01814a5b08a6ae95c74399c0172c988547b3ce0921dcd9b02829412127264970baaf5981ca5818797181f9fd54a679c2b99c061ea73b2e61ca2be2762f92539a0444bb5d5b4896bc3a5d692e21623a86337ee90ceb0058169f9005884682666bffb633f7427ca85798f7fec8ebeb2c6cdb0459b7819c15c0382bbb4b673b3bf78b66bf0a60b8489e6367b2213999e527185408f8f95035e9cc32cb55e47d35d75a02e94c5ab99908cf153caf063bbbcc8308d8636fb03391760305788997fd26155d04a8a9809b064ae5d3305225376db053f48b919c28080cb77239b5b9452b40066b88e39016210552afde25d923139453b6aae6c62db177f6f1b0c294ac95ed402118733fb2c6df4a27863e20eb0803142f722d67b6808b6135899019903a940c6623e57909dc297170304b3c1ad32b5c209cf6802e03537177c46d038b88547d93a
+Secret Key: f19906144c2a0a666ed013cfc53006376abcbcf26399388adaa314b8cb55f55710de1033d3a9a6cb1723304cc6e7b96095fb63ed689247030d16909d02034c1c121b1b720a67f4496fda5723871068951b91e862fa4133119c3207a97724f4b029b00eb1aa8f17368706a7b71182cf74c187e96725f924cd9567bb41598d2738b73187b4b0e437b36537a9f1a68ddc95d5265194807f75b294ea535f32991c73fb1f2c5c2e92569b45d29996562ff9ec9d1b9082da69671958ce81e429a532aac9e727f8d435ce27b1e42753f6c4ca11ca6cc9ca12baaba48998a2f1d293bab53a2b21bbf38ba1cc5a0703d646ea5866b66c93333b6165ea50623b45bdc62939f6943a615a52a52f39790f92b3bb486c8c1ed63fca0760d857949bcc66236b4b09d36f2cb50c535c86168920592a1ec4393df974ba8d4320b4191b7981c5569a911f5b3dbc93bddb13a7358b970ad91aef673c90d56e90ebbeeb6132f5630e8c8ba1ec5a357c793dcabc698a457b17261e3abbba78644123a2047152304b03a1d9e755833549cd597dbcb175c26a111cd44e2e975c651b679e600e0e12113a5b7f1058a70f777599685c4e1529a4a55f5e60bcb9658dd56b86bef43a0014cf308aa0611b5eab6c39b070add8d85b9bfbc044431992611a3f2a15d677538e099234f886b032638507d0a7f86b9821779da268df67909d4738a6512c10308f2339128e26c96761a9d855c0a16c68fdacbf00f284a3276e65b4ae9ac0b7648b9930ab981c80072e737f2a7118e8c56ab7378eb7106f288119a644c09b462227264bcf9876d6b454f5a033e574745f482bd42b0ad3e0aac7c8bf1c07126446216bbc83e02180e8926b48cc08e6a3b1caa2786b952f655bae0dbb34f2706c428651a7a45ec32955597537a85955fe7679bee7300feca24ab40e6588b5dd5a157f540c31d284aad85328635578b0883efc1b7ad0a62dea5f4df192ad281095280192424ed89a129c003b7bea711c96b552f64403a963f36309a0c81509444eee6c4407a41940dc6e7f36c34cc44611c438d87510d2373a5a2ba9a12a017d3576c9a38b5d169a5f808dd0c31a9a726abde1ace2a90d7d139e136b91f9e39bee865a77ec718d159bc035c20f425b22c17b26e2153e5b42e861215756797920979772784a669750f2745d1b059e8c66f7a966e953982e6c4d84172de04b3d0f4970e9c53cca65b86535864c90c1c0a786d975408ac307d2624d4fe2ac8d1790cdbc1fcac7a63cfb4b93409ad15c2cef460b2921868d8c61ee3a9a54c57e666a19da277b1d35353b6907aac148ab31386ad1278980b8aef23fd3e30de12b45175cc6eb6411108590b27c37c9cca7d6854c4667237ec370a38218e5858f7673376fa34bbd66030815b837b162a40c8b197775c0380aa85040bef42186265208975a2e488509401c658b9763e2303129c2355a52abaa7d06575b23938778b012546c4dcaa6774b8ba4edb770e202ad8dd5c5f5e4209702b33ff7b2a7f0755b091d02d277811b0901b68445590a046a241ff75cf73117929a46e8a3addf1c11b2157d9100422cc0b72073aff891430e9b292db976e205c7b259b0a13b76f2b7ceb733049a1a1405ccc899e49414394d3b57238e11c432dc879fa93a47b3c2355c8330ab1174c746c7db7f40761f06167ea0472a1ca06e82ec3b35d357b4b9338bd16d749b6248e2c9b5e6cae58c52e626439af8aff7e23d6a2672ad818bccc867d4c9c01814a5b08a6ae95c74399c0172c988547b3ce0921dcd9b02829412127264970baaf5981ca5818797181f9fd54a679c2b99c061ea73b2e61ca2be2762f92539a0444bb5d5b4896bc3a5d692e21623a86337ee90ceb0058169f9005884682666bffb633f7427ca85798f7fec8ebeb2c6cdb0459b7819c15c0382bbb4b673b3bf78b66bf0a60b8489e6367b2213999e527185408f8f95035e9cc32cb55e47d35d75a02e94c5ab99908cf153caf063bbbcc8308d8636fb03391760305788997fd26155d04a8a9809b064ae5d3305225376db053f48b919c28080cb77239b5b9452b40066b88e39016210552afde25d923139453b6aae6c62db177f6f1b0c294ac95ed402118733fb2c6df4a27863e20eb0803142f722d67b6808b6135899019903a940c6623e57909dc297170304b3c1ad32b5c209cf6802e03537177c46d038b88547d93a3cd1e410fbbea1f4989ce41f43dece2f5d4166eac421e870e46b4905e2b981bcb0f26cc365aa65dd03be98ee2b7694ab417522c26b7c4b558f66ebfd660713f6
+Ciphertext: 9e303d077f85f98042690ddd933bb9e93bb53f31e4fddd2752ea0452fc2d5364167a3bbdfcc8d8159dd7ba8499e61178a5e4c4d57e45060783fee40074ad1ab07e017cfaf3bfeb450b9e69fdab3337adcf63ee9d453b082fbbc3422313ba043ce385ec8ae2b3baf0c0af59de226746cd5ad9a101f8de7e77c325ed7e2f25755eb0a51b3d1b792df8502e170a8207ffc14c59e5c9759b9f1ab0ee05be5c34989ac49f0ce879223260dfb7228e0675cfb4f88b5b81bb17d441492abe6ddeb775b37d2a775e9150460f5633a959db783f1a41f7ff6d9b4a863ddff2fa909c4b724791e03e9b710c7106e3c5617a5a325501ac0c44bba534478d3671ec9c4b048af025490af4a610ccaa4cc0eb94e62463b06666e1882990c2eb95301cdc5fe3987334bdbba9c060001f060b5d5cb890864b68c1178d87ddd988367255d0c9975f6ee8ed426bffc89eeb88cb88c8a8afc9ada4adebccc077436b58f0fdeb25c8c8b1488531794e66dd89311898c4a42f9aed6d5f23a62c574850b7fb6f1296e4f4476f917d56af4b8c15a70d6761c53118856e7cdf73ca309af83adb1148d51b9ae03ae27ba9b1bceff74658d341d889113526aa952f8cfda8e5356a19d87270e43cc252f5d1058eb9e43111cb62eb9247d52e51671b6214f9df476c311cb1e36ff0dab75a53807c0c9b924ee8ac5b126eb4564562c2a24353fa552c022d6ed32861fd9d844a43fc4b1dcbf81edbb00ca6d46ce68e1fe480bfd562bfacd396a1bd0b306215e545021512be944b1b9d63d45fdf70c3c8252131d027c2f762a66ad985fb451604d93a032382eb987102359e6eb7e89fd04f1699ad3c72ffa957ae716018992bde3419384bc7654825c43fcdd493b4524990053e6c50a52f72a01af18601d8688ad145257e9afff64540fd129ab8539dcfc70d7b7a1d9cabcecb968a1a4562de3f76f717b830f37683fae9904f6a60c38d69d8d8b056beac1760551a9b2691b4c1982a5e9b467a6662ca3cf846d2ccc5dfd8bb3ca2188eeb467e004afc5284f5a07a68f48f8a4236387e3f4b1144ec4aff196e732a6d0d33047cd2b840
+Shared Secret B: 5c7c7dee5c6f58b7dae85524e9419b5c8e6c1aa013e3cc6ff6a54d7d2eda55c8
+Shared Secret A: 5c7c7dee5c6f58b7dae85524e9419b5c8e6c1aa013e3cc6ff6a54d7d2eda55c8
+Pseudorandom shared Secret A: 8fbb7dbec1cc16cdeccb8310bfdda432c943036f8c6074373ef901d8af4b06d4
+Public Key: 496214910a4c1ab31d490c98723cb95709345adc3cd7a4aee1a6758cfa8147ab2daa288bc6e3055ab54ff238540325bd0f27676344c7dc3247141bc9610b17c56106f5f7c6cd7903172b4473287ed4f027efa51556c14bcf627d4b461916599278e1bc67022d5ce36c40f8543197a48c556cb3c3a71e7c2e56ea7022758a8bf8403196a5595884119737fcc79ea38110185310522c4cca625a02a53ad55648c7cb07bd955faef6780df25ebbba6d406601e81b8ccafb930072437bb59e5fb2049393977f1c6e9b0a32b2e4472673287a3896b4d15d7020798a448a42fcc8e7302cf524a9f691190f0448b3db404aba19b1992e487c3b256771a9c15e7c026c84797611f461f8a19fd9a07faddc7297395d4633488290836424aaaa58c31b163ffd774e3e48bec8db62e18a7961515db8db3e8a467da2a51173606b7ce9b621046a0786422638adee747faa34b91718a396988ed936ccfba28ba91c5163b732cd6079649ba450884f2eb93f487327833b2329745cc56952dbf63ffa62cf4e937e854147fe9ac85a6673a1330b43d8a95903607bec917b82ad65f895a53588a6fc24b680ae5fc168c3005e0bd972df596949b607fc74443ce34c73ea63ad903ea2c300b5833cc7ba2a89027b7beca65691a823c5c922207525e75f8e28b55a76b12092cff44a1062a7a0b7015ead3121d897c1ced1a687c51983a5786fc0a913f57cbdb6be8e805c1fd86c0bb4b7a219921b503cb1483b3e303cca8b7ce28495ec9a7f544a1991b96f5efc7cdab968e42088d8c97fadec5435b5c5d01c06dd9193af7c24bbd77f246886edc505e3776eede5bee9dc79f409076a5cac8dc1507e649114d2328f521ffc4aa594fb49012b3ef2a6a466328eeaa855bf30a14f351931f13039516394c4b36a28622b0bc382a9cfa8fb1fdd2214e9e77b4f6657ea76cdad6c220bd7266e179c3246a86e0179812334f9253c7e314482bc9ac0a64d82dc8c6fe6a679f52045907408e960e5d29fc536b206d99aa6c99396564ced5659bd0c5ade413a6d742aa12a9da62c1577854a105a39507b0af21b011575057182038b70e2f150189b54bc766293d2da5ae7da0f55414fd6d8227aa56b223351155f
+Secret Key: e1cacbff555404389550c150db495ab8e602f1a11bd7005e4c8262de03c2e6704eb3702215c2a45baa1095038bff730b00f4935d3a3a8fc7cc8825635aa20d87f9c1858093bf08c0e1618251571610c51507c19aa6832e70688c65ea2159a15226e92519e00b2f023933c6bf29cb616b723c554ac613365675735946b65b6dc04d29469e67a892c04437e149802514abe45cbc1610b2200134ca666728b7bb7a17b959d64a3fec7087a21ca92c5e0e13188bfaa7b787af26968bd5b7395c670ffc41cd12f452437a63bbf85e3361704a1a50060b917a3c04a01676a0e3ae0a438370e4ca6b00483c63840f3bacc53727a60c4ad056097af56393c66403c927d24aa39e1c4910260015e4a1d238c35dc0848c5b8deaf953b7815686ca346fe60476fa45aa538dba1b2663c2062415c06c5a02b2aa6eda703e27d161bab2a7bd760922a3a8cfab93ee8153c9f4548439c590a0a834597f5ed7809cb030acbb26983568d3325766eb6d9a71372098406b5077c94734a6b0b4ae6ca09db09e3881181ed8616b24343c202f0816216e134cc3773229db12c0ba956cd062c4b8650fd82a626990b0583d84a86357bc4ff89cac3eea57658055c96a659f1a163a2ac32346a61b0c61008dabd32b1698e800489b3baeb7844b6474f73b51c45c3f90593e031531b3a0245090964df04aa074c32fd4a56c380b21fa7e60276935fb574401a4f5e02aa15c28cccb6a2a80576c0c6fd5760ad8fa509106a6bd556e14ac7e486b69a25b02a1d410a0e873762ac9ccb35b1de490de12cca2e2a8f3aa4f57aa7333a92858c6413f310f01ac10fce202adaa3cb9c0012218ba548515ed852438c80273b11c099771b2b33ca6a2aa0c0b2a52059254f85310a4b130117d7acba5e715128f4742f0a606ee628fc27395f6464a7b142c6bb1ae4551324bc203ca6154aa96b02f1b3984222f10e1361d203135f87a50f16c9b5bca56b2bc0c8328f6615810c92ec54cb64422ae6de334c548495e64cfbf77741863cc2629344dd560b101a26850ba002210d2e64a58294b41bb3c8d7c048135457660936f639618126f496214910a4c1ab31d490c98723cb95709345adc3cd7a4aee1a6758cfa8147ab2daa288bc6e3055ab54ff238540325bd0f27676344c7dc3247141bc9610b17c56106f5f7c6cd7903172b4473287ed4f027efa51556c14bcf627d4b461916599278e1bc67022d5ce36c40f8543197a48c556cb3c3a71e7c2e56ea7022758a8bf8403196a5595884119737fcc79ea38110185310522c4cca625a02a53ad55648c7cb07bd955faef6780df25ebbba6d406601e81b8ccafb930072437bb59e5fb2049393977f1c6e9b0a32b2e4472673287a3896b4d15d7020798a448a42fcc8e7302cf524a9f691190f0448b3db404aba19b1992e487c3b256771a9c15e7c026c84797611f461f8a19fd9a07faddc7297395d4633488290836424aaaa58c31b163ffd774e3e48bec8db62e18a7961515db8db3e8a467da2a51173606b7ce9b621046a0786422638adee747faa34b91718a396988ed936ccfba28ba91c5163b732cd6079649ba450884f2eb93f487327833b2329745cc56952dbf63ffa62cf4e937e854147fe9ac85a6673a1330b43d8a95903607bec917b82ad65f895a53588a6fc24b680ae5fc168c3005e0bd972df596949b607fc74443ce34c73ea63ad903ea2c300b5833cc7ba2a89027b7beca65691a823c5c922207525e75f8e28b55a76b12092cff44a1062a7a0b7015ead3121d897c1ced1a687c51983a5786fc0a913f57cbdb6be8e805c1fd86c0bb4b7a219921b503cb1483b3e303cca8b7ce28495ec9a7f544a1991b96f5efc7cdab968e42088d8c97fadec5435b5c5d01c06dd9193af7c24bbd77f246886edc505e3776eede5bee9dc79f409076a5cac8dc1507e649114d2328f521ffc4aa594fb49012b3ef2a6a466328eeaa855bf30a14f351931f13039516394c4b36a28622b0bc382a9cfa8fb1fdd2214e9e77b4f6657ea76cdad6c220bd7266e179c3246a86e0179812334f9253c7e314482bc9ac0a64d82dc8c6fe6a679f52045907408e960e5d29fc536b206d99aa6c99396564ced5659bd0c5ade413a6d742aa12a9da62c1577854a105a39507b0af21b011575057182038b70e2f150189b54bc766293d2da5ae7da0f55414fd6d8227aa56b223351155fcf0b1795707d86980c5af131f900be5e001a203a3d3cdb01147ae66634f744f4e259d98c355d79a617d33094c565315fcb98fe2171009621d085febc8ca7f72b
+Ciphertext: 16363a512d4c7e7d0c65c39871f565cc5efcbc3f596227477478466e70aa815fa943b9be75b820ce4545b277a45b53a4c1778d9a19feb235b52b8332f29b69942adaddf9ef48c5e9c62e0131a6cd4e5ffecc083786ad9705777ed4e79e80d322994738c2ae6b8a896f676b3f3d1099f06ddd1223de78489b6a2830007d6a52c95856896e544927b8f51c8c09208253caa3bf097bde0c87564860a856e6b69174c5e3c5b9edb359e406de1e979b1b82fa587074ac4caf21795bac05845f81d513a5716452281ed166885089d4019d05e26fc438d3f4c8837d3ef0e3fc1753c48bcda2a7728c7a83d9d897f1eb1c77299b0ed38079e6a6db1d4a310719a14348dc374bd6b9fb4cea5e5141b666a310165695dcadce27e8f0e4b5add0551ff752b4d82f871ad27227d754f0b6db92697234d643dd83ecedf437e50d7efd4c24e7c8b47f191991634f6c0da9762badd9449e9104e20a7583b546d49a46d24e0bc134ba7527f2011fb0df3cef10a09379ad1f30ab2bcfc34ecd833e71c9b61be2b576b82edf7f626435bd707124ea339dec4734b22ee7d719441f8d3e2bb048f4d8b0f1e0d60368188047af6d2a85a28036046fe2a42739d563bc8e7a6d1a80276b5267deef8fa270abc5b68165349d1d697a4e1669b0f14943b24a6348dee2695ae22a1dde34c35b43f8c74100d7cc86b78922d1ac6e3c77bdbd1e1fcfb72e3416cd4b4538f50362499bbe96d6c8637b81141fa12b2ffd7533d755c42515e71e80c8004b654f50f3a8e5a916f1d19476b6a877f2c08073d4b6564f1f736b98452a4f87823e01b5a373e0e6514ccf652a7884a431ffecdb841d827f6b5b2d4b910475f17132e161c5cc6a3693d25559f004f44bfe0fedb7ebcf1f4822de864cb0294398a8976facb09622bfd26b5cd4fd0405fa5e0e620af0aecbdef0e4f135cb9e3ef9bd78a0f4c7d4f0aff6ec532e37b796422118a103ced0f93d1bfef2a70768510da736d58011b64904b4372c66828b807635832fb60608b7cdb2e79c750834b938c6d13c3be86fd1f807e39eaa5cf1a00f2d16dfd4d5b9455aff3e4658dfb3dd
+Shared Secret B: cdf87ef35ea62de73166cda405efe170a3fc319b1a99401ec5f32b9cd9b6cb6b
+Shared Secret A: cdf87ef35ea62de73166cda405efe170a3fc319b1a99401ec5f32b9cd9b6cb6b
+Pseudorandom shared Secret A: 09f7731a309e61fddbec82c1e23385a5cc8ed8030cd9f74a2496d9fd82aeda9c
+Public Key: 85bc45051aab867175fa37b3efbb45293a0e0a5215aa2424d2e64fa787908537aaf9ab254bd89f5d92341c7b81d744b40df30fcf884a62261a948257a819c013d71237b799fa670d43b9b20fc72c694b2bb8d7675e8395a0110eaf5b38364873e9a4c1d30756d0a22f280bce983012f30206fa41c1e5b64c863a2ef88b1bb7105ea141327f15849df6427197930aa8b6b0b30bba2748da7708efb20d116b4c72780e50629f26b63f30879bba811b0e1046994a7f3e61677a1c1fd1f4a2b6701183d1b7fd21bb478a1e745b3c51858d75dc787f4455e4e62b76fa524c9a9aaee6c2499c2720630195d09aff2b5d297098a8b584694189e3fb4c5959cb84a765e0f844ae85510a1994847c509cb7b9e366b4c75aca07d89852d807b5c00cc6a2a94cc566cc9a3cbee32d89dbb0df0c9634dc356c282e69432778462d877a3cc72c2f320b8f9f54b116e3b5e29a6fb4e14c46389993964f1784bea4c85063e42baab55d7a74c999da99eca8987ce14b2ccc78e4b11e1b584cb5075c52669146fab493321d025393cfa6a49fd57d30ccaf264145e3d9015348149248abf32b859ccba971d273183791ecd26b922a6728bcc980b63c4aeac02a886e9316564eac023e94c3f3516f4c82af0d25acb2b41aeaa40eb6ec17d152b6dd77bec0937dcaa5c455f15aa3e3822c317c306763c7d209c5183d58d85246e83aca0705b1b6564d3047278bcb0b985105502dd810785f658dcdf374a61b77836a001dea3fba828c7507b0e5b76ee28302cde0605642be0e546bb2a10606fa222e9aa668095421430b9e439543a7a83900488079a959dc9c9cd3aa5c205cd46c467f967acfd9b3163672939a543757937c5ccd1ac664f4b79770b0a06d5b30808a4c4632c24f4423046b9c88113e85575be69987f27ccd22ba32cdbc8948338a64fc5e3c5227dff4766665b25fe66276122d6e950c66c9635b9414b4092cf23220677160d16b9b3ec58f4328ad6d9a8e309ab89602601e4545bebb506580025d91b20c699853a88eaba0bebe596acb714e7b2c2ea982991a950f351a75f2455836f7516835b9ef1766471758e0769216d0bddb4e3361dbb9e2257ccb2ab9c0bc0d01f5e6cc903abf86
+Secret Key: 32c7c983e9a556bb542f3627eaf84a8897695d6cb6acfc001e248ca06152f75b2f72faae28755d5b8126628c4be13bb368f9b8be7a352754c7eb7c1b039c8ae48c851a18a96097b6f18099e3db4a19bbb396b00db223841d4a6623f328633467061362980cca0b3165acb89766765091d32b45f6993b98a8b34a7ab9fb2bafe0bf0eac2a06a17c727b5395bbc41c341699066a72db49f7cb8d60db1848b9c69fecb62ce84150ca15a26b2c529c858c66c66b35018847b57602675a5acfeca38c58cc34a36a3e0474aab95137d95141086a49fc4aaa61bb67f2287e8531ae523184af4966ce4b1cd890038427c1f1c6ca6c33a689e2c084f12447009f78c571a247708baa9e77c188d052a3010a3202152688d89776c4a53ba681ed35a0d97133ceecb226a7b6967655c5dc744df6cc656935a65313d34997458a17b3a37b96f31eb124295968bde64a57a383474b0a593a43b705307bdd20bce869394160c59321b123778502633c7b442f6850bca0308b2de6a80b110c36322cd20189c1917792e9b0565795f6da552559715f8442769b234f32cf911c66116271a7c93a5ca49a56794ac5971870ec1ede66a4b49ab93e01b21a8165967376ca1b74b3c8623958ce6c89b7e9ab11aa6932869ca004b402af8c181c731d2bb1664ff85f510b3939717da2f350885c7b9ba59511891f4987a8399a83b371417e247c08587ec3b80b76197dbb342d25b6cafae585a0a621f9d5be542394fd9acb8555340540cf4e90082034b38cc73bafe65298506310bbce92f6c6468b6d55729ab2850cf5e4ba21db2fe694b40085197a49730c7ab4113238acf0257a55c612f8ced96028d70469fbac3f1e7618730b247171abc39232e485abfe02cd5271b41056176230073c659c8b489e5bd151dc2a69d5578c739c6f4232132494bf189b7a39b89125310320b1b445e3612ed89d3c4cb91cd08a9ab2bb9b66cf9a6ba427923686c8c86f0518700919d16c9615ba9ec6163884c74d2237cc939808b2772084279be55c29a0a073be6184e1ccb3fbdb1cddc466a10b20a1c8ad0034b9695462dd0b372d745a85bc45051aab867175fa37b3efbb45293a0e0a5215aa2424d2e64fa787908537aaf9ab254bd89f5d92341c7b81d744b40df30fcf884a62261a948257a819c013d71237b799fa670d43b9b20fc72c694b2bb8d7675e8395a0110eaf5b38364873e9a4c1d30756d0a22f280bce983012f30206fa41c1e5b64c863a2ef88b1bb7105ea141327f15849df6427197930aa8b6b0b30bba2748da7708efb20d116b4c72780e50629f26b63f30879bba811b0e1046994a7f3e61677a1c1fd1f4a2b6701183d1b7fd21bb478a1e745b3c51858d75dc787f4455e4e62b76fa524c9a9aaee6c2499c2720630195d09aff2b5d297098a8b584694189e3fb4c5959cb84a765e0f844ae85510a1994847c509cb7b9e366b4c75aca07d89852d807b5c00cc6a2a94cc566cc9a3cbee32d89dbb0df0c9634dc356c282e69432778462d877a3cc72c2f320b8f9f54b116e3b5e29a6fb4e14c46389993964f1784bea4c85063e42baab55d7a74c999da99eca8987ce14b2ccc78e4b11e1b584cb5075c52669146fab493321d025393cfa6a49fd57d30ccaf264145e3d9015348149248abf32b859ccba971d273183791ecd26b922a6728bcc980b63c4aeac02a886e9316564eac023e94c3f3516f4c82af0d25acb2b41aeaa40eb6ec17d152b6dd77bec0937dcaa5c455f15aa3e3822c317c306763c7d209c5183d58d85246e83aca0705b1b6564d3047278bcb0b985105502dd810785f658dcdf374a61b77836a001dea3fba828c7507b0e5b76ee28302cde0605642be0e546bb2a10606fa222e9aa668095421430b9e439543a7a83900488079a959dc9c9cd3aa5c205cd46c467f967acfd9b3163672939a543757937c5ccd1ac664f4b79770b0a06d5b30808a4c4632c24f4423046b9c88113e85575be69987f27ccd22ba32cdbc8948338a64fc5e3c5227dff4766665b25fe66276122d6e950c66c9635b9414b4092cf23220677160d16b9b3ec58f4328ad6d9a8e309ab89602601e4545bebb506580025d91b20c699853a88eaba0bebe596acb714e7b2c2ea982991a950f351a75f2455836f7516835b9ef1766471758e0769216d0bddb4e3361dbb9e2257ccb2ab9c0bc0d01f5e6cc903abf862fc1dbb20b8d544bd24efe207b64a3ecc89dc65ca0047e76e409b24ff2079418b23e49444957aed4a91fe7c694807b97072fa9ca9ebcd2a7979e395f4bfed326
+Ciphertext: 7b4b834a220d92c5b4cbab69931944ff04a632b4ec7c415ccae03dda32c6fa28ab7a43ab25a28ca0b928d9eccfa34731be726d83b7380b8bb9c55ed56496e36d194d530baff0984231677b5d6048f675f98ff2df225d2e4ab3a2a5dc30c5ee99c465ef0e07938f843b0bdc2357473488deae24efcb4e7faa59696fdefbe8d5b716e1fa62b4787c36ad3cfbc655dacc592a82cdf2ef4139630b4e7e086e5cbc63e37ee44e61e05b71ea277311bf701d8887ffcf2008614d734b840930727799eae79076b2b7e1a18cf6122c235c32905a2902a167524fc0fc80805e196b4008c4ac69bc666f345a2a1eb4228d2d8a297267d34518a9141ca4ea6fe6d2cfd12e7cbad1e7ccb95c1bca679128c04e2ea15be173b96eaffa6af22a0d25eab3757d0029c8dd19a17c3d5e503eaff24c8fec80197ba5757198fbce5b15dc90e1476a595a8649738a2a593bde4af7735f3fccd3e4dd072075fb3f7107a53037490b271e56494121d2269a6e34cdeea3f7efe5111b706e25284068367a645f4d7c3731b2af34df0dd778ef833d9d11b938aeb6f630977bebf8f82b50d9425cf2f5042a08475ef574794f04f07b141495b643d96e5eaecf0555d69794c18b6364bca2a8e8248f70b67788b984b37df01b3ba805653d685d38f67e9d0959d31009f60ff978d76df9b34b086e9d2eaa72d8c520e79ed2fcdff017b0bbe2f3b8a5ba883cbad99009c1e0f1b7c6788d8d2ff7c2aaa381c14fc92695b9b92059182493e8a2afbb8241423a96d7e09ad648019ac68dee71b6b5cdfaef02b9e085de8d30356ef1074d8a8b0ccb91c7950e34113c37c8980816fdfad8f685263ab90300ada91817fc23159a4b783197294f09b51f825f7369cdf4c169f977c714fe4dba5f97d9c8ff924ba80a6ff27bdc38443c9c725d5005c9c70a28f47e77631333db5e36f078353f9e77b5f347ea905c3f7f6a3cebe22fc336de0c85821f0774b0e2171fd60f48e8c4c48772cf547584d4eba290a27a3cf509a2bc596d05446b038bcb6a67d027adc0ce61b9a6c557a2882deb8151dea53873c6c5cc94f371aae28ae6bfaf4f99
+Shared Secret B: 5211eff26a6ab9363092fd0013fb3697af95da08c38002c4757a76ed9373f922
+Shared Secret A: 5211eff26a6ab9363092fd0013fb3697af95da08c38002c4757a76ed9373f922
+Pseudorandom shared Secret A: f7647fd5311e7af7a080e0e44c5a40b693e498a56812c075d53be016a4b54c9a
+Public Key: f0ba5604d56aab0b79d9249aac764ca121781dc7a0b5347254c8a3475cb3fe8c366361413d0a61ee9429605743d51953a40a40464c6487201f365068aca831e3812010a5adf55565357c7b3bc3655ab86aad8bcd1543b6236513c32259b7e634c6b104ebe8bc0524a0eeebba16e687496154d1d3c8b90a45adb7ae610c0f21b04919430a85884f62c774b4fb77af618e41b0a77ea22edf09923714c30cd5ba1c010a5dc8b0a0d01bd6333fa95a6f1540c01bd15776f355aeda0a21067b591b35d5d743defb2f181a22a5d54b3117b20d148a8c005d6db97fe41a9eb08188f8b554a9d1c7510a12e3c0788027388af380221a4654f04e45b29bd2a03571b28d7dd366e812b98e8210e0567c1561105a511c4967367dd8b325a1764b797317ec00eb635b43997b76e57fe17a1e7663198ee182faa07bd93715b8f00507eb374c155eb1559be9128b68b6b2b6076ee539a4ae4742aa096200f7265f848cf35881baa0553090b60b2a31f46413cdc9458fc83801b45202fbb4830089c8697c54f08308512682f4406786396a6026ce8252863c0fcc9a39906c6e21e51b602890a795996d668d12500f12b056173a5329966ef1a9407a905407681f3d30a44b7a70c2c57e332a6baa39cb8d243159091c7e3667c6d548f5cc08cc7c753dcb347c5c33a63251b907ae5cc3245d034fa16c5e1796782cf04d5dc4a43e02c99b8a935fe81db5102e358065ef41619704b7b74b7fbf60a19bdc68053a09e76914d54658c0642d304b82658b4c8936bbef7b067609b78d256388a6117e60783f702635676901e342445977fa1cc7d6a000cabb44246634edf93cd653b7911bcd93071a4b2813c142294011710a1a925215334481562e9322b9b175859465ef18bc69c58c45ea1fe6b3cdb7573721116c985c50b54aa1272850c3900e7952943486cd39410e58616e95570b06c72bb7a55fc1539c73851abd220549914f65b6b676b315384842750640999477ded7cd03a4b5436190cdc2b1ea44616a61b82a221f62b1a14f88b327e5439bca002308842a127532d0724a6c8f26a7b2c6901d6e7462bd784be67e64af7c8ec3f959bea0592517a0e77d4fcafd290e150af3031114d768581a
+Secret Key: 05193adee774f5a68696db45d26751c8c291e5635ef5fc1a763b8d484716a1c512dfb4b6c0196a835126c0d7bcf63197a8f7212ad6860158b998a1b5a274876c06779ebb7a5ad670b28592549103ab1bb4fe5946d57cbf0c188e46db86f1456e5f5a019f59b860221188e0c9a31708df9a7e04019c23b028a921c065d577547cb7614727d561a8ee3c553716c8b4326c72bb3054091d21c20957c0992447655a6b4a48765346f74293d9cfb4d5c08126b46aeb743114056cf1a8d09c2965c389026655ff058c52d9b96edcabbec05465b919a10a0b02b440fad848626a8d543a2874762406fab1b6b3394fb02d308a78abf6c3d0e455a61ca4196422090c1b933642c7c002098c3562e62bc8cb4b5e972485a69e95a03a9738895b98155a902c05940d92d577a6e05c837b91b9a031093899553696b97ca9fdb1cd9cc981d6887b474623be2316acf453b8480043a01f548b1236f7183b506f1378bc573c068b74b4ce66bed0dc263ba44c0e10c081d9030fca5049b8491e8a028cc85adc0ccaf12bacdb905420686bd51809b37928ea21a99dbb302008cd4c3c9f224320c9908ff54a262ec9308c4aae0b0a566b5b71f5ab00e90b4df47835466226a5e93be0414b57a46ca8c301b3e37334c182f5d29deb1131f5bca98745815a981d2085ad717152860b33ab454ae7559c01697ecf653c98a02d8ae3b7b2620580a0a8f12a13de058169e7cee53b64db1217b9550a641777dcfa4c9be0a4cb4b43d9f54da7272ce2c69b6649c1c3288b2c9747c4b87a2705187ea584b8147851dc6906794560bc8d59f48b19e098ebaccd300c704a8805ef44187d28707b72812b9aabecba0ed6e5c207b8c20300496d1362c187cb6f0a44a984a93eb8a732eca9c7e9582aa87bd6c40f3848808323b4ed4c5b489327ff56709073321cc06c2d184f6054cec23b53fb797472d9432ba8550f505d706ac891136126d63ba1893e961873ceb6563c23b1066aaa9b11bcbfd978b3196ded2a4d7ab49675fa5e1a3338d910015064b833ecac6908cbd792720ad7cceed2a380f35d6af791f45b1f548646e7c70ef0ba5604d56aab0b79d9249aac764ca121781dc7a0b5347254c8a3475cb3fe8c366361413d0a61ee9429605743d51953a40a40464c6487201f365068aca831e3812010a5adf55565357c7b3bc3655ab86aad8bcd1543b6236513c32259b7e634c6b104ebe8bc0524a0eeebba16e687496154d1d3c8b90a45adb7ae610c0f21b04919430a85884f62c774b4fb77af618e41b0a77ea22edf09923714c30cd5ba1c010a5dc8b0a0d01bd6333fa95a6f1540c01bd15776f355aeda0a21067b591b35d5d743defb2f181a22a5d54b3117b20d148a8c005d6db97fe41a9eb08188f8b554a9d1c7510a12e3c0788027388af380221a4654f04e45b29bd2a03571b28d7dd366e812b98e8210e0567c1561105a511c4967367dd8b325a1764b797317ec00eb635b43997b76e57fe17a1e7663198ee182faa07bd93715b8f00507eb374c155eb1559be9128b68b6b2b6076ee539a4ae4742aa096200f7265f848cf35881baa0553090b60b2a31f46413cdc9458fc83801b45202fbb4830089c8697c54f08308512682f4406786396a6026ce8252863c0fcc9a39906c6e21e51b602890a795996d668d12500f12b056173a5329966ef1a9407a905407681f3d30a44b7a70c2c57e332a6baa39cb8d243159091c7e3667c6d548f5cc08cc7c753dcb347c5c33a63251b907ae5cc3245d034fa16c5e1796782cf04d5dc4a43e02c99b8a935fe81db5102e358065ef41619704b7b74b7fbf60a19bdc68053a09e76914d54658c0642d304b82658b4c8936bbef7b067609b78d256388a6117e60783f702635676901e342445977fa1cc7d6a000cabb44246634edf93cd653b7911bcd93071a4b2813c142294011710a1a925215334481562e9322b9b175859465ef18bc69c58c45ea1fe6b3cdb7573721116c985c50b54aa1272850c3900e7952943486cd39410e58616e95570b06c72bb7a55fc1539c73851abd220549914f65b6b676b315384842750640999477ded7cd03a4b5436190cdc2b1ea44616a61b82a221f62b1a14f88b327e5439bca002308842a127532d0724a6c8f26a7b2c6901d6e7462bd784be67e64af7c8ec3f959bea0592517a0e77d4fcafd290e150af3031114d768581ac1ac0e896559e36b0cee6ad0d647606e3419bc4b9ab09bc88ae0b3b378f37be470c1bfc6cf1c8312b6a74ad242466d09c84c5769719e04dab40bfc9e883a5645
+Ciphertext: 41732ccec9b0543380f01299066ab5f3b063832a8a286b50547fb38f0f9bdc1915cc5ed9e94497635c3acee3e6f994f943b7af722bb407134268fffb90055cf629abf167ea787b4d4516d033de0625faf57030d0faa20058859ead3959a9fecdb8477e3af2e55c56849087d2c181202285b6bc9af97433e81cb249e8b4f0989ac8c65089b9148f78ac61df2864d5c6e4b0baceb005cc679e7edd9933f0a99f6f1ae7416cb96f3886ee7b21bbf4386fc4ed9ad8fbae6baee4cb4b25aa2020d03a4b504f0b7f76504fe7a9e59292b58634d3c17f95b4ac19094118da1aa1dc37fd5462d1a180ceea1bd941790d6d9e337a5f0129a54b85380aeca3a762b818681deb929d05c3865f9ae9afbd476ad32a60c3fe94c546c25681fccad88ae0605c1b30d965ea62a6804bbb857a9b819dc5ba57eba5ce4625dfb8507f59d7aaf7b5f36e1c628fd5f3079bf4892fa91030609043618b3eb8f73ba52aacd85bd5c3e0d79f55f7e2a398821bd7d0d764b5a4cb7c5228c99fda133b01a752a42171780e3a76ede5bcdbeaaf51124dc83fd97abb70b54aebdbcfcb87283f8aaf22eaf943957487a4c7fafd81e025545d497d0ed8a6847a2df5ca2af785d1a89b3a01e2bbc52605554cea6dfd32cb9e3afbe79dbad320303827a71d7d1a161749b60fb4a420f7de4788f758eb212315c45ddb35adab52ce19c445a074d5191897b0b2f6c1393e80a298083616914a0bd12633932c66dad38db7a7e26c015131dca3c16dcd7ed7ed07679214feabbecf39a169360aaa8305cfcf7dd6ff63342e97fc5fc20ced10e477744694fa04425618eebaacb3aecd484789f54c6c46b80e8ab27097a47e979be90fdb8ce2989bf8a2043caf38af5394255c47c553ce0d5eb17d0c47a60bd917acf3ea487625247b34cea09f3b8cc82cee5cfd36cd38444800f0dcee2efae1d3ff623917a783e273bb434aa532d1233122442ff78c4c05d7859e5c4eecb7cbda0e97d25253152b34b2ba97c5cad3e0a3c0ec62be1bd0ce328c82fec2759a197dfe3103d7b3c73fd4971b49c59205fe8b92d0a675979adc82ebd8b98997db
+Shared Secret B: c28e6327a80c6dd54db501b3e7305eb7c56246c38bd053b5c073b473c7b616b1
+Shared Secret A: c28e6327a80c6dd54db501b3e7305eb7c56246c38bd053b5c073b473c7b616b1
+Pseudorandom shared Secret A: 5512fe2cb5ac7e5ba355ddec6048ebddab808e7bfc0d49dd04ef6be6153e7a87
+Public Key: 3e890a55bccd601b6eda85430503279dc2b7317920936c4835522309fcc418e33a54c79a239b1ccd69cc3c21c1636a0209e9bfde8316505aa79c0cbe577891c81624019b7f89828c91abb2235724ba9635f03a4a3365759b72bf9ef80f110660ed9294ad6a8e6345c4dd9a8b04b0c589352bf5ca6320e31b08a9b2ddd934447a0eec3650bc177d0e857cb6b98fd7ac2021b1c17b04be6d153ee1c68c5ca029a12199302a9c9dd2c3a8342ef6e3133432098936682c162c6a356159c7834c41724cd57579cca76241c47d996bd2f48338f95bbb462d3c9520a0514429873a92999d0ef61b4fb92149d85f9d107fe7ecadc5fa47c231c83c1150de8a84e14b67fb79ba731b93d3b035c23134b072010e851727a856ef619e8512542bb24176648a0e60ab821a74942c0981fc149cb8005ffb8050fa9f62269a8aab76d2f697e938b6c85a37259b3f20b66545070d6a9a8801634f50e26e20e89d0d44a3d4dccf546aa5257295c41723550204ded10025673919d1ae3e210860eac41842cef6cb88565222f4560cc98c8dddd59ef53a4de11303d7eba9238b47242228578c3c71fabbdf19604c4a2ea204982051b4dc7650f33a16bd04a9ce7390dff41eabf25f00106455627d6a38ccb3598b8655bb1e2641273b086ad06303e5a4daa1a6d8a28cc35a291512c470d261f805a150a748d9422f59139957ea107ae1305596135418158376ac55e003efc73403b2241e7b64b69239a6f07cf001c15f6c036b7323b75623ad9b5b20566d696010f6340cba8233187b7c39822685085cdd62a7c6444a0e78137f5a2bc3ec38902091b0cb8d9d8194e1b172c5f26fafc723c2e5b465a561a8163b8be3916a469a7c34cf7f446d00b1483f9913e5a122d1701e916492b357cbfd74942ff64805b4c3037cae0fd87888ea4dbfd75bd80591f4c11e103a43b8e58001d8adaf6674fa953cd786213bb62c91272e58060841e7869fbcb42b0946f8a24b66190cd089041a8b4ef7b21d644651e4fcae4dea5db4d87cc404b8d9107ee1428089b2aa2e0c64daec4523925fa1ea48477cb0ebb68b37c5564f7892417f8ab20d52ec4f2fb643b9b61857862eb2c0fabd693ca1917fa5f4394e4db6
+Secret Key: aeb714182c074cec8a694c241e09c95240601b4594a7cc251efcb1711320f46aabe8bc99a3212b88b7782d6a2091495164825a7568ba9a23595cf17bea84273eb87d17c9618280676a1b4987890eaa7c4ad58437b5c77205bb9bf3dc34649965d2d32ef80287695797aec40ff1058ea862257119aee108459f500ea5a287b1e032110a880bd820dd8544ad574a1e798034b229e28695741651eb5a087bf69ecb65475d6b6b7d5917e9e5bdfe352a30006303435c356caa0cbbb00c857aefd65582818062556e75d8b3063c58a1a98eb4fb5fcd00056cd554061226151573928a79939292dba775c3fc771b72b2d666225c694967757fa674434692c226faab72a2a1112789c6e730370033f3db9f012a6beedc4330e0253782c37ccc60b389084e251560d5c37db0754ffa057f9cbb25f22c9946c71ad67dba1921c6b70414e04f38a1bad9a155a8052d6aa9c333a55bfcb8a98892428c577926772177d1591d48476f93a2d7e80db6443f200846cb07010228689b1a1e2fb7b4f7ecba89b22f9d698c21e10babc1b0fb6310d676930c430bc19cc28877466d953d22baa87662677e918beb5396a43380b1b6a92a27652255000535567798bb0ec9a21338559dc16963237b976897be2c18fda6a26df98fae7a967df7928b2bce98605bf8832af2d39c45901e351c7b0c8bb89f0cc1a8c95d55b01ff00958ae97ce43a31bd872b3241787350744f80a9fac1cc644455df851364bd2c9f0e10f3c213d86aabc13c859bc5b9b77d27c10f3315f628231cc8c971b75f26835b39576212b84fb508271dc4546d9ca5e3258a2a40f00ddcd968701f01c61ac21353d27cc1642a900107d99f64b57307c8921b45b80af235cb1cb989c19a27e8c81678fe070e2c60ef70569fd607c737182a80646e0ec685f150d4a3cbed97c2d7929721aba0947eb966a5641b31949d3ca9edd731a5414c87f6c4f2e061d991b1837c663c938736843081b75a2994c7f15667ef5d74f2b975866f823a1d2b1d9a76e3a3a18406cbef2728d7a8caacb53b4a4f72e3cd613dc0c9214d17d66c166e359908d435a7253353e890a55bccd601b6eda85430503279dc2b7317920936c4835522309fcc418e33a54c79a239b1ccd69cc3c21c1636a0209e9bfde8316505aa79c0cbe577891c81624019b7f89828c91abb2235724ba9635f03a4a3365759b72bf9ef80f110660ed9294ad6a8e6345c4dd9a8b04b0c589352bf5ca6320e31b08a9b2ddd934447a0eec3650bc177d0e857cb6b98fd7ac2021b1c17b04be6d153ee1c68c5ca029a12199302a9c9dd2c3a8342ef6e3133432098936682c162c6a356159c7834c41724cd57579cca76241c47d996bd2f48338f95bbb462d3c9520a0514429873a92999d0ef61b4fb92149d85f9d107fe7ecadc5fa47c231c83c1150de8a84e14b67fb79ba731b93d3b035c23134b072010e851727a856ef619e8512542bb24176648a0e60ab821a74942c0981fc149cb8005ffb8050fa9f62269a8aab76d2f697e938b6c85a37259b3f20b66545070d6a9a8801634f50e26e20e89d0d44a3d4dccf546aa5257295c41723550204ded10025673919d1ae3e210860eac41842cef6cb88565222f4560cc98c8dddd59ef53a4de11303d7eba9238b47242228578c3c71fabbdf19604c4a2ea204982051b4dc7650f33a16bd04a9ce7390dff41eabf25f00106455627d6a38ccb3598b8655bb1e2641273b086ad06303e5a4daa1a6d8a28cc35a291512c470d261f805a150a748d9422f59139957ea107ae1305596135418158376ac55e003efc73403b2241e7b64b69239a6f07cf001c15f6c036b7323b75623ad9b5b20566d696010f6340cba8233187b7c39822685085cdd62a7c6444a0e78137f5a2bc3ec38902091b0cb8d9d8194e1b172c5f26fafc723c2e5b465a561a8163b8be3916a469a7c34cf7f446d00b1483f9913e5a122d1701e916492b357cbfd74942ff64805b4c3037cae0fd87888ea4dbfd75bd80591f4c11e103a43b8e58001d8adaf6674fa953cd786213bb62c91272e58060841e7869fbcb42b0946f8a24b66190cd089041a8b4ef7b21d644651e4fcae4dea5db4d87cc404b8d9107ee1428089b2aa2e0c64daec4523925fa1ea48477cb0ebb68b37c5564f7892417f8ab20d52ec4f2fb643b9b61857862eb2c0fabd693ca1917fa5f4394e4db65afc084b0151886363346044de0bab2955abb1e4b735306a67c471ba272108e00f5c9fda1a33552db907c11a770e2074c9f262fa7affb6803c422e21b0924f74
+Ciphertext: d5a0154926f9ae783fd14b7a262377fa023f0984cf75c27f436a5cb13a2465bd3b4afe8b8f7472226fe8c09ceef6a9eb662ebc99329ffe23e5122f1da47c379e4989e720e3660f4fe6baae85fe585147c6ae3d40daaa144d06b395b86828dc4236c730c6769a3de6d00562bdfa4c3d3cbe1f59beb5687c1edd43e6d1e17802f60a3a1e375a7ce48b7edd396f8490ef6656a7714e7b3f4571b03c3b5c8a23981d2042574254a3ed881b8f818854ba640bf9c72608dd2269b31f1779f86754c5aaa242b403f509cc248308116b783601352b2cc0592fe2fe97228f46ecea8f0a4e624531a02093230b6700d92527fe7d30b949b5687962948a5dd1f01a6ef56947c994b69fd2ea7fd5440c307430465ddf90b80ace8c44ca028c258261d4c11039d5b1c1df36dcd9141f78d9ddb5d6e20406daf8de8d42f0a96e233200c68566bcdc21372cde6e1c2af92098391adcba1d6075b43b76f6f088af8dd2090e86d76a40659e68ec532113eef3a7faaecac5044be98536832cadd67a1a5022aa57c970fa4ea403e4f036f621ff4f56ee4e8cca1edbee7ba4e3c3a38c8ea4cdb0735644229d611a78dd4e9a12e265786256c6725acc55557148d7e373f2f6af244da4988aaa3570b21bf027089d6572287f7aab8335acc9b16af35fbe64c1b8bf5334e8613566a0ca537ae3c23358901e468c32a0c74ff651222d810b44e739d69b8aecc04e914b19c3e0eb1ce42eb459c264e8c9a1f19346099ad59003c9509b66009fce5e07eef12c129bfdabf71bf838f9cb426c4c06ac5db1324f26b9d488329786d5aa0e22a020592660333c1fa31d0b5b7af3bdf1f728879c076f1e66b18fb64713468c0a60bbb732b79d04a9b13590cb277380b6acfd0e3986ddc9e0194324beeb344d3789657081d1041125d35682c2a304c4550249f8f9192d5f2c57ea65eb4b3d4a5aed853ecea3585d47bfc782671fb363eaa9b659d7bf19ba88ee1daaad8a86e9b807a26534b9b1a23c3742abce4067736432ad2a9bf64bce1f1ac0c28bb9bc7c1e68f191efa2f1f69bb5276af5d8f4fe4cbd19e024c77a577ea7aa0746
+Shared Secret B: c203e931d3f6c14512c90b7113c52e2196feaa2cc641239b9ab7f01ff6efec82
+Shared Secret A: c203e931d3f6c14512c90b7113c52e2196feaa2cc641239b9ab7f01ff6efec82
+Pseudorandom shared Secret A: 0e97c8c15b6fda02ee5a789bea3dde9237ac81332a8f671117c2762d7aaa30c9
+#
+[Kyber-768]
+#
+Public Key: 1bc331b659a61a04883d0c5ebbc0772754a4c33b6a90e52e0678ce06a0453ba8a188b15a496bae6a24177b636d12fbb088f2cd9504ac200231473031a31a5c62e46288fb3edb858b21bc0ea59a212fd1c6dba09e920712d068a2be7abcf4f2a3533443ee1780dd419681a960cd90af5fcaab8c1552ef25572f157a2bbb934a18a5c57a761b54a45d774ac6bc593583a1bcfc4dcd0cca87ab9cff463dc5e80ebbb501d18c8b39e324dbd07ca06cbf75ba33297abcc7aabdd5b308401ba387f533f3927b51e91380f5a59b119e354835ab182db62c76d6d85fa63241743a52012aac281222bc0037e2c493b4777a99cb5929aba155a006bc9b461c365fa3583fac5414b403af9135079b33a10df8819cb462f067253f92b3c45a7fb1c1478d4091e39010ba44071019010daa15c0f43d14641a8fa3a94cfaa2a877ae8113bbf8221ee13223376494fb128b825952d5105ae4157dd6d70f71d5bd48f34d469976629bce6c12931c88ca0882965e27538f272b19796b251226075b131b38564f90159583cd9c4c3c098c8f06a267b262b8731b9e962976c41152a76c30b502d0425635357b43cd3a3ecef5bc9910bb89ca9e91ba75e8121d53c2329b5222df12560d242724523ff60b6ead310d99954d483b91383a726a937f1b60b474b22ea5b81954580339d81c9f47bab44a3fe0c833a7dba1f5b33a5a2a459812645c6537c2317163d71b7bd7a4a5459a28a1c28659aad9a1ca9a99a363062d453355108445a673438e77624e73757c1a84d031cf0fb24b1187aafbe6738e9abaf5b42b004b1fa0d96426d3c5324235dd871e7a89364d335ebb6718ad098154208b143b2b43eb9e5fd8816c5225d494b40809b2459903c6486a1db9ac3414945e1867b5869c2f88cf9edc0a216681804578d34923e5a353babba923db907725b384e74e66987292e007e05c6766f267f839b7617c55e28b0fa2121da2d037d6830af9d869e1fb52b0cb645fe221a79b2a46e41980d34671ccc58d8756054b2cca7b13715a05f3925355cca838ab8d2425255f61135727167ad6bcb0632ebf86384b950ad21088c292b4a4fcc0e59c42d3f77fac85cd9f5cb049b3a29505a984c4c6ac98ca3d0a8f30d2b1bd9815b94b27051b40ffc3455a668b9e141428611b280c1b8f2b55f6eb04e10c68f1340ef1582115f10ee2b785b7ebb0ec3a0c61670cf48107b594cd6e238e0d68961b47983b87879771519d2b7c21681cd494b420f03d004bb06eeb54f9c080c2f2aff6759074d5b3a3b11c73f1af6dc874eeec254d5409fceaa90ff66d90b6930a540fd1d9be1844af1d861ff96a611a414a6c61a78fb2a78e74383ab05ebc73855a818a627242d523a3e2a35ab4285b4a2564f76772aaf8cdc9f87c65f1b4b5819905fb4f9ea59166fbbdb201c5eefc0df7418ca211b5b079a511b8b94429847b537fbed82d57632d63e815d8212d8a280d43328604a6c4d2c1887e7ab061f120a0168db2f4735369b193780f0aeb381ff2653f3b46e206afe77a7e814c7716a1b166727dd2a0b9a7d8aeace425da63977f8103457c9f438a2676c10e3a9c630b855873288ee560ca05c37cc7329e9e502cfac918b9420544445d4cfa93f56ee922c7d660937b5937c3074d62968f006d1211c60296685953e5de
+Secret Key: 24c59d1c7603e7b74bc7aa1bc2cb3a214b3cfaebb63bd85b65408427c498ba394371bb271f92a3b506b81d54a95a7c0ddfbaa1519553d6f3cd5a601b7db6b0e91a5149468f1f68ad26478bf3c6670e093ac4c49e7a90ba46595de94c50e04129a811a841b39534a87f0ae7b1116553e20c9a566b9b8ff7c7e728b8b201893403a4f252a55230874c256b897834cda349807b25cbd75a30867bfb80328200017f1cb70b56cc546b65d3dc9cdb45107cf10dba349619043ac35c0b9546309a239039813ed5c40f353a5e8e42193564496112bda56cb38c081df252ae9c2c7e441a062e92a7c8da7a240c9952d86b5f1bb6a53b38a5ac0a54a84b43f12da1d0525655684a12090b60b28b0c628db092015547d1070af5d6192e639636615d03c654bb90008ca15b784119f6178a00d7bef4a54a274ac922e55c61a3a8840aa258639484a3bce2e43b6c969b11275631daa129a61ea0e2939f0877e1a110c8a44b24c54fbb07a958db9feeca1eb52b086c87bf43a9b02a5b2c4762117c3a99ae4c4e2eaa7a33b9a714737215c10317514f6c4299ef92acd64c4858e85ce737a801890022d7381f3540230c0c8ef50a848a28b09ba0bf8b50619c905751601d7629767449c9c0b2bae321f438a77f412a55e45ecab4b39053c6561801c639be6495be8fa144ef6029af663407ca9181946de5f3aec7236343ab3bc5a38a09c01b412baf0afb23f9e9b8f2b40810f2ce4ffbcdbfd87972323e98065160bcba34b3afd6c25b664745fca99a9ea75cef019d768485ec23336d9b39e4d05d8d587b30633d4f69ade5753a39680235e44f27995da96798f3a85e184a9fad19320829629f4140417bb7dbf5851ab79258134146d088452774991a087a1c2beaea89f218087ba774ae253b494c27750b1de04b44d953c5e47ab10f65205ee212f9c30391e5299553954916873a0b41164543e801c0b099cb44f48995675823c10b40f4bbac9177a558ca0c30765c2aabfd6a4da54c8413e33902d63f064330f0464982429de2604cd03b4de84a9f821a5470423a40a964dcc41863363d77b02c3127304f942ee71c98c643a427533ef300104948b825277953aaabfd855588f75a77d199a213ad348116e9e539f6d37068a551c710548b7a2c7ee95f9cd9b3483332673cc44bcb18a778a49455c768e0b340f81102ac6b76b064057151ef101ae143787f548553558df8035a3ce00c9c43cda43142cca39034b09a7e6089867b4c64980a69ecab2e6818724c35cb909d5d45bc6a349c71b306567664adc0cc8ef698049b4b4b432dd0f69fac07580f77c4f79b22bb90cb97b341880716853431694c9120f6724ad58d57127fced999ff6229a5d4c3c240129cc812acc73698f949d8e73661f2528262bfccfa5cdf5a2104649806e295ea161217083365aa26cee6ae2f1356e8e1c5cefcc85703447ef1160a1b4a0e8c017b173802c66c88ab70d39a6c96c1569d5a86245a7eeb087d682219080768745b44bf244f65b567b2658dbae6962ba52b322118e214cfadd7cf3502582dc9cafba952a9637ad3600710259778d99d23f8235da90791604b4f0a4f7640680f59b633d93dfb84282ba54c674b115684a41bc331b659a61a04883d0c5ebbc0772754a4c33b6a90e52e0678ce06a0453ba8a188b15a496bae6a24177b636d12fbb088f2cd9504ac200231473031a31a5c62e46288fb3edb858b21bc0ea59a212fd1c6dba09e920712d068a2be7abcf4f2a3533443ee1780dd419681a960cd90af5fcaab8c1552ef25572f157a2bbb934a18a5c57a761b54a45d774ac6bc593583a1bcfc4dcd0cca87ab9cff463dc5e80ebbb501d18c8b39e324dbd07ca06cbf75ba33297abcc7aabdd5b308401ba387f533f3927b51e91380f5a59b119e354835ab182db62c76d6d85fa63241743a52012aac281222bc0037e2c493b4777a99cb5929aba155a006bc9b461c365fa3583fac5414b403af9135079b33a10df8819cb462f067253f92b3c45a7fb1c1478d4091e39010ba44071019010daa15c0f43d14641a8fa3a94cfaa2a877ae8113bbf8221ee13223376494fb128b825952d5105ae4157dd6d70f71d5bd48f34d469976629bce6c12931c88ca0882965e27538f272b19796b251226075b131b38564f90159583cd9c4c3c098c8f06a267b262b8731b9e962976c41152a76c30b502d0425635357b43cd3a3ecef5bc9910bb89ca9e91ba75e8121d53c2329b5222df12560d242724523ff60b6ead310d99954d483b91383a726a937f1b60b474b22ea5b81954580339d81c9f47bab44a3fe0c833a7dba1f5b33a5a2a459812645c6537c2317163d71b7bd7a4a5459a28a1c28659aad9a1ca9a99a363062d453355108445a673438e77624e73757c1a84d031cf0fb24b1187aafbe6738e9abaf5b42b004b1fa0d96426d3c5324235dd871e7a89364d335ebb6718ad098154208b143b2b43eb9e5fd8816c5225d494b40809b2459903c6486a1db9ac3414945e1867b5869c2f88cf9edc0a216681804578d34923e5a353babba923db907725b384e74e66987292e007e05c6766f267f839b7617c55e28b0fa2121da2d037d6830af9d869e1fb52b0cb645fe221a79b2a46e41980d34671ccc58d8756054b2cca7b13715a05f3925355cca838ab8d2425255f61135727167ad6bcb0632ebf86384b950ad21088c292b4a4fcc0e59c42d3f77fac85cd9f5cb049b3a29505a984c4c6ac98ca3d0a8f30d2b1bd9815b94b27051b40ffc3455a668b9e141428611b280c1b8f2b55f6eb04e10c68f1340ef1582115f10ee2b785b7ebb0ec3a0c61670cf48107b594cd6e238e0d68961b47983b87879771519d2b7c21681cd494b420f03d004bb06eeb54f9c080c2f2aff6759074d5b3a3b11c73f1af6dc874eeec254d5409fceaa90ff66d90b6930a540fd1d9be1844af1d861ff96a611a414a6c61a78fb2a78e74383ab05ebc73855a818a627242d523a3e2a35ab4285b4a2564f76772aaf8cdc9f87c65f1b4b5819905fb4f9ea59166fbbdb201c5eefc0df7418ca211b5b079a511b8b94429847b537fbed82d57632d63e815d8212d8a280d43328604a6c4d2c1887e7ab061f120a0168db2f4735369b193780f0aeb381ff2653f3b46e206afe77a7e814c7716a1b166727dd2a0b9a7d8aeace425da63977f8103457c9f438a2676c10e3a9c630b855873288ee560ca05c37cc7329e9e502cfac918b9420544445d4cfa93f56ee922c7d660937b5937c3074d62968f006d1211c60296685953e5def3804c2dad5c36180137c1df12f31385b670fde5cfe76447f6c4b5b50083553c3cb1eea988004b93103cfb0aeefd2a686e01fa4a58e8a3639ca8a1e3f9ae57e2
+Ciphertext: 0315a52971584a19d748fb3841dbfae8ead9d2a46133f6a87e5ae2e529328c2edf0f9966f8652b15c906f6f1f07ca200931131dd3947ee7c28a485febb8cc3da2ad38d39577852af309dfc34e51ad3059746e1bab53785f7d3ef2929cc647e9cfe4be630a16614b8129ecea012f5e34ec74b43c262ecb95bb59efaa02e76c7542e44e8fcb4b7741ecbd1a80c042ade44fd98c48a6594529316d245ec429051baf7229071ce15eb8ca9f5b1552bd0c7f6a687aca0827322815f54ff0db0705273e34f07a2c161231d06ac50dafdafe70780c90df7943cf3ca574309ea3ceeff424f9b88ae21ea19dfea3c185367cfaefdcda1a57b15ed52dbcb52e343b2cc4b5e989a12af3d8bd2609f244a5e996013c089acb974b97c1b79dbd433aa02fe4db56ed791e529dc7d9655483c17f2cc7a3db9257279c148fef775bb4508fc4b9d133e4277fdc062d5158a10cc95a710576f6ded5fc0f8343aac95317d075d3651b43029fb037ea0b292d83be152842657794ff60da14be8b3aaf7ae027db9306780eef9d5990dfd8eed14a13ae6b9cc29891e6ce218fab68138f0b4d46eab0d1e889f340dfd5326766585ab0073f6118b299d8e5ce51209768553b4e066da6f6bdb46ab4e70d5944da4f7514c63d087fc6cea2f3fbbd12decba3b99b1b63595e1bcb4948121066c21b27ef3c439955690e53ecaad1e91e7e60eda0723649fbc58ee4da58498e72b410d902a32a71de94cf7b8f1cf4b64917ec69f63b80c4bee1c9dd8cb8f085bc93ccb7f1693a76dc75ac5920e8d0ed32d69cfea3fb52d8b0dfd469d193eeedb96d002b8d56a29cf3cd8e84d363117b12d484b14f99be3656aeff033bd64872f3af6888826d62bbbbc388c5f68ee7c3ffe40b23c62316e0ff350e929890997c507d75e499d50734c220816ec0adc40378673537a4f0b660d796edf0d498a69b1a06b9ebf5cd2c1c9ac3c365901923198ed71721824ee8b1ea3991709fa44c9606d34ed7a298f9cad015815b944546f0ee1ef0cb7596cf5ecd9aad04d74b2f8aee6c7db90ae7b53b5c6bb1691a18a97a9ded5ae17e92d0fbbbdbb06203220acd35f1eefb413ff19bf47bca430c1ea72332fc157b4c5c978ee5c9018f9a4cf1cf763054c3fadf22f5e2d1d7832216c43172fbb23b76a06c92b9c9d17ad3b77fe1bc31b826f8ecd676038624102b7786b6afac24f0af05dc66f43abea1c1b98422131570670e933342dbf7023faa30fa89f58f7c4b6a024156311452fbbd2280a42fcf2600a5d97b99eaceb9249f86346d66e5def734ef65e6a565a4f21e43880e59981f39a495984f90ae4d05a4d917b64bf229e9a5173906f8100b240609989a186c168dcb7e6398a4f8624fd9b94ed7de8150c72d92eb7d3e4609112d6b6a37b12c75a9739566b1e6a8739ae1ca0083ea8c511fc3f8c08c61cb8816cc1c69d452102e78ef60f54a50f5338a4ac9707e74b8b6cab93c5554d9519dc14c1c9d47b74d32e778a484be3ba60793e84fccd3280a02af7673f92a8d84dba
+Shared Secret B: aac51a36f6e5e1871b916a1a1f4396615c298a9f738fb3ea2d7cff299f965bbe
+Shared Secret A: aac51a36f6e5e1871b916a1a1f4396615c298a9f738fb3ea2d7cff299f965bbe
+Pseudorandom shared Secret A: b877da792d89f28049b590121601202d2bc8f5f1af8382bf4f3941050dd5172b
+Public Key: 4bf08ff68a039e252425135d4db95959355eb91b0a3d39a32f9126bec86d8fabc1e212bd3ac91a37950ac5d43686f73ec2ab920486658e0903dd74c606a9b0702027aac2426e77c238d4574edaacd7449fedc550a0964e2aa0a73e5a27d1d20a88482aff96885c059f2a27170c1274d8cb89b10ab21c11be8ac629733a8f1146887a1191a90c86f46a7f2a457089d66b1185090022aa4753ad95728ecfa23ced955a383150d784b13b94c8aa00ba1da86abe760fe6497d67926d13741542b0122c4a70c9851cc2590c4bd1481dd4ba490a1f80210a61a58e3382425e1520d5e3070aa39498076b21db7223d31592624b42aa9908ebadec3a6c85dbaa3a4290bf23b00659b429fb467b97c839d297b8ebc39eb893a20421c29bba03f096cb2bc5126449bb055f9eb69d35829680152c8651cd49babed9e040e9490fe5a4298316024cf99148a17de8bb78ba3060a87c4dc77955e3270f88d72e1bc3b98aa76714460706092ce30c642c4bbe43c0c7b6c8aaac5035f04432eb21a8ce2c50ca4870a2052c1543801b9a26754c51dbe5ac3daa234fbbaa705ba0f903c2e6f17a3066bd36174168f5c58fa675c122cff66a2e6a643fd4b98e8643363eea5b937354e504703beaada1e669a527a12665696498c42da54b4ae6732d083d9fa70b0c279e003b83e850c7885a1bdb42318369bceb0189ebaaba303515de16571f4c65463369f92a1e86d0935709a29742505189866a204d108a15853b484f817033eb793171535ee8bf9f661decc0c84a15a6925c4c65f19ed9448a95e116cb656f80fc731b1778b90084e43acbe1c81981169d41713b22d090f0c35d62534a83796d49d627f4733f26d240c22314850baebd762e23c47cdb148ff3d081fbfac78999ca0fd632c715a147c08799f80a8dc61509442826dc412e317ff72298d7f203b3d57c53435bbc30c473e0646f1294b6e6182163334eb6cde81150b29060c7d042378902a198a2bc3239d103970208958bb664cd48a6405b122006481675626fe3406f2c339d347ccae79d3eb226cfd50804c556d6f1c7aca8798d26be8465a7f3d54d87526c79da325dc9ae7a09758e970863868a3c7693c4da54e84c9685281208617f31c08b03486af61cbe362029449a119179448af2721851c242c0b32b6b179f1b5978f07921b0afae9ac9a2214250f9951ed646c2563eec038e52d5b4633652ea4768b3584619193a8eb60e4f6527eff12358108f8265b7c7e362ef85cf3c4b3e338446e178c078c99854939c768320bb3b63be29c5f173924b739d9bb2238298ab08f3cb90251d8f97b435592e492c81fcc31449d0c571d6420ae78f59b1481b37867b604285c415dda151fba92a53c762805a906d7379b728042a5c50d5165263c66c6dc3440f14c157e75985ea56df9456b98b2bf494453152404b898f87a013799ca2824b7e5ae499bd6423f2d7726b89be5a4cc6af36839aea06c780a294ca072e447beeaccf81f49bf531c6273c081a3066b8e35246d72f1a98cf07622fcaec6ce7c37029445b7d9233682065ae28ce4902a9458403990406a3fb1c3cd5acbc93344ed47e54621f6a545777d88312bb7a7161611cd832e967830c24202228efdfc6b89938eef9046b8ad3a0b810e2acea081333a71459576def292af7
+Secret Key: e648438856b5ca416eeea82f6dbc35fe089f67dc7fb25680edc039fe6128138a0d16a44195e58fe33cb4d7dca9bcd57ace1933d2189bb8e96d500bb501725a3bc27bf936b747da516c056a150acdad1c6b31fc85da357d048a014ae8c6f79736ace203ad4a0fdbd743ed1797523623eb9206d60ab57ab579cfe9662cb43b27c29b4abc098ae385cb115867f71794299eb3096ff71bc38c661e2ec3c37596ca45237d35ccb35b822324010aac324e0290b25d868b2d295ed8311fc0a29a8dcc08bf545ac05c0a8185b32e9313d7f71018a07cffec186560544c56ce51c373b3dcc43e5c9115059928777e5347c987e939b98436b3ba6953258e534259099395fb14b462d672235c168fc72a1a7a01a53b29eeb67cfbda3fa664b696f52686d8b9d4bc6e179702be512484b02beda86952a0cc88db1e4967c0b45a3dadf590e3e19606ec09f04c8499c6adfe24ccf820138744555b9c0319daab49c00c1e997c27352a81904543183701fc26536720bd729db32a803da53f2ac1c01b552eed146a74f7adccca159a8c88e09cc885a8a7ab030fb8e011ee0703966b6321eb24af8087798317586b41142a429a64b4f5a8a38b00765d62caeab1b96bb68c53025d44989b00f8a3c8ea1262d79289fa436d8726f5d66a525700541ccbeba6180dd84b96f11ea2c799b06ca79ec155a53b3c647b19042cb297601ca352c1c4e605f6bc8252301199f184e80a7f1d92114e7474be8081a3c82343b194b3f99992e20ad7d991501c47fcd50e28ca8e41c807ebd743b5e50bc83924b0307d4c618fb48944f8f0916dfb9381d696d0377bb5b64d14a6c181e98413978cc35878ff107fe2cc28735accda51ceac82b28bd70601c05125512b87b6a0b63a009d490b18d1bf17c31c18b8058cc709826b99ca3241ac550917ec92bd37012c9b7ffe909d3c85606666112ea9af9685878cec344d3a8fcfba01d8475cbac0a688a9230f50a087bb9e79c321d1c12a481a9fff8a99fe042016112c33765a721978a369147c7b4ab6255c2577043b379ff15bb5fc148a5069c33d3017a47a80bd97c2146c5c51282b36189ba9d1883d186a02c0652216728ca4285b1b908272161b391ef3cb9323540ec6e96ba048339faa982a67c66bec8aa9273f86042a48fbbcbb3c372ebc97c3b957cf6366bb70a5a747a5e694a463bbcb11f4a6de5c602c45a4b887220e26b5d8665eb0276f4c2545361158728824acba86a78c5980245568533e3803a552e89e59b341c284440074b171c76072db74563042da3c26529a40a3a3025791384ce87d14638b635a3b45218c872635da6a74acfa0d65fc0a5b0917a35131be894bb23330403b2ed3ecc00dc2c8212531fec60a8515502057234a0bbe3e9023a3f8346d006474c8b70c73aaaf28528247352ad2c578a6647f7a2af390a02f2117aa466acc290bed8632adb96f7220989e5815a6b3815cfc565c2448fa59b1eef0c3c9fc7fe123703a4c66fb1894af36458a570436568fb1205bb2dbc428bc5c07078f4b2b7bdaabcd2c8617dab043ef28b1f1c64695166acab93042f737139a7b1d91bd4d54b5617298ac769bc2f32059f96542fa77ff134babcb78843ba0b999404bf08ff68a039e252425135d4db95959355eb91b0a3d39a32f9126bec86d8fabc1e212bd3ac91a37950ac5d43686f73ec2ab920486658e0903dd74c606a9b0702027aac2426e77c238d4574edaacd7449fedc550a0964e2aa0a73e5a27d1d20a88482aff96885c059f2a27170c1274d8cb89b10ab21c11be8ac629733a8f1146887a1191a90c86f46a7f2a457089d66b1185090022aa4753ad95728ecfa23ced955a383150d784b13b94c8aa00ba1da86abe760fe6497d67926d13741542b0122c4a70c9851cc2590c4bd1481dd4ba490a1f80210a61a58e3382425e1520d5e3070aa39498076b21db7223d31592624b42aa9908ebadec3a6c85dbaa3a4290bf23b00659b429fb467b97c839d297b8ebc39eb893a20421c29bba03f096cb2bc5126449bb055f9eb69d35829680152c8651cd49babed9e040e9490fe5a4298316024cf99148a17de8bb78ba3060a87c4dc77955e3270f88d72e1bc3b98aa76714460706092ce30c642c4bbe43c0c7b6c8aaac5035f04432eb21a8ce2c50ca4870a2052c1543801b9a26754c51dbe5ac3daa234fbbaa705ba0f903c2e6f17a3066bd36174168f5c58fa675c122cff66a2e6a643fd4b98e8643363eea5b937354e504703beaada1e669a527a12665696498c42da54b4ae6732d083d9fa70b0c279e003b83e850c7885a1bdb42318369bceb0189ebaaba303515de16571f4c65463369f92a1e86d0935709a29742505189866a204d108a15853b484f817033eb793171535ee8bf9f661decc0c84a15a6925c4c65f19ed9448a95e116cb656f80fc731b1778b90084e43acbe1c81981169d41713b22d090f0c35d62534a83796d49d627f4733f26d240c22314850baebd762e23c47cdb148ff3d081fbfac78999ca0fd632c715a147c08799f80a8dc61509442826dc412e317ff72298d7f203b3d57c53435bbc30c473e0646f1294b6e6182163334eb6cde81150b29060c7d042378902a198a2bc3239d103970208958bb664cd48a6405b122006481675626fe3406f2c339d347ccae79d3eb226cfd50804c556d6f1c7aca8798d26be8465a7f3d54d87526c79da325dc9ae7a09758e970863868a3c7693c4da54e84c9685281208617f31c08b03486af61cbe362029449a119179448af2721851c242c0b32b6b179f1b5978f07921b0afae9ac9a2214250f9951ed646c2563eec038e52d5b4633652ea4768b3584619193a8eb60e4f6527eff12358108f8265b7c7e362ef85cf3c4b3e338446e178c078c99854939c768320bb3b63be29c5f173924b739d9bb2238298ab08f3cb90251d8f97b435592e492c81fcc31449d0c571d6420ae78f59b1481b37867b604285c415dda151fba92a53c762805a906d7379b728042a5c50d5165263c66c6dc3440f14c157e75985ea56df9456b98b2bf494453152404b898f87a013799ca2824b7e5ae499bd6423f2d7726b89be5a4cc6af36839aea06c780a294ca072e447beeaccf81f49bf531c6273c081a3066b8e35246d72f1a98cf07622fcaec6ce7c37029445b7d9233682065ae28ce4902a9458403990406a3fb1c3cd5acbc93344ed47e54621f6a545777d88312bb7a7161611cd832e967830c24202228efdfc6b89938eef9046b8ad3a0b810e2acea081333a71459576def292af7f21972333efed4f5f4a43c81b7c7f0f0cb5c1fe816dcc2ce7256d36380c09276a467dc73403c987d8fda5f6e568291dda40d7681a07eb9f80d8765c734bded79
+Ciphertext: 461e2508a365e2f742f984d5c46736b6412e7c10b688443ac6f8a435cb02255148ca8f54fdf9998043af5e34fd9076ad71dfa8294bf33ddb710e2fc8691d767d54ad4bc2f07fd26d6b4b44d59b5ef770e012ed9772c51bdc78e98436982e9d74e14ab09a93a3883e0b0303b7aa3a75add145aef90385b3ab66be1d9d1fad72e29a168ec00af9a850198ec799e09495b7f00a18cd703a9d88b2a4fb7d74ad62b884f172af3f3960ae9da36e90ee9cf4784891eb887dabf5d9c02599906f9b2c7eacc19093b4d373408fa6d6fe24007ef6414d6e6043537732dcd3796e6002133f9154902b41f6b46efd51e91f83ce26bd411522a446ed659d1e94a36310d4f50327011e1fa60207157b50aa42ed58bf997e9e50886c7b15eee9025aa765da759dfe52803d94950245c4c12dbac75e112832070b185f3d7910974de03e309639951cb2c6ef3759e9234c75c9f5b7853e269c27ae32070510d8eda80833decbaacc4d0d6e1894e50616fa3319f3a07d15ec2c1d71e8f1a1e81e7f92e1d052972ad6a87bc12908a9b9be510a7679febff882dfbaa4195b72f85b139d48c709e442fb74b995c51538ae7abb409020ffd04d45d7b84c84e1496621ea9ce7917f002ac1e292d87768d269cb4139144f4b8644edbe657f3987fbe39ebd1978b1452f38a90c287a74e5dd8d1768a1bedfdbc81d5856e45178fd8bdf9fe081867f31071ddd7deb1fe1cc74e41f50ad40e26d73a1be3778c505a1c5bd15d3a66927906db3245f5ba15cee49ef2b6dab7f0c57e274478b37d516dfddbaca1c936096c3c755124f81673015833c66d645d4414e0f8ce87e18788f3cf47e6e84f9bbb22855f15627a99240ca47333e3545973aade6c405ecdaaea1a634d982304c6118bb9bc852b629e1456857cf4bb904d95d311434d4dcfe05802e66640fefd23cb6a48c3253a1c15327e48d5d125928a1ec7f4da079c39f8259ad62495850542518c03d5d0a13a0b51a1ef7de895eaa9270efcc24c65201288de56972323c26a62448ffd96b16b8b55ef6522664f236a61f334e4bbf422070d5744dad5c7dd858e5dc26c49ebeff1c1ebf28cbf4931d88945b9e5dd4747c662ca24a760fa322b18bf213d2087c5fac037f503aa8bffdac4dbfcb3366103dba5cf2eb512fbe561e11456a59188c2eaf556c95af6bebc84e7bcce06edf80bdd19e514caa1452bd16077011683b5c5517813a539b540f80dde09bdf10bd4700706a2c3f3603b78ba94e6800905b3b4c201188bb3fcee6b5fcb643c81e37954a24649a631c842d8ddf18dcd97ae6162b87fa0be85902107d5af1dde864b63f1d73b96a60b83e98fa63001d0fe67ed13095c2d1d9e9d1410b895f957c7ab1979b74c854973978fe4c7970d6d008c97078e73ea97335d12a52cb3961f4cdbec46e25d53a9973c16ae0e56427c57d08084cd9df33ab6cd0342f78f47c21efaa1791d63bc098d5c54f301bd485d350e87274474c437ee66b53d2ec74c60bb9a4338a8d4fd6a3b3c656c2a868cc48057b
+Shared Secret B: d57b4011a977cd46198beeacf2d7c1194904cd7381b8e1fb79b092d313b313a2
+Shared Secret A: d57b4011a977cd46198beeacf2d7c1194904cd7381b8e1fb79b092d313b313a2
+Pseudorandom shared Secret A: 8f14195b9966457b825e32d53dd0d6762f18f957e2104b7a57c375492305dca9
+Public Key: f04543f98a47f62b191e263508a930db0c3629877e6ef7056fd40ea9b86292072b123c4cbb4046c935387e6986f6187f768467358a5845a198b6e3123f50b883b49819f96f6b23593ab647cd574d84a53fafd1320c81780d531d9cba102f1382f2f278386c29d8c17089828b420cc902c40eca788e3bf0662be6b891297b56a6815bf90132b5ce26a883ab111be6db577b4466b630173a0771e628804164caada20543e14c8e07be13181c93167e4a3112bcc11bc0c504d86b521e1cbe85d444fe466423949c8dc6a6d8e01d2991cd70ab2eb119303e44b7433557f39ab29f3219853b4ed9ca167b922444761ab9c381a73ace1478cf17f54136d99ebfb27e521c83b9485d2165cd41174943e47de76957e6a21e733a770d22711a57400d2a53264ac6164574bc584d33f7a3ec8b631026a816b911e159596b0934e4812465a41e713974eda8acb3432b9414aad7b0ce2d531f6244a94a80315de0662763a3e0d3442cd0701bd3643ae2795e9827e28305f7080873baab029576d329cfd9eaa7be8695a31978ff7890d0c70b4937107b54b51b32bf30818c888086013785ab84ace89c519cc419fb751e0950ca7cd9437586bc6bd39ee6892c58fc022935a112b20a3009431f87b2149c56977c32c4f960fc2b91937811964454736905b881904adc49f5b74fef29013011a646ec6427298b6b5acf5648211eec6643e93b411cc60b33396338831f2a52bf687a56e0cb02d6c84363aa7a632fc13ac102f405516226ad256ac36a4c41b7b12f16cac519c7a1d72610203b3034931c13203672b24209a658b23c176933ac02420485ba573348d90235b8902cd660b88fc5278371af08241c8e743b9d81c9f893b01d1ab38d207f92a51f07f10bec901c0104a87a775555570cba2984a31cb0dc734b82bc27439352eed560d4da0239189d684843025552742a1c2249513c40b9ed165769644c36958fe0006748f87978c83c5b5c92193c2734162f89046c80a552f567a58c533bda0700722312c7d12f0636afe75baa16db7be5a594fa893130eb39c4a69dbfe40c958b0ba389cf577c1cf1220a31c182077a839480a6cb7850c7917788daaefa604c851ccb3adca4489b705cf635704892731a8422e1397ad4597b04bbe8125c2d90b5ff9149759a5d4df244fcda98330442ed26afec358946e59b7a378f31261b36cbcb7c728dfeb21f0e10195736753ca81ec644a9a28188c0281631e397e63c5ab0742607546175f057b6039d0fb13be4bb23847ab97f7792bf74627e19038f36b06054c34ec23e22166be56a8efd6b5e9923aa331737bfd484fac424c6c419203ca4400c0003376605d44b60368f4e526599c45866c64945b59869465f84d1014e9a8070bb53feea83280bb9bfb006e6e4bc30c22ff3246366d639cbe3ac4cba543d134a0e5513f7d9a106852c584542f5804d75cb1ae483cc4fd223caa52d1e84a778c09a90cb9cb7310a6551c32d0aa32119905a0ca878769e7e46c82b53cf31606dcfcca8fbda86eb9a1067712e76723774d23b4c54c9aac3ba02fb996c6a322f085db03c2145625f1d306fa7934747eb0d5bf2271ac637f1a63682070fe5c3a0e949a13a9bc75ac94d829dc43da9c3e3e1dfbdbe4d73da0982be73dd7a6ffcbd0625393795bb07ac06
+Secret Key: a8262a8f917663e605fdc7ab2e028239796b40b8c54290709ea970b7b2b0fbb02c4e109287542649750c74196fc2dca223430c5c31c0de3b6ed4757114362200ab72fa48cd87771f5593197bda5e19d9b17ab198a4533bbfc99363f09b94e7587d55b228a833305456231c8b84e9071112452a0a462c20152e91a2fe2bc737307145e38f44da90b4ca543f809d9db687989a9b0d9ccc5a66294b34c17f4505849920dc16c8bb8a4889b01c9608a99d7832f52c7fdd6422a2596f85444280f1c5e4970543c0b72f08c0cc7444621626de8aa92ecba05e0a8b49364cb0d69ab0c10c72f3372e97c2395c6ad9332c93355cbe617a3226a54288a197d3c38dc4b0cad0c6763705a573a4fa687fbb01bfedf2b85837808f819314079e5842209a818b90e731afeb9b9152b92848a1b6483aff848727c47eb55225f324936aa97a415bca06176e7e489ae72631346a1533dc48b938019fda5ad9346f02f937359b243bcacff841084b5342e3408fcb63009672cb58e319bcf8832dc98404150e11ec39f815c726cc7c7b15aff8e67ad8f75df78aa913643e5b1970d06763e6bbc991c03e59941cb2923259e9a9f24b55c1e51ea2405d593070fc49186b4ca343220340f99c8ce350db663c3b41b2431973d2941ab42535513b7058457285513826e14b7d7135ad487837223c15ea7f86cb49b3031654662b7e6ab4595c4c204077c256cf2ba8795a96a40b11bf7eca482a7b34257c072006a667c83808accc7ebabc2afa48e437736550cde02ccfdbf5050f26910f8b231a709da67982395a926724104336b41c262be843bd9c764d9af6a7e042a60d541fe4a1124b57c85559345885121a1c60cf44983a8757b9a21389a25c38c5c9da5806085a3f2eaabb0f52103ce46214e2aecc579972573569191cef2ba797e154b0876060baaff400beb3d245ffe1b73668a87a167409e754eed8b9d39223d956765212947bc77502539f1a2bb195d4a50f003ab2628cb6cc299bec58c515927e7c10a09312a97b59b2c1915c7713fb970b45566f69b806de278ce2f45c39706d84f8b81ea6ce46bc946346b3be6599fde5bb15b8b2173267f5d785ae362bec9b74cb53b888ab3de409127ac9ae4f523186c9c97ba24fd8935965e22c245b68257b16d1711aacd818e9b09a5279ced4b19e41c48b18549e71c676585c04b1d805c65a25d118c9f7330838c083582984aa7670ea5020eeeb44ddd07cc0a428e49739fc30b8b686b2ed9b948b9b1df8f65f2f951741b266deb9c0272297d1f88b88ba6fdf7a8845d324a549467e15520e6497dd16168da37c11f710da408d94e9bc46c1ab4c4890fce070a6d3a99ec82c8b02970a759f9cb89e9046234398376dc82cdf30a5d42a1775f53eec4526ef27626aa872225495e27c8aa82a9500ed6a172718a6448087553684566917c6ae919a4a56308fdf859c800429676a974997494754c2437bc8b0ba5521b546501a8d64602b16cabf1f9843bdf20dc12cc5c7e7cd61f27154362cdcb46777313067a16a4fdba8dbb461a3023ad384cf01e9245d352aa1100260a8886126a3dec82763f20fae784d43da5b8df34ec8345039d4229b7248ed99120db2a4369839f04543f98a47f62b191e263508a930db0c3629877e6ef7056fd40ea9b86292072b123c4cbb4046c935387e6986f6187f768467358a5845a198b6e3123f50b883b49819f96f6b23593ab647cd574d84a53fafd1320c81780d531d9cba102f1382f2f278386c29d8c17089828b420cc902c40eca788e3bf0662be6b891297b56a6815bf90132b5ce26a883ab111be6db577b4466b630173a0771e628804164caada20543e14c8e07be13181c93167e4a3112bcc11bc0c504d86b521e1cbe85d444fe466423949c8dc6a6d8e01d2991cd70ab2eb119303e44b7433557f39ab29f3219853b4ed9ca167b922444761ab9c381a73ace1478cf17f54136d99ebfb27e521c83b9485d2165cd41174943e47de76957e6a21e733a770d22711a57400d2a53264ac6164574bc584d33f7a3ec8b631026a816b911e159596b0934e4812465a41e713974eda8acb3432b9414aad7b0ce2d531f6244a94a80315de0662763a3e0d3442cd0701bd3643ae2795e9827e28305f7080873baab029576d329cfd9eaa7be8695a31978ff7890d0c70b4937107b54b51b32bf30818c888086013785ab84ace89c519cc419fb751e0950ca7cd9437586bc6bd39ee6892c58fc022935a112b20a3009431f87b2149c56977c32c4f960fc2b91937811964454736905b881904adc49f5b74fef29013011a646ec6427298b6b5acf5648211eec6643e93b411cc60b33396338831f2a52bf687a56e0cb02d6c84363aa7a632fc13ac102f405516226ad256ac36a4c41b7b12f16cac519c7a1d72610203b3034931c13203672b24209a658b23c176933ac02420485ba573348d90235b8902cd660b88fc5278371af08241c8e743b9d81c9f893b01d1ab38d207f92a51f07f10bec901c0104a87a775555570cba2984a31cb0dc734b82bc27439352eed560d4da0239189d684843025552742a1c2249513c40b9ed165769644c36958fe0006748f87978c83c5b5c92193c2734162f89046c80a552f567a58c533bda0700722312c7d12f0636afe75baa16db7be5a594fa893130eb39c4a69dbfe40c958b0ba389cf577c1cf1220a31c182077a839480a6cb7850c7917788daaefa604c851ccb3adca4489b705cf635704892731a8422e1397ad4597b04bbe8125c2d90b5ff9149759a5d4df244fcda98330442ed26afec358946e59b7a378f31261b36cbcb7c728dfeb21f0e10195736753ca81ec644a9a28188c0281631e397e63c5ab0742607546175f057b6039d0fb13be4bb23847ab97f7792bf74627e19038f36b06054c34ec23e22166be56a8efd6b5e9923aa331737bfd484fac424c6c419203ca4400c0003376605d44b60368f4e526599c45866c64945b59869465f84d1014e9a8070bb53feea83280bb9bfb006e6e4bc30c22ff3246366d639cbe3ac4cba543d134a0e5513f7d9a106852c584542f5804d75cb1ae483cc4fd223caa52d1e84a778c09a90cb9cb7310a6551c32d0aa32119905a0ca878769e7e46c82b53cf31606dcfcca8fbda86eb9a1067712e76723774d23b4c54c9aac3ba02fb996c6a322f085db03c2145625f1d306fa7934747eb0d5bf2271ac637f1a63682070fe5c3a0e949a13a9bc75ac94d829dc43da9c3e3e1dfbdbe4d73da0982be73dd7a6ffcbd0625393795bb07ac069fb3b6227925de96a8cd233a9e6be6dd03acbcb7aa47a2ad6354325ded8fc51b6ba7881736f2fc4bde18ec531d0b7b75d22a460533a65ac12de5f1c4b57fbc0c
+Ciphertext: 58c0dd47495143da27462e62784abee27193cec99f7fd32f2e8499d133c03713025d2400437891ebe6a0620e74bf4b1bdaa609d52e003020a561854a27c7f20151faa6832c7adaa33add53b33160792d316ff944683ea05ff9d9555a1fef0eb38727b96fe7df7c248e9e42099458840c4b6800832253601888665ee3db65ccd32ca3f1b775ca8d7a3512ee0b728b472695f10bbbe38b585ddf7b6eb5e3e16448e91d19b7488314e5337771bcd32d60fd2d64061b2b42ddbfdbe4e51f557d10ca77233f2c9b19dde3838a38a150cdec9a08c0cae8e2fb7d6a6abc09a8fb3f909a7f2d01a5460f9a0706e832050d9954efbfa99f8a2d8880961065ebc285bb9c93e2ffccabdda399deaac482f7cdb9629df2317932cc1a10557ac270c5d407eeabd2f1a9199b6714c7298f0fd3008c8a39025d210558193bd6ca3cbb7dcc13cb1ab8687f043cbe41c2b10db1a439d75f9c1edf3e7d47cf9cb09a55462996e0f501f4304f29b7a056518b9500e0ce205404c404d06b2eb077c73fe19c59da41ec80e44acfcec13c847065987fec9f6f5b7120dfbd46644690b356585b87f288e9010164db58e6f2e7fa2a1ec0e471ace90248c13ee96308c5c9db31167614ffd9915fa2ac7159c5e73ba8bb86151f853e168178f9be2c4921b04845bdedbf956fae2e9c7587d0d877355f85c6831724ce093fa785144c1a044d65ee0ea1547902480e8c81c2766283c425aecdf7b619e59c78240c9a5efaa16ebf4ce9f62f5392ecf44e19ab91f83fdbe2653c26e30f8760773baa644d744115886d83fbe80f0a31e9e8cac23618fc21e9f5edac8c6801520e2de3a87949c0c0328a895fdb89ce7c7f7003e1d68593cba423b482716270f24ed2218e05f71ca2346ba60e0488e5ae94a48e8f071419008ed19c4d95638e74a39728139986a687a23c9926279c79280198a73b4d17b50ffc94f0db8016d03fb17daba42632f067a1e3cf89c66b2033da0e067bac8b60f829629cecfbbb340cdd7c357fbea6b6ddc3ce25bad6179469c1acf0f90e8795eb76d657a23291214d1d4c4787b09ec26d37f079bd61358e5cb6df187663ba2688d3d6f5ba5ff12e8bfec155654020f2b5505ff2a7cc81026508ed855300a7e0b1a1f471f23c463921674722013d6b2f77e3f022d59056dbf4746ff673cbc6d8e67a352cd80074f9c4198b651778d1dc183a9aa3dd443f5344237da01667f76f8c2c4e96732028b3798453107755f1e1508738379b398a3368a5742a8e5877c31ce2f666c3d15ffbe79a46c104db89ec7772c6e2c058d7ad60fce190769c436ccc396904e01a4d1b3044353a535a0a10534cbc48caff56ec4a0a88d1e721760beb33958baec89b6b3db649a37c5603eae49b74bea5f03c1ebf4783e3af76ed51d9a7764722af2ac383f1d7c27c25f24c75a4ccff509eebeca69430e030c75854b0e47ce24f4fec5df627f6ba8f8a54924acb9f335780dd9203da33086016e04f44318119bf3fdf487729ce8ea8272063b2a1813b8c4ca1946c
+Shared Secret B: dd79e7fd27c2696b5e65c35fb26042cde4e2088b81786c85966c4e5e2d0a0fc0
+Shared Secret A: dd79e7fd27c2696b5e65c35fb26042cde4e2088b81786c85966c4e5e2d0a0fc0
+Pseudorandom shared Secret A: a7969a1b54a56b6a3575d971b5354ac1a7cc9256f7166b71645f3874bd0467f1
+Public Key: e16c291fc8aadfe33f0a96ac81c2ae06a2c1ff199ae7153a3fe311b4d44094413c59ba2dcb103ceda9a8663291f78a46d56b7cbbaa57ee5abdbb434b5629448ba121e7f9a4384c5193fb13a888303d2b2a6b6878fab77c2b074fab60899d4a83fc74ba187327e87b52ad32410ca4844fb2cda09bb7abc5a627134322eabe077a7b5c81cc0dd5276eb583b4771824a3262359249b151b06356c6cbc2f8aab1c10a0930b41a8a90244525b051db4675aa343e1202632c6a9f3d4c836d068aa1b73f9a15f0eacc6efc8c5853c4637d7b31ce3c2aa1419b7da55d10cb65b4cc0918298f3233710c16f5cbc53f398a60dd27dc5dc6e3d815c94c9460158cbed7c537a8a1d3f08b12831b8f8f016c366c9bb73ab8154b39e6093c69c4baf144386c89ce67606028710abc81a2c03a1cd8aced356049be33e80ab8ab5031e10d6a3a46b223c475f045c0361fb36e689ad0651bde9e8857023bc95f86b3fb7c238346411c0534b6177ceb0c23ec7911590425e0a837179406d464655836b83c785654231a02b4fc8d85bde6ca8e6203cc68196f7d8a46927832348836af8484a3634622929f4f4878013c59b976799f4cd8ac6668c9a6a0924b3e07139958b8833b5b227c2ce3bc29572a95d09393458f72822c65a9d969b55db86c39437f5830bb6970b2ea2628e6c9d9cd8030e458f2fd4941a67c0dc562332a8bbc7230d21e09efc503c1888b999c285d17b1f5f4026725340b8a77f1f12890079cdcb661851751dd8a4c4699a67b025c3d610852439818a25578e676f81c17a7f5112067808b4e597648847352c09db6c0cd1c8203ef28c43eb281f100ea243c97715b24659a6039ab6126ab51e787504182b003c4585b579c4579da5717a72a9ca13c375e8f3cb56d3467fb44f676bb7e66143010cc77f645304eb70b572cb6841c4f1786e526460a6a1856f18ab15aa9f8eb6aa55eac80861bc3ddb0bcd102e24c363b6a8582b0bc8c8e8a4d53954b3b10eac6c7cdf54c246c13763403b8d4ccd9396982b6cbafdd60d2681b7d1eace3031a7d665776b129b71d93b24359d341043cc465499827dfa74cf973bbef333703b60b0d1b19a43281b29b8624be61939cb4b8347280ddc045f008a6c97ca1ed2334740191182108a461d70d284908c497d2350fe4c2d2da54445844867e93f9dc96765d3a55307b12e725b78754e0aa1c7c2eb2a06bc0b7900c7d7143948869b51f259a4144db4b43c3d12047cd53132fa946db7a73ed6912b018160d088526105be98ad8410b9ba31cf2232cc779a4961429524c080077c9c971a8d4f766c92f16065e2342bf906a861156fd9194053ac2ba55604e9c1f94a95e971c94f14a50ad00830abc8412bcda9a94f8dc4725f538f8f8b8ddf37cbee67cd17ac3521e860428b15bc3c099d0494c170ade4b22b29c6c99a529c865458e544b31d747963000b854b55a4ebb8902879d5ec175d229e60a85505305d0f7bb729a510bc110882365f7e1a7b4b25811591c63737aa0c241af3f56fd7b98314044839f11cfa4c014b5562739883e4f1c0f4fc2d6f1c02202a07e2e3b2079b8b9c9468d17b405aa07bab4978baa3a712e583b7d9c58b4c2e11385e110acb55d985f95328b0854e7955e53f1afb531a84766e886dd1bebddb3f54240d6416
+Secret Key: 1f10ab3432acc43516235432a10b42e2423acba73d8a2775bb4468cb78c346aa5f76400990187e4ce887167266b047b200552bbff96c5882421a83845e810c94620f5f994624f563b6b4594ed6a3e3b4333d82207ec181df0a71f4c254459677a1878cdad4331b985052489bb49534e4719a82373cd5f462b272300a3b335020c5ad9616d3c1bf97fb47ed1bb10966bbdb6705902262803a00c460a2e7719ed0e55caed60b56ccaf43f139a1754f7d99bbe74a965441a9b01a1c8fbc2855bb8304793f2e904ff9461f3fbc7c090cc4bf4475c5202ad4e50d743c498d46891b382949a56ef2a185d09031408801de81c3750b8ecb4237b45933679bb9d2b046b50b79a8f43d00c3854335b99bba312ed46514654cba7773b1817c89f38be8fc5ed3e4b912f08c7ae51d2919533e66a504e7ab6d28c6a7168ad2f254be49131b5ccd65046ef810b01cc3c78440b976194b28f7453d74ce21f967f20bb4c5f94d70990110e1c223b41320fc6473042c26a63121744d61919666b89a22674027511f1355139a469398d36f27d5be2359b60b6c19fa32022000172cc08361b0a49184ce38d9b15d6431fb423fc2b951299c221d7a1a1842c72a6794a30396d1a84ad9e56000a76d7ba62fa069b9e5d62bdf807639329c4a3c81abeb6a33673e5d376fae791e5e32c784faba416603f4d152b6b8b6a85c9668fcab6af7574e251922c976c25b0af2c381c358a1d42b3841309bc434851da1b60e05479e8b28ed350412888ad547c8fe1798da076564c8aa40713b51d999e5b46663b8aa4729111d0758e8e6c752c0b6eae3953a83ca087721d7a621c28bb077a6124958986fd171fd0b11bec01bea1170c609be66b0b1aeb65651f338038c9cc85b9e0c4414b4572a6ba3114376b8d182452bda9905d9a3b9886bcb08bae6c5be0d81359a19717fc019330217dca185ea46c9473433ee2633c518bba3ca737b56c8b8358915b10059067345a60d9b21657984bdc1887cea9c02628694de11262ab10c1dab9b4be177486c2bfc291d588cb2e35b2469104b53bba5b5e50808b9a7c6e296041124c7b3131c09c3774202052576faeb6c41a2aeb6b638472b91367421d26b59e8ba769ce71e8d030eae997b01d6308fc47781025c688abc4cbbcb32cb957ec1c056a243df0bcbd8ec5bef12b1f3005f0297bba330405140ba6ccb49a597bd65192eabd6481a85c903559c5e99af5ae963dc30808cf073cf03b8ec96915d7c9360a031401861e47a1bfd76a6cbb79c84aa3e7e088e814b06bcec3779e4286d4150579cb8e5839c77f61b4ce4c50ebcbc0a280b75962f3de363a0441afd719559069c8d4b8c6afa8751a2bc83d476824b23ac827bc6080f8a83591cc9b397713c0dc14c60b0249ee69b0e0c2c2cf843e31022e8632b21408956dc02163c1240da478a8582f7371bf08b2edeb68d5db425b901a1da306fedecc1a3c57bcfe117f76cb06d33132ccabbb04ab2f35837e8f284b0b6be0feab766013134ab0babb19ceb67a4918790877b81ff8a90059b963b6012b8d6ba63056700110ddda4ac97577858124d62ea5d6c69add1da94ce104f5ae4190984291a66a71fd4c9a6a861c3148bf9865ee16c291fc8aadfe33f0a96ac81c2ae06a2c1ff199ae7153a3fe311b4d44094413c59ba2dcb103ceda9a8663291f78a46d56b7cbbaa57ee5abdbb434b5629448ba121e7f9a4384c5193fb13a888303d2b2a6b6878fab77c2b074fab60899d4a83fc74ba187327e87b52ad32410ca4844fb2cda09bb7abc5a627134322eabe077a7b5c81cc0dd5276eb583b4771824a3262359249b151b06356c6cbc2f8aab1c10a0930b41a8a90244525b051db4675aa343e1202632c6a9f3d4c836d068aa1b73f9a15f0eacc6efc8c5853c4637d7b31ce3c2aa1419b7da55d10cb65b4cc0918298f3233710c16f5cbc53f398a60dd27dc5dc6e3d815c94c9460158cbed7c537a8a1d3f08b12831b8f8f016c366c9bb73ab8154b39e6093c69c4baf144386c89ce67606028710abc81a2c03a1cd8aced356049be33e80ab8ab5031e10d6a3a46b223c475f045c0361fb36e689ad0651bde9e8857023bc95f86b3fb7c238346411c0534b6177ceb0c23ec7911590425e0a837179406d464655836b83c785654231a02b4fc8d85bde6ca8e6203cc68196f7d8a46927832348836af8484a3634622929f4f4878013c59b976799f4cd8ac6668c9a6a0924b3e07139958b8833b5b227c2ce3bc29572a95d09393458f72822c65a9d969b55db86c39437f5830bb6970b2ea2628e6c9d9cd8030e458f2fd4941a67c0dc562332a8bbc7230d21e09efc503c1888b999c285d17b1f5f4026725340b8a77f1f12890079cdcb661851751dd8a4c4699a67b025c3d610852439818a25578e676f81c17a7f5112067808b4e597648847352c09db6c0cd1c8203ef28c43eb281f100ea243c97715b24659a6039ab6126ab51e787504182b003c4585b579c4579da5717a72a9ca13c375e8f3cb56d3467fb44f676bb7e66143010cc77f645304eb70b572cb6841c4f1786e526460a6a1856f18ab15aa9f8eb6aa55eac80861bc3ddb0bcd102e24c363b6a8582b0bc8c8e8a4d53954b3b10eac6c7cdf54c246c13763403b8d4ccd9396982b6cbafdd60d2681b7d1eace3031a7d665776b129b71d93b24359d341043cc465499827dfa74cf973bbef333703b60b0d1b19a43281b29b8624be61939cb4b8347280ddc045f008a6c97ca1ed2334740191182108a461d70d284908c497d2350fe4c2d2da54445844867e93f9dc96765d3a55307b12e725b78754e0aa1c7c2eb2a06bc0b7900c7d7143948869b51f259a4144db4b43c3d12047cd53132fa946db7a73ed6912b018160d088526105be98ad8410b9ba31cf2232cc779a4961429524c080077c9c971a8d4f766c92f16065e2342bf906a861156fd9194053ac2ba55604e9c1f94a95e971c94f14a50ad00830abc8412bcda9a94f8dc4725f538f8f8b8ddf37cbee67cd17ac3521e860428b15bc3c099d0494c170ade4b22b29c6c99a529c865458e544b31d747963000b854b55a4ebb8902879d5ec175d229e60a85505305d0f7bb729a510bc110882365f7e1a7b4b25811591c63737aa0c241af3f56fd7b98314044839f11cfa4c014b5562739883e4f1c0f4fc2d6f1c02202a07e2e3b2079b8b9c9468d17b405aa07bab4978baa3a712e583b7d9c58b4c2e11385e110acb55d985f95328b0854e7955e53f1afb531a84766e886dd1bebddb3f54240d6416454eea987cf1ff691c2f748cbe61618e564ff271b5d6fe02371731a25e23392ca25f247b01b41c215a285569b1496f1b528727b526f67cb8a967134c7c4bebf9
+Ciphertext: 362e09c268a44fd94a6c510692956f25ea43fad5effbeb119d2a5fe0a80f80cd88aea8ec5ed98ca4257bf7ebe186b704b1896b7d045af33905adfae8d2339944958d1585896067b8328f6bb148cbb64c0e49a2580919cdef5ce1af6a3379e791225c152b19350239223b477a724687a376461f639cd3960ac619798252c3efc69ad2172dba84becdaa94f76274ea2e958bcf98acb0879c0e98efa372ed1bfbd6e9d1174dcce051cea043ba173605e64238bd9e13fde3e9fc6adb0653f615651d7f5d806c2d7d64cfe8af3380525c99ddb52a367a0dae9f266ce9855a4945eb890c71bf9dc88e9efd9c3a496cb0bfd19be5cebd983fd1c3ad0d4d567880da7624eb39433b0958bcd109954e56776b3a757fecaee005cbcef21d21e39a3c26b76e76015859eed40ef10b6311d6ac2923c94de2ae4302704527f78d889b760e9dcdf6aaef71861bb8316c57ecec5c29d00e29effd9c2e1df3acdcbf263752a37659f3b6491f69adb73d3c100f34c472346efa2e5a04a7e3270654e278f53c88acc1e33269306a0df3646643ac43722bd0189f4112b6aa20be2bcfc9732785da7f86be6b0c38ebbb20cff3d629ac444a73376b0b945724f709ef302e6948a6fb3c3bc3324e746dbdb2a0599b1fa92104e2029110988d7c295d372aba105d8658f3638dff77c95d0b16230bd1d6ab1714e7d3902ad414db60617dbe9f4495798f144e249f3d50a1e80b4ca270e2676c80c62c21a4af69ef065f38c9e9470ed4aec770555d9405196f70d5ca3c32c396f550fa695abe0a9168f302ae19fe6444c8f2e8f5492513bacb1fee13f3632e3fb566280302439e8772df1fb7f40b93f310abf1ba0c048b024b327476bdae6417cbb5beaaf2ff6432c65c6f1c774671980b91b6a5a7b7ac245ec65391a396f9ffe9510c70f8d3b1e08f700e5615fd3fa40b94ea31740a37c499137cdfc9c09d5097038a385dd9f86a57cad34e0e324e5a651660e7bdc52449d6771a7d0d1145160fcf59b5cbce8bcbb425b464edbb480df0ae601dff25db540725dda8b648ed4753fc676d55a5e26fd8ebbdf5b065337c57f98d028a859a52b59969c772362ac72e9f4ad58ca7bb540c5cf26e7b5fe454eb04791c499e557829928750ce9e7007bf200bab5ed17f4d8f9647cefaaae55525ebf84ff7c4f1fddde1f02ea5f5bffac647b4d81a04669e14570e13c95e0640842937177314c3c035ebe5f8ab678c4f17c982b856e445033ce133ed843650dfbe5d86e565b593377a1a55eda73ce801b961ff3adb5c40007b3a1eb93613683b42b8b605d698a962a87a8676827d0de73eeb8539b10ff98621a69f9cb024db2a6bd0f955fe23425c8974475bd07c48d0fef2f794fd1f0808b5d6931c8d5a432e488429ce23247bbe38862e2277f6544833c6c86219b19f97af12cebbcb59b2e0c4308c41a056b5302bee892e1e3f0db21d1c3b5ef4fdf2def0298028635751e18a5f3b059df9fd69ad525bbe43c7ea6f38232984cdcec6b4789eea8b95ae08405a43ab
+Shared Secret B: 04f3e8c6551f0c9b19681d168ae5fe594d7e8e0583217258ec46765ffa9d0ae6
+Shared Secret A: 04f3e8c6551f0c9b19681d168ae5fe594d7e8e0583217258ec46765ffa9d0ae6
+Pseudorandom shared Secret A: bd86301f655cf5fa3b4754c4104bb2efcd064627412522efb01e1ee82133661f
+Public Key: 12a4aea7f9327ed92d5ec9698beb50e66bae1eb46575264beec8a5f8d8328e66b775f29d041c369c8c80efc8227089320a706bb310ccae03c2bef91380c4988108111356139d1a5223d186eb965b99ac8bafb221426953fd2c9ed91ab179f5098f746e8723531bd15a376c447f80b5fd797551e2a252683fb59b3d6e148182d489a1a5384326662a8084d8d29830d45f5342304becc88eb53dce1bbf20e976554abde47652f2574c6979b420c3c772e368e7ec3f6fd33720f8459e5846f9bbb09d210bc2a5abb8806767149b8ddb3f5c7a70a4eb4b32b00e5c961d3107856419ae0c018de07c6711068ad16bce82b44a2464414cd0768fe93d9f29976e6522a6985887f6639fb099aa5a31d80c7fb51c43c433b75d272a53108a5e289d19d8192bfca1e1a6a56ae165e9a88566c0b2aed8a3afc88865a695ab9129a8b9a7bc091c71423bfcd9c6b0268d97a7ab83eb73e91cacd0aa439d0941d07297659645cd1a5024e5331186a07f549a3af022ef7600fd232dd7c166ece632abb7150ef3b3387a9deb55496bc47943b00fed40616edc009ba85499c2b2a1e90857996b95f687a8d2bb21e1cfedd65e7389ad02263d1322572c94a5443bad61c484c0a36d0516239377411db56dacd680a51962cb7a19228896062c4188e5c7311640da710a724a49b1a857a3f172e0d10efc6474ba1580bd76020d645fb7fc056d1a25c083ad07d6ae6cca50f5a083d3073c52b61d33cb6a6d4b2402f282ac266abdd369dd2b31bee640d2609d84c53ceb250d6508939ea4257e1b40bdbc135bc132a8e25a3a11713130a4c2d2ca0812272bccc2f1b89c3e965cd5b7a0aaf061df662004261997e79b9dd3b15f722693a98116e22a6714a30f69aa4af701ef0013c7315a9e175c61f0387f011f7ad022a37879b67bce84d00dadd642c2509729b78a64e5babdb191ab8a77c83718852b1dfe497673e0a289392b4b637454a4aaea587d68b67a270437da2b07f2801aa21306991ba897a7789cc74f848c575a02cfdc879f2d379376fb74eca96de47a075b6868c68c77253b8f7461aae5b58af286a7f341b27f270aba614ce5687c53c2675055905be7a9673c637e763becb8ccfee9a2d2a0a3cd2144484204303c7463609bf767118a2c0eabc41fb74249b5505e77749e68a6b94270c95af1b3eb14cba76349794184eeeb56ff58218164a3981312db63893974b5345484bd6041a3a49832736d96111fceac1389e066dd146612459854059f0819b0064356971187a8bab5b86104b410b63c621ab1626c8d70ab90b4480310a4882897af6486e7e6cdbc0a7f8f512e98a555b701479ab0aa9f2b30b8b5413026c2964532485390b8f1ae12c60474e9ae3447794708039b4827b69015d0189c273c05198cb93ae5ad42a1718ea2bd11e7a7e23093ff8a0245a157da032fc307129e81b794e74821dc254966b3ee358df2fccf811b2712452b5036023c118e3013634b408aeba99fb59152fd0536c07c127bb64e76b8a88adbabf39732f78124a64baaffb1cf8ceb176e2c6ba2903b27407868979431ab7927132115189b408c4e79cc2a9c66a988d41c403099599ba54e4a8f8ddc7176160c92b153c1fc2933baabcf8d838e62f8ab43b2bf0b425748acdd6102d085daeb6ca3ab90874d79
+Secret Key: b06c4d4e30bd3d51a38626461e30c2fc34ae6d873852101452f37bf566644012044d8b43a24b5f01f251626521fd024aa5bb87a506681ddca21745749ba666aff7cd67fa3a7f05b814367d87f95b51c69662ac8d21e39447a53183718eb8857c5f9a3218b74ded1c8e67acc179d5471cea7461542efbca8374fa102398417a3b028ee940c94051c97b6186604faff220a1f215a9959d5dfa17b5c745236bc7ed8a7717919b18a0a755134b2e656f85749f85cc283d6a5f699894db050a4775ca4da1cbffb579e75b08f25652a9c956eba351e61a8e0a3bc5e635cf94914ffceb0b31668e15a20212bba840355581c5431c0c0ad7d8c17ab217bc61a1a0b55375c48d71a446839986c41505422c9ff8d89c28763337d4c0c40202acc926daaa337dfa4c858a34f9e6b5fdd69d0989c09930393b4019f226b3177b095e307e2fbc7ac0492c196c9cddac55fb9bb482fa7774601ee508584ba162c743ab401a83caca83c530744ee3a78cf27b6e9b93f08ac60a208e4728b923cb54f8ca5196f64ad92434ebac6e54856c5c74afa1d9865050bb487c3f7e70be723a530ef93227517c05237da7920d92045022345d3b07cbc07b247f2b656a1638f495be7ea9cbe71c7731d283fada429ab322be43644017528f946c97b39c04b35cc5e01a30c21f097cacf61988e46520c757b59737798eb18941435770b607cd6515ec515b5ef0684f559edcb42792d126138c9448c5c191c81bcb54748b7b1869205438b9bb1b9bb938f87c3c5806516a6fbf00b9dab62b93e9afe4c723ea3a2e4ff3af8f44ad35d16547989efe9682a703329fe3a74a8463b3ecadbb307b0f42545da5b74bd1bbab842387148a7999559871434a4aa4afcb32e33937a69903b44cc075c9a04154003e130bda5599fd4b4fa500422e9c0f76f0c25ab244d043a57732bce01505bd34c618c3cdf94776547b2ca6c85bcbd2c75e16abe7eca60c114ccf196b3e8882213b675b046dcc2249b2232aabc6696513c74f084dc8e938d5a83f012197d9289716446ce79b8841e7314194469b697f0c0923e0797d358b1a507928f66960071b3147e236038243b3b93f260b664da7874bda1d29e837ec9260d5e4acdbf76794a025e9f158f97671078766050b50f794233cbb0017a5458de86895466af3b7224b2c3db3a3a4244213374658888b9bd753c19fe1c6e7dc7d095b7e10562674814d0f6b7516755ef11a5f32f48432ca99444751e6c96f0071943029056b3bca87a82aace656628282e6a09ffb6b5c91e81432d5bd22a22bda434443e62990e3a94bcb78f98948f0397553c25c099c043835bca8a5b6b4f5731af53a2aecc6189919b61c50e2f58e1116ca3f40c588dc10e0072e3ee53e616c9f11eb2e286a4d2e255d29d68794143e11267773090085444723cc2edbb404e2379112792de744536d999f52ab2a960c1f55a28f51514f2af17ae405b6f70a915006962f0a9a0cf434ff3b6ef6bb8b0ee769255c32bf51b939e89cff20621fac3d747b78d7aaaadefcb645e06dc8005f2200c06076c72b6a2a4e4237927cc921214725164e7fa17fac7b48ce70189f6b9cf769903ee942395710f415bab6299b1b73ad3aac8f12a4aea7f9327ed92d5ec9698beb50e66bae1eb46575264beec8a5f8d8328e66b775f29d041c369c8c80efc8227089320a706bb310ccae03c2bef91380c4988108111356139d1a5223d186eb965b99ac8bafb221426953fd2c9ed91ab179f5098f746e8723531bd15a376c447f80b5fd797551e2a252683fb59b3d6e148182d489a1a5384326662a8084d8d29830d45f5342304becc88eb53dce1bbf20e976554abde47652f2574c6979b420c3c772e368e7ec3f6fd33720f8459e5846f9bbb09d210bc2a5abb8806767149b8ddb3f5c7a70a4eb4b32b00e5c961d3107856419ae0c018de07c6711068ad16bce82b44a2464414cd0768fe93d9f29976e6522a6985887f6639fb099aa5a31d80c7fb51c43c433b75d272a53108a5e289d19d8192bfca1e1a6a56ae165e9a88566c0b2aed8a3afc88865a695ab9129a8b9a7bc091c71423bfcd9c6b0268d97a7ab83eb73e91cacd0aa439d0941d07297659645cd1a5024e5331186a07f549a3af022ef7600fd232dd7c166ece632abb7150ef3b3387a9deb55496bc47943b00fed40616edc009ba85499c2b2a1e90857996b95f687a8d2bb21e1cfedd65e7389ad02263d1322572c94a5443bad61c484c0a36d0516239377411db56dacd680a51962cb7a19228896062c4188e5c7311640da710a724a49b1a857a3f172e0d10efc6474ba1580bd76020d645fb7fc056d1a25c083ad07d6ae6cca50f5a083d3073c52b61d33cb6a6d4b2402f282ac266abdd369dd2b31bee640d2609d84c53ceb250d6508939ea4257e1b40bdbc135bc132a8e25a3a11713130a4c2d2ca0812272bccc2f1b89c3e965cd5b7a0aaf061df662004261997e79b9dd3b15f722693a98116e22a6714a30f69aa4af701ef0013c7315a9e175c61f0387f011f7ad022a37879b67bce84d00dadd642c2509729b78a64e5babdb191ab8a77c83718852b1dfe497673e0a289392b4b637454a4aaea587d68b67a270437da2b07f2801aa21306991ba897a7789cc74f848c575a02cfdc879f2d379376fb74eca96de47a075b6868c68c77253b8f7461aae5b58af286a7f341b27f270aba614ce5687c53c2675055905be7a9673c637e763becb8ccfee9a2d2a0a3cd2144484204303c7463609bf767118a2c0eabc41fb74249b5505e77749e68a6b94270c95af1b3eb14cba76349794184eeeb56ff58218164a3981312db63893974b5345484bd6041a3a49832736d96111fceac1389e066dd146612459854059f0819b0064356971187a8bab5b86104b410b63c621ab1626c8d70ab90b4480310a4882897af6486e7e6cdbc0a7f8f512e98a555b701479ab0aa9f2b30b8b5413026c2964532485390b8f1ae12c60474e9ae3447794708039b4827b69015d0189c273c05198cb93ae5ad42a1718ea2bd11e7a7e23093ff8a0245a157da032fc307129e81b794e74821dc254966b3ee358df2fccf811b2712452b5036023c118e3013634b408aeba99fb59152fd0536c07c127bb64e76b8a88adbabf39732f78124a64baaffb1cf8ceb176e2c6ba2903b27407868979431ab7927132115189b408c4e79cc2a9c66a988d41c403099599ba54e4a8f8ddc7176160c92b153c1fc2933baabcf8d838e62f8ab43b2bf0b425748acdd6102d085daeb6ca3ab90874d79e9c4df6656709c7dd7394dd66b51043993e7fda5cb2a70907d149df521c082834b55603ce096f17606978ff74ccee68a3a010e01cc7d5fa855d9fccc994f8700
+Ciphertext: 1f831dbde8dea58aea7f908189403d6aa0482585ee8a81b28a74fef5a63a764bb380a3443c6854ce5c45096a480960ac522d19150af90dd210e616d64af300d2ece9719bb107468316396c2195c5a8fe2ce3cee2863e63f1aaf8390a851c99acc51b5743e8090a04a338fadec4cf612c88ac8c547baf227b408556d094f8888430883a80d2300f52ce16d83291dc4f9a7b61c1ddd090a525d4ae6fd396abc1af25d627931a28afaa22500c2bae02fa9ae5dbdb2e3a553bec39abc781a91a1d8f0c91723e4ec405d3ff74aa705a53f7f3301a35fc3559c240d422675a971ff79af0035b5e573cb49d921e819f53ec8f2d3a60777835f664517c2a6b2f14290a22a865cb2fbf331e2be5d76e212333f0181be5e8341b80d7a234b487c349074c7dbeedb78cc3cc20007474962adcf7c5eba47887bd06d9af6a10520e05de4df40adef1f14f1e73244e19a7f74d28a1b3c60a92f64a6ed8fd064a0392da830b894bf1a1dc70a23e3ab97ff18827e284898143bb43ca7057c27e000951cd33bea58fd92ebaca271819b14649f4704fe6e6c4eb6a43954c36ab889e765fb3069aeda12203a70456993ff03f74a049e34e70c25a5937a5887fb4bee454734264cc297f59aa4006151c86e81382f391fc2246ed3c68a915debd4f752bc2ae7754df511de3b410cb156a03daf0c6293dff8b2a24ba6950d0be37d5f29913073c59adda955ed85a92cb01327fef3e0dea57b1b65293b6f8104dafa622b00f5f31c396e7f01b2495c3f2ad0c06c3b2c05cd87f8ad2edd032bdd5c0ef0e6d744b0ae87f393528f3e7b25cf0d2eef1c0c61390cfb188ac5b95c66e47ca55fcc7977ed63d7a4cd47587fd7b5b0a5387482e3de30bbe70d125e20420d80d80582969131408249ece86590062b552cab231a1fe123d627e9eec8bd51b1d45f82c6ac5af0f78d5cee555baf3b066f4bc4d00bd1c2ebc727d8b70a875e75225503d99e9a5be850a205e486b593ba942d9c99629cba0960e13991a61c694e5c0634af055793887f3a43a2ff93fd7fb8dfc06804852d1346addee524fc4a466bd9e246428f877b2ae5e7b30d1c8786d353e19d5eea025f7434b727fbb91f6d7f02d7ad72fe52ebf82f894badd926954d5b3644d8c4bcb0311b039e887f2859df9da0d6c569cfe6e2505c8eac48c3a6033050434b75008b762da8ef84fa0c63c8213793c7ce5b661f2425176a6c46c90500390605efb8c0d3d43c3f94aab9659ea20815c8b597f3c59c5e713aac5ce04994cb918987383f3b6258c2706c7764af4cdfee9e37a50661e4c1e46182bd58a192f4ee90adda2e6eb711c1dee3d7a5a4d39aaaaf3239f85e901ebfd3c5e48f7aa7fa0602f042d8981012b565e56d7af0a0b4ee58134d16bc81a6b12bf645d87ef99a3449b145d96414a169ffb6bd44575bd5c560db9e211da0eff4c52775b081df54932fb58fe1177f921ef869fb95204248bfe2955f7a6defbd276dbadec29a3fa9416243df6defb549edc184ee1e3c9548a46dd68a2451ed3
+Shared Secret B: 1a18b60694764cf529615515f7a25be17bb0d8de6793b791e83565d9e94ad6b9
+Shared Secret A: 1a18b60694764cf529615515f7a25be17bb0d8de6793b791e83565d9e94ad6b9
+Pseudorandom shared Secret A: 26ef19da24235bd342acf71a1a7177d09d72d2d302faea26df17247b46c4a7b0
+Public Key: 49b69c93ea2e853b009429174948593270483fcb883954c72300a67e185ad8f9c9d5fb21ba89b72d82751ec37b2255c85f4ac9813625af8440ef19a73c8324fd2c3431a70687f6a2456594b1a33c7af87961168a6798bc0df82076bb0e188a661d4839c7c469342061ff47b36024331ba131607039a3f93295c1b67d919575da17c5aa1b04882b0c7a4b561cad3aac82aba6ca1d681f3019a3ac40ab6d7b0ad40b05db3589ef09b279b9c946027b3bbcc854d85969f6962755b1d367701db3835df82b35f927880938e49065317c9d7a0c88a427534f5ba4136b257e532cd487a9ee6c4da4771a48a81b94392593a6826259a20807a1b6c2abe277b0f6e65ac7a67f37e344019b9beaf382f8025409c0164c87431df418e580be04304e1aa14732dbacbed75049f69a9e528d0dc4c1090a46f6e7302b9986b8262c330397d8085adf2c7e8242b770cc42ae8bb4f6457ee8e604a707889181799ff423edc03b94a4759b376009b5bbe6bb3926e2ce750035a5db37508b44bc830b740563598581ac3a1ae41449d0ea224c3c0bed79b23b249e8395c590127bb3c48095782653571a8cbbc6a3347122f8bf58b61dbf84a1ebe061a46c4272f34381e4c76941553bd020d740cd02c0c6a2f784295c62b9c08f7a1a72d78707655a5ac4097ef19380188b3399556562ab30350272e8903478c1a001b750cd8a68e56879fd612009890554444acc670adc460128b76bcbc7395b929c51d8700aca7ad6fb85937a8f18a5c70bc4bdb06a7bf5a10898f2a5800192ed74518b7bac2e116cf6c305e4399e763489e692630ddca459727411fc5506e71ff18bc72c7b603686c8f9bc62d64a25f99630b50b7b82ca080340b08fa207a469c121f3a083e38ceec08338385acbda00f1c16c17115e0969b7ab0bcf8725855d721d3113160b942d6577790407505fa3aa1a13391c0a7e35aa7037d9c432e547062c3835554a67f726572139d3ec6d1a34197d20a9973990e63331b3e7388671cb5b4597697a1eff33230ea925352964bfea3b3da225184b0fa9527ac73245d8a9a6b00a64b151cdb6000bca7ac9d87b455fa632d6e140fd4254aae8a2224b527fc02a665167c4722a4cf578af80b1d934caa7f4a0e2794569aaa0793a675120765ff87219cb8e3cd0981d200dd0da779a3830df31ce13146a96962f22d412180a9263e8cbb9d420789b925c547e2d121b15970a5e97c73e629416f218d9f60276e2a062c69a7514a714a897ef7c3e20d46c8c5253c11b4e457221c216ce17512f7be61001e2095877b7ff461f26606aaa95b16747695a037784b26de4c3939c776480e4bb75da27ed897f7f552e6cc7820d2bb0efbb210df2caecd91b3574c00519855da6a382b5c85a3cafa735250a1ca01ef26629a9bfa7989eacfc4f8a395046cc71bdc382733c23465b409a8cc88d5889b4a6052071a139a0435c7ab536b70970486ea00425dab455e234534771a2fcf193a695ae1f18c9d198057e676cdc8b6f6204c8f616670cd49da3210f5c85ce9cc1c4701cc4f9a18edcc90a2e3a3da7f98d2bd405bc71acbb36a022983ed56a62c6059d8ffa355f0552cf4c2bc5386a2ac8be133574def370444a3621306180aac5be7943d14b0719ac0df0001370c47010ef9b894d07bf7b499538
+Secret Key: b684437f2b356b46c081db2041042995eacb244126c9bb5dd75230cc50048dfa68f636b110eb4843d16d407abf07ba5206373f17e7c5d2cc0211ea4a8792a3734439661b5316978c90f884ab38c590f015b042c858517e2beb3febca1abf83b787fa55e4fa89782883ea8077f6d6bf81a2ae2e024d44076cec3720c779b6604960d77a13eeb32e98e3a88931a4bdf71f57612b57d8810e388bb7333a98233c85374b23ba9f9f083d9528019256a3fb22b4dc226255927e8e23bf641b5fbd156c36fa75e9f81d94c798e46c73db77a898b75705fb479483c4399292b82c19c65a83170b241a2299d888a3a5c3397af62cb020afe112851d0022c647a3c9258a4b0b7f69c5ae4506457ada82a397ac77b0b05bd52a131c359db8a7ddaba2adc4ccfe48b04a134834b82e55e15f70ab338d19274c05023690bb7668cffcf7aaeaab25d0d41ee3b602278b424a92234e4641273a331d40bf22f5c3c5a95445b5125ce24870863645e5bc71625f12e14cae47926fd08d1e49223a4a855495aef3c02b7a9a644c675f487771c5599fa0279b1c61619692cd1a193f87c9a567a02fb9dac263519bf3a5b1d3f6863fb67647c6079e994fb490464fd7b07a033e5d55bc0e86c842c421600851c3c63287960dac8071de136fbc88a705a19e8294aaa474c436ec4a5f17729ab4b2619cb5086a45ca41592359bee0f65d58e584c992a7ba54aa6468951fbc4fa7663a9e42bf50661a2db628a97590a2f3616cb2696f16a7b5ec6442934adb116ee4661e1077a0a8f088db91789b353bd381aefe1a0ae75b1ab950406213096067af9df204a85237a4e35c40539f42274513829719da7d0fb8519fa157e128307ada7cab72270a559f35c24eda9bc73ef16c75833db44882620c176ba00a362b09d223cacf965a4a1242c73b2b4d04c2cf218063b43dea2c93954b5783ec407cd7852fa82c85c70f54c6c6fc0553eecc0b550292afa3479a503a7360926614c0f2461b581a565917ab081885f91a4082c5a5e1fa7054345e8d274484b504ab882b6eb2cd8509546046112669017315b59d667964a75499290627615f598ba9c1b04dce5144372b13164bb99ea86aa5b80d4d583d20109224ba9c0d6393bdc000a85563d97a96eb7525626b0cba44c822b7bf51a5c6519414a087a1c11c33112c1119e86fc5127af4a582e69b4723bb0d23e371356674ee991553a48ce00bcf47a04b386b435c42b5253617b8ac334121422344257cc2a7a7d4884275c929d392eb942f64c72e513045b165ad91ac7e1448ccec024d248106480004dc055e84a18b12103cd4eb553dd947907407381548284158148b1ff2358be387c9d651ba35861902c1455b6a497ad19c8d9b97765878d1e471169205401ab25e8a9d59fa90fef63fc05a064cccc3a2237a522619db633a98b2c55d459b113b4a3d893e243688c69a8a8fd091f9d6566da15568e4ad79671365f21fd2b8ac021b9649628f4440943bbcc15c6856b6b408e0d3c065b37828b03886419aee798050a4b8fdb88b4636a1a7b92b8a62cf90d421ed82bae542a3e8798505971aa925b580838586b7bb35f27944923250385da64ab1e901668eb3a8d22c184fc4a149b69c93ea2e853b009429174948593270483fcb883954c72300a67e185ad8f9c9d5fb21ba89b72d82751ec37b2255c85f4ac9813625af8440ef19a73c8324fd2c3431a70687f6a2456594b1a33c7af87961168a6798bc0df82076bb0e188a661d4839c7c469342061ff47b36024331ba131607039a3f93295c1b67d919575da17c5aa1b04882b0c7a4b561cad3aac82aba6ca1d681f3019a3ac40ab6d7b0ad40b05db3589ef09b279b9c946027b3bbcc854d85969f6962755b1d367701db3835df82b35f927880938e49065317c9d7a0c88a427534f5ba4136b257e532cd487a9ee6c4da4771a48a81b94392593a6826259a20807a1b6c2abe277b0f6e65ac7a67f37e344019b9beaf382f8025409c0164c87431df418e580be04304e1aa14732dbacbed75049f69a9e528d0dc4c1090a46f6e7302b9986b8262c330397d8085adf2c7e8242b770cc42ae8bb4f6457ee8e604a707889181799ff423edc03b94a4759b376009b5bbe6bb3926e2ce750035a5db37508b44bc830b740563598581ac3a1ae41449d0ea224c3c0bed79b23b249e8395c590127bb3c48095782653571a8cbbc6a3347122f8bf58b61dbf84a1ebe061a46c4272f34381e4c76941553bd020d740cd02c0c6a2f784295c62b9c08f7a1a72d78707655a5ac4097ef19380188b3399556562ab30350272e8903478c1a001b750cd8a68e56879fd612009890554444acc670adc460128b76bcbc7395b929c51d8700aca7ad6fb85937a8f18a5c70bc4bdb06a7bf5a10898f2a5800192ed74518b7bac2e116cf6c305e4399e763489e692630ddca459727411fc5506e71ff18bc72c7b603686c8f9bc62d64a25f99630b50b7b82ca080340b08fa207a469c121f3a083e38ceec08338385acbda00f1c16c17115e0969b7ab0bcf8725855d721d3113160b942d6577790407505fa3aa1a13391c0a7e35aa7037d9c432e547062c3835554a67f726572139d3ec6d1a34197d20a9973990e63331b3e7388671cb5b4597697a1eff33230ea925352964bfea3b3da225184b0fa9527ac73245d8a9a6b00a64b151cdb6000bca7ac9d87b455fa632d6e140fd4254aae8a2224b527fc02a665167c4722a4cf578af80b1d934caa7f4a0e2794569aaa0793a675120765ff87219cb8e3cd0981d200dd0da779a3830df31ce13146a96962f22d412180a9263e8cbb9d420789b925c547e2d121b15970a5e97c73e629416f218d9f60276e2a062c69a7514a714a897ef7c3e20d46c8c5253c11b4e457221c216ce17512f7be61001e2095877b7ff461f26606aaa95b16747695a037784b26de4c3939c776480e4bb75da27ed897f7f552e6cc7820d2bb0efbb210df2caecd91b3574c00519855da6a382b5c85a3cafa735250a1ca01ef26629a9bfa7989eacfc4f8a395046cc71bdc382733c23465b409a8cc88d5889b4a6052071a139a0435c7ab536b70970486ea00425dab455e234534771a2fcf193a695ae1f18c9d198057e676cdc8b6f6204c8f616670cd49da3210f5c85ce9cc1c4701cc4f9a18edcc90a2e3a3da7f98d2bd405bc71acbb36a022983ed56a62c6059d8ffa355f0552cf4c2bc5386a2ac8be133574def370444a3621306180aac5be7943d14b0719ac0df0001370c47010ef9b894d07bf7b4995389d368adec0e7efcb5c8a3dca48362487215b21e9f7c504bfd4e4729ac30fb213e54cc37a9f2e94b014fc7f5b3889c88f53ccb545726b162b3a5a9e366b17c6cc
+Ciphertext: 907bf7e8c20537730045792885ef07141886c7f8740544bc6b440e8d37c013d9a5174372934af751cac3b06e48a39197772b1bb652ee4f795df01ed62fa9a49d1edb1ffa2f49b1c28ce311655a6fe0da90b918f7e3e7eefbf23a40dad8dbb12a4a6bd454f4cb93e2a601e5c342236a454037868ea5ea09e4bc1793f939b230006ef558e067736e9bdca5e5f56abe8d94480b0de2b587c24ecef78330aafeaa680fc6f4ad389824ee136df784794c2b9929337072a89e5c938e95a825adc1ed224d72076ab5e8b5ddde515e6f57dc237585ceb60025ea551ed39fb21f153595da8db2cb30b7cd2415256797eaaec2003900ed5d86ef9ce15987327d6773734bbcdfe59e5f61d62022b72bad7080a85d507672b166a60783076415f3fc155108beaa2e7ecc197feec173a686e0e667b349a64b6924266c99010f9edb26e41dae62f8f3cb8f32cb8ea5914eb67be3b840b83dc40795898d486c33459005024ea0c1120a9c1552cbb40cd9202816ff240ec150501fc683dbd87dfca9ec7cab4e72e9fe144fbec9cdacd1d8aef8357d7246f058eae579c6e9b994c3a16b7cd250140e9b91b24222984017ae285a25ec9028942514dce0843e188b8e6795e7500e232c45da1f344221de3ef24a32313965d6ccec75596922134464318522a9da2048581059068172a2f20585b0e8bb1390124d78e51f9a8491fd6081aa571b823e3d7eeefae8d39c0fff7ed0f5b69f1bfb9ee663f4c93123b6a790347f16fede00f2ddc67a5005988b9dc80720d22df7ac10293602e590da263340a72007f1a8bca1e2a70f5ac9908ef8e4db858f25bbb17378434f870b0c12fc02032b4f12bc7c010de22b387f33f5eddb8fec76d327e272229ca306384c05cfefa20ee57876023bdf5a94e3458c63c6a6b313f6e5473f26f9a44e47ed5a2f3d8f98c0530e821378e01a9c813028d3b4f5e741a8178b905d130545ddb1a71ba0561a06461dabcc97188e0fa251fbd3e3a54cf49c48fbef3193cacf2bba7ac3fd5123f463e2d3319e9af4e7b7725195045f4bf521a4b63085bbdd08a4fb8c6da7092a1c6dd25038b9bd04e85245c3fa0150d3738a652e9448d775e7b3b54920a4525e264d823313231aea9b0a471df2c259924ef1b2cb886766952a45db163f23063fb6e978acfcb767336ef81a746d3ac841cb95abad440647189dfd2a7150015e36c3192fcf9cd9f718be4dc91851f440eb95bfb2b223aa31085e58d3e200c241df24957255487a2da5538a5340757b22606bd9b4059bc737747497464c023ad494286d9d1dac5a15726017941054583f2147d2d9b326afb5e3cd4e2a421778f86bbaa2513536afae9878eb18049e2ef7a32b6263f9ef54e736ec8ed8920648433174d9a0aa2ffe099ccc6670cfa4b69342e374815c44b320629cabd5f7f3ad6eadea9c2c82a90f9ba36fdbbe8c3b6ae0bb8a1da513e7d4d509d9dac2fb04125555c23de9b0eb040a80f54fabbd6f9473961f4407125bbb734750723a1777c06cbf1bab3b45f77b3d
+Shared Secret B: 6073e667497869de533c56ac92a0a026cb52bad69b57b5d23c6416cbc09ebbe2
+Shared Secret A: 6073e667497869de533c56ac92a0a026cb52bad69b57b5d23c6416cbc09ebbe2
+Pseudorandom shared Secret A: ec0bec913a757d3f387c221a93b2ebff5427c8a1906a8c7114f144a6c52ae4c9
+Public Key: 42854126b08760677ee6496f18209d491513d1b6cab4636fc4685ae3d0971dcb20c2668a8a748466f96e71a7cf27d873f0601afae8895c053c27fb7f8555a46a009af1311c2624be94d4c6b28b8ce16676ab9c4f9b49337eb82964c932baca109f663b71a2977885c4aa5a2081814c0c8b2be1e8056fb21f36545c1772537863772f8c892ce5b43487859d82310708bb47f4460ab2ad8d24084c505a7f79b3cdfa86fe883352ab50a9b62e42e0bd6962a54c348b674627be016ce8b417d1046a3370b5a4320fc5eb993b513b07027d18e6319dd56122f4842819980dcb8efe3909fdb652264ca49839332be33a184012b0b06babf6b90d714b76b3a490c6a44bd10650d8c415141115a713eb09a94bf582f6683ea58a186fb418f0822689504bf307874e13697a85bc06351acd0625e84895e8f7c1f5ccb809bb7b6f42b018434af24887f6cb534054490b94bdd057ce5cd1993725a049c497c91969df8569de0a1a22d0c2ce214c3cc4a29108931c4b486309c859eb8acedc58a8840d4bd12b4bc0ccf7da54fb03021f0ac4c31ba76e6007afb40cadb024fcb74a43a50e43e6cf540001b5083806a63eb942bdbda2063db41908941c53bb46f7da9c10934f109a7343f9cde68889bd79bc8ab617b9e96980207af156031a1b4df3a3423a1583d4e75d5d487cb63470262b15b705404c2b53103148492bcacc208fa1e59ec986244d199755b94acf673f28217f63877a7241ba7a24741a41086fd248f3462db2d12934fc039ec448e4497aac58a715ba98ee804ae6697284964b8d3679b6b1c63a123f20559947056d774c4f08603b31ab133183a025e49c52c2c7564bc0e0c096311342af7c6a896ac0c68b4574f7ab53f671d9287c47a3a15d1a4a71045ace12627db07bfef98b6a194fb8993e6bec463c9915b3abb0822320e3d5141b85483c2817fcc460cb8741c6b879522b8ce82a10ae17c2c330c925e48c7b0b0682b72263c9affde7acf6b33c19523337b797b0e082d380cd0a72554be08a4f2797c72caf477061e241252813673e5173019339f84bba667317eca59b73d98b59e228b0178cc7b223dbecccbf70c16921630214c10024bb9a1797d26b663bbb4147e39fd4c65206336c76b029470ab62fc454b90b6a02a0414c4826542accc4e069c1db30de1c62a8ca8191c3ab0c436b6a47793ab46a86667fc5fa7ddfb3ae13155af8032b57c0cb58b932cebac0a49a3e53a1171e2c811af6018a159cdedc46d3dcc88a69cd936519a40917e4705c4b621f3f2285e7936148661583b1a145c5c4cfe67655d86b0386c46bd2303a6a984b5044768455231648264a8d285ac3b3cbafb880200bb50c94acca11f417988a3c4f989b715503e70b26dfb6ca291b94e66718a478b0d954b848749c21f7b1faf963d861cfb34ac1711c75fd070431221fe08b2b4638cb481c4ce9f956a6a890ba60b3558908d2b07a2cb64162731705232b5a2c3543c2758c5ccf9c30411c49a8fd957b8e7714c2c66cf8512588c8be6b1528b3529c19c5c62fe28d2882cce7c68d2ad7a24517a71bc3a9c029ac7aec520d4b91e1f09958aa452d4988ebf4a3b60ac9f1512ffaa9383a25375d3b7d808530a409ce571a628ca0aa98e2669e6aca5cda0c9ae7ed39d0ba33c2f7665cf49f2e
+Secret Key: 359b7e5923994374a9d90c2ac4f1895157715fe02591ea2b1dc2b0bf3120f80537e96b22d8da97151044bffa4e7e5783650078161a70544547ebc05605f668381a53fe2b02d0d9cd9e75aa0dd088b79192098aa56782856f069186a0c256156cc7260edd4602d36c32acbbce89a53915f501d4d29a21119fe5a82777d26a79238eeeccac9054855c69bc6e0abe982c512ee51c873a9c20db54d6157c42508535d790842995a7b198d42a1d36a808f2737ce5937d38b37d0558089be2a0d7804976f25b84426b8f9c2f27c4bb7976ad82051328754614a56441aa1c363a9bcc7a6d8d6348554a9feff59f61fb951611afb3fc94d7ec72a65c800fe435ec210dbb618b49d642f6c79060205dc47a03751cc7732b83eff26fc9d7939fe41e06d3cbf0f65ff5d6397a99c29dd220ccfa600ad390fab86faad9397eb66f59ec8f5719af88cab154b0b8a6909d28f8373b9a1aabb86e4b44915f86c33fb2505f837802994680b2aed75773995a8dcfc09c88c6664c11cc2be2cf42f31732893f4f43080bd7176a733190ac57318849cddb73ba0163e2022d3873b2810b1eb7386f3c40c98f294c8a4193b3c3237078bde217cb6b9849bb0114dda6256162bb0fd2464cf10ed76ba173779af4882bbf1a9cad7b1fc7864990481b5888a3d2910124d4c06d787facf9584021b7ac7252dcb91df2a645d59bb242308b60f90daecb249f3cbc17e610c31434194a647095a85c674010862756730b8a586b99207b18a4673d350d3e4a1663459f67ebc2ec0477fb0c2a739bc7555cc5c73a273a23afbfc7187100541e092e0c8a5eed55b27956290fc72992361cfda89706805467648204542545d091bc4321cc7acacd18084b826e0b2340275b530ae82a84996e78188594f43c5c72c580b55cd1731ca309b38cac18cb89194461316af795ee9c760cf48b878bccbf9915913c6964738316811c7a9a9258e179e959bc1e2116acb7c7fcfa317e971ed4582ff8c54204dc27d8674eb2b178fa7c29b80cc200c44bc90ace1680c8c528aedeb913ead905384b2fe9a405a4282525a237e27b3adb285aa4bb8a2230696957755fd3578a204fc1cc9e0bf39d11d469d07b10f429548afa03d4c238c2357f937752eaf94e26626dbad69c11381957b9b8a843ace4c0b7b5ccbb68033afac6290b1b2c8e00885a565a58f07121f9ac656abf835868528ab875557611f26e75377f23b50bb1c9bf1eb65ec6c38b2dc47258542236ba4734c54b13c5cfc47b39e0121f422172ab340ce26c0feeb95933f93d32221574c6185334a1dc881af9435f010ad093c394e35a5c32f22751e4a39c147c49a5c281145f8370a3cf7b14872b566ed3adee32ccbc78969dc87c18a314c5844b97ba288422b927007dad16b974ab43f256c6151a7ab1617de390276f00102b4c111579c12f226acb9551cdf4a52ac3b433d13379223c686313b6c9bb261aa1bb91471d0ac1b1a956668933e269161a26844761aa65b284e6e025ccfa3f7560ad3897a6d7399e40857509c6c00b37a8d2a95bdbaa26b1cccfaae213bde49ca41653bb36aa1bfa04dada7efbfba733ec1e13fa91af747427751891bc09e3ecc1af4a1583c30096ab8242854126b08760677ee6496f18209d491513d1b6cab4636fc4685ae3d0971dcb20c2668a8a748466f96e71a7cf27d873f0601afae8895c053c27fb7f8555a46a009af1311c2624be94d4c6b28b8ce16676ab9c4f9b49337eb82964c932baca109f663b71a2977885c4aa5a2081814c0c8b2be1e8056fb21f36545c1772537863772f8c892ce5b43487859d82310708bb47f4460ab2ad8d24084c505a7f79b3cdfa86fe883352ab50a9b62e42e0bd6962a54c348b674627be016ce8b417d1046a3370b5a4320fc5eb993b513b07027d18e6319dd56122f4842819980dcb8efe3909fdb652264ca49839332be33a184012b0b06babf6b90d714b76b3a490c6a44bd10650d8c415141115a713eb09a94bf582f6683ea58a186fb418f0822689504bf307874e13697a85bc06351acd0625e84895e8f7c1f5ccb809bb7b6f42b018434af24887f6cb534054490b94bdd057ce5cd1993725a049c497c91969df8569de0a1a22d0c2ce214c3cc4a29108931c4b486309c859eb8acedc58a8840d4bd12b4bc0ccf7da54fb03021f0ac4c31ba76e6007afb40cadb024fcb74a43a50e43e6cf540001b5083806a63eb942bdbda2063db41908941c53bb46f7da9c10934f109a7343f9cde68889bd79bc8ab617b9e96980207af156031a1b4df3a3423a1583d4e75d5d487cb63470262b15b705404c2b53103148492bcacc208fa1e59ec986244d199755b94acf673f28217f63877a7241ba7a24741a41086fd248f3462db2d12934fc039ec448e4497aac58a715ba98ee804ae6697284964b8d3679b6b1c63a123f20559947056d774c4f08603b31ab133183a025e49c52c2c7564bc0e0c096311342af7c6a896ac0c68b4574f7ab53f671d9287c47a3a15d1a4a71045ace12627db07bfef98b6a194fb8993e6bec463c9915b3abb0822320e3d5141b85483c2817fcc460cb8741c6b879522b8ce82a10ae17c2c330c925e48c7b0b0682b72263c9affde7acf6b33c19523337b797b0e082d380cd0a72554be08a4f2797c72caf477061e241252813673e5173019339f84bba667317eca59b73d98b59e228b0178cc7b223dbecccbf70c16921630214c10024bb9a1797d26b663bbb4147e39fd4c65206336c76b029470ab62fc454b90b6a02a0414c4826542accc4e069c1db30de1c62a8ca8191c3ab0c436b6a47793ab46a86667fc5fa7ddfb3ae13155af8032b57c0cb58b932cebac0a49a3e53a1171e2c811af6018a159cdedc46d3dcc88a69cd936519a40917e4705c4b621f3f2285e7936148661583b1a145c5c4cfe67655d86b0386c46bd2303a6a984b5044768455231648264a8d285ac3b3cbafb880200bb50c94acca11f417988a3c4f989b715503e70b26dfb6ca291b94e66718a478b0d954b848749c21f7b1faf963d861cfb34ac1711c75fd070431221fe08b2b4638cb481c4ce9f956a6a890ba60b3558908d2b07a2cb64162731705232b5a2c3543c2758c5ccf9c30411c49a8fd957b8e7714c2c66cf8512588c8be6b1528b3529c19c5c62fe28d2882cce7c68d2ad7a24517a71bc3a9c029ac7aec520d4b91e1f09958aa452d4988ebf4a3b60ac9f1512ffaa9383a25375d3b7d808530a409ce571a628ca0aa98e2669e6aca5cda0c9ae7ed39d0ba33c2f7665cf49f2e8934f0a7958dd1752977bbfbf87883fda0957c7084f8ae7bd7e418fd583e6c0e7a09d13a37330db4d7dc10605a823595b3a3a784d530e2e1e11c315527ec2bcf
+Ciphertext: 03073fcba8f45e2a1594d1430cddf2aa7e875f10b59322dacd9b8590d57e054a6bbe91a5f831e0147ac81556455c4170799707de5bfcd44b4fe2058ff7e3d91090c3bda2b9f9fe2dd568451e01d94d44b01407ae7f5f324589d712af1e9c49e4f449a13752d6501b2683bc0b2864301ee93a5cf74213891bd1d9edf36983581e3de2378c76e5de38f88a02bc116d91317c1ad6c8e149893f56f1b6afbf526f88c43c5ee9c8d7aa5b05eaf73aee2760a246f35578790714942fe496a299feb3625c67375afc85107e34627714f88812d3337c1ccf970680736219069f25974d407b04ca7205762bdcfe17195d785b81de0ff3bc1153d9cbd1e259a819c9b0a49621aabbfac4dfb7552f7a66e2dad593636a1f2f92f92e78277fc0793dcd21f2a77b14148256866066ea45d80c0cde5a3e88f1bac13a1a82257df8b2ebe789325c86efe268f9e3e4db128fd4c89a0faa4c7cc7ce4bbb29d1690f850ebbe4680988ac791c2b154cd89f945aa11e5c8fbe74080b2f5a118a1273e5a7d7b2e37df974a0dc50cceaaca62584ad248f3ef3ce33de9bebf3306834ef6f6bcd04d37168548625620a3d2c2a220124b20fd6bce232160b18d99f38652ea4bbd5808bb60b596c6227d6a3fac118699bb8a7c6c1c6b4922b766aa649bb6683640538ff9a6977591f56b1532abc033ec0aeb3096e796e23a14a3d998eedea285d10f8d5af90493fffecdcbe14b5ae17f3f0e5b8852306f370a1d2acb80f9963b87214949c23ace694f97ebcaccc451cc8980289b35775af86e9635444b540a0239494154626e60d165faaebd7faf353801d9bbea28df436376a0bc86fdbf23572615c25663297a414399614602df64122a07df1a50dced74d8e7ef9da9645849c8f7a00a0ef2e3b7e0f20f5f07abd3454dbbe29474fc158d28189dffeec50d500328a3301697c9cf57eeee8e7b2fd6a885a51844fded64cae513ffb248ed98fc449a3201151574c5b4a9fcf2fc0e1776413f28bdf867b8bd659d05b6627cd0e8965afcd7fb0533b30b6990e413916cc6f52d2f01777e28f92d36a475ddc31912baba7c91d98737dc0c245fced10aedeb495bbd862fcc1512a8e522eb9389b3fcbc88e21940905f7284e18006bbd8c664656660b261bb746854121f29dccbab68500ef34747e6ec3931b8129b82f673383cfa913eca784554300ee6afb0d5dbab35aec8aeb0f50ccdd193d8de33e7e917678ae78eb3912ec0d407605b8d5b7488ef2367d692cfce95491499099e9885971187822870d5db9995d3bf1600888f0587ac5013d9d8b7329fd5f0065433fc37b7bbf0d3944bbcf92a8449b7da0346ab397579f1103f3bc00a468d75d183ca543a99ec15f2c6af6841d19bd626146a2cca7850f3899be7f17a2e65eaccfcf88f643c42fdff9163ef1eec4ab6fdc6be81db8cf13e1405d74bd6816449fbeb23afd212792a4bdf72c4ba06c9f838f928be020960557a3fc2407ae608e494724e22023493f0e286207dd9e3573309ff7a41e7089ea67ca3b
+Shared Secret B: 3471f3f3e511e19e286cf2ef9cde315bcafcb9d3010afc3e92d2e0527449cee7
+Shared Secret A: 3471f3f3e511e19e286cf2ef9cde315bcafcb9d3010afc3e92d2e0527449cee7
+Pseudorandom shared Secret A: 9a78e3995573ecd766419a3c47a2902e4fb9c4271a8c41273233b139ce49e239
+Public Key: 4b022e127155dec9717d7038d7322a85772c752806560771351a9894b19b76312cbe5b9c7a54344f241eb81a2aef728910990e69482e5bc4cd5271617803a4b7eaab8ba95aef0012397b50c7e1af34b8c384728f62f51f49a2af694160faeb37f43339f75988b5c6ccff3bc2db669d748c94734b84ac763407721fd2a47a24f7564a4480579c1ef86134d5455db95445c5249364a94f74a33186f9298fc7329316bbad54a249cc2d4343bbe2ea2ca1d09e4dab85415525bff4cca75098a750c510ab7e9db63294cc97c1a36083282bd5f69882a86a154949657c2bb6b3809c279c5230b998b06acd9a4a61ac755019bf395bc8635843d642ccae4b423f542d90743cc86583554904c9e443f6cc9067a57be0b751a88238eca77d4225832968bd358772446028a549cb19845ca56cc04ebb965186b2f76a01daeac1ac60cadbfb39835837daea5cdaaa191f9a88581017be42a0d6410e495704311553bb320633490072a5891b251234a3684b4b7a4610986c861216203c7bd700aeab64ef882977737219224d9410cbce10ac955a6b39497fe468163a9230653a5303d07bdc56081fcca726f982a3b882fd59b5db23283b29b5b4a6b0becaced8762097001813e829803b53752c4768157d981419edc09e0eeaba2238955f02ba7e8c2a13f6ce4bc8991ce38d5a8170eb486ccd587a106531fa6ba03ce98e243901203a6052e26b71f5a7702a7cb1959d12667e0420c1acb3bc52d68e7a8b19d0c43ddd077094ab1e7485837698c22a895357e9483f7b166f92c7996b6c3b36152965867b353f8cf184009b2b3488a624e203ad72bfb6788799622d0822645d6412a5437e97c603b653457318b29111609420827c5c13b9498a762243e1e48a682c6bb3c624dedcb3962251ee4201fb4085e4c41677f88d41f34b99041735010246d46ce939329454472b752e8eec0bc48c718fb52d9aaabf9b95a9befc32bf8279fa88aa21e399cda6794dd76205c4529f09484f8a1e3588cdba570abb9b73ba37582979a4bfe3922dac0048b56d7971962f38ceef915e90603cc8d079cc465012c198e023b95b1cb027881b095367a1d840f798954dd96ab815b9765397b96114a554aa15d154eba02a7b8c89e413c749229e1b759680e2be8e0745cc730bc7276ad20526371106fd316e200633f945ac5d571f53eaacb8a26069938462399b7d83c608f895fe10971f468ab752a1cbc97752f1cdf67c23e2a10455e758b7b9659528142165049dc75414e136acb89fe1aa2cb7e1bbd7221d0121a4a9089d0fb2a50bf65b73514b9e231994e5145a46105870962321c199e9565336634d351b7c65af87544905c8582609629002bcdc46a360bc8d7b2c216189cefa473c0d33a4546aca961acc23c48b0775918a697e42576980f24d95a619db93a8c34b5ff9a93dd673b699ab0b34fccee7d2936d35392f2336715c14f897069e9b38f02b75d6192fc8ca6ce5d78dd61776f61acb06b80ccd1ab771a39eb4792b42668a788a702dba8f9c74c8d341b1079b610dc3b183f6570046559b351f64bb0cd288b1af22b6cf0631aa5384a30985fc8386c9a1c9d87970ec78a75204b4c346abbfc69f4660b74a3aa1e82e7cfe2383ef76b1706b1abd0206e846e35b417f8b5e8b36e29849ad9acb4f
+Secret Key: 65dbafe7756f7101853632bcc48c6c900b402fbc78d4b41efbc155a55aafd8925f876c52327b5b2f5a32d16182778809ec6a56013b1053297366179326917e9e8995bcb5ca572939fac0315ef328543bb1afc29062a2665c4b61e477072b80ccb4b2991c6830a7139a2173a889267048f00bebaa4d1cda29d60256dbea8127506bc135be6a3c870c3737356cc24e04c95536cdef0866a4c13531243488fc446e91382bf178303cb65a67cea1f7ccea276ca3706ee7bb9c8e259a50b8090fda564f355d442635975a498f64ce1804ca9de282fed67a44864a81a53cef99904d591e109839aea69da53cb84ca31e6f441ec499b9cfd730a71c00246c7a7e5954d4919e36f1a7bab858bc6a4f1abc418f7b87ea19322f749f3610b267b444468579a5a5be06554f6f1a5fb7c97c8da38cf62052e4519301181337b644af5b4a6dcc21c687a29c57956e7501ffc90d3b754f3de8aab3a63a5deab82fac4a7eab06d82a39b588ba6bfc4db72c35a74944e2d3696c765d52e92c475090b4a50cdac51757e13a08f2088cb9747167581420a072d07052841326a3b5acf23c0c417d34050aeb8b33562bafc0e0372d009b5585c1e22a4929ec3878f7980670222b014001872b9870698b55a519ab301089041504b4bed757e8c54949748a0a7963440612c6fb31d6480d10c8bd8141ce390b691673738bdab1f30273762c149f4a6325b139923bc8e5019a5fe858ec435232a3b4a489bce6811e569c324d363d487615eb6380f1e87450670b3c37bea962abc5f623190162712693d6fa57d3b5917a021f00482acd7ab61bc722240586498039d36748ab786dead54ba79525435995abe07045113db6719bf169846e7251e1231279ab427e5a0c9e3c36df0020067a7183a6115a25625ddbb676575f1129c818749a14d6572cfa7379e6ac94b2436d6c96651a0bd1d875c36667a6119b9ac197651865935c8f44dbc005b4c1b2372e77cb6183e8a24678c61f142da9187626542fc3473995272a3c68bf4d45544e4a58cbba0e84e0b81e1530adf0c4073a51cad33fec24a485105aadd62291455045e40d34e7486cdc21f32ac761d54db2423fc2c6c2430124615731f9903a3a132a5bfc19a3e6749da052592c79ecfa5569768dd9f0c62a660122d54e76720d4f488e1638c887b281d428a57d093c4e4b4fde520a19b73da9d4545bfb7a8f530fb8971df4f73f392a55d7e4bf7f595065d70f720b5da5cc5f6365262a6ac93901407e1ba94765031ea332ad0854ebd23f6fe03815e28c9bb32c4b226b94966e62d6ace56a605a3c1e2e865aefb55c3378b69be4400b05abcb68c30eda4a55a695ee38359d0c7584d60dada07667191399709a135126b86416c70c39759723f6d49238b34b06349a7d85203d901c1ee1cb8af20f03d84817a23905f97a9492b31c7a5a933a77998b99e7659dda3b5ee4b3a06de88a1042217811285eb06d9b8ca5f5236918e2ad0f9286a3040ddc6b9908b453f0a925aee2541e2a6348321b39217a19c60485d05a7246b6737433a810b95de67c4aab33f2bccd1e95acf36961f2a4808e0a2011293473149215b32b831702718c2dac324ab05280cb1b51cc522d2f981c4b022e127155dec9717d7038d7322a85772c752806560771351a9894b19b76312cbe5b9c7a54344f241eb81a2aef728910990e69482e5bc4cd5271617803a4b7eaab8ba95aef0012397b50c7e1af34b8c384728f62f51f49a2af694160faeb37f43339f75988b5c6ccff3bc2db669d748c94734b84ac763407721fd2a47a24f7564a4480579c1ef86134d5455db95445c5249364a94f74a33186f9298fc7329316bbad54a249cc2d4343bbe2ea2ca1d09e4dab85415525bff4cca75098a750c510ab7e9db63294cc97c1a36083282bd5f69882a86a154949657c2bb6b3809c279c5230b998b06acd9a4a61ac755019bf395bc8635843d642ccae4b423f542d90743cc86583554904c9e443f6cc9067a57be0b751a88238eca77d4225832968bd358772446028a549cb19845ca56cc04ebb965186b2f76a01daeac1ac60cadbfb39835837daea5cdaaa191f9a88581017be42a0d6410e495704311553bb320633490072a5891b251234a3684b4b7a4610986c861216203c7bd700aeab64ef882977737219224d9410cbce10ac955a6b39497fe468163a9230653a5303d07bdc56081fcca726f982a3b882fd59b5db23283b29b5b4a6b0becaced8762097001813e829803b53752c4768157d981419edc09e0eeaba2238955f02ba7e8c2a13f6ce4bc8991ce38d5a8170eb486ccd587a106531fa6ba03ce98e243901203a6052e26b71f5a7702a7cb1959d12667e0420c1acb3bc52d68e7a8b19d0c43ddd077094ab1e7485837698c22a895357e9483f7b166f92c7996b6c3b36152965867b353f8cf184009b2b3488a624e203ad72bfb6788799622d0822645d6412a5437e97c603b653457318b29111609420827c5c13b9498a762243e1e48a682c6bb3c624dedcb3962251ee4201fb4085e4c41677f88d41f34b99041735010246d46ce939329454472b752e8eec0bc48c718fb52d9aaabf9b95a9befc32bf8279fa88aa21e399cda6794dd76205c4529f09484f8a1e3588cdba570abb9b73ba37582979a4bfe3922dac0048b56d7971962f38ceef915e90603cc8d079cc465012c198e023b95b1cb027881b095367a1d840f798954dd96ab815b9765397b96114a554aa15d154eba02a7b8c89e413c749229e1b759680e2be8e0745cc730bc7276ad20526371106fd316e200633f945ac5d571f53eaacb8a26069938462399b7d83c608f895fe10971f468ab752a1cbc97752f1cdf67c23e2a10455e758b7b9659528142165049dc75414e136acb89fe1aa2cb7e1bbd7221d0121a4a9089d0fb2a50bf65b73514b9e231994e5145a46105870962321c199e9565336634d351b7c65af87544905c8582609629002bcdc46a360bc8d7b2c216189cefa473c0d33a4546aca961acc23c48b0775918a697e42576980f24d95a619db93a8c34b5ff9a93dd673b699ab0b34fccee7d2936d35392f2336715c14f897069e9b38f02b75d6192fc8ca6ce5d78dd61776f61acb06b80ccd1ab771a39eb4792b42668a788a702dba8f9c74c8d341b1079b610dc3b183f6570046559b351f64bb0cd288b1af22b6cf0631aa5384a30985fc8386c9a1c9d87970ec78a75204b4c346abbfc69f4660b74a3aa1e82e7cfe2383ef76b1706b1abd0206e846e35b417f8b5e8b36e29849ad9acb4fbc81d2e934b56e773510727e7bfe175b6a4a4ac2e554986478aa665dff722217f012556160316f659cab6895d02c47deda1ac4ae347cc23c51d8ce6d6a8d29e7
+Ciphertext: 45ed1076e1ab87bc266592522c9a0b53b2a475d42bd204acfff529d0b7afa8033020d42abd0f9c334aceb2c9ee44b86df9b7ccf390bafcaaf01657f0b1be666d3162d9caf9edbebb903f495c14a07a98378435011f6c7fa525101d7e09ce91b004b8fed6f50c7bc6dd8495c42d893e09b790f0cf2fc008a8fa26e69fe250881634c141e67e658cdea784ce15ab6d8ef658ba9dca98d33ce4a8463923a458dae78888302e9bfc34c1bf57bfd3d9347b08bba547949839210684559519f648d2aa07d83c715a95f5593e0570ba489686f7606ed584a43afe4fd75c40072fcf990f8f82be95d47327778c56cb43a4b497ee6a244de81dd0cdc298c112000adfca45dd9391cfd0a979af224804b0c09e1f17a64415ee71faafa1942da363086651085b2bc2ce3daff3609a0e6efe117f7888f4ce0b39ac84c5ca3b99169e657dc093abbb6e150d92c6af4836e4fdfc5a149de37b603a02588f5caa3becfb1d7c9ad49141f453f6e8c6cfcf5c178177ba4d6016db9814dff5df643f07671e12e2aa89a40b52017ef08120a89247780813038ea55924d9252626d2f01708942fbe4eff53e7c476a864bfdad08f5a60e7799e998534365b1d5e4b775fd1bdd6a3213fecad0c24533feaab3026177bb262e096ef862f58f2c01b62b0a300068b01dd3cc7ccd94d64564af8d70281b82fc1fa57dd41281a5c434c1462a1c09b95d9ec51b6382621b250031d6255c142359e04ff3ea56d737876ae4d6a3ceba9953a64a05a3724c98c95498d6bb6cc9192ac7bdb20a78fc6d40c1289d9727e99d74512323b32b77028de72576f8e1d40f6ce92b00a77b42d8ef4a7a90931358f9494e188b437640867e742388f7108c6f86edb4b7cb834fc1f229f4c48ab1ced053a60fda259eca3ecb03cc50266a4f7cd4c39e0a226dd9fb55847d95c85ff41a75592f1122e038b2f8a5142e7b4539a3fcf62f016f5db3a8c4895f1dea240c17748790b1771cbdfca8e57981318a9b042a6294504358d94fd1ed438621f1a72a86312ed9c7cf440eba2697c126893af2dec0cf844819e0d5099a7860806659cbc80c040a5f83a2dd9b73af41813495d35e8ad22424f978d2168dd5a4fb8a591d8c1553be7883381d006a18af3595ae13341c935ade057188cb62231061d17f402811245bd759b3fca95cec7c825d414c8bbdbb58c60804e4bdef42b3fff773f0dc91b5f1ea12bb07d6246c0c3f1008293552dfc0e9bd207fcb6ed32bb905106745d99a49a14fdffab1921faf52a411aedc6d4704b1ef94d7f8a01a384825cd4ed13e55c5ec6c6581736280eb57ccb212df2fbf2f016d740d4760520fa2a793c1dfaf976d9450ecaca57051a37bf804f64efda029ba93df1f2f66c878441e286fa3cd7b0092affacf0b6613023eb2dfbd48530e1c8fdf16db4e8eed0c977e6e664483a60a1644ab21e54dc4687858b02db373c2b698279977a1226d6ede22da319d85dde4966b81a910c9afc1ffb036b8601badcb4fb3ed379755cebc5657e4a69ca710540
+Shared Secret B: 59eb3cd8b3db755fc50c25e7f2d8de6b1cc83098ccba0c285e92319fe47b3529
+Shared Secret A: 59eb3cd8b3db755fc50c25e7f2d8de6b1cc83098ccba0c285e92319fe47b3529
+Pseudorandom shared Secret A: 425357bd9c736b2ce0c6c548bb2a76eab4fdffbe0eb602d345dcc2aa2278ca9a
+Public Key: 62d17fb6c88480031680fb60047892396672cda6ca76c97ef633408319059e292189910b64fc6078e514642970f1a5187df5522fa585f0e6673c53a8076065f2e1a3c4b761a037064446cedfca6cb8375de0185eb472b34d14a0aa7128ed39b0960861630975c3c038ded102f658109224ab58333db4a556a3d15a2ae2a1657c8c82b1b260b6a798715fd6dc21854c86045890a755058c003d67c077c248c5fbc37863a56ee3808acd8879f06319daa8bece7b07a6323daeac275e193194528db3115a5e47950d23ad1be76e31048461783f477900c8882793c28ecf8358e8e8016d0156d91790643a14dcd17ba77b3413d51d40684018d63137f61f1262c18db534ed460cf5f2484643232887c67c23cd9f7ac9557420b3f6c8f1b64c3bb423c46b7c8272c05cd79113466bc664123d94853350bd7ceca1130238b5b3070f9563049ba4c6628fe012243a384f5aa1455a12ab8dd322186121b3e2450be97405a20b54a193353baa9e11cb8d15cfbbf31945564fa6180ffe100e1afc6572956b091a46b08c180ff91dd9592684535c8e3433f2e367a5e6c1ce622e8c3033e7d79f3a0c7955945871d8a75d3c81438b40a4c790e4ca39ebc6509f9472155acc89795b386007912685d0dc0cf66cbe9a6478cc674a1061941233c4523b9e54b54dc889334b3a7757e043207b644c079cb8f18f6361ae3db78a29d445ac36ae7147a5287b24c70bad50006f73f5ab53e03266a2913e016277a89133224060e80cfb56925f234432fa81fa20c22caa6255c3a15777b6403a6ed0245d32287c97827362c489f118c7dce97bcb9a8c0fc7b664050a0e8c28ebea3855b5b11ad426026cc3295c2c6cb27315950ea56a8023277534ea5fd046c84f358677c3b9a967c8cb8b786399c0d4117f12ca2cef82ad8fb09ce7809c0bf8ad001512c4da16f86048b8120893b042f5a147880a7a65eb8be8fa97aa87aa920260be54bb1b0121f58124e8dc1960e566e3d73bdf303235b74fa8362dbbe7be50b5296575c6536690304cc8a8813b7f0b039ae13d9925a66af067f0d5a8f7d8c00adaafce3477f079317ccca6c25bcdbaa1c0ebaa27ea34c3d0e3276a42bebde3185604b290446cd74b6d6638c6a1a979c6843b583a7334660092f525fd7a6d87905bcebc1a28faa840a38fa2d667c5e75c416a559ce190007b74b7e0ac84d2c1dc385f6d74a551e01caeba1e9ed64845c893f07b204eb57c0aa93248fa44cc45bb052813c60c4e637732a09978a796071beaab0770393cf80089778327e149262a1e39853e43e5a8e08ab9264a1a71a7cbec339249d03e04f5c7a7c63d4c31c58e43ac3c88a22e5c823320016b44a41a50a6747468a839b44cb839ef76679da3a109d995e183a4ed37675d4a76706a82b2f49f3f5801742169a6b94768367bb7d03ea77127b6f80fd8a73dbb1b32a696429f2811504ab3ef4cc9cec8aa342939d81822f7a43b2310abb47109eef1298dd9c725bab97836b571ab8f115b85416187467069baf0a35b4763fa2546360caf16bccf6ce5c02d583956b45a7e684f672104fb774acac67ff826701b0abdbf4b0dc7114d7db4a921200211fc2eee24ac41b51ca895c25ff8b002238042068b99953fe582d7eb1d95c417c5f9861fdff70bff188da0e379f2cc
+Secret Key: 4ff3c0e9c1923614427a346a6a74aa9ddc9e82040518e8a6c73874ae7735cf45ca523922b687ba198472ef47b13b653cd927130c563aa3560f79a120e16bb84660ab633cb9f6d993820b7273b6656fac751e745411c894336841b8138241475594cacda98427a4d63104d415278740a299afa077ae1a1acb2efcb536070eab481318a19a890407827b9b00cb70cfd776cc4988022c30687b6ad3d5889ce277bdb8244db8122076a0bed75d5e187a6520c7b1192f9d3564cfe5acce6918e8fc1d7296309648b9b352a7fa17235658a0ea3b25a7666bebdcb3165a7ebd72c005a9b469b064233872047358fcd367070a446a630ee9631cabf72077701153b084a5746a9490b0232c1e11e90914c784bd068df5f05efb5a58ae8b59786039e27974a13c88ec998160035e25c8125c87be70dc609d08634982756c81558ce37cb2eb7d86d00187a24958f1085c71a34c354da49ba45d8696114235127300ebab3fa3a674f1f474d0fb1f9795118788a318f63c48e674b31298092825b9002e0edb2582c4a95d617b2dd44eb3e867b9e1a52fbc94dfb586f6eb1aa9c0835131bb1845a4a3d3089ab8ac7364bc13e2b895d1802e52b330f194c6a552b57c6d718a3b6b042c42a53c4cd784135418fda605a346a1919651d4d81a3917c792385b485b8856dbb74861878cc7344f48b2f7a403cda1894406680b9c9ed97c36bfe595b2aa63037c3e51dc252da5370eb3cc554a7c4e1c81f187c5c7d0c1dce22335b30ba9717377ca4b1e1c8fb13134d2b3104d875d38c02fd8a43c12e1598c111afb4a06aff91b7110a95537324b552794f4c84605c3df36a01419cb2779aa73d683fb83bb2250847e6c3951c600d42214ad54cd0137532a85679659a9cc1b0fae05680bc92d0cc52c76843f63d17aab27747973038e873f3006244b911699c13cc477be9c2c4319cb3cb46c2e48393d7691c5c345cb9cda3bfa887d0354a24bdc8881667dbf3889759c3379b011408185243b069ef6060a901c38971466400072c9860b3a3c4f71b133b445087301f1b22b2f73528889cdf9d46bd1905eb62a4343c3568178b9ee4849aa1277f5444397e63d49c3ce3d19782e73007e1686296406b38a3d502b3909058d15d6c58167b263408314450a1d474e6d507f82b3c346638165a0444df883da940be53acaf4a301e15619ed9c99ad7989456ca403a4a17ce29f30dc02f8e288573b4fbcd23444e1b574d8307c324fb4269b5d43aaba0b27b8897df2864f42a63a80c0a0359ab8900c42f6bab36645b3c30b0cfd5b8fef65ca14466d7a43ac65ac871f6968dc21c6d7807284d203dfb1c3493b779b910f8ea147215b0c5856234b980bf42b3c5cb255e7151caa7456657561251216407c3e2fc96ba730874685769501c40f6163406a9d20da032d66691855bc53dac32e350700cc2656cc39bc2ac9be8a40486b0e7fe1b04258b2656cc50b5896ee54b9869650570170c22bce4cf507b28ccb42a8cc21267be200c719c49c3230bf9bc0724861ad11db8be8b4972df989e2a1c2dd83c5e411a40e1480e96ba7b69b0f0edb5ece349a39b63247fc8df62203bb14c07a9597237ac1ce864e2758862153374ff65d62d17fb6c88480031680fb60047892396672cda6ca76c97ef633408319059e292189910b64fc6078e514642970f1a5187df5522fa585f0e6673c53a8076065f2e1a3c4b761a037064446cedfca6cb8375de0185eb472b34d14a0aa7128ed39b0960861630975c3c038ded102f658109224ab58333db4a556a3d15a2ae2a1657c8c82b1b260b6a798715fd6dc21854c86045890a755058c003d67c077c248c5fbc37863a56ee3808acd8879f06319daa8bece7b07a6323daeac275e193194528db3115a5e47950d23ad1be76e31048461783f477900c8882793c28ecf8358e8e8016d0156d91790643a14dcd17ba77b3413d51d40684018d63137f61f1262c18db534ed460cf5f2484643232887c67c23cd9f7ac9557420b3f6c8f1b64c3bb423c46b7c8272c05cd79113466bc664123d94853350bd7ceca1130238b5b3070f9563049ba4c6628fe012243a384f5aa1455a12ab8dd322186121b3e2450be97405a20b54a193353baa9e11cb8d15cfbbf31945564fa6180ffe100e1afc6572956b091a46b08c180ff91dd9592684535c8e3433f2e367a5e6c1ce622e8c3033e7d79f3a0c7955945871d8a75d3c81438b40a4c790e4ca39ebc6509f9472155acc89795b386007912685d0dc0cf66cbe9a6478cc674a1061941233c4523b9e54b54dc889334b3a7757e043207b644c079cb8f18f6361ae3db78a29d445ac36ae7147a5287b24c70bad50006f73f5ab53e03266a2913e016277a89133224060e80cfb56925f234432fa81fa20c22caa6255c3a15777b6403a6ed0245d32287c97827362c489f118c7dce97bcb9a8c0fc7b664050a0e8c28ebea3855b5b11ad426026cc3295c2c6cb27315950ea56a8023277534ea5fd046c84f358677c3b9a967c8cb8b786399c0d4117f12ca2cef82ad8fb09ce7809c0bf8ad001512c4da16f86048b8120893b042f5a147880a7a65eb8be8fa97aa87aa920260be54bb1b0121f58124e8dc1960e566e3d73bdf303235b74fa8362dbbe7be50b5296575c6536690304cc8a8813b7f0b039ae13d9925a66af067f0d5a8f7d8c00adaafce3477f079317ccca6c25bcdbaa1c0ebaa27ea34c3d0e3276a42bebde3185604b290446cd74b6d6638c6a1a979c6843b583a7334660092f525fd7a6d87905bcebc1a28faa840a38fa2d667c5e75c416a559ce190007b74b7e0ac84d2c1dc385f6d74a551e01caeba1e9ed64845c893f07b204eb57c0aa93248fa44cc45bb052813c60c4e637732a09978a796071beaab0770393cf80089778327e149262a1e39853e43e5a8e08ab9264a1a71a7cbec339249d03e04f5c7a7c63d4c31c58e43ac3c88a22e5c823320016b44a41a50a6747468a839b44cb839ef76679da3a109d995e183a4ed37675d4a76706a82b2f49f3f5801742169a6b94768367bb7d03ea77127b6f80fd8a73dbb1b32a696429f2811504ab3ef4cc9cec8aa342939d81822f7a43b2310abb47109eef1298dd9c725bab97836b571ab8f115b85416187467069baf0a35b4763fa2546360caf16bccf6ce5c02d583956b45a7e684f672104fb774acac67ff826701b0abdbf4b0dc7114d7db4a921200211fc2eee24ac41b51ca895c25ff8b002238042068b99953fe582d7eb1d95c417c5f9861fdff70bff188da0e379f2cc0ce81e947f18f3c1271a374381506186c121fe5c8dfa3e3fbd4f3dbad5441a5f96cefab1f18f92ae2b79365c284ca5f3a6db2d987fac3e983bc8ee0c06c59b55
+Ciphertext: 4a57d84735ad02a0bb66f6238c1673ff74000c7647c2432784471e274667eb5a1c619dc1a6f2b52a3ade658707dafaec0da91f468776f1f9e2f1af17d25c7d2b06bbd25b321d57afb14b1b66487f2e7d649780c1d6297538e8cf6e2bdef271214d0b9df47f13d3deb1aa82512ac82bd2277c37d8a1710bec38cef32ae4402abdfbdbe73062d8d05ccec9b8d30fe3311aa27b42c9db1418c3fbfe630a66fda9e66673ebba772ec47626e29ae21383b34276ef3e88ee78aaf90f8239457be4682b50f3a3fed7b9b9f67e06fa3c35870a35422954b8975c3a05b752a5fbb936d053f6e3dc39fc74e89f1ab84967e253cd550c3b614209a3ce49a7277b181cbfab1043dd8e06d2678f77b502b44539f06672b9e8cae517c10579b81d93a2e50c5ebc8c772b55fff8830c54b375bbedd6d94eb540300dbb9efe74b02bc57d7d8a9ae2c37a63830629290da2aff4f57a2fa57ac78120788de2975904f46fdf0acb9d61c72169fbf597ac9022c65b354d53d983342563e7a7ba47459188f275a0fa7e6cb41ceacb34e93ea888e60668b587ef538b5d17d7858d2d499686c9be791af3837c615c19054990109edd1e1a454634957bbc594fb6b774037cf383e69e226c57439fda9dd2d9c785ccc19a5c876cc5f5ce420540f50a31a4e8675ea02c4099731016c8ad0495f836a96eebbb3977b062ff96f83001ced29efd835370b7923766606dba4de9a7de787edc9274f77ee2a048138486e5569744f544f1e66e85aa9578c0045adb544ad5b3eb4cf5660874eca8b5f5b16ec47313f722faa5e4f1b1f9670a335a69b136e21303025d565969fa241f1f2efc75d25b0ecbac4034fa60c9cbd4f9636b943fd0e1c58c58bc4797b1690420f0a2385c6525d737a117d84b9901e34906f91bc8185693821735c7b60ac582b702c9542e68a1e3699e43169f6fe7a1bb9e07d554eed9a3a2bd851015a1d983e9d4c2bbc7d6cf1083e3c6841140aded7ca98f10da28f9bdfe91bd5b0baafba1145d9b45efbf22aa7b5bcc1eafba49d6032140295d7c8f7114d2f3175a261f41339a2a1f321c63683a629772e86c589614a1e5ffa65127bb84360769a0d3ec3914ca96d199efcf717f84952272e65ac2ba2cb3f4a123b551af06bc0fd4f94130d89c7d75346c9a20332d00d9a9f4cc0994f0c10a5473c34970dea614ee3a634c3afcc1dc94ec596a23a74d0b7d06d95a207349b5a2abf5ca21ae17f675b5091f83f61ff2cb7bdbb50ae9959de50385405bbf7085c353775afb4b1416a19853fe4affe327c7c14a7ac3223a69153c57782ab04a307dda70e22c976e00514acf7625595d98f6540132aa4c000401f5f883ff1cae50c4db4b80c4e0b26951adaa9870e16e370bdef8509a3510b590a5a934c2f17d309e0f05938cb7dcd7946d86bc719dd01cc159c55d9201b2d0c69e6a9ff4ec72d4f847a0d053809fe22cd438cc1a3b3a79e9c8633a8ecb6e7c311406ffa3ccb1599ecbb4d14e5022d9e63284401da695814fe63fbf1bb368849237
+Shared Secret B: ffa7b27111745eb0310da0a0044a201b1795fd28bb8bbc90141034a505bb7666
+Shared Secret A: ffa7b27111745eb0310da0a0044a201b1795fd28bb8bbc90141034a505bb7666
+Pseudorandom shared Secret A: 7a99648b5cfc933dd49f5c1fc52da8e548bbffc88292d0035df2496bfd84d9ad
+Public Key: 478b6c5479bad596748527184b2c5100d91d85fb1696459d9ce4bb2b6406f20a0547014ca9ca6decaa3605869426a756ca12b1173ac73e34a23e32af7b383eb1283c8e05245248911e97af22e33dc52b286e26bc726710ad815c78d28d14e34160f931a44173a5a4303aaa4a83ca47a3c118026760e381b6cac8404fe537950b920f04329f299251583da0a06dc053109bfa61e4e0953ac10e5a048f9d668062436e735782d8c496ef32ad20d99aaf749ba7e63b09ccbf39621cd27373dbfa74b2fa967b2710c813288ef94133f07cda635b27a53e93a0649aa8aae25c3cd7f378008b07f9c7a7b212a9dd46be26125190e712dfa202bce6a95ff944de950a64a82ea62a19ba2ac3c359436532a3996298b6664996446f816204855c23da1b0e91cb0587823bab961d1e5269518a942bb2a2821c84be79b0813c8597200f1e8311e133588dc742cf91312ad7b0fd5289cc2a970ee58b29a3c5b1074a078ccdf2696181352fb09c3262811f23e610e4c15c2cec640845c893b40d7b2bb44c71c5092a25f5d18b5ad963e60a5767222e5da04782a50da268c3c95c54a4854a10676a5ff78721bca09d8567a06033d6a49b5a5b28c9776962bc25d45b0fb9097e07a55e52a6b134815b6cab212840432b73755f579f6cb49349399a3895222895c7aba1721114741ca302ee9cb3a1acc83d5720ba6c40486497a4a408a5eb4f466c0c5fb0151654bfa961ccc2b93e4a9a4b60db0ceeda46d0bc9844193176faa5c52111a94ab1ff4549a21aa122ba4823a1641abb4ee75011bc2177a4bb9d1e3c918618b8ac21cdc7ea32f7f16fed5a36e716453de9657441654e8414e4a2911e053f8b951a77569fff349bf29320bcf9c7768a4293b7346539c2fbf90c360a101df2b9c1c938b3b18b72d9746b241ee6a48e02c0c26067b4e2a543294271956cbdb449963bab4c6362c903b6cd04ca4e5155321dc58bc7d91f1cabb7c6a051b2a4917c2a66844169a15924f178cf817c7665852c38877e0fe5c84466a101c1035c1a43e7e9c1e17c31d26a484344bd99683920502f3d937ccad6c4d535360a09457065bb9140194a9725850c87fdfb2696a8867e33be1506093578038b873788120a321078b4f0596849b142d62c8ef1081fe4547e7865dacc006d310ac56bc3da6258fe469e10d5c5bef0ce9b08cf4be51cf3d74631e3a174918b5b75c8d10429f22bbabfe62bca64411703ba61a40c70b982f17898298800c0cb5537c37e6f9a16aa064264b29703d599608254922ca8a2e42af0688c817b4259e00853c103bd0b1051333a0041c6c9179e29d8427dfa036e77c26037097d25b221b5c2f38a1dc48c20a0e3386c5662012645042492ce6646e2d82c5245a910e824a963a8aaf09fd7a9336f6381d332269c6431130576a2a093e1124c9356a97362543271432329cb2488ae22aa4add5b58a4016c12213f6fa000142bc40a64a55c37ad13d7c549fb9add1b8408ca8beae4aaab0740ca16189ef597f6230e64d47026c75a79fa5a8880b791a589e54b2db3aab762f37dae5a1ac5995b23e645b7d01a16139538b01bcc9a0aea3a15f3c9c95aa25a1801842d54518c98a0b10856fca7706e350380a5e63312300361637585654c1a404f8c942452276dd52335eed6e164455099
+Secret Key: de62b1f32294f3451d27797e04065569acc68a15048a415937b95a2bab4b117c9c01689586083b4d55910a237eae56862a204cf8f220c28408c7097c65690fd4e985c743a05076478b6960ad40923a165e5bb727839b528582cfb02ab68ef15dcb3b00a4e632966122bc11a59eb3978f049c0ffb05a9d311f60c7c424a067d26a993da9dab52314c1048b729a7992b97b1671e554381d6105c0e4aced9770e4cb30fb4f2a145c34f446a365e4363c4f2811dcb63d8f445e05a497fa256514b0a9d7799945382e268274cc29a8e278d99a9a4cfa5425df945a0a5b1521663eba2037c66ad0f8c48d112a082da6d6bb81f10152c59016188ba4ace7b259bb58e4db47bfb58b00560aeb7d92498d0712f1c35be228caeecc3bd70a8c49cc7ae410f1bf0885c0a5e6cc4a16c621736a08a0194aa3958ccb6dba5fefc14c4e277238541be98aff7a0b7717193c7701c2d402c9fdba64c816483145f2a459eca7b7a03e921231a1b3b3a9b45905dd7f057e52288a46b54829715c46439ae2c9c95b3cdd227ccbd439311338b43c4be54317633c351c07aa64c05b8a5e16356383cb7b96f11630823085433b1132d87a8d3d18ce289556f00907e334a50b683fb9bb69d6b1a81c9c86a93650cd281e9b0ae1ac643bb1177f8fc54d451a314d9ca23d240cc075d9aa53ea0922800e6c849b9ac937076c7b8c9968467ec313faf40be4b9843d4f71780787dea0b5e8690b9e8e1a6d75b14ebb754926682a560b662e123c49502bcc0a62358b7ace083b449a153a2463873797c16b26ce82b6806caf40620b1f283dc499d69f9a79dc8af227c5bf36c9433216feb0351822237c75a52cdd95cf35871ac0672359cc016d09388961d5cda509fe8a89c9858789501ab73c8bc08877d35b444389f6648b65bdcc3affaa3eb2583c01880ee488538aa275d3cc226c732407439ba3b75f2023b6ad7cde6324bcdf32c314c01efa316ece394797a157db5c59325cefbc007cf54990d1c0a1e966d6da83f90299a2b10af3abab6bffc2151f56b0499276a565d27b8645a5ab556375452c5240b6427a0f652b3865073fb91e3931a33e63f66463fdf582614b67815060bd58939b2521cccd566971c2d5668bd9d5c98fc6aa0609680f318625f934b9e6c5ac686b7ad45435547cfdfb9c2baf43c9ff79298e74d5f02199bbc8957bab585888f611a5406709c3dd869e43145641b1609664d20227ae9f93496c1bfb1987ac6653abe420378583f4ba8327198915bd174fe442a257bb5503396da99cdfaaa6abcda334b0c287b3502d9f72646e45d4264153bc5b698f0b7be6787827a11e5f93ad00506788c0411222ccdaca64d74c5fd81b5a9507f5ccccdfb9cc46f067db6e496b29b218446ba36881f038c4cfd881d546836dde426191a3217198dfcd36b9612c83fe4c974797406d9b5827289dff180b5e9bb0fa6b8e45646558559cfb57f6d1321c82a4ca4eac43e2a93d3e4ad709c98abeb0402b9527ce9cf23b8a701d282a83549432825746a9b810593ec294332d11a1b211928bb9d38709bee45b006d6aa43c611938196f83877d82a5533bcbfa8470380eb10ed54644ff07c87583d80976f988b9375a843478b6c5479bad596748527184b2c5100d91d85fb1696459d9ce4bb2b6406f20a0547014ca9ca6decaa3605869426a756ca12b1173ac73e34a23e32af7b383eb1283c8e05245248911e97af22e33dc52b286e26bc726710ad815c78d28d14e34160f931a44173a5a4303aaa4a83ca47a3c118026760e381b6cac8404fe537950b920f04329f299251583da0a06dc053109bfa61e4e0953ac10e5a048f9d668062436e735782d8c496ef32ad20d99aaf749ba7e63b09ccbf39621cd27373dbfa74b2fa967b2710c813288ef94133f07cda635b27a53e93a0649aa8aae25c3cd7f378008b07f9c7a7b212a9dd46be26125190e712dfa202bce6a95ff944de950a64a82ea62a19ba2ac3c359436532a3996298b6664996446f816204855c23da1b0e91cb0587823bab961d1e5269518a942bb2a2821c84be79b0813c8597200f1e8311e133588dc742cf91312ad7b0fd5289cc2a970ee58b29a3c5b1074a078ccdf2696181352fb09c3262811f23e610e4c15c2cec640845c893b40d7b2bb44c71c5092a25f5d18b5ad963e60a5767222e5da04782a50da268c3c95c54a4854a10676a5ff78721bca09d8567a06033d6a49b5a5b28c9776962bc25d45b0fb9097e07a55e52a6b134815b6cab212840432b73755f579f6cb49349399a3895222895c7aba1721114741ca302ee9cb3a1acc83d5720ba6c40486497a4a408a5eb4f466c0c5fb0151654bfa961ccc2b93e4a9a4b60db0ceeda46d0bc9844193176faa5c52111a94ab1ff4549a21aa122ba4823a1641abb4ee75011bc2177a4bb9d1e3c918618b8ac21cdc7ea32f7f16fed5a36e716453de9657441654e8414e4a2911e053f8b951a77569fff349bf29320bcf9c7768a4293b7346539c2fbf90c360a101df2b9c1c938b3b18b72d9746b241ee6a48e02c0c26067b4e2a543294271956cbdb449963bab4c6362c903b6cd04ca4e5155321dc58bc7d91f1cabb7c6a051b2a4917c2a66844169a15924f178cf817c7665852c38877e0fe5c84466a101c1035c1a43e7e9c1e17c31d26a484344bd99683920502f3d937ccad6c4d535360a09457065bb9140194a9725850c87fdfb2696a8867e33be1506093578038b873788120a321078b4f0596849b142d62c8ef1081fe4547e7865dacc006d310ac56bc3da6258fe469e10d5c5bef0ce9b08cf4be51cf3d74631e3a174918b5b75c8d10429f22bbabfe62bca64411703ba61a40c70b982f17898298800c0cb5537c37e6f9a16aa064264b29703d599608254922ca8a2e42af0688c817b4259e00853c103bd0b1051333a0041c6c9179e29d8427dfa036e77c26037097d25b221b5c2f38a1dc48c20a0e3386c5662012645042492ce6646e2d82c5245a910e824a963a8aaf09fd7a9336f6381d332269c6431130576a2a093e1124c9356a97362543271432329cb2488ae22aa4add5b58a4016c12213f6fa000142bc40a64a55c37ad13d7c549fb9add1b8408ca8beae4aaab0740ca16189ef597f6230e64d47026c75a79fa5a8880b791a589e54b2db3aab762f37dae5a1ac5995b23e645b7d01a16139538b01bcc9a0aea3a15f3c9c95aa25a1801842d54518c98a0b10856fca7706e350380a5e63312300361637585654c1a404f8c942452276dd52335eed6e164455099b21205bfd227a507256c07570aafea393131603c882b7948d24b7cc922309edf538cf628c175047f13a1690ef6f389506bfbf7c0260fc69afe912d45c70f4b46
+Ciphertext: 89f1bfc3a367939412ec982800cce48c2880ac83ca5a0a2b973ed2ba7d29f9cfc6536475997467825f51091b68ce82de2c05fea92815b709667738196734434d3acb97c4fd56d2abb08b9f018553ad7e3740179584ad5bee78fe31d5725889d79856a4d757afbae6ff701a6c8b089f4d0e82db15bdc553c53ac5cd8d095afb9c3979e2807378264fc2968338ebfc51c42dea3dbf1b480fbb7f0ae2d55308b2d9678bba2ef72fed0685a877a90891e4a573c70becdd84a29877ec7bf8c738fa4a603f00abc6ec4f096f391e3f3abfe80481adf63fa884975a6e4a66c22f8bd0a70362886f7c1fbc5d07108e57d911176937403d6d4308393f489d664837352f397675b397a1d97b3f4a376f78e2ad682e94463c4366effcecec2cd690443aa654931ad3e36406e198b5963389e86ddece6930ab10abc42e54f5db26cd09e9865a316d8fd99fd30330fac9d87dfb5d4ab21bae7248235a192c6ffa8066b3b593f0544da7508b2ae01b9831ff24b38d92e85e4880b56225ac8f22b0bcef9c3ca36560484bbecd7b7bb9d3c839dbcf810986619b683041a8dd83066b941d4fa7b35313523eab0d4f0394e57caf9dbd3e7781a675813322cd7624b75642da49f12798f55a1ec79bdff561209350c3a4c286de41a9c09cdfab6a060cdc974e09208525ba1fbbe4af078ca1a1b863fbc4d9bdf839e13bfdcc95817f8a4ddb02b8b37c2aaf933db47390968e3391553411e3f90d950ec491cf599146db0b78d3c6275531db194e7b82fa7e5afc110819ea3e6cb001940fb7f6a0b722b03cf246fbcadcf39da58ad619b2e9a2aad56b177f85d1a3cee3179ab3f038150d6c6bc0374d103aab46ad2ce2725330f2bb90d32fcfd9f0c45920d78262aa1ece6af74ad3e5e08e79e62fa85b250973464accbc602797366b7b6e055ee66c4903191b755777a97716e5cbd2195667f7f5d45f1a47dc08b25bee8175539164bde35f6555f8451363745674cf227f406f0da08eae61cbb9b1ebe762c61a672f347cbaa9530873718b6ea3a65eba3da781d13d05f80680154ccd138484f867b505d124fd621dbdcf0b9dce57c33a998f6007ee2c41c738851132dd4886d1dc0072c0c33b19e17021c37a0d3fe987e6a2847d68e6d4c6fc47c00eae04976cedfc0b39c2ed69f0ae361ca58b329dbb69e2b8925b7e08719bb356b127dc0d2994e8699c286e0418495666573815b5d046be9aefe4300f985e8972fff2855763b0deb7b3f1d8f1bc07d82db79a3b416b134b3b8b912d0153f67250da098ee0d27defc83fa9131ec1289ed29f4439a5db57a5646825eb066026116096c839fe8eb33cdc5db2521f25b90d332f0a051292da8866539f1d2d11745737197c7b3e18774711d5f7287f8adef54a347cd6b0bfeec98613a6bdcee0c67517487d545cb4876224353099698fe7ce94b5a8f42325a30c34ab340089883feaa26ec18501f292485d2596b172a961254afd25693e024af3c33c6148bc409ac5dd811b5f4305ce61ba47fc0aeb995b07fd
+Shared Secret B: 1df23d827fd1b2f1bc71a50cba8d823561687a4c3413161b459e7768dbe53a4f
+Shared Secret A: 1df23d827fd1b2f1bc71a50cba8d823561687a4c3413161b459e7768dbe53a4f
+Pseudorandom shared Secret A: 0de26918e3f574e6785aae1867b8432bed1047184b08d068e36212a1b1048d66
+#
+[Kyber-1024]
+#
+Public Key: 780c725a25151f1b09e9c71d26975dd65220e059ca526b6e6f874aa5e8b62297ce0f037da312a86a05131f2923d49928b7450ecbe996d8aa21ae6a25a9e44d4b8c991bccc59d6313a61c652bf904790a063cb84b1d870b11828c6c8c7df7632dcf717d5ae33e4d487a25637e4d030e0a76b21a2142eac1024fc5a4603ca08d1081c954c8a9368e7011169a00a4994a56f9fbaecccb022775c9a2679dca6453eb7b90db61354980c03cf17a7405140497b23aa996d637a4cc027c5e3911de634e3ea65ab49455efc253261ab85efc5a55ba703a8a0c1c789d94bcadec88b319bb6918354eb33498d6680d2197206c35820b7b193952b4c6e81d7222844ba31bf446a93e817be86b7b406165ad9b45bff86304d3546092cdd1113e49a1be90e388d0a0155de603b906ccb8870ae162b231fa3ee4f20163cb0322d23a2b217fccfacf4a6324eee4b853b42209e44786ccae52571282d238a85b838d4787a1852cc28561384b201b13a2df710c7c4c3aaaeb7fded4620afb43fcc8167914c7d9a3176222007975c4b378ce7cd47135c072ddf03cc69b7bbcc3bbc737433ee05090c324835a5c5b9162b8cb2bc8c409b0682322f45d36626fbc58c0ee34054343507b20b79b2a00c929213487bd84072303023f65b8c68f3c5c6ffca1f7a77a3d1a0da8a237a30518ecc8a5b73c20c8354fe0167bd39c64f0720503f711fe1839445768504548b3b87a0deab387507dd80a4cce69994a652a362366be86a2f75182f278796f672525e23c83271be4265cd6288035b49e365224ed1405b1214f45db48dd28c8f3138224ea177dcc56720bbf754c2d5304148a95c4e724760e0166b9104bb0970e90f20f32aa75c838c6f29321bf161a0cb518903ba3b53a9caf5234d2b24ecef2cfb560201f738908099ec4ca90bf2551c7449215b32b8816bedbf69d816b583854ce85dc729e820accf15efe9c7fad724a654338d3a36eb1a116c20aad05046d2bfa4481d0a5e1c29315e94d85e16d344892c73a66a0d10e924c2e3c650648da8751465aa5913a92884fd3873491db5b9e108360cc8d33e08e508375ad36a90df865cdb11a617cb82bb0940b6063331200c0185f7ff8515d554bd9c71a3b97706268a14c60382dd169f7762969681d0d759c497b86baca5082a31d3db4978887172bab9573fab18b4198cb71cfc25c9396a0976863348d13287f36bb016a7cc031169f629eb3d447521b7d9b3b22e68ab8f4089077e35c874843bc55556f2508502c8aeaa931c9d3519d1355bce3ad87d719a3030c5441b498e4779261ca64fa24e4fb04fb98751c8848fe847a84704333700516a349f00ccf90506e725a5dd9e46d9bd83f8a2b8c89f957a316a9f6bc21a0e69c1ea42f975756580a8b5c378b8332127f6a5d94b639ead24fa0c80b0ff1bde9294111c1c0eab00f796a9d6882164d246de3f91c83583e551970e633cd3fd00fe2624ff0bb9e2d461e0aa760e8f7b9935b7fadda8e14a2024238578b34b0ef7c9f8178ba90b4cc73d90fdcf2b65102a7f6abcba83492f109500c4b166538099045c5d50bd0c7d73e9046317ea19819b34002cc0ec6691f36f064a6880cfbe450887b38498932402561222325d1e808e804aa544444407c944461b326f78f66399d920b2271f357299472e0f41ad7c727722413095978afb80e889b2d52873ce1747b98f29b75b103838b84ebaa979bab1cbb8317a5d36251073ab5ba0047b9bddd75b38bb01459d6210eb613ee260425045e2ebaadb95884c5a20cb6fa73a6f4280ae51ff9dc5b8492c21af36fbd49a70b06a698aa032763464886aeac2170fd4bb8080355c6a5abfa8345aeb72e0df67ce01835b294408231b7ae4581a5d82b58b631d041939d04b31ff0add36a4f00a7adde9b27f6f1c154400a2f8854865c5005fc2158226cc263532005c9b7165bf52c72ec372ebc15bb5621003d16791f6671f4f9cbc463171e101009e4bbc5d1aa54634e73c77b00684209d49c3a616969231dec430e5989aa6800185ab7b6e14080740617456ab7b9dbce4ac1017d2c6f823cce5df3a0bc3585cbc97d771437d01b485fa40ea04871ba59b89b22b59de2ae7c9648ef3206dbdb9502856868635a224239a1db01e9076e15882baa050943921d241297f172694cfa93f56ee922c7d660937b5937c3074d62968f006d1211c60296685953e5de
+Secret Key: 24c59d1c7603e7b74bc7aa1bc2cb3a214b3cfaebb63bd85b65408427c498ba394371bb271f92a3b506b81d54a95a7c0ddfbaa1519553d6f3cd5a601b7db6b0e91a5149468f1f68ad26478bf3c6670e093ac4c49e7a90ba46595de94c50e04129a811a841b39534a87f0ae7b1116553e20c9a566b9b8ff7c7e728b8b201893403a4f252a55230874c256b897834cda349807b25cbd75a30867bfb80328200017f1cb70b56cc546b65d3dc9cdb45107cf10dba349619043ac35c0b9546309a239039813ed5c40f353a5e8e42193564496112bda56cb38c081df252ae9c2c7e441a062e92a7c8da7a240c9952d86b5f1bb6a53b38a5ac0a54a84b43f12da1d0525655684a12090b60b28b0c628db092015547d1070af5d6192e639636615d03c654bb90008ca15b784119f6178a00d7bef4a54a274ac922e55c61a3a8840aa258639484a3bce2e43b6c969b11275631daa129a61ea0e2939f0877e1a110c8a44b24c54fbb07a958db9feeca1eb52b086c87bf43a9b02a5b2c4762117c3a99ae4c4e2eaa7a33b9a714737215c10317514f6c4299ef92acd64c4858e85ce737a801890022d7381f3540230c0c8ef50a848a28b09ba0bf8b50619c905751601d7629767449c9c0b2bae321f438a77f412a55e45ecab4b39053c6561801c639be6495be8fa144ef6029af663407ca9181946de5f3aec7236343ab3bc5a38a09c01b412baf0afb23f9e9b8f2b40810f2ce4ffbcdbfd87972323e98065160bcba34b3afd6c25b664745fca99a9ea75cef019d768485ec23336d9b39e4d05d8d587b30633d4f69ade5753a39680235e44f27995da96798f3a85e184a9fad19320829629f4140417bb7dbf5851ab79258134146d088452774991a087a1c2beaea89f218087ba774ae253b494c27750b1de04b44d953c5e47ab10f65205ee212f9c30391e5299553954916873a0b41164543e801c0b099cb44f48995675823c10b40f4bbac9177a558ca0c30765c2aabfd6a4da54c8413e33902d63f064330f0464982429de2604cd03b4de84a9f821a5470423a40a964dcc41863363d77b02c3127304f942ee71c98c643a427533ef300104948b825277953aaabfd855588f75a77d199a213ad348116e9e539f6d37068a551c710548b7a2c7ee95f9cd9b3483332673cc44bcb18a778a49455c768e0b340f81102ac6b76b064057151ef101ae143787f548553558df8035a3ce00c9c43cda43142cca39034b09a7e6089867b4c64980a69ecab2e6818724c35cb909d5d45bc6a349c71b306567664adc0cc8ef698049b4b4b432dd0f69fac07580f77c4f79b22bb90cb97b341880716853431694c9120f6724ad58d57127fced999ff6229a5d4c3c240129cc812acc73698f949d8e73661f2528262bfccfa5cdf5a2104649806e295ea161217083365aa26cee6ae2f1356e8e1c5cefcc85703447ef1160a1b4a0e8c017b173802c66c88ab70d39a6c96c1569d5a86245a7eeb087d682219080768745b44bf244f65b567b2658dbae6962ba52b322118e214cfadd7cf3502582dc9cafba952a9637ad3600710259778d99d23f8235da90791604b4f0a4f7640680f59b633d93dfb84282ba54c674b115684a4ac1461a4c880e305116c5354ffdbbaebe0c067b63240b96c47f27e9c8129c6183ad904785c486999b7aa8c1c8029153ae73bae6997558990998b809a25f86092b1267e70983aa7cd0fe8c19091c0b780c95f2632f5eaaf49fb4abd60c976942b4e0cd0ef17711ac5388beab01da0883b8c4c9a286701e36e54c46aeee0847a2308909b8ddb3c7b37b9ceb0334bbd3b4000db0980b81cfe7857dd543b09d71c98c0aa12d658ddd93948bb7f4c874e449ba6ee0c4fc94a70ee353580b19a04693cc6f480abf96db4305ecc031906c9b4cc167c9c932ad5203ddf143111e6463739c337a2426a01c88340785638b05df5b81084885c8103289142e4bc6bb89598f85b852ca72594e732010860bca635418984fe8631b6d5b514f38128800f7feac7effb1e072a4d0ce336ec9769690c80c9579e1ff3893daa1c5b11552aec0ca3b37ab3927674778b4b3367542926d70bbba796b409330f06acc913b04b45c81ed5dc8abc2629eef92c712c0b9b22a6f8252e7ca148c955cfab8017b9077fdd18a2780c725a25151f1b09e9c71d26975dd65220e059ca526b6e6f874aa5e8b62297ce0f037da312a86a05131f2923d49928b7450ecbe996d8aa21ae6a25a9e44d4b8c991bccc59d6313a61c652bf904790a063cb84b1d870b11828c6c8c7df7632dcf717d5ae33e4d487a25637e4d030e0a76b21a2142eac1024fc5a4603ca08d1081c954c8a9368e7011169a00a4994a56f9fbaecccb022775c9a2679dca6453eb7b90db61354980c03cf17a7405140497b23aa996d637a4cc027c5e3911de634e3ea65ab49455efc253261ab85efc5a55ba703a8a0c1c789d94bcadec88b319bb6918354eb33498d6680d2197206c35820b7b193952b4c6e81d7222844ba31bf446a93e817be86b7b406165ad9b45bff86304d3546092cdd1113e49a1be90e388d0a0155de603b906ccb8870ae162b231fa3ee4f20163cb0322d23a2b217fccfacf4a6324eee4b853b42209e44786ccae52571282d238a85b838d4787a1852cc28561384b201b13a2df710c7c4c3aaaeb7fded4620afb43fcc8167914c7d9a3176222007975c4b378ce7cd47135c072ddf03cc69b7bbcc3bbc737433ee05090c324835a5c5b9162b8cb2bc8c409b0682322f45d36626fbc58c0ee34054343507b20b79b2a00c929213487bd84072303023f65b8c68f3c5c6ffca1f7a77a3d1a0da8a237a30518ecc8a5b73c20c8354fe0167bd39c64f0720503f711fe1839445768504548b3b87a0deab387507dd80a4cce69994a652a362366be86a2f75182f278796f672525e23c83271be4265cd6288035b49e365224ed1405b1214f45db48dd28c8f3138224ea177dcc56720bbf754c2d5304148a95c4e724760e0166b9104bb0970e90f20f32aa75c838c6f29321bf161a0cb518903ba3b53a9caf5234d2b24ecef2cfb560201f738908099ec4ca90bf2551c7449215b32b8816bedbf69d816b583854ce85dc729e820accf15efe9c7fad724a654338d3a36eb1a116c20aad05046d2bfa4481d0a5e1c29315e94d85e16d344892c73a66a0d10e924c2e3c650648da8751465aa5913a92884fd3873491db5b9e108360cc8d33e08e508375ad36a90df865cdb11a617cb82bb0940b6063331200c0185f7ff8515d554bd9c71a3b97706268a14c60382dd169f7762969681d0d759c497b86baca5082a31d3db4978887172bab9573fab18b4198cb71cfc25c9396a0976863348d13287f36bb016a7cc031169f629eb3d447521b7d9b3b22e68ab8f4089077e35c874843bc55556f2508502c8aeaa931c9d3519d1355bce3ad87d719a3030c5441b498e4779261ca64fa24e4fb04fb98751c8848fe847a84704333700516a349f00ccf90506e725a5dd9e46d9bd83f8a2b8c89f957a316a9f6bc21a0e69c1ea42f975756580a8b5c378b8332127f6a5d94b639ead24fa0c80b0ff1bde9294111c1c0eab00f796a9d6882164d246de3f91c83583e551970e633cd3fd00fe2624ff0bb9e2d461e0aa760e8f7b9935b7fadda8e14a2024238578b34b0ef7c9f8178ba90b4cc73d90fdcf2b65102a7f6abcba83492f109500c4b166538099045c5d50bd0c7d73e9046317ea19819b34002cc0ec6691f36f064a6880cfbe450887b38498932402561222325d1e808e804aa544444407c944461b326f78f66399d920b2271f357299472e0f41ad7c727722413095978afb80e889b2d52873ce1747b98f29b75b103838b84ebaa979bab1cbb8317a5d36251073ab5ba0047b9bddd75b38bb01459d6210eb613ee260425045e2ebaadb95884c5a20cb6fa73a6f4280ae51ff9dc5b8492c21af36fbd49a70b06a698aa032763464886aeac2170fd4bb8080355c6a5abfa8345aeb72e0df67ce01835b294408231b7ae4581a5d82b58b631d041939d04b31ff0add36a4f00a7adde9b27f6f1c154400a2f8854865c5005fc2158226cc263532005c9b7165bf52c72ec372ebc15bb5621003d16791f6671f4f9cbc463171e101009e4bbc5d1aa54634e73c77b00684209d49c3a616969231dec430e5989aa6800185ab7b6e14080740617456ab7b9dbce4ac1017d2c6f823cce5df3a0bc3585cbc97d771437d01b485fa40ea04871ba59b89b22b59de2ae7c9648ef3206dbdb9502856868635a224239a1db01e9076e15882baa050943921d241297f172694cfa93f56ee922c7d660937b5937c3074d62968f006d1211c60296685953e5dee3f8c83144b3bac8b5a5c98197832b44c8912900853f84b20f8af5fa6fd852543cb1eea988004b93103cfb0aeefd2a686e01fa4a58e8a3639ca8a1e3f9ae57e2
+Ciphertext: 590fba1aec97703d04b4529c082832105115ad6770db68ec94e6bd35c355e4ffa19d1a7b22e616dce1c5401cc03bf717dbaa2997afd8c9d4bf57f0490156d8e78fe419a0e6d3b560ea42e43eef45ef59c1e274b865bab45a5573329fd51f7ac10dfb024e8bfa48258fa7ad9dbd033afbebbf23f4ad40019bfa4d7d753214a55af78c63129f005deebf1281bf53b6eb357a1a83f84ff4cade284cf14159b9000646c23ea74da2e386033f8534eb6d2b49e5f6115847786eb68c4a2d185b69cdb05a88e20a0f8e50a577de6a9364c7952ab9cd11c95ae164c833df4ba5e6e5f91b18a7a1dde540e8343b4144c902cc34b1a088e261d122512236373d164f571d261dbe7ba1f3657fe7c806e04a19ae8c1894d35bfd67ccfc4cd5dac38a40377edab55c5463d88908e425c8957eaa022141a3fb424e673f20d739f2c28e987f519fd78af78af3ed2c3016ff56760eb84d9deabb4bd0e88e15b2b2fe2e9ebb7a6d767c61fcf28c6bf53f7f341ed501f4d2aa7131051cc77e90d92463618fa1b3759d1acef040003b6cf70426df39a427833e14614ea328c881d5f2ab7463151e85d16899c19e423bba6744d48f3937b052df92f89d90bb6569dc7d0d89ca1e76574fa6eca015c74f26b20d73f3f16f952e90ce5c3138271145116176bb50cb3806b6d34942df2b8e3765781c46d048b56e43f4deb6d83d6a191d2eb3e56e15e1e9aac22711cecec41ad8e03cabe06dc2f204a059b8a1df5734a767ecc7f8b3f33eb960bba3a43c17c90c39a6df0c4488bdd7d7890840491fd491f89b17f5ae1064018105ac74a830a5782b314ac3fdab36fc361b17db7b0d276b42088bfec2d5a183b9798fa0f071d70947e7d6ade2e311690955d284a68ce83b82eca013e90f4becea95699b5fbff5b12a5823b688a83d18dacf6764b001d48d1d8ebc5cff6695c9806e2b819313d54632b782a8a3d1a085d1c32386d36dbd41a6b6ba233c205cabd3f35d2c765d4537cedd04923bf8b67325cc75d78f0d8c950e435cad7054dfe500b5bb3311a4989073d05ac84d43309b13f2bb564664b8382a65d536f62585e48d9e3defbfa195a1fd6af8f09dceeb8b131502650e8902caf931dd49fcf57f0d05522124cf245ab1aaf60f85577f3d3d58b40adf3625745399b710823787a89cb9854192345458afd5004f7b333eeb2105b1e7216a3aa4941158cf2ec6cce9709d21c63735586b0bd7e7b8e999508037f72eb44c616960f90d41e5e736f355121a5c00e84a61b7723a7ab75dafbe1432f72929d519ec28ae994a6fcd2d24ed871a9dff2ec5fed871e6f63314e2450793022cb1aec1d712e45f6314166187c634d5ee84e45543fd1f1238fa3b6a9413fa9e817e8e1b3d9f8830509c35e6109d60a2dec8222c1c67528d65b0ebef6da66a3ac6fdbe025b9a2b4764e4769a92a2d2a459d296a08bfdaee03587b3641e9fb6040406e4f2202da0cdbdd5104d78364a75a58752e87ba946102c522fb869c89b7ab5e72fed38f2d7e315627d48c14f066035e79287d004266defc302ffe2af7ba90ad1c49dca037d007c53051058182ed412b9f679b63e1a6acd8e02e6e566092db9fd7d1605db6c2b06e406d25d02340571cbbe7642aa015c271bf7d1d0278ac3c714d8247b74d3d0cf20d3b652e2252f35903310208cd9e5465257439a9cff43edb87fd94b8f9fa94b714e2c878a3ceda179c87443f27c24988ba3273e06f932136e4e6f714a5a52e1e0b6c5103286cbf332fed8bdd97e7438c7d3e4736eeef77529dabe8a309ed3c3d9adb6cecbd178a87ee50b71eeae1c7b575483b0480c065c02397059ed672b2e553fe44822a8e9b5066397178c243a84a4e3d76493354d1c9dd21aa67d473d2a2610deef921c99f703092394838ac8469692210f2310ad4372535b29a6be036f8eb95f3ca9761be97451980ba766b27a2d8ff5d71c0bc9e31851f9ffb0b9be30cbe9f190491d8781a7978a6561a67db52c898c3f364d841bf11090fdb2f857918e1a958d48f5f7d47a8cb504b7dcb5cd1ddead67069fa8ed461925beb97e25334074c7d4235021b755b27e84be438939a293b5248d574cb341856cbdd679fd49bd181527c0444a8a377dd8d2163571423b705d53c7177c229fe4bbb85162c95a0c3fb21431fce9e899e918c5c9b1dfc6279bae489e1dbbcad66955ed0c99c73eb287712e62c1
+Shared Secret B: d526390b916937f6bf5cb9562e32968ddda71701bc2f9cc270efd427fa46c2e3
+Shared Secret A: d526390b916937f6bf5cb9562e32968ddda71701bc2f9cc270efd427fa46c2e3
+Pseudorandom shared Secret A: c91f344f49fe943509991ef30392fc3d25cf65be3c3baf6d0d1487bf95d59f42
+Public Key: 4595ceac1b0dd1aac964e755ce455bc11a5859b91e6b14b403b52f15eb76e71794c9b1020fbbb6e8618566cc5ec451424d410e5f3bcd0eec84f8e7b95ff553bba6139e012e19a21378386db3e5ac04a36c9bf65cd485554fba6e3aa8ca9007aafd306c3a703cf10503b916743d6455afd39e6a32058f069a23c2196f075d17c62e23b296fa837fe570c2c6c4b5bb87116920b14291646ffb9bb2e2c5b925114d06323fb1665f3c6da5e06980e8c5e0d6c9fbb02e88f57cf480a6c990a473d04fd9fa9244989b30490b3569ce54c96c01730c8d117d34aac685164f9aaa81141b571c69ba28b2b5fac8074367ccdb7271265caec2833dc2bc9de6c70e3fc49a3a17b35fd64eeba0c472d947b05702345928bd28411602aea0344eda194857661adbe45a1a460a4fc1677cea57baf2b21c9b402008b5a2e0cdbd98433f213654a5cf65c79fbd7c1e843c340242446bbbaaf8f82681f26227386a29d92200034ea1babb6e079c05f201b4a3b28950ab32406eb3351c99c2a9c6575242e7c8c83a07c20a91037c8ca85618cab98ba0732bc9f43f559a0458a066e8b194a7305565152951a1819e0bc91e6bc9b870445f9791cb056d4e176302c139d2e070ba18384e19986a39bdd936c8dba6b2db6a9aa39b71f6f35da0e9a146d0caa02c41eea6674e382b53128772f2553b3b860912a76e13875f762bca83c1c78756dd67542b3b97b0e65cedd39d74151a681a0a987183414591e780237212ca074794e8e029a4baa1e5b3a69b49b6b21b287ba4a33cbc6c736b674886a71f437f383c2fd6051694791b2c657039b92396127287b51373e3bf4f725067508965919ec77a7b313c33e7d1ae55bb3f74d381ee0a9c85a6889cf89e777444657ac9e10697b7570d67322b85c02088f80c96a2ae8eacb0820cad5f07195ab99353ebb794013d0edb8483ac83c7098e41510118e3482b698254296148a87fe11b39ca059bb456a553b9437650aca293a4b7a6c2f02989bbd55e5a83cfff1831479346cd34443b674ea0883a7a1a1ccd2010150c99b91850727c8c0b99196ba468ed374adbeb7bc958a839d08498b75d62a1891b122c7d8930ffd723eee87eea3421da7b2e2e830400e71dca623b0c177de97c0a2fe041f14a9e1b431e28da9699a262f2a8a16af781f6102e20e02dd373658f61609d9a1a38100186581c45f99ab520c157cb6c52d6375e5688ce9161f674b2d422b499f9189cf1b7ecab416c3c93d61727b75c6052b78e7150ca97404393c04dd6b67d398c9a1da2cf29b243a003d0c9b6a459a7c15ebcbeee703d52f40927789bb2462bd9968982ea774206bb87566a4b22b3c060451f0722d3a61f3f526b779bcd69f460aa432f7d59ccc38219b39177efdb8fbc439a86e769140508f65098b9527f59e75b937cb7bdcb6395270ee489be46f5357b63ca64656071b720cf5b2cbc3072cf4b2571fc4e49a30143a269abf33b00485a59da0b1919aad18ab54dac157fc6b902c472576cbf2a19cca20991b3d5087277b7eab8b01eab373dc56bb3577c8857be76b5402e90ac6dcab1db856258d53479987cb611a421c359d00bb1f03b9e3457c8be32212283b7664a1489faa6c21419c118c62042a8c0fc61dfb386360461d461b8eb16717d3216390548bee70c1430abd1e0792e899d4413913274ba0ba787a5a45146b00c32c66a29aa71b4e77327a2b8c7d3579ec06530f827ca550410f21ef7111d52831fe52871f4aa3c14536dc845cb7ac84160bb4a74f2a203001238c6a1efd527e0c65ac6d24adfa35bba5328c918482305828bf187c62654a1c7bd4282a2141c8a2f11c27f959143eac5b4338f4f808de2348637702ec0069ebd0b11c524b44d766b0c55770b401c30545bd4c331d9fb9185f2554482b7127ac0d066cb884808d6a8278afb29e6e76ef335aee4c64c9a2aaf3923c91081529a00890f3a7d2e9b5088199321c0c8ec9b127a9108742b4a76e038cb340330c6bded19937c192eea808fc98022c9ecb86cf46b65e93c6d8c0b661a7edc56226eb1862c267c20227e8e833414e734bac983a5572002f4b2fa0a107b453e9ab1403508c5c7db2299071b70e01c06732279739c57a484c055a2df466dc98c4a51d66b43f584abf0cadad501dc7928b2cb48bbd0af14678e709e03b07a96eaa90b631986dc386f389a21f799c3851bb772b215dea1
+Secret Key: 304a67a96cc9d6f78d0c0341e577052ba6542f262e9d230c08e35fcfa06ee8dc09af39cfb9742d2171c8ccd04e5e31066e863ba4f10990f0821ac0b0fb2c4157b49f9224c2adb3270a60b54e57893e3b346c19a0ae164746406d3ed5a6e5faccb3b3a5bc75bf9a12b494b2158e3743f381b2fc04083a65a3cd12273526801325bb69b30a77803c8f66a4db72b3a5993837e28bc1739306eace218c335cb713a461cb39702226a92726cb0686e60f6751c89d5b147afbc137f328b1cac1238310b92ba2dec1186d63c26c16c8a71079a1755a0e698be1304e886099140b0ac0f95867a0309d179f2c5086de251addb68abfeb4fca6b259b8379cfb4a76550007f93962d5009bd70c81ed091064c451ec01eb64c47ba862397e5223b1b3751e19103764c1ef8a129e089806aaff172259dea83595997e42b436d424a9247bcbe391ab9bb9cea4c491e37aa4de8b2aeac40222856a091255c47b94d787f826c528d90be487598d4253fbae04e935b3dacd8351f81a2f41430b47a158e7397ca18a750511055ca5763182e19627343d4b26d082dfcc2193d064ef903850e27c206a11a78d745c54aa53d186b6ee3baeb0c60d8c914cc3755d9e1c0f27b9d225485b5d9367d459144202c91a429b75a0d6bc78c5fd000540856ab085804bbaa234c7aeb81af28613b89c511012cbf8560a46f822e6dc91ce0b5c53c96000ef651b967b7ae3b55a0e61d9d92c4fbda42cdd078196ba1599b93ef655a6d95179e2b65f2b64190b05cbf482a9f628c798570feb329fc30ae2f524fc3703a2de19ce4e28ed407aeb6f9cafb8479f65ab75ec608d10057b458972c38799331cf3787b4b6e614ec34a54ac2775a39ad02257952b3a6fe8404656c653ea32363606d6637006482164fc505f0d7154c31b6ab361c1743bcf1297c11278dbabc29b7469a31830bc50795a0a38d7503be475bb89f44921cc4669f6b485f561ebb169d45ec218b23a442848513e45cb5d37325757d9e183c057881cbd7b625a41f1b42613b002a7f7477b51ac05e39500ca6b267783533a509355c1c84725e7361a21c3cc313d45890d72770ac6e2263a26ea730f65119a96725b9c043a42077f3925fdcb5064f0775204c02695b201f25877f4c2730fa0940d9276d617965f406ed340fc771b3764318f7028f27e344c922aa052461d3a1c9bb5c01b14c5c3d269b0131490ef662e55c6132e00b61f3bfb3fc6d911876a2301c0c326629bac672425305e19df6264c9529aedf231c64396d114733293ab30ac845a6112f16c77c3b9754a05150b1659132315912839ab160078dd698acb31a03687e24b942e87299e5bc2f6032ba9f27702635809d961b7ec453c7977e058b68b9fcaffc5073725543b5242c70a0b93ff44547d5c60b357fdd47b23414c7b0d0286018b8d1e73a75a27e0c6677682386bd5bc4841664eed3a34e55000082100ea44fb7c179153b88bcea67320a0da2568fd6963d705b50f9e49be6cac6ad1c22ee151139572f5a650956b8762d7316d275bd1a29a8d57aa403e9bd6be6024748466db1a1464b3dab08c4acb2b53e0567d436b19ad2241ae178eba2a37b3b2947879f4fe9b1c81768cda28e2a7619abac212dcc00cceb24806b5ce53a387bc38fa0668d5b620c93dba2071685db23a98988294f383466aa912eaa8d982697e6dac82f29c568363076aa7948b967d961a3dd3cc86ee54b42db23cc309575b18ff969c34510c07bd43948513a3721c9bdf4762212aed6f686d2a02d9483b368859f68298f3ba6a9b9e98e2f52bd021603b48aa1bb678542cacc307bb6d67ac1a5aacaede0688b02591c55743d3cb9a4950aa95750f3312b02da96ffb8b6acca5cff0a5259b37eaba91e14b128d05ba5b92a02325a9abca0b0589c2b60884d655966634886d3d0497c1a574f478ea7a941c3665b1ddaad73e23b80a6160c7c665940bc6b2a69fe83863037998ff066a844801c73246ebb7f500146bdb4b9cae32a246572fc363039fc29f27876f1198703f939f6c82093c45c72064d8ca518948823f9a007e082a6b0332a136bb630eb6e258a1056442d6ac0bd994b74b37a3849e3a9eacb4f1591029b8a02839576f48a253fd4ae6eecae41eb370dca9568040b961b8cf3721bcb57546a81767ba1bd4595ceac1b0dd1aac964e755ce455bc11a5859b91e6b14b403b52f15eb76e71794c9b1020fbbb6e8618566cc5ec451424d410e5f3bcd0eec84f8e7b95ff553bba6139e012e19a21378386db3e5ac04a36c9bf65cd485554fba6e3aa8ca9007aafd306c3a703cf10503b916743d6455afd39e6a32058f069a23c2196f075d17c62e23b296fa837fe570c2c6c4b5bb87116920b14291646ffb9bb2e2c5b925114d06323fb1665f3c6da5e06980e8c5e0d6c9fbb02e88f57cf480a6c990a473d04fd9fa9244989b30490b3569ce54c96c01730c8d117d34aac685164f9aaa81141b571c69ba28b2b5fac8074367ccdb7271265caec2833dc2bc9de6c70e3fc49a3a17b35fd64eeba0c472d947b05702345928bd28411602aea0344eda194857661adbe45a1a460a4fc1677cea57baf2b21c9b402008b5a2e0cdbd98433f213654a5cf65c79fbd7c1e843c340242446bbbaaf8f82681f26227386a29d92200034ea1babb6e079c05f201b4a3b28950ab32406eb3351c99c2a9c6575242e7c8c83a07c20a91037c8ca85618cab98ba0732bc9f43f559a0458a066e8b194a7305565152951a1819e0bc91e6bc9b870445f9791cb056d4e176302c139d2e070ba18384e19986a39bdd936c8dba6b2db6a9aa39b71f6f35da0e9a146d0caa02c41eea6674e382b53128772f2553b3b860912a76e13875f762bca83c1c78756dd67542b3b97b0e65cedd39d74151a681a0a987183414591e780237212ca074794e8e029a4baa1e5b3a69b49b6b21b287ba4a33cbc6c736b674886a71f437f383c2fd6051694791b2c657039b92396127287b51373e3bf4f725067508965919ec77a7b313c33e7d1ae55bb3f74d381ee0a9c85a6889cf89e777444657ac9e10697b7570d67322b85c02088f80c96a2ae8eacb0820cad5f07195ab99353ebb794013d0edb8483ac83c7098e41510118e3482b698254296148a87fe11b39ca059bb456a553b9437650aca293a4b7a6c2f02989bbd55e5a83cfff1831479346cd34443b674ea0883a7a1a1ccd2010150c99b91850727c8c0b99196ba468ed374adbeb7bc958a839d08498b75d62a1891b122c7d8930ffd723eee87eea3421da7b2e2e830400e71dca623b0c177de97c0a2fe041f14a9e1b431e28da9699a262f2a8a16af781f6102e20e02dd373658f61609d9a1a38100186581c45f99ab520c157cb6c52d6375e5688ce9161f674b2d422b499f9189cf1b7ecab416c3c93d61727b75c6052b78e7150ca97404393c04dd6b67d398c9a1da2cf29b243a003d0c9b6a459a7c15ebcbeee703d52f40927789bb2462bd9968982ea774206bb87566a4b22b3c060451f0722d3a61f3f526b779bcd69f460aa432f7d59ccc38219b39177efdb8fbc439a86e769140508f65098b9527f59e75b937cb7bdcb6395270ee489be46f5357b63ca64656071b720cf5b2cbc3072cf4b2571fc4e49a30143a269abf33b00485a59da0b1919aad18ab54dac157fc6b902c472576cbf2a19cca20991b3d5087277b7eab8b01eab373dc56bb3577c8857be76b5402e90ac6dcab1db856258d53479987cb611a421c359d00bb1f03b9e3457c8be32212283b7664a1489faa6c21419c118c62042a8c0fc61dfb386360461d461b8eb16717d3216390548bee70c1430abd1e0792e899d4413913274ba0ba787a5a45146b00c32c66a29aa71b4e77327a2b8c7d3579ec06530f827ca550410f21ef7111d52831fe52871f4aa3c14536dc845cb7ac84160bb4a74f2a203001238c6a1efd527e0c65ac6d24adfa35bba5328c918482305828bf187c62654a1c7bd4282a2141c8a2f11c27f959143eac5b4338f4f808de2348637702ec0069ebd0b11c524b44d766b0c55770b401c30545bd4c331d9fb9185f2554482b7127ac0d066cb884808d6a8278afb29e6e76ef335aee4c64c9a2aaf3923c91081529a00890f3a7d2e9b5088199321c0c8ec9b127a9108742b4a76e038cb340330c6bded19937c192eea808fc98022c9ecb86cf46b65e93c6d8c0b661a7edc56226eb1862c267c20227e8e833414e734bac983a5572002f4b2fa0a107b453e9ab1403508c5c7db2299071b70e01c06732279739c57a484c055a2df466dc98c4a51d66b43f584abf0cadad501dc7928b2cb48bbd0af14678e709e03b07a96eaa90b631986dc386f389a21f799c3851bb772b215dea1cf6aa0bb38a7d9761c15752cfb881447ea78dc4ee7bee98a1c872551e7c17b0df4999f8e6038b6b6b7dc806a23ceb21c757ca6c46071154f20b46d2f2fd2cd7c
+Ciphertext: 44326f94cb54a2e5ec9b7489e67a9bf350ce464df01cb1d85d3239738d1ed92c98107624e0e83f5b7674b1af0fce0cbc13f4d5e201314bd8038743f68ee763bd1772ae3e811c811c33d0d970b7c481ec55af821b844129fa406208817cf56c68ea4178b84b2bad7e13dce352ce9537b719b3bc1c3c05cd6c8b690210b833bce1e563a3aeddb32457a6a6adf0577550dd400fef4f635ffc0993ab4dbb345e9b7a616991b64f280a83628a31018931cea7537553a6d075693c0d0655baab0fd2ad0f652d1d546cb58ae318af8070a2bb581c00510fc7c272b8432c39ebea24eaacfb5890435523258d6b52e2a2d7b20970b47c538cf381ee6ee664c34fe19ae76a0db5bf101ba214dc7359e8bb0ad53749562f3382893e605ab7e56a503acf732572b3b5f05db04bcd81baac8cdca77a25f1332e59d33efe305fc1a313e1eb3bacf12a8858f1011c3aae570a625d041bcadb709026c5c8fc47b8e97ef73ee6dde23174325eee9d1455559a5289df91791dc1638514267b7a895b22ec6def044fcf99da53df08108d01696d2dafdad29d73d0422b5e939d4693b37e986fd4fcd1077ac37b90e920d90b6501512946a4e545f48924c308292f0d2a446febf262a1771503fe5c2308c888730c98e57235e6ce6da0cfb8d97206c1f44b836d5af3014a84f1bff4c23f1489180394c0ba165eb388ef8edb88ba0523145352121d21ccf9dad76f9e356ba072c9531462db83f2f101ff501b31caaa2f6206cb8d282b8631054dba94156ce4e045f2efc6845772f7754145be0701dc083556751f81f894f4e4147014ebc0e2e8d1bc9337aa21c5a94a8995f998d4a23a3627d2c80a6de270ff1a4d1be45e87d44ee387aac3edda86ba4f187ada2676cd0ccbd8905984cc0252f00d2b14ea13d1577cb760ebfe46bb7f872eac3f1aec683fcfe00f9c1db6c2721e051c8e706b73b98d9f38c6db34346c3d3b3c591a758497afd53d53387a9a8feb09edfd583f2e24bd8aee8ba45b19314f6b843f04bb1f2d14cbd524aa5f4d3ca031427cd16ba121b6eb3864742c4fc61e67e60d7a575d8d23a3bab31b28e1a26217c2b053764af89bb3b844abe9d1090a69bd5cc2db67b915d0d3e8b5a566632300497e90e3efdc1c18ba5a51c51060247567bb5feb3f2978e321ceff1154c1a55682ffa8d8575400d64ef91723d9758a06e78620aa244dc51b7e18bd2954bc28ed38edf3dc50b2132156070405bb2b35766dac74f22bf03bd38a9c2e9c8cde9be2cc7dc58039989026897ad431585bf51d1e6b4b81a7d9875c6ecbb5e8e98ec1c4eea6793fe51059dc09b1c1fccf12510730ed975cfa1f08bf03b25bc323220c9739b00ed894fbdc6b44950ca5a9492e91a39519da73e1ffbf13fad5abb733969537acbe255b09d3b40d2483ce4cc083eb0e7cd27e572b957910337e78c54bfba3486a98cc8e566f7bbe840d16563de5e7f3ed4719f6d489e13e22784960ed5fb5c8e253cf7b9738baa28ae6d48b26b106a6ca339ee33872a33c01a937f67c0b377755dff5f3ce6616b2433ca3fd7fa6995f649636d6a70810da8b5097d3e6a4b5997258deb58868f830bd363bbe860cfa63cd4c785e0a55027f5d19de36fdcd0bac4e6e3d925e6415f5a31677f97f8579edab07de2a0915d0287911f83a56113d10f2d3b3468c6666d707551f7eab0de18b4725c9d38231543cf5728d0ee8274c6dbd0625efdfdd2d2544ad137ff3a28e4c32542a3ce6fa552a0af19d40bb25949a8e95a3430f6a5460792b5a1a4f905501954b7437d4bd29f39219c50be4fd0d933ff6ee54c37527634f749e3c6c8458053378454826b4aa6cda9729334c44779f4b69bb00d4e6f128f9bef5f337fcdf3d45a802cfb17e910d4c378aaa6529e7faffb5471b4d60a234df3e0e0b4bfb7635978abaa334333107b2b098e580581c2cdbb6e7dbbd9fdde8d4c6872c95b124d5c095a1ce0f566ca210457b839f96d592d2b8de0a2043eb04879499723f58ab175fe04ca40f7c275fd9ff7a39a1bd418b16ab5c270f19ef1fe9e5b19030ba2ba2193fce88812338fd4a19c061ba68c773cb0372ac02d9a212e13f28c73797f0cc21ef3da7f840962464eed0ac11dde61d71ab7b2f94a3af6ad24a6875cb91f906774b2d58a222540509b5a638cc868f80c8219227baeb281f65050d4267f1755a37a7088546f93fd5db61a726
+Shared Secret B: 788ec1c83a592680c19d30da63d7054e8557435995a5f2e1cbbfc957ee6211bd
+Shared Secret A: 788ec1c83a592680c19d30da63d7054e8557435995a5f2e1cbbfc957ee6211bd
+Pseudorandom shared Secret A: fc3f5bcf7449dbaad42a59644dea8e0d58603ed8ee0080707140823ec4df832f
+Public Key: b42744685ace38d673844acea83c7bbd9170f414c7e5c322420184a217b4107aaabe4386a8f99f52b7c5b0d9a60cc33a68e32113216a381a2835d499e19947fe0512a0113dd0415549138ee6138a0c05a5d308bbc0143ccc513d59c935e820afd93529343163ffb9cd91e9b0f0945bde3c38fbe80385c25b189a12215341dc973c3b5b07c803a2bc882702e151f3c71ac8173c5b975b2b75b25703b094ac9cd1a63a6391800505b3ba09b771a6b84faa8f744ab336b2ac97274aed1bc11e2960e4b6b8be5908f9d283f2f7cc46d05e703438c97b53f9b06ae3690c9b7acda5a9b359f9c1a1d0823d4c9fb7a35f8129348b38574ad832f744ae606826cd5ab1e3aaaacfc9053e90bff14923457b9991d66c9dba558d862efe49b90750081d61c0b8ab323bd10da60c370d2804b3f20b94165bfbeb11dfa159617a9ab837664fa1b3e3b17b3d63c9f382888cc3bd4ee8bf02da42da407daecc356ab2a09e80b59ab40719dc23f0622f5112bc697041cbe3b2acab03732370b736b94cacadd2e0a614726bf8b866870987eb67c8c6bc7143670d84d9bf640cb8baf77b08720a54d215c984219c2ab474a10adbd38596f1b81518b930fac994e611a1791d735bc9e420284cc49d84b95e09f34b45f273af0a081584a5f03ac8ae3c0f8a5c0466e98db08a0160e2c452d24a29157c7e9a4e41d731f41971d603bea51805286a4130d258f063adaa4a6a46740d011892ed9aa77b37adde05a52afba7442547313b26a71c5cd23377c5c5813e968ca4d490271a7d60a0cd5587737c300ac296ae3ae149ce8a81708470e859809c025e25a94bac2062777397dec4ab699158c13cc709c42d35206d1c305a111b55d0c09b6ee471b4e445cca2c86a1496d5d30ab04a7887e28b4f3aa4e1639b43bcca8c9747f50307928b0d36a5194fe7a159d81a1b81053e55b7d158255f958e4b7a47ba7b632e1a5a12893a89530dcc2a176a5ccaa69c6ed5b58513b27df078719083905ec9892ad2b72f84b10943564e90434e0a6c5c152cd7a55843623b6541cd70c940856ac36fd07d223a1ebe21ccb92ca53a62a996197cbf289ec3cca39b2248519840a33825151330e9b881097aa7ee7c8a4b5ac74778bfd5d9b69903714fe64dd2f181505a450b821c0fd9cab00067ff33c9174972bc742d586965b4a5b1c883c6afba89dbcc60edb991de540ee6fb9c7c32a8cd962a3fd003c9a99e7a4c44876915c9c32cf23ba596091d87033402caa1e3178577a8bdabb61ba1ab8f6736531e68a2757a37a5fc053671507e697d60c33097897ed827c087395454d8399cd9b26fa7386fb4a4f445cb36a23021ec92b8322ddb82ba50f6ab3cb15fcf124ad9d7554861241e8822931c0ab3d7610dd86fee470f66b645f89589ec748313bb6d852a320bf517c87b8431eb64f6760dc3c646b10478d9837e781a5a83628dda740cd39739c8e5c557c72696cacb506b4e6ba1897d293ab6a25b962b7dfe4457af964e3f4792470bbc51161dfa4ca6e5d23eb0b09aabb20b2a7740ea20814283bacfe39ea82a852af229faec9f162462646a3923681661612f0803c2fe15c2179299aff7896a5a4f8a254c7f61726ddc35bc89b64cd9b462d176c277959cfc8f3cb892d0e854af1b3071c6c37d7576e205a5cceb7fb06bc9bca48656c3498aba977af1a91d4a61383ba97385c55669be0374b63ba40b5ffca11a230c703903a9a56c0b693702d45333843b3b5cb29f12af0eb036c85ab5d8db30a3692a10ccc03d307d6b97ae1e097aa5f9ae8b8754a92105f7a72bb97074988064312109f0ab07ce1bb25ce9b4a6bc237e5740df587fffb4c89847b2e2b74995b96df477795bf1232d31a73a30c1a0512ad5192c2a162127e164746ca57ea222f94c671ffb43f7ba49b9ec3c915111df3b009d40463693c92a8098f9f3289e13776718c7f546bcbf7ccdf9f1a6e0384da5f2825d111b26a11cc4b250461148bfc17152e40eba73063d096896f6bac014b9e4aac49241998334b6678158e0c29e05433316d64688fb4bb7f07857f641cb90c4efe928505b108a3201191216810b9e21643a2ff1cd4ad66f5f757432654a4e328352938f1e4b7e4dfa5274e601202aabbba78aab3546fd9935eaa99986bbbb839954258c2fa7a87ea97c77f4c7aeb5889722dcf6f13ec43c01ebbcf9de78eabfb29718dbe3e2d430
+Secret Key: a3292e7c922ccf1a4f3d6c38e6013163d8c8b2163b4c193b8bd82b755cc237198ec1a8480b788a8e0b4c1263812816283ca98d64e2a33f8246d7953898d5b0e9a71aa16a755429672770b77afb38b3d60043dc14b5a816ecc39814a5b2e7c559ac4c6679406878131f023cab5bf64c56f3a8b95273ccac4c26f3c5748541aa5b89e7ca86b2e62ec6700c4b78c370dac3dbe520d8dc9768e038dc4cbec2101582c94c7ea62c73c776a554943602717b9846a8119753091f24d165d664aabd5231cfeb943fe49f11647bf2c1775123123f2c94245bba7e64c1da931c1c1901d1d7ab508857fd0a3c3d15af4f91bd6458b3633666bd01acc8b7a7fc83adbde990f062576af8619b0c0e3b9517ccf0ae75fb7ffaa2609d21cbd72ccb6236695dd24eae0c551bf47d96cbc0bc8ac5d59b9b92a83a84c90dd36cadf1357ab8299eaf154ae7b69e1b47c8831388a3c5385f382ef7718205185567a1ae73d77869e64dd53c5b8605b6515c195317368dc407b11b3b56148547a11fb1d08d714abfc8d8a9a14a0fbb0994f0b24feef9b927665173f89417988f0d5baa5dd37fc3a775f88c126750550d04a5682732e9436d819a2d304a5ff036ce1ceb2b5aa1a021c6cc0f9876ee66b072c4057827089dc347f00b5fddaa354d0628036827f8f33bc40b8606994fbc99a80bd3870d000ccf955352b2030f2a64ec1b3c015138aa944ad7ab96ce1ab47aab266fb882d36cca31067970e67b8b3406793c0c33502a90e2a695c74b7a6cbf66fa2f6c6c4320553422126cc516005bda656fa9433195864e711c3930494c509ef72b1b69784000927fcfd5350338023b412e73a95914900c0450acc0b52fef98be0f310929b4788ae8350721120e4ac8fd50068aa45c3f499b5ec91080ea0db71ac0389c07fed76cb63670f3b67db3d96e9ea26f5539143cca27d0ec618df46726a619a06a1954c5ca4e251bfa260ceda0c0fd767bdc56b63f17ae92867f41b6b9bb3b9c3f8c66b531cc9f28c48dc0a659757002d68575c82f00ab1642bb9adaab4ccd76271d75bf13f1697e058567978816e10413c4cab691c4742241e6c3ca69ea94c9694b663c3be31cc9fe4ac39d38359101871c94c27323046720b485b85950195a4e1cbabb93bb14174534c36a1d9012f86284c396ab097aa50438264d316247524091d8c9ea2046a97888d1d089d6350a8724a8d03856bba40fb9bb7cc232ac235cc323941a38385e9c33cc23a0bf42884dc7d9b527500c43725cdee3963af688c960371c580e9f978169c368f5a546e9500e681008eca90195f062695c36a3d32b95250cbcd140c5866bfb440683f06d1bf555185a2c15552e53d15b894b3756155f485c8f892abc47f80c7e45071d457b9d934bdf17b51509134d18612c06262eea8b441aade3160129d425757b87e2c2b00bccb62cb33ab807a3bd9541fcd5259496a328980247047cca35bc4a1b6c6a6200d764ba24a9072d4434fbe63c99bc53f6495dd6646c80a1843801a01e1bcbeac2ce3c1bb1dd670d2a915a02d34eadc85fbe9432a8c64837f0490fb0a7741a0c6cdbae864a440ac8445c2a7c0d41a091d2323ff79cce9394a1c23717352b9d463dcb3741a275b2e7d6a55db9438d9265cf795838e36e46541190d30b472047e4b8949e254e1ab7c050e39105e7053e2a7a19d97fa5096d43d3abe93b8f3118416e0ab04c519b7eb01dce9b6c953ba884349f7ac4326a5463fa3964760120550b01d0857d1d1c96fbda7f693457fa465ebb49a82c42aa9d59240aec2b1674bab8b603b329beb2ac31a85331b49b991f575402611606026a40273a7a8cb235c0826002b3f01ab1791086e1d8c1f9e0b20ca6c1bd7094b45055d7d1004f8839a8cba2ac158f2b565e866c076ee161b743b7e9bc358011218cb928e9969bb6d38234111503573498d4cf695ab9f0806927a8418e32b797546aab502e7aa20687107360c49338009b277115a1475c2efbab91da4247bc84046a1e82c9c554e4b2f0752fdcf5c238443717674d25240016accfc84cc768f23028c30bd1087dcb690575bc07aec442efe6b1d45c9e58886be9d72f89948187876bd59b7287b569ce538c1a47741d82ae2882410612632e3683647331a357703647a354399a701615301b65c5e526b42744685ace38d673844acea83c7bbd9170f414c7e5c322420184a217b4107aaabe4386a8f99f52b7c5b0d9a60cc33a68e32113216a381a2835d499e19947fe0512a0113dd0415549138ee6138a0c05a5d308bbc0143ccc513d59c935e820afd93529343163ffb9cd91e9b0f0945bde3c38fbe80385c25b189a12215341dc973c3b5b07c803a2bc882702e151f3c71ac8173c5b975b2b75b25703b094ac9cd1a63a6391800505b3ba09b771a6b84faa8f744ab336b2ac97274aed1bc11e2960e4b6b8be5908f9d283f2f7cc46d05e703438c97b53f9b06ae3690c9b7acda5a9b359f9c1a1d0823d4c9fb7a35f8129348b38574ad832f744ae606826cd5ab1e3aaaacfc9053e90bff14923457b9991d66c9dba558d862efe49b90750081d61c0b8ab323bd10da60c370d2804b3f20b94165bfbeb11dfa159617a9ab837664fa1b3e3b17b3d63c9f382888cc3bd4ee8bf02da42da407daecc356ab2a09e80b59ab40719dc23f0622f5112bc697041cbe3b2acab03732370b736b94cacadd2e0a614726bf8b866870987eb67c8c6bc7143670d84d9bf640cb8baf77b08720a54d215c984219c2ab474a10adbd38596f1b81518b930fac994e611a1791d735bc9e420284cc49d84b95e09f34b45f273af0a081584a5f03ac8ae3c0f8a5c0466e98db08a0160e2c452d24a29157c7e9a4e41d731f41971d603bea51805286a4130d258f063adaa4a6a46740d011892ed9aa77b37adde05a52afba7442547313b26a71c5cd23377c5c5813e968ca4d490271a7d60a0cd5587737c300ac296ae3ae149ce8a81708470e859809c025e25a94bac2062777397dec4ab699158c13cc709c42d35206d1c305a111b55d0c09b6ee471b4e445cca2c86a1496d5d30ab04a7887e28b4f3aa4e1639b43bcca8c9747f50307928b0d36a5194fe7a159d81a1b81053e55b7d158255f958e4b7a47ba7b632e1a5a12893a89530dcc2a176a5ccaa69c6ed5b58513b27df078719083905ec9892ad2b72f84b10943564e90434e0a6c5c152cd7a55843623b6541cd70c940856ac36fd07d223a1ebe21ccb92ca53a62a996197cbf289ec3cca39b2248519840a33825151330e9b881097aa7ee7c8a4b5ac74778bfd5d9b69903714fe64dd2f181505a450b821c0fd9cab00067ff33c9174972bc742d586965b4a5b1c883c6afba89dbcc60edb991de540ee6fb9c7c32a8cd962a3fd003c9a99e7a4c44876915c9c32cf23ba596091d87033402caa1e3178577a8bdabb61ba1ab8f6736531e68a2757a37a5fc053671507e697d60c33097897ed827c087395454d8399cd9b26fa7386fb4a4f445cb36a23021ec92b8322ddb82ba50f6ab3cb15fcf124ad9d7554861241e8822931c0ab3d7610dd86fee470f66b645f89589ec748313bb6d852a320bf517c87b8431eb64f6760dc3c646b10478d9837e781a5a83628dda740cd39739c8e5c557c72696cacb506b4e6ba1897d293ab6a25b962b7dfe4457af964e3f4792470bbc51161dfa4ca6e5d23eb0b09aabb20b2a7740ea20814283bacfe39ea82a852af229faec9f162462646a3923681661612f0803c2fe15c2179299aff7896a5a4f8a254c7f61726ddc35bc89b64cd9b462d176c277959cfc8f3cb892d0e854af1b3071c6c37d7576e205a5cceb7fb06bc9bca48656c3498aba977af1a91d4a61383ba97385c55669be0374b63ba40b5ffca11a230c703903a9a56c0b693702d45333843b3b5cb29f12af0eb036c85ab5d8db30a3692a10ccc03d307d6b97ae1e097aa5f9ae8b8754a92105f7a72bb97074988064312109f0ab07ce1bb25ce9b4a6bc237e5740df587fffb4c89847b2e2b74995b96df477795bf1232d31a73a30c1a0512ad5192c2a162127e164746ca57ea222f94c671ffb43f7ba49b9ec3c915111df3b009d40463693c92a8098f9f3289e13776718c7f546bcbf7ccdf9f1a6e0384da5f2825d111b26a11cc4b250461148bfc17152e40eba73063d096896f6bac014b9e4aac49241998334b6678158e0c29e05433316d64688fb4bb7f07857f641cb90c4efe928505b108a3201191216810b9e21643a2ff1cd4ad66f5f757432654a4e328352938f1e4b7e4dfa5274e601202aabbba78aab3546fd9935eaa99986bbbb839954258c2fa7a87ea97c77f4c7aeb5889722dcf6f13ec43c01ebbcf9de78eabfb29718dbe3e2d43037ce1262753670aaad84e69accd1380f84acd48b9d76fbb843c5452ee351e12a6cf3c13f459d34acac62c2b8999ee364e92f3e8ad5c6f39944611478cd059176
+Ciphertext: 844fbea813e653b36e9f0db068f3e24a43d55040d7e13832bae3f9f5b133919ea1d06544c4d4b1299429c8c6c9f03b44a5e1ec53c73328e7beb8111a44df506959cb88ba675c03fa1eb8d984f65823cb1b55738b49bef4641068ebf8aba5587f38b0b25b6b4de5aa3fd8c295ca23b2905034422a64031324e15c5b9a412f1dbf0d125afbb4eda3b0866e6225b2c6b3ca351d1dd7b873b134535fa0c8792b1752a70e4b4c382ea4c3275e3c5eecfd58d6c57d36903e26108d7042549529114811f3cb0b756769f2376938195b1769c807618b15ef26ffffbe099bbc0abcc5763e0bd65c1d650e7e8b695ae562fab2c519c11e294f79f2da21075f4fc1286faa0735f71af457fb18b80b54cc8de202da6f4b6d43deacc4599c335f7db89e28e626d427568e188efd2743cdc5c8a92609210fa3f443bd00250b48ff7499dc2b23ed822297d948c2a1bb8f355c9a93a24eef87b40f35c03bb291ad1e4652a83e0163d3a1e39a566ef3d648bf88c02bead238b259fc5cd2509628e3b1c47f648de18f58ecd9b66e1791b38981d0fec8bc5b60f8f51302e09a11208498a115a8be26223ccaceac5d78ed0ab53933ea2b95442babb31a2a2e7e5530d1cf98f761a3bc7f9cd59ecac7ed3b80c630e5cd6305db642daf7dd8c291c8420a7e9e054bd06b540f0f737f895ee93d7de82fca14cd7f5c363abcb8a1e5c4e3123d84767a6ad99a58de56dd5fbe373925c969fd5163f72ec3ade65907967f9cfedbedafd09622935238331f0db21863052f4e45c74f680e44dda9a0e2bb8ec2ddddd4c927e42d353ab9e746b6ec120c689106541a9560834c5c87808d50ac075849bd471f39548d5eb4a0a6b19e697638363dab814d80bdd07bf29ba6f3dc972563fa1e7f9ca467f878c5459a5efa3e98a5025df2956736d40e127dbab0fa0239ef3e72f8e405538fbb3317e95673d3549cd8022496926c67db0627ea90133e73751f1923879c652ae24e75097c693d94d70e95bf8e83eb530c8a3569a79f21661d1f1df82c58381c190c6dc4da2630bdcd0db25c08c160d5a70a2b9fcbaca0ec7a419e6510a17bd4a2318f4f1dacb8129f1f404acaa9f07a5b509113221c43ebb2a160a68aaeef0362e98a0883eb85f3f0656dec5e52c159f9ea9537f0aa80f2808942786fb8b6558f774a70abbc5131b385ba0f8e9a193d7ecf8c8b664da02c39fd5241a3bc3adaeaa5d79bc6c460574f9642b48d9369f9c860f6eca68205ae16aef45549aa4a985607b07b79d93a57385fc2ebbdd61002050deccf3369a6443bb97719603b4ce3799a7b091d2d24723ee4bb8418baeed2bc77c3ef611b74412feee40c2b367d29a78b0c89c397065f316dc1949c611df99a219688a4a4ece7da08b1a5ae72907dee59e4a34faaf098d459a6be9850b97a3cdbf50188b6553770992305757c5f09c67a73d0cb5cc2036a7e3d2ed565d5c0b7d7b9f08df233e2433861ae2c8f28320539b0b2bc8b7b6378cffb28f6d90295e2e226fd2e166f60346239f379c269dd449f013bf715f3c4aa20efcadedab8415019ece2472f11627acf9aca6bee79aad02c8bf5d874c498786be01e026b08d219dc5739bc84b75daedf31eccde707fbd5ed78e542a2b7e735ab2e4f86f323842a3e04310c2194c1c6de9f5ba537cc52850d7425a13d0c161f15842c8683433cbac41c89e0a6d86781f54428130f6e14a332dc4cb77a7328f365d8a7776dcb6858b05aeefebb40be8c6ec918fa3c710bef541b0392fce5ffcb18ba3d32c1ef1f11d05078a760707efcaf5e42bae63a13eb51461b2ba8fd4fc858674f9a078cd9fae95455020da206fa1ee437f580a01960f0c2c0d44b52b9ebe06c6f624672f125a69e4ee924ee8a2aafdd2c8ebd450e0e0c4b3529c634345c9e21d16042c5e508066847ede9632e193ab24698393607dc8b268649ecfc01a47c75ee4a5ab3b7803ae81646b1023aaa8800f29f50606f9242e88c36e58a0b313b8a4764532203cd810b54f054fa61aedad6574175511cb9c5598b44132606d6320c9d0dd8fff8fd0a8e538fcba6dc0b166246377232a6bc19e5bb6c4174585d29ac8ad23edfbcae9ff4f9c7458af5daff7676e28854e8f8ca6eacdbefedc5608fe0c413af87cb99e000064b72cf8f5716d694a22e8d1caa300d5a88a4fbb06efc380f6c48ea0878072df0b3fdcdb8469c4374eb685c
+Shared Secret B: 326e0384896975c5936cfd8ec747843b86d73569cda53cd40bf0aba6b0a3c0f7
+Shared Secret A: 326e0384896975c5936cfd8ec747843b86d73569cda53cd40bf0aba6b0a3c0f7
+Pseudorandom shared Secret A: f7f9462b50e493bb6a6c36aa187b278fb753125749ef3b96780d9a1173df5a6e
+Public Key: 93ea84d5989ab22652de9067e3339820039d80e44a24945a6c154c730a83457976aa29a354cccfe7fcad5023362fa80422faa2fea1ce16b05aa43b581f8c8f648940f6bb4a5a87b6f15940804603e23880ee5028af5b4c1f040832d56841a4b981c80288b65f6f8278f33a16f912a0a36161435b61857b65fa20bcb9f7beef8b3902359c66e25a766614118c5caab32055783103593e0c76426f36c394f391de6712efe1c6e101b42e3251de5135c91089307bc03712ad85b33e9e8b5bc21309b4ccbe49a022da5a983a6679c5b4c84620a88124276ff83d8a0383ab84223362173c740c8241b74d7032410ccf59ebb158b59823b45def1532d4097361371697dca44bdcac57360b58912ac43a0c12e34b234753d8682ae881b5c9a314fc2ac58d465b4569552d2477531a66df4c4cb2b18491c161aaa8294ef1cf4643611d70a2bf585044db640deaa6b00b81c3cb889bb547f5ac154c383824239dcd45787024cd388130a53c4791bbae643c7447fccc57b91ef6812a3cbc97e3c691b7b18d781912f88a39aad39779e54610d4430a1c76fafa0ad7bbaa47a45b4a44244e63862bf8347bf465e830be276803d412b205183c3f59a50fc47eec27225ca9888c7a4c510304d5ca5ed5792656d15b63a30bd526b1b3bc820f8336497baa81669f0bea2bc220ba31275455d09028e119e1c0b4513b3cd4438ca52abc496b5b2e731c9acb5889757200371d003c7efc4c2923896cb5fb48ff2a581335bbb7a4c5bb68107eb6146e4375f43a43ae678a833c773dd2c460607a4a40cb5e8a5251514c0c692d2ac21dfa2c296755543601bcced30f656a4ccc82340ebca175c4191102446275a1d9f669d405014e881e98b676afc82a1c5a6e7fcb482a1a13ca311fa6765eaf474ed2ca8ee2e056cbc469bef18668405023926309710cd1d4a2ffc42af54a8c3fb499781579812007fd4a63929a1d945a38d88b205a51035e31710007c218f23e06e83408507f76ab6e382c1c52940f3d0b0d67a6c478a69a2835a82c3674c0e29d430806d2097fb2b35c5de1940041cb13c910db779b64f9a2af01718235a3382b19eb853cc1999998c00090881cb45c89c770c79d5a8772230a9d22791ffb14ea53aa96b66cca4acd8fbb03d642174552682d98c8089801a1376c72ab743db20e7c62977e4c9232cc44f3c95d6761062f9b73456228fc935264059fe476bab4f0a920917acc7989afb69126f5c2d105780da3a3c4a38ffb932ee7daa6fdb8c5ffb0577c367845f75f2eb545db05ab7d20c9bfc57788ba8725c62a06c728b84b5b3ea39fbfd7af0967703ee7b64e26050a741f69fc70b0bac44396a7be3a90018734ae0a76b5239ff92945babaa880d3907f347b86d8a834e02f660764ae88cb300c3f6e77634409249d471f64938c7e70b08492c66372753af46d71691c95a00b86124d552b482266b6f463620a510f823bb406f509bfd7987cfbb113e0104cb04b5861698599b8aff57180e898d494adbb15897efa3fb7c76c86c21a37013f61e635f5bc422865a585c9c6ad34a804facf6b035bec34a65f645b564a3aac7597b6911b4c90006f66cbf338a802714e7567b30c0770c82b6cb9f042f8614cc25a0348f9599fe5c65638737157cc11eb0d80866c51634ca6b3435ba6ca918557ff4c3174eb8e85449cff2b798e054cbb32531e633e1a9810f61988a0d60cecf62918257769941ed9c1618cf45e78c028f24bc1ff2a0b745665fba04869f38e597bb0007d511d90abac794ca3cbba9aeb391b846cb702b0ec0c8ac23a318798c31851710b572feda34f00362d36d623db678254e32662b6a5fd326e9697010ef4078ef25c099990dc082e2100ca83f36cfe8b9052320bcda70a2f641bbb65bbcbb5647fd0cbd6c0af7c1bb55761785e9716e2480519714e6a12bb2283aa49b3743b3965286b45f6709890953719385a656ca270e902805771ca087dc4e9c504656aba35834db581cf578efbfab089498724a34af043468c517e754489149051a1e77a1332231970c9f43c1fdc513e902364efd60e42f965ec0018259276f8c2b9ca503188d8a612b5a1fefb013f17a97dc0cf6e2a37a06261f4f42f4daa556493231f81a915a2734b7922929b6d15a6b9c6383c77715ba6610ce9b416dd44a424940c88db7808334323ed107046a705c670bff4311280840de3e264f3
+Secret Key: 5f4a2b7dc30c06841640ca3cc90140d4f010004b112c0b085d0b4e7e245ef6c51eeaa18c09b054887343c2f5394577a379c6806aa005b1b64d8b585bb6f42e27494c5b4c42d1a555a5295e1e050e54c1731704b0bb174e7dbb68d69a51189854d2432cd9f5bb32456276c8a42bec6aea78a784396ac3a9355514b7d419ab2bbaa0e8d428a1a014f2535941d416ab6a39f81a85e71429e59b70e51241c33558e0a50651073f41f62d6f044b6f80a1f5f80b526cb5776ca1cf13af009119d59893e9c77372278661484aeb090e86cb7481421a22c7618e99226df73e30891856e5721500afaf56c22c539e59954f56317934b43d8a84b8d61cce7738264be010c5db91bb14c21db23ef1ca4ab6f5b3d08bc3fd3548408acb6a95ba1c046af149b9e7915f05e80501f73f183384a3e45366215f2d0b357127adc217cd58e3b5bebc370d9a00d04b4147383c73f504d9eb3081bbc11a13b9c4da162c65b41d32c55bf25ad41c7160778ec1c11138a3816a9c23881bc804420873bc13d9f4630deac4bb399e1990986ac63de176c11a1b1dc2fa3008f82685b021012ab9c7ab5a8bd320aecb7a46989f6ba510a3b038059a67b5098cae282fcf4045638c6ef7210db232a43ee09ccc7252dfe7ba480bc7bb7119351991d42965bbc64172c572bf1b3f6d8906ccbc4f7000412171ca06890405198597211644b0717e16899c36bdae6c88acb0725bd9aaaae989fef280bfb72c6ab9b51a4423a39004fa3292eaf2ad9a2b001be06fc55c32803b96ba75c779ca182a707eecec8d3ba79583ba1f0f65a2c4aa3379223e972729f3a7304ea8566990b8479c86d6742edbda25585849ffea962eba334cd018b839a27f30200679a65de727da18cfdb138eeed80a450c635d82161756354e7b6d197782efe6b3437510a705aef84b5867f56a10dc33ce214a7cd80fe6a474f0babf1377400959aba9d6140f0a6464b69487902337c40de796431f23340e1342e9629d2e7b86442a5700d27d9967a8f3da1da755c008f909c518c05d2439697065148cce4b0ba107c3003597bc43cc8edb0b51bfd77e937631f395b469c22a3d252d99bc3ba59ca47a54cc40e0a8f61796aaa0cb703caf00f29b5b61cd54e3619d4b03ff315720ba58c7168208da6a7bb1886cb04b8a21936218ba231ccd392a9ebb19ba23f3733a08a76e6a81536a383076871af98af2541558a83edd117e4b896704a3910aeac63dfc80a90170d9036e133a0ef7f2459f4964ef0923b1e02338b24339bc39fe66ba3fdc42810232c39380d136c019dc504e770136736f7c543caf8a6493a9b190782b86036ff5983221dc029c6861a574a908ba50722c82a7712ee751c144d27b04a754a3d62f0273b4781bb56574bd2c0b5d113b8ae97968abdc763a6324a72734bc59a9780783678a2c11896998836165a41f295a8a5e38c55de7c707d449c7a991bd17954ac29cb043048616037f97b044bb312eca07a15354f569174c1956bef673046231ae774e738bc7a2d9c130d8573997baa907bb65f722de59b9988c992d2b6258e3b5dc6b83aac06e9a71c2e20761f2419ee1065dc7715b41bc78d9e36d4ab92d91dba378c4a4a246aa7406a52c8413c0717840071bc06558b0d780e5f98608b216aa995c2429bc353270ffe14f40057b42fa004265696d8333b3016abdc80065a4770c2a2da21c1c3fd802fb0ac1cd302781d37c26601fccfa8ebb1a6518436bdd68c7b9f197f45c33189352b6338af430bc01962f614686b60713391a050628a3bdf1a06b695a69e07c164ac6f331517e13b27e4cb0abf7a348f122ac72726ec76f74d98ee708b82c67968e106077050ecce329b030335a6a69dee6520be6c4f0a8040b39c7d2ab65725716b1c847693c9ecc122da9d7326584a281f3cefefc82da1b1f3b1634dcc323dd695d72d0342f5750c847b03c969974c2b8ef606110c60467cb9840d47833d49d3a02c8521a38d9c90eb6109f7bd4295263b2124a224054bade4c87d0511bd699a21b9c52b650907b8c5f0334ac8a54c2b87146166647064088e73b65b75a5b9243bf2f9c3b357988e809c87fa542eb4197883067020091b92560637599b086c7b13953e3a448ad3042c4015c12668df13a5e92cc564879a475495724866d8d585f93ea84d5989ab22652de9067e3339820039d80e44a24945a6c154c730a83457976aa29a354cccfe7fcad5023362fa80422faa2fea1ce16b05aa43b581f8c8f648940f6bb4a5a87b6f15940804603e23880ee5028af5b4c1f040832d56841a4b981c80288b65f6f8278f33a16f912a0a36161435b61857b65fa20bcb9f7beef8b3902359c66e25a766614118c5caab32055783103593e0c76426f36c394f391de6712efe1c6e101b42e3251de5135c91089307bc03712ad85b33e9e8b5bc21309b4ccbe49a022da5a983a6679c5b4c84620a88124276ff83d8a0383ab84223362173c740c8241b74d7032410ccf59ebb158b59823b45def1532d4097361371697dca44bdcac57360b58912ac43a0c12e34b234753d8682ae881b5c9a314fc2ac58d465b4569552d2477531a66df4c4cb2b18491c161aaa8294ef1cf4643611d70a2bf585044db640deaa6b00b81c3cb889bb547f5ac154c383824239dcd45787024cd388130a53c4791bbae643c7447fccc57b91ef6812a3cbc97e3c691b7b18d781912f88a39aad39779e54610d4430a1c76fafa0ad7bbaa47a45b4a44244e63862bf8347bf465e830be276803d412b205183c3f59a50fc47eec27225ca9888c7a4c510304d5ca5ed5792656d15b63a30bd526b1b3bc820f8336497baa81669f0bea2bc220ba31275455d09028e119e1c0b4513b3cd4438ca52abc496b5b2e731c9acb5889757200371d003c7efc4c2923896cb5fb48ff2a581335bbb7a4c5bb68107eb6146e4375f43a43ae678a833c773dd2c460607a4a40cb5e8a5251514c0c692d2ac21dfa2c296755543601bcced30f656a4ccc82340ebca175c4191102446275a1d9f669d405014e881e98b676afc82a1c5a6e7fcb482a1a13ca311fa6765eaf474ed2ca8ee2e056cbc469bef18668405023926309710cd1d4a2ffc42af54a8c3fb499781579812007fd4a63929a1d945a38d88b205a51035e31710007c218f23e06e83408507f76ab6e382c1c52940f3d0b0d67a6c478a69a2835a82c3674c0e29d430806d2097fb2b35c5de1940041cb13c910db779b64f9a2af01718235a3382b19eb853cc1999998c00090881cb45c89c770c79d5a8772230a9d22791ffb14ea53aa96b66cca4acd8fbb03d642174552682d98c8089801a1376c72ab743db20e7c62977e4c9232cc44f3c95d6761062f9b73456228fc935264059fe476bab4f0a920917acc7989afb69126f5c2d105780da3a3c4a38ffb932ee7daa6fdb8c5ffb0577c367845f75f2eb545db05ab7d20c9bfc57788ba8725c62a06c728b84b5b3ea39fbfd7af0967703ee7b64e26050a741f69fc70b0bac44396a7be3a90018734ae0a76b5239ff92945babaa880d3907f347b86d8a834e02f660764ae88cb300c3f6e77634409249d471f64938c7e70b08492c66372753af46d71691c95a00b86124d552b482266b6f463620a510f823bb406f509bfd7987cfbb113e0104cb04b5861698599b8aff57180e898d494adbb15897efa3fb7c76c86c21a37013f61e635f5bc422865a585c9c6ad34a804facf6b035bec34a65f645b564a3aac7597b6911b4c90006f66cbf338a802714e7567b30c0770c82b6cb9f042f8614cc25a0348f9599fe5c65638737157cc11eb0d80866c51634ca6b3435ba6ca918557ff4c3174eb8e85449cff2b798e054cbb32531e633e1a9810f61988a0d60cecf62918257769941ed9c1618cf45e78c028f24bc1ff2a0b745665fba04869f38e597bb0007d511d90abac794ca3cbba9aeb391b846cb702b0ec0c8ac23a318798c31851710b572feda34f00362d36d623db678254e32662b6a5fd326e9697010ef4078ef25c099990dc082e2100ca83f36cfe8b9052320bcda70a2f641bbb65bbcbb5647fd0cbd6c0af7c1bb55761785e9716e2480519714e6a12bb2283aa49b3743b3965286b45f6709890953719385a656ca270e902805771ca087dc4e9c504656aba35834db581cf578efbfab089498724a34af043468c517e754489149051a1e77a1332231970c9f43c1fdc513e902364efd60e42f965ec0018259276f8c2b9ca503188d8a612b5a1fefb013f17a97dc0cf6e2a37a06261f4f42f4daa556493231f81a915a2734b7922929b6d15a6b9c6383c77715ba6610ce9b416dd44a424940c88db7808334323ed107046a705c670bff4311280840de3e264f3dfd30b29a24513331df9c6a913e0b56efde554779cda1279fa6a9c35e27229bb5b5ca733950b6973c249e642de1c151111c1787616e68e4a1e6b897566546d9b
+Ciphertext: 5f86dd4ee5869abf7b0c80e4a19603a778a314f7eca435d5833db32db14800d771fbb30161302740e78f82e3753ff4ef376d505417dee9f7fed73893eed52a4df5e5c10ae6c6021cdb31caa6858c698def95e0ed1601aa94ad3a22002e8b42ad63990cf9e6f944f017f187bf3cece7a85c26f1bc30664bac1cbcbf43e095708ba3d19510ddf0a3d31043a2cae1c741686ab00da2ccdd698dfc5c500f6c4585d32857abfeb42eafff6b808ae5fcba9712b89dee53527981e35ca75a8168eccc4ae60c5ea8a8d7368f0a731efdc2e6df0f96181b711bc6beb7f12fd488442fb7c549397bc022dd7d74d8f60305e68534e807a46fa8ce8aa96ddc92166a0624a9928502f448a389bd20c9af5e9216a1a258e0582451473454af4c747e975177f7477fd63581c70f02f31e50a5843a9d3beab3113d1696c80b572e0c4b771ee68cb599d8f124d58f18881453bec62666dc9d36ab3f013b17d96ca886cd6c6dcf4a47d11c602db9737804fd74d04e9c46c0844556d52b710a1a8c298ae7895df02b274717b8038e2c3f98e0c375c0747fd50139efeba3fbbe69df2045781341e9a0c4442eb4f15d570c6d04f78b17fff094bd4c40c5e0c12f77aeaaf4b05789240e0d3d4d8c8a9ba33cb550e4b5e7b9538d0d871927792e36bb7382c9b6c937851395eb5651449998aa4410b8c2c85323f8a088cfdc4ed3574e41b8a4aef7f57d963a4eced714f5f20abf0c89de4d8b8a1039b027e296931b55520c963e0aebcb361111dc5d5633ef9d10edc1b9043ac0727a7298e7858a2ed61c6a18371238dd5544657d8568c8ce5c7e4f5809ea3e254acc50565d41e9599e3f5856235c7d9b2dd8cabd1075a1500e6bae4ec3a9510b22d2a70d9f89feb0df5cae0466d4a86e1b3603673c173e0fba2fb8c0c74c3da79908bf7bcd94c2618c2a5c8e0d3b00deb931999f2b180ad95ef901aa1a345ddb4eae90ed81e9d115145fd216097c9e8095d78260adf5ce60c3f5ebaecb40ad218471451d44ea0f647c5f144cc37d89eef590388191cf15b09b88c856184078db33d89324c20963aad1875d7019892fc47fe435d361caf7c6eba2e743ec5cf6ed4518891a212032c1890bee8196b3a59311ade8a3615f4b0b6b46e8571a98bdb7a571e3e01487da6bd6872a0caf38d97fd7d8fd7a43a19ca1acf039a9bfb72394e75fdafd772a842411fa9a4fb061490226433ceef3ab95d76b4354163dcdf4ffc49094515db2fbc3ba398642598012cad605d39c833cfd8074ae2586e8555fb455275c513472841b9f80d29d5654e247145b1d907f201f849512ce85daa20f45c7b25f0cb86c26f00a83ab5abb13c7641601a56581ad93d1034226a0c9fe14ac082e69684875eadb87133229b9623159edc4d03e119be3496f3c3b0eceaabd0fe43f10b66b117f64903d244664ccdcc387a06c1fb51c74c6de710c5d2414e3d01eb3593d0295408f741df2aa6b610c298f03ce75811849642c8c9ee618b78f453b6ad6d52f2a2d93d47570a54cbc32ad93abddc594a0d5f707e1357e9e62ce86a3feed3b92fe5102dea36b1b794a8cf746383962387af88a0777f7ac28f8cde4e94b2534ed4ec79a113bdf5770642589769857b1d9e8f0d5224629ba963c423406b56db5a452bb386733f5bb63e26f4e6582c8620c883d137e5eb9d90a189bae88fe2145a9e239b786f10477636fd3b4e05c3ed39a5f8400ab353d0c4cd27d5cc20ed6d9ca462302e2cc30f5af5975a657b27dc17a9bc0f5172fa4bf84551db6effe0d806b2b22f7eac1a946435690f70d64a3dd1108f24e726239359927d57311ffdb8992832b04ba76e0608fe9a64b93d7dda1c726e36a8fcb2ca94d558d2af8698653c77bcf8a22c298772f5b31038f76af6dcd6fbfa6c5bfbf0d0b7b40f20fcd40ab085a97f03daab5a38a9e3baefcbca04769e685cf7c39ba18b1b5ea4847c63a08dc61a397632701d86cf137ac8beb2706a19854613dc4d5667d9453c58b868469a5c3cc4239d9bbf999d2eaa808f2d0df66c075f2e5b65954c44a708fa7b424d0028923cbe27a40747cfa2f78485bbc33ed7c58bcd2664a2e1b257915759b1863043f91073b7a2c7d30c7ea420edaa60d1385b83890ef136a9839e0893bd12e92a694d5afcbfcb78a4f84b9a207bec0b1b65062836aace99f7a3783a3f239bdc0a7d9667b0f1a00a328982121f93a
+Shared Secret B: 93eff5bf398b09469240fbf9af440f0606daba069415645ec37ea990fed9019e
+Shared Secret A: 93eff5bf398b09469240fbf9af440f0606daba069415645ec37ea990fed9019e
+Pseudorandom shared Secret A: 9a4bc1b2cb506e8e51b61a3d3ed76dd3449df2b443d6c288c100f1cfc03b1ce6
+Public Key: 36c22505c463e8c513b2f146da7c6f5cb72168d39f11e8b428361b40eb4d48187000e59e6a082a7ec6770093ce2558b14d131edb865e47f799559ac0d95a25f2a25b3e09325fd4621a8743b04cb0cb81864d345b3d6c6186c703395c783a8abe6d10b0bf96815ed17435a29f48e59ade23a56ff44825e3647b7407eaa9454d522303873c24b503d742ba851bcd1177c4dc3880991230df0b20976c5cc673231073a5e202287c58ccee800c52bcc647709de9203620183c0f817a4cb85ee71a017b196704052d0bb99faac375e1099e43257b456ca5ea2a3ff263a7ede58e1e807718730218e19750fb0cec1b67e29b33762229984691d01b7cac0225f323a86b316a69907b2b0c4c2d0a19c4eb9788a58df734c4ace16667e138c31b4e5de96a5f34301a7069d70274f3153ef3d560d426024dabc23327b8938ba18a089e3538bfc6c48478484757804507dc460d479712810a04557b7ab8c5ac95b3c57120d9b6c64f25299d243da43c121571a2e71b28d1b5a424c10b76004e35a422e21a3723d996d4895683ea56b240468e0cc3a2e2686558ccf6c6164f6a4ebfab3b242b437dbb8812578303b7058bdcb2f8e785355c37e70737b60864d330153e0657af7676aef87fb8027f3d17a7fcb53a470b4f18120a6047a599c92fbf00a7db1b752cb45581dcaa3e8acc454a2ecd0a38f5c59d7451b778828ab93b8c13fb440aecad8da62d808950a5564dbcb5889b4c829d41824776bcf4029576f2249bd16747f620b91639f0fa1ed62b659f75afa342b4ff2c7fa9859e59a6124ee90b6546595a0b875841660a61c0e3a43d45009ade81b1abf576857b9c7e94447c8abd719b32f836629e8b6115901585c815b18476d014824c5b5df7c7c27461a7d72630a17c852b7516174a5cef2903081037a5689bd3060906406612bbba150bbe2755263ff7492a201e80559000b453b77246a7f70b03ac0fb5042002ac76754742e3eb5d4cb090c8ab41a8290dc9936ee6dc9a703366046755ea7b3d8b105c01e59905e47a20e4758a3a7fc630b36a4b4b8de11be3f84fa0648c7263a5eb2b736673ae1b619f28ac1087ca36d1c51d11b7c071bb1b8283c59db64bce9b2755b62f531582c1740421c5640c69cac7206f9cc95c14490d5da8905df287cdb58e30e8a6495565a2d32436c55fe074b297c16cc0794788629f45c66a0439ab06a88c0e57536c265bdab290a8f80ebd7447a11665c851aa3940cb9aa6be7cf936e8f070a39b8c2f892ffdd192939031108085b1569785503c2d7503c98a6f141c016d853c1621bef53224b892c552207d7a5255179b0ff88aa98b79b0db39386f929cfea3176aa82314a038056cc7899537f79c9055c438f43a680b6671e670848f199635f65b1aa38fa4b55ba6c7a9d13a75120c5ff2141e2397cad37366cb14cd67fa5bc3f474ae219c740c6392c1225f52bb2eb61a5e3340964128d5e22444d149535313067c11f2a95c8ec89ed5158fc1569fefa68cec8b285e473d9d0a783f736ab427adfe508ded57af03dc468e59aa9d486f0e45572d5807be512b5bccc1805105cf240c6b8313ccaaa55065712d012bdc13c6472730b9d098cf7b9aa4e63fac7a0c5559aae0578db146074c472142153f53c43ca2ea561e0c44d1eb77d86960c13a08a52bc200e3253052bab7f34215bc06bbd74e618281093527df911c692c5fca1b9072234494a9cba47296d2c946174a341c2675161559d2c5404d26bcd92234ea516be46b40ef208c82c079c645cd60e51b06129cf207b47dda8d394887e78292a7032840144a31e37cf5cb0f172a6e5537188dcc1c7e858020a601001caa40d0c078e1916d9b78f8e40cbf0c5dc05b599463aeb65ab4758cbb84a7cdc98458da5a8bdfa6bb27f44b6a38722553b3fdc1a163c28eeebbadcac491d48ba1f9736cd942b5220bb2d22ac12b236c5a9baeede0b2cdf29beb137a90965773071eab457886e2a61148502c5c39cd27cc4ff2690b9728a609ba7c142c4fdb6989b76e8308922683ae2bf47780f06308b260c095a3b4c0348d273fd24ca281b842f2a981bcc85f325869fd3cb96b94928e180262b55ebd3caa895ccc784983eac8a35bcb0cbf948101f6af67e02a52377783e67c2a01bf6ff79803f483efe6635e94639227a11ba34c4133572f1bf0aa4cf2d2f10893bc8ee5e4fbb5dfd2555e66adee
+Secret Key: 4f7888df201f71bc8d79fa3be946a5b71b9cbc577789d87f1f0b3dbd16089711b7b2ac33e3822fdff485bddc4b1f1639c619369cdc21d0398204410f260801c1c84a0cb0b52187b1f6f00f46d57b3f2ccf0e02485e63458b15c48d688890ba2377d081da70bd5f4c1669c2bd2ce96f2ba238b0b5b78aa02c6e1189147893cee09982c75820486e3c6598def52cf7da46038640c343a1a4d15aed66342034721eb0938749137d30716ca36325b576b63563641a13f4029d10f0befff250d050392a6947260638ecf81e480373e3b11191b2cab2d55b5ec069400c2c4ad79909cc8206237f887bc870c128ab0b680cc7bd27c98ce4a586445b42072aa62ccab0363344cb160bba3473b320b4d653be1c054ef3d3838e20b88e07235846839739563e643ef8c6ac904322ae10523ea952d1615f080545c66033e692a720d895b4d03084d9cb2b28bfad0b42e6cb24fe57a783193cab99661bbb0339283e42d89731d023cac14ac98722d5916e66c3b4658a6026eb965e8889b114687c617c0b897f4d920d065c963852c298f54b4b380cb8077c22719d17998b626cca22e89d3ad8186b5663ce8741f3f38d01f8ba01c00489469af3e3ca285678171ba6995baff974a86620be58e85f64198e360b40db2979c7d36972db1299c6a59517c3ff044fb6c6ba6ef29460c095ad217d997c6db28902d094127b29198fd9291cf08200104a253a71a0dc387d92653f82cef9416e6a3254a71444280276dca4373517c98595b2a5c070af482f9d5a39f74144b76247c357b08eea7fb0e6beed3b87e6767a17c88f2aa4c83f781eb902b38af2a6ce72aa2ee2670d8134c7f30d27bc930640b958ea17588c3f9f55c1de103745b849caa929c36194e1521bffa6494d396a2ad08199423872eb1dc5b588c7792c87b2b53a757c7c55a764d66e1b2976c5a7bb75028d5aac84f2e6ac5cf0808d49cbc5255dcc75195a4915e3728492d8604d76ca5fc239b68a1da05a303029ae8df3b4ac73ad22b5564d0a53a3a3c34c9517caa43ab30172e859932af50b4d0b174b23cfbe1104a3084c72b78d8d2c07d61686553683fb0c672720c285ca7797d9696e48742d058cadb50c1c1347c23b6b21b17cf8e988a238c7fd2871f2b8b10498a823f37f1918b205dc627234416c697d1b394b001ac2ddd8c7a6772c5954bd309c0c6f4637a0036d75ca2204155ecfaca64b35bf4ffca1a820cd6e0820e26381b115b2846ab687d21a9dab97d63a787096c744768c20d596efd6a12a9acd9f122494da8d1c588b9324660d7644f3e478c4fba40de1241dca8e4b871c7ac961cc4acfdf87696f1cb590c60420b280f517152a9b2fa82ccb2fb700e4196c54b2b62889cf61f1c9c4f6642cdc5dd6104e0bf9a91c03b85d6aa1ba0315a2819ed70825696750e0203e25da891fb37c08401aaa728a279ca0b33b47328290b6c63ab0e034a8b9b2f44c87c0f924d0f04b6b1c5d5741462ebcc59cf6186e306396c5cc157cbed7d39e7d1576580189bd901923368da92b3909758f1ac4c8cb4806b7123cba60376be4a3b1c0837908b132f8c25da1cd2e7c123f7441dbc769a0704719898cadc20ddb886c24787b0778b510e6716a39a1c9830e76e81ef08467eae06728115293ab012b2768dcb8a0d562739ae49916e1979fcb76bb99a2f3fc66a781061bcb05210425d29679104a15c398766e329582a71f0c10ac73826b9ae918f9928a4ec61b859c67ee4cbe57244a3700b0c2dc7c24a338f832a8ec8c39df9387c3dba627b94a66034431f12da8e0b2ee18229d42698cd415a4f450248952ed46ac9d8b521b2a4835d5991192bddfb36c4dd98b7e77769499adef753c6540a03353c2c2ac4dee307edb4a211faca5e5286c9a49878ee2ba6e5ba0c4697fe79791414caae6a44207732d59862ec67c09ed0bbe0dc7666232b49b066218134b882c48ddaa747d35b6f41c4a6244cc3f6b9057589fd58071b63591fcc6c635947b48e86fe2ab095556859ec40e5c0371b981355113491099c90b4b38e05293e665639e3174b1610f504c80ea9c62082063d28287e34a6d2a2c7bae818509f5265d928fab436179062dabbb1fa1c8b1b92589cfb24a812a061f511548f654a1c8c508157a909596ecaa7158e0c171d48604fa6e36c22505c463e8c513b2f146da7c6f5cb72168d39f11e8b428361b40eb4d48187000e59e6a082a7ec6770093ce2558b14d131edb865e47f799559ac0d95a25f2a25b3e09325fd4621a8743b04cb0cb81864d345b3d6c6186c703395c783a8abe6d10b0bf96815ed17435a29f48e59ade23a56ff44825e3647b7407eaa9454d522303873c24b503d742ba851bcd1177c4dc3880991230df0b20976c5cc673231073a5e202287c58ccee800c52bcc647709de9203620183c0f817a4cb85ee71a017b196704052d0bb99faac375e1099e43257b456ca5ea2a3ff263a7ede58e1e807718730218e19750fb0cec1b67e29b33762229984691d01b7cac0225f323a86b316a69907b2b0c4c2d0a19c4eb9788a58df734c4ace16667e138c31b4e5de96a5f34301a7069d70274f3153ef3d560d426024dabc23327b8938ba18a089e3538bfc6c48478484757804507dc460d479712810a04557b7ab8c5ac95b3c57120d9b6c64f25299d243da43c121571a2e71b28d1b5a424c10b76004e35a422e21a3723d996d4895683ea56b240468e0cc3a2e2686558ccf6c6164f6a4ebfab3b242b437dbb8812578303b7058bdcb2f8e785355c37e70737b60864d330153e0657af7676aef87fb8027f3d17a7fcb53a470b4f18120a6047a599c92fbf00a7db1b752cb45581dcaa3e8acc454a2ecd0a38f5c59d7451b778828ab93b8c13fb440aecad8da62d808950a5564dbcb5889b4c829d41824776bcf4029576f2249bd16747f620b91639f0fa1ed62b659f75afa342b4ff2c7fa9859e59a6124ee90b6546595a0b875841660a61c0e3a43d45009ade81b1abf576857b9c7e94447c8abd719b32f836629e8b6115901585c815b18476d014824c5b5df7c7c27461a7d72630a17c852b7516174a5cef2903081037a5689bd3060906406612bbba150bbe2755263ff7492a201e80559000b453b77246a7f70b03ac0fb5042002ac76754742e3eb5d4cb090c8ab41a8290dc9936ee6dc9a703366046755ea7b3d8b105c01e59905e47a20e4758a3a7fc630b36a4b4b8de11be3f84fa0648c7263a5eb2b736673ae1b619f28ac1087ca36d1c51d11b7c071bb1b8283c59db64bce9b2755b62f531582c1740421c5640c69cac7206f9cc95c14490d5da8905df287cdb58e30e8a6495565a2d32436c55fe074b297c16cc0794788629f45c66a0439ab06a88c0e57536c265bdab290a8f80ebd7447a11665c851aa3940cb9aa6be7cf936e8f070a39b8c2f892ffdd192939031108085b1569785503c2d7503c98a6f141c016d853c1621bef53224b892c552207d7a5255179b0ff88aa98b79b0db39386f929cfea3176aa82314a038056cc7899537f79c9055c438f43a680b6671e670848f199635f65b1aa38fa4b55ba6c7a9d13a75120c5ff2141e2397cad37366cb14cd67fa5bc3f474ae219c740c6392c1225f52bb2eb61a5e3340964128d5e22444d149535313067c11f2a95c8ec89ed5158fc1569fefa68cec8b285e473d9d0a783f736ab427adfe508ded57af03dc468e59aa9d486f0e45572d5807be512b5bccc1805105cf240c6b8313ccaaa55065712d012bdc13c6472730b9d098cf7b9aa4e63fac7a0c5559aae0578db146074c472142153f53c43ca2ea561e0c44d1eb77d86960c13a08a52bc200e3253052bab7f34215bc06bbd74e618281093527df911c692c5fca1b9072234494a9cba47296d2c946174a341c2675161559d2c5404d26bcd92234ea516be46b40ef208c82c079c645cd60e51b06129cf207b47dda8d394887e78292a7032840144a31e37cf5cb0f172a6e5537188dcc1c7e858020a601001caa40d0c078e1916d9b78f8e40cbf0c5dc05b599463aeb65ab4758cbb84a7cdc98458da5a8bdfa6bb27f44b6a38722553b3fdc1a163c28eeebbadcac491d48ba1f9736cd942b5220bb2d22ac12b236c5a9baeede0b2cdf29beb137a90965773071eab457886e2a61148502c5c39cd27cc4ff2690b9728a609ba7c142c4fdb6989b76e8308922683ae2bf47780f06308b260c095a3b4c0348d273fd24ca281b842f2a981bcc85f325869fd3cb96b94928e180262b55ebd3caa895ccc784983eac8a35bcb0cbf948101f6af67e02a52377783e67c2a01bf6ff79803f483efe6635e94639227a11ba34c4133572f1bf0aa4cf2d2f10893bc8ee5e4fbb5dfd2555e66adee1dd8471a5b03a297ee48a09fc8d43ad0afaf6aebaa9f1d2f73abf7701f51cf5349e0aba620d52322f200540658c370ae5ada496969200b6c2cdc179acd2c49d7
+Ciphertext: 2756b4955d04ab3e5c68a229251c061d9d038d21e8d1d608f0d07aca6b4d695220f4e144141551c16b525a126b419c0be25a1e0c5c0e6b8cfdc33ace2dd926bddb9f786913a2072aaa9344bd76c6d2d625d8523d8915f81c4cb021c449fca20bcd4389f42947801b486ca608ec0caf700417887b81261e42e990f15aabcf2c9322aacb2941a2dd0b74a0262d1dfefdb8431dd2ca9bbb653be44298eadc1f84a21763e34c27692e8c614136f66b1cdcbbc2658fe8d3e3f043f99d07600e66afddb3ee15195e14eb2aecf691f334251004c2802085a504da0caffb7b7b5b1393b98577cdf48eb35c4396c06c157f17d7a23337d645725df667ce7a974961b5b99aca0e22c49e8164b22472e5d2762517a33f0a8b013d3f9018be11159f4cff18a7a8aa6a94dfa8ab991b4c227edae3aed69dfe76f0b36ac10b6a0189b52bd299e68e3402b873a624cc46790db3c6898502df35ba4e3363c400c681852226676a95512589aa169e1d8573b51472887c986d045fa6c6ec5a9fa4f71e9312392fcfd6e4fa776b945525017fb7dac94c001363a251a509a611c6cf9240d1c67e9be6d4e31c22e43ca7f1130964ce9767a05161245144815beb02a8482ea5f88006066c027628a803189af5d42470acd3caf858a42dd461ea98d394da7cb3dfd155037131c2514b64e4d398011337b5ae4f5b6e02cd0e3d656809850797a066b603f3cc26a037872e380d750021c7a487e70506a401ed09ffd8c26b78bf6d8cb209f6af80183012398d2ddd1e4c4b0d78fc86de395cdbd61040fb6e2b0d1924e3970cb022767a036c0abea7bb5b292be9f212b4c1eca25835f1c5b0f5f84601aae01384e930d547dba7794769c86a16efc216e041e6ec74c09ea581fae9ec01f539c1d25eee88d43628c6431b81b854fc6a7c28c44acd9e0e4cf82cd9047f3097b69f09d573256d9870ea7855be96c72259cddb54dce497f516d79e0b7b0e8d652ee8714cbeda8724ce487dc869891742109616654ad3857447e371744d9e6400a2275a6db7cc24dd516bd20f8e20924251478f20d16e466232cfa06c7392e6a6bc6fcd4151448062a2c931ac740a8ed61fc8c5d8bf1fa80dd6f11d349eb5cf39962fd3358e4e21890556878c7ea9768211b5b6d87ebddd2b763028d71ca0b415bfecc6fb4fc4e7dbc991faea49a166c2d716c1c71b4d3a6bae4fd77ff2adf946ce99c36a4ece6dc18d9246ff28a7f55c6e5e85841627ac5fb1d62fb0009c24fb083d0dbbfa2770854a2bda6811115dc330d6018b77a4401ab99fd4da348e7c3018ceb4d5910fd0656405121a9e4c00139b99cd584e3bbb43c16fea4a904ae66c984a7a5fe21d6c3fdcad6476899ae8cbc5864ad971c98b2950a60f40e359c205f9243f195d1a5f5e1046f493687a666f4e05200fc36cc5a6ccdf4db35df25ff125d819dd755f8bcb9c28bc675059f77ce91a732feb3bfca44335c843fbc2c7210897148c53cec56520aedd7c4c602ef711f9cd38192b206eded8d589f65db9c9fee494bce9c794ebc4436c97b4e6f8dc263d382160818916ab73f128a64f755bb6ccfec40fa1e7c34f11fb358bb7547e65b9f9760315ddc29fc9b996911e3c6feda716b44ffe74228fac035222bba3ae3fd627c3d6c3614be94d19eb31ffaf867a47e77e9a8f6d106c2d2bbbd337bf2cf3dadaf3f3f46b116034e7edaad8356779457a1880d23dc9add91567f57f56340580658cdf969a48bce8067159d7d7f252f7bae85472fd3c0e4b95cc613bdacebfaaba17e4a6f489b275194b1b1893161a234f72bad8a95bf83fb1d1476718178fe6c49aa0307ebf1a724fa59e8958609beec304bfc3a3a26b252b3b26084a6f69d82a375b415758e0f1613749f5bfa8a9ca95ddb0054f5c6e680aa42f4b2d4c5d5800c34e9a05d1037089304b5f257edb816d72c160bf4b4398894cfd5a73b5758491b39fa3b5e00395d578bed67973a6d22a611d326844a0a0fa976a81ae115fa6bcf2777dcce5d88d1608084372aa7a5fca48544e997113e2c6d4b79807b5ba877b77cd6ff13d77c34c3b4f9c8cecd1a54dac4d0dd3445dc7ac6115562611756fa5ed943a48317e2eeb2e6e9ee3db07cad6f989cc089d52479811a416839eb0eb35648e25be560bdf4c76dab49209e0a7d820829f33835eae1cd1a3c156b5bd09a7dd6b86db6c826676759fea40d5366731
+Shared Secret B: 4834268965d5f54903d7840c351a3878d22a6e05bf89f824137f3f4592733994
+Shared Secret A: 4834268965d5f54903d7840c351a3878d22a6e05bf89f824137f3f4592733994
+Pseudorandom shared Secret A: b0a109bedfe12344070b38264b1d2ffd1551f06d77463a3b93e19497d7d8749f
+Public Key: a7030074a0c54bea09e8b7a3b2cba68d411ad908b991a0ad34b26f378c820b74b6e8a4bebe4b2f6eb7cf4d435c390555de44b4b8a5a60030aa66b84178d9b85f34a7ed03b9891097e09057d3220ce4f07f11cc603149877fb02eb09155dbb0c040b2668cc1661272cb1a80903ac72abf128e564496d465058d601220f0850c1390cc662149b0cbbaa7885bf18ec49335befbc04da7c310161a3f6170dac0176e707f00e45aa945ba99b93eb65703f6f733b7b462809b6c33770b2adc4898f6b55e5a08b83113ee55be00a5a5198539fcf0387ea38cc897b474f7a345bc7907b9cb9bdb0ad1d103eec1c16c27a20a497398644cc7228644c0201e549b9933a443f218a8c5bc3342ca65a707c7e70a73148a9b924f8b8b65f914947681950b8126c80356c8f51e5445132c43c53252ceb643676e75c5f3b1c4d0d844d8f513e266070777005295cc34a0447d7c4bdeb56da426a5ce52800c3a2d4b9515f019cf86b1ce5f95abc759009db18398488d89640c38b0081d22488fa4caef2206b58280fbd9a859a94099d503f66066d0f86baee19a462cc36c7b6ce4617e87020571a3a8c61554f38307ac8141857a80fa2c8ac1e08aea819009046fd6b901ac3c2d54e91ff35cab0d76358f6ab6c3a0011fc186615501b5533c9351cfa23510f222bff803b202012eb3e790bf4555f9bb0022f424f660c5db05bec4b0c403b7b43928a8f327ace3399acec7985bc89fae1caad269c688a77d1f32b4537b9466809526161b26bcb07e0b212f093a98caaff3336b21b2078dc632e1bc5d70c5c6e2c3006697997aa64ee0905081872b42e168b5a10c76883776a0075ac697423a17e502c685ba64d59801dc06586c141fc73a28cd535cdb009e59607c1124002d7bb44c2671b3a57f6f4067de425fbc5528004a5d68e89e0804b417853f19711bb6d32230324155030bc2c30e4c8687921505ac446b7deb6383cc66f0c169ca812576f5394d03c8a6c1305890842412a0255c361926765c46bb0ab6b61a79ae5275cdd0e449c7880494f977742c92961228e68c30f2e172aec61bd7034ece85811e10aa10254335368c05d5ad1c1490f73a0ae8498e49025d2f5cb06e825682518f554735b68807e1c1bd8c229acc8c30a9a3a7cb01a93382ab47fcb9c30969c79c9ff406badafb650b74c6eafa6c9696a043b17f57184125d9734a2c2fcae10602b74fecdba04ae285c031153f905037c87ee9c07540ba3468b8afc0c1b69f55bfacbb69c9323200e63ff0ca7e6023233cab0f88456f07035e6d3b8b3e0a1ed608977f2890d85778f164392b209d47c003f6f398fb5b4686e2389433247d10026531245ac3126ed862609493a31ab7a9154a30561477d6b628e569ad3bc4aa34b54e419619e3622d9904bd2c74496509db1102fa575f8b048193d41cbbe4686ec00fcc76c2af856ff016c046a863f4845f7e737f02d508bfe3b46bf1bd36c664f16a747d191d1f83726c577e554badc52b7e1b7c9a4668afdae8523ef641b2538a9d114f1ef6272af55a8b6a5d65174432ec6884c88646e0b282d7b112216848c44cc5b9805fcc77fa44b685dacb49e664e525c04dc692fa018d6ce51822c85840e6b26319b518cab182f7692cc24db098b3e689ac418c590a606fdc720f89f08f11d3062c67957ae6bb9e8124288a59b4b236d030522361460848213409b15557385678adc6892aa6945e06b9bd5a919477710562cc3a58f42a2aca7427396968d35bba7b6d1cf40097eb26e1966cdc403c25fc5a45011ff578cd52825ffac89e935366ddf644856878bbc400f53cab49291ff3631e9f2a2f6991cc264b6d1d583b2f84ada6458796d5a836b845000454acb1202aa4329c41421b002827a675b16ac90bc400d9257404d7206ca7b18b873fea71794a580221f07e7dec443fd83998ab3a3e573df5e7be29c8cdb5b933b2943e0dd64b88db90d01cb068101f48f8404e7a9aaf073b21e98ace354430c7a86cd6cc751ab57589a6f957ca23a4a9962b9eaf429ffc24c35d06a24070cd6cc16c8b380278996938225891689488e59e8e605e72269a4767c9514670621cbeedf0a7be34a91656c5c2688595b2a30ea59d178831c819100aa1b1e1c75d3ba45265607bd771650a42abaa622a6608bbf72a34b4da7ea7a373ef3a1af4d11a642c57f5915637ca7fb88ad8d405c0d22683f250
+Secret Key: 6b1352aca0519505bdc79ccbd4a24a7b381f62f63e30e2748bd4112e0c4490db4449794fdc1c63c4c0affc560eceb6afd4d95491755496519e121a48b12a1480da455c8c0c44cc712d3467912222d8317d0ee1c1d36c9610620b17328775165247ca1bd8c7b38a4a86fd4683f66420e5a45ba6533341510e45439b29f948a15a440947749cf592b33c13d8c9afe6919910d73d651a5920a93dcacbad14c818f301570d8c6f69e465d3b53f19954391803fd3e0687c8bce40190c04a421c8f09d5f3104eb9372bc1b58b9e83b1c169a3ad34c3c53546f05bd1314078f476fce805a9dbabd67551a3f10580f00b64298683eb82f1349554917b83915bd0f439db6610f40699bc627969de5b254bab335f256a2812ee94c1532b263787bcfa12b5369873f99455b31c5a6e7780c797855771532f5770d36087fa0412a45f7c4aa459980283e331612ba3c04520834c65c60275c899cd56b60601375f9807a91afb5d005fc86cc17cb56dd83a6f7957dd0f89166aa0ce3969ceef1c5161ba24dbcb552908b04955a71526b3e093401e96217c280ff06184e9540c4d395396c36af91818c7b672d6bad12c14ecadbbc377c4885076ae5bc2b4d2b25ac90b76ebc1b071390201b6768e29398b9798ee448498889f952b8450b6f53ec4212a01a46f41fe1025f94317e32474936b707f0417ba87c2ac3632158a9c640046511047179b81202d42a07a10ba4831ef34a0585930659c6c86e98b8f80281e7082b7fb8976ae4a4e4049a2ea976c6507eca8329ba260f43b5adec82396c2c93821298dfc492b8219b5487c52aa09296db6b96132ebf759f33257d0edc92573269cc9088666bc22be4c43cab58ce9602ca34357a2bacb8f60460f0914bcbb6fae34d44e06607c9099c8618a24cb5d52063e6e188779c2194e4a5521b89c1e80c0de00a87e67e2fa59b29794a0617c295a88f42d998918caaf1a9a2ea85a39461ca945c6959e353d5f9222074393d02906397016b558d04fb67bf609242403587bc71506820b6256a092c7f7a22993091169f51aec26c0a06ab90c03104493589d1e840e00bc982f83bd5f48a1ed25842c6c0a2daba2bf06db4098373f24b7e6a6ecc3020148c9710e53525fc5dd8b75c2bc6bbe4d9a5a312260c30c83e35172a75c61b699421252e4371ba3bf4105f19a0bc9a5c8d58cc40c60d02b80768437c8ce4cd5c8a7bed967eea029d154b4c5a5c3c2c46cd412c423ac8be6622cf5ae36c8540b001f67db2a367ee522ef6827e672430c028cc0d9c06f072533ae7a9b3186eb25435d3a6cd3eb7213743241bf1421ac20bd254b1f8239fbf498fafea06fc6552f1902427852fc7e383f5f3c678d48e47fc0348338e71b85b3844630bfbcf1ab15daadc73a0b17e193cc4d6fa1a2cdc2bcadc6c4ba3bafcd54156427ac3852a18a45bf1c1030a876e03b34bf2a2ae47d8b85de935c1c9742ae857eb647f6ca70162519949dc90ecc670c5064f69a8aa1eb6c362da7e4c8a59d309cbdaf4c02bb449dca5908ea120126bbff7561f67f61ba3809e438a18cec0afd9f6abb5422190ba731a95404a29cf59a62a05599a37912de4d126e8093a1dd8ca0e92b0ae7c8a78f1470aa2986c396c3220c2b69818b854cbd68193b775826b2502020c6b29dca52f997cb949099fb9036435052999235d889fda40a185dac3f514c97d415992c78bde52ab50cc14e64278aac9c5727b7c4215aac24a12da495beebc9f908006bfc6b86ec100c839676f0b61ec2a2425b50918a0807ed829882c4ec2da22615ba3f68227d2bb1da56b02c0614e681b6e6e720074411e43b99136e1b292974d599712f01767b4853bbfbb0c927a349ac70304801c467c8813347c9e22b78c26b9a0a37627da8deb4b98af370ec46bbaa40709d48979420283852b1a03b009a472bee8fb4112c484a211c158305806271654680d0ba859bbe92433476a932cb8719b8e04d8b280f0cdeba8289a98c9d0782dba7a7bdf49b34ec6014b2a4ede5ac3e4056e49d5a114b7b6e3052de96879d9318bee74b7ccd5b303e0230bdb7428f640cadbc04beca13d3b2a5a4aa973db98b895c1acc3bd182618525bcc33c260df038a16556ed924ca0ce3a2c12b5c8e7474cbc66efa0546b6a900c7c89b961114d0451fa7030074a0c54bea09e8b7a3b2cba68d411ad908b991a0ad34b26f378c820b74b6e8a4bebe4b2f6eb7cf4d435c390555de44b4b8a5a60030aa66b84178d9b85f34a7ed03b9891097e09057d3220ce4f07f11cc603149877fb02eb09155dbb0c040b2668cc1661272cb1a80903ac72abf128e564496d465058d601220f0850c1390cc662149b0cbbaa7885bf18ec49335befbc04da7c310161a3f6170dac0176e707f00e45aa945ba99b93eb65703f6f733b7b462809b6c33770b2adc4898f6b55e5a08b83113ee55be00a5a5198539fcf0387ea38cc897b474f7a345bc7907b9cb9bdb0ad1d103eec1c16c27a20a497398644cc7228644c0201e549b9933a443f218a8c5bc3342ca65a707c7e70a73148a9b924f8b8b65f914947681950b8126c80356c8f51e5445132c43c53252ceb643676e75c5f3b1c4d0d844d8f513e266070777005295cc34a0447d7c4bdeb56da426a5ce52800c3a2d4b9515f019cf86b1ce5f95abc759009db18398488d89640c38b0081d22488fa4caef2206b58280fbd9a859a94099d503f66066d0f86baee19a462cc36c7b6ce4617e87020571a3a8c61554f38307ac8141857a80fa2c8ac1e08aea819009046fd6b901ac3c2d54e91ff35cab0d76358f6ab6c3a0011fc186615501b5533c9351cfa23510f222bff803b202012eb3e790bf4555f9bb0022f424f660c5db05bec4b0c403b7b43928a8f327ace3399acec7985bc89fae1caad269c688a77d1f32b4537b9466809526161b26bcb07e0b212f093a98caaff3336b21b2078dc632e1bc5d70c5c6e2c3006697997aa64ee0905081872b42e168b5a10c76883776a0075ac697423a17e502c685ba64d59801dc06586c141fc73a28cd535cdb009e59607c1124002d7bb44c2671b3a57f6f4067de425fbc5528004a5d68e89e0804b417853f19711bb6d32230324155030bc2c30e4c8687921505ac446b7deb6383cc66f0c169ca812576f5394d03c8a6c1305890842412a0255c361926765c46bb0ab6b61a79ae5275cdd0e449c7880494f977742c92961228e68c30f2e172aec61bd7034ece85811e10aa10254335368c05d5ad1c1490f73a0ae8498e49025d2f5cb06e825682518f554735b68807e1c1bd8c229acc8c30a9a3a7cb01a93382ab47fcb9c30969c79c9ff406badafb650b74c6eafa6c9696a043b17f57184125d9734a2c2fcae10602b74fecdba04ae285c031153f905037c87ee9c07540ba3468b8afc0c1b69f55bfacbb69c9323200e63ff0ca7e6023233cab0f88456f07035e6d3b8b3e0a1ed608977f2890d85778f164392b209d47c003f6f398fb5b4686e2389433247d10026531245ac3126ed862609493a31ab7a9154a30561477d6b628e569ad3bc4aa34b54e419619e3622d9904bd2c74496509db1102fa575f8b048193d41cbbe4686ec00fcc76c2af856ff016c046a863f4845f7e737f02d508bfe3b46bf1bd36c664f16a747d191d1f83726c577e554badc52b7e1b7c9a4668afdae8523ef641b2538a9d114f1ef6272af55a8b6a5d65174432ec6884c88646e0b282d7b112216848c44cc5b9805fcc77fa44b685dacb49e664e525c04dc692fa018d6ce51822c85840e6b26319b518cab182f7692cc24db098b3e689ac418c590a606fdc720f89f08f11d3062c67957ae6bb9e8124288a59b4b236d030522361460848213409b15557385678adc6892aa6945e06b9bd5a919477710562cc3a58f42a2aca7427396968d35bba7b6d1cf40097eb26e1966cdc403c25fc5a45011ff578cd52825ffac89e935366ddf644856878bbc400f53cab49291ff3631e9f2a2f6991cc264b6d1d583b2f84ada6458796d5a836b845000454acb1202aa4329c41421b002827a675b16ac90bc400d9257404d7206ca7b18b873fea71794a580221f07e7dec443fd83998ab3a3e573df5e7be29c8cdb5b933b2943e0dd64b88db90d01cb068101f48f8404e7a9aaf073b21e98ace354430c7a86cd6cc751ab57589a6f957ca23a4a9962b9eaf429ffc24c35d06a24070cd6cc16c8b380278996938225891689488e59e8e605e72269a4767c9514670621cbeedf0a7be34a91656c5c2688595b2a30ea59d178831c819100aa1b1e1c75d3ba45265607bd771650a42abaa622a6608bbf72a34b4da7ea7a373ef3a1af4d11a642c57f5915637ca7fb88ad8d405c0d22683f25040b3c378e0150b87b451e2680a442e3d47500a8e8b953373b426e32f46c19f82c70fcca770dd361562b612bd2731a4738c9b7af62e7dbf290b1f530c6984148a
+Ciphertext: 28637c38ca57b3fb1faa5f6394c13cc59de2967811af63e48e9b85549a3c950b6f21f2db81cc1ee256a05c8fddeca8f1162bce76f1352784aea3376ca9a085c60405e9d86892a9d0203990a55dac7f4db1ffcd8a0e293afc8e663a9200ab99529c1eacf72aa369db3b1648be7c9ce164bfa5949198ce985e16dc816f9c148488cbcfc28d7a24c923240965c5c594b994e6f8935ea4b94195aa4f03ee68c3a6b93984451e7f380da139c64bfaab6faf96d315af662d782168d609b8b63b71d6b033771f3dc9c62e858916a299feabf07406cbf578da5c54a879cceb265ebb0bbf418eb242181854e8643a3797c817a77a5a01f2711d726bdc16899c5f42f6ee9a24c884ab45b7c54566b0703fcfb356ce60f47ec826fe40e1460dec81cb020bbfce87642f2c05fe7f686adee464f4f1a9c6455c80d6eb5e9807b5e814968edd11e3aa2909e9f497ad83d76b0c2fad4f2fbda6bedae9a8e52acfe0b3491bde4949b6e6fce6583ce776da53c9c5dbbb4a84a48e6a4081bb8af1fc8065582b71eb2e2abdbfa8f6a8b065aeb1705caf1eab89d2cd03253ab3fc22ff62ae7d2d9120b4d70319fa005429714272d376eb5d0a844a3134e2bf193c56f2acd2d9c93cce32cb0bef3a327abef9f656d634c8e0ad786753ac2dcffa5878a42ee5341bdb27adb3cf1378605fdb9a75819f213da33badc33d81db729671f21f4ffd98543bbd137403968d09fcc8a97c9218f3ce96847a8b1146484e938b018050042eb5e0d3d457877d4f39ec8777693c47607fa016169cef9143525c5d4eea20089771d65ae5e4f81490ad78afbf3e42da35902ac2ed619ae61510e05c33060536436a31fb8841f31968ab27b2f6b9b2618f316674aac4738414ee47b9ceb35ea1c4e56ee84261c8a77e7269193913cacceff7a09d50ea92c9855a2598a8b291132e58d406be6c4c0ebe7659885e96b13c2f3d61667f6a31c6a5fe17893aba1b1df2520660b91666b61cbc29927b96a420ff955b28de21c1e4723b19fc675ed68bbb3c557c3ab749a618dc89bf06b74d55df59752fa450224e90ee54fac9f6716ac3fba45b6caa6b442c57a8612330b249abd34c9c379ea357263fc837c6d04d40403f78a063926f0ea7c617e46eb0e35be4462099ce3ccb27cc4e48a11183bf37df42e77844965ecaaa432be9d4f62a20ef20a76ae6639dcc148a9f8ca2b19978be52ceeb461df14dcb88955123a3a7568b7dee76db2420f46f305c4a8ad44c0a99625e58278e4573f981c71c953dc42a66270944352307030f989744b694b467b332f975899ab6c84067864b5cd97e631bc1f6a2c4140739676fd518ecbf24f184549c60ce42f7299336865405c7b7d3f6c8bb4017e06c8764af5c56d946e12c8bc2f3979217f909dc2c0dd27a3f159c2fe4e958c3b64017cf5d5a36d926cb3442df2a16703cbb001b35e06ef8ceaafc21db03520fceb4b5ec1494aa8a062e48da1436a21295c1b22117999413a9ae7fc6577c4974c3930401f0ee5c5956a69e2de14a09a71ba893211f76c5b94bddaa8026670f837a895137e82b57c400cb7827623a87673d299561da2bc85818062c49029700a5ae40b94ac51a29ef8432eacce4f15f102331379754824b4ee17845c032a334fcfda5a0154e5e6368d4554dd1068b9e39206b65e851adf2b7838da299442fc602405608167c2c9aedc0bafb44dbc335f8af196e0d589406aaa5cde3138bbbf60f3a4f463ec7b6751f1ca1a2bfa8c96d1af4ebf9b240f2c1160a1034065183b2092fd6f7dd3ff635352a342f19ddab7975c8ed4931ecf00a626b72fb4078980900d999b233e5807fe6c9a0ef4e6071e1b64055513d1b2e5ebddf212fb2bb6f5858da33fd514c702f2a0c5c8797a55d35bdd7d49d3eb4db30ca176433e510bf2a9b3341488087efe0b9f079a67a6c77be76513f3b2c9ffea6096b8aaf6068248733ab0c78bef54f61724bd6b25b03b00a09a4f17d76d5fcb73abcc347e3f807851529e975e2b04518f24776a412fef16c4e8ae5363500d9b6403cc273024040024ab07154c4b10a3f429c91224cba6ee01711fce37aeec7730be813261bc16b94346da345ca62eab474928309b7f52b888cf0ebd1dbdc4fefbf2690ab0467c1038c8e4f8505af4492eac686211b55be542a913a16b1a093670044b5e603cfe00cb0a77fb0f9acb895efe8000b3e2cfcf8b4
+Shared Secret B: 71d9d2ea636d0691eb5a9bba3c264cab22b56b27932ee38fe1f2ac9baa411000
+Shared Secret A: 71d9d2ea636d0691eb5a9bba3c264cab22b56b27932ee38fe1f2ac9baa411000
+Pseudorandom shared Secret A: 9e21f9365d560462b18f0481dcd4abc62ab264f8b9764962258ec8752c90f1bf
+Public Key: 750414aecb5889f845a9361729742cc27cc5937361fcd60826f510bd4b2f29950373fbad8fb3c5e8724e46fc5a01513dc48c1cf2959a49f07897220fd095c0e43bc02ce19b4a985adf543a54983214e36b5d7958c6569f3a98809439b5c9a580c8347526105f91317c6ca16c58e406f8176443030853d92ef357c194837333ac8e4338008968296d558404c0487823623d87c4efc029515ac150500ae5c723991b2ec72750e1a4b87530673c29c4873c70af26605aa22d1450895eec579c985e2d24c0cdf0b7a8444a0764a87f9758d4e64df088946ff39887081429336be020aa0461b7b19257c4d3be2ac86918b27cff9932031bb96bd27833c203de871762f52b8a269059dba553677be6db75eaa615683356a4372d445ab25c6b3bc32a86f5c5cc3fea88053a1fa228268f98c92f7a92d1548230b8aae1054b82c290258b183817ac5670768ebc80d30668b3179fb8d5af00233f22d6c9ebd33ae4318491c4349c712e6675a103b969e72072ed5811fedc61188b93993078a7ba4219fa1baf982dd655430408aac622abee33b0a6878e7f9b906d67cf11650543d07e39a4cfa20a660063b9bb40974e39a3978c8718e413abe99e8774c3e3b13ddcf64d25a9cb89eb5aab5165156556679b5be7b4136282b655393ff9e792e92670d9a93ce65714daf7716199cc6ad7c6774914688c3e7db13bfd525b01c567b72853d1a23d9864b059401e24aa85e8d7178c020275450f97829a50e234251097bc2691362829ebb7482b309b1b10acff00af75569eb43a94731a5a40eaaf9505a3992935064cbffd34c88e6415f605010d585fb666c9a3282308cc891cb3b9ada1075a6514f0d1b816970d3b425234f0af024b1f8e3a108d5a724357a1e625970631554c32453b7b17b1e9905683a61c323019c6b641f916df67aee03ab9f664460521b72b51b247bc5a64c032645a8e8fd6218771a32ad001d7854386615c50a57d28a04ed29a93e9138c5ec46e54e31cd5f04b930b881ba7b88290b1de741269b912f90c612488319c3467d413cf8d895383e09bc7d1a13cf80176c505ef51941691ad9db6205a871338b532e873612fc929ac710d8c4334ff7692345b8ffb066ccc69b2ac5084e43466f31c6104964e1b61928b4743155a4a681c906aa8034bb432edd68b121758eea472eb6426d4415f3c46170361546b17776596c8f5f9842af35e21a882aa912af924a404a6187dd0b8fcfa5bf678c224a10deab865328caa2e352b90c20c2ee38c3003a5423c055fb0261cb61ea27c4a0a30a40ca75998c470e2575bc6147fdd4492cefb45cc0192d20129294b345c0823442576037aa26b6528c090844eb820000b709f985a750178c28b7bd7a63092b5712bcc5e94c72b09cb95be183623324d102156cca67f73493f64b9c5e2da19215c9fae3154afc93702e41102432fc454a2e6fb3475c61ab234ad6c403580aab7654cbf301516eed866f84719ebca8279188709a74ce228755859a48b13002ff100e4fa5e2611a57df524c31a57ef125a41935925c6434d956e63934b4a0267a29b1c1da23988aa7b1c97cc466c4aa88651f1ecbc090ba2f56b9a70089fcad9238cf0ad4012abe4659e99f0926f948f114c20025746bf2a7482730a2254c340257b42469fc126786fe91f19dc3887c8714ae130be6620df977881606dcfd3ae63c70723a5bcfc95531a175da633a197e73c2ad3941fe85513d1786df77039fb142f55ca11d2ab92003453dc8773233bd0ab1eb1119f15e853fab5204c7c538d31032fcc968658812e173299e374a6e99faa40605485cebf4c454c99c5339918a0c74b1d30c05f9b3216b68dc96c7e9e256f0512a16a3b0f76e43782789e0b856de1b05e81e90dc82c0260a3b28f510af2f5300a4731ac064669f58119630813acb3d06b738b01af77d116f6468730aa277b62a00ab60038cb1acea20419e6c117fb132e271f57f84ae53ca09b508bfe1ac49e6a5b63b6000f1b6e73d96406a91700448e292c3c86d076aae9482b4188a069445e10cd2afa7d6dd496b1951e7de38ec9069a65b94042171df75c420184326e324192653d2aa843080694edb17720029960a8bfd53a4181145fa877be04811b02f8af5611366a2c1bb9306ae4c1a524aa191430a21b2949629be7527f1ef217a6aaaacd33828090352c6f812462a01cb808c7f3ba81b6ce
+Secret Key: f1216dc25ca478557ecf35c6f854157446339ce408f2519e5ebb41b2f59442b26f92c196d1ea0f9b410006e37d5db97a7e263679b211ffbb630fd8bfb4127e68d3a1b59b26515a9e41c4300ddb87eb119671eb87f4f9626e1c7925e70b780579fe060fc3545ae2062f65857741c148c17439aa0c4189008ec6227c77754d7808278e72b92bdc3054d7159dd22a5b7b75fa92ba2cc7791ab2c1dd65114a298d8181a7a6a15596c0552335a311730ffe131b646312abe935cb0cbe74548d0c5a08a5aa097a65bd494613dafc51167893c45468454569148784a4033e5edb24dde89cff4561fc2bc05e313315db040da608792c26884212c6981d56913ea6709ce729baa8e10502f8418536c318da648558c9fb92c22d765722aaaad5ea04478ac318306a29e04679d2399b6588e0fc2af25b44f119c2ef9118798b75f702186f5b5c202cb230e25969b1c34c9c3ed52bbea5779fcf399cf0d1112f88a3cfb45a17245769a790825152a5111d4a1a71b6c973c3471412b8a72fc85d67ea9d442bc34925c9a8494035eb645e655f622421b9e17800077d5ef9a5dec91fce4b7ecd09ccc59598b5ba4cea09559f269e5018983603874f604c1cb6ba406846f262bd77b1132d9119523371f2845a9cc3b67a0c79f3da536c20732127704a435543c7cbd70b00c0dbc24ac45d21c2a47cc37b1a52383c14512959935f072f0e1945224a5d4975c81c63694b9169dfc7b477f08319a5bf6555a3b8bc14dd58b297d789843501b3018050400e419c88afa450dff2a5db8b185dac974a43150fc038b3016777d7494869c8575a6966a85d9c0a9b8a2c6cb2b0768c246484cc0a7b7aa29292040f33878cf03fbb302e7ad4b91d0c5591087b6dbc1582398cf4155dae3309f7b5857093c62e9b7de170b6a2e765f21956bc3a47ac8bc545843499fccf21b03f5a44bd57cc2c0f909f321b2d1edabbde1b6b8b26b4e4b02ea2558f648655d8611d12f699a029399c036c195297e86b1f6595bffe8a116cd54e9c902404639ad308a42a78c6ea80ca55948188c52d70303b40043e7056a8e6dc23bbd249d39c465506a6b422512b50bab0195e9abc29fbf948d31458a0c3626a69b3ad2bc8116a1f05a7401ce50da66b8b2b416d22d96a88512f1e07433c63923c26b036c6cc8fc0b16c8865fd8c0d16210996ba1859d3879e2b7237b54a5a068c644188d53b2a460849df766dbf818e8d475a79c6681f632df3849279e2c96c88c4b19386252b3728db9eead3644e035cd9361f42109cd8bb63139a2a3b927da3da48fc7ccbc1c23a6dcaa980c480d37b37fcc896e0e49d3a006fc95aaa4e57c0665b35d621a6dab3c9670404df97261bec5187b35aa340ab6bda2cfa5277ba713a198a20e368b9643c7b06594e39dc673861b1b9c6c776caa47a2079e0f04956f6c89a74a9e960b4f4795ded59113ec95b935b4e36fa587870be6b28275972a8d456754aa275b317a4d48b97cc88bb38d8666ff55093c2c77df311db50b5edc110bb522893b769741b9e03b80213dc0313a205252065719469ac04b1fef9b72f921a34c91b62919e1121cc74972c7ca9539bc21a7d846a2e92007f618927f77a52b8b1fc8592b59bb782618cdc489a1b71b5b3a5b0d8369b2ccc0a7ea93809282efde2a9ceb9097f3807e96004ba375ad7547ece870a7b8b4b63e1935165509a9c5fc17cc155aa8a80875f37229a1eec6fca02ac99348aaf3c095b9b4f2853681f668f6dfbb9d44452cbabcd7c3916c9c00d11d8caffd17256392835e483c52634f0b4640a52ca6915127a1a03d81a92d5c950a64316c2f1515b21076d6b8bd2ac17e8234eac2402588817702c16dff77b7e3038bfa057e61b8024d329c93166c68a6ef05148b8222b26a21bdb389a53d59c73b78debb804de05aa52549f6cc81f7f02b1fec747a7094e4ba9925790bfaa5a0c8085953a55373eaa8b78282c48726c2cb6c087e07cea8aa44d609b7488b6d5c8195cc415687a6d5ec20744466956aa9d8aa56e8fa6b7a320b604a9671f7c864feb2616a45c0b362b4d186d2cd3742e3382b8a7c6f3f720d91c30e75972b5340d6f97cf43339535d81340e37675753cea6cc376bb4178111c1ffba9b506ac5632bab5d984213c4e2b95bb6d865848949b971295750414aecb5889f845a9361729742cc27cc5937361fcd60826f510bd4b2f29950373fbad8fb3c5e8724e46fc5a01513dc48c1cf2959a49f07897220fd095c0e43bc02ce19b4a985adf543a54983214e36b5d7958c6569f3a98809439b5c9a580c8347526105f91317c6ca16c58e406f8176443030853d92ef357c194837333ac8e4338008968296d558404c0487823623d87c4efc029515ac150500ae5c723991b2ec72750e1a4b87530673c29c4873c70af26605aa22d1450895eec579c985e2d24c0cdf0b7a8444a0764a87f9758d4e64df088946ff39887081429336be020aa0461b7b19257c4d3be2ac86918b27cff9932031bb96bd27833c203de871762f52b8a269059dba553677be6db75eaa615683356a4372d445ab25c6b3bc32a86f5c5cc3fea88053a1fa228268f98c92f7a92d1548230b8aae1054b82c290258b183817ac5670768ebc80d30668b3179fb8d5af00233f22d6c9ebd33ae4318491c4349c712e6675a103b969e72072ed5811fedc61188b93993078a7ba4219fa1baf982dd655430408aac622abee33b0a6878e7f9b906d67cf11650543d07e39a4cfa20a660063b9bb40974e39a3978c8718e413abe99e8774c3e3b13ddcf64d25a9cb89eb5aab5165156556679b5be7b4136282b655393ff9e792e92670d9a93ce65714daf7716199cc6ad7c6774914688c3e7db13bfd525b01c567b72853d1a23d9864b059401e24aa85e8d7178c020275450f97829a50e234251097bc2691362829ebb7482b309b1b10acff00af75569eb43a94731a5a40eaaf9505a3992935064cbffd34c88e6415f605010d585fb666c9a3282308cc891cb3b9ada1075a6514f0d1b816970d3b425234f0af024b1f8e3a108d5a724357a1e625970631554c32453b7b17b1e9905683a61c323019c6b641f916df67aee03ab9f664460521b72b51b247bc5a64c032645a8e8fd6218771a32ad001d7854386615c50a57d28a04ed29a93e9138c5ec46e54e31cd5f04b930b881ba7b88290b1de741269b912f90c612488319c3467d413cf8d895383e09bc7d1a13cf80176c505ef51941691ad9db6205a871338b532e873612fc929ac710d8c4334ff7692345b8ffb066ccc69b2ac5084e43466f31c6104964e1b61928b4743155a4a681c906aa8034bb432edd68b121758eea472eb6426d4415f3c46170361546b17776596c8f5f9842af35e21a882aa912af924a404a6187dd0b8fcfa5bf678c224a10deab865328caa2e352b90c20c2ee38c3003a5423c055fb0261cb61ea27c4a0a30a40ca75998c470e2575bc6147fdd4492cefb45cc0192d20129294b345c0823442576037aa26b6528c090844eb820000b709f985a750178c28b7bd7a63092b5712bcc5e94c72b09cb95be183623324d102156cca67f73493f64b9c5e2da19215c9fae3154afc93702e41102432fc454a2e6fb3475c61ab234ad6c403580aab7654cbf301516eed866f84719ebca8279188709a74ce228755859a48b13002ff100e4fa5e2611a57df524c31a57ef125a41935925c6434d956e63934b4a0267a29b1c1da23988aa7b1c97cc466c4aa88651f1ecbc090ba2f56b9a70089fcad9238cf0ad4012abe4659e99f0926f948f114c20025746bf2a7482730a2254c340257b42469fc126786fe91f19dc3887c8714ae130be6620df977881606dcfd3ae63c70723a5bcfc95531a175da633a197e73c2ad3941fe85513d1786df77039fb142f55ca11d2ab92003453dc8773233bd0ab1eb1119f15e853fab5204c7c538d31032fcc968658812e173299e374a6e99faa40605485cebf4c454c99c5339918a0c74b1d30c05f9b3216b68dc96c7e9e256f0512a16a3b0f76e43782789e0b856de1b05e81e90dc82c0260a3b28f510af2f5300a4731ac064669f58119630813acb3d06b738b01af77d116f6468730aa277b62a00ab60038cb1acea20419e6c117fb132e271f57f84ae53ca09b508bfe1ac49e6a5b63b6000f1b6e73d96406a91700448e292c3c86d076aae9482b4188a069445e10cd2afa7d6dd496b1951e7de38ec9069a65b94042171df75c420184326e324192653d2aa843080694edb17720029960a8bfd53a4181145fa877be04811b02f8af5611366a2c1bb9306ae4c1a524aa191430a21b2949629be7527f1ef217a6aaaacd33828090352c6f812462a01cb808c7f3ba81b6cec1d84fe9136f889118e8bccb10175bde58ba45ead34e9535264ec0b43ba0f10ab9d1a594d9245ba3201545235146f9a25134a46ca3fb04ab0230724055bb7998
+Ciphertext: 4b1f5cce913864feac0e971a1c3d685bb42db1314569bdeccb12c103ecffa40127d8d75c1313a1127741fd843ce84d6a55dfd6657e4c8741eb05a0d766b0ce3dc0f4bd3ece9f3405e0322803319ca7715cd56f0572141083af1145fd6de956b3a19b3e6145549dcc0fe9b59fa15680acf7ed4b16d14565f648cd97a96e4e4ffd7a13b881721392f41156548885f830a74155ffc6729063e9471130cecff5a9d0c457051c81b73e1bb1294cd4ac00f73f3bcbfd5590850d6fa7b32f8a7c16e62d5f94171591ae6677bde6504b2e4313e0937f37b0ea4484d0e87c794e5efd1f2ce72ca87c3ea45f152dbc64e7aa4a7d27f21b4f3a8df13d90d72913375d88f43e5e26f92002134a3f4f6b8f40f71a51146d983cf77f81d86ce978952b2d732dd6d121e0b816eea0676d3e0f766adbfba8698a29d42f6a0860b2ba4d0c1b54d3269ef172fe437e102e4c16b022087e6b00dfc98f4901f9bdd61f1656a58fcf3ba8358132e979c1e13bce8a5629a5f2fef7cd5606b545224345dcaf309ba655adb749a8031cb05e2539a12b921a58cd27552c8cb0419fe16eba7a2c5c332105dd53969b608ea22ac82539c4297b24bde2b6dc2827d53fbb5b70004bcb54bd266809c0f34d2c395b7d59bd5a919987289e9af7c37346881b8d14bd1e200ba58ae47b972724f6b5bbe4e089e004a570169f57a0b1fdd76ad08af705cc3255151b72e8ad4615c53d63d5d9bc8bc31a2b7731d627b7c7fa5b0a5ef73d699d878cd68404d6ca5e2506a7ccd2d9d5bc8a0e9f7320a353462e4e80c5911c6942530c546c466ee6c8698613981954d3c0998b0109f8bba52e34e368af20025e284b0772d16764b384b234a86dec4b83efc82e92b46fc8a888d921925fff63917811ff30daf35623fd32d3d9c2ebf88dfb7f3e2e3637bbb0db57783288d3be73af238e4b811378a4381b38c2e4f33ec8bf857bc83368ef0ce1b28ad791bd4e2562760695a116bb34337f94a41d1eada69db4dbf4bf3037cd6efd388ec673d953cccf3c4cee19c70dea0459e0742ed1cfefc4edb461ce220b0edd998ee7b5bc0d5ad9367fb218812a2842b00bec092fc337b7445697c632ebe8dfab81512e12f9dc8a045dacebecaafe3334e1cb66a99c4a50c499a97c270695d99d45ac2b004fe3b1b0129db5f77770e531c3b6b0b8fb9c6bc74c0d51b0b867a947701c49d09f7532c3d75fee2b4092ada584585bf260fedcf150ae47c17da61e97b66e3483dad093fcb8a2dae15e7342972a737071dd8f11cc9a78813bf46d470fbb03b9de4838c8e8dc35d730740da80425f831d09ed4f25c90ef73b25cad49e4ea5e132cb45573afb49dbb0a746a22e23980a257a957c1da93db6be9ece374bf2240db1e66d0a92e11ad0cca2ff6f9796baf26261cb0eca5a42717d2c421cc29436dd55fd4a1771535c298c5ddda4777fc49aa6e16bc7e4e65dab7543fa0476607bc006556b839cdbbcdc5b777a6c74c29f76ea88c4813cec0de9978172083d679d707980af35bc95cff1abcfb570b7e6e6e8af13a0258fee47ba499fe4c6608c6aa7e964a061f769e39f74c9e0f31fb2bbcdb3c73945d22a872252a2ec54f9e3a9b6a75ef0ccf8c60594062b4c60ff5173c013edf0c13ca5e6d0ee83c29b6fadc4f0bba968148337493da3e794aa8973f8d8a9a45d206ae33a3ef12483ae26645bc4ae024d15475d7b362e036b9b7100896337294deb1f09385cb7447ce73895dec7f7d13c59fc7a8671f9ed96319eeec25f19f88ace86ea4daf4a276b6c3773ebbcc7df4d28e2c174590b60e21ac8ec2dbeeeb8c6844f9f45386b5e5a740e3134ed3260039d624abd57d8ee767fba78d5872009257df866cc723b6acca0228c78517087306096f047b57248f9786c36276da977b8df84b41249285b17140d42deff1360b42d80672abed438bd7b6d6b27c818a3412d31b00f24fe21a5469cb5cea9feecb1d0803df46797fec6600a8e3987f42d6bc32799c73400a41c76ade62441b5d1153a513ee8137127204845380621d1ae0cb35578dbb1fb619e4ad25e636d8ab6dd4e484f1c9b1ed4ef446fd75dedb5d3d02708e009f4a5de5e38ac8d646d78be9e43cff7acec2150625e247cf1454748e127bc7160367f6c7e794b4c701d169de68f54bcdfa3b386e14080b72a94dfff4421ba7703b74a0caf5974dedf087c4b26294bdf9f477
+Shared Secret B: 7f5500d36fb86cb45cd6c0c03b740f387b96792223ff081fdcf522327e2df928
+Shared Secret A: 7f5500d36fb86cb45cd6c0c03b740f387b96792223ff081fdcf522327e2df928
+Pseudorandom shared Secret A: 97d889eb6ed005097b2a8657a2ba280f0a538d06bc164840f0c6957ab50ac4a9
+Public Key: 7b021071172f7d9795b1b519c37474ac2a618331b89fc36e0ea7131cb69e23c6158e430c9d7c2f2da595c5970166c16e21150689262302ea3661e7601c881805f679c71545c797379310766cf96cd2e17a65f9168c2ccde530a5cdac335d88622d667610821848362384e7c7256232c3000083179e3a73c9c10a324698cbe401484fb262a3773536d35c5186ce511b4be17c3dfa4c609e7b0370e40f6bb0811e5043b6ac22eac68ec1d12359011bc0514a81730cf93b1b164332f0735c6c6827cb08bfe6901de255c15dc728ed3635046087f70462e3832fe6e9bc11a5a1390847d36286dae81085c51e2719cc66f1146f1b92ce079034d083065c2df90c1caf15ab99336690d0994ad83602518cac742b4128442e221c971a48d89c78c88b5380cc10325ab82f22564ca85c85c8c6cb421013c9b7007542e4c2b4203bb2bdf8abb8ec9f42b57e022c728c108d22b27aea556a84b29d26265a1dda788dacc54ae55c9ffb949b7778462706e867abcf48838b2630a1597c46d841c61740ebf39d26d0a15ca88270a19af024b101a16350c0a974d4ae52ab35d0bba2a334ceb45836ac9cbf63055d6af575a53bb96dab7f2fb82d3cb2676498a8a50160dae1522be33341321ca69185c66aad0f741cf1a59d36493aa61476176574c6aca50271046496bee6fa1bdc4526255baf0c06c24c450785161a0fc36c7519aec7c74149c638e512bfeb190fb1c7653a017572992f71d2bf054391c28b01be219bc1b1b667f6bbe1c6c4764b4d95628602f6bf61fc5e6d89669851802194069ed774ba115d17e80cecda151f408b6999528df701d95064e22917af766a33f2352293063be7842640b93a19895ad7a5fff76bbb94ad4138b4a3997ad8166684b51a1a7450b6b84cd8b12b6ad62ae1f22a49e980e524286f24a93117444c8643d5d13bbfb54fac3a95b3ba20cbb55a054686f0e2cb0955b84d6a9c77410178f13d1319a981f0b95e16a0b9b17707a718176ca5acf6211bf0974689c306c1a01f0bc03558674cb008ac2200f502a8149a2c11fb75e6aa1436da642f82b0fe68382646b8c833140ee634104213b5f8502cba0c119c02f7b10c3d2c8d5cd01618941bc378026f07cf728c0714c458e7b38e36bcb47557992c9697ba41bef1811784d1ca7f92a622b88236f9a3ab4c802eecabf1505dfdc956a8ca24008b050c54ae4efc65af9b1fe40b703a56a990a55b6ee97800b6b3fbd7624b4c768990060fc476beb7ad888181a608c5a09200345379f3dbc5df4831f4320c7fb3a47d1ac3de893fd8935211970393e0258cf21bb797040ac5854aa30bcc7005ae209e09a564f50703c6094a06f9494f8b15f7681428f1b781a72700847ac9f568de67c8962284ca4b78b5bcaf343c43d69396deb6bccdd92e0e8c6e42a89e421b971e25679d7464b87a15149778294768a7cc893d924369a59550f1bb4f8846ea2882e9b9014f33312902071fcb1642d69ba114c6f247bd68ec9f88821687468ebab91123d9b618cc75ad1b43285cc7db89b0c0e68200d396a52077adda2b8d747358f743d3d34c1580b3a32496a4cc5cc3abb8cfc75f8c6627a7fbacaf8856019080b9203b5f811516f383d661cf3fa767ff22afc2c71f1b1310daa03faf235ee8b57dae7a8ec35bcf1e422a7d861e01d93044e4873e84a94208255a7b9bb0511cfe719a1fa06faf3345e2e0114189207537b51761024f849298ab7b49b36976552de2640da6049ccc8c6701c78fbe6865aa8b6cf5c03ae07ba486c215a42180e05714c3340b53d17e0a8cc0387291d394135c271b47b71c5a1353a963bf71116bbf10991e1858cbc729d94b8cfb3c46ed181782667efc6606aad58795915ddee3140e566d839c1ba67abea7acced4c4608ad8c8cd5c3ea6685601676edb81a152fa5fe3929a31a6656d6a1d922405e03ab813b2248f396b9904ae3ac10c025068df4871485c7b0b6219d23b81fff03594ca76cef375ba6b17dc2c9550ab7fe372140444971b461f65468f0255616f2ba2d009289224a943e930abb7558e72cefd237cf1e20514793395d47491fa97b1d5ccc0c6448ba9c17c18985a302d63ba425a6876b856957ea99643b77529a44350763038482003eb5537b2cf00f4368ef0c738562ecbb46119153a67bfbc9cbc68af16a983991c9d051e71eb543c4f785adfac913f646270f63f71
+Secret Key: 08454c00dd757409a928f91868977ea3ba8ac207a70e157f3b2004fa22aa14b9196cd4346977439ca42bfba8247c7b21b79581d2d97656501da8044d804c69ac5302b4004d1b7597173b2181da173d22a36fa49f30d0a6f9d8836b4c9190dbba1f0805668655c9bb3df0a1206a57a35ec5a0de36cbb15917f816b8ee9140a2c87e93dccb3445ce65468b7d87b030329981d0cfa9142142a95ff94b108b1445f7431f56625aa3949a26e9450945295b43abb645aeeb486dee7b124b99782ef11ca75b3e93fba9a8d117500046bb76a5cab01513247d98f0bd0a192f79ab8eeee15e879ca885374ba5b18fb16a27d8c5473d53b43df13f20f110c6d518a615944dc00a6d22ba1957ac91469de5623c0002b236e92f7bdb9e50acc1c3e2ab964cc5703a0c89e6ce2849c95d751b7fd439ea943c5454ad5385238c71351645887d1b5eabba1fff174d6f08a108f7c3bfa96f7308942618cbbb3117ffc111ff31c60b36955af3913579037d7345632a464ed0b83fc3b095c2b09d83558839b2e648be3c87b73e1272ec39a5f781b34f615bc8b00befe4ce8319884a909868b7834177a87ad595b18518011350b9a9ac96b80d3005607ac879fa2a7f322817ab909d67219807fa5e07cb4271d40e110c7cdf622bdcb5317f120de298b961a89a96e58332d0ccd0b38b8072364cb271950ccf990a6d0bc4b3554b77eac2c61cd7984b1560f1f07f5ca6226dc79978134b5d8ac768abc6121a6b749852eee9bfaf307b14aa146446192173cf7ce0195be05adfe52d57d108d3b49dbe805d25970ebc291a6a097052d6b5a731885454b68bfa86949c9fa4a86a1790c4585553d682426bb25014680f4728cadf7468b29255b19b68a40693ae020cc4e166a95c434d84764896924a4a5325ba1c33d0a23ba44d5fd529a50ba7c7f4830f34930f1744b680ca45dca960425f8a558bdb3582376a469a51b205541977d693344cb823798cfe49568b1498135353c3f37b3080632f7725aad64f5c042347d842ed944983f15bfc7a2ee7502bfdb41c1ff32790ab827b70703ea4a6e79c09cfa789a802539683956d0c53c4b3a6c75b1c83307761438039041ce9ec873eb43b55307ddf651eaac9987477b71e5c70a50c05de594856bc13d6a35a4505c6b125261f5acf2dd93557a8a7a9e524081328d246ccd2160f24d36e17eb4714c548fa569937bb7959aa076858b2053a1365850d01b1a278a3212132a8784cc676ca45b0e8ba8fdb65269a9c2bd5bbe7b0b1e17b4d1738a7f079ba1c92bce0b1bf6693c665aac17f929ddc4a75ef151eba741a4094bad948726dc57dab56b41a354da6805b0b4bc2a26c8158049d7a8b585d8aba2d4b2dcb399d977a0586d23d41658eaa9192829c143de545c783b32edc73ead50599fcb5ba866eba2a82580311d8c962fefc9a3b1c2d6de66634031a9837ce14847fcc8a111bea8badd82310899a5c543ddcf9b0090b1da98830455c93a968068af322537153d9e083a09295e0ab5c5c989906578ed2da9c30a4871bb06e86a086ec541129a6c845293f25da11f5b31a42a9c76a8881d4d2638a56790e4c346d03448a685e36067acc4a433a7a864a603bc461906ee233fbb951ca3649de598c9004c873738fdea337d688046c388ba3aac6ea24768581b02b1732b5f906cd53cecf938c19e8146127adb8b28be2517e77e245db7072c0f08f5dab71697bbbdd92042a40bc7e46822af051154742e6ac7a7ffc41701b898f31521621cdf8e91260563c15a521a2007f6039208b714346a06d0c81a66f02142f1077f6188791e5c0d73a5fe413679f319d043814df101dd23a2ab736322259029da074ad23c023517f24a1c95e3a751d88631b6901d6b61714d08494536dc65b151e099284b7a967014cb4a1658b596a14d0c147db2010190e04e46aef47b8dd599a47a7780740537a1756d264bae0133093e352dec249cf88a4b855b0ee509623334eb94869be828cc1123fa3ca45ae0207fc177e3e9009d684bc5f576761d3c400618c913b56224b50a1221611c366c508ad67e06a1d628a55ba700a427994309d0d18634ad4bc75fa49ad2793a6f60ab4243698d949c6e330f2dc4b78a05311394119ecc6925339a39660758c8e1e1471e9676db809877be20e4dd7727b021071172f7d9795b1b519c37474ac2a618331b89fc36e0ea7131cb69e23c6158e430c9d7c2f2da595c5970166c16e21150689262302ea3661e7601c881805f679c71545c797379310766cf96cd2e17a65f9168c2ccde530a5cdac335d88622d667610821848362384e7c7256232c3000083179e3a73c9c10a324698cbe401484fb262a3773536d35c5186ce511b4be17c3dfa4c609e7b0370e40f6bb0811e5043b6ac22eac68ec1d12359011bc0514a81730cf93b1b164332f0735c6c6827cb08bfe6901de255c15dc728ed3635046087f70462e3832fe6e9bc11a5a1390847d36286dae81085c51e2719cc66f1146f1b92ce079034d083065c2df90c1caf15ab99336690d0994ad83602518cac742b4128442e221c971a48d89c78c88b5380cc10325ab82f22564ca85c85c8c6cb421013c9b7007542e4c2b4203bb2bdf8abb8ec9f42b57e022c728c108d22b27aea556a84b29d26265a1dda788dacc54ae55c9ffb949b7778462706e867abcf48838b2630a1597c46d841c61740ebf39d26d0a15ca88270a19af024b101a16350c0a974d4ae52ab35d0bba2a334ceb45836ac9cbf63055d6af575a53bb96dab7f2fb82d3cb2676498a8a50160dae1522be33341321ca69185c66aad0f741cf1a59d36493aa61476176574c6aca50271046496bee6fa1bdc4526255baf0c06c24c450785161a0fc36c7519aec7c74149c638e512bfeb190fb1c7653a017572992f71d2bf054391c28b01be219bc1b1b667f6bbe1c6c4764b4d95628602f6bf61fc5e6d89669851802194069ed774ba115d17e80cecda151f408b6999528df701d95064e22917af766a33f2352293063be7842640b93a19895ad7a5fff76bbb94ad4138b4a3997ad8166684b51a1a7450b6b84cd8b12b6ad62ae1f22a49e980e524286f24a93117444c8643d5d13bbfb54fac3a95b3ba20cbb55a054686f0e2cb0955b84d6a9c77410178f13d1319a981f0b95e16a0b9b17707a718176ca5acf6211bf0974689c306c1a01f0bc03558674cb008ac2200f502a8149a2c11fb75e6aa1436da642f82b0fe68382646b8c833140ee634104213b5f8502cba0c119c02f7b10c3d2c8d5cd01618941bc378026f07cf728c0714c458e7b38e36bcb47557992c9697ba41bef1811784d1ca7f92a622b88236f9a3ab4c802eecabf1505dfdc956a8ca24008b050c54ae4efc65af9b1fe40b703a56a990a55b6ee97800b6b3fbd7624b4c768990060fc476beb7ad888181a608c5a09200345379f3dbc5df4831f4320c7fb3a47d1ac3de893fd8935211970393e0258cf21bb797040ac5854aa30bcc7005ae209e09a564f50703c6094a06f9494f8b15f7681428f1b781a72700847ac9f568de67c8962284ca4b78b5bcaf343c43d69396deb6bccdd92e0e8c6e42a89e421b971e25679d7464b87a15149778294768a7cc893d924369a59550f1bb4f8846ea2882e9b9014f33312902071fcb1642d69ba114c6f247bd68ec9f88821687468ebab91123d9b618cc75ad1b43285cc7db89b0c0e68200d396a52077adda2b8d747358f743d3d34c1580b3a32496a4cc5cc3abb8cfc75f8c6627a7fbacaf8856019080b9203b5f811516f383d661cf3fa767ff22afc2c71f1b1310daa03faf235ee8b57dae7a8ec35bcf1e422a7d861e01d93044e4873e84a94208255a7b9bb0511cfe719a1fa06faf3345e2e0114189207537b51761024f849298ab7b49b36976552de2640da6049ccc8c6701c78fbe6865aa8b6cf5c03ae07ba486c215a42180e05714c3340b53d17e0a8cc0387291d394135c271b47b71c5a1353a963bf71116bbf10991e1858cbc729d94b8cfb3c46ed181782667efc6606aad58795915ddee3140e566d839c1ba67abea7acced4c4608ad8c8cd5c3ea6685601676edb81a152fa5fe3929a31a6656d6a1d922405e03ab813b2248f396b9904ae3ac10c025068df4871485c7b0b6219d23b81fff03594ca76cef375ba6b17dc2c9550ab7fe372140444971b461f65468f0255616f2ba2d009289224a943e930abb7558e72cefd237cf1e20514793395d47491fa97b1d5ccc0c6448ba9c17c18985a302d63ba425a6876b856957ea99643b77529a44350763038482003eb5537b2cf00f4368ef0c738562ecbb46119153a67bfbc9cbc68af16a983991c9d051e71eb543c4f785adfac913f646270f63f719f2c8cee3a0501d1c3f41a94f08e50639987c952f238cc6f04db25fb41341b2decabb7746192ee9fea9bc1b717b7a7cf39b784554ea65847053a4970dda754bf
+Ciphertext: 4865d1a42737f6aa09c865cc1da0274d8a30d04523ef3aa637625cecd8869821a43c00fc53b66523c5cbef452324d75726909eb685b4d7b6937891922fc885fb6a6cfa03202ca7fc1b24744a90b031992d8d62926c8faedae229e1d529cc8e2271236bc0d3b7da841bd1069bf460d9298adb33f565471efeb8aafd4453f30c308dee22f9124ab03660ca6f40c72f0cef00f87575cfa5475afc8715398f73170e51e238adb2e887e0b8b5242593f6a8993edbf99e62c2e6df33ed35616acc0c4c6466fee057b9276e6abd1415df693591c06b09b79b8e508a4eff3e8c039397dd5a0827a3e3b799cd0dc3b9d3e3d7ecbe88b564c82d391d1ee006131564b9a730373d8cab128531fccdf2381a714c45048aa170c2416e3c8ac4635574d711dea1f47a36a231906524bcba3abd928aae1f49f8547cd13e853299e665f313af493d64cfd9478be475a393062b7fe2913bc3ef9e304e3f15b2b765f7b62dafd5235f724825df5fa0e43cc9f143528f71e3141034c7a22bfcbdd351fcabd9b0080cd5d22e37e24a1701b5e3be0b6da1d365513640c9a10853694232f6066ae85b6ba36d3ae8efb4183fff1ec1c19144bd12fda1a16ef595d958c88593438624f02ecbe71ee9d6d6f8f36b746c7ed7dcd3871ec27a90d98e40a8dd5aa1f64308506ae6d05ee033639611b849748aee5f4231103d76ad68a3a05d9b6f54faafe0efd36e4d4c17c74a0826b1f48e1df36e9a4580965fa76554b62e4700b48d704be73831c2f638525aa4eb7d8d719123c80602cf023d1183c3f7ce1ab6418ff0f3929f0df8eba71e136284dca088a18dad03515e6dd218323df07eeb3d49dfaba014ae68b7aa49937c4f2cf75c42a5def8e2e908141570d4cff08a4459cd80a2c17cd14562ce27578a3d4898d2a2717bbfa0cf08f88d70b473d4800461afab9daf26497c70a79b9a54cfa99ea96b641731c0c155acf33b9a3de6afc6d943dc817ba8aeffe362f1b7ea278efc9310bd8dd587ee7765ea34f8dcb0e85b46ab58b6ea5d9dba911b9456cd67d0572ba343e0886be75cf57020d1e1552b66c5841a9c18cc85542494c85d7ac6a15816b5d5c7d95c85dea0691b042ff613f6d3a6666fcfc3928e68d282f7d8dbe298e9a21eb56fc9dc66b893ba753601c9300312726d4f82b7bed8126309b02c9ca0d505cd85d2413ec73575cfdb98e75bd7b60d74f3b01eef56123b438a4568fc3660825e39c84959070144d738aba609e17f453b2b410a8cea59a0fdfabef598141d35ca03f06b45398c73dec2b6e3900802678f2a95087557c842ffceed180314b0028f83792eace294f772e37b671b043ff022d9fb9503abfc59818db46228b7c93a1bb2b68585a377916aa57bbb57bb2999db1b2b9430a6f3b28d8e2cdd1f5d2ed1b5722d8245e177329dea897770eae8fd85e1e0c6a346bc04fdbfe3f81e9a9456b3ab21775deed7b45b40acf8876ff240448448481fef36b80214d17332b0b134e4455c69b7ccfec56e4345af2db13065cf9213ba474987c03ec261a434c62b3b75477ebb899017018b5a7f055cd01554a691a264e2a130da51019a7109344efc3bfb5caaf3b1ac51e847cb5f584be8306e4ef47397515fae7c8369eca08abb411ff6484285ac0adb76e5fc3975c73c2a3c193790758fedc7651da75eed3d457fb0b902d2fab03bf577b2ceedad4a5e1919bf0894f40838f4921709b2213a40eba8e511b7a3c347df3a4f6a9beede312576674fbf2ac62d3ca079f8231a13fe569c17512842f58397539d5ad824acc2754f466acff4c5f146d8761a90d20eac9ecdd9b6f3240353542b715f101ec90cc1f4caf1664d47253d7fe451e2a55ac6d79c30019ecf79bceb61350ac46673076684727e9071013fd866b52d219d90fdd92d96f584b9b5e9aae65680b0d8cc6ad71c1ae73729ef9e2424d6772a5b57ccb4add22bd412dbaec72393f0a0852e8ce62aa84be088f0301718b295424193e377de4151afa190584551117791cab51ff37f45ebe7eb7c5d532641c40e247d3355b9d6a988c38ca2afea3bd3a58d2596395216bf3ad02b7cf2c1c9d2ab20b96169a98b0b8986b1a3e826743a45181101157b45460c220f6a3c06b0b0382c3791e12db9a23a2092be4466a7582abe8e385d193bcb406e0be3fafd054fc171659c8be508ca0ee0fb569eebc6bce1f6fabcd965ef8074b016
+Shared Secret B: 222184947fab143b64ff3c64382c0ec20c903deae807e52de54796491ab70a94
+Shared Secret A: 222184947fab143b64ff3c64382c0ec20c903deae807e52de54796491ab70a94
+Pseudorandom shared Secret A: 7755f20ee19646dbf3911cc0e9abdee69059cfbbf7dade97f2ca42e71c3b0e0d
+Public Key: 60b4343108ab2751bac766285e63a324b301aa3c246f053b8f60288e6a2af162285c8aacea785bd967c9b89a994cc7a3b9f57f5b52865b441e1bd988fa00727a54281e6768c1ac4876b0655798b7e8c31b15d6bfc4961567517284313dc49724def79b1fc4a34261237da8b049f407bd03191712b30c01209aa13fb050699d8920c656566d9119fde057e34625498153c53c3f4d905a2698544a7076cd56c1fd5111c4299147a6745ea235bc6086b3600e1053557f432ef79a2eda30a808f2061fd898954269af775256c030fd600fd02b814b0160e6579e05c815e1cc8f7f3ccc8b017166848b6dda56a1e775cef20fafb126641a8a1de6b0cbcc944b4826fa770b0b9b9e0317b914866daa217662874879900490a4295f3a5a9d577127558abcab8a3a510d72597379a835e542ccdac127061764b9ea2b18c2c02dfa14f532945a685322cc3c52c78cd3c79ddda20f576298bc150742f3ca8d436995e311d5a14a394b017ab982296a2933d4a7504b01b24491bea3b07b753800a234a0873e30e855b2117e526c784cfaac5951672676b1ddd65bd37993078467014ac0f3b26fed66a29c07936a480854d445e06014f96876d55050ce3a824fb6056aa0c18f850f3641343cdc5eec1ccbf3d97b05222c65079e2e62c0758c241d729b28479977891541dc41454755ee177faf33b641f331edd739e862cfc7f39b67316e14e24ff509cb15f49e409ac146f2830283b5ea98c5e1eb437163984d27c76706287a44314e7955822810aca92f339b1808437a1cc92d27e24aa92a6eb19804a4160eaf6739bd71729ef3904fc8bbe203a0d573ce099478e0952bf8f1712e63cc53a41834142595648a22c915eeb798f27b0f796327aa091c52fb7d2492a19c46705edc4368540592010370b10c7214542de10a9f78a4c83aa9c5d55d20aabaf927b86cd0708f979f69c152a5f439b4281888f5505af98f9360326d7c8e9a6397d2813cd63cc56049628a54b4f22c9c680c070c8b7a5ba4805122449ce65ef4973c27955c89d700bfdb64e47c1c76c01797cac924792945da3a06c7be7f6856ee070072cbc7bce5331ac09fe7f4cc2b66498fc53c0afb4f824abf1a3b08549ac3e6408deaa446a223966dc817495a8dfa3217aa57727671289b263725299305a468a4b7394b25c94ea701d403800679283d11c50796604a5174c7545104a5bc7da243406bbb0298460f548befd90a64f14c8758c1d62c4cc092cf629286fbf43cc04cb6822247fd1263a97c3383aaaf0dd5b39ff2c661a0c1d108a708749e32bb41343795e0193634d28e3831bd32922ea690457328690e957e4e869dd51128944760095062c754cd127bbbeec3012d7786cc78afa022a5f2393ea4496a834b6dca553373378818750dceb26d761012e3787437a0c07027a8f41c03fdd841d20ad02bf1858f9c2ce71558bdc08302c5155d7a1740206ae12c7ddf1421a18194724c0fe580684462c02f32a71ebc40125abf1de897cf646fd676a6b1bb8e9b8b8f15822eb6e5a4c4c96fa30c811bb430711808b616046e777f3564a4191bb42d160ad434a5dfd47c3d496cf07c7a88802b2cf9cd5b5b4b7b301294c28acdb8633378ca7231cbf0a484a85366015773de67398d343e6a432017d9897cd16d9aa0bc186649f5389551604080b48705a9bc0cd58036826a1ce5b1ede108a73322bc56a196608497cac224c3974b00c05515942bc2696c1b4132a12d1d365d8116bfa4e66f00c7cf3026afc0e31336b00c39ba5a01a557a37bb54d9798b56a0c61d59b06b3b55531424af74350fc9d8bf2ccd983ca6c885cd3f0036d83c1df549660064e8ed6b0afc41cdd11b54196abade450a3388d149a1c004dc8d3804e9184b378c02d5d7b445fb60b765929ab97c1b2c60b22185828776d89b94438c1557d37a99088a5feb04b3274c70110917b642e74b427a4f3baa6217310a35b2e7b16e53545c078b728bac496e1958216b4b9bc1c9b5570fb32a760c4584da375c7a7ce1ae24c1a49792d4b45e34a991ea37ac19c7c1cd6ae20fc24cee859b1fa7ff5d7061f74209ed308eea90509fa268b93c078386a30b0c3da0401428083b0361d04e1041c6a097e19b36a3ca868ac9b1169971eb45ec4f6cd9144519b968ad121a83a712fa492cd7933b0b8334ee77a46942b03b50e59f287fae2abb52b4762f16827301d
+Secret Key: 31d570552c952913b86b92bfd810b56bc512fed756deb711f37849c27b76bc98ae9416bc84b06e1509278d5998117ba2ff69088001c49dd0b6cad40358fb5476f6254736b30af8871b317e07a84fdeda96855abde55c37e6f452f1a0371e79090b0578c3552d2dcbaaa3313b7f7bbc5db390fe513f02e18b48c9894d71ceb4a5961cab6578727473a808fe45a66ab420789a820fb6286f455406467b769167db5b5d496379a6e483b6e3abb9a06224fab05ee72863c7359d76bc24f43622c8480068407cb958f8a317ad9badf77bbd005c51faea1f2ae2892c5a2282c97707c9c6892c8887ca7aa45416ae80c537ac51225918d491500c7a288445b60b6057da89bb6c936b6eb3cb02e9b74e4ca6c59c00c0c9a5931b34a42347aef11a9ae02ec2ca785e5a627cc7c153d27391b3af20d533ccc1c2f5dca08c982375046404221d3fc82ba8691f7c10655c38021edbc878b8cb2749b7e0941466f23616527703320f89e2ba6325cd6b7b30f5351a62db9d040143194957d565cb17d5a728b18868416612bbcb3f208f53355f372b5c243c3f421bc3b01a9f610a7fbddb10aea2b9d64c753d0b8cfc719c2cfc4e36f196337a83f18693e24ac8d7a58ad6d6ac72567aead06712c8c3e09740f04c6274d0c53257cec7294ec0c46e5f3a4208a9bcda6a729835768ad90b71e5491f04242df447eb26b1e1ab39f3309d5e177f1f49271ff602804745b9e74ccb5ab50023b73032954e025a8f1622109765b5e8224cb11194d096a000256893a010d886e2cac75ba54934e2b909d08d2d5766c08baeb3784ac029ab96d80a70743cfabb7a9711305bf78c967a66a0c30f1db699cf341c187879ab833cbff75e9a600ce795487eebace2b43dea609c52f1b3e40019a0d63c39cc1a0ca25a57f6b0fedb1a7e72119b2c3051263df329c477798670e55e7bf387054593f9b867a7f16939cabb97d17e0dc5cb20cb28d6a7cf2881545f1159e10b4d88325e5c757f968384ae273477b0a4ddf2b7a4c5ca9026a2dbf0b63b3884006d5675e2091659cc2854cc5f50aae49492b6c7a43f8bbf3c199e0cfc7e7e2957d6a7170964a369c017c152cf0e01215e663f6d6b17a98a3fbaf2809fc67e1b83ade742c72b2c38b17b6f95097f18177e7440c955a863fa52cc595555d0a9268f975dedd41d07d60fd3f7afc5dc7ca9dc4169a49ac691211ecb10ada45eebb63567e0cd44f31e15c4ae53eaa8d1d116fe5a5230123ebe708c9b31800c399c3e05c543e75ebb878cb5e4839f16a93eba3f6b7345a7348065ab582f2cc3b52c83532aa9ea2ca439a8382b894db561ca809c92b58c6075f1694b1a64f4b1c9ce6a58c5294bc469cc9593c989680373e78f4cd019ee2b63dc2b9017b81fa9924d501a82e717cbd3fc9d20c098b421accd162077785a3e6684fd5aabb984b83a8621911ac998f3612b4b848bc21e9c649e1af6497c032527ac91d8caadadd5b200731c1276401e9abeaafa7a4c52c1085c9d012c942f92b45b5816fca3756f673bcee624c8c43e60f441663a3519945c69a4a4d309403ce59aa8144e382861439113df875f406168e77824535419d3dc1266b21477c793b526a67673612e809416a24a08b865b8c3b20c242b2431ae5081a56c3b2200d0b8857c8680034207453a97066c16b67fa0925002db28ba228a83b159aaf77c04256a7b44a3dbb2afa121232169a90d0c6d9d4aa8a4c022a9c3b24586329939550c8c39a4cb305af13e36540b0d52068b0b33708cb551d82f94da33eeccb5a180b1c8962e2245ab15e44b4be687c6bb524f5638a2023a8182b964f2b5b7403ba0650554e96191f8a0de214d36c3cc58001d32e98c50844ce976c7b73c6dc0f29e8c728d435c4d0e712e9f969c5e69672c92347df5a9ca01c3a0819630507f61e480d20169e6f4ca93725f866bb43a644b40952d87446fed2a1f07372761b8a435fa3bc4c9b497352a91779350a0889b08189fcb5c7c1775e673798e91428db32814c236151b716cbb4cfacb3a08952d777b45290936f9164f2a8795f3f91a33f88f635630bb216127395b5cd85f88e9439efcc00034127f52302a02c8c1433738864db47ca70766b1c3087cc252aaecb849d7ca66605790da89a344d9cae3560886d9c694515ee8b00160b4343108ab2751bac766285e63a324b301aa3c246f053b8f60288e6a2af162285c8aacea785bd967c9b89a994cc7a3b9f57f5b52865b441e1bd988fa00727a54281e6768c1ac4876b0655798b7e8c31b15d6bfc4961567517284313dc49724def79b1fc4a34261237da8b049f407bd03191712b30c01209aa13fb050699d8920c656566d9119fde057e34625498153c53c3f4d905a2698544a7076cd56c1fd5111c4299147a6745ea235bc6086b3600e1053557f432ef79a2eda30a808f2061fd898954269af775256c030fd600fd02b814b0160e6579e05c815e1cc8f7f3ccc8b017166848b6dda56a1e775cef20fafb126641a8a1de6b0cbcc944b4826fa770b0b9b9e0317b914866daa217662874879900490a4295f3a5a9d577127558abcab8a3a510d72597379a835e542ccdac127061764b9ea2b18c2c02dfa14f532945a685322cc3c52c78cd3c79ddda20f576298bc150742f3ca8d436995e311d5a14a394b017ab982296a2933d4a7504b01b24491bea3b07b753800a234a0873e30e855b2117e526c784cfaac5951672676b1ddd65bd37993078467014ac0f3b26fed66a29c07936a480854d445e06014f96876d55050ce3a824fb6056aa0c18f850f3641343cdc5eec1ccbf3d97b05222c65079e2e62c0758c241d729b28479977891541dc41454755ee177faf33b641f331edd739e862cfc7f39b67316e14e24ff509cb15f49e409ac146f2830283b5ea98c5e1eb437163984d27c76706287a44314e7955822810aca92f339b1808437a1cc92d27e24aa92a6eb19804a4160eaf6739bd71729ef3904fc8bbe203a0d573ce099478e0952bf8f1712e63cc53a41834142595648a22c915eeb798f27b0f796327aa091c52fb7d2492a19c46705edc4368540592010370b10c7214542de10a9f78a4c83aa9c5d55d20aabaf927b86cd0708f979f69c152a5f439b4281888f5505af98f9360326d7c8e9a6397d2813cd63cc56049628a54b4f22c9c680c070c8b7a5ba4805122449ce65ef4973c27955c89d700bfdb64e47c1c76c01797cac924792945da3a06c7be7f6856ee070072cbc7bce5331ac09fe7f4cc2b66498fc53c0afb4f824abf1a3b08549ac3e6408deaa446a223966dc817495a8dfa3217aa57727671289b263725299305a468a4b7394b25c94ea701d403800679283d11c50796604a5174c7545104a5bc7da243406bbb0298460f548befd90a64f14c8758c1d62c4cc092cf629286fbf43cc04cb6822247fd1263a97c3383aaaf0dd5b39ff2c661a0c1d108a708749e32bb41343795e0193634d28e3831bd32922ea690457328690e957e4e869dd51128944760095062c754cd127bbbeec3012d7786cc78afa022a5f2393ea4496a834b6dca553373378818750dceb26d761012e3787437a0c07027a8f41c03fdd841d20ad02bf1858f9c2ce71558bdc08302c5155d7a1740206ae12c7ddf1421a18194724c0fe580684462c02f32a71ebc40125abf1de897cf646fd676a6b1bb8e9b8b8f15822eb6e5a4c4c96fa30c811bb430711808b616046e777f3564a4191bb42d160ad434a5dfd47c3d496cf07c7a88802b2cf9cd5b5b4b7b301294c28acdb8633378ca7231cbf0a484a85366015773de67398d343e6a432017d9897cd16d9aa0bc186649f5389551604080b48705a9bc0cd58036826a1ce5b1ede108a73322bc56a196608497cac224c3974b00c05515942bc2696c1b4132a12d1d365d8116bfa4e66f00c7cf3026afc0e31336b00c39ba5a01a557a37bb54d9798b56a0c61d59b06b3b55531424af74350fc9d8bf2ccd983ca6c885cd3f0036d83c1df549660064e8ed6b0afc41cdd11b54196abade450a3388d149a1c004dc8d3804e9184b378c02d5d7b445fb60b765929ab97c1b2c60b22185828776d89b94438c1557d37a99088a5feb04b3274c70110917b642e74b427a4f3baa6217310a35b2e7b16e53545c078b728bac496e1958216b4b9bc1c9b5570fb32a760c4584da375c7a7ce1ae24c1a49792d4b45e34a991ea37ac19c7c1cd6ae20fc24cee859b1fa7ff5d7061f74209ed308eea90509fa268b93c078386a30b0c3da0401428083b0361d04e1041c6a097e19b36a3ca868ac9b1169971eb45ec4f6cd9144519b968ad121a83a712fa492cd7933b0b8334ee77a46942b03b50e59f287fae2abb52b4762f16827301d84c680df57b110cc1286418e7f9c757eb249a7cb2de2bbd3ea87bbcde1e73b768981bceb10ca7c4c3f749091e8fa6c33559493a7d179fdf23157c14c9f1ff900
+Ciphertext: 3092a396d5e6271e953294a36e5734458ffe214d97e4ef119a9d2913895abe10a7d912cca2eff274902c438d4bab54459dbb60eeab28c61cd3a82480df419949cfa770d5bb70d3dd139106d9fc4a6c961d8bc4932efd97a8d03335ba48ea3ca53c0475c77e55c3b25012142870772ddedca7a2711154620cb29385f0127d3cee9c4c3de0bc9e773522f26033fa1331e091b92752a70b7a24f7d512c44fd0954b1f7100d8898beae66493f9f8344f906bfdf76ba92e62fb6044d926cd130527b51f2acd3ddd74177ca145ef294705b689d406e44f72937fb860a8a3a8ee37f458f5c8da0b9871cbfb92b214cc947ad21b75b158bff964a061f4a34d4659efa1bb5c04866df44fe041db3cc005985e42ef9edd540d6ccda3ec869474461eead96534b9f73e303cebdfe9e5fc7298cdf906a74f247f8ce961013d29a8c79c11b0c9f4baea1ac4fb845ffc18938a697c7be1e76428f1663d0156f7c96f160a3d0add1bae1594a69b52e7f00345a84575c6dee0f4e01bd91c29630b981cc0a2e3997874b3cb408d567427f052bf45a13f3abe407f7e1023e1f56db71018dbdf43302930af3019209461355faf3eaf3811808e4ebe2dc1bf7596ee1569257c51095e6cd2dd9ce06bddcd2077fb8c264e6ed47ac890eb350746771f9806c7d6a2025534acd09bb59649f9a944d1711c36138159af543d237e03fe8c999545ba0af8e4db39125f709c0a4c3f5c15874d8f9f6973ac1edddf14cb9742788e477821acbc05cb6449e8186d9aa8e7f9382babea51e82f8926baadbff04b106d361e3dfeb9e451c7e110175082d753036c7229115c02ad7e8b7575221e4d114c9067fc8ffa02bbd92dcb508bea101d51c732bd4b55e7863308182eca3d6ab32e18b5ab7aff583be670a7a10c4cc38a5d7a68a21de8f62a5a977fffa2529ddeccff94d42d0bcf47250fdf967220f74b3f93ba865c3e55c6129e3c35b71706d793da43387562569dee0e9ae9412046ef333a1af5a6fc7d32804419d6247f63035aa61de02ca481db16f0b3cbbd104237445b8d860e7cb36629d18652c73eaf32bcc1e9bf426a8f8359bc0f26c96792e6049a92398b668ff8f3d7f986108680b8c3af9c84f3bac096dfead94a7f3f2a79cefcb6988048a9de4a3149749b2ffc672635468000d81ef31cdbb7a8b03343c810f9d42752335d7dbe8a8a3d334d2c59472f2f255049f2eecd09d53ab38013a6ee5b142a7891031a382262c597609a23bbcb37d74b602e814075aee24abc3dfa2f8adeb265c92efcbf17d6e7ebb722fbf373a8ea24b96c2cfa4da00539ba8ec3652faf0280559b7db416466ae9f505a48b04e56130dcc8132205e8ab9787d5800a8645f5af4e272e770a619fe1a377085b5193c0803f91f5e2a77d0d314ce382410fc20cc4af7ecbb5ad8478ae444251e14bc3cfb1ece07a06b4f38f7a717ce31a6f62cf2114f3b0f6d5a29119565b110e83474f02c8f4b2c3de0bd1e79422eac771c22087a453b410f873ab42029860f525335ce44ae518bc005aa003fdbb74c051532470c02f486910703a960f9b5c1768b2508d84c65e5ded498a7430aaac7723942296a1701fdc9f3784fa1c3fab906793c3713c5422a90c64b43b0778894fa712c2c3f6851c165589c33b8b4cdc77a3b563f729dc912b29e7305a3a44c9a49c2ae68ed6aa58223fe6de0d4abfbe1035ca7df658ab808c8849cdc03a821d602339a37b45442b271e898c40e8a1fc3c191572bbb561489b2fa7bc327be23743eca71a89b7a9542a34677a1e26bd2502d0e5eaad1e7152e87b83127e6b293f74676e82e18d342c73abcb715662c97f1477e9fa04876159321af6d4a725630cbbfc1ba2d0c6848049d0486d4df4551f7dcef40845f8c4f184b4cac06054a42c024017db777e3ed3015245b991ff9221545a7359ee16371d26e6b015d35114e2fe6591db14fc51074d4481d8479743ad623fe8a3bbaf6e38ae591a3890790ee9f46bbdf39eda04ace9b6970265343952052bc898c8f3e55d5d64183f02d3c46564d4fd8151e51945cebe0132b2b7afd0f6ac0b5f0f352331b0ba4af107f251aa4ec3102669013cbf33aa631f56cd7e5deb53205161a267b0324e8a048bd93444e69f872148429b7f2bd6ead136a4349119d6987150849d80300be347b31bf8e57063ac7117c67bfff2272a4fe617494edd693d267fba53
+Shared Secret B: a7629f3cdc86dfcee5e03d1b81a2be9c8a067878c8d2edef332969e8279b4ef0
+Shared Secret A: a7629f3cdc86dfcee5e03d1b81a2be9c8a067878c8d2edef332969e8279b4ef0
+Pseudorandom shared Secret A: 55f4b50a369cab449018f5c68a9c53be7303b72d1c3613d594633fd68e014f4d
+Public Key: bf0c06ba3711922729dc720ec8207efec1132b39c5d75646ed5059fb636d1f206788087e2c7b0a9ef247e58567208192fe1b6fb5b67eecb91a935a67d40a35d1cc8b2893a598616695933629369dcc655e061c244ac8216b1b0b0cb41afc1918b9ea0125434d7045ac6dc50777db3829c246fc469cf33a8daea012a9e555f721945cb2679f0839ee36a85f25a53517c6c813c450b32238c7858bf8afb71c5c62910ed3da3f2c769034aa4a34e163de049a81a265bf835837ac5cc5377420f6a8654111852c59d69687b45041e4f04b8f6944cc876e4fe77322e5bca5093a6cf1913b4a3937253e5cd825b6e04d97211024b47255981dfc669122359249cb77c8f2a5f699400aa939bde5a121e17700982ba99b2a221a8a766a1e561a10829685b94a78a93cb79d15bcd2016de2132481289fe3899008c29a1944c5ed900c04f8a5f4978502670edf939b79c77d37f412f561a9a1174aead3ca08f22bf22033e283a54f12c66cb28aec754ac12cc5d73c07df2cc50b660b6b426e0d58a9d6f33d82b593305bc95d7b1377c00638f294b0983c98f242370c54dba65fccf93acc31c2f0709017ba8f0ac0b9eaf983d0432905e32c18d83ca04b0c4e0c7e4ecc7f6a285786d23f13e9680a4462cf90180ed816480666fcf613262440a517ca4fa702a425992667a5989c8ed3f935cd52b6456a6aab903634d50a1ceb484059c1ebfc729db451abd26444826270c4645d852ede38aa9df531fa649623f3ad90fb5a19957f5ba7556124630e39b723d114058b8b602763d2996c2e106c7aa623f01b6b3a154bf1123df956a3f4a319855722ea8856bdc342f6c0cd1a2bbc0ff697dce97c5fc341f87a4d6a92c312f689171972bacba9b42538b3c6bf055c841e5067c79921008c5527539435b221680654e7ea9894fa0ca1c10e21043109446885e1213bdb747e288db0d4af204bba169684a68a5c6e7010732caf169cce7f81421cd90100d0b90097393b7390eada62ff076a1b216860fc4f656a35979489e9f60a0a85623e04b085d2ce05f05a7335a18b9498c51083fac2c898c630afc5b34ae8175fb92fcfd90018d45ead23c00714620fd05d8d8607abe817f919363752119aa431b1235c4f37ca6fd06b4b959ad84c922af275ad71a770cc480cd77edc501d2692523a29bce78886e8f78c7dba5dfa322529ab9b2c11928d095326cb33f80620aa61519d08b4dd326fc83554f7dbb3073206e251b3c3cc3ddfaa02cea940b7f52c7c6c0bbf9a0bf5476eeec64e59a0190229a65e082c4f354cc52029a15abe62c372a6c1a136c621c0c00b607ace31a23b3fc602ec96769fca73a3b9641c7164bc309a1c85906d8539a6ac018c02455718597da441bc977b4a651d64eba74fa66672f3280b3737ca65c905380b65f814025abaef68c13230b2a4d4b28a31a29333ac5ac001c3c753964529b1574507c72a024c2f1928c28ec56689a76daa401f4300733e65a54c6386799871c472448ae4b2a49b5884bcb62ae7610df67edfdac08c361b719b2a17746b67fb778bfcb5578a7da8875920c297c4e876e0d03a244cb38ecb7ddbc58f18204ab12b8e3bc394fe15c2eb208cf50b0e2f8c144e5ca74bd165531b6a9d048f57cb05c7a4982e1a2324c33438b1a9e7a9c92f07b90ef389869b25673971ccf50906b7ba40da3af4f75ed4a6018d9368277816ac6cc6dbe7257530c5c88baa1c908917c08695899198bcab62808f1ec977da0c036b3c5c6b5b6c810b638532b6b19ab6103a2066d6487341a63a83205311a4550a5aa5553d3041bc5fa779a1a118f5831e28c473622b0013a351d8eba3ba4a6493b83a7737c1cf827732d65bc1d29eca4c1382f97216b81ac4996a90d025f1fa76e1aca81ee917c8c3939ce1b54a26b9bc4162424c3bc0f642f3f82df958891b8b76a76248d4d8549de9844d7aaeced7aa8c8485bd5759a0194040c671f4fcbbd1b416871307b12304fd000287a30635293dfa203184d996d71751be729245b60f5673b333bc4599b36b0fa705b0c15f63da3597616d600374ac0491f64b940b36618df7c84784cdebf3297b101b9771c68df5477f712b722918ebe11e13d494128c57c5914dd3257353146000e07f12a1c08fa35a9c60aff5b84f8396c8a97b65f9eb819bfa5dd7bccce0556a7e45a60bb22ca41ba2a0a889815c375edf5f6ed3e1b2a5
+Secret Key: bb996bafe2a8ac962dbb7a294cb85eee1ccf17222512f495fb151970b032cac8a9c1da4e39948462aa5fec999a2caa616b421c566ac995b5609b36ae8b7678c86624a9c4204416af525490ec95cdb5d033763a136cd75dbbd00b0948bcf32aab324babbfd9256c37badfe41f0db795f0c8268de2973a8312464350474c250c4651999a43ae651ef699a76c30bad6038fb8640ed7f474b7e502c6140a1c00bfcdb7c2844c3c49538c575172e913a108927f687943aa5cb4a6ecbcbee4b3d0ccae5434c45b819829e1559c950254b96debe5b04713b9d38949a8db25b56b02d9f80c0a058980363cdcb06e12b267f8fbb874251cbfdcc9001302bce1b2c32b3d4fb8ac5a832a8807cff8b22d5e5728742709f9c177a3f75a92c74b11e846c09b5e92426cfe809e2d3a7987076407d71c6d668daef969c48c9d8f1083d5b5985af85d9107766c678bdd2b169359b24b57b08c941b47b1758ec7be0f043e8ae428de57775e83a009079fbfe4a85dd23ccd503bc440aa2d8b3fa8ca095a78946403275c4c1f36d42168a85272e46e6d77bae8f62175b7c2ea191db082189bfbc7f25a8b175c3689d14b2fb39c5da084aa1c25014717c42a017066b392177721b19c2a595dbc895fa4ca93a2247a758a8d491242d577095cf764aed8a5d0710e3f9156ac41be8e97103d6559d93b3418e0b6f104b7a90966958b5bb669151959c93c950ef91a2f740a3d69f634b45b2298a1b69cd1289043ccdb0c3c5d26925637941e82a073893ecaa09008e444b72508eaa582e513c5e9d797ee8757a92c8aa9f5bd5b85b20b5cbbb2343d7930a836c4483b0141c6387affcbc5a1b76b74697ed89c4fd45649cdf67fed040c31693616f8af3437cafc0b6cdd0842f2aa4da2966e4c4ac198d728019604052533b3924abbdc031e12ac6bbc1805e887d14131999c4f89f22a09c767e0aa9d02b4626f1350ade2aff290b25e26cfa44aaecee25883a3bea8922912274f761bc9b858104bb6b521468d097b9d21aa35ae62a4f5d50d6201cd47076319556131936f2408a66f962f2d4023e44aa16409cf07391a4e727b87328353b66860d7776e24a1c61b09a76a6f8e62a2df17c0b8349710a9a7c8b82f6ed5cfa7a05db4d34a408a9395bb97f9570f480214354127edf20aa3899d176ab55a38ac0074290e6aa292738a108932b97b410c928e8a1b58349a340d81a380c8c65ff602e6c5235a8732c7e0c0b7269cef081a6975ca3e5061bb074088dc0010d8b4ad4c86b7552c50ec0bec41c1f00b2c0f2225f2b7cf8494b66bb122e22acf8ff818a5570214fc7dda426bedd479a1f41d3777a361c42af1113d775186f641cda0178f8d1c0e9a7799df260a9292744daa6c41c63f2a10424c470ffbf35c734967df3431ee073ed405521d0075e1563b51484eb32b8e58b8b9bd04770ba6b04e633343146688705cb4e1088ef80b1b5853b043cb450a7299f97a7933a5a527998a57ca8715189cc54878fb9dd26162591359c3c195b1ac01623a2916a13eb4e84be38c40459343e2723c52d2a60c2b001a6b4a561a23c02420333c572744babdb99e0be7c4ef76c216ec21f7308555f09e956c43e9b0912fc4a5941aa9b60902f5400933b4a66540c770e184e47c5c2697c3eab94dd3542a23f5051cf3378fd7590aac3f62664ba793531d09d0b4c3a84016ce49173a02aac440aa685f0bcaade1898f6043f9029f91a7808b22015fb11927e6066e96750f392a53b1a262c00a7f5c3afae8751eb2588dd21e0d592b29f35129427a305bacaf198aa606bf7f73cfc9816b7e17621de59bae349ae0a60b55b261a2479ffbc4cef3db7c8b0a802bf893984423454c434622bdddb67d462c723bc5208072cdfe792c8b266cde8914ca409dd45b03d9f7bbb459c119b36b7618aa0d14382727b19364907389aa588775e5b075ecc6059827040a9c4dd189a73c7724160288a4774b8fa23553b43d330538200721d28b8368fb154fc9b16d9a694119a7da5409a884b2c2fb7cbef43bdf20caddb6b788043767f1789eab19f1675285e566a6599331dc44a1e274bc0661a0b637742076224436136b1aaa419751b9a9d4e993d7b5b5f2bc0ef3b97bda939be596c72eb460f1585767c9369ed39402c52f29903bdc52a43fd356bf0c06ba3711922729dc720ec8207efec1132b39c5d75646ed5059fb636d1f206788087e2c7b0a9ef247e58567208192fe1b6fb5b67eecb91a935a67d40a35d1cc8b2893a598616695933629369dcc655e061c244ac8216b1b0b0cb41afc1918b9ea0125434d7045ac6dc50777db3829c246fc469cf33a8daea012a9e555f721945cb2679f0839ee36a85f25a53517c6c813c450b32238c7858bf8afb71c5c62910ed3da3f2c769034aa4a34e163de049a81a265bf835837ac5cc5377420f6a8654111852c59d69687b45041e4f04b8f6944cc876e4fe77322e5bca5093a6cf1913b4a3937253e5cd825b6e04d97211024b47255981dfc669122359249cb77c8f2a5f699400aa939bde5a121e17700982ba99b2a221a8a766a1e561a10829685b94a78a93cb79d15bcd2016de2132481289fe3899008c29a1944c5ed900c04f8a5f4978502670edf939b79c77d37f412f561a9a1174aead3ca08f22bf22033e283a54f12c66cb28aec754ac12cc5d73c07df2cc50b660b6b426e0d58a9d6f33d82b593305bc95d7b1377c00638f294b0983c98f242370c54dba65fccf93acc31c2f0709017ba8f0ac0b9eaf983d0432905e32c18d83ca04b0c4e0c7e4ecc7f6a285786d23f13e9680a4462cf90180ed816480666fcf613262440a517ca4fa702a425992667a5989c8ed3f935cd52b6456a6aab903634d50a1ceb484059c1ebfc729db451abd26444826270c4645d852ede38aa9df531fa649623f3ad90fb5a19957f5ba7556124630e39b723d114058b8b602763d2996c2e106c7aa623f01b6b3a154bf1123df956a3f4a319855722ea8856bdc342f6c0cd1a2bbc0ff697dce97c5fc341f87a4d6a92c312f689171972bacba9b42538b3c6bf055c841e5067c79921008c5527539435b221680654e7ea9894fa0ca1c10e21043109446885e1213bdb747e288db0d4af204bba169684a68a5c6e7010732caf169cce7f81421cd90100d0b90097393b7390eada62ff076a1b216860fc4f656a35979489e9f60a0a85623e04b085d2ce05f05a7335a18b9498c51083fac2c898c630afc5b34ae8175fb92fcfd90018d45ead23c00714620fd05d8d8607abe817f919363752119aa431b1235c4f37ca6fd06b4b959ad84c922af275ad71a770cc480cd77edc501d2692523a29bce78886e8f78c7dba5dfa322529ab9b2c11928d095326cb33f80620aa61519d08b4dd326fc83554f7dbb3073206e251b3c3cc3ddfaa02cea940b7f52c7c6c0bbf9a0bf5476eeec64e59a0190229a65e082c4f354cc52029a15abe62c372a6c1a136c621c0c00b607ace31a23b3fc602ec96769fca73a3b9641c7164bc309a1c85906d8539a6ac018c02455718597da441bc977b4a651d64eba74fa66672f3280b3737ca65c905380b65f814025abaef68c13230b2a4d4b28a31a29333ac5ac001c3c753964529b1574507c72a024c2f1928c28ec56689a76daa401f4300733e65a54c6386799871c472448ae4b2a49b5884bcb62ae7610df67edfdac08c361b719b2a17746b67fb778bfcb5578a7da8875920c297c4e876e0d03a244cb38ecb7ddbc58f18204ab12b8e3bc394fe15c2eb208cf50b0e2f8c144e5ca74bd165531b6a9d048f57cb05c7a4982e1a2324c33438b1a9e7a9c92f07b90ef389869b25673971ccf50906b7ba40da3af4f75ed4a6018d9368277816ac6cc6dbe7257530c5c88baa1c908917c08695899198bcab62808f1ec977da0c036b3c5c6b5b6c810b638532b6b19ab6103a2066d6487341a63a83205311a4550a5aa5553d3041bc5fa779a1a118f5831e28c473622b0013a351d8eba3ba4a6493b83a7737c1cf827732d65bc1d29eca4c1382f97216b81ac4996a90d025f1fa76e1aca81ee917c8c3939ce1b54a26b9bc4162424c3bc0f642f3f82df958891b8b76a76248d4d8549de9844d7aaeced7aa8c8485bd5759a0194040c671f4fcbbd1b416871307b12304fd000287a30635293dfa203184d996d71751be729245b60f5673b333bc4599b36b0fa705b0c15f63da3597616d600374ac0491f64b940b36618df7c84784cdebf3297b101b9771c68df5477f712b722918ebe11e13d494128c57c5914dd3257353146000e07f12a1c08fa35a9c60aff5b84f8396c8a97b65f9eb819bfa5dd7bccce0556a7e45a60bb22ca41ba2a0a889815c375edf5f6ed3e1b2a54925fc9825ab6619e9f3d54ea33b67a9efddbea0413199642d0a8eaac27fc0c8ec5d3df80fa00039d70206d0a30483b1253212ac2f0177c5168a36d5245ef30c
+Ciphertext: 296dbf5fb07232cf366c266c00a17620410c7de2091ef7bd57fb9acf0bf1fd62e9da964e935ec5b0c2048f1b29a168c33fcf01472ddd6d74d1e623e637a2ebb8e3613a48e8bc3a58db85d50361f328e4b044f5f8c3caa40583658cdccd2c8a8f95885e30d8dbbf71e5e74bcc8a907b69810279727f2022d77fce8430a854a3c4d019324d0d4d56c3145774cf19fa7d16bcd5645f4fa232e402d448082ef04ac1a7e1a77b3714016d601bb5460f3ecbb9c5f3bcebc9bec5d0bbe5786e7d369c530aafd82bea4e29178960b5ada5fedd0a60ba8599d6bb1f640383ea0022217d56af252d47b174b7eac166436759b20123e63913e6440f871d171931b5902300559af41611a0da2a28fc1bec458097d68d3a1c5e4fba1cf730c6fec6bc3e67d3333a484e80c711bb8dd82e6a681222cc92d14b63c39c986dff23ed82da723785bd8385efdd6bdfbc3db8cb2e8873c7b1a4316923c9ae6dd85430ea4484a024a6e688542286b352a47b1d6d9ace7b15d4cefd1a06bf81d843743d657518df289f091f1ccf50596ed678f7fa90bb6d0eb11904e24535097d8d372ebb7fc21d8360844b2432120a1e4c982fef6d59532ac1e7f3ce99688f9e5432062e79d401c966933c46abbb56413e899d13b13d96f3495690c0842ca27c9b9133648ef7b7af94f0fcca577d4e5410c4a57825da5d31191f2ab477837eaeb6a84c4680b2786a4a1d096b71fbfce5a5a3adb166e2ff60b899ef297580a4c1e3267318998990bd2274fce38a71a861a80f03e416f21eeb8d433f8147e5c74eb78de10db8418c05125467893e1ca039ee344c0d476096cab8c28fbce7f7fbd16260e2bee1c3e6333c3590caa32f39d48a93ed93547009b4ba2d4744fc88b430ec834c23b0401d6d0d8c24c264e3f52e961a7d47424d5a82f19ada13b1f3d7811a40210595f5fc76f1def5e4492dd465706520a6458579ec272ca0eee13ff03d1ce2017844e9dd1338091873bc15f46cc4aa5127e7baa86436ee6512b0a41df833ffdc2c28cd48421b7457e91d6f2c43855e3fe3ebdc4a72efbd21ac54cae0cb670016ed2c58196b6b32d5edafa481c50893cb718010e3d108b410ff89c6771bfc19e6f4df29b32bc091567eaa64964052ab6b1cebbbc45cf5a00aac5232d15cfdf1c870a3c4604470f45fc9d92ffd748a2363ff88568158359f76836f30171514b6d58248eecb445655cbcd89da8338332d3097d752e74d9db6ced978c7d1ff6af50fa72e6679b46180852d366803a78de16e856dd06895995bae213bb49ef98b1cbe978a1a3e490b837d69743f1e60c3ddc445d4ac6eeaa844b69b2109b7bcf1e922d93412d0be1fabb74f2acff757d3b38b0712d57f024b7d6fbd77ee75c5651c9e953014d966f718a2185cae26c25b180daf5fe424d4d895d3045f1fa37b9fdffb5fc87ec4557af5c8f661d7f3910b087aba0819f77ed24d18a30717223549e57df94b3be5c8af8413a0ce54e27d1f5d47b99a5c3984963fb1e95a01cde35bbe44a1b43751d0c31c6f05fb9c8fefc502e879ef891aac14193dfbb4a0bb898ac23188866274da8dc603170a36bcf88837cd75c14bf32de1bf43a675a4bd919d7c3cf91b9925c2477b1e0d6d3188b34f6f4e8759fa70e01da5c99b033fdb035cfd2faac0b1c409bc59d2eb018b6dc966670bc1cc24d8313f6802fd50fab4d60dcaaa52a94ba1d1c2a58bb956d7bd4cfb4413b2a4e2560e6f5056cfff2bbacaf24586dbe1a82388f2e87fa02820ed541edf34a542f2249c322de5ac8bc3e05a2374fcdbfea1a0f6dde5c196a7655d821857795995bf9ee60ed57156d8d5c1bf9b4131e28c38e670c2ae404c2ed6e455e27da936d91723915e986a1730623886914d8ba184b01eb0cd8b89476d1ed67dadf8b6ecc5ddc07f761ecdf5785d20226531fd6060a4a552a927f13f142b26fc3c47b8c8f6852ca7d5d43c38c5e2654d229942885b9e553f83c104f5271cf227f086f388348fcce45315cc0b1cf219ac30d98d6f4a7d9d701ddd61946955e610ca1fc2af2748dfb28a3a51d5a9b7bde5de00d746512a65f1067b8566f9b6f52478b08a5d51a20ce4d06e71351fdf57913fb805e0eb64334dd3372651940d63bb7e907711bfd2aee08f056561c336c4a0ab0d029d02d7ffc1aa8aabd220f785d80c94f37eadb3b7927eb4fbfdb9bea0d95a2d685bde56963504ae
+Shared Secret B: 1c04260fa850a0ab3eb1faf6124c180bfc35a4132aceaf1c0c6d9a20b2518690
+Shared Secret A: 1c04260fa850a0ab3eb1faf6124c180bfc35a4132aceaf1c0c6d9a20b2518690
+Pseudorandom shared Secret A: 290d2e3c41f3706a8a4a9fb7d7b136708e32ec01b22964f75a64c905e0227aba
index b66809f01e906d229b88f0be25dc3cccb400ecff..6947f9ac88b2f2b02910305c2e728955cf0094ca 100644 (file)
@@ -14,9 +14,8 @@
  * GNU Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
- * MA 02110-1301, USA.
+ * License along with this program; if not, see <https://www.gnu.org/licenses/>.
+ * SPDX-License-Identifier: LGPL-2.1-or-later
  */
 
 #ifdef HAVE_CONFIG_H
index bfc22fc04e27943fcd975c0093aaaf34d206457d..d7a1172dec0f9408e68bd52463cadc0dce29880d 100644 (file)
@@ -271,7 +271,7 @@ test_cv_x448 (int testno, const char *k_str, const char *u_str,
   err = gcry_ecc_mul_point (GCRY_ECC_CURVE448, result, scalar, point);
   if (in_fips_mode)
     {
-      if (err != GPG_ERR_NOT_SUPPORTED)
+      if (gpg_err_code (err) != GPG_ERR_NOT_SUPPORTED)
         fail ("gcry_ecc_mul_point is not expected to work in FIPS mode for test %d: %s",
               testno, gpg_strerror (err));
       if (verbose > 1)
index 2355859fc92dee21a5baebce216e179997ef3b5a..f060bb07abb63d448667be3cc3b44d9604a52983 100644 (file)
@@ -4,7 +4,7 @@
  * This file is part of Libgcrypt.
  *
  * Libgcrypt is free software; you can redistribute it and/or modify
- * it under the terms of the GNU Lesser general Public License as
+ * it under the terms of the GNU Lesser General Public License as
  * published by the Free Software Foundation; either version 2.1 of
  * the License, or (at your option) any later version.
  *
@@ -14,8 +14,8 @@
  * GNU Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA
+ * License along with this program; if not, see <https://www.gnu.org/licenses/>.
+ * SPDX-License-Identifier: LGPL-2.1-or-later
  */
 
 #ifdef HAVE_CONFIG_H
index 0ccde32691747efa082823b45f6e1a6044b79aae..bfca4c23143bf7003447708d16a2f0f08ffd2c37 100644 (file)
@@ -78,7 +78,12 @@ static struct {
    { "t-ed448"     },
    { "benchmark"   },
    { "bench-slope" },
-   { "hashtest-256g",  "hashtest", "--gigs 256 SHA1 SHA256 SHA512 SM3",
+   { "hashtest-6g", "hashtest", "--hugeblock --gigs 6 SHA1 SHA256 SHA512 "
+                                                     "SHA3-512 SM3 BLAKE2S_256 "
+                                                     "BLAKE2B_512 CRC32 "
+                                                     "CRC24RFC2440",
+     LONG_RUNNING },
+   { "hashtest-256g", "hashtest", "--gigs 256 SHA1 SHA256 SHA512 SHA3-512 SM3",
      LONG_RUNNING },
    { NULL }
   };
index 9d3a3c240c48bb5c34efbb52a5723310b0da08f5..ad0c9cbc6cf4bb43ef2415f0b8b0b9c81ee68f39 100644 (file)
@@ -1,22 +1,22 @@
 /* version.c  -  This version test should be run first.
  Copyright (C) 2007 Free Software Foundation, Inc.
-
  This file is part of Libgcrypt.
-
  Libgcrypt is free software; you can redistribute it and/or modify
  it under the terms of the GNU Lesser General Public License as
  published by the Free Software Foundation; either version 2.1 of
  the License, or (at your option) any later version.
-
  Libgcrypt is distributed in the hope that it will be useful,
  but WITHOUT ANY WARRANTY; without even the implied warranty of
  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  GNU Lesser General Public License for more details.
-
  You should have received a copy of the GNU Lesser General Public
-   License along with this program; if not, write to the Free Software
-   Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301,
  USA.  */
* Copyright (C) 2007 Free Software Foundation, Inc.
+ *
* This file is part of Libgcrypt.
+ *
* Libgcrypt is free software; you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as
* published by the Free Software Foundation; either version 2.1 of
* the License, or (at your option) any later version.
+ *
* Libgcrypt is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
* GNU Lesser General Public License for more details.
+ *
* You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <https://www.gnu.org/licenses/>.
+ * SPDX-License-Identifier: LGPL-2.1-or-later
+ */
 
 /* This test should be run first because due to a failing config.links
    script or bad configure parameters the just build libgcrypt may