Import aes 0.8.2 upstream upstream/0.8.2
authorDongHun Kwak <dh0128.kwak@samsung.com>
Wed, 22 Mar 2023 01:29:49 +0000 (10:29 +0900)
committerDongHun Kwak <dh0128.kwak@samsung.com>
Wed, 22 Mar 2023 01:29:49 +0000 (10:29 +0900)
31 files changed:
.cargo_vcs_info.json [new file with mode: 0644]
CHANGELOG.md [new file with mode: 0644]
Cargo.toml [new file with mode: 0644]
Cargo.toml.orig [new file with mode: 0644]
LICENSE-APACHE [new file with mode: 0644]
LICENSE-MIT [new file with mode: 0644]
README.md [new file with mode: 0644]
benches/mod.rs [new file with mode: 0644]
src/armv8.rs [new file with mode: 0644]
src/armv8/encdec.rs [new file with mode: 0644]
src/armv8/expand.rs [new file with mode: 0644]
src/armv8/hazmat.rs [new file with mode: 0644]
src/armv8/test_expand.rs [new file with mode: 0644]
src/autodetect.rs [new file with mode: 0644]
src/hazmat.rs [new file with mode: 0644]
src/lib.rs [new file with mode: 0644]
src/ni.rs [new file with mode: 0644]
src/ni/aes128.rs [new file with mode: 0644]
src/ni/aes192.rs [new file with mode: 0644]
src/ni/aes256.rs [new file with mode: 0644]
src/ni/hazmat.rs [new file with mode: 0644]
src/ni/test_expand.rs [new file with mode: 0644]
src/ni/utils.rs [new file with mode: 0644]
src/soft.rs [new file with mode: 0644]
src/soft/fixslice32.rs [new file with mode: 0644]
src/soft/fixslice64.rs [new file with mode: 0644]
tests/data/aes128.blb [new file with mode: 0644]
tests/data/aes192.blb [new file with mode: 0644]
tests/data/aes256.blb [new file with mode: 0644]
tests/hazmat.rs [new file with mode: 0644]
tests/mod.rs [new file with mode: 0644]

diff --git a/.cargo_vcs_info.json b/.cargo_vcs_info.json
new file mode 100644 (file)
index 0000000..4210283
--- /dev/null
@@ -0,0 +1,6 @@
+{
+  "git": {
+    "sha1": "48242cc25cf9fa8af87acef6deb5b5edde1265f9"
+  },
+  "path_in_vcs": "aes"
+}
\ No newline at end of file
diff --git a/CHANGELOG.md b/CHANGELOG.md
new file mode 100644 (file)
index 0000000..cdf781d
--- /dev/null
@@ -0,0 +1,133 @@
+# Changelog
+
+All notable changes to this project will be documented in this file.
+
+The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
+and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
+
+## 0.8.2 (2022-10-27)
+### Fixed
+- Crate documentation around configuration flags ([#343])
+
+[#343]: https://github.com/RustCrypto/block-ciphers/pull/343
+
+## 0.8.1 (2022-02-17)
+### Fixed
+- Minimal versions build ([#303])
+
+[#303]: https://github.com/RustCrypto/block-ciphers/pull/303
+
+## 0.8.0 (2022-02-10)
+### Changed
+- Bump `cipher` dependency to v0.4 ([#284])
+
+### Added
+- Encrypt-only and decrypt-only cipher types ([#284])
+
+[#284]: https://github.com/RustCrypto/block-ciphers/pull/284
+
+## 0.7.5 (2021-08-26)
+### Changed
+- Bump `ctr` dependency to v0.8 ([#275])
+- Use the `aes` target feature instead of `crypto` on ARMv8 ([#279])
+- Use `core::arch::aarch64::vst1q_u8` intrinsic on `armv8` ([#280])
+- Bump `cpufeatures` dependency to v0.2 ([#281])
+
+[#275]: https://github.com/RustCrypto/block-ciphers/pull/275
+[#279]: https://github.com/RustCrypto/block-ciphers/pull/279
+[#280]: https://github.com/RustCrypto/block-ciphers/pull/280
+[#281]: https://github.com/RustCrypto/block-ciphers/pull/281
+
+## 0.7.4 (2021-06-01)
+### Added
+- Soft `hazmat` backend ([#267], [#268])
+- Parallel `hazmat` APIs ([#269])
+
+[#267]: https://github.com/RustCrypto/block-ciphers/pull/267
+[#268]: https://github.com/RustCrypto/block-ciphers/pull/268
+[#269]: https://github.com/RustCrypto/block-ciphers/pull/269
+
+## 0.7.3 (2021-05-26)
+### Added
+- `hazmat` feature/module providing round function access ([#257], [#259], [#260])
+- `BLOCK_SIZE` constant ([#263])
+
+[#257]: https://github.com/RustCrypto/block-ciphers/pull/257
+[#259]: https://github.com/RustCrypto/block-ciphers/pull/259
+[#260]: https://github.com/RustCrypto/block-ciphers/pull/260
+[#263]: https://github.com/RustCrypto/block-ciphers/pull/263
+
+## 0.7.2 (2021-05-17)
+### Added
+- Nightly-only ARMv8 intrinsics support gated under the `armv8` feature ([#250])
+
+[#250]: https://github.com/RustCrypto/block-ciphers/pull/250
+
+## 0.7.1 (2021-05-09)
+### Fixed
+- Restore `fixslice64.rs` ([#247])
+
+[#247]: https://github.com/RustCrypto/block-ciphers/pull/247
+
+## 0.7.0 (2021-04-29)
+### Added
+- Auto-detection support for AES-NI; MSRV 1.49+ ([#208], [#214], [#215], [#216])
+- `ctr` feature providing SIMD accelerated AES-CTR ([#200])
+
+### Changed
+- Unify the `aes`, `aesni`, `aes-ctr`, and `aes-soft` crates ([#200])
+- Use `cfg-if` crate ([#203])
+- Rename `semi_fixslice` feature to `compact` ([#204])
+- Refactor NI backend ([#224], [#225])
+- Bump `cipher` crate dependency to v0.3 ([#235])
+- Bump `ctr` crate dependency to v0.7 ([#237])
+
+[#200]: https://github.com/RustCrypto/block-ciphers/pull/200
+[#203]: https://github.com/RustCrypto/block-ciphers/pull/203
+[#204]: https://github.com/RustCrypto/block-ciphers/pull/204
+[#208]: https://github.com/RustCrypto/block-ciphers/pull/208
+[#214]: https://github.com/RustCrypto/block-ciphers/pull/214
+[#215]: https://github.com/RustCrypto/block-ciphers/pull/215
+[#216]: https://github.com/RustCrypto/block-ciphers/pull/216
+[#224]: https://github.com/RustCrypto/block-ciphers/pull/224
+[#225]: https://github.com/RustCrypto/block-ciphers/pull/225
+[#235]: https://github.com/RustCrypto/block-ciphers/pull/235
+[#237]: https://github.com/RustCrypto/block-ciphers/pull/237
+
+## 0.6.0 (2020-10-16)
+### Changed
+- Replace `block-cipher`/`stream-cipher` with `cipher` crate ([#167])
+
+[#167]: https://github.com/RustCrypto/block-ciphers/pull/167
+
+## 0.5.1 (2020-08-25)
+### Changed
+- Bump `aesni` dependency to v0.9 ([#158])
+
+[#158]: https://github.com/RustCrypto/block-ciphers/pull/158
+
+## 0.5.0 (2020-08-07)
+### Changed
+- Bump `block-cipher` dependency to v0.8 ([#138])
+- Bump `opaque-debug` dependency to v0.3 ([#140])
+
+[#138]: https://github.com/RustCrypto/block-ciphers/pull/138
+[#140]: https://github.com/RustCrypto/block-ciphers/pull/140
+
+## 0.4.0 (2020-06-05)
+### Changed
+- Bump `block-cipher` dependency to v0.7 ([#86], [#122])
+- Update to Rust 2018 edition ([#86])
+
+[#121]: https://github.com/RustCrypto/block-ciphers/pull/122 
+[#86]: https://github.com/RustCrypto/block-ciphers/pull/86
+
+## 0.3.2 (2018-11-01)
+
+## 0.3.1 (2018-10-04)
+
+## 0.3.0 (2018-10-03)
+
+## 0.2.0 (2018-07-27)
+
+## 0.1.0 (2018-06-22)
diff --git a/Cargo.toml b/Cargo.toml
new file mode 100644 (file)
index 0000000..e03d07a
--- /dev/null
@@ -0,0 +1,70 @@
+# THIS FILE IS AUTOMATICALLY GENERATED BY CARGO
+#
+# When uploading crates to the registry Cargo will automatically
+# "normalize" Cargo.toml files for maximal compatibility
+# with all versions of Cargo and also rewrite `path` dependencies
+# to registry (e.g., crates.io) dependencies.
+#
+# If you are reading this file be aware that the original Cargo.toml
+# will likely look very different (and much more reasonable).
+# See Cargo.toml.orig for the original contents.
+
+[package]
+edition = "2021"
+rust-version = "1.56"
+name = "aes"
+version = "0.8.2"
+authors = ["RustCrypto Developers"]
+description = "Pure Rust implementation of the Advanced Encryption Standard (a.k.a. Rijndael)"
+documentation = "https://docs.rs/aes"
+readme = "README.md"
+keywords = [
+    "crypto",
+    "aes",
+    "rijndael",
+    "block-cipher",
+]
+categories = [
+    "cryptography",
+    "no-std",
+]
+license = "MIT OR Apache-2.0"
+repository = "https://github.com/RustCrypto/block-ciphers"
+resolver = "1"
+
+[package.metadata.docs.rs]
+all-features = true
+rustdoc-args = [
+    "--cfg",
+    "docsrs",
+]
+
+[dependencies.cfg-if]
+version = "1"
+
+[dependencies.cipher]
+version = "0.4.2"
+
+[dev-dependencies.cipher]
+version = "0.4.2"
+features = ["dev"]
+
+[dev-dependencies.hex-literal]
+version = "0.3"
+
+[features]
+hazmat = []
+
+[target."cfg(all(aes_armv8, target_arch = \"aarch64\"))".dependencies.zeroize]
+version = "1.5.6"
+features = ["aarch64"]
+optional = true
+default_features = false
+
+[target."cfg(any(target_arch = \"aarch64\", target_arch = \"x86_64\", target_arch = \"x86\"))".dependencies.cpufeatures]
+version = "0.2"
+
+[target."cfg(not(all(aes_armv8, target_arch = \"aarch64\")))".dependencies.zeroize]
+version = "1.5.6"
+optional = true
+default_features = false
diff --git a/Cargo.toml.orig b/Cargo.toml.orig
new file mode 100644 (file)
index 0000000..ebf0415
--- /dev/null
@@ -0,0 +1,38 @@
+[package]
+name = "aes"
+version = "0.8.2"
+description = "Pure Rust implementation of the Advanced Encryption Standard (a.k.a. Rijndael)"
+authors = ["RustCrypto Developers"]
+license = "MIT OR Apache-2.0"
+edition = "2021"
+rust-version = "1.56"
+readme = "README.md"
+documentation = "https://docs.rs/aes"
+repository = "https://github.com/RustCrypto/block-ciphers"
+keywords = ["crypto", "aes", "rijndael", "block-cipher"]
+categories = ["cryptography", "no-std"]
+
+[dependencies]
+cfg-if = "1"
+cipher = "0.4.2"
+
+[target.'cfg(any(target_arch = "aarch64", target_arch = "x86_64", target_arch = "x86"))'.dependencies]
+cpufeatures = "0.2"
+
+[target.'cfg(not(all(aes_armv8, target_arch = "aarch64")))'.dependencies]
+zeroize = { version = "1.5.6", optional = true, default_features = false }
+
+# TODO(tarcieri): unconditionally enable `aarch64` feature when MSRV is 1.59
+[target.'cfg(all(aes_armv8, target_arch = "aarch64"))'.dependencies]
+zeroize = { version = "1.5.6", optional = true, default_features = false, features = ["aarch64"] }
+
+[dev-dependencies]
+cipher = { version = "0.4.2", features = ["dev"] }
+hex-literal = "0.3"
+
+[features]
+hazmat = [] # Expose cryptographically hazardous APIs
+
+[package.metadata.docs.rs]
+all-features = true
+rustdoc-args = ["--cfg", "docsrs"]
diff --git a/LICENSE-APACHE b/LICENSE-APACHE
new file mode 100644 (file)
index 0000000..78173fa
--- /dev/null
@@ -0,0 +1,201 @@
+                              Apache License
+                        Version 2.0, January 2004
+                     http://www.apache.org/licenses/
+
+TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+1. Definitions.
+
+   "License" shall mean the terms and conditions for use, reproduction,
+   and distribution as defined by Sections 1 through 9 of this document.
+
+   "Licensor" shall mean the copyright owner or entity authorized by
+   the copyright owner that is granting the License.
+
+   "Legal Entity" shall mean the union of the acting entity and all
+   other entities that control, are controlled by, or are under common
+   control with that entity. For the purposes of this definition,
+   "control" means (i) the power, direct or indirect, to cause the
+   direction or management of such entity, whether by contract or
+   otherwise, or (ii) ownership of fifty percent (50%) or more of the
+   outstanding shares, or (iii) beneficial ownership of such entity.
+
+   "You" (or "Your") shall mean an individual or Legal Entity
+   exercising permissions granted by this License.
+
+   "Source" form shall mean the preferred form for making modifications,
+   including but not limited to software source code, documentation
+   source, and configuration files.
+
+   "Object" form shall mean any form resulting from mechanical
+   transformation or translation of a Source form, including but
+   not limited to compiled object code, generated documentation,
+   and conversions to other media types.
+
+   "Work" shall mean the work of authorship, whether in Source or
+   Object form, made available under the License, as indicated by a
+   copyright notice that is included in or attached to the work
+   (an example is provided in the Appendix below).
+
+   "Derivative Works" shall mean any work, whether in Source or Object
+   form, that is based on (or derived from) the Work and for which the
+   editorial revisions, annotations, elaborations, or other modifications
+   represent, as a whole, an original work of authorship. For the purposes
+   of this License, Derivative Works shall not include works that remain
+   separable from, or merely link (or bind by name) to the interfaces of,
+   the Work and Derivative Works thereof.
+
+   "Contribution" shall mean any work of authorship, including
+   the original version of the Work and any modifications or additions
+   to that Work or Derivative Works thereof, that is intentionally
+   submitted to Licensor for inclusion in the Work by the copyright owner
+   or by an individual or Legal Entity authorized to submit on behalf of
+   the copyright owner. For the purposes of this definition, "submitted"
+   means any form of electronic, verbal, or written communication sent
+   to the Licensor or its representatives, including but not limited to
+   communication on electronic mailing lists, source code control systems,
+   and issue tracking systems that are managed by, or on behalf of, the
+   Licensor for the purpose of discussing and improving the Work, but
+   excluding communication that is conspicuously marked or otherwise
+   designated in writing by the copyright owner as "Not a Contribution."
+
+   "Contributor" shall mean Licensor and any individual or Legal Entity
+   on behalf of whom a Contribution has been received by Licensor and
+   subsequently incorporated within the Work.
+
+2. Grant of Copyright License. Subject to the terms and conditions of
+   this License, each Contributor hereby grants to You a perpetual,
+   worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+   copyright license to reproduce, prepare Derivative Works of,
+   publicly display, publicly perform, sublicense, and distribute the
+   Work and such Derivative Works in Source or Object form.
+
+3. Grant of Patent License. Subject to the terms and conditions of
+   this License, each Contributor hereby grants to You a perpetual,
+   worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+   (except as stated in this section) patent license to make, have made,
+   use, offer to sell, sell, import, and otherwise transfer the Work,
+   where such license applies only to those patent claims licensable
+   by such Contributor that are necessarily infringed by their
+   Contribution(s) alone or by combination of their Contribution(s)
+   with the Work to which such Contribution(s) was submitted. If You
+   institute patent litigation against any entity (including a
+   cross-claim or counterclaim in a lawsuit) alleging that the Work
+   or a Contribution incorporated within the Work constitutes direct
+   or contributory patent infringement, then any patent licenses
+   granted to You under this License for that Work shall terminate
+   as of the date such litigation is filed.
+
+4. Redistribution. You may reproduce and distribute copies of the
+   Work or Derivative Works thereof in any medium, with or without
+   modifications, and in Source or Object form, provided that You
+   meet the following conditions:
+
+   (a) You must give any other recipients of the Work or
+       Derivative Works a copy of this License; and
+
+   (b) You must cause any modified files to carry prominent notices
+       stating that You changed the files; and
+
+   (c) You must retain, in the Source form of any Derivative Works
+       that You distribute, all copyright, patent, trademark, and
+       attribution notices from the Source form of the Work,
+       excluding those notices that do not pertain to any part of
+       the Derivative Works; and
+
+   (d) If the Work includes a "NOTICE" text file as part of its
+       distribution, then any Derivative Works that You distribute must
+       include a readable copy of the attribution notices contained
+       within such NOTICE file, excluding those notices that do not
+       pertain to any part of the Derivative Works, in at least one
+       of the following places: within a NOTICE text file distributed
+       as part of the Derivative Works; within the Source form or
+       documentation, if provided along with the Derivative Works; or,
+       within a display generated by the Derivative Works, if and
+       wherever such third-party notices normally appear. The contents
+       of the NOTICE file are for informational purposes only and
+       do not modify the License. You may add Your own attribution
+       notices within Derivative Works that You distribute, alongside
+       or as an addendum to the NOTICE text from the Work, provided
+       that such additional attribution notices cannot be construed
+       as modifying the License.
+
+   You may add Your own copyright statement to Your modifications and
+   may provide additional or different license terms and conditions
+   for use, reproduction, or distribution of Your modifications, or
+   for any such Derivative Works as a whole, provided Your use,
+   reproduction, and distribution of the Work otherwise complies with
+   the conditions stated in this License.
+
+5. Submission of Contributions. Unless You explicitly state otherwise,
+   any Contribution intentionally submitted for inclusion in the Work
+   by You to the Licensor shall be under the terms and conditions of
+   this License, without any additional terms or conditions.
+   Notwithstanding the above, nothing herein shall supersede or modify
+   the terms of any separate license agreement you may have executed
+   with Licensor regarding such Contributions.
+
+6. Trademarks. This License does not grant permission to use the trade
+   names, trademarks, service marks, or product names of the Licensor,
+   except as required for reasonable and customary use in describing the
+   origin of the Work and reproducing the content of the NOTICE file.
+
+7. Disclaimer of Warranty. Unless required by applicable law or
+   agreed to in writing, Licensor provides the Work (and each
+   Contributor provides its Contributions) on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+   implied, including, without limitation, any warranties or conditions
+   of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+   PARTICULAR PURPOSE. You are solely responsible for determining the
+   appropriateness of using or redistributing the Work and assume any
+   risks associated with Your exercise of permissions under this License.
+
+8. Limitation of Liability. In no event and under no legal theory,
+   whether in tort (including negligence), contract, or otherwise,
+   unless required by applicable law (such as deliberate and grossly
+   negligent acts) or agreed to in writing, shall any Contributor be
+   liable to You for damages, including any direct, indirect, special,
+   incidental, or consequential damages of any character arising as a
+   result of this License or out of the use or inability to use the
+   Work (including but not limited to damages for loss of goodwill,
+   work stoppage, computer failure or malfunction, or any and all
+   other commercial damages or losses), even if such Contributor
+   has been advised of the possibility of such damages.
+
+9. Accepting Warranty or Additional Liability. While redistributing
+   the Work or Derivative Works thereof, You may choose to offer,
+   and charge a fee for, acceptance of support, warranty, indemnity,
+   or other liability obligations and/or rights consistent with this
+   License. However, in accepting such obligations, You may act only
+   on Your own behalf and on Your sole responsibility, not on behalf
+   of any other Contributor, and only if You agree to indemnify,
+   defend, and hold each Contributor harmless for any liability
+   incurred by, or claims asserted against, such Contributor by reason
+   of your accepting any such warranty or additional liability.
+
+END OF TERMS AND CONDITIONS
+
+APPENDIX: How to apply the Apache License to your work.
+
+   To apply the Apache License to your work, attach the following
+   boilerplate notice, with the fields enclosed by brackets "[]"
+   replaced with your own identifying information. (Don't include
+   the brackets!)  The text should be enclosed in the appropriate
+   comment syntax for the file format. We also recommend that a
+   file or class name and description of purpose be included on the
+   same "printed page" as the copyright notice for easier
+   identification within third-party archives.
+
+Copyright [yyyy] [name of copyright owner]
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
diff --git a/LICENSE-MIT b/LICENSE-MIT
new file mode 100644 (file)
index 0000000..f5b157a
--- /dev/null
@@ -0,0 +1,25 @@
+Copyright (c) 2018 Artyom Pavlov
+
+Permission is hereby granted, free of charge, to any
+person obtaining a copy of this software and associated
+documentation files (the "Software"), to deal in the
+Software without restriction, including without
+limitation the rights to use, copy, modify, merge,
+publish, distribute, sublicense, and/or sell copies of
+the Software, and to permit persons to whom the Software
+is furnished to do so, subject to the following
+conditions:
+
+The above copyright notice and this permission notice
+shall be included in all copies or substantial portions
+of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF
+ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED
+TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
+PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT
+SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR
+IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+DEALINGS IN THE SOFTWARE.
diff --git a/README.md b/README.md
new file mode 100644 (file)
index 0000000..ce95a3a
--- /dev/null
+++ b/README.md
@@ -0,0 +1,95 @@
+# RustCrypto: Advanced Encryption Standard (AES)
+
+[![crate][crate-image]][crate-link]
+[![Docs][docs-image]][docs-link]
+![Apache2/MIT licensed][license-image]
+![Rust Version][rustc-image]
+[![Project Chat][chat-image]][chat-link]
+[![Build Status][build-image]][build-link]
+[![Downloads][downloads-image]][crate-link]
+[![HAZMAT][hazmat-image]][hazmat-link]
+
+Pure Rust implementation of the [Advanced Encryption Standard (AES)][1].
+
+This crate implements the low-level AES block function, and is intended
+for use for implementing higher-level constructions *only*. It is NOT
+intended for direct use in applications.
+
+[Documentation][docs-link]
+
+<img src="https://raw.githubusercontent.com/RustCrypto/media/85f62bb/img/block-ciphers/aes-round.svg" width="480px">
+
+## Security
+
+### ⚠️ Warning: [Hazmat!][hazmat-link]
+
+This crate does not ensure ciphertexts are authentic (i.e. by using a MAC to
+verify ciphertext integrity), which can lead to serious vulnerabilities
+if used incorrectly!
+
+To avoid this, use an [AEAD][2] mode based on AES, such as [AES-GCM][3] or [AES-GCM-SIV][4].
+See the [RustCrypto/AEADs][5] repository for more information.
+
+USE AT YOUR OWN RISK!
+
+### Notes
+
+This crate has received one [security audit by NCC Group][6], with no significant
+findings. We would like to thank [MobileCoin][7] for funding the audit.
+
+All implementations contained in the crate are designed to execute in constant
+time, either by relying on hardware intrinsics (i.e. AES-NI on x86/x86_64), or
+using a portable implementation based on bitslicing.
+
+## Minimum Supported Rust Version
+
+Rust **1.56** or higher.
+
+Minimum supported Rust version can be changed in future releases, but it will
+be done with a minor version bump.
+
+## SemVer Policy
+
+- All on-by-default features of this library are covered by SemVer
+- MSRV is considered exempt from SemVer as noted above
+
+## License
+
+Licensed under either of:
+
+ * [Apache License, Version 2.0](http://www.apache.org/licenses/LICENSE-2.0)
+ * [MIT license](http://opensource.org/licenses/MIT)
+
+at your option.
+
+### Contribution
+
+Unless you explicitly state otherwise, any contribution intentionally submitted
+for inclusion in the work by you, as defined in the Apache-2.0 license, shall be
+dual licensed as above, without any additional terms or conditions.
+
+[//]: # (badges)
+
+[crate-image]: https://img.shields.io/crates/v/aes.svg
+[crate-link]: https://crates.io/crates/aes
+[docs-image]: https://docs.rs/aes/badge.svg
+[docs-link]: https://docs.rs/aes/
+[license-image]: https://img.shields.io/badge/license-Apache2.0/MIT-blue.svg
+[rustc-image]: https://img.shields.io/badge/rustc-1.56+-blue.svg
+[chat-image]: https://img.shields.io/badge/zulip-join_chat-blue.svg
+[chat-link]: https://rustcrypto.zulipchat.com/#narrow/stream/260039-block-ciphers
+[build-image]: https://github.com/RustCrypto/block-ciphers/workflows/aes/badge.svg?branch=master&event=push
+[build-link]: https://github.com/RustCrypto/block-ciphers/actions?query=workflow%3Aaes
+[downloads-image]: https://img.shields.io/crates/d/aes.svg
+[hazmat-image]: https://img.shields.io/badge/crypto-hazmat%E2%9A%A0-red.svg
+[hazmat-link]: https://github.com/RustCrypto/meta/blob/master/HAZMAT.md
+
+[//]: # (general links)
+
+[1]: https://en.wikipedia.org/wiki/Advanced_Encryption_Standard
+[2]: https://en.wikipedia.org/wiki/Authenticated_encryption
+[3]: https://github.com/RustCrypto/AEADs/tree/master/aes-gcm
+[4]: https://github.com/RustCrypto/AEADs/tree/master/aes-gcm-siv
+[5]: https://github.com/RustCrypto/AEADs
+[6]: https://research.nccgroup.com/2020/02/26/public-report-rustcrypto-aes-gcm-and-chacha20poly1305-implementation-review/
+[7]: https://www.mobilecoin.com/
diff --git a/benches/mod.rs b/benches/mod.rs
new file mode 100644 (file)
index 0000000..579b073
--- /dev/null
@@ -0,0 +1,62 @@
+#![feature(test)]
+extern crate test;
+
+use cipher::{block_decryptor_bench, block_encryptor_bench, KeyInit};
+
+block_encryptor_bench!(
+    Key: aes::Aes128,
+    aes128_encrypt_block,
+    aes128_encrypt_blocks,
+);
+block_decryptor_bench!(
+    Key: aes::Aes128,
+    aes128_decrypt_block,
+    aes128_decrypt_blocks,
+);
+block_encryptor_bench!(
+    Key: aes::Aes192,
+    aes192_encrypt_block,
+    aes192_encrypt_blocks,
+);
+block_decryptor_bench!(
+    Key: aes::Aes192,
+    aes192_decrypt_block,
+    aes192_decrypt_blocks,
+);
+block_encryptor_bench!(
+    Key: aes::Aes256,
+    aes256_encrypt_block,
+    aes256_encrypt_blocks,
+);
+block_decryptor_bench!(
+    Key: aes::Aes256,
+    aes256_decrypt_block,
+    aes256_decrypt_blocks,
+);
+
+#[bench]
+fn aes128_new(bh: &mut test::Bencher) {
+    bh.iter(|| {
+        let key = test::black_box(Default::default());
+        let cipher = aes::Aes128::new(&key);
+        test::black_box(&cipher);
+    });
+}
+
+#[bench]
+fn aes192_new(bh: &mut test::Bencher) {
+    bh.iter(|| {
+        let key = test::black_box(Default::default());
+        let cipher = aes::Aes192::new(&key);
+        test::black_box(&cipher);
+    });
+}
+
+#[bench]
+fn aes256_new(bh: &mut test::Bencher) {
+    bh.iter(|| {
+        let key = test::black_box(Default::default());
+        let cipher = aes::Aes256::new(&key);
+        test::black_box(&cipher);
+    });
+}
diff --git a/src/armv8.rs b/src/armv8.rs
new file mode 100644 (file)
index 0000000..4ecc471
--- /dev/null
@@ -0,0 +1,342 @@
+//! AES block cipher implementation using the ARMv8 Cryptography Extensions.
+//!
+//! Based on this C intrinsics implementation:
+//! <https://github.com/noloader/AES-Intrinsics/blob/master/aes-arm.c>
+//!
+//! Original C written and placed in public domain by Jeffrey Walton.
+//! Based on code from ARM, and by Johannes Schneiders, Skip Hovsmith and
+//! Barry O'Rourke for the mbedTLS project.
+
+#![allow(clippy::needless_range_loop)]
+
+#[cfg(feature = "hazmat")]
+pub(crate) mod hazmat;
+
+mod encdec;
+mod expand;
+#[cfg(test)]
+mod test_expand;
+
+use self::{
+    encdec::{decrypt1, decrypt8, encrypt1, encrypt8},
+    expand::{expand_key, inv_expanded_keys},
+};
+use crate::{Block, Block8};
+use cipher::{
+    consts::{U16, U24, U32, U8},
+    inout::InOut,
+    AlgorithmName, BlockBackend, BlockCipher, BlockClosure, BlockDecrypt, BlockEncrypt,
+    BlockSizeUser, Key, KeyInit, KeySizeUser, ParBlocksSizeUser,
+};
+use core::arch::aarch64::*;
+use core::fmt;
+
+macro_rules! define_aes_impl {
+    (
+        $name:ident,
+        $name_enc:ident,
+        $name_dec:ident,
+        $name_back_enc:ident,
+        $name_back_dec:ident,
+        $key_size:ty,
+        $rounds:tt,
+        $doc:expr $(,)?
+    ) => {
+        #[doc=$doc]
+        #[doc = "block cipher"]
+        #[derive(Clone)]
+        pub struct $name {
+            encrypt: $name_enc,
+            decrypt: $name_dec,
+        }
+
+        impl $name {
+            #[inline(always)]
+            pub(crate) fn get_enc_backend(&self) -> $name_back_enc<'_> {
+                self.encrypt.get_enc_backend()
+            }
+
+            #[inline(always)]
+            pub(crate) fn get_dec_backend(&self) -> $name_back_dec<'_> {
+                self.decrypt.get_dec_backend()
+            }
+        }
+
+        impl BlockCipher for $name {}
+
+        impl KeySizeUser for $name {
+            type KeySize = $key_size;
+        }
+
+        impl KeyInit for $name {
+            #[inline]
+            fn new(key: &Key<Self>) -> Self {
+                let encrypt = $name_enc::new(key);
+                let decrypt = $name_dec::from(&encrypt);
+                Self { encrypt, decrypt }
+            }
+        }
+
+        impl From<$name_enc> for $name {
+            #[inline]
+            fn from(encrypt: $name_enc) -> $name {
+                let decrypt = (&encrypt).into();
+                Self { encrypt, decrypt }
+            }
+        }
+
+        impl From<&$name_enc> for $name {
+            #[inline]
+            fn from(encrypt: &$name_enc) -> $name {
+                let decrypt = encrypt.into();
+                let encrypt = encrypt.clone();
+                Self { encrypt, decrypt }
+            }
+        }
+
+        impl BlockSizeUser for $name {
+            type BlockSize = U16;
+        }
+
+        impl BlockEncrypt for $name {
+            fn encrypt_with_backend(&self, f: impl BlockClosure<BlockSize = U16>) {
+                self.encrypt.encrypt_with_backend(f)
+            }
+        }
+
+        impl BlockDecrypt for $name {
+            fn decrypt_with_backend(&self, f: impl BlockClosure<BlockSize = U16>) {
+                self.decrypt.decrypt_with_backend(f)
+            }
+        }
+
+        impl fmt::Debug for $name {
+            fn fmt(&self, f: &mut fmt::Formatter<'_>) -> Result<(), fmt::Error> {
+                f.write_str(concat!(stringify!($name), " { .. }"))
+            }
+        }
+
+        impl AlgorithmName for $name {
+            fn write_alg_name(f: &mut fmt::Formatter<'_>) -> fmt::Result {
+                f.write_str(stringify!($name))
+            }
+        }
+
+        #[cfg(feature = "zeroize")]
+        impl zeroize::ZeroizeOnDrop for $name {}
+
+        #[doc=$doc]
+        #[doc = "block cipher (encrypt-only)"]
+        #[derive(Clone)]
+        pub struct $name_enc {
+            round_keys: [uint8x16_t; $rounds],
+        }
+
+        impl $name_enc {
+            #[inline(always)]
+            pub(crate) fn get_enc_backend(&self) -> $name_back_enc<'_> {
+                $name_back_enc(self)
+            }
+        }
+
+        impl BlockCipher for $name_enc {}
+
+        impl KeySizeUser for $name_enc {
+            type KeySize = $key_size;
+        }
+
+        impl KeyInit for $name_enc {
+            fn new(key: &Key<Self>) -> Self {
+                Self {
+                    round_keys: expand_key(key.as_ref()),
+                }
+            }
+        }
+
+        impl BlockSizeUser for $name_enc {
+            type BlockSize = U16;
+        }
+
+        impl BlockEncrypt for $name_enc {
+            fn encrypt_with_backend(&self, f: impl BlockClosure<BlockSize = U16>) {
+                f.call(&mut self.get_enc_backend())
+            }
+        }
+
+        impl fmt::Debug for $name_enc {
+            fn fmt(&self, f: &mut fmt::Formatter<'_>) -> Result<(), fmt::Error> {
+                f.write_str(concat!(stringify!($name_enc), " { .. }"))
+            }
+        }
+
+        impl AlgorithmName for $name_enc {
+            fn write_alg_name(f: &mut fmt::Formatter<'_>) -> fmt::Result {
+                f.write_str(stringify!($name_enc))
+            }
+        }
+
+        impl Drop for $name_enc {
+            #[inline]
+            fn drop(&mut self) {
+                #[cfg(feature = "zeroize")]
+                zeroize::Zeroize::zeroize(&mut self.round_keys);
+            }
+        }
+
+        #[cfg(feature = "zeroize")]
+        impl zeroize::ZeroizeOnDrop for $name_enc {}
+
+        #[doc=$doc]
+        #[doc = "block cipher (decrypt-only)"]
+        #[derive(Clone)]
+        pub struct $name_dec {
+            round_keys: [uint8x16_t; $rounds],
+        }
+
+        impl $name_dec {
+            #[inline(always)]
+            pub(crate) fn get_dec_backend(&self) -> $name_back_dec<'_> {
+                $name_back_dec(self)
+            }
+        }
+
+        impl BlockCipher for $name_dec {}
+
+        impl KeySizeUser for $name_dec {
+            type KeySize = $key_size;
+        }
+
+        impl KeyInit for $name_dec {
+            fn new(key: &Key<Self>) -> Self {
+                $name_enc::new(key).into()
+            }
+        }
+
+        impl From<$name_enc> for $name_dec {
+            #[inline]
+            fn from(enc: $name_enc) -> $name_dec {
+                Self::from(&enc)
+            }
+        }
+
+        impl From<&$name_enc> for $name_dec {
+            fn from(enc: &$name_enc) -> $name_dec {
+                let mut round_keys = enc.round_keys;
+                inv_expanded_keys(&mut round_keys);
+                Self { round_keys }
+            }
+        }
+
+        impl BlockSizeUser for $name_dec {
+            type BlockSize = U16;
+        }
+
+        impl BlockDecrypt for $name_dec {
+            fn decrypt_with_backend(&self, f: impl BlockClosure<BlockSize = U16>) {
+                f.call(&mut self.get_dec_backend());
+            }
+        }
+
+        impl fmt::Debug for $name_dec {
+            fn fmt(&self, f: &mut fmt::Formatter<'_>) -> Result<(), fmt::Error> {
+                f.write_str(concat!(stringify!($name_dec), " { .. }"))
+            }
+        }
+
+        impl AlgorithmName for $name_dec {
+            fn write_alg_name(f: &mut fmt::Formatter<'_>) -> fmt::Result {
+                f.write_str(stringify!($name_dec))
+            }
+        }
+
+        impl Drop for $name_dec {
+            #[inline]
+            fn drop(&mut self) {
+                #[cfg(feature = "zeroize")]
+                zeroize::Zeroize::zeroize(&mut self.round_keys);
+            }
+        }
+
+        #[cfg(feature = "zeroize")]
+        impl zeroize::ZeroizeOnDrop for $name_dec {}
+
+        pub(crate) struct $name_back_enc<'a>(&'a $name_enc);
+
+        impl<'a> BlockSizeUser for $name_back_enc<'a> {
+            type BlockSize = U16;
+        }
+
+        impl<'a> ParBlocksSizeUser for $name_back_enc<'a> {
+            type ParBlocksSize = U8;
+        }
+
+        impl<'a> BlockBackend for $name_back_enc<'a> {
+            #[inline(always)]
+            fn proc_block(&mut self, block: InOut<'_, '_, Block>) {
+                unsafe {
+                    encrypt1(&self.0.round_keys, block);
+                }
+            }
+
+            #[inline(always)]
+            fn proc_par_blocks(&mut self, blocks: InOut<'_, '_, Block8>) {
+                unsafe { encrypt8(&self.0.round_keys, blocks) }
+            }
+        }
+
+        pub(crate) struct $name_back_dec<'a>(&'a $name_dec);
+
+        impl<'a> BlockSizeUser for $name_back_dec<'a> {
+            type BlockSize = U16;
+        }
+
+        impl<'a> ParBlocksSizeUser for $name_back_dec<'a> {
+            type ParBlocksSize = U8;
+        }
+
+        impl<'a> BlockBackend for $name_back_dec<'a> {
+            #[inline(always)]
+            fn proc_block(&mut self, block: InOut<'_, '_, Block>) {
+                unsafe {
+                    decrypt1(&self.0.round_keys, block);
+                }
+            }
+
+            #[inline(always)]
+            fn proc_par_blocks(&mut self, blocks: InOut<'_, '_, Block8>) {
+                unsafe { decrypt8(&self.0.round_keys, blocks) }
+            }
+        }
+    };
+}
+
+define_aes_impl!(
+    Aes128,
+    Aes128Enc,
+    Aes128Dec,
+    Aes128BackEnc,
+    Aes128BackDec,
+    U16,
+    11,
+    "AES-128",
+);
+define_aes_impl!(
+    Aes192,
+    Aes192Enc,
+    Aes192Dec,
+    Aes192BackEnc,
+    Aes192BackDec,
+    U24,
+    13,
+    "AES-192",
+);
+define_aes_impl!(
+    Aes256,
+    Aes256Enc,
+    Aes256Dec,
+    Aes256BackEnc,
+    Aes256BackDec,
+    U32,
+    15,
+    "AES-256",
+);
diff --git a/src/armv8/encdec.rs b/src/armv8/encdec.rs
new file mode 100644 (file)
index 0000000..ecf7d5c
--- /dev/null
@@ -0,0 +1,158 @@
+//! AES encryption support
+
+use crate::{Block, Block8};
+use cipher::inout::InOut;
+use core::arch::aarch64::*;
+
+/// Perform AES encryption using the given expanded keys.
+#[target_feature(enable = "aes")]
+#[target_feature(enable = "neon")]
+pub(super) unsafe fn encrypt1<const N: usize>(
+    expanded_keys: &[uint8x16_t; N],
+    block: InOut<'_, '_, Block>,
+) {
+    let rounds = N - 1;
+    assert!(rounds == 10 || rounds == 12 || rounds == 14);
+
+    let (in_ptr, out_ptr) = block.into_raw();
+
+    let mut state = vld1q_u8(in_ptr as *const u8);
+
+    for k in expanded_keys.iter().take(rounds - 1) {
+        // AES single round encryption
+        state = vaeseq_u8(state, *k);
+
+        // AES mix columns
+        state = vaesmcq_u8(state);
+    }
+
+    // AES single round encryption
+    state = vaeseq_u8(state, expanded_keys[rounds - 1]);
+
+    // Final add (bitwise XOR)
+    state = veorq_u8(state, expanded_keys[rounds]);
+
+    vst1q_u8(out_ptr as *mut u8, state);
+}
+
+/// Perform parallel AES encryption 8-blocks-at-a-time using the given expanded keys.
+#[target_feature(enable = "aes")]
+#[target_feature(enable = "neon")]
+pub(super) unsafe fn encrypt8<const N: usize>(
+    expanded_keys: &[uint8x16_t; N],
+    blocks: InOut<'_, '_, Block8>,
+) {
+    let rounds = N - 1;
+    assert!(rounds == 10 || rounds == 12 || rounds == 14);
+
+    let (in_ptr, out_ptr) = blocks.into_raw();
+    let in_ptr = in_ptr as *const Block;
+    let out_ptr = out_ptr as *const Block;
+
+    let mut state = [
+        vld1q_u8(in_ptr.add(0) as *const u8),
+        vld1q_u8(in_ptr.add(1) as *const u8),
+        vld1q_u8(in_ptr.add(2) as *const u8),
+        vld1q_u8(in_ptr.add(3) as *const u8),
+        vld1q_u8(in_ptr.add(4) as *const u8),
+        vld1q_u8(in_ptr.add(5) as *const u8),
+        vld1q_u8(in_ptr.add(6) as *const u8),
+        vld1q_u8(in_ptr.add(7) as *const u8),
+    ];
+
+    for k in expanded_keys.iter().take(rounds - 1) {
+        for i in 0..8 {
+            // AES single round encryption
+            state[i] = vaeseq_u8(state[i], *k);
+
+            // AES mix columns
+            state[i] = vaesmcq_u8(state[i]);
+        }
+    }
+
+    for i in 0..8 {
+        // AES single round encryption
+        state[i] = vaeseq_u8(state[i], expanded_keys[rounds - 1]);
+
+        // Final add (bitwise XOR)
+        state[i] = veorq_u8(state[i], expanded_keys[rounds]);
+
+        vst1q_u8(out_ptr.add(i) as *mut u8, state[i]);
+    }
+}
+
+/// Perform AES decryption using the given expanded keys.
+#[target_feature(enable = "aes")]
+#[target_feature(enable = "neon")]
+pub(super) unsafe fn decrypt1<const N: usize>(
+    expanded_keys: &[uint8x16_t; N],
+    block: InOut<'_, '_, Block>,
+) {
+    let rounds = N - 1;
+    assert!(rounds == 10 || rounds == 12 || rounds == 14);
+
+    let (in_ptr, out_ptr) = block.into_raw();
+    let mut state = vld1q_u8(in_ptr as *const u8);
+
+    for k in expanded_keys.iter().take(rounds - 1) {
+        // AES single round decryption
+        state = vaesdq_u8(state, *k);
+
+        // AES inverse mix columns
+        state = vaesimcq_u8(state);
+    }
+
+    // AES single round decryption
+    state = vaesdq_u8(state, expanded_keys[rounds - 1]);
+
+    // Final add (bitwise XOR)
+    state = veorq_u8(state, expanded_keys[rounds]);
+
+    vst1q_u8(out_ptr as *mut u8, state);
+}
+
+/// Perform parallel AES decryption 8-blocks-at-a-time using the given expanded keys.
+#[target_feature(enable = "aes")]
+#[target_feature(enable = "neon")]
+pub(super) unsafe fn decrypt8<const N: usize>(
+    expanded_keys: &[uint8x16_t; N],
+    blocks: InOut<'_, '_, Block8>,
+) {
+    let rounds = N - 1;
+    assert!(rounds == 10 || rounds == 12 || rounds == 14);
+
+    let (in_ptr, out_ptr) = blocks.into_raw();
+    let in_ptr = in_ptr as *const Block;
+    let out_ptr = out_ptr as *const Block;
+
+    let mut state = [
+        vld1q_u8(in_ptr.add(0) as *const u8),
+        vld1q_u8(in_ptr.add(1) as *const u8),
+        vld1q_u8(in_ptr.add(2) as *const u8),
+        vld1q_u8(in_ptr.add(3) as *const u8),
+        vld1q_u8(in_ptr.add(4) as *const u8),
+        vld1q_u8(in_ptr.add(5) as *const u8),
+        vld1q_u8(in_ptr.add(6) as *const u8),
+        vld1q_u8(in_ptr.add(7) as *const u8),
+    ];
+
+    for k in expanded_keys.iter().take(rounds - 1) {
+        for i in 0..8 {
+            // AES single round decryption
+            state[i] = vaesdq_u8(state[i], *k);
+
+            // AES inverse mix columns
+            state[i] = vaesimcq_u8(state[i]);
+        }
+    }
+
+    for i in 0..8 {
+        // AES single round decryption
+        state[i] = vaesdq_u8(state[i], expanded_keys[rounds - 1]);
+
+        // Final add (bitwise XOR)
+        state[i] = veorq_u8(state[i], expanded_keys[rounds]);
+
+        vst1q_u8(out_ptr.add(i) as *mut u8, state[i]);
+    }
+}
diff --git a/src/armv8/expand.rs b/src/armv8/expand.rs
new file mode 100644 (file)
index 0000000..8e5cf88
--- /dev/null
@@ -0,0 +1,77 @@
+//! AES key expansion support.
+
+use core::{arch::aarch64::*, mem, slice};
+
+/// There are 4 AES words in a block.
+const BLOCK_WORDS: usize = 4;
+
+/// The AES (nee Rijndael) notion of a word is always 32-bits, or 4-bytes.
+const WORD_SIZE: usize = 4;
+
+/// AES round constants.
+const ROUND_CONSTS: [u32; 10] = [0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x1b, 0x36];
+
+/// AES key expansion
+// TODO(tarcieri): big endian support?
+#[inline]
+pub(super) fn expand_key<const L: usize, const N: usize>(key: &[u8; L]) -> [uint8x16_t; N] {
+    assert!((L == 16 && N == 11) || (L == 24 && N == 13) || (L == 32 && N == 15));
+
+    let mut expanded_keys: [uint8x16_t; N] = unsafe { mem::zeroed() };
+
+    // TODO(tarcieri): construct expanded keys using `vreinterpretq_u8_u32`
+    let ek_words = unsafe {
+        slice::from_raw_parts_mut(expanded_keys.as_mut_ptr() as *mut u32, N * BLOCK_WORDS)
+    };
+
+    for (i, chunk) in key.chunks_exact(WORD_SIZE).enumerate() {
+        ek_words[i] = u32::from_ne_bytes(chunk.try_into().unwrap());
+    }
+
+    // From "The Rijndael Block Cipher" Section 4.1:
+    // > The number of columns of the Cipher Key is denoted by `Nk` and is
+    // > equal to the key length divided by 32 [bits].
+    let nk = L / WORD_SIZE;
+
+    for i in nk..(N * BLOCK_WORDS) {
+        let mut word = ek_words[i - 1];
+
+        if i % nk == 0 {
+            word = sub_word(word).rotate_right(8) ^ ROUND_CONSTS[i / nk - 1];
+        } else if nk > 6 && i % nk == 4 {
+            word = sub_word(word)
+        }
+
+        ek_words[i] = ek_words[i - nk] ^ word;
+    }
+
+    expanded_keys
+}
+
+/// Compute inverse expanded keys (for decryption).
+///
+/// This is the reverse of the encryption keys, with the Inverse Mix Columns
+/// operation applied to all but the first and last expanded key.
+#[inline]
+pub(super) fn inv_expanded_keys<const N: usize>(expanded_keys: &mut [uint8x16_t; N]) {
+    assert!(N == 11 || N == 13 || N == 15);
+
+    for ek in expanded_keys.iter_mut().take(N - 1).skip(1) {
+        unsafe { *ek = vaesimcq_u8(*ek) }
+    }
+
+    expanded_keys.reverse();
+}
+
+/// Sub bytes for a single AES word: used for key expansion.
+#[inline(always)]
+fn sub_word(input: u32) -> u32 {
+    unsafe {
+        let input = vreinterpretq_u8_u32(vdupq_n_u32(input));
+
+        // AES single round encryption (with a "round" key of all zeros)
+        let sub_input = vaeseq_u8(input, vdupq_n_u8(0));
+
+        vgetq_lane_u32(vreinterpretq_u32_u8(sub_input), 0)
+    }
+}
diff --git a/src/armv8/hazmat.rs b/src/armv8/hazmat.rs
new file mode 100644 (file)
index 0000000..f094243
--- /dev/null
@@ -0,0 +1,104 @@
+//! Low-level "hazmat" AES functions: ARMv8 Cryptography Extensions support.
+//!
+//! Note: this isn't actually used in the `Aes128`/`Aes192`/`Aes256`
+//! implementations in this crate, but instead provides raw AES-NI accelerated
+//! access to the AES round function gated under the `hazmat` crate feature.
+
+use crate::{Block, Block8};
+use core::arch::aarch64::*;
+
+/// AES cipher (encrypt) round function.
+#[allow(clippy::cast_ptr_alignment)]
+#[target_feature(enable = "aes")]
+pub(crate) unsafe fn cipher_round(block: &mut Block, round_key: &Block) {
+    let b = vld1q_u8(block.as_ptr());
+    let k = vld1q_u8(round_key.as_ptr());
+
+    // AES single round encryption (all-zero round key, deferred until the end)
+    let mut state = vaeseq_u8(b, vdupq_n_u8(0));
+
+    // AES mix columns (the `vaeseq_u8` instruction otherwise omits this step)
+    state = vaesmcq_u8(state);
+
+    // AES add round key (bitwise XOR)
+    state = veorq_u8(state, k);
+
+    vst1q_u8(block.as_mut_ptr(), state);
+}
+
+/// AES cipher (encrypt) round function: parallel version.
+#[allow(clippy::cast_ptr_alignment)]
+#[target_feature(enable = "aes")]
+pub(crate) unsafe fn cipher_round_par(blocks: &mut Block8, round_keys: &Block8) {
+    for i in 0..8 {
+        let mut state = vld1q_u8(blocks[i].as_ptr());
+
+        // AES single round encryption
+        state = vaeseq_u8(state, vdupq_n_u8(0));
+
+        // AES mix columns
+        state = vaesmcq_u8(state);
+
+        // AES add round key (bitwise XOR)
+        state = veorq_u8(state, vld1q_u8(round_keys[i].as_ptr()));
+
+        vst1q_u8(blocks[i].as_mut_ptr(), state);
+    }
+}
+
+/// AES equivalent inverse cipher (decrypt) round function.
+#[allow(clippy::cast_ptr_alignment)]
+#[target_feature(enable = "aes")]
+pub(crate) unsafe fn equiv_inv_cipher_round(block: &mut Block, round_key: &Block) {
+    let b = vld1q_u8(block.as_ptr());
+    let k = vld1q_u8(round_key.as_ptr());
+
+    // AES single round decryption (all-zero round key, deferred until the end)
+    let mut state = vaesdq_u8(b, vdupq_n_u8(0));
+
+    // AES inverse mix columns (the `vaesdq_u8` instruction otherwise omits this step)
+    state = vaesimcq_u8(state);
+
+    // AES add round key (bitwise XOR)
+    state = veorq_u8(state, k);
+
+    vst1q_u8(block.as_mut_ptr(), state);
+}
+
+/// AES equivalent inverse cipher (decrypt) round function: parallel version.
+#[allow(clippy::cast_ptr_alignment)]
+#[target_feature(enable = "aes")]
+pub(crate) unsafe fn equiv_inv_cipher_round_par(blocks: &mut Block8, round_keys: &Block8) {
+    for i in 0..8 {
+        let mut state = vld1q_u8(blocks[i].as_ptr());
+
+        // AES single round decryption (all-zero round key, deferred until the end)
+        state = vaesdq_u8(state, vdupq_n_u8(0));
+
+        // AES inverse mix columns (the `vaesdq_u8` instruction otherwise omits this step)
+        state = vaesimcq_u8(state);
+
+        // AES add round key (bitwise XOR)
+        state = veorq_u8(state, vld1q_u8(round_keys[i].as_ptr()));
+
+        vst1q_u8(blocks[i].as_mut_ptr(), state);
+    }
+}
+
+/// AES mix columns function.
+#[allow(clippy::cast_ptr_alignment)]
+#[target_feature(enable = "aes")]
+pub(crate) unsafe fn mix_columns(block: &mut Block) {
+    let b = vld1q_u8(block.as_ptr());
+    let out = vaesmcq_u8(b);
+    vst1q_u8(block.as_mut_ptr(), out);
+}
+
+/// AES inverse mix columns function.
+#[allow(clippy::cast_ptr_alignment)]
+#[target_feature(enable = "aes")]
+pub(crate) unsafe fn inv_mix_columns(block: &mut Block) {
+    let b = vld1q_u8(block.as_ptr());
+    let out = vaesimcq_u8(b);
+    vst1q_u8(block.as_mut_ptr(), out);
+}
diff --git a/src/armv8/test_expand.rs b/src/armv8/test_expand.rs
new file mode 100644 (file)
index 0000000..c52bda7
--- /dev/null
@@ -0,0 +1,130 @@
+use super::{expand_key, inv_expanded_keys};
+use core::arch::aarch64::*;
+use hex_literal::hex;
+
+/// FIPS 197, Appendix A.1: AES-128 Cipher Key
+/// user input, unaligned buffer
+const AES128_KEY: [u8; 16] = hex!("2b7e151628aed2a6abf7158809cf4f3c");
+
+/// FIPS 197 Appendix A.1: Expansion of a 128-bit Cipher Key
+/// library controlled, aligned buffer
+const AES128_EXP_KEYS: [[u8; 16]; 11] = [
+    AES128_KEY,
+    hex!("a0fafe1788542cb123a339392a6c7605"),
+    hex!("f2c295f27a96b9435935807a7359f67f"),
+    hex!("3d80477d4716fe3e1e237e446d7a883b"),
+    hex!("ef44a541a8525b7fb671253bdb0bad00"),
+    hex!("d4d1c6f87c839d87caf2b8bc11f915bc"),
+    hex!("6d88a37a110b3efddbf98641ca0093fd"),
+    hex!("4e54f70e5f5fc9f384a64fb24ea6dc4f"),
+    hex!("ead27321b58dbad2312bf5607f8d292f"),
+    hex!("ac7766f319fadc2128d12941575c006e"),
+    hex!("d014f9a8c9ee2589e13f0cc8b6630ca6"),
+];
+
+/// Inverse expanded keys for [`AES128_EXPANDED_KEYS`]
+const AES128_EXP_INVKEYS: [[u8; 16]; 11] = [
+    hex!("d014f9a8c9ee2589e13f0cc8b6630ca6"),
+    hex!("0c7b5a631319eafeb0398890664cfbb4"),
+    hex!("df7d925a1f62b09da320626ed6757324"),
+    hex!("12c07647c01f22c7bc42d2f37555114a"),
+    hex!("6efcd876d2df54807c5df034c917c3b9"),
+    hex!("6ea30afcbc238cf6ae82a4b4b54a338d"),
+    hex!("90884413d280860a12a128421bc89739"),
+    hex!("7c1f13f74208c219c021ae480969bf7b"),
+    hex!("cc7505eb3e17d1ee82296c51c9481133"),
+    hex!("2b3708a7f262d405bc3ebdbf4b617d62"),
+    AES128_KEY,
+];
+
+/// FIPS 197, Appendix A.2: AES-192 Cipher Key
+/// user input, unaligned buffer
+const AES192_KEY: [u8; 24] = hex!("8e73b0f7da0e6452c810f32b809079e562f8ead2522c6b7b");
+
+/// FIPS 197 Appendix A.2: Expansion of a 192-bit Cipher Key
+/// library controlled, aligned buffer
+const AES192_EXP_KEYS: [[u8; 16]; 13] = [
+    hex!("8e73b0f7da0e6452c810f32b809079e5"),
+    hex!("62f8ead2522c6b7bfe0c91f72402f5a5"),
+    hex!("ec12068e6c827f6b0e7a95b95c56fec2"),
+    hex!("4db7b4bd69b5411885a74796e92538fd"),
+    hex!("e75fad44bb095386485af05721efb14f"),
+    hex!("a448f6d94d6dce24aa326360113b30e6"),
+    hex!("a25e7ed583b1cf9a27f939436a94f767"),
+    hex!("c0a69407d19da4e1ec1786eb6fa64971"),
+    hex!("485f703222cb8755e26d135233f0b7b3"),
+    hex!("40beeb282f18a2596747d26b458c553e"),
+    hex!("a7e1466c9411f1df821f750aad07d753"),
+    hex!("ca4005388fcc5006282d166abc3ce7b5"),
+    hex!("e98ba06f448c773c8ecc720401002202"),
+];
+
+/// FIPS 197, Appendix A.3: AES-256 Cipher Key
+/// user input, unaligned buffer
+const AES256_KEY: [u8; 32] =
+    hex!("603deb1015ca71be2b73aef0857d77811f352c073b6108d72d9810a30914dff4");
+
+/// FIPS 197 Appendix A.3: Expansion of a 256-bit Cipher Key
+/// library controlled, aligned buffer
+const AES256_EXP_KEYS: [[u8; 16]; 15] = [
+    hex!("603deb1015ca71be2b73aef0857d7781"),
+    hex!("1f352c073b6108d72d9810a30914dff4"),
+    hex!("9ba354118e6925afa51a8b5f2067fcde"),
+    hex!("a8b09c1a93d194cdbe49846eb75d5b9a"),
+    hex!("d59aecb85bf3c917fee94248de8ebe96"),
+    hex!("b5a9328a2678a647983122292f6c79b3"),
+    hex!("812c81addadf48ba24360af2fab8b464"),
+    hex!("98c5bfc9bebd198e268c3ba709e04214"),
+    hex!("68007bacb2df331696e939e46c518d80"),
+    hex!("c814e20476a9fb8a5025c02d59c58239"),
+    hex!("de1369676ccc5a71fa2563959674ee15"),
+    hex!("5886ca5d2e2f31d77e0af1fa27cf73c3"),
+    hex!("749c47ab18501ddae2757e4f7401905a"),
+    hex!("cafaaae3e4d59b349adf6acebd10190d"),
+    hex!("fe4890d1e6188d0b046df344706c631e"),
+];
+
+fn load_expanded_keys<const N: usize>(input: [[u8; 16]; N]) -> [uint8x16_t; N] {
+    let mut output = [unsafe { vdupq_n_u8(0) }; N];
+
+    for (src, dst) in input.iter().zip(output.iter_mut()) {
+        *dst = unsafe { vld1q_u8(src.as_ptr()) }
+    }
+
+    output
+}
+
+fn store_expanded_keys<const N: usize>(input: [uint8x16_t; N]) -> [[u8; 16]; N] {
+    let mut output = [[0u8; 16]; N];
+
+    for (src, dst) in input.iter().zip(output.iter_mut()) {
+        unsafe { vst1q_u8(dst.as_mut_ptr(), *src) }
+    }
+
+    output
+}
+
+#[test]
+fn aes128_key_expansion() {
+    let ek = expand_key(&AES128_KEY);
+    assert_eq!(store_expanded_keys(ek), AES128_EXP_KEYS);
+}
+
+#[test]
+fn aes128_key_expansion_inv() {
+    let mut ek = load_expanded_keys(AES128_EXP_KEYS);
+    inv_expanded_keys(&mut ek);
+    assert_eq!(store_expanded_keys(ek), AES128_EXP_INVKEYS);
+}
+
+#[test]
+fn aes192_key_expansion() {
+    let ek = expand_key(&AES192_KEY);
+    assert_eq!(store_expanded_keys(ek), AES192_EXP_KEYS);
+}
+
+#[test]
+fn aes256_key_expansion() {
+    let ek = expand_key(&AES256_KEY);
+    assert_eq!(store_expanded_keys(ek), AES256_EXP_KEYS);
+}
diff --git a/src/autodetect.rs b/src/autodetect.rs
new file mode 100644 (file)
index 0000000..ac471fa
--- /dev/null
@@ -0,0 +1,430 @@
+//! Autodetection support for hardware accelerated AES backends with fallback
+//! to the fixsliced "soft" implementation.
+
+use crate::soft;
+use cipher::{
+    consts::{U16, U24, U32},
+    AlgorithmName, BlockCipher, BlockClosure, BlockDecrypt, BlockEncrypt, BlockSizeUser, Key,
+    KeyInit, KeySizeUser,
+};
+use core::fmt;
+use core::mem::ManuallyDrop;
+
+#[cfg(all(target_arch = "aarch64", aes_armv8))]
+use crate::armv8 as intrinsics;
+
+#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
+use crate::ni as intrinsics;
+
+cpufeatures::new!(aes_intrinsics, "aes");
+
+macro_rules! define_aes_impl {
+    (
+        $name:ident,
+        $name_enc:ident,
+        $name_dec:ident,
+        $module:tt,
+        $key_size:ty,
+        $doc:expr $(,)?
+    ) => {
+        mod $module {
+            use super::{intrinsics, soft};
+            use core::mem::ManuallyDrop;
+
+            pub(super) union Inner {
+                pub(super) intrinsics: ManuallyDrop<intrinsics::$name>,
+                pub(super) soft: ManuallyDrop<soft::$name>,
+            }
+
+            pub(super) union InnerEnc {
+                pub(super) intrinsics: ManuallyDrop<intrinsics::$name_enc>,
+                pub(super) soft: ManuallyDrop<soft::$name_enc>,
+            }
+
+            pub(super) union InnerDec {
+                pub(super) intrinsics: ManuallyDrop<intrinsics::$name_dec>,
+                pub(super) soft: ManuallyDrop<soft::$name_dec>,
+            }
+        }
+
+        #[doc=$doc]
+        #[doc = "block cipher"]
+        pub struct $name {
+            inner: $module::Inner,
+            token: aes_intrinsics::InitToken,
+        }
+
+        impl KeySizeUser for $name {
+            type KeySize = $key_size;
+        }
+        impl From<$name_enc> for $name {
+            #[inline]
+            fn from(enc: $name_enc) -> $name {
+                Self::from(&enc)
+            }
+        }
+
+        impl From<&$name_enc> for $name {
+            fn from(enc: &$name_enc) -> $name {
+                use core::ops::Deref;
+                let inner = if enc.token.get() {
+                    $module::Inner {
+                        intrinsics: ManuallyDrop::new(unsafe {
+                            enc.inner.intrinsics.deref().into()
+                        }),
+                    }
+                } else {
+                    $module::Inner {
+                        soft: ManuallyDrop::new(unsafe { enc.inner.soft.deref().into() }),
+                    }
+                };
+
+                Self {
+                    inner,
+                    token: enc.token,
+                }
+            }
+        }
+
+        impl KeyInit for $name {
+            #[inline]
+            fn new(key: &Key<Self>) -> Self {
+                let (token, aesni_present) = aes_intrinsics::init_get();
+
+                let inner = if aesni_present {
+                    $module::Inner {
+                        intrinsics: ManuallyDrop::new(intrinsics::$name::new(key)),
+                    }
+                } else {
+                    $module::Inner {
+                        soft: ManuallyDrop::new(soft::$name::new(key)),
+                    }
+                };
+
+                Self { inner, token }
+            }
+        }
+
+        impl Clone for $name {
+            fn clone(&self) -> Self {
+                let inner = if self.token.get() {
+                    $module::Inner {
+                        intrinsics: unsafe { self.inner.intrinsics.clone() },
+                    }
+                } else {
+                    $module::Inner {
+                        soft: unsafe { self.inner.soft.clone() },
+                    }
+                };
+
+                Self {
+                    inner,
+                    token: self.token,
+                }
+            }
+        }
+
+        impl BlockSizeUser for $name {
+            type BlockSize = U16;
+        }
+
+        impl BlockCipher for $name {}
+
+        impl BlockEncrypt for $name {
+            fn encrypt_with_backend(&self, f: impl BlockClosure<BlockSize = U16>) {
+                unsafe {
+                    if self.token.get() {
+                        #[target_feature(enable = "aes")]
+                        unsafe fn inner(
+                            state: &intrinsics::$name,
+                            f: impl BlockClosure<BlockSize = U16>,
+                        ) {
+                            f.call(&mut state.get_enc_backend());
+                        }
+                        inner(&self.inner.intrinsics, f);
+                    } else {
+                        f.call(&mut self.inner.soft.get_enc_backend());
+                    }
+                }
+            }
+        }
+
+        impl BlockDecrypt for $name {
+            fn decrypt_with_backend(&self, f: impl BlockClosure<BlockSize = U16>) {
+                unsafe {
+                    if self.token.get() {
+                        #[target_feature(enable = "aes")]
+                        unsafe fn inner(
+                            state: &intrinsics::$name,
+                            f: impl BlockClosure<BlockSize = U16>,
+                        ) {
+                            f.call(&mut state.get_dec_backend());
+                        }
+                        inner(&self.inner.intrinsics, f);
+                    } else {
+                        f.call(&mut self.inner.soft.get_dec_backend());
+                    }
+                }
+            }
+        }
+
+        impl fmt::Debug for $name {
+            fn fmt(&self, f: &mut fmt::Formatter<'_>) -> Result<(), fmt::Error> {
+                f.write_str(concat!(stringify!($name), " { .. }"))
+            }
+        }
+
+        impl AlgorithmName for $name {
+            fn write_alg_name(f: &mut fmt::Formatter<'_>) -> fmt::Result {
+                f.write_str(stringify!($name))
+            }
+        }
+
+        impl Drop for $name {
+            #[inline]
+            fn drop(&mut self) {
+                if self.token.get() {
+                    unsafe { ManuallyDrop::drop(&mut self.inner.intrinsics) };
+                } else {
+                    unsafe { ManuallyDrop::drop(&mut self.inner.soft) };
+                };
+            }
+        }
+
+        #[cfg(feature = "zeroize")]
+        impl zeroize::ZeroizeOnDrop for $name {}
+
+        #[doc=$doc]
+        #[doc = "block cipher (encrypt-only)"]
+        pub struct $name_enc {
+            inner: $module::InnerEnc,
+            token: aes_intrinsics::InitToken,
+        }
+
+        impl KeySizeUser for $name_enc {
+            type KeySize = $key_size;
+        }
+
+        impl KeyInit for $name_enc {
+            #[inline]
+            fn new(key: &Key<Self>) -> Self {
+                let (token, aesni_present) = aes_intrinsics::init_get();
+
+                let inner = if aesni_present {
+                    $module::InnerEnc {
+                        intrinsics: ManuallyDrop::new(intrinsics::$name_enc::new(key)),
+                    }
+                } else {
+                    $module::InnerEnc {
+                        soft: ManuallyDrop::new(soft::$name_enc::new(key)),
+                    }
+                };
+
+                Self { inner, token }
+            }
+        }
+
+        impl Clone for $name_enc {
+            fn clone(&self) -> Self {
+                let inner = if self.token.get() {
+                    $module::InnerEnc {
+                        intrinsics: unsafe { self.inner.intrinsics.clone() },
+                    }
+                } else {
+                    $module::InnerEnc {
+                        soft: unsafe { self.inner.soft.clone() },
+                    }
+                };
+
+                Self {
+                    inner,
+                    token: self.token,
+                }
+            }
+        }
+
+        impl BlockSizeUser for $name_enc {
+            type BlockSize = U16;
+        }
+
+        impl BlockCipher for $name_enc {}
+
+        impl BlockEncrypt for $name_enc {
+            fn encrypt_with_backend(&self, f: impl BlockClosure<BlockSize = U16>) {
+                unsafe {
+                    if self.token.get() {
+                        #[target_feature(enable = "aes")]
+                        unsafe fn inner(
+                            state: &intrinsics::$name_enc,
+                            f: impl BlockClosure<BlockSize = U16>,
+                        ) {
+                            f.call(&mut state.get_enc_backend());
+                        }
+                        inner(&self.inner.intrinsics, f);
+                    } else {
+                        f.call(&mut self.inner.soft.get_enc_backend());
+                    }
+                }
+            }
+        }
+
+        impl fmt::Debug for $name_enc {
+            fn fmt(&self, f: &mut fmt::Formatter<'_>) -> Result<(), fmt::Error> {
+                f.write_str(concat!(stringify!($name_enc), " { .. }"))
+            }
+        }
+
+        impl AlgorithmName for $name_enc {
+            fn write_alg_name(f: &mut fmt::Formatter<'_>) -> fmt::Result {
+                f.write_str(stringify!($name_enc))
+            }
+        }
+
+        impl Drop for $name_enc {
+            #[inline]
+            fn drop(&mut self) {
+                if self.token.get() {
+                    unsafe { ManuallyDrop::drop(&mut self.inner.intrinsics) };
+                } else {
+                    unsafe { ManuallyDrop::drop(&mut self.inner.soft) };
+                };
+            }
+        }
+
+        #[cfg(feature = "zeroize")]
+        impl zeroize::ZeroizeOnDrop for $name_enc {}
+
+        #[doc=$doc]
+        #[doc = "block cipher (decrypt-only)"]
+        pub struct $name_dec {
+            inner: $module::InnerDec,
+            token: aes_intrinsics::InitToken,
+        }
+
+        impl KeySizeUser for $name_dec {
+            type KeySize = $key_size;
+        }
+
+        impl From<$name_enc> for $name_dec {
+            #[inline]
+            fn from(enc: $name_enc) -> $name_dec {
+                Self::from(&enc)
+            }
+        }
+
+        impl From<&$name_enc> for $name_dec {
+            fn from(enc: &$name_enc) -> $name_dec {
+                use core::ops::Deref;
+                let inner = if enc.token.get() {
+                    $module::InnerDec {
+                        intrinsics: ManuallyDrop::new(unsafe {
+                            enc.inner.intrinsics.deref().into()
+                        }),
+                    }
+                } else {
+                    $module::InnerDec {
+                        soft: ManuallyDrop::new(unsafe { enc.inner.soft.deref().into() }),
+                    }
+                };
+
+                Self {
+                    inner,
+                    token: enc.token,
+                }
+            }
+        }
+
+        impl KeyInit for $name_dec {
+            #[inline]
+            fn new(key: &Key<Self>) -> Self {
+                let (token, aesni_present) = aes_intrinsics::init_get();
+
+                let inner = if aesni_present {
+                    $module::InnerDec {
+                        intrinsics: ManuallyDrop::new(intrinsics::$name_dec::new(key)),
+                    }
+                } else {
+                    $module::InnerDec {
+                        soft: ManuallyDrop::new(soft::$name_dec::new(key)),
+                    }
+                };
+
+                Self { inner, token }
+            }
+        }
+
+        impl Clone for $name_dec {
+            fn clone(&self) -> Self {
+                let inner = if self.token.get() {
+                    $module::InnerDec {
+                        intrinsics: unsafe { self.inner.intrinsics.clone() },
+                    }
+                } else {
+                    $module::InnerDec {
+                        soft: unsafe { self.inner.soft.clone() },
+                    }
+                };
+
+                Self {
+                    inner,
+                    token: self.token,
+                }
+            }
+        }
+
+        impl BlockSizeUser for $name_dec {
+            type BlockSize = U16;
+        }
+
+        impl BlockCipher for $name_dec {}
+
+        impl BlockDecrypt for $name_dec {
+            fn decrypt_with_backend(&self, f: impl BlockClosure<BlockSize = U16>) {
+                unsafe {
+                    if self.token.get() {
+                        #[target_feature(enable = "aes")]
+                        unsafe fn inner(
+                            state: &intrinsics::$name_dec,
+                            f: impl BlockClosure<BlockSize = U16>,
+                        ) {
+                            f.call(&mut state.get_dec_backend());
+                        }
+                        inner(&self.inner.intrinsics, f);
+                    } else {
+                        f.call(&mut self.inner.soft.get_dec_backend());
+                    }
+                }
+            }
+        }
+
+        impl fmt::Debug for $name_dec {
+            fn fmt(&self, f: &mut fmt::Formatter<'_>) -> Result<(), fmt::Error> {
+                f.write_str(concat!(stringify!($name_dec), " { .. }"))
+            }
+        }
+
+        impl AlgorithmName for $name_dec {
+            fn write_alg_name(f: &mut fmt::Formatter<'_>) -> fmt::Result {
+                f.write_str(stringify!($name_dec))
+            }
+        }
+
+        impl Drop for $name_dec {
+            #[inline]
+            fn drop(&mut self) {
+                if self.token.get() {
+                    unsafe { ManuallyDrop::drop(&mut self.inner.intrinsics) };
+                } else {
+                    unsafe { ManuallyDrop::drop(&mut self.inner.soft) };
+                };
+            }
+        }
+
+        #[cfg(feature = "zeroize")]
+        impl zeroize::ZeroizeOnDrop for $name_dec {}
+    };
+}
+
+define_aes_impl!(Aes128, Aes128Enc, Aes128Dec, aes128, U16, "AES-128");
+define_aes_impl!(Aes192, Aes192Enc, Aes192Dec, aes192, U24, "AES-192");
+define_aes_impl!(Aes256, Aes256Enc, Aes256Dec, aes256, U32, "AES-256");
diff --git a/src/hazmat.rs b/src/hazmat.rs
new file mode 100644 (file)
index 0000000..9b5555e
--- /dev/null
@@ -0,0 +1,159 @@
+//! ⚠️ Low-level "hazmat" AES functions.
+//!
+//! # ☢️️ WARNING: HAZARDOUS API ☢️
+//!
+//! This module contains an extremely low-level cryptographic primitive
+//! which is likewise extremely difficult to use correctly.
+//!
+//! There are very few valid uses cases for this API. It's intended to be used
+//! for implementing well-reviewed higher-level constructions.
+//!
+//! We do NOT recommending using it to implement any algorithm which has not
+//! received extensive peer review by cryptographers.
+
+use crate::{soft::fixslice::hazmat as soft, Block, Block8};
+
+#[cfg(all(target_arch = "aarch64", aes_armv8, not(aes_force_soft)))]
+use crate::armv8::hazmat as intrinsics;
+
+#[cfg(all(any(target_arch = "x86_64", target_arch = "x86"), not(aes_force_soft)))]
+use crate::ni::hazmat as intrinsics;
+
+#[cfg(all(
+    any(
+        target_arch = "x86",
+        target_arch = "x86_64",
+        all(target_arch = "aarch64", aes_armv8)
+    ),
+    not(aes_force_soft)
+))]
+cpufeatures::new!(aes_intrinsics, "aes");
+
+/// Execute the provided body if CPU intrinsics are available.
+// TODO(tarcieri): more `cfg-if`-like macro with an else branch?
+macro_rules! if_intrinsics_available {
+    ($body:expr) => {{
+        #[cfg(all(
+            any(
+                target_arch = "x86",
+                target_arch = "x86_64",
+                all(target_arch = "aarch64", aes_armv8)
+            ),
+            not(aes_force_soft)
+        ))]
+        if aes_intrinsics::get() {
+            unsafe { $body }
+            return;
+        }
+    }};
+}
+
+/// ⚠️ AES cipher (encrypt) round function.
+///
+/// This API performs the following steps as described in FIPS 197 Appendix C:
+///
+/// - `s_box`: state after `SubBytes()`
+/// - `s_row`: state after `ShiftRows()`
+/// - `m_col`: state after `MixColumns()`
+/// - `k_sch`: key schedule value for `round[r]`
+///
+/// This series of operations is equivalent to the Intel AES-NI `AESENC` instruction.
+///
+/// # ☢️️ WARNING: HAZARDOUS API ☢️
+///
+/// Use this function with great care! See the [module-level documentation][crate::hazmat]
+/// for more information.
+pub fn cipher_round(block: &mut Block, round_key: &Block) {
+    if_intrinsics_available! {
+        intrinsics::cipher_round(block, round_key)
+    }
+
+    soft::cipher_round(block, round_key);
+}
+
+/// ⚠️ AES cipher (encrypt) round function: parallel version.
+///
+/// Equivalent to [`cipher_round`], but acts on 8 blocks-at-a-time, applying
+/// the same number of round keys.
+///
+/// # ☢️️ WARNING: HAZARDOUS API ☢️
+///
+/// Use this function with great care! See the [module-level documentation][crate::hazmat]
+/// for more information.
+pub fn cipher_round_par(blocks: &mut Block8, round_keys: &Block8) {
+    if_intrinsics_available! {
+        intrinsics::cipher_round_par(blocks, round_keys)
+    }
+
+    soft::cipher_round_par(blocks, round_keys);
+}
+
+/// ⚠️ AES equivalent inverse cipher (decrypt) round function.
+///
+/// This API performs the following steps as described in FIPS 197 Appendix C:
+///
+/// - `is_box`: state after `InvSubBytes()`
+/// - `is_row`: state after `InvShiftRows()`
+/// - `im_col`: state after `InvMixColumns()`
+/// - `ik_sch`: key schedule value for `round[r]`
+///
+/// This series of operations is equivalent to the Intel AES-NI `AESDEC` instruction.
+///
+/// # ☢️️ WARNING: HAZARDOUS API ☢️
+///
+/// Use this function with great care! See the [module-level documentation][crate::hazmat]
+/// for more information.
+pub fn equiv_inv_cipher_round(block: &mut Block, round_key: &Block) {
+    if_intrinsics_available! {
+        intrinsics::equiv_inv_cipher_round(block, round_key)
+    }
+
+    soft::equiv_inv_cipher_round(block, round_key);
+}
+
+/// ⚠️ AES equivalent inverse cipher (decrypt) round function: parallel version.
+///
+/// Equivalent to [`equiv_inv_cipher_round`], but acts on 8 blocks-at-a-time,
+/// applying the same number of round keys.
+///
+/// # ☢️️ WARNING: HAZARDOUS API ☢️
+///
+/// Use this function with great care! See the [module-level documentation][crate::hazmat]
+/// for more information.
+pub fn equiv_inv_cipher_round_par(blocks: &mut Block8, round_keys: &Block8) {
+    if_intrinsics_available! {
+        intrinsics::equiv_inv_cipher_round_par(blocks, round_keys)
+    }
+
+    soft::equiv_inv_cipher_round_par(blocks, round_keys);
+}
+
+/// ⚠️ AES mix columns function.
+///
+/// # ☢️️ WARNING: HAZARDOUS API ☢️
+///
+/// Use this function with great care! See the [module-level documentation][crate::hazmat]
+/// for more information.
+pub fn mix_columns(block: &mut Block) {
+    if_intrinsics_available! {
+        intrinsics::mix_columns(block)
+    }
+
+    soft::mix_columns(block);
+}
+
+/// ⚠️ AES inverse mix columns function.
+///
+/// This function is equivalent to the Intel AES-NI `AESIMC` instruction.
+///
+/// # ☢️️ WARNING: HAZARDOUS API ☢️
+///
+/// Use this function with great care! See the [module-level documentation][crate::hazmat]
+/// for more information.
+pub fn inv_mix_columns(block: &mut Block) {
+    if_intrinsics_available! {
+        intrinsics::inv_mix_columns(block)
+    }
+
+    soft::inv_mix_columns(block);
+}
diff --git a/src/lib.rs b/src/lib.rs
new file mode 100644 (file)
index 0000000..f43b21c
--- /dev/null
@@ -0,0 +1,232 @@
+//! Pure Rust implementation of the [Advanced Encryption Standard][AES]
+//! (AES, a.k.a. Rijndael).
+//!
+//! # ⚠️ Security Warning: Hazmat!
+//!
+//! This crate implements only the low-level block cipher function, and is intended
+//! for use for implementing higher-level constructions *only*. It is NOT
+//! intended for direct use in applications.
+//!
+//! USE AT YOUR OWN RISK!
+//!
+//! # Supported backends
+//! This crate provides multiple backends including a portable pure Rust
+//! backend as well as ones based on CPU intrinsics.
+//!
+//! By default, it performs runtime detection of CPU intrinsics and uses them
+//! if they are available.
+//!
+//! ## "soft" portable backend
+//! As a baseline implementation, this crate provides a constant-time pure Rust
+//! implementation based on [fixslicing], a more advanced form of bitslicing
+//! implemented entirely in terms of bitwise arithmetic with no use of any
+//! lookup tables or data-dependent branches.
+//!
+//! Enabling the `aes_compact` configuration flag will reduce the code size of this
+//! backend at the cost of decreased performance (using a modified form of
+//! the fixslicing technique called "semi-fixslicing").
+//!
+//! ## ARMv8 intrinsics (nightly-only)
+//! On `aarch64` targets including `aarch64-apple-darwin` (Apple M1) and Linux
+//! targets such as `aarch64-unknown-linux-gnu` and `aarch64-unknown-linux-musl`,
+//! support for using AES intrinsics provided by the ARMv8 Cryptography Extensions
+//! is available when using the nightly compiler, and can be enabled using the
+//! `aes_armv8` configuration flag.
+//!
+//! On Linux and macOS, when the `aes_armv8` flag is enabled support for AES
+//! intrinsics is autodetected at runtime. On other platforms the `aes`
+//! target feature must be enabled via RUSTFLAGS.
+//!
+//! ## `x86`/`x86_64` intrinsics (AES-NI)
+//! By default this crate uses runtime detection on `i686`/`x86_64` targets
+//! in order to determine if AES-NI is available, and if it is not, it will
+//! fallback to using a constant-time software implementation.
+//!
+//! Passing `RUSTFLAGS=-Ctarget-feature=+aes,+ssse3` explicitly at compile-time
+//! will override runtime detection and ensure that AES-NI is always used.
+//! Programs built in this manner will crash with an illegal instruction on
+//! CPUs which do not have AES-NI enabled.
+//!
+//! Note: runtime detection is not possible on SGX targets. Please use the
+//! afforementioned `RUSTFLAGS` to leverage AES-NI on these targets.
+//!
+//! # Examples
+//! ```
+//! use aes::Aes128;
+//! use aes::cipher::{
+//!     BlockCipher, BlockEncrypt, BlockDecrypt, KeyInit,
+//!     generic_array::GenericArray,
+//! };
+//!
+//! let key = GenericArray::from([0u8; 16]);
+//! let mut block = GenericArray::from([42u8; 16]);
+//!
+//! // Initialize cipher
+//! let cipher = Aes128::new(&key);
+//!
+//! let block_copy = block.clone();
+//!
+//! // Encrypt block in-place
+//! cipher.encrypt_block(&mut block);
+//!
+//! // And decrypt it back
+//! cipher.decrypt_block(&mut block);
+//! assert_eq!(block, block_copy);
+//!
+//! // implementation supports parallel block processing
+//! // number of blocks processed in parallel depends in general
+//! // on hardware capabilities
+//! let mut blocks = [block; 100];
+//! cipher.encrypt_blocks(&mut blocks);
+//!
+//! for block in blocks.iter_mut() {
+//!     cipher.decrypt_block(block);
+//!     assert_eq!(block, &block_copy);
+//! }
+//!
+//! cipher.decrypt_blocks(&mut blocks);
+//!
+//! for block in blocks.iter_mut() {
+//!     cipher.encrypt_block(block);
+//!     assert_eq!(block, &block_copy);
+//! }
+//! ```
+//!
+//! For implementation of block cipher modes of operation see
+//! [`block-modes`] repository.
+//!
+//! # Configuration Flags
+//!
+//! You can modify crate using the following configuration flags:
+//!
+//! - `aes_armv8`: enable ARMv8 AES intrinsics (nightly-only).
+//! - `aes_force_soft`: force software implementation.
+//! - `aes_compact`: reduce code size at the cost of slower performance
+//! (affects only software backend).
+//!
+//! It can be enabled using `RUSTFLAGS` environmental variable
+//! (e.g. `RUSTFLAGS="--cfg aes_compact"`) or by modifying `.cargo/config`.
+//!
+//! [AES]: https://en.wikipedia.org/wiki/Advanced_Encryption_Standard
+//! [fixslicing]: https://eprint.iacr.org/2020/1123.pdf
+//! [AES-NI]: https://en.wikipedia.org/wiki/AES_instruction_set
+//! [`block-modes`]: https://github.com/RustCrypto/block-modes/
+
+#![no_std]
+#![doc(
+    html_logo_url = "https://raw.githubusercontent.com/RustCrypto/media/26acc39f/logo.svg",
+    html_favicon_url = "https://raw.githubusercontent.com/RustCrypto/media/26acc39f/logo.svg"
+)]
+#![cfg_attr(docsrs, feature(doc_cfg))]
+#![warn(missing_docs, rust_2018_idioms)]
+#![cfg_attr(all(aes_armv8, target_arch = "aarch64"), feature(stdsimd))]
+
+#[cfg(feature = "hazmat")]
+#[cfg_attr(docsrs, doc(cfg(feature = "hazmat")))]
+pub mod hazmat;
+
+mod soft;
+
+use cfg_if::cfg_if;
+
+cfg_if! {
+    if #[cfg(all(target_arch = "aarch64", aes_armv8, not(aes_force_soft)))] {
+        mod armv8;
+        mod autodetect;
+        pub use autodetect::*;
+    } else if #[cfg(all(
+        any(target_arch = "x86", target_arch = "x86_64"),
+        not(aes_force_soft)
+    ))] {
+        mod autodetect;
+        mod ni;
+        pub use autodetect::*;
+    } else {
+        pub use soft::*;
+    }
+}
+
+pub use cipher;
+use cipher::{
+    consts::{U16, U8},
+    generic_array::GenericArray,
+};
+
+/// 128-bit AES block
+pub type Block = GenericArray<u8, U16>;
+/// Eight 128-bit AES blocks
+pub type Block8 = GenericArray<Block, U8>;
+
+#[cfg(test)]
+mod tests {
+    #[cfg(feature = "zeroize")]
+    #[test]
+    fn zeroize_works() {
+        use super::soft;
+
+        fn test_for<T: zeroize::ZeroizeOnDrop>(val: T) {
+            use core::mem::{size_of, ManuallyDrop};
+
+            let mut val = ManuallyDrop::new(val);
+            let ptr = &val as *const _ as *const u8;
+            let len = size_of::<ManuallyDrop<T>>();
+
+            unsafe { ManuallyDrop::drop(&mut val) };
+
+            let slice = unsafe { core::slice::from_raw_parts(ptr, len) };
+
+            assert!(slice.iter().all(|&byte| byte == 0));
+        }
+
+        let key_128 = [42; 16].into();
+        let key_192 = [42; 24].into();
+        let key_256 = [42; 32].into();
+
+        use cipher::KeyInit as _;
+        test_for(soft::Aes128::new(&key_128));
+        test_for(soft::Aes128Enc::new(&key_128));
+        test_for(soft::Aes128Dec::new(&key_128));
+        test_for(soft::Aes192::new(&key_192));
+        test_for(soft::Aes192Enc::new(&key_192));
+        test_for(soft::Aes192Dec::new(&key_192));
+        test_for(soft::Aes256::new(&key_256));
+        test_for(soft::Aes256Enc::new(&key_256));
+        test_for(soft::Aes256Dec::new(&key_256));
+
+        #[cfg(all(any(target_arch = "x86", target_arch = "x86_64"), not(aes_force_soft)))]
+        {
+            use super::ni;
+
+            cpufeatures::new!(aes_intrinsics, "aes");
+            if aes_intrinsics::get() {
+                test_for(ni::Aes128::new(&key_128));
+                test_for(ni::Aes128Enc::new(&key_128));
+                test_for(ni::Aes128Dec::new(&key_128));
+                test_for(ni::Aes192::new(&key_192));
+                test_for(ni::Aes192Enc::new(&key_192));
+                test_for(ni::Aes192Dec::new(&key_192));
+                test_for(ni::Aes256::new(&key_256));
+                test_for(ni::Aes256Enc::new(&key_256));
+                test_for(ni::Aes256Dec::new(&key_256));
+            }
+        }
+
+        #[cfg(all(target_arch = "aarch64", aes_armv8, not(aes_force_soft)))]
+        {
+            use super::armv8;
+
+            cpufeatures::new!(aes_intrinsics, "aes");
+            if aes_intrinsics::get() {
+                test_for(armv8::Aes128::new(&key_128));
+                test_for(armv8::Aes128Enc::new(&key_128));
+                test_for(armv8::Aes128Dec::new(&key_128));
+                test_for(armv8::Aes192::new(&key_192));
+                test_for(armv8::Aes192Enc::new(&key_192));
+                test_for(armv8::Aes192Dec::new(&key_192));
+                test_for(armv8::Aes256::new(&key_256));
+                test_for(armv8::Aes256Enc::new(&key_256));
+                test_for(armv8::Aes256Dec::new(&key_256));
+            }
+        }
+    }
+}
diff --git a/src/ni.rs b/src/ni.rs
new file mode 100644 (file)
index 0000000..15b49ef
--- /dev/null
+++ b/src/ni.rs
@@ -0,0 +1,361 @@
+//! AES block ciphers implementation using AES-NI instruction set.
+//!
+//! Ciphers functionality is accessed using `BlockCipher` trait from the
+//! [`cipher`](https://docs.rs/cipher) crate.
+//!
+//! # Vulnerability
+//! Lazy FP state restory vulnerability can allow local process to leak content
+//! of the FPU register, in which round keys are stored. This vulnerability
+//! can be mitigated at the operating system level by installing relevant
+//! patches. (i.e. keep your OS updated!) More info:
+//! - [Intel advisory](https://www.intel.com/content/www/us/en/security-center/advisory/intel-sa-00145.html)
+//! - [Wikipedia](https://en.wikipedia.org/wiki/Lazy_FP_state_restore)
+//!
+//! # Related documents
+//! - [Intel AES-NI whitepaper](https://software.intel.com/sites/default/files/article/165683/aes-wp-2012-09-22-v01.pdf)
+//! - [Use of the AES Instruction Set](https://www.cosic.esat.kuleuven.be/ecrypt/AESday/slides/Use_of_the_AES_Instruction_Set.pdf)
+
+#[macro_use]
+mod utils;
+
+mod aes128;
+mod aes192;
+mod aes256;
+
+#[cfg(test)]
+mod test_expand;
+
+#[cfg(feature = "hazmat")]
+pub(crate) mod hazmat;
+
+#[cfg(target_arch = "x86")]
+use core::arch::x86 as arch;
+#[cfg(target_arch = "x86_64")]
+use core::arch::x86_64 as arch;
+
+use crate::{Block, Block8};
+use cipher::{
+    consts::{U16, U24, U32, U8},
+    inout::InOut,
+    AlgorithmName, BlockBackend, BlockCipher, BlockClosure, BlockDecrypt, BlockEncrypt,
+    BlockSizeUser, Key, KeyInit, KeySizeUser, ParBlocksSizeUser,
+};
+use core::fmt;
+
+macro_rules! define_aes_impl {
+    (
+        $name:tt,
+        $name_enc:ident,
+        $name_dec:ident,
+        $name_back_enc:ident,
+        $name_back_dec:ident,
+        $module:tt,
+        $key_size:ty,
+        $doc:expr $(,)?
+    ) => {
+        #[doc=$doc]
+        #[doc = "block cipher"]
+        #[derive(Clone)]
+        pub struct $name {
+            encrypt: $name_enc,
+            decrypt: $name_dec,
+        }
+
+        impl $name {
+            #[inline(always)]
+            pub(crate) fn get_enc_backend(&self) -> $name_back_enc<'_> {
+                self.encrypt.get_enc_backend()
+            }
+
+            #[inline(always)]
+            pub(crate) fn get_dec_backend(&self) -> $name_back_dec<'_> {
+                self.decrypt.get_dec_backend()
+            }
+        }
+
+        impl BlockCipher for $name {}
+
+        impl KeySizeUser for $name {
+            type KeySize = $key_size;
+        }
+
+        impl KeyInit for $name {
+            #[inline]
+            fn new(key: &Key<Self>) -> Self {
+                let encrypt = $name_enc::new(key);
+                let decrypt = $name_dec::from(&encrypt);
+                Self { encrypt, decrypt }
+            }
+        }
+
+        impl From<$name_enc> for $name {
+            #[inline]
+            fn from(encrypt: $name_enc) -> $name {
+                let decrypt = (&encrypt).into();
+                Self { encrypt, decrypt }
+            }
+        }
+
+        impl From<&$name_enc> for $name {
+            #[inline]
+            fn from(encrypt: &$name_enc) -> $name {
+                let decrypt = encrypt.into();
+                let encrypt = encrypt.clone();
+                Self { encrypt, decrypt }
+            }
+        }
+
+        impl BlockSizeUser for $name {
+            type BlockSize = U16;
+        }
+
+        impl BlockEncrypt for $name {
+            fn encrypt_with_backend(&self, f: impl BlockClosure<BlockSize = U16>) {
+                self.encrypt.encrypt_with_backend(f)
+            }
+        }
+
+        impl BlockDecrypt for $name {
+            fn decrypt_with_backend(&self, f: impl BlockClosure<BlockSize = U16>) {
+                self.decrypt.decrypt_with_backend(f)
+            }
+        }
+
+        impl fmt::Debug for $name {
+            fn fmt(&self, f: &mut fmt::Formatter<'_>) -> Result<(), fmt::Error> {
+                f.write_str(concat!(stringify!($name), " { .. }"))
+            }
+        }
+
+        impl AlgorithmName for $name {
+            fn write_alg_name(f: &mut fmt::Formatter<'_>) -> fmt::Result {
+                f.write_str(stringify!($name))
+            }
+        }
+
+        #[cfg(feature = "zeroize")]
+        impl zeroize::ZeroizeOnDrop for $name {}
+
+        #[doc=$doc]
+        #[doc = "block cipher (encrypt-only)"]
+        #[derive(Clone)]
+        pub struct $name_enc {
+            round_keys: $module::RoundKeys,
+        }
+
+        impl $name_enc {
+            #[inline(always)]
+            pub(crate) fn get_enc_backend(&self) -> $name_back_enc<'_> {
+                $name_back_enc(self)
+            }
+        }
+
+        impl BlockCipher for $name_enc {}
+
+        impl KeySizeUser for $name_enc {
+            type KeySize = $key_size;
+        }
+
+        impl KeyInit for $name_enc {
+            fn new(key: &Key<Self>) -> Self {
+                // SAFETY: we enforce that this code is called only when
+                // target features required by `expand` were properly checked.
+                Self {
+                    round_keys: unsafe { $module::expand_key(key.as_ref()) },
+                }
+            }
+        }
+
+        impl BlockSizeUser for $name_enc {
+            type BlockSize = U16;
+        }
+
+        impl BlockEncrypt for $name_enc {
+            fn encrypt_with_backend(&self, f: impl BlockClosure<BlockSize = U16>) {
+                f.call(&mut self.get_enc_backend())
+            }
+        }
+
+        impl fmt::Debug for $name_enc {
+            fn fmt(&self, f: &mut fmt::Formatter<'_>) -> Result<(), fmt::Error> {
+                f.write_str(concat!(stringify!($name_enc), " { .. }"))
+            }
+        }
+
+        impl AlgorithmName for $name_enc {
+            fn write_alg_name(f: &mut fmt::Formatter<'_>) -> fmt::Result {
+                f.write_str(stringify!($name_enc))
+            }
+        }
+
+        impl Drop for $name_enc {
+            #[inline]
+            fn drop(&mut self) {
+                #[cfg(feature = "zeroize")]
+                zeroize::Zeroize::zeroize(&mut self.round_keys);
+            }
+        }
+
+        #[cfg(feature = "zeroize")]
+        impl zeroize::ZeroizeOnDrop for $name_enc {}
+
+        #[doc=$doc]
+        #[doc = "block cipher (decrypt-only)"]
+        #[derive(Clone)]
+        pub struct $name_dec {
+            round_keys: $module::RoundKeys,
+        }
+
+        impl $name_dec {
+            #[inline(always)]
+            pub(crate) fn get_dec_backend(&self) -> $name_back_dec<'_> {
+                $name_back_dec(self)
+            }
+        }
+
+        impl BlockCipher for $name_dec {}
+
+        impl KeySizeUser for $name_dec {
+            type KeySize = $key_size;
+        }
+
+        impl KeyInit for $name_dec {
+            fn new(key: &Key<Self>) -> Self {
+                $name_enc::new(key).into()
+            }
+        }
+
+        impl From<$name_enc> for $name_dec {
+            #[inline]
+            fn from(enc: $name_enc) -> $name_dec {
+                Self::from(&enc)
+            }
+        }
+
+        impl From<&$name_enc> for $name_dec {
+            #[inline]
+            fn from(enc: &$name_enc) -> $name_dec {
+                let round_keys = unsafe { $module::inv_expanded_keys(&enc.round_keys) };
+                Self { round_keys }
+            }
+        }
+
+        impl BlockSizeUser for $name_dec {
+            type BlockSize = U16;
+        }
+
+        impl BlockDecrypt for $name_dec {
+            fn decrypt_with_backend(&self, f: impl BlockClosure<BlockSize = U16>) {
+                f.call(&mut self.get_dec_backend());
+            }
+        }
+
+        impl fmt::Debug for $name_dec {
+            fn fmt(&self, f: &mut fmt::Formatter<'_>) -> Result<(), fmt::Error> {
+                f.write_str(concat!(stringify!($name_dec), " { .. }"))
+            }
+        }
+
+        impl AlgorithmName for $name_dec {
+            fn write_alg_name(f: &mut fmt::Formatter<'_>) -> fmt::Result {
+                f.write_str(stringify!($name_dec))
+            }
+        }
+
+        impl Drop for $name_dec {
+            #[inline]
+            fn drop(&mut self) {
+                #[cfg(feature = "zeroize")]
+                zeroize::Zeroize::zeroize(&mut self.round_keys);
+            }
+        }
+
+        #[cfg(feature = "zeroize")]
+        impl zeroize::ZeroizeOnDrop for $name_dec {}
+
+        pub(crate) struct $name_back_enc<'a>(&'a $name_enc);
+
+        impl<'a> BlockSizeUser for $name_back_enc<'a> {
+            type BlockSize = U16;
+        }
+
+        impl<'a> ParBlocksSizeUser for $name_back_enc<'a> {
+            type ParBlocksSize = U8;
+        }
+
+        impl<'a> BlockBackend for $name_back_enc<'a> {
+            #[inline(always)]
+            fn proc_block(&mut self, block: InOut<'_, '_, Block>) {
+                unsafe {
+                    $module::encrypt1(&self.0.round_keys, block);
+                }
+            }
+
+            #[inline(always)]
+            fn proc_par_blocks(&mut self, blocks: InOut<'_, '_, Block8>) {
+                unsafe {
+                    $module::encrypt8(&self.0.round_keys, blocks);
+                }
+            }
+        }
+
+        pub(crate) struct $name_back_dec<'a>(&'a $name_dec);
+
+        impl<'a> BlockSizeUser for $name_back_dec<'a> {
+            type BlockSize = U16;
+        }
+
+        impl<'a> ParBlocksSizeUser for $name_back_dec<'a> {
+            type ParBlocksSize = U8;
+        }
+
+        impl<'a> BlockBackend for $name_back_dec<'a> {
+            #[inline(always)]
+            fn proc_block(&mut self, block: InOut<'_, '_, Block>) {
+                unsafe {
+                    $module::decrypt1(&self.0.round_keys, block);
+                }
+            }
+
+            #[inline(always)]
+            fn proc_par_blocks(&mut self, blocks: InOut<'_, '_, Block8>) {
+                unsafe {
+                    $module::decrypt8(&self.0.round_keys, blocks);
+                }
+            }
+        }
+    };
+}
+
+define_aes_impl!(
+    Aes128,
+    Aes128Enc,
+    Aes128Dec,
+    Aes128BackEnc,
+    Aes128BackDec,
+    aes128,
+    U16,
+    "AES-128",
+);
+
+define_aes_impl!(
+    Aes192,
+    Aes192Enc,
+    Aes192Dec,
+    Aes192BackEnc,
+    Aes192BackDec,
+    aes192,
+    U24,
+    "AES-192",
+);
+
+define_aes_impl!(
+    Aes256,
+    Aes256Enc,
+    Aes256Dec,
+    Aes256BackEnc,
+    Aes256BackDec,
+    aes256,
+    U32,
+    "AES-256",
+);
diff --git a/src/ni/aes128.rs b/src/ni/aes128.rs
new file mode 100644 (file)
index 0000000..b0836a1
--- /dev/null
@@ -0,0 +1,145 @@
+use super::{arch::*, utils::*};
+use crate::{Block, Block8};
+use cipher::inout::InOut;
+use core::mem;
+
+/// AES-128 round keys
+pub(super) type RoundKeys = [__m128i; 11];
+
+#[inline]
+#[target_feature(enable = "aes")]
+pub(super) unsafe fn encrypt1(keys: &RoundKeys, block: InOut<'_, '_, Block>) {
+    let (in_ptr, out_ptr) = block.into_raw();
+    let mut b = _mm_loadu_si128(in_ptr as *const __m128i);
+    b = _mm_xor_si128(b, keys[0]);
+    b = _mm_aesenc_si128(b, keys[1]);
+    b = _mm_aesenc_si128(b, keys[2]);
+    b = _mm_aesenc_si128(b, keys[3]);
+    b = _mm_aesenc_si128(b, keys[4]);
+    b = _mm_aesenc_si128(b, keys[5]);
+    b = _mm_aesenc_si128(b, keys[6]);
+    b = _mm_aesenc_si128(b, keys[7]);
+    b = _mm_aesenc_si128(b, keys[8]);
+    b = _mm_aesenc_si128(b, keys[9]);
+    b = _mm_aesenclast_si128(b, keys[10]);
+    _mm_storeu_si128(out_ptr as *mut __m128i, b);
+}
+
+#[inline]
+#[target_feature(enable = "aes")]
+pub(super) unsafe fn encrypt8(keys: &RoundKeys, blocks: InOut<'_, '_, Block8>) {
+    let (in_ptr, out_ptr) = blocks.into_raw();
+    let mut b = load8(in_ptr);
+    xor8(&mut b, keys[0]);
+    aesenc8(&mut b, keys[1]);
+    aesenc8(&mut b, keys[2]);
+    aesenc8(&mut b, keys[3]);
+    aesenc8(&mut b, keys[4]);
+    aesenc8(&mut b, keys[5]);
+    aesenc8(&mut b, keys[6]);
+    aesenc8(&mut b, keys[7]);
+    aesenc8(&mut b, keys[8]);
+    aesenc8(&mut b, keys[9]);
+    aesenclast8(&mut b, keys[10]);
+    store8(out_ptr, b);
+}
+
+#[inline]
+#[target_feature(enable = "aes")]
+pub(super) unsafe fn decrypt1(keys: &RoundKeys, block: InOut<'_, '_, Block>) {
+    let (in_ptr, out_ptr) = block.into_raw();
+    let mut b = _mm_loadu_si128(in_ptr as *const __m128i);
+    b = _mm_xor_si128(b, keys[10]);
+    b = _mm_aesdec_si128(b, keys[9]);
+    b = _mm_aesdec_si128(b, keys[8]);
+    b = _mm_aesdec_si128(b, keys[7]);
+    b = _mm_aesdec_si128(b, keys[6]);
+    b = _mm_aesdec_si128(b, keys[5]);
+    b = _mm_aesdec_si128(b, keys[4]);
+    b = _mm_aesdec_si128(b, keys[3]);
+    b = _mm_aesdec_si128(b, keys[2]);
+    b = _mm_aesdec_si128(b, keys[1]);
+    b = _mm_aesdeclast_si128(b, keys[0]);
+    _mm_storeu_si128(out_ptr as *mut __m128i, b);
+}
+
+#[inline]
+#[target_feature(enable = "aes")]
+pub(super) unsafe fn decrypt8(keys: &RoundKeys, blocks: InOut<'_, '_, Block8>) {
+    let (in_ptr, out_ptr) = blocks.into_raw();
+    let mut b = load8(in_ptr);
+    xor8(&mut b, keys[10]);
+    aesdec8(&mut b, keys[9]);
+    aesdec8(&mut b, keys[8]);
+    aesdec8(&mut b, keys[7]);
+    aesdec8(&mut b, keys[6]);
+    aesdec8(&mut b, keys[5]);
+    aesdec8(&mut b, keys[4]);
+    aesdec8(&mut b, keys[3]);
+    aesdec8(&mut b, keys[2]);
+    aesdec8(&mut b, keys[1]);
+    aesdeclast8(&mut b, keys[0]);
+    store8(out_ptr, b);
+}
+
+macro_rules! expand_round {
+    ($keys:expr, $pos:expr, $round:expr) => {
+        let mut t1 = $keys[$pos - 1];
+        let mut t2;
+        let mut t3;
+
+        t2 = _mm_aeskeygenassist_si128(t1, $round);
+        t2 = _mm_shuffle_epi32(t2, 0xff);
+        t3 = _mm_slli_si128(t1, 0x4);
+        t1 = _mm_xor_si128(t1, t3);
+        t3 = _mm_slli_si128(t3, 0x4);
+        t1 = _mm_xor_si128(t1, t3);
+        t3 = _mm_slli_si128(t3, 0x4);
+        t1 = _mm_xor_si128(t1, t3);
+        t1 = _mm_xor_si128(t1, t2);
+
+        $keys[$pos] = t1;
+    };
+}
+
+#[inline]
+#[target_feature(enable = "aes")]
+pub(super) unsafe fn expand_key(key: &[u8; 16]) -> RoundKeys {
+    // SAFETY: `RoundKeys` is a `[__m128i; 11]` which can be initialized
+    // with all zeroes.
+    let mut keys: RoundKeys = mem::zeroed();
+
+    let k = _mm_loadu_si128(key.as_ptr() as *const __m128i);
+    keys[0] = k;
+
+    expand_round!(keys, 1, 0x01);
+    expand_round!(keys, 2, 0x02);
+    expand_round!(keys, 3, 0x04);
+    expand_round!(keys, 4, 0x08);
+    expand_round!(keys, 5, 0x10);
+    expand_round!(keys, 6, 0x20);
+    expand_round!(keys, 7, 0x40);
+    expand_round!(keys, 8, 0x80);
+    expand_round!(keys, 9, 0x1B);
+    expand_round!(keys, 10, 0x36);
+
+    keys
+}
+
+#[inline]
+#[target_feature(enable = "aes")]
+pub(super) unsafe fn inv_expanded_keys(keys: &RoundKeys) -> RoundKeys {
+    [
+        keys[0],
+        _mm_aesimc_si128(keys[1]),
+        _mm_aesimc_si128(keys[2]),
+        _mm_aesimc_si128(keys[3]),
+        _mm_aesimc_si128(keys[4]),
+        _mm_aesimc_si128(keys[5]),
+        _mm_aesimc_si128(keys[6]),
+        _mm_aesimc_si128(keys[7]),
+        _mm_aesimc_si128(keys[8]),
+        _mm_aesimc_si128(keys[9]),
+        keys[10],
+    ]
+}
diff --git a/src/ni/aes192.rs b/src/ni/aes192.rs
new file mode 100644 (file)
index 0000000..eee1f21
--- /dev/null
@@ -0,0 +1,197 @@
+use super::{arch::*, utils::*};
+use crate::{Block, Block8};
+use cipher::inout::InOut;
+use core::{mem, ptr};
+
+/// AES-192 round keys
+pub(super) type RoundKeys = [__m128i; 13];
+
+#[inline]
+#[target_feature(enable = "aes")]
+pub(super) unsafe fn encrypt1(keys: &RoundKeys, block: InOut<'_, '_, Block>) {
+    let (in_ptr, out_ptr) = block.into_raw();
+    let mut b = _mm_loadu_si128(in_ptr as *const __m128i);
+    b = _mm_xor_si128(b, keys[0]);
+    b = _mm_aesenc_si128(b, keys[1]);
+    b = _mm_aesenc_si128(b, keys[2]);
+    b = _mm_aesenc_si128(b, keys[3]);
+    b = _mm_aesenc_si128(b, keys[4]);
+    b = _mm_aesenc_si128(b, keys[5]);
+    b = _mm_aesenc_si128(b, keys[6]);
+    b = _mm_aesenc_si128(b, keys[7]);
+    b = _mm_aesenc_si128(b, keys[8]);
+    b = _mm_aesenc_si128(b, keys[9]);
+    b = _mm_aesenc_si128(b, keys[10]);
+    b = _mm_aesenc_si128(b, keys[11]);
+    b = _mm_aesenclast_si128(b, keys[12]);
+    _mm_storeu_si128(out_ptr as *mut __m128i, b);
+}
+
+#[inline]
+#[target_feature(enable = "aes")]
+pub(super) unsafe fn encrypt8(keys: &RoundKeys, blocks: InOut<'_, '_, Block8>) {
+    let (in_ptr, out_ptr) = blocks.into_raw();
+    let mut b = load8(in_ptr);
+    xor8(&mut b, keys[0]);
+    aesenc8(&mut b, keys[1]);
+    aesenc8(&mut b, keys[2]);
+    aesenc8(&mut b, keys[3]);
+    aesenc8(&mut b, keys[4]);
+    aesenc8(&mut b, keys[5]);
+    aesenc8(&mut b, keys[6]);
+    aesenc8(&mut b, keys[7]);
+    aesenc8(&mut b, keys[8]);
+    aesenc8(&mut b, keys[9]);
+    aesenc8(&mut b, keys[10]);
+    aesenc8(&mut b, keys[11]);
+    aesenclast8(&mut b, keys[12]);
+    store8(out_ptr, b);
+}
+
+#[inline]
+#[target_feature(enable = "aes")]
+pub(super) unsafe fn decrypt1(keys: &RoundKeys, block: InOut<'_, '_, Block>) {
+    let (in_ptr, out_ptr) = block.into_raw();
+    let mut b = _mm_loadu_si128(in_ptr as *const __m128i);
+    b = _mm_xor_si128(b, keys[12]);
+    b = _mm_aesdec_si128(b, keys[11]);
+    b = _mm_aesdec_si128(b, keys[10]);
+    b = _mm_aesdec_si128(b, keys[9]);
+    b = _mm_aesdec_si128(b, keys[8]);
+    b = _mm_aesdec_si128(b, keys[7]);
+    b = _mm_aesdec_si128(b, keys[6]);
+    b = _mm_aesdec_si128(b, keys[5]);
+    b = _mm_aesdec_si128(b, keys[4]);
+    b = _mm_aesdec_si128(b, keys[3]);
+    b = _mm_aesdec_si128(b, keys[2]);
+    b = _mm_aesdec_si128(b, keys[1]);
+    b = _mm_aesdeclast_si128(b, keys[0]);
+    _mm_storeu_si128(out_ptr as *mut __m128i, b);
+}
+
+#[inline]
+#[target_feature(enable = "aes")]
+pub(super) unsafe fn decrypt8(keys: &RoundKeys, blocks: InOut<'_, '_, Block8>) {
+    let (in_ptr, out_ptr) = blocks.into_raw();
+    let mut b = load8(in_ptr);
+    xor8(&mut b, keys[12]);
+    aesdec8(&mut b, keys[11]);
+    aesdec8(&mut b, keys[10]);
+    aesdec8(&mut b, keys[9]);
+    aesdec8(&mut b, keys[8]);
+    aesdec8(&mut b, keys[7]);
+    aesdec8(&mut b, keys[6]);
+    aesdec8(&mut b, keys[5]);
+    aesdec8(&mut b, keys[4]);
+    aesdec8(&mut b, keys[3]);
+    aesdec8(&mut b, keys[2]);
+    aesdec8(&mut b, keys[1]);
+    aesdeclast8(&mut b, keys[0]);
+    store8(out_ptr, b);
+}
+
+macro_rules! expand_round {
+    ($t1:expr, $t3:expr, $round:expr) => {{
+        let mut t1 = $t1;
+        let mut t2;
+        let mut t3 = $t3;
+        let mut t4;
+
+        t2 = _mm_aeskeygenassist_si128(t3, $round);
+        t2 = _mm_shuffle_epi32(t2, 0x55);
+        t4 = _mm_slli_si128(t1, 0x4);
+        t1 = _mm_xor_si128(t1, t4);
+        t4 = _mm_slli_si128(t4, 0x4);
+        t1 = _mm_xor_si128(t1, t4);
+        t4 = _mm_slli_si128(t4, 0x4);
+        t1 = _mm_xor_si128(t1, t4);
+        t1 = _mm_xor_si128(t1, t2);
+        t2 = _mm_shuffle_epi32(t1, 0xff);
+        t4 = _mm_slli_si128(t3, 0x4);
+        t3 = _mm_xor_si128(t3, t4);
+        t3 = _mm_xor_si128(t3, t2);
+
+        (t1, t3)
+    }};
+}
+
+macro_rules! shuffle {
+    ($a:expr, $b:expr, $imm:expr) => {
+        mem::transmute::<_, __m128i>(_mm_shuffle_pd(mem::transmute($a), mem::transmute($b), $imm))
+    };
+}
+
+#[inline]
+#[target_feature(enable = "aes")]
+pub(super) unsafe fn expand_key(key: &[u8; 24]) -> RoundKeys {
+    // SAFETY: `RoundKeys` is a `[__m128i; 13]` which can be initialized
+    // with all zeroes.
+    let mut keys: RoundKeys = mem::zeroed();
+    // we are being extra pedantic here to remove out-of-bound access.
+    // this should be optimized out into movups, movsd sequence
+    // note that unaligned load MUST be used here, even though we read
+    // from the array (compiler missoptimizes aligned load)
+    let (k0, k1l) = {
+        let mut t = [0u8; 32];
+        ptr::write(t.as_mut_ptr() as *mut [u8; 24], *key);
+
+        (
+            _mm_loadu_si128(t.as_ptr() as *const __m128i),
+            _mm_loadu_si128(t.as_ptr().offset(16) as *const __m128i),
+        )
+    };
+
+    keys[0] = k0;
+
+    let (k1_2, k2r) = expand_round!(k0, k1l, 0x01);
+    keys[1] = shuffle!(k1l, k1_2, 0);
+    keys[2] = shuffle!(k1_2, k2r, 1);
+
+    let (k3, k4l) = expand_round!(k1_2, k2r, 0x02);
+    keys[3] = k3;
+
+    let (k4_5, k5r) = expand_round!(k3, k4l, 0x04);
+    let k4 = shuffle!(k4l, k4_5, 0);
+    let k5 = shuffle!(k4_5, k5r, 1);
+    keys[4] = k4;
+    keys[5] = k5;
+
+    let (k6, k7l) = expand_round!(k4_5, k5r, 0x08);
+    keys[6] = k6;
+
+    let (k7_8, k8r) = expand_round!(k6, k7l, 0x10);
+    keys[7] = shuffle!(k7l, k7_8, 0);
+    keys[8] = shuffle!(k7_8, k8r, 1);
+
+    let (k9, k10l) = expand_round!(k7_8, k8r, 0x20);
+    keys[9] = k9;
+
+    let (k10_11, k11r) = expand_round!(k9, k10l, 0x40);
+    keys[10] = shuffle!(k10l, k10_11, 0);
+    keys[11] = shuffle!(k10_11, k11r, 1);
+
+    let (k12, _) = expand_round!(k10_11, k11r, 0x80);
+    keys[12] = k12;
+
+    keys
+}
+
+#[inline]
+#[target_feature(enable = "aes")]
+pub(super) unsafe fn inv_expanded_keys(keys: &RoundKeys) -> RoundKeys {
+    [
+        keys[0],
+        _mm_aesimc_si128(keys[1]),
+        _mm_aesimc_si128(keys[2]),
+        _mm_aesimc_si128(keys[3]),
+        _mm_aesimc_si128(keys[4]),
+        _mm_aesimc_si128(keys[5]),
+        _mm_aesimc_si128(keys[6]),
+        _mm_aesimc_si128(keys[7]),
+        _mm_aesimc_si128(keys[8]),
+        _mm_aesimc_si128(keys[9]),
+        _mm_aesimc_si128(keys[10]),
+        _mm_aesimc_si128(keys[11]),
+        keys[12],
+    ]
+}
diff --git a/src/ni/aes256.rs b/src/ni/aes256.rs
new file mode 100644 (file)
index 0000000..bea090a
--- /dev/null
@@ -0,0 +1,196 @@
+use super::{arch::*, utils::*};
+use crate::{Block, Block8};
+use cipher::inout::InOut;
+use core::mem;
+
+/// AES-192 round keys
+pub(super) type RoundKeys = [__m128i; 15];
+
+#[inline]
+#[target_feature(enable = "aes")]
+pub(super) unsafe fn encrypt1(keys: &RoundKeys, block: InOut<'_, '_, Block>) {
+    let (in_ptr, out_ptr) = block.into_raw();
+    let mut b = _mm_loadu_si128(in_ptr as *const __m128i);
+    b = _mm_xor_si128(b, keys[0]);
+    b = _mm_aesenc_si128(b, keys[1]);
+    b = _mm_aesenc_si128(b, keys[2]);
+    b = _mm_aesenc_si128(b, keys[3]);
+    b = _mm_aesenc_si128(b, keys[4]);
+    b = _mm_aesenc_si128(b, keys[5]);
+    b = _mm_aesenc_si128(b, keys[6]);
+    b = _mm_aesenc_si128(b, keys[7]);
+    b = _mm_aesenc_si128(b, keys[8]);
+    b = _mm_aesenc_si128(b, keys[9]);
+    b = _mm_aesenc_si128(b, keys[10]);
+    b = _mm_aesenc_si128(b, keys[11]);
+    b = _mm_aesenc_si128(b, keys[12]);
+    b = _mm_aesenc_si128(b, keys[13]);
+    b = _mm_aesenclast_si128(b, keys[14]);
+    _mm_storeu_si128(out_ptr as *mut __m128i, b);
+}
+
+#[inline]
+#[target_feature(enable = "aes")]
+pub(super) unsafe fn encrypt8(keys: &RoundKeys, blocks: InOut<'_, '_, Block8>) {
+    let (in_ptr, out_ptr) = blocks.into_raw();
+    let mut b = load8(in_ptr);
+    xor8(&mut b, keys[0]);
+    aesenc8(&mut b, keys[1]);
+    aesenc8(&mut b, keys[2]);
+    aesenc8(&mut b, keys[3]);
+    aesenc8(&mut b, keys[4]);
+    aesenc8(&mut b, keys[5]);
+    aesenc8(&mut b, keys[6]);
+    aesenc8(&mut b, keys[7]);
+    aesenc8(&mut b, keys[8]);
+    aesenc8(&mut b, keys[9]);
+    aesenc8(&mut b, keys[10]);
+    aesenc8(&mut b, keys[11]);
+    aesenc8(&mut b, keys[12]);
+    aesenc8(&mut b, keys[13]);
+    aesenclast8(&mut b, keys[14]);
+    store8(out_ptr, b);
+}
+
+#[inline]
+#[target_feature(enable = "aes")]
+pub(super) unsafe fn decrypt1(keys: &RoundKeys, block: InOut<'_, '_, Block>) {
+    let (in_ptr, out_ptr) = block.into_raw();
+    let mut b = _mm_loadu_si128(in_ptr as *const __m128i);
+    b = _mm_xor_si128(b, keys[14]);
+    b = _mm_aesdec_si128(b, keys[13]);
+    b = _mm_aesdec_si128(b, keys[12]);
+    b = _mm_aesdec_si128(b, keys[11]);
+    b = _mm_aesdec_si128(b, keys[10]);
+    b = _mm_aesdec_si128(b, keys[9]);
+    b = _mm_aesdec_si128(b, keys[8]);
+    b = _mm_aesdec_si128(b, keys[7]);
+    b = _mm_aesdec_si128(b, keys[6]);
+    b = _mm_aesdec_si128(b, keys[5]);
+    b = _mm_aesdec_si128(b, keys[4]);
+    b = _mm_aesdec_si128(b, keys[3]);
+    b = _mm_aesdec_si128(b, keys[2]);
+    b = _mm_aesdec_si128(b, keys[1]);
+    b = _mm_aesdeclast_si128(b, keys[0]);
+    _mm_storeu_si128(out_ptr as *mut __m128i, b);
+}
+
+#[inline]
+#[target_feature(enable = "aes")]
+pub(super) unsafe fn decrypt8(keys: &RoundKeys, blocks: InOut<'_, '_, Block8>) {
+    let (in_ptr, out_ptr) = blocks.into_raw();
+    let mut b = load8(in_ptr);
+    xor8(&mut b, keys[14]);
+    aesdec8(&mut b, keys[13]);
+    aesdec8(&mut b, keys[12]);
+    aesdec8(&mut b, keys[11]);
+    aesdec8(&mut b, keys[10]);
+    aesdec8(&mut b, keys[9]);
+    aesdec8(&mut b, keys[8]);
+    aesdec8(&mut b, keys[7]);
+    aesdec8(&mut b, keys[6]);
+    aesdec8(&mut b, keys[5]);
+    aesdec8(&mut b, keys[4]);
+    aesdec8(&mut b, keys[3]);
+    aesdec8(&mut b, keys[2]);
+    aesdec8(&mut b, keys[1]);
+    aesdeclast8(&mut b, keys[0]);
+    store8(out_ptr, b);
+}
+
+macro_rules! expand_round {
+    ($keys:expr, $pos:expr, $round:expr) => {
+        let mut t1 = $keys[$pos - 2];
+        let mut t2;
+        let mut t3 = $keys[$pos - 1];
+        let mut t4;
+
+        t2 = _mm_aeskeygenassist_si128(t3, $round);
+        t2 = _mm_shuffle_epi32(t2, 0xff);
+        t4 = _mm_slli_si128(t1, 0x4);
+        t1 = _mm_xor_si128(t1, t4);
+        t4 = _mm_slli_si128(t4, 0x4);
+        t1 = _mm_xor_si128(t1, t4);
+        t4 = _mm_slli_si128(t4, 0x4);
+        t1 = _mm_xor_si128(t1, t4);
+        t1 = _mm_xor_si128(t1, t2);
+
+        $keys[$pos] = t1;
+
+        t4 = _mm_aeskeygenassist_si128(t1, 0x00);
+        t2 = _mm_shuffle_epi32(t4, 0xaa);
+        t4 = _mm_slli_si128(t3, 0x4);
+        t3 = _mm_xor_si128(t3, t4);
+        t4 = _mm_slli_si128(t4, 0x4);
+        t3 = _mm_xor_si128(t3, t4);
+        t4 = _mm_slli_si128(t4, 0x4);
+        t3 = _mm_xor_si128(t3, t4);
+        t3 = _mm_xor_si128(t3, t2);
+
+        $keys[$pos + 1] = t3;
+    };
+}
+
+macro_rules! expand_round_last {
+    ($keys:expr, $pos:expr, $round:expr) => {
+        let mut t1 = $keys[$pos - 2];
+        let mut t2;
+        let t3 = $keys[$pos - 1];
+        let mut t4;
+
+        t2 = _mm_aeskeygenassist_si128(t3, $round);
+        t2 = _mm_shuffle_epi32(t2, 0xff);
+        t4 = _mm_slli_si128(t1, 0x4);
+        t1 = _mm_xor_si128(t1, t4);
+        t4 = _mm_slli_si128(t4, 0x4);
+        t1 = _mm_xor_si128(t1, t4);
+        t4 = _mm_slli_si128(t4, 0x4);
+        t1 = _mm_xor_si128(t1, t4);
+        t1 = _mm_xor_si128(t1, t2);
+
+        $keys[$pos] = t1;
+    };
+}
+
+#[inline(always)]
+pub(super) unsafe fn expand_key(key: &[u8; 32]) -> RoundKeys {
+    // SAFETY: `RoundKeys` is a `[__m128i; 15]` which can be initialized
+    // with all zeroes.
+    let mut keys: RoundKeys = mem::zeroed();
+
+    let kp = key.as_ptr() as *const __m128i;
+    keys[0] = _mm_loadu_si128(kp);
+    keys[1] = _mm_loadu_si128(kp.add(1));
+
+    expand_round!(keys, 2, 0x01);
+    expand_round!(keys, 4, 0x02);
+    expand_round!(keys, 6, 0x04);
+    expand_round!(keys, 8, 0x08);
+    expand_round!(keys, 10, 0x10);
+    expand_round!(keys, 12, 0x20);
+    expand_round_last!(keys, 14, 0x40);
+
+    keys
+}
+
+#[inline]
+#[target_feature(enable = "aes")]
+pub(super) unsafe fn inv_expanded_keys(keys: &RoundKeys) -> RoundKeys {
+    [
+        keys[0],
+        _mm_aesimc_si128(keys[1]),
+        _mm_aesimc_si128(keys[2]),
+        _mm_aesimc_si128(keys[3]),
+        _mm_aesimc_si128(keys[4]),
+        _mm_aesimc_si128(keys[5]),
+        _mm_aesimc_si128(keys[6]),
+        _mm_aesimc_si128(keys[7]),
+        _mm_aesimc_si128(keys[8]),
+        _mm_aesimc_si128(keys[9]),
+        _mm_aesimc_si128(keys[10]),
+        _mm_aesimc_si128(keys[11]),
+        _mm_aesimc_si128(keys[12]),
+        _mm_aesimc_si128(keys[13]),
+        keys[14],
+    ]
+}
diff --git a/src/ni/hazmat.rs b/src/ni/hazmat.rs
new file mode 100644 (file)
index 0000000..a2a735a
--- /dev/null
@@ -0,0 +1,80 @@
+//! Low-level "hazmat" AES functions: AES-NI support.
+//!
+//! Note: this isn't actually used in the `Aes128`/`Aes192`/`Aes256`
+//! implementations in this crate, but instead provides raw AES-NI accelerated
+//! access to the AES round function gated under the `hazmat` crate feature.
+
+use super::{
+    arch::*,
+    utils::{load8, store8},
+};
+use crate::{Block, Block8};
+
+/// AES cipher (encrypt) round function.
+#[target_feature(enable = "aes")]
+pub(crate) unsafe fn cipher_round(block: &mut Block, round_key: &Block) {
+    // Safety: `loadu` and `storeu` support unaligned access
+    let b = _mm_loadu_si128(block.as_ptr() as *const __m128i);
+    let k = _mm_loadu_si128(round_key.as_ptr() as *const __m128i);
+    let out = _mm_aesenc_si128(b, k);
+    _mm_storeu_si128(block.as_mut_ptr() as *mut __m128i, out);
+}
+
+/// AES cipher (encrypt) round function: parallel version.
+#[target_feature(enable = "aes")]
+pub(crate) unsafe fn cipher_round_par(blocks: &mut Block8, round_keys: &Block8) {
+    let xmm_keys = load8(round_keys);
+    let mut xmm_blocks = load8(blocks);
+
+    for i in 0..8 {
+        xmm_blocks[i] = _mm_aesenc_si128(xmm_blocks[i], xmm_keys[i]);
+    }
+
+    store8(blocks, xmm_blocks);
+}
+
+/// AES cipher (encrypt) round function.
+#[target_feature(enable = "aes")]
+pub(crate) unsafe fn equiv_inv_cipher_round(block: &mut Block, round_key: &Block) {
+    // Safety: `loadu` and `storeu` support unaligned access
+    let b = _mm_loadu_si128(block.as_ptr() as *const __m128i);
+    let k = _mm_loadu_si128(round_key.as_ptr() as *const __m128i);
+    let out = _mm_aesdec_si128(b, k);
+    _mm_storeu_si128(block.as_mut_ptr() as *mut __m128i, out);
+}
+
+/// AES cipher (encrypt) round function: parallel version.
+#[target_feature(enable = "aes")]
+pub(crate) unsafe fn equiv_inv_cipher_round_par(blocks: &mut Block8, round_keys: &Block8) {
+    let xmm_keys = load8(round_keys);
+    let mut xmm_blocks = load8(blocks);
+
+    for i in 0..8 {
+        xmm_blocks[i] = _mm_aesdec_si128(xmm_blocks[i], xmm_keys[i]);
+    }
+
+    store8(blocks, xmm_blocks);
+}
+
+/// AES mix columns function.
+#[target_feature(enable = "aes")]
+pub(crate) unsafe fn mix_columns(block: &mut Block) {
+    // Safety: `loadu` and `storeu` support unaligned access
+    let mut state = _mm_loadu_si128(block.as_ptr() as *const __m128i);
+
+    // Emulate mix columns by performing three inverse mix columns operations
+    state = _mm_aesimc_si128(state);
+    state = _mm_aesimc_si128(state);
+    state = _mm_aesimc_si128(state);
+
+    _mm_storeu_si128(block.as_mut_ptr() as *mut __m128i, state);
+}
+
+/// AES inverse mix columns function.
+#[target_feature(enable = "aes")]
+pub(crate) unsafe fn inv_mix_columns(block: &mut Block) {
+    // Safety: `loadu` and `storeu` support unaligned access
+    let b = _mm_loadu_si128(block.as_ptr() as *const __m128i);
+    let out = _mm_aesimc_si128(b);
+    _mm_storeu_si128(block.as_mut_ptr() as *mut __m128i, out);
+}
diff --git a/src/ni/test_expand.rs b/src/ni/test_expand.rs
new file mode 100644 (file)
index 0000000..6ab87c5
--- /dev/null
@@ -0,0 +1,275 @@
+use super::utils::check;
+use hex_literal::hex;
+
+#[test]
+fn aes128_expand_key_test() {
+    use super::aes128::expand_key;
+
+    let keys = [0x00; 16];
+    check(
+        unsafe { &expand_key(&keys) },
+        &[
+            [0x0000000000000000, 0x0000000000000000],
+            [0x6263636362636363, 0x6263636362636363],
+            [0x9b9898c9f9fbfbaa, 0x9b9898c9f9fbfbaa],
+            [0x90973450696ccffa, 0xf2f457330b0fac99],
+            [0xee06da7b876a1581, 0x759e42b27e91ee2b],
+            [0x7f2e2b88f8443e09, 0x8dda7cbbf34b9290],
+            [0xec614b851425758c, 0x99ff09376ab49ba7],
+            [0x217517873550620b, 0xacaf6b3cc61bf09b],
+            [0x0ef903333ba96138, 0x97060a04511dfa9f],
+            [0xb1d4d8e28a7db9da, 0x1d7bb3de4c664941],
+            [0xb4ef5bcb3e92e211, 0x23e951cf6f8f188e],
+        ],
+    );
+
+    let keys = [0xff; 16];
+    check(
+        unsafe { &expand_key(&keys) },
+        &[
+            [0xffffffffffffffff, 0xffffffffffffffff],
+            [0xe8e9e9e917161616, 0xe8e9e9e917161616],
+            [0xadaeae19bab8b80f, 0x525151e6454747f0],
+            [0x090e2277b3b69a78, 0xe1e7cb9ea4a08c6e],
+            [0xe16abd3e52dc2746, 0xb33becd8179b60b6],
+            [0xe5baf3ceb766d488, 0x045d385013c658e6],
+            [0x71d07db3c6b6a93b, 0xc2eb916bd12dc98d],
+            [0xe90d208d2fbb89b6, 0xed5018dd3c7dd150],
+            [0x96337366b988fad0, 0x54d8e20d68a5335d],
+            [0x8bf03f233278c5f3, 0x66a027fe0e0514a3],
+            [0xd60a3588e472f07b, 0x82d2d7858cd7c326],
+        ],
+    );
+
+    let keys = hex!("000102030405060708090a0b0c0d0e0f");
+    check(
+        unsafe { &expand_key(&keys) },
+        &[
+            [0x0001020304050607, 0x08090a0b0c0d0e0f],
+            [0xd6aa74fdd2af72fa, 0xdaa678f1d6ab76fe],
+            [0xb692cf0b643dbdf1, 0xbe9bc5006830b3fe],
+            [0xb6ff744ed2c2c9bf, 0x6c590cbf0469bf41],
+            [0x47f7f7bc95353e03, 0xf96c32bcfd058dfd],
+            [0x3caaa3e8a99f9deb, 0x50f3af57adf622aa],
+            [0x5e390f7df7a69296, 0xa7553dc10aa31f6b],
+            [0x14f9701ae35fe28c, 0x440adf4d4ea9c026],
+            [0x47438735a41c65b9, 0xe016baf4aebf7ad2],
+            [0x549932d1f0855768, 0x1093ed9cbe2c974e],
+            [0x13111d7fe3944a17, 0xf307a78b4d2b30c5],
+        ],
+    );
+
+    let keys = hex!("6920e299a5202a6d656e636869746f2a");
+    check(
+        unsafe { &expand_key(&keys) },
+        &[
+            [0x6920e299a5202a6d, 0x656e636869746f2a],
+            [0xfa8807605fa82d0d, 0x3ac64e6553b2214f],
+            [0xcf75838d90ddae80, 0xaa1be0e5f9a9c1aa],
+            [0x180d2f1488d08194, 0x22cb6171db62a0db],
+            [0xbaed96ad323d1739, 0x10f67648cb94d693],
+            [0x881b4ab2ba265d8b, 0xaad02bc36144fd50],
+            [0xb34f195d096944d6, 0xa3b96f15c2fd9245],
+            [0xa7007778ae6933ae, 0x0dd05cbbcf2dcefe],
+            [0xff8bccf251e2ff5c, 0x5c32a3e7931f6d19],
+            [0x24b7182e7555e772, 0x29674495ba78298c],
+            [0xae127cdadb479ba8, 0xf220df3d4858f6b1],
+        ],
+    );
+
+    let keys = hex!("2b7e151628aed2a6abf7158809cf4f3c");
+    check(
+        unsafe { &expand_key(&keys) },
+        &[
+            [0x2b7e151628aed2a6, 0xabf7158809cf4f3c],
+            [0xa0fafe1788542cb1, 0x23a339392a6c7605],
+            [0xf2c295f27a96b943, 0x5935807a7359f67f],
+            [0x3d80477d4716fe3e, 0x1e237e446d7a883b],
+            [0xef44a541a8525b7f, 0xb671253bdb0bad00],
+            [0xd4d1c6f87c839d87, 0xcaf2b8bc11f915bc],
+            [0x6d88a37a110b3efd, 0xdbf98641ca0093fd],
+            [0x4e54f70e5f5fc9f3, 0x84a64fb24ea6dc4f],
+            [0xead27321b58dbad2, 0x312bf5607f8d292f],
+            [0xac7766f319fadc21, 0x28d12941575c006e],
+            [0xd014f9a8c9ee2589, 0xe13f0cc8b6630ca6],
+        ],
+    );
+}
+
+#[test]
+fn aes192_expand_key_test() {
+    use super::aes192::expand_key;
+
+    let keys = [0x00; 24];
+    check(
+        unsafe { &expand_key(&keys) },
+        &[
+            [0x0000000000000000, 0x0000000000000000],
+            [0x0000000000000000, 0x6263636362636363],
+            [0x6263636362636363, 0x6263636362636363],
+            [0x9b9898c9f9fbfbaa, 0x9b9898c9f9fbfbaa],
+            [0x9b9898c9f9fbfbaa, 0x90973450696ccffa],
+            [0xf2f457330b0fac99, 0x90973450696ccffa],
+            [0xc81d19a9a171d653, 0x53858160588a2df9],
+            [0xc81d19a9a171d653, 0x7bebf49bda9a22c8],
+            [0x891fa3a8d1958e51, 0x198897f8b8f941ab],
+            [0xc26896f718f2b43f, 0x91ed1797407899c6],
+            [0x59f00e3ee1094f95, 0x83ecbc0f9b1e0830],
+            [0x0af31fa74a8b8661, 0x137b885ff272c7ca],
+            [0x432ac886d834c0b6, 0xd2c7df11984c5970],
+        ],
+    );
+
+    let keys = [0xff; 24];
+    check(
+        unsafe { &expand_key(&keys) },
+        &[
+            [0xffffffffffffffff, 0xffffffffffffffff],
+            [0xffffffffffffffff, 0xe8e9e9e917161616],
+            [0xe8e9e9e917161616, 0xe8e9e9e917161616],
+            [0xadaeae19bab8b80f, 0x525151e6454747f0],
+            [0xadaeae19bab8b80f, 0xc5c2d8ed7f7a60e2],
+            [0x2d2b3104686c76f4, 0xc5c2d8ed7f7a60e2],
+            [0x1712403f686820dd, 0x454311d92d2f672d],
+            [0xe8edbfc09797df22, 0x8f8cd3b7e7e4f36a],
+            [0xa2a7e2b38f88859e, 0x67653a5ef0f2e57c],
+            [0x2655c33bc1b13051, 0x6316d2e2ec9e577c],
+            [0x8bfb6d227b09885e, 0x67919b1aa620ab4b],
+            [0xc53679a929a82ed5, 0xa25343f7d95acba9],
+            [0x598e482fffaee364, 0x3a989acd1330b418],
+        ],
+    );
+
+    let keys = hex!("000102030405060708090a0b0c0d0e0f1011121314151617");
+    check(
+        unsafe { &expand_key(&keys) },
+        &[
+            [0x0001020304050607, 0x08090a0b0c0d0e0f],
+            [0x1011121314151617, 0x5846f2f95c43f4fe],
+            [0x544afef55847f0fa, 0x4856e2e95c43f4fe],
+            [0x40f949b31cbabd4d, 0x48f043b810b7b342],
+            [0x58e151ab04a2a555, 0x7effb5416245080c],
+            [0x2ab54bb43a02f8f6, 0x62e3a95d66410c08],
+            [0xf501857297448d7e, 0xbdf1c6ca87f33e3c],
+            [0xe510976183519b69, 0x34157c9ea351f1e0],
+            [0x1ea0372a99530916, 0x7c439e77ff12051e],
+            [0xdd7e0e887e2fff68, 0x608fc842f9dcc154],
+            [0x859f5f237a8d5a3d, 0xc0c02952beefd63a],
+            [0xde601e7827bcdf2c, 0xa223800fd8aeda32],
+            [0xa4970a331a78dc09, 0xc418c271e3a41d5d],
+        ],
+    );
+
+    let keys = hex!("8e73b0f7da0e6452c810f32b809079e562f8ead2522c6b7b");
+    check(
+        unsafe { &expand_key(&keys) },
+        &[
+            [0x8e73b0f7da0e6452, 0xc810f32b809079e5],
+            [0x62f8ead2522c6b7b, 0xfe0c91f72402f5a5],
+            [0xec12068e6c827f6b, 0x0e7a95b95c56fec2],
+            [0x4db7b4bd69b54118, 0x85a74796e92538fd],
+            [0xe75fad44bb095386, 0x485af05721efb14f],
+            [0xa448f6d94d6dce24, 0xaa326360113b30e6],
+            [0xa25e7ed583b1cf9a, 0x27f939436a94f767],
+            [0xc0a69407d19da4e1, 0xec1786eb6fa64971],
+            [0x485f703222cb8755, 0xe26d135233f0b7b3],
+            [0x40beeb282f18a259, 0x6747d26b458c553e],
+            [0xa7e1466c9411f1df, 0x821f750aad07d753],
+            [0xca4005388fcc5006, 0x282d166abc3ce7b5],
+            [0xe98ba06f448c773c, 0x8ecc720401002202],
+        ],
+    );
+}
+
+#[test]
+fn aes256_expand_key_test() {
+    use super::aes256::expand_key;
+
+    let keys = [0x00; 32];
+    check(
+        unsafe { &expand_key(&keys) },
+        &[
+            [0x0000000000000000, 0x0000000000000000],
+            [0x0000000000000000, 0x0000000000000000],
+            [0x6263636362636363, 0x6263636362636363],
+            [0xaafbfbfbaafbfbfb, 0xaafbfbfbaafbfbfb],
+            [0x6f6c6ccf0d0f0fac, 0x6f6c6ccf0d0f0fac],
+            [0x7d8d8d6ad7767691, 0x7d8d8d6ad7767691],
+            [0x5354edc15e5be26d, 0x31378ea23c38810e],
+            [0x968a81c141fcf750, 0x3c717a3aeb070cab],
+            [0x9eaa8f28c0f16d45, 0xf1c6e3e7cdfe62e9],
+            [0x2b312bdf6acddc8f, 0x56bca6b5bdbbaa1e],
+            [0x6406fd52a4f79017, 0x553173f098cf1119],
+            [0x6dbba90b07767584, 0x51cad331ec71792f],
+            [0xe7b0e89c4347788b, 0x16760b7b8eb91a62],
+            [0x74ed0ba1739b7e25, 0x2251ad14ce20d43b],
+            [0x10f80a1753bf729c, 0x45c979e7cb706385],
+        ],
+    );
+
+    let keys = [0xff; 32];
+    check(
+        unsafe { &expand_key(&keys) },
+        &[
+            [0xffffffffffffffff, 0xffffffffffffffff],
+            [0xffffffffffffffff, 0xffffffffffffffff],
+            [0xe8e9e9e917161616, 0xe8e9e9e917161616],
+            [0x0fb8b8b8f0474747, 0x0fb8b8b8f0474747],
+            [0x4a4949655d5f5f73, 0xb5b6b69aa2a0a08c],
+            [0x355858dcc51f1f9b, 0xcaa7a7233ae0e064],
+            [0xafa80ae5f2f75596, 0x4741e30ce5e14380],
+            [0xeca0421129bf5d8a, 0xe318faa9d9f81acd],
+            [0xe60ab7d014fde246, 0x53bc014ab65d42ca],
+            [0xa2ec6e658b5333ef, 0x684bc946b1b3d38b],
+            [0x9b6c8a188f91685e, 0xdc2d69146a702bde],
+            [0xa0bd9f782beeac97, 0x43a565d1f216b65a],
+            [0xfc22349173b35ccf, 0xaf9e35dbc5ee1e05],
+            [0x0695ed132d7b4184, 0x6ede24559cc8920f],
+            [0x546d424f27de1e80, 0x88402b5b4dae355e],
+        ],
+    );
+
+    let keys = hex!("000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f");
+    check(
+        unsafe { &expand_key(&keys) },
+        &[
+            [0x0001020304050607, 0x08090a0b0c0d0e0f],
+            [0x1011121314151617, 0x18191a1b1c1d1e1f],
+            [0xa573c29fa176c498, 0xa97fce93a572c09c],
+            [0x1651a8cd0244beda, 0x1a5da4c10640bade],
+            [0xae87dff00ff11b68, 0xa68ed5fb03fc1567],
+            [0x6de1f1486fa54f92, 0x75f8eb5373b8518d],
+            [0xc656827fc9a79917, 0x6f294cec6cd5598b],
+            [0x3de23a75524775e7, 0x27bf9eb45407cf39],
+            [0x0bdc905fc27b0948, 0xad5245a4c1871c2f],
+            [0x45f5a66017b2d387, 0x300d4d33640a820a],
+            [0x7ccff71cbeb4fe54, 0x13e6bbf0d261a7df],
+            [0xf01afafee7a82979, 0xd7a5644ab3afe640],
+            [0x2541fe719bf50025, 0x8813bbd55a721c0a],
+            [0x4e5a6699a9f24fe0, 0x7e572baacdf8cdea],
+            [0x24fc79ccbf0979e9, 0x371ac23c6d68de36],
+        ],
+    );
+
+    let keys = hex!("603deb1015ca71be2b73aef0857d77811f352c073b6108d72d9810a30914dff4");
+    check(
+        unsafe { &expand_key(&keys) },
+        &[
+            [0x603deb1015ca71be, 0x2b73aef0857d7781],
+            [0x1f352c073b6108d7, 0x2d9810a30914dff4],
+            [0x9ba354118e6925af, 0xa51a8b5f2067fcde],
+            [0xa8b09c1a93d194cd, 0xbe49846eb75d5b9a],
+            [0xd59aecb85bf3c917, 0xfee94248de8ebe96],
+            [0xb5a9328a2678a647, 0x983122292f6c79b3],
+            [0x812c81addadf48ba, 0x24360af2fab8b464],
+            [0x98c5bfc9bebd198e, 0x268c3ba709e04214],
+            [0x68007bacb2df3316, 0x96e939e46c518d80],
+            [0xc814e20476a9fb8a, 0x5025c02d59c58239],
+            [0xde1369676ccc5a71, 0xfa2563959674ee15],
+            [0x5886ca5d2e2f31d7, 0x7e0af1fa27cf73c3],
+            [0x749c47ab18501dda, 0xe2757e4f7401905a],
+            [0xcafaaae3e4d59b34, 0x9adf6acebd10190d],
+            [0xfe4890d1e6188d0b, 0x046df344706c631e],
+        ],
+    );
+}
diff --git a/src/ni/utils.rs b/src/ni/utils.rs
new file mode 100644 (file)
index 0000000..1bd6522
--- /dev/null
@@ -0,0 +1,92 @@
+//! Utility functions
+
+// TODO(tarcieri): check performance impact / generated assembly changes
+#![allow(clippy::needless_range_loop)]
+
+use super::arch::*;
+use crate::{Block, Block8};
+
+pub type U128x8 = [__m128i; 8];
+
+#[cfg(test)]
+pub(crate) fn check(a: &[__m128i], b: &[[u64; 2]]) {
+    for (v1, v2) in a.iter().zip(b) {
+        let t1: [u64; 2] = unsafe { core::mem::transmute(*v1) };
+        let t2 = [v2[0].to_be(), v2[1].to_be()];
+        assert_eq!(t1, t2);
+    }
+}
+
+#[inline(always)]
+pub(crate) fn load8(blocks: *const Block8) -> U128x8 {
+    unsafe {
+        let p = blocks as *const Block;
+        [
+            _mm_loadu_si128(p.add(0) as *const __m128i),
+            _mm_loadu_si128(p.add(1) as *const __m128i),
+            _mm_loadu_si128(p.add(2) as *const __m128i),
+            _mm_loadu_si128(p.add(3) as *const __m128i),
+            _mm_loadu_si128(p.add(4) as *const __m128i),
+            _mm_loadu_si128(p.add(5) as *const __m128i),
+            _mm_loadu_si128(p.add(6) as *const __m128i),
+            _mm_loadu_si128(p.add(7) as *const __m128i),
+        ]
+    }
+}
+
+#[inline(always)]
+pub(crate) fn store8(blocks: *mut Block8, b: U128x8) {
+    unsafe {
+        let p = blocks as *mut Block;
+        _mm_storeu_si128(p.add(0) as *mut __m128i, b[0]);
+        _mm_storeu_si128(p.add(1) as *mut __m128i, b[1]);
+        _mm_storeu_si128(p.add(2) as *mut __m128i, b[2]);
+        _mm_storeu_si128(p.add(3) as *mut __m128i, b[3]);
+        _mm_storeu_si128(p.add(4) as *mut __m128i, b[4]);
+        _mm_storeu_si128(p.add(5) as *mut __m128i, b[5]);
+        _mm_storeu_si128(p.add(6) as *mut __m128i, b[6]);
+        _mm_storeu_si128(p.add(7) as *mut __m128i, b[7]);
+    }
+}
+
+#[inline(always)]
+pub(crate) fn xor8(b: &mut U128x8, key: __m128i) {
+    unsafe {
+        b[0] = _mm_xor_si128(b[0], key);
+        b[1] = _mm_xor_si128(b[1], key);
+        b[2] = _mm_xor_si128(b[2], key);
+        b[3] = _mm_xor_si128(b[3], key);
+        b[4] = _mm_xor_si128(b[4], key);
+        b[5] = _mm_xor_si128(b[5], key);
+        b[6] = _mm_xor_si128(b[6], key);
+        b[7] = _mm_xor_si128(b[7], key);
+    }
+}
+
+#[inline(always)]
+pub(crate) fn aesenc8(buffer: &mut U128x8, key: __m128i) {
+    for i in 0..8 {
+        buffer[i] = unsafe { _mm_aesenc_si128(buffer[i], key) };
+    }
+}
+
+#[inline(always)]
+pub(crate) fn aesenclast8(buffer: &mut U128x8, key: __m128i) {
+    for i in 0..8 {
+        buffer[i] = unsafe { _mm_aesenclast_si128(buffer[i], key) };
+    }
+}
+
+#[inline(always)]
+pub(crate) fn aesdec8(buffer: &mut U128x8, key: __m128i) {
+    for i in 0..8 {
+        buffer[i] = unsafe { _mm_aesdec_si128(buffer[i], key) };
+    }
+}
+
+#[inline(always)]
+pub(crate) fn aesdeclast8(buffer: &mut U128x8, key: __m128i) {
+    for i in 0..8 {
+        buffer[i] = unsafe { _mm_aesdeclast_si128(buffer[i], key) };
+    }
+}
diff --git a/src/soft.rs b/src/soft.rs
new file mode 100644 (file)
index 0000000..5f90b1e
--- /dev/null
@@ -0,0 +1,342 @@
+//! AES block cipher constant-time implementation.
+//!
+//! The implementation uses a technique called [fixslicing][1], an improved
+//! form of bitslicing which represents ciphers in a way which enables
+//! very efficient constant-time implementations in software.
+//!
+//! [1]: https://eprint.iacr.org/2020/1123.pdf
+
+#![deny(unsafe_code)]
+
+#[cfg_attr(not(target_pointer_width = "64"), path = "soft/fixslice32.rs")]
+#[cfg_attr(target_pointer_width = "64", path = "soft/fixslice64.rs")]
+pub(crate) mod fixslice;
+
+use crate::Block;
+use cipher::{
+    consts::{U16, U24, U32},
+    inout::InOut,
+    AlgorithmName, BlockBackend, BlockCipher, BlockClosure, BlockDecrypt, BlockEncrypt,
+    BlockSizeUser, Key, KeyInit, KeySizeUser, ParBlocksSizeUser,
+};
+use core::fmt;
+use fixslice::{BatchBlocks, FixsliceBlocks, FixsliceKeys128, FixsliceKeys192, FixsliceKeys256};
+
+macro_rules! define_aes_impl {
+    (
+        $name:tt,
+        $name_enc:ident,
+        $name_dec:ident,
+        $name_back_enc:ident,
+        $name_back_dec:ident,
+        $key_size:ty,
+        $fixslice_keys:ty,
+        $fixslice_key_schedule:path,
+        $fixslice_decrypt:path,
+        $fixslice_encrypt:path,
+        $doc:expr $(,)?
+    ) => {
+        #[doc=$doc]
+        #[doc = "block cipher"]
+        #[derive(Clone)]
+        pub struct $name {
+            keys: $fixslice_keys,
+        }
+
+        impl $name {
+            #[inline(always)]
+            pub(crate) fn get_enc_backend(&self) -> $name_back_enc<'_> {
+                $name_back_enc(self)
+            }
+
+            #[inline(always)]
+            pub(crate) fn get_dec_backend(&self) -> $name_back_dec<'_> {
+                $name_back_dec(self)
+            }
+        }
+
+        impl KeySizeUser for $name {
+            type KeySize = $key_size;
+        }
+
+        impl KeyInit for $name {
+            #[inline]
+            fn new(key: &Key<Self>) -> Self {
+                Self {
+                    keys: $fixslice_key_schedule(key.as_ref()),
+                }
+            }
+        }
+
+        impl BlockSizeUser for $name {
+            type BlockSize = U16;
+        }
+
+        impl BlockCipher for $name {}
+
+        impl BlockEncrypt for $name {
+            fn encrypt_with_backend(&self, f: impl BlockClosure<BlockSize = U16>) {
+                f.call(&mut self.get_enc_backend())
+            }
+        }
+
+        impl BlockDecrypt for $name {
+            fn decrypt_with_backend(&self, f: impl BlockClosure<BlockSize = U16>) {
+                f.call(&mut self.get_dec_backend())
+            }
+        }
+
+        impl From<$name_enc> for $name {
+            #[inline]
+            fn from(enc: $name_enc) -> $name {
+                enc.inner
+            }
+        }
+
+        impl From<&$name_enc> for $name {
+            #[inline]
+            fn from(enc: &$name_enc) -> $name {
+                enc.inner.clone()
+            }
+        }
+
+        impl fmt::Debug for $name {
+            fn fmt(&self, f: &mut fmt::Formatter<'_>) -> Result<(), fmt::Error> {
+                f.write_str(concat!(stringify!($name), " { .. }"))
+            }
+        }
+
+        impl AlgorithmName for $name {
+            fn write_alg_name(f: &mut fmt::Formatter<'_>) -> fmt::Result {
+                f.write_str(stringify!($name))
+            }
+        }
+
+        impl Drop for $name {
+            #[inline]
+            fn drop(&mut self) {
+                #[cfg(feature = "zeroize")]
+                zeroize::Zeroize::zeroize(&mut self.keys);
+            }
+        }
+
+        #[cfg(feature = "zeroize")]
+        impl zeroize::ZeroizeOnDrop for $name {}
+
+        #[doc=$doc]
+        #[doc = "block cipher (encrypt-only)"]
+        #[derive(Clone)]
+        pub struct $name_enc {
+            inner: $name,
+        }
+
+        impl $name_enc {
+            #[inline(always)]
+            pub(crate) fn get_enc_backend(&self) -> $name_back_enc<'_> {
+                self.inner.get_enc_backend()
+            }
+        }
+
+        impl BlockCipher for $name_enc {}
+
+        impl KeySizeUser for $name_enc {
+            type KeySize = $key_size;
+        }
+
+        impl KeyInit for $name_enc {
+            #[inline(always)]
+            fn new(key: &Key<Self>) -> Self {
+                let inner = $name::new(key);
+                Self { inner }
+            }
+        }
+
+        impl BlockSizeUser for $name_enc {
+            type BlockSize = U16;
+        }
+
+        impl BlockEncrypt for $name_enc {
+            fn encrypt_with_backend(&self, f: impl BlockClosure<BlockSize = U16>) {
+                f.call(&mut self.get_enc_backend())
+            }
+        }
+
+        impl fmt::Debug for $name_enc {
+            fn fmt(&self, f: &mut fmt::Formatter<'_>) -> Result<(), fmt::Error> {
+                f.write_str(concat!(stringify!($name_enc), " { .. }"))
+            }
+        }
+
+        impl AlgorithmName for $name_enc {
+            fn write_alg_name(f: &mut fmt::Formatter<'_>) -> fmt::Result {
+                f.write_str(stringify!($name_enc))
+            }
+        }
+
+        #[cfg(feature = "zeroize")]
+        impl zeroize::ZeroizeOnDrop for $name_enc {}
+
+        #[doc=$doc]
+        #[doc = "block cipher (decrypt-only)"]
+        #[derive(Clone)]
+        pub struct $name_dec {
+            inner: $name,
+        }
+
+        impl $name_dec {
+            #[inline(always)]
+            pub(crate) fn get_dec_backend(&self) -> $name_back_dec<'_> {
+                self.inner.get_dec_backend()
+            }
+        }
+
+        impl BlockCipher for $name_dec {}
+
+        impl KeySizeUser for $name_dec {
+            type KeySize = $key_size;
+        }
+
+        impl KeyInit for $name_dec {
+            #[inline(always)]
+            fn new(key: &Key<Self>) -> Self {
+                let inner = $name::new(key);
+                Self { inner }
+            }
+        }
+
+        impl From<$name_enc> for $name_dec {
+            #[inline]
+            fn from(enc: $name_enc) -> $name_dec {
+                Self { inner: enc.inner }
+            }
+        }
+
+        impl From<&$name_enc> for $name_dec {
+            #[inline]
+            fn from(enc: &$name_enc) -> $name_dec {
+                Self {
+                    inner: enc.inner.clone(),
+                }
+            }
+        }
+
+        impl BlockSizeUser for $name_dec {
+            type BlockSize = U16;
+        }
+
+        impl BlockDecrypt for $name_dec {
+            fn decrypt_with_backend(&self, f: impl BlockClosure<BlockSize = U16>) {
+                f.call(&mut self.get_dec_backend());
+            }
+        }
+
+        impl fmt::Debug for $name_dec {
+            fn fmt(&self, f: &mut fmt::Formatter<'_>) -> Result<(), fmt::Error> {
+                f.write_str(concat!(stringify!($name_dec), " { .. }"))
+            }
+        }
+
+        impl AlgorithmName for $name_dec {
+            fn write_alg_name(f: &mut fmt::Formatter<'_>) -> fmt::Result {
+                f.write_str(stringify!($name_dec))
+            }
+        }
+
+        #[cfg(feature = "zeroize")]
+        impl zeroize::ZeroizeOnDrop for $name_dec {}
+
+        pub(crate) struct $name_back_enc<'a>(&'a $name);
+
+        impl<'a> BlockSizeUser for $name_back_enc<'a> {
+            type BlockSize = U16;
+        }
+
+        impl<'a> ParBlocksSizeUser for $name_back_enc<'a> {
+            type ParBlocksSize = FixsliceBlocks;
+        }
+
+        impl<'a> BlockBackend for $name_back_enc<'a> {
+            #[inline(always)]
+            fn proc_block(&mut self, mut block: InOut<'_, '_, Block>) {
+                let mut blocks = BatchBlocks::default();
+                blocks[0] = block.clone_in().into();
+                let res = $fixslice_encrypt(&self.0.keys, &blocks);
+                *block.get_out() = res[0].into();
+            }
+
+            #[inline(always)]
+            fn proc_par_blocks(&mut self, mut blocks: InOut<'_, '_, BatchBlocks>) {
+                let res = $fixslice_encrypt(&self.0.keys, blocks.get_in());
+                *blocks.get_out() = res;
+            }
+        }
+
+        pub(crate) struct $name_back_dec<'a>(&'a $name);
+
+        impl<'a> BlockSizeUser for $name_back_dec<'a> {
+            type BlockSize = U16;
+        }
+
+        impl<'a> ParBlocksSizeUser for $name_back_dec<'a> {
+            type ParBlocksSize = FixsliceBlocks;
+        }
+
+        impl<'a> BlockBackend for $name_back_dec<'a> {
+            #[inline(always)]
+            fn proc_block(&mut self, mut block: InOut<'_, '_, Block>) {
+                let mut blocks = BatchBlocks::default();
+                blocks[0] = block.clone_in();
+                let res = $fixslice_decrypt(&self.0.keys, &blocks);
+                *block.get_out() = res[0];
+            }
+
+            #[inline(always)]
+            fn proc_par_blocks(&mut self, mut blocks: InOut<'_, '_, BatchBlocks>) {
+                let res = $fixslice_decrypt(&self.0.keys, blocks.get_in());
+                *blocks.get_out() = res;
+            }
+        }
+    };
+}
+
+define_aes_impl!(
+    Aes128,
+    Aes128Enc,
+    Aes128Dec,
+    Aes128BackEnc,
+    Aes128BackDec,
+    U16,
+    FixsliceKeys128,
+    fixslice::aes128_key_schedule,
+    fixslice::aes128_decrypt,
+    fixslice::aes128_encrypt,
+    "AES-128",
+);
+
+define_aes_impl!(
+    Aes192,
+    Aes192Enc,
+    Aes192Dec,
+    Aes192BackEnc,
+    Aes192BackDec,
+    U24,
+    FixsliceKeys192,
+    fixslice::aes192_key_schedule,
+    fixslice::aes192_decrypt,
+    fixslice::aes192_encrypt,
+    "AES-192",
+);
+
+define_aes_impl!(
+    Aes256,
+    Aes256Enc,
+    Aes256Dec,
+    Aes256BackEnc,
+    Aes256BackDec,
+    U32,
+    FixsliceKeys256,
+    fixslice::aes256_key_schedule,
+    fixslice::aes256_decrypt,
+    fixslice::aes256_encrypt,
+    "AES-256",
+);
diff --git a/src/soft/fixslice32.rs b/src/soft/fixslice32.rs
new file mode 100644 (file)
index 0000000..45b674d
--- /dev/null
@@ -0,0 +1,1479 @@
+//! Fixsliced implementations of AES-128, AES-192 and AES-256 (32-bit)
+//! adapted from the C implementation
+//!
+//! All implementations are fully bitsliced and do not rely on any
+//! Look-Up Table (LUT).
+//!
+//! See the paper at <https://eprint.iacr.org/2020/1123.pdf> for more details.
+//!
+//! # Author (original C code)
+//!
+//! Alexandre Adomnicai, Nanyang Technological University, Singapore
+//! <alexandre.adomnicai@ntu.edu.sg>
+//!
+//! Originally licensed MIT. Relicensed as Apache 2.0+MIT with permission.
+
+#![allow(clippy::unreadable_literal)]
+
+use crate::Block;
+use cipher::{consts::U2, generic_array::GenericArray};
+
+/// AES block batch size for this implementation
+pub(crate) type FixsliceBlocks = U2;
+
+pub(crate) type BatchBlocks = GenericArray<Block, FixsliceBlocks>;
+
+/// AES-128 round keys
+pub(crate) type FixsliceKeys128 = [u32; 88];
+
+/// AES-192 round keys
+pub(crate) type FixsliceKeys192 = [u32; 104];
+
+/// AES-256 round keys
+pub(crate) type FixsliceKeys256 = [u32; 120];
+
+/// 256-bit internal state
+pub(crate) type State = [u32; 8];
+
+/// Fully bitsliced AES-128 key schedule to match the fully-fixsliced representation.
+pub(crate) fn aes128_key_schedule(key: &[u8; 16]) -> FixsliceKeys128 {
+    let mut rkeys = [0u32; 88];
+
+    bitslice(&mut rkeys[..8], key, key);
+
+    let mut rk_off = 0;
+    for rcon in 0..10 {
+        memshift32(&mut rkeys, rk_off);
+        rk_off += 8;
+
+        sub_bytes(&mut rkeys[rk_off..(rk_off + 8)]);
+        sub_bytes_nots(&mut rkeys[rk_off..(rk_off + 8)]);
+
+        if rcon < 8 {
+            add_round_constant_bit(&mut rkeys[rk_off..(rk_off + 8)], rcon);
+        } else {
+            add_round_constant_bit(&mut rkeys[rk_off..(rk_off + 8)], rcon - 8);
+            add_round_constant_bit(&mut rkeys[rk_off..(rk_off + 8)], rcon - 7);
+            add_round_constant_bit(&mut rkeys[rk_off..(rk_off + 8)], rcon - 5);
+            add_round_constant_bit(&mut rkeys[rk_off..(rk_off + 8)], rcon - 4);
+        }
+
+        xor_columns(&mut rkeys, rk_off, 8, ror_distance(1, 3));
+    }
+
+    // Adjust to match fixslicing format
+    #[cfg(aes_compact)]
+    {
+        for i in (8..88).step_by(16) {
+            inv_shift_rows_1(&mut rkeys[i..(i + 8)]);
+        }
+    }
+    #[cfg(not(aes_compact))]
+    {
+        for i in (8..72).step_by(32) {
+            inv_shift_rows_1(&mut rkeys[i..(i + 8)]);
+            inv_shift_rows_2(&mut rkeys[(i + 8)..(i + 16)]);
+            inv_shift_rows_3(&mut rkeys[(i + 16)..(i + 24)]);
+        }
+        inv_shift_rows_1(&mut rkeys[72..80]);
+    }
+
+    // Account for NOTs removed from sub_bytes
+    for i in 1..11 {
+        sub_bytes_nots(&mut rkeys[(i * 8)..(i * 8 + 8)]);
+    }
+
+    rkeys
+}
+
+/// Fully bitsliced AES-192 key schedule to match the fully-fixsliced representation.
+pub(crate) fn aes192_key_schedule(key: &[u8; 24]) -> FixsliceKeys192 {
+    let mut rkeys = [0u32; 104];
+    let mut tmp = [0u32; 8];
+
+    bitslice(&mut rkeys[..8], &key[..16], &key[..16]);
+    bitslice(&mut tmp, &key[8..], &key[8..]);
+
+    let mut rcon = 0;
+    let mut rk_off = 8;
+
+    loop {
+        for i in 0..8 {
+            rkeys[rk_off + i] =
+                (0x0f0f0f0f & (tmp[i] >> 4)) | (0xf0f0f0f0 & (rkeys[(rk_off - 8) + i] << 4));
+        }
+
+        sub_bytes(&mut tmp);
+        sub_bytes_nots(&mut tmp);
+
+        add_round_constant_bit(&mut tmp, rcon);
+        rcon += 1;
+
+        for i in 0..8 {
+            let mut ti = rkeys[rk_off + i];
+            ti ^= 0x30303030 & ror(tmp[i], ror_distance(1, 1));
+            ti ^= 0xc0c0c0c0 & (ti << 2);
+            tmp[i] = ti;
+        }
+        rkeys[rk_off..(rk_off + 8)].copy_from_slice(&tmp);
+        rk_off += 8;
+
+        for i in 0..8 {
+            let ui = tmp[i];
+            let mut ti = (0x0f0f0f0f & (rkeys[(rk_off - 16) + i] >> 4)) | (0xf0f0f0f0 & (ui << 4));
+            ti ^= 0x03030303 & (ui >> 6);
+            tmp[i] =
+                ti ^ (0xfcfcfcfc & (ti << 2)) ^ (0xf0f0f0f0 & (ti << 4)) ^ (0xc0c0c0c0 & (ti << 6));
+        }
+        rkeys[rk_off..(rk_off + 8)].copy_from_slice(&tmp);
+        rk_off += 8;
+
+        sub_bytes(&mut tmp);
+        sub_bytes_nots(&mut tmp);
+
+        add_round_constant_bit(&mut tmp, rcon);
+        rcon += 1;
+
+        for i in 0..8 {
+            let mut ti = (0x0f0f0f0f & (rkeys[(rk_off - 16) + i] >> 4))
+                | (0xf0f0f0f0 & (rkeys[(rk_off - 8) + i] << 4));
+            ti ^= 0x03030303 & ror(tmp[i], ror_distance(1, 3));
+            rkeys[rk_off + i] =
+                ti ^ (0xfcfcfcfc & (ti << 2)) ^ (0xf0f0f0f0 & (ti << 4)) ^ (0xc0c0c0c0 & (ti << 6));
+        }
+        rk_off += 8;
+
+        if rcon >= 8 {
+            break;
+        }
+
+        for i in 0..8 {
+            let ui = rkeys[(rk_off - 8) + i];
+            let mut ti = rkeys[(rk_off - 16) + i];
+            ti ^= 0x30303030 & (ui >> 2);
+            ti ^= 0xc0c0c0c0 & (ti << 2);
+            tmp[i] = ti;
+        }
+    }
+
+    // Adjust to match fixslicing format
+    #[cfg(aes_compact)]
+    {
+        for i in (8..104).step_by(16) {
+            inv_shift_rows_1(&mut rkeys[i..(i + 8)]);
+        }
+    }
+    #[cfg(not(aes_compact))]
+    {
+        for i in (0..96).step_by(32) {
+            inv_shift_rows_1(&mut rkeys[(i + 8)..(i + 16)]);
+            inv_shift_rows_2(&mut rkeys[(i + 16)..(i + 24)]);
+            inv_shift_rows_3(&mut rkeys[(i + 24)..(i + 32)]);
+        }
+    }
+
+    // Account for NOTs removed from sub_bytes
+    for i in 1..13 {
+        sub_bytes_nots(&mut rkeys[(i * 8)..(i * 8 + 8)]);
+    }
+
+    rkeys
+}
+
+/// Fully bitsliced AES-256 key schedule to match the fully-fixsliced representation.
+pub(crate) fn aes256_key_schedule(key: &[u8; 32]) -> FixsliceKeys256 {
+    let mut rkeys = [0u32; 120];
+
+    bitslice(&mut rkeys[..8], &key[..16], &key[..16]);
+    bitslice(&mut rkeys[8..16], &key[16..], &key[16..]);
+
+    let mut rk_off = 8;
+
+    let mut rcon = 0;
+    loop {
+        memshift32(&mut rkeys, rk_off);
+        rk_off += 8;
+
+        sub_bytes(&mut rkeys[rk_off..(rk_off + 8)]);
+        sub_bytes_nots(&mut rkeys[rk_off..(rk_off + 8)]);
+
+        add_round_constant_bit(&mut rkeys[rk_off..(rk_off + 8)], rcon);
+        xor_columns(&mut rkeys, rk_off, 16, ror_distance(1, 3));
+        rcon += 1;
+
+        if rcon == 7 {
+            break;
+        }
+
+        memshift32(&mut rkeys, rk_off);
+        rk_off += 8;
+
+        sub_bytes(&mut rkeys[rk_off..(rk_off + 8)]);
+        sub_bytes_nots(&mut rkeys[rk_off..(rk_off + 8)]);
+
+        xor_columns(&mut rkeys, rk_off, 16, ror_distance(0, 3));
+    }
+
+    // Adjust to match fixslicing format
+    #[cfg(aes_compact)]
+    {
+        for i in (8..120).step_by(16) {
+            inv_shift_rows_1(&mut rkeys[i..(i + 8)]);
+        }
+    }
+    #[cfg(not(aes_compact))]
+    {
+        for i in (8..104).step_by(32) {
+            inv_shift_rows_1(&mut rkeys[i..(i + 8)]);
+            inv_shift_rows_2(&mut rkeys[(i + 8)..(i + 16)]);
+            inv_shift_rows_3(&mut rkeys[(i + 16)..(i + 24)]);
+        }
+        inv_shift_rows_1(&mut rkeys[104..112]);
+    }
+
+    // Account for NOTs removed from sub_bytes
+    for i in 1..15 {
+        sub_bytes_nots(&mut rkeys[(i * 8)..(i * 8 + 8)]);
+    }
+
+    rkeys
+}
+
+/// Fully-fixsliced AES-128 decryption (the InvShiftRows is completely omitted).
+///
+/// Decrypts four blocks in-place and in parallel.
+pub(crate) fn aes128_decrypt(rkeys: &FixsliceKeys128, blocks: &BatchBlocks) -> BatchBlocks {
+    let mut state = State::default();
+
+    bitslice(&mut state, &blocks[0], &blocks[1]);
+
+    add_round_key(&mut state, &rkeys[80..]);
+    inv_sub_bytes(&mut state);
+
+    #[cfg(not(aes_compact))]
+    {
+        inv_shift_rows_2(&mut state);
+    }
+
+    let mut rk_off = 72;
+    loop {
+        #[cfg(aes_compact)]
+        {
+            inv_shift_rows_2(&mut state);
+        }
+
+        add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]);
+        inv_mix_columns_1(&mut state);
+        inv_sub_bytes(&mut state);
+        rk_off -= 8;
+
+        if rk_off == 0 {
+            break;
+        }
+
+        add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]);
+        inv_mix_columns_0(&mut state);
+        inv_sub_bytes(&mut state);
+        rk_off -= 8;
+
+        #[cfg(not(aes_compact))]
+        {
+            add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]);
+            inv_mix_columns_3(&mut state);
+            inv_sub_bytes(&mut state);
+            rk_off -= 8;
+
+            add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]);
+            inv_mix_columns_2(&mut state);
+            inv_sub_bytes(&mut state);
+            rk_off -= 8;
+        }
+    }
+
+    add_round_key(&mut state, &rkeys[..8]);
+
+    inv_bitslice(&state)
+}
+
+/// Fully-fixsliced AES-128 encryption (the ShiftRows is completely omitted).
+///
+/// Encrypts four blocks in-place and in parallel.
+pub(crate) fn aes128_encrypt(rkeys: &FixsliceKeys128, blocks: &BatchBlocks) -> BatchBlocks {
+    let mut state = State::default();
+
+    bitslice(&mut state, &blocks[0], &blocks[1]);
+
+    add_round_key(&mut state, &rkeys[..8]);
+
+    let mut rk_off = 8;
+    loop {
+        sub_bytes(&mut state);
+        mix_columns_1(&mut state);
+        add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]);
+        rk_off += 8;
+
+        #[cfg(aes_compact)]
+        {
+            shift_rows_2(&mut state);
+        }
+
+        if rk_off == 80 {
+            break;
+        }
+
+        #[cfg(not(aes_compact))]
+        {
+            sub_bytes(&mut state);
+            mix_columns_2(&mut state);
+            add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]);
+            rk_off += 8;
+
+            sub_bytes(&mut state);
+            mix_columns_3(&mut state);
+            add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]);
+            rk_off += 8;
+        }
+
+        sub_bytes(&mut state);
+        mix_columns_0(&mut state);
+        add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]);
+        rk_off += 8;
+    }
+
+    #[cfg(not(aes_compact))]
+    {
+        shift_rows_2(&mut state);
+    }
+
+    sub_bytes(&mut state);
+    add_round_key(&mut state, &rkeys[80..]);
+
+    inv_bitslice(&state)
+}
+
+/// Fully-fixsliced AES-192 decryption (the InvShiftRows is completely omitted).
+///
+/// Decrypts four blocks in-place and in parallel.
+pub(crate) fn aes192_decrypt(rkeys: &FixsliceKeys192, blocks: &BatchBlocks) -> BatchBlocks {
+    let mut state = State::default();
+
+    bitslice(&mut state, &blocks[0], &blocks[1]);
+
+    add_round_key(&mut state, &rkeys[96..]);
+    inv_sub_bytes(&mut state);
+
+    let mut rk_off = 88;
+    loop {
+        #[cfg(aes_compact)]
+        {
+            inv_shift_rows_2(&mut state);
+        }
+        #[cfg(not(aes_compact))]
+        {
+            add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]);
+            inv_mix_columns_3(&mut state);
+            inv_sub_bytes(&mut state);
+            rk_off -= 8;
+
+            add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]);
+            inv_mix_columns_2(&mut state);
+            inv_sub_bytes(&mut state);
+            rk_off -= 8;
+        }
+
+        add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]);
+        inv_mix_columns_1(&mut state);
+        inv_sub_bytes(&mut state);
+        rk_off -= 8;
+
+        if rk_off == 0 {
+            break;
+        }
+
+        add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]);
+        inv_mix_columns_0(&mut state);
+        inv_sub_bytes(&mut state);
+        rk_off -= 8;
+    }
+
+    add_round_key(&mut state, &rkeys[..8]);
+
+    inv_bitslice(&state)
+}
+
+/// Fully-fixsliced AES-192 encryption (the ShiftRows is completely omitted).
+///
+/// Encrypts four blocks in-place and in parallel.
+pub(crate) fn aes192_encrypt(rkeys: &FixsliceKeys192, blocks: &BatchBlocks) -> BatchBlocks {
+    let mut state = State::default();
+
+    bitslice(&mut state, &blocks[0], &blocks[1]);
+
+    add_round_key(&mut state, &rkeys[..8]);
+
+    let mut rk_off = 8;
+    loop {
+        sub_bytes(&mut state);
+        mix_columns_1(&mut state);
+        add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]);
+        rk_off += 8;
+
+        #[cfg(aes_compact)]
+        {
+            shift_rows_2(&mut state);
+        }
+        #[cfg(not(aes_compact))]
+        {
+            sub_bytes(&mut state);
+            mix_columns_2(&mut state);
+            add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]);
+            rk_off += 8;
+
+            sub_bytes(&mut state);
+            mix_columns_3(&mut state);
+            add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]);
+            rk_off += 8;
+        }
+
+        if rk_off == 96 {
+            break;
+        }
+
+        sub_bytes(&mut state);
+        mix_columns_0(&mut state);
+        add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]);
+        rk_off += 8;
+    }
+
+    sub_bytes(&mut state);
+    add_round_key(&mut state, &rkeys[96..]);
+
+    inv_bitslice(&state)
+}
+
+/// Fully-fixsliced AES-256 decryption (the InvShiftRows is completely omitted).
+///
+/// Decrypts four blocks in-place and in parallel.
+pub(crate) fn aes256_decrypt(rkeys: &FixsliceKeys256, blocks: &BatchBlocks) -> BatchBlocks {
+    let mut state = State::default();
+
+    bitslice(&mut state, &blocks[0], &blocks[1]);
+
+    add_round_key(&mut state, &rkeys[112..]);
+    inv_sub_bytes(&mut state);
+
+    #[cfg(not(aes_compact))]
+    {
+        inv_shift_rows_2(&mut state);
+    }
+
+    let mut rk_off = 104;
+    loop {
+        #[cfg(aes_compact)]
+        {
+            inv_shift_rows_2(&mut state);
+        }
+
+        add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]);
+        inv_mix_columns_1(&mut state);
+        inv_sub_bytes(&mut state);
+        rk_off -= 8;
+
+        if rk_off == 0 {
+            break;
+        }
+
+        add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]);
+        inv_mix_columns_0(&mut state);
+        inv_sub_bytes(&mut state);
+        rk_off -= 8;
+
+        #[cfg(not(aes_compact))]
+        {
+            add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]);
+            inv_mix_columns_3(&mut state);
+            inv_sub_bytes(&mut state);
+            rk_off -= 8;
+
+            add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]);
+            inv_mix_columns_2(&mut state);
+            inv_sub_bytes(&mut state);
+            rk_off -= 8;
+        }
+    }
+
+    add_round_key(&mut state, &rkeys[..8]);
+
+    inv_bitslice(&state)
+}
+
+/// Fully-fixsliced AES-256 encryption (the ShiftRows is completely omitted).
+///
+/// Encrypts four blocks in-place and in parallel.
+pub(crate) fn aes256_encrypt(rkeys: &FixsliceKeys256, blocks: &BatchBlocks) -> BatchBlocks {
+    let mut state = State::default();
+
+    bitslice(&mut state, &blocks[0], &blocks[1]);
+
+    add_round_key(&mut state, &rkeys[..8]);
+
+    let mut rk_off = 8;
+    loop {
+        sub_bytes(&mut state);
+        mix_columns_1(&mut state);
+        add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]);
+        rk_off += 8;
+
+        #[cfg(aes_compact)]
+        {
+            shift_rows_2(&mut state);
+        }
+
+        if rk_off == 112 {
+            break;
+        }
+
+        #[cfg(not(aes_compact))]
+        {
+            sub_bytes(&mut state);
+            mix_columns_2(&mut state);
+            add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]);
+            rk_off += 8;
+
+            sub_bytes(&mut state);
+            mix_columns_3(&mut state);
+            add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]);
+            rk_off += 8;
+        }
+
+        sub_bytes(&mut state);
+        mix_columns_0(&mut state);
+        add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]);
+        rk_off += 8;
+    }
+
+    #[cfg(not(aes_compact))]
+    {
+        shift_rows_2(&mut state);
+    }
+
+    sub_bytes(&mut state);
+    add_round_key(&mut state, &rkeys[112..]);
+
+    inv_bitslice(&state)
+}
+
+/// Note that the 4 bitwise NOT (^= 0xffffffff) are accounted for here so that it is a true
+/// inverse of 'sub_bytes'.
+fn inv_sub_bytes(state: &mut [u32]) {
+    debug_assert_eq!(state.len(), 8);
+
+    // Scheduled using https://github.com/Ko-/aes-armcortexm/tree/public/scheduler
+    // Inline "stack" comments reflect suggested stores and loads (ARM Cortex-M3 and M4)
+
+    let u7 = state[0];
+    let u6 = state[1];
+    let u5 = state[2];
+    let u4 = state[3];
+    let u3 = state[4];
+    let u2 = state[5];
+    let u1 = state[6];
+    let u0 = state[7];
+
+    let t23 = u0 ^ u3;
+    let t8 = u1 ^ t23;
+    let m2 = t23 & t8;
+    let t4 = u4 ^ t8;
+    let t22 = u1 ^ u3;
+    let t2 = u0 ^ u1;
+    let t1 = u3 ^ u4;
+    // t23 -> stack
+    let t9 = u7 ^ t1;
+    // t8 -> stack
+    let m7 = t22 & t9;
+    // t9 -> stack
+    let t24 = u4 ^ u7;
+    // m7 -> stack
+    let t10 = t2 ^ t24;
+    // u4 -> stack
+    let m14 = t2 & t10;
+    let r5 = u6 ^ u7;
+    // m2 -> stack
+    let t3 = t1 ^ r5;
+    // t2 -> stack
+    let t13 = t2 ^ r5;
+    let t19 = t22 ^ r5;
+    // t3 -> stack
+    let t17 = u2 ^ t19;
+    // t4 -> stack
+    let t25 = u2 ^ t1;
+    let r13 = u1 ^ u6;
+    // t25 -> stack
+    let t20 = t24 ^ r13;
+    // t17 -> stack
+    let m9 = t20 & t17;
+    // t20 -> stack
+    let r17 = u2 ^ u5;
+    // t22 -> stack
+    let t6 = t22 ^ r17;
+    // t13 -> stack
+    let m1 = t13 & t6;
+    let y5 = u0 ^ r17;
+    let m4 = t19 & y5;
+    let m5 = m4 ^ m1;
+    let m17 = m5 ^ t24;
+    let r18 = u5 ^ u6;
+    let t27 = t1 ^ r18;
+    let t15 = t10 ^ t27;
+    // t6 -> stack
+    let m11 = t1 & t15;
+    let m15 = m14 ^ m11;
+    let m21 = m17 ^ m15;
+    // t1 -> stack
+    // t4 <- stack
+    let m12 = t4 & t27;
+    let m13 = m12 ^ m11;
+    let t14 = t10 ^ r18;
+    let m3 = t14 ^ m1;
+    // m2 <- stack
+    let m16 = m3 ^ m2;
+    let m20 = m16 ^ m13;
+    // u4 <- stack
+    let r19 = u2 ^ u4;
+    let t16 = r13 ^ r19;
+    // t3 <- stack
+    let t26 = t3 ^ t16;
+    let m6 = t3 & t16;
+    let m8 = t26 ^ m6;
+    // t10 -> stack
+    // m7 <- stack
+    let m18 = m8 ^ m7;
+    let m22 = m18 ^ m13;
+    let m25 = m22 & m20;
+    let m26 = m21 ^ m25;
+    let m10 = m9 ^ m6;
+    let m19 = m10 ^ m15;
+    // t25 <- stack
+    let m23 = m19 ^ t25;
+    let m28 = m23 ^ m25;
+    let m24 = m22 ^ m23;
+    let m30 = m26 & m24;
+    let m39 = m23 ^ m30;
+    let m48 = m39 & y5;
+    let m57 = m39 & t19;
+    // m48 -> stack
+    let m36 = m24 ^ m25;
+    let m31 = m20 & m23;
+    let m27 = m20 ^ m21;
+    let m32 = m27 & m31;
+    let m29 = m28 & m27;
+    let m37 = m21 ^ m29;
+    // m39 -> stack
+    let m42 = m37 ^ m39;
+    let m52 = m42 & t15;
+    // t27 -> stack
+    // t1 <- stack
+    let m61 = m42 & t1;
+    let p0 = m52 ^ m61;
+    let p16 = m57 ^ m61;
+    // m57 -> stack
+    // t20 <- stack
+    let m60 = m37 & t20;
+    // p16 -> stack
+    // t17 <- stack
+    let m51 = m37 & t17;
+    let m33 = m27 ^ m25;
+    let m38 = m32 ^ m33;
+    let m43 = m37 ^ m38;
+    let m49 = m43 & t16;
+    let p6 = m49 ^ m60;
+    let p13 = m49 ^ m51;
+    let m58 = m43 & t3;
+    // t9 <- stack
+    let m50 = m38 & t9;
+    // t22 <- stack
+    let m59 = m38 & t22;
+    // p6 -> stack
+    let p1 = m58 ^ m59;
+    let p7 = p0 ^ p1;
+    let m34 = m21 & m22;
+    let m35 = m24 & m34;
+    let m40 = m35 ^ m36;
+    let m41 = m38 ^ m40;
+    let m45 = m42 ^ m41;
+    // t27 <- stack
+    let m53 = m45 & t27;
+    let p8 = m50 ^ m53;
+    let p23 = p7 ^ p8;
+    // t4 <- stack
+    let m62 = m45 & t4;
+    let p14 = m49 ^ m62;
+    let s6 = p14 ^ p23;
+    // t10 <- stack
+    let m54 = m41 & t10;
+    let p2 = m54 ^ m62;
+    let p22 = p2 ^ p7;
+    let s0 = p13 ^ p22;
+    let p17 = m58 ^ p2;
+    let p15 = m54 ^ m59;
+    // t2 <- stack
+    let m63 = m41 & t2;
+    // m39 <- stack
+    let m44 = m39 ^ m40;
+    // p17 -> stack
+    // t6 <- stack
+    let m46 = m44 & t6;
+    let p5 = m46 ^ m51;
+    // p23 -> stack
+    let p18 = m63 ^ p5;
+    let p24 = p5 ^ p7;
+    // m48 <- stack
+    let p12 = m46 ^ m48;
+    let s3 = p12 ^ p22;
+    // t13 <- stack
+    let m55 = m44 & t13;
+    let p9 = m55 ^ m63;
+    // p16 <- stack
+    let s7 = p9 ^ p16;
+    // t8 <- stack
+    let m47 = m40 & t8;
+    let p3 = m47 ^ m50;
+    let p19 = p2 ^ p3;
+    let s5 = p19 ^ p24;
+    let p11 = p0 ^ p3;
+    let p26 = p9 ^ p11;
+    // t23 <- stack
+    let m56 = m40 & t23;
+    let p4 = m48 ^ m56;
+    // p6 <- stack
+    let p20 = p4 ^ p6;
+    let p29 = p15 ^ p20;
+    let s1 = p26 ^ p29;
+    // m57 <- stack
+    let p10 = m57 ^ p4;
+    let p27 = p10 ^ p18;
+    // p23 <- stack
+    let s4 = p23 ^ p27;
+    let p25 = p6 ^ p10;
+    let p28 = p11 ^ p25;
+    // p17 <- stack
+    let s2 = p17 ^ p28;
+
+    state[0] = s7;
+    state[1] = s6;
+    state[2] = s5;
+    state[3] = s4;
+    state[4] = s3;
+    state[5] = s2;
+    state[6] = s1;
+    state[7] = s0;
+}
+
+/// Bitsliced implementation of the AES Sbox based on Boyar, Peralta and Calik.
+///
+/// See: <http://www.cs.yale.edu/homes/peralta/CircuitStuff/SLP_AES_113.txt>
+///
+/// Note that the 4 bitwise NOT (^= 0xffffffff) are moved to the key schedule.
+fn sub_bytes(state: &mut [u32]) {
+    debug_assert_eq!(state.len(), 8);
+
+    // Scheduled using https://github.com/Ko-/aes-armcortexm/tree/public/scheduler
+    // Inline "stack" comments reflect suggested stores and loads (ARM Cortex-M3 and M4)
+
+    let u7 = state[0];
+    let u6 = state[1];
+    let u5 = state[2];
+    let u4 = state[3];
+    let u3 = state[4];
+    let u2 = state[5];
+    let u1 = state[6];
+    let u0 = state[7];
+
+    let y14 = u3 ^ u5;
+    let y13 = u0 ^ u6;
+    let y12 = y13 ^ y14;
+    let t1 = u4 ^ y12;
+    let y15 = t1 ^ u5;
+    let t2 = y12 & y15;
+    let y6 = y15 ^ u7;
+    let y20 = t1 ^ u1;
+    // y12 -> stack
+    let y9 = u0 ^ u3;
+    // y20 -> stack
+    let y11 = y20 ^ y9;
+    // y9 -> stack
+    let t12 = y9 & y11;
+    // y6 -> stack
+    let y7 = u7 ^ y11;
+    let y8 = u0 ^ u5;
+    let t0 = u1 ^ u2;
+    let y10 = y15 ^ t0;
+    // y15 -> stack
+    let y17 = y10 ^ y11;
+    // y14 -> stack
+    let t13 = y14 & y17;
+    let t14 = t13 ^ t12;
+    // y17 -> stack
+    let y19 = y10 ^ y8;
+    // y10 -> stack
+    let t15 = y8 & y10;
+    let t16 = t15 ^ t12;
+    let y16 = t0 ^ y11;
+    // y11 -> stack
+    let y21 = y13 ^ y16;
+    // y13 -> stack
+    let t7 = y13 & y16;
+    // y16 -> stack
+    let y18 = u0 ^ y16;
+    let y1 = t0 ^ u7;
+    let y4 = y1 ^ u3;
+    // u7 -> stack
+    let t5 = y4 & u7;
+    let t6 = t5 ^ t2;
+    let t18 = t6 ^ t16;
+    let t22 = t18 ^ y19;
+    let y2 = y1 ^ u0;
+    let t10 = y2 & y7;
+    let t11 = t10 ^ t7;
+    let t20 = t11 ^ t16;
+    let t24 = t20 ^ y18;
+    let y5 = y1 ^ u6;
+    let t8 = y5 & y1;
+    let t9 = t8 ^ t7;
+    let t19 = t9 ^ t14;
+    let t23 = t19 ^ y21;
+    let y3 = y5 ^ y8;
+    // y6 <- stack
+    let t3 = y3 & y6;
+    let t4 = t3 ^ t2;
+    // y20 <- stack
+    let t17 = t4 ^ y20;
+    let t21 = t17 ^ t14;
+    let t26 = t21 & t23;
+    let t27 = t24 ^ t26;
+    let t31 = t22 ^ t26;
+    let t25 = t21 ^ t22;
+    // y4 -> stack
+    let t28 = t25 & t27;
+    let t29 = t28 ^ t22;
+    let z14 = t29 & y2;
+    let z5 = t29 & y7;
+    let t30 = t23 ^ t24;
+    let t32 = t31 & t30;
+    let t33 = t32 ^ t24;
+    let t35 = t27 ^ t33;
+    let t36 = t24 & t35;
+    let t38 = t27 ^ t36;
+    let t39 = t29 & t38;
+    let t40 = t25 ^ t39;
+    let t43 = t29 ^ t40;
+    // y16 <- stack
+    let z3 = t43 & y16;
+    let tc12 = z3 ^ z5;
+    // tc12 -> stack
+    // y13 <- stack
+    let z12 = t43 & y13;
+    let z13 = t40 & y5;
+    let z4 = t40 & y1;
+    let tc6 = z3 ^ z4;
+    let t34 = t23 ^ t33;
+    let t37 = t36 ^ t34;
+    let t41 = t40 ^ t37;
+    // y10 <- stack
+    let z8 = t41 & y10;
+    let z17 = t41 & y8;
+    let t44 = t33 ^ t37;
+    // y15 <- stack
+    let z0 = t44 & y15;
+    // z17 -> stack
+    // y12 <- stack
+    let z9 = t44 & y12;
+    let z10 = t37 & y3;
+    let z1 = t37 & y6;
+    let tc5 = z1 ^ z0;
+    let tc11 = tc6 ^ tc5;
+    // y4 <- stack
+    let z11 = t33 & y4;
+    let t42 = t29 ^ t33;
+    let t45 = t42 ^ t41;
+    // y17 <- stack
+    let z7 = t45 & y17;
+    let tc8 = z7 ^ tc6;
+    // y14 <- stack
+    let z16 = t45 & y14;
+    // y11 <- stack
+    let z6 = t42 & y11;
+    let tc16 = z6 ^ tc8;
+    // z14 -> stack
+    // y9 <- stack
+    let z15 = t42 & y9;
+    let tc20 = z15 ^ tc16;
+    let tc1 = z15 ^ z16;
+    let tc2 = z10 ^ tc1;
+    let tc21 = tc2 ^ z11;
+    let tc3 = z9 ^ tc2;
+    let s0 = tc3 ^ tc16;
+    let s3 = tc3 ^ tc11;
+    let s1 = s3 ^ tc16;
+    let tc13 = z13 ^ tc1;
+    // u7 <- stack
+    let z2 = t33 & u7;
+    let tc4 = z0 ^ z2;
+    let tc7 = z12 ^ tc4;
+    let tc9 = z8 ^ tc7;
+    let tc10 = tc8 ^ tc9;
+    // z14 <- stack
+    let tc17 = z14 ^ tc10;
+    let s5 = tc21 ^ tc17;
+    let tc26 = tc17 ^ tc20;
+    // z17 <- stack
+    let s2 = tc26 ^ z17;
+    // tc12 <- stack
+    let tc14 = tc4 ^ tc12;
+    let tc18 = tc13 ^ tc14;
+    let s6 = tc10 ^ tc18;
+    let s7 = z12 ^ tc18;
+    let s4 = tc14 ^ s3;
+
+    state[0] = s7;
+    state[1] = s6;
+    state[2] = s5;
+    state[3] = s4;
+    state[4] = s3;
+    state[5] = s2;
+    state[6] = s1;
+    state[7] = s0;
+}
+
+/// NOT operations that are omitted in S-box
+#[inline]
+fn sub_bytes_nots(state: &mut [u32]) {
+    debug_assert_eq!(state.len(), 8);
+    state[0] ^= 0xffffffff;
+    state[1] ^= 0xffffffff;
+    state[5] ^= 0xffffffff;
+    state[6] ^= 0xffffffff;
+}
+
+/// Computation of the MixColumns transformation in the fixsliced representation, with different
+/// rotations used according to the round number mod 4.
+///
+/// Based on Käsper-Schwabe, similar to https://github.com/Ko-/aes-armcortexm.
+macro_rules! define_mix_columns {
+    (
+        $name:ident,
+        $name_inv:ident,
+        $first_rotate:path,
+        $second_rotate:path
+    ) => {
+        #[rustfmt::skip]
+        fn $name(state: &mut State) {
+            let (a0, a1, a2, a3, a4, a5, a6, a7) = (
+                state[0], state[1], state[2], state[3], state[4], state[5], state[6], state[7]
+            );
+            let (b0, b1, b2, b3, b4, b5, b6, b7) = (
+                $first_rotate(a0),
+                $first_rotate(a1),
+                $first_rotate(a2),
+                $first_rotate(a3),
+                $first_rotate(a4),
+                $first_rotate(a5),
+                $first_rotate(a6),
+                $first_rotate(a7),
+            );
+            let (c0, c1, c2, c3, c4, c5, c6, c7) = (
+                a0 ^ b0,
+                a1 ^ b1,
+                a2 ^ b2,
+                a3 ^ b3,
+                a4 ^ b4,
+                a5 ^ b5,
+                a6 ^ b6,
+                a7 ^ b7,
+            );
+            state[0] = b0      ^ c7 ^ $second_rotate(c0);
+            state[1] = b1 ^ c0 ^ c7 ^ $second_rotate(c1);
+            state[2] = b2 ^ c1      ^ $second_rotate(c2);
+            state[3] = b3 ^ c2 ^ c7 ^ $second_rotate(c3);
+            state[4] = b4 ^ c3 ^ c7 ^ $second_rotate(c4);
+            state[5] = b5 ^ c4      ^ $second_rotate(c5);
+            state[6] = b6 ^ c5      ^ $second_rotate(c6);
+            state[7] = b7 ^ c6      ^ $second_rotate(c7);
+        }
+
+        #[rustfmt::skip]
+        fn $name_inv(state: &mut State) {
+            let (a0, a1, a2, a3, a4, a5, a6, a7) = (
+                state[0], state[1], state[2], state[3], state[4], state[5], state[6], state[7]
+            );
+            let (b0, b1, b2, b3, b4, b5, b6, b7) = (
+                $first_rotate(a0),
+                $first_rotate(a1),
+                $first_rotate(a2),
+                $first_rotate(a3),
+                $first_rotate(a4),
+                $first_rotate(a5),
+                $first_rotate(a6),
+                $first_rotate(a7),
+            );
+            let (c0, c1, c2, c3, c4, c5, c6, c7) = (
+                a0 ^ b0,
+                a1 ^ b1,
+                a2 ^ b2,
+                a3 ^ b3,
+                a4 ^ b4,
+                a5 ^ b5,
+                a6 ^ b6,
+                a7 ^ b7,
+            );
+            let (d0, d1, d2, d3, d4, d5, d6, d7) = (
+                a0      ^ c7,
+                a1 ^ c0 ^ c7,
+                a2 ^ c1,
+                a3 ^ c2 ^ c7,
+                a4 ^ c3 ^ c7,
+                a5 ^ c4,
+                a6 ^ c5,
+                a7 ^ c6,
+            );
+            let (e0, e1, e2, e3, e4, e5, e6, e7) = (
+                c0      ^ d6,
+                c1      ^ d6 ^ d7,
+                c2 ^ d0      ^ d7,
+                c3 ^ d1 ^ d6,
+                c4 ^ d2 ^ d6 ^ d7,
+                c5 ^ d3      ^ d7,
+                c6 ^ d4,
+                c7 ^ d5,
+            );
+            state[0] = d0 ^ e0 ^ $second_rotate(e0);
+            state[1] = d1 ^ e1 ^ $second_rotate(e1);
+            state[2] = d2 ^ e2 ^ $second_rotate(e2);
+            state[3] = d3 ^ e3 ^ $second_rotate(e3);
+            state[4] = d4 ^ e4 ^ $second_rotate(e4);
+            state[5] = d5 ^ e5 ^ $second_rotate(e5);
+            state[6] = d6 ^ e6 ^ $second_rotate(e6);
+            state[7] = d7 ^ e7 ^ $second_rotate(e7);
+        }
+    }
+}
+
+define_mix_columns!(
+    mix_columns_0,
+    inv_mix_columns_0,
+    rotate_rows_1,
+    rotate_rows_2
+);
+
+define_mix_columns!(
+    mix_columns_1,
+    inv_mix_columns_1,
+    rotate_rows_and_columns_1_1,
+    rotate_rows_and_columns_2_2
+);
+
+#[cfg(not(aes_compact))]
+define_mix_columns!(
+    mix_columns_2,
+    inv_mix_columns_2,
+    rotate_rows_and_columns_1_2,
+    rotate_rows_2
+);
+
+#[cfg(not(aes_compact))]
+define_mix_columns!(
+    mix_columns_3,
+    inv_mix_columns_3,
+    rotate_rows_and_columns_1_3,
+    rotate_rows_and_columns_2_2
+);
+
+#[inline]
+fn delta_swap_1(a: &mut u32, shift: u32, mask: u32) {
+    let t = (*a ^ ((*a) >> shift)) & mask;
+    *a ^= t ^ (t << shift);
+}
+
+#[inline]
+fn delta_swap_2(a: &mut u32, b: &mut u32, shift: u32, mask: u32) {
+    let t = (*a ^ ((*b) >> shift)) & mask;
+    *a ^= t;
+    *b ^= t << shift;
+}
+
+/// Applies ShiftRows once on an AES state (or key).
+#[cfg(any(not(aes_compact), feature = "hazmat"))]
+#[inline]
+fn shift_rows_1(state: &mut [u32]) {
+    debug_assert_eq!(state.len(), 8);
+    for x in state.iter_mut() {
+        delta_swap_1(x, 4, 0x0c0f0300);
+        delta_swap_1(x, 2, 0x33003300);
+    }
+}
+
+/// Applies ShiftRows twice on an AES state (or key).
+#[inline]
+fn shift_rows_2(state: &mut [u32]) {
+    debug_assert_eq!(state.len(), 8);
+    for x in state.iter_mut() {
+        delta_swap_1(x, 4, 0x0f000f00);
+    }
+}
+
+/// Applies ShiftRows three times on an AES state (or key).
+#[inline]
+fn shift_rows_3(state: &mut [u32]) {
+    debug_assert_eq!(state.len(), 8);
+    for x in state.iter_mut() {
+        delta_swap_1(x, 4, 0x030f0c00);
+        delta_swap_1(x, 2, 0x33003300);
+    }
+}
+
+#[inline(always)]
+fn inv_shift_rows_1(state: &mut [u32]) {
+    shift_rows_3(state);
+}
+
+#[inline(always)]
+fn inv_shift_rows_2(state: &mut [u32]) {
+    shift_rows_2(state);
+}
+
+#[cfg(not(aes_compact))]
+#[inline(always)]
+fn inv_shift_rows_3(state: &mut [u32]) {
+    shift_rows_1(state);
+}
+
+/// XOR the columns after the S-box during the key schedule round function.
+///
+/// The `idx_xor` parameter refers to the index of the previous round key that is
+/// involved in the XOR computation (should be 8 and 16 for AES-128 and AES-256,
+/// respectively).
+///
+/// The `idx_ror` parameter refers to the rotation value, which varies between the
+/// different key schedules.
+fn xor_columns(rkeys: &mut [u32], offset: usize, idx_xor: usize, idx_ror: u32) {
+    for i in 0..8 {
+        let off_i = offset + i;
+        let rk = rkeys[off_i - idx_xor] ^ (0x03030303 & ror(rkeys[off_i], idx_ror));
+        rkeys[off_i] =
+            rk ^ (0xfcfcfcfc & (rk << 2)) ^ (0xf0f0f0f0 & (rk << 4)) ^ (0xc0c0c0c0 & (rk << 6));
+    }
+}
+
+/// Bitslice two 128-bit input blocks input0, input1 into a 256-bit internal state.
+fn bitslice(output: &mut [u32], input0: &[u8], input1: &[u8]) {
+    debug_assert_eq!(output.len(), 8);
+    debug_assert_eq!(input0.len(), 16);
+    debug_assert_eq!(input1.len(), 16);
+
+    // Bitslicing is a bit index manipulation. 256 bits of data means each bit is positioned at an
+    // 8-bit index. AES data is 2 blocks, each one a 4x4 column-major matrix of bytes, so the
+    // index is initially ([b]lock, [c]olumn, [r]ow, [p]osition):
+    //     b0 c1 c0 r1 r0 p2 p1 p0
+    //
+    // The desired bitsliced data groups first by bit position, then row, column, block:
+    //     p2 p1 p0 r1 r0 c1 c0 b0
+
+    // Interleave the columns on input (note the order of input)
+    //     b0 c1 c0 __ __ __ __ __ => c1 c0 b0 __ __ __ __ __
+    let mut t0 = u32::from_le_bytes(input0[0x00..0x04].try_into().unwrap());
+    let mut t2 = u32::from_le_bytes(input0[0x04..0x08].try_into().unwrap());
+    let mut t4 = u32::from_le_bytes(input0[0x08..0x0c].try_into().unwrap());
+    let mut t6 = u32::from_le_bytes(input0[0x0c..0x10].try_into().unwrap());
+    let mut t1 = u32::from_le_bytes(input1[0x00..0x04].try_into().unwrap());
+    let mut t3 = u32::from_le_bytes(input1[0x04..0x08].try_into().unwrap());
+    let mut t5 = u32::from_le_bytes(input1[0x08..0x0c].try_into().unwrap());
+    let mut t7 = u32::from_le_bytes(input1[0x0c..0x10].try_into().unwrap());
+
+    // Bit Index Swap 5 <-> 0:
+    //     __ __ b0 __ __ __ __ p0 => __ __ p0 __ __ __ __ b0
+    let m0 = 0x55555555;
+    delta_swap_2(&mut t1, &mut t0, 1, m0);
+    delta_swap_2(&mut t3, &mut t2, 1, m0);
+    delta_swap_2(&mut t5, &mut t4, 1, m0);
+    delta_swap_2(&mut t7, &mut t6, 1, m0);
+
+    // Bit Index Swap 6 <-> 1:
+    //     __ c0 __ __ __ __ p1 __ => __ p1 __ __ __ __ c0 __
+    let m1 = 0x33333333;
+    delta_swap_2(&mut t2, &mut t0, 2, m1);
+    delta_swap_2(&mut t3, &mut t1, 2, m1);
+    delta_swap_2(&mut t6, &mut t4, 2, m1);
+    delta_swap_2(&mut t7, &mut t5, 2, m1);
+
+    // Bit Index Swap 7 <-> 2:
+    //     c1 __ __ __ __ p2 __ __ => p2 __ __ __ __ c1 __ __
+    let m2 = 0x0f0f0f0f;
+    delta_swap_2(&mut t4, &mut t0, 4, m2);
+    delta_swap_2(&mut t5, &mut t1, 4, m2);
+    delta_swap_2(&mut t6, &mut t2, 4, m2);
+    delta_swap_2(&mut t7, &mut t3, 4, m2);
+
+    // Final bitsliced bit index, as desired:
+    //     p2 p1 p0 r1 r0 c1 c0 b0
+    output[0] = t0;
+    output[1] = t1;
+    output[2] = t2;
+    output[3] = t3;
+    output[4] = t4;
+    output[5] = t5;
+    output[6] = t6;
+    output[7] = t7;
+}
+
+/// Un-bitslice a 256-bit internal state into two 128-bit blocks of output.
+fn inv_bitslice(input: &[u32]) -> BatchBlocks {
+    debug_assert_eq!(input.len(), 8);
+
+    // Unbitslicing is a bit index manipulation. 256 bits of data means each bit is positioned at
+    // an 8-bit index. AES data is 2 blocks, each one a 4x4 column-major matrix of bytes, so the
+    // desired index for the output is ([b]lock, [c]olumn, [r]ow, [p]osition):
+    //     b0 c1 c0 r1 r0 p2 p1 p0
+    //
+    // The initially bitsliced data groups first by bit position, then row, column, block:
+    //     p2 p1 p0 r1 r0 c1 c0 b0
+
+    let mut t0 = input[0];
+    let mut t1 = input[1];
+    let mut t2 = input[2];
+    let mut t3 = input[3];
+    let mut t4 = input[4];
+    let mut t5 = input[5];
+    let mut t6 = input[6];
+    let mut t7 = input[7];
+
+    // TODO: these bit index swaps are identical to those in 'packing'
+
+    // Bit Index Swap 5 <-> 0:
+    //     __ __ p0 __ __ __ __ b0 => __ __ b0 __ __ __ __ p0
+    let m0 = 0x55555555;
+    delta_swap_2(&mut t1, &mut t0, 1, m0);
+    delta_swap_2(&mut t3, &mut t2, 1, m0);
+    delta_swap_2(&mut t5, &mut t4, 1, m0);
+    delta_swap_2(&mut t7, &mut t6, 1, m0);
+
+    // Bit Index Swap 6 <-> 1:
+    //     __ p1 __ __ __ __ c0 __ => __ c0 __ __ __ __ p1 __
+    let m1 = 0x33333333;
+    delta_swap_2(&mut t2, &mut t0, 2, m1);
+    delta_swap_2(&mut t3, &mut t1, 2, m1);
+    delta_swap_2(&mut t6, &mut t4, 2, m1);
+    delta_swap_2(&mut t7, &mut t5, 2, m1);
+
+    // Bit Index Swap 7 <-> 2:
+    //     p2 __ __ __ __ c1 __ __ => c1 __ __ __ __ p2 __ __
+    let m2 = 0x0f0f0f0f;
+    delta_swap_2(&mut t4, &mut t0, 4, m2);
+    delta_swap_2(&mut t5, &mut t1, 4, m2);
+    delta_swap_2(&mut t6, &mut t2, 4, m2);
+    delta_swap_2(&mut t7, &mut t3, 4, m2);
+
+    let mut output = BatchBlocks::default();
+    // De-interleave the columns on output (note the order of output)
+    //     c1 c0 b0 __ __ __ __ __ => b0 c1 c0 __ __ __ __ __
+    output[0][0x00..0x04].copy_from_slice(&t0.to_le_bytes());
+    output[0][0x04..0x08].copy_from_slice(&t2.to_le_bytes());
+    output[0][0x08..0x0c].copy_from_slice(&t4.to_le_bytes());
+    output[0][0x0c..0x10].copy_from_slice(&t6.to_le_bytes());
+    output[1][0x00..0x04].copy_from_slice(&t1.to_le_bytes());
+    output[1][0x04..0x08].copy_from_slice(&t3.to_le_bytes());
+    output[1][0x08..0x0c].copy_from_slice(&t5.to_le_bytes());
+    output[1][0x0c..0x10].copy_from_slice(&t7.to_le_bytes());
+
+    // Final AES bit index, as desired:
+    //     b0 c1 c0 r1 r0 p2 p1 p0
+    output
+}
+
+/// Copy 32-bytes within the provided slice to an 8-byte offset
+fn memshift32(buffer: &mut [u32], src_offset: usize) {
+    debug_assert_eq!(src_offset % 8, 0);
+
+    let dst_offset = src_offset + 8;
+    debug_assert!(dst_offset + 8 <= buffer.len());
+
+    for i in (0..8).rev() {
+        buffer[dst_offset + i] = buffer[src_offset + i];
+    }
+}
+
+/// XOR the round key to the internal state. The round keys are expected to be
+/// pre-computed and to be packed in the fixsliced representation.
+#[inline]
+fn add_round_key(state: &mut State, rkey: &[u32]) {
+    debug_assert_eq!(rkey.len(), 8);
+    for (a, b) in state.iter_mut().zip(rkey) {
+        *a ^= b;
+    }
+}
+
+#[inline(always)]
+fn add_round_constant_bit(state: &mut [u32], bit: usize) {
+    state[bit] ^= 0x0000c000;
+}
+
+#[inline(always)]
+fn ror(x: u32, y: u32) -> u32 {
+    x.rotate_right(y)
+}
+
+#[inline(always)]
+fn ror_distance(rows: u32, cols: u32) -> u32 {
+    (rows << 3) + (cols << 1)
+}
+
+#[inline(always)]
+fn rotate_rows_1(x: u32) -> u32 {
+    ror(x, ror_distance(1, 0))
+}
+
+#[inline(always)]
+fn rotate_rows_2(x: u32) -> u32 {
+    ror(x, ror_distance(2, 0))
+}
+
+#[inline(always)]
+#[rustfmt::skip]
+fn rotate_rows_and_columns_1_1(x: u32) -> u32 {
+    (ror(x, ror_distance(1, 1)) & 0x3f3f3f3f) |
+    (ror(x, ror_distance(0, 1)) & 0xc0c0c0c0)
+}
+
+#[cfg(not(aes_compact))]
+#[inline(always)]
+#[rustfmt::skip]
+fn rotate_rows_and_columns_1_2(x: u32) -> u32 {
+    (ror(x, ror_distance(1, 2)) & 0x0f0f0f0f) |
+    (ror(x, ror_distance(0, 2)) & 0xf0f0f0f0)
+}
+
+#[cfg(not(aes_compact))]
+#[inline(always)]
+#[rustfmt::skip]
+fn rotate_rows_and_columns_1_3(x: u32) -> u32 {
+    (ror(x, ror_distance(1, 3)) & 0x03030303) |
+    (ror(x, ror_distance(0, 3)) & 0xfcfcfcfc)
+}
+
+#[inline(always)]
+#[rustfmt::skip]
+fn rotate_rows_and_columns_2_2(x: u32) -> u32 {
+    (ror(x, ror_distance(2, 2)) & 0x0f0f0f0f) |
+    (ror(x, ror_distance(1, 2)) & 0xf0f0f0f0)
+}
+
+/// Low-level "hazmat" AES functions.
+///
+/// Note: this isn't actually used in the `Aes128`/`Aes192`/`Aes256`
+/// implementations in this crate, but instead provides raw access to
+/// the AES round function gated under the `hazmat` crate feature.
+#[cfg(feature = "hazmat")]
+pub(crate) mod hazmat {
+    use super::{
+        bitslice, inv_bitslice, inv_mix_columns_0, inv_shift_rows_1, inv_sub_bytes, mix_columns_0,
+        shift_rows_1, sub_bytes, sub_bytes_nots, State,
+    };
+    use crate::{Block, Block8};
+
+    /// XOR the `src` block into the `dst` block in-place.
+    fn xor_in_place(dst: &mut Block, src: &Block) {
+        for (a, b) in dst.iter_mut().zip(src.as_slice()) {
+            *a ^= *b;
+        }
+    }
+
+    /// Perform a bitslice operation, loading a single block.
+    fn bitslice_block(block: &Block) -> State {
+        let mut state = State::default();
+        bitslice(&mut state, block, block);
+        state
+    }
+
+    /// Perform an inverse bitslice operation, extracting a single block.
+    fn inv_bitslice_block(block: &mut Block, state: &State) {
+        let out = inv_bitslice(state);
+        block.copy_from_slice(&out[0]);
+    }
+
+    /// AES cipher (encrypt) round function.
+    #[inline]
+    pub(crate) fn cipher_round(block: &mut Block, round_key: &Block) {
+        let mut state = bitslice_block(block);
+        sub_bytes(&mut state);
+        sub_bytes_nots(&mut state);
+        shift_rows_1(&mut state);
+        mix_columns_0(&mut state);
+        inv_bitslice_block(block, &state);
+        xor_in_place(block, round_key);
+    }
+
+    /// AES cipher (encrypt) round function: parallel version.
+    #[inline]
+    pub(crate) fn cipher_round_par(blocks: &mut Block8, round_keys: &Block8) {
+        for (chunk, keys) in blocks.chunks_exact_mut(2).zip(round_keys.chunks_exact(2)) {
+            let mut state = State::default();
+            bitslice(&mut state, &chunk[0], &chunk[1]);
+            sub_bytes(&mut state);
+            sub_bytes_nots(&mut state);
+            shift_rows_1(&mut state);
+            mix_columns_0(&mut state);
+            let res = inv_bitslice(&state);
+
+            for i in 0..2 {
+                chunk[i] = res[i];
+                xor_in_place(&mut chunk[i], &keys[i]);
+            }
+        }
+    }
+
+    /// AES cipher (encrypt) round function.
+    #[inline]
+    pub(crate) fn equiv_inv_cipher_round(block: &mut Block, round_key: &Block) {
+        let mut state = bitslice_block(block);
+        sub_bytes_nots(&mut state);
+        inv_sub_bytes(&mut state);
+        inv_shift_rows_1(&mut state);
+        inv_mix_columns_0(&mut state);
+        inv_bitslice_block(block, &state);
+        xor_in_place(block, round_key);
+    }
+
+    /// AES cipher (encrypt) round function: parallel version.
+    #[inline]
+    pub(crate) fn equiv_inv_cipher_round_par(blocks: &mut Block8, round_keys: &Block8) {
+        for (chunk, keys) in blocks.chunks_exact_mut(2).zip(round_keys.chunks_exact(2)) {
+            let mut state = State::default();
+            bitslice(&mut state, &chunk[0], &chunk[1]);
+            sub_bytes_nots(&mut state);
+            inv_sub_bytes(&mut state);
+            inv_shift_rows_1(&mut state);
+            inv_mix_columns_0(&mut state);
+            let res = inv_bitslice(&state);
+
+            for i in 0..2 {
+                chunk[i] = res[i];
+                xor_in_place(&mut chunk[i], &keys[i]);
+            }
+        }
+    }
+
+    /// AES mix columns function.
+    #[inline]
+    pub(crate) fn mix_columns(block: &mut Block) {
+        let mut state = bitslice_block(block);
+        mix_columns_0(&mut state);
+        inv_bitslice_block(block, &state);
+    }
+
+    /// AES inverse mix columns function.
+    #[inline]
+    pub(crate) fn inv_mix_columns(block: &mut Block) {
+        let mut state = bitslice_block(block);
+        inv_mix_columns_0(&mut state);
+        inv_bitslice_block(block, &state);
+    }
+}
diff --git a/src/soft/fixslice64.rs b/src/soft/fixslice64.rs
new file mode 100644 (file)
index 0000000..09dbcbe
--- /dev/null
@@ -0,0 +1,1534 @@
+//! Fixsliced implementations of AES-128, AES-192 and AES-256 (64-bit)
+//! adapted from the C implementation.
+//!
+//! All implementations are fully bitsliced and do not rely on any
+//! Look-Up Table (LUT).
+//!
+//! See the paper at <https://eprint.iacr.org/2020/1123.pdf> for more details.
+//!
+//! # Author (original C code)
+//!
+//! Alexandre Adomnicai, Nanyang Technological University, Singapore
+//! <alexandre.adomnicai@ntu.edu.sg>
+//!
+//! Originally licensed MIT. Relicensed as Apache 2.0+MIT with permission.
+
+#![allow(clippy::unreadable_literal)]
+
+use crate::Block;
+use cipher::{consts::U4, generic_array::GenericArray};
+
+/// AES block batch size for this implementation
+pub(crate) type FixsliceBlocks = U4;
+
+pub(crate) type BatchBlocks = GenericArray<Block, FixsliceBlocks>;
+
+/// AES-128 round keys
+pub(crate) type FixsliceKeys128 = [u64; 88];
+
+/// AES-192 round keys
+pub(crate) type FixsliceKeys192 = [u64; 104];
+
+/// AES-256 round keys
+pub(crate) type FixsliceKeys256 = [u64; 120];
+
+/// 512-bit internal state
+pub(crate) type State = [u64; 8];
+
+/// Fully bitsliced AES-128 key schedule to match the fully-fixsliced representation.
+pub(crate) fn aes128_key_schedule(key: &[u8; 16]) -> FixsliceKeys128 {
+    let mut rkeys = [0u64; 88];
+
+    bitslice(&mut rkeys[..8], key, key, key, key);
+
+    let mut rk_off = 0;
+    for rcon in 0..10 {
+        memshift32(&mut rkeys, rk_off);
+        rk_off += 8;
+
+        sub_bytes(&mut rkeys[rk_off..(rk_off + 8)]);
+        sub_bytes_nots(&mut rkeys[rk_off..(rk_off + 8)]);
+
+        if rcon < 8 {
+            add_round_constant_bit(&mut rkeys[rk_off..(rk_off + 8)], rcon);
+        } else {
+            add_round_constant_bit(&mut rkeys[rk_off..(rk_off + 8)], rcon - 8);
+            add_round_constant_bit(&mut rkeys[rk_off..(rk_off + 8)], rcon - 7);
+            add_round_constant_bit(&mut rkeys[rk_off..(rk_off + 8)], rcon - 5);
+            add_round_constant_bit(&mut rkeys[rk_off..(rk_off + 8)], rcon - 4);
+        }
+
+        xor_columns(&mut rkeys, rk_off, 8, ror_distance(1, 3));
+    }
+
+    // Adjust to match fixslicing format
+    #[cfg(aes_compact)]
+    {
+        for i in (8..88).step_by(16) {
+            inv_shift_rows_1(&mut rkeys[i..(i + 8)]);
+        }
+    }
+    #[cfg(not(aes_compact))]
+    {
+        for i in (8..72).step_by(32) {
+            inv_shift_rows_1(&mut rkeys[i..(i + 8)]);
+            inv_shift_rows_2(&mut rkeys[(i + 8)..(i + 16)]);
+            inv_shift_rows_3(&mut rkeys[(i + 16)..(i + 24)]);
+        }
+        inv_shift_rows_1(&mut rkeys[72..80]);
+    }
+
+    // Account for NOTs removed from sub_bytes
+    for i in 1..11 {
+        sub_bytes_nots(&mut rkeys[(i * 8)..(i * 8 + 8)]);
+    }
+
+    rkeys
+}
+
+/// Fully bitsliced AES-192 key schedule to match the fully-fixsliced representation.
+pub(crate) fn aes192_key_schedule(key: &[u8; 24]) -> FixsliceKeys192 {
+    let mut rkeys = [0u64; 104];
+    let mut tmp = [0u64; 8];
+
+    bitslice(
+        &mut rkeys[..8],
+        &key[..16],
+        &key[..16],
+        &key[..16],
+        &key[..16],
+    );
+    bitslice(&mut tmp, &key[8..], &key[8..], &key[8..], &key[8..]);
+
+    let mut rcon = 0;
+    let mut rk_off = 8;
+
+    loop {
+        for i in 0..8 {
+            rkeys[rk_off + i] = (0x00ff00ff00ff00ff & (tmp[i] >> 8))
+                | (0xff00ff00ff00ff00 & (rkeys[(rk_off - 8) + i] << 8));
+        }
+
+        sub_bytes(&mut tmp);
+        sub_bytes_nots(&mut tmp);
+
+        add_round_constant_bit(&mut tmp, rcon);
+        rcon += 1;
+
+        for i in 0..8 {
+            let mut ti = rkeys[rk_off + i];
+            ti ^= 0x0f000f000f000f00 & ror(tmp[i], ror_distance(1, 1));
+            ti ^= 0xf000f000f000f000 & (ti << 4);
+            tmp[i] = ti;
+        }
+        rkeys[rk_off..(rk_off + 8)].copy_from_slice(&tmp);
+        rk_off += 8;
+
+        for i in 0..8 {
+            let ui = tmp[i];
+            let mut ti = (0x00ff00ff00ff00ff & (rkeys[(rk_off - 16) + i] >> 8))
+                | (0xff00ff00ff00ff00 & (ui << 8));
+            ti ^= 0x000f000f000f000f & (ui >> 12);
+            tmp[i] = ti
+                ^ (0xfff0fff0fff0fff0 & (ti << 4))
+                ^ (0xff00ff00ff00ff00 & (ti << 8))
+                ^ (0xf000f000f000f000 & (ti << 12));
+        }
+        rkeys[rk_off..(rk_off + 8)].copy_from_slice(&tmp);
+        rk_off += 8;
+
+        sub_bytes(&mut tmp);
+        sub_bytes_nots(&mut tmp);
+
+        add_round_constant_bit(&mut tmp, rcon);
+        rcon += 1;
+
+        for i in 0..8 {
+            let mut ti = (0x00ff00ff00ff00ff & (rkeys[(rk_off - 16) + i] >> 8))
+                | (0xff00ff00ff00ff00 & (rkeys[(rk_off - 8) + i] << 8));
+            ti ^= 0x000f000f000f000f & ror(tmp[i], ror_distance(1, 3));
+            rkeys[rk_off + i] = ti
+                ^ (0xfff0fff0fff0fff0 & (ti << 4))
+                ^ (0xff00ff00ff00ff00 & (ti << 8))
+                ^ (0xf000f000f000f000 & (ti << 12));
+        }
+        rk_off += 8;
+
+        if rcon >= 8 {
+            break;
+        }
+
+        for i in 0..8 {
+            let ui = rkeys[(rk_off - 8) + i];
+            let mut ti = rkeys[(rk_off - 16) + i];
+            ti ^= 0x0f000f000f000f00 & (ui >> 4);
+            ti ^= 0xf000f000f000f000 & (ti << 4);
+            tmp[i] = ti;
+        }
+    }
+
+    // Adjust to match fixslicing format
+    #[cfg(aes_compact)]
+    {
+        for i in (8..104).step_by(16) {
+            inv_shift_rows_1(&mut rkeys[i..(i + 8)]);
+        }
+    }
+    #[cfg(not(aes_compact))]
+    {
+        for i in (0..96).step_by(32) {
+            inv_shift_rows_1(&mut rkeys[(i + 8)..(i + 16)]);
+            inv_shift_rows_2(&mut rkeys[(i + 16)..(i + 24)]);
+            inv_shift_rows_3(&mut rkeys[(i + 24)..(i + 32)]);
+        }
+    }
+
+    // Account for NOTs removed from sub_bytes
+    for i in 1..13 {
+        sub_bytes_nots(&mut rkeys[(i * 8)..(i * 8 + 8)]);
+    }
+
+    rkeys
+}
+
+/// Fully bitsliced AES-256 key schedule to match the fully-fixsliced representation.
+pub(crate) fn aes256_key_schedule(key: &[u8; 32]) -> FixsliceKeys256 {
+    let mut rkeys = [0u64; 120];
+
+    bitslice(
+        &mut rkeys[..8],
+        &key[..16],
+        &key[..16],
+        &key[..16],
+        &key[..16],
+    );
+    bitslice(
+        &mut rkeys[8..16],
+        &key[16..],
+        &key[16..],
+        &key[16..],
+        &key[16..],
+    );
+
+    let mut rk_off = 8;
+
+    let mut rcon = 0;
+    loop {
+        memshift32(&mut rkeys, rk_off);
+        rk_off += 8;
+
+        sub_bytes(&mut rkeys[rk_off..(rk_off + 8)]);
+        sub_bytes_nots(&mut rkeys[rk_off..(rk_off + 8)]);
+
+        add_round_constant_bit(&mut rkeys[rk_off..(rk_off + 8)], rcon);
+        xor_columns(&mut rkeys, rk_off, 16, ror_distance(1, 3));
+        rcon += 1;
+
+        if rcon == 7 {
+            break;
+        }
+
+        memshift32(&mut rkeys, rk_off);
+        rk_off += 8;
+
+        sub_bytes(&mut rkeys[rk_off..(rk_off + 8)]);
+        sub_bytes_nots(&mut rkeys[rk_off..(rk_off + 8)]);
+
+        xor_columns(&mut rkeys, rk_off, 16, ror_distance(0, 3));
+    }
+
+    // Adjust to match fixslicing format
+    #[cfg(aes_compact)]
+    {
+        for i in (8..120).step_by(16) {
+            inv_shift_rows_1(&mut rkeys[i..(i + 8)]);
+        }
+    }
+    #[cfg(not(aes_compact))]
+    {
+        for i in (8..104).step_by(32) {
+            inv_shift_rows_1(&mut rkeys[i..(i + 8)]);
+            inv_shift_rows_2(&mut rkeys[(i + 8)..(i + 16)]);
+            inv_shift_rows_3(&mut rkeys[(i + 16)..(i + 24)]);
+        }
+        inv_shift_rows_1(&mut rkeys[104..112]);
+    }
+
+    // Account for NOTs removed from sub_bytes
+    for i in 1..15 {
+        sub_bytes_nots(&mut rkeys[(i * 8)..(i * 8 + 8)]);
+    }
+
+    rkeys
+}
+
+/// Fully-fixsliced AES-128 decryption (the InvShiftRows is completely omitted).
+///
+/// Decrypts four blocks in-place and in parallel.
+pub(crate) fn aes128_decrypt(rkeys: &FixsliceKeys128, blocks: &BatchBlocks) -> BatchBlocks {
+    let mut state = State::default();
+
+    bitslice(&mut state, &blocks[0], &blocks[1], &blocks[2], &blocks[3]);
+
+    add_round_key(&mut state, &rkeys[80..]);
+    inv_sub_bytes(&mut state);
+
+    #[cfg(not(aes_compact))]
+    {
+        inv_shift_rows_2(&mut state);
+    }
+
+    let mut rk_off = 72;
+    loop {
+        #[cfg(aes_compact)]
+        {
+            inv_shift_rows_2(&mut state);
+        }
+
+        add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]);
+        inv_mix_columns_1(&mut state);
+        inv_sub_bytes(&mut state);
+        rk_off -= 8;
+
+        if rk_off == 0 {
+            break;
+        }
+
+        add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]);
+        inv_mix_columns_0(&mut state);
+        inv_sub_bytes(&mut state);
+        rk_off -= 8;
+
+        #[cfg(not(aes_compact))]
+        {
+            add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]);
+            inv_mix_columns_3(&mut state);
+            inv_sub_bytes(&mut state);
+            rk_off -= 8;
+
+            add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]);
+            inv_mix_columns_2(&mut state);
+            inv_sub_bytes(&mut state);
+            rk_off -= 8;
+        }
+    }
+
+    add_round_key(&mut state, &rkeys[..8]);
+
+    inv_bitslice(&state)
+}
+
+/// Fully-fixsliced AES-128 encryption (the ShiftRows is completely omitted).
+///
+/// Encrypts four blocks in-place and in parallel.
+pub(crate) fn aes128_encrypt(rkeys: &FixsliceKeys128, blocks: &BatchBlocks) -> BatchBlocks {
+    let mut state = State::default();
+
+    bitslice(&mut state, &blocks[0], &blocks[1], &blocks[2], &blocks[3]);
+
+    add_round_key(&mut state, &rkeys[..8]);
+
+    let mut rk_off = 8;
+    loop {
+        sub_bytes(&mut state);
+        mix_columns_1(&mut state);
+        add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]);
+        rk_off += 8;
+
+        #[cfg(aes_compact)]
+        {
+            shift_rows_2(&mut state);
+        }
+
+        if rk_off == 80 {
+            break;
+        }
+
+        #[cfg(not(aes_compact))]
+        {
+            sub_bytes(&mut state);
+            mix_columns_2(&mut state);
+            add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]);
+            rk_off += 8;
+
+            sub_bytes(&mut state);
+            mix_columns_3(&mut state);
+            add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]);
+            rk_off += 8;
+        }
+
+        sub_bytes(&mut state);
+        mix_columns_0(&mut state);
+        add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]);
+        rk_off += 8;
+    }
+
+    #[cfg(not(aes_compact))]
+    {
+        shift_rows_2(&mut state);
+    }
+
+    sub_bytes(&mut state);
+    add_round_key(&mut state, &rkeys[80..]);
+
+    inv_bitslice(&state)
+}
+
+/// Fully-fixsliced AES-192 decryption (the InvShiftRows is completely omitted).
+///
+/// Decrypts four blocks in-place and in parallel.
+pub(crate) fn aes192_decrypt(rkeys: &FixsliceKeys192, blocks: &BatchBlocks) -> BatchBlocks {
+    let mut state = State::default();
+
+    bitslice(&mut state, &blocks[0], &blocks[1], &blocks[2], &blocks[3]);
+
+    add_round_key(&mut state, &rkeys[96..]);
+    inv_sub_bytes(&mut state);
+
+    let mut rk_off = 88;
+    loop {
+        #[cfg(aes_compact)]
+        {
+            inv_shift_rows_2(&mut state);
+        }
+        #[cfg(not(aes_compact))]
+        {
+            add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]);
+            inv_mix_columns_3(&mut state);
+            inv_sub_bytes(&mut state);
+            rk_off -= 8;
+
+            add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]);
+            inv_mix_columns_2(&mut state);
+            inv_sub_bytes(&mut state);
+            rk_off -= 8;
+        }
+
+        add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]);
+        inv_mix_columns_1(&mut state);
+        inv_sub_bytes(&mut state);
+        rk_off -= 8;
+
+        if rk_off == 0 {
+            break;
+        }
+
+        add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]);
+        inv_mix_columns_0(&mut state);
+        inv_sub_bytes(&mut state);
+        rk_off -= 8;
+    }
+
+    add_round_key(&mut state, &rkeys[..8]);
+
+    inv_bitslice(&state)
+}
+
+/// Fully-fixsliced AES-192 encryption (the ShiftRows is completely omitted).
+///
+/// Encrypts four blocks in-place and in parallel.
+pub(crate) fn aes192_encrypt(rkeys: &FixsliceKeys192, blocks: &BatchBlocks) -> BatchBlocks {
+    let mut state = State::default();
+
+    bitslice(&mut state, &blocks[0], &blocks[1], &blocks[2], &blocks[3]);
+
+    add_round_key(&mut state, &rkeys[..8]);
+
+    let mut rk_off = 8;
+    loop {
+        sub_bytes(&mut state);
+        mix_columns_1(&mut state);
+        add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]);
+        rk_off += 8;
+
+        #[cfg(aes_compact)]
+        {
+            shift_rows_2(&mut state);
+        }
+        #[cfg(not(aes_compact))]
+        {
+            sub_bytes(&mut state);
+            mix_columns_2(&mut state);
+            add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]);
+            rk_off += 8;
+
+            sub_bytes(&mut state);
+            mix_columns_3(&mut state);
+            add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]);
+            rk_off += 8;
+        }
+
+        if rk_off == 96 {
+            break;
+        }
+
+        sub_bytes(&mut state);
+        mix_columns_0(&mut state);
+        add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]);
+        rk_off += 8;
+    }
+
+    sub_bytes(&mut state);
+    add_round_key(&mut state, &rkeys[96..]);
+
+    inv_bitslice(&state)
+}
+
+/// Fully-fixsliced AES-256 decryption (the InvShiftRows is completely omitted).
+///
+/// Decrypts four blocks in-place and in parallel.
+pub(crate) fn aes256_decrypt(rkeys: &FixsliceKeys256, blocks: &BatchBlocks) -> BatchBlocks {
+    let mut state = State::default();
+
+    bitslice(&mut state, &blocks[0], &blocks[1], &blocks[2], &blocks[3]);
+
+    add_round_key(&mut state, &rkeys[112..]);
+    inv_sub_bytes(&mut state);
+
+    #[cfg(not(aes_compact))]
+    {
+        inv_shift_rows_2(&mut state);
+    }
+
+    let mut rk_off = 104;
+    loop {
+        #[cfg(aes_compact)]
+        {
+            inv_shift_rows_2(&mut state);
+        }
+
+        add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]);
+        inv_mix_columns_1(&mut state);
+        inv_sub_bytes(&mut state);
+        rk_off -= 8;
+
+        if rk_off == 0 {
+            break;
+        }
+
+        add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]);
+        inv_mix_columns_0(&mut state);
+        inv_sub_bytes(&mut state);
+        rk_off -= 8;
+
+        #[cfg(not(aes_compact))]
+        {
+            add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]);
+            inv_mix_columns_3(&mut state);
+            inv_sub_bytes(&mut state);
+            rk_off -= 8;
+
+            add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]);
+            inv_mix_columns_2(&mut state);
+            inv_sub_bytes(&mut state);
+            rk_off -= 8;
+        }
+    }
+
+    add_round_key(&mut state, &rkeys[..8]);
+
+    inv_bitslice(&state)
+}
+
+/// Fully-fixsliced AES-256 encryption (the ShiftRows is completely omitted).
+///
+/// Encrypts four blocks in-place and in parallel.
+pub(crate) fn aes256_encrypt(rkeys: &FixsliceKeys256, blocks: &BatchBlocks) -> BatchBlocks {
+    let mut state = State::default();
+
+    bitslice(&mut state, &blocks[0], &blocks[1], &blocks[2], &blocks[3]);
+
+    add_round_key(&mut state, &rkeys[..8]);
+
+    let mut rk_off = 8;
+    loop {
+        sub_bytes(&mut state);
+        mix_columns_1(&mut state);
+        add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]);
+        rk_off += 8;
+
+        #[cfg(aes_compact)]
+        {
+            shift_rows_2(&mut state);
+        }
+
+        if rk_off == 112 {
+            break;
+        }
+
+        #[cfg(not(aes_compact))]
+        {
+            sub_bytes(&mut state);
+            mix_columns_2(&mut state);
+            add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]);
+            rk_off += 8;
+
+            sub_bytes(&mut state);
+            mix_columns_3(&mut state);
+            add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]);
+            rk_off += 8;
+        }
+
+        sub_bytes(&mut state);
+        mix_columns_0(&mut state);
+        add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]);
+        rk_off += 8;
+    }
+
+    #[cfg(not(aes_compact))]
+    {
+        shift_rows_2(&mut state);
+    }
+
+    sub_bytes(&mut state);
+    add_round_key(&mut state, &rkeys[112..]);
+
+    inv_bitslice(&state)
+}
+
+/// Note that the 4 bitwise NOT (^= 0xffffffffffffffff) are accounted for here so that it is a true
+/// inverse of 'sub_bytes'.
+fn inv_sub_bytes(state: &mut [u64]) {
+    debug_assert_eq!(state.len(), 8);
+
+    // Scheduled using https://github.com/Ko-/aes-armcortexm/tree/public/scheduler
+    // Inline "stack" comments reflect suggested stores and loads (ARM Cortex-M3 and M4)
+
+    let u7 = state[0];
+    let u6 = state[1];
+    let u5 = state[2];
+    let u4 = state[3];
+    let u3 = state[4];
+    let u2 = state[5];
+    let u1 = state[6];
+    let u0 = state[7];
+
+    let t23 = u0 ^ u3;
+    let t8 = u1 ^ t23;
+    let m2 = t23 & t8;
+    let t4 = u4 ^ t8;
+    let t22 = u1 ^ u3;
+    let t2 = u0 ^ u1;
+    let t1 = u3 ^ u4;
+    // t23 -> stack
+    let t9 = u7 ^ t1;
+    // t8 -> stack
+    let m7 = t22 & t9;
+    // t9 -> stack
+    let t24 = u4 ^ u7;
+    // m7 -> stack
+    let t10 = t2 ^ t24;
+    // u4 -> stack
+    let m14 = t2 & t10;
+    let r5 = u6 ^ u7;
+    // m2 -> stack
+    let t3 = t1 ^ r5;
+    // t2 -> stack
+    let t13 = t2 ^ r5;
+    let t19 = t22 ^ r5;
+    // t3 -> stack
+    let t17 = u2 ^ t19;
+    // t4 -> stack
+    let t25 = u2 ^ t1;
+    let r13 = u1 ^ u6;
+    // t25 -> stack
+    let t20 = t24 ^ r13;
+    // t17 -> stack
+    let m9 = t20 & t17;
+    // t20 -> stack
+    let r17 = u2 ^ u5;
+    // t22 -> stack
+    let t6 = t22 ^ r17;
+    // t13 -> stack
+    let m1 = t13 & t6;
+    let y5 = u0 ^ r17;
+    let m4 = t19 & y5;
+    let m5 = m4 ^ m1;
+    let m17 = m5 ^ t24;
+    let r18 = u5 ^ u6;
+    let t27 = t1 ^ r18;
+    let t15 = t10 ^ t27;
+    // t6 -> stack
+    let m11 = t1 & t15;
+    let m15 = m14 ^ m11;
+    let m21 = m17 ^ m15;
+    // t1 -> stack
+    // t4 <- stack
+    let m12 = t4 & t27;
+    let m13 = m12 ^ m11;
+    let t14 = t10 ^ r18;
+    let m3 = t14 ^ m1;
+    // m2 <- stack
+    let m16 = m3 ^ m2;
+    let m20 = m16 ^ m13;
+    // u4 <- stack
+    let r19 = u2 ^ u4;
+    let t16 = r13 ^ r19;
+    // t3 <- stack
+    let t26 = t3 ^ t16;
+    let m6 = t3 & t16;
+    let m8 = t26 ^ m6;
+    // t10 -> stack
+    // m7 <- stack
+    let m18 = m8 ^ m7;
+    let m22 = m18 ^ m13;
+    let m25 = m22 & m20;
+    let m26 = m21 ^ m25;
+    let m10 = m9 ^ m6;
+    let m19 = m10 ^ m15;
+    // t25 <- stack
+    let m23 = m19 ^ t25;
+    let m28 = m23 ^ m25;
+    let m24 = m22 ^ m23;
+    let m30 = m26 & m24;
+    let m39 = m23 ^ m30;
+    let m48 = m39 & y5;
+    let m57 = m39 & t19;
+    // m48 -> stack
+    let m36 = m24 ^ m25;
+    let m31 = m20 & m23;
+    let m27 = m20 ^ m21;
+    let m32 = m27 & m31;
+    let m29 = m28 & m27;
+    let m37 = m21 ^ m29;
+    // m39 -> stack
+    let m42 = m37 ^ m39;
+    let m52 = m42 & t15;
+    // t27 -> stack
+    // t1 <- stack
+    let m61 = m42 & t1;
+    let p0 = m52 ^ m61;
+    let p16 = m57 ^ m61;
+    // m57 -> stack
+    // t20 <- stack
+    let m60 = m37 & t20;
+    // p16 -> stack
+    // t17 <- stack
+    let m51 = m37 & t17;
+    let m33 = m27 ^ m25;
+    let m38 = m32 ^ m33;
+    let m43 = m37 ^ m38;
+    let m49 = m43 & t16;
+    let p6 = m49 ^ m60;
+    let p13 = m49 ^ m51;
+    let m58 = m43 & t3;
+    // t9 <- stack
+    let m50 = m38 & t9;
+    // t22 <- stack
+    let m59 = m38 & t22;
+    // p6 -> stack
+    let p1 = m58 ^ m59;
+    let p7 = p0 ^ p1;
+    let m34 = m21 & m22;
+    let m35 = m24 & m34;
+    let m40 = m35 ^ m36;
+    let m41 = m38 ^ m40;
+    let m45 = m42 ^ m41;
+    // t27 <- stack
+    let m53 = m45 & t27;
+    let p8 = m50 ^ m53;
+    let p23 = p7 ^ p8;
+    // t4 <- stack
+    let m62 = m45 & t4;
+    let p14 = m49 ^ m62;
+    let s6 = p14 ^ p23;
+    // t10 <- stack
+    let m54 = m41 & t10;
+    let p2 = m54 ^ m62;
+    let p22 = p2 ^ p7;
+    let s0 = p13 ^ p22;
+    let p17 = m58 ^ p2;
+    let p15 = m54 ^ m59;
+    // t2 <- stack
+    let m63 = m41 & t2;
+    // m39 <- stack
+    let m44 = m39 ^ m40;
+    // p17 -> stack
+    // t6 <- stack
+    let m46 = m44 & t6;
+    let p5 = m46 ^ m51;
+    // p23 -> stack
+    let p18 = m63 ^ p5;
+    let p24 = p5 ^ p7;
+    // m48 <- stack
+    let p12 = m46 ^ m48;
+    let s3 = p12 ^ p22;
+    // t13 <- stack
+    let m55 = m44 & t13;
+    let p9 = m55 ^ m63;
+    // p16 <- stack
+    let s7 = p9 ^ p16;
+    // t8 <- stack
+    let m47 = m40 & t8;
+    let p3 = m47 ^ m50;
+    let p19 = p2 ^ p3;
+    let s5 = p19 ^ p24;
+    let p11 = p0 ^ p3;
+    let p26 = p9 ^ p11;
+    // t23 <- stack
+    let m56 = m40 & t23;
+    let p4 = m48 ^ m56;
+    // p6 <- stack
+    let p20 = p4 ^ p6;
+    let p29 = p15 ^ p20;
+    let s1 = p26 ^ p29;
+    // m57 <- stack
+    let p10 = m57 ^ p4;
+    let p27 = p10 ^ p18;
+    // p23 <- stack
+    let s4 = p23 ^ p27;
+    let p25 = p6 ^ p10;
+    let p28 = p11 ^ p25;
+    // p17 <- stack
+    let s2 = p17 ^ p28;
+
+    state[0] = s7;
+    state[1] = s6;
+    state[2] = s5;
+    state[3] = s4;
+    state[4] = s3;
+    state[5] = s2;
+    state[6] = s1;
+    state[7] = s0;
+}
+
+/// Bitsliced implementation of the AES Sbox based on Boyar, Peralta and Calik.
+///
+/// See: <http://www.cs.yale.edu/homes/peralta/CircuitStuff/SLP_AES_113.txt>
+///
+/// Note that the 4 bitwise NOT (^= 0xffffffffffffffff) are moved to the key schedule.
+fn sub_bytes(state: &mut [u64]) {
+    debug_assert_eq!(state.len(), 8);
+
+    // Scheduled using https://github.com/Ko-/aes-armcortexm/tree/public/scheduler
+    // Inline "stack" comments reflect suggested stores and loads (ARM Cortex-M3 and M4)
+
+    let u7 = state[0];
+    let u6 = state[1];
+    let u5 = state[2];
+    let u4 = state[3];
+    let u3 = state[4];
+    let u2 = state[5];
+    let u1 = state[6];
+    let u0 = state[7];
+
+    let y14 = u3 ^ u5;
+    let y13 = u0 ^ u6;
+    let y12 = y13 ^ y14;
+    let t1 = u4 ^ y12;
+    let y15 = t1 ^ u5;
+    let t2 = y12 & y15;
+    let y6 = y15 ^ u7;
+    let y20 = t1 ^ u1;
+    // y12 -> stack
+    let y9 = u0 ^ u3;
+    // y20 -> stack
+    let y11 = y20 ^ y9;
+    // y9 -> stack
+    let t12 = y9 & y11;
+    // y6 -> stack
+    let y7 = u7 ^ y11;
+    let y8 = u0 ^ u5;
+    let t0 = u1 ^ u2;
+    let y10 = y15 ^ t0;
+    // y15 -> stack
+    let y17 = y10 ^ y11;
+    // y14 -> stack
+    let t13 = y14 & y17;
+    let t14 = t13 ^ t12;
+    // y17 -> stack
+    let y19 = y10 ^ y8;
+    // y10 -> stack
+    let t15 = y8 & y10;
+    let t16 = t15 ^ t12;
+    let y16 = t0 ^ y11;
+    // y11 -> stack
+    let y21 = y13 ^ y16;
+    // y13 -> stack
+    let t7 = y13 & y16;
+    // y16 -> stack
+    let y18 = u0 ^ y16;
+    let y1 = t0 ^ u7;
+    let y4 = y1 ^ u3;
+    // u7 -> stack
+    let t5 = y4 & u7;
+    let t6 = t5 ^ t2;
+    let t18 = t6 ^ t16;
+    let t22 = t18 ^ y19;
+    let y2 = y1 ^ u0;
+    let t10 = y2 & y7;
+    let t11 = t10 ^ t7;
+    let t20 = t11 ^ t16;
+    let t24 = t20 ^ y18;
+    let y5 = y1 ^ u6;
+    let t8 = y5 & y1;
+    let t9 = t8 ^ t7;
+    let t19 = t9 ^ t14;
+    let t23 = t19 ^ y21;
+    let y3 = y5 ^ y8;
+    // y6 <- stack
+    let t3 = y3 & y6;
+    let t4 = t3 ^ t2;
+    // y20 <- stack
+    let t17 = t4 ^ y20;
+    let t21 = t17 ^ t14;
+    let t26 = t21 & t23;
+    let t27 = t24 ^ t26;
+    let t31 = t22 ^ t26;
+    let t25 = t21 ^ t22;
+    // y4 -> stack
+    let t28 = t25 & t27;
+    let t29 = t28 ^ t22;
+    let z14 = t29 & y2;
+    let z5 = t29 & y7;
+    let t30 = t23 ^ t24;
+    let t32 = t31 & t30;
+    let t33 = t32 ^ t24;
+    let t35 = t27 ^ t33;
+    let t36 = t24 & t35;
+    let t38 = t27 ^ t36;
+    let t39 = t29 & t38;
+    let t40 = t25 ^ t39;
+    let t43 = t29 ^ t40;
+    // y16 <- stack
+    let z3 = t43 & y16;
+    let tc12 = z3 ^ z5;
+    // tc12 -> stack
+    // y13 <- stack
+    let z12 = t43 & y13;
+    let z13 = t40 & y5;
+    let z4 = t40 & y1;
+    let tc6 = z3 ^ z4;
+    let t34 = t23 ^ t33;
+    let t37 = t36 ^ t34;
+    let t41 = t40 ^ t37;
+    // y10 <- stack
+    let z8 = t41 & y10;
+    let z17 = t41 & y8;
+    let t44 = t33 ^ t37;
+    // y15 <- stack
+    let z0 = t44 & y15;
+    // z17 -> stack
+    // y12 <- stack
+    let z9 = t44 & y12;
+    let z10 = t37 & y3;
+    let z1 = t37 & y6;
+    let tc5 = z1 ^ z0;
+    let tc11 = tc6 ^ tc5;
+    // y4 <- stack
+    let z11 = t33 & y4;
+    let t42 = t29 ^ t33;
+    let t45 = t42 ^ t41;
+    // y17 <- stack
+    let z7 = t45 & y17;
+    let tc8 = z7 ^ tc6;
+    // y14 <- stack
+    let z16 = t45 & y14;
+    // y11 <- stack
+    let z6 = t42 & y11;
+    let tc16 = z6 ^ tc8;
+    // z14 -> stack
+    // y9 <- stack
+    let z15 = t42 & y9;
+    let tc20 = z15 ^ tc16;
+    let tc1 = z15 ^ z16;
+    let tc2 = z10 ^ tc1;
+    let tc21 = tc2 ^ z11;
+    let tc3 = z9 ^ tc2;
+    let s0 = tc3 ^ tc16;
+    let s3 = tc3 ^ tc11;
+    let s1 = s3 ^ tc16;
+    let tc13 = z13 ^ tc1;
+    // u7 <- stack
+    let z2 = t33 & u7;
+    let tc4 = z0 ^ z2;
+    let tc7 = z12 ^ tc4;
+    let tc9 = z8 ^ tc7;
+    let tc10 = tc8 ^ tc9;
+    // z14 <- stack
+    let tc17 = z14 ^ tc10;
+    let s5 = tc21 ^ tc17;
+    let tc26 = tc17 ^ tc20;
+    // z17 <- stack
+    let s2 = tc26 ^ z17;
+    // tc12 <- stack
+    let tc14 = tc4 ^ tc12;
+    let tc18 = tc13 ^ tc14;
+    let s6 = tc10 ^ tc18;
+    let s7 = z12 ^ tc18;
+    let s4 = tc14 ^ s3;
+
+    state[0] = s7;
+    state[1] = s6;
+    state[2] = s5;
+    state[3] = s4;
+    state[4] = s3;
+    state[5] = s2;
+    state[6] = s1;
+    state[7] = s0;
+}
+
+/// NOT operations that are omitted in S-box
+#[inline]
+fn sub_bytes_nots(state: &mut [u64]) {
+    debug_assert_eq!(state.len(), 8);
+    state[0] ^= 0xffffffffffffffff;
+    state[1] ^= 0xffffffffffffffff;
+    state[5] ^= 0xffffffffffffffff;
+    state[6] ^= 0xffffffffffffffff;
+}
+
+/// Computation of the MixColumns transformation in the fixsliced representation, with different
+/// rotations used according to the round number mod 4.
+///
+/// Based on Käsper-Schwabe, similar to https://github.com/Ko-/aes-armcortexm.
+macro_rules! define_mix_columns {
+    (
+        $name:ident,
+        $name_inv:ident,
+        $first_rotate:path,
+        $second_rotate:path
+    ) => {
+        #[rustfmt::skip]
+        fn $name(state: &mut State) {
+            let (a0, a1, a2, a3, a4, a5, a6, a7) = (
+                state[0], state[1], state[2], state[3], state[4], state[5], state[6], state[7]
+            );
+            let (b0, b1, b2, b3, b4, b5, b6, b7) = (
+                $first_rotate(a0),
+                $first_rotate(a1),
+                $first_rotate(a2),
+                $first_rotate(a3),
+                $first_rotate(a4),
+                $first_rotate(a5),
+                $first_rotate(a6),
+                $first_rotate(a7),
+            );
+            let (c0, c1, c2, c3, c4, c5, c6, c7) = (
+                a0 ^ b0,
+                a1 ^ b1,
+                a2 ^ b2,
+                a3 ^ b3,
+                a4 ^ b4,
+                a5 ^ b5,
+                a6 ^ b6,
+                a7 ^ b7,
+            );
+            state[0] = b0      ^ c7 ^ $second_rotate(c0);
+            state[1] = b1 ^ c0 ^ c7 ^ $second_rotate(c1);
+            state[2] = b2 ^ c1      ^ $second_rotate(c2);
+            state[3] = b3 ^ c2 ^ c7 ^ $second_rotate(c3);
+            state[4] = b4 ^ c3 ^ c7 ^ $second_rotate(c4);
+            state[5] = b5 ^ c4      ^ $second_rotate(c5);
+            state[6] = b6 ^ c5      ^ $second_rotate(c6);
+            state[7] = b7 ^ c6      ^ $second_rotate(c7);
+        }
+
+        #[rustfmt::skip]
+        fn $name_inv(state: &mut State) {
+            let (a0, a1, a2, a3, a4, a5, a6, a7) = (
+                state[0], state[1], state[2], state[3], state[4], state[5], state[6], state[7]
+            );
+            let (b0, b1, b2, b3, b4, b5, b6, b7) = (
+                $first_rotate(a0),
+                $first_rotate(a1),
+                $first_rotate(a2),
+                $first_rotate(a3),
+                $first_rotate(a4),
+                $first_rotate(a5),
+                $first_rotate(a6),
+                $first_rotate(a7),
+            );
+            let (c0, c1, c2, c3, c4, c5, c6, c7) = (
+                a0 ^ b0,
+                a1 ^ b1,
+                a2 ^ b2,
+                a3 ^ b3,
+                a4 ^ b4,
+                a5 ^ b5,
+                a6 ^ b6,
+                a7 ^ b7,
+            );
+            let (d0, d1, d2, d3, d4, d5, d6, d7) = (
+                a0      ^ c7,
+                a1 ^ c0 ^ c7,
+                a2 ^ c1,
+                a3 ^ c2 ^ c7,
+                a4 ^ c3 ^ c7,
+                a5 ^ c4,
+                a6 ^ c5,
+                a7 ^ c6,
+            );
+            let (e0, e1, e2, e3, e4, e5, e6, e7) = (
+                c0      ^ d6,
+                c1      ^ d6 ^ d7,
+                c2 ^ d0      ^ d7,
+                c3 ^ d1 ^ d6,
+                c4 ^ d2 ^ d6 ^ d7,
+                c5 ^ d3      ^ d7,
+                c6 ^ d4,
+                c7 ^ d5,
+            );
+            state[0] = d0 ^ e0 ^ $second_rotate(e0);
+            state[1] = d1 ^ e1 ^ $second_rotate(e1);
+            state[2] = d2 ^ e2 ^ $second_rotate(e2);
+            state[3] = d3 ^ e3 ^ $second_rotate(e3);
+            state[4] = d4 ^ e4 ^ $second_rotate(e4);
+            state[5] = d5 ^ e5 ^ $second_rotate(e5);
+            state[6] = d6 ^ e6 ^ $second_rotate(e6);
+            state[7] = d7 ^ e7 ^ $second_rotate(e7);
+        }
+    }
+}
+
+define_mix_columns!(
+    mix_columns_0,
+    inv_mix_columns_0,
+    rotate_rows_1,
+    rotate_rows_2
+);
+
+define_mix_columns!(
+    mix_columns_1,
+    inv_mix_columns_1,
+    rotate_rows_and_columns_1_1,
+    rotate_rows_and_columns_2_2
+);
+
+#[cfg(not(aes_compact))]
+define_mix_columns!(
+    mix_columns_2,
+    inv_mix_columns_2,
+    rotate_rows_and_columns_1_2,
+    rotate_rows_2
+);
+
+#[cfg(not(aes_compact))]
+define_mix_columns!(
+    mix_columns_3,
+    inv_mix_columns_3,
+    rotate_rows_and_columns_1_3,
+    rotate_rows_and_columns_2_2
+);
+
+#[inline]
+fn delta_swap_1(a: &mut u64, shift: u32, mask: u64) {
+    let t = (*a ^ ((*a) >> shift)) & mask;
+    *a ^= t ^ (t << shift);
+}
+
+#[inline]
+fn delta_swap_2(a: &mut u64, b: &mut u64, shift: u32, mask: u64) {
+    let t = (*a ^ ((*b) >> shift)) & mask;
+    *a ^= t;
+    *b ^= t << shift;
+}
+
+/// Applies ShiftRows once on an AES state (or key).
+#[cfg(any(not(aes_compact), feature = "hazmat"))]
+#[inline]
+fn shift_rows_1(state: &mut [u64]) {
+    debug_assert_eq!(state.len(), 8);
+    for x in state.iter_mut() {
+        delta_swap_1(x, 8, 0x00f000ff000f0000);
+        delta_swap_1(x, 4, 0x0f0f00000f0f0000);
+    }
+}
+
+/// Applies ShiftRows twice on an AES state (or key).
+#[inline]
+fn shift_rows_2(state: &mut [u64]) {
+    debug_assert_eq!(state.len(), 8);
+    for x in state.iter_mut() {
+        delta_swap_1(x, 8, 0x00ff000000ff0000);
+    }
+}
+
+/// Applies ShiftRows three times on an AES state (or key).
+#[inline]
+fn shift_rows_3(state: &mut [u64]) {
+    debug_assert_eq!(state.len(), 8);
+    for x in state.iter_mut() {
+        delta_swap_1(x, 8, 0x000f00ff00f00000);
+        delta_swap_1(x, 4, 0x0f0f00000f0f0000);
+    }
+}
+
+#[inline(always)]
+fn inv_shift_rows_1(state: &mut [u64]) {
+    shift_rows_3(state);
+}
+
+#[inline(always)]
+fn inv_shift_rows_2(state: &mut [u64]) {
+    shift_rows_2(state);
+}
+
+#[cfg(not(aes_compact))]
+#[inline(always)]
+fn inv_shift_rows_3(state: &mut [u64]) {
+    shift_rows_1(state);
+}
+
+/// XOR the columns after the S-box during the key schedule round function.
+///
+/// The `idx_xor` parameter refers to the index of the previous round key that is
+/// involved in the XOR computation (should be 8 and 16 for AES-128 and AES-256,
+/// respectively).
+///
+/// The `idx_ror` parameter refers to the rotation value, which varies between the
+/// different key schedules.
+fn xor_columns(rkeys: &mut [u64], offset: usize, idx_xor: usize, idx_ror: u32) {
+    for i in 0..8 {
+        let off_i = offset + i;
+        let rk = rkeys[off_i - idx_xor] ^ (0x000f000f000f000f & ror(rkeys[off_i], idx_ror));
+        rkeys[off_i] = rk
+            ^ (0xfff0fff0fff0fff0 & (rk << 4))
+            ^ (0xff00ff00ff00ff00 & (rk << 8))
+            ^ (0xf000f000f000f000 & (rk << 12));
+    }
+}
+
+/// Bitslice four 128-bit input blocks input0, input1, input2, input3 into a 512-bit internal state.
+fn bitslice(output: &mut [u64], input0: &[u8], input1: &[u8], input2: &[u8], input3: &[u8]) {
+    debug_assert_eq!(output.len(), 8);
+    debug_assert_eq!(input0.len(), 16);
+    debug_assert_eq!(input1.len(), 16);
+    debug_assert_eq!(input2.len(), 16);
+    debug_assert_eq!(input3.len(), 16);
+
+    // Bitslicing is a bit index manipulation. 512 bits of data means each bit is positioned at a
+    // 9-bit index. AES data is 4 blocks, each one a 4x4 column-major matrix of bytes, so the
+    // index is initially ([b]lock, [c]olumn, [r]ow, [p]osition):
+    //     b1 b0 c1 c0 r1 r0 p2 p1 p0
+    //
+    // The desired bitsliced data groups first by bit position, then row, column, block:
+    //     p2 p1 p0 r1 r0 c1 c0 b1 b0
+
+    #[rustfmt::skip]
+    fn read_reordered(input: &[u8]) -> u64 {
+        (u64::from(input[0x0])        ) |
+        (u64::from(input[0x1]) << 0x10) |
+        (u64::from(input[0x2]) << 0x20) |
+        (u64::from(input[0x3]) << 0x30) |
+        (u64::from(input[0x8]) << 0x08) |
+        (u64::from(input[0x9]) << 0x18) |
+        (u64::from(input[0xa]) << 0x28) |
+        (u64::from(input[0xb]) << 0x38)
+    }
+
+    // Reorder each block's bytes on input
+    //     __ __ c1 c0 r1 r0 __ __ __ => __ __ c0 r1 r0 c1 __ __ __
+    // Reorder by relabeling (note the order of input)
+    //     b1 b0 c0 __ __ __ __ __ __ => c0 b1 b0 __ __ __ __ __ __
+    let mut t0 = read_reordered(&input0[0x00..0x0c]);
+    let mut t4 = read_reordered(&input0[0x04..0x10]);
+    let mut t1 = read_reordered(&input1[0x00..0x0c]);
+    let mut t5 = read_reordered(&input1[0x04..0x10]);
+    let mut t2 = read_reordered(&input2[0x00..0x0c]);
+    let mut t6 = read_reordered(&input2[0x04..0x10]);
+    let mut t3 = read_reordered(&input3[0x00..0x0c]);
+    let mut t7 = read_reordered(&input3[0x04..0x10]);
+
+    // Bit Index Swap 6 <-> 0:
+    //     __ __ b0 __ __ __ __ __ p0 => __ __ p0 __ __ __ __ __ b0
+    let m0 = 0x5555555555555555;
+    delta_swap_2(&mut t1, &mut t0, 1, m0);
+    delta_swap_2(&mut t3, &mut t2, 1, m0);
+    delta_swap_2(&mut t5, &mut t4, 1, m0);
+    delta_swap_2(&mut t7, &mut t6, 1, m0);
+
+    // Bit Index Swap 7 <-> 1:
+    //     __ b1 __ __ __ __ __ p1 __ => __ p1 __ __ __ __ __ b1 __
+    let m1 = 0x3333333333333333;
+    delta_swap_2(&mut t2, &mut t0, 2, m1);
+    delta_swap_2(&mut t3, &mut t1, 2, m1);
+    delta_swap_2(&mut t6, &mut t4, 2, m1);
+    delta_swap_2(&mut t7, &mut t5, 2, m1);
+
+    // Bit Index Swap 8 <-> 2:
+    //     c0 __ __ __ __ __ p2 __ __ => p2 __ __ __ __ __ c0 __ __
+    let m2 = 0x0f0f0f0f0f0f0f0f;
+    delta_swap_2(&mut t4, &mut t0, 4, m2);
+    delta_swap_2(&mut t5, &mut t1, 4, m2);
+    delta_swap_2(&mut t6, &mut t2, 4, m2);
+    delta_swap_2(&mut t7, &mut t3, 4, m2);
+
+    // Final bitsliced bit index, as desired:
+    //     p2 p1 p0 r1 r0 c1 c0 b1 b0
+    output[0] = t0;
+    output[1] = t1;
+    output[2] = t2;
+    output[3] = t3;
+    output[4] = t4;
+    output[5] = t5;
+    output[6] = t6;
+    output[7] = t7;
+}
+
+/// Un-bitslice a 512-bit internal state into four 128-bit blocks of output.
+fn inv_bitslice(input: &[u64]) -> BatchBlocks {
+    debug_assert_eq!(input.len(), 8);
+
+    // Unbitslicing is a bit index manipulation. 512 bits of data means each bit is positioned at
+    // a 9-bit index. AES data is 4 blocks, each one a 4x4 column-major matrix of bytes, so the
+    // desired index for the output is ([b]lock, [c]olumn, [r]ow, [p]osition):
+    //     b1 b0 c1 c0 r1 r0 p2 p1 p0
+    //
+    // The initially bitsliced data groups first by bit position, then row, column, block:
+    //     p2 p1 p0 r1 r0 c1 c0 b1 b0
+
+    let mut t0 = input[0];
+    let mut t1 = input[1];
+    let mut t2 = input[2];
+    let mut t3 = input[3];
+    let mut t4 = input[4];
+    let mut t5 = input[5];
+    let mut t6 = input[6];
+    let mut t7 = input[7];
+
+    // TODO: these bit index swaps are identical to those in 'packing'
+
+    // Bit Index Swap 6 <-> 0:
+    //     __ __ p0 __ __ __ __ __ b0 => __ __ b0 __ __ __ __ __ p0
+    let m0 = 0x5555555555555555;
+    delta_swap_2(&mut t1, &mut t0, 1, m0);
+    delta_swap_2(&mut t3, &mut t2, 1, m0);
+    delta_swap_2(&mut t5, &mut t4, 1, m0);
+    delta_swap_2(&mut t7, &mut t6, 1, m0);
+
+    // Bit Index Swap 7 <-> 1:
+    //     __ p1 __ __ __ __ __ b1 __ => __ b1 __ __ __ __ __ p1 __
+    let m1 = 0x3333333333333333;
+    delta_swap_2(&mut t2, &mut t0, 2, m1);
+    delta_swap_2(&mut t3, &mut t1, 2, m1);
+    delta_swap_2(&mut t6, &mut t4, 2, m1);
+    delta_swap_2(&mut t7, &mut t5, 2, m1);
+
+    // Bit Index Swap 8 <-> 2:
+    //     p2 __ __ __ __ __ c0 __ __ => c0 __ __ __ __ __ p2 __ __
+    let m2 = 0x0f0f0f0f0f0f0f0f;
+    delta_swap_2(&mut t4, &mut t0, 4, m2);
+    delta_swap_2(&mut t5, &mut t1, 4, m2);
+    delta_swap_2(&mut t6, &mut t2, 4, m2);
+    delta_swap_2(&mut t7, &mut t3, 4, m2);
+
+    #[rustfmt::skip]
+    fn write_reordered(columns: u64, output: &mut [u8]) {
+        output[0x0] = (columns        ) as u8;
+        output[0x1] = (columns >> 0x10) as u8;
+        output[0x2] = (columns >> 0x20) as u8;
+        output[0x3] = (columns >> 0x30) as u8;
+        output[0x8] = (columns >> 0x08) as u8;
+        output[0x9] = (columns >> 0x18) as u8;
+        output[0xa] = (columns >> 0x28) as u8;
+        output[0xb] = (columns >> 0x38) as u8;
+    }
+
+    let mut output = BatchBlocks::default();
+    // Reorder by relabeling (note the order of output)
+    //     c0 b1 b0 __ __ __ __ __ __ => b1 b0 c0 __ __ __ __ __ __
+    // Reorder each block's bytes on output
+    //     __ __ c0 r1 r0 c1 __ __ __ => __ __ c1 c0 r1 r0 __ __ __
+    write_reordered(t0, &mut output[0][0x00..0x0c]);
+    write_reordered(t4, &mut output[0][0x04..0x10]);
+    write_reordered(t1, &mut output[1][0x00..0x0c]);
+    write_reordered(t5, &mut output[1][0x04..0x10]);
+    write_reordered(t2, &mut output[2][0x00..0x0c]);
+    write_reordered(t6, &mut output[2][0x04..0x10]);
+    write_reordered(t3, &mut output[3][0x00..0x0c]);
+    write_reordered(t7, &mut output[3][0x04..0x10]);
+
+    // Final AES bit index, as desired:
+    //     b1 b0 c1 c0 r1 r0 p2 p1 p0
+    output
+}
+
+/// Copy 32-bytes within the provided slice to an 8-byte offset
+fn memshift32(buffer: &mut [u64], src_offset: usize) {
+    debug_assert_eq!(src_offset % 8, 0);
+
+    let dst_offset = src_offset + 8;
+    debug_assert!(dst_offset + 8 <= buffer.len());
+
+    for i in (0..8).rev() {
+        buffer[dst_offset + i] = buffer[src_offset + i];
+    }
+}
+
+/// XOR the round key to the internal state. The round keys are expected to be
+/// pre-computed and to be packed in the fixsliced representation.
+#[inline]
+fn add_round_key(state: &mut State, rkey: &[u64]) {
+    debug_assert_eq!(rkey.len(), 8);
+    for (a, b) in state.iter_mut().zip(rkey) {
+        *a ^= b;
+    }
+}
+
+#[inline(always)]
+fn add_round_constant_bit(state: &mut [u64], bit: usize) {
+    state[bit] ^= 0x00000000f0000000;
+}
+
+#[inline(always)]
+fn ror(x: u64, y: u32) -> u64 {
+    x.rotate_right(y)
+}
+
+#[inline(always)]
+fn ror_distance(rows: u32, cols: u32) -> u32 {
+    (rows << 4) + (cols << 2)
+}
+
+#[inline(always)]
+fn rotate_rows_1(x: u64) -> u64 {
+    ror(x, ror_distance(1, 0))
+}
+
+#[inline(always)]
+fn rotate_rows_2(x: u64) -> u64 {
+    ror(x, ror_distance(2, 0))
+}
+
+#[inline(always)]
+#[rustfmt::skip]
+fn rotate_rows_and_columns_1_1(x: u64) -> u64 {
+    (ror(x, ror_distance(1, 1)) & 0x0fff0fff0fff0fff) |
+    (ror(x, ror_distance(0, 1)) & 0xf000f000f000f000)
+}
+
+#[cfg(not(aes_compact))]
+#[inline(always)]
+#[rustfmt::skip]
+fn rotate_rows_and_columns_1_2(x: u64) -> u64 {
+    (ror(x, ror_distance(1, 2)) & 0x00ff00ff00ff00ff) |
+    (ror(x, ror_distance(0, 2)) & 0xff00ff00ff00ff00)
+}
+
+#[cfg(not(aes_compact))]
+#[inline(always)]
+#[rustfmt::skip]
+fn rotate_rows_and_columns_1_3(x: u64) -> u64 {
+    (ror(x, ror_distance(1, 3)) & 0x000f000f000f000f) |
+    (ror(x, ror_distance(0, 3)) & 0xfff0fff0fff0fff0)
+}
+
+#[inline(always)]
+#[rustfmt::skip]
+fn rotate_rows_and_columns_2_2(x: u64) -> u64 {
+    (ror(x, ror_distance(2, 2)) & 0x00ff00ff00ff00ff) |
+    (ror(x, ror_distance(1, 2)) & 0xff00ff00ff00ff00)
+}
+
+/// Low-level "hazmat" AES functions.
+///
+/// Note: this isn't actually used in the `Aes128`/`Aes192`/`Aes256`
+/// implementations in this crate, but instead provides raw access to
+/// the AES round function gated under the `hazmat` crate feature.
+#[cfg(feature = "hazmat")]
+pub(crate) mod hazmat {
+    use super::{
+        bitslice, inv_bitslice, inv_mix_columns_0, inv_shift_rows_1, inv_sub_bytes, mix_columns_0,
+        shift_rows_1, sub_bytes, sub_bytes_nots, State,
+    };
+    use crate::{Block, Block8};
+
+    /// XOR the `src` block into the `dst` block in-place.
+    fn xor_in_place(dst: &mut Block, src: &Block) {
+        for (a, b) in dst.iter_mut().zip(src.as_slice()) {
+            *a ^= *b;
+        }
+    }
+
+    /// Perform a bitslice operation, loading a single block.
+    fn bitslice_block(block: &Block) -> State {
+        let mut state = State::default();
+        bitslice(&mut state, block, block, block, block);
+        state
+    }
+
+    /// Perform an inverse bitslice operation, extracting a single block.
+    fn inv_bitslice_block(block: &mut Block, state: &State) {
+        block.copy_from_slice(&inv_bitslice(state)[0]);
+    }
+
+    /// AES cipher (encrypt) round function.
+    #[inline]
+    pub(crate) fn cipher_round(block: &mut Block, round_key: &Block) {
+        let mut state = bitslice_block(block);
+        sub_bytes(&mut state);
+        sub_bytes_nots(&mut state);
+        shift_rows_1(&mut state);
+        mix_columns_0(&mut state);
+        inv_bitslice_block(block, &state);
+        xor_in_place(block, round_key);
+    }
+
+    /// AES cipher (encrypt) round function: parallel version.
+    #[inline]
+    pub(crate) fn cipher_round_par(blocks: &mut Block8, round_keys: &Block8) {
+        for (chunk, keys) in blocks.chunks_exact_mut(4).zip(round_keys.chunks_exact(4)) {
+            let mut state = State::default();
+            bitslice(&mut state, &chunk[0], &chunk[1], &chunk[2], &chunk[3]);
+            sub_bytes(&mut state);
+            sub_bytes_nots(&mut state);
+            shift_rows_1(&mut state);
+            mix_columns_0(&mut state);
+            let res = inv_bitslice(&state);
+
+            for i in 0..4 {
+                chunk[i] = res[i];
+                xor_in_place(&mut chunk[i], &keys[i]);
+            }
+        }
+    }
+
+    /// AES cipher (encrypt) round function.
+    #[inline]
+    pub(crate) fn equiv_inv_cipher_round(block: &mut Block, round_key: &Block) {
+        let mut state = State::default();
+        bitslice(&mut state, block, block, block, block);
+        sub_bytes_nots(&mut state);
+        inv_sub_bytes(&mut state);
+        inv_shift_rows_1(&mut state);
+        inv_mix_columns_0(&mut state);
+        inv_bitslice_block(block, &state);
+        xor_in_place(block, round_key);
+    }
+
+    /// AES cipher (encrypt) round function: parallel version.
+    #[inline]
+    pub(crate) fn equiv_inv_cipher_round_par(blocks: &mut Block8, round_keys: &Block8) {
+        for (chunk, keys) in blocks.chunks_exact_mut(4).zip(round_keys.chunks_exact(4)) {
+            let mut state = State::default();
+            bitslice(&mut state, &chunk[0], &chunk[1], &chunk[2], &chunk[3]);
+            sub_bytes_nots(&mut state);
+            inv_sub_bytes(&mut state);
+            inv_shift_rows_1(&mut state);
+            inv_mix_columns_0(&mut state);
+            let res = inv_bitslice(&state);
+
+            for i in 0..4 {
+                chunk[i] = res[i];
+                xor_in_place(&mut chunk[i], &keys[i]);
+            }
+        }
+    }
+
+    /// AES mix columns function.
+    #[inline]
+    pub(crate) fn mix_columns(block: &mut Block) {
+        let mut state = bitslice_block(block);
+        mix_columns_0(&mut state);
+        inv_bitslice_block(block, &state);
+    }
+
+    /// AES inverse mix columns function.
+    #[inline]
+    pub(crate) fn inv_mix_columns(block: &mut Block) {
+        let mut state = bitslice_block(block);
+        inv_mix_columns_0(&mut state);
+        inv_bitslice_block(block, &state);
+    }
+}
diff --git a/tests/data/aes128.blb b/tests/data/aes128.blb
new file mode 100644 (file)
index 0000000..0accb99
Binary files /dev/null and b/tests/data/aes128.blb differ
diff --git a/tests/data/aes192.blb b/tests/data/aes192.blb
new file mode 100644 (file)
index 0000000..b5f70fa
Binary files /dev/null and b/tests/data/aes192.blb differ
diff --git a/tests/data/aes256.blb b/tests/data/aes256.blb
new file mode 100644 (file)
index 0000000..2fa2e3a
Binary files /dev/null and b/tests/data/aes256.blb differ
diff --git a/tests/hazmat.rs b/tests/hazmat.rs
new file mode 100644 (file)
index 0000000..ce0e8f8
--- /dev/null
@@ -0,0 +1,155 @@
+//! Tests for low-level "hazmat" AES functions.
+
+// TODO(tarcieri): support for using the hazmat functions with the `soft` backend
+#![cfg(feature = "hazmat")]
+
+use aes::{Block, Block8};
+use hex_literal::hex;
+
+/// Round function tests vectors.
+struct RoundTestVector {
+    /// State at start of `round[r]`.
+    start: [u8; 16],
+
+    /// Key schedule value for `round[r]`.
+    k_sch: [u8; 16],
+
+    /// Cipher output.
+    output: [u8; 16],
+}
+
+/// Cipher round function test vectors from FIPS 197 Appendix C.1.
+const CIPHER_ROUND_TEST_VECTORS: &[RoundTestVector] = &[
+    // round 1
+    RoundTestVector {
+        start: hex!("00102030405060708090a0b0c0d0e0f0"),
+        k_sch: hex!("d6aa74fdd2af72fadaa678f1d6ab76fe"),
+        output: hex!("89d810e8855ace682d1843d8cb128fe4"),
+    },
+    // round 2
+    RoundTestVector {
+        start: hex!("89d810e8855ace682d1843d8cb128fe4"),
+        k_sch: hex!("b692cf0b643dbdf1be9bc5006830b3fe"),
+        output: hex!("4915598f55e5d7a0daca94fa1f0a63f7"),
+    },
+    // round 3
+    RoundTestVector {
+        start: hex!("4915598f55e5d7a0daca94fa1f0a63f7"),
+        k_sch: hex!("b6ff744ed2c2c9bf6c590cbf0469bf41"),
+        output: hex!("fa636a2825b339c940668a3157244d17"),
+    },
+    // round 4
+    RoundTestVector {
+        start: hex!("fa636a2825b339c940668a3157244d17"),
+        k_sch: hex!("47f7f7bc95353e03f96c32bcfd058dfd"),
+        output: hex!("247240236966b3fa6ed2753288425b6c"),
+    },
+];
+
+/// Equivalent Inverse Cipher round function test vectors from FIPS 197 Appendix C.1.
+const EQUIV_INV_CIPHER_ROUND_TEST_VECTORS: &[RoundTestVector] = &[
+    // round 1
+    RoundTestVector {
+        start: hex!("7ad5fda789ef4e272bca100b3d9ff59f"),
+        k_sch: hex!("13aa29be9c8faff6f770f58000f7bf03"),
+        output: hex!("54d990a16ba09ab596bbf40ea111702f"),
+    },
+    // round 2
+    RoundTestVector {
+        start: hex!("54d990a16ba09ab596bbf40ea111702f"),
+        k_sch: hex!("1362a4638f2586486bff5a76f7874a83"),
+        output: hex!("3e1c22c0b6fcbf768da85067f6170495"),
+    },
+    // round 3
+    RoundTestVector {
+        start: hex!("3e1c22c0b6fcbf768da85067f6170495"),
+        k_sch: hex!("8d82fc749c47222be4dadc3e9c7810f5"),
+        output: hex!("b458124c68b68a014b99f82e5f15554c"),
+    },
+    // round 4
+    RoundTestVector {
+        start: hex!("b458124c68b68a014b99f82e5f15554c"),
+        k_sch: hex!("72e3098d11c5de5f789dfe1578a2cccb"),
+        output: hex!("e8dab6901477d4653ff7f5e2e747dd4f"),
+    },
+];
+
+#[test]
+fn cipher_round_fips197_vectors() {
+    for vector in CIPHER_ROUND_TEST_VECTORS {
+        let mut block = Block::from(vector.start);
+        aes::hazmat::cipher_round(&mut block, &vector.k_sch.into());
+        assert_eq!(block.as_slice(), &vector.output);
+    }
+}
+
+#[test]
+fn cipher_round_par_fips197_vectors() {
+    let mut blocks = Block8::default();
+    let mut round_keys = Block8::default();
+
+    for i in 0..4 {
+        let vector = &CIPHER_ROUND_TEST_VECTORS[i];
+
+        blocks[i] = Block::from(vector.start);
+        blocks[i + 4] = Block::from(vector.start);
+
+        round_keys[i] = Block::from(vector.k_sch);
+        round_keys[i + 4] = Block::from(vector.k_sch);
+    }
+
+    aes::hazmat::cipher_round_par(&mut blocks, &round_keys);
+
+    for i in 0..4 {
+        let vector = &CIPHER_ROUND_TEST_VECTORS[i];
+        assert_eq!(blocks[i].as_slice(), &vector.output);
+        assert_eq!(blocks[i + 4].as_slice(), &vector.output);
+    }
+}
+
+#[test]
+fn equiv_inv_cipher_round_fips197_vectors() {
+    for vector in EQUIV_INV_CIPHER_ROUND_TEST_VECTORS {
+        let mut block = Block::from(vector.start);
+        aes::hazmat::equiv_inv_cipher_round(&mut block, &vector.k_sch.into());
+        assert_eq!(block.as_slice(), &vector.output);
+    }
+}
+
+#[test]
+fn equiv_inv_cipher_round_par_fips197_vectors() {
+    let mut blocks = Block8::default();
+    let mut round_keys = Block8::default();
+
+    for i in 0..4 {
+        let vector = &EQUIV_INV_CIPHER_ROUND_TEST_VECTORS[i];
+
+        blocks[i] = Block::from(vector.start);
+        blocks[i + 4] = Block::from(vector.start);
+
+        round_keys[i] = Block::from(vector.k_sch);
+        round_keys[i + 4] = Block::from(vector.k_sch);
+    }
+
+    aes::hazmat::equiv_inv_cipher_round_par(&mut blocks, &round_keys);
+
+    for i in 0..4 {
+        let vector = &EQUIV_INV_CIPHER_ROUND_TEST_VECTORS[i];
+        assert_eq!(blocks[i].as_slice(), &vector.output);
+        assert_eq!(blocks[i + 4].as_slice(), &vector.output);
+    }
+}
+
+#[test]
+fn mix_columns_fips197_vector() {
+    let mut block = Block::from(hex!("6353e08c0960e104cd70b751bacad0e7"));
+    aes::hazmat::mix_columns(&mut block);
+    assert_eq!(block.as_slice(), &hex!("5f72641557f5bc92f7be3b291db9f91a"))
+}
+
+#[test]
+fn inv_mix_columns_fips197_vector() {
+    let mut block = Block::from(hex!("bd6e7c3df2b5779e0b61216e8b10b689"));
+    aes::hazmat::inv_mix_columns(&mut block);
+    assert_eq!(block.as_slice(), &hex!("4773b91ff72f354361cb018ea1e6cf2c"))
+}
diff --git a/tests/mod.rs b/tests/mod.rs
new file mode 100644 (file)
index 0000000..4164e4f
--- /dev/null
@@ -0,0 +1,6 @@
+//! Test vectors are from NESSIE:
+//! https://www.cosic.esat.kuleuven.be/nessie/testvectors/
+
+cipher::block_cipher_test!(aes128_test, "aes128", aes::Aes128);
+cipher::block_cipher_test!(aes192_test, "aes192", aes::Aes192);
+cipher::block_cipher_test!(aes256_test, "aes256", aes::Aes256);