sha3: make size/speed optimization decision configurable
authorDenys Vlasenko <vda.linux@googlemail.com>
Tue, 15 Jan 2013 00:12:26 +0000 (01:12 +0100)
committerDenys Vlasenko <vda.linux@googlemail.com>
Tue, 15 Jan 2013 00:12:26 +0000 (01:12 +0100)
Signed-off-by: Denys Vlasenko <vda.linux@googlemail.com>
libbb/Config.src
libbb/hash_md5_sha.c

index ee1b66a..19021fe 100644 (file)
@@ -28,6 +28,16 @@ config MD5_SMALL
          2                   3.0                5088
          3 (smallest)        5.1                4912
 
+config SHA3_SMALL
+       int "SHA3: Trade bytes for speed (0:fast, 1:slow)"
+       default 1
+       range 0 1
+       help
+         Trade binary size versus speed for the sha3sum algorithm.
+         SHA3_SMALL=0 compared to SHA3_SMALL=1 (approximate):
+         64-bit x86: +270 bytes of code, 45% faster
+         32-bit x86: +450 bytes of code, 75% faster
+
 config FEATURE_FAST_TOP
        bool "Faster /proc scanning code (+100 bytes)"
        default y
index 06a2400..643cf20 100644 (file)
@@ -918,6 +918,16 @@ void FAST_FUNC sha512_end(sha512_ctx_t *ctx, void *resbuf)
  * Busybox modifications (C) Lauri Kasanen, under the GPLv2.
  */
 
+#if CONFIG_SHA3_SMALL < 0
+# define SHA3_SMALL 0
+#elif CONFIG_SHA3_SMALL > 1
+# define SHA3_SMALL 1
+#else
+# define SHA3_SMALL CONFIG_SHA3_SMALL
+#endif
+
+#define ARCH_IS_64BIT (sizeof(long) >= sizeof(uint64_t))
+
 enum {
        cKeccakR_SizeInBytes = 576 / 8,
        cKeccakNumberOfRounds = 24,
@@ -967,8 +977,6 @@ static const uint8_t KeccakF_Mod5[10] = {
 static void KeccakF(uint64_t *state)
 {
        uint8_t x, y;
-       uint64_t temp;
-       uint64_t BC[5];
        int round;
 
        if (BB_BIG_ENDIAN) {
@@ -979,30 +987,61 @@ static void KeccakF(uint64_t *state)
 
        for (round = 0; round < cKeccakNumberOfRounds; ++round) {
                /* Theta */
-               for (x = 0; x < 5; ++x) {
-                       BC[x] = state[x] ^ state[5 + x] ^ state[10 + x] ^
-                               state[15 + x] ^ state[20 + x];
-               }
-               for (x = 0; x < 5; ++x) {
-                       temp = BC[KeccakF_Mod5[x + 4]] ^
-                               rotl64(BC[KeccakF_Mod5[x + 1]], 1);
-
-                       for (y = 0; y <= 20; y += 5) {
-                               state[y + x] ^= temp;
+               {
+                       uint64_t BC[5];
+                       for (x = 0; x < 5; ++x) {
+                               BC[x] = state[x] ^ state[5 + x] ^ state[10 + x] ^
+                                       state[15 + x] ^ state[20 + x];
+                       }
+                       for (x = 0; x < 5; ++x) {
+                               uint64_t temp = BC[KeccakF_Mod5[x + 4]] ^
+                                       rotl64(BC[KeccakF_Mod5[x + 1]], 1);
+                               if (SHA3_SMALL && !ARCH_IS_64BIT) {
+                                       for (y = 0; y <= 20; y += 5)
+                                               state[y + x] ^= temp;
+                               } else {
+                                       /* on 64-bit arch, this is actually smaller too */
+                                       state[0 + x] ^= temp;
+                                       state[5 + x] ^= temp;
+                                       state[10 + x] ^= temp;
+                                       state[15 + x] ^= temp;
+                                       state[20 + x] ^= temp;
+                               }
                        }
                }
 
                /* Rho Pi */
-               temp = state[1];
-               for (x = 0; x < 24; ++x) {
-                       BC[0] = state[KeccakF_PiLane[x]];
-                       state[KeccakF_PiLane[x]] =
-                           rotl64(temp, KeccakF_RotationConstants[x]);
-                       temp = BC[0];
+               if (SHA3_SMALL) {
+                       uint64_t t1 = state[1];
+                       for (x = 0; x < 24; ++x) {
+                               uint64_t t0 = state[KeccakF_PiLane[x]];
+                               state[KeccakF_PiLane[x]] = rotl64(t1, KeccakF_RotationConstants[x]);
+                               t1 = t0;
+                       }
+               } else {
+                       /* Especially large benefit for 32-bit arch:
+                        * 64-bit rotations by non-constant usually are SLOW on those.
+                        * We resort to unrolling here.
+                        * This optimizes out KeccakF_PiLane[] and KeccakF_RotationConstants[],
+                        * but generates 300-500 more bytes of code.
+                        */
+                       uint64_t t0;
+                       uint64_t t1 = state[1];
+#define RhoPi_twice(x) \
+                       t0 = state[KeccakF_PiLane[x  ]]; state[KeccakF_PiLane[x  ]] = rotl64(t1, KeccakF_RotationConstants[x  ]); \
+                       t1 = state[KeccakF_PiLane[x+1]]; state[KeccakF_PiLane[x+1]] = rotl64(t0, KeccakF_RotationConstants[x+1]);
+                       RhoPi_twice(0); RhoPi_twice(2);
+                       RhoPi_twice(4); RhoPi_twice(6);
+                       RhoPi_twice(8); RhoPi_twice(10);
+                       RhoPi_twice(12); RhoPi_twice(14);
+                       RhoPi_twice(16); RhoPi_twice(18);
+                       RhoPi_twice(20); RhoPi_twice(22);
+#undef RhoPi_twice
                }
 
                /* Chi */
-               for (y = 0; y < 25; y += 5) {
+               for (y = 0; y <= 20; y += 5) {
+                       uint64_t BC[5];
                        BC[0] = state[y + 0];
                        BC[1] = state[y + 1];
                        BC[2] = state[y + 2];