src/third_party/boringssl/src/crypto/sha/sha512.c

   1 /* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
   2  * All rights reserved.
   3  *
   4  * This package is an SSL implementation written
   5  * by Eric Young (eay@cryptsoft.com).
   6  * The implementation was written so as to conform with Netscapes SSL.
   7  *
   8  * This library is free for commercial and non-commercial use as long as
   9  * the following conditions are aheared to.  The following conditions
  10  * apply to all code found in this distribution, be it the RC4, RSA,
  11  * lhash, DES, etc., code; not just the SSL code.  The SSL documentation
  12  * included with this distribution is covered by the same copyright terms
  13  * except that the holder is Tim Hudson (tjh@cryptsoft.com).
  14  *
  15  * Copyright remains Eric Young's, and as such any Copyright notices in
  16  * the code are not to be removed.
  17  * If this package is used in a product, Eric Young should be given attribution
  18  * as the author of the parts of the library used.
  19  * This can be in the form of a textual message at program startup or
  20  * in documentation (online or textual) provided with the package.
  21  *
  22  * Redistribution and use in source and binary forms, with or without
  23  * modification, are permitted provided that the following conditions
  24  * are met:
  25  * 1. Redistributions of source code must retain the copyright
  26  *    notice, this list of conditions and the following disclaimer.
  27  * 2. Redistributions in binary form must reproduce the above copyright
  28  *    notice, this list of conditions and the following disclaimer in the
  29  *    documentation and/or other materials provided with the distribution.
  30  * 3. All advertising materials mentioning features or use of this software
  31  *    must display the following acknowledgement:
  32  *    "This product includes cryptographic software written by
  33  *     Eric Young (eay@cryptsoft.com)"
  34  *    The word 'cryptographic' can be left out if the rouines from the library
  35  *    being used are not cryptographic related :-).
  36  * 4. If you include any Windows specific code (or a derivative thereof) from
  37  *    the apps directory (application code) you must include an acknowledgement:
  38  *    "This product includes software written by Tim Hudson (tjh@cryptsoft.com)"
  39  *
  40  * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND
  41  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  42  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  43  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  44  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  45  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  46  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  47  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  48  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  49  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  50  * SUCH DAMAGE.
  51  *
  52  * The licence and distribution terms for any publically available version or
  53  * derivative of this code cannot be changed.  i.e. this code cannot simply be
  54  * copied and put under another distribution licence
  55  * [including the GNU Public Licence.] */
  56
  57 #include <openssl/sha.h>
  58
  59 #include <string.h>
  60
  61 #include <openssl/mem.h>
  62
  63 #include "../internal.h"
  64
  65
  66 /* IMPLEMENTATION NOTES.
  67  *
  68  * As you might have noticed 32-bit hash algorithms:
  69  *
  70  * - permit SHA_LONG to be wider than 32-bit (case on CRAY);
  71  * - optimized versions implement two transform functions: one operating
  72  *   on [aligned] data in host byte order and one - on data in input
  73  *   stream byte order;
  74  * - share common byte-order neutral collector and padding function
  75  *   implementations, ../md32_common.h;
  76  *
  77  * Neither of the above applies to this SHA-512 implementations. Reasons
  78  * [in reverse order] are:
  79  *
  80  * - it's the only 64-bit hash algorithm for the moment of this writing,
  81  *   there is no need for common collector/padding implementation [yet];
  82  * - by supporting only one transform function [which operates on
  83  *   *aligned* data in input stream byte order, big-endian in this case]
  84  *   we minimize burden of maintenance in two ways: a) collector/padding
  85  *   function is simpler; b) only one transform function to stare at;
  86  * - SHA_LONG64 is required to be exactly 64-bit in order to be able to
  87  *   apply a number of optimizations to mitigate potential performance
  88  *   penalties caused by previous design decision; */
  89
  90 #if !defined(OPENSSL_NO_ASM) && \
  91     (defined(OPENSSL_X86) || defined(OPENSSL_X86_64) || defined(OPENSSL_ARM))
  92 #define SHA512_BLOCK_CAN_MANAGE_UNALIGNED_DATA
  93 #define SHA512_ASM
  94 #endif
  95
  96 int SHA384_Init(SHA512_CTX *sha) {
  97   sha->h[0] = OPENSSL_U64(0xcbbb9d5dc1059ed8);
  98   sha->h[1] = OPENSSL_U64(0x629a292a367cd507);
  99   sha->h[2] = OPENSSL_U64(0x9159015a3070dd17);
 100   sha->h[3] = OPENSSL_U64(0x152fecd8f70e5939);
 101   sha->h[4] = OPENSSL_U64(0x67332667ffc00b31);
 102   sha->h[5] = OPENSSL_U64(0x8eb44a8768581511);
 103   sha->h[6] = OPENSSL_U64(0xdb0c2e0d64f98fa7);
 104   sha->h[7] = OPENSSL_U64(0x47b5481dbefa4fa4);
 105
 106   sha->Nl = 0;
 107   sha->Nh = 0;
 108   sha->num = 0;
 109   sha->md_len = SHA384_DIGEST_LENGTH;
 110   return 1;
 111 }
 112
 113
 114 int SHA512_Init(SHA512_CTX *sha) {
 115   sha->h[0] = OPENSSL_U64(0x6a09e667f3bcc908);
 116   sha->h[1] = OPENSSL_U64(0xbb67ae8584caa73b);
 117   sha->h[2] = OPENSSL_U64(0x3c6ef372fe94f82b);
 118   sha->h[3] = OPENSSL_U64(0xa54ff53a5f1d36f1);
 119   sha->h[4] = OPENSSL_U64(0x510e527fade682d1);
 120   sha->h[5] = OPENSSL_U64(0x9b05688c2b3e6c1f);
 121   sha->h[6] = OPENSSL_U64(0x1f83d9abfb41bd6b);
 122   sha->h[7] = OPENSSL_U64(0x5be0cd19137e2179);
 123
 124   sha->Nl = 0;
 125   sha->Nh = 0;
 126   sha->num = 0;
 127   sha->md_len = SHA512_DIGEST_LENGTH;
 128   return 1;
 129 }
 130
 131 uint8_t *SHA384(const uint8_t *data, size_t len, uint8_t *out) {
 132   SHA512_CTX ctx;
 133   static uint8_t buf[SHA384_DIGEST_LENGTH];
 134
 135   /* TODO(fork): remove this static buffer. */
 136   if (out == NULL) {
 137     out = buf;
 138   }
 139
 140   SHA384_Init(&ctx);
 141   SHA512_Update(&ctx, data, len);
 142   SHA512_Final(out, &ctx);
 143   OPENSSL_cleanse(&ctx, sizeof(ctx));
 144   return out;
 145 }
 146
 147 uint8_t *SHA512(const uint8_t *data, size_t len, uint8_t *out) {
 148   SHA512_CTX ctx;
 149   static uint8_t buf[SHA512_DIGEST_LENGTH];
 150
 151   /* TODO(fork): remove this static buffer. */
 152   if (out == NULL) {
 153     out = buf;
 154   }
 155   SHA512_Init(&ctx);
 156   SHA512_Update(&ctx, data, len);
 157   SHA512_Final(out, &ctx);
 158   OPENSSL_cleanse(&ctx, sizeof(ctx));
 159   return out;
 160 }
 161
 162 #if !defined(SHA512_ASM)
 163 static
 164 #endif
 165 void sha512_block_data_order(SHA512_CTX *ctx, const void *in, size_t num);
 166
 167
 168 int SHA384_Final(unsigned char *md, SHA512_CTX *sha) {
 169   return SHA512_Final(md, sha);
 170 }
 171
 172 int SHA384_Update(SHA512_CTX *sha, const void *data, size_t len) {
 173   return SHA512_Update(sha, data, len);
 174 }
 175
 176 void SHA512_Transform(SHA512_CTX *c, const unsigned char *data) {
 177 #ifndef SHA512_BLOCK_CAN_MANAGE_UNALIGNED_DATA
 178   if ((size_t)data % sizeof(c->u.d[0]) != 0) {
 179     memcpy(c->u.p, data, sizeof(c->u.p));
 180     data = c->u.p;
 181   }
 182 #endif
 183   sha512_block_data_order(c, data, 1);
 184 }
 185
 186 int SHA512_Update(SHA512_CTX *c, const void *in_data, size_t len) {
 187   uint64_t l;
 188   uint8_t *p = c->u.p;
 189   const uint8_t *data = (const uint8_t *)in_data;
 190
 191   if (len == 0)
 192     return 1;
 193
 194   l = (c->Nl + (((uint64_t)len) << 3)) & OPENSSL_U64(0xffffffffffffffff);
 195   if (l < c->Nl) {
 196     c->Nh++;
 197   }
 198   if (sizeof(len) >= 8) {
 199     c->Nh += (((uint64_t)len) >> 61);
 200   }
 201   c->Nl = l;
 202
 203   if (c->num != 0) {
 204     size_t n = sizeof(c->u) - c->num;
 205
 206     if (len < n) {
 207       memcpy(p + c->num, data, len);
 208       c->num += (unsigned int)len;
 209       return 1;
 210     } else {
 211       memcpy(p + c->num, data, n), c->num = 0;
 212       len -= n;
 213       data += n;
 214       sha512_block_data_order(c, p, 1);
 215     }
 216   }
 217
 218   if (len >= sizeof(c->u)) {
 219 #ifndef SHA512_BLOCK_CAN_MANAGE_UNALIGNED_DATA
 220     if ((size_t)data % sizeof(c->u.d[0]) != 0)
 221       while (len >= sizeof(c->u))
 222         memcpy(p, data, sizeof(c->u)), sha512_block_data_order(c, p, 1),
 223             len -= sizeof(c->u), data += sizeof(c->u);
 224     else
 225 #endif
 226       sha512_block_data_order(c, data, len / sizeof(c->u)), data += len,
 227           len %= sizeof(c->u), data -= len;
 228   }
 229
 230   if (len != 0) {
 231     memcpy(p, data, len);
 232     c->num = (int)len;
 233   }
 234
 235   return 1;
 236 }
 237
 238 int SHA512_Final(unsigned char *md, SHA512_CTX *sha) {
 239   uint8_t *p = (uint8_t *)sha->u.p;
 240   size_t n = sha->num;
 241
 242   p[n] = 0x80; /* There always is a room for one */
 243   n++;
 244   if (n > (sizeof(sha->u) - 16)) {
 245     memset(p + n, 0, sizeof(sha->u) - n);
 246     n = 0;
 247     sha512_block_data_order(sha, p, 1);
 248   }
 249
 250   memset(p + n, 0, sizeof(sha->u) - 16 - n);
 251   p[sizeof(sha->u) - 1] = (uint8_t)(sha->Nl);
 252   p[sizeof(sha->u) - 2] = (uint8_t)(sha->Nl >> 8);
 253   p[sizeof(sha->u) - 3] = (uint8_t)(sha->Nl >> 16);
 254   p[sizeof(sha->u) - 4] = (uint8_t)(sha->Nl >> 24);
 255   p[sizeof(sha->u) - 5] = (uint8_t)(sha->Nl >> 32);
 256   p[sizeof(sha->u) - 6] = (uint8_t)(sha->Nl >> 40);
 257   p[sizeof(sha->u) - 7] = (uint8_t)(sha->Nl >> 48);
 258   p[sizeof(sha->u) - 8] = (uint8_t)(sha->Nl >> 56);
 259   p[sizeof(sha->u) - 9] = (uint8_t)(sha->Nh);
 260   p[sizeof(sha->u) - 10] = (uint8_t)(sha->Nh >> 8);
 261   p[sizeof(sha->u) - 11] = (uint8_t)(sha->Nh >> 16);
 262   p[sizeof(sha->u) - 12] = (uint8_t)(sha->Nh >> 24);
 263   p[sizeof(sha->u) - 13] = (uint8_t)(sha->Nh >> 32);
 264   p[sizeof(sha->u) - 14] = (uint8_t)(sha->Nh >> 40);
 265   p[sizeof(sha->u) - 15] = (uint8_t)(sha->Nh >> 48);
 266   p[sizeof(sha->u) - 16] = (uint8_t)(sha->Nh >> 56);
 267
 268   sha512_block_data_order(sha, p, 1);
 269
 270   if (md == 0) {
 271     return 0;
 272   }
 273
 274   switch (sha->md_len) {
 275     /* Let compiler decide if it's appropriate to unroll... */
 276     case SHA384_DIGEST_LENGTH:
 277       for (n = 0; n < SHA384_DIGEST_LENGTH / 8; n++) {
 278         uint64_t t = sha->h[n];
 279
 280         *(md++) = (uint8_t)(t >> 56);
 281         *(md++) = (uint8_t)(t >> 48);
 282         *(md++) = (uint8_t)(t >> 40);
 283         *(md++) = (uint8_t)(t >> 32);
 284         *(md++) = (uint8_t)(t >> 24);
 285         *(md++) = (uint8_t)(t >> 16);
 286         *(md++) = (uint8_t)(t >> 8);
 287         *(md++) = (uint8_t)(t);
 288       }
 289       break;
 290     case SHA512_DIGEST_LENGTH:
 291       for (n = 0; n < SHA512_DIGEST_LENGTH / 8; n++) {
 292         uint64_t t = sha->h[n];
 293
 294         *(md++) = (uint8_t)(t >> 56);
 295         *(md++) = (uint8_t)(t >> 48);
 296         *(md++) = (uint8_t)(t >> 40);
 297         *(md++) = (uint8_t)(t >> 32);
 298         *(md++) = (uint8_t)(t >> 24);
 299         *(md++) = (uint8_t)(t >> 16);
 300         *(md++) = (uint8_t)(t >> 8);
 301         *(md++) = (uint8_t)(t);
 302       }
 303       break;
 304     /* ... as well as make sure md_len is not abused. */
 305     default:
 306       return 0;
 307   }
 308
 309   return 1;
 310 }
 311
 312 #ifndef SHA512_ASM
 313 static const uint64_t K512[80] = {
 314     0x428a2f98d728ae22, 0x7137449123ef65cd, 0xb5c0fbcfec4d3b2f,
 315     0xe9b5dba58189dbbc, 0x3956c25bf348b538, 0x59f111f1b605d019,
 316     0x923f82a4af194f9b, 0xab1c5ed5da6d8118, 0xd807aa98a3030242,
 317     0x12835b0145706fbe, 0x243185be4ee4b28c, 0x550c7dc3d5ffb4e2,
 318     0x72be5d74f27b896f, 0x80deb1fe3b1696b1, 0x9bdc06a725c71235,
 319     0xc19bf174cf692694, 0xe49b69c19ef14ad2, 0xefbe4786384f25e3,
 320     0x0fc19dc68b8cd5b5, 0x240ca1cc77ac9c65, 0x2de92c6f592b0275,
 321     0x4a7484aa6ea6e483, 0x5cb0a9dcbd41fbd4, 0x76f988da831153b5,
 322     0x983e5152ee66dfab, 0xa831c66d2db43210, 0xb00327c898fb213f,
 323     0xbf597fc7beef0ee4, 0xc6e00bf33da88fc2, 0xd5a79147930aa725,
 324     0x06ca6351e003826f, 0x142929670a0e6e70, 0x27b70a8546d22ffc,
 325     0x2e1b21385c26c926, 0x4d2c6dfc5ac42aed, 0x53380d139d95b3df,
 326     0x650a73548baf63de, 0x766a0abb3c77b2a8, 0x81c2c92e47edaee6,
 327     0x92722c851482353b, 0xa2bfe8a14cf10364, 0xa81a664bbc423001,
 328     0xc24b8b70d0f89791, 0xc76c51a30654be30, 0xd192e819d6ef5218,
 329     0xd69906245565a910, 0xf40e35855771202a, 0x106aa07032bbd1b8,
 330     0x19a4c116b8d2d0c8, 0x1e376c085141ab53, 0x2748774cdf8eeb99,
 331     0x34b0bcb5e19b48a8, 0x391c0cb3c5c95a63, 0x4ed8aa4ae3418acb,
 332     0x5b9cca4f7763e373, 0x682e6ff3d6b2b8a3, 0x748f82ee5defb2fc,
 333     0x78a5636f43172f60, 0x84c87814a1f0ab72, 0x8cc702081a6439ec,
 334     0x90befffa23631e28, 0xa4506cebde82bde9, 0xbef9a3f7b2c67915,
 335     0xc67178f2e372532b, 0xca273eceea26619c, 0xd186b8c721c0c207,
 336     0xeada7dd6cde0eb1e, 0xf57d4f7fee6ed178, 0x06f067aa72176fba,
 337     0x0a637dc5a2c898a6, 0x113f9804bef90dae, 0x1b710b35131c471b,
 338     0x28db77f523047d84, 0x32caab7b40c72493, 0x3c9ebe0a15c9bebc,
 339     0x431d67c49c100d4c, 0x4cc5d4becb3e42b6, 0x597f299cfc657e2a,
 340     0x5fcb6fab3ad6faec, 0x6c44198c4a475817};
 341
 342 #if defined(__GNUC__) && __GNUC__ >= 2 && !defined(OPENSSL_NO_ASM)
 343 #if defined(__x86_64) || defined(__x86_64__)
 344 #define ROTR(a, n)                                         \
 345   ({                                                       \
 346     uint64_t ret;                                          \
 347     asm("rorq %1,%0" : "=r"(ret) : "J"(n), "0"(a) : "cc"); \
 348     ret;                                                   \
 349   })
 350 #define PULL64(x)                               \
 351   ({                                            \
 352     uint64_t ret = *((const uint64_t *)(&(x))); \
 353     asm("bswapq %0" : "=r"(ret) : "0"(ret));    \
 354     ret;                                        \
 355   })
 356 #elif(defined(__i386) || defined(__i386__))
 357 #define PULL64(x)                                                         \
 358   ({                                                                      \
 359     const unsigned int *p = (const unsigned int *)(&(x));                 \
 360     unsigned int hi = p[0], lo = p[1];                                    \
 361     asm("bswapl %0; bswapl %1;" : "=r"(lo), "=r"(hi) : "0"(lo), "1"(hi)); \
 362     ((uint64_t)hi) << 32 | lo;                                            \
 363   })
 364 #elif(defined(_ARCH_PPC) && defined(__64BIT__)) || defined(_ARCH_PPC64)
 365 #define ROTR(a, n)                                       \
 366   ({                                                     \
 367     uint64_t ret;                                        \
 368     asm("rotrdi %0,%1,%2" : "=r"(ret) : "r"(a), "K"(n)); \
 369     ret;                                                 \
 370   })
 371 #elif defined(__aarch64__)
 372 #define ROTR(a, n)                                    \
 373   ({                                                  \
 374     uint64_t ret;                                     \
 375     asm("ror %0,%1,%2" : "=r"(ret) : "r"(a), "I"(n)); \
 376     ret;                                              \
 377   })
 378 #if defined(__BYTE_ORDER__) && defined(__ORDER_LITTLE_ENDIAN__) && \
 379     __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
 380 #define PULL64(x)                                                       \
 381   ({                                                                    \
 382     uint64_t ret;                                                       \
 383     asm("rev    %0,%1" : "=r"(ret) : "r"(*((const uint64_t *)(&(x))))); \
 384     ret;                                                                \
 385   })
 386 #endif
 387 #endif
 388 #elif defined(_MSC_VER)
 389 #if defined(_WIN64) /* applies to both IA-64 and AMD64 */
 390 #pragma intrinsic(_rotr64)
 391 #define ROTR(a, n) _rotr64((a), n)
 392 #endif
 393 #if defined(_M_IX86) && !defined(OPENSSL_NO_ASM)
 394 static uint64_t __fastcall __pull64be(const void *x) {
 395   _asm mov edx, [ecx + 0]
 396   _asm mov eax, [ecx + 4]
 397   _asm bswap edx
 398   _asm bswap eax
 399 }
 400 #define PULL64(x) __pull64be(&(x))
 401 #if _MSC_VER <= 1200
 402 #pragma inline_depth(0)
 403 #endif
 404 #endif
 405 #endif
 406
 407 #ifndef PULL64
 408 #define B(x, j) \
 409   (((uint64_t)(*(((const unsigned char *)(&x)) + j))) << ((7 - j) * 8))
 410 #define PULL64(x)                                                        \
 411   (B(x, 0) | B(x, 1) | B(x, 2) | B(x, 3) | B(x, 4) | B(x, 5) | B(x, 6) | \
 412    B(x, 7))
 413 #endif
 414
 415 #ifndef ROTR
 416 #define ROTR(x, s) (((x) >> s) | (x) << (64 - s))
 417 #endif
 418
 419 #define Sigma0(x) (ROTR((x), 28) ^ ROTR((x), 34) ^ ROTR((x), 39))
 420 #define Sigma1(x) (ROTR((x), 14) ^ ROTR((x), 18) ^ ROTR((x), 41))
 421 #define sigma0(x) (ROTR((x), 1) ^ ROTR((x), 8) ^ ((x) >> 7))
 422 #define sigma1(x) (ROTR((x), 19) ^ ROTR((x), 61) ^ ((x) >> 6))
 423
 424 #define Ch(x, y, z) (((x) & (y)) ^ ((~(x)) & (z)))
 425 #define Maj(x, y, z) (((x) & (y)) ^ ((x) & (z)) ^ ((y) & (z)))
 426
 427
 428 #if defined(__i386) || defined(__i386__) || defined(_M_IX86)
 429 /*
 430  * This code should give better results on 32-bit CPU with less than
 431  * ~24 registers, both size and performance wise...
 432  */
 433 static void sha512_block_data_order(SHA512_CTX *ctx, const void *in,
 434                                     size_t num) {
 435   const uint64_t *W = in;
 436   uint64_t A, E, T;
 437   uint64_t X[9 + 80], *F;
 438   int i;
 439
 440   while (num--) {
 441     F = X + 80;
 442     A = ctx->h[0];
 443     F[1] = ctx->h[1];
 444     F[2] = ctx->h[2];
 445     F[3] = ctx->h[3];
 446     E = ctx->h[4];
 447     F[5] = ctx->h[5];
 448     F[6] = ctx->h[6];
 449     F[7] = ctx->h[7];
 450
 451     for (i = 0; i < 16; i++, F--) {
 452       T = PULL64(W[i]);
 453       F[0] = A;
 454       F[4] = E;
 455       F[8] = T;
 456       T += F[7] + Sigma1(E) + Ch(E, F[5], F[6]) + K512[i];
 457       E = F[3] + T;
 458       A = T + Sigma0(A) + Maj(A, F[1], F[2]);
 459     }
 460
 461     for (; i < 80; i++, F--) {
 462       T = sigma0(F[8 + 16 - 1]);
 463       T += sigma1(F[8 + 16 - 14]);
 464       T += F[8 + 16] + F[8 + 16 - 9];
 465
 466       F[0] = A;
 467       F[4] = E;
 468       F[8] = T;
 469       T += F[7] + Sigma1(E) + Ch(E, F[5], F[6]) + K512[i];
 470       E = F[3] + T;
 471       A = T + Sigma0(A) + Maj(A, F[1], F[2]);
 472     }
 473
 474     ctx->h[0] += A;
 475     ctx->h[1] += F[1];
 476     ctx->h[2] += F[2];
 477     ctx->h[3] += F[3];
 478     ctx->h[4] += E;
 479     ctx->h[5] += F[5];
 480     ctx->h[6] += F[6];
 481     ctx->h[7] += F[7];
 482
 483     W += 16;
 484   }
 485 }
 486
 487 #else
 488
 489 #define ROUND_00_15(i, a, b, c, d, e, f, g, h)   \
 490   do {                                           \
 491     T1 += h + Sigma1(e) + Ch(e, f, g) + K512[i]; \
 492     h = Sigma0(a) + Maj(a, b, c);                \
 493     d += T1;                                     \
 494     h += T1;                                     \
 495   } while (0)
 496
 497 #define ROUND_16_80(i, j, a, b, c, d, e, f, g, h, X)   \
 498   do {                                                 \
 499     s0 = X[(j + 1) & 0x0f];                            \
 500     s0 = sigma0(s0);                                   \
 501     s1 = X[(j + 14) & 0x0f];                           \
 502     s1 = sigma1(s1);                                   \
 503     T1 = X[(j) & 0x0f] += s0 + s1 + X[(j + 9) & 0x0f]; \
 504     ROUND_00_15(i + j, a, b, c, d, e, f, g, h);        \
 505   } while (0)
 506
 507 static void sha512_block_data_order(SHA512_CTX *ctx, const void *in,
 508                                     size_t num) {
 509   const uint64_t *W = in;
 510   uint64_t a, b, c, d, e, f, g, h, s0, s1, T1;
 511   uint64_t X[16];
 512   int i;
 513
 514   while (num--) {
 515
 516     a = ctx->h[0];
 517     b = ctx->h[1];
 518     c = ctx->h[2];
 519     d = ctx->h[3];
 520     e = ctx->h[4];
 521     f = ctx->h[5];
 522     g = ctx->h[6];
 523     h = ctx->h[7];
 524
 525     T1 = X[0] = PULL64(W[0]);
 526     ROUND_00_15(0, a, b, c, d, e, f, g, h);
 527     T1 = X[1] = PULL64(W[1]);
 528     ROUND_00_15(1, h, a, b, c, d, e, f, g);
 529     T1 = X[2] = PULL64(W[2]);
 530     ROUND_00_15(2, g, h, a, b, c, d, e, f);
 531     T1 = X[3] = PULL64(W[3]);
 532     ROUND_00_15(3, f, g, h, a, b, c, d, e);
 533     T1 = X[4] = PULL64(W[4]);
 534     ROUND_00_15(4, e, f, g, h, a, b, c, d);
 535     T1 = X[5] = PULL64(W[5]);
 536     ROUND_00_15(5, d, e, f, g, h, a, b, c);
 537     T1 = X[6] = PULL64(W[6]);
 538     ROUND_00_15(6, c, d, e, f, g, h, a, b);
 539     T1 = X[7] = PULL64(W[7]);
 540     ROUND_00_15(7, b, c, d, e, f, g, h, a);
 541     T1 = X[8] = PULL64(W[8]);
 542     ROUND_00_15(8, a, b, c, d, e, f, g, h);
 543     T1 = X[9] = PULL64(W[9]);
 544     ROUND_00_15(9, h, a, b, c, d, e, f, g);
 545     T1 = X[10] = PULL64(W[10]);
 546     ROUND_00_15(10, g, h, a, b, c, d, e, f);
 547     T1 = X[11] = PULL64(W[11]);
 548     ROUND_00_15(11, f, g, h, a, b, c, d, e);
 549     T1 = X[12] = PULL64(W[12]);
 550     ROUND_00_15(12, e, f, g, h, a, b, c, d);
 551     T1 = X[13] = PULL64(W[13]);
 552     ROUND_00_15(13, d, e, f, g, h, a, b, c);
 553     T1 = X[14] = PULL64(W[14]);
 554     ROUND_00_15(14, c, d, e, f, g, h, a, b);
 555     T1 = X[15] = PULL64(W[15]);
 556     ROUND_00_15(15, b, c, d, e, f, g, h, a);
 557
 558     for (i = 16; i < 80; i += 16) {
 559       ROUND_16_80(i, 0, a, b, c, d, e, f, g, h, X);
 560       ROUND_16_80(i, 1, h, a, b, c, d, e, f, g, X);
 561       ROUND_16_80(i, 2, g, h, a, b, c, d, e, f, X);
 562       ROUND_16_80(i, 3, f, g, h, a, b, c, d, e, X);
 563       ROUND_16_80(i, 4, e, f, g, h, a, b, c, d, X);
 564       ROUND_16_80(i, 5, d, e, f, g, h, a, b, c, X);
 565       ROUND_16_80(i, 6, c, d, e, f, g, h, a, b, X);
 566       ROUND_16_80(i, 7, b, c, d, e, f, g, h, a, X);
 567       ROUND_16_80(i, 8, a, b, c, d, e, f, g, h, X);
 568       ROUND_16_80(i, 9, h, a, b, c, d, e, f, g, X);
 569       ROUND_16_80(i, 10, g, h, a, b, c, d, e, f, X);
 570       ROUND_16_80(i, 11, f, g, h, a, b, c, d, e, X);
 571       ROUND_16_80(i, 12, e, f, g, h, a, b, c, d, X);
 572       ROUND_16_80(i, 13, d, e, f, g, h, a, b, c, X);
 573       ROUND_16_80(i, 14, c, d, e, f, g, h, a, b, X);
 574       ROUND_16_80(i, 15, b, c, d, e, f, g, h, a, X);
 575     }
 576
 577     ctx->h[0] += a;
 578     ctx->h[1] += b;
 579     ctx->h[2] += c;
 580     ctx->h[3] += d;
 581     ctx->h[4] += e;
 582     ctx->h[5] += f;
 583     ctx->h[6] += g;
 584     ctx->h[7] += h;
 585
 586     W += 16;
 587   }
 588 }
 589
 590 #endif
 591
 592 #endif /* SHA512_ASM */