Replaced tinyDTLS ecc with micro-ecc
authorRandeep Singh <randeep.s@samsung.com>
Fri, 13 Feb 2015 06:07:54 +0000 (15:07 +0900)
committerPatrick Lankswert <patrick.lankswert@intel.com>
Mon, 23 Feb 2015 16:57:19 +0000 (16:57 +0000)
Please refer https://github.com/kmackay/micro-ecc with
version 98e1f208cfde43f65b6263dde6fca1a194bb6543 for
micro-ecc source code used in this implementation.

Change-Id: Icacd21ff24a7afce524bf4b0f4b3429541161269
Signed-off-by: Randeep Singh <randeep.s@samsung.com>
Reviewed-on: https://gerrit.iotivity.org/gerrit/340
Reviewed-by: Seung-Woo Lee <sw0524.lee@samsung.com>
Tested-by: jenkins-iotivity <jenkins-iotivity@opendaylight.org>
Reviewed-by: Sachin Agrawal <sachin.agrawal@intel.com>
Reviewed-by: Patrick Lankswert <patrick.lankswert@intel.com>
18 files changed:
extlibs/tinydtls/crypto.c [changed mode: 0644->0755]
extlibs/tinydtls/ecc/LICENSE.txt [new file with mode: 0755]
extlibs/tinydtls/ecc/Makefile.contiki [changed mode: 0644->0755]
extlibs/tinydtls/ecc/Makefile.ecc [changed mode: 0644->0755]
extlibs/tinydtls/ecc/Makefile.in [changed mode: 0644->0755]
extlibs/tinydtls/ecc/README.md [new file with mode: 0755]
extlibs/tinydtls/ecc/asm_arm.inc [new file with mode: 0755]
extlibs/tinydtls/ecc/asm_avr.inc [new file with mode: 0755]
extlibs/tinydtls/ecc/ecc.c [changed mode: 0644->0755]
extlibs/tinydtls/ecc/ecc.h [changed mode: 0644->0755]
extlibs/tinydtls/ecc/test/ecc_test/ecc_test.ino [new file with mode: 0755]
extlibs/tinydtls/ecc/test/emk_rules.py [new file with mode: 0755]
extlibs/tinydtls/ecc/test/test_ecdh.c [new file with mode: 0755]
extlibs/tinydtls/ecc/test/test_ecdsa.c [new file with mode: 0755]
extlibs/tinydtls/ecc/test_helper.c [deleted file]
extlibs/tinydtls/ecc/test_helper.h [deleted file]
extlibs/tinydtls/ecc/testecc.c [deleted file]
extlibs/tinydtls/ecc/testfield.c [deleted file]

old mode 100644 (file)
new mode 100755 (executable)
index 8012501..0113342
@@ -3,6 +3,18 @@
  * Copyright (C) 2011--2012 Olaf Bergmann <bergmann@tzi.org>
  * Copyright (C) 2013 Hauke Mehrtens <hauke@hauke-m.de>
  *
+ *
+ * Modified source code for micro-ecc porting,
+ *
+ * Following functions are removed:
+ *   - dtls_ec_key_to_uint32
+ *   - dtls_ec_key_from_uint32
+ * Following functions are modified:
+ *   - dtls_ecdh_pre_master_secret
+ *   - dtls_ecdsa_generate_key
+ *   - dtls_ecdsa_create_sig_hash
+ *   - dtls_ecdsa_verify_sig_hash
+ *
  * Permission is hereby granted, free of charge, to any person
  * obtaining a copy of this software and associated documentation
  * files (the "Software"), to deal in the Software without
@@ -136,7 +148,7 @@ dtls_handshake_parameters_t *dtls_handshake_new()
     /* initialize the handshake hash wrt. the hard-coded DTLS version */
     dtls_debug("DTLSv12: initialize HASH_SHA256\n");
     /* TLS 1.2:  PRF(secret, label, seed) = P_<hash>(secret, label + seed) */
-    /* FIXME: we use the default SHA256 here, might need to support other 
+    /* FIXME: we use the default SHA256 here, might need to support other
               hash functions as well */
     dtls_hash_init(&handshake->hs_state.hs_hash);
   }
@@ -230,11 +242,11 @@ dtls_p_hash(dtls_hashfunc_t h,
 
   dtls_hmac_init(hmac_p, key, keylen);
   dtls_hmac_update(hmac_p, A, dlen);
-  
+
   HMAC_UPDATE_SEED(hmac_p, label, labellen);
   HMAC_UPDATE_SEED(hmac_p, random1, random1len);
   HMAC_UPDATE_SEED(hmac_p, random2, random2len);
-  
+
   dtls_hmac_finalize(hmac_p, tmp);
   memcpy(buf, tmp, buflen - len);
 
@@ -245,7 +257,7 @@ dtls_p_hash(dtls_hashfunc_t h,
   return buflen;
 }
 
-size_t 
+size_t
 dtls_prf(const unsigned char *key, size_t keylen,
         const unsigned char *label, size_t labellen,
         const unsigned char *random1, size_t random1len,
@@ -254,16 +266,16 @@ dtls_prf(const unsigned char *key, size_t keylen,
 
   /* Clear the result buffer */
   memset(buf, 0, buflen);
-  return dtls_p_hash(HASH_SHA256, 
-                    key, keylen, 
-                    label, labellen, 
+  return dtls_p_hash(HASH_SHA256,
+                    key, keylen,
+                    label, labellen,
                     random1, random1len,
                     random2, random2len,
                     buf, buflen);
 }
 
 void
-dtls_mac(dtls_hmac_context_t *hmac_ctx, 
+dtls_mac(dtls_hmac_context_t *hmac_ctx,
         const unsigned char *record,
         const unsigned char *packet, size_t length,
         unsigned char *buf) {
@@ -275,23 +287,23 @@ dtls_mac(dtls_hmac_context_t *hmac_ctx,
   dtls_hmac_update(hmac_ctx, record, sizeof(uint8) + sizeof(uint16));
   dtls_hmac_update(hmac_ctx, L, sizeof(uint16));
   dtls_hmac_update(hmac_ctx, packet, length);
-  
+
   dtls_hmac_finalize(hmac_ctx, buf);
 }
 
 static size_t
 dtls_ccm_encrypt(aes128_ccm_t *ccm_ctx, const unsigned char *src, size_t srclen,
-                unsigned char *buf, 
+                unsigned char *buf,
                 unsigned char *nounce,
                 const unsigned char *aad, size_t la) {
   long int len;
 
   assert(ccm_ctx);
 
-  len = dtls_ccm_encrypt_message(&ccm_ctx->ctx, 8 /* M */, 
+  len = dtls_ccm_encrypt_message(&ccm_ctx->ctx, 8 /* M */,
                                 max(2, 15 - DTLS_CCM_NONCE_SIZE),
                                 nounce,
-                                buf, srclen, 
+                                buf, srclen,
                                 aad, la);
   return len;
 }
@@ -305,10 +317,10 @@ dtls_ccm_decrypt(aes128_ccm_t *ccm_ctx, const unsigned char *src,
 
   assert(ccm_ctx);
 
-  len = dtls_ccm_decrypt_message(&ccm_ctx->ctx, 8 /* M */, 
+  len = dtls_ccm_decrypt_message(&ccm_ctx->ctx, 8 /* M */,
                                 max(2, 15 - DTLS_CCM_NONCE_SIZE),
                                 nounce,
-                                buf, srclen, 
+                                buf, srclen,
                                 aad, la);
   return len;
 }
@@ -331,7 +343,7 @@ dtls_psk_pre_master_secret(unsigned char *key, size_t keylen,
 
   memcpy(p, result, sizeof(uint16));
   p += sizeof(uint16);
-  
+
   memcpy(p, key, keylen);
 
   return 2 * (sizeof(uint16) + keylen);
@@ -339,31 +351,12 @@ dtls_psk_pre_master_secret(unsigned char *key, size_t keylen,
 #endif /* DTLS_PSK */
 
 #ifdef DTLS_ECC
-static void dtls_ec_key_to_uint32(const unsigned char *key, size_t key_size,
-                                 uint32_t *result) {
-  int i;
-
-  for (i = (key_size / sizeof(uint32_t)) - 1; i >= 0 ; i--) {
-    *result = dtls_uint32_to_int(&key[i * sizeof(uint32_t)]);
-    result++;
-  }
-}
-
-static void dtls_ec_key_from_uint32(const uint32_t *key, size_t key_size,
-                                   unsigned char *result) {
-  int i;
-
-  for (i = (key_size / sizeof(uint32_t)) - 1; i >= 0 ; i--) {
-    dtls_int_to_uint32(result, key[i]);
-    result += 4;
-  }
-}
 
 int dtls_ec_key_from_uint32_asn1(const uint32_t *key, size_t key_size,
                                 unsigned char *buf) {
   int i;
   unsigned char *buf_orig = buf;
-  int first = 1; 
+  int first = 1;
 
   for (i = (key_size / sizeof(uint32_t)) - 1; i >= 0 ; i--) {
     if (key[i] == 0)
@@ -373,7 +366,7 @@ int dtls_ec_key_from_uint32_asn1(const uint32_t *key, size_t key_size,
       *buf = 0;
       buf++;
       dtls_int_to_uint32(buf, key[i]);
-      buf += 4;      
+      buf += 4;
     } else if (first && !(key[i] & 0xFF800000)) {
       buf[0] = (key[i] >> 16) & 0xff;
       buf[1] = (key[i] >> 8) & 0xff;
@@ -401,23 +394,20 @@ int dtls_ecdh_pre_master_secret(unsigned char *priv_key,
                                    size_t key_size,
                                    unsigned char *result,
                                    size_t result_len) {
-  uint32_t priv[8];
-  uint32_t pub_x[8];
-  uint32_t pub_y[8];
-  uint32_t result_x[8];
-  uint32_t result_y[8];
+
+  uint8_t publicKey[64];
+  uint8_t privateKey[32];
 
   if (result_len < key_size) {
     return -1;
   }
 
-  dtls_ec_key_to_uint32(priv_key, key_size, priv);
-  dtls_ec_key_to_uint32(pub_key_x, key_size, pub_x);
-  dtls_ec_key_to_uint32(pub_key_y, key_size, pub_y);
 
-  ecc_ecdh(pub_x, pub_y, priv, result_x, result_y);
+  memcpy(publicKey, pub_key_x, 32);
+  memcpy(publicKey + 32, pub_key_y, 32);
+  memcpy(privateKey, priv_key, 32);
+  uECC_shared_secret(publicKey, privateKey, result);
 
-  dtls_ec_key_from_uint32(result_x, key_size, result);
   return key_size;
 }
 
@@ -426,19 +416,15 @@ dtls_ecdsa_generate_key(unsigned char *priv_key,
                        unsigned char *pub_key_x,
                        unsigned char *pub_key_y,
                        size_t key_size) {
-  uint32_t priv[8];
-  uint32_t pub_x[8];
-  uint32_t pub_y[8];
 
-  do {
-    dtls_prng((unsigned char *)priv, key_size);
-  } while (!ecc_is_valid_key(priv));
+  uint8_t publicKey[64];
+  uint8_t privateKey[32];
 
-  ecc_gen_pub_key(priv, pub_x, pub_y);
+  uECC_make_key(publicKey, privateKey);
+  memcpy(pub_key_x, publicKey, 32);
+  memcpy(pub_key_y, publicKey + 32, 32);
+  memcpy(priv_key, privateKey, 32);
 
-  dtls_ec_key_from_uint32(priv, key_size, priv_key);
-  dtls_ec_key_from_uint32(pub_x, key_size, pub_key_x);
-  dtls_ec_key_from_uint32(pub_y, key_size, pub_key_y);
 }
 
 /* rfc4492#section-5.4 */
@@ -447,16 +433,15 @@ dtls_ecdsa_create_sig_hash(const unsigned char *priv_key, size_t key_size,
                           const unsigned char *sign_hash, size_t sign_hash_size,
                           uint32_t point_r[9], uint32_t point_s[9]) {
   int ret;
-  uint32_t priv[8];
-  uint32_t hash[8];
-  uint32_t rand[8];
-  
-  dtls_ec_key_to_uint32(priv_key, key_size, priv);
-  dtls_ec_key_to_uint32(sign_hash, sign_hash_size, hash);
-  do {
-    dtls_prng((unsigned char *)rand, key_size);
-    ret = ecc_ecdsa_sign(priv, hash, rand, point_r, point_s);
-  } while (ret);
+
+  uint8_t privateKey[32];
+  uint8_t hashValue[32];
+  uint8_t sign[64];
+
+
+  uECC_sign(privateKey, hashValue, sign);
+  memcpy(point_r, sign, 32);
+  memcpy(point_s, sign + 32, 32);
 }
 
 void
@@ -473,7 +458,7 @@ dtls_ecdsa_create_sig(const unsigned char *priv_key, size_t key_size,
   dtls_hash_update(&data, server_random, server_random_size);
   dtls_hash_update(&data, keyx_params, keyx_params_size);
   dtls_hash_finalize(sha256hash, &data);
-  
+
   dtls_ecdsa_create_sig_hash(priv_key, key_size, sha256hash,
                             sizeof(sha256hash), point_r, point_s);
 }
@@ -484,19 +469,14 @@ dtls_ecdsa_verify_sig_hash(const unsigned char *pub_key_x,
                           const unsigned char *pub_key_y, size_t key_size,
                           const unsigned char *sign_hash, size_t sign_hash_size,
                           unsigned char *result_r, unsigned char *result_s) {
-  uint32_t pub_x[8];
-  uint32_t pub_y[8];
-  uint32_t hash[8];
-  uint32_t point_r[8];
-  uint32_t point_s[8];
-
-  dtls_ec_key_to_uint32(pub_key_x, key_size, pub_x);
-  dtls_ec_key_to_uint32(pub_key_y, key_size, pub_y);
-  dtls_ec_key_to_uint32(result_r, key_size, point_r);
-  dtls_ec_key_to_uint32(result_s, key_size, point_s);
-  dtls_ec_key_to_uint32(sign_hash, sign_hash_size, hash);
-
-  return ecc_ecdsa_validate(pub_x, pub_y, hash, point_r, point_s);
+
+  uint8_t publicKey[64];
+  uint8_t hashValue[32];
+  uint8_t sign[64];
+
+  memcpy(publicKey, pub_key_x, 32);
+  memcpy(publicKey + 32, pub_key_y, 32);
+  return uECC_verify(publicKey, hashValue, sign);
 }
 
 int
@@ -508,7 +488,7 @@ dtls_ecdsa_verify_sig(const unsigned char *pub_key_x,
                      unsigned char *result_r, unsigned char *result_s) {
   dtls_hash_ctx data;
   unsigned char sha256hash[DTLS_HMAC_DIGEST_SIZE];
-  
+
   dtls_hash_init(&data);
   dtls_hash_update(&data, client_random, client_random_size);
   dtls_hash_update(&data, server_random, server_random_size);
@@ -520,7 +500,7 @@ dtls_ecdsa_verify_sig(const unsigned char *pub_key_x,
 }
 #endif /* DTLS_ECC */
 
-int 
+int
 dtls_encrypt(const unsigned char *src, size_t length,
             unsigned char *buf,
             unsigned char *nounce,
@@ -546,7 +526,7 @@ error:
   return ret;
 }
 
-int 
+int
 dtls_decrypt(const unsigned char *src, size_t length,
             unsigned char *buf,
             unsigned char *nounce,
diff --git a/extlibs/tinydtls/ecc/LICENSE.txt b/extlibs/tinydtls/ecc/LICENSE.txt
new file mode 100755 (executable)
index 0000000..ab099ae
--- /dev/null
@@ -0,0 +1,21 @@
+Copyright (c) 2014, Kenneth MacKay
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without modification,
+are permitted provided that the following conditions are met:
+ * Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
old mode 100644 (file)
new mode 100755 (executable)
old mode 100644 (file)
new mode 100755 (executable)
old mode 100644 (file)
new mode 100755 (executable)
index 4631cbb..2a38603
@@ -35,18 +35,19 @@ abs_builddir = @abs_builddir@
 top_builddir = @top_builddir@
 top_srcdir:= @top_srcdir@
 
-ECC_SOURCES:= ecc.c testecc.c testfield.c test_helper.c
-ECC_HEADERS:= ecc.h test_helper.h
-FILES:=Makefile.in Makefile.contiki $(ECC_SOURCES) $(ECC_HEADERS) 
+
+ECC_SOURCES:= ecc.c test/test_ecdh.c test/test_ecdsa.c
+ECC_HEADERS:= ecc.h
+FILES:=Makefile.in Makefile.contiki $(ECC_SOURCES) $(ECC_HEADERS)
 DISTDIR=$(top_builddir)/@PACKAGE_TARNAME@-@PACKAGE_VERSION@
 
 ifeq ("@WITH_CONTIKI@", "1")
 include Makefile.contiki
 else
 ECC_OBJECTS:= $(patsubst %.c, %.o, $(ECC_SOURCES)) ecc_test.o
-PROGRAMS:= testecc testfield
+PROGRAMS:= test_ecdh test_ecdsa
 CPPFLAGS=@CPPFLAGS@
-CFLAGS=-Wall -std=c99 -pedantic @CFLAGS@ -DTEST_INCLUDE
+CFLAGS=-Wall -std=c99 @CFLAGS@ -DTEST_INCLUDE
 LDLIBS=@LIBS@
 
 .PHONY: all dirs clean install distclean .gitignore doc
@@ -59,11 +60,13 @@ all: $(PROGRAMS)
 ecc_test.o:    ecc.c ecc.h
        $(CC) $(CFLAGS) $(CPPFLAGS)  -c -o $@ $<
 
-testecc: ecc_test.o test_helper.o
+test_ecdh: ecc.c test/test_ecdh.c
+       $(CC) $(CFLAGS) $(CPPFLAGS) -o test_ecdh ecc.c test/test_ecdh.c
 
-testfield: ecc_test.o test_helper.o
+test_ecdsa:ecc.c test/test_ecdsa.c
+       $(CC) $(CFLAGS) $(CPPFLAGS) -o test_ecdsa ecc.c test/test_ecdsa.c
 
-check: 
+check:
        echo DISTDIR: $(DISTDIR)
        echo top_builddir: $(top_builddir)
 
diff --git a/extlibs/tinydtls/ecc/README.md b/extlibs/tinydtls/ecc/README.md
new file mode 100755 (executable)
index 0000000..24f1231
--- /dev/null
@@ -0,0 +1,208 @@
+micro-ecc
+==========
+
+A small and fast ECDH and ECDSA implementation for 8-bit, 32-bit, and 64-bit processors.
+
+The old version of micro-ecc can be found in the "old" branch.
+
+Features
+--------
+
+ * Resistant to known side-channel attacks.
+ * Written in C, with optional GCC inline assembly for AVR, ARM and Thumb platforms.
+ * Supports 8, 32, and 64-bit architectures.
+ * Small code size.
+ * No dynamic memory allocation.
+ * Support for 4 standard curves: secp160r1, secp192r1, secp256r1, and secp256k1.
+ * BSD 2-clause license.
+
+Usage Notes
+-----------
+### Point Representation ###
+Compressed points are represented in the standard format as defined in http://www.secg.org/collateral/sec1_final.pdf; uncompressed points are represented in standard format, but without the `0x04` prefix. `uECC_make_key()`, `uECC_shared_secret()`, `uECC_sign()`, and `uECC_verify()` only handle uncompressed points; you can use `uECC_compress()` and `uECC_decompress()` to convert between compressed and uncompressed point representations.
+
+Private keys are represented in the standard format.
+
+### Using the Code ###
+
+I recommend just copying (or symlink) uECC.h, uECC.c, and the appropriate asm\_&lt;arch&gt;\_.inc (if any) into your project. Then just `#include "uECC.h"` to use the micro-ecc functions.
+
+For use with Arduino, you can just create a symlink to the `uECC` directory in your Arduino `libraries` directory. You can then use uECC just like any other Arduino library (uECC should show up in the **Sketch**=>**Import Library** submenu).
+
+See uECC.h for documentation for each function.
+
+### Compilation Notes ###
+
+ * Should compile with any C/C++ compiler that supports stdint.h (this includes Visual Studio 2013).
+ * If you want to change the defaults for `uECC_CURVE` and `uECC_ASM`, you must change them in your Makefile or similar so that uECC.c is compiled with the desired values (ie, compile uECC.c with `-DuECC_CURVE=uECC_secp256r1` or whatever).
+ * When compiling for a Thumb-1 platform with inline assembly enabled (ie, `uECC_ASM` is defined to `uECC_asm_small` or `uECC_asm_fast`), you must use the `-fomit-frame-pointer` GCC option (this is enabled by default when compiling with `-O1` or higher).
+ * When compiling for an ARM/Thumb-2 platform with fast inline assembly enabled (ie, `uECC_ASM` is defined to `uECC_asm_fast`), you must use the `-fomit-frame-pointer` GCC option (this is enabled by default when compiling with `-O1` or higher).
+ * When compiling for AVR with inline assembly enabled, you must have optimizations enabled (compile with `-O1` or higher).
+ * When building for Windows, you will need to link in the `advapi32.lib` system library.
+
+ARM Performance
+---------------
+
+All tests were built using gcc 4.8.2 with `-O3`, and were run on a Raspberry Pi B+. `uECC_ASM` was defined to `uECC_asm_fast` and `ECC_SQUARE_FUNC` was defined to `1` in all cases. All times are in milliseconds.
+
+<table>
+       <tr>
+               <th></th>
+               <th>secp160r1</th>
+               <th>secp192r1</th>
+               <th>secp256r1</th>
+               <th>secp256k1</th>
+       </tr>
+       <tr>
+               <td><em>ECDH:</em></td>
+               <td>2.3</td>
+               <td>2.7</td>
+               <td>7.9</td>
+               <td>6.5</td>
+       </tr>
+       <tr>
+               <td><em>ECDSA sign:</em></td>
+               <td>2.8</td>
+               <td>3.1</td>
+               <td>8.6</td>
+               <td>7.2</td>
+       </tr>
+       <tr>
+               <td><em>ECDSA verify:</em></td>
+               <td>2.7</td>
+               <td>3.2</td>
+               <td>9.2</td>
+               <td>7.0</td>
+       </tr>
+</table>
+
+AVR Performance
+---------------
+
+All tests were built using avr-gcc 4.8.1 with `-Os`, and were run on a 16 MHz ATmega256RFR2. Code size refers to the space used by micro-ecc code and data.
+
+#### ECDH (fast) ####
+
+In these tests, `uECC_ASM` was defined to `uECC_asm_fast` and `ECC_SQUARE_FUNC` was defined to `1` in all cases.
+
+<table>
+       <tr>
+               <th></th>
+               <th>secp160r1</th>
+               <th>secp192r1</th>
+               <th>secp256r1</th>
+               <th>secp256k1</th>
+       </tr>
+       <tr>
+               <td><em>ECDH time (ms):</em></td>
+               <td>470</td>
+               <td>810</td>
+               <td>2220</td>
+               <td>1615</td>
+       </tr>
+       <tr>
+               <td><em>Code size (bytes):</em></td>
+               <td>10768</td>
+               <td>13112</td>
+               <td>20886</td>
+               <td>21126</td>
+       </tr>
+</table>
+
+#### ECDH (small) ####
+
+In these tests, `uECC_ASM` was defined to `uECC_asm_small` and `ECC_SQUARE_FUNC` was defined to `0` in all cases.
+
+<table>
+       <tr>
+               <th></th>
+               <th>secp160r1</th>
+               <th>secp192r1</th>
+               <th>secp256r1</th>
+               <th>secp256k1</th>
+       </tr>
+       <tr>
+               <td><em>ECDH time (ms):</em></td>
+               <td>1250</td>
+               <td>1810</td>
+               <td>4790</td>
+               <td>4700</td>
+       </tr>
+       <tr>
+               <td><em>Code size (bytes):</em></td>
+               <td>3244</td>
+               <td>3400</td>
+               <td>5274</td>
+               <td>3426</td>
+       </tr>
+</table>
+
+#### ECDSA (fast) ####
+
+In these tests, `uECC_ASM` was defined to `uECC_asm_fast` and `ECC_SQUARE_FUNC` was defined to `1` in all cases.
+
+<table>
+       <tr>
+               <th></th>
+               <th>secp160r1</th>
+               <th>secp192r1</th>
+               <th>secp256r1</th>
+               <th>secp256k1</th>
+       </tr>
+       <tr>
+               <td><em>ECDSA sign time (ms):</em></td>
+               <td>555</td>
+               <td>902</td>
+               <td>2386</td>
+               <td>1773</td>
+       </tr>
+       <tr>
+               <td><em>ECDSA verify time (ms):</em></td>
+               <td>590</td>
+               <td>990</td>
+               <td>2650</td>
+               <td>1800</td>
+       </tr>
+       <tr>
+               <td><em>Code size (bytes):</em></td>
+               <td>13246</td>
+               <td>14798</td>
+               <td>22594</td>
+               <td>22826</td>
+       </tr>
+</table>
+
+#### ECDSA (small) ####
+
+In these tests, `uECC_ASM` was defined to `uECC_asm_small` and `ECC_SQUARE_FUNC` was defined to `0` in all cases.
+
+<table>
+       <tr>
+               <th></th>
+               <th>secp160r1</th>
+               <th>secp192r1</th>
+               <th>secp256r1</th>
+               <th>secp256k1</th>
+       </tr>
+       <tr>
+               <td><em>ECDSA sign time (ms):</em></td>
+               <td>1359</td>
+               <td>1931</td>
+               <td>4998</td>
+               <td>4904</td>
+       </tr>
+       <tr>
+               <td><em>ECDSA verify time (ms):</em></td>
+               <td>1515</td>
+               <td>2160</td>
+               <td>5700</td>
+               <td>5220</td>
+       </tr>
+       <tr>
+               <td><em>Code size (bytes):</em></td>
+               <td>5690</td>
+               <td>5054</td>
+               <td>6980</td>
+               <td>5080</td>
+       </tr>
+</table>
diff --git a/extlibs/tinydtls/ecc/asm_arm.inc b/extlibs/tinydtls/ecc/asm_arm.inc
new file mode 100755 (executable)
index 0000000..f181b17
--- /dev/null
@@ -0,0 +1,1905 @@
+#define DEC_5 4
+#define DEC_6 5
+#define DEC_8 7
+
+#define DEC(N) uECC_CONCAT(DEC_, N)
+
+#define REPEAT_1(stuff) stuff
+#define REPEAT_2(stuff) REPEAT_1(stuff) stuff
+#define REPEAT_3(stuff) REPEAT_2(stuff) stuff
+#define REPEAT_4(stuff) REPEAT_3(stuff) stuff
+#define REPEAT_5(stuff) REPEAT_4(stuff) stuff
+#define REPEAT_6(stuff) REPEAT_5(stuff) stuff
+#define REPEAT_7(stuff) REPEAT_6(stuff) stuff
+#define REPEAT_8(stuff) REPEAT_7(stuff) stuff
+
+#define REPEAT(N, stuff) uECC_CONCAT(REPEAT_, N)(stuff)
+
+#define STR2(thing) #thing
+#define STR(thing) STR2(thing)
+
+#if (uECC_ASM == uECC_asm_fast)
+
+static uint32_t vli_add(uint32_t *p_result, uint32_t *p_left, uint32_t *p_right)
+{
+    uint32_t l_carry = 0;
+    uint32_t l_left;
+    uint32_t l_right;
+    
+    __asm__ volatile (
+        ".syntax unified \n\t"
+        "ldmia %[lptr]!, {%[left]} \n\t"  /* Load left word. */
+        "ldmia %[rptr]!, {%[right]} \n\t" /* Load right word. */
+        "adds %[left], %[right] \n\t"     /* Add first word. */
+        "stmia %[dptr]!, {%[left]} \n\t"  /* Store result word. */
+        
+        /* Now we just do the remaining words with the carry bit (using ADC) */
+        REPEAT(DEC(uECC_WORDS), "ldmia %[lptr]!, {%[left]} \n\t"
+            "ldmia %[rptr]!, {%[right]} \n\t"
+            "adcs %[left], %[right] \n\t"
+            "stmia %[dptr]!, {%[left]} \n\t")
+        
+        "adcs %[carry], %[carry] \n\t" /* Store carry bit in l_carry. */
+    #if (uECC_PLATFORM != uECC_arm_thumb2)
+        ".syntax divided \n\t"
+    #endif
+    #if (uECC_PLATFORM == uECC_arm_thumb)
+        : [dptr] "+l" (p_result), [lptr] "+l" (p_left), [rptr] "+l" (p_right),
+          [carry] "+l" (l_carry), [left] "=l" (l_left), [right] "=l" (l_right)
+    #else
+        : [dptr] "+r" (p_result), [lptr] "+r" (p_left), [rptr] "+r" (p_right),
+          [carry] "+r" (l_carry), [left] "=r" (l_left), [right] "=r" (l_right)
+    #endif
+        :
+        : "cc", "memory"
+    );
+    return l_carry;
+}
+#define asm_add 1
+
+static uint32_t vli_sub(uint32_t *p_result, uint32_t *p_left, uint32_t *p_right)
+{
+    uint32_t l_carry = 0;
+    uint32_t l_left;
+    uint32_t l_right;
+    
+    __asm__ volatile (
+        ".syntax unified \n\t"
+        "ldmia %[lptr]!, {%[left]} \n\t"  /* Load left word. */
+        "ldmia %[rptr]!, {%[right]} \n\t" /* Load right word. */
+        "subs %[left], %[right] \n\t"     /* Subtract. */
+        "stmia %[dptr]!, {%[left]} \n\t"  /* Store result word. */
+        
+        /* Now we just do the remaining words with the carry bit (using SBC) */
+        REPEAT(DEC(uECC_WORDS), "ldmia %[lptr]!, {%[left]} \n\t"
+            "ldmia %[rptr]!, {%[right]} \n\t"
+            "sbcs %[left], %[right] \n\t"
+            "stmia %[dptr]!, {%[left]} \n\t")
+            
+        "adcs %[carry], %[carry] \n\t" /* Store carry bit in l_carry. */
+    #if (uECC_PLATFORM != uECC_arm_thumb2)
+        ".syntax divided \n\t"
+    #endif
+    #if (uECC_PLATFORM == uECC_arm_thumb)
+        : [dptr] "+l" (p_result), [lptr] "+l" (p_left), [rptr] "+l" (p_right),
+          [carry] "+l" (l_carry), [left] "=l" (l_left), [right] "=l" (l_right)
+    #else
+        : [dptr] "+r" (p_result), [lptr] "+r" (p_left), [rptr] "+r" (p_right),
+          [carry] "+r" (l_carry), [left] "=r" (l_left), [right] "=r" (l_right)
+    #endif
+        :
+        : "cc", "memory"
+    );
+    return !l_carry; // note that on ARM, carry flag set means "no borrow" when subtracting (for some reason...)
+}
+#define asm_sub 1
+
+#if (uECC_PLATFORM != uECC_arm_thumb)
+#if (uECC_WORDS == 5)
+static void vli_mult(uint32_t *p_result, uint32_t *p_left, uint32_t *p_right)
+{
+    register uint32_t *r0 __asm__("r0") = p_result;
+    register uint32_t *r1 __asm__("r1") = p_left;
+    register uint32_t *r2 __asm__("r2") = p_right;
+    
+    __asm__ volatile (
+        ".syntax unified \n\t"
+        "add r0, 12 \n\t"
+        "add r2, 12 \n\t"
+        "ldmia r1!, {r3,r4} \n\t"
+        "ldmia r2!, {r6,r7} \n\t"
+
+        "umull r11, r12, r3, r6 \n\t"
+        "stmia r0!, {r11} \n\t"
+
+        "mov r10, #0 \n\t"
+        "umull r11, r9, r3, r7 \n\t"
+        "adds r12, r11 \n\t"
+        "adc r9, #0 \n\t"
+        "umull r11, r14, r4, r6 \n\t"
+        "adds r12, r11 \n\t"
+        "adcs r9, r14 \n\t"
+        "adc r10, #0 \n\t"
+        "stmia r0!, {r12} \n\t"
+
+        "umull r12, r14, r4, r7 \n\t"
+        "adds r9, r12 \n\t"
+        "adc r10, r14 \n\t"
+        "stmia r0!, {r9, r10} \n\t"
+
+        "sub r0, 28 \n\t"
+        "sub r2, 20 \n\t"
+        "ldmia r2!, {r6,r7,r8} \n\t"
+        "ldmia r1!, {r5} \n\t"
+
+        "umull r11, r12, r3, r6 \n\t"
+        "stmia r0!, {r11} \n\t"
+
+        "mov r10, #0 \n\t"
+        "umull r11, r9, r3, r7 \n\t"
+        "adds r12, r11 \n\t"
+        "adc r9, #0 \n\t"
+        "umull r11, r14, r4, r6 \n\t"
+        "adds r12, r11 \n\t"
+        "adcs r9, r14 \n\t"
+        "adc r10, #0 \n\t"
+        "stmia r0!, {r12} \n\t"
+
+        "mov r11, #0 \n\t"
+        "umull r12, r14, r3, r8 \n\t"
+        "adds r9, r12 \n\t"
+        "adcs r10, r14 \n\t"
+        "adc r11, #0 \n\t"
+        "umull r12, r14, r4, r7 \n\t"
+        "adds r9, r12 \n\t"
+        "adcs r10, r14 \n\t"
+        "adc r11, #0 \n\t"
+        "umull r12, r14, r5, r6 \n\t"
+        "adds r9, r12 \n\t"
+        "adcs r10, r14 \n\t"
+        "adc r11, #0 \n\t"
+        "stmia r0!, {r9} \n\t"
+
+        "ldmia r1!, {r3} \n\t"
+        "mov r12, #0 \n\t"
+        "umull r14, r9, r4, r8 \n\t"
+        "adds r10, r14 \n\t"
+        "adcs r11, r9 \n\t"
+        "adc r12, #0 \n\t"
+        "umull r14, r9, r5, r7 \n\t"
+        "adds r10, r14 \n\t"
+        "adcs r11, r9 \n\t"
+        "adc r12, #0 \n\t"
+        "umull r14, r9, r3, r6 \n\t"
+        "adds r10, r14 \n\t"
+        "adcs r11, r9 \n\t"
+        "adc r12, #0 \n\t"
+        "ldr r14, [r0] \n\t"
+        "adds r10, r14 \n\t"
+        "adcs r11, #0 \n\t"
+        "adc r12, #0 \n\t"
+        "stmia r0!, {r10} \n\t"
+
+        "ldmia r1!, {r4} \n\t"
+        "mov r14, #0 \n\t"
+        "umull r9, r10, r5, r8 \n\t"
+        "adds r11, r9 \n\t"
+        "adcs r12, r10 \n\t"
+        "adc r14, #0 \n\t"
+        "umull r9, r10, r3, r7 \n\t"
+        "adds r11, r9 \n\t"
+        "adcs r12, r10 \n\t"
+        "adc r14, #0 \n\t"
+        "umull r9, r10, r4, r6 \n\t"
+        "adds r11, r9 \n\t"
+        "adcs r12, r10 \n\t"
+        "adc r14, #0 \n\t"
+        "ldr r9, [r0] \n\t"
+        "adds r11, r9 \n\t"
+        "adcs r12, #0 \n\t"
+        "adc r14, #0 \n\t"
+        "stmia r0!, {r11} \n\t"
+
+        "ldmia r2!, {r6} \n\t"
+        "mov r9, #0 \n\t"
+        "umull r10, r11, r5, r6 \n\t"
+        "adds r12, r10 \n\t"
+        "adcs r14, r11 \n\t"
+        "adc r9, #0 \n\t"
+        "umull r10, r11, r3, r8 \n\t"
+        "adds r12, r10 \n\t"
+        "adcs r14, r11 \n\t"
+        "adc r9, #0 \n\t"
+        "umull r10, r11, r4, r7 \n\t"
+        "adds r12, r10 \n\t"
+        "adcs r14, r11 \n\t"
+        "adc r9, #0 \n\t"
+        "ldr r10, [r0] \n\t"
+        "adds r12, r10 \n\t"
+        "adcs r14, #0 \n\t"
+        "adc r9, #0 \n\t"
+        "stmia r0!, {r12} \n\t"
+
+        "ldmia r2!, {r7} \n\t"
+        "mov r10, #0 \n\t"
+        "umull r11, r12, r5, r7 \n\t"
+        "adds r14, r11 \n\t"
+        "adcs r9, r12 \n\t"
+        "adc r10, #0 \n\t"
+        "umull r11, r12, r3, r6 \n\t"
+        "adds r14, r11 \n\t"
+        "adcs r9, r12 \n\t"
+        "adc r10, #0 \n\t"
+        "umull r11, r12, r4, r8 \n\t"
+        "adds r14, r11 \n\t"
+        "adcs r9, r12 \n\t"
+        "adc r10, #0 \n\t"
+        "ldr r11, [r0] \n\t"
+        "adds r14, r11 \n\t"
+        "adcs r9, #0 \n\t"
+        "adc r10, #0 \n\t"
+        "stmia r0!, {r14} \n\t"
+
+        "mov r11, #0 \n\t"
+        "umull r12, r14, r3, r7 \n\t"
+        "adds r9, r12 \n\t"
+        "adcs r10, r14 \n\t"
+        "adc r11, #0 \n\t"
+        "umull r12, r14, r4, r6 \n\t"
+        "adds r9, r12 \n\t"
+        "adcs r10, r14 \n\t"
+        "adc r11, #0 \n\t"
+        "stmia r0!, {r9} \n\t"
+
+        "umull r14, r9, r4, r7 \n\t"
+        "adds r10, r14 \n\t"
+        "adc r11, r9 \n\t"
+        "stmia r0!, {r10, r11} \n\t"
+    #if (uECC_PLATFORM != uECC_arm_thumb2)
+        ".syntax divided \n\t"
+    #endif
+        : "+r" (r0), "+r" (r1), "+r" (r2)
+        :
+        : "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "r14", "cc", "memory"
+    );
+}
+#define asm_mult 1
+#endif /* (uECC_WORDS == 5) */
+
+#if (uECC_WORDS == 6)
+static void vli_mult(uint32_t *p_result, uint32_t *p_left, uint32_t *p_right)
+{
+    register uint32_t *r0 __asm__("r0") = p_result;
+    register uint32_t *r1 __asm__("r1") = p_left;
+    register uint32_t *r2 __asm__("r2") = p_right;
+    
+    __asm__ volatile (
+        ".syntax unified \n\t"
+        "add r0, 12 \n\t"
+        "add r2, 12 \n\t"
+        "ldmia r1!, {r3,r4,r5} \n\t"
+        "ldmia r2!, {r6,r7,r8} \n\t"
+
+        "umull r11, r12, r3, r6 \n\t"
+        "stmia r0!, {r11} \n\t"
+
+        "mov r10, #0 \n\t"
+        "umull r11, r9, r3, r7 \n\t"
+        "adds r12, r11 \n\t"
+        "adc r9, #0 \n\t"
+        "umull r11, r14, r4, r6 \n\t"
+        "adds r12, r11 \n\t"
+        "adcs r9, r14 \n\t"
+        "adc r10, #0 \n\t"
+        "stmia r0!, {r12} \n\t"
+
+        "mov r11, #0 \n\t"
+        "umull r12, r14, r3, r8 \n\t"
+        "adds r9, r12 \n\t"
+        "adcs r10, r14 \n\t"
+        "adc r11, #0 \n\t"
+        "umull r12, r14, r4, r7 \n\t"
+        "adds r9, r12 \n\t"
+        "adcs r10, r14 \n\t"
+        "adc r11, #0 \n\t"
+        "umull r12, r14, r5, r6 \n\t"
+        "adds r9, r12 \n\t"
+        "adcs r10, r14 \n\t"
+        "adc r11, #0 \n\t"
+        "stmia r0!, {r9} \n\t"
+
+        "mov r12, #0 \n\t"
+        "umull r14, r9, r4, r8 \n\t"
+        "adds r10, r14 \n\t"
+        "adcs r11, r9 \n\t"
+        "adc r12, #0 \n\t"
+        "umull r14, r9, r5, r7 \n\t"
+        "adds r10, r14 \n\t"
+        "adcs r11, r9 \n\t"
+        "adc r12, #0 \n\t"
+        "stmia r0!, {r10} \n\t"
+
+        "umull r9, r10, r5, r8 \n\t"
+        "adds r11, r9 \n\t"
+        "adc r12, r10 \n\t"
+        "stmia r0!, {r11, r12} \n\t"
+
+        "sub r0, 36 \n\t"
+        "sub r2, 24 \n\t"
+        "ldmia r2!, {r6,r7,r8} \n\t"
+
+        "umull r11, r12, r3, r6 \n\t"
+        "stmia r0!, {r11} \n\t"
+
+        "mov r10, #0 \n\t"
+        "umull r11, r9, r3, r7 \n\t"
+        "adds r12, r11 \n\t"
+        "adc r9, #0 \n\t"
+        "umull r11, r14, r4, r6 \n\t"
+        "adds r12, r11 \n\t"
+        "adcs r9, r14 \n\t"
+        "adc r10, #0 \n\t"
+        "stmia r0!, {r12} \n\t"
+
+        "mov r11, #0 \n\t"
+        "umull r12, r14, r3, r8 \n\t"
+        "adds r9, r12 \n\t"
+        "adcs r10, r14 \n\t"
+        "adc r11, #0 \n\t"
+        "umull r12, r14, r4, r7 \n\t"
+        "adds r9, r12 \n\t"
+        "adcs r10, r14 \n\t"
+        "adc r11, #0 \n\t"
+        "umull r12, r14, r5, r6 \n\t"
+        "adds r9, r12 \n\t"
+        "adcs r10, r14 \n\t"
+        "adc r11, #0 \n\t"
+        "stmia r0!, {r9} \n\t"
+
+        "ldmia r1!, {r3} \n\t"
+        "mov r12, #0 \n\t"
+        "umull r14, r9, r4, r8 \n\t"
+        "adds r10, r14 \n\t"
+        "adcs r11, r9 \n\t"
+        "adc r12, #0 \n\t"
+        "umull r14, r9, r5, r7 \n\t"
+        "adds r10, r14 \n\t"
+        "adcs r11, r9 \n\t"
+        "adc r12, #0 \n\t"
+        "umull r14, r9, r3, r6 \n\t"
+        "adds r10, r14 \n\t"
+        "adcs r11, r9 \n\t"
+        "adc r12, #0 \n\t"
+        "ldr r14, [r0] \n\t"
+        "adds r10, r14 \n\t"
+        "adcs r11, #0 \n\t"
+        "adc r12, #0 \n\t"
+        "stmia r0!, {r10} \n\t"
+
+        "ldmia r1!, {r4} \n\t"
+        "mov r14, #0 \n\t"
+        "umull r9, r10, r5, r8 \n\t"
+        "adds r11, r9 \n\t"
+        "adcs r12, r10 \n\t"
+        "adc r14, #0 \n\t"
+        "umull r9, r10, r3, r7 \n\t"
+        "adds r11, r9 \n\t"
+        "adcs r12, r10 \n\t"
+        "adc r14, #0 \n\t"
+        "umull r9, r10, r4, r6 \n\t"
+        "adds r11, r9 \n\t"
+        "adcs r12, r10 \n\t"
+        "adc r14, #0 \n\t"
+        "ldr r9, [r0] \n\t"
+        "adds r11, r9 \n\t"
+        "adcs r12, #0 \n\t"
+        "adc r14, #0 \n\t"
+        "stmia r0!, {r11} \n\t"
+
+        "ldmia r1!, {r5} \n\t"
+        "mov r9, #0 \n\t"
+        "umull r10, r11, r3, r8 \n\t"
+        "adds r12, r10 \n\t"
+        "adcs r14, r11 \n\t"
+        "adc r9, #0 \n\t"
+        "umull r10, r11, r4, r7 \n\t"
+        "adds r12, r10 \n\t"
+        "adcs r14, r11 \n\t"
+        "adc r9, #0 \n\t"
+        "umull r10, r11, r5, r6 \n\t"
+        "adds r12, r10 \n\t"
+        "adcs r14, r11 \n\t"
+        "adc r9, #0 \n\t"
+        "ldr r10, [r0] \n\t"
+        "adds r12, r10 \n\t"
+        "adcs r14, #0 \n\t"
+        "adc r9, #0 \n\t"
+        "stmia r0!, {r12} \n\t"
+
+        "ldmia r2!, {r6} \n\t"
+        "mov r10, #0 \n\t"
+        "umull r11, r12, r3, r6 \n\t"
+        "adds r14, r11 \n\t"
+        "adcs r9, r12 \n\t"
+        "adc r10, #0 \n\t"
+        "umull r11, r12, r4, r8 \n\t"
+        "adds r14, r11 \n\t"
+        "adcs r9, r12 \n\t"
+        "adc r10, #0 \n\t"
+        "umull r11, r12, r5, r7 \n\t"
+        "adds r14, r11 \n\t"
+        "adcs r9, r12 \n\t"
+        "adc r10, #0 \n\t"
+        "ldr r11, [r0] \n\t"
+        "adds r14, r11 \n\t"
+        "adcs r9, #0 \n\t"
+        "adc r10, #0 \n\t"
+        "stmia r0!, {r14} \n\t"
+
+        "ldmia r2!, {r7} \n\t"
+        "mov r11, #0 \n\t"
+        "umull r12, r14, r3, r7 \n\t"
+        "adds r9, r12 \n\t"
+        "adcs r10, r14 \n\t"
+        "adc r11, #0 \n\t"
+        "umull r12, r14, r4, r6 \n\t"
+        "adds r9, r12 \n\t"
+        "adcs r10, r14 \n\t"
+        "adc r11, #0 \n\t"
+        "umull r12, r14, r5, r8 \n\t"
+        "adds r9, r12 \n\t"
+        "adcs r10, r14 \n\t"
+        "adc r11, #0 \n\t"
+        "ldr r12, [r0] \n\t"
+        "adds r9, r12 \n\t"
+        "adcs r10, #0 \n\t"
+        "adc r11, #0 \n\t"
+        "stmia r0!, {r9} \n\t"
+
+        "ldmia r2!, {r8} \n\t"
+        "mov r12, #0 \n\t"
+        "umull r14, r9, r3, r8 \n\t"
+        "adds r10, r14 \n\t"
+        "adcs r11, r9 \n\t"
+        "adc r12, #0 \n\t"
+        "umull r14, r9, r4, r7 \n\t"
+        "adds r10, r14 \n\t"
+        "adcs r11, r9 \n\t"
+        "adc r12, #0 \n\t"
+        "umull r14, r9, r5, r6 \n\t"
+        "adds r10, r14 \n\t"
+        "adcs r11, r9 \n\t"
+        "adc r12, #0 \n\t"
+        "ldr r14, [r0] \n\t"
+        "adds r10, r14 \n\t"
+        "adcs r11, #0 \n\t"
+        "adc r12, #0 \n\t"
+        "stmia r0!, {r10} \n\t"
+
+        "mov r14, #0 \n\t"
+        "umull r9, r10, r4, r8 \n\t"
+        "adds r11, r9 \n\t"
+        "adcs r12, r10 \n\t"
+        "adc r14, #0 \n\t"
+        "umull r9, r10, r5, r7 \n\t"
+        "adds r11, r9 \n\t"
+        "adcs r12, r10 \n\t"
+        "adc r14, #0 \n\t"
+        "stmia r0!, {r11} \n\t"
+
+        "umull r10, r11, r5, r8 \n\t"
+        "adds r12, r10 \n\t"
+        "adc r14, r11 \n\t"
+        "stmia r0!, {r12, r14} \n\t"
+    #if (uECC_PLATFORM != uECC_arm_thumb2)
+        ".syntax divided \n\t"
+    #endif
+        : "+r" (r0), "+r" (r1), "+r" (r2)
+        :
+        : "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "r14", "cc", "memory"
+    );
+}
+#define asm_mult 1
+#endif /* (uECC_WORDS == 6) */
+
+#if (uECC_WORDS == 8)
+static void vli_mult(uint32_t *p_result, uint32_t *p_left, uint32_t *p_right)
+{
+    register uint32_t *r0 __asm__("r0") = p_result;
+    register uint32_t *r1 __asm__("r1") = p_left;
+    register uint32_t *r2 __asm__("r2") = p_right;
+    
+    __asm__ volatile (
+        ".syntax unified \n\t"
+        "add r0, 24 \n\t"
+        "add r2, 24 \n\t"
+        "ldmia r1!, {r3,r4} \n\t"
+        "ldmia r2!, {r6,r7} \n\t"
+
+        "umull r11, r12, r3, r6 \n\t"
+        "stmia r0!, {r11} \n\t"
+
+        "mov r10, #0 \n\t"
+        "umull r11, r9, r3, r7 \n\t"
+        "adds r12, r11 \n\t"
+        "adc r9, #0 \n\t"
+        "umull r11, r14, r4, r6 \n\t"
+        "adds r12, r11 \n\t"
+        "adcs r9, r14 \n\t"
+        "adc r10, #0 \n\t"
+        "stmia r0!, {r12} \n\t"
+
+        "umull r12, r14, r4, r7 \n\t"
+        "adds r9, r12 \n\t"
+        "adc r10, r14 \n\t"
+        "stmia r0!, {r9, r10} \n\t"
+
+        "sub r0, 28 \n\t"
+        "sub r2, 20 \n\t"
+        "ldmia r2!, {r6,r7,r8} \n\t"
+        "ldmia r1!, {r5} \n\t"
+
+        "umull r11, r12, r3, r6 \n\t"
+        "stmia r0!, {r11} \n\t"
+
+        "mov r10, #0 \n\t"
+        "umull r11, r9, r3, r7 \n\t"
+        "adds r12, r11 \n\t"
+        "adc r9, #0 \n\t"
+        "umull r11, r14, r4, r6 \n\t"
+        "adds r12, r11 \n\t"
+        "adcs r9, r14 \n\t"
+        "adc r10, #0 \n\t"
+        "stmia r0!, {r12} \n\t"
+
+        "mov r11, #0 \n\t"
+        "umull r12, r14, r3, r8 \n\t"
+        "adds r9, r12 \n\t"
+        "adcs r10, r14 \n\t"
+        "adc r11, #0 \n\t"
+        "umull r12, r14, r4, r7 \n\t"
+        "adds r9, r12 \n\t"
+        "adcs r10, r14 \n\t"
+        "adc r11, #0 \n\t"
+        "umull r12, r14, r5, r6 \n\t"
+        "adds r9, r12 \n\t"
+        "adcs r10, r14 \n\t"
+        "adc r11, #0 \n\t"
+        "stmia r0!, {r9} \n\t"
+
+        "ldmia r1!, {r3} \n\t"
+        "mov r12, #0 \n\t"
+        "umull r14, r9, r4, r8 \n\t"
+        "adds r10, r14 \n\t"
+        "adcs r11, r9 \n\t"
+        "adc r12, #0 \n\t"
+        "umull r14, r9, r5, r7 \n\t"
+        "adds r10, r14 \n\t"
+        "adcs r11, r9 \n\t"
+        "adc r12, #0 \n\t"
+        "umull r14, r9, r3, r6 \n\t"
+        "adds r10, r14 \n\t"
+        "adcs r11, r9 \n\t"
+        "adc r12, #0 \n\t"
+        "ldr r14, [r0] \n\t"
+        "adds r10, r14 \n\t"
+        "adcs r11, #0 \n\t"
+        "adc r12, #0 \n\t"
+        "stmia r0!, {r10} \n\t"
+
+        "ldmia r1!, {r4} \n\t"
+        "mov r14, #0 \n\t"
+        "umull r9, r10, r5, r8 \n\t"
+        "adds r11, r9 \n\t"
+        "adcs r12, r10 \n\t"
+        "adc r14, #0 \n\t"
+        "umull r9, r10, r3, r7 \n\t"
+        "adds r11, r9 \n\t"
+        "adcs r12, r10 \n\t"
+        "adc r14, #0 \n\t"
+        "umull r9, r10, r4, r6 \n\t"
+        "adds r11, r9 \n\t"
+        "adcs r12, r10 \n\t"
+        "adc r14, #0 \n\t"
+        "ldr r9, [r0] \n\t"
+        "adds r11, r9 \n\t"
+        "adcs r12, #0 \n\t"
+        "adc r14, #0 \n\t"
+        "stmia r0!, {r11} \n\t"
+
+        "ldmia r2!, {r6} \n\t"
+        "mov r9, #0 \n\t"
+        "umull r10, r11, r5, r6 \n\t"
+        "adds r12, r10 \n\t"
+        "adcs r14, r11 \n\t"
+        "adc r9, #0 \n\t"
+        "umull r10, r11, r3, r8 \n\t"
+        "adds r12, r10 \n\t"
+        "adcs r14, r11 \n\t"
+        "adc r9, #0 \n\t"
+        "umull r10, r11, r4, r7 \n\t"
+        "adds r12, r10 \n\t"
+        "adcs r14, r11 \n\t"
+        "adc r9, #0 \n\t"
+        "ldr r10, [r0] \n\t"
+        "adds r12, r10 \n\t"
+        "adcs r14, #0 \n\t"
+        "adc r9, #0 \n\t"
+        "stmia r0!, {r12} \n\t"
+
+        "ldmia r2!, {r7} \n\t"
+        "mov r10, #0 \n\t"
+        "umull r11, r12, r5, r7 \n\t"
+        "adds r14, r11 \n\t"
+        "adcs r9, r12 \n\t"
+        "adc r10, #0 \n\t"
+        "umull r11, r12, r3, r6 \n\t"
+        "adds r14, r11 \n\t"
+        "adcs r9, r12 \n\t"
+        "adc r10, #0 \n\t"
+        "umull r11, r12, r4, r8 \n\t"
+        "adds r14, r11 \n\t"
+        "adcs r9, r12 \n\t"
+        "adc r10, #0 \n\t"
+        "ldr r11, [r0] \n\t"
+        "adds r14, r11 \n\t"
+        "adcs r9, #0 \n\t"
+        "adc r10, #0 \n\t"
+        "stmia r0!, {r14} \n\t"
+
+        "mov r11, #0 \n\t"
+        "umull r12, r14, r3, r7 \n\t"
+        "adds r9, r12 \n\t"
+        "adcs r10, r14 \n\t"
+        "adc r11, #0 \n\t"
+        "umull r12, r14, r4, r6 \n\t"
+        "adds r9, r12 \n\t"
+        "adcs r10, r14 \n\t"
+        "adc r11, #0 \n\t"
+        "stmia r0!, {r9} \n\t"
+
+        "umull r14, r9, r4, r7 \n\t"
+        "adds r10, r14 \n\t"
+        "adc r11, r9 \n\t"
+        "stmia r0!, {r10, r11} \n\t"
+
+        "sub r0, 52 \n\t"
+        "sub r1, 20 \n\t"
+        "sub r2, 32 \n\t"
+        "ldmia r1!, {r3,r4,r5} \n\t"
+        "ldmia r2!, {r6,r7,r8} \n\t"
+
+        "umull r11, r12, r3, r6 \n\t"
+        "stmia r0!, {r11} \n\t"
+
+        "mov r10, #0 \n\t"
+        "umull r11, r9, r3, r7 \n\t"
+        "adds r12, r11 \n\t"
+        "adc r9, #0 \n\t"
+        "umull r11, r14, r4, r6 \n\t"
+        "adds r12, r11 \n\t"
+        "adcs r9, r14 \n\t"
+        "adc r10, #0 \n\t"
+        "stmia r0!, {r12} \n\t"
+
+        "mov r11, #0 \n\t"
+        "umull r12, r14, r3, r8 \n\t"
+        "adds r9, r12 \n\t"
+        "adcs r10, r14 \n\t"
+        "adc r11, #0 \n\t"
+        "umull r12, r14, r4, r7 \n\t"
+        "adds r9, r12 \n\t"
+        "adcs r10, r14 \n\t"
+        "adc r11, #0 \n\t"
+        "umull r12, r14, r5, r6 \n\t"
+        "adds r9, r12 \n\t"
+        "adcs r10, r14 \n\t"
+        "adc r11, #0 \n\t"
+        "stmia r0!, {r9} \n\t"
+
+        "ldmia r1!, {r3} \n\t"
+        "mov r12, #0 \n\t"
+        "umull r14, r9, r4, r8 \n\t"
+        "adds r10, r14 \n\t"
+        "adcs r11, r9 \n\t"
+        "adc r12, #0 \n\t"
+        "umull r14, r9, r5, r7 \n\t"
+        "adds r10, r14 \n\t"
+        "adcs r11, r9 \n\t"
+        "adc r12, #0 \n\t"
+        "umull r14, r9, r3, r6 \n\t"
+        "adds r10, r14 \n\t"
+        "adcs r11, r9 \n\t"
+        "adc r12, #0 \n\t"
+        "ldr r14, [r0] \n\t"
+        "adds r10, r14 \n\t"
+        "adcs r11, #0 \n\t"
+        "adc r12, #0 \n\t"
+        "stmia r0!, {r10} \n\t"
+
+        "ldmia r1!, {r4} \n\t"
+        "mov r14, #0 \n\t"
+        "umull r9, r10, r5, r8 \n\t"
+        "adds r11, r9 \n\t"
+        "adcs r12, r10 \n\t"
+        "adc r14, #0 \n\t"
+        "umull r9, r10, r3, r7 \n\t"
+        "adds r11, r9 \n\t"
+        "adcs r12, r10 \n\t"
+        "adc r14, #0 \n\t"
+        "umull r9, r10, r4, r6 \n\t"
+        "adds r11, r9 \n\t"
+        "adcs r12, r10 \n\t"
+        "adc r14, #0 \n\t"
+        "ldr r9, [r0] \n\t"
+        "adds r11, r9 \n\t"
+        "adcs r12, #0 \n\t"
+        "adc r14, #0 \n\t"
+        "stmia r0!, {r11} \n\t"
+
+        "ldmia r1!, {r5} \n\t"
+        "mov r9, #0 \n\t"
+        "umull r10, r11, r3, r8 \n\t"
+        "adds r12, r10 \n\t"
+        "adcs r14, r11 \n\t"
+        "adc r9, #0 \n\t"
+        "umull r10, r11, r4, r7 \n\t"
+        "adds r12, r10 \n\t"
+        "adcs r14, r11 \n\t"
+        "adc r9, #0 \n\t"
+        "umull r10, r11, r5, r6 \n\t"
+        "adds r12, r10 \n\t"
+        "adcs r14, r11 \n\t"
+        "adc r9, #0 \n\t"
+        "ldr r10, [r0] \n\t"
+        "adds r12, r10 \n\t"
+        "adcs r14, #0 \n\t"
+        "adc r9, #0 \n\t"
+        "stmia r0!, {r12} \n\t"
+
+        "ldmia r1!, {r3} \n\t"
+        "mov r10, #0 \n\t"
+        "umull r11, r12, r4, r8 \n\t"
+        "adds r14, r11 \n\t"
+        "adcs r9, r12 \n\t"
+        "adc r10, #0 \n\t"
+        "umull r11, r12, r5, r7 \n\t"
+        "adds r14, r11 \n\t"
+        "adcs r9, r12 \n\t"
+        "adc r10, #0 \n\t"
+        "umull r11, r12, r3, r6 \n\t"
+        "adds r14, r11 \n\t"
+        "adcs r9, r12 \n\t"
+        "adc r10, #0 \n\t"
+        "ldr r11, [r0] \n\t"
+        "adds r14, r11 \n\t"
+        "adcs r9, #0 \n\t"
+        "adc r10, #0 \n\t"
+        "stmia r0!, {r14} \n\t"
+
+        "ldmia r1!, {r4} \n\t"
+        "mov r11, #0 \n\t"
+        "umull r12, r14, r5, r8 \n\t"
+        "adds r9, r12 \n\t"
+        "adcs r10, r14 \n\t"
+        "adc r11, #0 \n\t"
+        "umull r12, r14, r3, r7 \n\t"
+        "adds r9, r12 \n\t"
+        "adcs r10, r14 \n\t"
+        "adc r11, #0 \n\t"
+        "umull r12, r14, r4, r6 \n\t"
+        "adds r9, r12 \n\t"
+        "adcs r10, r14 \n\t"
+        "adc r11, #0 \n\t"
+        "ldr r12, [r0] \n\t"
+        "adds r9, r12 \n\t"
+        "adcs r10, #0 \n\t"
+        "adc r11, #0 \n\t"
+        "stmia r0!, {r9} \n\t"
+
+        "ldmia r2!, {r6} \n\t"
+        "mov r12, #0 \n\t"
+        "umull r14, r9, r5, r6 \n\t"
+        "adds r10, r14 \n\t"
+        "adcs r11, r9 \n\t"
+        "adc r12, #0 \n\t"
+        "umull r14, r9, r3, r8 \n\t"
+        "adds r10, r14 \n\t"
+        "adcs r11, r9 \n\t"
+        "adc r12, #0 \n\t"
+        "umull r14, r9, r4, r7 \n\t"
+        "adds r10, r14 \n\t"
+        "adcs r11, r9 \n\t"
+        "adc r12, #0 \n\t"
+        "ldr r14, [r0] \n\t"
+        "adds r10, r14 \n\t"
+        "adcs r11, #0 \n\t"
+        "adc r12, #0 \n\t"
+        "stmia r0!, {r10} \n\t"
+
+        "ldmia r2!, {r7} \n\t"
+        "mov r14, #0 \n\t"
+        "umull r9, r10, r5, r7 \n\t"
+        "adds r11, r9 \n\t"
+        "adcs r12, r10 \n\t"
+        "adc r14, #0 \n\t"
+        "umull r9, r10, r3, r6 \n\t"
+        "adds r11, r9 \n\t"
+        "adcs r12, r10 \n\t"
+        "adc r14, #0 \n\t"
+        "umull r9, r10, r4, r8 \n\t"
+        "adds r11, r9 \n\t"
+        "adcs r12, r10 \n\t"
+        "adc r14, #0 \n\t"
+        "ldr r9, [r0] \n\t"
+        "adds r11, r9 \n\t"
+        "adcs r12, #0 \n\t"
+        "adc r14, #0 \n\t"
+        "stmia r0!, {r11} \n\t"
+
+        "ldmia r2!, {r8} \n\t"
+        "mov r9, #0 \n\t"
+        "umull r10, r11, r5, r8 \n\t"
+        "adds r12, r10 \n\t"
+        "adcs r14, r11 \n\t"
+        "adc r9, #0 \n\t"
+        "umull r10, r11, r3, r7 \n\t"
+        "adds r12, r10 \n\t"
+        "adcs r14, r11 \n\t"
+        "adc r9, #0 \n\t"
+        "umull r10, r11, r4, r6 \n\t"
+        "adds r12, r10 \n\t"
+        "adcs r14, r11 \n\t"
+        "adc r9, #0 \n\t"
+        "ldr r10, [r0] \n\t"
+        "adds r12, r10 \n\t"
+        "adcs r14, #0 \n\t"
+        "adc r9, #0 \n\t"
+        "stmia r0!, {r12} \n\t"
+
+        "ldmia r2!, {r6} \n\t"
+        "mov r10, #0 \n\t"
+        "umull r11, r12, r5, r6 \n\t"
+        "adds r14, r11 \n\t"
+        "adcs r9, r12 \n\t"
+        "adc r10, #0 \n\t"
+        "umull r11, r12, r3, r8 \n\t"
+        "adds r14, r11 \n\t"
+        "adcs r9, r12 \n\t"
+        "adc r10, #0 \n\t"
+        "umull r11, r12, r4, r7 \n\t"
+        "adds r14, r11 \n\t"
+        "adcs r9, r12 \n\t"
+        "adc r10, #0 \n\t"
+        "ldr r11, [r0] \n\t"
+        "adds r14, r11 \n\t"
+        "adcs r9, #0 \n\t"
+        "adc r10, #0 \n\t"
+        "stmia r0!, {r14} \n\t"
+
+        "ldmia r2!, {r7} \n\t"
+        "mov r11, #0 \n\t"
+        "umull r12, r14, r5, r7 \n\t"
+        "adds r9, r12 \n\t"
+        "adcs r10, r14 \n\t"
+        "adc r11, #0 \n\t"
+        "umull r12, r14, r3, r6 \n\t"
+        "adds r9, r12 \n\t"
+        "adcs r10, r14 \n\t"
+        "adc r11, #0 \n\t"
+        "umull r12, r14, r4, r8 \n\t"
+        "adds r9, r12 \n\t"
+        "adcs r10, r14 \n\t"
+        "adc r11, #0 \n\t"
+        "ldr r12, [r0] \n\t"
+        "adds r9, r12 \n\t"
+        "adcs r10, #0 \n\t"
+        "adc r11, #0 \n\t"
+        "stmia r0!, {r9} \n\t"
+
+        "mov r12, #0 \n\t"
+        "umull r14, r9, r3, r7 \n\t"
+        "adds r10, r14 \n\t"
+        "adcs r11, r9 \n\t"
+        "adc r12, #0 \n\t"
+        "umull r14, r9, r4, r6 \n\t"
+        "adds r10, r14 \n\t"
+        "adcs r11, r9 \n\t"
+        "adc r12, #0 \n\t"
+        "stmia r0!, {r10} \n\t"
+
+        "umull r9, r10, r4, r7 \n\t"
+        "adds r11, r9 \n\t"
+        "adc r12, r10 \n\t"
+        "stmia r0!, {r11, r12} \n\t"
+    #if (uECC_PLATFORM != uECC_arm_thumb2)
+        ".syntax divided \n\t"
+    #endif
+        : "+r" (r0), "+r" (r1), "+r" (r2)
+        :
+        : "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "r14", "cc", "memory"
+    );
+}
+#define asm_mult 1
+#endif /* (uECC_WORDS == 8) */
+
+#if (uECC_WORDS == 5)
+static void vli_square(uint32_t *p_result, uint32_t *p_left)
+{
+    register uint32_t *r0 __asm__("r0") = p_result;
+    register uint32_t *r1 __asm__("r1") = p_left;
+    
+    __asm__ volatile (
+        ".syntax unified \n\t"
+        "ldmia r1!, {r2,r3,r4,r5,r6} \n\t"
+
+        "umull r11, r12, r2, r2 \n\t"
+        "stmia r0!, {r11} \n\t"
+
+        "mov r9, #0 \n\t"
+        "umull r10, r11, r2, r3 \n\t"
+        "adds r12, r10 \n\t"
+        "adcs r8, r11, #0 \n\t"
+        "adc r9, #0 \n\t"
+        "adds r12, r10 \n\t"
+        "adcs r8, r11 \n\t"
+        "adc r9, #0 \n\t"
+        "stmia r0!, {r12} \n\t"
+
+        "mov r10, #0 \n\t"
+        "umull r11, r12, r2, r4 \n\t"
+        "adds r11, r11 \n\t"
+        "adcs r12, r12 \n\t"
+        "adc r10, #0 \n\t"
+        "adds r8, r11 \n\t"
+        "adcs r9, r12 \n\t"
+        "adc r10, #0 \n\t"
+        "umull r11, r12, r3, r3 \n\t"
+        "adds r8, r11 \n\t"
+        "adcs r9, r12 \n\t"
+        "adc r10, #0 \n\t"
+        "stmia r0!, {r8} \n\t"
+
+        "mov r12, #0 \n\t"
+        "umull r8, r11, r2, r5 \n\t"
+        "umull r1, r14, r3, r4 \n\t"
+        "adds r8, r1 \n\t"
+        "adcs r11, r14 \n\t"
+        "adc r12, #0 \n\t"
+        "adds r8, r8 \n\t"
+        "adcs r11, r11 \n\t"
+        "adc r12, r12 \n\t"
+        "adds r8, r9 \n\t"
+        "adcs r11, r10 \n\t"
+        "adc r12, #0 \n\t"
+        "stmia r0!, {r8} \n\t"
+
+        "mov r10, #0 \n\t"
+        "umull r8, r9, r2, r6 \n\t"
+        "umull r1, r14, r3, r5 \n\t"
+        "adds r8, r1 \n\t"
+        "adcs r9, r14 \n\t"
+        "adc r10, #0 \n\t"
+        "adds r8, r8 \n\t"
+        "adcs r9, r9 \n\t"
+        "adc r10, r10 \n\t"
+        "umull r1, r14, r4, r4 \n\t"
+        "adds r8, r1 \n\t"
+        "adcs r9, r14 \n\t"
+        "adc r10, #0 \n\t"
+        "adds r8, r11 \n\t"
+        "adcs r9, r12 \n\t"
+        "adc r10, #0 \n\t"
+        "stmia r0!, {r8} \n\t"
+
+        "mov r12, #0 \n\t"
+        "umull r8, r11, r3, r6 \n\t"
+        "umull r1, r14, r4, r5 \n\t"
+        "adds r8, r1 \n\t"
+        "adcs r11, r14 \n\t"
+        "adc r12, #0 \n\t"
+        "adds r8, r8 \n\t"
+        "adcs r11, r11 \n\t"
+        "adc r12, r12 \n\t"
+        "adds r8, r9 \n\t"
+        "adcs r11, r10 \n\t"
+        "adc r12, #0 \n\t"
+        "stmia r0!, {r8} \n\t"
+
+        "mov r8, #0 \n\t"
+        "umull r1, r10, r4, r6 \n\t"
+        "adds r1, r1 \n\t"
+        "adcs r10, r10 \n\t"
+        "adc r8, #0 \n\t"
+        "adds r11, r1 \n\t"
+        "adcs r12, r10 \n\t"
+        "adc r8, #0 \n\t"
+        "umull r1, r10, r5, r5 \n\t"
+        "adds r11, r1 \n\t"
+        "adcs r12, r10 \n\t"
+        "adc r8, #0 \n\t"
+        "stmia r0!, {r11} \n\t"
+
+        "mov r11, #0 \n\t"
+        "umull r1, r10, r5, r6 \n\t"
+        "adds r1, r1 \n\t"
+        "adcs r10, r10 \n\t"
+        "adc r11, #0 \n\t"
+        "adds r12, r1 \n\t"
+        "adcs r8, r10 \n\t"
+        "adc r11, #0 \n\t"
+        "stmia r0!, {r12} \n\t"
+
+        "umull r1, r10, r6, r6 \n\t"
+        "adds r8, r1 \n\t"
+        "adcs r11, r10 \n\t"
+        "stmia r0!, {r8, r11} \n\t"
+    #if (uECC_PLATFORM != uECC_arm_thumb2)
+        ".syntax divided \n\t"
+    #endif
+        : "+r" (r0), "+r" (r1)
+        :
+        : "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "r14", "cc", "memory"
+    );
+}
+#define asm_square 1
+#endif /* (uECC_WORDS == 5) */
+
+#if (uECC_WORDS == 6)
+static void vli_square(uint32_t *p_result, uint32_t *p_left)
+{
+    register uint32_t *r0 __asm__("r0") = p_result;
+    register uint32_t *r1 __asm__("r1") = p_left;
+    
+    __asm__ volatile (
+        ".syntax unified \n\t"
+        "ldmia r1!, {r2,r3,r4,r5,r6,r7} \n\t"
+
+        "umull r11, r12, r2, r2 \n\t"
+        "stmia r0!, {r11} \n\t"
+
+        "mov r9, #0 \n\t"
+        "umull r10, r11, r2, r3 \n\t"
+        "adds r12, r10 \n\t"
+        "adcs r8, r11, #0 \n\t"
+        "adc r9, #0 \n\t"
+        "adds r12, r10 \n\t"
+        "adcs r8, r11 \n\t"
+        "adc r9, #0 \n\t"
+        "stmia r0!, {r12} \n\t"
+
+        "mov r10, #0 \n\t"
+        "umull r11, r12, r2, r4 \n\t"
+        "adds r11, r11 \n\t"
+        "adcs r12, r12 \n\t"
+        "adc r10, #0 \n\t"
+        "adds r8, r11 \n\t"
+        "adcs r9, r12 \n\t"
+        "adc r10, #0 \n\t"
+        "umull r11, r12, r3, r3 \n\t"
+        "adds r8, r11 \n\t"
+        "adcs r9, r12 \n\t"
+        "adc r10, #0 \n\t"
+        "stmia r0!, {r8} \n\t"
+
+        "mov r12, #0 \n\t"
+        "umull r8, r11, r2, r5 \n\t"
+        "umull r1, r14, r3, r4 \n\t"
+        "adds r8, r1 \n\t"
+        "adcs r11, r14 \n\t"
+        "adc r12, #0 \n\t"
+        "adds r8, r8 \n\t"
+        "adcs r11, r11 \n\t"
+        "adc r12, r12 \n\t"
+        "adds r8, r9 \n\t"
+        "adcs r11, r10 \n\t"
+        "adc r12, #0 \n\t"
+        "stmia r0!, {r8} \n\t"
+
+        "mov r10, #0 \n\t"
+        "umull r8, r9, r2, r6 \n\t"
+        "umull r1, r14, r3, r5 \n\t"
+        "adds r8, r1 \n\t"
+        "adcs r9, r14 \n\t"
+        "adc r10, #0 \n\t"
+        "adds r8, r8 \n\t"
+        "adcs r9, r9 \n\t"
+        "adc r10, r10 \n\t"
+        "umull r1, r14, r4, r4 \n\t"
+        "adds r8, r1 \n\t"
+        "adcs r9, r14 \n\t"
+        "adc r10, #0 \n\t"
+        "adds r8, r11 \n\t"
+        "adcs r9, r12 \n\t"
+        "adc r10, #0 \n\t"
+        "stmia r0!, {r8} \n\t"
+
+        "mov r12, #0 \n\t"
+        "umull r8, r11, r2, r7 \n\t"
+        "umull r1, r14, r3, r6 \n\t"
+        "adds r8, r1 \n\t"
+        "adcs r11, r14 \n\t"
+        "adc r12, #0 \n\t"
+        "umull r1, r14, r4, r5 \n\t"
+        "adds r8, r1 \n\t"
+        "adcs r11, r14 \n\t"
+        "adc r12, #0 \n\t"
+        "adds r8, r8 \n\t"
+        "adcs r11, r11 \n\t"
+        "adc r12, r12 \n\t"
+        "adds r8, r9 \n\t"
+        "adcs r11, r10 \n\t"
+        "adc r12, #0 \n\t"
+        "stmia r0!, {r8} \n\t"
+
+        "mov r10, #0 \n\t"
+        "umull r8, r9, r3, r7 \n\t"
+        "umull r1, r14, r4, r6 \n\t"
+        "adds r8, r1 \n\t"
+        "adcs r9, r14 \n\t"
+        "adc r10, #0 \n\t"
+        "adds r8, r8 \n\t"
+        "adcs r9, r9 \n\t"
+        "adc r10, r10 \n\t"
+        "umull r1, r14, r5, r5 \n\t"
+        "adds r8, r1 \n\t"
+        "adcs r9, r14 \n\t"
+        "adc r10, #0 \n\t"
+        "adds r8, r11 \n\t"
+        "adcs r9, r12 \n\t"
+        "adc r10, #0 \n\t"
+        "stmia r0!, {r8} \n\t"
+
+        "mov r12, #0 \n\t"
+        "umull r8, r11, r4, r7 \n\t"
+        "umull r1, r14, r5, r6 \n\t"
+        "adds r8, r1 \n\t"
+        "adcs r11, r14 \n\t"
+        "adc r12, #0 \n\t"
+        "adds r8, r8 \n\t"
+        "adcs r11, r11 \n\t"
+        "adc r12, r12 \n\t"
+        "adds r8, r9 \n\t"
+        "adcs r11, r10 \n\t"
+        "adc r12, #0 \n\t"
+        "stmia r0!, {r8} \n\t"
+
+        "mov r8, #0 \n\t"
+        "umull r1, r10, r5, r7 \n\t"
+        "adds r1, r1 \n\t"
+        "adcs r10, r10 \n\t"
+        "adc r8, #0 \n\t"
+        "adds r11, r1 \n\t"
+        "adcs r12, r10 \n\t"
+        "adc r8, #0 \n\t"
+        "umull r1, r10, r6, r6 \n\t"
+        "adds r11, r1 \n\t"
+        "adcs r12, r10 \n\t"
+        "adc r8, #0 \n\t"
+        "stmia r0!, {r11} \n\t"
+
+        "mov r11, #0 \n\t"
+        "umull r1, r10, r6, r7 \n\t"
+        "adds r1, r1 \n\t"
+        "adcs r10, r10 \n\t"
+        "adc r11, #0 \n\t"
+        "adds r12, r1 \n\t"
+        "adcs r8, r10 \n\t"
+        "adc r11, #0 \n\t"
+        "stmia r0!, {r12} \n\t"
+
+        "umull r1, r10, r7, r7 \n\t"
+        "adds r8, r1 \n\t"
+        "adcs r11, r10 \n\t"
+        "stmia r0!, {r8, r11} \n\t"
+    #if (uECC_PLATFORM != uECC_arm_thumb2)
+        ".syntax divided \n\t"
+    #endif
+        : "+r" (r0), "+r" (r1)
+        :
+        : "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "r14", "cc", "memory"
+    );
+}
+#define asm_square 1
+#endif /* (uECC_WORDS == 6) */
+
+#if (uECC_WORDS == 8)
+static void vli_square(uint32_t *p_result, uint32_t *p_left)
+{
+    register uint32_t *r0 __asm__("r0") = p_result;
+    register uint32_t *r1 __asm__("r1") = p_left;
+    
+    __asm__ volatile (
+        ".syntax unified \n\t"
+        "ldmia r1!, {r2, r3} \n\t"
+        "add r1, 16 \n\t"
+        "ldmia r1!, {r5, r6} \n\t"
+        "add r0, 24 \n\t"
+
+        "umull r8, r9, r2, r5 \n\t"
+        "stmia r0!, {r8} \n\t"
+
+        "umull r12, r10, r2, r6 \n\t"
+        "adds r9, r12 \n\t"
+        "adc r10, #0 \n\t"
+        "stmia r0!, {r9} \n\t"
+
+        "umull r8, r9, r3, r6 \n\t"
+        "adds r10, r8 \n\t"
+        "adc r11, r9, #0 \n\t"
+        "stmia r0!, {r10, r11} \n\t"
+
+        "sub r0, 40 \n\t"
+        "sub r1, 32 \n\t"
+        "ldmia r1!, {r2,r3,r4,r5,r6,r7} \n\t"
+
+        "umull r11, r12, r2, r2 \n\t"
+        "stmia r0!, {r11} \n\t"
+
+        "mov r9, #0 \n\t"
+        "umull r10, r11, r2, r3 \n\t"
+        "adds r12, r10 \n\t"
+        "adcs r8, r11, #0 \n\t"
+        "adc r9, #0 \n\t"
+        "adds r12, r10 \n\t"
+        "adcs r8, r11 \n\t"
+        "adc r9, #0 \n\t"
+        "stmia r0!, {r12} \n\t"
+
+        "mov r10, #0 \n\t"
+        "umull r11, r12, r2, r4 \n\t"
+        "adds r11, r11 \n\t"
+        "adcs r12, r12 \n\t"
+        "adc r10, #0 \n\t"
+        "adds r8, r11 \n\t"
+        "adcs r9, r12 \n\t"
+        "adc r10, #0 \n\t"
+        "umull r11, r12, r3, r3 \n\t"
+        "adds r8, r11 \n\t"
+        "adcs r9, r12 \n\t"
+        "adc r10, #0 \n\t"
+        "stmia r0!, {r8} \n\t"
+
+        "mov r12, #0 \n\t"
+        "umull r8, r11, r2, r5 \n\t"
+        "mov r14, r11 \n\t"
+        "umlal r8, r11, r3, r4 \n\t"
+        "cmp r14, r11 \n\t"
+        "it hi \n\t"
+        "adchi r12, #0 \n\t"
+        "adds r8, r8 \n\t"
+        "adcs r11, r11 \n\t"
+        "adc r12, r12 \n\t"
+        "adds r8, r9 \n\t"
+        "adcs r11, r10 \n\t"
+        "adc r12, #0 \n\t"
+        "stmia r0!, {r8} \n\t"
+
+        "mov r10, #0 \n\t"
+        "umull r8, r9, r2, r6 \n\t"
+        "mov r14, r9 \n\t"
+        "umlal r8, r9, r3, r5 \n\t"
+        "cmp r14, r9 \n\t"
+        "it hi \n\t"
+        "adchi r10, #0 \n\t"
+        "adds r8, r8 \n\t"
+        "adcs r9, r9 \n\t"
+        "adc r10, r10 \n\t"
+        "mov r14, r9 \n\t"
+        "umlal r8, r9, r4, r4 \n\t"
+        "cmp r14, r9 \n\t"
+        "it hi \n\t"
+        "adchi r10, #0 \n\t"
+        "adds r8, r11 \n\t"
+        "adcs r9, r12 \n\t"
+        "adc r10, #0 \n\t"
+        "stmia r0!, {r8} \n\t"
+
+        "mov r12, #0 \n\t"
+        "umull r8, r11, r2, r7 \n\t"
+        "mov r14, r11 \n\t"
+        "umlal r8, r11, r3, r6 \n\t"
+        "cmp r14, r11 \n\t"
+        "it hi \n\t"
+        "adchi r12, #0 \n\t"
+        "mov r14, r11 \n\t"
+        "umlal r8, r11, r4, r5 \n\t"
+        "cmp r14, r11 \n\t"
+        "it hi \n\t"
+        "adchi r12, #0 \n\t"
+        "adds r8, r8 \n\t"
+        "adcs r11, r11 \n\t"
+        "adc r12, r12 \n\t"
+        "adds r8, r9 \n\t"
+        "adcs r11, r10 \n\t"
+        "adc r12, #0 \n\t"
+        "stmia r0!, {r8} \n\t"
+
+        "ldmia r1!, {r2} \n\t"
+        "mov r10, #0 \n\t"
+        "umull r8, r9, r3, r7 \n\t"
+        "mov r14, r9 \n\t"
+        "umlal r8, r9, r4, r6 \n\t"
+        "cmp r14, r9 \n\t"
+        "it hi \n\t"
+        "adchi r10, #0 \n\t"
+        "ldr r14, [r0] \n\t"
+        "adds r8, r14 \n\t"
+        "adcs r9, #0 \n\t"
+        "adc r10, #0 \n\t"
+        "adds r8, r8 \n\t"
+        "adcs r9, r9 \n\t"
+        "adc r10, r10 \n\t"
+        "mov r14, r9 \n\t"
+        "umlal r8, r9, r5, r5 \n\t"
+        "cmp r14, r9 \n\t"
+        "it hi \n\t"
+        "adchi r10, #0 \n\t"
+        "adds r8, r11 \n\t"
+        "adcs r9, r12 \n\t"
+        "adc r10, #0 \n\t"
+        "stmia r0!, {r8} \n\t"
+
+        "mov r12, #0 \n\t"
+        "umull r8, r11, r3, r2 \n\t"
+        "mov r14, r11 \n\t"
+        "umlal r8, r11, r4, r7 \n\t"
+        "cmp r14, r11 \n\t"
+        "it hi \n\t"
+        "adchi r12, #0 \n\t"
+        "mov r14, r11 \n\t"
+        "umlal r8, r11, r5, r6 \n\t"
+        "cmp r14, r11 \n\t"
+        "it hi \n\t"
+        "adchi r12, #0 \n\t"
+        "ldr r14, [r0] \n\t"
+        "adds r8, r14 \n\t"
+        "adcs r11, #0 \n\t"
+        "adc r12, #0 \n\t"
+        "adds r8, r8 \n\t"
+        "adcs r11, r11 \n\t"
+        "adc r12, r12 \n\t"
+        "adds r8, r9 \n\t"
+        "adcs r11, r10 \n\t"
+        "adc r12, #0 \n\t"
+        "stmia r0!, {r8} \n\t"
+
+        "ldmia r1!, {r3} \n\t"
+        "mov r10, #0 \n\t"
+        "umull r8, r9, r4, r2 \n\t"
+        "mov r14, r9 \n\t"
+        "umlal r8, r9, r5, r7 \n\t"
+        "cmp r14, r9 \n\t"
+        "it hi \n\t"
+        "adchi r10, #0 \n\t"
+        "ldr r14, [r0] \n\t"
+        "adds r8, r14 \n\t"
+        "adcs r9, #0 \n\t"
+        "adc r10, #0 \n\t"
+        "adds r8, r8 \n\t"
+        "adcs r9, r9 \n\t"
+        "adc r10, r10 \n\t"
+        "mov r14, r9 \n\t"
+        "umlal r8, r9, r6, r6 \n\t"
+        "cmp r14, r9 \n\t"
+        "it hi \n\t"
+        "adchi r10, #0 \n\t"
+        "adds r8, r11 \n\t"
+        "adcs r9, r12 \n\t"
+        "adc r10, #0 \n\t"
+        "stmia r0!, {r8} \n\t"
+
+        "mov r12, #0 \n\t"
+        "umull r8, r11, r4, r3 \n\t"
+        "mov r14, r11 \n\t"
+        "umlal r8, r11, r5, r2 \n\t"
+        "cmp r14, r11 \n\t"
+        "it hi \n\t"
+        "adchi r12, #0 \n\t"
+        "mov r14, r11 \n\t"
+        "umlal r8, r11, r6, r7 \n\t"
+        "cmp r14, r11 \n\t"
+        "it hi \n\t"
+        "adchi r12, #0 \n\t"
+        "ldr r14, [r0] \n\t"
+        "adds r8, r14 \n\t"
+        "adcs r11, #0 \n\t"
+        "adc r12, #0 \n\t"
+        "adds r8, r8 \n\t"
+        "adcs r11, r11 \n\t"
+        "adc r12, r12 \n\t"
+        "adds r8, r9 \n\t"
+        "adcs r11, r10 \n\t"
+        "adc r12, #0 \n\t"
+        "stmia r0!, {r8} \n\t"
+
+        "mov r10, #0 \n\t"
+        "umull r8, r9, r5, r3 \n\t"
+        "mov r14, r9 \n\t"
+        "umlal r8, r9, r6, r2 \n\t"
+        "cmp r14, r9 \n\t"
+        "it hi \n\t"
+        "adchi r10, #0 \n\t"
+        "adds r8, r8 \n\t"
+        "adcs r9, r9 \n\t"
+        "adc r10, r10 \n\t"
+        "mov r14, r9 \n\t"
+        "umlal r8, r9, r7, r7 \n\t"
+        "cmp r14, r9 \n\t"
+        "it hi \n\t"
+        "adchi r10, #0 \n\t"
+        "adds r8, r11 \n\t"
+        "adcs r9, r12 \n\t"
+        "adc r10, #0 \n\t"
+        "stmia r0!, {r8} \n\t"
+
+        "mov r12, #0 \n\t"
+        "umull r8, r11, r6, r3 \n\t"
+        "mov r14, r11 \n\t"
+        "umlal r8, r11, r7, r2 \n\t"
+        "cmp r14, r11 \n\t"
+        "it hi \n\t"
+        "adchi r12, #0 \n\t"
+        "adds r8, r8 \n\t"
+        "adcs r11, r11 \n\t"
+        "adc r12, r12 \n\t"
+        "adds r8, r9 \n\t"
+        "adcs r11, r10 \n\t"
+        "adc r12, #0 \n\t"
+        "stmia r0!, {r8} \n\t"
+
+        "mov r8, #0 \n\t"
+        "umull r1, r10, r7, r3 \n\t"
+        "adds r1, r1 \n\t"
+        "adcs r10, r10 \n\t"
+        "adc r8, #0 \n\t"
+        "adds r11, r1 \n\t"
+        "adcs r12, r10 \n\t"
+        "adc r8, #0 \n\t"
+        "umull r1, r10, r2, r2 \n\t"
+        "adds r11, r1 \n\t"
+        "adcs r12, r10 \n\t"
+        "adc r8, #0 \n\t"
+        "stmia r0!, {r11} \n\t"
+
+        "mov r11, #0 \n\t"
+        "umull r1, r10, r2, r3 \n\t"
+        "adds r1, r1 \n\t"
+        "adcs r10, r10 \n\t"
+        "adc r11, #0 \n\t"
+        "adds r12, r1 \n\t"
+        "adcs r8, r10 \n\t"
+        "adc r11, #0 \n\t"
+        "stmia r0!, {r12} \n\t"
+
+        "umull r1, r10, r3, r3 \n\t"
+        "adds r8, r1 \n\t"
+        "adcs r11, r10 \n\t"
+        "stmia r0!, {r8, r11} \n\t"
+    #if (uECC_PLATFORM != uECC_arm_thumb2)
+        ".syntax divided \n\t"
+    #endif
+        : "+r" (r0), "+r" (r1)
+        :
+        : "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "r14", "cc", "memory"
+    );
+}
+#define asm_square 1
+#endif /* (uECC_WORDS == 8) */
+
+#endif /* (uECC_PLATFORM != uECC_arm_thumb) */
+
+#endif /* (uECC_ASM == uECC_asm_fast) */
+
+#if !asm_add
+static uint32_t vli_add(uint32_t *p_result, uint32_t *p_left, uint32_t *p_right)
+{
+    uint32_t l_counter = uECC_WORDS;
+    uint32_t l_carry = 0; /* carry = 0 initially */
+    uint32_t l_left;
+    uint32_t l_right;
+    
+    __asm__ volatile (
+        ".syntax unified \n\t"
+        "1: \n\t"
+        "ldmia %[lptr]!, {%[left]} \n\t"  /* Load left word. */
+        "ldmia %[rptr]!, {%[right]} \n\t" /* Load right word. */
+        "lsrs %[carry], #1 \n\t"          /* Set up carry flag (l_carry = 0 after this). */
+        "adcs %[left], %[right] \n\t"     /* Add with carry. */
+        "adcs %[carry], %[carry] \n\t"    /* Store carry bit in l_carry. */
+        "stmia %[dptr]!, {%[left]} \n\t"  /* Store result word. */
+        "subs %[ctr], #1 \n\t"            /* Decrement index. */
+        "bne 1b \n\t"                     /* Loop until index == 0. */
+    #if (uECC_PLATFORM != uECC_arm_thumb2)
+        ".syntax divided \n\t"
+    #endif
+    #if (uECC_PLATFORM == uECC_arm_thumb)
+        : [dptr] "+l" (p_result), [lptr] "+l" (p_left), [rptr] "+l" (p_right),
+          [ctr] "+l" (l_counter), [carry] "+l" (l_carry), [left] "=l" (l_left), [right] "=l" (l_right)
+    #else
+        : [dptr] "+r" (p_result), [lptr] "+r" (p_left), [rptr] "+r" (p_right),
+          [ctr] "+r" (l_counter), [carry] "+r" (l_carry), [left] "=r" (l_left), [right] "=r" (l_right)
+    #endif
+        :
+        : "cc", "memory"
+    );
+    return l_carry;
+}
+#define asm_add 1
+#endif
+
+#if !asm_sub
+static uint32_t vli_sub(uint32_t *p_result, uint32_t *p_left, uint32_t *p_right)
+{
+    uint32_t l_counter = uECC_WORDS;
+    uint32_t l_carry = 1; /* carry = 1 initially (means don't borrow) */
+    uint32_t l_left;
+    uint32_t l_right;
+    
+    __asm__ volatile (
+        ".syntax unified \n\t"
+        "1: \n\t"
+        "ldmia %[lptr]!, {%[left]} \n\t"  /* Load left word. */
+        "ldmia %[rptr]!, {%[right]} \n\t" /* Load right word. */
+        "lsrs %[carry], #1 \n\t"          /* Set up carry flag (l_carry = 0 after this). */
+        "sbcs %[left], %[right] \n\t"     /* Subtract with borrow. */
+        "adcs %[carry], %[carry] \n\t"    /* Store carry bit in l_carry. */
+        "stmia %[dptr]!, {%[left]} \n\t"  /* Store result word. */
+        "subs %[ctr], #1 \n\t"            /* Decrement index. */
+        "bne 1b \n\t"                     /* Loop until index == 0. */
+    #if (uECC_PLATFORM != uECC_arm_thumb2)
+        ".syntax divided \n\t"
+    #endif
+    #if (uECC_PLATFORM == uECC_arm_thumb)
+        : [dptr] "+l" (p_result), [lptr] "+l" (p_left), [rptr] "+l" (p_right),
+          [ctr] "+l" (l_counter), [carry] "+l" (l_carry), [left] "=l" (l_left), [right] "=l" (l_right)
+    #else
+        : [dptr] "+r" (p_result), [lptr] "+r" (p_left), [rptr] "+r" (p_right),
+          [ctr] "+r" (l_counter), [carry] "+r" (l_carry), [left] "=r" (l_left), [right] "=r" (l_right)
+    #endif
+        :
+        : "cc", "memory"
+    );
+    return !l_carry;
+}
+#define asm_sub 1
+#endif
+
+#if !asm_mult
+static void vli_mult(uint32_t *p_result, uint32_t *p_left, uint32_t *p_right)
+{
+#if (uECC_PLATFORM != uECC_arm_thumb)
+    uint32_t c0 = 0;
+    uint32_t c1 = 0;
+    uint32_t c2 = 0;
+    uint32_t k = 0;
+    uint32_t i;
+    uint32_t t0, t1;
+    
+    __asm__ volatile (
+        ".syntax unified \n\t"
+        
+        "1: \n\t" /* outer loop (k < uECC_WORDS) */
+        "movs %[i], #0 \n\t" /* i = 0 */
+        "b 3f \n\t"
+        
+        "2: \n\t" /* outer loop (k >= uECC_WORDS) */
+        "movs %[i], %[k] \n\t"      /* i = k */
+        "subs %[i], %[eccdm1] \n\t" /* i = k - (uECC_WORDS - 1) (times 4) */
+        
+        "3: \n\t" /* inner loop */
+        "subs %[t0], %[k], %[i] \n\t" /* t0 = k-i */
+        
+        "ldr %[t1], [%[right], %[t0]] \n\t" /* t1 = p_right[k-i] */
+        "ldr %[t0], [%[left], %[i]] \n\t"   /* t0 = p_left[i] */
+        
+        "umull %[t0], %[t1], %[t0], %[t1] \n\t" /* (t0, t1) = p_left[i] * p_right[k-i] */
+        
+        "adds %[c0], %[t0] \n\t" /* add low word to c0 */
+        "adcs %[c1], %[t1] \n\t" /* add high word to c1, including carry */
+        "adcs %[c2], #0 \n\t"    /* add carry to c2 */
+
+        "adds %[i], #4 \n\t"     /* i += 4 */
+        "cmp %[i], %[eccd] \n\t" /* i < uECC_WORDS (times 4)? */
+        "bge 4f \n\t" /* if not, exit the loop */
+        "cmp %[i], %[k] \n\t"    /* i <= k? */
+        "ble 3b \n\t" /* if so, continue looping */
+        
+        "4: \n\t" /* end inner loop */
+        
+        "str %[c0], [%[result], %[k]] \n\t" /* p_result[k] = c0 */
+        "mov %[c0], %[c1] \n\t"     /* c0 = c1 */
+        "mov %[c1], %[c2] \n\t"     /* c1 = c2 */
+        "movs %[c2], #0 \n\t"       /* c2 = 0 */
+        "adds %[k], #4 \n\t"        /* k += 4 */
+        "cmp %[k], %[eccd] \n\t"    /* k < uECC_WORDS (times 4) ? */
+        "blt 1b \n\t" /* if not, loop back, start with i = 0 */
+        "cmp %[k], %[eccd2m1] \n\t" /* k < uECC_WORDS * 2 - 1 (times 4) ? */
+        "blt 2b \n\t" /* if not, loop back, start with i = (k+1) - uECC_WORDS */
+        /* end outer loop */
+        
+        "str %[c0], [%[result], %[k]] \n\t" /* p_result[uECC_WORDS * 2 - 1] = c0 */
+    #if (uECC_PLATFORM != uECC_arm_thumb2)
+        ".syntax divided \n\t"
+    #endif
+        : [c0] "+r" (c0), [c1] "+r" (c1), [c2] "+r" (c2), [k] "+r" (k), [i] "=&r" (i), [t0] "=&r" (t0), [t1] "=&r" (t1)
+        : [result] "r" (p_result), [left] "r" (p_left), [right] "r" (p_right),
+          [eccd] "I" (uECC_WORDS * 4), [eccdm1] "I" ((uECC_WORDS-1) * 4), [eccd2m1] "I" ((uECC_WORDS * 2 - 1) * 4)
+        : "cc", "memory"
+    );
+    
+#else /* Thumb-1 */
+
+    register uint32_t *r0 __asm__("r0") = p_result;
+    register uint32_t *r1 __asm__("r1") = p_left;
+    register uint32_t *r2 __asm__("r2") = p_right;
+    
+    __asm__ volatile (
+        ".syntax unified \n\t"
+        "movs r3, #0 \n\t" /* c0 = 0 */
+        "movs r4, #0 \n\t" /* c1 = 0 */
+        "movs r5, #0 \n\t" /* c2 = 0 */
+        "movs r6, #0 \n\t" /* k = 0 */
+        
+        "push {r0} \n\t" /* keep p_result on the stack */
+        
+        "1: \n\t" /* outer loop (k < uECC_WORDS) */
+        "movs r7, #0 \n\t" /* r7 = i = 0 */
+        "b 3f \n\t"
+        
+        "2: \n\t" /* outer loop (k >= uECC_WORDS) */
+        "movs r7, r6 \n\t"        /* r7 = k */
+        "subs r7, %[eccdm1] \n\t" /* r7 = i = k - (uECC_WORDS - 1) (times 4) */
+        
+        "3: \n\t" /* inner loop */
+        "push {r3, r4, r5, r6} \n\t" /* push things, r3 (c0) is at the top of stack. */
+        "subs r0, r6, r7 \n\t"       /* r0 = k-i */
+        
+        "ldr r4, [r2, r0] \n\t" /* r4 = p_right[k-i] */
+        "ldr r0, [r1, r7] \n\t" /* r0 = p_left[i] */
+        
+        "lsrs r3, r0, #16 \n\t" /* r3 = a1 */
+        "uxth r0, r0 \n\t"      /* r0 = a0 */
+        
+        "lsrs r5, r4, #16 \n\t" /* r5 = b1 */
+        "uxth r4, r4 \n\t"      /* r4 = b0 */
+        
+        "movs r6, r3 \n\t"     /* r6 = a1 */
+        "muls r6, r5, r6 \n\t" /* r6 = a1*b1 */
+        "muls r3, r4, r3 \n\t" /* r3 = b0*a1 */
+        "muls r5, r0, r5 \n\t" /* r5 = a0*b1 */
+        "muls r0, r4, r0 \n\t" /* r0 = a0*b0 */
+        
+        "movs r4, #0 \n\t"  /* r4 = 0 */
+        "adds r3, r5 \n\t"  /* r3 = b0*a1 + a0*b1 */
+        "adcs r4, r4 \n\t"  /* r4 = carry */
+        "lsls r4, #16 \n\t" /* r4 = carry << 16 */
+        "adds r6, r4 \n\t"  /* r6 = a1*b1 + carry */
+        
+        "lsls r4, r3, #16 \n\t" /* r4 = (b0*a1 + a0*b1) << 16 */
+        "lsrs r3, #16 \n\t"     /* r3 = (b0*a1 + a0*b1) >> 16 */
+        "adds r0, r4 \n\t"      /* r0 = low word = a0*b0 + ((b0*a1 + a0*b1) << 16) */
+        "adcs r6, r3 \n\t"      /* r6 = high word = a1*b1 + carry + ((b0*a1 + a0*b1) >> 16) */
+        
+        "pop {r3, r4, r5} \n\t" /* r3 = c0, r4 = c1, r5 = c2 */
+        "adds r3, r0 \n\t"      /* add low word to c0 */
+        "adcs r4, r6 \n\t"      /* add high word to c1, including carry */
+        "movs r0, #0 \n\t"      /* r0 = 0 (does not affect carry bit) */
+        "adcs r5, r0 \n\t"      /* add carry to c2 */
+        
+        "pop {r6} \n\t" /* r6 = k */
+
+        "adds r7, #4 \n\t"     /* i += 4 */
+        "cmp r7, %[eccd] \n\t" /* i < uECC_WORDS (times 4)? */
+        "bge 4f \n\t" /* if not, exit the loop */
+        "cmp r7, r6 \n\t"      /* i <= k? */
+        "ble 3b \n\t" /* if so, continue looping */
+        
+        "4: \n\t" /* end inner loop */
+        
+        "ldr r0, [sp, #0] \n\t" /* r0 = p_result */
+        
+        "str r3, [r0, r6] \n\t"   /* p_result[k] = c0 */
+        "mov r3, r4 \n\t"         /* c0 = c1 */
+        "mov r4, r5 \n\t"         /* c1 = c2 */
+        "movs r5, #0 \n\t"        /* c2 = 0 */
+        "adds r6, #4 \n\t"        /* k += 4 */
+        "cmp r6, %[eccd] \n\t"    /* k < uECC_WORDS (times 4) ? */
+        "blt 1b \n\t" /* if not, loop back, start with i = 0 */
+        "cmp r6, %[eccd2m1] \n\t" /* k < uECC_WORDS * 2 - 1 (times 4) ? */
+        "blt 2b \n\t" /* if not, loop back, start with i = (k+1) - uECC_WORDS */
+        /* end outer loop */
+        
+        "str r3, [r0, r6] \n\t" /* p_result[uECC_WORDS * 2 - 1] = c0 */
+        "pop {r0} \n\t"         /* pop p_result off the stack */
+        
+        ".syntax divided \n\t"
+        : 
+        : [r0] "l" (r0), [r1] "l" (r1), [r2] "l" (r2), [eccd] "I" (uECC_WORDS * 4), [eccdm1] "I" ((uECC_WORDS-1) * 4), [eccd2m1] "I" ((uECC_WORDS * 2 - 1) * 4)
+        : "r3", "r4", "r5", "r6", "r7", "cc", "memory"
+    );
+#endif
+}
+#define asm_mult 1
+#endif /* !asm_mult */
+
+#if uECC_SQUARE_FUNC
+#if !asm_square
+static void vli_square(uint32_t *p_result, uint32_t *p_left)
+{
+#if (uECC_PLATFORM != uECC_arm_thumb)
+    uint32_t c0 = 0;
+    uint32_t c1 = 0;
+    uint32_t c2 = 0;
+    uint32_t k = 0;
+    uint32_t i, tt;
+    uint32_t t0, t1;
+    
+    __asm__ volatile (
+        ".syntax unified \n\t"
+        
+        "1: \n\t" /* outer loop (k < uECC_WORDS) */
+        "movs %[i], #0 \n\t" /* i = 0 */
+        "b 3f \n\t"
+        
+        "2: \n\t" /* outer loop (k >= uECC_WORDS) */
+        "movs %[i], %[k] \n\t"      /* i = k */
+        "subs %[i], %[eccdm1] \n\t" /* i = k - (uECC_WORDS - 1) (times 4) */
+        
+        "3: \n\t" /* inner loop */
+        "subs %[tt], %[k], %[i] \n\t" /* tt = k-i */
+        
+        "ldr %[t1], [%[left], %[tt]] \n\t" /* t1 = p_left[k-i] */
+        "ldr %[t0], [%[left], %[i]] \n\t"  /* t0 = p_left[i] */
+        
+        "umull %[t0], %[t1], %[t0], %[t1] \n\t" /* (t0, t1) = p_left[i] * p_right[k-i] */
+        
+        "cmp %[i], %[tt] \n\t"   /* (i < k-i) ? */
+        "bge 4f \n\t" /* if i >= k-i, skip */
+        "lsls %[t1], #1 \n\t"    /* high word << 1 */
+        "adc %[c2], #0 \n\t"     /* add carry bit to c2 */
+        "lsls %[t0], #1 \n\t"       /* low word << 1 */
+        "adc %[t1], #0 \n\t"     /* add carry bit to high word */
+        
+        "4: \n\t"
+
+        "adds %[c0], %[t0] \n\t" /* add low word to c0 */
+        "adcs %[c1], %[t1] \n\t" /* add high word to c1, including carry */
+        "adc %[c2], #0 \n\t"     /* add carry to c2 */
+        
+        "adds %[i], #4 \n\t"          /* i += 4 */
+        "cmp %[i], %[k] \n\t"         /* i <= k? */
+        "bge 5f \n\t" /* if not, exit the loop */
+        "subs %[tt], %[k], %[i] \n\t" /* tt = k-i */
+        "cmp %[i], %[tt] \n\t"        /* i <= k-i? */
+        "ble 3b \n\t" /* if so, continue looping */
+        
+        "5: \n\t" /* end inner loop */
+        
+        "str %[c0], [%[result], %[k]] \n\t" /* p_result[k] = c0 */
+        "mov %[c0], %[c1] \n\t"     /* c0 = c1 */
+        "mov %[c1], %[c2] \n\t"     /* c1 = c2 */
+        "movs %[c2], #0 \n\t"       /* c2 = 0 */
+        "adds %[k], #4 \n\t"        /* k += 4 */
+        "cmp %[k], %[eccd] \n\t"    /* k < uECC_WORDS (times 4) ? */
+        "blt 1b \n\t" /* if not, loop back, start with i = 0 */
+        "cmp %[k], %[eccd2m1] \n\t" /* k < uECC_WORDS * 2 - 1 (times 4) ? */
+        "blt 2b \n\t" /* if not, loop back, start with i = (k+1) - uECC_WORDS */
+        /* end outer loop */
+        
+        "str %[c0], [%[result], %[k]] \n\t" /* p_result[uECC_WORDS * 2 - 1] = c0 */
+    #if (uECC_PLATFORM != uECC_arm_thumb2)
+        ".syntax divided \n\t"
+    #endif
+        : [c0] "+r" (c0), [c1] "+r" (c1), [c2] "+r" (c2), [k] "+r" (k), [i] "=&r" (i), [tt] "=&r" (tt), [t0] "=&r" (t0), [t1] "=&r" (t1)
+        : [result] "r" (p_result), [left] "r" (p_left),
+          [eccd] "I" (uECC_WORDS * 4), [eccdm1] "I" ((uECC_WORDS-1) * 4), [eccd2m1] "I" ((uECC_WORDS * 2 - 1) * 4)
+        : "cc", "memory"
+    );
+    
+#else
+
+    register uint32_t *r0 __asm__("r0") = p_result;
+    register uint32_t *r1 __asm__("r1") = p_left;
+    
+    __asm__ volatile (
+        ".syntax unified \n\t"
+        "movs r2, #0 \n\t" /* c0 = 0 */
+        "movs r3, #0 \n\t" /* c1 = 0 */
+        "movs r4, #0 \n\t" /* c2 = 0 */
+        "movs r5, #0 \n\t" /* k = 0 */
+        
+        "push {r0} \n\t" /* keep p_result on the stack */
+        
+        "1: \n\t" /* outer loop (k < uECC_WORDS) */
+        "movs r6, #0 \n\t" /* r6 = i = 0 */
+        "b 3f \n\t"
+        
+        "2: \n\t" /* outer loop (k >= uECC_WORDS) */
+        "movs r6, r5 \n\t"        /* r6 = k */
+        "subs r6, %[eccdm1] \n\t" /* r6 = i = k - (uECC_WORDS - 1) (times 4) */
+        
+        "3: \n\t" /* inner loop */
+        "push {r2, r3, r4, r5} \n\t" /* push things, r2 (c0) is at the top of stack. */
+        "subs r7, r5, r6 \n\t"       /* r7 = k-i */
+        
+        "ldr r3, [r1, r7] \n\t" /* r3 = p_left[k-i] */
+        "ldr r0, [r1, r6] \n\t" /* r0 = p_left[i] */
+        
+        "lsrs r2, r0, #16 \n\t" /* r2 = a1 */
+        "uxth r0, r0 \n\t"      /* r0 = a0 */
+        
+        "lsrs r4, r3, #16 \n\t" /* r4 = b1 */
+        "uxth r3, r3 \n\t"      /* r3 = b0 */
+        
+        "movs r5, r2 \n\t"     /* r5 = a1 */
+        "muls r5, r4, r5 \n\t" /* r5 = a1*b1 */
+        "muls r2, r3, r2 \n\t" /* r2 = b0*a1 */
+        "muls r4, r0, r4 \n\t" /* r4 = a0*b1 */
+        "muls r0, r3, r0 \n\t" /* r0 = a0*b0 */
+        
+        "movs r3, #0 \n\t"  /* r3 = 0 */
+        "adds r2, r4 \n\t"  /* r2 = b0*a1 + a0*b1 */
+        "adcs r3, r3 \n\t"  /* r3 = carry */
+        "lsls r3, #16 \n\t" /* r3 = carry << 16 */
+        "adds r5, r3 \n\t"  /* r5 = a1*b1 + carry */
+        
+        "lsls r3, r2, #16 \n\t" /* r3 = (b0*a1 + a0*b1) << 16 */
+        "lsrs r2, #16 \n\t"     /* r2 = (b0*a1 + a0*b1) >> 16 */
+        "adds r0, r3 \n\t"      /* r0 = low word = a0*b0 + ((b0*a1 + a0*b1) << 16) */
+        "adcs r5, r2 \n\t"      /* r5 = high word = a1*b1 + carry + ((b0*a1 + a0*b1) >> 16) */
+    
+        "movs r3, #0 \n\t"  /* r3 = 0 */
+        "cmp r6, r7 \n\t"   /* (i < k-i) ? */
+        "mov r7, r3 \n\t"   /* r7 = 0 (does not affect condition)*/
+        "bge 4f \n\t" /* if i >= k-i, skip */
+        "lsls r5, #1 \n\t"  /* high word << 1 */
+        "adcs r7, r3 \n\t"  /* r7 = carry bit for c2 */
+        "lsls r0, #1 \n\t"  /* low word << 1 */
+        "adcs r5, r3 \n\t"  /* add carry from shift to high word */
+        
+        "4: \n\t"
+        "pop {r2, r3, r4} \n\t" /* r2 = c0, r3 = c1, r4 = c2 */
+        "adds r2, r0 \n\t"      /* add low word to c0 */
+        "adcs r3, r5 \n\t"      /* add high word to c1, including carry */
+        "movs r0, #0 \n\t"      /* r0 = 0 (does not affect carry bit) */
+        "adcs r4, r0 \n\t"      /* add carry to c2 */
+        "adds r4, r7 \n\t"      /* add carry from doubling (if any) */
+        
+        "pop {r5} \n\t" /* r5 = k */
+        
+        "adds r6, #4 \n\t"     /* i += 4 */
+        "cmp r6, r5 \n\t"      /* i <= k? */
+        "bge 5f \n\t" /* if not, exit the loop */
+        "subs r7, r5, r6 \n\t" /* r7 = k-i */
+        "cmp r6, r7 \n\t"      /* i <= k-i? */
+        "ble 3b \n\t" /* if so, continue looping */
+        
+        "5: \n\t" /* end inner loop */
+        
+        "ldr r0, [sp, #0] \n\t" /* r0 = p_result */
+        
+        "str r2, [r0, r5] \n\t"   /* p_result[k] = c0 */
+        "mov r2, r3 \n\t"         /* c0 = c1 */
+        "mov r3, r4 \n\t"         /* c1 = c2 */
+        "movs r4, #0 \n\t"        /* c2 = 0 */
+        "adds r5, #4 \n\t"        /* k += 4 */
+        "cmp r5, %[eccd] \n\t"    /* k < uECC_WORDS (times 4) ? */
+        "blt 1b \n\t" /* if not, loop back, start with i = 0 */
+        "cmp r5, %[eccd2m1] \n\t" /* k < uECC_WORDS * 2 - 1 (times 4) ? */
+        "blt 2b \n\t" /* if not, loop back, start with i = (k+1) - uECC_WORDS */
+        /* end outer loop */
+        
+        "str r2, [r0, r5] \n\t" /* p_result[uECC_WORDS * 2 - 1] = c0 */
+        "pop {r0} \n\t"         /* pop p_result off the stack */
+
+        ".syntax divided \n\t"
+        : [r0] "+l" (r0), [r1] "+l" (r1)
+        : [eccd] "I" (uECC_WORDS * 4), [eccdm1] "I" ((uECC_WORDS-1) * 4), [eccd2m1] "I" ((uECC_WORDS * 2 - 1) * 4)
+        : "r2", "r3", "r4", "r5", "r6", "r7", "cc", "memory"
+    );
+#endif
+}
+#define asm_square 1
+#endif /* !asm_square */
+#endif /* uECC_SQUARE_FUNC */
diff --git a/extlibs/tinydtls/ecc/asm_avr.inc b/extlibs/tinydtls/ecc/asm_avr.inc
new file mode 100755 (executable)
index 0000000..a945e52
--- /dev/null
@@ -0,0 +1,16276 @@
+#define DEC_20 19
+#define DEC_24 23
+#define DEC_32 31
+
+#define DEC(N) uECC_CONCAT(DEC_, N)
+
+#define REPEAT_1(stuff) stuff
+#define REPEAT_2(stuff) REPEAT_1(stuff) stuff
+#define REPEAT_3(stuff) REPEAT_2(stuff) stuff
+#define REPEAT_4(stuff) REPEAT_3(stuff) stuff
+#define REPEAT_5(stuff) REPEAT_4(stuff) stuff
+#define REPEAT_6(stuff) REPEAT_5(stuff) stuff
+#define REPEAT_7(stuff) REPEAT_6(stuff) stuff
+#define REPEAT_8(stuff) REPEAT_7(stuff) stuff
+#define REPEAT_9(stuff) REPEAT_8(stuff) stuff
+#define REPEAT_10(stuff) REPEAT_9(stuff) stuff
+#define REPEAT_11(stuff) REPEAT_10(stuff) stuff
+#define REPEAT_12(stuff) REPEAT_11(stuff) stuff
+#define REPEAT_13(stuff) REPEAT_12(stuff) stuff
+#define REPEAT_14(stuff) REPEAT_13(stuff) stuff
+#define REPEAT_15(stuff) REPEAT_14(stuff) stuff
+#define REPEAT_16(stuff) REPEAT_15(stuff) stuff
+#define REPEAT_17(stuff) REPEAT_16(stuff) stuff
+#define REPEAT_18(stuff) REPEAT_17(stuff) stuff
+#define REPEAT_19(stuff) REPEAT_18(stuff) stuff
+#define REPEAT_20(stuff) REPEAT_19(stuff) stuff
+#define REPEAT_21(stuff) REPEAT_20(stuff) stuff
+#define REPEAT_22(stuff) REPEAT_21(stuff) stuff
+#define REPEAT_23(stuff) REPEAT_22(stuff) stuff
+#define REPEAT_24(stuff) REPEAT_23(stuff) stuff
+#define REPEAT_25(stuff) REPEAT_24(stuff) stuff
+#define REPEAT_26(stuff) REPEAT_25(stuff) stuff
+#define REPEAT_27(stuff) REPEAT_26(stuff) stuff
+#define REPEAT_28(stuff) REPEAT_27(stuff) stuff
+#define REPEAT_29(stuff) REPEAT_28(stuff) stuff
+#define REPEAT_30(stuff) REPEAT_29(stuff) stuff
+#define REPEAT_31(stuff) REPEAT_30(stuff) stuff
+#define REPEAT_32(stuff) REPEAT_31(stuff) stuff
+
+#define REPEAT(N, stuff) uECC_CONCAT(REPEAT_, N)(stuff)
+
+#define STR2(thing) #thing
+#define STR(thing) STR2(thing)
+
+#if (uECC_ASM == uECC_asm_fast)
+
+static void vli_clear(uint8_t *p_vli)
+{
+    __asm__ volatile (
+        REPEAT(uECC_BYTES, "st %a[ptr]+, r1 \n\t")
+
+        : [ptr] "+e" (p_vli)
+        :
+        : "r0", "cc", "memory"
+    );
+}
+#define asm_clear 1
+
+static void vli_set(uint8_t *p_dest, const uint8_t *p_src)
+{
+    __asm__ volatile (
+        REPEAT(uECC_BYTES, "ld r0, %a[sptr]+ \n\t"
+            "st %a[dptr]+, r0 \n\t")
+        : [dptr] "+e" (p_dest), [sptr] "+e" (p_src)
+        :
+        : "r0", "cc", "memory"
+    );
+}
+#define asm_set 1
+
+static void vli_rshift1(uint8_t *p_vli)
+{
+    __asm__ volatile (
+        "adiw r30, " STR(uECC_BYTES) " \n\t"
+        "ld r0, -z \n\t"  /* Load byte. */
+        "lsr r0 \n\t" /* Shift. */
+        "st z, r0 \n\t"  /* Store the first result byte. */
+
+        /* Now we just do the remaining bytes with the carry bit (using ROR) */
+        REPEAT(DEC(uECC_BYTES), "ld r0, -z \n\t"
+            "ror r0 \n\t"
+            "st z, r0 \n\t")
+
+        : "+z" (p_vli)
+        :
+        : "r0", "cc", "memory"
+    );
+}
+#define asm_rshift1 1
+
+/* Computes p_result = p_left + p_right, returning carry. Can modify in place. */
+static uint8_t vli_add(uint8_t *p_result, uint8_t *p_left, uint8_t *p_right)
+{
+    uint8_t l_carry = 0;
+    uint8_t l_left;
+    uint8_t l_right;
+
+    __asm__ volatile (
+        "ld %[left], x+ \n\t"  /* Load left byte. */
+        "ld %[right], y+ \n\t" /* Load right byte. */
+        "add %[left], %[right] \n\t" /* Add the first byte. */
+        "st z+, %[left] \n\t"  /* Store the first result byte. */
+        
+        /* Now we just do the remaining bytes with the carry bit (using ADC) */
+        REPEAT(DEC(uECC_BYTES), "ld %[left], x+ \n\t"
+            "ld %[right], y+ \n\t"
+            "adc %[left], %[right] \n\t"
+            "st z+, %[left] \n\t")
+        
+        "adc %[carry], %[carry] \n\t"    /* Store carry bit in l_carry. */
+        
+        "sbiw r28, " STR(uECC_BYTES) " \n\t" /* Restore Y */
+
+        : "+z" (p_result), "+x" (p_left),
+            [carry] "+r" (l_carry), [left] "=&r" (l_left), [right] "=&r" (l_right)
+        : "y" (p_right)
+        : "cc", "memory"
+    );
+    return l_carry;
+}
+#define asm_add 1
+
+/* Computes p_result = p_left - p_right, returning borrow. Can modify in place. */
+static uint8_t vli_sub(uint8_t *p_result, uint8_t *p_left, uint8_t *p_right)
+{
+    uint8_t l_borrow = 0;
+    uint8_t l_left;
+    uint8_t l_right;
+
+    __asm__ volatile (
+        "ld %[left], x+ \n\t"  /* Load left byte. */
+        "ld %[right], y+ \n\t" /* Load right byte. */
+        "sub %[left], %[right] \n\t" /* Subtract the first byte. */
+        "st z+, %[left] \n\t"  /* Store the first result byte. */
+        
+        /* Now we just do the remaining bytes with the carry bit (using SBC) */
+        REPEAT(DEC(uECC_BYTES), "ld %[left], x+ \n\t"
+            "ld %[right], y+ \n\t"
+            "sbc %[left], %[right] \n\t"
+            "st z+, %[left] \n\t")
+        
+        "adc %[borrow], %[borrow] \n\t"    /* Store carry bit in l_borrow. */
+        
+        "sbiw r28, " STR(uECC_BYTES) " \n\t" /* Restore Y */
+
+        : "+z" (p_result), "+x" (p_left),
+            [borrow] "+r" (l_borrow), [left] "=&r" (l_left), [right] "=&r" (l_right)
+        : "y" (p_right)
+        : "cc", "memory"
+    );
+    return l_borrow;
+}
+#define asm_sub 1
+
+#if (uECC_BYTES == 20)
+__attribute((noinline))
+static void vli_mult(uint8_t *p_result, uint8_t *p_left, uint8_t *p_right)
+{
+    __asm__ volatile (
+        "adiw r30, 10 \n\t"
+        "adiw r28, 10 \n\t"
+        "ld r2, x+ \n\t"
+        "ld r3, x+ \n\t"
+        "ld r4, x+ \n\t"
+        "ld r5, x+ \n\t"
+        "ld r6, x+ \n\t"
+        "ld r7, x+ \n\t"
+        "ld r8, x+ \n\t"
+        "ld r9, x+ \n\t"
+        "ld r10, x+ \n\t"
+        "ld r11, x+ \n\t"
+        "ld r12, y+ \n\t"
+        "ld r13, y+ \n\t"
+        "ld r14, y+ \n\t"
+        "ld r15, y+ \n\t"
+        "ld r16, y+ \n\t"
+        "ld r17, y+ \n\t"
+        "ld r18, y+ \n\t"
+        "ld r19, y+ \n\t"
+        "ld r20, y+ \n\t"
+        "ld r21, y+ \n\t"
+        "ldi r25, 0 \n\t"
+
+        "ldi r23, 0 \n\t"
+        "mul r2, r12 \n\t"
+        "st z+, r0 \n\t"
+        "mov r22, r1 \n\t"
+
+        "ldi r24, 0 \n\t"
+        "mul r2, r13 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "mul r3, r12 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "st z+, r22 \n\t"
+
+        "ldi r22, 0 \n\t"
+        "mul r2, r14 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r3, r13 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r4, r12 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "st z+, r23 \n\t"
+
+        "ldi r23, 0 \n\t"
+        "mul r2, r15 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r3, r14 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r4, r13 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r5, r12 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "st z+, r24 \n\t"
+
+        "ldi r24, 0 \n\t"
+        "mul r2, r16 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r3, r15 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r4, r14 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r5, r13 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r6, r12 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "st z+, r22 \n\t"
+
+        "ldi r22, 0 \n\t"
+        "mul r2, r17 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r3, r16 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r4, r15 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r5, r14 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r6, r13 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r7, r12 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "st z+, r23 \n\t"
+
+        "ldi r23, 0 \n\t"
+        "mul r2, r18 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r3, r17 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r4, r16 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r5, r15 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r6, r14 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r7, r13 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r8, r12 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "st z+, r24 \n\t"
+
+        "ldi r24, 0 \n\t"
+        "mul r2, r19 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r3, r18 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r4, r17 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r5, r16 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r6, r15 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r7, r14 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r8, r13 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r9, r12 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "st z+, r22 \n\t"
+
+        "ldi r22, 0 \n\t"
+        "mul r2, r20 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r3, r19 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r4, r18 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r5, r17 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r6, r16 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r7, r15 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r8, r14 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r9, r13 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r10, r12 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "st z+, r23 \n\t"
+
+        "ldi r23, 0 \n\t"
+        "mul r2, r21 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r3, r20 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r4, r19 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r5, r18 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r6, r17 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r7, r16 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r8, r15 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r9, r14 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r10, r13 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r11, r12 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "st z+, r24 \n\t"
+
+        "ldi r24, 0 \n\t"
+        "mul r3, r21 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r4, r20 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r5, r19 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r6, r18 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r7, r17 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r8, r16 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r9, r15 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r10, r14 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r11, r13 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "st z+, r22 \n\t"
+
+        "ldi r22, 0 \n\t"
+        "mul r4, r21 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r5, r20 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r6, r19 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r7, r18 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r8, r17 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r9, r16 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r10, r15 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r11, r14 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "st z+, r23 \n\t"
+
+        "ldi r23, 0 \n\t"
+        "mul r5, r21 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r6, r20 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r7, r19 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r8, r18 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r9, r17 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r10, r16 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r11, r15 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "st z+, r24 \n\t"
+
+        "ldi r24, 0 \n\t"
+        "mul r6, r21 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r7, r20 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r8, r19 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r9, r18 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r10, r17 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r11, r16 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "st z+, r22 \n\t"
+
+        "ldi r22, 0 \n\t"
+        "mul r7, r21 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r8, r20 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r9, r19 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r10, r18 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r11, r17 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "st z+, r23 \n\t"
+
+        "ldi r23, 0 \n\t"
+        "mul r8, r21 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r9, r20 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r10, r19 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r11, r18 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "st z+, r24 \n\t"
+
+        "ldi r24, 0 \n\t"
+        "mul r9, r21 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r10, r20 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r11, r19 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "st z+, r22 \n\t"
+
+        "ldi r22, 0 \n\t"
+        "mul r10, r21 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r11, r20 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "st z+, r23 \n\t"
+
+        "mul r11, r21 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "st z+, r24 \n\t"
+        "st z+, r22 \n\t"
+
+        "sbiw r30, 30 \n\t"
+        "sbiw r28, 20 \n\t"
+        "ld r12, y+ \n\t"
+        "ld r13, y+ \n\t"
+        "ld r14, y+ \n\t"
+        "ld r15, y+ \n\t"
+        "ld r16, y+ \n\t"
+        "ld r17, y+ \n\t"
+        "ld r18, y+ \n\t"
+        "ld r19, y+ \n\t"
+        "ld r20, y+ \n\t"
+        "ld r21, y+ \n\t"
+
+        "ldi r23, 0 \n\t"
+        "mul r2, r12 \n\t"
+        "st z+, r0 \n\t"
+        "mov r22, r1 \n\t"
+
+        "ldi r24, 0 \n\t"
+        "mul r2, r13 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "mul r3, r12 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "st z+, r22 \n\t"
+
+        "ldi r22, 0 \n\t"
+        "mul r2, r14 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r3, r13 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r4, r12 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "st z+, r23 \n\t"
+
+        "ldi r23, 0 \n\t"
+        "mul r2, r15 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r3, r14 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r4, r13 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r5, r12 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "st z+, r24 \n\t"
+
+        "ldi r24, 0 \n\t"
+        "mul r2, r16 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r3, r15 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r4, r14 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r5, r13 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r6, r12 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "st z+, r22 \n\t"
+
+        "ldi r22, 0 \n\t"
+        "mul r2, r17 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r3, r16 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r4, r15 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r5, r14 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r6, r13 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r7, r12 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "st z+, r23 \n\t"
+
+        "ldi r23, 0 \n\t"
+        "mul r2, r18 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r3, r17 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r4, r16 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r5, r15 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r6, r14 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r7, r13 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r8, r12 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "st z+, r24 \n\t"
+
+        "ldi r24, 0 \n\t"
+        "mul r2, r19 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r3, r18 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r4, r17 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r5, r16 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r6, r15 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r7, r14 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r8, r13 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r9, r12 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "st z+, r22 \n\t"
+
+        "ldi r22, 0 \n\t"
+        "mul r2, r20 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r3, r19 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r4, r18 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r5, r17 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r6, r16 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r7, r15 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r8, r14 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r9, r13 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r10, r12 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "st z+, r23 \n\t"
+
+        "ldi r23, 0 \n\t"
+        "mul r2, r21 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r3, r20 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r4, r19 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r5, r18 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r6, r17 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r7, r16 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r8, r15 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r9, r14 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r10, r13 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r11, r12 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "st z+, r24 \n\t"
+
+        "ld r2, x+ \n\t"
+        "ldi r24, 0 \n\t"
+        "mul r3, r21 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r4, r20 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r5, r19 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r6, r18 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r7, r17 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r8, r16 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r9, r15 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r10, r14 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r11, r13 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r2, r12 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "ld r0, z \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r25 \n\t"
+        "adc r24, r25 \n\t"
+        "st z+, r22 \n\t"
+
+        "ld r3, x+ \n\t"
+        "ldi r22, 0 \n\t"
+        "mul r4, r21 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r5, r20 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r6, r19 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r7, r18 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r8, r17 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r9, r16 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r10, r15 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r11, r14 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r2, r13 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r3, r12 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "ld r0, z \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r25 \n\t"
+        "adc r22, r25 \n\t"
+        "st z+, r23 \n\t"
+
+        "ld r4, x+ \n\t"
+        "ldi r23, 0 \n\t"
+        "mul r5, r21 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r6, r20 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r7, r19 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r8, r18 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r9, r17 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r10, r16 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r11, r15 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r2, r14 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r3, r13 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r4, r12 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "ld r0, z \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r25 \n\t"
+        "adc r23, r25 \n\t"
+        "st z+, r24 \n\t"
+
+        "ld r5, x+ \n\t"
+        "ldi r24, 0 \n\t"
+        "mul r6, r21 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r7, r20 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r8, r19 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r9, r18 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r10, r17 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r11, r16 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r2, r15 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r3, r14 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r4, r13 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r5, r12 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "ld r0, z \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r25 \n\t"
+        "adc r24, r25 \n\t"
+        "st z+, r22 \n\t"
+
+        "ld r6, x+ \n\t"
+        "ldi r22, 0 \n\t"
+        "mul r7, r21 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r8, r20 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r9, r19 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r10, r18 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r11, r17 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r2, r16 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r3, r15 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r4, r14 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r5, r13 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r6, r12 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "ld r0, z \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r25 \n\t"
+        "adc r22, r25 \n\t"
+        "st z+, r23 \n\t"
+
+        "ld r7, x+ \n\t"
+        "ldi r23, 0 \n\t"
+        "mul r8, r21 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r9, r20 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r10, r19 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r11, r18 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r2, r17 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r3, r16 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r4, r15 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r5, r14 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r6, r13 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r7, r12 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "ld r0, z \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r25 \n\t"
+        "adc r23, r25 \n\t"
+        "st z+, r24 \n\t"
+
+        "ld r8, x+ \n\t"
+        "ldi r24, 0 \n\t"
+        "mul r9, r21 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r10, r20 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r11, r19 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r2, r18 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r3, r17 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r4, r16 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r5, r15 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r6, r14 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r7, r13 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r8, r12 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "ld r0, z \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r25 \n\t"
+        "adc r24, r25 \n\t"
+        "st z+, r22 \n\t"
+
+        "ld r9, x+ \n\t"
+        "ldi r22, 0 \n\t"
+        "mul r10, r21 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r11, r20 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r2, r19 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r3, r18 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r4, r17 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r5, r16 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r6, r15 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r7, r14 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r8, r13 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r9, r12 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "ld r0, z \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r25 \n\t"
+        "adc r22, r25 \n\t"
+        "st z+, r23 \n\t"
+
+        "ld r10, x+ \n\t"
+        "ldi r23, 0 \n\t"
+        "mul r11, r21 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r2, r20 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r3, r19 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r4, r18 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r5, r17 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r6, r16 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r7, r15 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r8, r14 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r9, r13 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r10, r12 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "ld r0, z \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r25 \n\t"
+        "adc r23, r25 \n\t"
+        "st z+, r24 \n\t"
+
+        "ld r11, x+ \n\t"
+        "ldi r24, 0 \n\t"
+        "mul r2, r21 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r3, r20 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r4, r19 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r5, r18 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r6, r17 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r7, r16 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r8, r15 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r9, r14 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r10, r13 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r11, r12 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "ld r0, z \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r25 \n\t"
+        "adc r24, r25 \n\t"
+        "st z+, r22 \n\t"
+
+        "ld r12, y+ \n\t"
+        "ldi r22, 0 \n\t"
+        "mul r2, r12 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r3, r21 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r4, r20 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r5, r19 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r6, r18 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r7, r17 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r8, r16 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r9, r15 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r10, r14 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r11, r13 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "ld r0, z \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r25 \n\t"
+        "adc r22, r25 \n\t"
+        "st z+, r23 \n\t"
+
+        "ld r13, y+ \n\t"
+        "ldi r23, 0 \n\t"
+        "mul r2, r13 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r3, r12 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r4, r21 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r5, r20 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r6, r19 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r7, r18 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r8, r17 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r9, r16 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r10, r15 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r11, r14 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "ld r0, z \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r25 \n\t"
+        "adc r23, r25 \n\t"
+        "st z+, r24 \n\t"
+
+        "ld r14, y+ \n\t"
+        "ldi r24, 0 \n\t"
+        "mul r2, r14 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r3, r13 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r4, r12 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r5, r21 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r6, r20 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r7, r19 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r8, r18 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r9, r17 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r10, r16 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r11, r15 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "ld r0, z \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r25 \n\t"
+        "adc r24, r25 \n\t"
+        "st z+, r22 \n\t"
+
+        "ld r15, y+ \n\t"
+        "ldi r22, 0 \n\t"
+        "mul r2, r15 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r3, r14 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r4, r13 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r5, r12 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r6, r21 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r7, r20 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r8, r19 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r9, r18 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r10, r17 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r11, r16 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "ld r0, z \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r25 \n\t"
+        "adc r22, r25 \n\t"
+        "st z+, r23 \n\t"
+
+        "ld r16, y+ \n\t"
+        "ldi r23, 0 \n\t"
+        "mul r2, r16 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r3, r15 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r4, r14 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r5, r13 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r6, r12 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r7, r21 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r8, r20 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r9, r19 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r10, r18 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r11, r17 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "ld r0, z \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r25 \n\t"
+        "adc r23, r25 \n\t"
+        "st z+, r24 \n\t"
+
+        "ld r17, y+ \n\t"
+        "ldi r24, 0 \n\t"
+        "mul r2, r17 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r3, r16 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r4, r15 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r5, r14 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r6, r13 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r7, r12 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r8, r21 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r9, r20 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r10, r19 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r11, r18 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "ld r0, z \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r25 \n\t"
+        "adc r24, r25 \n\t"
+        "st z+, r22 \n\t"
+
+        "ld r18, y+ \n\t"
+        "ldi r22, 0 \n\t"
+        "mul r2, r18 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r3, r17 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r4, r16 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r5, r15 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r6, r14 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r7, r13 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r8, r12 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r9, r21 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r10, r20 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r11, r19 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "ld r0, z \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r25 \n\t"
+        "adc r22, r25 \n\t"
+        "st z+, r23 \n\t"
+
+        "ld r19, y+ \n\t"
+        "ldi r23, 0 \n\t"
+        "mul r2, r19 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r3, r18 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r4, r17 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r5, r16 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r6, r15 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r7, r14 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r8, r13 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r9, r12 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r10, r21 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r11, r20 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "ld r0, z \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r25 \n\t"
+        "adc r23, r25 \n\t"
+        "st z+, r24 \n\t"
+
+        "ld r20, y+ \n\t"
+        "ldi r24, 0 \n\t"
+        "mul r2, r20 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r3, r19 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r4, r18 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r5, r17 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r6, r16 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r7, r15 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r8, r14 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r9, r13 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r10, r12 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r11, r21 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "ld r0, z \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r25 \n\t"
+        "adc r24, r25 \n\t"
+        "st z+, r22 \n\t"
+
+        "ld r21, y+ \n\t"
+        "ldi r22, 0 \n\t"
+        "mul r2, r21 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r3, r20 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r4, r19 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r5, r18 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r6, r17 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r7, r16 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r8, r15 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r9, r14 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r10, r13 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r11, r12 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "ld r0, z \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r25 \n\t"
+        "adc r22, r25 \n\t"
+        "st z+, r23 \n\t"
+
+        "ldi r23, 0 \n\t"
+        "mul r3, r21 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r4, r20 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r5, r19 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r6, r18 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r7, r17 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r8, r16 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r9, r15 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r10, r14 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r11, r13 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "st z+, r24 \n\t"
+
+        "ldi r24, 0 \n\t"
+        "mul r4, r21 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r5, r20 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r6, r19 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r7, r18 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r8, r17 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r9, r16 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r10, r15 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r11, r14 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "st z+, r22 \n\t"
+
+        "ldi r22, 0 \n\t"
+        "mul r5, r21 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r6, r20 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r7, r19 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r8, r18 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r9, r17 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r10, r16 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r11, r15 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "st z+, r23 \n\t"
+
+        "ldi r23, 0 \n\t"
+        "mul r6, r21 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r7, r20 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r8, r19 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r9, r18 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r10, r17 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r11, r16 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "st z+, r24 \n\t"
+
+        "ldi r24, 0 \n\t"
+        "mul r7, r21 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r8, r20 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r9, r19 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r10, r18 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r11, r17 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "st z+, r22 \n\t"
+
+        "ldi r22, 0 \n\t"
+        "mul r8, r21 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r9, r20 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r10, r19 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r11, r18 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "st z+, r23 \n\t"
+
+        "ldi r23, 0 \n\t"
+        "mul r9, r21 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r10, r20 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r11, r19 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "st z+, r24 \n\t"
+
+        "ldi r24, 0 \n\t"
+        "mul r10, r21 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r11, r20 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "st z+, r22 \n\t"
+
+        "mul r11, r21 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "st z+, r23 \n\t"
+        "st z+, r24 \n\t"
+        "eor r1, r1 \n\t"
+        : "+x" (p_left), "+y" (p_right), "+z" (p_result)
+        :
+        : "r0", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12",
+          "r13", "r14", "r15", "r16", "r17", "r18", "r19", "r20", "r21", "r22", "r23", "r24", "r25", "cc", "memory"
+    );
+}
+#define asm_mult 1
+#elif (uECC_BYTES == 24)
+__attribute((noinline))
+static void vli_mult(uint8_t *p_result, uint8_t *p_left, uint8_t *p_right)
+{
+    __asm__ volatile (
+        "adiw r30, 20 \n\t"
+        "adiw r28, 20 \n\t"
+        "ld r2, x+ \n\t"
+        "ld r3, x+ \n\t"
+        "ld r4, x+ \n\t"
+        "ld r5, x+ \n\t"
+        "ld r12, y+ \n\t"
+        "ld r13, y+ \n\t"
+        "ld r14, y+ \n\t"
+        "ld r15, y+ \n\t"
+        "ldi r25, 0 \n\t"
+
+        "ldi r23, 0 \n\t"
+        "mul r2, r12 \n\t"
+        "st z+, r0 \n\t"
+        "mov r22, r1 \n\t"
+
+        "ldi r24, 0 \n\t"
+        "mul r2, r13 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "mul r3, r12 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "st z+, r22 \n\t"
+
+        "ldi r22, 0 \n\t"
+        "mul r2, r14 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r3, r13 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r4, r12 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "st z+, r23 \n\t"
+
+        "ldi r23, 0 \n\t"
+        "mul r2, r15 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r3, r14 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r4, r13 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r5, r12 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "st z+, r24 \n\t"
+
+        "ldi r24, 0 \n\t"
+        "mul r3, r15 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r4, r14 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r5, r13 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "st z+, r22 \n\t"
+
+        "ldi r22, 0 \n\t"
+        "mul r4, r15 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r5, r14 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "st z+, r23 \n\t"
+
+        "mul r5, r15 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "st z+, r24 \n\t"
+        "st z+, r22 \n\t"
+
+        "sbiw r30, 18 \n\t"
+        "sbiw r28, 14 \n\t"
+        "ld r12, y+ \n\t"
+        "ld r13, y+ \n\t"
+        "ld r14, y+ \n\t"
+        "ld r15, y+ \n\t"
+        "ld r16, y+ \n\t"
+        "ld r17, y+ \n\t"
+        "ld r18, y+ \n\t"
+        "ld r19, y+ \n\t"
+        "ld r20, y+ \n\t"
+        "ld r21, y+ \n\t"
+        "ld r6, x+ \n\t"
+        "ld r7, x+ \n\t"
+        "ld r8, x+ \n\t"
+        "ld r9, x+ \n\t"
+        "ld r10, x+ \n\t"
+        "ld r11, x+ \n\t"
+
+        "ldi r23, 0 \n\t"
+        "mul r2, r12 \n\t"
+        "st z+, r0 \n\t"
+        "mov r22, r1 \n\t"
+
+        "ldi r24, 0 \n\t"
+        "mul r2, r13 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "mul r3, r12 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "st z+, r22 \n\t"
+
+        "ldi r22, 0 \n\t"
+        "mul r2, r14 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r3, r13 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r4, r12 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "st z+, r23 \n\t"
+
+        "ldi r23, 0 \n\t"
+        "mul r2, r15 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r3, r14 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r4, r13 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r5, r12 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "st z+, r24 \n\t"
+
+        "ldi r24, 0 \n\t"
+        "mul r2, r16 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r3, r15 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r4, r14 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r5, r13 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r6, r12 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "st z+, r22 \n\t"
+
+        "ldi r22, 0 \n\t"
+        "mul r2, r17 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r3, r16 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r4, r15 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r5, r14 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r6, r13 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r7, r12 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "st z+, r23 \n\t"
+
+        "ldi r23, 0 \n\t"
+        "mul r2, r18 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r3, r17 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r4, r16 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r5, r15 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r6, r14 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r7, r13 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r8, r12 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "st z+, r24 \n\t"
+
+        "ldi r24, 0 \n\t"
+        "mul r2, r19 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r3, r18 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r4, r17 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r5, r16 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r6, r15 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r7, r14 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r8, r13 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r9, r12 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "st z+, r22 \n\t"
+
+        "ldi r22, 0 \n\t"
+        "mul r2, r20 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r3, r19 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r4, r18 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r5, r17 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r6, r16 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r7, r15 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r8, r14 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r9, r13 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r10, r12 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "st z+, r23 \n\t"
+
+        "ldi r23, 0 \n\t"
+        "mul r2, r21 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r3, r20 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r4, r19 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r5, r18 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r6, r17 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r7, r16 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r8, r15 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r9, r14 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r10, r13 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r11, r12 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "st z+, r24 \n\t"
+
+        "ld r2, x+ \n\t"
+        "ldi r24, 0 \n\t"
+        "mul r3, r21 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r4, r20 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r5, r19 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r6, r18 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r7, r17 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r8, r16 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r9, r15 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r10, r14 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r11, r13 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r2, r12 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "ld r0, z \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r25 \n\t"
+        "adc r24, r25 \n\t"
+        "st z+, r22 \n\t"
+
+        "ld r3, x+ \n\t"
+        "ldi r22, 0 \n\t"
+        "mul r4, r21 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r5, r20 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r6, r19 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r7, r18 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r8, r17 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r9, r16 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r10, r15 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r11, r14 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r2, r13 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r3, r12 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "ld r0, z \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r25 \n\t"
+        "adc r22, r25 \n\t"
+        "st z+, r23 \n\t"
+
+        "ld r4, x+ \n\t"
+        "ldi r23, 0 \n\t"
+        "mul r5, r21 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r6, r20 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r7, r19 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r8, r18 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r9, r17 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r10, r16 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r11, r15 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r2, r14 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r3, r13 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r4, r12 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "ld r0, z \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r25 \n\t"
+        "adc r23, r25 \n\t"
+        "st z+, r24 \n\t"
+
+        "ld r5, x+ \n\t"
+        "ldi r24, 0 \n\t"
+        "mul r6, r21 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r7, r20 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r8, r19 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r9, r18 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r10, r17 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r11, r16 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r2, r15 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r3, r14 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r4, r13 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r5, r12 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "ld r0, z \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r25 \n\t"
+        "adc r24, r25 \n\t"
+        "st z+, r22 \n\t"
+
+        "ld r12, y+ \n\t"
+        "ldi r22, 0 \n\t"
+        "mul r6, r12 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r7, r21 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r8, r20 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r9, r19 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r10, r18 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r11, r17 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r2, r16 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r3, r15 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r4, r14 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r5, r13 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "ld r0, z \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r25 \n\t"
+        "adc r22, r25 \n\t"
+        "st z+, r23 \n\t"
+
+        "ld r13, y+ \n\t"
+        "ldi r23, 0 \n\t"
+        "mul r6, r13 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r7, r12 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r8, r21 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r9, r20 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r10, r19 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r11, r18 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r2, r17 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r3, r16 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r4, r15 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r5, r14 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "ld r0, z \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r25 \n\t"
+        "adc r23, r25 \n\t"
+        "st z+, r24 \n\t"
+
+        "ld r14, y+ \n\t"
+        "ldi r24, 0 \n\t"
+        "mul r6, r14 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r7, r13 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r8, r12 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r9, r21 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r10, r20 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r11, r19 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r2, r18 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r3, r17 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r4, r16 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r5, r15 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "ld r0, z \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r25 \n\t"
+        "adc r24, r25 \n\t"
+        "st z+, r22 \n\t"
+
+        "ld r15, y+ \n\t"
+        "ldi r22, 0 \n\t"
+        "mul r6, r15 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r7, r14 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r8, r13 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r9, r12 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r10, r21 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r11, r20 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r2, r19 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r3, r18 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r4, r17 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r5, r16 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "ld r0, z \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r25 \n\t"
+        "adc r22, r25 \n\t"
+        "st z+, r23 \n\t"
+
+        "ldi r23, 0 \n\t"
+        "mul r7, r15 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r8, r14 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r9, r13 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r10, r12 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r11, r21 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r2, r20 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r3, r19 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r4, r18 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r5, r17 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "st z+, r24 \n\t"
+
+        "ldi r24, 0 \n\t"
+        "mul r8, r15 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r9, r14 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r10, r13 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r11, r12 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r2, r21 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r3, r20 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r4, r19 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r5, r18 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "st z+, r22 \n\t"
+
+        "ldi r22, 0 \n\t"
+        "mul r9, r15 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r10, r14 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r11, r13 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r2, r12 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r3, r21 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r4, r20 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r5, r19 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "st z+, r23 \n\t"
+
+        "ldi r23, 0 \n\t"
+        "mul r10, r15 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r11, r14 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r2, r13 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r3, r12 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r4, r21 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r5, r20 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "st z+, r24 \n\t"
+
+        "ldi r24, 0 \n\t"
+        "mul r11, r15 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r2, r14 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r3, r13 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r4, r12 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r5, r21 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "st z+, r22 \n\t"
+
+        "ldi r22, 0 \n\t"
+        "mul r2, r15 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r3, r14 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r4, r13 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r5, r12 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "st z+, r23 \n\t"
+
+        "ldi r23, 0 \n\t"
+        "mul r3, r15 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r4, r14 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r5, r13 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "st z+, r24 \n\t"
+
+        "ldi r24, 0 \n\t"
+        "mul r4, r15 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r5, r14 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "st z+, r22 \n\t"
+
+        "mul r5, r15 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "st z+, r23 \n\t"
+        "st z+, r24 \n\t"
+
+        "sbiw r30, 38 \n\t"
+        "sbiw r28, 24 \n\t"
+        "sbiw r26, 14 \n\t"
+        "ld r2, x+ \n\t"
+        "ld r12, y+ \n\t"
+        "ld r3, x+ \n\t"
+        "ld r13, y+ \n\t"
+        "ld r4, x+ \n\t"
+        "ld r14, y+ \n\t"
+        "ld r5, x+ \n\t"
+        "ld r15, y+ \n\t"
+        "ld r6, x+ \n\t"
+        "ld r16, y+ \n\t"
+        "ld r7, x+ \n\t"
+        "ld r17, y+ \n\t"
+        "ld r8, x+ \n\t"
+        "ld r18, y+ \n\t"
+        "ld r9, x+ \n\t"
+        "ld r19, y+ \n\t"
+        "ld r10, x+ \n\t"
+        "ld r20, y+ \n\t"
+        "ld r11, x+ \n\t"
+        "ld r21, y+ \n\t"
+
+        "ldi r23, 0 \n\t"
+        "mul r2, r12 \n\t"
+        "st z+, r0 \n\t"
+        "mov r22, r1 \n\t"
+
+        "ldi r24, 0 \n\t"
+        "mul r2, r13 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "mul r3, r12 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "st z+, r22 \n\t"
+
+        "ldi r22, 0 \n\t"
+        "mul r2, r14 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r3, r13 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r4, r12 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "st z+, r23 \n\t"
+
+        "ldi r23, 0 \n\t"
+        "mul r2, r15 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r3, r14 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r4, r13 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r5, r12 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "st z+, r24 \n\t"
+
+        "ldi r24, 0 \n\t"
+        "mul r2, r16 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r3, r15 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r4, r14 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r5, r13 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r6, r12 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "st z+, r22 \n\t"
+
+        "ldi r22, 0 \n\t"
+        "mul r2, r17 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r3, r16 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r4, r15 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r5, r14 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r6, r13 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r7, r12 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "st z+, r23 \n\t"
+
+        "ldi r23, 0 \n\t"
+        "mul r2, r18 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r3, r17 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r4, r16 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r5, r15 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r6, r14 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r7, r13 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r8, r12 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "st z+, r24 \n\t"
+
+        "ldi r24, 0 \n\t"
+        "mul r2, r19 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r3, r18 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r4, r17 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r5, r16 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r6, r15 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r7, r14 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r8, r13 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r9, r12 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "st z+, r22 \n\t"
+
+        "ldi r22, 0 \n\t"
+        "mul r2, r20 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r3, r19 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r4, r18 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r5, r17 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r6, r16 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r7, r15 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r8, r14 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r9, r13 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r10, r12 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "st z+, r23 \n\t"
+
+        "ldi r23, 0 \n\t"
+        "mul r2, r21 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r3, r20 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r4, r19 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r5, r18 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r6, r17 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r7, r16 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r8, r15 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r9, r14 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r10, r13 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r11, r12 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "st z+, r24 \n\t"
+
+        "ld r2, x+ \n\t"
+        "ldi r24, 0 \n\t"
+        "mul r3, r21 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r4, r20 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r5, r19 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r6, r18 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r7, r17 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r8, r16 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r9, r15 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r10, r14 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r11, r13 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r2, r12 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "ld r0, z \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r25 \n\t"
+        "adc r24, r25 \n\t"
+        "st z+, r22 \n\t"
+
+        "ld r3, x+ \n\t"
+        "ldi r22, 0 \n\t"
+        "mul r4, r21 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r5, r20 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r6, r19 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r7, r18 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r8, r17 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r9, r16 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r10, r15 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r11, r14 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r2, r13 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r3, r12 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "ld r0, z \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r25 \n\t"
+        "adc r22, r25 \n\t"
+        "st z+, r23 \n\t"
+
+        "ld r4, x+ \n\t"
+        "ldi r23, 0 \n\t"
+        "mul r5, r21 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r6, r20 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r7, r19 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r8, r18 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r9, r17 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r10, r16 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r11, r15 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r2, r14 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r3, r13 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r4, r12 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "ld r0, z \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r25 \n\t"
+        "adc r23, r25 \n\t"
+        "st z+, r24 \n\t"
+
+        "ld r5, x+ \n\t"
+        "ldi r24, 0 \n\t"
+        "mul r6, r21 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r7, r20 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r8, r19 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r9, r18 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r10, r17 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r11, r16 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r2, r15 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r3, r14 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r4, r13 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r5, r12 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "ld r0, z \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r25 \n\t"
+        "adc r24, r25 \n\t"
+        "st z+, r22 \n\t"
+
+        "ld r6, x+ \n\t"
+        "ldi r22, 0 \n\t"
+        "mul r7, r21 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r8, r20 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r9, r19 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r10, r18 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r11, r17 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r2, r16 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r3, r15 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r4, r14 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r5, r13 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r6, r12 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "ld r0, z \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r25 \n\t"
+        "adc r22, r25 \n\t"
+        "st z+, r23 \n\t"
+
+        "ld r7, x+ \n\t"
+        "ldi r23, 0 \n\t"
+        "mul r8, r21 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r9, r20 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r10, r19 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r11, r18 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r2, r17 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r3, r16 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r4, r15 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r5, r14 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r6, r13 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r7, r12 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "ld r0, z \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r25 \n\t"
+        "adc r23, r25 \n\t"
+        "st z+, r24 \n\t"
+
+        "ld r8, x+ \n\t"
+        "ldi r24, 0 \n\t"
+        "mul r9, r21 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r10, r20 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r11, r19 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r2, r18 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r3, r17 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r4, r16 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r5, r15 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r6, r14 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r7, r13 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r8, r12 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "ld r0, z \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r25 \n\t"
+        "adc r24, r25 \n\t"
+        "st z+, r22 \n\t"
+
+        "ld r9, x+ \n\t"
+        "ldi r22, 0 \n\t"
+        "mul r10, r21 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r11, r20 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r2, r19 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r3, r18 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r4, r17 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r5, r16 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r6, r15 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r7, r14 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r8, r13 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r9, r12 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "ld r0, z \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r25 \n\t"
+        "adc r22, r25 \n\t"
+        "st z+, r23 \n\t"
+
+        "ld r10, x+ \n\t"
+        "ldi r23, 0 \n\t"
+        "mul r11, r21 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r2, r20 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r3, r19 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r4, r18 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r5, r17 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r6, r16 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r7, r15 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r8, r14 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r9, r13 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r10, r12 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "ld r0, z \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r25 \n\t"
+        "adc r23, r25 \n\t"
+        "st z+, r24 \n\t"
+
+        "ld r11, x+ \n\t"
+        "ldi r24, 0 \n\t"
+        "mul r2, r21 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r3, r20 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r4, r19 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r5, r18 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r6, r17 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r7, r16 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r8, r15 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r9, r14 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r10, r13 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r11, r12 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "ld r0, z \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r25 \n\t"
+        "adc r24, r25 \n\t"
+        "st z+, r22 \n\t"
+
+        "ld r2, x+ \n\t"
+        "ldi r22, 0 \n\t"
+        "mul r3, r21 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r4, r20 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r5, r19 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r6, r18 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r7, r17 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r8, r16 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r9, r15 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r10, r14 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r11, r13 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r2, r12 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "ld r0, z \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r25 \n\t"
+        "adc r22, r25 \n\t"
+        "st z+, r23 \n\t"
+
+        "ld r3, x+ \n\t"
+        "ldi r23, 0 \n\t"
+        "mul r4, r21 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r5, r20 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r6, r19 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r7, r18 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r8, r17 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r9, r16 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r10, r15 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r11, r14 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r2, r13 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r3, r12 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "ld r0, z \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r25 \n\t"
+        "adc r23, r25 \n\t"
+        "st z+, r24 \n\t"
+
+        "ld r4, x+ \n\t"
+        "ldi r24, 0 \n\t"
+        "mul r5, r21 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r6, r20 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r7, r19 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r8, r18 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r9, r17 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r10, r16 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r11, r15 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r2, r14 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r3, r13 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r4, r12 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "ld r0, z \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r25 \n\t"
+        "adc r24, r25 \n\t"
+        "st z+, r22 \n\t"
+
+        "ld r5, x+ \n\t"
+        "ldi r22, 0 \n\t"
+        "mul r6, r21 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r7, r20 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r8, r19 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r9, r18 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r10, r17 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r11, r16 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r2, r15 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r3, r14 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r4, r13 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r5, r12 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "ld r0, z \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r25 \n\t"
+        "adc r22, r25 \n\t"
+        "st z+, r23 \n\t"
+
+        "ld r12, y+ \n\t"
+        "ldi r23, 0 \n\t"
+        "mul r6, r12 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r7, r21 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r8, r20 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r9, r19 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r10, r18 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r11, r17 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r2, r16 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r3, r15 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r4, r14 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r5, r13 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "ld r0, z \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r25 \n\t"
+        "adc r23, r25 \n\t"
+        "st z+, r24 \n\t"
+
+        "ld r13, y+ \n\t"
+        "ldi r24, 0 \n\t"
+        "mul r6, r13 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r7, r12 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r8, r21 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r9, r20 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r10, r19 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r11, r18 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r2, r17 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r3, r16 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r4, r15 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r5, r14 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "ld r0, z \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r25 \n\t"
+        "adc r24, r25 \n\t"
+        "st z+, r22 \n\t"
+
+        "ld r14, y+ \n\t"
+        "ldi r22, 0 \n\t"
+        "mul r6, r14 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r7, r13 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r8, r12 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r9, r21 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r10, r20 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r11, r19 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r2, r18 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r3, r17 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r4, r16 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r5, r15 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "ld r0, z \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r25 \n\t"
+        "adc r22, r25 \n\t"
+        "st z+, r23 \n\t"
+
+        "ld r15, y+ \n\t"
+        "ldi r23, 0 \n\t"
+        "mul r6, r15 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r7, r14 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r8, r13 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r9, r12 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r10, r21 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r11, r20 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r2, r19 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r3, r18 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r4, r17 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r5, r16 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "ld r0, z \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r25 \n\t"
+        "adc r23, r25 \n\t"
+        "st z+, r24 \n\t"
+
+        "ld r16, y+ \n\t"
+        "ldi r24, 0 \n\t"
+        "mul r6, r16 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r7, r15 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r8, r14 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r9, r13 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r10, r12 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r11, r21 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r2, r20 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r3, r19 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r4, r18 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r5, r17 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "ld r0, z \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r25 \n\t"
+        "adc r24, r25 \n\t"
+        "st z+, r22 \n\t"
+
+        "ld r17, y+ \n\t"
+        "ldi r22, 0 \n\t"
+        "mul r6, r17 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r7, r16 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r8, r15 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r9, r14 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r10, r13 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r11, r12 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r2, r21 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r3, r20 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r4, r19 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r5, r18 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "ld r0, z \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r25 \n\t"
+        "adc r22, r25 \n\t"
+        "st z+, r23 \n\t"
+
+        "ld r18, y+ \n\t"
+        "ldi r23, 0 \n\t"
+        "mul r6, r18 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r7, r17 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r8, r16 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r9, r15 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r10, r14 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r11, r13 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r2, r12 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r3, r21 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r4, r20 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r5, r19 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "ld r0, z \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r25 \n\t"
+        "adc r23, r25 \n\t"
+        "st z+, r24 \n\t"
+
+        "ld r19, y+ \n\t"
+        "ldi r24, 0 \n\t"
+        "mul r6, r19 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r7, r18 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r8, r17 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r9, r16 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r10, r15 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r11, r14 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r2, r13 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r3, r12 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r4, r21 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r5, r20 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "ld r0, z \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r25 \n\t"
+        "adc r24, r25 \n\t"
+        "st z+, r22 \n\t"
+
+        "ld r20, y+ \n\t"
+        "ldi r22, 0 \n\t"
+        "mul r6, r20 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r7, r19 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r8, r18 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r9, r17 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r10, r16 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r11, r15 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r2, r14 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r3, r13 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r4, r12 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r5, r21 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "ld r0, z \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r25 \n\t"
+        "adc r22, r25 \n\t"
+        "st z+, r23 \n\t"
+
+        "ld r21, y+ \n\t"
+        "ldi r23, 0 \n\t"
+        "mul r6, r21 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r7, r20 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r8, r19 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r9, r18 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r10, r17 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r11, r16 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r2, r15 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r3, r14 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r4, r13 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r5, r12 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "ld r0, z \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r25 \n\t"
+        "adc r23, r25 \n\t"
+        "st z+, r24 \n\t"
+
+        "ld r12, y+ \n\t"
+        "ldi r24, 0 \n\t"
+        "mul r6, r12 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r7, r21 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r8, r20 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r9, r19 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r10, r18 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r11, r17 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r2, r16 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r3, r15 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r4, r14 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r5, r13 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "ld r0, z \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r25 \n\t"
+        "adc r24, r25 \n\t"
+        "st z+, r22 \n\t"
+
+        "ld r13, y+ \n\t"
+        "ldi r22, 0 \n\t"
+        "mul r6, r13 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r7, r12 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r8, r21 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r9, r20 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r10, r19 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r11, r18 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r2, r17 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r3, r16 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r4, r15 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r5, r14 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "ld r0, z \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r25 \n\t"
+        "adc r22, r25 \n\t"
+        "st z+, r23 \n\t"
+
+        "ld r14, y+ \n\t"
+        "ldi r23, 0 \n\t"
+        "mul r6, r14 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r7, r13 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r8, r12 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r9, r21 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r10, r20 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r11, r19 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r2, r18 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r3, r17 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r4, r16 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r5, r15 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "ld r0, z \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r25 \n\t"
+        "adc r23, r25 \n\t"
+        "st z+, r24 \n\t"
+
+        "ld r15, y+ \n\t"
+        "ldi r24, 0 \n\t"
+        "mul r6, r15 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r7, r14 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r8, r13 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r9, r12 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r10, r21 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r11, r20 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r2, r19 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r3, r18 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r4, r17 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r5, r16 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "ld r0, z \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r25 \n\t"
+        "adc r24, r25 \n\t"
+        "st z+, r22 \n\t"
+
+        "ldi r22, 0 \n\t"
+        "mul r7, r15 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r8, r14 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r9, r13 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r10, r12 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r11, r21 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r2, r20 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r3, r19 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r4, r18 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r5, r17 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "st z+, r23 \n\t"
+
+        "ldi r23, 0 \n\t"
+        "mul r8, r15 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r9, r14 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r10, r13 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r11, r12 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r2, r21 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r3, r20 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r4, r19 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r5, r18 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "st z+, r24 \n\t"
+
+        "ldi r24, 0 \n\t"
+        "mul r9, r15 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r10, r14 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r11, r13 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r2, r12 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r3, r21 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r4, r20 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r5, r19 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "st z+, r22 \n\t"
+
+        "ldi r22, 0 \n\t"
+        "mul r10, r15 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r11, r14 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r2, r13 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r3, r12 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r4, r21 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r5, r20 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "st z+, r23 \n\t"
+
+        "ldi r23, 0 \n\t"
+        "mul r11, r15 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r2, r14 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r3, r13 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r4, r12 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r5, r21 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "st z+, r24 \n\t"
+
+        "ldi r24, 0 \n\t"
+        "mul r2, r15 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r3, r14 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r4, r13 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r5, r12 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "st z+, r22 \n\t"
+
+        "ldi r22, 0 \n\t"
+        "mul r3, r15 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r4, r14 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r5, r13 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "st z+, r23 \n\t"
+
+        "ldi r23, 0 \n\t"
+        "mul r4, r15 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r5, r14 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "st z+, r24 \n\t"
+
+        "mul r5, r15 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "st z+, r22 \n\t"
+        "st z+, r23 \n\t"
+
+        "eor r1, r1 \n\t"
+        : "+x" (p_left), "+y" (p_right), "+z" (p_result)
+        :
+        : "r0", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12",
+          "r13", "r14", "r15", "r16", "r17", "r18", "r19", "r20", "r21", "r22", "r23", "r24", "r25", "cc", "memory"
+    );
+}
+#define asm_mult 1
+#elif (uECC_BYTES == 32)
+__attribute((noinline))
+static void vli_mult(uint8_t *p_result, uint8_t *p_left, uint8_t *p_right)
+{
+    __asm__ volatile (
+        "adiw r30, 30 \n\t"
+        "adiw r28, 30 \n\t"
+        "ld r2, x+ \n\t"
+        "ld r3, x+ \n\t"
+        "ld r12, y+ \n\t"
+        "ld r13, y+ \n\t"
+        "ldi r25, 0 \n\t"
+
+        "ldi r23, 0 \n\t"
+        "mul r2, r12 \n\t"
+        "st z+, r0 \n\t"
+        "mov r22, r1 \n\t"
+
+        "ldi r24, 0 \n\t"
+        "mul r2, r13 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "mul r3, r12 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "st z+, r22 \n\t"
+
+        "mul r3, r13 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "st z+, r23 \n\t"
+        "st z+, r24 \n\t"
+
+        "sbiw r30, 14 \n\t"
+        "sbiw r28, 12 \n\t"
+        "ld r12, y+ \n\t"
+        "ld r13, y+ \n\t"
+        "ld r14, y+ \n\t"
+        "ld r15, y+ \n\t"
+        "ld r16, y+ \n\t"
+        "ld r17, y+ \n\t"
+        "ld r18, y+ \n\t"
+        "ld r19, y+ \n\t"
+        "ld r20, y+ \n\t"
+        "ld r21, y+ \n\t"
+        "ld r4, x+ \n\t"
+        "ld r5, x+ \n\t"
+        "ld r6, x+ \n\t"
+        "ld r7, x+ \n\t"
+        "ld r8, x+ \n\t"
+        "ld r9, x+ \n\t"
+        "ld r10, x+ \n\t"
+        "ld r11, x+ \n\t"
+
+        "ldi r23, 0 \n\t"
+        "mul r2, r12 \n\t"
+        "st z+, r0 \n\t"
+        "mov r22, r1 \n\t"
+
+        "ldi r24, 0 \n\t"
+        "mul r2, r13 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "mul r3, r12 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "st z+, r22 \n\t"
+
+        "ldi r22, 0 \n\t"
+        "mul r2, r14 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r3, r13 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r4, r12 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "st z+, r23 \n\t"
+
+        "ldi r23, 0 \n\t"
+        "mul r2, r15 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r3, r14 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r4, r13 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r5, r12 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "st z+, r24 \n\t"
+
+        "ldi r24, 0 \n\t"
+        "mul r2, r16 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r3, r15 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r4, r14 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r5, r13 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r6, r12 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "st z+, r22 \n\t"
+
+        "ldi r22, 0 \n\t"
+        "mul r2, r17 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r3, r16 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r4, r15 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r5, r14 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r6, r13 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r7, r12 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "st z+, r23 \n\t"
+
+        "ldi r23, 0 \n\t"
+        "mul r2, r18 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r3, r17 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r4, r16 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r5, r15 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r6, r14 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r7, r13 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r8, r12 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "st z+, r24 \n\t"
+
+        "ldi r24, 0 \n\t"
+        "mul r2, r19 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r3, r18 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r4, r17 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r5, r16 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r6, r15 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r7, r14 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r8, r13 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r9, r12 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "st z+, r22 \n\t"
+
+        "ldi r22, 0 \n\t"
+        "mul r2, r20 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r3, r19 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r4, r18 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r5, r17 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r6, r16 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r7, r15 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r8, r14 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r9, r13 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r10, r12 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "st z+, r23 \n\t"
+
+        "ldi r23, 0 \n\t"
+        "mul r2, r21 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r3, r20 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r4, r19 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r5, r18 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r6, r17 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r7, r16 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r8, r15 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r9, r14 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r10, r13 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r11, r12 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "st z+, r24 \n\t"
+
+        "ld r2, x+ \n\t"
+        "ldi r24, 0 \n\t"
+        "mul r3, r21 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r4, r20 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r5, r19 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r6, r18 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r7, r17 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r8, r16 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r9, r15 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r10, r14 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r11, r13 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r2, r12 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "ld r0, z \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r25 \n\t"
+        "adc r24, r25 \n\t"
+        "st z+, r22 \n\t"
+
+        "ld r3, x+ \n\t"
+        "ldi r22, 0 \n\t"
+        "mul r4, r21 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r5, r20 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r6, r19 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r7, r18 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r8, r17 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r9, r16 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r10, r15 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r11, r14 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r2, r13 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r3, r12 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "ld r0, z \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r25 \n\t"
+        "adc r22, r25 \n\t"
+        "st z+, r23 \n\t"
+
+        "ld r12, y+ \n\t"
+        "ldi r23, 0 \n\t"
+        "mul r4, r12 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r5, r21 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r6, r20 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r7, r19 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r8, r18 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r9, r17 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r10, r16 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r11, r15 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r2, r14 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r3, r13 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "ld r0, z \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r25 \n\t"
+        "adc r23, r25 \n\t"
+        "st z+, r24 \n\t"
+
+        "ld r13, y+ \n\t"
+        "ldi r24, 0 \n\t"
+        "mul r4, r13 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r5, r12 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r6, r21 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r7, r20 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r8, r19 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r9, r18 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r10, r17 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r11, r16 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r2, r15 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r3, r14 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "ld r0, z \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r25 \n\t"
+        "adc r24, r25 \n\t"
+        "st z+, r22 \n\t"
+
+        "ldi r22, 0 \n\t"
+        "mul r5, r13 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r6, r12 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r7, r21 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r8, r20 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r9, r19 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r10, r18 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r11, r17 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r2, r16 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r3, r15 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "st z+, r23 \n\t"
+
+        "ldi r23, 0 \n\t"
+        "mul r6, r13 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r7, r12 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r8, r21 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r9, r20 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r10, r19 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r11, r18 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r2, r17 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r3, r16 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "st z+, r24 \n\t"
+
+        "ldi r24, 0 \n\t"
+        "mul r7, r13 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r8, r12 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r9, r21 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r10, r20 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r11, r19 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r2, r18 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r3, r17 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "st z+, r22 \n\t"
+
+        "ldi r22, 0 \n\t"
+        "mul r8, r13 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r9, r12 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r10, r21 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r11, r20 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r2, r19 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r3, r18 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "st z+, r23 \n\t"
+
+        "ldi r23, 0 \n\t"
+        "mul r9, r13 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r10, r12 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r11, r21 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r2, r20 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r3, r19 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "st z+, r24 \n\t"
+
+        "ldi r24, 0 \n\t"
+        "mul r10, r13 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r11, r12 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r2, r21 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r3, r20 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "st z+, r22 \n\t"
+
+        "ldi r22, 0 \n\t"
+        "mul r11, r13 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r2, r12 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r3, r21 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "st z+, r23 \n\t"
+
+        "ldi r23, 0 \n\t"
+        "mul r2, r13 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r3, r12 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "st z+, r24 \n\t"
+
+        "mul r3, r13 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "st z+, r22 \n\t"
+        "st z+, r23 \n\t"
+
+        "sbiw r30, 34 \n\t"
+        "sbiw r28, 22 \n\t"
+        "sbiw r26, 12 \n\t"
+        "ld r2, x+ \n\t"
+        "ld r12, y+ \n\t"
+        "ld r3, x+ \n\t"
+        "ld r13, y+ \n\t"
+        "ld r4, x+ \n\t"
+        "ld r14, y+ \n\t"
+        "ld r5, x+ \n\t"
+        "ld r15, y+ \n\t"
+        "ld r6, x+ \n\t"
+        "ld r16, y+ \n\t"
+        "ld r7, x+ \n\t"
+        "ld r17, y+ \n\t"
+        "ld r8, x+ \n\t"
+        "ld r18, y+ \n\t"
+        "ld r9, x+ \n\t"
+        "ld r19, y+ \n\t"
+        "ld r10, x+ \n\t"
+        "ld r20, y+ \n\t"
+        "ld r11, x+ \n\t"
+        "ld r21, y+ \n\t"
+
+        "ldi r23, 0 \n\t"
+        "mul r2, r12 \n\t"
+        "st z+, r0 \n\t"
+        "mov r22, r1 \n\t"
+
+        "ldi r24, 0 \n\t"
+        "mul r2, r13 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "mul r3, r12 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "st z+, r22 \n\t"
+
+        "ldi r22, 0 \n\t"
+        "mul r2, r14 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r3, r13 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r4, r12 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "st z+, r23 \n\t"
+
+        "ldi r23, 0 \n\t"
+        "mul r2, r15 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r3, r14 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r4, r13 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r5, r12 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "st z+, r24 \n\t"
+
+        "ldi r24, 0 \n\t"
+        "mul r2, r16 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r3, r15 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r4, r14 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r5, r13 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r6, r12 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "st z+, r22 \n\t"
+
+        "ldi r22, 0 \n\t"
+        "mul r2, r17 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r3, r16 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r4, r15 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r5, r14 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r6, r13 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r7, r12 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "st z+, r23 \n\t"
+
+        "ldi r23, 0 \n\t"
+        "mul r2, r18 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r3, r17 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r4, r16 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r5, r15 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r6, r14 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r7, r13 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r8, r12 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "st z+, r24 \n\t"
+
+        "ldi r24, 0 \n\t"
+        "mul r2, r19 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r3, r18 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r4, r17 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r5, r16 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r6, r15 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r7, r14 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r8, r13 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r9, r12 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "st z+, r22 \n\t"
+
+        "ldi r22, 0 \n\t"
+        "mul r2, r20 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r3, r19 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r4, r18 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r5, r17 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r6, r16 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r7, r15 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r8, r14 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r9, r13 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r10, r12 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "st z+, r23 \n\t"
+
+        "ldi r23, 0 \n\t"
+        "mul r2, r21 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r3, r20 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r4, r19 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r5, r18 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r6, r17 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r7, r16 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r8, r15 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r9, r14 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r10, r13 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r11, r12 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "st z+, r24 \n\t"
+
+        "ld r2, x+ \n\t"
+        "ldi r24, 0 \n\t"
+        "mul r3, r21 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r4, r20 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r5, r19 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r6, r18 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r7, r17 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r8, r16 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r9, r15 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r10, r14 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r11, r13 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r2, r12 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "ld r0, z \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r25 \n\t"
+        "adc r24, r25 \n\t"
+        "st z+, r22 \n\t"
+
+        "ld r3, x+ \n\t"
+        "ldi r22, 0 \n\t"
+        "mul r4, r21 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r5, r20 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r6, r19 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r7, r18 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r8, r17 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r9, r16 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r10, r15 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r11, r14 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r2, r13 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r3, r12 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "ld r0, z \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r25 \n\t"
+        "adc r22, r25 \n\t"
+        "st z+, r23 \n\t"
+
+        "ld r4, x+ \n\t"
+        "ldi r23, 0 \n\t"
+        "mul r5, r21 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r6, r20 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r7, r19 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r8, r18 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r9, r17 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r10, r16 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r11, r15 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r2, r14 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r3, r13 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r4, r12 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "ld r0, z \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r25 \n\t"
+        "adc r23, r25 \n\t"
+        "st z+, r24 \n\t"
+
+        "ld r5, x+ \n\t"
+        "ldi r24, 0 \n\t"
+        "mul r6, r21 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r7, r20 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r8, r19 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r9, r18 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r10, r17 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r11, r16 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r2, r15 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r3, r14 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r4, r13 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r5, r12 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "ld r0, z \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r25 \n\t"
+        "adc r24, r25 \n\t"
+        "st z+, r22 \n\t"
+
+        "ld r6, x+ \n\t"
+        "ldi r22, 0 \n\t"
+        "mul r7, r21 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r8, r20 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r9, r19 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r10, r18 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r11, r17 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r2, r16 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r3, r15 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r4, r14 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r5, r13 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r6, r12 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "ld r0, z \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r25 \n\t"
+        "adc r22, r25 \n\t"
+        "st z+, r23 \n\t"
+
+        "ld r7, x+ \n\t"
+        "ldi r23, 0 \n\t"
+        "mul r8, r21 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r9, r20 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r10, r19 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r11, r18 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r2, r17 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r3, r16 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r4, r15 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r5, r14 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r6, r13 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r7, r12 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "ld r0, z \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r25 \n\t"
+        "adc r23, r25 \n\t"
+        "st z+, r24 \n\t"
+
+        "ld r8, x+ \n\t"
+        "ldi r24, 0 \n\t"
+        "mul r9, r21 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r10, r20 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r11, r19 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r2, r18 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r3, r17 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r4, r16 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r5, r15 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r6, r14 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r7, r13 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r8, r12 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "ld r0, z \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r25 \n\t"
+        "adc r24, r25 \n\t"
+        "st z+, r22 \n\t"
+
+        "ld r9, x+ \n\t"
+        "ldi r22, 0 \n\t"
+        "mul r10, r21 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r11, r20 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r2, r19 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r3, r18 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r4, r17 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r5, r16 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r6, r15 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r7, r14 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r8, r13 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r9, r12 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "ld r0, z \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r25 \n\t"
+        "adc r22, r25 \n\t"
+        "st z+, r23 \n\t"
+
+        "ld r10, x+ \n\t"
+        "ldi r23, 0 \n\t"
+        "mul r11, r21 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r2, r20 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r3, r19 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r4, r18 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r5, r17 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r6, r16 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r7, r15 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r8, r14 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r9, r13 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r10, r12 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "ld r0, z \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r25 \n\t"
+        "adc r23, r25 \n\t"
+        "st z+, r24 \n\t"
+
+        "ld r11, x+ \n\t"
+        "ldi r24, 0 \n\t"
+        "mul r2, r21 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r3, r20 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r4, r19 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r5, r18 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r6, r17 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r7, r16 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r8, r15 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r9, r14 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r10, r13 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r11, r12 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "ld r0, z \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r25 \n\t"
+        "adc r24, r25 \n\t"
+        "st z+, r22 \n\t"
+
+        "ld r2, x+ \n\t"
+        "ldi r22, 0 \n\t"
+        "mul r3, r21 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r4, r20 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r5, r19 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r6, r18 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r7, r17 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r8, r16 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r9, r15 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r10, r14 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r11, r13 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r2, r12 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "ld r0, z \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r25 \n\t"
+        "adc r22, r25 \n\t"
+        "st z+, r23 \n\t"
+
+        "ld r3, x+ \n\t"
+        "ldi r23, 0 \n\t"
+        "mul r4, r21 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r5, r20 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r6, r19 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r7, r18 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r8, r17 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r9, r16 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r10, r15 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r11, r14 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r2, r13 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r3, r12 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "ld r0, z \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r25 \n\t"
+        "adc r23, r25 \n\t"
+        "st z+, r24 \n\t"
+
+        "ld r12, y+ \n\t"
+        "ldi r24, 0 \n\t"
+        "mul r4, r12 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r5, r21 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r6, r20 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r7, r19 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r8, r18 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r9, r17 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r10, r16 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r11, r15 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r2, r14 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r3, r13 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "ld r0, z \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r25 \n\t"
+        "adc r24, r25 \n\t"
+        "st z+, r22 \n\t"
+
+        "ld r13, y+ \n\t"
+        "ldi r22, 0 \n\t"
+        "mul r4, r13 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r5, r12 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r6, r21 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r7, r20 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r8, r19 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r9, r18 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r10, r17 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r11, r16 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r2, r15 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r3, r14 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "ld r0, z \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r25 \n\t"
+        "adc r22, r25 \n\t"
+        "st z+, r23 \n\t"
+
+        "ld r14, y+ \n\t"
+        "ldi r23, 0 \n\t"
+        "mul r4, r14 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r5, r13 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r6, r12 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r7, r21 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r8, r20 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r9, r19 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r10, r18 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r11, r17 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r2, r16 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r3, r15 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "ld r0, z \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r25 \n\t"
+        "adc r23, r25 \n\t"
+        "st z+, r24 \n\t"
+
+        "ld r15, y+ \n\t"
+        "ldi r24, 0 \n\t"
+        "mul r4, r15 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r5, r14 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r6, r13 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r7, r12 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r8, r21 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r9, r20 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r10, r19 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r11, r18 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r2, r17 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r3, r16 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "ld r0, z \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r25 \n\t"
+        "adc r24, r25 \n\t"
+        "st z+, r22 \n\t"
+
+        "ld r16, y+ \n\t"
+        "ldi r22, 0 \n\t"
+        "mul r4, r16 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r5, r15 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r6, r14 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r7, r13 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r8, r12 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r9, r21 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r10, r20 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r11, r19 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r2, r18 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r3, r17 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "ld r0, z \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r25 \n\t"
+        "adc r22, r25 \n\t"
+        "st z+, r23 \n\t"
+
+        "ld r17, y+ \n\t"
+        "ldi r23, 0 \n\t"
+        "mul r4, r17 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r5, r16 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r6, r15 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r7, r14 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r8, r13 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r9, r12 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r10, r21 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r11, r20 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r2, r19 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r3, r18 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "ld r0, z \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r25 \n\t"
+        "adc r23, r25 \n\t"
+        "st z+, r24 \n\t"
+
+        "ld r18, y+ \n\t"
+        "ldi r24, 0 \n\t"
+        "mul r4, r18 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r5, r17 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r6, r16 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r7, r15 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r8, r14 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r9, r13 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r10, r12 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r11, r21 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r2, r20 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r3, r19 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "ld r0, z \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r25 \n\t"
+        "adc r24, r25 \n\t"
+        "st z+, r22 \n\t"
+
+        "ld r19, y+ \n\t"
+        "ldi r22, 0 \n\t"
+        "mul r4, r19 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r5, r18 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r6, r17 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r7, r16 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r8, r15 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r9, r14 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r10, r13 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r11, r12 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r2, r21 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r3, r20 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "ld r0, z \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r25 \n\t"
+        "adc r22, r25 \n\t"
+        "st z+, r23 \n\t"
+
+        "ld r20, y+ \n\t"
+        "ldi r23, 0 \n\t"
+        "mul r4, r20 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r5, r19 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r6, r18 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r7, r17 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r8, r16 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r9, r15 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r10, r14 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r11, r13 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r2, r12 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r3, r21 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "ld r0, z \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r25 \n\t"
+        "adc r23, r25 \n\t"
+        "st z+, r24 \n\t"
+
+        "ld r21, y+ \n\t"
+        "ldi r24, 0 \n\t"
+        "mul r4, r21 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r5, r20 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r6, r19 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r7, r18 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r8, r17 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r9, r16 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r10, r15 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r11, r14 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r2, r13 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r3, r12 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "ld r0, z \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r25 \n\t"
+        "adc r24, r25 \n\t"
+        "st z+, r22 \n\t"
+
+        "ld r12, y+ \n\t"
+        "ldi r22, 0 \n\t"
+        "mul r4, r12 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r5, r21 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r6, r20 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r7, r19 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r8, r18 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r9, r17 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r10, r16 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r11, r15 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r2, r14 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r3, r13 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "ld r0, z \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r25 \n\t"
+        "adc r22, r25 \n\t"
+        "st z+, r23 \n\t"
+
+        "ld r13, y+ \n\t"
+        "ldi r23, 0 \n\t"
+        "mul r4, r13 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r5, r12 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r6, r21 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r7, r20 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r8, r19 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r9, r18 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r10, r17 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r11, r16 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r2, r15 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r3, r14 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "ld r0, z \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r25 \n\t"
+        "adc r23, r25 \n\t"
+        "st z+, r24 \n\t"
+
+        "ldi r24, 0 \n\t"
+        "mul r5, r13 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r6, r12 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r7, r21 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r8, r20 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r9, r19 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r10, r18 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r11, r17 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r2, r16 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r3, r15 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "st z+, r22 \n\t"
+
+        "ldi r22, 0 \n\t"
+        "mul r6, r13 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r7, r12 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r8, r21 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r9, r20 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r10, r19 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r11, r18 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r2, r17 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r3, r16 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "st z+, r23 \n\t"
+
+        "ldi r23, 0 \n\t"
+        "mul r7, r13 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r8, r12 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r9, r21 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r10, r20 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r11, r19 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r2, r18 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r3, r17 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "st z+, r24 \n\t"
+
+        "ldi r24, 0 \n\t"
+        "mul r8, r13 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r9, r12 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r10, r21 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r11, r20 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r2, r19 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r3, r18 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "st z+, r22 \n\t"
+
+        "ldi r22, 0 \n\t"
+        "mul r9, r13 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r10, r12 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r11, r21 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r2, r20 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r3, r19 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "st z+, r23 \n\t"
+
+        "ldi r23, 0 \n\t"
+        "mul r10, r13 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r11, r12 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r2, r21 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r3, r20 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "st z+, r24 \n\t"
+
+        "ldi r24, 0 \n\t"
+        "mul r11, r13 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r2, r12 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r3, r21 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "st z+, r22 \n\t"
+
+        "ldi r22, 0 \n\t"
+        "mul r2, r13 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r3, r12 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "st z+, r23 \n\t"
+
+        "mul r3, r13 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "st z+, r24 \n\t"
+        "st z+, r22 \n\t"
+
+        "sbiw r30, 54 \n\t"
+        "sbiw r28, 32 \n\t"
+        "sbiw r26, 22 \n\t"
+        "ld r2, x+ \n\t"
+        "ld r12, y+ \n\t"
+        "ld r3, x+ \n\t"
+        "ld r13, y+ \n\t"
+        "ld r4, x+ \n\t"
+        "ld r14, y+ \n\t"
+        "ld r5, x+ \n\t"
+        "ld r15, y+ \n\t"
+        "ld r6, x+ \n\t"
+        "ld r16, y+ \n\t"
+        "ld r7, x+ \n\t"
+        "ld r17, y+ \n\t"
+        "ld r8, x+ \n\t"
+        "ld r18, y+ \n\t"
+        "ld r9, x+ \n\t"
+        "ld r19, y+ \n\t"
+        "ld r10, x+ \n\t"
+        "ld r20, y+ \n\t"
+        "ld r11, x+ \n\t"
+        "ld r21, y+ \n\t"
+
+        "ldi r23, 0 \n\t"
+        "mul r2, r12 \n\t"
+        "st z+, r0 \n\t"
+        "mov r22, r1 \n\t"
+
+        "ldi r24, 0 \n\t"
+        "mul r2, r13 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "mul r3, r12 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "st z+, r22 \n\t"
+
+        "ldi r22, 0 \n\t"
+        "mul r2, r14 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r3, r13 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r4, r12 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "st z+, r23 \n\t"
+
+        "ldi r23, 0 \n\t"
+        "mul r2, r15 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r3, r14 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r4, r13 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r5, r12 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "st z+, r24 \n\t"
+
+        "ldi r24, 0 \n\t"
+        "mul r2, r16 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r3, r15 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r4, r14 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r5, r13 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r6, r12 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "st z+, r22 \n\t"
+
+        "ldi r22, 0 \n\t"
+        "mul r2, r17 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r3, r16 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r4, r15 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r5, r14 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r6, r13 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r7, r12 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "st z+, r23 \n\t"
+
+        "ldi r23, 0 \n\t"
+        "mul r2, r18 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r3, r17 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r4, r16 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r5, r15 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r6, r14 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r7, r13 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r8, r12 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "st z+, r24 \n\t"
+
+        "ldi r24, 0 \n\t"
+        "mul r2, r19 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r3, r18 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r4, r17 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r5, r16 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r6, r15 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r7, r14 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r8, r13 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r9, r12 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "st z+, r22 \n\t"
+
+        "ldi r22, 0 \n\t"
+        "mul r2, r20 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r3, r19 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r4, r18 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r5, r17 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r6, r16 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r7, r15 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r8, r14 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r9, r13 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r10, r12 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "st z+, r23 \n\t"
+
+        "ldi r23, 0 \n\t"
+        "mul r2, r21 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r3, r20 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r4, r19 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r5, r18 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r6, r17 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r7, r16 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r8, r15 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r9, r14 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r10, r13 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r11, r12 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "st z+, r24 \n\t"
+
+        "ld r2, x+ \n\t"
+        "ldi r24, 0 \n\t"
+        "mul r3, r21 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r4, r20 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r5, r19 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r6, r18 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r7, r17 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r8, r16 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r9, r15 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r10, r14 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r11, r13 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r2, r12 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "ld r0, z \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r25 \n\t"
+        "adc r24, r25 \n\t"
+        "st z+, r22 \n\t"
+
+        "ld r3, x+ \n\t"
+        "ldi r22, 0 \n\t"
+        "mul r4, r21 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r5, r20 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r6, r19 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r7, r18 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r8, r17 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r9, r16 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r10, r15 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r11, r14 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r2, r13 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r3, r12 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "ld r0, z \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r25 \n\t"
+        "adc r22, r25 \n\t"
+        "st z+, r23 \n\t"
+
+        "ld r4, x+ \n\t"
+        "ldi r23, 0 \n\t"
+        "mul r5, r21 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r6, r20 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r7, r19 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r8, r18 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r9, r17 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r10, r16 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r11, r15 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r2, r14 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r3, r13 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r4, r12 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "ld r0, z \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r25 \n\t"
+        "adc r23, r25 \n\t"
+        "st z+, r24 \n\t"
+
+        "ld r5, x+ \n\t"
+        "ldi r24, 0 \n\t"
+        "mul r6, r21 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r7, r20 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r8, r19 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r9, r18 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r10, r17 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r11, r16 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r2, r15 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r3, r14 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r4, r13 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r5, r12 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "ld r0, z \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r25 \n\t"
+        "adc r24, r25 \n\t"
+        "st z+, r22 \n\t"
+
+        "ld r6, x+ \n\t"
+        "ldi r22, 0 \n\t"
+        "mul r7, r21 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r8, r20 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r9, r19 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r10, r18 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r11, r17 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r2, r16 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r3, r15 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r4, r14 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r5, r13 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r6, r12 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "ld r0, z \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r25 \n\t"
+        "adc r22, r25 \n\t"
+        "st z+, r23 \n\t"
+
+        "ld r7, x+ \n\t"
+        "ldi r23, 0 \n\t"
+        "mul r8, r21 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r9, r20 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r10, r19 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r11, r18 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r2, r17 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r3, r16 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r4, r15 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r5, r14 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r6, r13 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r7, r12 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "ld r0, z \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r25 \n\t"
+        "adc r23, r25 \n\t"
+        "st z+, r24 \n\t"
+
+        "ld r8, x+ \n\t"
+        "ldi r24, 0 \n\t"
+        "mul r9, r21 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r10, r20 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r11, r19 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r2, r18 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r3, r17 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r4, r16 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r5, r15 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r6, r14 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r7, r13 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r8, r12 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "ld r0, z \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r25 \n\t"
+        "adc r24, r25 \n\t"
+        "st z+, r22 \n\t"
+
+        "ld r9, x+ \n\t"
+        "ldi r22, 0 \n\t"
+        "mul r10, r21 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r11, r20 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r2, r19 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r3, r18 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r4, r17 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r5, r16 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r6, r15 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r7, r14 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r8, r13 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r9, r12 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "ld r0, z \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r25 \n\t"
+        "adc r22, r25 \n\t"
+        "st z+, r23 \n\t"
+
+        "ld r10, x+ \n\t"
+        "ldi r23, 0 \n\t"
+        "mul r11, r21 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r2, r20 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r3, r19 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r4, r18 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r5, r17 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r6, r16 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r7, r15 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r8, r14 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r9, r13 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r10, r12 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "ld r0, z \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r25 \n\t"
+        "adc r23, r25 \n\t"
+        "st z+, r24 \n\t"
+
+        "ld r11, x+ \n\t"
+        "ldi r24, 0 \n\t"
+        "mul r2, r21 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r3, r20 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r4, r19 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r5, r18 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r6, r17 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r7, r16 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r8, r15 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r9, r14 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r10, r13 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r11, r12 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "ld r0, z \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r25 \n\t"
+        "adc r24, r25 \n\t"
+        "st z+, r22 \n\t"
+
+        "ld r2, x+ \n\t"
+        "ldi r22, 0 \n\t"
+        "mul r3, r21 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r4, r20 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r5, r19 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r6, r18 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r7, r17 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r8, r16 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r9, r15 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r10, r14 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r11, r13 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r2, r12 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "ld r0, z \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r25 \n\t"
+        "adc r22, r25 \n\t"
+        "st z+, r23 \n\t"
+
+        "ld r3, x+ \n\t"
+        "ldi r23, 0 \n\t"
+        "mul r4, r21 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r5, r20 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r6, r19 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r7, r18 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r8, r17 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r9, r16 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r10, r15 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r11, r14 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r2, r13 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r3, r12 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "ld r0, z \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r25 \n\t"
+        "adc r23, r25 \n\t"
+        "st z+, r24 \n\t"
+
+        "ld r4, x+ \n\t"
+        "ldi r24, 0 \n\t"
+        "mul r5, r21 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r6, r20 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r7, r19 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r8, r18 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r9, r17 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r10, r16 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r11, r15 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r2, r14 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r3, r13 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r4, r12 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "ld r0, z \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r25 \n\t"
+        "adc r24, r25 \n\t"
+        "st z+, r22 \n\t"
+
+        "ld r5, x+ \n\t"
+        "ldi r22, 0 \n\t"
+        "mul r6, r21 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r7, r20 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r8, r19 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r9, r18 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r10, r17 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r11, r16 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r2, r15 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r3, r14 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r4, r13 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r5, r12 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "ld r0, z \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r25 \n\t"
+        "adc r22, r25 \n\t"
+        "st z+, r23 \n\t"
+
+        "ld r6, x+ \n\t"
+        "ldi r23, 0 \n\t"
+        "mul r7, r21 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r8, r20 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r9, r19 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r10, r18 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r11, r17 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r2, r16 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r3, r15 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r4, r14 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r5, r13 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r6, r12 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "ld r0, z \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r25 \n\t"
+        "adc r23, r25 \n\t"
+        "st z+, r24 \n\t"
+
+        "ld r7, x+ \n\t"
+        "ldi r24, 0 \n\t"
+        "mul r8, r21 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r9, r20 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r10, r19 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r11, r18 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r2, r17 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r3, r16 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r4, r15 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r5, r14 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r6, r13 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r7, r12 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "ld r0, z \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r25 \n\t"
+        "adc r24, r25 \n\t"
+        "st z+, r22 \n\t"
+
+        "ld r8, x+ \n\t"
+        "ldi r22, 0 \n\t"
+        "mul r9, r21 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r10, r20 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r11, r19 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r2, r18 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r3, r17 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r4, r16 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r5, r15 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r6, r14 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r7, r13 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r8, r12 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "ld r0, z \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r25 \n\t"
+        "adc r22, r25 \n\t"
+        "st z+, r23 \n\t"
+
+        "ld r9, x+ \n\t"
+        "ldi r23, 0 \n\t"
+        "mul r10, r21 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r11, r20 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r2, r19 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r3, r18 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r4, r17 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r5, r16 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r6, r15 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r7, r14 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r8, r13 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r9, r12 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "ld r0, z \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r25 \n\t"
+        "adc r23, r25 \n\t"
+        "st z+, r24 \n\t"
+
+        "ld r10, x+ \n\t"
+        "ldi r24, 0 \n\t"
+        "mul r11, r21 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r2, r20 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r3, r19 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r4, r18 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r5, r17 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r6, r16 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r7, r15 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r8, r14 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r9, r13 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r10, r12 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "ld r0, z \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r25 \n\t"
+        "adc r24, r25 \n\t"
+        "st z+, r22 \n\t"
+
+        "ld r11, x+ \n\t"
+        "ldi r22, 0 \n\t"
+        "mul r2, r21 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r3, r20 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r4, r19 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r5, r18 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r6, r17 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r7, r16 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r8, r15 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r9, r14 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r10, r13 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r11, r12 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "ld r0, z \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r25 \n\t"
+        "adc r22, r25 \n\t"
+        "st z+, r23 \n\t"
+
+        "ld r2, x+ \n\t"
+        "ldi r23, 0 \n\t"
+        "mul r3, r21 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r4, r20 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r5, r19 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r6, r18 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r7, r17 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r8, r16 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r9, r15 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r10, r14 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r11, r13 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r2, r12 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "ld r0, z \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r25 \n\t"
+        "adc r23, r25 \n\t"
+        "st z+, r24 \n\t"
+
+        "ld r3, x+ \n\t"
+        "ldi r24, 0 \n\t"
+        "mul r4, r21 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r5, r20 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r6, r19 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r7, r18 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r8, r17 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r9, r16 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r10, r15 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r11, r14 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r2, r13 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r3, r12 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "ld r0, z \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r25 \n\t"
+        "adc r24, r25 \n\t"
+        "st z+, r22 \n\t"
+
+        "ld r12, y+ \n\t"
+        "ldi r22, 0 \n\t"
+        "mul r4, r12 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r5, r21 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r6, r20 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r7, r19 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r8, r18 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r9, r17 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r10, r16 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r11, r15 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r2, r14 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r3, r13 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "ld r0, z \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r25 \n\t"
+        "adc r22, r25 \n\t"
+        "st z+, r23 \n\t"
+
+        "ld r13, y+ \n\t"
+        "ldi r23, 0 \n\t"
+        "mul r4, r13 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r5, r12 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r6, r21 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r7, r20 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r8, r19 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r9, r18 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r10, r17 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r11, r16 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r2, r15 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r3, r14 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "ld r0, z \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r25 \n\t"
+        "adc r23, r25 \n\t"
+        "st z+, r24 \n\t"
+
+        "ld r14, y+ \n\t"
+        "ldi r24, 0 \n\t"
+        "mul r4, r14 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r5, r13 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r6, r12 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r7, r21 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r8, r20 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r9, r19 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r10, r18 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r11, r17 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r2, r16 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r3, r15 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "ld r0, z \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r25 \n\t"
+        "adc r24, r25 \n\t"
+        "st z+, r22 \n\t"
+
+        "ld r15, y+ \n\t"
+        "ldi r22, 0 \n\t"
+        "mul r4, r15 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r5, r14 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r6, r13 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r7, r12 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r8, r21 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r9, r20 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r10, r19 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r11, r18 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r2, r17 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r3, r16 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "ld r0, z \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r25 \n\t"
+        "adc r22, r25 \n\t"
+        "st z+, r23 \n\t"
+
+        "ld r16, y+ \n\t"
+        "ldi r23, 0 \n\t"
+        "mul r4, r16 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r5, r15 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r6, r14 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r7, r13 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r8, r12 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r9, r21 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r10, r20 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r11, r19 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r2, r18 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r3, r17 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "ld r0, z \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r25 \n\t"
+        "adc r23, r25 \n\t"
+        "st z+, r24 \n\t"
+
+        "ld r17, y+ \n\t"
+        "ldi r24, 0 \n\t"
+        "mul r4, r17 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r5, r16 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r6, r15 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r7, r14 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r8, r13 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r9, r12 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r10, r21 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r11, r20 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r2, r19 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r3, r18 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "ld r0, z \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r25 \n\t"
+        "adc r24, r25 \n\t"
+        "st z+, r22 \n\t"
+
+        "ld r18, y+ \n\t"
+        "ldi r22, 0 \n\t"
+        "mul r4, r18 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r5, r17 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r6, r16 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r7, r15 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r8, r14 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r9, r13 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r10, r12 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r11, r21 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r2, r20 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r3, r19 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "ld r0, z \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r25 \n\t"
+        "adc r22, r25 \n\t"
+        "st z+, r23 \n\t"
+
+        "ld r19, y+ \n\t"
+        "ldi r23, 0 \n\t"
+        "mul r4, r19 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r5, r18 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r6, r17 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r7, r16 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r8, r15 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r9, r14 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r10, r13 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r11, r12 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r2, r21 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r3, r20 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "ld r0, z \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r25 \n\t"
+        "adc r23, r25 \n\t"
+        "st z+, r24 \n\t"
+
+        "ld r20, y+ \n\t"
+        "ldi r24, 0 \n\t"
+        "mul r4, r20 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r5, r19 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r6, r18 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r7, r17 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r8, r16 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r9, r15 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r10, r14 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r11, r13 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r2, r12 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r3, r21 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "ld r0, z \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r25 \n\t"
+        "adc r24, r25 \n\t"
+        "st z+, r22 \n\t"
+
+        "ld r21, y+ \n\t"
+        "ldi r22, 0 \n\t"
+        "mul r4, r21 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r5, r20 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r6, r19 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r7, r18 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r8, r17 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r9, r16 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r10, r15 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r11, r14 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r2, r13 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r3, r12 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "ld r0, z \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r25 \n\t"
+        "adc r22, r25 \n\t"
+        "st z+, r23 \n\t"
+
+        "ld r12, y+ \n\t"
+        "ldi r23, 0 \n\t"
+        "mul r4, r12 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r5, r21 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r6, r20 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r7, r19 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r8, r18 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r9, r17 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r10, r16 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r11, r15 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r2, r14 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r3, r13 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "ld r0, z \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r25 \n\t"
+        "adc r23, r25 \n\t"
+        "st z+, r24 \n\t"
+
+        "ld r13, y+ \n\t"
+        "ldi r24, 0 \n\t"
+        "mul r4, r13 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r5, r12 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r6, r21 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r7, r20 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r8, r19 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r9, r18 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r10, r17 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r11, r16 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r2, r15 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r3, r14 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "ld r0, z \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r25 \n\t"
+        "adc r24, r25 \n\t"
+        "st z+, r22 \n\t"
+
+        "ld r14, y+ \n\t"
+        "ldi r22, 0 \n\t"
+        "mul r4, r14 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r5, r13 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r6, r12 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r7, r21 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r8, r20 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r9, r19 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r10, r18 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r11, r17 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r2, r16 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r3, r15 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "ld r0, z \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r25 \n\t"
+        "adc r22, r25 \n\t"
+        "st z+, r23 \n\t"
+
+        "ld r15, y+ \n\t"
+        "ldi r23, 0 \n\t"
+        "mul r4, r15 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r5, r14 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r6, r13 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r7, r12 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r8, r21 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r9, r20 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r10, r19 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r11, r18 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r2, r17 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r3, r16 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "ld r0, z \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r25 \n\t"
+        "adc r23, r25 \n\t"
+        "st z+, r24 \n\t"
+
+        "ld r16, y+ \n\t"
+        "ldi r24, 0 \n\t"
+        "mul r4, r16 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r5, r15 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r6, r14 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r7, r13 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r8, r12 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r9, r21 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r10, r20 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r11, r19 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r2, r18 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r3, r17 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "ld r0, z \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r25 \n\t"
+        "adc r24, r25 \n\t"
+        "st z+, r22 \n\t"
+
+        "ld r17, y+ \n\t"
+        "ldi r22, 0 \n\t"
+        "mul r4, r17 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r5, r16 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r6, r15 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r7, r14 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r8, r13 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r9, r12 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r10, r21 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r11, r20 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r2, r19 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r3, r18 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "ld r0, z \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r25 \n\t"
+        "adc r22, r25 \n\t"
+        "st z+, r23 \n\t"
+
+        "ld r18, y+ \n\t"
+        "ldi r23, 0 \n\t"
+        "mul r4, r18 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r5, r17 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r6, r16 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r7, r15 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r8, r14 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r9, r13 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r10, r12 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r11, r21 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r2, r20 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r3, r19 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "ld r0, z \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r25 \n\t"
+        "adc r23, r25 \n\t"
+        "st z+, r24 \n\t"
+
+        "ld r19, y+ \n\t"
+        "ldi r24, 0 \n\t"
+        "mul r4, r19 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r5, r18 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r6, r17 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r7, r16 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r8, r15 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r9, r14 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r10, r13 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r11, r12 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r2, r21 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r3, r20 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "ld r0, z \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r25 \n\t"
+        "adc r24, r25 \n\t"
+        "st z+, r22 \n\t"
+
+        "ld r20, y+ \n\t"
+        "ldi r22, 0 \n\t"
+        "mul r4, r20 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r5, r19 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r6, r18 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r7, r17 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r8, r16 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r9, r15 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r10, r14 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r11, r13 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r2, r12 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r3, r21 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "ld r0, z \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r25 \n\t"
+        "adc r22, r25 \n\t"
+        "st z+, r23 \n\t"
+
+        "ld r21, y+ \n\t"
+        "ldi r23, 0 \n\t"
+        "mul r4, r21 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r5, r20 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r6, r19 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r7, r18 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r8, r17 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r9, r16 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r10, r15 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r11, r14 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r2, r13 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r3, r12 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "ld r0, z \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r25 \n\t"
+        "adc r23, r25 \n\t"
+        "st z+, r24 \n\t"
+
+        "ld r12, y+ \n\t"
+        "ldi r24, 0 \n\t"
+        "mul r4, r12 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r5, r21 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r6, r20 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r7, r19 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r8, r18 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r9, r17 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r10, r16 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r11, r15 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r2, r14 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r3, r13 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "ld r0, z \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r25 \n\t"
+        "adc r24, r25 \n\t"
+        "st z+, r22 \n\t"
+
+        "ld r13, y+ \n\t"
+        "ldi r22, 0 \n\t"
+        "mul r4, r13 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r5, r12 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r6, r21 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r7, r20 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r8, r19 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r9, r18 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r10, r17 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r11, r16 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r2, r15 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r3, r14 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "ld r0, z \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r25 \n\t"
+        "adc r22, r25 \n\t"
+        "st z+, r23 \n\t"
+
+        "ldi r23, 0 \n\t"
+        "mul r5, r13 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r6, r12 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r7, r21 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r8, r20 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r9, r19 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r10, r18 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r11, r17 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r2, r16 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r3, r15 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "st z+, r24 \n\t"
+
+        "ldi r24, 0 \n\t"
+        "mul r6, r13 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r7, r12 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r8, r21 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r9, r20 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r10, r19 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r11, r18 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r2, r17 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r3, r16 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "st z+, r22 \n\t"
+
+        "ldi r22, 0 \n\t"
+        "mul r7, r13 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r8, r12 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r9, r21 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r10, r20 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r11, r19 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r2, r18 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r3, r17 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "st z+, r23 \n\t"
+
+        "ldi r23, 0 \n\t"
+        "mul r8, r13 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r9, r12 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r10, r21 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r11, r20 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r2, r19 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r3, r18 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "st z+, r24 \n\t"
+
+        "ldi r24, 0 \n\t"
+        "mul r9, r13 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r10, r12 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r11, r21 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r2, r20 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r3, r19 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "st z+, r22 \n\t"
+
+        "ldi r22, 0 \n\t"
+        "mul r10, r13 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r11, r12 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r2, r21 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r3, r20 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "st z+, r23 \n\t"
+
+        "ldi r23, 0 \n\t"
+        "mul r11, r13 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r2, r12 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r3, r21 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "st z+, r24 \n\t"
+
+        "ldi r24, 0 \n\t"
+        "mul r2, r13 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r3, r12 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "st z+, r22 \n\t"
+
+        "mul r3, r13 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "st z+, r23 \n\t"
+        "st z+, r24 \n\t"
+
+        "eor r1, r1 \n\t"
+        : "+x" (p_left), "+y" (p_right), "+z" (p_result)
+        :
+        : "r0", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12",
+          "r13", "r14", "r15", "r16", "r17", "r18", "r19", "r20", "r21", "r22", "r23", "r24", "r25", "cc", "memory"
+    );
+}
+#define asm_mult 1
+#endif /* uECC_BYTES == 32 */
+
+#if uECC_SQUARE_FUNC
+
+#if (uECC_BYTES == 20)
+static void vli_square(uint8_t *p_result, uint8_t *p_left)
+{
+    __asm__ volatile (
+        "ld r2, x+ \n\t"
+        "ld r3, x+ \n\t"
+        "ld r4, x+ \n\t"
+        "ld r5, x+ \n\t"
+        "ld r6, x+ \n\t"
+        "ld r7, x+ \n\t"
+        "ld r8, x+ \n\t"
+        "ld r9, x+ \n\t"
+        "ld r10, x+ \n\t"
+        "ld r11, x+ \n\t"
+        "ld r12, x+ \n\t"
+        "ld r13, x+ \n\t"
+        "ld r14, x+ \n\t"
+        "ld r15, x+ \n\t"
+        "ld r16, x+ \n\t"
+        "ld r17, x+ \n\t"
+        "ld r18, x+ \n\t"
+        "ld r19, x+ \n\t"
+        "ld r20, x+ \n\t"
+        "ld r21, x+ \n\t"
+        "ldi r27, 0 \n\t"
+
+        "ldi r23, 0 \n\t"
+        "mul r2, r2 \n\t"
+        "st z+, r0 \n\t"
+        "mov r22, r1 \n\t"
+
+        "ldi r24, 0 \n\t"
+        "mul r2, r3 \n\t"
+        "lsl r0 \n\t"
+        "rol r1 \n\t"
+        "adc r24, r27 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r27 \n\t"
+        "st z+, r22 \n\t"
+
+        "ldi r22, 0 \n\t"
+        "mul r2, r4 \n\t"
+        "lsl r0 \n\t"
+        "rol r1 \n\t"
+        "adc r22, r27 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r27 \n\t"
+        "mul r3, r3 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r27 \n\t"
+        "st z+, r23 \n\t"
+
+        "ldi r26, 0 \n\t"
+        "mul r2, r5 \n\t"
+        "mov r23, r0 \n\t"
+        "mov r25, r1 \n\t"
+        "mul r3, r4 \n\t"
+        "add r23, r0 \n\t"
+        "adc r25, r1 \n\t"
+        "adc r26, r27 \n\t"
+        "lsl r23 \n\t"
+        "rol r25 \n\t"
+        "rol r26 \n\t"
+        "add r23, r24 \n\t"
+        "adc r25, r22 \n\t"
+        "adc r26, r27 \n\t"
+        "st z+, r23 \n\t"
+
+        "ldi r22, 0 \n\t"
+        "mul r2, r6 \n\t"
+        "mov r23, r0 \n\t"
+        "mov r24, r1 \n\t"
+        "mul r3, r5 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r27 \n\t"
+        "lsl r23 \n\t"
+        "rol r24 \n\t"
+        "rol r22 \n\t"
+        "mul r4, r4 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r27 \n\t"
+        "add r23, r25 \n\t"
+        "adc r24, r26 \n\t"
+        "adc r22, r27 \n\t"
+        "st z+, r23 \n\t"
+
+        "ldi r26, 0 \n\t"
+        "mul r2, r7 \n\t"
+        "mov r23, r0 \n\t"
+        "mov r25, r1 \n\t"
+        "mul r3, r6 \n\t"
+        "add r23, r0 \n\t"
+        "adc r25, r1 \n\t"
+        "adc r26, r27 \n\t"
+        "mul r4, r5 \n\t"
+        "add r23, r0 \n\t"
+        "adc r25, r1 \n\t"
+        "adc r26, r27 \n\t"
+        "lsl r23 \n\t"
+        "rol r25 \n\t"
+        "rol r26 \n\t"
+        "add r23, r24 \n\t"
+        "adc r25, r22 \n\t"
+        "adc r26, r27 \n\t"
+        "st z+, r23 \n\t"
+
+        "ldi r22, 0 \n\t"
+        "mul r2, r8 \n\t"
+        "mov r23, r0 \n\t"
+        "mov r24, r1 \n\t"
+        "mul r3, r7 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r27 \n\t"
+        "mul r4, r6 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r27 \n\t"
+        "lsl r23 \n\t"
+        "rol r24 \n\t"
+        "rol r22 \n\t"
+        "mul r5, r5 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r27 \n\t"
+        "add r23, r25 \n\t"
+        "adc r24, r26 \n\t"
+        "adc r22, r27 \n\t"
+        "st z+, r23 \n\t"
+
+        "ldi r26, 0 \n\t"
+        "mul r2, r9 \n\t"
+        "mov r23, r0 \n\t"
+        "mov r25, r1 \n\t"
+        "mul r3, r8 \n\t"
+        "add r23, r0 \n\t"
+        "adc r25, r1 \n\t"
+        "adc r26, r27 \n\t"
+        "mul r4, r7 \n\t"
+        "add r23, r0 \n\t"
+        "adc r25, r1 \n\t"
+        "adc r26, r27 \n\t"
+        "mul r5, r6 \n\t"
+        "add r23, r0 \n\t"
+        "adc r25, r1 \n\t"
+        "adc r26, r27 \n\t"
+        "lsl r23 \n\t"
+        "rol r25 \n\t"
+        "rol r26 \n\t"
+        "add r23, r24 \n\t"
+        "adc r25, r22 \n\t"
+        "adc r26, r27 \n\t"
+        "st z+, r23 \n\t"
+
+        "ldi r22, 0 \n\t"
+        "mul r2, r10 \n\t"
+        "mov r23, r0 \n\t"
+        "mov r24, r1 \n\t"
+        "mul r3, r9 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r27 \n\t"
+        "mul r4, r8 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r27 \n\t"
+        "mul r5, r7 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r27 \n\t"
+        "lsl r23 \n\t"
+        "rol r24 \n\t"
+        "rol r22 \n\t"
+        "mul r6, r6 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r27 \n\t"
+        "add r23, r25 \n\t"
+        "adc r24, r26 \n\t"
+        "adc r22, r27 \n\t"
+        "st z+, r23 \n\t"
+
+        "ldi r26, 0 \n\t"
+        "mul r2, r11 \n\t"
+        "mov r23, r0 \n\t"
+        "mov r25, r1 \n\t"
+        "mul r3, r10 \n\t"
+        "add r23, r0 \n\t"
+        "adc r25, r1 \n\t"
+        "adc r26, r27 \n\t"
+        "mul r4, r9 \n\t"
+        "add r23, r0 \n\t"
+        "adc r25, r1 \n\t"
+        "adc r26, r27 \n\t"
+        "mul r5, r8 \n\t"
+        "add r23, r0 \n\t"
+        "adc r25, r1 \n\t"
+        "adc r26, r27 \n\t"
+        "mul r6, r7 \n\t"
+        "add r23, r0 \n\t"
+        "adc r25, r1 \n\t"
+        "adc r26, r27 \n\t"
+        "lsl r23 \n\t"
+        "rol r25 \n\t"
+        "rol r26 \n\t"
+        "add r23, r24 \n\t"
+        "adc r25, r22 \n\t"
+        "adc r26, r27 \n\t"
+        "st z+, r23 \n\t"
+
+        "ldi r22, 0 \n\t"
+        "mul r2, r12 \n\t"
+        "mov r23, r0 \n\t"
+        "mov r24, r1 \n\t"
+        "mul r3, r11 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r27 \n\t"
+        "mul r4, r10 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r27 \n\t"
+        "mul r5, r9 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r27 \n\t"
+        "mul r6, r8 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r27 \n\t"
+        "lsl r23 \n\t"
+        "rol r24 \n\t"
+        "rol r22 \n\t"
+        "mul r7, r7 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r27 \n\t"
+        "add r23, r25 \n\t"
+        "adc r24, r26 \n\t"
+        "adc r22, r27 \n\t"
+        "st z+, r23 \n\t"
+
+        "ldi r26, 0 \n\t"
+        "mul r2, r13 \n\t"
+        "mov r23, r0 \n\t"
+        "mov r25, r1 \n\t"
+        "mul r3, r12 \n\t"
+        "add r23, r0 \n\t"
+        "adc r25, r1 \n\t"
+        "adc r26, r27 \n\t"
+        "mul r4, r11 \n\t"
+        "add r23, r0 \n\t"
+        "adc r25, r1 \n\t"
+        "adc r26, r27 \n\t"
+        "mul r5, r10 \n\t"
+        "add r23, r0 \n\t"
+        "adc r25, r1 \n\t"
+        "adc r26, r27 \n\t"
+        "mul r6, r9 \n\t"
+        "add r23, r0 \n\t"
+        "adc r25, r1 \n\t"
+        "adc r26, r27 \n\t"
+        "mul r7, r8 \n\t"
+        "add r23, r0 \n\t"
+        "adc r25, r1 \n\t"
+        "adc r26, r27 \n\t"
+        "lsl r23 \n\t"
+        "rol r25 \n\t"
+        "rol r26 \n\t"
+        "add r23, r24 \n\t"
+        "adc r25, r22 \n\t"
+        "adc r26, r27 \n\t"
+        "st z+, r23 \n\t"
+
+        "ldi r22, 0 \n\t"
+        "mul r2, r14 \n\t"
+        "mov r23, r0 \n\t"
+        "mov r24, r1 \n\t"
+        "mul r3, r13 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r27 \n\t"
+        "mul r4, r12 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r27 \n\t"
+        "mul r5, r11 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r27 \n\t"
+        "mul r6, r10 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r27 \n\t"
+        "mul r7, r9 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r27 \n\t"
+        "lsl r23 \n\t"
+        "rol r24 \n\t"
+        "rol r22 \n\t"
+        "mul r8, r8 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r27 \n\t"
+        "add r23, r25 \n\t"
+        "adc r24, r26 \n\t"
+        "adc r22, r27 \n\t"
+        "st z+, r23 \n\t"
+
+        "ldi r26, 0 \n\t"
+        "mul r2, r15 \n\t"
+        "mov r23, r0 \n\t"
+        "mov r25, r1 \n\t"
+        "mul r3, r14 \n\t"
+        "add r23, r0 \n\t"
+        "adc r25, r1 \n\t"
+        "adc r26, r27 \n\t"
+        "mul r4, r13 \n\t"
+        "add r23, r0 \n\t"
+        "adc r25, r1 \n\t"
+        "adc r26, r27 \n\t"
+        "mul r5, r12 \n\t"
+        "add r23, r0 \n\t"
+        "adc r25, r1 \n\t"
+        "adc r26, r27 \n\t"
+        "mul r6, r11 \n\t"
+        "add r23, r0 \n\t"
+        "adc r25, r1 \n\t"
+        "adc r26, r27 \n\t"
+        "mul r7, r10 \n\t"
+        "add r23, r0 \n\t"
+        "adc r25, r1 \n\t"
+        "adc r26, r27 \n\t"
+        "mul r8, r9 \n\t"
+        "add r23, r0 \n\t"
+        "adc r25, r1 \n\t"
+        "adc r26, r27 \n\t"
+        "lsl r23 \n\t"
+        "rol r25 \n\t"
+        "rol r26 \n\t"
+        "add r23, r24 \n\t"
+        "adc r25, r22 \n\t"
+        "adc r26, r27 \n\t"
+        "st z+, r23 \n\t"
+
+        "ldi r22, 0 \n\t"
+        "mul r2, r16 \n\t"
+        "mov r23, r0 \n\t"
+        "mov r24, r1 \n\t"
+        "mul r3, r15 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r27 \n\t"
+        "mul r4, r14 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r27 \n\t"
+        "mul r5, r13 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r27 \n\t"
+        "mul r6, r12 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r27 \n\t"
+        "mul r7, r11 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r27 \n\t"
+        "mul r8, r10 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r27 \n\t"
+        "lsl r23 \n\t"
+        "rol r24 \n\t"
+        "rol r22 \n\t"
+        "mul r9, r9 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r27 \n\t"
+        "add r23, r25 \n\t"
+        "adc r24, r26 \n\t"
+        "adc r22, r27 \n\t"
+        "st z+, r23 \n\t"
+
+        "ldi r26, 0 \n\t"
+        "mul r2, r17 \n\t"
+        "mov r23, r0 \n\t"
+        "mov r25, r1 \n\t"
+        "mul r3, r16 \n\t"
+        "add r23, r0 \n\t"
+        "adc r25, r1 \n\t"
+        "adc r26, r27 \n\t"
+        "mul r4, r15 \n\t"
+        "add r23, r0 \n\t"
+        "adc r25, r1 \n\t"
+        "adc r26, r27 \n\t"
+        "mul r5, r14 \n\t"
+        "add r23, r0 \n\t"
+        "adc r25, r1 \n\t"
+        "adc r26, r27 \n\t"
+        "mul r6, r13 \n\t"
+        "add r23, r0 \n\t"
+        "adc r25, r1 \n\t"
+        "adc r26, r27 \n\t"
+        "mul r7, r12 \n\t"
+        "add r23, r0 \n\t"
+        "adc r25, r1 \n\t"
+        "adc r26, r27 \n\t"
+        "mul r8, r11 \n\t"
+        "add r23, r0 \n\t"
+        "adc r25, r1 \n\t"
+        "adc r26, r27 \n\t"
+        "mul r9, r10 \n\t"
+        "add r23, r0 \n\t"
+        "adc r25, r1 \n\t"
+        "adc r26, r27 \n\t"
+        "lsl r23 \n\t"
+        "rol r25 \n\t"
+        "rol r26 \n\t"
+        "add r23, r24 \n\t"
+        "adc r25, r22 \n\t"
+        "adc r26, r27 \n\t"
+        "st z+, r23 \n\t"
+
+        "ldi r22, 0 \n\t"
+        "mul r2, r18 \n\t"
+        "mov r23, r0 \n\t"
+        "mov r24, r1 \n\t"
+        "mul r3, r17 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r27 \n\t"
+        "mul r4, r16 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r27 \n\t"
+        "mul r5, r15 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r27 \n\t"
+        "mul r6, r14 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r27 \n\t"
+        "mul r7, r13 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r27 \n\t"
+        "mul r8, r12 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r27 \n\t"
+        "mul r9, r11 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r27 \n\t"
+        "lsl r23 \n\t"
+        "rol r24 \n\t"
+        "rol r22 \n\t"
+        "mul r10, r10 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r27 \n\t"
+        "add r23, r25 \n\t"
+        "adc r24, r26 \n\t"
+        "adc r22, r27 \n\t"
+        "st z+, r23 \n\t"
+
+        "ldi r26, 0 \n\t"
+        "mul r2, r19 \n\t"
+        "mov r23, r0 \n\t"
+        "mov r25, r1 \n\t"
+        "mul r3, r18 \n\t"
+        "add r23, r0 \n\t"
+        "adc r25, r1 \n\t"
+        "adc r26, r27 \n\t"
+        "mul r4, r17 \n\t"
+        "add r23, r0 \n\t"
+        "adc r25, r1 \n\t"
+        "adc r26, r27 \n\t"
+        "mul r5, r16 \n\t"
+        "add r23, r0 \n\t"
+        "adc r25, r1 \n\t"
+        "adc r26, r27 \n\t"
+        "mul r6, r15 \n\t"
+        "add r23, r0 \n\t"
+        "adc r25, r1 \n\t"
+        "adc r26, r27 \n\t"
+        "mul r7, r14 \n\t"
+        "add r23, r0 \n\t"
+        "adc r25, r1 \n\t"
+        "adc r26, r27 \n\t"
+        "mul r8, r13 \n\t"
+        "add r23, r0 \n\t"
+        "adc r25, r1 \n\t"
+        "adc r26, r27 \n\t"
+        "mul r9, r12 \n\t"
+        "add r23, r0 \n\t"
+        "adc r25, r1 \n\t"
+        "adc r26, r27 \n\t"
+        "mul r10, r11 \n\t"
+        "add r23, r0 \n\t"
+        "adc r25, r1 \n\t"
+        "adc r26, r27 \n\t"
+        "lsl r23 \n\t"
+        "rol r25 \n\t"
+        "rol r26 \n\t"
+        "add r23, r24 \n\t"
+        "adc r25, r22 \n\t"
+        "adc r26, r27 \n\t"
+        "st z+, r23 \n\t"
+
+        "ldi r22, 0 \n\t"
+        "mul r2, r20 \n\t"
+        "mov r23, r0 \n\t"
+        "mov r24, r1 \n\t"
+        "mul r3, r19 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r27 \n\t"
+        "mul r4, r18 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r27 \n\t"
+        "mul r5, r17 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r27 \n\t"
+        "mul r6, r16 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r27 \n\t"
+        "mul r7, r15 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r27 \n\t"
+        "mul r8, r14 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r27 \n\t"
+        "mul r9, r13 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r27 \n\t"
+        "mul r10, r12 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r27 \n\t"
+        "lsl r23 \n\t"
+        "rol r24 \n\t"
+        "rol r22 \n\t"
+        "mul r11, r11 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r27 \n\t"
+        "add r23, r25 \n\t"
+        "adc r24, r26 \n\t"
+        "adc r22, r27 \n\t"
+        "st z+, r23 \n\t"
+
+        "ldi r26, 0 \n\t"
+        "mul r2, r21 \n\t"
+        "mov r23, r0 \n\t"
+        "mov r25, r1 \n\t"
+        "mul r3, r20 \n\t"
+        "add r23, r0 \n\t"
+        "adc r25, r1 \n\t"
+        "adc r26, r27 \n\t"
+        "mul r4, r19 \n\t"
+        "add r23, r0 \n\t"
+        "adc r25, r1 \n\t"
+        "adc r26, r27 \n\t"
+        "mul r5, r18 \n\t"
+        "add r23, r0 \n\t"
+        "adc r25, r1 \n\t"
+        "adc r26, r27 \n\t"
+        "mul r6, r17 \n\t"
+        "add r23, r0 \n\t"
+        "adc r25, r1 \n\t"
+        "adc r26, r27 \n\t"
+        "mul r7, r16 \n\t"
+        "add r23, r0 \n\t"
+        "adc r25, r1 \n\t"
+        "adc r26, r27 \n\t"
+        "mul r8, r15 \n\t"
+        "add r23, r0 \n\t"
+        "adc r25, r1 \n\t"
+        "adc r26, r27 \n\t"
+        "mul r9, r14 \n\t"
+        "add r23, r0 \n\t"
+        "adc r25, r1 \n\t"
+        "adc r26, r27 \n\t"
+        "mul r10, r13 \n\t"
+        "add r23, r0 \n\t"
+        "adc r25, r1 \n\t"
+        "adc r26, r27 \n\t"
+        "mul r11, r12 \n\t"
+        "add r23, r0 \n\t"
+        "adc r25, r1 \n\t"
+        "adc r26, r27 \n\t"
+        "lsl r23 \n\t"
+        "rol r25 \n\t"
+        "rol r26 \n\t"
+        "add r23, r24 \n\t"
+        "adc r25, r22 \n\t"
+        "adc r26, r27 \n\t"
+        "st z+, r23 \n\t"
+
+        "ldi r22, 0 \n\t"
+        "mul r3, r21 \n\t"
+        "mov r23, r0 \n\t"
+        "mov r24, r1 \n\t"
+        "mul r4, r20 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r27 \n\t"
+        "mul r5, r19 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r27 \n\t"
+        "mul r6, r18 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r27 \n\t"
+        "mul r7, r17 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r27 \n\t"
+        "mul r8, r16 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r27 \n\t"
+        "mul r9, r15 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r27 \n\t"
+        "mul r10, r14 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r27 \n\t"
+        "mul r11, r13 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r27 \n\t"
+        "lsl r23 \n\t"
+        "rol r24 \n\t"
+        "rol r22 \n\t"
+        "mul r12, r12 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r27 \n\t"
+        "add r23, r25 \n\t"
+        "adc r24, r26 \n\t"
+        "adc r22, r27 \n\t"
+        "st z+, r23 \n\t"
+
+        "ldi r26, 0 \n\t"
+        "mul r4, r21 \n\t"
+        "mov r23, r0 \n\t"
+        "mov r25, r1 \n\t"
+        "mul r5, r20 \n\t"
+        "add r23, r0 \n\t"
+        "adc r25, r1 \n\t"
+        "adc r26, r27 \n\t"
+        "mul r6, r19 \n\t"
+        "add r23, r0 \n\t"
+        "adc r25, r1 \n\t"
+        "adc r26, r27 \n\t"
+        "mul r7, r18 \n\t"
+        "add r23, r0 \n\t"
+        "adc r25, r1 \n\t"
+        "adc r26, r27 \n\t"
+        "mul r8, r17 \n\t"
+        "add r23, r0 \n\t"
+        "adc r25, r1 \n\t"
+        "adc r26, r27 \n\t"
+        "mul r9, r16 \n\t"
+        "add r23, r0 \n\t"
+        "adc r25, r1 \n\t"
+        "adc r26, r27 \n\t"
+        "mul r10, r15 \n\t"
+        "add r23, r0 \n\t"
+        "adc r25, r1 \n\t"
+        "adc r26, r27 \n\t"
+        "mul r11, r14 \n\t"
+        "add r23, r0 \n\t"
+        "adc r25, r1 \n\t"
+        "adc r26, r27 \n\t"
+        "mul r12, r13 \n\t"
+        "add r23, r0 \n\t"
+        "adc r25, r1 \n\t"
+        "adc r26, r27 \n\t"
+        "lsl r23 \n\t"
+        "rol r25 \n\t"
+        "rol r26 \n\t"
+        "add r23, r24 \n\t"
+        "adc r25, r22 \n\t"
+        "adc r26, r27 \n\t"
+        "st z+, r23 \n\t"
+
+        "ldi r22, 0 \n\t"
+        "mul r5, r21 \n\t"
+        "mov r23, r0 \n\t"
+        "mov r24, r1 \n\t"
+        "mul r6, r20 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r27 \n\t"
+        "mul r7, r19 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r27 \n\t"
+        "mul r8, r18 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r27 \n\t"
+        "mul r9, r17 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r27 \n\t"
+        "mul r10, r16 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r27 \n\t"
+        "mul r11, r15 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r27 \n\t"
+        "mul r12, r14 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r27 \n\t"
+        "lsl r23 \n\t"
+        "rol r24 \n\t"
+        "rol r22 \n\t"
+        "mul r13, r13 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r27 \n\t"
+        "add r23, r25 \n\t"
+        "adc r24, r26 \n\t"
+        "adc r22, r27 \n\t"
+        "st z+, r23 \n\t"
+
+        "ldi r26, 0 \n\t"
+        "mul r6, r21 \n\t"
+        "mov r23, r0 \n\t"
+        "mov r25, r1 \n\t"
+        "mul r7, r20 \n\t"
+        "add r23, r0 \n\t"
+        "adc r25, r1 \n\t"
+        "adc r26, r27 \n\t"
+        "mul r8, r19 \n\t"
+        "add r23, r0 \n\t"
+        "adc r25, r1 \n\t"
+        "adc r26, r27 \n\t"
+        "mul r9, r18 \n\t"
+        "add r23, r0 \n\t"
+        "adc r25, r1 \n\t"
+        "adc r26, r27 \n\t"
+        "mul r10, r17 \n\t"
+        "add r23, r0 \n\t"
+        "adc r25, r1 \n\t"
+        "adc r26, r27 \n\t"
+        "mul r11, r16 \n\t"
+        "add r23, r0 \n\t"
+        "adc r25, r1 \n\t"
+        "adc r26, r27 \n\t"
+        "mul r12, r15 \n\t"
+        "add r23, r0 \n\t"
+        "adc r25, r1 \n\t"
+        "adc r26, r27 \n\t"
+        "mul r13, r14 \n\t"
+        "add r23, r0 \n\t"
+        "adc r25, r1 \n\t"
+        "adc r26, r27 \n\t"
+        "lsl r23 \n\t"
+        "rol r25 \n\t"
+        "rol r26 \n\t"
+        "add r23, r24 \n\t"
+        "adc r25, r22 \n\t"
+        "adc r26, r27 \n\t"
+        "st z+, r23 \n\t"
+
+        "ldi r22, 0 \n\t"
+        "mul r7, r21 \n\t"
+        "mov r23, r0 \n\t"
+        "mov r24, r1 \n\t"
+        "mul r8, r20 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r27 \n\t"
+        "mul r9, r19 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r27 \n\t"
+        "mul r10, r18 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r27 \n\t"
+        "mul r11, r17 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r27 \n\t"
+        "mul r12, r16 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r27 \n\t"
+        "mul r13, r15 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r27 \n\t"
+        "lsl r23 \n\t"
+        "rol r24 \n\t"
+        "rol r22 \n\t"
+        "mul r14, r14 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r27 \n\t"
+        "add r23, r25 \n\t"
+        "adc r24, r26 \n\t"
+        "adc r22, r27 \n\t"
+        "st z+, r23 \n\t"
+
+        "ldi r26, 0 \n\t"
+        "mul r8, r21 \n\t"
+        "mov r23, r0 \n\t"
+        "mov r25, r1 \n\t"
+        "mul r9, r20 \n\t"
+        "add r23, r0 \n\t"
+        "adc r25, r1 \n\t"
+        "adc r26, r27 \n\t"
+        "mul r10, r19 \n\t"
+        "add r23, r0 \n\t"
+        "adc r25, r1 \n\t"
+        "adc r26, r27 \n\t"
+        "mul r11, r18 \n\t"
+        "add r23, r0 \n\t"
+        "adc r25, r1 \n\t"
+        "adc r26, r27 \n\t"
+        "mul r12, r17 \n\t"
+        "add r23, r0 \n\t"
+        "adc r25, r1 \n\t"
+        "adc r26, r27 \n\t"
+        "mul r13, r16 \n\t"
+        "add r23, r0 \n\t"
+        "adc r25, r1 \n\t"
+        "adc r26, r27 \n\t"
+        "mul r14, r15 \n\t"
+        "add r23, r0 \n\t"
+        "adc r25, r1 \n\t"
+        "adc r26, r27 \n\t"
+        "lsl r23 \n\t"
+        "rol r25 \n\t"
+        "rol r26 \n\t"
+        "add r23, r24 \n\t"
+        "adc r25, r22 \n\t"
+        "adc r26, r27 \n\t"
+        "st z+, r23 \n\t"
+
+        "ldi r22, 0 \n\t"
+        "mul r9, r21 \n\t"
+        "mov r23, r0 \n\t"
+        "mov r24, r1 \n\t"
+        "mul r10, r20 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r27 \n\t"
+        "mul r11, r19 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r27 \n\t"
+        "mul r12, r18 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r27 \n\t"
+        "mul r13, r17 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r27 \n\t"
+        "mul r14, r16 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r27 \n\t"
+        "lsl r23 \n\t"
+        "rol r24 \n\t"
+        "rol r22 \n\t"
+        "mul r15, r15 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r27 \n\t"
+        "add r23, r25 \n\t"
+        "adc r24, r26 \n\t"
+        "adc r22, r27 \n\t"
+        "st z+, r23 \n\t"
+
+        "ldi r26, 0 \n\t"
+        "mul r10, r21 \n\t"
+        "mov r23, r0 \n\t"
+        "mov r25, r1 \n\t"
+        "mul r11, r20 \n\t"
+        "add r23, r0 \n\t"
+        "adc r25, r1 \n\t"
+        "adc r26, r27 \n\t"
+        "mul r12, r19 \n\t"
+        "add r23, r0 \n\t"
+        "adc r25, r1 \n\t"
+        "adc r26, r27 \n\t"
+        "mul r13, r18 \n\t"
+        "add r23, r0 \n\t"
+        "adc r25, r1 \n\t"
+        "adc r26, r27 \n\t"
+        "mul r14, r17 \n\t"
+        "add r23, r0 \n\t"
+        "adc r25, r1 \n\t"
+        "adc r26, r27 \n\t"
+        "mul r15, r16 \n\t"
+        "add r23, r0 \n\t"
+        "adc r25, r1 \n\t"
+        "adc r26, r27 \n\t"
+        "lsl r23 \n\t"
+        "rol r25 \n\t"
+        "rol r26 \n\t"
+        "add r23, r24 \n\t"
+        "adc r25, r22 \n\t"
+        "adc r26, r27 \n\t"
+        "st z+, r23 \n\t"
+
+        "ldi r22, 0 \n\t"
+        "mul r11, r21 \n\t"
+        "mov r23, r0 \n\t"
+        "mov r24, r1 \n\t"
+        "mul r12, r20 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r27 \n\t"
+        "mul r13, r19 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r27 \n\t"
+        "mul r14, r18 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r27 \n\t"
+        "mul r15, r17 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r27 \n\t"
+        "lsl r23 \n\t"
+        "rol r24 \n\t"
+        "rol r22 \n\t"
+        "mul r16, r16 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r27 \n\t"
+        "add r23, r25 \n\t"
+        "adc r24, r26 \n\t"
+        "adc r22, r27 \n\t"
+        "st z+, r23 \n\t"
+
+        "ldi r26, 0 \n\t"
+        "mul r12, r21 \n\t"
+        "mov r23, r0 \n\t"
+        "mov r25, r1 \n\t"
+        "mul r13, r20 \n\t"
+        "add r23, r0 \n\t"
+        "adc r25, r1 \n\t"
+        "adc r26, r27 \n\t"
+        "mul r14, r19 \n\t"
+        "add r23, r0 \n\t"
+        "adc r25, r1 \n\t"
+        "adc r26, r27 \n\t"
+        "mul r15, r18 \n\t"
+        "add r23, r0 \n\t"
+        "adc r25, r1 \n\t"
+        "adc r26, r27 \n\t"
+        "mul r16, r17 \n\t"
+        "add r23, r0 \n\t"
+        "adc r25, r1 \n\t"
+        "adc r26, r27 \n\t"
+        "lsl r23 \n\t"
+        "rol r25 \n\t"
+        "rol r26 \n\t"
+        "add r23, r24 \n\t"
+        "adc r25, r22 \n\t"
+        "adc r26, r27 \n\t"
+        "st z+, r23 \n\t"
+
+        "ldi r22, 0 \n\t"
+        "mul r13, r21 \n\t"
+        "mov r23, r0 \n\t"
+        "mov r24, r1 \n\t"
+        "mul r14, r20 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r27 \n\t"
+        "mul r15, r19 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r27 \n\t"
+        "mul r16, r18 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r27 \n\t"
+        "lsl r23 \n\t"
+        "rol r24 \n\t"
+        "rol r22 \n\t"
+        "mul r17, r17 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r27 \n\t"
+        "add r23, r25 \n\t"
+        "adc r24, r26 \n\t"
+        "adc r22, r27 \n\t"
+        "st z+, r23 \n\t"
+
+        "ldi r26, 0 \n\t"
+        "mul r14, r21 \n\t"
+        "mov r23, r0 \n\t"
+        "mov r25, r1 \n\t"
+        "mul r15, r20 \n\t"
+        "add r23, r0 \n\t"
+        "adc r25, r1 \n\t"
+        "adc r26, r27 \n\t"
+        "mul r16, r19 \n\t"
+        "add r23, r0 \n\t"
+        "adc r25, r1 \n\t"
+        "adc r26, r27 \n\t"
+        "mul r17, r18 \n\t"
+        "add r23, r0 \n\t"
+        "adc r25, r1 \n\t"
+        "adc r26, r27 \n\t"
+        "lsl r23 \n\t"
+        "rol r25 \n\t"
+        "rol r26 \n\t"
+        "add r23, r24 \n\t"
+        "adc r25, r22 \n\t"
+        "adc r26, r27 \n\t"
+        "st z+, r23 \n\t"
+
+        "ldi r22, 0 \n\t"
+        "mul r15, r21 \n\t"
+        "mov r23, r0 \n\t"
+        "mov r24, r1 \n\t"
+        "mul r16, r20 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r27 \n\t"
+        "mul r17, r19 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r27 \n\t"
+        "lsl r23 \n\t"
+        "rol r24 \n\t"
+        "rol r22 \n\t"
+        "mul r18, r18 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r27 \n\t"
+        "add r23, r25 \n\t"
+        "adc r24, r26 \n\t"
+        "adc r22, r27 \n\t"
+        "st z+, r23 \n\t"
+
+        "ldi r26, 0 \n\t"
+        "mul r16, r21 \n\t"
+        "mov r23, r0 \n\t"
+        "mov r25, r1 \n\t"
+        "mul r17, r20 \n\t"
+        "add r23, r0 \n\t"
+        "adc r25, r1 \n\t"
+        "adc r26, r27 \n\t"
+        "mul r18, r19 \n\t"
+        "add r23, r0 \n\t"
+        "adc r25, r1 \n\t"
+        "adc r26, r27 \n\t"
+        "lsl r23 \n\t"
+        "rol r25 \n\t"
+        "rol r26 \n\t"
+        "add r23, r24 \n\t"
+        "adc r25, r22 \n\t"
+        "adc r26, r27 \n\t"
+        "st z+, r23 \n\t"
+
+        "ldi r22, 0 \n\t"
+        "mul r17, r21 \n\t"
+        "mov r23, r0 \n\t"
+        "mov r24, r1 \n\t"
+        "mul r18, r20 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r27 \n\t"
+        "lsl r23 \n\t"
+        "rol r24 \n\t"
+        "rol r22 \n\t"
+        "mul r19, r19 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r27 \n\t"
+        "add r23, r25 \n\t"
+        "adc r24, r26 \n\t"
+        "adc r22, r27 \n\t"
+        "st z+, r23 \n\t"
+
+        "ldi r26, 0 \n\t"
+        "mul r18, r21 \n\t"
+        "mov r23, r0 \n\t"
+        "mov r25, r1 \n\t"
+        "mul r19, r20 \n\t"
+        "add r23, r0 \n\t"
+        "adc r25, r1 \n\t"
+        "adc r26, r27 \n\t"
+        "lsl r23 \n\t"
+        "rol r25 \n\t"
+        "rol r26 \n\t"
+        "add r23, r24 \n\t"
+        "adc r25, r22 \n\t"
+        "adc r26, r27 \n\t"
+        "st z+, r23 \n\t"
+
+        "ldi r23, 0 \n\t"
+        "mul r19, r21 \n\t"
+        "lsl r0 \n\t"
+        "rol r1 \n\t"
+        "adc r23, r27 \n\t"
+        "add r25, r0 \n\t"
+        "adc r26, r1 \n\t"
+        "adc r23, r27 \n\t"
+        "mul r20, r20 \n\t"
+        "add r25, r0 \n\t"
+        "adc r26, r1 \n\t"
+        "adc r23, r27 \n\t"
+        "st z+, r25 \n\t"
+
+        "ldi r25, 0 \n\t"
+        "mul r20, r21 \n\t"
+        "lsl r0 \n\t"
+        "rol r1 \n\t"
+        "adc r25, r27 \n\t"
+        "add r26, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r25, r27 \n\t"
+        "st z+, r26 \n\t"
+
+        "mul r21, r21 \n\t"
+        "add r23, r0 \n\t"
+        "adc r25, r1 \n\t"
+        "st z+, r23 \n\t"
+        "st z+, r25 \n\t"
+        "eor r1, r1 \n\t"
+        : "+x" (p_left), "+z" (p_result)
+        :
+        : "r0", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12",
+          "r13", "r14", "r15", "r16", "r17", "r18", "r19", "r20", "r21", "r22", "r23", "r24", "r25", "cc", "memory"
+    );
+}
+#define asm_square 1
+
+#elif (uECC_BYTES == 24)
+
+__attribute((noinline))
+static void vli_square(uint8_t *p_result, uint8_t *p_left)
+{
+    __asm__ volatile (
+        "ldi r25, 0 \n\t"
+        "movw r28, r26 \n\t"
+        "ld r2, x+ \n\t"
+        "ld r3, x+ \n\t"
+        "adiw r28, 20 \n\t"
+        "ld r12, y+ \n\t"
+        "ld r13, y+ \n\t"
+        "adiw r30, 20 \n\t"
+        
+        "ldi r23, 0 \n\t"
+        "mul 2, 12 \n\t"
+        "st z+, r0 \n\t"
+        "mov r22, r1 \n\t"
+        
+        "ldi r24, 0 \n\t"
+        "mul r2, r13 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "st z+, r22 \n\t"
+        
+        "ld r12, y+ \n\t"
+        "ldi r22, 0 \n\t"
+        "mul r2, r12 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r3, r13 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "st z+, r23 \n\t"
+        
+        "ld r13, y+ \n\t"
+        "ldi r23, 0 \n\t"
+        "mul r2, r13 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r3, r12 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "st z+, r24 \n\t"
+        
+        "ld r2, x+ \n\t"
+        "ldi r24, 0 \n\t"
+        "mul r3, r13 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r2, r12 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "st z+, r22 \n\t"
+        
+        "ld r3, x+ \n\t"
+        "ldi r22, 0 \n\t"
+        "mul r2, r13 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "st z+, r23 \n\t"
+        
+        "mul r3, r13 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "st z+, r24 \n\t"
+        "st z+, r22 \n\t"
+        
+        "sbiw r26, 4 \n\t"
+        "sbiw r30, 28 \n\t"
+        "ld r2, x+ \n\t"
+        "ld r3, x+ \n\t"
+        "ld r4, x+ \n\t"
+        "ld r5, x+ \n\t"
+        "ld r6, x+ \n\t"
+        "ld r7, x+ \n\t"
+        "ld r8, x+ \n\t"
+        "ld r9, x+ \n\t"
+        "ld r10, x+ \n\t"
+        "ld r11, x+ \n\t"
+        "ld r12, x+ \n\t"
+        "ld r13, x+ \n\t"
+        "ld r14, x+ \n\t"
+        "ld r15, x+ \n\t"
+        "ld r16, x+ \n\t"
+        "ld r17, x+ \n\t"
+        "ld r18, x+ \n\t"
+        "ld r19, x+ \n\t"
+        "ld r20, x+ \n\t"
+        "ld r21, x+ \n\t"
+        
+        "ldi r23, 0 \n\t"
+        "mul r2, r2 \n\t"
+        "st z+, r0 \n\t"
+        "mov r22, r1 \n\t"
+        
+        "ldi r24, 0 \n\t"
+        "mul r2, r3 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "st z+, r22 \n\t"
+        
+        "ldi r22, 0 \n\t"
+        "mul r2, r4 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r3, r3 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "st z+, r23 \n\t"
+        
+        "ldi r29, 0 \n\t"
+        "mul r2, r5 \n\t"
+        "mov r23, r0 \n\t"
+        "mov r28, r1 \n\t"
+        "mul r3, r4 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "lsl r23 \n\t"
+        "rol r28 \n\t"
+        "rol r29 \n\t"
+        "add r23, r24 \n\t"
+        "adc r28, r22 \n\t"
+        "adc r29, r25 \n\t"
+        "st z+, r23 \n\t"
+        
+        "ldi r22, 0 \n\t"
+        "mul r2, r6 \n\t"
+        "mov r23, r0 \n\t"
+        "mov r24, r1 \n\t"
+        "mul r3, r5 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "lsl r23 \n\t"
+        "rol r24 \n\t"
+        "rol r22 \n\t"
+        "mul r4, r4 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "add r23, r28 \n\t"
+        "adc r24, r29 \n\t"
+        "adc r22, r25 \n\t"
+        "st z+, r23 \n\t"
+        
+        "ldi r29, 0 \n\t"
+        "mul r2, r7 \n\t"
+        "mov r23, r0 \n\t"
+        "mov r28, r1 \n\t"
+        "mul r3, r6 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "mul r4, r5 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "lsl r23 \n\t"
+        "rol r28 \n\t"
+        "rol r29 \n\t"
+        "add r23, r24 \n\t"
+        "adc r28, r22 \n\t"
+        "adc r29, r25 \n\t"
+        "st z+, r23 \n\t"
+        
+        "ldi r22, 0 \n\t"
+        "mul r2, r8 \n\t"
+        "mov r23, r0 \n\t"
+        "mov r24, r1 \n\t"
+        "mul r3, r7 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r4, r6 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "lsl r23 \n\t"
+        "rol r24 \n\t"
+        "rol r22 \n\t"
+        "mul r5, r5 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "add r23, r28 \n\t"
+        "adc r24, r29 \n\t"
+        "adc r22, r25 \n\t"
+        "st z+, r23 \n\t"
+        
+        "ldi r29, 0 \n\t"
+        "mul r2, r9 \n\t"
+        "mov r23, r0 \n\t"
+        "mov r28, r1 \n\t"
+        "mul r3, r8 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "mul r4, r7 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "mul r5, r6 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "lsl r23 \n\t"
+        "rol r28 \n\t"
+        "rol r29 \n\t"
+        "add r23, r24 \n\t"
+        "adc r28, r22 \n\t"
+        "adc r29, r25 \n\t"
+        "st z+, r23 \n\t"
+        
+        "ldi r22, 0 \n\t"
+        "mul r2, r10 \n\t"
+        "mov r23, r0 \n\t"
+        "mov r24, r1 \n\t"
+        "mul r3, r9 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r4, r8 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r5, r7 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "lsl r23 \n\t"
+        "rol r24 \n\t"
+        "rol r22 \n\t"
+        "mul r6, r6 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "add r23, r28 \n\t"
+        "adc r24, r29 \n\t"
+        "adc r22, r25 \n\t"
+        "st z+, r23 \n\t"
+        
+        "ldi r29, 0 \n\t"
+        "mul r2, r11 \n\t"
+        "mov r23, r0 \n\t"
+        "mov r28, r1 \n\t"
+        "mul r3, r10 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "mul r4, r9 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "mul r5, r8 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "mul r6, r7 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "lsl r23 \n\t"
+        "rol r28 \n\t"
+        "rol r29 \n\t"
+        "add r23, r24 \n\t"
+        "adc r28, r22 \n\t"
+        "adc r29, r25 \n\t"
+        "st z+, r23 \n\t"
+        
+        "ldi r22, 0 \n\t"
+        "mul r2, r12 \n\t"
+        "mov r23, r0 \n\t"
+        "mov r24, r1 \n\t"
+        "mul r3, r11 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r4, r10 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r5, r9 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r6, r8 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "lsl r23 \n\t"
+        "rol r24 \n\t"
+        "rol r22 \n\t"
+        "mul r7, r7 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "add r23, r28 \n\t"
+        "adc r24, r29 \n\t"
+        "adc r22, r25 \n\t"
+        "st z+, r23 \n\t"
+        
+        "ldi r29, 0 \n\t"
+        "mul r2, r13 \n\t"
+        "mov r23, r0 \n\t"
+        "mov r28, r1 \n\t"
+        "mul r3, r12 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "mul r4, r11 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "mul r5, r10 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "mul r6, r9 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "mul r7, r8 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "lsl r23 \n\t"
+        "rol r28 \n\t"
+        "rol r29 \n\t"
+        "add r23, r24 \n\t"
+        "adc r28, r22 \n\t"
+        "adc r29, r25 \n\t"
+        "st z+, r23 \n\t"
+        
+        "ldi r22, 0 \n\t"
+        "mul r2, r14 \n\t"
+        "mov r23, r0 \n\t"
+        "mov r24, r1 \n\t"
+        "mul r3, r13 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r4, r12 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r5, r11 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r6, r10 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r7, r9 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "lsl r23 \n\t"
+        "rol r24 \n\t"
+        "rol r22 \n\t"
+        "mul r8, r8 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "add r23, r28 \n\t"
+        "adc r24, r29 \n\t"
+        "adc r22, r25 \n\t"
+        "st z+, r23 \n\t"
+        
+        "ldi r29, 0 \n\t"
+        "mul r2, r15 \n\t"
+        "mov r23, r0 \n\t"
+        "mov r28, r1 \n\t"
+        "mul r3, r14 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "mul r4, r13 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "mul r5, r12 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "mul r6, r11 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "mul r7, r10 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "mul r8, r9 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "lsl r23 \n\t"
+        "rol r28 \n\t"
+        "rol r29 \n\t"
+        "add r23, r24 \n\t"
+        "adc r28, r22 \n\t"
+        "adc r29, r25 \n\t"
+        "st z+, r23 \n\t"
+        
+        "ldi r22, 0 \n\t"
+        "mul r2, r16 \n\t"
+        "mov r23, r0 \n\t"
+        "mov r24, r1 \n\t"
+        "mul r3, r15 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r4, r14 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r5, r13 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r6, r12 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r7, r11 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r8, r10 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "lsl r23 \n\t"
+        "rol r24 \n\t"
+        "rol r22 \n\t"
+        "mul r9, r9 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "add r23, r28 \n\t"
+        "adc r24, r29 \n\t"
+        "adc r22, r25 \n\t"
+        "st z+, r23 \n\t"
+        
+        "ldi r29, 0 \n\t"
+        "mul r2, r17 \n\t"
+        "mov r23, r0 \n\t"
+        "mov r28, r1 \n\t"
+        "mul r3, r16 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "mul r4, r15 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "mul r5, r14 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "mul r6, r13 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "mul r7, r12 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "mul r8, r11 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "mul r9, r10 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "lsl r23 \n\t"
+        "rol r28 \n\t"
+        "rol r29 \n\t"
+        "add r23, r24 \n\t"
+        "adc r28, r22 \n\t"
+        "adc r29, r25 \n\t"
+        "st z+, r23 \n\t"
+        
+        "ldi r22, 0 \n\t"
+        "mul r2, r18 \n\t"
+        "mov r23, r0 \n\t"
+        "mov r24, r1 \n\t"
+        "mul r3, r17 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r4, r16 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r5, r15 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r6, r14 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r7, r13 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r8, r12 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r9, r11 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "lsl r23 \n\t"
+        "rol r24 \n\t"
+        "rol r22 \n\t"
+        "mul r10, r10 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "add r23, r28 \n\t"
+        "adc r24, r29 \n\t"
+        "adc r22, r25 \n\t"
+        "st z+, r23 \n\t"
+        
+        "ldi r29, 0 \n\t"
+        "mul r2, r19 \n\t"
+        "mov r23, r0 \n\t"
+        "mov r28, r1 \n\t"
+        "mul r3, r18 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "mul r4, r17 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "mul r5, r16 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "mul r6, r15 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "mul r7, r14 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "mul r8, r13 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "mul r9, r12 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "mul r10, r11 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "lsl r23 \n\t"
+        "rol r28 \n\t"
+        "rol r29 \n\t"
+        "add r23, r24 \n\t"
+        "adc r28, r22 \n\t"
+        "adc r29, r25 \n\t"
+        "st z+, r23 \n\t"
+        
+        "ldi r22, 0 \n\t"
+        "mul r2, r20 \n\t"
+        "mov r23, r0 \n\t"
+        "mov r24, r1 \n\t"
+        "mul r3, r19 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r4, r18 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r5, r17 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r6, r16 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r7, r15 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r8, r14 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r9, r13 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r10, r12 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "lsl r23 \n\t"
+        "rol r24 \n\t"
+        "rol r22 \n\t"
+        "mul r11, r11 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "add r23, r28 \n\t"
+        "adc r24, r29 \n\t"
+        "adc r22, r25 \n\t"
+        "st z+, r23 \n\t"
+        
+        "ldi r29, 0 \n\t"
+        "mul r2, r21 \n\t"
+        "mov r23, r0 \n\t"
+        "mov r28, r1 \n\t"
+        "mul r3, r20 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "mul r4, r19 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "mul r5, r18 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "mul r6, r17 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "mul r7, r16 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "mul r8, r15 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "mul r9, r14 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "mul r10, r13 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "mul r11, r12 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "lsl r23 \n\t"
+        "rol r28 \n\t"
+        "rol r29 \n\t"
+        "add r23, r24 \n\t"
+        "adc r28, r22 \n\t"
+        "adc r29, r25 \n\t"
+        "st z+, r23 \n\t"
+        
+        "ld r2, x+ \n\t"
+        "ldi r22, 0 \n\t"
+        "mul r3, r21 \n\t"
+        "mov r23, r0 \n\t"
+        "mov r24, r1 \n\t"
+        "mul r4, r20 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r5, r19 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r6, r18 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r7, r17 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r8, r16 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r9, r15 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r10, r14 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r11, r13 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "ld r0, z \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r25 \n\t"
+        "adc r22, r25 \n\t"
+        "lsl r23 \n\t"
+        "rol r24 \n\t"
+        "rol r22 \n\t"
+        "mul r12, r12 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "add r23, r28 \n\t"
+        "adc r24, r29 \n\t"
+        "adc r22, r25 \n\t"
+        "st z+, r23 \n\t"
+        
+        "ldi r29, 0 \n\t"
+        "mul r3, r2 \n\t"
+        "mov r23, r0 \n\t"
+        "mov r28, r1 \n\t"
+        "mul r4, r21 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "mul r5, r20 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "mul r6, r19 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "mul r7, r18 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "mul r8, r17 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "mul r9, r16 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "mul r10, r15 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "mul r11, r14 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "mul r12, r13 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "ld r0, z \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r25 \n\t"
+        "adc r29, r25 \n\t"
+        "lsl r23 \n\t"
+        "rol r28 \n\t"
+        "rol r29 \n\t"
+        "add r23, r24 \n\t"
+        "adc r28, r22 \n\t"
+        "adc r29, r25 \n\t"
+        "st z+, r23 \n\t"
+        
+        "ld r3, x+ \n\t"
+        "ldi r22, 0 \n\t"
+        "mul r4, r2 \n\t"
+        "mov r23, r0 \n\t"
+        "mov r24, r1 \n\t"
+        "mul r5, r21 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r6, r20 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r7, r19 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r8, r18 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r9, r17 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r10, r16 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r11, r15 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r12, r14 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "ld r0, z \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r25 \n\t"
+        "adc r22, r25 \n\t"
+        "lsl r23 \n\t"
+        "rol r24 \n\t"
+        "rol r22 \n\t"
+        "mul r13, r13 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "add r23, r28 \n\t"
+        "adc r24, r29 \n\t"
+        "adc r22, r25 \n\t"
+        "st z+, r23 \n\t"
+        
+        "ldi r29, 0 \n\t"
+        "mul r4, r3 \n\t"
+        "mov r23, r0 \n\t"
+        "mov r28, r1 \n\t"
+        "mul r5, r2 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "mul r6, r21 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "mul r7, r20 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "mul r8, r19 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "mul r9, r18 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "mul r10, r17 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "mul r11, r16 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "mul r12, r15 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "mul r13, r14 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "ld r0, z \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r25 \n\t"
+        "adc r29, r25 \n\t"
+        "lsl r23 \n\t"
+        "rol r28 \n\t"
+        "rol r29 \n\t"
+        "add r23, r24 \n\t"
+        "adc r28, r22 \n\t"
+        "adc r29, r25 \n\t"
+        "st z+, r23 \n\t"
+        
+        "ld r4, x+ \n\t"
+        "ldi r22, 0 \n\t"
+        "mul r5, r3 \n\t"
+        "mov r23, r0 \n\t"
+        "mov r24, r1 \n\t"
+        "mul r6, r2 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r7, r21 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r8, r20 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r9, r19 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r10, r18 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r11, r17 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r12, r16 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r13, r15 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "ld r0, z \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r25 \n\t"
+        "adc r22, r25 \n\t"
+        "lsl r23 \n\t"
+        "rol r24 \n\t"
+        "rol r22 \n\t"
+        "mul r14, r14 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "add r23, r28 \n\t"
+        "adc r24, r29 \n\t"
+        "adc r22, r25 \n\t"
+        "st z+, r23 \n\t"
+        
+        "ldi r29, 0 \n\t"
+        "mul r5, r4 \n\t"
+        "mov r23, r0 \n\t"
+        "mov r28, r1 \n\t"
+        "mul r6, r3 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "mul r7, r2 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "mul r8, r21 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "mul r9, r20 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "mul r10, r19 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "mul r11, r18 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "mul r12, r17 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "mul r13, r16 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "mul r14, r15 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "ld r0, z \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r25 \n\t"
+        "adc r29, r25 \n\t"
+        "lsl r23 \n\t"
+        "rol r28 \n\t"
+        "rol r29 \n\t"
+        "add r23, r24 \n\t"
+        "adc r28, r22 \n\t"
+        "adc r29, r25 \n\t"
+        "st z+, r23 \n\t"
+        
+        "ld r5, x+ \n\t"
+        "ldi r22, 0 \n\t"
+        "mul r6, r4 \n\t"
+        "mov r23, r0 \n\t"
+        "mov r24, r1 \n\t"
+        "mul r7, r3 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r8, r2 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r9, r21 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r10, r20 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r11, r19 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r12, r18 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r13, r17 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r14, r16 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "ld r0, z \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r25 \n\t"
+        "adc r22, r25 \n\t"
+        "lsl r23 \n\t"
+        "rol r24 \n\t"
+        "rol r22 \n\t"
+        "mul r15, r15 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "add r23, r28 \n\t"
+        "adc r24, r29 \n\t"
+        "adc r22, r25 \n\t"
+        "st z+, r23 \n\t"
+        
+        "ldi r29, 0 \n\t"
+        "mul r6, r5 \n\t"
+        "mov r23, r0 \n\t"
+        "mov r28, r1 \n\t"
+        "mul r7, r4 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "mul r8, r3 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "mul r9, r2 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "mul r10, r21 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "mul r11, r20 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "mul r12, r19 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "mul r13, r18 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "mul r14, r17 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "mul r15, r16 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "ld r0, z \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r25 \n\t"
+        "adc r29, r25 \n\t"
+        "lsl r23 \n\t"
+        "rol r28 \n\t"
+        "rol r29 \n\t"
+        "add r23, r24 \n\t"
+        "adc r28, r22 \n\t"
+        "adc r29, r25 \n\t"
+        "st z+, r23 \n\t"
+        
+        "ldi r22, 0 \n\t"
+        "mul r7, r5 \n\t"
+        "mov r23, r0 \n\t"
+        "mov r24, r1 \n\t"
+        "mul r8, r4 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r9, r3 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r10, r2 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r11, r21 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r12, r20 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r13, r19 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r14, r18 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r15, r17 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "lsl r23 \n\t"
+        "rol r24 \n\t"
+        "rol r22 \n\t"
+        "mul r16, r16 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "add r23, r28 \n\t"
+        "adc r24, r29 \n\t"
+        "adc r22, r25 \n\t"
+        "st z+, r23 \n\t"
+        
+        "ldi r29, 0 \n\t"
+        "mul r8, r5 \n\t"
+        "mov r23, r0 \n\t"
+        "mov r28, r1 \n\t"
+        "mul r9, r4 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "mul r10, r3 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "mul r11, r2 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "mul r12, r21 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "mul r13, r20 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "mul r14, r19 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "mul r15, r18 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "mul r16, r17 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "lsl r23 \n\t"
+        "rol r28 \n\t"
+        "rol r29 \n\t"
+        "add r23, r24 \n\t"
+        "adc r28, r22 \n\t"
+        "adc r29, r25 \n\t"
+        "st z+, r23 \n\t"
+        
+        "ldi r22, 0 \n\t"
+        "mul r9, r5 \n\t"
+        "mov r23, r0 \n\t"
+        "mov r24, r1 \n\t"
+        "mul r10, r4 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r11, r3 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r12, r2 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r13, r21 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r14, r20 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r15, r19 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r16, r18 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "lsl r23 \n\t"
+        "rol r24 \n\t"
+        "rol r22 \n\t"
+        "mul r17, r17 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "add r23, r28 \n\t"
+        "adc r24, r29 \n\t"
+        "adc r22, r25 \n\t"
+        "st z+, r23 \n\t"
+        
+        "ldi r29, 0 \n\t"
+        "mul r10, r5 \n\t"
+        "mov r23, r0 \n\t"
+        "mov r28, r1 \n\t"
+        "mul r11, r4 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "mul r12, r3 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "mul r13, r2 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "mul r14, r21 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "mul r15, r20 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "mul r16, r19 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "mul r17, r18 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "lsl r23 \n\t"
+        "rol r28 \n\t"
+        "rol r29 \n\t"
+        "add r23, r24 \n\t"
+        "adc r28, r22 \n\t"
+        "adc r29, r25 \n\t"
+        "st z+, r23 \n\t"
+        
+        "ldi r22, 0 \n\t"
+        "mul r11, r5 \n\t"
+        "mov r23, r0 \n\t"
+        "mov r24, r1 \n\t"
+        "mul r12, r4 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r13, r3 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r14, r2 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r15, r21 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r16, r20 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r17, r19 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "lsl r23 \n\t"
+        "rol r24 \n\t"
+        "rol r22 \n\t"
+        "mul r18, r18 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "add r23, r28 \n\t"
+        "adc r24, r29 \n\t"
+        "adc r22, r25 \n\t"
+        "st z+, r23 \n\t"
+        
+        "ldi r29, 0 \n\t"
+        "mul r12, r5 \n\t"
+        "mov r23, r0 \n\t"
+        "mov r28, r1 \n\t"
+        "mul r13, r4 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "mul r14, r3 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "mul r15, r2 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "mul r16, r21 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "mul r17, r20 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "mul r18, r19 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "lsl r23 \n\t"
+        "rol r28 \n\t"
+        "rol r29 \n\t"
+        "add r23, r24 \n\t"
+        "adc r28, r22 \n\t"
+        "adc r29, r25 \n\t"
+        "st z+, r23 \n\t"
+        
+        "ldi r22, 0 \n\t"
+        "mul r13, r5 \n\t"
+        "mov r23, r0 \n\t"
+        "mov r24, r1 \n\t"
+        "mul r14, r4 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r15, r3 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r16, r2 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r17, r21 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r18, r20 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "lsl r23 \n\t"
+        "rol r24 \n\t"
+        "rol r22 \n\t"
+        "mul r19, r19 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "add r23, r28 \n\t"
+        "adc r24, r29 \n\t"
+        "adc r22, r25 \n\t"
+        "st z+, r23 \n\t"
+        
+        "ldi r29, 0 \n\t"
+        "mul r14, r5 \n\t"
+        "mov r23, r0 \n\t"
+        "mov r28, r1 \n\t"
+        "mul r15, r4 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "mul r16, r3 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "mul r17, r2 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "mul r18, r21 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "mul r19, r20 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "lsl r23 \n\t"
+        "rol r28 \n\t"
+        "rol r29 \n\t"
+        "add r23, r24 \n\t"
+        "adc r28, r22 \n\t"
+        "adc r29, r25 \n\t"
+        "st z+, r23 \n\t"
+        
+        "ldi r22, 0 \n\t"
+        "mul r15, r5 \n\t"
+        "mov r23, r0 \n\t"
+        "mov r24, r1 \n\t"
+        "mul r16, r4 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r17, r3 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r18, r2 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r19, r21 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "lsl r23 \n\t"
+        "rol r24 \n\t"
+        "rol r22 \n\t"
+        "mul r20, r20 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "add r23, r28 \n\t"
+        "adc r24, r29 \n\t"
+        "adc r22, r25 \n\t"
+        "st z+, r23 \n\t"
+        
+        "ldi r29, 0 \n\t"
+        "mul r16, r5 \n\t"
+        "mov r23, r0 \n\t"
+        "mov r28, r1 \n\t"
+        "mul r17, r4 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "mul r18, r3 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "mul r19, r2 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "mul r20, r21 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "lsl r23 \n\t"
+        "rol r28 \n\t"
+        "rol r29 \n\t"
+        "add r23, r24 \n\t"
+        "adc r28, r22 \n\t"
+        "adc r29, r25 \n\t"
+        "st z+, r23 \n\t"
+        
+        "ldi r22, 0 \n\t"
+        "mul r17, r5 \n\t"
+        "mov r23, r0 \n\t"
+        "mov r24, r1 \n\t"
+        "mul r18, r4 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r19, r3 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r20, r2 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "lsl r23 \n\t"
+        "rol r24 \n\t"
+        "rol r22 \n\t"
+        "mul r21, r21 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "add r23, r28 \n\t"
+        "adc r24, r29 \n\t"
+        "adc r22, r25 \n\t"
+        "st z+, r23 \n\t"
+        
+        "ldi r29, 0 \n\t"
+        "mul r18, r5 \n\t"
+        "mov r23, r0 \n\t"
+        "mov r28, r1 \n\t"
+        "mul r19, r4 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "mul r20, r3 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "mul r21, r2 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "lsl r23 \n\t"
+        "rol r28 \n\t"
+        "rol r29 \n\t"
+        "add r23, r24 \n\t"
+        "adc r28, r22 \n\t"
+        "adc r29, r25 \n\t"
+        "st z+, r23 \n\t"
+        
+        "ldi r22, 0 \n\t"
+        "mul r19, r5 \n\t"
+        "mov r23, r0 \n\t"
+        "mov r24, r1 \n\t"
+        "mul r20, r4 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r21, r3 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "lsl r23 \n\t"
+        "rol r24 \n\t"
+        "rol r22 \n\t"
+        "mul r2, r2 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "add r23, r28 \n\t"
+        "adc r24, r29 \n\t"
+        "adc r22, r25 \n\t"
+        "st z+, r23 \n\t"
+        
+        "ldi r29, 0 \n\t"
+        "mul r20, r5 \n\t"
+        "mov r23, r0 \n\t"
+        "mov r28, r1 \n\t"
+        "mul r21, r4 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "mul r2, r3 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "lsl r23 \n\t"
+        "rol r28 \n\t"
+        "rol r29 \n\t"
+        "add r23, r24 \n\t"
+        "adc r28, r22 \n\t"
+        "adc r29, r25 \n\t"
+        "st z+, r23 \n\t"
+        
+        "ldi r22, 0 \n\t"
+        "mul r21, r5 \n\t"
+        "mov r23, r0 \n\t"
+        "mov r24, r1 \n\t"
+        "mul r2, r4 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "lsl r23 \n\t"
+        "rol r24 \n\t"
+        "rol r22 \n\t"
+        "mul r3, r3 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "add r23, r28 \n\t"
+        "adc r24, r29 \n\t"
+        "adc r22, r25 \n\t"
+        "st z+, r23 \n\t"
+        
+        "ldi r29, 0 \n\t"
+        "mul r2, r5 \n\t"
+        "mov r23, r0 \n\t"
+        "mov r28, r1 \n\t"
+        "mul r3, r4 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "lsl r23 \n\t"
+        "rol r28 \n\t"
+        "rol r29 \n\t"
+        "add r23, r24 \n\t"
+        "adc r28, r22 \n\t"
+        "adc r29, r25 \n\t"
+        "st z+, r23 \n\t"
+        
+        "ldi r23, 0 \n\t"
+        "mul r3, r5 \n\t"
+        "add r28, r0 \n\t"
+        "adc r29, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "add r28, r0 \n\t"
+        "adc r29, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r4, r4 \n\t"
+        "add r28, r0 \n\t"
+        "adc r29, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "st z+, r28 \n\t"
+        
+        "ldi r28, 0 \n\t"
+        "mul r4, r5 \n\t"
+        "add r29, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r28, r25 \n\t"
+        "add r29, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r28, r25 \n\t"
+        "st z+, r29 \n\t"
+        
+        "mul r5, r5 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "st z+, r23 \n\t"
+        "st z+, r28 \n\t"
+        "eor r1, r1 \n\t"
+        : "+x" (p_left), "+z" (p_result)
+        :
+        : "r0", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12",
+          "r13", "r14", "r15", "r16", "r17", "r18", "r19", "r20", "r21", "r22", "r23", "r24", "r25", "r28", "r29", "cc", "memory"
+    );
+}
+#define asm_square 1
+
+#elif (uECC_BYTES == 32)
+
+__attribute((noinline))
+static void vli_square(uint8_t *p_result, uint8_t *p_left)
+{
+    __asm__ volatile (
+        "ldi r25, 0 \n\t"
+        "movw r28, r26 \n\t"
+        "ld r2, x+ \n\t"
+        "ld r3, x+ \n\t"
+        "ld r4, x+ \n\t"
+        "ld r5, x+ \n\t"
+        "ld r6, x+ \n\t"
+        "ld r7, x+ \n\t"
+        "adiw r28, 20 \n\t"
+        "ld r12, y+ \n\t"
+        "ld r13, y+ \n\t"
+        "ld r14, y+ \n\t"
+        "ld r15, y+ \n\t"
+        "ld r16, y+ \n\t"
+        "ld r17, y+ \n\t"
+        "adiw r30, 20 \n\t"
+
+        "ldi r23, 0 \n\t"
+        "mul 2, 12 \n\t"
+        "st z+, r0 \n\t"
+        "mov r22, r1 \n\t"
+
+        "ldi r24, 0 \n\t"
+        "mul r2, r13 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "st z+, r22 \n\t"
+
+        "ldi r22, 0 \n\t"
+        "mul r2, r14 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r3, r13 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "st z+, r23 \n\t"
+
+        "ldi r23, 0 \n\t"
+        "mul r2, r15 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r3, r14 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "st z+, r24 \n\t"
+
+        "ldi r24, 0 \n\t"
+        "mul r2, r16 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r3, r15 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r4, r14 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "st z+, r22 \n\t"
+
+        "ldi r22, 0 \n\t"
+        "mul r2, r17 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r3, r16 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r4, r15 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "st z+, r23 \n\t"
+
+        "ld r12, y+ \n\t"
+        "ldi r23, 0 \n\t"
+        "mul r2, r12 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r3, r17 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r4, r16 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r5, r15 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "st z+, r24 \n\t"
+
+        "ld r13, y+ \n\t"
+        "ldi r24, 0 \n\t"
+        "mul r2, r13 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r3, r12 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r4, r17 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r5, r16 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "st z+, r22 \n\t"
+
+        "ld r14, y+ \n\t"
+        "ldi r22, 0 \n\t"
+        "mul r2, r14 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r3, r13 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r4, r12 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r5, r17 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r6, r16 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "st z+, r23 \n\t"
+
+        "ld r15, y+ \n\t"
+        "ldi r23, 0 \n\t"
+        "mul r2, r15 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r3, r14 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r4, r13 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r5, r12 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r6, r17 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "st z+, r24 \n\t"
+
+        "ld r16, y+ \n\t"
+        "ldi r24, 0 \n\t"
+        "mul r2, r16 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r3, r15 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r4, r14 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r5, r13 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r6, r12 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r7, r17 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "st z+, r22 \n\t"
+
+        "ld r17, y+ \n\t"
+        "ldi r22, 0 \n\t"
+        "mul r2, r17 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r3, r16 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r4, r15 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r5, r14 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r6, r13 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r7, r12 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "st z+, r23 \n\t"
+
+        "ld r2, x+ \n\t"
+        "ldi r23, 0 \n\t"
+        "mul r3, r17 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r4, r16 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r5, r15 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r6, r14 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r7, r13 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r2, r12 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "st z+, r24 \n\t"
+
+        "ld r3, x+ \n\t"
+        "ldi r24, 0 \n\t"
+        "mul r4, r17 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r5, r16 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r6, r15 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r7, r14 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r2, r13 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "st z+, r22 \n\t"
+
+        "ld r4, x+ \n\t"
+        "ldi r22, 0 \n\t"
+        "mul r5, r17 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r6, r16 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r7, r15 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r2, r14 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r3, r13 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "st z+, r23 \n\t"
+
+        "ld r5, x+ \n\t"
+        "ldi r23, 0 \n\t"
+        "mul r6, r17 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r7, r16 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r2, r15 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r3, r14 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "st z+, r24 \n\t"
+
+        "ld r6, x+ \n\t"
+        "ldi r24, 0 \n\t"
+        "mul r7, r17 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r2, r16 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r3, r15 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r4, r14 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "st z+, r22 \n\t"
+
+        "ld r7, x+ \n\t"
+        "ldi r22, 0 \n\t"
+        "mul r2, r17 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r3, r16 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r4, r15 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "st z+, r23 \n\t"
+
+        "ldi r23, 0 \n\t"
+        "mul r3, r17 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r4, r16 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r5, r15 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "st z+, r24 \n\t"
+
+        "ldi r24, 0 \n\t"
+        "mul r4, r17 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "mul r5, r16 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "st z+, r22 \n\t"
+
+        "ldi r22, 0 \n\t"
+        "mul r5, r17 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r6, r16 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "st z+, r23 \n\t"
+
+        "ldi r23, 0 \n\t"
+        "mul r6, r17 \n\t"
+        "add r24, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "st z+, r24 \n\t"
+
+        "mul r7, r17 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "st z+, r22 \n\t"
+        "st z+, r23 \n\t"
+
+        "sbiw r26, 12 \n\t"
+        "sbiw r30, 44 \n\t"
+        "ld r2, x+ \n\t"
+        "ld r3, x+ \n\t"
+        "ld r4, x+ \n\t"
+        "ld r5, x+ \n\t"
+        "ld r6, x+ \n\t"
+        "ld r7, x+ \n\t"
+        "ld r8, x+ \n\t"
+        "ld r9, x+ \n\t"
+        "ld r10, x+ \n\t"
+        "ld r11, x+ \n\t"
+        "ld r12, x+ \n\t"
+        "ld r13, x+ \n\t"
+        "ld r14, x+ \n\t"
+        "ld r15, x+ \n\t"
+        "ld r16, x+ \n\t"
+        "ld r17, x+ \n\t"
+        "ld r18, x+ \n\t"
+        "ld r19, x+ \n\t"
+        "ld r20, x+ \n\t"
+        "ld r21, x+ \n\t"
+
+        "ldi r23, 0 \n\t"
+        "mul r2, r2 \n\t"
+        "st z+, r0 \n\t"
+        "mov r22, r1 \n\t"
+
+        "ldi r24, 0 \n\t"
+        "mul r2, r3 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r24, r25 \n\t"
+        "st z+, r22 \n\t"
+
+        "ldi r22, 0 \n\t"
+        "mul r2, r4 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r3, r3 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "st z+, r23 \n\t"
+
+        "ldi r29, 0 \n\t"
+        "mul r2, r5 \n\t"
+        "mov r23, r0 \n\t"
+        "mov r28, r1 \n\t"
+        "mul r3, r4 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "lsl r23 \n\t"
+        "rol r28 \n\t"
+        "rol r29 \n\t"
+        "add r23, r24 \n\t"
+        "adc r28, r22 \n\t"
+        "adc r29, r25 \n\t"
+        "st z+, r23 \n\t"
+
+        "ldi r22, 0 \n\t"
+        "mul r2, r6 \n\t"
+        "mov r23, r0 \n\t"
+        "mov r24, r1 \n\t"
+        "mul r3, r5 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "lsl r23 \n\t"
+        "rol r24 \n\t"
+        "rol r22 \n\t"
+        "mul r4, r4 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "add r23, r28 \n\t"
+        "adc r24, r29 \n\t"
+        "adc r22, r25 \n\t"
+        "st z+, r23 \n\t"
+
+        "ldi r29, 0 \n\t"
+        "mul r2, r7 \n\t"
+        "mov r23, r0 \n\t"
+        "mov r28, r1 \n\t"
+        "mul r3, r6 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "mul r4, r5 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "lsl r23 \n\t"
+        "rol r28 \n\t"
+        "rol r29 \n\t"
+        "add r23, r24 \n\t"
+        "adc r28, r22 \n\t"
+        "adc r29, r25 \n\t"
+        "st z+, r23 \n\t"
+
+        "ldi r22, 0 \n\t"
+        "mul r2, r8 \n\t"
+        "mov r23, r0 \n\t"
+        "mov r24, r1 \n\t"
+        "mul r3, r7 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r4, r6 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "lsl r23 \n\t"
+        "rol r24 \n\t"
+        "rol r22 \n\t"
+        "mul r5, r5 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "add r23, r28 \n\t"
+        "adc r24, r29 \n\t"
+        "adc r22, r25 \n\t"
+        "st z+, r23 \n\t"
+
+        "ldi r29, 0 \n\t"
+        "mul r2, r9 \n\t"
+        "mov r23, r0 \n\t"
+        "mov r28, r1 \n\t"
+        "mul r3, r8 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "mul r4, r7 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "mul r5, r6 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "lsl r23 \n\t"
+        "rol r28 \n\t"
+        "rol r29 \n\t"
+        "add r23, r24 \n\t"
+        "adc r28, r22 \n\t"
+        "adc r29, r25 \n\t"
+        "st z+, r23 \n\t"
+
+        "ldi r22, 0 \n\t"
+        "mul r2, r10 \n\t"
+        "mov r23, r0 \n\t"
+        "mov r24, r1 \n\t"
+        "mul r3, r9 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r4, r8 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r5, r7 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "lsl r23 \n\t"
+        "rol r24 \n\t"
+        "rol r22 \n\t"
+        "mul r6, r6 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "add r23, r28 \n\t"
+        "adc r24, r29 \n\t"
+        "adc r22, r25 \n\t"
+        "st z+, r23 \n\t"
+
+        "ldi r29, 0 \n\t"
+        "mul r2, r11 \n\t"
+        "mov r23, r0 \n\t"
+        "mov r28, r1 \n\t"
+        "mul r3, r10 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "mul r4, r9 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "mul r5, r8 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "mul r6, r7 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "lsl r23 \n\t"
+        "rol r28 \n\t"
+        "rol r29 \n\t"
+        "add r23, r24 \n\t"
+        "adc r28, r22 \n\t"
+        "adc r29, r25 \n\t"
+        "st z+, r23 \n\t"
+
+        "ldi r22, 0 \n\t"
+        "mul r2, r12 \n\t"
+        "mov r23, r0 \n\t"
+        "mov r24, r1 \n\t"
+        "mul r3, r11 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r4, r10 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r5, r9 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r6, r8 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "lsl r23 \n\t"
+        "rol r24 \n\t"
+        "rol r22 \n\t"
+        "mul r7, r7 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "add r23, r28 \n\t"
+        "adc r24, r29 \n\t"
+        "adc r22, r25 \n\t"
+        "st z+, r23 \n\t"
+
+        "ldi r29, 0 \n\t"
+        "mul r2, r13 \n\t"
+        "mov r23, r0 \n\t"
+        "mov r28, r1 \n\t"
+        "mul r3, r12 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "mul r4, r11 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "mul r5, r10 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "mul r6, r9 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "mul r7, r8 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "lsl r23 \n\t"
+        "rol r28 \n\t"
+        "rol r29 \n\t"
+        "add r23, r24 \n\t"
+        "adc r28, r22 \n\t"
+        "adc r29, r25 \n\t"
+        "st z+, r23 \n\t"
+
+        "ldi r22, 0 \n\t"
+        "mul r2, r14 \n\t"
+        "mov r23, r0 \n\t"
+        "mov r24, r1 \n\t"
+        "mul r3, r13 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r4, r12 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r5, r11 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r6, r10 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r7, r9 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "lsl r23 \n\t"
+        "rol r24 \n\t"
+        "rol r22 \n\t"
+        "mul r8, r8 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "add r23, r28 \n\t"
+        "adc r24, r29 \n\t"
+        "adc r22, r25 \n\t"
+        "st z+, r23 \n\t"
+
+        "ldi r29, 0 \n\t"
+        "mul r2, r15 \n\t"
+        "mov r23, r0 \n\t"
+        "mov r28, r1 \n\t"
+        "mul r3, r14 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "mul r4, r13 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "mul r5, r12 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "mul r6, r11 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "mul r7, r10 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "mul r8, r9 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "lsl r23 \n\t"
+        "rol r28 \n\t"
+        "rol r29 \n\t"
+        "add r23, r24 \n\t"
+        "adc r28, r22 \n\t"
+        "adc r29, r25 \n\t"
+        "st z+, r23 \n\t"
+
+        "ldi r22, 0 \n\t"
+        "mul r2, r16 \n\t"
+        "mov r23, r0 \n\t"
+        "mov r24, r1 \n\t"
+        "mul r3, r15 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r4, r14 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r5, r13 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r6, r12 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r7, r11 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r8, r10 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "lsl r23 \n\t"
+        "rol r24 \n\t"
+        "rol r22 \n\t"
+        "mul r9, r9 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "add r23, r28 \n\t"
+        "adc r24, r29 \n\t"
+        "adc r22, r25 \n\t"
+        "st z+, r23 \n\t"
+
+        "ldi r29, 0 \n\t"
+        "mul r2, r17 \n\t"
+        "mov r23, r0 \n\t"
+        "mov r28, r1 \n\t"
+        "mul r3, r16 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "mul r4, r15 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "mul r5, r14 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "mul r6, r13 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "mul r7, r12 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "mul r8, r11 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "mul r9, r10 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "lsl r23 \n\t"
+        "rol r28 \n\t"
+        "rol r29 \n\t"
+        "add r23, r24 \n\t"
+        "adc r28, r22 \n\t"
+        "adc r29, r25 \n\t"
+        "st z+, r23 \n\t"
+
+        "ldi r22, 0 \n\t"
+        "mul r2, r18 \n\t"
+        "mov r23, r0 \n\t"
+        "mov r24, r1 \n\t"
+        "mul r3, r17 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r4, r16 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r5, r15 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r6, r14 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r7, r13 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r8, r12 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r9, r11 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "lsl r23 \n\t"
+        "rol r24 \n\t"
+        "rol r22 \n\t"
+        "mul r10, r10 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "add r23, r28 \n\t"
+        "adc r24, r29 \n\t"
+        "adc r22, r25 \n\t"
+        "st z+, r23 \n\t"
+
+        "ldi r29, 0 \n\t"
+        "mul r2, r19 \n\t"
+        "mov r23, r0 \n\t"
+        "mov r28, r1 \n\t"
+        "mul r3, r18 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "mul r4, r17 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "mul r5, r16 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "mul r6, r15 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "mul r7, r14 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "mul r8, r13 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "mul r9, r12 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "mul r10, r11 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "lsl r23 \n\t"
+        "rol r28 \n\t"
+        "rol r29 \n\t"
+        "add r23, r24 \n\t"
+        "adc r28, r22 \n\t"
+        "adc r29, r25 \n\t"
+        "st z+, r23 \n\t"
+
+        "ldi r22, 0 \n\t"
+        "mul r2, r20 \n\t"
+        "mov r23, r0 \n\t"
+        "mov r24, r1 \n\t"
+        "mul r3, r19 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r4, r18 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r5, r17 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r6, r16 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r7, r15 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r8, r14 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r9, r13 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r10, r12 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "lsl r23 \n\t"
+        "rol r24 \n\t"
+        "rol r22 \n\t"
+        "mul r11, r11 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "add r23, r28 \n\t"
+        "adc r24, r29 \n\t"
+        "adc r22, r25 \n\t"
+        "st z+, r23 \n\t"
+
+        "ldi r29, 0 \n\t"
+        "mul r2, r21 \n\t"
+        "mov r23, r0 \n\t"
+        "mov r28, r1 \n\t"
+        "mul r3, r20 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "mul r4, r19 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "mul r5, r18 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "mul r6, r17 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "mul r7, r16 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "mul r8, r15 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "mul r9, r14 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "mul r10, r13 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "mul r11, r12 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "lsl r23 \n\t"
+        "rol r28 \n\t"
+        "rol r29 \n\t"
+        "add r23, r24 \n\t"
+        "adc r28, r22 \n\t"
+        "adc r29, r25 \n\t"
+        "st z+, r23 \n\t"
+
+        "ld r2, x+ \n\t"
+        "ldi r22, 0 \n\t"
+        "mul r3, r21 \n\t"
+        "mov r23, r0 \n\t"
+        "mov r24, r1 \n\t"
+        "mul r4, r20 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r5, r19 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r6, r18 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r7, r17 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r8, r16 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r9, r15 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r10, r14 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r11, r13 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "ld r0, z \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r25 \n\t"
+        "adc r22, r25 \n\t"
+        "lsl r23 \n\t"
+        "rol r24 \n\t"
+        "rol r22 \n\t"
+        "mul r12, r12 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "add r23, r28 \n\t"
+        "adc r24, r29 \n\t"
+        "adc r22, r25 \n\t"
+        "st z+, r23 \n\t"
+
+        "ldi r29, 0 \n\t"
+        "mul r3, r2 \n\t"
+        "mov r23, r0 \n\t"
+        "mov r28, r1 \n\t"
+        "mul r4, r21 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "mul r5, r20 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "mul r6, r19 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "mul r7, r18 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "mul r8, r17 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "mul r9, r16 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "mul r10, r15 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "mul r11, r14 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "mul r12, r13 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "ld r0, z \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r25 \n\t"
+        "adc r29, r25 \n\t"
+        "lsl r23 \n\t"
+        "rol r28 \n\t"
+        "rol r29 \n\t"
+        "add r23, r24 \n\t"
+        "adc r28, r22 \n\t"
+        "adc r29, r25 \n\t"
+        "st z+, r23 \n\t"
+
+        "ld r3, x+ \n\t"
+        "ldi r22, 0 \n\t"
+        "mul r4, r2 \n\t"
+        "mov r23, r0 \n\t"
+        "mov r24, r1 \n\t"
+        "mul r5, r21 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r6, r20 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r7, r19 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r8, r18 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r9, r17 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r10, r16 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r11, r15 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r12, r14 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "ld r0, z \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r25 \n\t"
+        "adc r22, r25 \n\t"
+        "lsl r23 \n\t"
+        "rol r24 \n\t"
+        "rol r22 \n\t"
+        "mul r13, r13 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "add r23, r28 \n\t"
+        "adc r24, r29 \n\t"
+        "adc r22, r25 \n\t"
+        "st z+, r23 \n\t"
+
+        "ldi r29, 0 \n\t"
+        "mul r4, r3 \n\t"
+        "mov r23, r0 \n\t"
+        "mov r28, r1 \n\t"
+        "mul r5, r2 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "mul r6, r21 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "mul r7, r20 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "mul r8, r19 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "mul r9, r18 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "mul r10, r17 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "mul r11, r16 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "mul r12, r15 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "mul r13, r14 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "ld r0, z \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r25 \n\t"
+        "adc r29, r25 \n\t"
+        "lsl r23 \n\t"
+        "rol r28 \n\t"
+        "rol r29 \n\t"
+        "add r23, r24 \n\t"
+        "adc r28, r22 \n\t"
+        "adc r29, r25 \n\t"
+        "st z+, r23 \n\t"
+
+        "ld r4, x+ \n\t"
+        "ldi r22, 0 \n\t"
+        "mul r5, r3 \n\t"
+        "mov r23, r0 \n\t"
+        "mov r24, r1 \n\t"
+        "mul r6, r2 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r7, r21 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r8, r20 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r9, r19 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r10, r18 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r11, r17 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r12, r16 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r13, r15 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "ld r0, z \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r25 \n\t"
+        "adc r22, r25 \n\t"
+        "lsl r23 \n\t"
+        "rol r24 \n\t"
+        "rol r22 \n\t"
+        "mul r14, r14 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "add r23, r28 \n\t"
+        "adc r24, r29 \n\t"
+        "adc r22, r25 \n\t"
+        "st z+, r23 \n\t"
+
+        "ldi r29, 0 \n\t"
+        "mul r5, r4 \n\t"
+        "mov r23, r0 \n\t"
+        "mov r28, r1 \n\t"
+        "mul r6, r3 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "mul r7, r2 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "mul r8, r21 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "mul r9, r20 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "mul r10, r19 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "mul r11, r18 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "mul r12, r17 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "mul r13, r16 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "mul r14, r15 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "ld r0, z \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r25 \n\t"
+        "adc r29, r25 \n\t"
+        "lsl r23 \n\t"
+        "rol r28 \n\t"
+        "rol r29 \n\t"
+        "add r23, r24 \n\t"
+        "adc r28, r22 \n\t"
+        "adc r29, r25 \n\t"
+        "st z+, r23 \n\t"
+
+        "ld r5, x+ \n\t"
+        "ldi r22, 0 \n\t"
+        "mul r6, r4 \n\t"
+        "mov r23, r0 \n\t"
+        "mov r24, r1 \n\t"
+        "mul r7, r3 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r8, r2 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r9, r21 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r10, r20 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r11, r19 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r12, r18 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r13, r17 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r14, r16 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "ld r0, z \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r25 \n\t"
+        "adc r22, r25 \n\t"
+        "lsl r23 \n\t"
+        "rol r24 \n\t"
+        "rol r22 \n\t"
+        "mul r15, r15 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "add r23, r28 \n\t"
+        "adc r24, r29 \n\t"
+        "adc r22, r25 \n\t"
+        "st z+, r23 \n\t"
+
+        "ldi r29, 0 \n\t"
+        "mul r6, r5 \n\t"
+        "mov r23, r0 \n\t"
+        "mov r28, r1 \n\t"
+        "mul r7, r4 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "mul r8, r3 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "mul r9, r2 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "mul r10, r21 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "mul r11, r20 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "mul r12, r19 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "mul r13, r18 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "mul r14, r17 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "mul r15, r16 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "ld r0, z \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r25 \n\t"
+        "adc r29, r25 \n\t"
+        "lsl r23 \n\t"
+        "rol r28 \n\t"
+        "rol r29 \n\t"
+        "add r23, r24 \n\t"
+        "adc r28, r22 \n\t"
+        "adc r29, r25 \n\t"
+        "st z+, r23 \n\t"
+
+        "ld r6, x+ \n\t"
+        "ldi r22, 0 \n\t"
+        "mul r7, r5 \n\t"
+        "mov r23, r0 \n\t"
+        "mov r24, r1 \n\t"
+        "mul r8, r4 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r9, r3 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r10, r2 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r11, r21 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r12, r20 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r13, r19 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r14, r18 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r15, r17 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "ld r0, z \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r25 \n\t"
+        "adc r22, r25 \n\t"
+        "lsl r23 \n\t"
+        "rol r24 \n\t"
+        "rol r22 \n\t"
+        "mul r16, r16 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "add r23, r28 \n\t"
+        "adc r24, r29 \n\t"
+        "adc r22, r25 \n\t"
+        "st z+, r23 \n\t"
+
+        "ldi r29, 0 \n\t"
+        "mul r7, r6 \n\t"
+        "mov r23, r0 \n\t"
+        "mov r28, r1 \n\t"
+        "mul r8, r5 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "mul r9, r4 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "mul r10, r3 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "mul r11, r2 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "mul r12, r21 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "mul r13, r20 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "mul r14, r19 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "mul r15, r18 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "mul r16, r17 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "ld r0, z \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r25 \n\t"
+        "adc r29, r25 \n\t"
+        "lsl r23 \n\t"
+        "rol r28 \n\t"
+        "rol r29 \n\t"
+        "add r23, r24 \n\t"
+        "adc r28, r22 \n\t"
+        "adc r29, r25 \n\t"
+        "st z+, r23 \n\t"
+
+        "ld r7, x+ \n\t"
+        "ldi r22, 0 \n\t"
+        "mul r8, r6 \n\t"
+        "mov r23, r0 \n\t"
+        "mov r24, r1 \n\t"
+        "mul r9, r5 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r10, r4 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r11, r3 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r12, r2 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r13, r21 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r14, r20 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r15, r19 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r16, r18 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "ld r0, z \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r25 \n\t"
+        "adc r22, r25 \n\t"
+        "lsl r23 \n\t"
+        "rol r24 \n\t"
+        "rol r22 \n\t"
+        "mul r17, r17 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "add r23, r28 \n\t"
+        "adc r24, r29 \n\t"
+        "adc r22, r25 \n\t"
+        "st z+, r23 \n\t"
+
+        "ldi r29, 0 \n\t"
+        "mul r8, r7 \n\t"
+        "mov r23, r0 \n\t"
+        "mov r28, r1 \n\t"
+        "mul r9, r6 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "mul r10, r5 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "mul r11, r4 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "mul r12, r3 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "mul r13, r2 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "mul r14, r21 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "mul r15, r20 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "mul r16, r19 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "mul r17, r18 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "ld r0, z \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r25 \n\t"
+        "adc r29, r25 \n\t"
+        "lsl r23 \n\t"
+        "rol r28 \n\t"
+        "rol r29 \n\t"
+        "add r23, r24 \n\t"
+        "adc r28, r22 \n\t"
+        "adc r29, r25 \n\t"
+        "st z+, r23 \n\t"
+
+        "ld r8, x+ \n\t"
+        "ldi r22, 0 \n\t"
+        "mul r9, r7 \n\t"
+        "mov r23, r0 \n\t"
+        "mov r24, r1 \n\t"
+        "mul r10, r6 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r11, r5 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r12, r4 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r13, r3 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r14, r2 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r15, r21 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r16, r20 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r17, r19 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "ld r0, z \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r25 \n\t"
+        "adc r22, r25 \n\t"
+        "lsl r23 \n\t"
+        "rol r24 \n\t"
+        "rol r22 \n\t"
+        "mul r18, r18 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "add r23, r28 \n\t"
+        "adc r24, r29 \n\t"
+        "adc r22, r25 \n\t"
+        "st z+, r23 \n\t"
+
+        "ldi r29, 0 \n\t"
+        "mul r9, r8 \n\t"
+        "mov r23, r0 \n\t"
+        "mov r28, r1 \n\t"
+        "mul r10, r7 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "mul r11, r6 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "mul r12, r5 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "mul r13, r4 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "mul r14, r3 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "mul r15, r2 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "mul r16, r21 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "mul r17, r20 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "mul r18, r19 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "ld r0, z \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r25 \n\t"
+        "adc r29, r25 \n\t"
+        "lsl r23 \n\t"
+        "rol r28 \n\t"
+        "rol r29 \n\t"
+        "add r23, r24 \n\t"
+        "adc r28, r22 \n\t"
+        "adc r29, r25 \n\t"
+        "st z+, r23 \n\t"
+
+        "ld r9, x+ \n\t"
+        "ldi r22, 0 \n\t"
+        "mul r10, r8 \n\t"
+        "mov r23, r0 \n\t"
+        "mov r24, r1 \n\t"
+        "mul r11, r7 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r12, r6 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r13, r5 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r14, r4 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r15, r3 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r16, r2 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r17, r21 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r18, r20 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "ld r0, z \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r25 \n\t"
+        "adc r22, r25 \n\t"
+        "lsl r23 \n\t"
+        "rol r24 \n\t"
+        "rol r22 \n\t"
+        "mul r19, r19 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "add r23, r28 \n\t"
+        "adc r24, r29 \n\t"
+        "adc r22, r25 \n\t"
+        "st z+, r23 \n\t"
+
+        "ldi r29, 0 \n\t"
+        "mul r10, r9 \n\t"
+        "mov r23, r0 \n\t"
+        "mov r28, r1 \n\t"
+        "mul r11, r8 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "mul r12, r7 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "mul r13, r6 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "mul r14, r5 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "mul r15, r4 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "mul r16, r3 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "mul r17, r2 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "mul r18, r21 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "mul r19, r20 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "ld r0, z \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r25 \n\t"
+        "adc r29, r25 \n\t"
+        "lsl r23 \n\t"
+        "rol r28 \n\t"
+        "rol r29 \n\t"
+        "add r23, r24 \n\t"
+        "adc r28, r22 \n\t"
+        "adc r29, r25 \n\t"
+        "st z+, r23 \n\t"
+
+        "ld r10, x+ \n\t"
+        "ldi r22, 0 \n\t"
+        "mul r11, r9 \n\t"
+        "mov r23, r0 \n\t"
+        "mov r24, r1 \n\t"
+        "mul r12, r8 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r13, r7 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r14, r6 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r15, r5 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r16, r4 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r17, r3 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r18, r2 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r19, r21 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "ld r0, z \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r25 \n\t"
+        "adc r22, r25 \n\t"
+        "lsl r23 \n\t"
+        "rol r24 \n\t"
+        "rol r22 \n\t"
+        "mul r20, r20 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "add r23, r28 \n\t"
+        "adc r24, r29 \n\t"
+        "adc r22, r25 \n\t"
+        "st z+, r23 \n\t"
+
+        "ldi r29, 0 \n\t"
+        "mul r11, r10 \n\t"
+        "mov r23, r0 \n\t"
+        "mov r28, r1 \n\t"
+        "mul r12, r9 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "mul r13, r8 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "mul r14, r7 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "mul r15, r6 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "mul r16, r5 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "mul r17, r4 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "mul r18, r3 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "mul r19, r2 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "mul r20, r21 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "ld r0, z \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r25 \n\t"
+        "adc r29, r25 \n\t"
+        "lsl r23 \n\t"
+        "rol r28 \n\t"
+        "rol r29 \n\t"
+        "add r23, r24 \n\t"
+        "adc r28, r22 \n\t"
+        "adc r29, r25 \n\t"
+        "st z+, r23 \n\t"
+
+        "ld r11, x+ \n\t"
+        "ldi r22, 0 \n\t"
+        "mul r12, r10 \n\t"
+        "mov r23, r0 \n\t"
+        "mov r24, r1 \n\t"
+        "mul r13, r9 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r14, r8 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r15, r7 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r16, r6 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r17, r5 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r18, r4 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r19, r3 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r20, r2 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "ld r0, z \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r25 \n\t"
+        "adc r22, r25 \n\t"
+        "lsl r23 \n\t"
+        "rol r24 \n\t"
+        "rol r22 \n\t"
+        "mul r21, r21 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "add r23, r28 \n\t"
+        "adc r24, r29 \n\t"
+        "adc r22, r25 \n\t"
+        "st z+, r23 \n\t"
+
+        "ldi r29, 0 \n\t"
+        "mul r12, r11 \n\t"
+        "mov r23, r0 \n\t"
+        "mov r28, r1 \n\t"
+        "mul r13, r10 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "mul r14, r9 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "mul r15, r8 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "mul r16, r7 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "mul r17, r6 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "mul r18, r5 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "mul r19, r4 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "mul r20, r3 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "mul r21, r2 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "ld r0, z \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r25 \n\t"
+        "adc r29, r25 \n\t"
+        "lsl r23 \n\t"
+        "rol r28 \n\t"
+        "rol r29 \n\t"
+        "add r23, r24 \n\t"
+        "adc r28, r22 \n\t"
+        "adc r29, r25 \n\t"
+        "st z+, r23 \n\t"
+
+        "ld r12, x+ \n\t"
+        "ldi r22, 0 \n\t"
+        "mul r13, r11 \n\t"
+        "mov r23, r0 \n\t"
+        "mov r24, r1 \n\t"
+        "mul r14, r10 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r15, r9 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r16, r8 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r17, r7 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r18, r6 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r19, r5 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r20, r4 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r21, r3 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "ld r0, z \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r25 \n\t"
+        "adc r22, r25 \n\t"
+        "lsl r23 \n\t"
+        "rol r24 \n\t"
+        "rol r22 \n\t"
+        "mul r2, r2 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "add r23, r28 \n\t"
+        "adc r24, r29 \n\t"
+        "adc r22, r25 \n\t"
+        "st z+, r23 \n\t"
+
+        "ldi r29, 0 \n\t"
+        "mul r13, r12 \n\t"
+        "mov r23, r0 \n\t"
+        "mov r28, r1 \n\t"
+        "mul r14, r11 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "mul r15, r10 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "mul r16, r9 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "mul r17, r8 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "mul r18, r7 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "mul r19, r6 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "mul r20, r5 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "mul r21, r4 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "mul r2, r3 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "ld r0, z \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r25 \n\t"
+        "adc r29, r25 \n\t"
+        "lsl r23 \n\t"
+        "rol r28 \n\t"
+        "rol r29 \n\t"
+        "add r23, r24 \n\t"
+        "adc r28, r22 \n\t"
+        "adc r29, r25 \n\t"
+        "st z+, r23 \n\t"
+
+        "ld r13, x+ \n\t"
+        "ldi r22, 0 \n\t"
+        "mul r14, r12 \n\t"
+        "mov r23, r0 \n\t"
+        "mov r24, r1 \n\t"
+        "mul r15, r11 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r16, r10 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r17, r9 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r18, r8 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r19, r7 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r20, r6 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r21, r5 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r2, r4 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "ld r0, z \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r25 \n\t"
+        "adc r22, r25 \n\t"
+        "lsl r23 \n\t"
+        "rol r24 \n\t"
+        "rol r22 \n\t"
+        "mul r3, r3 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "add r23, r28 \n\t"
+        "adc r24, r29 \n\t"
+        "adc r22, r25 \n\t"
+        "st z+, r23 \n\t"
+
+        "ldi r29, 0 \n\t"
+        "mul r14, r13 \n\t"
+        "mov r23, r0 \n\t"
+        "mov r28, r1 \n\t"
+        "mul r15, r12 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "mul r16, r11 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "mul r17, r10 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "mul r18, r9 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "mul r19, r8 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "mul r20, r7 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "mul r21, r6 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "mul r2, r5 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "mul r3, r4 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "ld r0, z \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r25 \n\t"
+        "adc r29, r25 \n\t"
+        "lsl r23 \n\t"
+        "rol r28 \n\t"
+        "rol r29 \n\t"
+        "add r23, r24 \n\t"
+        "adc r28, r22 \n\t"
+        "adc r29, r25 \n\t"
+        "st z+, r23 \n\t"
+
+        "ldi r22, 0 \n\t"
+        "mul r15, r13 \n\t"
+        "mov r23, r0 \n\t"
+        "mov r24, r1 \n\t"
+        "mul r16, r12 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r17, r11 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r18, r10 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r19, r9 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r20, r8 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r21, r7 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r2, r6 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r3, r5 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "lsl r23 \n\t"
+        "rol r24 \n\t"
+        "rol r22 \n\t"
+        "mul r4, r4 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "add r23, r28 \n\t"
+        "adc r24, r29 \n\t"
+        "adc r22, r25 \n\t"
+        "st z+, r23 \n\t"
+
+        "ldi r29, 0 \n\t"
+        "mul r16, r13 \n\t"
+        "mov r23, r0 \n\t"
+        "mov r28, r1 \n\t"
+        "mul r17, r12 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "mul r18, r11 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "mul r19, r10 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "mul r20, r9 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "mul r21, r8 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "mul r2, r7 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "mul r3, r6 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "mul r4, r5 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "lsl r23 \n\t"
+        "rol r28 \n\t"
+        "rol r29 \n\t"
+        "add r23, r24 \n\t"
+        "adc r28, r22 \n\t"
+        "adc r29, r25 \n\t"
+        "st z+, r23 \n\t"
+
+        "ldi r22, 0 \n\t"
+        "mul r17, r13 \n\t"
+        "mov r23, r0 \n\t"
+        "mov r24, r1 \n\t"
+        "mul r18, r12 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r19, r11 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r20, r10 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r21, r9 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r2, r8 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r3, r7 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r4, r6 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "lsl r23 \n\t"
+        "rol r24 \n\t"
+        "rol r22 \n\t"
+        "mul r5, r5 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "add r23, r28 \n\t"
+        "adc r24, r29 \n\t"
+        "adc r22, r25 \n\t"
+        "st z+, r23 \n\t"
+
+        "ldi r29, 0 \n\t"
+        "mul r18, r13 \n\t"
+        "mov r23, r0 \n\t"
+        "mov r28, r1 \n\t"
+        "mul r19, r12 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "mul r20, r11 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "mul r21, r10 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "mul r2, r9 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "mul r3, r8 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "mul r4, r7 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "mul r5, r6 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "lsl r23 \n\t"
+        "rol r28 \n\t"
+        "rol r29 \n\t"
+        "add r23, r24 \n\t"
+        "adc r28, r22 \n\t"
+        "adc r29, r25 \n\t"
+        "st z+, r23 \n\t"
+
+        "ldi r22, 0 \n\t"
+        "mul r19, r13 \n\t"
+        "mov r23, r0 \n\t"
+        "mov r24, r1 \n\t"
+        "mul r20, r12 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r21, r11 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r2, r10 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r3, r9 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r4, r8 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r5, r7 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "lsl r23 \n\t"
+        "rol r24 \n\t"
+        "rol r22 \n\t"
+        "mul r6, r6 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "add r23, r28 \n\t"
+        "adc r24, r29 \n\t"
+        "adc r22, r25 \n\t"
+        "st z+, r23 \n\t"
+
+        "ldi r29, 0 \n\t"
+        "mul r20, r13 \n\t"
+        "mov r23, r0 \n\t"
+        "mov r28, r1 \n\t"
+        "mul r21, r12 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "mul r2, r11 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "mul r3, r10 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "mul r4, r9 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "mul r5, r8 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "mul r6, r7 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "lsl r23 \n\t"
+        "rol r28 \n\t"
+        "rol r29 \n\t"
+        "add r23, r24 \n\t"
+        "adc r28, r22 \n\t"
+        "adc r29, r25 \n\t"
+        "st z+, r23 \n\t"
+
+        "ldi r22, 0 \n\t"
+        "mul r21, r13 \n\t"
+        "mov r23, r0 \n\t"
+        "mov r24, r1 \n\t"
+        "mul r2, r12 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r3, r11 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r4, r10 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r5, r9 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r6, r8 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "lsl r23 \n\t"
+        "rol r24 \n\t"
+        "rol r22 \n\t"
+        "mul r7, r7 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "add r23, r28 \n\t"
+        "adc r24, r29 \n\t"
+        "adc r22, r25 \n\t"
+        "st z+, r23 \n\t"
+
+        "ldi r29, 0 \n\t"
+        "mul r2, r13 \n\t"
+        "mov r23, r0 \n\t"
+        "mov r28, r1 \n\t"
+        "mul r3, r12 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "mul r4, r11 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "mul r5, r10 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "mul r6, r9 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "mul r7, r8 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "lsl r23 \n\t"
+        "rol r28 \n\t"
+        "rol r29 \n\t"
+        "add r23, r24 \n\t"
+        "adc r28, r22 \n\t"
+        "adc r29, r25 \n\t"
+        "st z+, r23 \n\t"
+
+        "ldi r22, 0 \n\t"
+        "mul r3, r13 \n\t"
+        "mov r23, r0 \n\t"
+        "mov r24, r1 \n\t"
+        "mul r4, r12 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r5, r11 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r6, r10 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r7, r9 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "lsl r23 \n\t"
+        "rol r24 \n\t"
+        "rol r22 \n\t"
+        "mul r8, r8 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "add r23, r28 \n\t"
+        "adc r24, r29 \n\t"
+        "adc r22, r25 \n\t"
+        "st z+, r23 \n\t"
+
+        "ldi r29, 0 \n\t"
+        "mul r4, r13 \n\t"
+        "mov r23, r0 \n\t"
+        "mov r28, r1 \n\t"
+        "mul r5, r12 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "mul r6, r11 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "mul r7, r10 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "mul r8, r9 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "lsl r23 \n\t"
+        "rol r28 \n\t"
+        "rol r29 \n\t"
+        "add r23, r24 \n\t"
+        "adc r28, r22 \n\t"
+        "adc r29, r25 \n\t"
+        "st z+, r23 \n\t"
+
+        "ldi r22, 0 \n\t"
+        "mul r5, r13 \n\t"
+        "mov r23, r0 \n\t"
+        "mov r24, r1 \n\t"
+        "mul r6, r12 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r7, r11 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r8, r10 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "lsl r23 \n\t"
+        "rol r24 \n\t"
+        "rol r22 \n\t"
+        "mul r9, r9 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "add r23, r28 \n\t"
+        "adc r24, r29 \n\t"
+        "adc r22, r25 \n\t"
+        "st z+, r23 \n\t"
+
+        "ldi r29, 0 \n\t"
+        "mul r6, r13 \n\t"
+        "mov r23, r0 \n\t"
+        "mov r28, r1 \n\t"
+        "mul r7, r12 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "mul r8, r11 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "mul r9, r10 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "lsl r23 \n\t"
+        "rol r28 \n\t"
+        "rol r29 \n\t"
+        "add r23, r24 \n\t"
+        "adc r28, r22 \n\t"
+        "adc r29, r25 \n\t"
+        "st z+, r23 \n\t"
+
+        "ldi r22, 0 \n\t"
+        "mul r7, r13 \n\t"
+        "mov r23, r0 \n\t"
+        "mov r24, r1 \n\t"
+        "mul r8, r12 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "mul r9, r11 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "lsl r23 \n\t"
+        "rol r24 \n\t"
+        "rol r22 \n\t"
+        "mul r10, r10 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "add r23, r28 \n\t"
+        "adc r24, r29 \n\t"
+        "adc r22, r25 \n\t"
+        "st z+, r23 \n\t"
+
+        "ldi r29, 0 \n\t"
+        "mul r8, r13 \n\t"
+        "mov r23, r0 \n\t"
+        "mov r28, r1 \n\t"
+        "mul r9, r12 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "mul r10, r11 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "lsl r23 \n\t"
+        "rol r28 \n\t"
+        "rol r29 \n\t"
+        "add r23, r24 \n\t"
+        "adc r28, r22 \n\t"
+        "adc r29, r25 \n\t"
+        "st z+, r23 \n\t"
+
+        "ldi r22, 0 \n\t"
+        "mul r9, r13 \n\t"
+        "mov r23, r0 \n\t"
+        "mov r24, r1 \n\t"
+        "mul r10, r12 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "lsl r23 \n\t"
+        "rol r24 \n\t"
+        "rol r22 \n\t"
+        "mul r11, r11 \n\t"
+        "add r23, r0 \n\t"
+        "adc r24, r1 \n\t"
+        "adc r22, r25 \n\t"
+        "add r23, r28 \n\t"
+        "adc r24, r29 \n\t"
+        "adc r22, r25 \n\t"
+        "st z+, r23 \n\t"
+
+        "ldi r29, 0 \n\t"
+        "mul r10, r13 \n\t"
+        "mov r23, r0 \n\t"
+        "mov r28, r1 \n\t"
+        "mul r11, r12 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "adc r29, r25 \n\t"
+        "lsl r23 \n\t"
+        "rol r28 \n\t"
+        "rol r29 \n\t"
+        "add r23, r24 \n\t"
+        "adc r28, r22 \n\t"
+        "adc r29, r25 \n\t"
+        "st z+, r23 \n\t"
+
+        "ldi r23, 0 \n\t"
+        "mul r11, r13 \n\t"
+        "add r28, r0 \n\t"
+        "adc r29, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "add r28, r0 \n\t"
+        "adc r29, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "mul r12, r12 \n\t"
+        "add r28, r0 \n\t"
+        "adc r29, r1 \n\t"
+        "adc r23, r25 \n\t"
+        "st z+, r28 \n\t"
+
+        "ldi r28, 0 \n\t"
+        "mul r12, r13 \n\t"
+        "add r29, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r28, r25 \n\t"
+        "add r29, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "adc r28, r25 \n\t"
+        "st z+, r29 \n\t"
+
+        "mul r13, r13 \n\t"
+        "add r23, r0 \n\t"
+        "adc r28, r1 \n\t"
+        "st z+, r23 \n\t"
+        "st z+, r28 \n\t"
+        "eor r1, r1 \n\t"
+        : "+x" (p_left), "+z" (p_result)
+        :
+        : "r0", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12",
+          "r13", "r14", "r15", "r16", "r17", "r18", "r19", "r20", "r21", "r22", "r23", "r24", "r25", "r28", "r29", "cc", "memory"
+    );
+}
+#define asm_square 1
+
+#endif /* uECC_BYTES == xx */
+#endif /* uECC_SQUARE_FUNC */
+
+static void vli_modSub_fast(uint8_t *p_result, uint8_t *p_left, uint8_t *p_right)
+{
+    uint8_t t1, t2;
+    __asm__ volatile (
+        "push r28 \n\t" /* Save Y */
+        "push r29 \n\t"
+        
+        "ld %[t1], x+ \n\t"  /* Load left word. */
+        "ld %[t2], y+ \n\t" /* Load right word. */
+        "sub %[t1], %[t2] \n\t" /* Subtract the first word. */
+        "st z+, %[t1] \n\t"  /* Store the first result word. */
+        
+        /* Now we just do the remaining words with the carry bit (using SBC) */
+        REPEAT(DEC(uECC_BYTES), "ld %[t1], x+ \n\t"
+            "ld %[t2], y+ \n\t"
+            "sbc %[t1], %[t2] \n\t"
+            "st z+, %[t1] \n\t")
+        
+        "brcs 1f \n\t" /* If borrow is set, then we need to add */
+        "rjmp done \n\t" /* otherwise we are done */
+        "1: \n\t"
+        
+        "sbiw r30, " STR(uECC_BYTES) " \n\t" /* make z point at p_result again */
+        "ldi r28, lo8(curve_p) \n\t" /* make y point at curve_p */
+       "ldi r29, hi8(curve_p) \n\t"
+       
+       /* do the addition */
+       "ld %[t1], z \n\t"
+        "ld %[t2], y+ \n\t"
+        "add %[t1], %[t2] \n\t"
+        "st z+, %[t1] \n\t"
+        REPEAT(DEC(uECC_BYTES), "ld %[t1], z \n\t"
+            "ld %[t2], y+ \n\t"
+            "adc %[t1], %[t2] \n\t"
+            "st z+, %[t1] \n\t")
+        
+        "done: \n\t"
+        "pop r29 \n\t" /* Restore Y */
+        "pop r28 \n\t"
+
+        : "+z" (p_result), "+x" (p_left),
+          [t1] "=&r" (t1), [t2] "=&r" (t2)
+        : "y" (p_right)
+        : "cc", "memory"
+    );
+}
+#define asm_modSub_fast 1
+
+#if uECC_CURVE == uECC_secp160r1
+static void vli_mmod_fast(uint8_t *RESTRICT p_result, uint8_t *RESTRICT p_product)
+{
+    uint8_t l_carry = 0;
+    __asm__ volatile (
+        "in r30, __SP_L__ \n\t"
+       "in r31, __SP_H__ \n\t"
+       "sbiw r30, 24 \n\t"
+       "in r0, __SREG__ \n\t"
+       "cli \n\t"
+       "out __SP_H__, r31 \n\t"
+       "out __SREG__, r0 \n\t"
+       "out __SP_L__, r30 \n\t"
+       
+       "adiw r30, 25 \n\t" /* we are shifting by 31 bits, so shift over 4 bytes (+ 1 since z initially points below the stack) */
+        "adiw r26, 40 \n\t" /* end of p_product */
+        "ld r18, -x \n\t"  /* Load word. */
+        "lsr r18 \n\t" /* Shift. */
+        "st -z, r18 \n\t"  /* Store the first result word. */
+
+        /* Now we just do the remaining words with the carry bit (using ROR) */
+        REPEAT(19, "ld r18, -x \n\t"
+            "ror r18 \n\t"
+            "st -z, r18 \n\t")
+
+        "eor r18, r18 \n\t" /* r18 = 0 */
+        "ror r18 \n\t" /* get last bit */
+        "st -z, r18 \n\t" /* store it */
+
+        "sbiw r30, 3 \n\t" /* move z back to point at tmp */
+        /* now we add p_right */
+        "ld r18, x+ \n\t"
+        "st z+, r18 \n\t" /* the first 3 bytes do not need to be added */
+        "ld r18, x+ \n\t"
+        "st z+, r18 \n\t"
+        "ld r18, x+ \n\t"
+        "st z+, r18 \n\t"
+
+        "ld r18, x+ \n\t"
+        "ld r19, z \n\t"
+        "add r18, r19 \n\t"
+        "st z+, r18 \n\t"
+
+        /* Now we just do the remaining words with the carry bit (using ADC) */
+        REPEAT(16, "ld r18, x+ \n\t"
+            "ld r19, z \n\t"
+            "adc r18, r19 \n\t"
+            "st z+, r18 \n\t")
+
+        /* Propagate over the remaining bytes of p_result */
+        "ld r18, z \n\t"
+        "adc r18, r1 \n\t"
+        "st z+, r18 \n\t"
+
+        "ld r18, z \n\t"
+        "adc r18, r1 \n\t"
+        "st z+, r18 \n\t"
+
+        "ld r18, z \n\t"
+        "adc r18, r1 \n\t"
+        "st z+, r18 \n\t"
+
+        "ld r18, z \n\t"
+        "adc r18, r1 \n\t"
+        "st z+, r18 \n\t"
+        
+        "sbiw r30, 24 \n\t" /* move z back to point at tmp */
+        "sbiw r26, 40 \n\t" /* move x back to point at p_product */
+        
+        /* add low bytes of tmp to p_product, storing in p_result */
+        "ld r18, z+ \n\t"
+        "ld r19, x+ \n\t"
+        "add r18, r19 \n\t"
+        "st y+, r18 \n\t"
+        REPEAT(19, "ld r18, z+ \n\t"
+            "ld r19, x+ \n\t"
+            "adc r18, r19 \n\t"
+            "st y+, r18 \n\t")
+        "adc %[carry], __zero_reg__ \n\t"    /* Store carry bit (carry flag is cleared). */
+        /* at this point x is at the end of p_product, y is at the end of p_result, z is 20 bytes into tmp */
+        "sbiw r28, 20 \n\t" /* move y back to point at p_result */
+        "adiw r30, 4 \n\t" /* move z to point to the end of tmp */
+        
+        /* do omega_mult again with the 4 relevant bytes */
+        /* z points to the end of tmp, x points to the end of p_product */
+        "ld r18, -z \n\t"  /* Load word. */
+        "lsr r18 \n\t" /* Shift. */
+        "st -x, r18 \n\t"  /* Store the first result word. */
+        
+        "ld r18, -z \n\t"
+        "ror r18 \n\t"
+        "st -x, r18 \n\t"
+        "ld r18, -z \n\t"
+        "ror r18 \n\t"
+        "st -x, r18 \n\t"
+        "ld r18, -z \n\t"
+        "ror r18 \n\t"
+        "st -x, r18 \n\t"
+        
+        "eor r18, r18 \n\t" /* r18 = 0 */
+        "ror r18 \n\t" /* get last bit */
+        "st -x, r18 \n\t" /* store it */
+        
+        "sbiw r26, 3 \n\t" /* move x back to point at beginning */
+        /* now we add a copy of the 4 bytes */
+        "ld r18, z+ \n\t"
+        "st x+, r18 \n\t" /* the first 3 bytes do not need to be added */
+        "ld r18, z+ \n\t"
+        "st x+, r18 \n\t"
+        "ld r18, z+ \n\t"
+        "st x+, r18 \n\t"
+        
+        "ld r18, z+ \n\t"
+        "ld r19, x \n\t"
+        "add r18, r19 \n\t"
+        "st x+, r18 \n\t"
+        
+        /* Propagate over the remaining bytes */
+        "ld r18, x \n\t"
+        "adc r18, r1 \n\t"
+        "st x+, r18 \n\t"
+        
+        "ld r18, x \n\t"
+        "adc r18, r1 \n\t"
+        "st x+, r18 \n\t"
+        
+        "ld r18, x \n\t"
+        "adc r18, r1 \n\t"
+        "st x+, r18 \n\t"
+        
+        "ld r18, x \n\t"
+        "adc r18, r1 \n\t"
+        "st x+, r18 \n\t"
+        
+        /* now z points to the end of tmp, x points to the end of p_product (y still points at p_result) */
+        "sbiw r26, 8 \n\t" /* move x back to point at beginning of actual data */
+        /* add into p_result */
+        "ld r18, x+ \n\t"
+        "ld r19, y \n\t"
+        "add r18, r19 \n\t"
+        "st y+, r18 \n\t"
+        REPEAT(7, "ld r18, x+ \n\t"
+            "ld r19, y \n\t"
+            "adc r18, r19 \n\t"
+            "st y+, r18 \n\t")
+        
+        /* Done adding, now propagate carry bit */
+        REPEAT(12, "ld r18, y \n\t"
+            "adc r18, __zero_reg__ \n\t"
+            "st y+, r18 \n\t")
+        
+        "adc %[carry], __zero_reg__ \n\t"    /* Store carry bit (carry flag is cleared). */
+        "sbiw r28, 20 \n\t" /* move y back to point at p_result */
+        
+        "sbiw r30, 1 \n\t" /* fix stack pointer */
+       "in r0, __SREG__ \n\t"
+       "cli \n\t"
+       "out __SP_H__, r31 \n\t"
+       "out __SREG__, r0 \n\t"
+       "out __SP_L__, r30 \n\t"
+        
+        : "+x" (p_product), [carry] "+r" (l_carry)
+        : "y" (p_result)
+        : "r0", "r18", "r19", "r30", "r31", "cc", "memory"
+    );
+    
+    if(l_carry > 0)
+    {
+        --l_carry;
+        vli_sub(p_result, p_result, curve_p);
+    }
+    if(l_carry > 0)
+    {
+        vli_sub(p_result, p_result, curve_p);
+    }
+    
+    if(vli_cmp(p_result, curve_p) > 0)
+    {
+        vli_sub(p_result, p_result, curve_p);
+    }
+}
+#define asm_mmod_fast 1
+
+#elif (uECC_CURVE == uECC_secp256k1)
+static void vli_mmod_fast(uint8_t *RESTRICT p_result, uint8_t *RESTRICT p_product)
+{
+    uint8_t l_carry = 0;
+    __asm__ volatile (
+        "in r30, __SP_L__ \n\t"
+       "in r31, __SP_H__ \n\t"
+       "sbiw r30, 37 \n\t"
+       "in r0, __SREG__ \n\t"
+       "cli \n\t"
+       "out __SP_H__, r31 \n\t"
+       "out __SREG__, r0 \n\t"
+       "out __SP_L__, r30 \n\t"
+       
+       "adiw r30, 1 \n\t" /* add 1 since z initially points below the stack */
+        "adiw r26, 32 \n\t" /* p_product + uECC_WORDS */
+        "ldi r25, 0x03 \n\t"
+        "ldi r24, 0xD1 \n\t"
+        "ld r18, x+ \n\t"
+        "ld r19, x+ \n\t"
+        "ld r20, x+ \n\t"
+        "ld r21, x+ \n\t"
+        
+        "mul r24, r18 \n\t"
+        "st z+, r0 \n\t"
+        "mov r22, r1 \n\t"
+        "ldi r23, 0 \n\t"
+        
+        "mul r24, r19 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t" /* can't overflow */
+        "mul r25, r18 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t" /* can't overflow */
+        "st z+, r22 \n\t"
+        "ldi r22, 0 \n\t"
+        
+        "mul r24, r20 \n\t"
+        "add r23, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "mul r25, r19 \n\t"
+        "add r23, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "st z+, r23 \n\t"
+        "ldi r23, 0 \n\t"
+        
+        "mul r24, r21 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "mul r25, r20 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "st z+, r22 \n\t"
+        "ldi r22, 0 \n\t"
+        
+        /* now we start adding the 2^32 part as well */
+        "add r23, r18 \n\t" // 28
+        "adc r22, r22 \n\t"
+        "ld r18, x+ \n\t"
+        "mul r24, r18 \n\t"
+        "add r23, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "mul r25, r21 \n\t"
+        "add r23, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "st z+, r23 \n\t"
+        "ldi r23, 0 \n\t"
+        
+        "add r22, r19 \n\t" // 27
+        "adc r23, r23 \n\t"
+        "ld r19, x+ \n\t"
+        "mul r24, r19 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "mul r25, r18 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "st z+, r22 \n\t"
+        "ldi r22, 0 \n\t"
+        
+        REPEAT(6, // 26 - 3
+            "add r23, r20 \n\t"
+            "adc r22, r22 \n\t"
+            "ld r20, x+ \n\t"
+            "mul r24, r20 \n\t"
+            "add r23, r0 \n\t"
+            "adc r22, r1 \n\t"
+            "mul r25, r19 \n\t"
+            "add r23, r0 \n\t"
+            "adc r22, r1 \n\t"
+            "st z+, r23 \n\t"
+            "ldi r23, 0 \n\t"
+            
+            "add r22, r21 \n\t"
+            "adc r23, r23 \n\t"
+            "ld r21, x+ \n\t"
+            "mul r24, r21 \n\t"
+            "add r22, r0 \n\t"
+            "adc r23, r1 \n\t"
+            "mul r25, r20 \n\t"
+            "add r22, r0 \n\t"
+            "adc r23, r1 \n\t"
+            "st z+, r22 \n\t"
+            "ldi r22, 0 \n\t"
+            
+            "add r23, r18 \n\t"
+            "adc r22, r22 \n\t"
+            "ld r18, x+ \n\t"
+            "mul r24, r18 \n\t"
+            "add r23, r0 \n\t"
+            "adc r22, r1 \n\t"
+            "mul r25, r21 \n\t"
+            "add r23, r0 \n\t"
+            "adc r22, r1 \n\t"
+            "st z+, r23 \n\t"
+            "ldi r23, 0 \n\t"
+            
+            "add r22, r19 \n\t"
+            "adc r23, r23 \n\t"
+            "ld r19, x+ \n\t"
+            "mul r24, r19 \n\t"
+            "add r22, r0 \n\t"
+            "adc r23, r1 \n\t"
+            "mul r25, r18 \n\t"
+            "add r22, r0 \n\t"
+            "adc r23, r1 \n\t"
+            "st z+, r22 \n\t"
+            "ldi r22, 0 \n\t")
+
+        "add r23, r20 \n\t" // 2
+        "adc r22, r22 \n\t"
+        "ld r20, x+ \n\t"
+        "mul r24, r20 \n\t"
+        "add r23, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "mul r25, r19 \n\t"
+        "add r23, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "st z+, r23 \n\t"
+        "ldi r23, 0 \n\t"
+        
+        "add r22, r21 \n\t" // 1
+        "adc r23, r23 \n\t"
+        "ld r21, x+ \n\t"
+        "mul r24, r21 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "mul r25, r20 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "st z+, r22 \n\t"
+        "ldi r22, 0 \n\t"
+        
+        /* Now finish the carries etc */
+        "add r23, r18 \n\t"
+        "adc r22, r22 \n\t"
+        "mul r25, r21 \n\t"
+        "add r23, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "st z+, r23 \n\t"
+        "ldi r23, 0 \n\t"
+        
+        "add r22, r19 \n\t"
+        "adc r23, r23 \n\t"
+        "st z+, r22 \n\t"
+        "ldi r22, 0 \n\t"
+        
+        "add r23, r20 \n\t"
+        "adc r22, r22 \n\t"
+        "st z+, r23 \n\t"
+        "ldi r23, 0 \n\t"
+        
+        "add r22, r21 \n\t"
+        "adc r23, r23 \n\t"
+        "st z+, r22 \n\t"
+        "st z+, r23 \n\t"
+        "eor r1, r1 \n\t" /* make r1 be 0 again */
+        
+        "sbiw r30, 37 \n\t" /* move z back to point at tmp */
+        "subi r26, 64 \n\t" /* move x back to point at p_product */
+        "sbc r27, __zero_reg__ \n\t"
+        
+        /* add low bytes of tmp to p_product, storing in p_result */
+        "ld r18, z+ \n\t"
+        "ld r19, x+ \n\t"
+        "add r18, r19 \n\t"
+        "st y+, r18 \n\t"
+        REPEAT(31, "ld r18, z+ \n\t"
+            "ld r19, x+ \n\t"
+            "adc r18, r19 \n\t"
+            "st y+, r18 \n\t")
+        
+        "adc %[carry], __zero_reg__ \n\t"    /* Store carry bit (carry flag is cleared). */
+        /* at this point x is at the end of p_product, y is at the end of p_result, z is 32 bytes into tmp */
+        "sbiw r28, 32 \n\t" /* move y back to point at p_result */
+
+        /* do omega_mult again with the 5 relevant bytes */
+        /* z points to l_tmp + uECC_WORDS, x points to the end of p_product */
+        "sbiw r26, 32 \n\t" /* shift x back to point into the p_product buffer (we can overwrite it now) */
+        
+        "ld r18, z+ \n\t"
+        "ld r19, z+ \n\t"
+        "ld r20, z+ \n\t"
+        "ld r21, z+ \n\t"
+        
+        "mul r24, r18 \n\t"
+        "st x+, r0 \n\t"
+        "mov r22, r1 \n\t"
+        "ldi r23, 0 \n\t"
+        
+        "mul r24, r19 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t" /* can't overflow */
+        "mul r25, r18 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t" /* can't overflow */
+        "st x+, r22 \n\t"
+        "ldi r22, 0 \n\t"
+        
+        "mul r24, r20 \n\t"
+        "add r23, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "mul r25, r19 \n\t"
+        "add r23, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "st x+, r23 \n\t"
+        "ldi r23, 0 \n\t"
+        
+        "mul r24, r21 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "mul r25, r20 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "st x+, r22 \n\t"
+        "ldi r22, 0 \n\t"
+        
+        "add r23, r18 \n\t"
+        "adc r22, r22 \n\t"
+        "ld r18, z+ \n\t"
+        "mul r24, r18 \n\t"
+        "add r23, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "mul r25, r21 \n\t"
+        "add r23, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "st x+, r23 \n\t"
+        "ldi r23, 0 \n\t"
+        
+        /* Now finish the carries etc */
+        "add r22, r19 \n\t"
+        "adc r23, r23 \n\t"
+        "mul r25, r18 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "st x+, r22 \n\t"
+        "ldi r22, 0 \n\t"
+        
+        "add r23, r20 \n\t"
+        "adc r22, r22 \n\t"
+        "st x+, r23 \n\t"
+        "ldi r23, 0 \n\t"
+        
+        "add r22, r21 \n\t"
+        "adc r23, r23 \n\t"
+        "st x+, r22 \n\t"
+        "ldi r22, 0 \n\t"
+        
+        "add r23, r18 \n\t"
+        "adc r22, r22 \n\t"
+        "st x+, r23 \n\t"
+        "st x+, r22 \n\t"
+        "eor r1, r1 \n\t" /* make r1 be 0 again */
+        
+        /* now z points to the end of tmp, x points to the end of p_product (y still points at p_result) */
+        "sbiw r26, 10 \n\t" /* move x back to point at beginning of actual data */
+        /* add into p_result */
+        "ld r18, x+ \n\t"
+        "ld r19, y \n\t"
+        "add r18, r19 \n\t"
+        "st y+, r18 \n\t"
+        REPEAT(9, "ld r18, x+ \n\t"
+            "ld r19, y \n\t"
+            "adc r18, r19 \n\t"
+            "st y+, r18 \n\t")
+        
+        /* Done adding, now propagate carry bit */
+        REPEAT(22, "ld r18, y \n\t"
+            "adc r18, __zero_reg__ \n\t"
+            "st y+, r18 \n\t")
+        
+        "adc %[carry], __zero_reg__ \n\t"    /* Store carry bit (carry flag is cleared). */
+        "sbiw r28, 32 \n\t" /* move y back to point at p_result */
+        
+        "sbiw r30, 1 \n\t" /* fix stack pointer */
+       "in r0, __SREG__ \n\t"
+       "cli \n\t"
+       "out __SP_H__, r31 \n\t"
+       "out __SREG__, r0 \n\t"
+       "out __SP_L__, r30 \n\t"
+        
+        : "+x" (p_product), [carry] "+r" (l_carry)
+        : "y" (p_result)
+        : "r0", "r18", "r19", "r20", "r21", "r22", "r23", "r24", "r25", "r30", "r31", "cc", "memory"
+    );
+    
+    if(l_carry > 0)
+    {
+        --l_carry;
+        vli_sub(p_result, p_result, curve_p);
+    }
+    if(l_carry > 0)
+    {
+        vli_sub(p_result, p_result, curve_p);
+    }
+    
+    if(vli_cmp(p_result, curve_p) > 0)
+    {
+        vli_sub(p_result, p_result, curve_p);
+    }
+}
+#define asm_mmod_fast 1
+
+#endif /* (uECC_CURVE == uECC_secp256k1) */
+
+#endif /* (uECC_ASM == uECC_asm_fast) */
+
+#if !asm_rshift1
+static void vli_rshift1(uint8_t *p_vli)
+{
+    uint8_t i = uECC_BYTES;
+    __asm__ volatile (
+        "adiw r30, " STR(uECC_BYTES) " \n\t"
+        "clc \n\t"
+        
+        "1: \n\t"
+        "ld r0, -z \n\t"
+        "ror r0 \n\t"
+        "st z, r0 \n\t"
+        "dec %[i] \n\t"
+        "brne 1b \n\t"
+
+        : "+z" (p_vli), [i] "+r" (i)
+        : 
+        : "r0", "cc", "memory"
+    );
+}
+#define asm_rshift1 1
+#endif
+
+#if !asm_add
+static uint8_t vli_add(uint8_t *p_result, uint8_t *p_left, uint8_t *p_right)
+{
+    uint8_t i = uECC_BYTES;
+    uint8_t l_carry = 0;
+    uint8_t l_left;
+    uint8_t l_right;
+
+    __asm__ volatile (
+        "clc \n\t"
+        
+        "1: \n\t"
+        "ld %[left], x+ \n\t"  /* Load left byte. */
+        "ld %[right], y+ \n\t" /* Load right byte. */
+        "adc %[left], %[right] \n\t" /* Add. */
+        "st z+, %[left] \n\t"  /* Store the result. */
+        "dec %[i] \n\t"
+        "brne 1b \n\t"
+        
+        "adc %[carry], %[carry] \n\t"    /* Store carry bit in l_carry. */
+        
+        "sbiw r28, " STR(uECC_BYTES) " \n\t" /* Restore Y */
+
+        : "+z" (p_result), "+x" (p_left), [i] "+r" (i),
+            [carry] "+r" (l_carry), [left] "=&r" (l_left), [right] "=&r" (l_right)
+        : "y" (p_right)
+        : "cc", "memory"
+    );
+    return l_carry;
+}
+#define asm_add 1
+#endif
+
+#if !asm_sub
+static uint8_t vli_sub(uint8_t *p_result, uint8_t *p_left, uint8_t *p_right)
+{
+    uint8_t i = uECC_BYTES;
+    uint8_t l_borrow = 0;
+    uint8_t l_left;
+    uint8_t l_right;
+
+    __asm__ volatile (
+        "clc \n\t"
+        
+        "1: \n\t"
+        "ld %[left], x+ \n\t"  /* Load left byte. */
+        "ld %[right], y+ \n\t" /* Load right byte. */
+        "sbc %[left], %[right] \n\t" /* Subtract. */
+        "st z+, %[left] \n\t"  /* Store the result. */
+        "dec %[i] \n\t"
+        "brne 1b \n\t"
+        
+        "adc %[borrow], %[borrow] \n\t"    /* Store carry bit in l_borrow. */
+        
+        "sbiw r28, " STR(uECC_BYTES) " \n\t" /* Restore Y */
+
+        : "+z" (p_result), "+x" (p_left), [i] "+r" (i),
+            [borrow] "+r" (l_borrow), [left] "=&r" (l_left), [right] "=&r" (l_right)
+        : "y" (p_right)
+        : "cc", "memory"
+    );
+    return l_borrow;
+}
+#define asm_sub 1
+#endif
+
+#if !asm_mult
+__attribute((noinline))
+static void vli_mult(uint8_t *p_result, uint8_t *p_left, uint8_t *p_right)
+{
+    uint8_t r0 = 0;
+    uint8_t r1 = 0;
+    uint8_t r2 = 0;
+    
+    uint8_t l_zero = 0;
+    
+    uint8_t k, i;
+    
+    __asm__ volatile (
+        "ldi %[k], 1 \n\t" /* k = 1; k < uECC_BYTES; ++k */
+        
+        "1: \n\t"
+        "ldi %[i], 0 \n\t"  /* i=0; i < k; ++i */
+        
+        "add r28, %[k] \n\t" /* pre-add right ptr */
+        "adc r29, %[zero] \n\t"
+        
+        "2: \n\t"
+        "ld r0, x+ \n\t"
+        "ld r1, -y \n\t"
+        "mul r0, r1 \n\t"
+        
+        "add %[r0], r0 \n\t"
+        "adc %[r1], r1 \n\t"
+        "adc %[r2], %[zero] \n\t"
+        
+        "inc %[i] \n\t"
+        "cp %[i], %[k] \n\t"
+        "brlo 2b \n\t" /* loop if i < k */
+        
+        "sub r26, %[k] \n\t" /* fix up left ptr */
+        "sbc r27, %[zero] \n\t"
+        
+        "st z+, %[r0] \n\t"  /* Store the result. */
+        "mov %[r0], %[r1] \n\t"
+        "mov %[r1], %[r2] \n\t"
+        "mov %[r2], %[zero] \n\t"
+        
+        "inc %[k] \n\t"
+        "cpi %[k], " STR(uECC_BYTES) " \n\t"
+        "brlo 1b \n\t" /* loop if k < uECC_BYTES */
+        
+        /* second half */
+        "ldi %[k], " STR(uECC_BYTES) " \n\t" /* k = uECC_BYTES; k > 0; --k */
+        "adiw r28, " STR(uECC_BYTES) " \n\t" /* move right ptr to point at the end of p_right */
+        
+        "1: \n\t"
+        "ldi %[i], 0 \n\t" /* i=0; i < k; ++i */
+        
+        "2: \n\t"
+        "ld r0, x+ \n\t"
+        "ld r1, -y \n\t"
+        "mul r0, r1 \n\t"
+        
+        "add %[r0], r0 \n\t"
+        "adc %[r1], r1 \n\t"
+        "adc %[r2], %[zero] \n\t"
+        
+        "inc %[i] \n\t"
+        "cp %[i], %[k] \n\t"
+        "brlo 2b \n\t" /* loop if i < k */
+        
+        "add r28, %[k] \n\t" /* fix up right ptr */
+        "adc r29, %[zero] \n\t"
+        
+        "st z+, %[r0] \n\t"  /* Store the result. */
+        "mov %[r0], %[r1] \n\t"
+        "mov %[r1], %[r2] \n\t"
+        "mov %[r2], %[zero] \n\t"
+        
+        "dec %[k] \n\t"
+        "sub r26, %[k] \n\t" /* fix up left ptr (after k is decremented, so next time we start 1 higher) */
+        "sbc r27, %[zero] \n\t"
+        
+        "cpi %[k], 0 \n\t"
+        "brne 1b \n\t" /* loop if k > 0 */
+        
+        "st z+, %[r0] \n\t"  /* Store last result byte. */
+        
+        "eor r1, r1 \n\t" /* fix r1 to be 0 again */
+        
+        "sbiw r28, " STR(uECC_BYTES) " \n\t" /* Restore Y */
+    
+        : "+z" (p_result), "+x" (p_left),
+          [r0] "+r" (r0), [r1] "+r" (r1), [r2] "+r" (r2), [zero] "+r" (l_zero),
+          [k] "=&a" (k), [i] "=&a" (i)
+        : "y" (p_right)
+        : "r0", "cc", "memory"
+    );
+}
+#define asm_mult 1
+#endif
+
+#if uECC_SQUARE_FUNC
+#if !asm_square
+static void vli_square(uint8_t *p_result, uint8_t *p_left)
+{
+    uint8_t r0 = 0;
+    uint8_t r1 = 0;
+    uint8_t r2 = 0;
+    
+    uint8_t l_zero = 0;
+    
+    uint8_t k;
+    
+    __asm__ volatile (
+        "ldi %[k], 1 \n\t" /* k = 1; k < uECC_BYTES*2; ++k */
+        
+        "1: \n\t"
+        
+        "movw r26, %[orig] \n\t"  /* copy orig ptr to 'left' ptr */
+        "movw r30, %[orig] \n\t"  /* copy orig ptr to 'right' ptr */
+        "cpi %[k], " STR(uECC_BYTES) " \n\t"
+        "brlo 2f \n\t"
+        "breq 2f \n\t"
+        
+        /* when k > uECC_BYTES, we start from (k - uECC_BYTES) on the 'left' ptr */
+        "add r26, %[k] \n\t"
+        "adc r27, %[zero] \n\t"
+        "subi r26, " STR(uECC_BYTES) " \n\t"
+        "sbc r27, %[zero] \n\t"
+        "adiw r30, " STR(uECC_BYTES) " \n\t" /* move right ptr to point at the end */
+        "rjmp 3f \n\t"
+        
+        "2: \n\t" /* when k <= uECC_BYTES, we add k to the 'right' ptr */
+        "add r30, %[k] \n\t" /* pre-add 'right' ptr */
+        "adc r31, %[zero] \n\t"
+        
+        "3: \n\t"
+        "ld r0, x+ \n\t"
+        "cp r26, r30 \n\t" /* if left == right here, then we are done after this mult (and we don't need to double) */
+        "breq 4f \n\t"
+        "ld r1, -z \n\t"
+        "mul r0, r1 \n\t"
+        
+        /* add twice since it costs the same as doubling */
+        "add %[r0], r0 \n\t"
+        "adc %[r1], r1 \n\t"
+        "adc %[r2], %[zero] \n\t"
+        "add %[r0], r0 \n\t"
+        "adc %[r1], r1 \n\t"
+        "adc %[r2], %[zero] \n\t"
+        
+        "cpse r26, r30 \n\t" /* if left == right here, then we are done */
+        "rjmp 3b \n\t"
+        "rjmp 5f \n\t" /* skip code for non-doubled mult */
+        
+        "4: \n\t"
+        "ld r1, -z \n\t"
+        "mul r0, r1 \n\t"
+        "add %[r0], r0 \n\t"
+        "adc %[r1], r1 \n\t"
+        "adc %[r2], %[zero] \n\t"
+        
+        "5: \n\t"
+        "movw r30, %[result] \n\t"  /* make z point to result */
+        "st z+, %[r0] \n\t"  /* Store the result. */
+        "movw %[result], r30 \n\t"  /* update result ptr*/
+        "mov %[r0], %[r1] \n\t"
+        "mov %[r1], %[r2] \n\t"
+        "mov %[r2], %[zero] \n\t"
+        
+        "inc %[k] \n\t"
+        "cpi %[k], %[max] \n\t"
+        "brlo 1b \n\t" /* loop if k < uECC_BYTES */
+        
+        "movw r30, %[result] \n\t"  /* make z point to result */
+        "st z+, %[r0] \n\t"  /* Store last result byte. */
+        
+        "eor r1, r1 \n\t" /* fix r1 to be 0 again */
+    
+        : [result] "+r" (p_result),
+          [r0] "+r" (r0), [r1] "+r" (r1), [r2] "+r" (r2), [zero] "+r" (l_zero),
+          [k] "=&a" (k)
+        : [orig] "r" (p_left), [max] "M" (2*uECC_BYTES)
+        : "r0", "r26", "r27", "r30", "r31", "cc", "memory"
+    );
+}
+#define asm_square 1
+#endif
+#endif /* uECC_SQUARE_FUNC */
old mode 100644 (file)
new mode 100755 (executable)
index c6c8497..c9bd8c5
-/*
- * Copyright (c) 2009 Chris K Cockrum <ckc@cockrum.net>
- *
- * Copyright (c) 2013 Jens Trillmann <jtrillma@tzi.de>
- * Copyright (c) 2013 Marc Müller-Weinhardt <muewei@tzi.de>
- * Copyright (c) 2013 Lars Schmertmann <lars@tzi.de>
- * Copyright (c) 2013 Hauke Mehrtens <hauke@hauke-m.de>
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- * THE SOFTWARE.
- *
- *
- * This implementation is based in part on the paper Implementation of an
- * Elliptic Curve Cryptosystem on an 8-bit Microcontroller [0] by
- * Chris K Cockrum <ckc@cockrum.net>.
- *
- * [0]: http://cockrum.net/Implementation_of_ECC_on_an_8-bit_microcontroller.pdf
- *
- * This is a efficient ECC implementation on the secp256r1 curve for 32 Bit CPU
- * architectures. It provides basic operations on the secp256r1 curve and support
- * for ECDH and ECDSA.
- */
-
-//big number functions
+/* Copyright 2014, Kenneth MacKay. Licensed under the BSD 2-clause license. */
+
 #include "ecc.h"
-#include <string.h>
 
-static uint32_t add( const uint32_t *x, const uint32_t *y, uint32_t *result, uint8_t length){
-       uint64_t d = 0; //carry
-       int v = 0;
-       for(v = 0;v<length;v++){
-               //printf("%02x + %02x + %01x = ", x[v], y[v], d);
-               d += (uint64_t) x[v] + (uint64_t) y[v];
-               //printf("%02x\n", d);
-               result[v] = d;
-               d = d>>32; //save carry
-       }
-       
-       return (uint32_t)d;
+#ifndef uECC_PLATFORM
+    #if __AVR__
+        #define uECC_PLATFORM uECC_avr
+    #elif defined(__thumb2__) || defined(_M_ARMT) /* I think MSVC only supports Thumb-2 targets */
+        #define uECC_PLATFORM uECC_arm_thumb2
+    #elif defined(__thumb__)
+        #define uECC_PLATFORM uECC_arm_thumb
+    #elif defined(__arm__) || defined(_M_ARM)
+        #define uECC_PLATFORM uECC_arm
+    #elif defined(__i386__) || defined(_M_IX86) || defined(_X86_) || defined(__I86__)
+        #define uECC_PLATFORM uECC_x86
+    #elif defined(__amd64__) || defined(_M_X64)
+        #define uECC_PLATFORM uECC_x86_64
+    #else
+        #define uECC_PLATFORM uECC_arch_other
+    #endif
+#endif
+
+#ifndef uECC_WORD_SIZE
+    #if uECC_PLATFORM == uECC_avr
+        #define uECC_WORD_SIZE 1
+    #elif (uECC_PLATFORM == uECC_x86_64)
+        #define uECC_WORD_SIZE 8
+    #else
+        #define uECC_WORD_SIZE 4
+    #endif
+#endif
+
+#if (uECC_CURVE == uECC_secp160r1) && (uECC_WORD_SIZE == 8)
+    #undef uECC_WORD_SIZE
+    #define uECC_WORD_SIZE 4
+    #if (uECC_PLATFORM == uECC_x86_64)
+        #undef uECC_PLATFORM
+        #define uECC_PLATFORM uECC_x86
+    #endif
+#endif
+
+#if (uECC_WORD_SIZE != 1) && (uECC_WORD_SIZE != 4) && (uECC_WORD_SIZE != 8)
+    #error "Unsupported value for uECC_WORD_SIZE"
+#endif
+
+#if (uECC_ASM && (uECC_PLATFORM == uECC_avr) && (uECC_WORD_SIZE != 1))
+    #pragma message ("uECC_WORD_SIZE must be 1 when using AVR asm")
+    #undef uECC_WORD_SIZE
+    #define uECC_WORD_SIZE 1
+#endif
+
+#if (uECC_ASM && (uECC_PLATFORM == uECC_arm || uECC_PLATFORM == uECC_arm_thumb) && (uECC_WORD_SIZE != 4))
+    #pragma message ("uECC_WORD_SIZE must be 4 when using ARM asm")
+    #undef uECC_WORD_SIZE
+    #define uECC_WORD_SIZE 4
+#endif
+
+#if __STDC_VERSION__ >= 199901L
+    #define RESTRICT restrict
+#else
+    #define RESTRICT
+#endif
+
+#if defined(__SIZEOF_INT128__) || ((__clang_major__ * 100 + __clang_minor__) >= 302)
+    #define SUPPORTS_INT128 1
+#else
+    #define SUPPORTS_INT128 0
+#endif
+
+#define MAX_TRIES 16
+
+#if (uECC_WORD_SIZE == 1)
+
+typedef uint8_t uECC_word_t;
+typedef uint16_t uECC_dword_t;
+typedef uint8_t wordcount_t;
+typedef int8_t swordcount_t;
+typedef int16_t bitcount_t;
+typedef int8_t cmpresult_t;
+
+#define HIGH_BIT_SET 0x80
+#define uECC_WORD_BITS 8
+#define uECC_WORD_BITS_SHIFT 3
+#define uECC_WORD_BITS_MASK 0x07
+
+#define uECC_WORDS_1 20
+#define uECC_WORDS_2 24
+#define uECC_WORDS_3 32
+#define uECC_WORDS_4 32
+
+#define uECC_N_WORDS_1 21
+#define uECC_N_WORDS_2 24
+#define uECC_N_WORDS_3 32
+#define uECC_N_WORDS_4 32
+
+#define Curve_P_1 {0xFF, 0xFF, 0xFF, 0x7F, 0xFF, 0xFF, 0xFF, 0xFF, \
+                   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, \
+                   0xFF, 0xFF, 0xFF, 0xFF}
+#define Curve_P_2 {0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, \
+                   0xFE, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, \
+                   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF}
+#define Curve_P_3 {0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, \
+                   0xFF, 0xFF, 0xFF, 0xFF, 0x00, 0x00, 0x00, 0x00, \
+                   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, \
+                   0x01, 0x00, 0x00, 0x00, 0xFF, 0xFF, 0xFF, 0xFF}
+#define Curve_P_4 {0x2F, 0xFC, 0xFF, 0xFF, 0xFE, 0xFF, 0xFF, 0xFF, \
+                   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, \
+                   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, \
+                   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF}
+
+#define Curve_B_1 {0x45, 0xFA, 0x65, 0xC5, 0xAD, 0xD4, 0xD4, 0x81, \
+                   0x9F, 0xF8, 0xAC, 0x65, 0x8B, 0x7A, 0xBD, 0x54, \
+                   0xFC, 0xBE, 0x97, 0x1C}
+#define Curve_B_2 {0xB1, 0xB9, 0x46, 0xC1, 0xEC, 0xDE, 0xB8, 0xFE, \
+                   0x49, 0x30, 0x24, 0x72, 0xAB, 0xE9, 0xA7, 0x0F, \
+                   0xE7, 0x80, 0x9C, 0xE5, 0x19, 0x05, 0x21, 0x64}
+#define Curve_B_3 {0x4B, 0x60, 0xD2, 0x27, 0x3E, 0x3C, 0xCE, 0x3B, \
+                   0xF6, 0xB0, 0x53, 0xCC, 0xB0, 0x06, 0x1D, 0x65, \
+                   0xBC, 0x86, 0x98, 0x76, 0x55, 0xBD, 0xEB, 0xB3, \
+                   0xE7, 0x93, 0x3A, 0xAA, 0xD8, 0x35, 0xC6, 0x5A}
+#define Curve_B_4 {0x07, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, \
+                   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, \
+                   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, \
+                   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}
+
+#define Curve_G_1 { \
+    {0x82, 0xFC, 0xCB, 0x13, 0xB9, 0x8B, 0xC3, 0x68, \
+        0x89, 0x69, 0x64, 0x46, 0x28, 0x73, 0xF5, 0x8E, \
+        0x68, 0xB5, 0x96, 0x4A}, \
+    {0x32, 0xFB, 0xC5, 0x7A, 0x37, 0x51, 0x23, 0x04, \
+        0x12, 0xC9, 0xDC, 0x59, 0x7D, 0x94, 0x68, 0x31, \
+        0x55, 0x28, 0xA6, 0x23}}
+
+#define Curve_G_2 { \
+    {0x12, 0x10, 0xFF, 0x82, 0xFD, 0x0A, 0xFF, 0xF4, \
+        0x00, 0x88, 0xA1, 0x43, 0xEB, 0x20, 0xBF, 0x7C, \
+        0xF6, 0x90, 0x30, 0xB0, 0x0E, 0xA8, 0x8D, 0x18}, \
+    {0x11, 0x48, 0x79, 0x1E, 0xA1, 0x77, 0xF9, 0x73, \
+        0xD5, 0xCD, 0x24, 0x6B, 0xED, 0x11, 0x10, 0x63, \
+        0x78, 0xDA, 0xC8, 0xFF, 0x95, 0x2B, 0x19, 0x07}}
+
+#define Curve_G_3 { \
+    {0x96, 0xC2, 0x98, 0xD8, 0x45, 0x39, 0xA1, 0xF4, \
+        0xA0, 0x33, 0xEB, 0x2D, 0x81, 0x7D, 0x03, 0x77, \
+        0xF2, 0x40, 0xA4, 0x63, 0xE5, 0xE6, 0xBC, 0xF8, \
+        0x47, 0x42, 0x2C, 0xE1, 0xF2, 0xD1, 0x17, 0x6B}, \
+    {0xF5, 0x51, 0xBF, 0x37, 0x68, 0x40, 0xB6, 0xCB, \
+        0xCE, 0x5E, 0x31, 0x6B, 0x57, 0x33, 0xCE, 0x2B, \
+        0x16, 0x9E, 0x0F, 0x7C, 0x4A, 0xEB, 0xE7, 0x8E, \
+        0x9B, 0x7F, 0x1A, 0xFE, 0xE2, 0x42, 0xE3, 0x4F}}
+
+#define Curve_G_4 { \
+    {0x98, 0x17, 0xF8, 0x16, 0x5B, 0x81, 0xF2, 0x59, \
+        0xD9, 0x28, 0xCE, 0x2D, 0xDB, 0xFC, 0x9B, 0x02, \
+        0x07, 0x0B, 0x87, 0xCE, 0x95, 0x62, 0xA0, 0x55, \
+        0xAC, 0xBB, 0xDC, 0xF9, 0x7E, 0x66, 0xBE, 0x79}, \
+    {0xB8, 0xD4, 0x10, 0xFB, 0x8F, 0xD0, 0x47, 0x9C, \
+        0x19, 0x54, 0x85, 0xA6, 0x48, 0xB4, 0x17, 0xFD, \
+        0xA8, 0x08, 0x11, 0x0E, 0xFC, 0xFB, 0xA4, 0x5D, \
+        0x65, 0xC4, 0xA3, 0x26, 0x77, 0xDA, 0x3A, 0x48}}
+
+#define Curve_N_1 {0x57, 0x22, 0x75, 0xCA, 0xD3, 0xAE, 0x27, 0xF9, \
+                   0xC8, 0xF4, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, \
+                   0x00, 0x00, 0x00, 0x00, 0x01}
+#define Curve_N_2 {0x31, 0x28, 0xD2, 0xB4, 0xB1, 0xC9, 0x6B, 0x14, \
+                   0x36, 0xF8, 0xDE, 0x99, 0xFF, 0xFF, 0xFF, 0xFF, \
+                   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF}
+#define Curve_N_3 {0x51, 0x25, 0x63, 0xFC, 0xC2, 0xCA, 0xB9, 0xF3, \
+                   0x84, 0x9E, 0x17, 0xA7, 0xAD, 0xFA, 0xE6, 0xBC, \
+                   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, \
+                   0x00, 0x00, 0x00, 0x00, 0xFF, 0xFF, 0xFF, 0xFF}
+#define Curve_N_4 {0x41, 0x41, 0x36, 0xD0, 0x8C, 0x5E, 0xD2, 0xBF, \
+                   0x3B, 0xA0, 0x48, 0xAF, 0xE6, 0xDC, 0xAE, 0xBA, \
+                   0xFE, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, \
+                   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF}
+
+#elif (uECC_WORD_SIZE == 4)
+
+typedef uint32_t uECC_word_t;
+typedef uint64_t uECC_dword_t;
+typedef unsigned wordcount_t;
+typedef int swordcount_t;
+typedef int bitcount_t;
+typedef int cmpresult_t;
+
+#define HIGH_BIT_SET 0x80000000
+#define uECC_WORD_BITS 32
+#define uECC_WORD_BITS_SHIFT 5
+#define uECC_WORD_BITS_MASK 0x01F
+
+#define uECC_WORDS_1 5
+#define uECC_WORDS_2 6
+#define uECC_WORDS_3 8
+#define uECC_WORDS_4 8
+
+#define uECC_N_WORDS_1 6
+#define uECC_N_WORDS_2 6
+#define uECC_N_WORDS_3 8
+#define uECC_N_WORDS_4 8
+
+#define Curve_P_1 {0x7FFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF}
+#define Curve_P_2 {0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFE, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF}
+#define Curve_P_3 {0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0x00000000, 0x00000000, 0x00000001, 0xFFFFFFFF}
+#define Curve_P_4 {0xFFFFFC2F, 0xFFFFFFFE, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF}
+
+#define Curve_B_1 {0xC565FA45, 0x81D4D4AD, 0x65ACF89F, 0x54BD7A8B, 0x1C97BEFC}
+#define Curve_B_2 {0xC146B9B1, 0xFEB8DEEC, 0x72243049, 0x0FA7E9AB, 0xE59C80E7, 0x64210519}
+#define Curve_B_3 {0x27D2604B, 0x3BCE3C3E, 0xCC53B0F6, 0x651D06B0, 0x769886BC, 0xB3EBBD55, 0xAA3A93E7, 0x5AC635D8}
+#define Curve_B_4 {0x00000007, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000}
+
+#define Curve_G_1 { \
+    {0x13CBFC82, 0x68C38BB9, 0x46646989, 0x8EF57328, 0x4A96B568}, \
+    {0x7AC5FB32, 0x04235137, 0x59DCC912, 0x3168947D, 0x23A62855}}
+
+#define Curve_G_2 { \
+    {0x82FF1012, 0xF4FF0AFD, 0x43A18800, 0x7CBF20EB, 0xB03090F6, 0x188DA80E}, \
+    {0x1E794811, 0x73F977A1, 0x6B24CDD5, 0x631011ED, 0xFFC8DA78, 0x07192B95}}
+
+#define Curve_G_3 { \
+    {0xD898C296, 0xF4A13945, 0x2DEB33A0, 0x77037D81, 0x63A440F2, 0xF8BCE6E5, 0xE12C4247, 0x6B17D1F2}, \
+    {0x37BF51F5, 0xCBB64068, 0x6B315ECE, 0x2BCE3357, 0x7C0F9E16, 0x8EE7EB4A, 0xFE1A7F9B, 0x4FE342E2}}
+
+#define Curve_G_4 { \
+    {0x16F81798, 0x59F2815B, 0x2DCE28D9, 0x029BFCDB, 0xCE870B07, 0x55A06295, 0xF9DCBBAC, 0x79BE667E}, \
+    {0xFB10D4B8, 0x9C47D08F, 0xA6855419, 0xFD17B448, 0x0E1108A8, 0x5DA4FBFC, 0x26A3C465, 0x483ADA77}}
+
+#define Curve_N_1 {0xCA752257, 0xF927AED3, 0x0001F4C8, 0x00000000, 0x00000000, 0x00000001}
+#define Curve_N_2 {0xB4D22831, 0x146BC9B1, 0x99DEF836, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF}
+#define Curve_N_3 {0xFC632551, 0xF3B9CAC2, 0xA7179E84, 0xBCE6FAAD, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF}
+#define Curve_N_4 {0xD0364141, 0xBFD25E8C, 0xAF48A03B, 0xBAAEDCE6, 0xFFFFFFFE, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF}
+
+#elif (uECC_WORD_SIZE == 8)
+
+typedef uint64_t uECC_word_t;
+#if SUPPORTS_INT128
+typedef unsigned __int128 uECC_dword_t;
+#endif
+typedef unsigned wordcount_t;
+typedef int swordcount_t;
+typedef int bitcount_t;
+typedef int cmpresult_t;
+
+#define HIGH_BIT_SET 0x8000000000000000ull
+#define uECC_WORD_BITS 64
+#define uECC_WORD_BITS_SHIFT 6
+#define uECC_WORD_BITS_MASK 0x03F
+
+#define uECC_WORDS_1 3
+#define uECC_WORDS_2 3
+#define uECC_WORDS_3 4
+#define uECC_WORDS_4 4
+
+#define uECC_N_WORDS_1 3
+#define uECC_N_WORDS_2 3
+#define uECC_N_WORDS_3 4
+#define uECC_N_WORDS_4 4
+
+#define Curve_P_1 {0xFFFFFFFF7FFFFFFFull, 0xFFFFFFFFFFFFFFFFull, 0x00000000FFFFFFFFull}
+#define Curve_P_2 {0xFFFFFFFFFFFFFFFFull, 0xFFFFFFFFFFFFFFFEull, 0xFFFFFFFFFFFFFFFFull}
+#define Curve_P_3 {0xFFFFFFFFFFFFFFFFull, 0x00000000FFFFFFFFull, 0x0000000000000000ull, 0xFFFFFFFF00000001ull}
+#define Curve_P_4 {0xFFFFFFFEFFFFFC2Full, 0xFFFFFFFFFFFFFFFFull, 0xFFFFFFFFFFFFFFFFull, 0xFFFFFFFFFFFFFFFFull}
+
+#define Curve_B_1 {0x81D4D4ADC565FA45ull, 0x54BD7A8B65ACF89Full, 0x000000001C97BEFCull}
+#define Curve_B_2 {0xFEB8DEECC146B9B1ull, 0x0FA7E9AB72243049ull, 0x64210519E59C80E7ull}
+#define Curve_B_3 {0x3BCE3C3E27D2604Bull, 0x651D06B0CC53B0F6ull, 0xB3EBBD55769886BCull, 0x5AC635D8AA3A93E7ull}
+#define Curve_B_4 {0x0000000000000007ull, 0x0000000000000000ull, 0x0000000000000000ull, 0x0000000000000000ull}
+
+#define Curve_G_1 { \
+    {0x68C38BB913CBFC82ull, 0x8EF5732846646989ull, 0x000000004A96B568ull}, \
+    {0x042351377AC5FB32ull, 0x3168947D59DCC912ull, 0x0000000023A62855ull}}
+
+#define Curve_G_2 { \
+    {0xF4FF0AFD82FF1012ull, 0x7CBF20EB43A18800ull, 0x188DA80EB03090F6ull}, \
+    {0x73F977A11E794811ull, 0x631011ED6B24CDD5ull, 0x07192B95FFC8DA78ull}}
+
+#define Curve_G_3 { \
+    {0xF4A13945D898C296ull, 0x77037D812DEB33A0ull, 0xF8BCE6E563A440F2ull, 0x6B17D1F2E12C4247ull}, \
+    {0xCBB6406837BF51F5ull, 0x2BCE33576B315ECEull, 0x8EE7EB4A7C0F9E16ull, 0x4FE342E2FE1A7F9Bull}}
+
+#define Curve_G_4 { \
+    {0x59F2815B16F81798, 0x029BFCDB2DCE28D9, 0x55A06295CE870B07, 0x79BE667EF9DCBBAC}, \
+    {0x9C47D08FFB10D4B8, 0xFD17B448A6855419, 0x5DA4FBFC0E1108A8, 0x483ADA7726A3C465}}
+
+#define Curve_N_1 {0xF927AED3CA752257ull, 0x000000000001F4C8ull, 0x0000000100000000ull}
+#define Curve_N_2 {0x146BC9B1B4D22831ull, 0xFFFFFFFF99DEF836ull, 0xFFFFFFFFFFFFFFFFull}
+#define Curve_N_3 {0xF3B9CAC2FC632551ull, 0xBCE6FAADA7179E84ull, 0xFFFFFFFFFFFFFFFFull, 0xFFFFFFFF00000000ull}
+#define Curve_N_4 {0xBFD25E8CD0364141, 0xBAAEDCE6AF48A03B, 0xFFFFFFFFFFFFFFFE, 0xFFFFFFFFFFFFFFFF}
+
+#endif /* (uECC_WORD_SIZE == 8) */
+
+#define uECC_WORDS uECC_CONCAT(uECC_WORDS_, uECC_CURVE)
+#define uECC_N_WORDS uECC_CONCAT(uECC_N_WORDS_, uECC_CURVE)
+
+typedef struct EccPoint
+{
+    uECC_word_t x[uECC_WORDS];
+    uECC_word_t y[uECC_WORDS];
+} EccPoint;
+
+static uECC_word_t curve_p[uECC_WORDS] = uECC_CONCAT(Curve_P_, uECC_CURVE);
+static uECC_word_t curve_b[uECC_WORDS] = uECC_CONCAT(Curve_B_, uECC_CURVE);
+static EccPoint curve_G = uECC_CONCAT(Curve_G_, uECC_CURVE);
+static uECC_word_t curve_n[uECC_N_WORDS] = uECC_CONCAT(Curve_N_, uECC_CURVE);
+
+static void vli_clear(uECC_word_t *p_vli);
+static uECC_word_t vli_isZero(const uECC_word_t *p_vli);
+static uECC_word_t vli_testBit(const uECC_word_t *p_vli, bitcount_t p_bit);
+static bitcount_t vli_numBits(const uECC_word_t *p_vli, wordcount_t p_maxWords);
+static void vli_set(uECC_word_t *p_dest, const uECC_word_t *p_src);
+static cmpresult_t vli_cmp(uECC_word_t *p_left, uECC_word_t *p_right);
+static void vli_rshift1(uECC_word_t *p_vli);
+static uECC_word_t vli_add(uECC_word_t *p_result, uECC_word_t *p_left, uECC_word_t *p_right);
+static uECC_word_t vli_sub(uECC_word_t *p_result, uECC_word_t *p_left, uECC_word_t *p_right);
+static void vli_mult(uECC_word_t *p_result, uECC_word_t *p_left, uECC_word_t *p_right);
+static void vli_modAdd(uECC_word_t *p_result, uECC_word_t *p_left, uECC_word_t *p_right, uECC_word_t *p_mod);
+static void vli_modSub(uECC_word_t *p_result, uECC_word_t *p_left, uECC_word_t *p_right, uECC_word_t *p_mod);
+static void vli_mmod_fast(uECC_word_t *RESTRICT p_result, uECC_word_t *RESTRICT p_product);
+static void vli_modMult_fast(uECC_word_t *p_result, uECC_word_t *p_left, uECC_word_t *p_right);
+static void vli_modInv(uECC_word_t *p_result, uECC_word_t *p_input, uECC_word_t *p_mod);
+#if uECC_SQUARE_FUNC
+static void vli_square(uECC_word_t *p_result, uECC_word_t *p_left);
+static void vli_modSquare_fast(uECC_word_t *p_result, uECC_word_t *p_left);
+#endif
+
+#if (defined(_WIN32) || defined(_WIN64))
+/* Windows */
+
+#define WIN32_LEAN_AND_MEAN
+#include <windows.h>
+#include <wincrypt.h>
+
+static int default_RNG(uint8_t *p_dest, unsigned p_size)
+{
+    HCRYPTPROV l_prov;
+    if(!CryptAcquireContext(&l_prov, NULL, NULL, PROV_RSA_FULL, CRYPT_VERIFYCONTEXT))
+    {
+        return 0;
+    }
+
+    CryptGenRandom(l_prov, p_size, (BYTE *)p_dest);
+    CryptReleaseContext(l_prov, 0);
+
+    return 1;
+}
+
+#elif defined(unix) || defined(__linux__) || defined(__unix__) || defined(__unix) || \
+    (defined(__APPLE__) && defined(__MACH__)) || defined(uECC_POSIX)
+
+/* Some POSIX-like system with /dev/urandom or /dev/random. */
+#include <sys/types.h>
+#include <fcntl.h>
+#include <unistd.h>
+
+#ifndef O_CLOEXEC
+    #define O_CLOEXEC 0
+#endif
+
+static int default_RNG(uint8_t *p_dest, unsigned p_size)
+{
+    int l_fd = open("/dev/urandom", O_RDONLY | O_CLOEXEC);
+    if(l_fd == -1)
+    {
+        l_fd = open("/dev/random", O_RDONLY | O_CLOEXEC);
+        if(l_fd == -1)
+        {
+            return 0;
+        }
+    }
+
+    char *l_ptr = (char *)p_dest;
+    size_t l_left = p_size;
+    while(l_left > 0)
+    {
+        int l_read = read(l_fd, l_ptr, l_left);
+        if(l_read <= 0)
+        { // read failed
+            close(l_fd);
+            return 0;
+        }
+        l_left -= l_read;
+        l_ptr += l_read;
+    }
+
+    close(l_fd);
+    return 1;
+}
+
+#else /* Some other platform */
+
+static int default_RNG(uint8_t *p_dest, unsigned p_size)
+{
+    return 0;
+}
+
+#endif
+
+static uECC_RNG_Function g_rng = &default_RNG;
+
+void uECC_set_rng(uECC_RNG_Function p_rng)
+{
+    g_rng = p_rng;
+}
+
+#ifdef __GNUC__ /* Only support GCC inline asm for now */
+    #if (uECC_ASM && (uECC_PLATFORM == uECC_avr))
+        #include "asm_avr.inc"
+    #endif
+
+    #if (uECC_ASM && (uECC_PLATFORM == uECC_arm || uECC_PLATFORM == uECC_arm_thumb || uECC_PLATFORM == uECC_arm_thumb2))
+        #include "asm_arm.inc"
+    #endif
+#endif
+
+#if !asm_clear
+static void vli_clear(uECC_word_t *p_vli)
+{
+    wordcount_t i;
+    for(i = 0; i < uECC_WORDS; ++i)
+    {
+        p_vli[i] = 0;
+    }
+}
+#endif
+
+/* Returns 1 if p_vli == 0, 0 otherwise. */
+#if !asm_isZero
+static uECC_word_t vli_isZero(const uECC_word_t *p_vli)
+{
+    wordcount_t i;
+    for(i = 0; i < uECC_WORDS; ++i)
+    {
+        if(p_vli[i])
+        {
+            return 0;
+        }
+    }
+    return 1;
+}
+#endif
+
+/* Returns nonzero if bit p_bit of p_vli is set. */
+#if !asm_testBit
+static uECC_word_t vli_testBit(const uECC_word_t *p_vli, bitcount_t p_bit)
+{
+    return (p_vli[p_bit >> uECC_WORD_BITS_SHIFT] & ((uECC_word_t)1 << (p_bit & uECC_WORD_BITS_MASK)));
+}
+#endif
+
+/* Counts the number of words in p_vli. */
+#if !asm_numBits
+static wordcount_t vli_numDigits(const uECC_word_t *p_vli, wordcount_t p_maxWords)
+{
+    swordcount_t i;
+    /* Search from the end until we find a non-zero digit.
+       We do it in reverse because we expect that most digits will be nonzero. */
+    for(i = p_maxWords-1; i >= 0 && p_vli[i] == 0; --i)
+    {
+    }
+
+    return (i + 1);
+}
+
+/* Counts the number of bits required to represent p_vli. */
+static bitcount_t vli_numBits(const uECC_word_t *p_vli, wordcount_t p_maxWords)
+{
+    uECC_word_t i;
+    uECC_word_t l_digit;
+
+    wordcount_t l_numDigits = vli_numDigits(p_vli, p_maxWords);
+    if(l_numDigits == 0)
+    {
+        return 0;
+    }
+
+    l_digit = p_vli[l_numDigits - 1];
+    for(i = 0; l_digit; ++i)
+    {
+        l_digit >>= 1;
+    }
+
+    return (((bitcount_t)(l_numDigits - 1) << uECC_WORD_BITS_SHIFT) + i);
+}
+#endif /* !asm_numBits */
+
+/* Sets p_dest = p_src. */
+#if !asm_set
+static void vli_set(uECC_word_t *p_dest, const uECC_word_t *p_src)
+{
+    wordcount_t i;
+    for(i=0; i<uECC_WORDS; ++i)
+    {
+        p_dest[i] = p_src[i];
+    }
+}
+#endif
+
+/* Returns sign of p_left - p_right. */
+#if !asm_cmp
+static cmpresult_t vli_cmp(uECC_word_t *p_left, uECC_word_t *p_right)
+{
+    swordcount_t i;
+    for(i = uECC_WORDS-1; i >= 0; --i)
+    {
+        if(p_left[i] > p_right[i])
+        {
+            return 1;
+        }
+        else if(p_left[i] < p_right[i])
+        {
+            return -1;
+        }
+    }
+    return 0;
+}
+#endif
+
+/* Computes p_vli = p_vli >> 1. */
+#if !asm_rshift1
+static void vli_rshift1(uECC_word_t *p_vli)
+{
+    uECC_word_t *l_end = p_vli;
+    uECC_word_t l_carry = 0;
+
+    p_vli += uECC_WORDS;
+    while(p_vli-- > l_end)
+    {
+        uECC_word_t l_temp = *p_vli;
+        *p_vli = (l_temp >> 1) | l_carry;
+        l_carry = l_temp << (uECC_WORD_BITS - 1);
+    }
+}
+#endif
+
+/* Computes p_result = p_left + p_right, returning carry. Can modify in place. */
+#if !asm_add
+static uECC_word_t vli_add(uECC_word_t *p_result, uECC_word_t *p_left, uECC_word_t *p_right)
+{
+    uECC_word_t l_carry = 0;
+    wordcount_t i;
+    for(i = 0; i < uECC_WORDS; ++i)
+    {
+        uECC_word_t l_sum = p_left[i] + p_right[i] + l_carry;
+        if(l_sum != p_left[i])
+        {
+            l_carry = (l_sum < p_left[i]);
+        }
+        p_result[i] = l_sum;
+    }
+    return l_carry;
+}
+#endif
+
+/* Computes p_result = p_left - p_right, returning borrow. Can modify in place. */
+#if !asm_sub
+static uECC_word_t vli_sub(uECC_word_t *p_result, uECC_word_t *p_left, uECC_word_t *p_right)
+{
+    uECC_word_t l_borrow = 0;
+    wordcount_t i;
+    for(i = 0; i < uECC_WORDS; ++i)
+    {
+        uECC_word_t l_diff = p_left[i] - p_right[i] - l_borrow;
+        if(l_diff != p_left[i])
+        {
+            l_borrow = (l_diff > p_left[i]);
+        }
+        p_result[i] = l_diff;
+    }
+    return l_borrow;
+}
+#endif
+
+#if (!asm_mult || !asm_square || uECC_CURVE == uECC_secp256k1)
+static void muladd(uECC_word_t a, uECC_word_t b, uECC_word_t *r0, uECC_word_t *r1, uECC_word_t *r2)
+{
+#if uECC_WORD_SIZE == 8 && !SUPPORTS_INT128
+    uint64_t a0 = a & 0xffffffffull;
+    uint64_t a1 = a >> 32;
+    uint64_t b0 = b & 0xffffffffull;
+    uint64_t b1 = b >> 32;
+
+    uint64_t i0 = a0 * b0;
+    uint64_t i1 = a0 * b1;
+    uint64_t i2 = a1 * b0;
+    uint64_t i3 = a1 * b1;
+
+    uint64_t p0, p1;
+
+    i2 += (i0 >> 32);
+    i2 += i1;
+    if(i2 < i1)
+    { // overflow
+        i3 += 0x100000000ull;
+    }
+
+    p0 = (i0 & 0xffffffffull) | (i2 << 32);
+    p1 = i3 + (i2 >> 32);
+
+    *r0 += p0;
+    *r1 += (p1 + (*r0 < p0));
+    *r2 += ((*r1 < p1) || (*r1 == p1 && *r0 < p0));
+#else
+    uECC_dword_t p = (uECC_dword_t)a * b;
+    uECC_dword_t r01 = ((uECC_dword_t)(*r1) << uECC_WORD_BITS) | *r0;
+    r01 += p;
+    *r2 += (r01 < p);
+    *r1 = r01 >> uECC_WORD_BITS;
+    *r0 = (uECC_word_t)r01;
+#endif
+}
+#define muladd_exists 1
+#endif
+
+#if !asm_mult
+static void vli_mult(uECC_word_t *p_result, uECC_word_t *p_left, uECC_word_t *p_right)
+{
+    uECC_word_t r0 = 0;
+    uECC_word_t r1 = 0;
+    uECC_word_t r2 = 0;
+
+    wordcount_t i, k;
+
+    /* Compute each digit of p_result in sequence, maintaining the carries. */
+    for(k = 0; k < uECC_WORDS; ++k)
+    {
+        for(i = 0; i <= k; ++i)
+        {
+            muladd(p_left[i], p_right[k-i], &r0, &r1, &r2);
+        }
+        p_result[k] = r0;
+        r0 = r1;
+        r1 = r2;
+        r2 = 0;
+    }
+    for(k = uECC_WORDS; k < uECC_WORDS*2 - 1; ++k)
+    {
+        for(i = (k + 1) - uECC_WORDS; i<uECC_WORDS; ++i)
+        {
+            muladd(p_left[i], p_right[k-i], &r0, &r1, &r2);
+        }
+        p_result[k] = r0;
+        r0 = r1;
+        r1 = r2;
+        r2 = 0;
+    }
+
+    p_result[uECC_WORDS*2 - 1] = r0;
+}
+#endif
+
+#if uECC_SQUARE_FUNC
+
+#if !asm_square
+static void mul2add(uECC_word_t a, uECC_word_t b, uECC_word_t *r0, uECC_word_t *r1, uECC_word_t *r2)
+{
+#if uECC_WORD_SIZE == 8 && !SUPPORTS_INT128
+    uint64_t a0 = a & 0xffffffffull;
+    uint64_t a1 = a >> 32;
+    uint64_t b0 = b & 0xffffffffull;
+    uint64_t b1 = b >> 32;
+
+    uint64_t i0 = a0 * b0;
+    uint64_t i1 = a0 * b1;
+    uint64_t i2 = a1 * b0;
+    uint64_t i3 = a1 * b1;
+
+    uint64_t p0, p1;
+
+    i2 += (i0 >> 32);
+    i2 += i1;
+    if(i2 < i1)
+    { // overflow
+        i3 += 0x100000000ull;
+    }
+
+    p0 = (i0 & 0xffffffffull) | (i2 << 32);
+    p1 = i3 + (i2 >> 32);
+
+    *r2 += (p1 >> 63);
+    p1 = (p1 << 1) | (p0 >> 63);
+    p0 <<= 1;
+
+    *r0 += p0;
+    *r1 += (p1 + (*r0 < p0));
+    *r2 += ((*r1 < p1) || (*r1 == p1 && *r0 < p0));
+#else
+    uECC_dword_t p = (uECC_dword_t)a * b;
+    uECC_dword_t r01 = ((uECC_dword_t)(*r1) << uECC_WORD_BITS) | *r0;
+    *r2 += (p >> (uECC_WORD_BITS * 2 - 1));
+    p *= 2;
+    r01 += p;
+    *r2 += (r01 < p);
+    *r1 = r01 >> uECC_WORD_BITS;
+    *r0 = (uECC_word_t)r01;
+#endif
+}
+
+static void vli_square(uECC_word_t *p_result, uECC_word_t *p_left)
+{
+    uECC_word_t r0 = 0;
+    uECC_word_t r1 = 0;
+    uECC_word_t r2 = 0;
+
+    wordcount_t i, k;
+
+    for(k = 0; k < uECC_WORDS*2 - 1; ++k)
+    {
+        uECC_word_t l_min = (k < uECC_WORDS ? 0 : (k + 1) - uECC_WORDS);
+        for(i = l_min; i<=k && i<=k-i; ++i)
+        {
+            if(i < k-i)
+            {
+                mul2add(p_left[i], p_left[k-i], &r0, &r1, &r2);
+            }
+            else
+            {
+                muladd(p_left[i], p_left[k-i], &r0, &r1, &r2);
+            }
+        }
+        p_result[k] = r0;
+        r0 = r1;
+        r1 = r2;
+        r2 = 0;
+    }
+
+    p_result[uECC_WORDS*2 - 1] = r0;
+}
+#endif
+
+#else /* uECC_SQUARE_FUNC */
+
+#define vli_square(result, left, size) vli_mult((result), (left), (left), (size))
+
+#endif /* uECC_SQUARE_FUNC */
+
+
+/* Computes p_result = (p_left + p_right) % p_mod.
+   Assumes that p_left < p_mod and p_right < p_mod, p_result != p_mod. */
+#if !asm_modAdd
+static void vli_modAdd(uECC_word_t *p_result, uECC_word_t *p_left, uECC_word_t *p_right, uECC_word_t *p_mod)
+{
+    uECC_word_t l_carry = vli_add(p_result, p_left, p_right);
+    if(l_carry || vli_cmp(p_result, p_mod) >= 0)
+    { /* p_result > p_mod (p_result = p_mod + remainder), so subtract p_mod to get remainder. */
+        vli_sub(p_result, p_result, p_mod);
+    }
+}
+#endif
+
+/* Computes p_result = (p_left - p_right) % p_mod.
+   Assumes that p_left < p_mod and p_right < p_mod, p_result != p_mod. */
+#if !asm_modSub
+static void vli_modSub(uECC_word_t *p_result, uECC_word_t *p_left, uECC_word_t *p_right, uECC_word_t *p_mod)
+{
+    uECC_word_t l_borrow = vli_sub(p_result, p_left, p_right);
+    if(l_borrow)
+    { /* In this case, p_result == -diff == (max int) - diff.
+         Since -x % d == d - x, we can get the correct result from p_result + p_mod (with overflow). */
+        vli_add(p_result, p_result, p_mod);
+    }
+}
+#endif
+
+#if !asm_modSub_fast
+    #define vli_modSub_fast(result, left, right) vli_modSub((result), (left), (right), curve_p)
+#endif
+
+#if !asm_mmod_fast
+
+#if (uECC_CURVE == uECC_secp160r1 || uECC_CURVE == uECC_secp256k1)
+/* omega_mult() is defined farther below for the different curves / word sizes */
+static void omega_mult(uECC_word_t * RESTRICT p_result, uECC_word_t * RESTRICT p_right);
+
+/* Computes p_result = p_product % curve_p
+    see http://www.isys.uni-klu.ac.at/PDF/2001-0126-MT.pdf page 354
+
+    Note that this only works if log2(omega) < log2(p)/2 */
+static void vli_mmod_fast(uECC_word_t *RESTRICT p_result, uECC_word_t *RESTRICT p_product)
+{
+    uECC_word_t l_tmp[2*uECC_WORDS];
+    uECC_word_t l_carry;
+
+    vli_clear(l_tmp);
+    vli_clear(l_tmp + uECC_WORDS);
+
+    omega_mult(l_tmp, p_product + uECC_WORDS); /* (Rq, q) = q * c */
+
+    l_carry = vli_add(p_result, p_product, l_tmp); /* (C, r) = r + q       */
+    vli_clear(p_product);
+    omega_mult(p_product, l_tmp + uECC_WORDS); /* Rq*c */
+    l_carry += vli_add(p_result, p_result, p_product); /* (C1, r) = r + Rq*c */
+
+    while(l_carry > 0)
+    {
+        --l_carry;
+        vli_sub(p_result, p_result, curve_p);
+    }
+
+    if(vli_cmp(p_result, curve_p) > 0)
+    {
+        vli_sub(p_result, p_result, curve_p);
+    }
+}
+
+#endif
+
+#if uECC_CURVE == uECC_secp160r1
+
+#if uECC_WORD_SIZE == 1
+static void omega_mult(uint8_t * RESTRICT p_result, uint8_t * RESTRICT p_right)
+{
+    uint8_t l_carry;
+    uint8_t i;
+
+    /* Multiply by (2^31 + 1). */
+    vli_set(p_result + 4, p_right); /* 2^32 */
+    vli_rshift1(p_result + 4); /* 2^31 */
+    p_result[3] = p_right[0] << 7; /* get last bit from shift */
+
+    l_carry = vli_add(p_result, p_result, p_right); /* 2^31 + 1 */
+    for(i = uECC_WORDS; l_carry; ++i)
+    {
+        uint16_t l_sum = (uint16_t)p_result[i] + l_carry;
+        p_result[i] = (uint8_t)l_sum;
+        l_carry = l_sum >> 8;
+    }
+}
+#elif uECC_WORD_SIZE == 4
+static void omega_mult(uint32_t * RESTRICT p_result, uint32_t * RESTRICT p_right)
+{
+    uint32_t l_carry;
+    unsigned i;
+
+    /* Multiply by (2^31 + 1). */
+    vli_set(p_result + 1, p_right); /* 2^32 */
+    vli_rshift1(p_result + 1); /* 2^31 */
+    p_result[0] = p_right[0] << 31; /* get last bit from shift */
+
+    l_carry = vli_add(p_result, p_result, p_right); /* 2^31 + 1 */
+    for(i = uECC_WORDS; l_carry; ++i)
+    {
+        uint64_t l_sum = (uint64_t)p_result[i] + l_carry;
+        p_result[i] = (uint32_t)l_sum;
+        l_carry = l_sum >> 32;
+    }
+}
+#endif /* uECC_WORD_SIZE */
+
+#elif uECC_CURVE == uECC_secp192r1
+
+/* Computes p_result = p_product % curve_p.
+   See algorithm 5 and 6 from http://www.isys.uni-klu.ac.at/PDF/2001-0126-MT.pdf */
+#if uECC_WORD_SIZE == 1
+static void vli_mmod_fast(uint8_t *RESTRICT p_result, uint8_t *RESTRICT p_product)
+{
+    uint8_t l_tmp[uECC_WORDS];
+    uint8_t l_carry;
+
+    vli_set(p_result, p_product);
+
+    vli_set(l_tmp, &p_product[24]);
+    l_carry = vli_add(p_result, p_result, l_tmp);
+
+    l_tmp[0] = l_tmp[1] = l_tmp[2] = l_tmp[3] = l_tmp[4] = l_tmp[5] = l_tmp[6] = l_tmp[7] = 0;
+    l_tmp[8] = p_product[24]; l_tmp[9] = p_product[25]; l_tmp[10] = p_product[26]; l_tmp[11] = p_product[27];
+    l_tmp[12] = p_product[28]; l_tmp[13] = p_product[29]; l_tmp[14] = p_product[30]; l_tmp[15] = p_product[31];
+    l_tmp[16] = p_product[32]; l_tmp[17] = p_product[33]; l_tmp[18] = p_product[34]; l_tmp[19] = p_product[35];
+    l_tmp[20] = p_product[36]; l_tmp[21] = p_product[37]; l_tmp[22] = p_product[38]; l_tmp[23] = p_product[39];
+    l_carry += vli_add(p_result, p_result, l_tmp);
+
+    l_tmp[0] = l_tmp[8] = p_product[40];
+    l_tmp[1] = l_tmp[9] = p_product[41];
+    l_tmp[2] = l_tmp[10] = p_product[42];
+    l_tmp[3] = l_tmp[11] = p_product[43];
+    l_tmp[4] = l_tmp[12] = p_product[44];
+    l_tmp[5] = l_tmp[13] = p_product[45];
+    l_tmp[6] = l_tmp[14] = p_product[46];
+    l_tmp[7] = l_tmp[15] = p_product[47];
+    l_tmp[16] = l_tmp[17] = l_tmp[18] = l_tmp[19] = l_tmp[20] = l_tmp[21] = l_tmp[22] = l_tmp[23] = 0;
+    l_carry += vli_add(p_result, p_result, l_tmp);
+
+    while(l_carry || vli_cmp(curve_p, p_result) != 1)
+    {
+        l_carry -= vli_sub(p_result, p_result, curve_p);
+    }
+}
+#elif uECC_WORD_SIZE == 4
+static void vli_mmod_fast(uint32_t *RESTRICT p_result, uint32_t *RESTRICT p_product)
+{
+    uint32_t l_tmp[uECC_WORDS];
+    int l_carry;
+
+    vli_set(p_result, p_product);
+
+    vli_set(l_tmp, &p_product[6]);
+    l_carry = vli_add(p_result, p_result, l_tmp);
+
+    l_tmp[0] = l_tmp[1] = 0;
+    l_tmp[2] = p_product[6];
+    l_tmp[3] = p_product[7];
+    l_tmp[4] = p_product[8];
+    l_tmp[5] = p_product[9];
+    l_carry += vli_add(p_result, p_result, l_tmp);
+
+    l_tmp[0] = l_tmp[2] = p_product[10];
+    l_tmp[1] = l_tmp[3] = p_product[11];
+    l_tmp[4] = l_tmp[5] = 0;
+    l_carry += vli_add(p_result, p_result, l_tmp);
+
+    while(l_carry || vli_cmp(curve_p, p_result) != 1)
+    {
+        l_carry -= vli_sub(p_result, p_result, curve_p);
+    }
+}
+#else
+static void vli_mmod_fast(uint64_t *RESTRICT p_result, uint64_t *RESTRICT p_product)
+{
+    uint64_t l_tmp[uECC_WORDS];
+    int l_carry;
+
+    vli_set(p_result, p_product);
+
+    vli_set(l_tmp, &p_product[3]);
+    l_carry = vli_add(p_result, p_result, l_tmp);
+
+    l_tmp[0] = 0;
+    l_tmp[1] = p_product[3];
+    l_tmp[2] = p_product[4];
+    l_carry += vli_add(p_result, p_result, l_tmp);
+
+    l_tmp[0] = l_tmp[1] = p_product[5];
+    l_tmp[2] = 0;
+    l_carry += vli_add(p_result, p_result, l_tmp);
+
+    while(l_carry || vli_cmp(curve_p, p_result) != 1)
+    {
+        l_carry -= vli_sub(p_result, p_result, curve_p);
+    }
+}
+#endif /* uECC_WORD_SIZE */
+
+#elif uECC_CURVE == uECC_secp256r1
+
+/* Computes p_result = p_product % curve_p
+   from http://www.nsa.gov/ia/_files/nist-routines.pdf */
+#if uECC_WORD_SIZE == 1
+static void vli_mmod_fast(uint8_t *RESTRICT p_result, uint8_t *RESTRICT p_product)
+{
+    uint8_t l_tmp[uECC_BYTES];
+    int8_t l_carry;
+
+    /* t */
+    vli_set(p_result, p_product);
+
+    /* s1 */
+    l_tmp[0] = l_tmp[1] = l_tmp[2] = l_tmp[3] = 0;
+    l_tmp[4] = l_tmp[5] = l_tmp[6] = l_tmp[7] = 0;
+    l_tmp[8] = l_tmp[9] = l_tmp[10] = l_tmp[11] = 0;
+    l_tmp[12] = p_product[44]; l_tmp[13] = p_product[45]; l_tmp[14] = p_product[46]; l_tmp[15] = p_product[47];
+    l_tmp[16] = p_product[48]; l_tmp[17] = p_product[49]; l_tmp[18] = p_product[50]; l_tmp[19] = p_product[51];
+    l_tmp[20] = p_product[52]; l_tmp[21] = p_product[53]; l_tmp[22] = p_product[54]; l_tmp[23] = p_product[55];
+    l_tmp[24] = p_product[56]; l_tmp[25] = p_product[57]; l_tmp[26] = p_product[58]; l_tmp[27] = p_product[59];
+    l_tmp[28] = p_product[60]; l_tmp[29] = p_product[61]; l_tmp[30] = p_product[62]; l_tmp[31] = p_product[63];
+    l_carry = vli_add(l_tmp, l_tmp, l_tmp);
+    l_carry += vli_add(p_result, p_result, l_tmp);
+
+    /* s2 */
+    l_tmp[12] = p_product[48]; l_tmp[13] = p_product[49]; l_tmp[14] = p_product[50]; l_tmp[15] = p_product[51];
+    l_tmp[16] = p_product[52]; l_tmp[17] = p_product[53]; l_tmp[18] = p_product[54]; l_tmp[19] = p_product[55];
+    l_tmp[20] = p_product[56]; l_tmp[21] = p_product[57]; l_tmp[22] = p_product[58]; l_tmp[23] = p_product[59];
+    l_tmp[24] = p_product[60]; l_tmp[25] = p_product[61]; l_tmp[26] = p_product[62]; l_tmp[27] = p_product[63];
+    l_tmp[28] = l_tmp[29] = l_tmp[30] = l_tmp[31] = 0;
+    l_carry += vli_add(l_tmp, l_tmp, l_tmp);
+    l_carry += vli_add(p_result, p_result, l_tmp);
+
+    /* s3 */
+    l_tmp[0] = p_product[32]; l_tmp[1] = p_product[33]; l_tmp[2] = p_product[34]; l_tmp[3] = p_product[35];
+    l_tmp[4] = p_product[36]; l_tmp[5] = p_product[37]; l_tmp[6] = p_product[38]; l_tmp[7] = p_product[39];
+    l_tmp[8] = p_product[40]; l_tmp[9] = p_product[41]; l_tmp[10] = p_product[42]; l_tmp[11] = p_product[43];
+    l_tmp[12] = l_tmp[13] = l_tmp[14] = l_tmp[15] = 0;
+    l_tmp[16] = l_tmp[17] = l_tmp[18] = l_tmp[19] = 0;
+    l_tmp[20] = l_tmp[21] = l_tmp[22] = l_tmp[23] = 0;
+    l_tmp[24] = p_product[56]; l_tmp[25] = p_product[57]; l_tmp[26] = p_product[58]; l_tmp[27] = p_product[59];
+    l_tmp[28] = p_product[60]; l_tmp[29] = p_product[61]; l_tmp[30] = p_product[62]; l_tmp[31] = p_product[63];
+    l_carry += vli_add(p_result, p_result, l_tmp);
+
+    /* s4 */
+    l_tmp[0] = p_product[36]; l_tmp[1] = p_product[37]; l_tmp[2] = p_product[38]; l_tmp[3] = p_product[39];
+    l_tmp[4] = p_product[40]; l_tmp[5] = p_product[41]; l_tmp[6] = p_product[42]; l_tmp[7] = p_product[43];
+    l_tmp[8] = p_product[44]; l_tmp[9] = p_product[45]; l_tmp[10] = p_product[46]; l_tmp[11] = p_product[47];
+    l_tmp[12] = p_product[52]; l_tmp[13] = p_product[53]; l_tmp[14] = p_product[54]; l_tmp[15] = p_product[55];
+    l_tmp[16] = p_product[56]; l_tmp[17] = p_product[57]; l_tmp[18] = p_product[58]; l_tmp[19] = p_product[59];
+    l_tmp[20] = p_product[60]; l_tmp[21] = p_product[61]; l_tmp[22] = p_product[62]; l_tmp[23] = p_product[63];
+    l_tmp[24] = p_product[52]; l_tmp[25] = p_product[53]; l_tmp[26] = p_product[54]; l_tmp[27] = p_product[55];
+    l_tmp[28] = p_product[32]; l_tmp[29] = p_product[33]; l_tmp[30] = p_product[34]; l_tmp[31] = p_product[35];
+    l_carry += vli_add(p_result, p_result, l_tmp);
+
+    /* d1 */
+    l_tmp[0] = p_product[44]; l_tmp[1] = p_product[45]; l_tmp[2] = p_product[46]; l_tmp[3] = p_product[47];
+    l_tmp[4] = p_product[48]; l_tmp[5] = p_product[49]; l_tmp[6] = p_product[50]; l_tmp[7] = p_product[51];
+    l_tmp[8] = p_product[52]; l_tmp[9] = p_product[53]; l_tmp[10] = p_product[54]; l_tmp[11] = p_product[55];
+    l_tmp[12] = l_tmp[13] = l_tmp[14] = l_tmp[15] = 0;
+    l_tmp[16] = l_tmp[17] = l_tmp[18] = l_tmp[19] = 0;
+    l_tmp[20] = l_tmp[21] = l_tmp[22] = l_tmp[23] = 0;
+    l_tmp[24] = p_product[32]; l_tmp[25] = p_product[33]; l_tmp[26] = p_product[34]; l_tmp[27] = p_product[35];
+    l_tmp[28] = p_product[40]; l_tmp[29] = p_product[41]; l_tmp[30] = p_product[42]; l_tmp[31] = p_product[43];
+    l_carry -= vli_sub(p_result, p_result, l_tmp);
+
+    /* d2 */
+    l_tmp[0] = p_product[48]; l_tmp[1] = p_product[49]; l_tmp[2] = p_product[50]; l_tmp[3] = p_product[51];
+    l_tmp[4] = p_product[52]; l_tmp[5] = p_product[53]; l_tmp[6] = p_product[54]; l_tmp[7] = p_product[55];
+    l_tmp[8] = p_product[56]; l_tmp[9] = p_product[57]; l_tmp[10] = p_product[58]; l_tmp[11] = p_product[59];
+    l_tmp[12] = p_product[60]; l_tmp[13] = p_product[61]; l_tmp[14] = p_product[62]; l_tmp[15] = p_product[63];
+    l_tmp[16] = l_tmp[17] = l_tmp[18] = l_tmp[19] = 0;
+    l_tmp[20] = l_tmp[21] = l_tmp[22] = l_tmp[23] = 0;
+    l_tmp[24] = p_product[36]; l_tmp[25] = p_product[37]; l_tmp[26] = p_product[38]; l_tmp[27] = p_product[39];
+    l_tmp[28] = p_product[44]; l_tmp[29] = p_product[45]; l_tmp[30] = p_product[46]; l_tmp[31] = p_product[47];
+    l_carry -= vli_sub(p_result, p_result, l_tmp);
+
+    /* d3 */
+    l_tmp[0] = p_product[52]; l_tmp[1] = p_product[53]; l_tmp[2] = p_product[54]; l_tmp[3] = p_product[55];
+    l_tmp[4] = p_product[56]; l_tmp[5] = p_product[57]; l_tmp[6] = p_product[58]; l_tmp[7] = p_product[59];
+    l_tmp[8] = p_product[60]; l_tmp[9] = p_product[61]; l_tmp[10] = p_product[62]; l_tmp[11] = p_product[63];
+    l_tmp[12] = p_product[32]; l_tmp[13] = p_product[33]; l_tmp[14] = p_product[34]; l_tmp[15] = p_product[35];
+    l_tmp[16] = p_product[36]; l_tmp[17] = p_product[37]; l_tmp[18] = p_product[38]; l_tmp[19] = p_product[39];
+    l_tmp[20] = p_product[40]; l_tmp[21] = p_product[41]; l_tmp[22] = p_product[42]; l_tmp[23] = p_product[43];
+    l_tmp[24] = l_tmp[25] = l_tmp[26] = l_tmp[27] = 0;
+    l_tmp[28] = p_product[48]; l_tmp[29] = p_product[49]; l_tmp[30] = p_product[50]; l_tmp[31] = p_product[51];
+    l_carry -= vli_sub(p_result, p_result, l_tmp);
+
+    /* d4 */
+    l_tmp[0] = p_product[56]; l_tmp[1] = p_product[57]; l_tmp[2] = p_product[58]; l_tmp[3] = p_product[59];
+    l_tmp[4] = p_product[60]; l_tmp[5] = p_product[61]; l_tmp[6] = p_product[62]; l_tmp[7] = p_product[63];
+    l_tmp[8] = l_tmp[9] = l_tmp[10] = l_tmp[11] = 0;
+    l_tmp[12] = p_product[36]; l_tmp[13] = p_product[37]; l_tmp[14] = p_product[38]; l_tmp[15] = p_product[39];
+    l_tmp[16] = p_product[40]; l_tmp[17] = p_product[41]; l_tmp[18] = p_product[42]; l_tmp[19] = p_product[43];
+    l_tmp[20] = p_product[44]; l_tmp[21] = p_product[45]; l_tmp[22] = p_product[46]; l_tmp[23] = p_product[47];
+    l_tmp[24] = l_tmp[25] = l_tmp[26] = l_tmp[27] = 0;
+    l_tmp[28] = p_product[52]; l_tmp[29] = p_product[53]; l_tmp[30] = p_product[54]; l_tmp[31] = p_product[55];
+    l_carry -= vli_sub(p_result, p_result, l_tmp);
+
+    if(l_carry < 0)
+    {
+        do
+        {
+            l_carry += vli_add(p_result, p_result, curve_p);
+        } while(l_carry < 0);
+    }
+    else
+    {
+        while(l_carry || vli_cmp(curve_p, p_result) != 1)
+        {
+            l_carry -= vli_sub(p_result, p_result, curve_p);
+        }
+    }
+}
+#elif uECC_WORD_SIZE == 4
+static void vli_mmod_fast(uint32_t *RESTRICT p_result, uint32_t *RESTRICT p_product)
+{
+    uint32_t l_tmp[uECC_WORDS];
+    int l_carry;
+
+    /* t */
+    vli_set(p_result, p_product);
+
+    /* s1 */
+    l_tmp[0] = l_tmp[1] = l_tmp[2] = 0;
+    l_tmp[3] = p_product[11];
+    l_tmp[4] = p_product[12];
+    l_tmp[5] = p_product[13];
+    l_tmp[6] = p_product[14];
+    l_tmp[7] = p_product[15];
+    l_carry = vli_add(l_tmp, l_tmp, l_tmp);
+    l_carry += vli_add(p_result, p_result, l_tmp);
+
+    /* s2 */
+    l_tmp[3] = p_product[12];
+    l_tmp[4] = p_product[13];
+    l_tmp[5] = p_product[14];
+    l_tmp[6] = p_product[15];
+    l_tmp[7] = 0;
+    l_carry += vli_add(l_tmp, l_tmp, l_tmp);
+    l_carry += vli_add(p_result, p_result, l_tmp);
+
+    /* s3 */
+    l_tmp[0] = p_product[8];
+    l_tmp[1] = p_product[9];
+    l_tmp[2] = p_product[10];
+    l_tmp[3] = l_tmp[4] = l_tmp[5] = 0;
+    l_tmp[6] = p_product[14];
+    l_tmp[7] = p_product[15];
+    l_carry += vli_add(p_result, p_result, l_tmp);
+
+    /* s4 */
+    l_tmp[0] = p_product[9];
+    l_tmp[1] = p_product[10];
+    l_tmp[2] = p_product[11];
+    l_tmp[3] = p_product[13];
+    l_tmp[4] = p_product[14];
+    l_tmp[5] = p_product[15];
+    l_tmp[6] = p_product[13];
+    l_tmp[7] = p_product[8];
+    l_carry += vli_add(p_result, p_result, l_tmp);
+
+    /* d1 */
+    l_tmp[0] = p_product[11];
+    l_tmp[1] = p_product[12];
+    l_tmp[2] = p_product[13];
+    l_tmp[3] = l_tmp[4] = l_tmp[5] = 0;
+    l_tmp[6] = p_product[8];
+    l_tmp[7] = p_product[10];
+    l_carry -= vli_sub(p_result, p_result, l_tmp);
+
+    /* d2 */
+    l_tmp[0] = p_product[12];
+    l_tmp[1] = p_product[13];
+    l_tmp[2] = p_product[14];
+    l_tmp[3] = p_product[15];
+    l_tmp[4] = l_tmp[5] = 0;
+    l_tmp[6] = p_product[9];
+    l_tmp[7] = p_product[11];
+    l_carry -= vli_sub(p_result, p_result, l_tmp);
+
+    /* d3 */
+    l_tmp[0] = p_product[13];
+    l_tmp[1] = p_product[14];
+    l_tmp[2] = p_product[15];
+    l_tmp[3] = p_product[8];
+    l_tmp[4] = p_product[9];
+    l_tmp[5] = p_product[10];
+    l_tmp[6] = 0;
+    l_tmp[7] = p_product[12];
+    l_carry -= vli_sub(p_result, p_result, l_tmp);
+
+    /* d4 */
+    l_tmp[0] = p_product[14];
+    l_tmp[1] = p_product[15];
+    l_tmp[2] = 0;
+    l_tmp[3] = p_product[9];
+    l_tmp[4] = p_product[10];
+    l_tmp[5] = p_product[11];
+    l_tmp[6] = 0;
+    l_tmp[7] = p_product[13];
+    l_carry -= vli_sub(p_result, p_result, l_tmp);
+
+    if(l_carry < 0)
+    {
+        do
+        {
+            l_carry += vli_add(p_result, p_result, curve_p);
+        } while(l_carry < 0);
+    }
+    else
+    {
+        while(l_carry || vli_cmp(curve_p, p_result) != 1)
+        {
+            l_carry -= vli_sub(p_result, p_result, curve_p);
+        }
+    }
+}
+#else
+static void vli_mmod_fast(uint64_t *RESTRICT p_result, uint64_t *RESTRICT p_product)
+{
+    uint64_t l_tmp[uECC_WORDS];
+    int l_carry;
+
+    /* t */
+    vli_set(p_result, p_product);
+
+    /* s1 */
+    l_tmp[0] = 0;
+    l_tmp[1] = p_product[5] & 0xffffffff00000000ull;
+    l_tmp[2] = p_product[6];
+    l_tmp[3] = p_product[7];
+    l_carry = vli_add(l_tmp, l_tmp, l_tmp);
+    l_carry += vli_add(p_result, p_result, l_tmp);
+
+    /* s2 */
+    l_tmp[1] = p_product[6] << 32;
+    l_tmp[2] = (p_product[6] >> 32) | (p_product[7] << 32);
+    l_tmp[3] = p_product[7] >> 32;
+    l_carry += vli_add(l_tmp, l_tmp, l_tmp);
+    l_carry += vli_add(p_result, p_result, l_tmp);
+
+    /* s3 */
+    l_tmp[0] = p_product[4];
+    l_tmp[1] = p_product[5] & 0xffffffff;
+    l_tmp[2] = 0;
+    l_tmp[3] = p_product[7];
+    l_carry += vli_add(p_result, p_result, l_tmp);
+
+    /* s4 */
+    l_tmp[0] = (p_product[4] >> 32) | (p_product[5] << 32);
+    l_tmp[1] = (p_product[5] >> 32) | (p_product[6] & 0xffffffff00000000ull);
+    l_tmp[2] = p_product[7];
+    l_tmp[3] = (p_product[6] >> 32) | (p_product[4] << 32);
+    l_carry += vli_add(p_result, p_result, l_tmp);
+
+    /* d1 */
+    l_tmp[0] = (p_product[5] >> 32) | (p_product[6] << 32);
+    l_tmp[1] = (p_product[6] >> 32);
+    l_tmp[2] = 0;
+    l_tmp[3] = (p_product[4] & 0xffffffff) | (p_product[5] << 32);
+    l_carry -= vli_sub(p_result, p_result, l_tmp);
+
+    /* d2 */
+    l_tmp[0] = p_product[6];
+    l_tmp[1] = p_product[7];
+    l_tmp[2] = 0;
+    l_tmp[3] = (p_product[4] >> 32) | (p_product[5] & 0xffffffff00000000ull);
+    l_carry -= vli_sub(p_result, p_result, l_tmp);
+
+    /* d3 */
+    l_tmp[0] = (p_product[6] >> 32) | (p_product[7] << 32);
+    l_tmp[1] = (p_product[7] >> 32) | (p_product[4] << 32);
+    l_tmp[2] = (p_product[4] >> 32) | (p_product[5] << 32);
+    l_tmp[3] = (p_product[6] << 32);
+    l_carry -= vli_sub(p_result, p_result, l_tmp);
+
+    /* d4 */
+    l_tmp[0] = p_product[7];
+    l_tmp[1] = p_product[4] & 0xffffffff00000000ull;
+    l_tmp[2] = p_product[5];
+    l_tmp[3] = p_product[6] & 0xffffffff00000000ull;
+    l_carry -= vli_sub(p_result, p_result, l_tmp);
+
+    if(l_carry < 0)
+    {
+        do
+        {
+            l_carry += vli_add(p_result, p_result, curve_p);
+        } while(l_carry < 0);
+    }
+    else
+    {
+        while(l_carry || vli_cmp(curve_p, p_result) != 1)
+        {
+            l_carry -= vli_sub(p_result, p_result, curve_p);
+        }
+    }
+}
+#endif /* uECC_WORD_SIZE */
+
+#elif uECC_CURVE == uECC_secp256k1
+
+#if uECC_WORD_SIZE == 1
+static void omega_mult(uint8_t * RESTRICT p_result, uint8_t * RESTRICT p_right)
+{
+    /* Multiply by (2^32 + 2^9 + 2^8 + 2^7 + 2^6 + 2^4 + 1). */
+    uECC_word_t r0 = 0;
+    uECC_word_t r1 = 0;
+    uECC_word_t r2 = 0;
+
+    wordcount_t k;
+
+    /* Multiply by (2^9 + 2^8 + 2^7 + 2^6 + 2^4 + 1). */
+    muladd(0xD1, p_right[0], &r0, &r1, &r2);
+    p_result[0] = r0;
+    r0 = r1;
+    r1 = r2;
+    /* r2 is still 0 */
+
+    for(k = 1; k < uECC_WORDS; ++k)
+    {
+        muladd(0x03, p_right[k-1], &r0, &r1, &r2);
+        muladd(0xD1, p_right[k], &r0, &r1, &r2);
+        p_result[k] = r0;
+        r0 = r1;
+        r1 = r2;
+        r2 = 0;
+    }
+
+    muladd(0x03, p_right[uECC_WORDS-1], &r0, &r1, &r2);
+    p_result[uECC_WORDS] = r0;
+    p_result[uECC_WORDS + 1] = r1;
+
+    p_result[4 + uECC_WORDS] = vli_add(p_result + 4, p_result + 4, p_right); /* add the 2^32 multiple */
+}
+#elif uECC_WORD_SIZE == 4
+static void omega_mult(uint32_t * RESTRICT p_result, uint32_t * RESTRICT p_right)
+{
+    /* Multiply by (2^9 + 2^8 + 2^7 + 2^6 + 2^4 + 1). */
+    uint32_t l_carry = 0;
+    wordcount_t k;
+
+    for(k = 0; k < uECC_WORDS; ++k)
+    {
+        uint64_t p = (uint64_t)0x3D1 * p_right[k] + l_carry;
+        p_result[k] = (p & 0xffffffff);
+        l_carry = p >> 32;
+    }
+    p_result[uECC_WORDS] = l_carry;
+
+    p_result[1 + uECC_WORDS] = vli_add(p_result + 1, p_result + 1, p_right); /* add the 2^32 multiple */
+}
+#else
+static void omega_mult(uint64_t * RESTRICT p_result, uint64_t * RESTRICT p_right)
+{
+    uECC_word_t r0 = 0;
+    uECC_word_t r1 = 0;
+    uECC_word_t r2 = 0;
+
+    wordcount_t k;
+
+    /* Multiply by (2^32 + 2^9 + 2^8 + 2^7 + 2^6 + 2^4 + 1). */
+    for(k = 0; k < uECC_WORDS; ++k)
+    {
+        muladd(0x1000003D1ull, p_right[k], &r0, &r1, &r2);
+        p_result[k] = r0;
+        r0 = r1;
+        r1 = r2;
+        r2 = 0;
+    }
+
+    p_result[uECC_WORDS] = r0;
 }
+#endif /* uECC_WORD_SIZE */
 
-static uint32_t sub( const uint32_t *x, const uint32_t *y, uint32_t *result, uint8_t length){
-       uint64_t d = 0;
-       int v;
-       for(v = 0;v < length; v++){
-               d = (uint64_t) x[v] - (uint64_t) y[v] - d;
-               result[v] = d & 0xFFFFFFFF;
-               d = d>>32;
-               d &= 0x1;
-       }       
-       return (uint32_t)d;
-}
-
-static void rshiftby(const uint32_t *in, uint8_t in_size, uint32_t *out, uint8_t out_size, uint8_t shift) {
-       int i;
-
-       for (i = 0; i < (in_size - shift) && i < out_size; i++)
-               out[i] = in[i + shift];
-       for (/* reuse i */; i < out_size; i++)
-               out[i] = 0;
-}
-
-//finite field functions
-//FFFFFFFF00000001000000000000000000000000FFFFFFFFFFFFFFFFFFFFFFFF
-static const uint32_t ecc_prime_m[8] = {0xffffffff, 0xffffffff, 0xffffffff, 0x00000000,
-                                       0x00000000, 0x00000000, 0x00000001, 0xffffffff};
-
-                                                       
-/* This is added after an static byte addition if the answer has a carry in MSB*/
-static const uint32_t ecc_prime_r[8] = {0x00000001, 0x00000000, 0x00000000, 0xffffffff,
-                                       0xffffffff, 0xffffffff, 0xfffffffe, 0x00000000};
-
-// ffffffff00000000ffffffffffffffffbce6faada7179e84f3b9cac2fc632551
-static const uint32_t ecc_order_m[9] = {0xFC632551, 0xF3B9CAC2, 0xA7179E84, 0xBCE6FAAD,
-                                       0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF,
-                                       0x00000000};
-
-static const uint32_t ecc_order_r[8] = {0x039CDAAF, 0x0C46353D, 0x58E8617B, 0x43190552,
-                                       0x00000000, 0x00000000, 0xFFFFFFFF, 0x00000000};
-
-static const uint32_t ecc_order_mu[9] = {0xEEDF9BFE, 0x012FFD85, 0xDF1A6C21, 0x43190552,
-                                        0xFFFFFFFF, 0xFFFFFFFE, 0xFFFFFFFF, 0x00000000,
-                                        0x00000001};
-
-static const uint8_t ecc_order_k = 8;
-
-const uint32_t ecc_g_point_x[8] = { 0xD898C296, 0xF4A13945, 0x2DEB33A0, 0x77037D81,
-                                   0x63A440F2, 0xF8BCE6E5, 0xE12C4247, 0x6B17D1F2};
-const uint32_t ecc_g_point_y[8] = { 0x37BF51F5, 0xCBB64068, 0x6B315ECE, 0x2BCE3357,
-                                   0x7C0F9E16, 0x8EE7EB4A, 0xFE1A7F9B, 0x4FE342E2};
-
-
-static void setZero(uint32_t *A, const int length){
-       memset(A, 0x0, length * sizeof(uint32_t));
-}
-
-/*
- * copy one array to another
- */
-static void copy(const uint32_t *from, uint32_t *to, uint8_t length){
-       memcpy(to, from, length * sizeof(uint32_t));
-}
-
-static int isSame(const uint32_t *A, const uint32_t *B, uint8_t length){
-       return !memcmp(A, B, length * sizeof(uint32_t));
-}
-
-//is A greater than B?
-static int isGreater(const uint32_t *A, const uint32_t *B, uint8_t length){
-       int i;
-       for (i = length-1; i >= 0; --i)
-       {
-               if(A[i] > B[i])
-                       return 1;
-               if(A[i] < B[i])
-                       return -1;
-       }
-       return 0;
-}
-
-
-static int fieldAdd(const uint32_t *x, const uint32_t *y, const uint32_t *reducer, uint32_t *result){
-       if(add(x, y, result, arrayLength)){ //add prime if carry is still set!
-               uint32_t tempas[8];
-               setZero(tempas, 8);
-               add(result, reducer, tempas, arrayLength);
-               copy(tempas, result, arrayLength);
-       }
-       return 0;
-}
-
-static int fieldSub(const uint32_t *x, const uint32_t *y, const uint32_t *modulus, uint32_t *result){
-       if(sub(x, y, result, arrayLength)){ //add modulus if carry is set
-               uint32_t tempas[8];
-               setZero(tempas, 8);
-               add(result, modulus, tempas, arrayLength);
-               copy(tempas, result, arrayLength);
-       }
-       return 0;
-}
-
-//finite Field multiplication
-//32bit * 32bit = 64bit
-static int fieldMult(const uint32_t *x, const uint32_t *y, uint32_t *result, uint8_t length){
-       uint32_t temp[length * 2];
-       setZero(temp, length * 2);
-       setZero(result, length * 2);
-       uint8_t k, n;
-       uint64_t l;
-       for (k = 0; k < length; k++){
-               for (n = 0; n < length; n++){ 
-                       l = (uint64_t)x[n]*(uint64_t)y[k];
-                       temp[n+k] = l&0xFFFFFFFF;
-                       temp[n+k+1] = l>>32;
-                       add(&temp[n+k], &result[n+k], &result[n+k], (length * 2) - (n + k));
-
-                       setZero(temp, length * 2);
-               }
-       }
-       return 0;
-}
-
-//TODO: maximum:
-//fffffffe00000002fffffffe0000000100000001fffffffe00000001fffffffe00000001fffffffefffffffffffffffffffffffe000000000000000000000001_16
-static void fieldModP(uint32_t *A, const uint32_t *B)
-{
-       uint32_t tempm[8];
-       uint32_t tempm2[8];
-       uint8_t n;
-       setZero(tempm, 8);
-       setZero(tempm2, 8);
-       /* A = T */ 
-       copy(B,A,arrayLength);
-
-       /* Form S1 */ 
-       for(n=0;n<3;n++) tempm[n]=0; 
-       for(n=3;n<8;n++) tempm[n]=B[n+8];
-
-       /* tempm2=T+S1 */ 
-       fieldAdd(A,tempm,ecc_prime_r,tempm2);
-       /* A=T+S1+S1 */ 
-       fieldAdd(tempm2,tempm,ecc_prime_r,A);
-       /* Form S2 */ 
-       for(n=0;n<3;n++) tempm[n]=0; 
-       for(n=3;n<7;n++) tempm[n]=B[n+9]; 
-       for(n=7;n<8;n++) tempm[n]=0;
-       /* tempm2=T+S1+S1+S2 */ 
-       fieldAdd(A,tempm,ecc_prime_r,tempm2);
-       /* A=T+S1+S1+S2+S2 */ 
-       fieldAdd(tempm2,tempm,ecc_prime_r,A);
-       /* Form S3 */ 
-       for(n=0;n<3;n++) tempm[n]=B[n+8]; 
-       for(n=3;n<6;n++) tempm[n]=0; 
-       for(n=6;n<8;n++) tempm[n]=B[n+8];
-       /* tempm2=T+S1+S1+S2+S2+S3 */ 
-       fieldAdd(A,tempm,ecc_prime_r,tempm2);
-       /* Form S4 */ 
-       for(n=0;n<3;n++) tempm[n]=B[n+9]; 
-       for(n=3;n<6;n++) tempm[n]=B[n+10]; 
-       for(n=6;n<7;n++) tempm[n]=B[n+7]; 
-       for(n=7;n<8;n++) tempm[n]=B[n+1];
-       /* A=T+S1+S1+S2+S2+S3+S4 */ 
-       fieldAdd(tempm2,tempm,ecc_prime_r,A);
-       /* Form D1 */ 
-       for(n=0;n<3;n++) tempm[n]=B[n+11]; 
-       for(n=3;n<6;n++) tempm[n]=0; 
-       for(n=6;n<7;n++) tempm[n]=B[n+2]; 
-       for(n=7;n<8;n++) tempm[n]=B[n+3];
-       /* tempm2=T+S1+S1+S2+S2+S3+S4-D1 */ 
-       fieldSub(A,tempm,ecc_prime_m,tempm2);
-       /* Form D2 */ 
-       for(n=0;n<4;n++) tempm[n]=B[n+12]; 
-       for(n=4;n<6;n++) tempm[n]=0; 
-       for(n=6;n<7;n++) tempm[n]=B[n+3]; 
-       for(n=7;n<8;n++) tempm[n]=B[n+4];
-       /* A=T+S1+S1+S2+S2+S3+S4-D1-D2 */ 
-       fieldSub(tempm2,tempm,ecc_prime_m,A);
-       /* Form D3 */ 
-       for(n=0;n<3;n++) tempm[n]=B[n+13]; 
-       for(n=3;n<6;n++) tempm[n]=B[n+5]; 
-       for(n=6;n<7;n++) tempm[n]=0; 
-       for(n=7;n<8;n++) tempm[n]=B[n+5];
-       /* tempm2=T+S1+S1+S2+S2+S3+S4-D1-D2-D3 */ 
-       fieldSub(A,tempm,ecc_prime_m,tempm2);
-       /* Form D4 */ 
-       for(n=0;n<2;n++) tempm[n]=B[n+14]; 
-       for(n=2;n<3;n++) tempm[n]=0; 
-       for(n=3;n<6;n++) tempm[n]=B[n+6]; 
-       for(n=6;n<7;n++) tempm[n]=0; 
-       for(n=7;n<8;n++) tempm[n]=B[n+6];
-       /* A=T+S1+S1+S2+S2+S3+S4-D1-D2-D3-D4 */ 
-       fieldSub(tempm2,tempm,ecc_prime_m,A);
-       if(isGreater(A, ecc_prime_m, arrayLength) >= 0){
-               fieldSub(A, ecc_prime_m, ecc_prime_m, tempm);
-               copy(tempm, A, arrayLength);
-       }
-}
-
-/**
- * calculate the result = A mod n.
- * n is the order of the eliptic curve.
- * A and result could point to the same value
- *
- * A: input value (max size * 4 bytes)
- * result: result of modulo calculation (max 36 bytes)
- * size: size of A
- *
- * This uses the Barrett modular reduction as described in the Handbook 
- * of Applied Cryptography 14.42 Algorithm Barrett modular reduction, 
- * see http://cacr.uwaterloo.ca/hac/about/chap14.pdf and 
- * http://everything2.com/title/Barrett+Reduction
- *
- * b = 32 (bite size of the processor architecture)
- * mu (ecc_order_mu) was precomputed in a java program
- */
-static void fieldModO(const uint32_t *A, uint32_t *result, uint8_t length) {
-       // This is used for value q1 and q3
-       uint32_t q1_q3[9];
-       // This is used for q2 and a temp var
-       uint32_t q2_tmp[18];
-
-       // return if the given value is smaller than the modulus
-       if (length == arrayLength && isGreater(A, ecc_order_m, arrayLength) <= 0) {
-               if (A != result)
-                       copy(A, result, length);
-               return;
-       }
-
-       rshiftby(A, length, q1_q3, 9, ecc_order_k - 1);
-
-       fieldMult(ecc_order_mu, q1_q3, q2_tmp, 9);
-
-       rshiftby(q2_tmp, 18, q1_q3, 8, ecc_order_k + 1);
-
-       // r1 = first 9 blocks of A
-
-       fieldMult(q1_q3, ecc_order_m, q2_tmp, 8);
-
-       // r2 = first 9 blocks of q2_tmp
-
-       sub(A, q2_tmp, result, 9);
-
-       while (isGreater(result, ecc_order_m, 9) >= 0)
-               sub(result, ecc_order_m, result, 9);
-}
-
-static int isOne(const uint32_t* A){
-       uint8_t n; 
-       for(n=1;n<8;n++) 
-               if (A[n]!=0) 
-                       break;
-
-       if ((n==8)&&(A[0]==1)) 
-               return 1;
-       else 
-               return 0;
-}
-
-static int isZero(const uint32_t* A){
-       uint8_t n, r=0;
-       for(n=0;n<8;n++){
-               if (A[n] == 0) r++;
-       }
-       return r==8;
-}
-
-static void rshift(uint32_t* A){
-       int n, i;
-       uint32_t nOld = 0;
-       for (i = 8; i--;)
-       {
-               n = A[i]&0x1;
-               A[i] = A[i]>>1 | nOld<<31;
-               nOld = n;
-       }
-}
-
-static int fieldAddAndDivide(const uint32_t *x, const uint32_t *modulus, const uint32_t *reducer, uint32_t* result){
-       uint32_t n = add(x, modulus, result, arrayLength);
-       rshift(result);
-       if(n){ //add prime if carry is still set!
-               result[7] |= 0x80000000;//add the carry
-               if (isGreater(result, modulus, arrayLength) == 1)
-               {
-                       uint32_t tempas[8];
-                       setZero(tempas, 8);
-                       add(result, reducer, tempas, 8);
-                       copy(tempas, result, arrayLength);
-               }
-               
-       }
-       return 0;
-}
-
-/*
- * Inverse A and output to B
- */
-static void fieldInv(const uint32_t *A, const uint32_t *modulus, const uint32_t *reducer, uint32_t *B){
-       uint32_t u[8],v[8],x1[8],x2[8];
-       uint32_t tempm[8];
-       uint32_t tempm2[8];
-       setZero(tempm, 8);
-       setZero(tempm2, 8);
-       setZero(u, 8);
-       setZero(v, 8);
-
-       uint8_t t;
-       copy(A,u,arrayLength); 
-       copy(modulus,v,arrayLength); 
-       setZero(x1, 8);
-       setZero(x2, 8);
-       x1[0]=1; 
-       /* While u !=1 and v !=1 */ 
-       while ((isOne(u) || isOne(v))==0) {
-               while(!(u[0]&1)) {                                      /* While u is even */
-                       rshift(u);                                              /* divide by 2 */
-                       if (!(x1[0]&1))                                 /*ifx1iseven*/
-                               rshift(x1);                                     /* Divide by 2 */
-                       else {
-                               fieldAddAndDivide(x1,modulus,reducer,tempm); /* tempm=x1+p */
-                               copy(tempm,x1,arrayLength);             /* x1=tempm */
-                               //rshift(x1);                                   /* Divide by 2 */
-                       }
-               } 
-               while(!(v[0]&1)) {                                      /* While v is even */
-                       rshift(v);                                              /* divide by 2 */ 
-                       if (!(x2[0]&1))                                 /*ifx1iseven*/
-                               rshift(x2);                             /* Divide by 2 */
-                       else
-                       {
-                               fieldAddAndDivide(x2,modulus,reducer,tempm);    /* tempm=x1+p */
-                               copy(tempm,x2,arrayLength);                     /* x1=tempm */ 
-                               //rshift(x2);                                   /* Divide by 2 */
-                       }
-                       
-               } 
-               t=sub(u,v,tempm,arrayLength);                           /* tempm=u-v */
-               if (t==0) {                                                     /* If u > 0 */
-                       copy(tempm,u,arrayLength);                                      /* u=u-v */
-                       fieldSub(x1,x2,modulus,tempm);                  /* tempm=x1-x2 */
-                       copy(tempm,x1,arrayLength);                                     /* x1=x1-x2 */
-               } else {
-                       sub(v,u,tempm,arrayLength);                     /* tempm=v-u */
-                       copy(tempm,v,arrayLength);                                      /* v=v-u */
-                       fieldSub(x2,x1,modulus,tempm);                  /* tempm=x2-x1 */
-                       copy(tempm,x2,arrayLength);                                     /* x2=x2-x1 */
-               }
-       } 
-       if (isOne(u)) {
-               copy(x1,B,arrayLength); 
-       } else {
-               copy(x2,B,arrayLength);
-       }
-}
-
-void static ec_double(const uint32_t *px, const uint32_t *py, uint32_t *Dx, uint32_t *Dy){
-       uint32_t tempA[8];
-       uint32_t tempB[8];
-       uint32_t tempC[8];
-       uint32_t tempD[16];
-
-       if(isZero(px) && isZero(py)){
-               copy(px, Dx,arrayLength);
-               copy(py, Dy,arrayLength);
-               return;
-       }
-
-       fieldMult(px, px, tempD, arrayLength);
-       fieldModP(tempA, tempD);
-       setZero(tempB, 8);
-       tempB[0] = 0x00000001;
-       fieldSub(tempA, tempB, ecc_prime_m, tempC); //tempC = (qx^2-1)
-       tempB[0] = 0x00000003;
-       fieldMult(tempC, tempB, tempD, arrayLength);
-       fieldModP(tempA, tempD);//tempA = 3*(qx^2-1)
-       fieldAdd(py, py, ecc_prime_r, tempB); //tempB = 2*qy
-       fieldInv(tempB, ecc_prime_m, ecc_prime_r, tempC); //tempC = 1/(2*qy)
-       fieldMult(tempA, tempC, tempD, arrayLength); //tempB = lambda = (3*(qx^2-1))/(2*qy)
-       fieldModP(tempB, tempD);
-
-       fieldMult(tempB, tempB, tempD, arrayLength); //tempC = lambda^2
-       fieldModP(tempC, tempD);
-       fieldSub(tempC, px, ecc_prime_m, tempA); //lambda^2 - Px
-       fieldSub(tempA, px, ecc_prime_m, Dx); //lambda^2 - Px - Qx
-
-       fieldSub(px, Dx, ecc_prime_m, tempA); //tempA = qx-dx
-       fieldMult(tempB, tempA, tempD, arrayLength); //tempC = lambda * (qx-dx)
-       fieldModP(tempC, tempD);
-       fieldSub(tempC, py, ecc_prime_m, Dy); //Dy = lambda * (qx-dx) - px
-}
-
-void static ec_add(const uint32_t *px, const uint32_t *py, const uint32_t *qx, const uint32_t *qy, uint32_t *Sx, uint32_t *Sy){
-       uint32_t tempA[8];
-       uint32_t tempB[8];
-       uint32_t tempC[8];
-       uint32_t tempD[16];
-
-       if(isZero(px) && isZero(py)){
-               copy(qx, Sx,arrayLength);
-               copy(qy, Sy,arrayLength);
-               return;
-       } else if(isZero(qx) && isZero(qy)) {
-               copy(px, Sx,arrayLength);
-               copy(py, Sy,arrayLength);
-               return;
-       }
-
-       if(isSame(px, qx, arrayLength)){
-               if(!isSame(py, qy, arrayLength)){
-                       setZero(Sx, 8);
-                       setZero(Sy, 8);
-                       return;
-               } else {
-                       ec_double(px, py, Sx, Sy);
-                       return;
-               }
-       }
-
-       fieldSub(py, qy, ecc_prime_m, tempA);
-       fieldSub(px, qx, ecc_prime_m, tempB);
-       fieldInv(tempB, ecc_prime_m, ecc_prime_r, tempB);
-       fieldMult(tempA, tempB, tempD, arrayLength); 
-       fieldModP(tempC, tempD); //tempC = lambda
-
-       fieldMult(tempC, tempC, tempD, arrayLength); //tempA = lambda^2
-       fieldModP(tempA, tempD);
-       fieldSub(tempA, px, ecc_prime_m, tempB); //lambda^2 - Px
-       fieldSub(tempB, qx, ecc_prime_m, Sx); //lambda^2 - Px - Qx
-
-       fieldSub(qx, Sx, ecc_prime_m, tempB);
-       fieldMult(tempC, tempB, tempD, arrayLength);
-       fieldModP(tempC, tempD);
-       fieldSub(tempC, qy, ecc_prime_m, Sy);
-}
-
-void ecc_ec_mult(const uint32_t *px, const uint32_t *py, const uint32_t *secret, uint32_t *resultx, uint32_t *resulty){
-       uint32_t Qx[8];
-       uint32_t Qy[8];
-       setZero(Qx, 8);
-       setZero(Qy, 8);
-
-       uint32_t tempx[8];
-       uint32_t tempy[8];
-
-       int i;
-       for (i = 256;i--;){
-               ec_double(Qx, Qy, tempx, tempy);
-               copy(tempx, Qx,arrayLength);
-               copy(tempy, Qy,arrayLength);
-               if (((secret[i / 32]) & ((uint32_t)1 << (i % 32)))) {
-                       ec_add(Qx, Qy, px, py, tempx, tempy); //eccAdd
-                       copy(tempx, Qx,arrayLength);
-                       copy(tempy, Qy,arrayLength);
-               }
-       }
-       copy(Qx, resultx,arrayLength);
-       copy(Qy, resulty,arrayLength);
-}
-
-/**
- * Calculate the ecdsa signature.
- *
- * For a description of this algorithm see
- * https://en.wikipedia.org/wiki/Elliptic_Curve_DSA#Signature_generation_algorithm
- *
- * input:
- *  d: private key on the curve secp256r1 (32 bytes)
- *  e: hash to sign (32 bytes)
- *  k: random data, this must be changed for every signature (32 bytes)
- *
- * output:
- *  r: r value of the signature (36 bytes)
- *  s: s value of the signature (36 bytes)
- *
- * return:
- *   0: everything is ok
- *  -1: can not create signature, try again with different k.
- */
-int ecc_ecdsa_sign(const uint32_t *d, const uint32_t *e, const uint32_t *k, uint32_t *r, uint32_t *s)
-{
-       uint32_t tmp1[16];
-       uint32_t tmp2[9];
-       uint32_t tmp3[9];
-
-       if (isZero(k))
-               return -1;
-
-       // 4. Calculate the curve point (x_1, y_1) = k * G.
-       ecc_ec_mult(ecc_g_point_x, ecc_g_point_y, k, r, tmp1);
-
-       // 5. Calculate r = x_1 \pmod{n}.
-       fieldModO(r, r, 8);
-
-       // 5. If r = 0, go back to step 3.
-       if (isZero(r))
-               return -1;
-
-       // 6. Calculate s = k^{-1}(z + r d_A) \pmod{n}.
-       // 6. r * d
-       fieldMult(r, d, tmp1, arrayLength);
-       fieldModO(tmp1, tmp2, 16);
-
-       // 6. z + (r d)
-       tmp1[8] = add(e, tmp2, tmp1, 8);
-       fieldModO(tmp1, tmp3, 9);
-
-       // 6. k^{-1}
-       fieldInv(k, ecc_order_m, ecc_order_r, tmp2);
-
-       // 6. (k^{-1}) (z + (r d))
-       fieldMult(tmp2, tmp3, tmp1, arrayLength);
-       fieldModO(tmp1, s, 16);
-
-       // 6. If s = 0, go back to step 3.
-       if (isZero(s))
-               return -1;
+#endif /* uECC_CURVE */
+#endif /* !asm_mmod_fast */
 
-       return 0;
+/* Computes p_result = (p_left * p_right) % curve_p. */
+static void vli_modMult_fast(uECC_word_t *p_result, uECC_word_t *p_left, uECC_word_t *p_right)
+{
+    uECC_word_t l_product[2 * uECC_WORDS];
+    vli_mult(l_product, p_left, p_right);
+    vli_mmod_fast(p_result, l_product);
 }
 
-/**
- * Verifies a ecdsa signature.
- *
- * For a description of this algorithm see
- * https://en.wikipedia.org/wiki/Elliptic_Curve_DSA#Signature_verification_algorithm
- *
- * input:
- *  x: x coordinate of the public key (32 bytes)
- *  y: y coordinate of the public key (32 bytes)
- *  e: hash to verify the signature of (32 bytes)
- *  r: r value of the signature (32 bytes)
- *  s: s value of the signature (32 bytes)
- *
- * return:
- *  0: signature is ok
- *  -1: signature check failed the signature is invalid
- */
-int ecc_ecdsa_validate(const uint32_t *x, const uint32_t *y, const uint32_t *e, const uint32_t *r, const uint32_t *s)
+#if uECC_SQUARE_FUNC
+
+/* Computes p_result = p_left^2 % curve_p. */
+static void vli_modSquare_fast(uECC_word_t *p_result, uECC_word_t *p_left)
 {
-       uint32_t w[8];
-       uint32_t tmp[16];
-       uint32_t u1[9];
-       uint32_t u2[9];
-       uint32_t tmp1_x[8];
-       uint32_t tmp1_y[8];
-       uint32_t tmp2_x[8];
-       uint32_t tmp2_y[8];
-       uint32_t tmp3_x[8];
-       uint32_t tmp3_y[8];
+    uECC_word_t l_product[2 * uECC_WORDS];
+    vli_square(l_product, p_left);
+    vli_mmod_fast(p_result, l_product);
+}
 
-       // 3. Calculate w = s^{-1} \pmod{n}
-       fieldInv(s, ecc_order_m, ecc_order_r, w);
+#else /* uECC_SQUARE_FUNC */
 
-       // 4. Calculate u_1 = zw \pmod{n}
-       fieldMult(e, w, tmp, arrayLength);
-       fieldModO(tmp, u1, 16);
+#define vli_modSquare_fast(result, left) vli_modMult_fast((result), (left), (left))
 
-       // 4. Calculate u_2 = rw \pmod{n}
-       fieldMult(r, w, tmp, arrayLength);
-       fieldModO(tmp, u2, 16);
+#endif /* uECC_SQUARE_FUNC */
 
-       // 5. Calculate the curve point (x_1, y_1) = u_1 * G + u_2 * Q_A.
-       // tmp1 = u_1 * G
-       ecc_ec_mult(ecc_g_point_x, ecc_g_point_y, u1, tmp1_x, tmp1_y);
 
-       // tmp2 = u_2 * Q_A
-       ecc_ec_mult(x, y, u2, tmp2_x, tmp2_y);
+#define EVEN(vli) (!(vli[0] & 1))
+/* Computes p_result = (1 / p_input) % p_mod. All VLIs are the same size.
+   See "From Euclid's GCD to Montgomery Multiplication to the Great Divide"
+   https://labs.oracle.com/techrep/2001/smli_tr-2001-95.pdf */
+#if !asm_modInv
+static void vli_modInv(uECC_word_t *p_result, uECC_word_t *p_input, uECC_word_t *p_mod)
+{
+    uECC_word_t a[uECC_WORDS], b[uECC_WORDS], u[uECC_WORDS], v[uECC_WORDS];
+    uECC_word_t l_carry;
+    cmpresult_t l_cmpResult;
+
+    if(vli_isZero(p_input))
+    {
+        vli_clear(p_result);
+        return;
+    }
+
+    vli_set(a, p_input);
+    vli_set(b, p_mod);
+    vli_clear(u);
+    u[0] = 1;
+    vli_clear(v);
+    while((l_cmpResult = vli_cmp(a, b)) != 0)
+    {
+        l_carry = 0;
+        if(EVEN(a))
+        {
+            vli_rshift1(a);
+            if(!EVEN(u))
+            {
+                l_carry = vli_add(u, u, p_mod);
+            }
+            vli_rshift1(u);
+            if(l_carry)
+            {
+                u[uECC_WORDS-1] |= HIGH_BIT_SET;
+            }
+        }
+        else if(EVEN(b))
+        {
+            vli_rshift1(b);
+            if(!EVEN(v))
+            {
+                l_carry = vli_add(v, v, p_mod);
+            }
+            vli_rshift1(v);
+            if(l_carry)
+            {
+                v[uECC_WORDS-1] |= HIGH_BIT_SET;
+            }
+        }
+        else if(l_cmpResult > 0)
+        {
+            vli_sub(a, a, b);
+            vli_rshift1(a);
+            if(vli_cmp(u, v) < 0)
+            {
+                vli_add(u, u, p_mod);
+            }
+            vli_sub(u, u, v);
+            if(!EVEN(u))
+            {
+                l_carry = vli_add(u, u, p_mod);
+            }
+            vli_rshift1(u);
+            if(l_carry)
+            {
+                u[uECC_WORDS-1] |= HIGH_BIT_SET;
+            }
+        }
+        else
+        {
+            vli_sub(b, b, a);
+            vli_rshift1(b);
+            if(vli_cmp(v, u) < 0)
+            {
+                vli_add(v, v, p_mod);
+            }
+            vli_sub(v, v, u);
+            if(!EVEN(v))
+            {
+                l_carry = vli_add(v, v, p_mod);
+            }
+            vli_rshift1(v);
+            if(l_carry)
+            {
+                v[uECC_WORDS-1] |= HIGH_BIT_SET;
+            }
+        }
+    }
+
+    vli_set(p_result, u);
+}
+#endif /* !asm_modInv */
 
-       // tmp3 = tmp1 + tmp2
-       ec_add(tmp1_x, tmp1_y, tmp2_x, tmp2_y, tmp3_x, tmp3_y);
-       // TODO: this u_1 * G + u_2 * Q_A  could be optimiced with Straus's algorithm.
+/* ------ Point operations ------ */
 
-       return isSame(tmp3_x, r, arrayLength) ? 0 : -1;
+/* Returns 1 if p_point is the point at infinity, 0 otherwise. */
+static cmpresult_t EccPoint_isZero(EccPoint *p_point)
+{
+    return (vli_isZero(p_point->x) && vli_isZero(p_point->y));
 }
 
-int ecc_is_valid_key(const uint32_t * priv_key)
+/* Point multiplication algorithm using Montgomery's ladder with co-Z coordinates.
+From http://eprint.iacr.org/2011/338.pdf
+*/
+
+/* Double in place */
+#if (uECC_CURVE == uECC_secp256k1)
+static void EccPoint_double_jacobian(uECC_word_t * RESTRICT X1, uECC_word_t * RESTRICT Y1, uECC_word_t * RESTRICT Z1)
 {
-       return isGreater(ecc_order_m, priv_key, arrayLength) == 1;
+    /* t1 = X, t2 = Y, t3 = Z */
+    uECC_word_t t4[uECC_WORDS];
+    uECC_word_t t5[uECC_WORDS];
+
+    if(vli_isZero(Z1))
+    {
+        return;
+    }
+
+    vli_modSquare_fast(t5, Y1);   /* t5 = y1^2 */
+    vli_modMult_fast(t4, X1, t5); /* t4 = x1*y1^2 = A */
+    vli_modSquare_fast(X1, X1);   /* t1 = x1^2 */
+    vli_modSquare_fast(t5, t5);   /* t5 = y1^4 */
+    vli_modMult_fast(Z1, Y1, Z1); /* t3 = y1*z1 = z3 */
+
+    vli_modAdd(Y1, X1, X1, curve_p); /* t2 = 2*x1^2 */
+    vli_modAdd(Y1, Y1, X1, curve_p); /* t2 = 3*x1^2 */
+    if(vli_testBit(Y1, 0))
+    {
+        uECC_word_t l_carry = vli_add(Y1, Y1, curve_p);
+        vli_rshift1(Y1);
+        Y1[uECC_WORDS-1] |= l_carry << (uECC_WORD_BITS - 1);
+    }
+    else
+    {
+        vli_rshift1(Y1);
+    }
+    /* t2 = 3/2*(x1^2) = B */
+
+    vli_modSquare_fast(X1, Y1);   /* t1 = B^2 */
+    vli_modSub(X1, X1, t4, curve_p); /* t1 = B^2 - A */
+    vli_modSub(X1, X1, t4, curve_p); /* t1 = B^2 - 2A = x3 */
+
+    vli_modSub(t4, t4, X1, curve_p); /* t4 = A - x3 */
+    vli_modMult_fast(Y1, Y1, t4);    /* t2 = B * (A - x3) */
+    vli_modSub(Y1, Y1, t5, curve_p); /* t2 = B * (A - x3) - y1^4 = y3 */
 }
+#else
+static void EccPoint_double_jacobian(uECC_word_t * RESTRICT X1, uECC_word_t * RESTRICT Y1, uECC_word_t * RESTRICT Z1)
+{
+    /* t1 = X, t2 = Y, t3 = Z */
+    uECC_word_t t4[uECC_WORDS];
+    uECC_word_t t5[uECC_WORDS];
+
+    if(vli_isZero(Z1))
+    {
+        return;
+    }
+
+    vli_modSquare_fast(t4, Y1);   /* t4 = y1^2 */
+    vli_modMult_fast(t5, X1, t4); /* t5 = x1*y1^2 = A */
+    vli_modSquare_fast(t4, t4);   /* t4 = y1^4 */
+    vli_modMult_fast(Y1, Y1, Z1); /* t2 = y1*z1 = z3 */
+    vli_modSquare_fast(Z1, Z1);   /* t3 = z1^2 */
+
+    vli_modAdd(X1, X1, Z1, curve_p); /* t1 = x1 + z1^2 */
+    vli_modAdd(Z1, Z1, Z1, curve_p); /* t3 = 2*z1^2 */
+    vli_modSub_fast(Z1, X1, Z1); /* t3 = x1 - z1^2 */
+    vli_modMult_fast(X1, X1, Z1);    /* t1 = x1^2 - z1^4 */
+
+    vli_modAdd(Z1, X1, X1, curve_p); /* t3 = 2*(x1^2 - z1^4) */
+    vli_modAdd(X1, X1, Z1, curve_p); /* t1 = 3*(x1^2 - z1^4) */
+    if(vli_testBit(X1, 0))
+    {
+        uECC_word_t l_carry = vli_add(X1, X1, curve_p);
+        vli_rshift1(X1);
+        X1[uECC_WORDS-1] |= l_carry << (uECC_WORD_BITS - 1);
+    }
+    else
+    {
+        vli_rshift1(X1);
+    }
+    /* t1 = 3/2*(x1^2 - z1^4) = B */
+
+    vli_modSquare_fast(Z1, X1);      /* t3 = B^2 */
+    vli_modSub_fast(Z1, Z1, t5); /* t3 = B^2 - A */
+    vli_modSub_fast(Z1, Z1, t5); /* t3 = B^2 - 2A = x3 */
+    vli_modSub_fast(t5, t5, Z1); /* t5 = A - x3 */
+    vli_modMult_fast(X1, X1, t5);    /* t1 = B * (A - x3) */
+    vli_modSub_fast(t4, X1, t4); /* t4 = B * (A - x3) - y1^4 = y3 */
+
+    vli_set(X1, Z1);
+    vli_set(Z1, Y1);
+    vli_set(Y1, t4);
+}
+#endif
 
-/*
- * This exports the low level functions so the tests can use them.
- * In real use the compiler is now bale to optimice the code better.
- */
-#ifdef TEST_INCLUDE
-uint32_t ecc_add( const uint32_t *x, const uint32_t *y, uint32_t *result, uint8_t length)
+/* Modify (x1, y1) => (x1 * z^2, y1 * z^3) */
+static void apply_z(uECC_word_t * RESTRICT X1, uECC_word_t * RESTRICT Y1, uECC_word_t * RESTRICT Z)
 {
-       return add(x, y, result, length);
+    uECC_word_t t1[uECC_WORDS];
+
+    vli_modSquare_fast(t1, Z);    /* z^2 */
+    vli_modMult_fast(X1, X1, t1); /* x1 * z^2 */
+    vli_modMult_fast(t1, t1, Z);  /* z^3 */
+    vli_modMult_fast(Y1, Y1, t1); /* y1 * z^3 */
 }
-uint32_t ecc_sub( const uint32_t *x, const uint32_t *y, uint32_t *result, uint8_t length)
+
+/* P = (x1, y1) => 2P, (x2, y2) => P' */
+static void XYcZ_initial_double(uECC_word_t * RESTRICT X1, uECC_word_t * RESTRICT Y1,
+    uECC_word_t * RESTRICT X2, uECC_word_t * RESTRICT Y2, const uECC_word_t * RESTRICT p_initialZ)
 {
-       return sub(x, y, result, length);
+    uECC_word_t z[uECC_WORDS];
+
+    vli_set(X2, X1);
+    vli_set(Y2, Y1);
+
+    vli_clear(z);
+    z[0] = 1;
+    if(p_initialZ)
+    {
+        vli_set(z, p_initialZ);
+    }
+
+    apply_z(X1, Y1, z);
+
+    EccPoint_double_jacobian(X1, Y1, z);
+
+    apply_z(X2, Y2, z);
 }
-int ecc_fieldAdd(const uint32_t *x, const uint32_t *y, const uint32_t *reducer, uint32_t *result)
+
+/* Input P = (x1, y1, Z), Q = (x2, y2, Z)
+   Output P' = (x1', y1', Z3), P + Q = (x3, y3, Z3)
+   or P => P', Q => P + Q
+*/
+static void XYcZ_add(uECC_word_t * RESTRICT X1, uECC_word_t * RESTRICT Y1, uECC_word_t * RESTRICT X2, uECC_word_t * RESTRICT Y2)
 {
-       return fieldAdd(x, y, reducer, result);
+    /* t1 = X1, t2 = Y1, t3 = X2, t4 = Y2 */
+    uECC_word_t t5[uECC_WORDS];
+
+    vli_modSub_fast(t5, X2, X1); /* t5 = x2 - x1 */
+    vli_modSquare_fast(t5, t5);      /* t5 = (x2 - x1)^2 = A */
+    vli_modMult_fast(X1, X1, t5);    /* t1 = x1*A = B */
+    vli_modMult_fast(X2, X2, t5);    /* t3 = x2*A = C */
+    vli_modSub_fast(Y2, Y2, Y1); /* t4 = y2 - y1 */
+    vli_modSquare_fast(t5, Y2);      /* t5 = (y2 - y1)^2 = D */
+
+    vli_modSub_fast(t5, t5, X1); /* t5 = D - B */
+    vli_modSub_fast(t5, t5, X2); /* t5 = D - B - C = x3 */
+    vli_modSub_fast(X2, X2, X1); /* t3 = C - B */
+    vli_modMult_fast(Y1, Y1, X2);    /* t2 = y1*(C - B) */
+    vli_modSub_fast(X2, X1, t5); /* t3 = B - x3 */
+    vli_modMult_fast(Y2, Y2, X2);    /* t4 = (y2 - y1)*(B - x3) */
+    vli_modSub_fast(Y2, Y2, Y1); /* t4 = y3 */
+
+    vli_set(X2, t5);
 }
-int ecc_fieldSub(const uint32_t *x, const uint32_t *y, const uint32_t *modulus, uint32_t *result)
+
+/* Input P = (x1, y1, Z), Q = (x2, y2, Z)
+   Output P + Q = (x3, y3, Z3), P - Q = (x3', y3', Z3)
+   or P => P - Q, Q => P + Q
+*/
+static void XYcZ_addC(uECC_word_t * RESTRICT X1, uECC_word_t * RESTRICT Y1, uECC_word_t * RESTRICT X2, uECC_word_t * RESTRICT Y2)
 {
-       return fieldSub(x, y, modulus, result);
+    /* t1 = X1, t2 = Y1, t3 = X2, t4 = Y2 */
+    uECC_word_t t5[uECC_WORDS];
+    uECC_word_t t6[uECC_WORDS];
+    uECC_word_t t7[uECC_WORDS];
+
+    vli_modSub_fast(t5, X2, X1); /* t5 = x2 - x1 */
+    vli_modSquare_fast(t5, t5);      /* t5 = (x2 - x1)^2 = A */
+    vli_modMult_fast(X1, X1, t5);    /* t1 = x1*A = B */
+    vli_modMult_fast(X2, X2, t5);    /* t3 = x2*A = C */
+    vli_modAdd(t5, Y2, Y1, curve_p); /* t4 = y2 + y1 */
+    vli_modSub_fast(Y2, Y2, Y1); /* t4 = y2 - y1 */
+
+    vli_modSub_fast(t6, X2, X1); /* t6 = C - B */
+    vli_modMult_fast(Y1, Y1, t6);    /* t2 = y1 * (C - B) */
+    vli_modAdd(t6, X1, X2, curve_p); /* t6 = B + C */
+    vli_modSquare_fast(X2, Y2);      /* t3 = (y2 - y1)^2 */
+    vli_modSub_fast(X2, X2, t6); /* t3 = x3 */
+
+    vli_modSub_fast(t7, X1, X2); /* t7 = B - x3 */
+    vli_modMult_fast(Y2, Y2, t7);    /* t4 = (y2 - y1)*(B - x3) */
+    vli_modSub_fast(Y2, Y2, Y1); /* t4 = y3 */
+
+    vli_modSquare_fast(t7, t5);      /* t7 = (y2 + y1)^2 = F */
+    vli_modSub_fast(t7, t7, t6); /* t7 = x3' */
+    vli_modSub_fast(t6, t7, X1); /* t6 = x3' - B */
+    vli_modMult_fast(t6, t6, t5);    /* t6 = (y2 + y1)*(x3' - B) */
+    vli_modSub_fast(Y1, t6, Y1); /* t2 = y3' */
+
+    vli_set(X1, t7);
 }
-int ecc_fieldMult(const uint32_t *x, const uint32_t *y, uint32_t *result, uint8_t length)
+
+static void EccPoint_mult(EccPoint * RESTRICT p_result, EccPoint * RESTRICT p_point,
+    const uECC_word_t * RESTRICT p_scalar, const uECC_word_t * RESTRICT p_initialZ, bitcount_t p_numBits)
 {
-       return fieldMult(x, y, result, length);
+    /* R0 and R1 */
+    uECC_word_t Rx[2][uECC_WORDS];
+    uECC_word_t Ry[2][uECC_WORDS];
+    uECC_word_t z[uECC_WORDS];
+
+    bitcount_t i;
+    uECC_word_t nb;
+
+    vli_set(Rx[1], p_point->x);
+    vli_set(Ry[1], p_point->y);
+
+    XYcZ_initial_double(Rx[1], Ry[1], Rx[0], Ry[0], p_initialZ);
+
+    for(i = p_numBits - 2; i > 0; --i)
+    {
+        nb = !vli_testBit(p_scalar, i);
+        XYcZ_addC(Rx[1-nb], Ry[1-nb], Rx[nb], Ry[nb]);
+        XYcZ_add(Rx[nb], Ry[nb], Rx[1-nb], Ry[1-nb]);
+    }
+
+    nb = !vli_testBit(p_scalar, 0);
+    XYcZ_addC(Rx[1-nb], Ry[1-nb], Rx[nb], Ry[nb]);
+
+    /* Find final 1/Z value. */
+    vli_modSub_fast(z, Rx[1], Rx[0]); /* X1 - X0 */
+    vli_modMult_fast(z, z, Ry[1-nb]);     /* Yb * (X1 - X0) */
+    vli_modMult_fast(z, z, p_point->x);   /* xP * Yb * (X1 - X0) */
+    vli_modInv(z, z, curve_p);            /* 1 / (xP * Yb * (X1 - X0)) */
+    vli_modMult_fast(z, z, p_point->y);   /* yP / (xP * Yb * (X1 - X0)) */
+    vli_modMult_fast(z, z, Rx[1-nb]);     /* Xb * yP / (xP * Yb * (X1 - X0)) */
+    /* End 1/Z calculation */
+
+    XYcZ_add(Rx[nb], Ry[nb], Rx[1-nb], Ry[1-nb]);
+
+    apply_z(Rx[0], Ry[0], z);
+
+    vli_set(p_result->x, Rx[0]);
+    vli_set(p_result->y, Ry[0]);
 }
-void ecc_fieldModP(uint32_t *A, const uint32_t *B)
+
+/* Compute a = sqrt(a) (mod curve_p). */
+static void mod_sqrt(uECC_word_t *a)
 {
-       fieldModP(A, B);
+    bitcount_t i;
+    uECC_word_t p1[uECC_WORDS] = {1};
+    uECC_word_t l_result[uECC_WORDS] = {1};
+
+    /* Since curve_p == 3 (mod 4) for all supported curves, we can
+       compute sqrt(a) = a^((curve_p + 1) / 4) (mod curve_p). */
+    vli_add(p1, curve_p, p1); /* p1 = curve_p + 1 */
+    for(i = vli_numBits(p1, uECC_WORDS) - 1; i > 1; --i)
+    {
+        vli_modSquare_fast(l_result, l_result);
+        if(vli_testBit(p1, i))
+        {
+            vli_modMult_fast(l_result, l_result, a);
+        }
+    }
+    vli_set(a, l_result);
 }
-void ecc_fieldModO(const uint32_t *A, uint32_t *result, uint8_t length)
+
+#if uECC_WORD_SIZE == 1
+
+static void vli_nativeToBytes(uint8_t * RESTRICT p_dest, const uint8_t * RESTRICT p_src)
 {
-       fieldModO(A, result, length);
+    uint8_t i;
+    for(i=0; i<uECC_BYTES; ++i)
+    {
+        p_dest[i] = p_src[(uECC_BYTES - 1) - i];
+    }
 }
-void ecc_fieldInv(const uint32_t *A, const uint32_t *modulus, const uint32_t *reducer, uint32_t *B)
+
+#define vli_bytesToNative(dest, src) vli_nativeToBytes((dest), (src))
+
+#elif uECC_WORD_SIZE == 4
+
+static void vli_nativeToBytes(uint8_t *p_bytes, const uint32_t *p_native)
 {
-       fieldInv(A, modulus, reducer, B);
+    unsigned i;
+    for(i=0; i<uECC_WORDS; ++i)
+    {
+        uint8_t *p_digit = p_bytes + 4 * (uECC_WORDS - 1 - i);
+        p_digit[0] = p_native[i] >> 24;
+        p_digit[1] = p_native[i] >> 16;
+        p_digit[2] = p_native[i] >> 8;
+        p_digit[3] = p_native[i];
+    }
 }
-void ecc_copy(const uint32_t *from, uint32_t *to, uint8_t length)
+
+static void vli_bytesToNative(uint32_t *p_native, const uint8_t *p_bytes)
 {
-       copy(from, to, length);
+    unsigned i;
+    for(i=0; i<uECC_WORDS; ++i)
+    {
+        const uint8_t *p_digit = p_bytes + 4 * (uECC_WORDS - 1 - i);
+        p_native[i] = ((uint32_t)p_digit[0] << 24) | ((uint32_t)p_digit[1] << 16) | ((uint32_t)p_digit[2] << 8) | (uint32_t)p_digit[3];
+    }
 }
-int ecc_isSame(const uint32_t *A, const uint32_t *B, uint8_t length)
+
+#else
+
+static void vli_nativeToBytes(uint8_t *p_bytes, const uint64_t *p_native)
 {
-       return isSame(A, B, length);
+    unsigned i;
+    for(i=0; i<uECC_WORDS; ++i)
+    {
+        uint8_t *p_digit = p_bytes + 8 * (uECC_WORDS - 1 - i);
+        p_digit[0] = p_native[i] >> 56;
+        p_digit[1] = p_native[i] >> 48;
+        p_digit[2] = p_native[i] >> 40;
+        p_digit[3] = p_native[i] >> 32;
+        p_digit[4] = p_native[i] >> 24;
+        p_digit[5] = p_native[i] >> 16;
+        p_digit[6] = p_native[i] >> 8;
+        p_digit[7] = p_native[i];
+    }
 }
-void ecc_setZero(uint32_t *A, const int length)
+
+static void vli_bytesToNative(uint64_t *p_native, const uint8_t *p_bytes)
 {
-       setZero(A, length);
+    unsigned i;
+    for(i=0; i<uECC_WORDS; ++i)
+    {
+        const uint8_t *p_digit = p_bytes + 8 * (uECC_WORDS - 1 - i);
+        p_native[i] = ((uint64_t)p_digit[0] << 56) | ((uint64_t)p_digit[1] << 48) | ((uint64_t)p_digit[2] << 40) | ((uint64_t)p_digit[3] << 32) |
+            ((uint64_t)p_digit[4] << 24) | ((uint64_t)p_digit[5] << 16) | ((uint64_t)p_digit[6] << 8) | (uint64_t)p_digit[7];
+    }
 }
-int ecc_isOne(const uint32_t* A)
+
+#endif /* uECC_WORD_SIZE */
+
+int uECC_make_key(uint8_t p_publicKey[uECC_BYTES*2], uint8_t p_privateKey[uECC_BYTES])
 {
-       return isOne(A);
+    EccPoint l_public;
+    uECC_word_t l_private[uECC_WORDS];
+    uECC_word_t l_tries = 0;
+
+    do
+    {
+    repeat:
+        if(!g_rng((uint8_t *)l_private, sizeof(l_private)) || (l_tries++ >= MAX_TRIES))
+        {
+            return 0;
+        }
+        if(vli_isZero(l_private))
+        {
+            goto repeat;
+        }
+
+        /* Make sure the private key is in the range [1, n-1]. */
+    #if uECC_CURVE != uECC_secp160r1
+        if(vli_cmp(curve_n, l_private) != 1)
+        {
+            goto repeat;
+        }
+    #endif
+
+        EccPoint_mult(&l_public, &curve_G, l_private, 0, vli_numBits(l_private, uECC_WORDS));
+    } while(EccPoint_isZero(&l_public));
+
+    vli_nativeToBytes(p_privateKey, l_private);
+    vli_nativeToBytes(p_publicKey, l_public.x);
+    vli_nativeToBytes(p_publicKey + uECC_BYTES, l_public.y);
+    return 1;
 }
-void ecc_rshift(uint32_t* A)
+
+int uECC_shared_secret(const uint8_t p_publicKey[uECC_BYTES*2], const uint8_t p_privateKey[uECC_BYTES], uint8_t p_secret[uECC_BYTES])
 {
-       rshift(A);
+    EccPoint l_public;
+    uECC_word_t l_private[uECC_WORDS];
+    uECC_word_t l_random[uECC_WORDS];
+
+    g_rng((uint8_t *)l_random, sizeof(l_random));
+
+    vli_bytesToNative(l_private, p_privateKey);
+    vli_bytesToNative(l_public.x, p_publicKey);
+    vli_bytesToNative(l_public.y, p_publicKey + uECC_BYTES);
+
+    EccPoint l_product;
+    EccPoint_mult(&l_product, &l_public, l_private, (vli_isZero(l_random) ? 0: l_random), vli_numBits(l_private, uECC_WORDS));
+
+    vli_nativeToBytes(p_secret, l_product.x);
+
+    return !EccPoint_isZero(&l_product);
 }
-int ecc_isGreater(const uint32_t *A, const uint32_t *B, uint8_t length)
+
+void uECC_compress(const uint8_t p_publicKey[uECC_BYTES*2], uint8_t p_compressed[uECC_BYTES+1])
 {
-       return isGreater(A, B , length);
+    wordcount_t i;
+    for(i=0; i<uECC_BYTES; ++i)
+    {
+        p_compressed[i+1] = p_publicKey[i];
+    }
+    p_compressed[0] = 2 + (p_publicKey[uECC_BYTES * 2 - 1] & 0x01);
 }
 
-void ecc_ec_add(const uint32_t *px, const uint32_t *py, const uint32_t *qx, const uint32_t *qy, uint32_t *Sx, uint32_t *Sy)
+void uECC_decompress(const uint8_t p_compressed[uECC_BYTES+1], uint8_t p_publicKey[uECC_BYTES*2])
 {
-       ec_add(px, py, qx, qy, Sx, Sy);
+    EccPoint l_point;
+    vli_bytesToNative(l_point.x, p_compressed + 1);
+
+#if (uECC_CURVE == uECC_secp256k1)
+    vli_modSquare_fast(l_point.y, l_point.x); /* r = x^2 */
+    vli_modMult_fast(l_point.y, l_point.y, l_point.x); /* r = x^3 */
+    vli_modAdd(l_point.y, l_point.y, curve_b, curve_p); /* r = x^3 + b */
+#else
+    uECC_word_t _3[uECC_WORDS] = {3}; /* -a = 3 */
+
+    vli_modSquare_fast(l_point.y, l_point.x); /* y = x^2 */
+    vli_modSub_fast(l_point.y, l_point.y, _3); /* y = x^2 - 3 */
+    vli_modMult_fast(l_point.y, l_point.y, l_point.x); /* y = x^3 - 3x */
+    vli_modAdd(l_point.y, l_point.y, curve_b, curve_p); /* y = x^3 - 3x + b */
+#endif
+
+    mod_sqrt(l_point.y);
+
+    if((l_point.y[0] & 0x01) != (p_compressed[0] & 0x01))
+    {
+        vli_sub(l_point.y, curve_p, l_point.y);
+    }
+
+    vli_nativeToBytes(p_publicKey, l_point.x);
+    vli_nativeToBytes(p_publicKey + uECC_BYTES, l_point.y);
 }
-void ecc_ec_double(const uint32_t *px, const uint32_t *py, uint32_t *Dx, uint32_t *Dy)
+
+/* -------- ECDSA code -------- */
+
+#if (uECC_CURVE == uECC_secp160r1)
+static void vli_clear_n(uECC_word_t *p_vli)
 {
-       ec_double(px, py, Dx, Dy);
+    vli_clear(p_vli);
+    p_vli[uECC_N_WORDS - 1] = 0;
 }
 
-#endif /* TEST_INCLUDE */
+static uECC_word_t vli_isZero_n(const uECC_word_t *p_vli)
+{
+    if(p_vli[uECC_N_WORDS - 1])
+    {
+        return 0;
+    }
+    return vli_isZero(p_vli);
+}
+
+static void vli_set_n(uECC_word_t *p_dest, const uECC_word_t *p_src)
+{
+    vli_set(p_dest, p_src);
+    p_dest[uECC_N_WORDS-1] = p_src[uECC_N_WORDS-1];
+}
+
+static cmpresult_t vli_cmp_n(uECC_word_t *p_left, uECC_word_t *p_right)
+{
+    if(p_left[uECC_N_WORDS-1] > p_right[uECC_N_WORDS-1])
+    {
+        return 1;
+    }
+    else if(p_left[uECC_N_WORDS-1] < p_right[uECC_N_WORDS-1])
+    {
+        return -1;
+    }
+    return vli_cmp(p_left, p_right);
+}
+
+static void vli_rshift1_n(uECC_word_t *p_vli)
+{
+    vli_rshift1(p_vli);
+    p_vli[uECC_N_WORDS-2] |= p_vli[uECC_N_WORDS-1] << (uECC_WORD_BITS - 1);
+    p_vli[uECC_N_WORDS-1] = p_vli[uECC_N_WORDS-1] >> 1;
+}
+
+static uECC_word_t vli_add_n(uECC_word_t *p_result, uECC_word_t *p_left, uECC_word_t *p_right)
+{
+    uECC_word_t l_carry = vli_add(p_result, p_left, p_right);
+    uECC_word_t l_sum = p_left[uECC_N_WORDS-1] + p_right[uECC_N_WORDS-1] + l_carry;
+    if(l_sum != p_left[uECC_N_WORDS-1])
+    {
+        l_carry = (l_sum < p_left[uECC_N_WORDS-1]);
+    }
+    p_result[uECC_N_WORDS-1] = l_sum;
+    return l_carry;
+}
+
+static uECC_word_t vli_sub_n(uECC_word_t *p_result, uECC_word_t *p_left, uECC_word_t *p_right)
+{
+    uECC_word_t l_borrow = vli_sub(p_result, p_left, p_right);
+    uECC_word_t l_diff = p_left[uECC_N_WORDS-1] - p_right[uECC_N_WORDS-1] - l_borrow;
+    if(l_diff != p_left[uECC_N_WORDS-1])
+    {
+        l_borrow = (l_diff > p_left[uECC_N_WORDS-1]);
+    }
+    p_result[uECC_N_WORDS-1] = l_diff;
+    return l_borrow;
+}
+
+#if !muladd_exists
+static void muladd(uECC_word_t a, uECC_word_t b, uECC_word_t *r0, uECC_word_t *r1, uECC_word_t *r2)
+{
+    uECC_dword_t p = (uECC_dword_t)a * b;
+    uECC_dword_t r01 = ((uECC_dword_t)(*r1) << uECC_WORD_BITS) | *r0;
+    r01 += p;
+    *r2 += (r01 < p);
+    *r1 = r01 >> uECC_WORD_BITS;
+    *r0 = (uECC_word_t)r01;
+}
+#define muladd_exists 1
+#endif
+
+static void vli_mult_n(uECC_word_t *p_result, uECC_word_t *p_left, uECC_word_t *p_right)
+{
+    uECC_word_t r0 = 0;
+    uECC_word_t r1 = 0;
+    uECC_word_t r2 = 0;
+
+    wordcount_t i, k;
+    for(k = 0; k < uECC_N_WORDS*2 - 1; ++k)
+    {
+        wordcount_t l_min = (k < uECC_N_WORDS ? 0 : (k + 1) - uECC_N_WORDS);
+        wordcount_t l_max = (k < uECC_N_WORDS ? k : uECC_N_WORDS-1);
+        for(i = l_min; i <= l_max; ++i)
+        {
+            muladd(p_left[i], p_right[k-i], &r0, &r1, &r2);
+        }
+        p_result[k] = r0;
+        r0 = r1;
+        r1 = r2;
+        r2 = 0;
+    }
+
+    p_result[uECC_N_WORDS*2 - 1] = r0;
+}
+
+static void vli_modAdd_n(uECC_word_t *p_result, uECC_word_t *p_left, uECC_word_t *p_right, uECC_word_t *p_mod)
+{
+    uECC_word_t l_carry = vli_add_n(p_result, p_left, p_right);
+    if(l_carry || vli_cmp_n(p_result, p_mod) >= 0)
+    {
+        vli_sub_n(p_result, p_result, p_mod);
+    }
+}
+
+static void vli_modInv_n(uECC_word_t *p_result, uECC_word_t *p_input, uECC_word_t *p_mod)
+{
+    uECC_word_t a[uECC_N_WORDS], b[uECC_N_WORDS], u[uECC_N_WORDS], v[uECC_N_WORDS];
+    uECC_word_t l_carry;
+    cmpresult_t l_cmpResult;
+
+    if(vli_isZero_n(p_input))
+    {
+        vli_clear_n(p_result);
+        return;
+    }
+
+    vli_set_n(a, p_input);
+    vli_set_n(b, p_mod);
+    vli_clear_n(u);
+    u[0] = 1;
+    vli_clear_n(v);
+    while((l_cmpResult = vli_cmp_n(a, b)) != 0)
+    {
+        l_carry = 0;
+        if(EVEN(a))
+        {
+            vli_rshift1_n(a);
+            if(!EVEN(u)) l_carry = vli_add_n(u, u, p_mod);
+            vli_rshift1_n(u);
+            if(l_carry) u[uECC_N_WORDS-1] |= HIGH_BIT_SET;
+        }
+        else if(EVEN(b))
+        {
+            vli_rshift1_n(b);
+            if(!EVEN(v)) l_carry = vli_add_n(v, v, p_mod);
+            vli_rshift1_n(v);
+            if(l_carry) v[uECC_N_WORDS-1] |= HIGH_BIT_SET;
+        }
+        else if(l_cmpResult > 0)
+        {
+            vli_sub_n(a, a, b);
+            vli_rshift1_n(a);
+            if(vli_cmp_n(u, v) < 0) vli_add_n(u, u, p_mod);
+            vli_sub_n(u, u, v);
+            if(!EVEN(u)) l_carry = vli_add_n(u, u, p_mod);
+            vli_rshift1_n(u);
+            if(l_carry) u[uECC_N_WORDS-1] |= HIGH_BIT_SET;
+        }
+        else
+        {
+            vli_sub_n(b, b, a);
+            vli_rshift1_n(b);
+            if(vli_cmp_n(v, u) < 0) vli_add_n(v, v, p_mod);
+            vli_sub_n(v, v, u);
+            if(!EVEN(v)) l_carry = vli_add_n(v, v, p_mod);
+            vli_rshift1_n(v);
+            if(l_carry) v[uECC_N_WORDS-1] |= HIGH_BIT_SET;
+        }
+    }
+
+    vli_set_n(p_result, u);
+}
+
+static void vli2_rshift1_n(uECC_word_t *p_vli)
+{
+    vli_rshift1_n(p_vli);
+    p_vli[uECC_N_WORDS-1] |= p_vli[uECC_N_WORDS] << (uECC_WORD_BITS - 1);
+    vli_rshift1_n(p_vli + uECC_N_WORDS);
+}
+
+static uECC_word_t vli2_sub_n(uECC_word_t *p_result, uECC_word_t *p_left, uECC_word_t *p_right)
+{
+    uECC_word_t l_borrow = 0;
+    wordcount_t i;
+    for(i=0; i<uECC_N_WORDS*2; ++i)
+    {
+        uECC_word_t l_diff = p_left[i] - p_right[i] - l_borrow;
+        if(l_diff != p_left[i])
+        {
+            l_borrow = (l_diff > p_left[i]);
+        }
+        p_result[i] = l_diff;
+    }
+    return l_borrow;
+}
+
+/* Computes p_result = (p_left * p_right) % curve_n. */
+static void vli_modMult_n(uECC_word_t *p_result, uECC_word_t *p_left, uECC_word_t *p_right)
+{
+    uECC_word_t l_product[2 * uECC_N_WORDS];
+    uECC_word_t l_modMultiple[2 * uECC_N_WORDS];
+    uECC_word_t l_tmp[2 * uECC_N_WORDS];
+    uECC_word_t *v[2] = {l_tmp, l_product};
+
+    vli_mult_n(l_product, p_left, p_right);
+    vli_clear_n(l_modMultiple);
+    vli_set(l_modMultiple + uECC_N_WORDS + 1, curve_n);
+    vli_rshift1(l_modMultiple + uECC_N_WORDS + 1);
+    l_modMultiple[2 * uECC_N_WORDS - 1] |= HIGH_BIT_SET;
+    l_modMultiple[uECC_N_WORDS] = HIGH_BIT_SET;
+
+    bitcount_t i;
+    uECC_word_t l_index = 1;
+    for(i=0; i<=((((bitcount_t)uECC_N_WORDS) << uECC_WORD_BITS_SHIFT) + (uECC_WORD_BITS - 1)); ++i)
+    {
+        uECC_word_t l_borrow = vli2_sub_n(v[1-l_index], v[l_index], l_modMultiple);
+        l_index = !(l_index ^ l_borrow); /* Swap the index if there was no borrow */
+        vli2_rshift1_n(l_modMultiple);
+    }
+
+    vli_set_n(p_result, v[l_index]);
+}
+
+#else
+
+#define vli_modInv_n vli_modInv
+#define vli_modAdd_n vli_modAdd
+
+static void vli2_rshift1(uECC_word_t *p_vli)
+{
+    vli_rshift1(p_vli);
+    p_vli[uECC_WORDS-1] |= p_vli[uECC_WORDS] << (uECC_WORD_BITS - 1);
+    vli_rshift1(p_vli + uECC_WORDS);
+}
+
+static uECC_word_t vli2_sub(uECC_word_t *p_result, uECC_word_t *p_left, uECC_word_t *p_right)
+{
+    uECC_word_t l_borrow = 0;
+    wordcount_t i;
+    for(i=0; i<uECC_WORDS*2; ++i)
+    {
+        uECC_word_t l_diff = p_left[i] - p_right[i] - l_borrow;
+        if(l_diff != p_left[i])
+        {
+            l_borrow = (l_diff > p_left[i]);
+        }
+        p_result[i] = l_diff;
+    }
+    return l_borrow;
+}
+
+/* Computes p_result = (p_left * p_right) % curve_n. */
+static void vli_modMult_n(uECC_word_t *p_result, uECC_word_t *p_left, uECC_word_t *p_right)
+{
+    uECC_word_t l_product[2 * uECC_WORDS];
+    uECC_word_t l_modMultiple[2 * uECC_WORDS];
+    uECC_word_t l_tmp[2 * uECC_WORDS];
+    uECC_word_t *v[2] = {l_tmp, l_product};
+
+    vli_mult(l_product, p_left, p_right);
+    vli_set(l_modMultiple + uECC_WORDS, curve_n); /* works if curve_n has its highest bit set */
+    vli_clear(l_modMultiple);
+
+    bitcount_t i;
+    uECC_word_t l_index = 1;
+    for(i=0; i<=uECC_BYTES * 8; ++i)
+    {
+        uECC_word_t l_borrow = vli2_sub(v[1-l_index], v[l_index], l_modMultiple);
+        l_index = !(l_index ^ l_borrow); /* Swap the index if there was no borrow */
+        vli2_rshift1(l_modMultiple);
+    }
+
+    vli_set(p_result, v[l_index]);
+}
+#endif /* (uECC_CURVE != uECC_secp160r1) */
+
+int uECC_sign(const uint8_t p_privateKey[uECC_BYTES], const uint8_t p_hash[uECC_BYTES], uint8_t p_signature[uECC_BYTES*2])
+{
+    uECC_word_t k[uECC_N_WORDS];
+    uECC_word_t l_tmp[uECC_N_WORDS];
+    uECC_word_t s[uECC_N_WORDS];
+    uECC_word_t *k2[2] = {l_tmp, s};
+    EccPoint p;
+    uECC_word_t l_tries = 0;
+
+    do
+    {
+    repeat:
+        if(!g_rng((uint8_t *)k, sizeof(k)) || (l_tries++ >= MAX_TRIES))
+        {
+            return 0;
+        }
+
+        if(vli_isZero(k))
+        {
+            goto repeat;
+        }
+
+    #if (uECC_CURVE == uECC_secp160r1)
+        k[uECC_WORDS] &= 0x01;
+        if(vli_cmp_n(curve_n, k) != 1)
+        {
+            goto repeat;
+        }
+
+        /* make sure that we don't leak timing information about k. See http://eprint.iacr.org/2011/232.pdf */
+        vli_add_n(l_tmp, k, curve_n);
+        uECC_word_t l_carry = (l_tmp[uECC_WORDS] & 0x02);
+        vli_add_n(s, l_tmp, curve_n);
+
+        /* p = k * G */
+        EccPoint_mult(&p, &curve_G, k2[!l_carry], 0, (uECC_BYTES * 8) + 2);
+    #else
+        if(vli_cmp(curve_n, k) != 1)
+        {
+            goto repeat;
+        }
+
+        /* make sure that we don't leak timing information about k. See http://eprint.iacr.org/2011/232.pdf */
+        uECC_word_t l_carry = vli_add(l_tmp, k, curve_n);
+        vli_add(s, l_tmp, curve_n);
+
+        /* p = k * G */
+        EccPoint_mult(&p, &curve_G, k2[!l_carry], 0, (uECC_BYTES * 8) + 1);
+
+        /* r = x1 (mod n) */
+        if(vli_cmp(curve_n, p.x) != 1)
+        {
+            vli_sub(p.x, p.x, curve_n);
+        }
+    #endif
+    } while(vli_isZero(p.x));
+
+    l_tries = 0;
+    do
+    {
+        if(!g_rng((uint8_t *)l_tmp, sizeof(l_tmp)) || (l_tries++ >= MAX_TRIES))
+        {
+            return 0;
+        }
+    } while(vli_isZero(l_tmp));
+
+    /* Prevent side channel analysis of vli_modInv() to determine
+       bits of k / the private key by premultiplying by a random number */
+    vli_modMult_n(k, k, l_tmp); /* k' = rand * k */
+    vli_modInv_n(k, k, curve_n); /* k = 1 / k' */
+    vli_modMult_n(k, k, l_tmp); /* k = 1 / k */
+
+    vli_nativeToBytes(p_signature, p.x); /* store r */
+
+    l_tmp[uECC_N_WORDS-1] = 0;
+    vli_bytesToNative(l_tmp, p_privateKey); /* tmp = d */
+    s[uECC_N_WORDS-1] = 0;
+    vli_set(s, p.x);
+    vli_modMult_n(s, l_tmp, s); /* s = r*d */
+
+    vli_bytesToNative(l_tmp, p_hash);
+    vli_modAdd_n(s, l_tmp, s, curve_n); /* s = e + r*d */
+    vli_modMult_n(s, s, k); /* s = (e + r*d) / k */
+#if (uECC_CURVE == uECC_secp160r1)
+    if(s[uECC_N_WORDS-1])
+    {
+        goto repeat;
+    }
+#endif
+    vli_nativeToBytes(p_signature + uECC_BYTES, s);
+
+    return 1;
+}
+
+static bitcount_t smax(bitcount_t a, bitcount_t b)
+{
+    return (a > b ? a : b);
+}
+
+int uECC_verify(const uint8_t p_publicKey[uECC_BYTES*2], const uint8_t p_hash[uECC_BYTES], const uint8_t p_signature[uECC_BYTES*2])
+{
+    uECC_word_t u1[uECC_N_WORDS], u2[uECC_N_WORDS];
+    uECC_word_t z[uECC_N_WORDS];
+    EccPoint l_public, l_sum;
+    uECC_word_t rx[uECC_WORDS];
+    uECC_word_t ry[uECC_WORDS];
+    uECC_word_t tx[uECC_WORDS];
+    uECC_word_t ty[uECC_WORDS];
+    uECC_word_t tz[uECC_WORDS];
+
+    uECC_word_t r[uECC_N_WORDS], s[uECC_N_WORDS];
+    r[uECC_N_WORDS-1] = 0;
+    s[uECC_N_WORDS-1] = 0;
+
+    vli_bytesToNative(l_public.x, p_publicKey);
+    vli_bytesToNative(l_public.y, p_publicKey + uECC_BYTES);
+    vli_bytesToNative(r, p_signature);
+    vli_bytesToNative(s, p_signature + uECC_BYTES);
+
+    if(vli_isZero(r) || vli_isZero(s))
+    { /* r, s must not be 0. */
+        return 0;
+    }
+
+#if (uECC_CURVE != uECC_secp160r1)
+    if(vli_cmp(curve_n, r) != 1 || vli_cmp(curve_n, s) != 1)
+    { /* r, s must be < n. */
+        return 0;
+    }
+#endif
+
+    /* Calculate u1 and u2. */
+    vli_modInv_n(z, s, curve_n); /* Z = s^-1 */
+    u1[uECC_N_WORDS-1] = 0;
+    vli_bytesToNative(u1, p_hash);
+    vli_modMult_n(u1, u1, z); /* u1 = e/s */
+    vli_modMult_n(u2, r, z); /* u2 = r/s */
+
+    /* Calculate l_sum = G + Q. */
+    vli_set(l_sum.x, l_public.x);
+    vli_set(l_sum.y, l_public.y);
+    vli_set(tx, curve_G.x);
+    vli_set(ty, curve_G.y);
+    vli_modSub_fast(z, l_sum.x, tx); /* Z = x2 - x1 */
+    XYcZ_add(tx, ty, l_sum.x, l_sum.y);
+    vli_modInv(z, z, curve_p); /* Z = 1/Z */
+    apply_z(l_sum.x, l_sum.y, z);
+
+    /* Use Shamir's trick to calculate u1*G + u2*Q */
+    EccPoint *l_points[4] = {0, &curve_G, &l_public, &l_sum};
+    bitcount_t l_numBits = smax(vli_numBits(u1, uECC_N_WORDS), vli_numBits(u2, uECC_N_WORDS));
+
+    EccPoint *l_point = l_points[(!!vli_testBit(u1, l_numBits-1)) | ((!!vli_testBit(u2, l_numBits-1)) << 1)];
+    vli_set(rx, l_point->x);
+    vli_set(ry, l_point->y);
+    vli_clear(z);
+    z[0] = 1;
+
+    bitcount_t i;
+    for(i = l_numBits - 2; i >= 0; --i)
+    {
+        EccPoint_double_jacobian(rx, ry, z);
+
+        uECC_word_t l_index = (!!vli_testBit(u1, i)) | ((!!vli_testBit(u2, i)) << 1);
+        l_point = l_points[l_index];
+        if(l_point)
+        {
+            vli_set(tx, l_point->x);
+            vli_set(ty, l_point->y);
+            apply_z(tx, ty, z);
+            vli_modSub_fast(tz, rx, tx); /* Z = x2 - x1 */
+            XYcZ_add(tx, ty, rx, ry);
+            vli_modMult_fast(z, z, tz);
+        }
+    }
+
+    vli_modInv(z, z, curve_p); /* Z = 1/Z */
+    apply_z(rx, ry, z);
+
+    /* v = x1 (mod n) */
+#if (uECC_CURVE != uECC_secp160r1)
+    if(vli_cmp(curve_n, rx) != 1)
+    {
+        vli_sub(rx, rx, curve_n);
+    }
+#endif
+
+    /* Accept only if v == r. */
+    return (vli_cmp(rx, r) == 0);
+}
old mode 100644 (file)
new mode 100755 (executable)
index 3c0d9b1..92e1313
-/*
- * Copyright (c) 2009 Chris K Cockrum <ckc@cockrum.net>
- *
- * Copyright (c) 2013 Jens Trillmann <jtrillma@tzi.de>
- * Copyright (c) 2013 Marc Müller-Weinhardt <muewei@tzi.de>
- * Copyright (c) 2013 Lars Schmertmann <lars@tzi.de>
- * Copyright (c) 2013 Hauke Mehrtens <hauke@hauke-m.de>
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- * THE SOFTWARE.
- *
- *
- * This implementation is based in part on the paper Implementation of an
- * Elliptic Curve Cryptosystem on an 8-bit Microcontroller [0] by
- * Chris K Cockrum <ckc@cockrum.net>.
- *
- * [0]: http://cockrum.net/Implementation_of_ECC_on_an_8-bit_microcontroller.pdf
- *
- * This is a efficient ECC implementation on the secp256r1 curve for 32 Bit CPU
- * architectures. It provides basic operations on the secp256r1 curve and support
- * for ECDH and ECDSA.
- */
-#include <inttypes.h>
-
-#define keyLengthInBytes 32
-#define arrayLength 8
-
-extern const uint32_t ecc_g_point_x[8];
-extern const uint32_t ecc_g_point_y[8];
-
-//ec Functions
-void ecc_ec_mult(const uint32_t *px, const uint32_t *py, const uint32_t *secret, uint32_t *resultx, uint32_t *resulty);
-
-static inline void ecc_ecdh(const uint32_t *px, const uint32_t *py, const uint32_t *secret, uint32_t *resultx, uint32_t *resulty) {
-       ecc_ec_mult(px, py, secret, resultx, resulty);
-}
-int ecc_ecdsa_validate(const uint32_t *x, const uint32_t *y, const uint32_t *e, const uint32_t *r, const uint32_t *s);
-int ecc_ecdsa_sign(const uint32_t *d, const uint32_t *e, const uint32_t *k, uint32_t *r, uint32_t *s);
-
-int ecc_is_valid_key(const uint32_t * priv_key);
-static inline void ecc_gen_pub_key(const uint32_t *priv_key, uint32_t *pub_x, uint32_t *pub_y)
+/* Copyright 2014, Kenneth MacKay. Licensed under the BSD 2-clause license. */
+
+#ifndef _MICRO_ECC_H_
+#define _MICRO_ECC_H_
+
+#include <stdint.h>
+
+/* Platform selection options.
+If uECC_PLATFORM is not defined, the code will try to guess it based on compiler macros.
+Possible values for uECC_PLATFORM are defined below: */
+#define uECC_arch_other 0
+#define uECC_x86        1
+#define uECC_x86_64     2
+#define uECC_arm        3
+#define uECC_arm_thumb  4
+#define uECC_avr        5
+
+/* If desired, you can define uECC_WORD_SIZE as appropriate for your platform (1, 4, or 8 bytes).
+If uECC_WORD_SIZE is not explicitly defined then it will be automatically set based on your platform. */
+
+/* Inline assembly options.
+uECC_asm_none  - Use standard C99 only.
+uECC_asm_small - Use GCC inline assembly for the target platform (if available), optimized for minimum size.
+uECC_asm_fast  - Use GCC inline assembly optimized for maximum speed. */
+#define uECC_asm_none  0
+#define uECC_asm_small 1
+#define uECC_asm_fast  2
+#ifndef uECC_ASM
+    #define uECC_ASM uECC_asm_none//uECC_asm_fast
+#endif
+
+/* Curve selection options. */
+#define uECC_secp160r1 1
+#define uECC_secp192r1 2
+#define uECC_secp256r1 3
+#define uECC_secp256k1 4
+#ifndef uECC_CURVE
+    #define uECC_CURVE uECC_secp256r1
+#endif
+
+/* uECC_SQUARE_FUNC - If enabled (defined as nonzero), this will cause a specific function to be used for (scalar) squaring
+    instead of the generic multiplication function. This will make things faster by about 8% but increases the code size. */
+#define uECC_SQUARE_FUNC 1
+
+#define uECC_CONCAT1(a, b) a##b
+#define uECC_CONCAT(a, b) uECC_CONCAT1(a, b)
+
+#define uECC_size_1 20 /* secp160r1 */
+#define uECC_size_2 24 /* secp192r1 */
+#define uECC_size_3 32 /* secp256r1 */
+#define uECC_size_4 32 /* secp256k1 */
+
+#define uECC_BYTES uECC_CONCAT(uECC_size_, uECC_CURVE)
+
+#ifdef __cplusplus
+extern "C"
 {
-       ecc_ec_mult(ecc_g_point_x, ecc_g_point_y, priv_key, pub_x, pub_y);
-}
-
-#ifdef TEST_INCLUDE
-//ec Functions
-void ecc_ec_add(const uint32_t *px, const uint32_t *py, const uint32_t *qx, const uint32_t *qy, uint32_t *Sx, uint32_t *Sy);
-void ecc_ec_double(const uint32_t *px, const uint32_t *py, uint32_t *Dx, uint32_t *Dy);
-
-//simple Functions for addition and substraction of big numbers
-uint32_t ecc_add( const uint32_t *x, const uint32_t *y, uint32_t *result, uint8_t length);
-uint32_t ecc_sub( const uint32_t *x, const uint32_t *y, uint32_t *result, uint8_t length);
-
-//field functions for big numbers
-int ecc_fieldAdd(const uint32_t *x, const uint32_t *y, const uint32_t *reducer, uint32_t *result);
-int ecc_fieldSub(const uint32_t *x, const uint32_t *y, const uint32_t *modulus, uint32_t *result);
-int ecc_fieldMult(const uint32_t *x, const uint32_t *y, uint32_t *result, uint8_t length);
-void ecc_fieldModP(uint32_t *A, const uint32_t *B);
-void ecc_fieldModO(const uint32_t *A, uint32_t *result, uint8_t length);
-void ecc_fieldInv(const uint32_t *A, const uint32_t *modulus, const uint32_t *reducer, uint32_t *B);
-
-//simple functions to work with the big numbers
-void ecc_copy(const uint32_t *from, uint32_t *to, uint8_t length);
-int ecc_isSame(const uint32_t *A, const uint32_t *B, uint8_t length);
-void ecc_setZero(uint32_t *A, const int length);
-int ecc_isOne(const uint32_t* A);
-void ecc_rshift(uint32_t* A);
-int ecc_isGreater(const uint32_t *A, const uint32_t *B, uint8_t length);
-
-#endif /* TEST_INCLUDE */
+#endif
+
+/* uECC_RNG_Function type
+The RNG function should fill p_size random bytes into p_dest. It should return 1 if
+p_dest was filled with random data, or 0 if the random data could not be generated.
+The filled-in values should be either truly random, or from a cryptographically-secure PRNG.
+
+A correctly functioning RNG function must be set (using uECC_set_rng()) before calling
+uECC_make_key() or uECC_sign().
+
+A correct RNG function is set by default when building for Windows, Linux, or OS X.
+If you are building on another POSIX-compliant system that supports /dev/random or /dev/urandom,
+you can define uECC_POSIX to use the predefined RNG. For embedded platforms there is no predefined
+RNG function; you must provide your own.
+*/
+typedef int (*uECC_RNG_Function)(uint8_t *p_dest, unsigned p_size);
+
+/* uECC_set_rng() function.
+Set the function that will be used to generate random bytes. The RNG function should
+return 1 if the random data was generated, or 0 if the random data could not be generated.
+
+On platforms where there is no predefined RNG function (eg embedded platforms), this must
+be called before uECC_make_key() or uECC_sign() are used.
+
+Inputs:
+    p_rng  - The function that will be used to generate random bytes.
+*/
+void uECC_set_rng(uECC_RNG_Function p_rng);
+
+/* uECC_make_key() function.
+Create a public/private key pair.
+
+Outputs:
+    p_publicKey  - Will be filled in with the public key.
+    p_privateKey - Will be filled in with the private key.
+
+Returns 1 if the key pair was generated successfully, 0 if an error occurred.
+*/
+int uECC_make_key(uint8_t p_publicKey[uECC_BYTES*2], uint8_t p_privateKey[uECC_BYTES]);
+
+/* uECC_shared_secret() function.
+Compute a shared secret given your secret key and someone else's public key.
+Note: It is recommended that you hash the result of uECC_shared_secret() before using it for symmetric encryption or HMAC.
+
+Inputs:
+    p_publicKey  - The public key of the remote party.
+    p_privateKey - Your private key.
+
+Outputs:
+    p_secret - Will be filled in with the shared secret value.
+
+Returns 1 if the shared secret was generated successfully, 0 if an error occurred.
+*/
+int uECC_shared_secret(const uint8_t p_publicKey[uECC_BYTES*2], const uint8_t p_privateKey[uECC_BYTES], uint8_t p_secret[uECC_BYTES]);
+
+/* uECC_compress() function.
+Compress a public key.
+
+Inputs:
+    p_publicKey - The public key to compress.
+
+Outputs:
+    p_compressed - Will be filled in with the compressed public key.
+*/
+void uECC_compress(const uint8_t p_publicKey[uECC_BYTES*2], uint8_t p_compressed[uECC_BYTES+1]);
+
+/* uECC_decompress() function.
+Decompress a compressed public key.
+
+Inputs:
+    p_compressed - The compressed public key.
+
+Outputs:
+    p_publicKey - Will be filled in with the decompressed public key.
+*/
+void uECC_decompress(const uint8_t p_compressed[uECC_BYTES+1], uint8_t p_publicKey[uECC_BYTES*2]);
+
+/* uECC_sign() function.
+Generate an ECDSA signature for a given hash value.
+
+Usage: Compute a hash of the data you wish to sign (SHA-2 is recommended) and pass it in to
+this function along with your private key.
+
+Inputs:
+    p_privateKey - Your private key.
+    p_hash       - The message hash to sign.
+
+Outputs:
+    p_signature  - Will be filled in with the signature value.
+
+Returns 1 if the signature generated successfully, 0 if an error occurred.
+*/
+int uECC_sign(const uint8_t p_privateKey[uECC_BYTES], const uint8_t p_hash[uECC_BYTES], uint8_t p_signature[uECC_BYTES*2]);
+
+/* uECC_verify() function.
+Verify an ECDSA signature.
+
+Usage: Compute the hash of the signed data using the same hash as the signer and
+pass it to this function along with the signer's public key and the signature values (r and s).
+
+Inputs:
+    p_publicKey - The signer's public key
+    p_hash      - The hash of the signed data.
+    p_signature - The signature value.
+
+Returns 1 if the signature is valid, 0 if it is invalid.
+*/
+int uECC_verify(const uint8_t p_publicKey[uECC_BYTES*2], const uint8_t p_hash[uECC_BYTES], const uint8_t p_signature[uECC_BYTES*2]);
+
+#ifdef __cplusplus
+} /* end of extern "C" */
+#endif
+
+#endif /* _MICRO_ECC_H_ */
diff --git a/extlibs/tinydtls/ecc/test/ecc_test/ecc_test.ino b/extlibs/tinydtls/ecc/test/ecc_test/ecc_test.ino
new file mode 100755 (executable)
index 0000000..04c5e4a
--- /dev/null
@@ -0,0 +1,115 @@
+#include <uECC.h>
+
+#include <j0g.h>
+#include <js0n.h>
+
+#include <lwm.h>
+
+#include <bitlash.h>
+
+#include <GS.h>
+
+#include <SPI.h>
+#include <Wire.h>
+#include <Scout.h>
+#include <Shell.h>
+
+
+#include <uECC.h>
+
+extern "C" {
+
+static int RNG(uint8_t *p_dest, unsigned p_size)
+{
+  // Use the least-significant bits from the ADC for an unconnected pin (or connected to a source of random noise)
+  // This can take a long time to generate random data if the result of analogRead(0) doesn't change very frequently.
+  while(p_size) {
+    uint8_t l_val = 0;
+    for(unsigned i=0; i<8; ++i)
+    {
+      int l_init = analogRead(0);
+      int l_count = 0;
+      while(analogRead(0) == l_init)
+      {
+        ++l_count;
+      }
+      
+      if(l_count == 0)
+      {
+         l_val = (l_val << 1) | (l_init & 0x01);
+      }
+      else
+      {
+         l_val = (l_val << 1) | (l_count & 0x01);
+      }
+    }
+    *p_dest = l_val;
+    ++p_dest;
+    --p_size;
+  }
+  
+  // NOTE: it would be a good idea to hash the resulting random data using SHA-256 or similar.
+  return 1;
+}
+
+}
+
+void setup()
+{
+  Scout.setup();
+  
+  Serial.print("Testing ecc\n");
+  
+  uECC_set_rng(&RNG);
+}
+
+void loop() {
+  uint8_t l_private1[uECC_BYTES];
+  uint8_t l_private2[uECC_BYTES];
+  
+  uint8_t l_public1[uECC_BYTES * 2];
+  uint8_t l_public2[uECC_BYTES * 2];
+  
+  uint8_t l_secret1[uECC_BYTES];
+  uint8_t l_secret2[uECC_BYTES];
+  
+  unsigned long a = millis();
+  uECC_make_key(l_public1, l_private1);
+  unsigned long b = millis();
+  
+  Serial.print("Made key 1 in "); Serial.println(b-a);
+  a = millis();
+  uECC_make_key(l_public2, l_private2);
+  b = millis();
+  Serial.print("Made key 2 in "); Serial.println(b-a);
+
+  a = millis();
+  int r = uECC_shared_secret(l_public2, l_private1, l_secret1);
+  b = millis();
+  Serial.print("Shared secret 1 in "); Serial.println(b-a);
+  if(!r)
+  {
+    Serial.print("shared_secret() failed (1)\n");
+    return;
+  }
+
+  a = millis();
+  r = uECC_shared_secret(l_public1, l_private2, l_secret2);
+  b = millis();
+  Serial.print("Shared secret 2 in "); Serial.println(b-a);
+  if(!r)
+  {
+    Serial.print("shared_secret() failed (2)\n");
+    return;
+  }
+    
+  if(memcmp(l_secret1, l_secret2, sizeof(l_secret1)) != 0)
+  {
+    Serial.print("Shared secrets are not identical!\n");
+  }
+  else
+  {
+    Serial.print("Shared secrets are identical\n");
+  }
+}
+
diff --git a/extlibs/tinydtls/ecc/test/emk_rules.py b/extlibs/tinydtls/ecc/test/emk_rules.py
new file mode 100755 (executable)
index 0000000..956ccf5
--- /dev/null
@@ -0,0 +1,4 @@
+c, link = emk.module("c", "link")
+link.depdirs += [
+    "$:proj:$"
+]
diff --git a/extlibs/tinydtls/ecc/test/test_ecdh.c b/extlibs/tinydtls/ecc/test/test_ecdh.c
new file mode 100755 (executable)
index 0000000..cef274d
--- /dev/null
@@ -0,0 +1,107 @@
+/* Copyright 2014, Kenneth MacKay. Licensed under the BSD 2-clause license. */
+
+#include "../ecc.h"
+
+#include <stdio.h>
+#include <string.h>
+
+#if LPC11XX
+
+#include "/Projects/lpc11xx/peripherals/uart.h"
+#include "/Projects/lpc11xx/peripherals/time.h"
+
+static uint64_t g_rand = 88172645463325252ull;
+int fake_rng(uint8_t *p_dest, unsigned p_size)
+{
+    while(p_size)
+    {
+        g_rand ^= (g_rand << 13);
+        g_rand ^= (g_rand >> 7);
+        g_rand ^= (g_rand << 17);
+
+        unsigned l_amount = (p_size > 8 ? 8 : p_size);
+        memcpy(p_dest, &g_rand, l_amount);
+        p_size -= l_amount;
+    }
+    return 1;
+}
+
+#endif
+
+void vli_print(uint8_t *p_vli, unsigned int p_size)
+{
+    while(p_size)
+    {
+        printf("%02X ", (unsigned)p_vli[p_size - 1]);
+        --p_size;
+    }
+}
+
+int main()
+{
+#if LPC11XX
+    uartInit(BAUD_115200);
+       initTime();
+
+    uECC_set_rng(&fake_rng);
+#endif
+
+    int i;
+
+    uint8_t l_private1[uECC_BYTES];
+    uint8_t l_private2[uECC_BYTES];
+
+    uint8_t l_public1[uECC_BYTES * 2];
+    uint8_t l_public2[uECC_BYTES * 2];
+
+    uint8_t l_secret1[uECC_BYTES];
+    uint8_t l_secret2[uECC_BYTES];
+
+    printf("Testing 256 random private key pairs\n");
+
+    for(i=0; i<256; ++i)
+    {
+        printf(".");
+    #if !LPC11XX
+        fflush(stdout);
+    #endif
+
+        if(!uECC_make_key(l_public1, l_private1) || !uECC_make_key(l_public2, l_private2))
+        {
+            printf("uECC_make_key() failed\n");
+            return 1;
+        }
+
+        if(!uECC_shared_secret(l_public2, l_private1, l_secret1))
+        {
+            printf("shared_secret() failed (1)\n");
+            return 1;
+        }
+
+        if(!uECC_shared_secret(l_public1, l_private2, l_secret2))
+        {
+            printf("shared_secret() failed (2)\n");
+            return 1;
+        }
+
+        if(memcmp(l_secret1, l_secret2, sizeof(l_secret1)) != 0)
+        {
+            printf("Shared secrets are not identical!\n");
+            printf("Shared secret 1 = ");
+            vli_print(l_secret1, uECC_BYTES);
+            printf("\n");
+            printf("Shared secret 2 = ");
+            vli_print(l_secret2, uECC_BYTES);
+            printf("\n");
+            printf("Private key 1 = ");
+            vli_print(l_private1, uECC_BYTES);
+            printf("\n");
+            printf("Private key 2 = ");
+            vli_print(l_private2, uECC_BYTES);
+            printf("\n");
+        }
+    }
+    printf("\n");
+
+    return 0;
+}
diff --git a/extlibs/tinydtls/ecc/test/test_ecdsa.c b/extlibs/tinydtls/ecc/test/test_ecdsa.c
new file mode 100755 (executable)
index 0000000..f3983bb
--- /dev/null
@@ -0,0 +1,79 @@
+/* Copyright 2014, Kenneth MacKay. Licensed under the BSD 2-clause license. */
+
+#include "../ecc.h"
+
+#include <stdio.h>
+#include <string.h>
+
+#if LPC11XX
+
+#include "/Projects/lpc11xx/peripherals/uart.h"
+#include "/Projects/lpc11xx/peripherals/time.h"
+
+static uint64_t g_rand = 88172645463325252ull;
+int fake_rng(uint8_t *p_dest, unsigned p_size)
+{
+    while(p_size)
+    {
+        g_rand ^= (g_rand << 13);
+        g_rand ^= (g_rand >> 7);
+        g_rand ^= (g_rand << 17);
+
+        unsigned l_amount = (p_size > 8 ? 8 : p_size);
+        memcpy(p_dest, &g_rand, l_amount);
+        p_size -= l_amount;
+    }
+    return 1;
+}
+
+#endif
+
+int main()
+{
+#if LPC11XX
+    uartInit(BAUD_115200);
+       initTime();
+
+    uECC_set_rng(&fake_rng);
+#endif
+
+    uint8_t l_public[uECC_BYTES*2];
+    uint8_t l_private[uECC_BYTES];
+
+    uint8_t l_hash[uECC_BYTES];
+
+    uint8_t l_sig[uECC_BYTES*2];
+
+    int i;
+
+    printf("Testing 256 signatures\n");
+
+    for(i=0; i<256; ++i)
+    {
+        printf(".");
+    #if !LPC11XX
+        fflush(stdout);
+    #endif
+
+        if(!uECC_make_key(l_public, l_private))
+        {
+            printf("uECC_make_key() failed\n");
+            continue;
+        }
+        memcpy(l_hash, l_public, uECC_BYTES);
+
+        if(!uECC_sign(l_private, l_hash, l_sig))
+        {
+            printf("uECC_sign() failed\n");
+            continue;
+        }
+
+        if(!uECC_verify(l_public, l_hash, l_sig))
+        {
+            printf("uECC_verify() failed\n");
+        }
+    }
+    printf("\n");
+
+    return 0;
+}
diff --git a/extlibs/tinydtls/ecc/test_helper.c b/extlibs/tinydtls/ecc/test_helper.c
deleted file mode 100644 (file)
index bda44ba..0000000
+++ /dev/null
@@ -1,79 +0,0 @@
-/*
- * Copyright (c) 2009 Chris K Cockrum <ckc@cockrum.net>
- *
- * Copyright (c) 2013 Jens Trillmann <jtrillma@tzi.de>
- * Copyright (c) 2013 Marc Müller-Weinhardt <muewei@tzi.de>
- * Copyright (c) 2013 Lars Schmertmann <lars@tzi.de>
- * Copyright (c) 2013 Hauke Mehrtens <hauke@hauke-m.de>
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- * THE SOFTWARE.
- *
- *
- * This implementation is based in part on the paper Implementation of an
- * Elliptic Curve Cryptosystem on an 8-bit Microcontroller [0] by
- * Chris K Cockrum <ckc@cockrum.net>.
- *
- * [0]: http://cockrum.net/Implementation_of_ECC_on_an_8-bit_microcontroller.pdf
- *
- * This is a efficient ECC implementation on the secp256r1 curve for 32 Bit CPU
- * architectures. It provides basic operations on the secp256r1 curve and support
- * for ECDH and ECDSA.
- */
-#include "test_helper.h"
-#include "ecc.h"
-#include <string.h>
-#include <stdio.h>
-#include <stdlib.h>
-
-void ecc_printNumber(const uint32_t *x, int numberLength){ //here the values are turned to MSB!
-       int n;
-
-       for(n = numberLength - 1; n >= 0; n--){
-               printf("%08x", x[n]);
-       }
-       printf("\n");
-}
-
-void ecc_setRandom(uint32_t *secret){
-       int i;
-
-       for (i = 0; i < arrayLength; ++i)
-       {
-               secret[i] = rand();
-       }
-}
-const uint32_t ecc_prime_m[8] = {0xffffffff, 0xffffffff, 0xffffffff, 0x00000000,
-                                0x00000000, 0x00000000, 0x00000001, 0xffffffff};
-
-                                                       
-/* This is added after an static byte addition if the answer has a carry in MSB*/
-const uint32_t ecc_prime_r[8] = {0x00000001, 0x00000000, 0x00000000, 0xffffffff,
-                                0xffffffff, 0xffffffff, 0xfffffffe, 0x00000000};
-
-#ifdef CONTIKI
-void
-test_assert(const char *file, int lineno)
-{
-  printf("Assertion failed: file %s, line %d.\n", file, lineno);
-  /*
-   * loop for a while;
-   * call _reset_vector__();
-   */
-}
-#endif
diff --git a/extlibs/tinydtls/ecc/test_helper.h b/extlibs/tinydtls/ecc/test_helper.h
deleted file mode 100644 (file)
index 38a194e..0000000
+++ /dev/null
@@ -1,51 +0,0 @@
-/*
- * Copyright (c) 2009 Chris K Cockrum <ckc@cockrum.net>
- *
- * Copyright (c) 2013 Jens Trillmann <jtrillma@tzi.de>
- * Copyright (c) 2013 Marc Müller-Weinhardt <muewei@tzi.de>
- * Copyright (c) 2013 Lars Schmertmann <lars@tzi.de>
- * Copyright (c) 2013 Hauke Mehrtens <hauke@hauke-m.de>
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- * THE SOFTWARE.
- *
- *
- * This implementation is based in part on the paper Implementation of an
- * Elliptic Curve Cryptosystem on an 8-bit Microcontroller [0] by
- * Chris K Cockrum <ckc@cockrum.net>.
- *
- * [0]: http://cockrum.net/Implementation_of_ECC_on_an_8-bit_microcontroller.pdf
- *
- * This is a efficient ECC implementation on the secp256r1 curve for 32 Bit CPU
- * architectures. It provides basic operations on the secp256r1 curve and support
- * for ECDH and ECDSA.
- */
-#include <inttypes.h>
-
-extern const uint32_t ecc_prime_m[8];
-extern const uint32_t ecc_prime_r[8];
-
-//debug function to print long numbers
-void ecc_printNumber(const uint32_t *x, int numberLength);
-void ecc_setRandom(uint32_t *secret);
-
-#ifdef CONTIKI
-#undef assert
-#define assert(e) ((e) ? (void)0 : test_assert(__FILE__, __LINE__))
-void test_assert(const char *, int);
-#endif
diff --git a/extlibs/tinydtls/ecc/testecc.c b/extlibs/tinydtls/ecc/testecc.c
deleted file mode 100644 (file)
index b36d46b..0000000
+++ /dev/null
@@ -1,226 +0,0 @@
-/*
- * Copyright (c) 2009 Chris K Cockrum <ckc@cockrum.net>
- *
- * Copyright (c) 2013 Jens Trillmann <jtrillma@tzi.de>
- * Copyright (c) 2013 Marc Müller-Weinhardt <muewei@tzi.de>
- * Copyright (c) 2013 Lars Schmertmann <lars@tzi.de>
- * Copyright (c) 2013 Hauke Mehrtens <hauke@hauke-m.de>
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- * THE SOFTWARE.
- *
- *
- * This implementation is based in part on the paper Implementation of an
- * Elliptic Curve Cryptosystem on an 8-bit Microcontroller [0] by
- * Chris K Cockrum <ckc@cockrum.net>.
- *
- * [0]: http://cockrum.net/Implementation_of_ECC_on_an_8-bit_microcontroller.pdf
- *
- * This is a efficient ECC implementation on the secp256r1 curve for 32 Bit CPU
- * architectures. It provides basic operations on the secp256r1 curve and support
- * for ECDH and ECDSA.
- */
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <assert.h>
-
-#include "ecc.h"
-#include "test_helper.h"
-
-#ifdef CONTIKI
-#include "contiki.h"
-#else
-#include <time.h>
-#endif /* CONTIKI */
-
-//These are testvalues taken from the NIST P-256 definition
-//6b17d1f2 e12c4247 f8bce6e5 63a440f2 77037d81 2deb33a0 f4a13945 d898c296
-uint32_t BasePointx[8] = {     0xd898c296, 0xf4a13945, 0x2deb33a0, 0x77037d81,
-                                                       0x63a440f2, 0xf8bce6e5, 0xe12c4247, 0x6b17d1f2};
-
-//4fe342e2 fe1a7f9b 8ee7eb4a 7c0f9e16 2bce3357 6b315ece cbb64068 37bf51f5
-uint32_t BasePointy[8] = {     0x37bf51f5, 0xcbb64068, 0x6b315ece, 0x2bce3357,
-                                                       0x7c0f9e16, 0x8ee7eb4a, 0xfe1a7f9b, 0x4fe342e2};
-
-//de2444be bc8d36e6 82edd27e 0f271508 617519b3 221a8fa0 b77cab39 89da97c9
-uint32_t Sx[8] = {     0x89da97c9, 0xb77cab39, 0x221a8fa0, 0x617519b3, 
-                                       0x0f271508, 0x82edd27e, 0xbc8d36e6, 0xde2444be};
-
-//c093ae7f f36e5380 fc01a5aa d1e66659 702de80f 53cec576 b6350b24 3042a256
-uint32_t Sy[8] = {     0x3042a256, 0xb6350b24, 0x53cec576, 0x702de80f,
-                                       0xd1e66659, 0xfc01a5aa, 0xf36e5380, 0xc093ae7f};
-
-//55a8b00f 8da1d44e 62f6b3b2 5316212e 39540dc8 61c89575 bb8cf92e 35e0986b
-uint32_t Tx[8] = {     0x35e0986b, 0xbb8cf92e, 0x61c89575, 0x39540dc8,
-                                       0x5316212e, 0x62f6b3b2, 0x8da1d44e, 0x55a8b00f};
-
-//5421c320 9c2d6c70 4835d82a c4c3dd90 f61a8a52 598b9e7a b656e9d8 c8b24316
-uint32_t Ty[8] = {     0xc8b24316, 0xb656e9d8, 0x598b9e7a, 0xf61a8a52,
-                                       0xc4c3dd90, 0x4835d82a, 0x9c2d6c70, 0x5421c320};
-
-//c51e4753 afdec1e6 b6c6a5b9 92f43f8d d0c7a893 3072708b 6522468b 2ffb06fd
-uint32_t secret[8] = { 0x2ffb06fd, 0x6522468b, 0x3072708b, 0xd0c7a893,
-                                               0x92f43f8d, 0xb6c6a5b9, 0xafdec1e6, 0xc51e4753};
-                                                       
-//72b13dd4 354b6b81 745195e9 8cc5ba69 70349191 ac476bd4 553cf35a 545a067e
-uint32_t resultAddx[8] = {     0x545a067e, 0x553cf35a, 0xac476bd4, 0x70349191,
-                                                       0x8cc5ba69, 0x745195e9, 0x354b6b81, 0x72b13dd4};
-
-//8d585cbb 2e1327d7 5241a8a1 22d7620d c33b1331 5aa5c9d4 6d013011 744ac264
-uint32_t resultAddy[8] = {     0x744ac264, 0x6d013011, 0x5aa5c9d4, 0xc33b1331,
-                                                       0x22d7620d, 0x5241a8a1, 0x2e1327d7, 0x8d585cbb};
-
-//7669e690 1606ee3b a1a8eef1 e0024c33 df6c22f3 b17481b8 2a860ffc db6127b0
-uint32_t resultDoublex[8] = {  0xdb6127b0, 0x2a860ffc, 0xb17481b8, 0xdf6c22f3,
-                                                               0xe0024c33, 0xa1a8eef1, 0x1606ee3b, 0x7669e690};
-
-//fa878162 187a54f6 c39f6ee0 072f33de 389ef3ee cd03023d e10ca2c1 db61d0c7
-uint32_t resultDoubley[8] = {  0xdb61d0c7, 0xe10ca2c1, 0xcd03023d, 0x389ef3ee,
-                                                               0x072f33de, 0xc39f6ee0, 0x187a54f6, 0xfa878162};
-
-//51d08d5f 2d427888 2946d88d 83c97d11 e62becc3 cfc18bed acc89ba3 4eeca03f
-uint32_t resultMultx[8] = {    0x4eeca03f, 0xacc89ba3, 0xcfc18bed, 0xe62becc3,
-                                                       0x83c97d11, 0x2946d88d, 0x2d427888, 0x51d08d5f};
-
-//75ee68eb 8bf626aa 5b673ab5 1f6e744e 06f8fcf8 a6c0cf30 35beca95 6a7b41d5
-uint32_t resultMulty[8] = {    0x6a7b41d5, 0x35beca95, 0xa6c0cf30, 0x06f8fcf8,
-                                                       0x1f6e744e, 0x5b673ab5, 0x8bf626aa, 0x75ee68eb};
-
-static const uint32_t ecdsaTestMessage[] = { 0x65637572, 0x20612073, 0x68206F66, 0x20686173, 0x69732061, 0x68697320, 0x6F2C2054, 0x48616C6C};
-
-static const uint32_t ecdsaTestSecret[] = {0x94A949FA, 0x401455A1, 0xAD7294CA, 0x896A33BB, 0x7A80E714, 0x4321435B, 0x51247A14, 0x41C1CB6B};
-
-static const uint32_t ecdsaTestRand1[] = { 0x1D1E1F20, 0x191A1B1C, 0x15161718, 0x11121314, 0x0D0E0F10, 0x090A0B0C, 0x05060708, 0x01020304};
-static const uint32_t ecdsaTestresultR1[] = { 0xC3B4035F, 0x515AD0A6, 0xBF375DCA, 0x0CC1E997, 0x7F54FDCD, 0x04D3FECA, 0xB9E396B9, 0x515C3D6E};
-static const uint32_t ecdsaTestresultS1[] = { 0x5366B1AB, 0x0F1DBF46, 0xB0C8D3C4, 0xDB755B6F, 0xB9BF9243, 0xE644A8BE, 0x55159A59, 0x6F9E52A6};
-
-static const uint32_t ecdsaTestRand2[] = { 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x01FFFFFF};
-static const uint32_t ecdsaTestresultR2[] = { 0x14146C91, 0xE878724D, 0xCD4FF928, 0xCC24BC04, 0xAC403390, 0x650C0060, 0x4A30B3F1, 0x9C69B726};
-static const uint32_t ecdsaTestresultS2[] = { 0x433AAB6F, 0x808250B1, 0xE46F90F4, 0xB342E972, 0x18B2F7E4, 0x2DB981A2, 0x6A288FA4, 0x41CF59DB};
-
-void addTest(){
-       uint32_t tempx[8];
-       uint32_t tempy[8];
-
-       ecc_ec_add(Tx, Ty, Sx, Sy, tempx, tempy);
-       assert(ecc_isSame(tempx, resultAddx, arrayLength));
-       assert(ecc_isSame(tempy, resultAddy, arrayLength));
-}
-
-void doubleTest(){
-       uint32_t tempx[8];
-       uint32_t tempy[8];
-
-       ecc_ec_double(Sx, Sy, tempx, tempy);
-       assert(ecc_isSame(tempx, resultDoublex, arrayLength));
-       assert(ecc_isSame(tempy, resultDoubley, arrayLength));
-}
-
-void multTest(){
-       uint32_t tempx[8];
-       uint32_t tempy[8];
-
-       ecc_ec_mult(Sx, Sy, secret, tempx, tempy);
-       assert(ecc_isSame(tempx, resultMultx, arrayLength));
-       assert(ecc_isSame(tempy, resultMulty, arrayLength));
-}
-
-void eccdhTest(){
-       uint32_t tempx[8];
-       uint32_t tempy[8];
-       uint32_t tempAx2[8];
-       uint32_t tempAy2[8];
-       uint32_t tempBx1[8];
-       uint32_t tempBy1[8];
-       uint32_t tempBx2[8];
-       uint32_t tempBy2[8];    
-       uint32_t secretA[8];
-       uint32_t secretB[8];
-       ecc_setRandom(secretA);
-       ecc_printNumber(secretA, 8);
-       ecc_setRandom(secretB);
-       ecc_printNumber(secretB, 8);
-       ecc_ec_mult(BasePointx, BasePointy, secretA, tempx, tempy);
-       ecc_ec_mult(BasePointx, BasePointy, secretB, tempBx1, tempBy1);
-       //public key exchange
-       ecc_ec_mult(tempBx1, tempBy1, secretA, tempAx2, tempAy2);
-       ecc_ec_mult(tempx, tempy, secretB, tempBx2, tempBy2);
-       assert(ecc_isSame(tempAx2, tempBx2, arrayLength));
-       assert(ecc_isSame(tempAy2, tempBy2, arrayLength));
-
-}
-
-void ecdsaTest() {
-       int ret __attribute__((unused));
-       uint32_t tempx[9];
-       uint32_t tempy[9];
-       uint32_t pub_x[8];
-       uint32_t pub_y[8];
-
-       ecc_ec_mult(BasePointx, BasePointy, ecdsaTestSecret, pub_x, pub_y);
-
-       ret = ecc_ecdsa_sign(ecdsaTestSecret, ecdsaTestMessage, ecdsaTestRand1, tempx, tempy);
-       assert(ecc_isSame(tempx, ecdsaTestresultR1, arrayLength));
-       assert(ecc_isSame(tempy, ecdsaTestresultS1, arrayLength));
-       assert(ret == 0);
-
-       ret = ecc_ecdsa_validate(pub_x, pub_y, ecdsaTestMessage, tempx, tempy);
-       assert(!ret);
-
-
-       ret = ecc_ecdsa_sign(ecdsaTestSecret, ecdsaTestMessage, ecdsaTestRand2, tempx, tempy);
-       assert(ecc_isSame(tempx, ecdsaTestresultR2, arrayLength));
-       assert(ecc_isSame(tempy, ecdsaTestresultS2, arrayLength));
-       assert(ret == 0);
-
-       ret = ecc_ecdsa_validate(pub_x, pub_y, ecdsaTestMessage, tempx, tempy);
-       assert(!ret);
-}
-
-#ifdef CONTIKI
-PROCESS(ecc_filed_test, "ECC test");
-AUTOSTART_PROCESSES(&ecc_filed_test);
-PROCESS_THREAD(ecc_filed_test, ev, d)
-{
-       PROCESS_BEGIN();
-
-       srand(1234);
-       addTest();
-       doubleTest();
-       multTest();
-       eccdhTest();
-       ecdsaTest();
-       printf("%s\n", "All Tests successful.");
-
-       PROCESS_END();
-}
-#else /* CONTIKI */
-int main(int argc, char const *argv[])
-{
-       srand(time(NULL));
-       addTest();
-       doubleTest();
-       multTest();
-       eccdhTest();
-       ecdsaTest();
-       printf("%s\n", "All Tests successful.");
-       return 0;
-}
-#endif /* CONTIKI */
diff --git a/extlibs/tinydtls/ecc/testfield.c b/extlibs/tinydtls/ecc/testfield.c
deleted file mode 100644 (file)
index 30a690e..0000000
+++ /dev/null
@@ -1,290 +0,0 @@
-/*
- * Copyright (c) 2009 Chris K Cockrum <ckc@cockrum.net>
- *
- * Copyright (c) 2013 Jens Trillmann <jtrillma@tzi.de>
- * Copyright (c) 2013 Marc Müller-Weinhardt <muewei@tzi.de>
- * Copyright (c) 2013 Lars Schmertmann <lars@tzi.de>
- * Copyright (c) 2013 Hauke Mehrtens <hauke@hauke-m.de>
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- * THE SOFTWARE.
- *
- *
- * This implementation is based in part on the paper Implementation of an
- * Elliptic Curve Cryptosystem on an 8-bit Microcontroller [0] by
- * Chris K Cockrum <ckc@cockrum.net>.
- *
- * [0]: http://cockrum.net/Implementation_of_ECC_on_an_8-bit_microcontroller.pdf
- *
- * This is a efficient ECC implementation on the secp256r1 curve for 32 Bit CPU
- * architectures. It provides basic operations on the secp256r1 curve and support
- * for ECDH and ECDSA.
- */
-#include <assert.h>
-#include <string.h>
-#include <stdio.h>
-#include "ecc.h"
-#include "test_helper.h"
-
-#ifdef CONTIKI
-#include "contiki.h"
-#endif /* CONTIKI */
-
-//arbitrary test values and results
-uint32_t null[8] = {   0x00000000,0x00000000,0x00000000,0x00000000,
-                                               0x00000000,0x00000000,0x00000000,0x00000000};
-uint32_t null64[16] = {        0x00000000,0x00000000,0x00000000,0x00000000,
-                                               0x00000000,0x00000000,0x00000000,0x00000000,
-                                               0x00000000,0x00000000,0x00000000,0x00000000,
-                                               0x00000000,0x00000000,0x00000000,0x00000000};
-uint32_t one[8] = {    0x00000001,0x00000000,0x00000000,0x00000000,
-                                       0x00000000,0x00000000,0x00000000,0x00000000};
-uint32_t one64[16] = { 0x00000001,0x00000000,0x00000000,0x00000000,
-                                               0x00000000,0x00000000,0x00000000,0x00000000,
-                                               0x00000000,0x00000000,0x00000000,0x00000000,
-                                               0x00000000,0x00000000,0x00000000,0x00000000};
-uint32_t two[8] = {    0x00000002,0x00000000,0x00000000,0x00000000,
-                                       0x00000000,0x00000000,0x00000000,0x00000000};
-uint32_t two64[16] = { 0x00000002,0x00000000,0x00000000,0x00000000,
-                                               0x00000000,0x00000000,0x00000000,0x00000000,
-                                               0x00000000,0x00000000,0x00000000,0x00000000,
-                                               0x00000000,0x00000000,0x00000000,0x00000000};
-uint32_t three[8] = {  0x00000003,0x00000000,0x00000000,0x00000000,
-                                               0x00000000,0x00000000,0x00000000,0x00000000};
-uint32_t four[8] = {0x00000004,0x00000000,0x00000000,0x00000000,
-                                       0x00000000,0x00000000,0x00000000,0x00000000};
-uint32_t four64[16] = {        0x00000004,0x00000000,0x00000000,0x00000000,
-                                               0x00000000,0x00000000,0x00000000,0x00000000,
-                                               0x00000000,0x00000000,0x00000000,0x00000000,
-                                               0x00000000,0x00000000,0x00000000,0x00000000};
-uint32_t six[8] = {    0x00000006,0x00000000,0x00000000,0x00000000,
-                                       0x00000000,0x00000000,0x00000000,0x00000000};
-uint32_t eight[8] = {  0x00000008,0x00000000,0x00000000,0x00000000,
-                                               0x00000000,0x00000000,0x00000000,0x00000000};
-uint32_t full[8] = {   0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF,
-                                               0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF};
-//00000000fffffffeffffffffffffffffffffffff000000000000000000000001_16
-uint32_t resultFullAdd[8] = {  0x00000001,0x00000000,0x00000000,0xFFFFFFFF,
-                                                               0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFE,0x00000000};
-uint32_t primeMinusOne[8]=     {       0xfffffffe,0xffffffff,0xffffffff,0x00000000,
-                                                               0x00000000,0x00000000,0x00000001,0xffffffff};
-uint32_t resultDoubleMod[8] = { 0xfffffffd,0xffffffff,0xffffffff,0x00000000,
-                                                               0x00000000,0x00000000,0x00000001,0xffffffff};
-//fffffffe00000002fffffffe0000000100000001fffffffe00000001fffffffc00000003fffffffcfffffffffffffffffffffffc000000000000000000000004_16
-uint32_t resultQuadMod[16] = { 0x00000004,0x00000000,0x00000000,0xFFFFFFFC,
-                                                               0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFC,0x00000003,
-                                                               0xFFFFFFFC,0x00000001,0xFFFFFFFE,0x00000001,
-                                                               0x00000001,0xFFFFFFFE,0x00000002,0xFFFFFFFE};
-//00000002fffffffffffffffffffffffefffffffdffffffff0000000000000002_16
-uint32_t resultFullMod[8] = {  0x00000002,0x00000000,0xFFFFFFFF,0xFFFFFFFD,
-                                                               0xFFFFFFFE,0xFFFFFFFF,0xFFFFFFFF,0x00000002};
-
-static const uint32_t orderMinusOne[8] = {0xFC632550, 0xF3B9CAC2, 0xA7179E84, 0xBCE6FAAD,
-                                       0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF};
-static const uint32_t orderResultDoubleMod[8] = {0xFC63254F, 0xF3B9CAC2, 0xA7179E84, 0xBCE6FAAD, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF};
-
-uint32_t temp[8];
-uint32_t temp2[16];
-
-void nullEverything(){
-       memset(temp, 0, sizeof(temp));
-       memset(temp2, 0, sizeof(temp));
-}
-
-void fieldAddTest(){
-       assert(ecc_isSame(one, one, arrayLength));
-       ecc_fieldAdd(one, null, ecc_prime_r, temp);
-       assert(ecc_isSame(temp, one, arrayLength));
-       nullEverything();
-       ecc_fieldAdd(one, one, ecc_prime_r, temp);
-       assert(ecc_isSame(temp, two, arrayLength));
-       nullEverything();
-       ecc_add(full, one, temp, 32);
-       assert(ecc_isSame(null, temp, arrayLength));
-       nullEverything();
-       ecc_fieldAdd(full, one, ecc_prime_r, temp);
-       assert(ecc_isSame(temp, resultFullAdd, arrayLength));
-}
-
-void fieldSubTest(){
-       assert(ecc_isSame(one, one, arrayLength));
-       ecc_fieldSub(one, null, ecc_prime_m, temp);
-       assert(ecc_isSame(one, temp, arrayLength));
-       nullEverything();
-       ecc_fieldSub(one, one, ecc_prime_m, temp);
-       assert(ecc_isSame(null, temp, arrayLength));
-       nullEverything();
-       ecc_fieldSub(null, one, ecc_prime_m, temp);
-       assert(ecc_isSame(primeMinusOne, temp, arrayLength));
-}
-
-void fieldMultTest(){
-       ecc_fieldMult(one, null, temp2, arrayLength);
-       assert(ecc_isSame(temp2, null64, arrayLength * 2));
-       nullEverything();
-       ecc_fieldMult(one, two, temp2, arrayLength);
-       assert(ecc_isSame(temp2, two64, arrayLength * 2));
-       nullEverything();
-       ecc_fieldMult(two, two, temp2, arrayLength);
-       assert(ecc_isSame(temp2, four64, arrayLength * 2));
-       nullEverything();
-       ecc_fieldMult(primeMinusOne, primeMinusOne, temp2, arrayLength);
-       assert(ecc_isSame(temp2, resultQuadMod, arrayLength * 2));
-       nullEverything();
-       ecc_fieldInv(two, ecc_prime_m, ecc_prime_r, temp);
-       ecc_fieldMult(temp, two, temp2, arrayLength);
-       ecc_fieldModP(temp, temp2);
-       assert(ecc_isSame(temp, one, arrayLength));
-}
-
-void fieldModPTest(){
-       ecc_fieldMult(primeMinusOne, primeMinusOne, temp2, arrayLength);
-       ecc_fieldModP(temp, temp2);
-       assert(ecc_isSame(temp, one, arrayLength));
-       nullEverything();
-       ecc_fieldModP(temp, one64);
-       assert(ecc_isSame(temp, one, arrayLength));
-       nullEverything();
-       ecc_fieldMult(two, primeMinusOne, temp2, arrayLength);
-       ecc_fieldModP(temp, temp2);
-       assert(ecc_isSame(temp, resultDoubleMod, arrayLength));
-       nullEverything();
-       /*fieldMult(full, full, temp2, arrayLength); //not working, maybe because of the number bigger than p^2?
-       fieldModP(temp, temp2);
-       assert(ecc_isSame(temp, resultFullMod, arrayLength));*/
-}
-
-void fieldModOTest(){
-       ecc_fieldMult(orderMinusOne, orderMinusOne, temp2, arrayLength);
-       ecc_fieldModO(temp2, temp, arrayLength * 2);
-       assert(ecc_isSame(temp, one, arrayLength));
-       nullEverything();
-       ecc_fieldModO(one64, temp, arrayLength * 2);
-       assert(ecc_isSame(temp, one, arrayLength));
-       nullEverything();
-       ecc_fieldMult(two, orderMinusOne, temp2, arrayLength);
-       ecc_fieldModO(temp2, temp, arrayLength * 2);
-       assert(ecc_isSame(temp, orderResultDoubleMod, arrayLength));
-       nullEverything();
-}
-
-
-// void rShiftTest(){
-//     printNumber(full, 32);
-//     rshift(full);
-//     printNumber(full, 32);
-//     printNumber(two, 32);
-//     rshift(two);
-//     printNumber(two, 32);
-//     printNumber(four, 32);
-//     rshift(four);
-//     printNumber(four, 32);
-// }
-
-// void isOneTest(){
-//     printf("%d\n", isone(one));
-//     printf("%d\n", isone(two));
-//     printf("%d\n", isone(four));
-//     printf("%d\n", isone(full));
-//     printf("%d\n", isone(null));
-// }
-
-void fieldInvTest(){
-       nullEverything();
-       ecc_fieldInv(two, ecc_prime_m, ecc_prime_r, temp);
-       ecc_fieldMult(temp, two, temp2, arrayLength);
-       ecc_fieldModP(temp, temp2);
-       assert(ecc_isSame(one, temp, arrayLength));
-       nullEverything();
-       ecc_fieldInv(eight, ecc_prime_m, ecc_prime_r, temp);
-       ecc_fieldMult(temp, eight, temp2, arrayLength);
-       ecc_fieldModP(temp, temp2);
-       assert(ecc_isSame(one, temp, arrayLength));
-       nullEverything();
-       ecc_fieldInv(three, ecc_prime_m, ecc_prime_r, temp);
-       ecc_fieldMult(temp, three, temp2, arrayLength);
-       ecc_fieldModP(temp, temp2);
-       assert(ecc_isSame(one, temp, arrayLength));
-       nullEverything();
-       ecc_fieldInv(six, ecc_prime_m, ecc_prime_r, temp);
-       ecc_fieldMult(temp, six, temp2, arrayLength);
-       ecc_fieldModP(temp, temp2);
-       assert(ecc_isSame(one, temp, arrayLength));
-       nullEverything();
-       ecc_fieldInv(primeMinusOne, ecc_prime_m, ecc_prime_r, temp);
-       ecc_fieldMult(temp, primeMinusOne, temp2, arrayLength);
-       ecc_fieldModP(temp, temp2);
-       assert(ecc_isSame(one, temp, arrayLength));
-}
-
-// void randomStuff(){
-
-// }
-
-#ifdef CONTIKI
-PROCESS(ecc_filed_test, "ECC field test");
-AUTOSTART_PROCESSES(&ecc_filed_test);
-PROCESS_THREAD(ecc_filed_test, ev, d)
-{
-       PROCESS_BEGIN();
-
-       nullEverything();
-       //randomStuff();
-       nullEverything();
-       fieldAddTest();
-       nullEverything();
-       fieldSubTest();
-       nullEverything();
-       fieldMultTest();
-       nullEverything();
-       fieldModPTest();
-       nullEverything();
-       fieldModOTest();
-       nullEverything();
-       fieldInvTest();
-       nullEverything();
-       //rShiftTest();
-       //isOneTest();
-       printf("%s\n", "All Tests succesfull!");
-
-       PROCESS_END();
-}
-#else /* CONTIKI */
-int main(int argc, char const *argv[])
-{
-       nullEverything();
-       //randomStuff();
-       nullEverything();
-       fieldAddTest();
-       nullEverything();
-       fieldSubTest();
-       nullEverything();
-       fieldMultTest();
-       nullEverything();
-       fieldModPTest();
-       nullEverything();
-       fieldModOTest();
-       nullEverything();
-       fieldInvTest();
-       nullEverything();
-       //rShiftTest();
-       //isOneTest();
-       printf("%s\n", "All Tests succesfull!");
-       return 0;
-}
-#endif /* CONTIKI */