Make sure that "make dist" is functional.
authorjbj <devnull@localhost>
Sat, 15 Jun 2002 14:19:38 +0000 (14:19 +0000)
committerjbj <devnull@localhost>
Sat, 15 Jun 2002 14:19:38 +0000 (14:19 +0000)
Rename fips180opt -> sha1opt platform specific asm files.

CVS patchset: 5487
CVS date: 2002/06/15 14:19:38

beecrypt/gas/Makefile.am
beecrypt/gas/Makefile.in
beecrypt/gas/blowfishopt.powerpc.S [new file with mode: 0644]
beecrypt/gas/fips180opt.ia64.S [deleted file]
beecrypt/gas/fips180opt.powerpc.S [deleted file]
beecrypt/gas/sha1opt.ia64.S [new file with mode: 0644]
beecrypt/gas/sha1opt.powerpc.S [new file with mode: 0644]

index 4a65cbbe075e85c13cbcc23899fe2e0b1a486494..8b80be9149c461abeb8c3421aa434b7e5dba0aeb 100644 (file)
@@ -22,4 +22,4 @@
 
 AUTOMAKE_OPTIONS = gnu no-dependencies
 
-EXTRA_DIST = aesopt.i586.S aesopt.powerpc.S blowfishopt.i586.S blowfishopt.powerpc.S sha1opt.i586.S mp32opt.arm.S mp32opt.i386.S mp32opt.ia64.S mp32opt.powerpc.S mp32opt.sparcv8.S mp32opt.sparcv9.S mp64opt.ia64.S
+EXTRA_DIST = aesopt.i586.S aesopt.powerpc.S blowfishopt.i586.S blowfishopt.powerpc.S mp32opt.arm.S mp32opt.i386.S mp32opt.ia64.S mp32opt.powerpc.S mp32opt.sparcv8.S mp32opt.sparcv9.S mp64opt.ia64.S sha1opt.i586.S sha1opt.ia64.S sha1opt.powerpc.S
index bac50e0907d4ddf0943edda5a2695728222aba52..f987868c5d0f2117a8010119f224750425646b37 100644 (file)
@@ -127,7 +127,7 @@ uint8_type = @uint8_type@
 
 AUTOMAKE_OPTIONS = gnu no-dependencies
 
-EXTRA_DIST = aesopt.i586.S aesopt.powerpc.S blowfishopt.i586.S blowfishopt.powerpc.S sha1opt.i586.S mp32opt.arm.S mp32opt.i386.S mp32opt.ia64.S mp32opt.powerpc.S mp32opt.sparcv8.S mp32opt.sparcv9.S mp64opt.ia64.S
+EXTRA_DIST = aesopt.i586.S aesopt.powerpc.S blowfishopt.i586.S blowfishopt.powerpc.S mp32opt.arm.S mp32opt.i386.S mp32opt.ia64.S mp32opt.powerpc.S mp32opt.sparcv8.S mp32opt.sparcv9.S mp64opt.ia64.S sha1opt.i586.S sha1opt.ia64.S sha1opt.powerpc.S
 subdir = gas
 mkinstalldirs = $(SHELL) $(top_srcdir)/mkinstalldirs
 CONFIG_HEADER = $(top_builddir)/config.h
diff --git a/beecrypt/gas/blowfishopt.powerpc.S b/beecrypt/gas/blowfishopt.powerpc.S
new file mode 100644 (file)
index 0000000..28076f0
--- /dev/null
@@ -0,0 +1,242 @@
+/*
+ * blowfishopt.powerpc.asm
+ *
+ * Assembler optimized Blowfish routines for PowerPC processors
+ *
+ * Compile target is GNU Assembler
+ *
+ * Copyright (c) 2002 Bob Deblier <bob@virtualunlimited.com>
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ *
+ */
+
+#include "beecrypt.gas.h"
+
+       .file "blowfish.powerpc.S"
+
+       .text
+
+       .macro round    xl xr offset
+       lwz r9,\offset(r3)
+       xor \xl,\xl,r9
+       rlwinm r9,\xl,10,22,29
+       rlwinm r10,\xl,18,22,29
+       lwzx r9,r9,r28
+       lwzx r10,r10,r29
+       rlwinm r11,\xl,26,22,29
+       add r9,r9,r10
+       lwzx r11,r11,r30
+       rlwinm r12,\xl,2,22,29
+       xor r9,r9,r11
+       lwzx r12,r12,r31
+       add r9,r9,r12
+       xor \xr,\xr,r9
+       .endm
+
+       .macro eblock
+       round xl=r7 xr=r8 offset=0
+       round xl=r8 xr=r7 offset=4
+       round xl=r7 xr=r8 offset=8
+       round xl=r8 xr=r7 offset=12
+       round xl=r7 xr=r8 offset=16
+       round xl=r8 xr=r7 offset=20
+       round xl=r7 xr=r8 offset=24
+       round xl=r8 xr=r7 offset=28
+       round xl=r7 xr=r8 offset=32
+       round xl=r8 xr=r7 offset=36
+       round xl=r7 xr=r8 offset=40
+       round xl=r8 xr=r7 offset=44
+       round xl=r7 xr=r8 offset=48
+       round xl=r8 xr=r7 offset=52
+       round xl=r7 xr=r8 offset=56
+       round xl=r8 xr=r7 offset=60
+
+       lwz r9,64(r3)
+       lwz r10,68(r3)
+       xor r7,r7,r9
+       xor r8,r8,r10
+       .endm
+
+       .macro dblock
+       round xl=r7 xr=r8 offset=68
+       round xl=r8 xr=r7 offset=64
+       round xl=r7 xr=r8 offset=60
+       round xl=r8 xr=r7 offset=56
+       round xl=r7 xr=r8 offset=52
+       round xl=r8 xr=r7 offset=48
+       round xl=r7 xr=r8 offset=44
+       round xl=r8 xr=r7 offset=40
+       round xl=r7 xr=r8 offset=36
+       round xl=r8 xr=r7 offset=32
+       round xl=r7 xr=r8 offset=28
+       round xl=r8 xr=r7 offset=24
+       round xl=r7 xr=r8 offset=20
+       round xl=r8 xr=r7 offset=16
+       round xl=r7 xr=r8 offset=12
+       round xl=r8 xr=r7 offset=8
+
+       lwz r9,4(r3)
+       lwz r10,0(r3)
+       xor r7,r7,r9
+       xor r8,r8,r10
+       .endm
+
+C_FUNCTION_BEGIN(blowfishEncrypt)
+LABEL(blowfishEncrypt)
+       la r1,-16(r1)
+       stmw r28,0(r1)
+
+       la r28,72(r3)
+       la r29,1096(r3)
+       la r30,2120(r3)
+       la r31,3144(r3)
+
+       #if WORDS_BIGENDIAN
+       lwz r7,0(r5)
+       lwz r8,4(r5)
+       #else
+       # error ppc little-endian mode not supported
+       #endif
+       
+       eblock
+
+       #if WORDS_BIGENDIAN
+       stw r7,4(r4)
+       stw r8,0(r4)
+       #else
+       # error ppc little-endian mode not supported
+       #endif
+
+       li r3,0
+       lmw r28,0(r1)
+       la r1,16(r1)
+       blr
+C_FUNCTION_END(blowfishEncrypt, LOCAL(blowfishEncrypt_size))
+
+
+C_FUNCTION_BEGIN(blowfishDecrypt)
+LABEL(blowfishDecrypt)
+       la r1,-16(r1)
+       stmw r28,0(r1)
+
+       la r28,72(r3)
+       la r29,1096(r3)
+       la r30,2120(r3)
+       la r31,3144(r3)
+
+       #if WORDS_BIGENDIAN
+       lwz r7,0(r5)
+       lwz r8,4(r5)
+       #else
+       # error ppc little-endian mode not supported
+       #endif
+       
+       dblock
+
+       #if WORDS_BIGENDIAN
+       stw r7,4(r4)
+       stw r8,0(r4)
+       #else
+       # error ppc little-endian mode not supported
+       #endif
+
+       li r3,0
+       lmw r28,0(r1)
+       la r1,16(r1)
+       blr
+C_FUNCTION_END(blowfishDecrypt, LOCAL(blowfishDecrypt_size))
+
+
+C_FUNCTION_BEGIN(blowfishECBEncrypt)
+LABEL(blowfishECBEncrypt)
+       la r1,-16(r1)
+       stmw r28,0(r1)
+
+       mtctr r4
+
+       la r28,72(r3)
+       la r29,1096(r3)
+       la r30,2120(r3)
+       la r31,3144(r3)
+
+LOCAL(00):
+       #if WORDS_BIGENDIAN
+       lwz r7,0(r6)
+       lwz r8,4(r6)
+       #else
+       # error ppc little-endian mode not supported
+       #endif
+       
+       eblock
+
+       #if WORDS_BIGENDIAN
+       stw r7,4(r5)
+       stw r8,0(r5)
+       #else
+       # error ppc little-endian mode not supported
+       #endif
+
+       la r5,8(r5)
+       la r6,8(r6)
+
+       bdnz LOCAL(00)
+
+       li r3,0
+       lmw r28,0(r1)
+       la r1,16(r1)
+       blr
+C_FUNCTION_END(blowfishECBEncrypt, LOCAL(blowfishECBEncrypt_size))
+
+
+C_FUNCTION_BEGIN(blowfishECBDecrypt)
+LABEL(blowfishECBDecrypt)
+       la r1,-16(r1)
+       stmw r28,0(r1)
+
+       mtctr r4
+
+       la r28,72(r3)
+       la r29,1096(r3)
+       la r30,2120(r3)
+       la r31,3144(r3)
+
+LOCAL(01):
+       #if WORDS_BIGENDIAN
+       lwz r7,0(r6)
+       lwz r8,4(r6)
+       #else
+       # error ppc little-endian mode not supported
+       #endif
+       
+       dblock
+
+       #if WORDS_BIGENDIAN
+       stw r7,4(r5)
+       stw r8,0(r5)
+       #else
+       # error ppc little-endian mode not supported
+       #endif
+
+       la r5,8(r5)
+       la r6,8(r6)
+
+       bdnz LOCAL(01)
+
+       li r3,0
+       lmw r28,0(r1)
+       la r1,16(r1)
+       blr
+C_FUNCTION_END(blowfishECBDecrypt, LOCAL(blowfishECBDecrypt_size))
diff --git a/beecrypt/gas/fips180opt.ia64.S b/beecrypt/gas/fips180opt.ia64.S
deleted file mode 100644 (file)
index 0b057c7..0000000
+++ /dev/null
@@ -1,213 +0,0 @@
-/*
- * fips180opt.ia64.S
- *
- * Assembler optimized SHA-1 routines for ia64 (Intel Itanium)
- *
- * Warning: this code is incomplete and only contains a rough prototype!
- *
- * Compile target is GNU Assembler
- *
- * Copyright (c) 2001 Virtual Unlimited B.V.
- *
- * Author: Bob Deblier <bob@virtualunlimited.com>
- *
- * This library is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * This library is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with this library; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
- *
- */
-
-#include "beecrypt.gas.h"
-
-#define saved_pfs      r14
-#define saved_lc       r15
-
-#define param          r16
-
-       .file   "fips180opt.ia64.S"
-
-       .text
-
-       .equ    K00,    0x5a827999
-       .equ    K20,    0x6ed9eba1
-       .equ    K40,    0x8f1bbcdc
-       .equ    K60,    0xca62c1d6
-
-       .equ    PARAM_H,        0
-       .equ    PARAM_DATA,     20
-
-/* for optimization, I have to see how I can parallellize the code
-
-       e = ROTL32(a, 5) + ((b&(c^d))^d) + e + w + K
-       b = ROTR32(b, 2);
-
-step1: load w, tmp0 = mix a, tmp1 = c xor d, e += K;;
-step2: tmp0 >>= 27, tmp1 &= b, e += w, b = mix b;;
-step3: b >>= 2, e += tmp0, tmp1 ^= d;;
-step4: e += tmp1, load next w, tmp0 = mix d, tmp1 = b xor d, d += K;;
-step5: etc.
-
-       d = ROTL32(d, 5) + ((a&(b^c))^c) + d + w + K
-       a = ROTR32(a, 2)
-*/
-
-       .macro  subround1       a b c d e w
-       ld4 r19 = [\w],4
-       add \e = $K00,\e
-       xor r21 = \c,\d
-       mix4.r r20 = \a,\a;;
-       add \e = \e,r19
-       and r21 = r21,\b
-       shr.u r20 = 27,r20
-       mix4.r r22 = \b,\b;;
-       add \e = r20,\e
-       xor r21 = r21,\d
-       shr.u \b = 2,r22;;
-       add \e = r21,\e
-       .endm
-
-       .macro  subround2       a b c d e w
-       ld4 r19 = [\w],4
-       add \e = $K20,\e
-       xor r21 = \b,\c
-       mix4.r r20 = \a,\a;;
-       add \e = \e,r19
-       xor r21 = r21,\d
-       shr.u r20 = 27,r20
-       mix4.r \b = \b,\b;;
-       add \e = r20,\e
-       shr.u \b = 2,\b;;
-       add \e = r21,\e
-       .endm
-
-       .macro  subround3       a b c d e w
-       ld4 r19 = [\w],4
-       add \e = $K40,\e
-       xor r21 = \b,\c
-       and r22 = \b,\c
-       mix4.r r20 = \a,\a;;
-       add \e = \e,r19
-       and r21 = r21,\d
-       shr.u r20 = 27,r20
-       mix4.r \b = \b,\b;;
-       add \e = r20,\e
-       or r21 = r21,r22
-       shr.u \b = 2,\b;;
-       add \e = r21,\e
-       .endm
-
-       .macro  subround4       a b c d e w
-       ld4 r19 = [\w],4
-       add \e = $K60,\e
-       xor r21 = \b,\c
-       mix4.r r20 = \a,\a;;
-       add \e = \e,r19
-       xor r21 = r21,\d
-       shr.u r20 = 27,r20
-       mix4.r \b = \b,\b;;
-       add \e = r20,\e
-       shr.u \b = 2,\b;;
-       add \e = r21,\e
-       .endm
-
-       .align  32
-       .global sha1Process#
-       .proc   sha1Process#
-
-sha1Process:
-       alloc saved_pfs = ar.pfs,2,0,0,0
-       mov saved_lc = ar.lc
-
-/*     r16 will be h */
-/*     r17 will be pdata */
-/*     There must be something neat I can do to speed up expansion (xor/rotate)
-
-       The following should work, if we use 24 rotating registers; speedup should be dramatic
-       preload with swapped values 0-15
-
-       rought draft: have to translate this to more precise rotating registers and predicates.
-
-       /----------\
-       |xor[2],[0]|
-       +----------+----------\
-       |xor[8]    |xor[3],[1]|
-       +----------+----------+----------\
-       |xor[13]   |xor[9]    |xor[4],[2]|
-       +----------+----------+----------+----------\
-       |mix4.r[16]|xor[14]   |xor[10]   |xor[5],[3]|
-       +----------+----------+----------+----------+-----------\
-       |shr[16]   |mix4.r[17]|xor[15]   |xor[11]   |xor[6],[4] |
-       +----------+----------+----------+----------+-----------+----------\
-       |store[16] |shr[17]   |mix4.r[18]|xor[16]   |xor[12]    |xor[7],[5]|
-       \----------+----------+----------+----------+-----------+----------+----------\
-                  |store[17] |shr[18]   |mix4.r[19]|xor[17]    |xor[13]   |xor[8],[6]|
-                  \----------+----------+----------+-----------+----------+----------+----------\
-                             |store[18] |shr[19]   |mix4.r[20] |xor[18]   |xor[14]   |xor[9],[7]|
-                             \----------+----------+-----------+----------+----------+----------+----------\
-                                        |          |           |          |          |          |          |
-*/
-       alloc saved_pfs = ar.pfs,3,21,0,24
-
-       /* look into big-endian loads, followed by little-endian stores */
-#if !WORD_BIGENDIAN
-       // save UM.be 
-       // set UM.be to one
-#endif
-       /*
-.L00:
-       ld4 r32 = [ra],4
-       
-br.ctop.sptk .L00;;
-#if !WORD_BIGENDIAN
-       // restore UM.be
-       /*
-       mov ra = rd
-       mov rb = rd;;
-       add rb = 4,rd;;
-       st4 [ra],8 = r48
-       st4 [rb],8 = r47;;
-       st4 [ra],8 = r46
-       st4 [rb],8 = r45;;
-       st4 [ra],8 = r44
-       st4 [rb],8 = r43;;
-       st4 [ra],8 = r42
-       st4 [rb],8 = r41;;
-       st4 [ra],8 = r40
-       st4 [rb],8 = r39;;
-       st4 [ra],8 = r38
-       st4 [rb],8 = r37;;
-       st4 [ra],8 = r36
-       st4 [rb],8 = r35;;
-       st4 [ra],8 = r34
-       st4 [rb],8 = r33;;
-       */
-#endif
-       /* also add a conditional which will save the original swapped words! */
-       /* the expansion loop will translate to something like this: */
-.L01:
-       /* put three xors together */
-       (p16) xor r32 = r46,r48
-       (p17) xor r33 = r33,r41
-       (p18) xor r34 = r34,r37
-       (p19) mix4.r r35 = r35,r35
-       (p20) shr.u r36 = 31,r36
-       (p21) st4 [],4 = r37
-       br.ctop.sptk .L01;;
-
-       etc.
-*/
-       mov ar.lc = r15
-       mov ar.pfs = r14
-       br.ret.sptk b0
-       .endp   sha1Process#
-
diff --git a/beecrypt/gas/fips180opt.powerpc.S b/beecrypt/gas/fips180opt.powerpc.S
deleted file mode 100644 (file)
index db63149..0000000
+++ /dev/null
@@ -1,287 +0,0 @@
-/*
- * fips180opt.powerpc.S
- *
- * Assembler optimized SHA-1 routines for PowerPC processors
- *
- * Warning: this code is incomplete and only contains a rough prototype!
- *
- * Compile target is GNU Assembler
- *
- * Copyright (c) 2000, 2001 Virtual Unlimited B.V.
- *
- * Author: Bob Deblier <bob@virtualunlimited.com>
- *
- * This library is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * This library is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with this library; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
- *
- */
-
-#include "beecrypt.gas.h"
-
-       .file "fips180opt.powerpc.S"
-
-       .text
-
-#if DARWIN
-# define reg0  r0
-# define reg3  r3
-# define reg4  r4
-# define reg5  r5
-# define reg6  r6
-# define reg7  r7
-# define reg8  r8
-# define reg9  r9
-# define reg26 r26
-# define reg27 r27
-# define reg28 r28
-# define reg29 r29
-# define reg30 r30
-# define reg31 r31
-#else
-# define reg0  %r0
-# define reg3  %r3
-# define reg4  %r4
-# define reg5  %r5
-# define reg6  %r6
-# define reg7  %r7
-# define reg8  %r8
-# define reg9  %r9
-# define reg26 %r26
-# define reg27 %r27
-# define reg28 %r28
-# define reg29 %r29
-# define reg30 %r30
-# define reg31 %r31
-#endif
-
-       .equ    K00,    0x5a827999
-       .equ    K20,    0x6ed9eba1
-       .equ    K40,    0x8f1bbcdc
-       .equ    K60,    0xca62c1d6
-
-       .equ    PARAM_H,                0
-       .equ    PARAM_DATA,             20
-
-/* sha1Param: param in reg3 */
-
-       .macro  subround1 a b c d e w
-       lwzu reg7,4(\w)
-       rotlwi  reg5,\a,5
-       dbct r0,\w
-       xor reg6,\c,\d
-       add \e,\e,K00
-       and reg6,reg6,\b
-       add \e,\e,reg7
-       xor reg6,reg6,\d
-       add \e,\e,reg5
-       rotrwi \b,\b,2
-       add \e,\e,reg6
-       .endm
-
-       .macro  subround2 a b c d e w
-       lwzu reg7,4(\w)
-       rotlwi reg5,\a,5
-       dbct r0,\w
-       add \e,\e,K20
-       xor reg6,\b,\c
-       add \e,\e,reg5
-       xor reg6,reg6,\d
-       add \e,\e,reg7
-       rotrwi \b,\b,2
-       add \e,\e,reg6
-       .endm
-
-       .macro  subround3 a b c d e w
-       lwzu reg7,4(\w)
-       rotlwi reg5,\a,5
-       dbct r0,\w
-       xor reg6,\b,\c
-       add \e,\e,reg5
-       and reg6,reg6,\d
-       add \e,\e,K40
-       and reg5,\b,\c
-       add \e,\e,reg7
-       or reg6,reg6,reg5
-       rotrwi \b,\b,2
-       add \e,\e,reg6
-       .endm
-
-       .macro  subround4 a b c d e w
-       lwzu reg7,4(\w)
-       rotlwi reg5,\a,5
-       dbct r0,\w
-       add \e,\e,K60
-       xor reg6,\b,\c
-       add \e,\e,reg5
-       xor reg6,reg6,\d
-       add \e,\e,reg7
-       rotrwi \b,\b,2
-       add \e,\e,reg6
-       .endm
-
-C_FUNCTION_BEGIN(sha1Process)
-/* zero reg0 for general use */
-       li reg0,0
-/* for a,b,c,d,e use r26,r27,r28,r29,r30, for w use r31 */
-       
-/* we need to save registers before loading them */
-       stmw reg26,-24(reg1)
-/* load the frame pointer with parameter data, and hint cache */
-       addi reg31,reg3,PARAM_DATA
-       dbct reg31
-
-#if !WORDS_BIGENDIAN /* have to provide for PowerPC little-endian mode
-       /* loop of 16 entries */
-       li reg5,60
-       mtctr reg6
-.L00:
-       lwbrx reg6,reg31,reg5
-       stwx reg6,reg31,reg5
-       subi. reg5,reg5,4
-       bcge cr0,.L00
-       addi reg31,reg3,PARAM_DATA
-#endif
-
-/* do the initial mixing */
-       li reg8,64
-       addi reg26,reg3,PARAM_DATA+64-4
-       addi reg27,reg3,PARAM_DATA+64-3*4-4
-       addi reg28,reg3,PARAM_DATA+64-8*4-4
-       addi reg29,reg3,PARAM_DATA+64-14*4-4
-       addi reg30,reg3,PARAM_DATA+64-16*4-4
-       mtctr reg8
-
-.L10:
-       lwzu reg5,4(reg27)
-       lwzu reg6,4(reg28)
-       lwzu reg7,4(reg29)
-       lwzu reg8,4(reg30)
-       xor reg5,reg5,reg6
-       xor reg7,reg7,reg8
-       xor reg5,reg5,reg7
-       stwu reg5,4(reg26)
-       bdnz .L10
-
-       lwz reg26,PARAM_H   (reg3)
-       lwz reg27,PARAM_H+4 (reg3)
-       lwz reg28,PARAM_H+8 (reg3)
-       lwz reg29,PARAM_H+12(reg3)
-       lwz reg30,PARAM_H+16(reg3)
-
-       subround1 reg26,reg27,reg28,reg29,reg30,reg31
-       subround1 reg30,reg26,reg27,reg28,reg29,reg31
-       subround1 reg29,reg30,reg26,reg27,reg28,reg31
-       subround1 reg28,reg29,reg30,reg26,reg27,reg31
-       subround1 reg27,reg28,reg29,reg30,reg26,reg31
-       subround1 reg26,reg27,reg28,reg29,reg30,reg31
-       subround1 reg30,reg26,reg27,reg28,reg29,reg31
-       subround1 reg29,reg30,reg26,reg27,reg28,reg31
-       subround1 reg28,reg29,reg30,reg26,reg27,reg31
-       subround1 reg27,reg28,reg29,reg30,reg26,reg31
-       subround1 reg26,reg27,reg28,reg29,reg30,reg31
-       subround1 reg30,reg26,reg27,reg28,reg29,reg31
-       subround1 reg29,reg30,reg26,reg27,reg28,reg31
-       subround1 reg28,reg29,reg30,reg26,reg27,reg31
-       subround1 reg27,reg28,reg29,reg30,reg26,reg31
-       subround1 reg26,reg27,reg28,reg29,reg30,reg31
-       subround1 reg30,reg26,reg27,reg28,reg29,reg31
-       subround1 reg29,reg30,reg26,reg27,reg28,reg31
-       subround1 reg28,reg29,reg30,reg26,reg27,reg31
-       subround1 reg27,reg28,reg29,reg30,reg26,reg31
-
-       subround2 reg26,reg27,reg28,reg29,reg30,reg31
-       subround2 reg30,reg26,reg27,reg28,reg29,reg31
-       subround2 reg29,reg30,reg26,reg27,reg28,reg31
-       subround2 reg28,reg29,reg30,reg26,reg27,reg31
-       subround2 reg27,reg28,reg29,reg30,reg26,reg31
-       subround2 reg26,reg27,reg28,reg29,reg30,reg31
-       subround2 reg30,reg26,reg27,reg28,reg29,reg31
-       subround2 reg29,reg30,reg26,reg27,reg28,reg31
-       subround2 reg28,reg29,reg30,reg26,reg27,reg31
-       subround2 reg27,reg28,reg29,reg30,reg26,reg31
-       subround2 reg26,reg27,reg28,reg29,reg30,reg31
-       subround2 reg30,reg26,reg27,reg28,reg29,reg31
-       subround2 reg29,reg30,reg26,reg27,reg28,reg31
-       subround2 reg28,reg29,reg30,reg26,reg27,reg31
-       subround2 reg27,reg28,reg29,reg30,reg26,reg31
-       subround2 reg26,reg27,reg28,reg29,reg30,reg31
-       subround2 reg30,reg26,reg27,reg28,reg29,reg31
-       subround2 reg29,reg30,reg26,reg27,reg28,reg31
-       subround2 reg28,reg29,reg30,reg26,reg27,reg31
-       subround2 reg27,reg28,reg29,reg30,reg26,reg31
-
-       subround3 reg26,reg27,reg28,reg29,reg30,reg31
-       subround3 reg30,reg26,reg27,reg28,reg29,reg31
-       subround3 reg29,reg30,reg26,reg27,reg28,reg31
-       subround3 reg28,reg29,reg30,reg26,reg27,reg31
-       subround3 reg27,reg28,reg29,reg30,reg26,reg31
-       subround3 reg26,reg27,reg28,reg29,reg30,reg31
-       subround3 reg30,reg26,reg27,reg28,reg29,reg31
-       subround3 reg29,reg30,reg26,reg27,reg28,reg31
-       subround3 reg28,reg29,reg30,reg26,reg27,reg31
-       subround3 reg27,reg28,reg29,reg30,reg26,reg31
-       subround3 reg26,reg27,reg28,reg29,reg30,reg31
-       subround3 reg30,reg26,reg27,reg28,reg29,reg31
-       subround3 reg29,reg30,reg26,reg27,reg28,reg31
-       subround3 reg28,reg29,reg30,reg26,reg27,reg31
-       subround3 reg27,reg28,reg29,reg30,reg26,reg31
-       subround3 reg26,reg27,reg28,reg29,reg30,reg31
-       subround3 reg30,reg26,reg27,reg28,reg29,reg31
-       subround3 reg29,reg30,reg26,reg27,reg28,reg31
-       subround3 reg28,reg29,reg30,reg26,reg27,reg31
-       subround3 reg27,reg28,reg29,reg30,reg26,reg31
-
-       subround4 reg26,reg27,reg28,reg29,reg30,reg31
-       subround4 reg30,reg26,reg27,reg28,reg29,reg31
-       subround4 reg29,reg30,reg26,reg27,reg28,reg31
-       subround4 reg28,reg29,reg30,reg26,reg27,reg31
-       subround4 reg27,reg28,reg29,reg30,reg26,reg31
-       subround4 reg26,reg27,reg28,reg29,reg30,reg31
-       subround4 reg30,reg26,reg27,reg28,reg29,reg31
-       subround4 reg29,reg30,reg26,reg27,reg28,reg31
-       subround4 reg28,reg29,reg30,reg26,reg27,reg31
-       subround4 reg27,reg28,reg29,reg30,reg26,reg31
-       subround4 reg26,reg27,reg28,reg29,reg30,reg31
-       subround4 reg30,reg26,reg27,reg28,reg29,reg31
-       subround4 reg29,reg30,reg26,reg27,reg28,reg31
-       subround4 reg28,reg29,reg30,reg26,reg27,reg31
-       subround4 reg27,reg28,reg29,reg30,reg26,reg31
-       subround4 reg26,reg27,reg28,reg29,reg30,reg31
-       subround4 reg30,reg26,reg27,reg28,reg29,reg31
-       subround4 reg29,reg30,reg26,reg27,reg28,reg31
-       subround4 reg28,reg29,reg30,reg26,reg27,reg31
-       subround4 reg27,reg28,reg29,reg30,reg26,reg31
-
-/* then store the five values into registers */
-       lwz reg5,PARAM_H   (reg3)
-       lwz reg6,PARAM_H+4 (reg3)
-       lwz reg7,PARAM_H+8 (reg3)
-       lwz reg8,PARAM_H+12(reg3)
-       lwz reg9,PARAM_H+16(reg3)
-       add reg26,reg5,reg26
-       add reg27,reg5,reg27
-       add reg28,reg5,reg28
-       add reg29,reg5,reg29
-       add reg30,reg5,reg30
-       stw reg26,PARAM_H   (reg3)
-       stw reg27,PARAM_H+4 (reg3)
-       stw reg28,PARAM_H+8 (reg3)
-       stw reg29,PARAM_H+12(reg3)
-       stw reg30,PARAM_H+16(reg3)
-
-/* finally, restore registers */
-       lmw reg26,-24(reg1)
-/* and return */
-       blr
-C_FUNCION_END(sha1Process, .Lsha1Process_size)
diff --git a/beecrypt/gas/sha1opt.ia64.S b/beecrypt/gas/sha1opt.ia64.S
new file mode 100644 (file)
index 0000000..0b057c7
--- /dev/null
@@ -0,0 +1,213 @@
+/*
+ * fips180opt.ia64.S
+ *
+ * Assembler optimized SHA-1 routines for ia64 (Intel Itanium)
+ *
+ * Warning: this code is incomplete and only contains a rough prototype!
+ *
+ * Compile target is GNU Assembler
+ *
+ * Copyright (c) 2001 Virtual Unlimited B.V.
+ *
+ * Author: Bob Deblier <bob@virtualunlimited.com>
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ *
+ */
+
+#include "beecrypt.gas.h"
+
+#define saved_pfs      r14
+#define saved_lc       r15
+
+#define param          r16
+
+       .file   "fips180opt.ia64.S"
+
+       .text
+
+       .equ    K00,    0x5a827999
+       .equ    K20,    0x6ed9eba1
+       .equ    K40,    0x8f1bbcdc
+       .equ    K60,    0xca62c1d6
+
+       .equ    PARAM_H,        0
+       .equ    PARAM_DATA,     20
+
+/* for optimization, I have to see how I can parallellize the code
+
+       e = ROTL32(a, 5) + ((b&(c^d))^d) + e + w + K
+       b = ROTR32(b, 2);
+
+step1: load w, tmp0 = mix a, tmp1 = c xor d, e += K;;
+step2: tmp0 >>= 27, tmp1 &= b, e += w, b = mix b;;
+step3: b >>= 2, e += tmp0, tmp1 ^= d;;
+step4: e += tmp1, load next w, tmp0 = mix d, tmp1 = b xor d, d += K;;
+step5: etc.
+
+       d = ROTL32(d, 5) + ((a&(b^c))^c) + d + w + K
+       a = ROTR32(a, 2)
+*/
+
+       .macro  subround1       a b c d e w
+       ld4 r19 = [\w],4
+       add \e = $K00,\e
+       xor r21 = \c,\d
+       mix4.r r20 = \a,\a;;
+       add \e = \e,r19
+       and r21 = r21,\b
+       shr.u r20 = 27,r20
+       mix4.r r22 = \b,\b;;
+       add \e = r20,\e
+       xor r21 = r21,\d
+       shr.u \b = 2,r22;;
+       add \e = r21,\e
+       .endm
+
+       .macro  subround2       a b c d e w
+       ld4 r19 = [\w],4
+       add \e = $K20,\e
+       xor r21 = \b,\c
+       mix4.r r20 = \a,\a;;
+       add \e = \e,r19
+       xor r21 = r21,\d
+       shr.u r20 = 27,r20
+       mix4.r \b = \b,\b;;
+       add \e = r20,\e
+       shr.u \b = 2,\b;;
+       add \e = r21,\e
+       .endm
+
+       .macro  subround3       a b c d e w
+       ld4 r19 = [\w],4
+       add \e = $K40,\e
+       xor r21 = \b,\c
+       and r22 = \b,\c
+       mix4.r r20 = \a,\a;;
+       add \e = \e,r19
+       and r21 = r21,\d
+       shr.u r20 = 27,r20
+       mix4.r \b = \b,\b;;
+       add \e = r20,\e
+       or r21 = r21,r22
+       shr.u \b = 2,\b;;
+       add \e = r21,\e
+       .endm
+
+       .macro  subround4       a b c d e w
+       ld4 r19 = [\w],4
+       add \e = $K60,\e
+       xor r21 = \b,\c
+       mix4.r r20 = \a,\a;;
+       add \e = \e,r19
+       xor r21 = r21,\d
+       shr.u r20 = 27,r20
+       mix4.r \b = \b,\b;;
+       add \e = r20,\e
+       shr.u \b = 2,\b;;
+       add \e = r21,\e
+       .endm
+
+       .align  32
+       .global sha1Process#
+       .proc   sha1Process#
+
+sha1Process:
+       alloc saved_pfs = ar.pfs,2,0,0,0
+       mov saved_lc = ar.lc
+
+/*     r16 will be h */
+/*     r17 will be pdata */
+/*     There must be something neat I can do to speed up expansion (xor/rotate)
+
+       The following should work, if we use 24 rotating registers; speedup should be dramatic
+       preload with swapped values 0-15
+
+       rought draft: have to translate this to more precise rotating registers and predicates.
+
+       /----------\
+       |xor[2],[0]|
+       +----------+----------\
+       |xor[8]    |xor[3],[1]|
+       +----------+----------+----------\
+       |xor[13]   |xor[9]    |xor[4],[2]|
+       +----------+----------+----------+----------\
+       |mix4.r[16]|xor[14]   |xor[10]   |xor[5],[3]|
+       +----------+----------+----------+----------+-----------\
+       |shr[16]   |mix4.r[17]|xor[15]   |xor[11]   |xor[6],[4] |
+       +----------+----------+----------+----------+-----------+----------\
+       |store[16] |shr[17]   |mix4.r[18]|xor[16]   |xor[12]    |xor[7],[5]|
+       \----------+----------+----------+----------+-----------+----------+----------\
+                  |store[17] |shr[18]   |mix4.r[19]|xor[17]    |xor[13]   |xor[8],[6]|
+                  \----------+----------+----------+-----------+----------+----------+----------\
+                             |store[18] |shr[19]   |mix4.r[20] |xor[18]   |xor[14]   |xor[9],[7]|
+                             \----------+----------+-----------+----------+----------+----------+----------\
+                                        |          |           |          |          |          |          |
+*/
+       alloc saved_pfs = ar.pfs,3,21,0,24
+
+       /* look into big-endian loads, followed by little-endian stores */
+#if !WORD_BIGENDIAN
+       // save UM.be 
+       // set UM.be to one
+#endif
+       /*
+.L00:
+       ld4 r32 = [ra],4
+       
+br.ctop.sptk .L00;;
+#if !WORD_BIGENDIAN
+       // restore UM.be
+       /*
+       mov ra = rd
+       mov rb = rd;;
+       add rb = 4,rd;;
+       st4 [ra],8 = r48
+       st4 [rb],8 = r47;;
+       st4 [ra],8 = r46
+       st4 [rb],8 = r45;;
+       st4 [ra],8 = r44
+       st4 [rb],8 = r43;;
+       st4 [ra],8 = r42
+       st4 [rb],8 = r41;;
+       st4 [ra],8 = r40
+       st4 [rb],8 = r39;;
+       st4 [ra],8 = r38
+       st4 [rb],8 = r37;;
+       st4 [ra],8 = r36
+       st4 [rb],8 = r35;;
+       st4 [ra],8 = r34
+       st4 [rb],8 = r33;;
+       */
+#endif
+       /* also add a conditional which will save the original swapped words! */
+       /* the expansion loop will translate to something like this: */
+.L01:
+       /* put three xors together */
+       (p16) xor r32 = r46,r48
+       (p17) xor r33 = r33,r41
+       (p18) xor r34 = r34,r37
+       (p19) mix4.r r35 = r35,r35
+       (p20) shr.u r36 = 31,r36
+       (p21) st4 [],4 = r37
+       br.ctop.sptk .L01;;
+
+       etc.
+*/
+       mov ar.lc = r15
+       mov ar.pfs = r14
+       br.ret.sptk b0
+       .endp   sha1Process#
+
diff --git a/beecrypt/gas/sha1opt.powerpc.S b/beecrypt/gas/sha1opt.powerpc.S
new file mode 100644 (file)
index 0000000..db63149
--- /dev/null
@@ -0,0 +1,287 @@
+/*
+ * fips180opt.powerpc.S
+ *
+ * Assembler optimized SHA-1 routines for PowerPC processors
+ *
+ * Warning: this code is incomplete and only contains a rough prototype!
+ *
+ * Compile target is GNU Assembler
+ *
+ * Copyright (c) 2000, 2001 Virtual Unlimited B.V.
+ *
+ * Author: Bob Deblier <bob@virtualunlimited.com>
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ *
+ */
+
+#include "beecrypt.gas.h"
+
+       .file "fips180opt.powerpc.S"
+
+       .text
+
+#if DARWIN
+# define reg0  r0
+# define reg3  r3
+# define reg4  r4
+# define reg5  r5
+# define reg6  r6
+# define reg7  r7
+# define reg8  r8
+# define reg9  r9
+# define reg26 r26
+# define reg27 r27
+# define reg28 r28
+# define reg29 r29
+# define reg30 r30
+# define reg31 r31
+#else
+# define reg0  %r0
+# define reg3  %r3
+# define reg4  %r4
+# define reg5  %r5
+# define reg6  %r6
+# define reg7  %r7
+# define reg8  %r8
+# define reg9  %r9
+# define reg26 %r26
+# define reg27 %r27
+# define reg28 %r28
+# define reg29 %r29
+# define reg30 %r30
+# define reg31 %r31
+#endif
+
+       .equ    K00,    0x5a827999
+       .equ    K20,    0x6ed9eba1
+       .equ    K40,    0x8f1bbcdc
+       .equ    K60,    0xca62c1d6
+
+       .equ    PARAM_H,                0
+       .equ    PARAM_DATA,             20
+
+/* sha1Param: param in reg3 */
+
+       .macro  subround1 a b c d e w
+       lwzu reg7,4(\w)
+       rotlwi  reg5,\a,5
+       dbct r0,\w
+       xor reg6,\c,\d
+       add \e,\e,K00
+       and reg6,reg6,\b
+       add \e,\e,reg7
+       xor reg6,reg6,\d
+       add \e,\e,reg5
+       rotrwi \b,\b,2
+       add \e,\e,reg6
+       .endm
+
+       .macro  subround2 a b c d e w
+       lwzu reg7,4(\w)
+       rotlwi reg5,\a,5
+       dbct r0,\w
+       add \e,\e,K20
+       xor reg6,\b,\c
+       add \e,\e,reg5
+       xor reg6,reg6,\d
+       add \e,\e,reg7
+       rotrwi \b,\b,2
+       add \e,\e,reg6
+       .endm
+
+       .macro  subround3 a b c d e w
+       lwzu reg7,4(\w)
+       rotlwi reg5,\a,5
+       dbct r0,\w
+       xor reg6,\b,\c
+       add \e,\e,reg5
+       and reg6,reg6,\d
+       add \e,\e,K40
+       and reg5,\b,\c
+       add \e,\e,reg7
+       or reg6,reg6,reg5
+       rotrwi \b,\b,2
+       add \e,\e,reg6
+       .endm
+
+       .macro  subround4 a b c d e w
+       lwzu reg7,4(\w)
+       rotlwi reg5,\a,5
+       dbct r0,\w
+       add \e,\e,K60
+       xor reg6,\b,\c
+       add \e,\e,reg5
+       xor reg6,reg6,\d
+       add \e,\e,reg7
+       rotrwi \b,\b,2
+       add \e,\e,reg6
+       .endm
+
+C_FUNCTION_BEGIN(sha1Process)
+/* zero reg0 for general use */
+       li reg0,0
+/* for a,b,c,d,e use r26,r27,r28,r29,r30, for w use r31 */
+       
+/* we need to save registers before loading them */
+       stmw reg26,-24(reg1)
+/* load the frame pointer with parameter data, and hint cache */
+       addi reg31,reg3,PARAM_DATA
+       dbct reg31
+
+#if !WORDS_BIGENDIAN /* have to provide for PowerPC little-endian mode
+       /* loop of 16 entries */
+       li reg5,60
+       mtctr reg6
+.L00:
+       lwbrx reg6,reg31,reg5
+       stwx reg6,reg31,reg5
+       subi. reg5,reg5,4
+       bcge cr0,.L00
+       addi reg31,reg3,PARAM_DATA
+#endif
+
+/* do the initial mixing */
+       li reg8,64
+       addi reg26,reg3,PARAM_DATA+64-4
+       addi reg27,reg3,PARAM_DATA+64-3*4-4
+       addi reg28,reg3,PARAM_DATA+64-8*4-4
+       addi reg29,reg3,PARAM_DATA+64-14*4-4
+       addi reg30,reg3,PARAM_DATA+64-16*4-4
+       mtctr reg8
+
+.L10:
+       lwzu reg5,4(reg27)
+       lwzu reg6,4(reg28)
+       lwzu reg7,4(reg29)
+       lwzu reg8,4(reg30)
+       xor reg5,reg5,reg6
+       xor reg7,reg7,reg8
+       xor reg5,reg5,reg7
+       stwu reg5,4(reg26)
+       bdnz .L10
+
+       lwz reg26,PARAM_H   (reg3)
+       lwz reg27,PARAM_H+4 (reg3)
+       lwz reg28,PARAM_H+8 (reg3)
+       lwz reg29,PARAM_H+12(reg3)
+       lwz reg30,PARAM_H+16(reg3)
+
+       subround1 reg26,reg27,reg28,reg29,reg30,reg31
+       subround1 reg30,reg26,reg27,reg28,reg29,reg31
+       subround1 reg29,reg30,reg26,reg27,reg28,reg31
+       subround1 reg28,reg29,reg30,reg26,reg27,reg31
+       subround1 reg27,reg28,reg29,reg30,reg26,reg31
+       subround1 reg26,reg27,reg28,reg29,reg30,reg31
+       subround1 reg30,reg26,reg27,reg28,reg29,reg31
+       subround1 reg29,reg30,reg26,reg27,reg28,reg31
+       subround1 reg28,reg29,reg30,reg26,reg27,reg31
+       subround1 reg27,reg28,reg29,reg30,reg26,reg31
+       subround1 reg26,reg27,reg28,reg29,reg30,reg31
+       subround1 reg30,reg26,reg27,reg28,reg29,reg31
+       subround1 reg29,reg30,reg26,reg27,reg28,reg31
+       subround1 reg28,reg29,reg30,reg26,reg27,reg31
+       subround1 reg27,reg28,reg29,reg30,reg26,reg31
+       subround1 reg26,reg27,reg28,reg29,reg30,reg31
+       subround1 reg30,reg26,reg27,reg28,reg29,reg31
+       subround1 reg29,reg30,reg26,reg27,reg28,reg31
+       subround1 reg28,reg29,reg30,reg26,reg27,reg31
+       subround1 reg27,reg28,reg29,reg30,reg26,reg31
+
+       subround2 reg26,reg27,reg28,reg29,reg30,reg31
+       subround2 reg30,reg26,reg27,reg28,reg29,reg31
+       subround2 reg29,reg30,reg26,reg27,reg28,reg31
+       subround2 reg28,reg29,reg30,reg26,reg27,reg31
+       subround2 reg27,reg28,reg29,reg30,reg26,reg31
+       subround2 reg26,reg27,reg28,reg29,reg30,reg31
+       subround2 reg30,reg26,reg27,reg28,reg29,reg31
+       subround2 reg29,reg30,reg26,reg27,reg28,reg31
+       subround2 reg28,reg29,reg30,reg26,reg27,reg31
+       subround2 reg27,reg28,reg29,reg30,reg26,reg31
+       subround2 reg26,reg27,reg28,reg29,reg30,reg31
+       subround2 reg30,reg26,reg27,reg28,reg29,reg31
+       subround2 reg29,reg30,reg26,reg27,reg28,reg31
+       subround2 reg28,reg29,reg30,reg26,reg27,reg31
+       subround2 reg27,reg28,reg29,reg30,reg26,reg31
+       subround2 reg26,reg27,reg28,reg29,reg30,reg31
+       subround2 reg30,reg26,reg27,reg28,reg29,reg31
+       subround2 reg29,reg30,reg26,reg27,reg28,reg31
+       subround2 reg28,reg29,reg30,reg26,reg27,reg31
+       subround2 reg27,reg28,reg29,reg30,reg26,reg31
+
+       subround3 reg26,reg27,reg28,reg29,reg30,reg31
+       subround3 reg30,reg26,reg27,reg28,reg29,reg31
+       subround3 reg29,reg30,reg26,reg27,reg28,reg31
+       subround3 reg28,reg29,reg30,reg26,reg27,reg31
+       subround3 reg27,reg28,reg29,reg30,reg26,reg31
+       subround3 reg26,reg27,reg28,reg29,reg30,reg31
+       subround3 reg30,reg26,reg27,reg28,reg29,reg31
+       subround3 reg29,reg30,reg26,reg27,reg28,reg31
+       subround3 reg28,reg29,reg30,reg26,reg27,reg31
+       subround3 reg27,reg28,reg29,reg30,reg26,reg31
+       subround3 reg26,reg27,reg28,reg29,reg30,reg31
+       subround3 reg30,reg26,reg27,reg28,reg29,reg31
+       subround3 reg29,reg30,reg26,reg27,reg28,reg31
+       subround3 reg28,reg29,reg30,reg26,reg27,reg31
+       subround3 reg27,reg28,reg29,reg30,reg26,reg31
+       subround3 reg26,reg27,reg28,reg29,reg30,reg31
+       subround3 reg30,reg26,reg27,reg28,reg29,reg31
+       subround3 reg29,reg30,reg26,reg27,reg28,reg31
+       subround3 reg28,reg29,reg30,reg26,reg27,reg31
+       subround3 reg27,reg28,reg29,reg30,reg26,reg31
+
+       subround4 reg26,reg27,reg28,reg29,reg30,reg31
+       subround4 reg30,reg26,reg27,reg28,reg29,reg31
+       subround4 reg29,reg30,reg26,reg27,reg28,reg31
+       subround4 reg28,reg29,reg30,reg26,reg27,reg31
+       subround4 reg27,reg28,reg29,reg30,reg26,reg31
+       subround4 reg26,reg27,reg28,reg29,reg30,reg31
+       subround4 reg30,reg26,reg27,reg28,reg29,reg31
+       subround4 reg29,reg30,reg26,reg27,reg28,reg31
+       subround4 reg28,reg29,reg30,reg26,reg27,reg31
+       subround4 reg27,reg28,reg29,reg30,reg26,reg31
+       subround4 reg26,reg27,reg28,reg29,reg30,reg31
+       subround4 reg30,reg26,reg27,reg28,reg29,reg31
+       subround4 reg29,reg30,reg26,reg27,reg28,reg31
+       subround4 reg28,reg29,reg30,reg26,reg27,reg31
+       subround4 reg27,reg28,reg29,reg30,reg26,reg31
+       subround4 reg26,reg27,reg28,reg29,reg30,reg31
+       subround4 reg30,reg26,reg27,reg28,reg29,reg31
+       subround4 reg29,reg30,reg26,reg27,reg28,reg31
+       subround4 reg28,reg29,reg30,reg26,reg27,reg31
+       subround4 reg27,reg28,reg29,reg30,reg26,reg31
+
+/* then store the five values into registers */
+       lwz reg5,PARAM_H   (reg3)
+       lwz reg6,PARAM_H+4 (reg3)
+       lwz reg7,PARAM_H+8 (reg3)
+       lwz reg8,PARAM_H+12(reg3)
+       lwz reg9,PARAM_H+16(reg3)
+       add reg26,reg5,reg26
+       add reg27,reg5,reg27
+       add reg28,reg5,reg28
+       add reg29,reg5,reg29
+       add reg30,reg5,reg30
+       stw reg26,PARAM_H   (reg3)
+       stw reg27,PARAM_H+4 (reg3)
+       stw reg28,PARAM_H+8 (reg3)
+       stw reg29,PARAM_H+12(reg3)
+       stw reg30,PARAM_H+16(reg3)
+
+/* finally, restore registers */
+       lmw reg26,-24(reg1)
+/* and return */
+       blr
+C_FUNCION_END(sha1Process, .Lsha1Process_size)