Rename fips180opt -> sha1opt platform specific asm files.
CVS patchset: 5487
CVS date: 2002/06/15 14:19:38
AUTOMAKE_OPTIONS = gnu no-dependencies
-EXTRA_DIST = aesopt.i586.S aesopt.powerpc.S blowfishopt.i586.S blowfishopt.powerpc.S sha1opt.i586.S mp32opt.arm.S mp32opt.i386.S mp32opt.ia64.S mp32opt.powerpc.S mp32opt.sparcv8.S mp32opt.sparcv9.S mp64opt.ia64.S
+EXTRA_DIST = aesopt.i586.S aesopt.powerpc.S blowfishopt.i586.S blowfishopt.powerpc.S mp32opt.arm.S mp32opt.i386.S mp32opt.ia64.S mp32opt.powerpc.S mp32opt.sparcv8.S mp32opt.sparcv9.S mp64opt.ia64.S sha1opt.i586.S sha1opt.ia64.S sha1opt.powerpc.S
AUTOMAKE_OPTIONS = gnu no-dependencies
-EXTRA_DIST = aesopt.i586.S aesopt.powerpc.S blowfishopt.i586.S blowfishopt.powerpc.S sha1opt.i586.S mp32opt.arm.S mp32opt.i386.S mp32opt.ia64.S mp32opt.powerpc.S mp32opt.sparcv8.S mp32opt.sparcv9.S mp64opt.ia64.S
+EXTRA_DIST = aesopt.i586.S aesopt.powerpc.S blowfishopt.i586.S blowfishopt.powerpc.S mp32opt.arm.S mp32opt.i386.S mp32opt.ia64.S mp32opt.powerpc.S mp32opt.sparcv8.S mp32opt.sparcv9.S mp64opt.ia64.S sha1opt.i586.S sha1opt.ia64.S sha1opt.powerpc.S
subdir = gas
mkinstalldirs = $(SHELL) $(top_srcdir)/mkinstalldirs
CONFIG_HEADER = $(top_builddir)/config.h
--- /dev/null
+/*
+ * blowfishopt.powerpc.asm
+ *
+ * Assembler optimized Blowfish routines for PowerPC processors
+ *
+ * Compile target is GNU Assembler
+ *
+ * Copyright (c) 2002 Bob Deblier <bob@virtualunlimited.com>
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ */
+
+#include "beecrypt.gas.h"
+
+ .file "blowfish.powerpc.S"
+
+ .text
+
+ .macro round xl xr offset
+ lwz r9,\offset(r3)
+ xor \xl,\xl,r9
+ rlwinm r9,\xl,10,22,29
+ rlwinm r10,\xl,18,22,29
+ lwzx r9,r9,r28
+ lwzx r10,r10,r29
+ rlwinm r11,\xl,26,22,29
+ add r9,r9,r10
+ lwzx r11,r11,r30
+ rlwinm r12,\xl,2,22,29
+ xor r9,r9,r11
+ lwzx r12,r12,r31
+ add r9,r9,r12
+ xor \xr,\xr,r9
+ .endm
+
+ .macro eblock
+ round xl=r7 xr=r8 offset=0
+ round xl=r8 xr=r7 offset=4
+ round xl=r7 xr=r8 offset=8
+ round xl=r8 xr=r7 offset=12
+ round xl=r7 xr=r8 offset=16
+ round xl=r8 xr=r7 offset=20
+ round xl=r7 xr=r8 offset=24
+ round xl=r8 xr=r7 offset=28
+ round xl=r7 xr=r8 offset=32
+ round xl=r8 xr=r7 offset=36
+ round xl=r7 xr=r8 offset=40
+ round xl=r8 xr=r7 offset=44
+ round xl=r7 xr=r8 offset=48
+ round xl=r8 xr=r7 offset=52
+ round xl=r7 xr=r8 offset=56
+ round xl=r8 xr=r7 offset=60
+
+ lwz r9,64(r3)
+ lwz r10,68(r3)
+ xor r7,r7,r9
+ xor r8,r8,r10
+ .endm
+
+ .macro dblock
+ round xl=r7 xr=r8 offset=68
+ round xl=r8 xr=r7 offset=64
+ round xl=r7 xr=r8 offset=60
+ round xl=r8 xr=r7 offset=56
+ round xl=r7 xr=r8 offset=52
+ round xl=r8 xr=r7 offset=48
+ round xl=r7 xr=r8 offset=44
+ round xl=r8 xr=r7 offset=40
+ round xl=r7 xr=r8 offset=36
+ round xl=r8 xr=r7 offset=32
+ round xl=r7 xr=r8 offset=28
+ round xl=r8 xr=r7 offset=24
+ round xl=r7 xr=r8 offset=20
+ round xl=r8 xr=r7 offset=16
+ round xl=r7 xr=r8 offset=12
+ round xl=r8 xr=r7 offset=8
+
+ lwz r9,4(r3)
+ lwz r10,0(r3)
+ xor r7,r7,r9
+ xor r8,r8,r10
+ .endm
+
+C_FUNCTION_BEGIN(blowfishEncrypt)
+LABEL(blowfishEncrypt)
+ la r1,-16(r1)
+ stmw r28,0(r1)
+
+ la r28,72(r3)
+ la r29,1096(r3)
+ la r30,2120(r3)
+ la r31,3144(r3)
+
+ #if WORDS_BIGENDIAN
+ lwz r7,0(r5)
+ lwz r8,4(r5)
+ #else
+ # error ppc little-endian mode not supported
+ #endif
+
+ eblock
+
+ #if WORDS_BIGENDIAN
+ stw r7,4(r4)
+ stw r8,0(r4)
+ #else
+ # error ppc little-endian mode not supported
+ #endif
+
+ li r3,0
+ lmw r28,0(r1)
+ la r1,16(r1)
+ blr
+C_FUNCTION_END(blowfishEncrypt, LOCAL(blowfishEncrypt_size))
+
+
+C_FUNCTION_BEGIN(blowfishDecrypt)
+LABEL(blowfishDecrypt)
+ la r1,-16(r1)
+ stmw r28,0(r1)
+
+ la r28,72(r3)
+ la r29,1096(r3)
+ la r30,2120(r3)
+ la r31,3144(r3)
+
+ #if WORDS_BIGENDIAN
+ lwz r7,0(r5)
+ lwz r8,4(r5)
+ #else
+ # error ppc little-endian mode not supported
+ #endif
+
+ dblock
+
+ #if WORDS_BIGENDIAN
+ stw r7,4(r4)
+ stw r8,0(r4)
+ #else
+ # error ppc little-endian mode not supported
+ #endif
+
+ li r3,0
+ lmw r28,0(r1)
+ la r1,16(r1)
+ blr
+C_FUNCTION_END(blowfishDecrypt, LOCAL(blowfishDecrypt_size))
+
+
+C_FUNCTION_BEGIN(blowfishECBEncrypt)
+LABEL(blowfishECBEncrypt)
+ la r1,-16(r1)
+ stmw r28,0(r1)
+
+ mtctr r4
+
+ la r28,72(r3)
+ la r29,1096(r3)
+ la r30,2120(r3)
+ la r31,3144(r3)
+
+LOCAL(00):
+ #if WORDS_BIGENDIAN
+ lwz r7,0(r6)
+ lwz r8,4(r6)
+ #else
+ # error ppc little-endian mode not supported
+ #endif
+
+ eblock
+
+ #if WORDS_BIGENDIAN
+ stw r7,4(r5)
+ stw r8,0(r5)
+ #else
+ # error ppc little-endian mode not supported
+ #endif
+
+ la r5,8(r5)
+ la r6,8(r6)
+
+ bdnz LOCAL(00)
+
+ li r3,0
+ lmw r28,0(r1)
+ la r1,16(r1)
+ blr
+C_FUNCTION_END(blowfishECBEncrypt, LOCAL(blowfishECBEncrypt_size))
+
+
+C_FUNCTION_BEGIN(blowfishECBDecrypt)
+LABEL(blowfishECBDecrypt)
+ la r1,-16(r1)
+ stmw r28,0(r1)
+
+ mtctr r4
+
+ la r28,72(r3)
+ la r29,1096(r3)
+ la r30,2120(r3)
+ la r31,3144(r3)
+
+LOCAL(01):
+ #if WORDS_BIGENDIAN
+ lwz r7,0(r6)
+ lwz r8,4(r6)
+ #else
+ # error ppc little-endian mode not supported
+ #endif
+
+ dblock
+
+ #if WORDS_BIGENDIAN
+ stw r7,4(r5)
+ stw r8,0(r5)
+ #else
+ # error ppc little-endian mode not supported
+ #endif
+
+ la r5,8(r5)
+ la r6,8(r6)
+
+ bdnz LOCAL(01)
+
+ li r3,0
+ lmw r28,0(r1)
+ la r1,16(r1)
+ blr
+C_FUNCTION_END(blowfishECBDecrypt, LOCAL(blowfishECBDecrypt_size))
+++ /dev/null
-/*
- * fips180opt.ia64.S
- *
- * Assembler optimized SHA-1 routines for ia64 (Intel Itanium)
- *
- * Warning: this code is incomplete and only contains a rough prototype!
- *
- * Compile target is GNU Assembler
- *
- * Copyright (c) 2001 Virtual Unlimited B.V.
- *
- * Author: Bob Deblier <bob@virtualunlimited.com>
- *
- * This library is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * This library is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with this library; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
- *
- */
-
-#include "beecrypt.gas.h"
-
-#define saved_pfs r14
-#define saved_lc r15
-
-#define param r16
-
- .file "fips180opt.ia64.S"
-
- .text
-
- .equ K00, 0x5a827999
- .equ K20, 0x6ed9eba1
- .equ K40, 0x8f1bbcdc
- .equ K60, 0xca62c1d6
-
- .equ PARAM_H, 0
- .equ PARAM_DATA, 20
-
-/* for optimization, I have to see how I can parallellize the code
-
- e = ROTL32(a, 5) + ((b&(c^d))^d) + e + w + K
- b = ROTR32(b, 2);
-
-step1: load w, tmp0 = mix a, tmp1 = c xor d, e += K;;
-step2: tmp0 >>= 27, tmp1 &= b, e += w, b = mix b;;
-step3: b >>= 2, e += tmp0, tmp1 ^= d;;
-step4: e += tmp1, load next w, tmp0 = mix d, tmp1 = b xor d, d += K;;
-step5: etc.
-
- d = ROTL32(d, 5) + ((a&(b^c))^c) + d + w + K
- a = ROTR32(a, 2)
-*/
-
- .macro subround1 a b c d e w
- ld4 r19 = [\w],4
- add \e = $K00,\e
- xor r21 = \c,\d
- mix4.r r20 = \a,\a;;
- add \e = \e,r19
- and r21 = r21,\b
- shr.u r20 = 27,r20
- mix4.r r22 = \b,\b;;
- add \e = r20,\e
- xor r21 = r21,\d
- shr.u \b = 2,r22;;
- add \e = r21,\e
- .endm
-
- .macro subround2 a b c d e w
- ld4 r19 = [\w],4
- add \e = $K20,\e
- xor r21 = \b,\c
- mix4.r r20 = \a,\a;;
- add \e = \e,r19
- xor r21 = r21,\d
- shr.u r20 = 27,r20
- mix4.r \b = \b,\b;;
- add \e = r20,\e
- shr.u \b = 2,\b;;
- add \e = r21,\e
- .endm
-
- .macro subround3 a b c d e w
- ld4 r19 = [\w],4
- add \e = $K40,\e
- xor r21 = \b,\c
- and r22 = \b,\c
- mix4.r r20 = \a,\a;;
- add \e = \e,r19
- and r21 = r21,\d
- shr.u r20 = 27,r20
- mix4.r \b = \b,\b;;
- add \e = r20,\e
- or r21 = r21,r22
- shr.u \b = 2,\b;;
- add \e = r21,\e
- .endm
-
- .macro subround4 a b c d e w
- ld4 r19 = [\w],4
- add \e = $K60,\e
- xor r21 = \b,\c
- mix4.r r20 = \a,\a;;
- add \e = \e,r19
- xor r21 = r21,\d
- shr.u r20 = 27,r20
- mix4.r \b = \b,\b;;
- add \e = r20,\e
- shr.u \b = 2,\b;;
- add \e = r21,\e
- .endm
-
- .align 32
- .global sha1Process#
- .proc sha1Process#
-
-sha1Process:
- alloc saved_pfs = ar.pfs,2,0,0,0
- mov saved_lc = ar.lc
-
-/* r16 will be h */
-/* r17 will be pdata */
-/* There must be something neat I can do to speed up expansion (xor/rotate)
-
- The following should work, if we use 24 rotating registers; speedup should be dramatic
- preload with swapped values 0-15
-
- rought draft: have to translate this to more precise rotating registers and predicates.
-
- /----------\
- |xor[2],[0]|
- +----------+----------\
- |xor[8] |xor[3],[1]|
- +----------+----------+----------\
- |xor[13] |xor[9] |xor[4],[2]|
- +----------+----------+----------+----------\
- |mix4.r[16]|xor[14] |xor[10] |xor[5],[3]|
- +----------+----------+----------+----------+-----------\
- |shr[16] |mix4.r[17]|xor[15] |xor[11] |xor[6],[4] |
- +----------+----------+----------+----------+-----------+----------\
- |store[16] |shr[17] |mix4.r[18]|xor[16] |xor[12] |xor[7],[5]|
- \----------+----------+----------+----------+-----------+----------+----------\
- |store[17] |shr[18] |mix4.r[19]|xor[17] |xor[13] |xor[8],[6]|
- \----------+----------+----------+-----------+----------+----------+----------\
- |store[18] |shr[19] |mix4.r[20] |xor[18] |xor[14] |xor[9],[7]|
- \----------+----------+-----------+----------+----------+----------+----------\
- | | | | | | |
-*/
- alloc saved_pfs = ar.pfs,3,21,0,24
-
- /* look into big-endian loads, followed by little-endian stores */
-#if !WORD_BIGENDIAN
- // save UM.be
- // set UM.be to one
-#endif
- /*
-.L00:
- ld4 r32 = [ra],4
-
-br.ctop.sptk .L00;;
-#if !WORD_BIGENDIAN
- // restore UM.be
- /*
- mov ra = rd
- mov rb = rd;;
- add rb = 4,rd;;
- st4 [ra],8 = r48
- st4 [rb],8 = r47;;
- st4 [ra],8 = r46
- st4 [rb],8 = r45;;
- st4 [ra],8 = r44
- st4 [rb],8 = r43;;
- st4 [ra],8 = r42
- st4 [rb],8 = r41;;
- st4 [ra],8 = r40
- st4 [rb],8 = r39;;
- st4 [ra],8 = r38
- st4 [rb],8 = r37;;
- st4 [ra],8 = r36
- st4 [rb],8 = r35;;
- st4 [ra],8 = r34
- st4 [rb],8 = r33;;
- */
-#endif
- /* also add a conditional which will save the original swapped words! */
- /* the expansion loop will translate to something like this: */
-.L01:
- /* put three xors together */
- (p16) xor r32 = r46,r48
- (p17) xor r33 = r33,r41
- (p18) xor r34 = r34,r37
- (p19) mix4.r r35 = r35,r35
- (p20) shr.u r36 = 31,r36
- (p21) st4 [],4 = r37
- br.ctop.sptk .L01;;
-
- etc.
-*/
- mov ar.lc = r15
- mov ar.pfs = r14
- br.ret.sptk b0
- .endp sha1Process#
-
+++ /dev/null
-/*
- * fips180opt.powerpc.S
- *
- * Assembler optimized SHA-1 routines for PowerPC processors
- *
- * Warning: this code is incomplete and only contains a rough prototype!
- *
- * Compile target is GNU Assembler
- *
- * Copyright (c) 2000, 2001 Virtual Unlimited B.V.
- *
- * Author: Bob Deblier <bob@virtualunlimited.com>
- *
- * This library is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * This library is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with this library; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
- *
- */
-
-#include "beecrypt.gas.h"
-
- .file "fips180opt.powerpc.S"
-
- .text
-
-#if DARWIN
-# define reg0 r0
-# define reg3 r3
-# define reg4 r4
-# define reg5 r5
-# define reg6 r6
-# define reg7 r7
-# define reg8 r8
-# define reg9 r9
-# define reg26 r26
-# define reg27 r27
-# define reg28 r28
-# define reg29 r29
-# define reg30 r30
-# define reg31 r31
-#else
-# define reg0 %r0
-# define reg3 %r3
-# define reg4 %r4
-# define reg5 %r5
-# define reg6 %r6
-# define reg7 %r7
-# define reg8 %r8
-# define reg9 %r9
-# define reg26 %r26
-# define reg27 %r27
-# define reg28 %r28
-# define reg29 %r29
-# define reg30 %r30
-# define reg31 %r31
-#endif
-
- .equ K00, 0x5a827999
- .equ K20, 0x6ed9eba1
- .equ K40, 0x8f1bbcdc
- .equ K60, 0xca62c1d6
-
- .equ PARAM_H, 0
- .equ PARAM_DATA, 20
-
-/* sha1Param: param in reg3 */
-
- .macro subround1 a b c d e w
- lwzu reg7,4(\w)
- rotlwi reg5,\a,5
- dbct r0,\w
- xor reg6,\c,\d
- add \e,\e,K00
- and reg6,reg6,\b
- add \e,\e,reg7
- xor reg6,reg6,\d
- add \e,\e,reg5
- rotrwi \b,\b,2
- add \e,\e,reg6
- .endm
-
- .macro subround2 a b c d e w
- lwzu reg7,4(\w)
- rotlwi reg5,\a,5
- dbct r0,\w
- add \e,\e,K20
- xor reg6,\b,\c
- add \e,\e,reg5
- xor reg6,reg6,\d
- add \e,\e,reg7
- rotrwi \b,\b,2
- add \e,\e,reg6
- .endm
-
- .macro subround3 a b c d e w
- lwzu reg7,4(\w)
- rotlwi reg5,\a,5
- dbct r0,\w
- xor reg6,\b,\c
- add \e,\e,reg5
- and reg6,reg6,\d
- add \e,\e,K40
- and reg5,\b,\c
- add \e,\e,reg7
- or reg6,reg6,reg5
- rotrwi \b,\b,2
- add \e,\e,reg6
- .endm
-
- .macro subround4 a b c d e w
- lwzu reg7,4(\w)
- rotlwi reg5,\a,5
- dbct r0,\w
- add \e,\e,K60
- xor reg6,\b,\c
- add \e,\e,reg5
- xor reg6,reg6,\d
- add \e,\e,reg7
- rotrwi \b,\b,2
- add \e,\e,reg6
- .endm
-
-C_FUNCTION_BEGIN(sha1Process)
-/* zero reg0 for general use */
- li reg0,0
-/* for a,b,c,d,e use r26,r27,r28,r29,r30, for w use r31 */
-
-/* we need to save registers before loading them */
- stmw reg26,-24(reg1)
-/* load the frame pointer with parameter data, and hint cache */
- addi reg31,reg3,PARAM_DATA
- dbct reg31
-
-#if !WORDS_BIGENDIAN /* have to provide for PowerPC little-endian mode
- /* loop of 16 entries */
- li reg5,60
- mtctr reg6
-.L00:
- lwbrx reg6,reg31,reg5
- stwx reg6,reg31,reg5
- subi. reg5,reg5,4
- bcge cr0,.L00
- addi reg31,reg3,PARAM_DATA
-#endif
-
-/* do the initial mixing */
- li reg8,64
- addi reg26,reg3,PARAM_DATA+64-4
- addi reg27,reg3,PARAM_DATA+64-3*4-4
- addi reg28,reg3,PARAM_DATA+64-8*4-4
- addi reg29,reg3,PARAM_DATA+64-14*4-4
- addi reg30,reg3,PARAM_DATA+64-16*4-4
- mtctr reg8
-
-.L10:
- lwzu reg5,4(reg27)
- lwzu reg6,4(reg28)
- lwzu reg7,4(reg29)
- lwzu reg8,4(reg30)
- xor reg5,reg5,reg6
- xor reg7,reg7,reg8
- xor reg5,reg5,reg7
- stwu reg5,4(reg26)
- bdnz .L10
-
- lwz reg26,PARAM_H (reg3)
- lwz reg27,PARAM_H+4 (reg3)
- lwz reg28,PARAM_H+8 (reg3)
- lwz reg29,PARAM_H+12(reg3)
- lwz reg30,PARAM_H+16(reg3)
-
- subround1 reg26,reg27,reg28,reg29,reg30,reg31
- subround1 reg30,reg26,reg27,reg28,reg29,reg31
- subround1 reg29,reg30,reg26,reg27,reg28,reg31
- subround1 reg28,reg29,reg30,reg26,reg27,reg31
- subround1 reg27,reg28,reg29,reg30,reg26,reg31
- subround1 reg26,reg27,reg28,reg29,reg30,reg31
- subround1 reg30,reg26,reg27,reg28,reg29,reg31
- subround1 reg29,reg30,reg26,reg27,reg28,reg31
- subround1 reg28,reg29,reg30,reg26,reg27,reg31
- subround1 reg27,reg28,reg29,reg30,reg26,reg31
- subround1 reg26,reg27,reg28,reg29,reg30,reg31
- subround1 reg30,reg26,reg27,reg28,reg29,reg31
- subround1 reg29,reg30,reg26,reg27,reg28,reg31
- subround1 reg28,reg29,reg30,reg26,reg27,reg31
- subround1 reg27,reg28,reg29,reg30,reg26,reg31
- subround1 reg26,reg27,reg28,reg29,reg30,reg31
- subround1 reg30,reg26,reg27,reg28,reg29,reg31
- subround1 reg29,reg30,reg26,reg27,reg28,reg31
- subround1 reg28,reg29,reg30,reg26,reg27,reg31
- subround1 reg27,reg28,reg29,reg30,reg26,reg31
-
- subround2 reg26,reg27,reg28,reg29,reg30,reg31
- subround2 reg30,reg26,reg27,reg28,reg29,reg31
- subround2 reg29,reg30,reg26,reg27,reg28,reg31
- subround2 reg28,reg29,reg30,reg26,reg27,reg31
- subround2 reg27,reg28,reg29,reg30,reg26,reg31
- subround2 reg26,reg27,reg28,reg29,reg30,reg31
- subround2 reg30,reg26,reg27,reg28,reg29,reg31
- subround2 reg29,reg30,reg26,reg27,reg28,reg31
- subround2 reg28,reg29,reg30,reg26,reg27,reg31
- subround2 reg27,reg28,reg29,reg30,reg26,reg31
- subround2 reg26,reg27,reg28,reg29,reg30,reg31
- subround2 reg30,reg26,reg27,reg28,reg29,reg31
- subround2 reg29,reg30,reg26,reg27,reg28,reg31
- subround2 reg28,reg29,reg30,reg26,reg27,reg31
- subround2 reg27,reg28,reg29,reg30,reg26,reg31
- subround2 reg26,reg27,reg28,reg29,reg30,reg31
- subround2 reg30,reg26,reg27,reg28,reg29,reg31
- subround2 reg29,reg30,reg26,reg27,reg28,reg31
- subround2 reg28,reg29,reg30,reg26,reg27,reg31
- subround2 reg27,reg28,reg29,reg30,reg26,reg31
-
- subround3 reg26,reg27,reg28,reg29,reg30,reg31
- subround3 reg30,reg26,reg27,reg28,reg29,reg31
- subround3 reg29,reg30,reg26,reg27,reg28,reg31
- subround3 reg28,reg29,reg30,reg26,reg27,reg31
- subround3 reg27,reg28,reg29,reg30,reg26,reg31
- subround3 reg26,reg27,reg28,reg29,reg30,reg31
- subround3 reg30,reg26,reg27,reg28,reg29,reg31
- subround3 reg29,reg30,reg26,reg27,reg28,reg31
- subround3 reg28,reg29,reg30,reg26,reg27,reg31
- subround3 reg27,reg28,reg29,reg30,reg26,reg31
- subround3 reg26,reg27,reg28,reg29,reg30,reg31
- subround3 reg30,reg26,reg27,reg28,reg29,reg31
- subround3 reg29,reg30,reg26,reg27,reg28,reg31
- subround3 reg28,reg29,reg30,reg26,reg27,reg31
- subround3 reg27,reg28,reg29,reg30,reg26,reg31
- subround3 reg26,reg27,reg28,reg29,reg30,reg31
- subround3 reg30,reg26,reg27,reg28,reg29,reg31
- subround3 reg29,reg30,reg26,reg27,reg28,reg31
- subround3 reg28,reg29,reg30,reg26,reg27,reg31
- subround3 reg27,reg28,reg29,reg30,reg26,reg31
-
- subround4 reg26,reg27,reg28,reg29,reg30,reg31
- subround4 reg30,reg26,reg27,reg28,reg29,reg31
- subround4 reg29,reg30,reg26,reg27,reg28,reg31
- subround4 reg28,reg29,reg30,reg26,reg27,reg31
- subround4 reg27,reg28,reg29,reg30,reg26,reg31
- subround4 reg26,reg27,reg28,reg29,reg30,reg31
- subround4 reg30,reg26,reg27,reg28,reg29,reg31
- subround4 reg29,reg30,reg26,reg27,reg28,reg31
- subround4 reg28,reg29,reg30,reg26,reg27,reg31
- subround4 reg27,reg28,reg29,reg30,reg26,reg31
- subround4 reg26,reg27,reg28,reg29,reg30,reg31
- subround4 reg30,reg26,reg27,reg28,reg29,reg31
- subround4 reg29,reg30,reg26,reg27,reg28,reg31
- subround4 reg28,reg29,reg30,reg26,reg27,reg31
- subround4 reg27,reg28,reg29,reg30,reg26,reg31
- subround4 reg26,reg27,reg28,reg29,reg30,reg31
- subround4 reg30,reg26,reg27,reg28,reg29,reg31
- subround4 reg29,reg30,reg26,reg27,reg28,reg31
- subround4 reg28,reg29,reg30,reg26,reg27,reg31
- subround4 reg27,reg28,reg29,reg30,reg26,reg31
-
-/* then store the five values into registers */
- lwz reg5,PARAM_H (reg3)
- lwz reg6,PARAM_H+4 (reg3)
- lwz reg7,PARAM_H+8 (reg3)
- lwz reg8,PARAM_H+12(reg3)
- lwz reg9,PARAM_H+16(reg3)
- add reg26,reg5,reg26
- add reg27,reg5,reg27
- add reg28,reg5,reg28
- add reg29,reg5,reg29
- add reg30,reg5,reg30
- stw reg26,PARAM_H (reg3)
- stw reg27,PARAM_H+4 (reg3)
- stw reg28,PARAM_H+8 (reg3)
- stw reg29,PARAM_H+12(reg3)
- stw reg30,PARAM_H+16(reg3)
-
-/* finally, restore registers */
- lmw reg26,-24(reg1)
-/* and return */
- blr
-C_FUNCION_END(sha1Process, .Lsha1Process_size)
--- /dev/null
+/*
+ * fips180opt.ia64.S
+ *
+ * Assembler optimized SHA-1 routines for ia64 (Intel Itanium)
+ *
+ * Warning: this code is incomplete and only contains a rough prototype!
+ *
+ * Compile target is GNU Assembler
+ *
+ * Copyright (c) 2001 Virtual Unlimited B.V.
+ *
+ * Author: Bob Deblier <bob@virtualunlimited.com>
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ */
+
+#include "beecrypt.gas.h"
+
+#define saved_pfs r14
+#define saved_lc r15
+
+#define param r16
+
+ .file "fips180opt.ia64.S"
+
+ .text
+
+ .equ K00, 0x5a827999
+ .equ K20, 0x6ed9eba1
+ .equ K40, 0x8f1bbcdc
+ .equ K60, 0xca62c1d6
+
+ .equ PARAM_H, 0
+ .equ PARAM_DATA, 20
+
+/* for optimization, I have to see how I can parallellize the code
+
+ e = ROTL32(a, 5) + ((b&(c^d))^d) + e + w + K
+ b = ROTR32(b, 2);
+
+step1: load w, tmp0 = mix a, tmp1 = c xor d, e += K;;
+step2: tmp0 >>= 27, tmp1 &= b, e += w, b = mix b;;
+step3: b >>= 2, e += tmp0, tmp1 ^= d;;
+step4: e += tmp1, load next w, tmp0 = mix d, tmp1 = b xor d, d += K;;
+step5: etc.
+
+ d = ROTL32(d, 5) + ((a&(b^c))^c) + d + w + K
+ a = ROTR32(a, 2)
+*/
+
+ .macro subround1 a b c d e w
+ ld4 r19 = [\w],4
+ add \e = $K00,\e
+ xor r21 = \c,\d
+ mix4.r r20 = \a,\a;;
+ add \e = \e,r19
+ and r21 = r21,\b
+ shr.u r20 = 27,r20
+ mix4.r r22 = \b,\b;;
+ add \e = r20,\e
+ xor r21 = r21,\d
+ shr.u \b = 2,r22;;
+ add \e = r21,\e
+ .endm
+
+ .macro subround2 a b c d e w
+ ld4 r19 = [\w],4
+ add \e = $K20,\e
+ xor r21 = \b,\c
+ mix4.r r20 = \a,\a;;
+ add \e = \e,r19
+ xor r21 = r21,\d
+ shr.u r20 = 27,r20
+ mix4.r \b = \b,\b;;
+ add \e = r20,\e
+ shr.u \b = 2,\b;;
+ add \e = r21,\e
+ .endm
+
+ .macro subround3 a b c d e w
+ ld4 r19 = [\w],4
+ add \e = $K40,\e
+ xor r21 = \b,\c
+ and r22 = \b,\c
+ mix4.r r20 = \a,\a;;
+ add \e = \e,r19
+ and r21 = r21,\d
+ shr.u r20 = 27,r20
+ mix4.r \b = \b,\b;;
+ add \e = r20,\e
+ or r21 = r21,r22
+ shr.u \b = 2,\b;;
+ add \e = r21,\e
+ .endm
+
+ .macro subround4 a b c d e w
+ ld4 r19 = [\w],4
+ add \e = $K60,\e
+ xor r21 = \b,\c
+ mix4.r r20 = \a,\a;;
+ add \e = \e,r19
+ xor r21 = r21,\d
+ shr.u r20 = 27,r20
+ mix4.r \b = \b,\b;;
+ add \e = r20,\e
+ shr.u \b = 2,\b;;
+ add \e = r21,\e
+ .endm
+
+ .align 32
+ .global sha1Process#
+ .proc sha1Process#
+
+sha1Process:
+ alloc saved_pfs = ar.pfs,2,0,0,0
+ mov saved_lc = ar.lc
+
+/* r16 will be h */
+/* r17 will be pdata */
+/* There must be something neat I can do to speed up expansion (xor/rotate)
+
+ The following should work, if we use 24 rotating registers; speedup should be dramatic
+ preload with swapped values 0-15
+
+ rought draft: have to translate this to more precise rotating registers and predicates.
+
+ /----------\
+ |xor[2],[0]|
+ +----------+----------\
+ |xor[8] |xor[3],[1]|
+ +----------+----------+----------\
+ |xor[13] |xor[9] |xor[4],[2]|
+ +----------+----------+----------+----------\
+ |mix4.r[16]|xor[14] |xor[10] |xor[5],[3]|
+ +----------+----------+----------+----------+-----------\
+ |shr[16] |mix4.r[17]|xor[15] |xor[11] |xor[6],[4] |
+ +----------+----------+----------+----------+-----------+----------\
+ |store[16] |shr[17] |mix4.r[18]|xor[16] |xor[12] |xor[7],[5]|
+ \----------+----------+----------+----------+-----------+----------+----------\
+ |store[17] |shr[18] |mix4.r[19]|xor[17] |xor[13] |xor[8],[6]|
+ \----------+----------+----------+-----------+----------+----------+----------\
+ |store[18] |shr[19] |mix4.r[20] |xor[18] |xor[14] |xor[9],[7]|
+ \----------+----------+-----------+----------+----------+----------+----------\
+ | | | | | | |
+*/
+ alloc saved_pfs = ar.pfs,3,21,0,24
+
+ /* look into big-endian loads, followed by little-endian stores */
+#if !WORD_BIGENDIAN
+ // save UM.be
+ // set UM.be to one
+#endif
+ /*
+.L00:
+ ld4 r32 = [ra],4
+
+br.ctop.sptk .L00;;
+#if !WORD_BIGENDIAN
+ // restore UM.be
+ /*
+ mov ra = rd
+ mov rb = rd;;
+ add rb = 4,rd;;
+ st4 [ra],8 = r48
+ st4 [rb],8 = r47;;
+ st4 [ra],8 = r46
+ st4 [rb],8 = r45;;
+ st4 [ra],8 = r44
+ st4 [rb],8 = r43;;
+ st4 [ra],8 = r42
+ st4 [rb],8 = r41;;
+ st4 [ra],8 = r40
+ st4 [rb],8 = r39;;
+ st4 [ra],8 = r38
+ st4 [rb],8 = r37;;
+ st4 [ra],8 = r36
+ st4 [rb],8 = r35;;
+ st4 [ra],8 = r34
+ st4 [rb],8 = r33;;
+ */
+#endif
+ /* also add a conditional which will save the original swapped words! */
+ /* the expansion loop will translate to something like this: */
+.L01:
+ /* put three xors together */
+ (p16) xor r32 = r46,r48
+ (p17) xor r33 = r33,r41
+ (p18) xor r34 = r34,r37
+ (p19) mix4.r r35 = r35,r35
+ (p20) shr.u r36 = 31,r36
+ (p21) st4 [],4 = r37
+ br.ctop.sptk .L01;;
+
+ etc.
+*/
+ mov ar.lc = r15
+ mov ar.pfs = r14
+ br.ret.sptk b0
+ .endp sha1Process#
+
--- /dev/null
+/*
+ * fips180opt.powerpc.S
+ *
+ * Assembler optimized SHA-1 routines for PowerPC processors
+ *
+ * Warning: this code is incomplete and only contains a rough prototype!
+ *
+ * Compile target is GNU Assembler
+ *
+ * Copyright (c) 2000, 2001 Virtual Unlimited B.V.
+ *
+ * Author: Bob Deblier <bob@virtualunlimited.com>
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ */
+
+#include "beecrypt.gas.h"
+
+ .file "fips180opt.powerpc.S"
+
+ .text
+
+#if DARWIN
+# define reg0 r0
+# define reg3 r3
+# define reg4 r4
+# define reg5 r5
+# define reg6 r6
+# define reg7 r7
+# define reg8 r8
+# define reg9 r9
+# define reg26 r26
+# define reg27 r27
+# define reg28 r28
+# define reg29 r29
+# define reg30 r30
+# define reg31 r31
+#else
+# define reg0 %r0
+# define reg3 %r3
+# define reg4 %r4
+# define reg5 %r5
+# define reg6 %r6
+# define reg7 %r7
+# define reg8 %r8
+# define reg9 %r9
+# define reg26 %r26
+# define reg27 %r27
+# define reg28 %r28
+# define reg29 %r29
+# define reg30 %r30
+# define reg31 %r31
+#endif
+
+ .equ K00, 0x5a827999
+ .equ K20, 0x6ed9eba1
+ .equ K40, 0x8f1bbcdc
+ .equ K60, 0xca62c1d6
+
+ .equ PARAM_H, 0
+ .equ PARAM_DATA, 20
+
+/* sha1Param: param in reg3 */
+
+ .macro subround1 a b c d e w
+ lwzu reg7,4(\w)
+ rotlwi reg5,\a,5
+ dbct r0,\w
+ xor reg6,\c,\d
+ add \e,\e,K00
+ and reg6,reg6,\b
+ add \e,\e,reg7
+ xor reg6,reg6,\d
+ add \e,\e,reg5
+ rotrwi \b,\b,2
+ add \e,\e,reg6
+ .endm
+
+ .macro subround2 a b c d e w
+ lwzu reg7,4(\w)
+ rotlwi reg5,\a,5
+ dbct r0,\w
+ add \e,\e,K20
+ xor reg6,\b,\c
+ add \e,\e,reg5
+ xor reg6,reg6,\d
+ add \e,\e,reg7
+ rotrwi \b,\b,2
+ add \e,\e,reg6
+ .endm
+
+ .macro subround3 a b c d e w
+ lwzu reg7,4(\w)
+ rotlwi reg5,\a,5
+ dbct r0,\w
+ xor reg6,\b,\c
+ add \e,\e,reg5
+ and reg6,reg6,\d
+ add \e,\e,K40
+ and reg5,\b,\c
+ add \e,\e,reg7
+ or reg6,reg6,reg5
+ rotrwi \b,\b,2
+ add \e,\e,reg6
+ .endm
+
+ .macro subround4 a b c d e w
+ lwzu reg7,4(\w)
+ rotlwi reg5,\a,5
+ dbct r0,\w
+ add \e,\e,K60
+ xor reg6,\b,\c
+ add \e,\e,reg5
+ xor reg6,reg6,\d
+ add \e,\e,reg7
+ rotrwi \b,\b,2
+ add \e,\e,reg6
+ .endm
+
+C_FUNCTION_BEGIN(sha1Process)
+/* zero reg0 for general use */
+ li reg0,0
+/* for a,b,c,d,e use r26,r27,r28,r29,r30, for w use r31 */
+
+/* we need to save registers before loading them */
+ stmw reg26,-24(reg1)
+/* load the frame pointer with parameter data, and hint cache */
+ addi reg31,reg3,PARAM_DATA
+ dbct reg31
+
+#if !WORDS_BIGENDIAN /* have to provide for PowerPC little-endian mode
+ /* loop of 16 entries */
+ li reg5,60
+ mtctr reg6
+.L00:
+ lwbrx reg6,reg31,reg5
+ stwx reg6,reg31,reg5
+ subi. reg5,reg5,4
+ bcge cr0,.L00
+ addi reg31,reg3,PARAM_DATA
+#endif
+
+/* do the initial mixing */
+ li reg8,64
+ addi reg26,reg3,PARAM_DATA+64-4
+ addi reg27,reg3,PARAM_DATA+64-3*4-4
+ addi reg28,reg3,PARAM_DATA+64-8*4-4
+ addi reg29,reg3,PARAM_DATA+64-14*4-4
+ addi reg30,reg3,PARAM_DATA+64-16*4-4
+ mtctr reg8
+
+.L10:
+ lwzu reg5,4(reg27)
+ lwzu reg6,4(reg28)
+ lwzu reg7,4(reg29)
+ lwzu reg8,4(reg30)
+ xor reg5,reg5,reg6
+ xor reg7,reg7,reg8
+ xor reg5,reg5,reg7
+ stwu reg5,4(reg26)
+ bdnz .L10
+
+ lwz reg26,PARAM_H (reg3)
+ lwz reg27,PARAM_H+4 (reg3)
+ lwz reg28,PARAM_H+8 (reg3)
+ lwz reg29,PARAM_H+12(reg3)
+ lwz reg30,PARAM_H+16(reg3)
+
+ subround1 reg26,reg27,reg28,reg29,reg30,reg31
+ subround1 reg30,reg26,reg27,reg28,reg29,reg31
+ subround1 reg29,reg30,reg26,reg27,reg28,reg31
+ subround1 reg28,reg29,reg30,reg26,reg27,reg31
+ subround1 reg27,reg28,reg29,reg30,reg26,reg31
+ subround1 reg26,reg27,reg28,reg29,reg30,reg31
+ subround1 reg30,reg26,reg27,reg28,reg29,reg31
+ subround1 reg29,reg30,reg26,reg27,reg28,reg31
+ subround1 reg28,reg29,reg30,reg26,reg27,reg31
+ subround1 reg27,reg28,reg29,reg30,reg26,reg31
+ subround1 reg26,reg27,reg28,reg29,reg30,reg31
+ subround1 reg30,reg26,reg27,reg28,reg29,reg31
+ subround1 reg29,reg30,reg26,reg27,reg28,reg31
+ subround1 reg28,reg29,reg30,reg26,reg27,reg31
+ subround1 reg27,reg28,reg29,reg30,reg26,reg31
+ subround1 reg26,reg27,reg28,reg29,reg30,reg31
+ subround1 reg30,reg26,reg27,reg28,reg29,reg31
+ subround1 reg29,reg30,reg26,reg27,reg28,reg31
+ subround1 reg28,reg29,reg30,reg26,reg27,reg31
+ subround1 reg27,reg28,reg29,reg30,reg26,reg31
+
+ subround2 reg26,reg27,reg28,reg29,reg30,reg31
+ subround2 reg30,reg26,reg27,reg28,reg29,reg31
+ subround2 reg29,reg30,reg26,reg27,reg28,reg31
+ subround2 reg28,reg29,reg30,reg26,reg27,reg31
+ subround2 reg27,reg28,reg29,reg30,reg26,reg31
+ subround2 reg26,reg27,reg28,reg29,reg30,reg31
+ subround2 reg30,reg26,reg27,reg28,reg29,reg31
+ subround2 reg29,reg30,reg26,reg27,reg28,reg31
+ subround2 reg28,reg29,reg30,reg26,reg27,reg31
+ subround2 reg27,reg28,reg29,reg30,reg26,reg31
+ subround2 reg26,reg27,reg28,reg29,reg30,reg31
+ subround2 reg30,reg26,reg27,reg28,reg29,reg31
+ subround2 reg29,reg30,reg26,reg27,reg28,reg31
+ subround2 reg28,reg29,reg30,reg26,reg27,reg31
+ subround2 reg27,reg28,reg29,reg30,reg26,reg31
+ subround2 reg26,reg27,reg28,reg29,reg30,reg31
+ subround2 reg30,reg26,reg27,reg28,reg29,reg31
+ subround2 reg29,reg30,reg26,reg27,reg28,reg31
+ subround2 reg28,reg29,reg30,reg26,reg27,reg31
+ subround2 reg27,reg28,reg29,reg30,reg26,reg31
+
+ subround3 reg26,reg27,reg28,reg29,reg30,reg31
+ subround3 reg30,reg26,reg27,reg28,reg29,reg31
+ subround3 reg29,reg30,reg26,reg27,reg28,reg31
+ subround3 reg28,reg29,reg30,reg26,reg27,reg31
+ subround3 reg27,reg28,reg29,reg30,reg26,reg31
+ subround3 reg26,reg27,reg28,reg29,reg30,reg31
+ subround3 reg30,reg26,reg27,reg28,reg29,reg31
+ subround3 reg29,reg30,reg26,reg27,reg28,reg31
+ subround3 reg28,reg29,reg30,reg26,reg27,reg31
+ subround3 reg27,reg28,reg29,reg30,reg26,reg31
+ subround3 reg26,reg27,reg28,reg29,reg30,reg31
+ subround3 reg30,reg26,reg27,reg28,reg29,reg31
+ subround3 reg29,reg30,reg26,reg27,reg28,reg31
+ subround3 reg28,reg29,reg30,reg26,reg27,reg31
+ subround3 reg27,reg28,reg29,reg30,reg26,reg31
+ subround3 reg26,reg27,reg28,reg29,reg30,reg31
+ subround3 reg30,reg26,reg27,reg28,reg29,reg31
+ subround3 reg29,reg30,reg26,reg27,reg28,reg31
+ subround3 reg28,reg29,reg30,reg26,reg27,reg31
+ subround3 reg27,reg28,reg29,reg30,reg26,reg31
+
+ subround4 reg26,reg27,reg28,reg29,reg30,reg31
+ subround4 reg30,reg26,reg27,reg28,reg29,reg31
+ subround4 reg29,reg30,reg26,reg27,reg28,reg31
+ subround4 reg28,reg29,reg30,reg26,reg27,reg31
+ subround4 reg27,reg28,reg29,reg30,reg26,reg31
+ subround4 reg26,reg27,reg28,reg29,reg30,reg31
+ subround4 reg30,reg26,reg27,reg28,reg29,reg31
+ subround4 reg29,reg30,reg26,reg27,reg28,reg31
+ subround4 reg28,reg29,reg30,reg26,reg27,reg31
+ subround4 reg27,reg28,reg29,reg30,reg26,reg31
+ subround4 reg26,reg27,reg28,reg29,reg30,reg31
+ subround4 reg30,reg26,reg27,reg28,reg29,reg31
+ subround4 reg29,reg30,reg26,reg27,reg28,reg31
+ subround4 reg28,reg29,reg30,reg26,reg27,reg31
+ subround4 reg27,reg28,reg29,reg30,reg26,reg31
+ subround4 reg26,reg27,reg28,reg29,reg30,reg31
+ subround4 reg30,reg26,reg27,reg28,reg29,reg31
+ subround4 reg29,reg30,reg26,reg27,reg28,reg31
+ subround4 reg28,reg29,reg30,reg26,reg27,reg31
+ subround4 reg27,reg28,reg29,reg30,reg26,reg31
+
+/* then store the five values into registers */
+ lwz reg5,PARAM_H (reg3)
+ lwz reg6,PARAM_H+4 (reg3)
+ lwz reg7,PARAM_H+8 (reg3)
+ lwz reg8,PARAM_H+12(reg3)
+ lwz reg9,PARAM_H+16(reg3)
+ add reg26,reg5,reg26
+ add reg27,reg5,reg27
+ add reg28,reg5,reg28
+ add reg29,reg5,reg29
+ add reg30,reg5,reg30
+ stw reg26,PARAM_H (reg3)
+ stw reg27,PARAM_H+4 (reg3)
+ stw reg28,PARAM_H+8 (reg3)
+ stw reg29,PARAM_H+12(reg3)
+ stw reg30,PARAM_H+16(reg3)
+
+/* finally, restore registers */
+ lmw reg26,-24(reg1)
+/* and return */
+ blr
+C_FUNCION_END(sha1Process, .Lsha1Process_size)