From 3de9f02e926f066b8ccd1cd3c1b58285d9034646 Mon Sep 17 00:00:00 2001 From: Roland McGrath Date: Fri, 1 Mar 1996 18:43:45 +0000 Subject: [PATCH] Updated from /src/gmp-1.937 --- sysdeps/alpha/addmul_1.s | 23 +--- sysdeps/alpha/alphaev5/add_n.s | 175 ++++++++++++++----------- sysdeps/alpha/alphaev5/lshift.s | 30 ++--- sysdeps/alpha/alphaev5/rshift.s | 28 ++-- sysdeps/alpha/alphaev5/sub_n.s | 148 +++++++++++++++++++++ sysdeps/alpha/lshift.s | 14 +- sysdeps/alpha/mul_1.s | 2 +- sysdeps/alpha/rshift.s | 16 +-- sysdeps/alpha/submul_1.s | 23 +--- sysdeps/alpha/udiv_qrnnd.S | 34 +++-- sysdeps/m68k/add_n.S | 75 +++++------ sysdeps/m68k/lshift.S | 150 ++++++++++++++++++++++ sysdeps/m68k/m68020/addmul_1.S | 75 +++++------ sysdeps/m68k/m68020/mul_1.S | 83 ++++++------ sysdeps/m68k/m68020/submul_1.S | 75 +++++------ sysdeps/m68k/rshift.S | 149 ++++++++++++++++++++++ sysdeps/m68k/sub_n.S | 75 +++++------ sysdeps/m88k/add_n.s | 2 +- sysdeps/m88k/m88110/add_n.S | 199 +++++++++++++++++++++++++++++ sysdeps/m88k/m88110/addmul_1.s | 60 +++++++++ sysdeps/m88k/m88110/mul_1.s | 28 +--- sysdeps/m88k/m88110/sub_n.S | 275 ++++++++++++++++++++++++++++++++++++++++ sysdeps/m88k/mul_1.s | 74 ++++++----- sysdeps/m88k/sub_n.s | 7 +- sysdeps/mips/addmul_1.s | 4 +- sysdeps/mips/mips3/addmul_1.s | 2 +- sysdeps/mips/mips3/mul_1.s | 2 +- sysdeps/mips/mips3/submul_1.s | 2 +- sysdeps/mips/mul_1.s | 4 +- sysdeps/mips/submul_1.s | 4 +- sysdeps/rs6000/add_n.s | 2 +- sysdeps/rs6000/sub_n.s | 2 +- sysdeps/vax/gmp-mparam.h | 2 +- sysdeps/z8000/mul_1.s | 2 +- 34 files changed, 1410 insertions(+), 436 deletions(-) create mode 100644 sysdeps/alpha/alphaev5/sub_n.s create mode 100644 sysdeps/m68k/lshift.S create mode 100644 sysdeps/m68k/rshift.S create mode 100644 sysdeps/m88k/m88110/add_n.S create mode 100644 sysdeps/m88k/m88110/addmul_1.s create mode 100644 sysdeps/m88k/m88110/sub_n.S diff --git a/sysdeps/alpha/addmul_1.s b/sysdeps/alpha/addmul_1.s index 46d277d..8b168cb 100644 --- a/sysdeps/alpha/addmul_1.s +++ b/sysdeps/alpha/addmul_1.s @@ -26,16 +26,7 @@ # size r18 # s2_limb r19 - # This code runs at 42 cycles/limb on the 21064. - - # To improve performance for long multiplications, we would use - # 'fetch' for S1 and 'fetch_m' for RES. It's not obvious how to use - # these instructions without slowing down the general code: 1. We can - # only have two prefetches in operation at any time in the Alpha - # architecture. 2. There will seldom be any special alignment - # between RES_PTR and S1_PTR. Maybe we can simply divide the current - # loop into an inner and outer loop, having the inner loop handle - # exactly one prefetch block? + # This code runs at 42 cycles/limb on EV4 and 18 cycles/limb on EV5. .set noreorder .set noat @@ -52,7 +43,7 @@ __mpn_addmul_1: mulq $2,$19,$3 # $3 = prod_low ldq $5,0($16) # $5 = *res_ptr umulh $2,$19,$0 # $0 = prod_high - beq $18,Lend1 # jump if size was == 1 + beq $18,.Lend1 # jump if size was == 1 ldq $2,0($17) # $2 = s1_limb addq $17,8,$17 # s1_ptr++ subq $18,1,$18 # size-- @@ -60,10 +51,10 @@ __mpn_addmul_1: cmpult $3,$5,$4 stq $3,0($16) addq $16,8,$16 # res_ptr++ - beq $18,Lend2 # jump if size was == 2 + beq $18,.Lend2 # jump if size was == 2 .align 3 -Loop: mulq $2,$19,$3 # $3 = prod_low +.Loop: mulq $2,$19,$3 # $3 = prod_low ldq $5,0($16) # $5 = *res_ptr addq $4,$0,$0 # cy_limb = cy_limb + 'cy' subq $18,1,$18 # size-- @@ -77,9 +68,9 @@ Loop: mulq $2,$19,$3 # $3 = prod_low stq $3,0($16) addq $16,8,$16 # res_ptr++ addq $5,$0,$0 # combine carries - bne $18,Loop + bne $18,.Loop -Lend2: mulq $2,$19,$3 # $3 = prod_low +.Lend2: mulq $2,$19,$3 # $3 = prod_low ldq $5,0($16) # $5 = *res_ptr addq $4,$0,$0 # cy_limb = cy_limb + 'cy' umulh $2,$19,$4 # $4 = cy_limb @@ -91,7 +82,7 @@ Lend2: mulq $2,$19,$3 # $3 = prod_low addq $5,$0,$0 # combine carries addq $4,$0,$0 # cy_limb = prod_high + cy ret $31,($26),1 -Lend1: addq $5,$3,$3 +.Lend1: addq $5,$3,$3 cmpult $3,$5,$5 stq $3,0($16) addq $0,$5,$0 diff --git a/sysdeps/alpha/alphaev5/add_n.s b/sysdeps/alpha/alphaev5/add_n.s index 2aaf041..66cf82b 100644 --- a/sysdeps/alpha/alphaev5/add_n.s +++ b/sysdeps/alpha/alphaev5/add_n.s @@ -35,84 +35,113 @@ __mpn_add_n: .frame $30,0,$26,0 - ldq $3,0($17) - ldq $4,0($18) - - subq $19,1,$19 - and $19,4-1,$2 # number of limbs in first loop - bis $31,$31,$0 - beq $2,.L0 # if multiple of 4 limbs, skip first loop - - subq $19,$2,$19 - -.Loop0: subq $2,1,$2 + or $31,$31,$25 # clear cy + subq $19,4,$19 # decr loop cnt + blt $19,.Lend2 # if less than 4 limbs, goto 2nd loop + # Start software pipeline for 1st loop + ldq $0,0($18) + ldq $1,8($18) + ldq $4,0($17) ldq $5,8($17) - addq $4,$0,$4 - ldq $6,8($18) - cmpult $4,$0,$1 - addq $3,$4,$4 - cmpult $4,$3,$0 - stq $4,0($16) - or $0,$1,$0 - - addq $17,8,$17 - addq $18,8,$18 - bis $5,$5,$3 - bis $6,$6,$4 - addq $16,8,$16 - bne $2,.Loop0 - -.L0: beq $19,.Lend - + addq $17,32,$17 # update s1_ptr + ldq $2,16($18) + addq $0,$4,$20 # 1st main add + ldq $3,24($18) + subq $19,4,$19 # decr loop cnt + ldq $6,-16($17) + cmpult $20,$0,$25 # compute cy from last add + ldq $7,-8($17) + addq $1,$25,$28 # cy add + addq $18,32,$18 # update s2_ptr + addq $5,$28,$21 # 2nd main add + cmpult $28,$25,$8 # compute cy from last add + blt $19,.Lend1 # if less than 4 limbs remain, jump + # 1st loop handles groups of 4 limbs in a software pipeline .align 4 -.Loop: subq $19,4,$19 - unop - - ldq $6,8($18) - addq $4,$0,$0 +.Loop: cmpult $21,$28,$25 # compute cy from last add + ldq $0,0($18) + or $8,$25,$25 # combine cy from the two adds + ldq $1,8($18) + addq $2,$25,$28 # cy add + ldq $4,0($17) + addq $28,$6,$22 # 3rd main add ldq $5,8($17) - cmpult $0,$4,$1 - ldq $4,16($18) - addq $3,$0,$20 - cmpult $20,$3,$0 - ldq $3,16($17) - or $0,$1,$0 - addq $6,$0,$0 - cmpult $0,$6,$1 - ldq $6,24($18) - addq $5,$0,$21 - cmpult $21,$5,$0 - ldq $5,24($17) - or $0,$1,$0 - addq $4,$0,$0 - cmpult $0,$4,$1 - ldq $4,32($18) - addq $3,$0,$22 - cmpult $22,$3,$0 - ldq $3,32($17) - or $0,$1,$0 - addq $6,$0,$0 - cmpult $0,$6,$1 - addq $5,$0,$23 - cmpult $23,$5,$0 - or $0,$1,$0 - + cmpult $28,$25,$8 # compute cy from last add + cmpult $22,$28,$25 # compute cy from last add stq $20,0($16) + or $8,$25,$25 # combine cy from the two adds stq $21,8($16) - stq $22,16($16) - stq $23,24($16) - - addq $17,32,$17 - addq $18,32,$18 - addq $16,32,$16 - bne $19,.Loop + addq $3,$25,$28 # cy add + addq $28,$7,$23 # 4th main add + cmpult $28,$25,$8 # compute cy from last add + cmpult $23,$28,$25 # compute cy from last add + addq $17,32,$17 # update s1_ptr + or $8,$25,$25 # combine cy from the two adds + addq $16,32,$16 # update res_ptr + addq $0,$25,$28 # cy add + ldq $2,16($18) + addq $4,$28,$20 # 1st main add + ldq $3,24($18) + cmpult $28,$25,$8 # compute cy from last add + ldq $6,-16($17) + cmpult $20,$28,$25 # compute cy from last add + ldq $7,-8($17) + or $8,$25,$25 # combine cy from the two adds + subq $19,4,$19 # decr loop cnt + stq $22,-16($16) + addq $1,$25,$28 # cy add + stq $23,-8($16) + addq $5,$28,$21 # 2nd main add + addq $18,32,$18 # update s2_ptr + cmpult $28,$25,$8 # compute cy from last add + bge $19,.Loop + # Finish software pipeline for 1st loop +.Lend1: cmpult $21,$28,$25 # compute cy from last add + or $8,$25,$25 # combine cy from the two adds + addq $2,$25,$28 # cy add + addq $28,$6,$22 # 3rd main add + cmpult $28,$25,$8 # compute cy from last add + cmpult $22,$28,$25 # compute cy from last add + stq $20,0($16) + or $8,$25,$25 # combine cy from the two adds + stq $21,8($16) + addq $3,$25,$28 # cy add + addq $28,$7,$23 # 4th main add + cmpult $28,$25,$8 # compute cy from last add + cmpult $23,$28,$25 # compute cy from last add + or $8,$25,$25 # combine cy from the two adds + addq $16,32,$16 # update res_ptr + stq $22,-16($16) + stq $23,-8($16) +.Lend2: addq $19,4,$19 # restore loop cnt + beq $19,.Lret + # Start software pipeline for 2nd loop + ldq $0,0($18) + ldq $4,0($17) + subq $19,1,$19 + beq $19,.Lend0 + # 2nd loop handles remaining 1-3 limbs + .align 4 +.Loop0: addq $0,$25,$28 # cy add + ldq $0,8($18) + addq $4,$28,$20 # main add + ldq $4,8($17) + addq $18,8,$18 + cmpult $28,$25,$8 # compute cy from last add + addq $17,8,$17 + stq $20,0($16) + cmpult $20,$28,$25 # compute cy from last add + subq $19,1,$19 # decr loop cnt + or $8,$25,$25 # combine cy from the two adds + addq $16,8,$16 + bne $19,.Loop0 +.Lend0: addq $0,$25,$28 # cy add + addq $4,$28,$20 # main add + cmpult $28,$25,$8 # compute cy from last add + cmpult $20,$28,$25 # compute cy from last add + stq $20,0($16) + or $8,$25,$25 # combine cy from the two adds -.Lend: addq $4,$0,$4 - cmpult $4,$0,$1 - addq $3,$4,$4 - cmpult $4,$3,$0 - stq $4,0($16) - or $0,$1,$0 +.Lret: or $25,$31,$0 # return cy ret $31,($26),1 - .end __mpn_add_n diff --git a/sysdeps/alpha/alphaev5/lshift.s b/sysdeps/alpha/alphaev5/lshift.s index fdb0895..392b424 100644 --- a/sysdeps/alpha/alphaev5/lshift.s +++ b/sysdeps/alpha/alphaev5/lshift.s @@ -25,7 +25,7 @@ # size r18 # cnt r19 - # This code runs at 4.25 cycles/limb on the EV5. + # This code runs at 3.25 cycles/limb on the EV5. .set noreorder .set noat @@ -44,11 +44,11 @@ __mpn_lshift: and $18,4-1,$28 # number of limbs in first loop srl $4,$20,$0 # compute function result - beq $28,L0 + beq $28,.L0 subq $18,$28,$18 .align 3 -Loop0: ldq $3,-16($17) +.Loop0: ldq $3,-16($17) subq $16,8,$16 sll $4,$19,$5 subq $17,8,$17 @@ -57,17 +57,17 @@ Loop0: ldq $3,-16($17) or $3,$3,$4 or $5,$6,$8 stq $8,0($16) - bne $28,Loop0 + bne $28,.Loop0 -L0: sll $4,$19,$24 - beq $18,Lend +.L0: sll $4,$19,$24 + beq $18,.Lend # warm up phase 1 ldq $1,-16($17) subq $18,4,$18 ldq $2,-24($17) ldq $3,-32($17) ldq $4,-40($17) - beq $18,Lcool1 + beq $18,.Lend1 # warm up phase 2 srl $1,$20,$7 sll $1,$19,$21 @@ -84,10 +84,10 @@ L0: sll $4,$19,$24 sll $4,$19,$24 ldq $4,-72($17) subq $18,4,$18 - beq $18,Lcool1 + beq $18,.Lend2 .align 4 # main loop -Loop: stq $7,-8($16) +.Loop: stq $7,-8($16) or $5,$22,$5 stq $8,-16($16) or $6,$23,$6 @@ -113,16 +113,14 @@ Loop: stq $7,-8($16) subq $16,32,$16 srl $4,$20,$6 - ldq $3,-96($17 + ldq $3,-96($17) sll $4,$19,$24 ldq $4,-104($17) subq $17,32,$17 - bne $18,Loop - unop - unop + bne $18,.Loop # cool down phase 2/1 -Lcool1: stq $7,-8($16) +.Lend2: stq $7,-8($16) or $5,$22,$5 stq $8,-16($16) or $6,$23,$6 @@ -150,7 +148,7 @@ Lcool1: stq $7,-8($16) ret $31,($26),1 # cool down phase 1/1 -Lcool1: srl $1,$20,$7 +.Lend1: srl $1,$20,$7 sll $1,$19,$21 srl $2,$20,$8 sll $2,$19,$22 @@ -170,6 +168,6 @@ Lcool1: srl $1,$20,$7 stq $24,-40($16) ret $31,($26),1 -Lend stq $24,-8($16) +.Lend: stq $24,-8($16) ret $31,($26),1 .end __mpn_lshift diff --git a/sysdeps/alpha/alphaev5/rshift.s b/sysdeps/alpha/alphaev5/rshift.s index 1da9960..d20dde3 100644 --- a/sysdeps/alpha/alphaev5/rshift.s +++ b/sysdeps/alpha/alphaev5/rshift.s @@ -25,7 +25,7 @@ # size r18 # cnt r19 - # This code runs at 4.25 cycles/limb on the EV5. + # This code runs at 3.25 cycles/limb on the EV5. .set noreorder .set noat @@ -42,11 +42,11 @@ __mpn_rshift: and $18,4-1,$28 # number of limbs in first loop sll $4,$20,$0 # compute function result - beq $28,L0 + beq $28,.L0 subq $18,$28,$18 .align 3 -Loop0: ldq $3,8($17) +.Loop0: ldq $3,8($17) addq $16,8,$16 srl $4,$19,$5 addq $17,8,$17 @@ -55,17 +55,17 @@ Loop0: ldq $3,8($17) or $3,$3,$4 or $5,$6,$8 stq $8,-8($16) - bne $28,Loop0 + bne $28,.Loop0 -L0: srl $4,$19,$24 - beq $18,Lend +.L0: srl $4,$19,$24 + beq $18,.Lend # warm up phase 1 ldq $1,8($17) subq $18,4,$18 ldq $2,16($17) ldq $3,24($17) ldq $4,32($17) - beq $18,Lcool1 + beq $18,.Lend1 # warm up phase 2 sll $1,$20,$7 srl $1,$19,$21 @@ -82,10 +82,10 @@ L0: srl $4,$19,$24 srl $4,$19,$24 ldq $4,64($17) subq $18,4,$18 - beq $18,Lcool2 + beq $18,.Lend2 .align 4 # main loop -Loop: stq $7,0($16) +.Loop: stq $7,0($16) or $5,$22,$5 stq $8,8($16) or $6,$23,$6 @@ -116,11 +116,9 @@ Loop: stq $7,0($16) ldq $4,96($17) addq $17,32,$17 - bne $18,Loop - unop - unop + bne $18,.Loop # cool down phase 2/1 -Lcool2: stq $7,0($16) +.Lend2: stq $7,0($16) or $5,$22,$5 stq $8,8($16) or $6,$23,$6 @@ -148,7 +146,7 @@ Lcool2: stq $7,0($16) ret $31,($26),1 # cool down phase 1/1 -Lcool1: sll $1,$20,$7 +.Lend1: sll $1,$20,$7 srl $1,$19,$21 sll $2,$20,$8 srl $2,$19,$22 @@ -168,6 +166,6 @@ Lcool1: sll $1,$20,$7 stq $24,32($16) ret $31,($26),1 -Lend: stq $24,0($16) +.Lend: stq $24,0($16) ret $31,($26),1 .end __mpn_rshift diff --git a/sysdeps/alpha/alphaev5/sub_n.s b/sysdeps/alpha/alphaev5/sub_n.s new file mode 100644 index 0000000..c9f3a4e --- /dev/null +++ b/sysdeps/alpha/alphaev5/sub_n.s @@ -0,0 +1,148 @@ + # Alpha __mpn_sub_n -- Subtract two limb vectors of the same length > 0 and + # store difference in a third limb vector. + + # Copyright (C) 1995 Free Software Foundation, Inc. + + # This file is part of the GNU MP Library. + + # The GNU MP Library is free software; you can redistribute it and/or modify + # it under the terms of the GNU Library General Public License as published by + # the Free Software Foundation; either version 2 of the License, or (at your + # option) any later version. + + # The GNU MP Library is distributed in the hope that it will be useful, but + # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY + # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public + # License for more details. + + # You should have received a copy of the GNU Library General Public License + # along with the GNU MP Library; see the file COPYING.LIB. If not, write to + # the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + + + # INPUT PARAMETERS + # res_ptr $16 + # s1_ptr $17 + # s2_ptr $18 + # size $19 + + .set noreorder + .set noat +.text + .align 3 + .globl __mpn_sub_n + .ent __mpn_sub_n +__mpn_sub_n: + .frame $30,0,$26,0 + + or $31,$31,$25 # clear cy + subq $19,4,$19 # decr loop cnt + blt $19,.Lend2 # if less than 4 limbs, goto 2nd loop + # Start software pipeline for 1st loop + ldq $0,0($18) + ldq $1,8($18) + ldq $4,0($17) + ldq $5,8($17) + addq $17,32,$17 # update s1_ptr + ldq $2,16($18) + subq $4,$0,$20 # 1st main sub + ldq $3,24($18) + subq $19,4,$19 # decr loop cnt + ldq $6,-16($17) + cmpult $4,$20,$25 # compute cy from last sub + ldq $7,-8($17) + addq $1,$25,$28 # cy add + addq $18,32,$18 # update s2_ptr + subq $5,$28,$21 # 2nd main sub + cmpult $28,$25,$8 # compute cy from last add + blt $19,.Lend1 # if less than 4 limbs remain, jump + # 1st loop handles groups of 4 limbs in a software pipeline + .align 4 +.Loop: cmpult $5,$21,$25 # compute cy from last add + ldq $0,0($18) + or $8,$25,$25 # combine cy from the two adds + ldq $1,8($18) + addq $2,$25,$28 # cy add + ldq $4,0($17) + subq $6,$28,$22 # 3rd main sub + ldq $5,8($17) + cmpult $28,$25,$8 # compute cy from last add + cmpult $6,$22,$25 # compute cy from last add + stq $20,0($16) + or $8,$25,$25 # combine cy from the two adds + stq $21,8($16) + addq $3,$25,$28 # cy add + subq $7,$28,$23 # 4th main sub + cmpult $28,$25,$8 # compute cy from last add + cmpult $7,$23,$25 # compute cy from last add + addq $17,32,$17 # update s1_ptr + or $8,$25,$25 # combine cy from the two adds + addq $16,32,$16 # update res_ptr + addq $0,$25,$28 # cy add + ldq $2,16($18) + subq $4,$28,$20 # 1st main sub + ldq $3,24($18) + cmpult $28,$25,$8 # compute cy from last add + ldq $6,-16($17) + cmpult $4,$20,$25 # compute cy from last add + ldq $7,-8($17) + or $8,$25,$25 # combine cy from the two adds + subq $19,4,$19 # decr loop cnt + stq $22,-16($16) + addq $1,$25,$28 # cy add + stq $23,-8($16) + subq $5,$28,$21 # 2nd main sub + addq $18,32,$18 # update s2_ptr + cmpult $28,$25,$8 # compute cy from last add + bge $19,.Loop + # Finish software pipeline for 1st loop +.Lend1: cmpult $5,$21,$25 # compute cy from last add + or $8,$25,$25 # combine cy from the two adds + addq $2,$25,$28 # cy add + subq $6,$28,$22 # 3rd main sub + cmpult $28,$25,$8 # compute cy from last add + cmpult $6,$22,$25 # compute cy from last add + stq $20,0($16) + or $8,$25,$25 # combine cy from the two adds + stq $21,8($16) + addq $3,$25,$28 # cy add + subq $7,$28,$23 # 4th main sub + cmpult $28,$25,$8 # compute cy from last add + cmpult $7,$23,$25 # compute cy from last add + or $8,$25,$25 # combine cy from the two adds + addq $16,32,$16 # update res_ptr + stq $22,-16($16) + stq $23,-8($16) +.Lend2: addq $19,4,$19 # restore loop cnt + beq $19,.Lret + # Start software pipeline for 2nd loop + ldq $0,0($18) + ldq $4,0($17) + subq $19,1,$19 + beq $19,.Lend0 + # 2nd loop handles remaining 1-3 limbs + .align 4 +.Loop0: addq $0,$25,$28 # cy add + ldq $0,8($18) + subq $4,$28,$20 # main sub + ldq $1,8($17) + addq $18,8,$18 + cmpult $28,$25,$8 # compute cy from last add + addq $17,8,$17 + stq $20,0($16) + cmpult $4,$20,$25 # compute cy from last add + subq $19,1,$19 # decr loop cnt + or $8,$25,$25 # combine cy from the two adds + addq $16,8,$16 + or $1,$31,$4 + bne $19,.Loop0 +.Lend0: addq $0,$25,$28 # cy add + subq $4,$28,$20 # main sub + cmpult $28,$25,$8 # compute cy from last add + cmpult $4,$20,$25 # compute cy from last add + stq $20,0($16) + or $8,$25,$25 # combine cy from the two adds + +.Lret: or $25,$31,$0 # return cy + ret $31,($26),1 + .end __mpn_sub_n diff --git a/sysdeps/alpha/lshift.s b/sysdeps/alpha/lshift.s index c284349..aa8417b 100644 --- a/sysdeps/alpha/lshift.s +++ b/sysdeps/alpha/lshift.s @@ -53,11 +53,11 @@ __mpn_lshift: and $18,4-1,$20 # number of limbs in first loop srl $4,$7,$0 # compute function result - beq $20,L0 + beq $20,.L0 subq $18,$20,$18 .align 3 -Loop0: +.Loop0: ldq $3,-8($17) subq $16,8,$16 subq $17,8,$17 @@ -67,12 +67,12 @@ Loop0: bis $3,$3,$4 bis $5,$6,$8 stq $8,0($16) - bne $20,Loop0 + bne $20,.Loop0 -L0: beq $18,Lend +.L0: beq $18,.Lend .align 3 -Loop: ldq $3,-8($17) +.Loop: ldq $3,-8($17) subq $16,32,$16 subq $18,4,$18 sll $4,$19,$5 @@ -100,9 +100,9 @@ Loop: ldq $3,-8($17) bis $1,$2,$8 stq $8,0($16) - bgt $18,Loop + bgt $18,.Loop -Lend: sll $4,$19,$8 +.Lend: sll $4,$19,$8 stq $8,-8($16) ret $31,($26),1 .end __mpn_lshift diff --git a/sysdeps/alpha/mul_1.s b/sysdeps/alpha/mul_1.s index 3ef194d..58a63df 100644 --- a/sysdeps/alpha/mul_1.s +++ b/sysdeps/alpha/mul_1.s @@ -1,7 +1,7 @@ # Alpha 21064 __mpn_mul_1 -- Multiply a limb vector with a limb and store # the result in a second limb vector. - # Copyright (C) 1992, 1994 Free Software Foundation, Inc. + # Copyright (C) 1992, 1994, 1995 Free Software Foundation, Inc. # This file is part of the GNU MP Library. diff --git a/sysdeps/alpha/rshift.s b/sysdeps/alpha/rshift.s index 74eab04..037b776 100644 --- a/sysdeps/alpha/rshift.s +++ b/sysdeps/alpha/rshift.s @@ -34,7 +34,7 @@ # 1. ldq has a 3 cycle delay, srl and sll have a 2 cycle delay. # 2. Only aligned instruction pairs can be paired. # 3. The store buffer or silo might not be able to deal with the bandwidth. - + .set noreorder .set noat .text @@ -51,11 +51,11 @@ __mpn_rshift: and $18,4-1,$20 # number of limbs in first loop sll $4,$7,$0 # compute function result - beq $20,L0 + beq $20,.L0 subq $18,$20,$18 .align 3 -Loop0: +.Loop0: ldq $3,0($17) addq $16,8,$16 addq $17,8,$17 @@ -65,12 +65,12 @@ Loop0: bis $3,$3,$4 bis $5,$6,$8 stq $8,-8($16) - bne $20,Loop0 + bne $20,.Loop0 -L0: beq $18,Lend +.L0: beq $18,.Lend .align 3 -Loop: ldq $3,0($17) +.Loop: ldq $3,0($17) addq $16,32,$16 subq $18,4,$18 srl $4,$19,$5 @@ -98,9 +98,9 @@ Loop: ldq $3,0($17) bis $1,$2,$8 stq $8,-8($16) - bgt $18,Loop + bgt $18,.Loop -Lend: srl $4,$19,$8 +.Lend: srl $4,$19,$8 stq $8,0($16) ret $31,($26),1 .end __mpn_rshift diff --git a/sysdeps/alpha/submul_1.s b/sysdeps/alpha/submul_1.s index acaa11c..292b2c1 100644 --- a/sysdeps/alpha/submul_1.s +++ b/sysdeps/alpha/submul_1.s @@ -26,16 +26,7 @@ # size r18 # s2_limb r19 - # This code runs at 42 cycles/limb on the 21064. - - # To improve performance for long multiplications, we would use - # 'fetch' for S1 and 'fetch_m' for RES. It's not obvious how to use - # these instructions without slowing down the general code: 1. We can - # only have two prefetches in operation at any time in the Alpha - # architecture. 2. There will seldom be any special alignment - # between RES_PTR and S1_PTR. Maybe we can simply divide the current - # loop into an inner and outer loop, having the inner loop handle - # exactly one prefetch block? + # This code runs at 42 cycles/limb on EV4 and 18 cycles/limb on EV5. .set noreorder .set noat @@ -52,7 +43,7 @@ __mpn_submul_1: mulq $2,$19,$3 # $3 = prod_low ldq $5,0($16) # $5 = *res_ptr umulh $2,$19,$0 # $0 = prod_high - beq $18,Lend1 # jump if size was == 1 + beq $18,.Lend1 # jump if size was == 1 ldq $2,0($17) # $2 = s1_limb addq $17,8,$17 # s1_ptr++ subq $18,1,$18 # size-- @@ -60,10 +51,10 @@ __mpn_submul_1: cmpult $5,$3,$4 stq $3,0($16) addq $16,8,$16 # res_ptr++ - beq $18,Lend2 # jump if size was == 2 + beq $18,.Lend2 # jump if size was == 2 .align 3 -Loop: mulq $2,$19,$3 # $3 = prod_low +.Loop: mulq $2,$19,$3 # $3 = prod_low ldq $5,0($16) # $5 = *res_ptr addq $4,$0,$0 # cy_limb = cy_limb + 'cy' subq $18,1,$18 # size-- @@ -77,9 +68,9 @@ Loop: mulq $2,$19,$3 # $3 = prod_low stq $3,0($16) addq $16,8,$16 # res_ptr++ addq $5,$0,$0 # combine carries - bne $18,Loop + bne $18,.Loop -Lend2: mulq $2,$19,$3 # $3 = prod_low +.Lend2: mulq $2,$19,$3 # $3 = prod_low ldq $5,0($16) # $5 = *res_ptr addq $4,$0,$0 # cy_limb = cy_limb + 'cy' umulh $2,$19,$4 # $4 = cy_limb @@ -91,7 +82,7 @@ Lend2: mulq $2,$19,$3 # $3 = prod_low addq $5,$0,$0 # combine carries addq $4,$0,$0 # cy_limb = prod_high + cy ret $31,($26),1 -Lend1: subq $5,$3,$3 +.Lend1: subq $5,$3,$3 cmpult $5,$3,$5 stq $3,0($16) addq $0,$5,$0 diff --git a/sysdeps/alpha/udiv_qrnnd.S b/sysdeps/alpha/udiv_qrnnd.S index bafafd6..ce590ed 100644 --- a/sysdeps/alpha/udiv_qrnnd.S +++ b/sysdeps/alpha/udiv_qrnnd.S @@ -1,6 +1,6 @@ # Alpha 21064 __udiv_qrnnd - # Copyright (C) 1992, 1994 Free Software Foundation, Inc. + # Copyright (C) 1992, 1994, 1995 Free Software Foundation, Inc. # This file is part of the GNU MP Library. @@ -21,13 +21,11 @@ .set noreorder .set noat - .text - .align 3 - .globl __udiv_qrnnd - .ent __udiv_qrnnd 0 + .align 3 + .globl __udiv_qrnnd + .ent __udiv_qrnnd __udiv_qrnnd: -__udiv_qrnnd..ng: .frame $30,0,$26,0 .prologue 0 #define cnt $2 @@ -39,9 +37,9 @@ __udiv_qrnnd..ng: #define qb $20 ldiq cnt,16 - blt d,Largedivisor + blt d,.Largedivisor -Loop1: cmplt n0,0,tmp +.Loop1: cmplt n0,0,tmp addq n1,n1,n1 bis n1,tmp,n1 addq n0,n0,n0 @@ -74,12 +72,12 @@ Loop1: cmplt n0,0,tmp cmovne qb,tmp,n1 bis n0,qb,n0 subq cnt,1,cnt - bgt cnt,Loop1 + bgt cnt,.Loop1 stq n1,0(rem_ptr) bis $31,n0,$0 ret $31,($26),1 -Largedivisor: +.Largedivisor: and n0,1,$4 srl n0,1,n0 @@ -91,7 +89,7 @@ Largedivisor: srl d,1,$5 addq $5,$6,$5 -Loop2: cmplt n0,0,tmp +.Loop2: cmplt n0,0,tmp addq n1,n1,n1 bis n1,tmp,n1 addq n0,n0,n0 @@ -124,27 +122,27 @@ Loop2: cmplt n0,0,tmp cmovne qb,tmp,n1 bis n0,qb,n0 subq cnt,1,cnt - bgt cnt,Loop2 + bgt cnt,.Loop2 addq n1,n1,n1 addq $4,n1,n1 - bne $6,Odd + bne $6,.LOdd stq n1,0(rem_ptr) bis $31,n0,$0 ret $31,($26),1 -Odd: +.LOdd: /* q' in n0. r' in n1 */ addq n1,n0,n1 cmpult n1,n0,tmp # tmp := carry from addq - beq tmp,LLp6 + beq tmp,.LLp6 addq n0,1,n0 subq n1,d,n1 -LLp6: cmpult n1,d,tmp - bne tmp,LLp7 +.LLp6: cmpult n1,d,tmp + bne tmp,.LLp7 addq n0,1,n0 subq n1,d,n1 -LLp7: +.LLp7: stq n1,0(rem_ptr) bis $31,n0,$0 ret $31,($26),1 diff --git a/sysdeps/m68k/add_n.S b/sysdeps/m68k/add_n.S index ea7a4458..754af9f 100644 --- a/sysdeps/m68k/add_n.S +++ b/sysdeps/m68k/add_n.S @@ -1,7 +1,7 @@ /* mc68020 __mpn_add_n -- Add two limb vectors of the same length > 0 and store sum in a third limb vector. -Copyright (C) 1992, 1994 Free Software Foundation, Inc. +Copyright (C) 1992, 1994, 1996 Free Software Foundation, Inc. This file is part of the GNU MP Library. @@ -27,50 +27,53 @@ the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ size (sp + 12) */ +#include "sysdep.h" #include "asm-syntax.h" TEXT ALIGN - GLOBL ___mpn_add_n + GLOBL C_SYMBOL_NAME(__mpn_add_n) -LAB(___mpn_add_n) +C_SYMBOL_NAME(__mpn_add_n:) +PROLOG(__mpn_add_n) /* Save used registers on the stack. */ - INSN2(move,l ,MEM_PREDEC(sp),d2) - INSN2(move,l ,MEM_PREDEC(sp),a2) + movel R(d2),MEM_PREDEC(sp) + movel R(a2),MEM_PREDEC(sp) /* Copy the arguments to registers. Better use movem? */ - INSN2(move,l ,a2,MEM_DISP(sp,12)) - INSN2(move,l ,a0,MEM_DISP(sp,16)) - INSN2(move,l ,a1,MEM_DISP(sp,20)) - INSN2(move,l ,d2,MEM_DISP(sp,24)) - - INSN2(eor,w ,d2,#1) - INSN2(lsr,l ,d2,#1) - bcc L1 - INSN2(subq,l ,d2,#1) /* clears cy as side effect */ - -LAB(Loop) - INSN2(move,l ,d0,MEM_POSTINC(a0)) - INSN2(move,l ,d1,MEM_POSTINC(a1)) - INSN2(addx,l ,d0,d1) - INSN2(move,l ,MEM_POSTINC(a2),d0) -LAB(L1) INSN2(move,l ,d0,MEM_POSTINC(a0)) - INSN2(move,l ,d1,MEM_POSTINC(a1)) - INSN2(addx,l ,d0,d1) - INSN2(move,l ,MEM_POSTINC(a2),d0) - - dbf d2,Loop /* loop until 16 lsb of %4 == -1 */ - INSN2(subx,l ,d0,d0) /* d0 <= -cy; save cy as 0 or -1 in d0 */ - INSN2(sub,l ,d2,#0x10000) - bcs L2 - INSN2(add,l ,d0,d0) /* restore cy */ - bra Loop - -LAB(L2) - INSN1(neg,l ,d0) + movel MEM_DISP(sp,12),R(a2) + movel MEM_DISP(sp,16),R(a0) + movel MEM_DISP(sp,20),R(a1) + movel MEM_DISP(sp,24),R(d2) + + eorw #1,R(d2) + lsrl #1,R(d2) + bcc L(L1) + subql #1,R(d2) /* clears cy as side effect */ + +L(Loop:) + movel MEM_POSTINC(a0),R(d0) + movel MEM_POSTINC(a1),R(d1) + addxl R(d1),R(d0) + movel R(d0),MEM_POSTINC(a2) +L(L1:) movel MEM_POSTINC(a0),R(d0) + movel MEM_POSTINC(a1),R(d1) + addxl R(d1),R(d0) + movel R(d0),MEM_POSTINC(a2) + + dbf R(d2),L(Loop) /* loop until 16 lsb of %4 == -1 */ + subxl R(d0),R(d0) /* d0 <= -cy; save cy as 0 or -1 in d0 */ + subl #0x10000,R(d2) + bcs L(L2) + addl R(d0),R(d0) /* restore cy */ + bra L(Loop) + +L(L2:) + negl R(d0) /* Restore used registers from stack frame. */ - INSN2(move,l ,a2,MEM_POSTINC(sp)) - INSN2(move,l ,d2,MEM_POSTINC(sp)) + movel MEM_POSTINC(sp),R(a2) + movel MEM_POSTINC(sp),R(d2) rts +EPILOG(__mpn_add_n) diff --git a/sysdeps/m68k/lshift.S b/sysdeps/m68k/lshift.S new file mode 100644 index 0000000..c58594a --- /dev/null +++ b/sysdeps/m68k/lshift.S @@ -0,0 +1,150 @@ +/* mc68020 __mpn_lshift -- Shift left a low-level natural-number integer. + +Copyright (C) 1996 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of the GNU Library General Public License as published by +the Free Software Foundation; either version 2 of the License, or (at your +option) any later version. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +License for more details. + +You should have received a copy of the GNU Library General Public License +along with the GNU MP Library; see the file COPYING.LIB. If not, write to +the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ + +/* + INPUT PARAMETERS + res_ptr (sp + 4) + s_ptr (sp + 8) + s_size (sp + 16) + cnt (sp + 12) +*/ + +#include "sysdep.h" +#include "asm-syntax.h" + +#define res_ptr a1 +#define s_ptr a0 +#define s_size d6 +#define cnt d4 + + TEXT + ALIGN + GLOBL C_SYMBOL_NAME(__mpn_lshift) + +C_SYMBOL_NAME(__mpn_lshift:) +PROLOG(__mpn_lshift) + +/* Save used registers on the stack. */ + moveml R(d2)-R(d6)/R(a2),MEM_PREDEC(sp) + +/* Copy the arguments to registers. */ + movel MEM_DISP(sp,28),R(res_ptr) + movel MEM_DISP(sp,32),R(s_ptr) + movel MEM_DISP(sp,36),R(s_size) + movel MEM_DISP(sp,40),R(cnt) + + moveql #1,R(d5) + cmpl R(d5),R(cnt) + bne L(Lnormal) + cmpl R(s_ptr),R(res_ptr) + bls L(Lspecial) /* jump if s_ptr >= res_ptr */ +#if (defined (__mc68020__) || defined (__NeXT__) || defined(mc68020)) + lea MEM_INDX1(s_ptr,s_size,l,4),R(a2) +#else /* not mc68020 */ + movel R(s_size),R(d0) + asll #2,R(d0) + lea MEM_INDX(s_ptr,d0,l),R(a2) +#endif + cmpl R(res_ptr),R(a2) + bls L(Lspecial) /* jump if res_ptr >= s_ptr + s_size */ + +L(Lnormal:) + moveql #32,R(d5) + subl R(cnt),R(d5) + +#if (defined (__mc68020__) || defined (__NeXT__) || defined(mc68020)) + lea MEM_INDX1(s_ptr,s_size,l,4),R(s_ptr) + lea MEM_INDX1(res_ptr,s_size,l,4),R(res_ptr) +#else /* not mc68000 */ + movel R(s_size),R(d0) + asll #2,R(d0) + addl R(s_size),R(s_ptr) + addl R(s_size),R(res_ptr) +#endif + movel MEM_PREDEC(s_ptr),R(d2) + movel R(d2),R(d0) + lsrl R(d5),R(d0) /* compute carry limb */ + + lsll R(cnt),R(d2) + movel R(d2),R(d1) + subql #1,R(s_size) + beq L(Lend) + lsrl #1,R(s_size) + bcs L(L1) + subql #1,R(s_size) + +L(Loop:) + movel MEM_PREDEC(s_ptr),R(d2) + movel R(d2),R(d3) + lsrl R(d5),R(d3) + orl R(d3),R(d1) + movel R(d1),MEM_PREDEC(res_ptr) + lsll R(cnt),R(d2) +L(L1:) + movel MEM_PREDEC(s_ptr),R(d1) + movel R(d1),R(d3) + lsrl R(d5),R(d3) + orl R(d3),R(d2) + movel R(d2),MEM_PREDEC(res_ptr) + lsll R(cnt),R(d1) + + dbf R(s_size),L(Loop) + subl #0x10000,R(s_size) + bcc L(Loop) + +L(Lend:) + movel R(d1),MEM_PREDEC(res_ptr) /* store least significant limb */ + +/* Restore used registers from stack frame. */ + moveml MEM_POSTINC(sp),R(d2)-R(d6)/R(a2) + rts + +/* We loop from least significant end of the arrays, which is only + permissable if the source and destination don't overlap, since the + function is documented to work for overlapping source and destination. */ + +L(Lspecial:) + clrl R(d0) /* initialize carry */ + eorw #1,R(s_size) + lsrl #1,R(s_size) + bcc L(LL1) + subql #1,R(s_size) + +L(LLoop:) + movel MEM_POSTINC(s_ptr),R(d2) + addxl R(d2),R(d2) + movel R(d2),MEM_POSTINC(res_ptr) +L(LL1:) + movel MEM_POSTINC(s_ptr),R(d2) + addxl R(d2),R(d2) + movel R(d2),MEM_POSTINC(res_ptr) + + dbf R(s_size),L(LLoop) + addxl R(d0),R(d0) /* save cy in lsb */ + subl #0x10000,R(s_size) + bcs L(LLend) + lsrl #1,R(d0) /* restore cy */ + bra L(LLoop) + +L(LLend:) +/* Restore used registers from stack frame. */ + moveml MEM_POSTINC(sp),R(d2)-R(d6)/R(a2) + rts +EPILOG(__mpn_lshift) diff --git a/sysdeps/m68k/m68020/addmul_1.S b/sysdeps/m68k/m68020/addmul_1.S index 3f244c4..169f113 100644 --- a/sysdeps/m68k/m68020/addmul_1.S +++ b/sysdeps/m68k/m68020/addmul_1.S @@ -1,7 +1,7 @@ /* mc68020 __mpn_addmul_1 -- Multiply a limb vector with a limb and add the result to a second limb vector. -Copyright (C) 1992, 1994 Free Software Foundation, Inc. +Copyright (C) 1992, 1994, 1996 Free Software Foundation, Inc. This file is part of the GNU MP Library. @@ -23,58 +23,61 @@ the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ INPUT PARAMETERS res_ptr (sp + 4) s1_ptr (sp + 8) - size (sp + 12) + s1_size (sp + 12) s2_limb (sp + 16) */ +#include "sysdep.h" #include "asm-syntax.h" TEXT ALIGN - GLOBL ___mpn_addmul_1 + GLOBL C_SYMBOL_NAME(__mpn_addmul_1) -LAB(___mpn_addmul_1) +C_SYMBOL_NAME(__mpn_addmul_1:) +PROLOG(__mpn_addmul_1) #define res_ptr a0 #define s1_ptr a1 -#define size d2 +#define s1_size d2 #define s2_limb d4 /* Save used registers on the stack. */ - INSN2(movem,l ,MEM_PREDEC(sp),d2-d5) + moveml R(d2)-R(d5),MEM_PREDEC(sp) /* Copy the arguments to registers. Better use movem? */ - INSN2(move,l ,res_ptr,MEM_DISP(sp,20)) - INSN2(move,l ,s1_ptr,MEM_DISP(sp,24)) - INSN2(move,l ,size,MEM_DISP(sp,28)) - INSN2(move,l ,s2_limb,MEM_DISP(sp,32)) - - INSN2(eor,w ,size,#1) - INSN1(clr,l ,d1) - INSN1(clr,l ,d5) - INSN2(lsr,l ,size,#1) - bcc L1 - INSN2(subq,l ,size,#1) - INSN2(sub,l ,d0,d0) /* (d0,cy) <= (0,0) */ - -LAB(Loop) - INSN2(move,l ,d3,MEM_POSTINC(s1_ptr)) - INSN2(mulu,l ,d1:d3,s2_limb) - INSN2(addx,l ,d3,d0) - INSN2(addx,l ,d1,d5) - INSN2(add,l ,MEM_POSTINC(res_ptr),d3) -LAB(L1) INSN2(move,l ,d3,MEM_POSTINC(s1_ptr)) - INSN2(mulu,l ,d0:d3,s2_limb) - INSN2(addx,l ,d3,d1) - INSN2(addx,l ,d0,d5) - INSN2(add,l ,MEM_POSTINC(res_ptr),d3) - - dbf size,Loop - INSN2(addx,l ,d0,d5) - INSN2(sub,l ,size,#0x10000) - bcc Loop + movel MEM_DISP(sp,20),R(res_ptr) + movel MEM_DISP(sp,24),R(s1_ptr) + movel MEM_DISP(sp,28),R(s1_size) + movel MEM_DISP(sp,32),R(s2_limb) + + eorw #1,R(s1_size) + clrl R(d1) + clrl R(d5) + lsrl #1,R(s1_size) + bcc L(L1) + subql #1,R(s1_size) + subl R(d0),R(d0) /* (d0,cy) <= (0,0) */ + +L(Loop:) + movel MEM_POSTINC(s1_ptr),R(d3) + mulul R(s2_limb),R(d1):R(d3) + addxl R(d0),R(d3) + addxl R(d5),R(d1) + addl R(d3),MEM_POSTINC(res_ptr) +L(L1:) movel MEM_POSTINC(s1_ptr),R(d3) + mulul R(s2_limb),R(d0):R(d3) + addxl R(d1),R(d3) + addxl R(d5),R(d0) + addl R(d3),MEM_POSTINC(res_ptr) + + dbf R(s1_size),L(Loop) + addxl R(d5),R(d0) + subl #0x10000,R(s1_size) + bcc L(Loop) /* Restore used registers from stack frame. */ - INSN2(movem,l ,d2-d5,MEM_POSTINC(sp)) + moveml MEM_POSTINC(sp),R(d2)-R(d5) rts +EPILOG(__mpn_addmul_1) diff --git a/sysdeps/m68k/m68020/mul_1.S b/sysdeps/m68k/m68020/mul_1.S index 548ca00..4db1cca 100644 --- a/sysdeps/m68k/m68020/mul_1.S +++ b/sysdeps/m68k/m68020/mul_1.S @@ -1,7 +1,7 @@ /* mc68020 __mpn_mul_1 -- Multiply a limb vector with a limb and store the result in a second limb vector. -Copyright (C) 1992, 1994 Free Software Foundation, Inc. +Copyright (C) 1992, 1994, 1996 Free Software Foundation, Inc. This file is part of the GNU MP Library. @@ -23,65 +23,68 @@ the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ INPUT PARAMETERS res_ptr (sp + 4) s1_ptr (sp + 8) - size (sp + 12) + s1_size (sp + 12) s2_limb (sp + 16) */ +#include "sysdep.h" #include "asm-syntax.h" TEXT ALIGN - GLOBL ___mpn_mul_1 + GLOBL C_SYMBOL_NAME(__mpn_mul_1) -LAB(___mpn_mul_1) +C_SYMBOL_NAME(__mpn_mul_1:) +PROLOG(__mpn_mul_1) #define res_ptr a0 #define s1_ptr a1 -#define size d2 +#define s1_size d2 #define s2_limb d4 /* Save used registers on the stack. */ - INSN2(movem,l ,MEM_PREDEC(sp),d2-d4) + moveml R(d2)-R(d4),MEM_PREDEC(sp) #if 0 - INSN2(move,l ,MEM_PREDEC(sp),d2) - INSN2(move,l ,MEM_PREDEC(sp),d3) - INSN2(move,l ,MEM_PREDEC(sp),d4) + movel R(d2),MEM_PREDEC(sp) + movel R(d3),MEM_PREDEC(sp) + movel R(d4),MEM_PREDEC(sp) #endif /* Copy the arguments to registers. Better use movem? */ - INSN2(move,l ,res_ptr,MEM_DISP(sp,16)) - INSN2(move,l ,s1_ptr,MEM_DISP(sp,20)) - INSN2(move,l ,size,MEM_DISP(sp,24)) - INSN2(move,l ,s2_limb,MEM_DISP(sp,28)) - - INSN2(eor,w ,size,#1) - INSN1(clr,l ,d1) - INSN2(lsr,l ,size,#1) - bcc L1 - INSN2(subq,l ,size,#1) - INSN2(sub,l ,d0,d0) /* (d0,cy) <= (0,0) */ - -LAB(Loop) - INSN2(move,l ,d3,MEM_POSTINC(s1_ptr)) - INSN2(mulu,l ,d1:d3,s2_limb) - INSN2(addx,l ,d3,d0) - INSN2(move,l ,MEM_POSTINC(res_ptr),d3) -LAB(L1) INSN2(move,l ,d3,MEM_POSTINC(s1_ptr)) - INSN2(mulu,l ,d0:d3,s2_limb) - INSN2(addx,l ,d3,d1) - INSN2(move,l ,MEM_POSTINC(res_ptr),d3) - - dbf size,Loop - INSN1(clr,l ,d3) - INSN2(addx,l ,d0,d3) - INSN2(sub,l ,size,#0x10000) - bcc Loop + movel MEM_DISP(sp,16),R(res_ptr) + movel MEM_DISP(sp,20),R(s1_ptr) + movel MEM_DISP(sp,24),R(s1_size) + movel MEM_DISP(sp,28),R(s2_limb) + + eorw #1,R(s1_size) + clrl R(d1) + lsrl #1,R(s1_size) + bcc L(L1) + subql #1,R(s1_size) + subl R(d0),R(d0) /* (d0,cy) <= (0,0) */ + +L(Loop:) + movel MEM_POSTINC(s1_ptr),R(d3) + mulul R(s2_limb),R(d1):R(d3) + addxl R(d0),R(d3) + movel R(d3),MEM_POSTINC(res_ptr) +L(L1:) movel MEM_POSTINC(s1_ptr),R(d3) + mulul R(s2_limb),R(d0):R(d3) + addxl R(d1),R(d3) + movel R(d3),MEM_POSTINC(res_ptr) + + dbf R(s1_size),L(Loop) + clrl R(d3) + addxl R(d3),R(d0) + subl #0x10000,R(s1_size) + bcc L(Loop) /* Restore used registers from stack frame. */ - INSN2(movem,l ,d2-d4,MEM_POSTINC(sp)) + moveml MEM_POSTINC(sp),R(d2)-R(d4) #if 0 - INSN2(move,l ,d4,MEM_POSTINC(sp)) - INSN2(move,l ,d3,MEM_POSTINC(sp)) - INSN2(move,l ,d2,MEM_POSTINC(sp)) + movel MEM_POSTINC(sp),R(d4) + movel MEM_POSTINC(sp),R(d3) + movel MEM_POSTINC(sp),R(d2) #endif rts +EPILOG(__mpn_mul_1) diff --git a/sysdeps/m68k/m68020/submul_1.S b/sysdeps/m68k/m68020/submul_1.S index ef7f39d..cf30029 100644 --- a/sysdeps/m68k/m68020/submul_1.S +++ b/sysdeps/m68k/m68020/submul_1.S @@ -1,7 +1,7 @@ /* mc68020 __mpn_submul_1 -- Multiply a limb vector with a limb and subtract the result from a second limb vector. -Copyright (C) 1992, 1994 Free Software Foundation, Inc. +Copyright (C) 1992, 1994, 1996 Free Software Foundation, Inc. This file is part of the GNU MP Library. @@ -23,58 +23,61 @@ the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ INPUT PARAMETERS res_ptr (sp + 4) s1_ptr (sp + 8) - size (sp + 12) + s1_size (sp + 12) s2_limb (sp + 16) */ +#include "sysdep.h" #include "asm-syntax.h" TEXT ALIGN - GLOBL ___mpn_submul_1 + GLOBL C_SYMBOL_NAME(__mpn_submul_1) -LAB(___mpn_submul_1) +C_SYMBOL_NAME(__mpn_submul_1:) +PROLOG(__mpn_submul_1) #define res_ptr a0 #define s1_ptr a1 -#define size d2 +#define s1_size d2 #define s2_limb d4 /* Save used registers on the stack. */ - INSN2(movem,l ,MEM_PREDEC(sp),d2-d5) + moveml R(d2)-R(d5),MEM_PREDEC(sp) /* Copy the arguments to registers. Better use movem? */ - INSN2(move,l ,res_ptr,MEM_DISP(sp,20)) - INSN2(move,l ,s1_ptr,MEM_DISP(sp,24)) - INSN2(move,l ,size,MEM_DISP(sp,28)) - INSN2(move,l ,s2_limb,MEM_DISP(sp,32)) - - INSN2(eor,w ,size,#1) - INSN1(clr,l ,d1) - INSN1(clr,l ,d5) - INSN2(lsr,l ,size,#1) - bcc L1 - INSN2(subq,l ,size,#1) - INSN2(sub,l ,d0,d0) /* (d0,cy) <= (0,0) */ - -LAB(Loop) - INSN2(move,l ,d3,MEM_POSTINC(s1_ptr)) - INSN2(mulu,l ,d1:d3,s2_limb) - INSN2(addx,l ,d3,d0) - INSN2(addx,l ,d1,d5) - INSN2(sub,l ,MEM_POSTINC(res_ptr),d3) -LAB(L1) INSN2(move,l ,d3,MEM_POSTINC(s1_ptr)) - INSN2(mulu,l ,d0:d3,s2_limb) - INSN2(addx,l ,d3,d1) - INSN2(addx,l ,d0,d5) - INSN2(sub,l ,MEM_POSTINC(res_ptr),d3) - - dbf size,Loop - INSN2(addx,l ,d0,d5) - INSN2(sub,l ,size,#0x10000) - bcc Loop + movel MEM_DISP(sp,20),R(res_ptr) + movel MEM_DISP(sp,24),R(s1_ptr) + movel MEM_DISP(sp,28),R(s1_size) + movel MEM_DISP(sp,32),R(s2_limb) + + eorw #1,R(s1_size) + clrl R(d1) + clrl R(d5) + lsrl #1,R(s1_size) + bcc L(L1) + subql #1,R(s1_size) + subl R(d0),R(d0) /* (d0,cy) <= (0,0) */ + +L(Loop:) + movel MEM_POSTINC(s1_ptr),R(d3) + mulul R(s2_limb),R(d1):R(d3) + addxl R(d0),R(d3) + addxl R(d5),R(d1) + subl R(d3),MEM_POSTINC(res_ptr) +L(L1:) movel MEM_POSTINC(s1_ptr),R(d3) + mulul R(s2_limb),R(d0):R(d3) + addxl R(d1),R(d3) + addxl R(d5),R(d0) + subl R(d3),MEM_POSTINC(res_ptr) + + dbf R(s1_size),L(Loop) + addxl R(d5),R(d0) + subl #0x10000,R(s1_size) + bcc L(Loop) /* Restore used registers from stack frame. */ - INSN2(movem,l ,d2-d5,MEM_POSTINC(sp)) + moveml MEM_POSTINC(sp),R(d2)-R(d5) rts +EPILOG(__mpn_submul_1) diff --git a/sysdeps/m68k/rshift.S b/sysdeps/m68k/rshift.S new file mode 100644 index 0000000..494dfcb --- /dev/null +++ b/sysdeps/m68k/rshift.S @@ -0,0 +1,149 @@ +/* mc68020 __mpn_rshift -- Shift right a low-level natural-number integer. + +Copyright (C) 1996 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of the GNU Library General Public License as published by +the Free Software Foundation; either version 2 of the License, or (at your +option) any later version. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +License for more details. + +You should have received a copy of the GNU Library General Public License +along with the GNU MP Library; see the file COPYING.LIB. If not, write to +the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ + +/* + INPUT PARAMETERS + res_ptr (sp + 4) + s_ptr (sp + 8) + s_size (sp + 16) + cnt (sp + 12) +*/ + +#include "sysdep.h" +#include "asm-syntax.h" + +#define res_ptr a1 +#define s_ptr a0 +#define s_size d6 +#define cnt d4 + + TEXT + ALIGN + GLOBL C_SYMBOL_NAME(__mpn_rshift) + +C_SYMBOL_NAME(__mpn_rshift:) +PROLOG(__mpn_rshift) +/* Save used registers on the stack. */ + moveml R(d2)-R(d6)/R(a2),MEM_PREDEC(sp) + +/* Copy the arguments to registers. */ + movel MEM_DISP(sp,28),R(res_ptr) + movel MEM_DISP(sp,32),R(s_ptr) + movel MEM_DISP(sp,36),R(s_size) + movel MEM_DISP(sp,40),R(cnt) + + moveql #1,R(d5) + cmpl R(d5),R(cnt) + bne L(Lnormal) + cmpl R(res_ptr),R(s_ptr) + bls L(Lspecial) /* jump if res_ptr >= s_ptr */ +#if (defined (__mc68020__) || defined (__NeXT__) || defined(mc68020)) + lea MEM_INDX1(res_ptr,s_size,l,4),R(a2) +#else /* not mc68020 */ + movel R(s_size),R(d0) + asll #2,R(d0) + lea MEM_INDX(res_ptr,d0,l),R(a2) +#endif + cmpl R(s_ptr),R(a2) + bls L(Lspecial) /* jump if s_ptr >= res_ptr + s_size */ + +L(Lnormal:) + moveql #32,R(d5) + subl R(cnt),R(d5) + movel MEM_POSTINC(s_ptr),R(d2) + movel R(d2),R(d0) + lsll R(d5),R(d0) /* compute carry limb */ + + lsrl R(cnt),R(d2) + movel R(d2),R(d1) + subql #1,R(s_size) + beq L(Lend) + lsrl #1,R(s_size) + bcs L(L1) + subql #1,R(s_size) + +L(Loop:) + movel MEM_POSTINC(s_ptr),R(d2) + movel R(d2),R(d3) + lsll R(d5),R(d3) + orl R(d3),R(d1) + movel R(d1),MEM_POSTINC(res_ptr) + lsrl R(cnt),R(d2) +L(L1:) + movel MEM_POSTINC(s_ptr),R(d1) + movel R(d1),R(d3) + lsll R(d5),R(d3) + orl R(d3),R(d2) + movel R(d2),MEM_POSTINC(res_ptr) + lsrl R(cnt),R(d1) + + dbf R(s_size),L(Loop) + subl #0x10000,R(s_size) + bcc L(Loop) + +L(Lend:) + movel R(d1),MEM(res_ptr) /* store most significant limb */ + +/* Restore used registers from stack frame. */ + moveml MEM_POSTINC(sp),R(d2)-R(d6)/R(a2) + rts + +/* We loop from most significant end of the arrays, which is only + permissable if the source and destination don't overlap, since the + function is documented to work for overlapping source and destination. */ + +L(Lspecial:) +#if (defined (__mc68020__) || defined (__NeXT__) || defined(mc68020)) + lea MEM_INDX1(s_ptr,s_size,l,4),R(s_ptr) + lea MEM_INDX1(res_ptr,s_size,l,4),R(res_ptr) +#else /* not mc68000 */ + movel R(s_size),R(d0) + asll #2,R(d0) + addl R(s_size),R(s_ptr) + addl R(s_size),R(res_ptr) +#endif + + clrl R(d0) /* initialize carry */ + eorw #1,R(s_size) + lsrl #1,R(s_size) + bcc L(LL1) + subql #1,R(s_size) + +L(LLoop:) + movel MEM_PREDEC(s_ptr),R(d2) + roxrl #1,R(d2) + movel R(d2),MEM_PREDEC(res_ptr) +L(LL1:) + movel MEM_PREDEC(s_ptr),R(d2) + roxrl #1,R(d2) + movel R(d2),MEM_PREDEC(res_ptr) + + dbf R(s_size),L(LLoop) + roxrl #1,R(d0) /* save cy in msb */ + subl #0x10000,R(s_size) + bcs L(LLend) + addl R(d0),R(d0) /* restore cy */ + bra L(LLoop) + +L(LLend:) +/* Restore used registers from stack frame. */ + moveml MEM_POSTINC(sp),R(d2)-R(d6)/R(a2) + rts +EPILOG(__mpn_rshift) diff --git a/sysdeps/m68k/sub_n.S b/sysdeps/m68k/sub_n.S index 19f0ec1..39f5161 100644 --- a/sysdeps/m68k/sub_n.S +++ b/sysdeps/m68k/sub_n.S @@ -1,7 +1,7 @@ /* mc68020 __mpn_sub_n -- Subtract two limb vectors of the same length > 0 and store difference in a third limb vector. -Copyright (C) 1992, 1994 Free Software Foundation, Inc. +Copyright (C) 1992, 1994, 1996 Free Software Foundation, Inc. This file is part of the GNU MP Library. @@ -27,50 +27,53 @@ the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ size (sp + 12) */ +#include "sysdep.h" #include "asm-syntax.h" TEXT ALIGN - GLOBL ___mpn_sub_n + GLOBL C_SYMBOL_NAME(__mpn_sub_n) -LAB(___mpn_sub_n) +C_SYMBOL_NAME(__mpn_sub_n:) +PROLOG(__mpn_sub_n) /* Save used registers on the stack. */ - INSN2(move,l ,MEM_PREDEC(sp),d2) - INSN2(move,l ,MEM_PREDEC(sp),a2) + movel R(d2),MEM_PREDEC(sp) + movel R(a2),MEM_PREDEC(sp) /* Copy the arguments to registers. Better use movem? */ - INSN2(move,l ,a2,MEM_DISP(sp,12)) - INSN2(move,l ,a0,MEM_DISP(sp,16)) - INSN2(move,l ,a1,MEM_DISP(sp,20)) - INSN2(move,l ,d2,MEM_DISP(sp,24)) - - INSN2(eor,w ,d2,#1) - INSN2(lsr,l ,d2,#1) - bcc L1 - INSN2(subq,l ,d2,#1) /* clears cy as side effect */ - -LAB(Loop) - INSN2(move,l ,d0,MEM_POSTINC(a0)) - INSN2(move,l ,d1,MEM_POSTINC(a1)) - INSN2(subx,l ,d0,d1) - INSN2(move,l ,MEM_POSTINC(a2),d0) -LAB(L1) INSN2(move,l ,d0,MEM_POSTINC(a0)) - INSN2(move,l ,d1,MEM_POSTINC(a1)) - INSN2(subx,l ,d0,d1) - INSN2(move,l ,MEM_POSTINC(a2),d0) - - dbf d2,Loop /* loop until 16 lsb of %4 == -1 */ - INSN2(subx,l ,d0,d0) /* d0 <= -cy; save cy as 0 or -1 in d0 */ - INSN2(sub,l ,d2,#0x10000) - bcs L2 - INSN2(add,l ,d0,d0) /* restore cy */ - bra Loop - -LAB(L2) - INSN1(neg,l ,d0) + movel MEM_DISP(sp,12),R(a2) + movel MEM_DISP(sp,16),R(a0) + movel MEM_DISP(sp,20),R(a1) + movel MEM_DISP(sp,24),R(d2) + + eorw #1,R(d2) + lsrl #1,R(d2) + bcc L(L1) + subql #1,R(d2) /* clears cy as side effect */ + +L(Loop:) + movel MEM_POSTINC(a0),R(d0) + movel MEM_POSTINC(a1),R(d1) + subxl R(d1),R(d0) + movel R(d0),MEM_POSTINC(a2) +L(L1:) movel MEM_POSTINC(a0),R(d0) + movel MEM_POSTINC(a1),R(d1) + subxl R(d1),R(d0) + movel R(d0),MEM_POSTINC(a2) + + dbf R(d2),L(Loop) /* loop until 16 lsb of %4 == -1 */ + subxl R(d0),R(d0) /* d0 <= -cy; save cy as 0 or -1 in d0 */ + subl #0x10000,R(d2) + bcs L(L2) + addl R(d0),R(d0) /* restore cy */ + bra L(Loop) + +L(L2:) + negl R(d0) /* Restore used registers from stack frame. */ - INSN2(move,l ,a2,MEM_POSTINC(sp)) - INSN2(move,l ,d2,MEM_POSTINC(sp)) + movel MEM_POSTINC(sp),R(a2) + movel MEM_POSTINC(sp),R(d2) rts +EPILOG(__mpn_sub_n) diff --git a/sysdeps/m88k/add_n.s b/sysdeps/m88k/add_n.s index 7e4cccc..d564479 100644 --- a/sysdeps/m88k/add_n.s +++ b/sysdeps/m88k/add_n.s @@ -1,7 +1,7 @@ ; mc88100 __mpn_add -- Add two limb vectors of the same length > 0 and store ; sum in a third limb vector. -; Copyright (C) 1992, 1994 Free Software Foundation, Inc. +; Copyright (C) 1992, 1994, 1995 Free Software Foundation, Inc. ; This file is part of the GNU MP Library. diff --git a/sysdeps/m88k/m88110/add_n.S b/sysdeps/m88k/m88110/add_n.S new file mode 100644 index 0000000..ab20630 --- /dev/null +++ b/sysdeps/m88k/m88110/add_n.S @@ -0,0 +1,199 @@ +; mc88110 __mpn_add_n -- Add two limb vectors of the same length > 0 and store +; sum in a third limb vector. + +; Copyright (C) 1995, 1996 Free Software Foundation, Inc. + +; This file is part of the GNU MP Library. + +; The GNU MP Library is free software; you can redistribute it and/or modify +; it under the terms of the GNU Library General Public License as published by +; the Free Software Foundation; either version 2 of the License, or (at your +; option) any later version. + +; The GNU MP Library is distributed in the hope that it will be useful, but +; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +; License for more details. + +; You should have received a copy of the GNU Library General Public License +; along with the GNU MP Library; see the file COPYING.LIB. If not, write to +; the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + + +; INPUT PARAMETERS +#define res_ptr r2 +#define s1_ptr r3 +#define s2_ptr r4 +#define size r5 + +#include "sysdep.h" + + text + align 16 + global C_SYMBOL_NAME(__mpn_add_n) +C_SYMBOL_NAME(__mpn_add_n): + addu.co r0,r0,r0 ; clear cy flag + xor r12,s2_ptr,res_ptr + bb1 2,r12,L1 +; ** V1a ** +L0: bb0 2,res_ptr,L_v1 ; branch if res_ptr is aligned? +/* Add least significant limb separately to align res_ptr and s2_ptr */ + ld r10,s1_ptr,0 + addu s1_ptr,s1_ptr,4 + ld r8,s2_ptr,0 + addu s2_ptr,s2_ptr,4 + subu size,size,1 + addu.co r6,r10,r8 + st r6,res_ptr,0 + addu res_ptr,res_ptr,4 +L_v1: cmp r12,size,2 + bb1 lt,r12,Lend2 + + ld r10,s1_ptr,0 + ld r12,s1_ptr,4 + ld.d r8,s2_ptr,0 + subu size,size,10 + bcnd lt0,size,Lfin1 +/* Add blocks of 8 limbs until less than 8 limbs remain */ + align 8 +Loop1: subu size,size,8 + addu.cio r6,r10,r8 + ld r10,s1_ptr,8 + addu.cio r7,r12,r9 + ld r12,s1_ptr,12 + ld.d r8,s2_ptr,8 + st.d r6,res_ptr,0 + addu.cio r6,r10,r8 + ld r10,s1_ptr,16 + addu.cio r7,r12,r9 + ld r12,s1_ptr,20 + ld.d r8,s2_ptr,16 + st.d r6,res_ptr,8 + addu.cio r6,r10,r8 + ld r10,s1_ptr,24 + addu.cio r7,r12,r9 + ld r12,s1_ptr,28 + ld.d r8,s2_ptr,24 + st.d r6,res_ptr,16 + addu.cio r6,r10,r8 + ld r10,s1_ptr,32 + addu.cio r7,r12,r9 + ld r12,s1_ptr,36 + addu s1_ptr,s1_ptr,32 + ld.d r8,s2_ptr,32 + addu s2_ptr,s2_ptr,32 + st.d r6,res_ptr,24 + addu res_ptr,res_ptr,32 + bcnd ge0,size,Loop1 + +Lfin1: addu size,size,8-2 + bcnd lt0,size,Lend1 +/* Add blocks of 2 limbs until less than 2 limbs remain */ +Loope1: addu.cio r6,r10,r8 + ld r10,s1_ptr,8 + addu.cio r7,r12,r9 + ld r12,s1_ptr,12 + ld.d r8,s2_ptr,8 + st.d r6,res_ptr,0 + subu size,size,2 + addu s1_ptr,s1_ptr,8 + addu s2_ptr,s2_ptr,8 + addu res_ptr,res_ptr,8 + bcnd ge0,size,Loope1 +Lend1: addu.cio r6,r10,r8 + addu.cio r7,r12,r9 + st.d r6,res_ptr,0 + + bb0 0,size,Lret1 +/* Add last limb */ + ld r10,s1_ptr,8 + ld r8,s2_ptr,8 + addu.cio r6,r10,r8 + st r6,res_ptr,8 + +Lret1: jmp.n r1 + addu.ci r2,r0,r0 ; return carry-out from most sign. limb + +L1: xor r12,s1_ptr,res_ptr + bb1 2,r12,L2 +; ** V1b ** + or r12,r0,s2_ptr + or s2_ptr,r0,s1_ptr + or s1_ptr,r0,r12 + br L0 + +; ** V2 ** +/* If we come here, the alignment of s1_ptr and res_ptr as well as the + alignment of s2_ptr and res_ptr differ. Since there are only two ways + things can be aligned (that we care about) we now know that the alignment + of s1_ptr and s2_ptr are the same. */ + +L2: cmp r12,size,1 + bb1 eq,r12,Ljone + bb0 2,s1_ptr,L_v2 ; branch if s1_ptr is aligned +/* Add least significant limb separately to align res_ptr and s2_ptr */ + ld r10,s1_ptr,0 + addu s1_ptr,s1_ptr,4 + ld r8,s2_ptr,0 + addu s2_ptr,s2_ptr,4 + subu size,size,1 + addu.co r6,r10,r8 + st r6,res_ptr,0 + addu res_ptr,res_ptr,4 + +L_v2: subu size,size,8 + bcnd lt0,size,Lfin2 +/* Add blocks of 8 limbs until less than 8 limbs remain */ + align 8 +Loop2: subu size,size,8 + ld.d r8,s1_ptr,0 + ld.d r6,s2_ptr,0 + addu.cio r8,r8,r6 + st r8,res_ptr,0 + addu.cio r9,r9,r7 + st r9,res_ptr,4 + ld.d r8,s1_ptr,8 + ld.d r6,s2_ptr,8 + addu.cio r8,r8,r6 + st r8,res_ptr,8 + addu.cio r9,r9,r7 + st r9,res_ptr,12 + ld.d r8,s1_ptr,16 + ld.d r6,s2_ptr,16 + addu.cio r8,r8,r6 + st r8,res_ptr,16 + addu.cio r9,r9,r7 + st r9,res_ptr,20 + ld.d r8,s1_ptr,24 + ld.d r6,s2_ptr,24 + addu.cio r8,r8,r6 + st r8,res_ptr,24 + addu.cio r9,r9,r7 + st r9,res_ptr,28 + addu s1_ptr,s1_ptr,32 + addu s2_ptr,s2_ptr,32 + addu res_ptr,res_ptr,32 + bcnd ge0,size,Loop2 + +Lfin2: addu size,size,8-2 + bcnd lt0,size,Lend2 +Loope2: ld.d r8,s1_ptr,0 + ld.d r6,s2_ptr,0 + addu.cio r8,r8,r6 + st r8,res_ptr,0 + addu.cio r9,r9,r7 + st r9,res_ptr,4 + subu size,size,2 + addu s1_ptr,s1_ptr,8 + addu s2_ptr,s2_ptr,8 + addu res_ptr,res_ptr,8 + bcnd ge0,size,Loope2 +Lend2: bb0 0,size,Lret2 +/* Add last limb */ +Ljone: ld r10,s1_ptr,0 + ld r8,s2_ptr,0 + addu.cio r6,r10,r8 + st r6,res_ptr,0 + +Lret2: jmp.n r1 + addu.ci r2,r0,r0 ; return carry-out from most sign. limb diff --git a/sysdeps/m88k/m88110/addmul_1.s b/sysdeps/m88k/m88110/addmul_1.s new file mode 100644 index 0000000..1a4dfa1 --- /dev/null +++ b/sysdeps/m88k/m88110/addmul_1.s @@ -0,0 +1,60 @@ +; mc88110 __mpn_addmul_1 -- Multiply a limb vector with a single limb and +; store the product in a second limb vector. + +; Copyright (C) 1996 Free Software Foundation, Inc. + +; This file is part of the GNU MP Library. + +; The GNU MP Library is free software; you can redistribute it and/or modify +; it under the terms of the GNU Library General Public License as published by +; the Free Software Foundation; either version 2 of the License, or (at your +; option) any later version. + +; The GNU MP Library is distributed in the hope that it will be useful, but +; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +; License for more details. + +; You should have received a copy of the GNU Library General Public License +; along with the GNU MP Library; see the file COPYING.LIB. If not, write to +; the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + + +; INPUT PARAMETERS +; res_ptr r2 +; s1_ptr r3 +; size r4 +; s2_limb r5 + + text + align 16 + global ___mpn_addmul_1 +___mpn_addmul_1: + lda r3,r3[r4] + lda r8,r2[r4] ; RES_PTR in r8 since r2 is retval + subu r4,r0,r4 + addu.co r2,r0,r0 ; r2 = cy = 0 + + ld r6,r3[r4] + addu r4,r4,1 + subu r8,r8,4 + bcnd.n eq0,r4,Lend + mulu.d r10,r6,r5 + +Loop: ld r7,r8[r4] + ld r6,r3[r4] + addu.cio r9,r11,r2 + addu.ci r2,r10,r0 + addu.co r9,r9,r7 + st r9,r8[r4] + addu r4,r4,1 + mulu.d r10,r6,r5 + bcnd ne0,r4,Loop + +Lend: ld r7,r8,0 + addu.cio r9,r11,r2 + addu.ci r2,r10,r0 + addu.co r9,r9,r7 + st r9,r8,0 + jmp.n r1 + addu.ci r2,r2,r0 diff --git a/sysdeps/m88k/m88110/mul_1.s b/sysdeps/m88k/m88110/mul_1.s index 08c3ca0..b1352ce 100644 --- a/sysdeps/m88k/m88110/mul_1.s +++ b/sysdeps/m88k/m88110/mul_1.s @@ -1,7 +1,7 @@ ; mc88110 __mpn_mul_1 -- Multiply a limb vector with a single limb and ; store the product in a second limb vector. -; Copyright (C) 1992, 1994 Free Software Foundation, Inc. +; Copyright (C) 1992, 1994, 1995 Free Software Foundation, Inc. ; This file is part of the GNU MP Library. @@ -56,29 +56,3 @@ Lend: addu.cio r9,r11,r2 st r9,r8,4 jmp.n r1 addu.ci r2,r10,r0 - -; This is the Right Way to do this on '110. 4 cycles / 64-bit limb. -; ld.d r10, -; mulu.d -; addu.cio -; addu.cio -; st.d -; mulu.d ,r11,r5 -; ld.d r12, -; mulu.d ,r10,r5 -; addu.cio -; addu.cio -; st.d -; mulu.d -; ld.d r10, -; mulu.d -; addu.cio -; addu.cio -; st.d -; mulu.d -; ld.d r10, -; mulu.d -; addu.cio -; addu.cio -; st.d -; mulu.d diff --git a/sysdeps/m88k/m88110/sub_n.S b/sysdeps/m88k/m88110/sub_n.S new file mode 100644 index 0000000..74ee0ae --- /dev/null +++ b/sysdeps/m88k/m88110/sub_n.S @@ -0,0 +1,275 @@ +; mc88110 __mpn_sub_n -- Subtract two limb vectors of the same length > 0 and +; store difference in a third limb vector. + +; Copyright (C) 1995, 1996 Free Software Foundation, Inc. + +; This file is part of the GNU MP Library. + +; The GNU MP Library is free software; you can redistribute it and/or modify +; it under the terms of the GNU Library General Public License as published by +; the Free Software Foundation; either version 2 of the License, or (at your +; option) any later version. + +; The GNU MP Library is distributed in the hope that it will be useful, but +; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +; License for more details. + +; You should have received a copy of the GNU Library General Public License +; along with the GNU MP Library; see the file COPYING.LIB. If not, write to +; the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + + +; INPUT PARAMETERS +#define res_ptr r2 +#define s1_ptr r3 +#define s2_ptr r4 +#define size r5 + +#include "sysdep.h" + + text + align 16 + global C_SYMBOL_NAME(__mpn_sub_n) +C_SYMBOL_NAME(__mpn_sub_n): + subu.co r0,r0,r0 ; set cy flag + xor r12,s2_ptr,res_ptr + bb1 2,r12,L1 +; ** V1a ** +L0: bb0 2,res_ptr,L_v1 ; branch if res_ptr is aligned +/* Add least significant limb separately to align res_ptr and s2_ptr */ + ld r10,s1_ptr,0 + addu s1_ptr,s1_ptr,4 + ld r8,s2_ptr,0 + addu s2_ptr,s2_ptr,4 + subu size,size,1 + subu.co r6,r10,r8 + st r6,res_ptr,0 + addu res_ptr,res_ptr,4 +L_v1: cmp r12,size,2 + bb1 lt,r12,Lend2 + + ld r10,s1_ptr,0 + ld r12,s1_ptr,4 + ld.d r8,s2_ptr,0 + subu size,size,10 + bcnd lt0,size,Lfin1 +/* Add blocks of 8 limbs until less than 8 limbs remain */ + align 8 +Loop1: subu size,size,8 + subu.cio r6,r10,r8 + ld r10,s1_ptr,8 + subu.cio r7,r12,r9 + ld r12,s1_ptr,12 + ld.d r8,s2_ptr,8 + st.d r6,res_ptr,0 + subu.cio r6,r10,r8 + ld r10,s1_ptr,16 + subu.cio r7,r12,r9 + ld r12,s1_ptr,20 + ld.d r8,s2_ptr,16 + st.d r6,res_ptr,8 + subu.cio r6,r10,r8 + ld r10,s1_ptr,24 + subu.cio r7,r12,r9 + ld r12,s1_ptr,28 + ld.d r8,s2_ptr,24 + st.d r6,res_ptr,16 + subu.cio r6,r10,r8 + ld r10,s1_ptr,32 + subu.cio r7,r12,r9 + ld r12,s1_ptr,36 + addu s1_ptr,s1_ptr,32 + ld.d r8,s2_ptr,32 + addu s2_ptr,s2_ptr,32 + st.d r6,res_ptr,24 + addu res_ptr,res_ptr,32 + bcnd ge0,size,Loop1 + +Lfin1: addu size,size,8-2 + bcnd lt0,size,Lend1 +/* Add blocks of 2 limbs until less than 2 limbs remain */ +Loope1: subu.cio r6,r10,r8 + ld r10,s1_ptr,8 + subu.cio r7,r12,r9 + ld r12,s1_ptr,12 + ld.d r8,s2_ptr,8 + st.d r6,res_ptr,0 + subu size,size,2 + addu s1_ptr,s1_ptr,8 + addu s2_ptr,s2_ptr,8 + addu res_ptr,res_ptr,8 + bcnd ge0,size,Loope1 +Lend1: subu.cio r6,r10,r8 + subu.cio r7,r12,r9 + st.d r6,res_ptr,0 + + bb0 0,size,Lret1 +/* Add last limb */ + ld r10,s1_ptr,8 + ld r8,s2_ptr,8 + subu.cio r6,r10,r8 + st r6,res_ptr,8 + +Lret1: addu.ci r2,r0,r0 ; return carry-out from most sign. limb + jmp.n r1 + xor r2,r2,1 + +L1: xor r12,s1_ptr,res_ptr + bb1 2,r12,L2 +; ** V1b ** + bb0 2,res_ptr,L_v1b ; branch if res_ptr is aligned +/* Add least significant limb separately to align res_ptr and s1_ptr */ + ld r10,s2_ptr,0 + addu s2_ptr,s2_ptr,4 + ld r8,s1_ptr,0 + addu s1_ptr,s1_ptr,4 + subu size,size,1 + subu.co r6,r8,r10 + st r6,res_ptr,0 + addu res_ptr,res_ptr,4 +L_v1b: cmp r12,size,2 + bb1 lt,r12,Lend2 + + ld r10,s2_ptr,0 + ld r12,s2_ptr,4 + ld.d r8,s1_ptr,0 + subu size,size,10 + bcnd lt0,size,Lfin1b +/* Add blocks of 8 limbs until less than 8 limbs remain */ + align 8 +Loop1b: subu size,size,8 + subu.cio r6,r8,r10 + ld r10,s2_ptr,8 + subu.cio r7,r9,r12 + ld r12,s2_ptr,12 + ld.d r8,s1_ptr,8 + st.d r6,res_ptr,0 + subu.cio r6,r8,r10 + ld r10,s2_ptr,16 + subu.cio r7,r9,r12 + ld r12,s2_ptr,20 + ld.d r8,s1_ptr,16 + st.d r6,res_ptr,8 + subu.cio r6,r8,r10 + ld r10,s2_ptr,24 + subu.cio r7,r9,r12 + ld r12,s2_ptr,28 + ld.d r8,s1_ptr,24 + st.d r6,res_ptr,16 + subu.cio r6,r8,r10 + ld r10,s2_ptr,32 + subu.cio r7,r9,r12 + ld r12,s2_ptr,36 + addu s2_ptr,s2_ptr,32 + ld.d r8,s1_ptr,32 + addu s1_ptr,s1_ptr,32 + st.d r6,res_ptr,24 + addu res_ptr,res_ptr,32 + bcnd ge0,size,Loop1b + +Lfin1b: addu size,size,8-2 + bcnd lt0,size,Lend1b +/* Add blocks of 2 limbs until less than 2 limbs remain */ +Loope1b:subu.cio r6,r8,r10 + ld r10,s2_ptr,8 + subu.cio r7,r9,r12 + ld r12,s2_ptr,12 + ld.d r8,s1_ptr,8 + st.d r6,res_ptr,0 + subu size,size,2 + addu s1_ptr,s1_ptr,8 + addu s2_ptr,s2_ptr,8 + addu res_ptr,res_ptr,8 + bcnd ge0,size,Loope1b +Lend1b: subu.cio r6,r8,r10 + subu.cio r7,r9,r12 + st.d r6,res_ptr,0 + + bb0 0,size,Lret1b +/* Add last limb */ + ld r10,s2_ptr,8 + ld r8,s1_ptr,8 + subu.cio r6,r8,r10 + st r6,res_ptr,8 + +Lret1b: addu.ci r2,r0,r0 ; return carry-out from most sign. limb + jmp.n r1 + xor r2,r2,1 + +; ** V2 ** +/* If we come here, the alignment of s1_ptr and res_ptr as well as the + alignment of s2_ptr and res_ptr differ. Since there are only two ways + things can be aligned (that we care about) we now know that the alignment + of s1_ptr and s2_ptr are the same. */ + +L2: cmp r12,size,1 + bb1 eq,r12,Ljone + bb0 2,s1_ptr,L_v2 ; branch if s1_ptr is aligned +/* Add least significant limb separately to align res_ptr and s2_ptr */ + ld r10,s1_ptr,0 + addu s1_ptr,s1_ptr,4 + ld r8,s2_ptr,0 + addu s2_ptr,s2_ptr,4 + subu size,size,1 + subu.co r6,r10,r8 + st r6,res_ptr,0 + addu res_ptr,res_ptr,4 + +L_v2: subu size,size,8 + bcnd lt0,size,Lfin2 +/* Add blocks of 8 limbs until less than 8 limbs remain */ + align 8 +Loop2: subu size,size,8 + ld.d r8,s1_ptr,0 + ld.d r6,s2_ptr,0 + subu.cio r8,r8,r6 + st r8,res_ptr,0 + subu.cio r9,r9,r7 + st r9,res_ptr,4 + ld.d r8,s1_ptr,8 + ld.d r6,s2_ptr,8 + subu.cio r8,r8,r6 + st r8,res_ptr,8 + subu.cio r9,r9,r7 + st r9,res_ptr,12 + ld.d r8,s1_ptr,16 + ld.d r6,s2_ptr,16 + subu.cio r8,r8,r6 + st r8,res_ptr,16 + subu.cio r9,r9,r7 + st r9,res_ptr,20 + ld.d r8,s1_ptr,24 + ld.d r6,s2_ptr,24 + subu.cio r8,r8,r6 + st r8,res_ptr,24 + subu.cio r9,r9,r7 + st r9,res_ptr,28 + addu s1_ptr,s1_ptr,32 + addu s2_ptr,s2_ptr,32 + addu res_ptr,res_ptr,32 + bcnd ge0,size,Loop2 + +Lfin2: addu size,size,8-2 + bcnd lt0,size,Lend2 +Loope2: ld.d r8,s1_ptr,0 + ld.d r6,s2_ptr,0 + subu.cio r8,r8,r6 + st r8,res_ptr,0 + subu.cio r9,r9,r7 + st r9,res_ptr,4 + subu size,size,2 + addu s1_ptr,s1_ptr,8 + addu s2_ptr,s2_ptr,8 + addu res_ptr,res_ptr,8 + bcnd ge0,size,Loope2 +Lend2: bb0 0,size,Lret2 +/* Add last limb */ +Ljone: ld r10,s1_ptr,0 + ld r8,s2_ptr,0 + subu.cio r6,r10,r8 + st r6,res_ptr,0 + +Lret2: addu.ci r2,r0,r0 ; return carry-out from most sign. limb + jmp.n r1 + xor r2,r2,1 diff --git a/sysdeps/m88k/mul_1.s b/sysdeps/m88k/mul_1.s index 35c238d..6b8492c 100644 --- a/sysdeps/m88k/mul_1.s +++ b/sysdeps/m88k/mul_1.s @@ -1,7 +1,7 @@ ; mc88100 __mpn_mul_1 -- Multiply a limb vector with a single limb and ; store the product in a second limb vector. -; Copyright (C) 1992, 1994 Free Software Foundation, Inc. +; Copyright (C) 1992, 1994, 1995 Free Software Foundation, Inc. ; This file is part of the GNU MP Library. @@ -55,14 +55,14 @@ ___mpn_mul_1: ; Make S1_PTR and RES_PTR point at the end of their blocks ; and negate SIZE. lda r3,r3[r4] - lda r6,r2[r4] ; RES_PTR in r6 since r2 is retval + lda r6,r2[r4] ; RES_PTR in r6 since r2 is retval subu r4,r0,r4 - addu.co r2,r0,r0 ; r2 = cy = 0 + addu.co r2,r0,r0 ; r2 = cy = 0 ld r9,r3[r4] - mask r7,r5,0xffff ; r7 = lo(S2_LIMB) - extu r8,r5,16 ; r8 = hi(S2_LIMB) - bcnd.n eq0,r8,Lsmall ; jump if (hi(S2_LIMB) == 0) + mask r7,r5,0xffff ; r7 = lo(S2_LIMB) + extu r8,r5,16 ; r8 = hi(S2_LIMB) + bcnd.n eq0,r8,Lsmall ; jump if (hi(S2_LIMB) == 0) subu r6,r6,4 ; General code for any value of S2_LIMB. @@ -75,28 +75,27 @@ ___mpn_mul_1: br.n L1 addu r4,r4,1 -Loop: - ld r9,r3[r4] +Loop: ld r9,r3[r4] st r26,r6[r4] -; bcnd ne0,r0,0 ; bubble +; bcnd ne0,r0,0 ; bubble addu r4,r4,1 -L1: mul r26,r9,r5 ; low word of product mul_1 WB ld - mask r12,r9,0xffff ; r12 = lo(s1_limb) mask_1 - mul r11,r12,r7 ; r11 = prod_0 mul_2 WB mask_1 - mul r10,r12,r8 ; r10 = prod_1a mul_3 - extu r13,r9,16 ; r13 = hi(s1_limb) extu_1 WB mul_1 - mul r12,r13,r7 ; r12 = prod_1b mul_4 WB extu_1 - mul r25,r13,r8 ; r25 = prod_2 mul_5 WB mul_2 - extu r11,r11,16 ; r11 = hi(prod_0) extu_2 WB mul_3 - addu r10,r10,r11 ; addu_1 WB extu_2 -; bcnd ne0,r0,0 ; bubble WB addu_1 - addu.co r10,r10,r12 ; WB mul_4 - mask.u r10,r10,0xffff ; move the 16 most significant bits... - addu.ci r10,r10,r0 ; ...to the low half of the word... - rot r10,r10,16 ; ...and put carry in pos 16. - addu.co r26,r26,r2 ; add old carry limb +L1: mul r26,r9,r5 ; low word of product mul_1 WB ld + mask r12,r9,0xffff ; r12 = lo(s1_limb) mask_1 + mul r11,r12,r7 ; r11 = prod_0 mul_2 WB mask_1 + mul r10,r12,r8 ; r10 = prod_1a mul_3 + extu r13,r9,16 ; r13 = hi(s1_limb) extu_1 WB mul_1 + mul r12,r13,r7 ; r12 = prod_1b mul_4 WB extu_1 + mul r25,r13,r8 ; r25 = prod_2 mul_5 WB mul_2 + extu r11,r11,16 ; r11 = hi(prod_0) extu_2 WB mul_3 + addu r10,r10,r11 ; addu_1 WB extu_2 +; bcnd ne0,r0,0 ; bubble WB addu_1 + addu.co r10,r10,r12 ; WB mul_4 + mask.u r10,r10,0xffff ; move the 16 most significant bits... + addu.ci r10,r10,r0 ; ...to the low half of the word... + rot r10,r10,16 ; ...and put carry in pos 16. + addu.co r26,r26,r2 ; add old carry limb bcnd.n ne0,r4,Loop - addu.ci r2,r25,r10 ; compute new carry limb + addu.ci r2,r25,r10 ; compute new carry limb st r26,r6[r4] ld.d r25,r31,8 @@ -109,20 +108,19 @@ Lsmall: br.n SL1 addu r4,r4,1 -SLoop: - ld r9,r3[r4] ; - st r8,r6[r4] ; - addu r4,r4,1 ; -SL1: mul r8,r9,r5 ; low word of product - mask r12,r9,0xffff ; r12 = lo(s1_limb) - extu r13,r9,16 ; r13 = hi(s1_limb) - mul r11,r12,r7 ; r11 = prod_0 - mul r12,r13,r7 ; r12 = prod_1b - addu.cio r8,r8,r2 ; add old carry limb - extu r10,r11,16 ; r11 = hi(prod_0) - addu r10,r10,r12 ; +SLoop: ld r9,r3[r4] ; + st r8,r6[r4] ; + addu r4,r4,1 ; +SL1: mul r8,r9,r5 ; low word of product + mask r12,r9,0xffff ; r12 = lo(s1_limb) + extu r13,r9,16 ; r13 = hi(s1_limb) + mul r11,r12,r7 ; r11 = prod_0 + mul r12,r13,r7 ; r12 = prod_1b + addu.cio r8,r8,r2 ; add old carry limb + extu r10,r11,16 ; r11 = hi(prod_0) + addu r10,r10,r12 ; bcnd.n ne0,r4,SLoop - extu r2,r10,16 ; r2 = new carry limb + extu r2,r10,16 ; r2 = new carry limb jmp.n r1 st r8,r6[r4] diff --git a/sysdeps/m88k/sub_n.s b/sysdeps/m88k/sub_n.s index 3963cd5..cd0b791 100644 --- a/sysdeps/m88k/sub_n.s +++ b/sysdeps/m88k/sub_n.s @@ -1,7 +1,7 @@ ; mc88100 __mpn_sub -- Subtract two limb vectors of the same length > 0 and ; store difference in a third limb vector. -; Copyright (C) 1992, 1994 Free Software Foundation, Inc. +; Copyright (C) 1992, 1994, 1996 Free Software Foundation, Inc. ; This file is part of the GNU MP Library. @@ -41,9 +41,10 @@ ___mpn_sub_n: extu r10,r5,3 ld r7,r4,0 ; read first limb from s2_ptr - subu.co r5,r0,r5 ; (clear carry as side effect) + subu r5,r0,r5 mak r5,r5,3<4> - bcnd eq0,r5,Lzero + bcnd.n eq0,r5,Lzero + subu.co r0,r0,r0 ; initialize carry or r12,r0,lo16(Lbase) or.u r12,r12,hi16(Lbase) diff --git a/sysdeps/mips/addmul_1.s b/sysdeps/mips/addmul_1.s index abc2fb8..917af1b 100644 --- a/sysdeps/mips/addmul_1.s +++ b/sysdeps/mips/addmul_1.s @@ -1,7 +1,7 @@ # MIPS __mpn_addmul_1 -- Multiply a limb vector with a single limb and # add the product to a second limb vector. - # Copyright (C) 1992, 1994 Free Software Foundation, Inc. + # Copyright (C) 1992, 1994, 1996 Free Software Foundation, Inc. # This file is part of the GNU MP Library. @@ -63,7 +63,7 @@ Loop: lw $10,0($4) addu $2,$2,$10 sw $3,0($4) addiu $4,$4,4 - bne $6,$0,Loop # should be "bnel" + bne $6,$0,Loop addu $2,$9,$2 # add high product limb and carry from addition # cool down phase 1 diff --git a/sysdeps/mips/mips3/addmul_1.s b/sysdeps/mips/mips3/addmul_1.s index 7af0172..7dbc9ad 100644 --- a/sysdeps/mips/mips3/addmul_1.s +++ b/sysdeps/mips/mips3/addmul_1.s @@ -63,7 +63,7 @@ Loop: ld $10,0($4) daddu $2,$2,$10 sd $3,0($4) daddiu $4,$4,8 - bne $6,$0,Loop # should be "bnel" + bne $6,$0,Loop daddu $2,$9,$2 # add high product limb and carry from addition # cool down phase 1 diff --git a/sysdeps/mips/mips3/mul_1.s b/sysdeps/mips/mips3/mul_1.s index 87954e5..8376a02 100644 --- a/sysdeps/mips/mips3/mul_1.s +++ b/sysdeps/mips/mips3/mul_1.s @@ -59,7 +59,7 @@ Loop: mflo $10 sltu $2,$10,$2 # carry from previous addition -> $2 sd $10,0($4) daddiu $4,$4,8 - bne $6,$0,Loop # should be "bnel" + bne $6,$0,Loop daddu $2,$9,$2 # add high product limb and carry from addition # cool down phase 1 diff --git a/sysdeps/mips/mips3/submul_1.s b/sysdeps/mips/mips3/submul_1.s index f28c6a5..f041f6c 100644 --- a/sysdeps/mips/mips3/submul_1.s +++ b/sysdeps/mips/mips3/submul_1.s @@ -63,7 +63,7 @@ Loop: ld $10,0($4) daddu $2,$2,$10 sd $3,0($4) daddiu $4,$4,8 - bne $6,$0,Loop # should be "bnel" + bne $6,$0,Loop daddu $2,$9,$2 # add high product limb and carry from addition # cool down phase 1 diff --git a/sysdeps/mips/mul_1.s b/sysdeps/mips/mul_1.s index 01327e2..6f5324c 100644 --- a/sysdeps/mips/mul_1.s +++ b/sysdeps/mips/mul_1.s @@ -1,7 +1,7 @@ # MIPS __mpn_mul_1 -- Multiply a limb vector with a single limb and # store the product in a second limb vector. - # Copyright (C) 1992, 1994 Free Software Foundation, Inc. + # Copyright (C) 1992, 1994, 1996 Free Software Foundation, Inc. # This file is part of the GNU MP Library. @@ -59,7 +59,7 @@ Loop: mflo $10 sltu $2,$10,$2 # carry from previous addition -> $2 sw $10,0($4) addiu $4,$4,4 - bne $6,$0,Loop # should be "bnel" + bne $6,$0,Loop addu $2,$9,$2 # add high product limb and carry from addition # cool down phase 1 diff --git a/sysdeps/mips/submul_1.s b/sysdeps/mips/submul_1.s index 616dd1b..a78072a 100644 --- a/sysdeps/mips/submul_1.s +++ b/sysdeps/mips/submul_1.s @@ -1,7 +1,7 @@ # MIPS __mpn_submul_1 -- Multiply a limb vector with a single limb and # subtract the product from a second limb vector. - # Copyright (C) 1992, 1994 Free Software Foundation, Inc. + # Copyright (C) 1992, 1994, 1996 Free Software Foundation, Inc. # This file is part of the GNU MP Library. @@ -63,7 +63,7 @@ Loop: lw $10,0($4) addu $2,$2,$10 sw $3,0($4) addiu $4,$4,4 - bne $6,$0,Loop # should be "bnel" + bne $6,$0,Loop addu $2,$9,$2 # add high product limb and carry from addition # cool down phase 1 diff --git a/sysdeps/rs6000/add_n.s b/sysdeps/rs6000/add_n.s index 7090cf1..e2536d5 100644 --- a/sysdeps/rs6000/add_n.s +++ b/sysdeps/rs6000/add_n.s @@ -1,6 +1,6 @@ # IBM POWER __mpn_add_n -- Add two limb vectors of equal, non-zero length. -# Copyright (C) 1992, 1994 Free Software Foundation, Inc. +# Copyright (C) 1992, 1994, 1995 Free Software Foundation, Inc. # This file is part of the GNU MP Library. diff --git a/sysdeps/rs6000/sub_n.s b/sysdeps/rs6000/sub_n.s index 40fe7d6..c57675b 100644 --- a/sysdeps/rs6000/sub_n.s +++ b/sysdeps/rs6000/sub_n.s @@ -1,7 +1,7 @@ # IBM POWER __mpn_sub_n -- Subtract two limb vectors of the same length > 0 and # store difference in a third limb vector. -# Copyright (C) 1992, 1994 Free Software Foundation, Inc. +# Copyright (C) 1992, 1994, 1995 Free Software Foundation, Inc. # This file is part of the GNU MP Library. diff --git a/sysdeps/vax/gmp-mparam.h b/sysdeps/vax/gmp-mparam.h index 687f12a..ddc308a 100644 --- a/sysdeps/vax/gmp-mparam.h +++ b/sysdeps/vax/gmp-mparam.h @@ -1,6 +1,6 @@ /* gmp-mparam.h -- Compiler/machine parameter header file. -Copyright (C) 1991, 1993, 1994 Free Software Foundation, Inc. +Copyright (C) 1991, 1993, 1994, 1995 Free Software Foundation, Inc. This file is part of the GNU MP Library. diff --git a/sysdeps/z8000/mul_1.s b/sysdeps/z8000/mul_1.s index 2075225..0150e85 100644 --- a/sysdeps/z8000/mul_1.s +++ b/sysdeps/z8000/mul_1.s @@ -1,7 +1,7 @@ ! Z8000 __mpn_mul_1 -- Multiply a limb vector with a limb and store ! the result in a second limb vector. -! Copyright (C) 1993, 1994 Free Software Foundation, Inc. +! Copyright (C) 1993, 1994, 1995 Free Software Foundation, Inc. ! This file is part of the GNU MP Library. -- 2.7.4