Updated from ../=mpn/gmp-1.906.7

author Roland McGrath <roland@gnu.org>

Mon, 16 Oct 1995 01:18:40 +0000 (01:18 +0000)

committer Roland McGrath <roland@gnu.org>

Mon, 16 Oct 1995 01:18:40 +0000 (01:18 +0000)
author Roland McGrath <roland@gnu.org>
Mon, 16 Oct 1995 01:18:40 +0000 (01:18 +0000)
committer Roland McGrath <roland@gnu.org>
Mon, 16 Oct 1995 01:18:40 +0000 (01:18 +0000)
diff --git a/sysdeps/alpha/add_n.s b/sysdeps/alpha/add_n.s

new file mode 100644 (file)

index 0000000..e1ad460
--- /dev/null
+++ b/sysdeps/alpha/add_n.s
@@ -0,0 +1,119 @@
+ # Alpha __mpn_add_n -- Add two limb vectors of the same length > 0 and
+ # store sum in a third limb vector.
+
+ # Copyright (C) 1995 Free Software Foundation, Inc.
+
+ # This file is part of the GNU MP Library.
+
+ # The GNU MP Library is free software; you can redistribute it and/or modify
+ # it under the terms of the GNU Library General Public License as published by
+ # the Free Software Foundation; either version 2 of the License, or (at your
+ # option) any later version.
+
+ # The GNU MP Library is distributed in the hope that it will be useful, but
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ # or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
+ # License for more details.
+
+ # You should have received a copy of the GNU Library General Public License
+ # along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+ # the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+
+ # INPUT PARAMETERS
+ # res_ptr     $16
+ # s1_ptr      $17
+ # s2_ptr      $18
+ # size                $19
+
+       .set    noreorder
+       .set    noat
+.text
+       .align  3
+       .globl  __mpn_add_n
+       .ent    __mpn_add_n
+__mpn_add_n:
+       .frame  $30,0,$26,0
+
+       ldq     $3,0($17)
+       ldq     $4,0($18)
+
+       subq    $19,1,$19
+       and     $19,4-1,$2      # number of limbs in first loop
+       bis     $31,$31,$0
+       beq     $2,.L0          # if multiple of 4 limbs, skip first loop
+
+       subq    $19,$2,$19
+
+.Loop0:        subq    $2,1,$2
+       ldq     $5,8($17)
+       addq    $4,$0,$4
+       ldq     $6,8($18)
+       cmpult  $4,$0,$1
+       addq    $3,$4,$4
+       cmpult  $4,$3,$0
+       stq     $4,0($16)
+       or      $0,$1,$0
+
+       addq    $17,8,$17
+       addq    $18,8,$18
+       bis     $5,$5,$3
+       bis     $6,$6,$4
+       addq    $16,8,$16
+       bne     $2,.Loop0
+
+.L0:   beq     $19,.Lend
+
+       .align  3
+.Loop: subq    $19,4,$19
+
+       ldq     $5,8($17)
+       addq    $4,$0,$4
+       ldq     $6,8($18)
+       cmpult  $4,$0,$1
+       addq    $3,$4,$4
+       cmpult  $4,$3,$0
+       stq     $4,0($16)
+       or      $0,$1,$0
+
+       ldq     $3,16($17)
+       addq    $6,$0,$6
+       ldq     $4,16($18)
+       cmpult  $6,$0,$1
+       addq    $5,$6,$6
+       cmpult  $6,$5,$0
+       stq     $6,8($16)
+       or      $0,$1,$0
+
+       ldq     $5,24($17)
+       addq    $4,$0,$4
+       ldq     $6,24($18)
+       cmpult  $4,$0,$1
+       addq    $3,$4,$4
+       cmpult  $4,$3,$0
+       stq     $4,16($16)
+       or      $0,$1,$0
+
+       ldq     $3,32($17)
+       addq    $6,$0,$6
+       ldq     $4,32($18)
+       cmpult  $6,$0,$1
+       addq    $5,$6,$6
+       cmpult  $6,$5,$0
+       stq     $6,24($16)
+       or      $0,$1,$0
+
+       addq    $17,32,$17
+       addq    $18,32,$18
+       addq    $16,32,$16
+       bne     $19,.Loop
+
+.Lend: addq    $4,$0,$4
+       cmpult  $4,$0,$1
+       addq    $3,$4,$4
+       cmpult  $4,$3,$0
+       stq     $4,0($16)
+       or      $0,$1,$0
+       ret     $31,($26),1
+
+       .end    __mpn_add_n
diff --git a/sysdeps/alpha/addmul_1.s b/sysdeps/alpha/addmul_1.s

new file mode 100644 (file)

index 0000000..46d277d
--- /dev/null
+++ b/sysdeps/alpha/addmul_1.s
@@ -0,0 +1,100 @@
+ # Alpha 21064 __mpn_addmul_1 -- Multiply a limb vector with a limb and add
+ # the result to a second limb vector.
+
+ # Copyright (C) 1992, 1994, 1995 Free Software Foundation, Inc.
+
+ # This file is part of the GNU MP Library.
+
+ # The GNU MP Library is free software; you can redistribute it and/or modify
+ # it under the terms of the GNU Library General Public License as published by
+ # the Free Software Foundation; either version 2 of the License, or (at your
+ # option) any later version.
+
+ # The GNU MP Library is distributed in the hope that it will be useful, but
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ # or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
+ # License for more details.
+
+ # You should have received a copy of the GNU Library General Public License
+ # along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+ # the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+
+ # INPUT PARAMETERS
+ # res_ptr     r16
+ # s1_ptr      r17
+ # size                r18
+ # s2_limb     r19
+
+ # This code runs at 42 cycles/limb on the 21064.
+
+ # To improve performance for long multiplications, we would use
+ # 'fetch' for S1 and 'fetch_m' for RES.  It's not obvious how to use
+ # these instructions without slowing down the general code: 1. We can
+ # only have two prefetches in operation at any time in the Alpha
+ # architecture.  2. There will seldom be any special alignment
+ # between RES_PTR and S1_PTR.  Maybe we can simply divide the current
+ # loop into an inner and outer loop, having the inner loop handle
+ # exactly one prefetch block?
+
+       .set    noreorder
+       .set    noat
+.text
+       .align  3
+       .globl  __mpn_addmul_1
+       .ent    __mpn_addmul_1 2
+__mpn_addmul_1:
+       .frame  $30,0,$26
+
+       ldq     $2,0($17)       # $2 = s1_limb
+       addq    $17,8,$17       # s1_ptr++
+       subq    $18,1,$18       # size--
+       mulq    $2,$19,$3       # $3 = prod_low
+       ldq     $5,0($16)       # $5 = *res_ptr
+       umulh   $2,$19,$0       # $0 = prod_high
+       beq     $18,Lend1       # jump if size was == 1
+       ldq     $2,0($17)       # $2 = s1_limb
+       addq    $17,8,$17       # s1_ptr++
+       subq    $18,1,$18       # size--
+       addq    $5,$3,$3
+       cmpult  $3,$5,$4
+       stq     $3,0($16)
+       addq    $16,8,$16       # res_ptr++
+       beq     $18,Lend2       # jump if size was == 2
+
+       .align  3
+Loop:  mulq    $2,$19,$3       # $3 = prod_low
+       ldq     $5,0($16)       # $5 = *res_ptr
+       addq    $4,$0,$0        # cy_limb = cy_limb + 'cy'
+       subq    $18,1,$18       # size--
+       umulh   $2,$19,$4       # $4 = cy_limb
+       ldq     $2,0($17)       # $2 = s1_limb
+       addq    $17,8,$17       # s1_ptr++
+       addq    $3,$0,$3        # $3 = cy_limb + prod_low
+       cmpult  $3,$0,$0        # $0 = carry from (cy_limb + prod_low)
+       addq    $5,$3,$3
+       cmpult  $3,$5,$5
+       stq     $3,0($16)
+       addq    $16,8,$16       # res_ptr++
+       addq    $5,$0,$0        # combine carries
+       bne     $18,Loop
+
+Lend2: mulq    $2,$19,$3       # $3 = prod_low
+       ldq     $5,0($16)       # $5 = *res_ptr
+       addq    $4,$0,$0        # cy_limb = cy_limb + 'cy'
+       umulh   $2,$19,$4       # $4 = cy_limb
+       addq    $3,$0,$3        # $3 = cy_limb + prod_low
+       cmpult  $3,$0,$0        # $0 = carry from (cy_limb + prod_low)
+       addq    $5,$3,$3
+       cmpult  $3,$5,$5
+       stq     $3,0($16)
+       addq    $5,$0,$0        # combine carries
+       addq    $4,$0,$0        # cy_limb = prod_high + cy
+       ret     $31,($26),1
+Lend1: addq    $5,$3,$3
+       cmpult  $3,$5,$5
+       stq     $3,0($16)
+       addq    $0,$5,$0
+       ret     $31,($26),1
+
+       .end    __mpn_addmul_1
diff --git a/sysdeps/alpha/alphaev5/add_n.s b/sysdeps/alpha/alphaev5/add_n.s

new file mode 100644 (file)

index 0000000..2aaf041
--- /dev/null
+++ b/sysdeps/alpha/alphaev5/add_n.s
@@ -0,0 +1,118 @@
+ # Alpha __mpn_add_n -- Add two limb vectors of the same length > 0 and
+ # store sum in a third limb vector.
+
+ # Copyright (C) 1995 Free Software Foundation, Inc.
+
+ # This file is part of the GNU MP Library.
+
+ # The GNU MP Library is free software; you can redistribute it and/or modify
+ # it under the terms of the GNU Library General Public License as published by
+ # the Free Software Foundation; either version 2 of the License, or (at your
+ # option) any later version.
+
+ # The GNU MP Library is distributed in the hope that it will be useful, but
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ # or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
+ # License for more details.
+
+ # You should have received a copy of the GNU Library General Public License
+ # along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+ # the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+
+ # INPUT PARAMETERS
+ # res_ptr     $16
+ # s1_ptr      $17
+ # s2_ptr      $18
+ # size                $19
+
+       .set    noreorder
+       .set    noat
+.text
+       .align  3
+       .globl  __mpn_add_n
+       .ent    __mpn_add_n
+__mpn_add_n:
+       .frame  $30,0,$26,0
+
+       ldq     $3,0($17)
+       ldq     $4,0($18)
+
+       subq    $19,1,$19
+       and     $19,4-1,$2      # number of limbs in first loop
+       bis     $31,$31,$0
+       beq     $2,.L0          # if multiple of 4 limbs, skip first loop
+
+       subq    $19,$2,$19
+
+.Loop0:        subq    $2,1,$2
+       ldq     $5,8($17)
+       addq    $4,$0,$4
+       ldq     $6,8($18)
+       cmpult  $4,$0,$1
+       addq    $3,$4,$4
+       cmpult  $4,$3,$0
+       stq     $4,0($16)
+       or      $0,$1,$0
+
+       addq    $17,8,$17
+       addq    $18,8,$18
+       bis     $5,$5,$3
+       bis     $6,$6,$4
+       addq    $16,8,$16
+       bne     $2,.Loop0
+
+.L0:   beq     $19,.Lend
+
+       .align  4
+.Loop: subq    $19,4,$19
+       unop
+
+       ldq     $6,8($18)
+       addq    $4,$0,$0
+       ldq     $5,8($17)
+       cmpult  $0,$4,$1
+       ldq     $4,16($18)
+       addq    $3,$0,$20
+       cmpult  $20,$3,$0
+       ldq     $3,16($17)
+       or      $0,$1,$0
+       addq    $6,$0,$0
+       cmpult  $0,$6,$1
+       ldq     $6,24($18)
+       addq    $5,$0,$21
+       cmpult  $21,$5,$0
+       ldq     $5,24($17)
+       or      $0,$1,$0
+       addq    $4,$0,$0
+       cmpult  $0,$4,$1
+       ldq     $4,32($18)
+       addq    $3,$0,$22
+       cmpult  $22,$3,$0
+       ldq     $3,32($17)
+       or      $0,$1,$0
+       addq    $6,$0,$0
+       cmpult  $0,$6,$1
+       addq    $5,$0,$23
+       cmpult  $23,$5,$0
+       or      $0,$1,$0
+
+       stq     $20,0($16)
+       stq     $21,8($16)
+       stq     $22,16($16)
+       stq     $23,24($16)
+
+       addq    $17,32,$17
+       addq    $18,32,$18
+       addq    $16,32,$16
+       bne     $19,.Loop
+
+.Lend: addq    $4,$0,$4
+       cmpult  $4,$0,$1
+       addq    $3,$4,$4
+       cmpult  $4,$3,$0
+       stq     $4,0($16)
+       or      $0,$1,$0
+       ret     $31,($26),1
+
+       .end    __mpn_add_n
diff --git a/sysdeps/alpha/alphaev5/lshift.s b/sysdeps/alpha/alphaev5/lshift.s

new file mode 100644 (file)

index 0000000..fdb0895
--- /dev/null
+++ b/sysdeps/alpha/alphaev5/lshift.s
@@ -0,0 +1,175 @@
+ # Alpha EV5 __mpn_lshift --
+
+ # Copyright (C) 1994, 1995 Free Software Foundation, Inc.
+
+ # This file is part of the GNU MP Library.
+
+ # The GNU MP Library is free software; you can redistribute it and/or modify
+ # it under the terms of the GNU Library General Public License as published by
+ # the Free Software Foundation; either version 2 of the License, or (at your
+ # option) any later version.
+
+ # The GNU MP Library is distributed in the hope that it will be useful, but
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ # or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
+ # License for more details.
+
+ # You should have received a copy of the GNU Library General Public License
+ # along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+ # the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+
+ # INPUT PARAMETERS
+ # res_ptr     r16
+ # s1_ptr      r17
+ # size                r18
+ # cnt         r19
+
+ # This code runs at 4.25 cycles/limb on the EV5.
+
+       .set    noreorder
+       .set    noat
+.text
+       .align  3
+       .globl  __mpn_lshift
+       .ent    __mpn_lshift
+__mpn_lshift:
+       .frame  $30,0,$26,0
+
+       s8addq  $18,$17,$17     # make r17 point at end of s1
+       ldq     $4,-8($17)      # load first limb
+       subq    $31,$19,$20
+       s8addq  $18,$16,$16     # make r16 point at end of RES
+       subq    $18,1,$18
+       and     $18,4-1,$28     # number of limbs in first loop
+       srl     $4,$20,$0       # compute function result
+
+       beq     $28,L0
+       subq    $18,$28,$18
+
+       .align  3
+Loop0: ldq     $3,-16($17)
+       subq    $16,8,$16
+       sll     $4,$19,$5
+       subq    $17,8,$17
+       subq    $28,1,$28
+       srl     $3,$20,$6
+       or      $3,$3,$4
+       or      $5,$6,$8
+       stq     $8,0($16)
+       bne     $28,Loop0
+
+L0:    sll     $4,$19,$24
+       beq     $18,Lend
+ # warm up phase 1
+       ldq     $1,-16($17)
+       subq    $18,4,$18
+       ldq     $2,-24($17)
+       ldq     $3,-32($17)
+       ldq     $4,-40($17)
+       beq     $18,Lcool1
+ # warm up phase 2
+       srl     $1,$20,$7
+       sll     $1,$19,$21
+       srl     $2,$20,$8
+       ldq     $1,-48($17)
+       sll     $2,$19,$22
+       ldq     $2,-56($17)
+       srl     $3,$20,$5
+       or      $7,$24,$7
+       sll     $3,$19,$23
+       or      $8,$21,$8
+       srl     $4,$20,$6
+       ldq     $3,-64($17)
+       sll     $4,$19,$24
+       ldq     $4,-72($17)
+       subq    $18,4,$18
+       beq     $18,Lcool1
+       .align  4
+ # main loop
+Loop:  stq     $7,-8($16)
+       or      $5,$22,$5
+       stq     $8,-16($16)
+       or      $6,$23,$6
+
+       srl     $1,$20,$7
+       subq    $18,4,$18
+       sll     $1,$19,$21
+       unop    # ldq   $31,-96($17)
+
+       srl     $2,$20,$8
+       ldq     $1,-80($17)
+       sll     $2,$19,$22
+       ldq     $2,-88($17)
+
+       stq     $5,-24($16)
+       or      $7,$24,$7
+       stq     $6,-32($16)
+       or      $8,$21,$8
+
+       srl     $3,$20,$5
+       unop    # ldq   $31,-96($17)
+       sll     $3,$19,$23
+       subq    $16,32,$16
+
+       srl     $4,$20,$6
+       ldq     $3,-96($17
+       sll     $4,$19,$24
+       ldq     $4,-104($17)
+
+       subq    $17,32,$17
+       bne     $18,Loop
+       unop
+       unop
+ # cool down phase 2/1
+Lcool1:        stq     $7,-8($16)
+       or      $5,$22,$5
+       stq     $8,-16($16)
+       or      $6,$23,$6
+       srl     $1,$20,$7
+       sll     $1,$19,$21
+       srl     $2,$20,$8
+       sll     $2,$19,$22
+       stq     $5,-24($16)
+       or      $7,$24,$7
+       stq     $6,-32($16)
+       or      $8,$21,$8
+       srl     $3,$20,$5
+       sll     $3,$19,$23
+       srl     $4,$20,$6
+       sll     $4,$19,$24
+ # cool down phase 2/2
+       stq     $7,-40($16)
+       or      $5,$22,$5
+       stq     $8,-48($16)
+       or      $6,$23,$6
+       stq     $5,-56($16)
+       stq     $6,-64($16)
+ # cool down phase 2/3
+       stq     $24,-72($16)
+       ret     $31,($26),1
+
+ # cool down phase 1/1
+Lcool1:        srl     $1,$20,$7
+       sll     $1,$19,$21
+       srl     $2,$20,$8
+       sll     $2,$19,$22
+       srl     $3,$20,$5
+       or      $7,$24,$7
+       sll     $3,$19,$23
+       or      $8,$21,$8
+       srl     $4,$20,$6
+       sll     $4,$19,$24
+ # cool down phase 1/2
+       stq     $7,-8($16)
+       or      $5,$22,$5
+       stq     $8,-16($16)
+       or      $6,$23,$6
+       stq     $5,-24($16)
+       stq     $6,-32($16)
+       stq     $24,-40($16)
+       ret     $31,($26),1
+
+Lend   stq     $24,-8($16)
+       ret     $31,($26),1
+       .end    __mpn_lshift
diff --git a/sysdeps/alpha/alphaev5/rshift.s b/sysdeps/alpha/alphaev5/rshift.s

new file mode 100644 (file)

index 0000000..1da9960
--- /dev/null
+++ b/sysdeps/alpha/alphaev5/rshift.s
@@ -0,0 +1,173 @@
+ # Alpha EV5 __mpn_rshift --
+
+ # Copyright (C) 1994, 1995 Free Software Foundation, Inc.
+
+ # This file is part of the GNU MP Library.
+
+ # The GNU MP Library is free software; you can redistribute it and/or modify
+ # it under the terms of the GNU Library General Public License as published by
+ # the Free Software Foundation; either version 2 of the License, or (at your
+ # option) any later version.
+
+ # The GNU MP Library is distributed in the hope that it will be useful, but
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ # or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
+ # License for more details.
+
+ # You should have received a copy of the GNU Library General Public License
+ # along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+ # the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+
+ # INPUT PARAMETERS
+ # res_ptr     r16
+ # s1_ptr      r17
+ # size                r18
+ # cnt         r19
+
+ # This code runs at 4.25 cycles/limb on the EV5.
+
+       .set    noreorder
+       .set    noat
+.text
+       .align  3
+       .globl  __mpn_rshift
+       .ent    __mpn_rshift
+__mpn_rshift:
+       .frame  $30,0,$26,0
+
+       ldq     $4,0($17)       # load first limb
+       subq    $31,$19,$20
+       subq    $18,1,$18
+       and     $18,4-1,$28     # number of limbs in first loop
+       sll     $4,$20,$0       # compute function result
+
+       beq     $28,L0
+       subq    $18,$28,$18
+
+       .align  3
+Loop0: ldq     $3,8($17)
+       addq    $16,8,$16
+       srl     $4,$19,$5
+       addq    $17,8,$17
+       subq    $28,1,$28
+       sll     $3,$20,$6
+       or      $3,$3,$4
+       or      $5,$6,$8
+       stq     $8,-8($16)
+       bne     $28,Loop0
+
+L0:    srl     $4,$19,$24
+       beq     $18,Lend
+ # warm up phase 1
+       ldq     $1,8($17)
+       subq    $18,4,$18
+       ldq     $2,16($17)
+       ldq     $3,24($17)
+       ldq     $4,32($17)
+       beq     $18,Lcool1
+ # warm up phase 2
+       sll     $1,$20,$7
+       srl     $1,$19,$21
+       sll     $2,$20,$8
+       ldq     $1,40($17)
+       srl     $2,$19,$22
+       ldq     $2,48($17)
+       sll     $3,$20,$5
+       or      $7,$24,$7
+       srl     $3,$19,$23
+       or      $8,$21,$8
+       sll     $4,$20,$6
+       ldq     $3,56($17)
+       srl     $4,$19,$24
+       ldq     $4,64($17)
+       subq    $18,4,$18
+       beq     $18,Lcool2
+       .align  4
+ # main loop
+Loop:  stq     $7,0($16)
+       or      $5,$22,$5
+       stq     $8,8($16)
+       or      $6,$23,$6
+
+       sll     $1,$20,$7
+       subq    $18,4,$18
+       srl     $1,$19,$21
+       unop    # ldq   $31,-96($17)
+
+       sll     $2,$20,$8
+       ldq     $1,72($17)
+       srl     $2,$19,$22
+       ldq     $2,80($17)
+
+       stq     $5,16($16)
+       or      $7,$24,$7
+       stq     $6,24($16)
+       or      $8,$21,$8
+
+       sll     $3,$20,$5
+       unop    # ldq   $31,-96($17)
+       srl     $3,$19,$23
+       addq    $16,32,$16
+
+       sll     $4,$20,$6
+       ldq     $3,88($17)
+       srl     $4,$19,$24
+       ldq     $4,96($17)
+
+       addq    $17,32,$17
+       bne     $18,Loop
+       unop
+       unop
+ # cool down phase 2/1
+Lcool2:        stq     $7,0($16)
+       or      $5,$22,$5
+       stq     $8,8($16)
+       or      $6,$23,$6
+       sll     $1,$20,$7
+       srl     $1,$19,$21
+       sll     $2,$20,$8
+       srl     $2,$19,$22
+       stq     $5,16($16)
+       or      $7,$24,$7
+       stq     $6,24($16)
+       or      $8,$21,$8
+       sll     $3,$20,$5
+       srl     $3,$19,$23
+       sll     $4,$20,$6
+       srl     $4,$19,$24
+ # cool down phase 2/2
+       stq     $7,32($16)
+       or      $5,$22,$5
+       stq     $8,40($16)
+       or      $6,$23,$6
+       stq     $5,48($16)
+       stq     $6,56($16)
+ # cool down phase 2/3
+       stq     $24,64($16)
+       ret     $31,($26),1
+
+ # cool down phase 1/1
+Lcool1:        sll     $1,$20,$7
+       srl     $1,$19,$21
+       sll     $2,$20,$8
+       srl     $2,$19,$22
+       sll     $3,$20,$5
+       or      $7,$24,$7
+       srl     $3,$19,$23
+       or      $8,$21,$8
+       sll     $4,$20,$6
+       srl     $4,$19,$24
+ # cool down phase 1/2
+       stq     $7,0($16)
+       or      $5,$22,$5
+       stq     $8,8($16)
+       or      $6,$23,$6
+       stq     $5,16($16)
+       stq     $6,24($16)
+       stq     $24,32($16)
+       ret     $31,($26),1
+
+Lend:  stq     $24,0($16)
+       ret     $31,($26),1
+       .end    __mpn_rshift
diff --git a/sysdeps/alpha/lshift.s b/sysdeps/alpha/lshift.s

new file mode 100644 (file)

index 0000000..c284349
--- /dev/null
+++ b/sysdeps/alpha/lshift.s
@@ -0,0 +1,108 @@
+ # Alpha 21064 __mpn_lshift --
+
+ # Copyright (C) 1994, 1995 Free Software Foundation, Inc.
+
+ # This file is part of the GNU MP Library.
+
+ # The GNU MP Library is free software; you can redistribute it and/or modify
+ # it under the terms of the GNU Library General Public License as published by
+ # the Free Software Foundation; either version 2 of the License, or (at your
+ # option) any later version.
+
+ # The GNU MP Library is distributed in the hope that it will be useful, but
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ # or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
+ # License for more details.
+
+ # You should have received a copy of the GNU Library General Public License
+ # along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+ # the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+
+ # INPUT PARAMETERS
+ # res_ptr     r16
+ # s1_ptr      r17
+ # size                r18
+ # cnt         r19
+
+ # This code runs at 4.8 cycles/limb on the 21064.  With infinite unrolling,
+ # it would take 4 cycles/limb.  It should be possible to get down to 3
+ # cycles/limb since both ldq and stq can be paired with the other used
+ # instructions.  But there are many restrictions in the 21064 pipeline that
+ # makes it hard, if not impossible, to get down to 3 cycles/limb:
+
+ # 1. ldq has a 3 cycle delay, srl and sll have a 2 cycle delay.
+ # 2. Only aligned instruction pairs can be paired.
+ # 3. The store buffer or silo might not be able to deal with the bandwidth.
+
+       .set    noreorder
+       .set    noat
+.text
+       .align  3
+       .globl  __mpn_lshift
+       .ent    __mpn_lshift
+__mpn_lshift:
+       .frame  $30,0,$26,0
+
+       s8addq  $18,$17,$17     # make r17 point at end of s1
+       ldq     $4,-8($17)      # load first limb
+       subq    $17,8,$17
+       subq    $31,$19,$7
+       s8addq  $18,$16,$16     # make r16 point at end of RES
+       subq    $18,1,$18
+       and     $18,4-1,$20     # number of limbs in first loop
+       srl     $4,$7,$0        # compute function result
+
+       beq     $20,L0
+       subq    $18,$20,$18
+
+       .align  3
+Loop0:
+       ldq     $3,-8($17)
+       subq    $16,8,$16
+       subq    $17,8,$17
+       subq    $20,1,$20
+       sll     $4,$19,$5
+       srl     $3,$7,$6
+       bis     $3,$3,$4
+       bis     $5,$6,$8
+       stq     $8,0($16)
+       bne     $20,Loop0
+
+L0:    beq     $18,Lend
+
+       .align  3
+Loop:  ldq     $3,-8($17)
+       subq    $16,32,$16
+       subq    $18,4,$18
+       sll     $4,$19,$5
+       srl     $3,$7,$6
+
+       ldq     $4,-16($17)
+       sll     $3,$19,$1
+       bis     $5,$6,$8
+       stq     $8,24($16)
+       srl     $4,$7,$2
+
+       ldq     $3,-24($17)
+       sll     $4,$19,$5
+       bis     $1,$2,$8
+       stq     $8,16($16)
+       srl     $3,$7,$6
+
+       ldq     $4,-32($17)
+       sll     $3,$19,$1
+       bis     $5,$6,$8
+       stq     $8,8($16)
+       srl     $4,$7,$2
+
+       subq    $17,32,$17
+       bis     $1,$2,$8
+       stq     $8,0($16)
+
+       bgt     $18,Loop
+
+Lend:  sll     $4,$19,$8
+       stq     $8,-8($16)
+       ret     $31,($26),1
+       .end    __mpn_lshift
diff --git a/sysdeps/alpha/mul_1.s b/sysdeps/alpha/mul_1.s

new file mode 100644 (file)

index 0000000..3ef194d
--- /dev/null
+++ b/sysdeps/alpha/mul_1.s
@@ -0,0 +1,84 @@
+ # Alpha 21064 __mpn_mul_1 -- Multiply a limb vector with a limb and store
+ # the result in a second limb vector.
+
+ # Copyright (C) 1992, 1994 Free Software Foundation, Inc.
+
+ # This file is part of the GNU MP Library.
+
+ # The GNU MP Library is free software; you can redistribute it and/or modify
+ # it under the terms of the GNU Library General Public License as published by
+ # the Free Software Foundation; either version 2 of the License, or (at your
+ # option) any later version.
+
+ # The GNU MP Library is distributed in the hope that it will be useful, but
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ # or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
+ # License for more details.
+
+ # You should have received a copy of the GNU Library General Public License
+ # along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+ # the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+
+ # INPUT PARAMETERS
+ # res_ptr     r16
+ # s1_ptr      r17
+ # size                r18
+ # s2_limb     r19
+
+ # This code runs at 42 cycles/limb on the EV4 and 18 cycles/limb on the EV5.
+
+ # To improve performance for long multiplications, we would use
+ # 'fetch' for S1 and 'fetch_m' for RES.  It's not obvious how to use
+ # these instructions without slowing down the general code: 1. We can
+ # only have two prefetches in operation at any time in the Alpha
+ # architecture.  2. There will seldom be any special alignment
+ # between RES_PTR and S1_PTR.  Maybe we can simply divide the current
+ # loop into an inner and outer loop, having the inner loop handle
+ # exactly one prefetch block?
+
+       .set    noreorder
+       .set    noat
+.text
+       .align  3
+       .globl  __mpn_mul_1
+       .ent    __mpn_mul_1 2
+__mpn_mul_1:
+       .frame  $30,0,$26
+
+       ldq     $2,0($17)       # $2 = s1_limb
+       subq    $18,1,$18       # size--
+       mulq    $2,$19,$3       # $3 = prod_low
+       bic     $31,$31,$4      # clear cy_limb
+       umulh   $2,$19,$0       # $0 = prod_high
+       beq     $18,Lend1       # jump if size was == 1
+       ldq     $2,8($17)       # $2 = s1_limb
+       subq    $18,1,$18       # size--
+       stq     $3,0($16)
+       beq     $18,Lend2       # jump if size was == 2
+
+       .align  3
+Loop:  mulq    $2,$19,$3       # $3 = prod_low
+       addq    $4,$0,$0        # cy_limb = cy_limb + 'cy'
+       subq    $18,1,$18       # size--
+       umulh   $2,$19,$4       # $4 = cy_limb
+       ldq     $2,16($17)      # $2 = s1_limb
+       addq    $17,8,$17       # s1_ptr++
+       addq    $3,$0,$3        # $3 = cy_limb + prod_low
+       stq     $3,8($16)
+       cmpult  $3,$0,$0        # $0 = carry from (cy_limb + prod_low)
+       addq    $16,8,$16       # res_ptr++
+       bne     $18,Loop
+
+Lend2: mulq    $2,$19,$3       # $3 = prod_low
+       addq    $4,$0,$0        # cy_limb = cy_limb + 'cy'
+       umulh   $2,$19,$4       # $4 = cy_limb
+       addq    $3,$0,$3        # $3 = cy_limb + prod_low
+       cmpult  $3,$0,$0        # $0 = carry from (cy_limb + prod_low)
+       stq     $3,8($16)
+       addq    $4,$0,$0        # cy_limb = prod_high + cy
+       ret     $31,($26),1
+Lend1: stq     $3,0($16)
+       ret     $31,($26),1
+
+       .end    __mpn_mul_1
diff --git a/sysdeps/alpha/rshift.s b/sysdeps/alpha/rshift.s

new file mode 100644 (file)

index 0000000..74eab04
--- /dev/null
+++ b/sysdeps/alpha/rshift.s
@@ -0,0 +1,106 @@
+ # Alpha 21064 __mpn_rshift --
+
+ # Copyright (C) 1994, 1995 Free Software Foundation, Inc.
+
+ # This file is part of the GNU MP Library.
+
+ # The GNU MP Library is free software; you can redistribute it and/or modify
+ # it under the terms of the GNU Library General Public License as published by
+ # the Free Software Foundation; either version 2 of the License, or (at your
+ # option) any later version.
+
+ # The GNU MP Library is distributed in the hope that it will be useful, but
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ # or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
+ # License for more details.
+
+ # You should have received a copy of the GNU Library General Public License
+ # along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+ # the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+
+ # INPUT PARAMETERS
+ # res_ptr     r16
+ # s1_ptr      r17
+ # size                r18
+ # cnt         r19
+
+ # This code runs at 4.8 cycles/limb on the 21064.  With infinite unrolling,
+ # it would take 4 cycles/limb.  It should be possible to get down to 3
+ # cycles/limb since both ldq and stq can be paired with the other used
+ # instructions.  But there are many restrictions in the 21064 pipeline that
+ # makes it hard, if not impossible, to get down to 3 cycles/limb:
+
+ # 1. ldq has a 3 cycle delay, srl and sll have a 2 cycle delay.
+ # 2. Only aligned instruction pairs can be paired.
+ # 3. The store buffer or silo might not be able to deal with the bandwidth.
+      
+       .set    noreorder
+       .set    noat
+.text
+       .align  3
+       .globl  __mpn_rshift
+       .ent    __mpn_rshift
+__mpn_rshift:
+       .frame  $30,0,$26,0
+
+       ldq     $4,0($17)       # load first limb
+       addq    $17,8,$17
+       subq    $31,$19,$7
+       subq    $18,1,$18
+       and     $18,4-1,$20     # number of limbs in first loop
+       sll     $4,$7,$0        # compute function result
+
+       beq     $20,L0
+       subq    $18,$20,$18
+
+       .align  3
+Loop0:
+       ldq     $3,0($17)
+       addq    $16,8,$16
+       addq    $17,8,$17
+       subq    $20,1,$20
+       srl     $4,$19,$5
+       sll     $3,$7,$6
+       bis     $3,$3,$4
+       bis     $5,$6,$8
+       stq     $8,-8($16)
+       bne     $20,Loop0
+
+L0:    beq     $18,Lend
+
+       .align  3
+Loop:  ldq     $3,0($17)
+       addq    $16,32,$16
+       subq    $18,4,$18
+       srl     $4,$19,$5
+       sll     $3,$7,$6
+
+       ldq     $4,8($17)
+       srl     $3,$19,$1
+       bis     $5,$6,$8
+       stq     $8,-32($16)
+       sll     $4,$7,$2
+
+       ldq     $3,16($17)
+       srl     $4,$19,$5
+       bis     $1,$2,$8
+       stq     $8,-24($16)
+       sll     $3,$7,$6
+
+       ldq     $4,24($17)
+       srl     $3,$19,$1
+       bis     $5,$6,$8
+       stq     $8,-16($16)
+       sll     $4,$7,$2
+
+       addq    $17,32,$17
+       bis     $1,$2,$8
+       stq     $8,-8($16)
+
+       bgt     $18,Loop
+
+Lend:  srl     $4,$19,$8
+       stq     $8,0($16)
+       ret     $31,($26),1
+       .end    __mpn_rshift
diff --git a/sysdeps/alpha/sub_n.s b/sysdeps/alpha/sub_n.s

new file mode 100644 (file)

index 0000000..5200025
--- /dev/null
+++ b/sysdeps/alpha/sub_n.s
@@ -0,0 +1,119 @@
+ # Alpha __mpn_sub_n -- Subtract two limb vectors of the same length > 0 and
+ # store difference in a third limb vector.
+
+ # Copyright (C) 1995 Free Software Foundation, Inc.
+
+ # This file is part of the GNU MP Library.
+
+ # The GNU MP Library is free software; you can redistribute it and/or modify
+ # it under the terms of the GNU Library General Public License as published by
+ # the Free Software Foundation; either version 2 of the License, or (at your
+ # option) any later version.
+
+ # The GNU MP Library is distributed in the hope that it will be useful, but
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ # or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
+ # License for more details.
+
+ # You should have received a copy of the GNU Library General Public License
+ # along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+ # the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+
+ # INPUT PARAMETERS
+ # res_ptr     $16
+ # s1_ptr      $17
+ # s2_ptr      $18
+ # size                $19
+
+       .set    noreorder
+       .set    noat
+.text
+       .align  3
+       .globl  __mpn_sub_n
+       .ent    __mpn_sub_n
+__mpn_sub_n:
+       .frame  $30,0,$26,0
+
+       ldq     $3,0($17)
+       ldq     $4,0($18)
+
+       subq    $19,1,$19
+       and     $19,4-1,$2      # number of limbs in first loop
+       bis     $31,$31,$0
+       beq     $2,.L0          # if multiple of 4 limbs, skip first loop
+
+       subq    $19,$2,$19
+
+.Loop0:        subq    $2,1,$2
+       ldq     $5,8($17)
+       addq    $4,$0,$4
+       ldq     $6,8($18)
+       cmpult  $4,$0,$1
+       subq    $3,$4,$4
+       cmpult  $3,$4,$0
+       stq     $4,0($16)
+       or      $0,$1,$0
+
+       addq    $17,8,$17
+       addq    $18,8,$18
+       bis     $5,$5,$3
+       bis     $6,$6,$4
+       addq    $16,8,$16
+       bne     $2,.Loop0
+
+.L0:   beq     $19,.Lend
+
+       .align  3
+.Loop: subq    $19,4,$19
+
+       ldq     $5,8($17)
+       addq    $4,$0,$4
+       ldq     $6,8($18)
+       cmpult  $4,$0,$1
+       subq    $3,$4,$4
+       cmpult  $3,$4,$0
+       stq     $4,0($16)
+       or      $0,$1,$0
+
+       ldq     $3,16($17)
+       addq    $6,$0,$6
+       ldq     $4,16($18)
+       cmpult  $6,$0,$1
+       subq    $5,$6,$6
+       cmpult  $5,$6,$0
+       stq     $6,8($16)
+       or      $0,$1,$0
+
+       ldq     $5,24($17)
+       addq    $4,$0,$4
+       ldq     $6,24($18)
+       cmpult  $4,$0,$1
+       subq    $3,$4,$4
+       cmpult  $3,$4,$0
+       stq     $4,16($16)
+       or      $0,$1,$0
+
+       ldq     $3,32($17)
+       addq    $6,$0,$6
+       ldq     $4,32($18)
+       cmpult  $6,$0,$1
+       subq    $5,$6,$6
+       cmpult  $5,$6,$0
+       stq     $6,24($16)
+       or      $0,$1,$0
+
+       addq    $17,32,$17
+       addq    $18,32,$18
+       addq    $16,32,$16
+       bne     $19,.Loop
+
+.Lend: addq    $4,$0,$4
+       cmpult  $4,$0,$1
+       subq    $3,$4,$4
+       cmpult  $3,$4,$0
+       stq     $4,0($16)
+       or      $0,$1,$0
+       ret     $31,($26),1
+
+       .end    __mpn_sub_n
diff --git a/sysdeps/alpha/submul_1.s b/sysdeps/alpha/submul_1.s

new file mode 100644 (file)

index 0000000..acaa11c
--- /dev/null
+++ b/sysdeps/alpha/submul_1.s
@@ -0,0 +1,100 @@
+ # Alpha 21064 __mpn_submul_1 -- Multiply a limb vector with a limb and
+ # subtract the result from a second limb vector.
+
+ # Copyright (C) 1992, 1994, 1995 Free Software Foundation, Inc.
+
+ # This file is part of the GNU MP Library.
+
+ # The GNU MP Library is free software; you can redistribute it and/or modify
+ # it under the terms of the GNU Library General Public License as published by
+ # the Free Software Foundation; either version 2 of the License, or (at your
+ # option) any later version.
+
+ # The GNU MP Library is distributed in the hope that it will be useful, but
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ # or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
+ # License for more details.
+
+ # You should have received a copy of the GNU Library General Public License
+ # along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+ # the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+
+ # INPUT PARAMETERS
+ # res_ptr     r16
+ # s1_ptr      r17
+ # size                r18
+ # s2_limb     r19
+
+ # This code runs at 42 cycles/limb on the 21064.
+
+ # To improve performance for long multiplications, we would use
+ # 'fetch' for S1 and 'fetch_m' for RES.  It's not obvious how to use
+ # these instructions without slowing down the general code: 1. We can
+ # only have two prefetches in operation at any time in the Alpha
+ # architecture.  2. There will seldom be any special alignment
+ # between RES_PTR and S1_PTR.  Maybe we can simply divide the current
+ # loop into an inner and outer loop, having the inner loop handle
+ # exactly one prefetch block?
+
+       .set    noreorder
+       .set    noat
+.text
+       .align  3
+       .globl  __mpn_submul_1
+       .ent    __mpn_submul_1 2
+__mpn_submul_1:
+       .frame  $30,0,$26
+
+       ldq     $2,0($17)       # $2 = s1_limb
+       addq    $17,8,$17       # s1_ptr++
+       subq    $18,1,$18       # size--
+       mulq    $2,$19,$3       # $3 = prod_low
+       ldq     $5,0($16)       # $5 = *res_ptr
+       umulh   $2,$19,$0       # $0 = prod_high
+       beq     $18,Lend1       # jump if size was == 1
+       ldq     $2,0($17)       # $2 = s1_limb
+       addq    $17,8,$17       # s1_ptr++
+       subq    $18,1,$18       # size--
+       subq    $5,$3,$3
+       cmpult  $5,$3,$4
+       stq     $3,0($16)
+       addq    $16,8,$16       # res_ptr++
+       beq     $18,Lend2       # jump if size was == 2
+
+       .align  3
+Loop:  mulq    $2,$19,$3       # $3 = prod_low
+       ldq     $5,0($16)       # $5 = *res_ptr
+       addq    $4,$0,$0        # cy_limb = cy_limb + 'cy'
+       subq    $18,1,$18       # size--
+       umulh   $2,$19,$4       # $4 = cy_limb
+       ldq     $2,0($17)       # $2 = s1_limb
+       addq    $17,8,$17       # s1_ptr++
+       addq    $3,$0,$3        # $3 = cy_limb + prod_low
+       cmpult  $3,$0,$0        # $0 = carry from (cy_limb + prod_low)
+       subq    $5,$3,$3
+       cmpult  $5,$3,$5
+       stq     $3,0($16)
+       addq    $16,8,$16       # res_ptr++
+       addq    $5,$0,$0        # combine carries
+       bne     $18,Loop
+
+Lend2: mulq    $2,$19,$3       # $3 = prod_low
+       ldq     $5,0($16)       # $5 = *res_ptr
+       addq    $4,$0,$0        # cy_limb = cy_limb + 'cy'
+       umulh   $2,$19,$4       # $4 = cy_limb
+       addq    $3,$0,$3        # $3 = cy_limb + prod_low
+       cmpult  $3,$0,$0        # $0 = carry from (cy_limb + prod_low)
+       subq    $5,$3,$3
+       cmpult  $5,$3,$5
+       stq     $3,0($16)
+       addq    $5,$0,$0        # combine carries
+       addq    $4,$0,$0        # cy_limb = prod_high + cy
+       ret     $31,($26),1
+Lend1: subq    $5,$3,$3
+       cmpult  $5,$3,$5
+       stq     $3,0($16)
+       addq    $0,$5,$0
+       ret     $31,($26),1
+
+       .end    __mpn_submul_1
diff --git a/sysdeps/hppa/add_n.s b/sysdeps/hppa/add_n.s

new file mode 100644 (file)

index 0000000..7f3e323
--- /dev/null
+++ b/sysdeps/hppa/add_n.s
@@ -0,0 +1,57 @@
+; HP-PA  __mpn_add_n -- Add two limb vectors of the same length > 0 and store
+; sum in a third limb vector.
+
+; Copyright (C) 1992, 1994 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Library General Public License as published by
+; the Free Software Foundation; either version 2 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
+; License for more details.
+
+; You should have received a copy of the GNU Library General Public License
+; along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+; the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+
+; INPUT PARAMETERS
+; res_ptr      gr26
+; s1_ptr       gr25
+; s2_ptr       gr24
+; size         gr23
+
+; One might want to unroll this as for other processors, but it turns
+; out that the data cache contention after a store makes such
+; unrolling useless.  We can't come under 5 cycles/limb anyway.
+
+       .code
+       .export         __mpn_add_n
+__mpn_add_n
+       .proc
+       .callinfo       frame=0,no_calls
+       .entry
+
+       ldws,ma         4(0,%r25),%r20
+       ldws,ma         4(0,%r24),%r19
+
+       addib,=         -1,%r23,L$end   ; check for (SIZE == 1)
+        add            %r20,%r19,%r28  ; add first limbs ignoring cy
+
+L$loop ldws,ma         4(0,%r25),%r20
+       ldws,ma         4(0,%r24),%r19
+       stws,ma         %r28,4(0,%r26)
+       addib,<>        -1,%r23,L$loop
+        addc           %r20,%r19,%r28
+
+L$end  stws            %r28,0(0,%r26)
+       bv              0(%r2)
+        addc           %r0,%r0,%r28
+
+       .exit
+       .procend
diff --git a/sysdeps/hppa/hppa1.1/addmul_1.s b/sysdeps/hppa/hppa1.1/addmul_1.s

new file mode 100644 (file)

index 0000000..a9dfdd1
--- /dev/null
+++ b/sysdeps/hppa/hppa1.1/addmul_1.s
@@ -0,0 +1,101 @@
+; HP-PA-1.1 __mpn_addmul_1 -- Multiply a limb vector with a limb and
+; add the result to a second limb vector.
+
+; Copyright (C) 1992, 1993, 1994 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Library General Public License as published by
+; the Free Software Foundation; either version 2 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
+; License for more details.
+
+; You should have received a copy of the GNU Library General Public License
+; along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+; the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+
+; INPUT PARAMETERS
+; res_ptr      r26
+; s1_ptr       r25
+; size         r24
+; s2_limb      r23
+
+; This runs at 11 cycles/limb on a PA7000.  With the used instructions, it
+; can not become faster due to data cache contention after a store.  On the
+; PA7100 it runs at 10 cycles/limb, and that can not be improved either,
+; since only the xmpyu does not need the integer pipeline, so the only
+; dual-issue we will get are addc+xmpyu.  Unrolling could gain a cycle/limb
+; on the PA7100.
+
+; There are some ideas described in mul_1.s that applies to this code too.
+
+       .code
+       .export         __mpn_addmul_1
+__mpn_addmul_1
+       .proc
+       .callinfo       frame=64,no_calls
+       .entry
+
+       ldo             64(%r30),%r30
+       fldws,ma        4(%r25),%fr5
+       stw             %r23,-16(%r30)          ; move s2_limb ...
+       addib,=         -1,%r24,L$just_one_limb
+        fldws          -16(%r30),%fr4          ; ... into fr4
+       add             %r0,%r0,%r0             ; clear carry
+       xmpyu           %fr4,%fr5,%fr6
+       fldws,ma        4(%r25),%fr7
+       fstds           %fr6,-16(%r30)
+       xmpyu           %fr4,%fr7,%fr8
+       ldw             -12(%r30),%r19          ; least significant limb in product
+       ldw             -16(%r30),%r28
+
+       fstds           %fr8,-16(%r30)
+       addib,=         -1,%r24,L$end
+        ldw            -12(%r30),%r1
+
+; Main loop
+L$loop ldws            0(%r26),%r29
+       fldws,ma        4(%r25),%fr5
+       add             %r29,%r19,%r19
+       stws,ma         %r19,4(%r26)
+       addc            %r28,%r1,%r19
+       xmpyu           %fr4,%fr5,%fr6
+       ldw             -16(%r30),%r28
+       fstds           %fr6,-16(%r30)
+       addc            %r0,%r28,%r28
+       addib,<>        -1,%r24,L$loop
+        ldw            -12(%r30),%r1
+
+L$end  ldw             0(%r26),%r29
+       add             %r29,%r19,%r19
+       stws,ma         %r19,4(%r26)
+       addc            %r28,%r1,%r19
+       ldw             -16(%r30),%r28
+       ldws            0(%r26),%r29
+       addc            %r0,%r28,%r28
+       add             %r29,%r19,%r19
+       stws,ma         %r19,4(%r26)
+       addc            %r0,%r28,%r28
+       bv              0(%r2)
+        ldo            -64(%r30),%r30
+
+L$just_one_limb
+       xmpyu           %fr4,%fr5,%fr6
+       ldw             0(%r26),%r29
+       fstds           %fr6,-16(%r30)
+       ldw             -12(%r30),%r1
+       ldw             -16(%r30),%r28
+       add             %r29,%r1,%r19
+       stw             %r19,0(%r26)
+       addc            %r0,%r28,%r28
+       bv              0(%r2)
+        ldo            -64(%r30),%r30
+
+       .exit
+       .procend
diff --git a/sysdeps/hppa/hppa1.1/mul_1.s b/sysdeps/hppa/hppa1.1/mul_1.s

new file mode 100644 (file)

index 0000000..ebf0778
--- /dev/null
+++ b/sysdeps/hppa/hppa1.1/mul_1.s
@@ -0,0 +1,97 @@
+; HP-PA-1.1 __mpn_mul_1 -- Multiply a limb vector with a limb and store
+; the result in a second limb vector.
+
+; Copyright (C) 1992, 1993, 1994 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Library General Public License as published by
+; the Free Software Foundation; either version 2 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
+; License for more details.
+
+; You should have received a copy of the GNU Library General Public License
+; along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+; the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+
+; INPUT PARAMETERS
+; res_ptr      r26
+; s1_ptr       r25
+; size         r24
+; s2_limb      r23
+
+; This runs at 9 cycles/limb on a PA7000.  With the used instructions, it can
+; not become faster due to data cache contention after a store.  On the
+; PA7100 it runs at 7 cycles/limb, and that can not be improved either, since
+; only the xmpyu does not need the integer pipeline, so the only dual-issue
+; we will get are addc+xmpyu.  Unrolling would not help either CPU.
+
+; We could use fldds to read two limbs at a time from the S1 array, and that
+; could bring down the times to 8.5 and 6.5 cycles/limb for the PA7000 and
+; PA7100, respectively.  We don't do that since it does not seem worth the
+; (alignment) troubles...
+
+; At least the PA7100 is rumored to be able to deal with cache-misses
+; without stalling instruction issue.  If this is true, and the cache is
+; actually also lockup-free, we should use a deeper software pipeline, and
+; load from S1 very early!  (The loads and stores to -12(sp) will surely be
+; in the cache.)
+
+       .code
+       .export         __mpn_mul_1
+__mpn_mul_1
+       .proc
+       .callinfo       frame=64,no_calls
+       .entry
+
+       ldo             64(%r30),%r30
+       fldws,ma        4(%r25),%fr5
+       stw             %r23,-16(%r30)          ; move s2_limb ...
+       addib,=         -1,%r24,L$just_one_limb
+        fldws          -16(%r30),%fr4          ; ... into fr4
+       add             %r0,%r0,%r0             ; clear carry
+       xmpyu           %fr4,%fr5,%fr6
+       fldws,ma        4(%r25),%fr7
+       fstds           %fr6,-16(%r30)
+       xmpyu           %fr4,%fr7,%fr8
+       ldw             -12(%r30),%r19          ; least significant limb in product
+       ldw             -16(%r30),%r28
+
+       fstds           %fr8,-16(%r30)
+       addib,=         -1,%r24,L$end
+        ldw            -12(%r30),%r1
+
+; Main loop
+L$loop fldws,ma        4(%r25),%fr5
+       stws,ma         %r19,4(%r26)
+       addc            %r28,%r1,%r19
+       xmpyu           %fr4,%fr5,%fr6
+       ldw             -16(%r30),%r28
+       fstds           %fr6,-16(%r30)
+       addib,<>        -1,%r24,L$loop
+        ldw            -12(%r30),%r1
+
+L$end  stws,ma         %r19,4(%r26)
+       addc            %r28,%r1,%r19
+       ldw             -16(%r30),%r28
+       stws,ma         %r19,4(%r26)
+       addc            %r0,%r28,%r28
+       bv              0(%r2)
+        ldo            -64(%r30),%r30
+
+L$just_one_limb
+       xmpyu           %fr4,%fr5,%fr6
+       fstds           %fr6,-16(%r30)
+       ldw             -16(%r30),%r28
+       ldo             -64(%r30),%r30
+       bv              0(%r2)
+        fstws          %fr6R,0(%r26)
+
+       .exit
+       .procend
diff --git a/sysdeps/hppa/hppa1.1/submul_1.s b/sysdeps/hppa/hppa1.1/submul_1.s

new file mode 100644 (file)

index 0000000..44cabf4
--- /dev/null
+++ b/sysdeps/hppa/hppa1.1/submul_1.s
@@ -0,0 +1,110 @@
+; HP-PA-1.1 __mpn_submul_1 -- Multiply a limb vector with a limb and
+; subtract the result from a second limb vector.
+
+; Copyright (C) 1992, 1993, 1994 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Library General Public License as published by
+; the Free Software Foundation; either version 2 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
+; License for more details.
+
+; You should have received a copy of the GNU Library General Public License
+; along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+; the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+
+; INPUT PARAMETERS
+; res_ptr      r26
+; s1_ptr       r25
+; size         r24
+; s2_limb      r23
+
+; This runs at 12 cycles/limb on a PA7000.  With the used instructions, it
+; can not become faster due to data cache contention after a store.  On the
+; PA7100 it runs at 11 cycles/limb, and that can not be improved either,
+; since only the xmpyu does not need the integer pipeline, so the only
+; dual-issue we will get are addc+xmpyu.  Unrolling could gain a cycle/limb
+; on the PA7100.
+
+; There are some ideas described in mul_1.s that applies to this code too.
+
+; It seems possible to make this run as fast as __mpn_addmul_1, if we use
+;      sub,>>= %r29,%r19,%r22
+;      addi    1,%r28,%r28
+; but that requires reworking the hairy software pipeline...
+
+       .code
+       .export         __mpn_submul_1
+__mpn_submul_1
+       .proc
+       .callinfo       frame=64,no_calls
+       .entry
+
+       ldo             64(%r30),%r30
+       fldws,ma        4(%r25),%fr5
+       stw             %r23,-16(%r30)          ; move s2_limb ...
+       addib,=         -1,%r24,L$just_one_limb
+        fldws          -16(%r30),%fr4          ; ... into fr4
+       add             %r0,%r0,%r0             ; clear carry
+       xmpyu           %fr4,%fr5,%fr6
+       fldws,ma        4(%r25),%fr7
+       fstds           %fr6,-16(%r30)
+       xmpyu           %fr4,%fr7,%fr8
+       ldw             -12(%r30),%r19          ; least significant limb in product
+       ldw             -16(%r30),%r28
+
+       fstds           %fr8,-16(%r30)
+       addib,=         -1,%r24,L$end
+        ldw            -12(%r30),%r1
+
+; Main loop
+L$loop ldws            0(%r26),%r29
+       fldws,ma        4(%r25),%fr5
+       sub             %r29,%r19,%r22
+       add             %r22,%r19,%r0
+       stws,ma         %r22,4(%r26)
+       addc            %r28,%r1,%r19
+       xmpyu           %fr4,%fr5,%fr6
+       ldw             -16(%r30),%r28
+       fstds           %fr6,-16(%r30)
+       addc            %r0,%r28,%r28
+       addib,<>        -1,%r24,L$loop
+        ldw            -12(%r30),%r1
+
+L$end  ldw             0(%r26),%r29
+       sub             %r29,%r19,%r22
+       add             %r22,%r19,%r0
+       stws,ma         %r22,4(%r26)
+       addc            %r28,%r1,%r19
+       ldw             -16(%r30),%r28
+       ldws            0(%r26),%r29
+       addc            %r0,%r28,%r28
+       sub             %r29,%r19,%r22
+       add             %r22,%r19,%r0
+       stws,ma         %r22,4(%r26)
+       addc            %r0,%r28,%r28
+       bv              0(%r2)
+        ldo            -64(%r30),%r30
+
+L$just_one_limb
+       xmpyu           %fr4,%fr5,%fr6
+       ldw             0(%r26),%r29
+       fstds           %fr6,-16(%r30)
+       ldw             -12(%r30),%r1
+       ldw             -16(%r30),%r28
+       sub             %r29,%r1,%r22
+       add             %r22,%r1,%r0
+       stw             %r22,0(%r26)
+       addc            %r0,%r28,%r28
+       bv              0(%r2)
+        ldo            -64(%r30),%r30
+
+       .exit
+       .procend
diff --git a/sysdeps/hppa/hppa1.1/udiv_qrnnd.s b/sysdeps/hppa/hppa1.1/udiv_qrnnd.s

new file mode 100644 (file)

index 0000000..4ffef3a
--- /dev/null
+++ b/sysdeps/hppa/hppa1.1/udiv_qrnnd.s
@@ -0,0 +1,74 @@
+; HP-PA  __udiv_qrnnd division support, used from longlong.h.
+; This version runs fast on PA 7000 and later.
+
+; Copyright (C) 1993, 1994 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Library General Public License as published by
+; the Free Software Foundation; either version 2 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
+; License for more details.
+
+; You should have received a copy of the GNU Library General Public License
+; along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+; the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+
+; INPUT PARAMETERS
+; rem_ptr      gr26
+; n1           gr25
+; n0           gr24
+; d            gr23
+
+       .code
+L$0000 .word           0x43f00000
+       .word           0x0
+       .export         __udiv_qrnnd
+__udiv_qrnnd
+       .proc
+       .callinfo       frame=64,no_calls
+       .entry
+       ldo             64(%r30),%r30
+
+       stws            %r25,-16(0,%r30)        ; n_hi
+       stws            %r24,-12(0,%r30)        ; n_lo
+       ldil            L'L$0000,%r19
+       ldo             R'L$0000(%r19),%r19
+       fldds           -16(0,%r30),%fr5
+       stws            %r23,-12(0,%r30)
+       comib,<=        0,%r25,L$1
+       fcnvxf,dbl,dbl  %fr5,%fr5
+       fldds           0(0,%r19),%fr4
+       fadd,dbl        %fr4,%fr5,%fr5
+L$1
+       fcpy,sgl        %fr0,%fr6L
+       fldws           -12(0,%r30),%fr6R
+       fcnvxf,dbl,dbl  %fr6,%fr4
+
+       fdiv,dbl        %fr5,%fr4,%fr5
+
+       fcnvfx,dbl,dbl  %fr5,%fr4
+       fstws           %fr4R,-16(%r30)
+       xmpyu           %fr4R,%fr6R,%fr6
+       ldws            -16(%r30),%r28
+       fstds           %fr6,-16(0,%r30)
+       ldws            -12(0,%r30),%r21
+       ldws            -16(0,%r30),%r20
+       sub             %r24,%r21,%r22
+       subb            %r25,%r20,%r19
+       comib,=         0,%r19,L$2
+       ldo             -64(%r30),%r30
+
+       add             %r22,%r23,%r22
+       ldo             -1(%r28),%r28
+L$2    bv              0(%r2)
+       stws            %r22,0(0,%r26)
+
+       .exit
+       .procend
diff --git a/sysdeps/hppa/lshift.s b/sysdeps/hppa/lshift.s

new file mode 100644 (file)

index 0000000..0479f4a
--- /dev/null
+++ b/sysdeps/hppa/lshift.s
@@ -0,0 +1,65 @@
+; HP-PA  __mpn_lshift --
+
+; Copyright (C) 1992, 1994 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Library General Public License as published by
+; the Free Software Foundation; either version 2 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
+; License for more details.
+
+; You should have received a copy of the GNU Library General Public License
+; along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+; the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+
+; INPUT PARAMETERS
+; res_ptr      gr26
+; s_ptr                gr25
+; size         gr24
+; cnt          gr23
+
+       .code
+       .export         __mpn_lshift
+__mpn_lshift
+       .proc
+       .callinfo       frame=64,no_calls
+       .entry
+
+       sh2add          %r24,%r25,%r25
+       sh2add          %r24,%r26,%r26
+       ldws,mb         -4(0,%r25),%r22
+       subi            32,%r23,%r1
+       mtsar           %r1
+       addib,=         -1,%r24,L$0004
+       vshd            %r0,%r22,%r28           ; compute carry out limb
+       ldws,mb         -4(0,%r25),%r29
+       addib,=         -1,%r24,L$0002
+       vshd            %r22,%r29,%r20
+
+L$loop ldws,mb         -4(0,%r25),%r22
+       stws,mb         %r20,-4(0,%r26)
+       addib,=         -1,%r24,L$0003
+       vshd            %r29,%r22,%r20
+       ldws,mb         -4(0,%r25),%r29
+       stws,mb         %r20,-4(0,%r26)
+       addib,<>        -1,%r24,L$loop
+       vshd            %r22,%r29,%r20
+
+L$0002 stws,mb         %r20,-4(0,%r26)
+       vshd            %r29,%r0,%r20
+       bv              0(%r2)
+       stw             %r20,-4(0,%r26)
+L$0003 stws,mb         %r20,-4(0,%r26)
+L$0004 vshd            %r22,%r0,%r20
+       bv              0(%r2)
+       stw             %r20,-4(0,%r26)
+
+       .exit
+       .procend
diff --git a/sysdeps/hppa/rshift.s b/sysdeps/hppa/rshift.s

new file mode 100644 (file)

index 0000000..18d33f2
--- /dev/null
+++ b/sysdeps/hppa/rshift.s
@@ -0,0 +1,62 @@
+; HP-PA  __mpn_rshift -- 
+
+; Copyright (C) 1992, 1994 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Library General Public License as published by
+; the Free Software Foundation; either version 2 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
+; License for more details.
+
+; You should have received a copy of the GNU Library General Public License
+; along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+; the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+
+; INPUT PARAMETERS
+; res_ptr      gr26
+; s_ptr                gr25
+; size         gr24
+; cnt          gr23
+
+       .code
+       .export         __mpn_rshift
+__mpn_rshift
+       .proc
+       .callinfo       frame=64,no_calls
+       .entry
+
+       ldws,ma         4(0,%r25),%r22
+       mtsar           %r23
+       addib,=         -1,%r24,L$0004
+       vshd            %r22,%r0,%r28           ; compute carry out limb
+       ldws,ma         4(0,%r25),%r29
+       addib,=         -1,%r24,L$0002
+       vshd            %r29,%r22,%r20
+
+L$loop ldws,ma         4(0,%r25),%r22
+       stws,ma         %r20,4(0,%r26)
+       addib,=         -1,%r24,L$0003
+       vshd            %r22,%r29,%r20
+       ldws,ma         4(0,%r25),%r29
+       stws,ma         %r20,4(0,%r26)
+       addib,<>        -1,%r24,L$loop
+       vshd            %r29,%r22,%r20
+
+L$0002 stws,ma         %r20,4(0,%r26)
+       vshd            %r0,%r29,%r20
+       bv              0(%r2)
+       stw             %r20,0(0,%r26)
+L$0003 stws,ma         %r20,4(0,%r26)
+L$0004 vshd            %r0,%r22,%r20
+       bv              0(%r2)
+       stw             %r20,0(0,%r26)
+
+       .exit
+       .procend
diff --git a/sysdeps/hppa/sub_n.s b/sysdeps/hppa/sub_n.s

new file mode 100644 (file)

index 0000000..daae46e
--- /dev/null
+++ b/sysdeps/hppa/sub_n.s
@@ -0,0 +1,58 @@
+; HP-PA  __mpn_sub_n -- Subtract two limb vectors of the same length > 0 and
+; store difference in a third limb vector.
+
+; Copyright (C) 1992, 1994 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Library General Public License as published by
+; the Free Software Foundation; either version 2 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
+; License for more details.
+
+; You should have received a copy of the GNU Library General Public License
+; along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+; the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+
+; INPUT PARAMETERS
+; res_ptr      gr26
+; s1_ptr       gr25
+; s2_ptr       gr24
+; size         gr23
+
+; One might want to unroll this as for other processors, but it turns
+; out that the data cache contention after a store makes such
+; unrolling useless.  We can't come under 5 cycles/limb anyway.
+
+       .code
+       .export         __mpn_sub_n
+__mpn_sub_n
+       .proc
+       .callinfo       frame=0,no_calls
+       .entry
+
+       ldws,ma         4(0,%r25),%r20
+       ldws,ma         4(0,%r24),%r19
+
+       addib,=         -1,%r23,L$end   ; check for (SIZE == 1)
+        sub            %r20,%r19,%r28  ; subtract first limbs ignoring cy
+
+L$loop ldws,ma         4(0,%r25),%r20
+       ldws,ma         4(0,%r24),%r19
+       stws,ma         %r28,4(0,%r26)
+       addib,<>        -1,%r23,L$loop
+        subb           %r20,%r19,%r28
+
+L$end  stws            %r28,0(0,%r26)
+       addc            %r0,%r0,%r28
+       bv              0(%r2)
+        subi           1,%r28,%r28
+
+       .exit
+       .procend
diff --git a/sysdeps/hppa/udiv_qrnnd.s b/sysdeps/hppa/udiv_qrnnd.s

new file mode 100644 (file)

index 0000000..0b069bf
--- /dev/null
+++ b/sysdeps/hppa/udiv_qrnnd.s
@@ -0,0 +1,285 @@
+; HP-PA  __udiv_qrnnd division support, used from longlong.h.
+; This version runs fast on pre-PA7000 CPUs.
+
+; Copyright (C) 1993, 1994 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Library General Public License as published by
+; the Free Software Foundation; either version 2 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
+; License for more details.
+
+; You should have received a copy of the GNU Library General Public License
+; along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+; the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+
+; INPUT PARAMETERS
+; rem_ptr      gr26
+; n1           gr25
+; n0           gr24
+; d            gr23
+
+; The code size is a bit excessive.  We could merge the last two ds;addc
+; sequences by simply moving the "bb,< Odd" instruction down.  The only
+; trouble is the FFFFFFFF code that would need some hacking.
+
+       .code
+       .export         __udiv_qrnnd
+__udiv_qrnnd
+       .proc
+       .callinfo       frame=0,no_calls
+       .entry
+
+       comb,<          %r23,0,L$largedivisor
+        sub            %r0,%r23,%r1            ; clear cy as side-effect
+       ds              %r0,%r1,%r0
+       addc            %r24,%r24,%r24
+       ds              %r25,%r23,%r25
+       addc            %r24,%r24,%r24
+       ds              %r25,%r23,%r25
+       addc            %r24,%r24,%r24
+       ds              %r25,%r23,%r25
+       addc            %r24,%r24,%r24
+       ds              %r25,%r23,%r25
+       addc            %r24,%r24,%r24
+       ds              %r25,%r23,%r25
+       addc            %r24,%r24,%r24
+       ds              %r25,%r23,%r25
+       addc            %r24,%r24,%r24
+       ds              %r25,%r23,%r25
+       addc            %r24,%r24,%r24
+       ds              %r25,%r23,%r25
+       addc            %r24,%r24,%r24
+       ds              %r25,%r23,%r25
+       addc            %r24,%r24,%r24
+       ds              %r25,%r23,%r25
+       addc            %r24,%r24,%r24
+       ds              %r25,%r23,%r25
+       addc            %r24,%r24,%r24
+       ds              %r25,%r23,%r25
+       addc            %r24,%r24,%r24
+       ds              %r25,%r23,%r25
+       addc            %r24,%r24,%r24
+       ds              %r25,%r23,%r25
+       addc            %r24,%r24,%r24
+       ds              %r25,%r23,%r25
+       addc            %r24,%r24,%r24
+       ds              %r25,%r23,%r25
+       addc            %r24,%r24,%r24
+       ds              %r25,%r23,%r25
+       addc            %r24,%r24,%r24
+       ds              %r25,%r23,%r25
+       addc            %r24,%r24,%r24
+       ds              %r25,%r23,%r25
+       addc            %r24,%r24,%r24
+       ds              %r25,%r23,%r25
+       addc            %r24,%r24,%r24
+       ds              %r25,%r23,%r25
+       addc            %r24,%r24,%r24
+       ds              %r25,%r23,%r25
+       addc            %r24,%r24,%r24
+       ds              %r25,%r23,%r25
+       addc            %r24,%r24,%r24
+       ds              %r25,%r23,%r25
+       addc            %r24,%r24,%r24
+       ds              %r25,%r23,%r25
+       addc            %r24,%r24,%r24
+       ds              %r25,%r23,%r25
+       addc            %r24,%r24,%r24
+       ds              %r25,%r23,%r25
+       addc            %r24,%r24,%r24
+       ds              %r25,%r23,%r25
+       addc            %r24,%r24,%r24
+       ds              %r25,%r23,%r25
+       addc            %r24,%r24,%r24
+       ds              %r25,%r23,%r25
+       addc            %r24,%r24,%r24
+       ds              %r25,%r23,%r25
+       addc            %r24,%r24,%r28
+       ds              %r25,%r23,%r25
+       comclr,>=       %r25,%r0,%r0
+       addl            %r25,%r23,%r25
+       stws            %r25,0(0,%r26)
+       bv              0(%r2)
+        addc           %r28,%r28,%r28
+
+L$largedivisor
+       extru           %r24,31,1,%r19          ; r19 = n0 & 1
+       bb,<            %r23,31,L$odd
+        extru          %r23,30,31,%r22         ; r22 = d >> 1
+       shd             %r25,%r24,1,%r24        ; r24 = new n0
+       extru           %r25,30,31,%r25         ; r25 = new n1
+       sub             %r0,%r22,%r21
+       ds              %r0,%r21,%r0
+       addc            %r24,%r24,%r24
+       ds              %r25,%r22,%r25
+       addc            %r24,%r24,%r24
+       ds              %r25,%r22,%r25
+       addc            %r24,%r24,%r24
+       ds              %r25,%r22,%r25
+       addc            %r24,%r24,%r24
+       ds              %r25,%r22,%r25
+       addc            %r24,%r24,%r24
+       ds              %r25,%r22,%r25
+       addc            %r24,%r24,%r24
+       ds              %r25,%r22,%r25
+       addc            %r24,%r24,%r24
+       ds              %r25,%r22,%r25
+       addc            %r24,%r24,%r24
+       ds              %r25,%r22,%r25
+       addc            %r24,%r24,%r24
+       ds              %r25,%r22,%r25
+       addc            %r24,%r24,%r24
+       ds              %r25,%r22,%r25
+       addc            %r24,%r24,%r24
+       ds              %r25,%r22,%r25
+       addc            %r24,%r24,%r24
+       ds              %r25,%r22,%r25
+       addc            %r24,%r24,%r24
+       ds              %r25,%r22,%r25
+       addc            %r24,%r24,%r24
+       ds              %r25,%r22,%r25
+       addc            %r24,%r24,%r24
+       ds              %r25,%r22,%r25
+       addc            %r24,%r24,%r24
+       ds              %r25,%r22,%r25
+       addc            %r24,%r24,%r24
+       ds              %r25,%r22,%r25
+       addc            %r24,%r24,%r24
+       ds              %r25,%r22,%r25
+       addc            %r24,%r24,%r24
+       ds              %r25,%r22,%r25
+       addc            %r24,%r24,%r24
+       ds              %r25,%r22,%r25
+       addc            %r24,%r24,%r24
+       ds              %r25,%r22,%r25
+       addc            %r24,%r24,%r24
+       ds              %r25,%r22,%r25
+       addc            %r24,%r24,%r24
+       ds              %r25,%r22,%r25
+       addc            %r24,%r24,%r24
+       ds              %r25,%r22,%r25
+       addc            %r24,%r24,%r24
+       ds              %r25,%r22,%r25
+       addc            %r24,%r24,%r24
+       ds              %r25,%r22,%r25
+       addc            %r24,%r24,%r24
+       ds              %r25,%r22,%r25
+       addc            %r24,%r24,%r24
+       ds              %r25,%r22,%r25
+       addc            %r24,%r24,%r24
+       ds              %r25,%r22,%r25
+       addc            %r24,%r24,%r24
+       ds              %r25,%r22,%r25
+       addc            %r24,%r24,%r24
+       ds              %r25,%r22,%r25
+       addc            %r24,%r24,%r24
+       ds              %r25,%r22,%r25
+       comclr,>=       %r25,%r0,%r0
+       addl            %r25,%r22,%r25
+       sh1addl         %r25,%r19,%r25
+       stws            %r25,0(0,%r26)
+       bv              0(%r2)
+        addc           %r24,%r24,%r28
+
+L$odd  addib,sv,n      1,%r22,L$FF..           ; r22 = (d / 2 + 1)
+       shd             %r25,%r24,1,%r24        ; r24 = new n0
+       extru           %r25,30,31,%r25         ; r25 = new n1
+       sub             %r0,%r22,%r21
+       ds              %r0,%r21,%r0
+       addc            %r24,%r24,%r24
+       ds              %r25,%r22,%r25
+       addc            %r24,%r24,%r24
+       ds              %r25,%r22,%r25
+       addc            %r24,%r24,%r24
+       ds              %r25,%r22,%r25
+       addc            %r24,%r24,%r24
+       ds              %r25,%r22,%r25
+       addc            %r24,%r24,%r24
+       ds              %r25,%r22,%r25
+       addc            %r24,%r24,%r24
+       ds              %r25,%r22,%r25
+       addc            %r24,%r24,%r24
+       ds              %r25,%r22,%r25
+       addc            %r24,%r24,%r24
+       ds              %r25,%r22,%r25
+       addc            %r24,%r24,%r24
+       ds              %r25,%r22,%r25
+       addc            %r24,%r24,%r24
+       ds              %r25,%r22,%r25
+       addc            %r24,%r24,%r24
+       ds              %r25,%r22,%r25
+       addc            %r24,%r24,%r24
+       ds              %r25,%r22,%r25
+       addc            %r24,%r24,%r24
+       ds              %r25,%r22,%r25
+       addc            %r24,%r24,%r24
+       ds              %r25,%r22,%r25
+       addc            %r24,%r24,%r24
+       ds              %r25,%r22,%r25
+       addc            %r24,%r24,%r24
+       ds              %r25,%r22,%r25
+       addc            %r24,%r24,%r24
+       ds              %r25,%r22,%r25
+       addc            %r24,%r24,%r24
+       ds              %r25,%r22,%r25
+       addc            %r24,%r24,%r24
+       ds              %r25,%r22,%r25
+       addc            %r24,%r24,%r24
+       ds              %r25,%r22,%r25
+       addc            %r24,%r24,%r24
+       ds              %r25,%r22,%r25
+       addc            %r24,%r24,%r24
+       ds              %r25,%r22,%r25
+       addc            %r24,%r24,%r24
+       ds              %r25,%r22,%r25
+       addc            %r24,%r24,%r24
+       ds              %r25,%r22,%r25
+       addc            %r24,%r24,%r24
+       ds              %r25,%r22,%r25
+       addc            %r24,%r24,%r24
+       ds              %r25,%r22,%r25
+       addc            %r24,%r24,%r24
+       ds              %r25,%r22,%r25
+       addc            %r24,%r24,%r24
+       ds              %r25,%r22,%r25
+       addc            %r24,%r24,%r24
+       ds              %r25,%r22,%r25
+       addc            %r24,%r24,%r24
+       ds              %r25,%r22,%r25
+       addc            %r24,%r24,%r24
+       ds              %r25,%r22,%r25
+       addc            %r24,%r24,%r24
+       ds              %r25,%r22,%r25
+       addc            %r24,%r24,%r28
+       comclr,>=       %r25,%r0,%r0
+       addl            %r25,%r22,%r25
+       sh1addl         %r25,%r19,%r25
+; We have computed (n1,,n0) / (d + 1), q' = r28, r' = r25
+       add,nuv         %r28,%r25,%r25
+       addl            %r25,%r1,%r25
+       addc            %r0,%r28,%r28
+       sub,<<          %r25,%r23,%r0
+       addl            %r25,%r1,%r25
+       stws            %r25,0(0,%r26)
+       bv              0(%r2)
+        addc           %r0,%r28,%r28
+
+; This is just a special case of the code above.
+; We come here when d == 0xFFFFFFFF
+L$FF.. add,uv          %r25,%r24,%r24
+       sub,<<          %r24,%r23,%r0
+       ldo             1(%r24),%r24
+       stws            %r24,0(0,%r26)
+       bv              0(%r2)
+        addc           %r0,%r25,%r28
+
+       .exit
+       .procend
diff --git a/sysdeps/i960/add_n.s b/sysdeps/i960/add_n.s

new file mode 100644 (file)

index 0000000..6031f6d
--- /dev/null
+++ b/sysdeps/i960/add_n.s
@@ -0,0 +1,21 @@
+.text
+       .align 4
+       .globl ___mpn_add_n
+___mpn_add_n:
+       mov     0,g6            # clear carry-save register
+       cmpo    1,0             # clear cy
+
+Loop:  subo    1,g3,g3         # update loop counter
+       ld      (g1),g5         # load from s1_ptr
+       addo    4,g1,g1         # s1_ptr++
+       ld      (g2),g4         # load from s2_ptr
+       addo    4,g2,g2         # s2_ptr++
+       cmpo    g6,1            # restore cy from g6, relies on cy being 0
+       addc    g4,g5,g4        # main add
+       subc    0,0,g6          # save cy in g6
+       st      g4,(g0)         # store result to res_ptr
+       addo    4,g0,g0         # res_ptr++
+       cmpobne 0,g3,Loop       # when branch is taken, clears C bit
+
+       mov     g6,g0
+       ret
diff --git a/sysdeps/i960/addmul_1.s b/sysdeps/i960/addmul_1.s

new file mode 100644 (file)

index 0000000..1a3de95
--- /dev/null
+++ b/sysdeps/i960/addmul_1.s
@@ -0,0 +1,26 @@
+.text
+       .align  4
+       .globl  ___mpn_mul_1
+___mpn_mul_1:
+       subo    g2,0,g2
+       shlo    2,g2,g4
+       subo    g4,g1,g1
+       subo    g4,g0,g13
+       mov     0,g0
+
+       cmpo    1,0             # clear C bit on AC.cc
+
+Loop:  ld      (g1)[g2*4],g5
+       emul    g3,g5,g6
+       ld      (g13)[g2*4],g5
+
+       addc    g0,g6,g6        # relies on that C bit is clear
+       addc    0,g7,g7
+       addc    g5,g6,g6        # relies on that C bit is clear
+       st      g6,(g13)[g2*4]
+       addc    0,g7,g0
+
+       addo    g2,1,g2
+       cmpobne 0,g2,Loop       # when branch is taken, clears C bit
+
+       ret
diff --git a/sysdeps/i960/mul_1.s b/sysdeps/i960/mul_1.s

new file mode 100644 (file)

index 0000000..e75ea42
--- /dev/null
+++ b/sysdeps/i960/mul_1.s
@@ -0,0 +1,23 @@
+.text
+       .align  4
+       .globl  ___mpn_mul_1
+___mpn_mul_1:
+       subo    g2,0,g2
+       shlo    2,g2,g4
+       subo    g4,g1,g1
+       subo    g4,g0,g13
+       mov     0,g0
+
+       cmpo    1,0             # clear C bit on AC.cc
+
+Loop:  ld      (g1)[g2*4],g5
+       emul    g3,g5,g6
+
+       addc    g0,g6,g6        # relies on that C bit is clear
+       st      g6,(g13)[g2*4]
+       addc    0,g7,g0
+
+       addo    g2,1,g2
+       cmpobne 0,g2,Loop       # when branch is taken, clears C bit
+
+       ret
diff --git a/sysdeps/i960/sub_n.s b/sysdeps/i960/sub_n.s

new file mode 100644 (file)

index 0000000..13ebbfa
--- /dev/null
+++ b/sysdeps/i960/sub_n.s
@@ -0,0 +1,21 @@
+.text
+       .align 4
+       .globl ___mpn_sub_n
+___mpn_sub_n:
+       mov     1,g6            # set carry-save register
+       cmpo    1,0             # clear cy
+
+Loop:  subo    1,g3,g3         # update loop counter
+       ld      (g1),g5         # load from s1_ptr
+       addo    4,g1,g1         # s1_ptr++
+       ld      (g2),g4         # load from s2_ptr
+       addo    4,g2,g2         # s2_ptr++
+       cmpo    g6,1            # restore cy from g6, relies on cy being 0
+       subc    g4,g5,g4        # main subtract
+       subc    0,0,g6          # save cy in g6
+       st      g4,(g0)         # store result to res_ptr
+       addo    4,g0,g0         # res_ptr++
+       cmpobne 0,g3,Loop       # when branch is taken, cy will be 0
+
+       mov     g6,g0
+       ret
diff --git a/sysdeps/m88k/m88100/add_n.s b/sysdeps/m88k/m88100/add_n.s

new file mode 100644 (file)

index 0000000..7e4cccc
--- /dev/null
+++ b/sysdeps/m88k/m88100/add_n.s
@@ -0,0 +1,103 @@
+; mc88100 __mpn_add -- Add two limb vectors of the same length > 0 and store
+; sum in a third limb vector.
+
+; Copyright (C) 1992, 1994 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Library General Public License as published by
+; the Free Software Foundation; either version 2 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
+; License for more details.
+
+; You should have received a copy of the GNU Library General Public License
+; along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+; the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+
+; INPUT PARAMETERS
+; res_ptr      r2
+; s1_ptr       r3
+; s2_ptr       r4
+; size         r5
+
+; This code has been optimized to run one instruction per clock, avoiding
+; load stalls and writeback contention.  As a result, the instruction
+; order is not always natural.
+
+; The speed is about 4.6 clocks/limb + 18 clocks/limb-vector on an 88100,
+; but on the 88110, it seems to run much slower, 6.6 clocks/limb.
+
+       text
+       align    16
+       global   ___mpn_add_n
+___mpn_add_n:
+       ld      r6,r3,0                 ; read first limb from s1_ptr
+       extu    r10,r5,3
+       ld      r7,r4,0                 ; read first limb from s2_ptr
+
+       subu.co r5,r0,r5                ; (clear carry as side effect)
+       mak     r5,r5,3<4>
+       bcnd    eq0,r5,Lzero
+
+       or      r12,r0,lo16(Lbase)
+       or.u    r12,r12,hi16(Lbase)
+       addu    r12,r12,r5              ; r12 is address for entering in loop
+
+       extu    r5,r5,2                 ; divide by 4
+       subu    r2,r2,r5                ; adjust res_ptr
+       subu    r3,r3,r5                ; adjust s1_ptr
+       subu    r4,r4,r5                ; adjust s2_ptr
+
+       or      r8,r6,r0
+
+       jmp.n   r12
+        or     r9,r7,r0
+
+Loop:  addu    r3,r3,32
+       st      r8,r2,28
+       addu    r4,r4,32
+       ld      r6,r3,0
+       addu    r2,r2,32
+       ld      r7,r4,0
+Lzero: subu    r10,r10,1               ; add 0 + 8r limbs (adj loop cnt)
+Lbase: ld      r8,r3,4
+       addu.cio r6,r6,r7
+       ld      r9,r4,4
+       st      r6,r2,0
+       ld      r6,r3,8                 ; add 7 + 8r limbs
+       addu.cio r8,r8,r9
+       ld      r7,r4,8
+       st      r8,r2,4
+       ld      r8,r3,12                ; add 6 + 8r limbs
+       addu.cio r6,r6,r7
+       ld      r9,r4,12
+       st      r6,r2,8
+       ld      r6,r3,16                ; add 5 + 8r limbs
+       addu.cio r8,r8,r9
+       ld      r7,r4,16
+       st      r8,r2,12
+       ld      r8,r3,20                ; add 4 + 8r limbs
+       addu.cio r6,r6,r7
+       ld      r9,r4,20
+       st      r6,r2,16
+       ld      r6,r3,24                ; add 3 + 8r limbs
+       addu.cio r8,r8,r9
+       ld      r7,r4,24
+       st      r8,r2,20
+       ld      r8,r3,28                ; add 2 + 8r limbs
+       addu.cio r6,r6,r7
+       ld      r9,r4,28
+       st      r6,r2,24
+       bcnd.n  ne0,r10,Loop            ; add 1 + 8r limbs
+        addu.cio r8,r8,r9
+
+       st      r8,r2,28                ; store most significant limb
+
+       jmp.n    r1
+        addu.ci r2,r0,r0               ; return carry-out from most sign. limb
diff --git a/sysdeps/m88k/m88100/mul_1.s b/sysdeps/m88k/m88100/mul_1.s

new file mode 100644 (file)

index 0000000..35c238d
--- /dev/null
+++ b/sysdeps/m88k/m88100/mul_1.s
@@ -0,0 +1,128 @@
+; mc88100 __mpn_mul_1 -- Multiply a limb vector with a single limb and
+; store the product in a second limb vector.
+
+; Copyright (C) 1992, 1994 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Library General Public License as published by
+; the Free Software Foundation; either version 2 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
+; License for more details.
+
+; You should have received a copy of the GNU Library General Public License
+; along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+; the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+
+; INPUT PARAMETERS
+; res_ptr      r2
+; s1_ptr       r3
+; size         r4
+; s2_limb      r5
+
+; Common overhead is about 11 cycles/invocation.
+
+; The speed for S2_LIMB >= 0x10000 is approximately 21 cycles/limb.  (The
+; pipeline stalls 2 cycles due to WB contention.)
+
+; The speed for S2_LIMB < 0x10000 is approximately 16 cycles/limb.  (The
+; pipeline stalls 2 cycles due to WB contention and 1 cycle due to latency.)
+
+; To enhance speed:
+; 1. Unroll main loop 4-8 times.
+; 2. Schedule code to avoid WB contention.  It might be tempting to move the
+;    ld instruction in the loops down to save 2 cycles (less WB contention),
+;    but that looses because the ultimate value will be read from outside
+;    the allocated space.  But if we handle the ultimate multiplication in
+;    the tail, we can do this.
+; 3. Make the multiplication with less instructions.  I think the code for
+;    (S2_LIMB >= 0x10000) is not minimal.
+; With these techniques the (S2_LIMB >= 0x10000) case would run in 17 or
+; less cycles/limb; the (S2_LIMB < 0x10000) case would run in 11
+; cycles/limb.  (Assuming infinite unrolling.)
+
+       text
+       align    16
+       global   ___mpn_mul_1
+___mpn_mul_1:
+
+       ; Make S1_PTR and RES_PTR point at the end of their blocks
+       ; and negate SIZE.
+       lda      r3,r3[r4]
+       lda      r6,r2[r4]              ; RES_PTR in r6 since r2 is retval
+       subu     r4,r0,r4
+
+       addu.co  r2,r0,r0               ; r2 = cy = 0
+       ld       r9,r3[r4]
+       mask     r7,r5,0xffff           ; r7 = lo(S2_LIMB)
+       extu     r8,r5,16               ; r8 = hi(S2_LIMB)
+       bcnd.n   eq0,r8,Lsmall          ; jump if (hi(S2_LIMB) == 0)
+        subu    r6,r6,4
+
+; General code for any value of S2_LIMB.
+
+       ; Make a stack frame and save r25 and r26
+       subu     r31,r31,16
+       st.d     r25,r31,8
+
+       ; Enter the loop in the middle
+       br.n    L1
+       addu     r4,r4,1
+
+Loop:
+       ld       r9,r3[r4]
+       st       r26,r6[r4]
+; bcnd ne0,r0,0                        ; bubble
+       addu     r4,r4,1
+L1:    mul      r26,r9,r5              ; low word of product   mul_1   WB ld
+       mask     r12,r9,0xffff          ; r12 = lo(s1_limb)     mask_1
+       mul      r11,r12,r7             ; r11 =  prod_0         mul_2   WB mask_1
+       mul      r10,r12,r8             ; r10 = prod_1a         mul_3
+       extu     r13,r9,16              ; r13 = hi(s1_limb)     extu_1  WB mul_1
+       mul      r12,r13,r7             ; r12 = prod_1b         mul_4   WB extu_1
+       mul      r25,r13,r8             ; r25  = prod_2         mul_5   WB mul_2
+       extu     r11,r11,16             ; r11 = hi(prod_0)      extu_2  WB mul_3
+       addu     r10,r10,r11            ;                       addu_1  WB extu_2
+; bcnd ne0,r0,0                        ; bubble                        WB addu_1
+       addu.co  r10,r10,r12            ;                               WB mul_4
+       mask.u   r10,r10,0xffff         ; move the 16 most significant bits...
+       addu.ci  r10,r10,r0             ; ...to the low half of the word...
+       rot      r10,r10,16             ; ...and put carry in pos 16.
+       addu.co  r26,r26,r2             ; add old carry limb
+       bcnd.n   ne0,r4,Loop
+        addu.ci r2,r25,r10             ; compute new carry limb
+
+       st       r26,r6[r4]
+       ld.d     r25,r31,8
+       jmp.n    r1
+        addu    r31,r31,16
+
+; Fast code for S2_LIMB < 0x10000
+Lsmall:
+       ; Enter the loop in the middle
+       br.n    SL1
+       addu     r4,r4,1
+
+SLoop:
+       ld       r9,r3[r4]              ;
+       st       r8,r6[r4]              ;
+       addu     r4,r4,1                ;
+SL1:   mul      r8,r9,r5               ; low word of product
+       mask     r12,r9,0xffff          ; r12 = lo(s1_limb)
+       extu     r13,r9,16              ; r13 = hi(s1_limb)
+       mul      r11,r12,r7             ; r11 =  prod_0
+       mul      r12,r13,r7             ; r12 = prod_1b
+       addu.cio r8,r8,r2               ; add old carry limb
+       extu     r10,r11,16             ; r11 = hi(prod_0)
+       addu     r10,r10,r12            ;
+       bcnd.n   ne0,r4,SLoop
+       extu     r2,r10,16              ; r2 = new carry limb
+
+       jmp.n    r1
+       st       r8,r6[r4]
diff --git a/sysdeps/m88k/m88100/sub_n.s b/sysdeps/m88k/m88100/sub_n.s

new file mode 100644 (file)

index 0000000..3963cd5
--- /dev/null
+++ b/sysdeps/m88k/m88100/sub_n.s
@@ -0,0 +1,104 @@
+; mc88100 __mpn_sub -- Subtract two limb vectors of the same length > 0 and
+; store difference in a third limb vector.
+
+; Copyright (C) 1992, 1994 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Library General Public License as published by
+; the Free Software Foundation; either version 2 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
+; License for more details.
+
+; You should have received a copy of the GNU Library General Public License
+; along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+; the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+
+; INPUT PARAMETERS
+; res_ptr      r2
+; s1_ptr       r3
+; s2_ptr       r4
+; size         r5
+
+; This code has been optimized to run one instruction per clock, avoiding
+; load stalls and writeback contention.  As a result, the instruction
+; order is not always natural.
+
+; The speed is about 4.6 clocks/limb + 18 clocks/limb-vector on an 88100,
+; but on the 88110, it seems to run much slower, 6.6 clocks/limb.
+
+       text
+       align    16
+       global   ___mpn_sub_n
+___mpn_sub_n:
+       ld      r6,r3,0                 ; read first limb from s1_ptr
+       extu    r10,r5,3
+       ld      r7,r4,0                 ; read first limb from s2_ptr
+
+       subu.co r5,r0,r5                ; (clear carry as side effect)
+       mak     r5,r5,3<4>
+       bcnd    eq0,r5,Lzero
+
+       or      r12,r0,lo16(Lbase)
+       or.u    r12,r12,hi16(Lbase)
+       addu    r12,r12,r5              ; r12 is address for entering in loop
+
+       extu    r5,r5,2                 ; divide by 4
+       subu    r2,r2,r5                ; adjust res_ptr
+       subu    r3,r3,r5                ; adjust s1_ptr
+       subu    r4,r4,r5                ; adjust s2_ptr
+
+       or      r8,r6,r0
+
+       jmp.n   r12
+        or     r9,r7,r0
+
+Loop:  addu    r3,r3,32
+       st      r8,r2,28
+       addu    r4,r4,32
+       ld      r6,r3,0
+       addu    r2,r2,32
+       ld      r7,r4,0
+Lzero: subu    r10,r10,1               ; subtract 0 + 8r limbs (adj loop cnt)
+Lbase: ld      r8,r3,4
+       subu.cio r6,r6,r7
+       ld      r9,r4,4
+       st      r6,r2,0
+       ld      r6,r3,8                 ; subtract 7 + 8r limbs
+       subu.cio r8,r8,r9
+       ld      r7,r4,8
+       st      r8,r2,4
+       ld      r8,r3,12                ; subtract 6 + 8r limbs
+       subu.cio r6,r6,r7
+       ld      r9,r4,12
+       st      r6,r2,8
+       ld      r6,r3,16                ; subtract 5 + 8r limbs
+       subu.cio r8,r8,r9
+       ld      r7,r4,16
+       st      r8,r2,12
+       ld      r8,r3,20                ; subtract 4 + 8r limbs
+       subu.cio r6,r6,r7
+       ld      r9,r4,20
+       st      r6,r2,16
+       ld      r6,r3,24                ; subtract 3 + 8r limbs
+       subu.cio r8,r8,r9
+       ld      r7,r4,24
+       st      r8,r2,20
+       ld      r8,r3,28                ; subtract 2 + 8r limbs
+       subu.cio r6,r6,r7
+       ld      r9,r4,28
+       st      r6,r2,24
+       bcnd.n  ne0,r10,Loop            ; subtract 1 + 8r limbs
+        subu.cio r8,r8,r9
+
+       st      r8,r2,28                ; store most significant limb
+
+       addu.ci r2,r0,r0                ; return carry-out from most sign. limb
+       jmp.n    r1
+        xor    r2,r2,1
diff --git a/sysdeps/m88k/m88110/mul_1.s b/sysdeps/m88k/m88110/mul_1.s

new file mode 100644 (file)

index 0000000..08c3ca0
--- /dev/null
+++ b/sysdeps/m88k/m88110/mul_1.s
@@ -0,0 +1,84 @@
+; mc88110 __mpn_mul_1 -- Multiply a limb vector with a single limb and
+; store the product in a second limb vector.
+
+; Copyright (C) 1992, 1994 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Library General Public License as published by
+; the Free Software Foundation; either version 2 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
+; License for more details.
+
+; You should have received a copy of the GNU Library General Public License
+; along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+; the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+
+; INPUT PARAMETERS
+; res_ptr      r2
+; s1_ptr       r3
+; size         r4
+; s2_limb      r5
+
+       text
+       align   16
+       global  ___mpn_mul_1
+___mpn_mul_1:
+       ; Make S1_PTR and RES_PTR point at the end of their blocks
+       ; and negate SIZE.
+       lda      r3,r3[r4]
+       lda      r8,r2[r4]              ; RES_PTR in r8 since r2 is retval
+       subu     r4,r0,r4
+
+       addu.co  r2,r0,r0               ; r2 = cy = 0
+
+       ld       r6,r3[r4]
+       addu     r4,r4,1
+       mulu.d   r10,r6,r5
+       bcnd.n   eq0,r4,Lend
+        subu    r8,r8,8
+
+Loop:  ld       r6,r3[r4]
+       addu.cio r9,r11,r2
+       or       r2,r10,r0              ; could be avoided if unrolled
+       addu     r4,r4,1
+       mulu.d   r10,r6,r5
+       bcnd.n   ne0,r4,Loop
+        st      r9,r8[r4]
+
+Lend:  addu.cio r9,r11,r2
+       st       r9,r8,4
+       jmp.n    r1
+        addu.ci r2,r10,r0
+
+; This is the Right Way to do this on '110.  4 cycles / 64-bit limb.
+;      ld.d    r10,
+;      mulu.d
+;      addu.cio
+;      addu.cio
+;      st.d
+;      mulu.d  ,r11,r5
+;      ld.d    r12,
+;      mulu.d  ,r10,r5
+;      addu.cio
+;      addu.cio
+;      st.d
+;      mulu.d
+;      ld.d    r10,
+;      mulu.d
+;      addu.cio
+;      addu.cio
+;      st.d
+;      mulu.d
+;      ld.d    r10,
+;      mulu.d
+;      addu.cio
+;      addu.cio
+;      st.d
+;      mulu.d
diff --git a/sysdeps/mips/add_n.s b/sysdeps/mips/add_n.s

new file mode 100644 (file)

index 0000000..c829108
--- /dev/null
+++ b/sysdeps/mips/add_n.s
@@ -0,0 +1,119 @@
+ # MIPS2 __mpn_add_n -- Add two limb vectors of the same length > 0 and
+ # store sum in a third limb vector.
+
+ # Copyright (C) 1995 Free Software Foundation, Inc.
+
+ # This file is part of the GNU MP Library.
+
+ # The GNU MP Library is free software; you can redistribute it and/or modify
+ # it under the terms of the GNU Library General Public License as published by
+ # the Free Software Foundation; either version 2 of the License, or (at your
+ # option) any later version.
+
+ # The GNU MP Library is distributed in the hope that it will be useful, but
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ # or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
+ # License for more details.
+
+ # You should have received a copy of the GNU Library General Public License
+ # along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+ # the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+
+ # INPUT PARAMETERS
+ # res_ptr     $4
+ # s1_ptr      $5
+ # s2_ptr      $6
+ # size                $7
+
+       .text
+       .align  2
+       .globl  __mpn_add_n
+       .ent    __mpn_add_n
+__mpn_add_n:
+       .set    noreorder
+       .set    nomacro
+
+       lw      $10,0($5)
+       lw      $11,0($6)
+
+       addiu   $7,$7,-1
+       and     $9,$7,4-1       # number of limbs in first loop
+       beq     $9,$0,.L0       # if multiple of 4 limbs, skip first loop
+        move   $2,$0
+
+       subu    $7,$7,$9
+
+.Loop0:        addiu   $9,$9,-1
+       lw      $12,4($5)
+       addu    $11,$11,$2
+       lw      $13,4($6)
+       sltu    $8,$11,$2
+       addu    $11,$10,$11
+       sltu    $2,$11,$10
+       sw      $11,0($4)
+       or      $2,$2,$8
+
+       addiu   $5,$5,4
+       addiu   $6,$6,4
+       move    $10,$12
+       move    $11,$13
+       bne     $9,$0,.Loop0
+        addiu  $4,$4,4
+
+.L0:   beq     $7,$0,.Lend
+        nop
+
+.Loop: addiu   $7,$7,-4
+
+       lw      $12,4($5)
+       addu    $11,$11,$2
+       lw      $13,4($6)
+       sltu    $8,$11,$2
+       addu    $11,$10,$11
+       sltu    $2,$11,$10
+       sw      $11,0($4)
+       or      $2,$2,$8
+
+       lw      $10,8($5)
+       addu    $13,$13,$2
+       lw      $11,8($6)
+       sltu    $8,$13,$2
+       addu    $13,$12,$13
+       sltu    $2,$13,$12
+       sw      $13,4($4)
+       or      $2,$2,$8
+
+       lw      $12,12($5)
+       addu    $11,$11,$2
+       lw      $13,12($6)
+       sltu    $8,$11,$2
+       addu    $11,$10,$11
+       sltu    $2,$11,$10
+       sw      $11,8($4)
+       or      $2,$2,$8
+
+       lw      $10,16($5)
+       addu    $13,$13,$2
+       lw      $11,16($6)
+       sltu    $8,$13,$2
+       addu    $13,$12,$13
+       sltu    $2,$13,$12
+       sw      $13,12($4)
+       or      $2,$2,$8
+
+       addiu   $5,$5,16
+       addiu   $6,$6,16
+
+       bne     $7,$0,.Loop
+        addiu  $4,$4,16
+
+.Lend: addu    $11,$11,$2
+       sltu    $8,$11,$2
+       addu    $11,$10,$11
+       sltu    $2,$11,$10
+       sw      $11,0($4)
+       j       $31
+       or      $2,$2,$8
+
+       .end    __mpn_add_n
diff --git a/sysdeps/mips/addmul_1.s b/sysdeps/mips/addmul_1.s

new file mode 100644 (file)

index 0000000..abc2fb8
--- /dev/null
+++ b/sysdeps/mips/addmul_1.s
@@ -0,0 +1,96 @@
+ # MIPS __mpn_addmul_1 -- Multiply a limb vector with a single limb and
+ # add the product to a second limb vector.
+
+ # Copyright (C) 1992, 1994 Free Software Foundation, Inc.
+
+ # This file is part of the GNU MP Library.
+
+ # The GNU MP Library is free software; you can redistribute it and/or modify
+ # it under the terms of the GNU Library General Public License as published by
+ # the Free Software Foundation; either version 2 of the License, or (at your
+ # option) any later version.
+
+ # The GNU MP Library is distributed in the hope that it will be useful, but
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ # or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
+ # License for more details.
+
+ # You should have received a copy of the GNU Library General Public License
+ # along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+ # the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+
+ # INPUT PARAMETERS
+ # res_ptr     $4
+ # s1_ptr      $5
+ # size                $6
+ # s2_limb     $7
+
+       .text
+       .align   4
+       .globl   __mpn_addmul_1
+       .ent    __mpn_addmul_1
+__mpn_addmul_1:
+       .set    noreorder
+       .set    nomacro
+
+ # warm up phase 0
+       lw      $8,0($5)
+
+ # warm up phase 1
+       addiu   $5,$5,4
+       multu   $8,$7
+
+       addiu   $6,$6,-1
+       beq     $6,$0,$LC0
+        move   $2,$0           # zero cy2
+
+       addiu   $6,$6,-1
+       beq     $6,$0,$LC1
+       lw      $8,0($5)        # load new s1 limb as early as possible
+
+Loop:  lw      $10,0($4)
+       mflo    $3
+       mfhi    $9
+       addiu   $5,$5,4
+       addu    $3,$3,$2        # add old carry limb to low product limb
+       multu   $8,$7
+       lw      $8,0($5)        # load new s1 limb as early as possible
+       addiu   $6,$6,-1        # decrement loop counter
+       sltu    $2,$3,$2        # carry from previous addition -> $2
+       addu    $3,$10,$3
+       sltu    $10,$3,$10
+       addu    $2,$2,$10
+       sw      $3,0($4)
+       addiu   $4,$4,4
+       bne     $6,$0,Loop      # should be "bnel"
+        addu   $2,$9,$2        # add high product limb and carry from addition
+
+ # cool down phase 1
+$LC1:  lw      $10,0($4)
+       mflo    $3
+       mfhi    $9
+       addu    $3,$3,$2
+       sltu    $2,$3,$2
+       multu   $8,$7
+       addu    $3,$10,$3
+       sltu    $10,$3,$10
+       addu    $2,$2,$10
+       sw      $3,0($4)
+       addiu   $4,$4,4
+       addu    $2,$9,$2        # add high product limb and carry from addition
+
+ # cool down phase 0
+$LC0:  lw      $10,0($4)
+       mflo    $3
+       mfhi    $9
+       addu    $3,$3,$2
+       sltu    $2,$3,$2
+       addu    $3,$10,$3
+       sltu    $10,$3,$10
+       addu    $2,$2,$10
+       sw      $3,0($4)
+       j       $31
+       addu    $2,$9,$2        # add high product limb and carry from addition
+
+       .end    __mpn_addmul_1
diff --git a/sysdeps/mips/lshift.s b/sysdeps/mips/lshift.s

new file mode 100644 (file)

index 0000000..ce33e7c
--- /dev/null
+++ b/sysdeps/mips/lshift.s
@@ -0,0 +1,94 @@
+ # MIPS2 __mpn_lshift --
+
+ # Copyright (C) 1995 Free Software Foundation, Inc.
+
+ # This file is part of the GNU MP Library.
+
+ # The GNU MP Library is free software; you can redistribute it and/or modify
+ # it under the terms of the GNU Library General Public License as published by
+ # the Free Software Foundation; either version 2 of the License, or (at your
+ # option) any later version.
+
+ # The GNU MP Library is distributed in the hope that it will be useful, but
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ # or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
+ # License for more details.
+
+ # You should have received a copy of the GNU Library General Public License
+ # along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+ # the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+
+ # INPUT PARAMETERS
+ # res_ptr     $4
+ # src_ptr     $5
+ # size                $6
+ # cnt         $7
+
+       .text
+       .align  2
+       .globl  __mpn_lshift
+       .ent    __mpn_lshift
+__mpn_lshift:
+       .set    noreorder
+       .set    nomacro
+
+       sll     $2,$6,2
+       addu    $5,$5,$2        # make r5 point at end of src
+       lw      $10,-4($5)      # load first limb
+       subu    $13,$0,$7
+       addu    $4,$4,$2        # make r4 point at end of res
+       addiu   $6,$6,-1
+       and     $9,$6,4-1       # number of limbs in first loop
+       beq     $9,$0,.L0       # if multiple of 4 limbs, skip first loop
+        srl    $2,$10,$13      # compute function result
+
+       subu    $6,$6,$9
+
+.Loop0:        lw      $3,-8($5)
+       addiu   $4,$4,-4
+       addiu   $5,$5,-4
+       addiu   $9,$9,-1
+       sll     $11,$10,$7
+       srl     $12,$3,$13
+       move    $10,$3
+       or      $8,$11,$12
+       bne     $9,$0,.Loop0
+        sw     $8,0($4)
+
+.L0:   beq     $6,$0,.Lend
+        nop
+
+.Loop: lw      $3,-8($5)
+       addiu   $4,$4,-16
+       addiu   $6,$6,-4
+       sll     $11,$10,$7
+       srl     $12,$3,$13
+
+       lw      $10,-12($5)
+       sll     $14,$3,$7
+       or      $8,$11,$12
+       sw      $8,12($4)
+       srl     $9,$10,$13
+
+       lw      $3,-16($5)
+       sll     $11,$10,$7
+       or      $8,$14,$9
+       sw      $8,8($4)
+       srl     $12,$3,$13
+
+       lw      $10,-20($5)
+       sll     $14,$3,$7
+       or      $8,$11,$12
+       sw      $8,4($4)
+       srl     $9,$10,$13
+
+       addiu   $5,$5,-16
+       or      $8,$14,$9
+       bgtz    $6,.Loop
+        sw     $8,0($4)
+
+.Lend: sll     $8,$10,$7
+       j       $31
+       sw      $8,-4($4)
+       .end    __mpn_lshift
diff --git a/sysdeps/mips/mips3/add_n.s b/sysdeps/mips/mips3/add_n.s

new file mode 100644 (file)

index 0000000..b525780
--- /dev/null
+++ b/sysdeps/mips/mips3/add_n.s
@@ -0,0 +1,119 @@
+ # MIPS3 __mpn_add_n -- Add two limb vectors of the same length > 0 and
+ # store sum in a third limb vector.
+
+ # Copyright (C) 1995 Free Software Foundation, Inc.
+
+ # This file is part of the GNU MP Library.
+
+ # The GNU MP Library is free software; you can redistribute it and/or modify
+ # it under the terms of the GNU Library General Public License as published by
+ # the Free Software Foundation; either version 2 of the License, or (at your
+ # option) any later version.
+
+ # The GNU MP Library is distributed in the hope that it will be useful, but
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ # or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
+ # License for more details.
+
+ # You should have received a copy of the GNU Library General Public License
+ # along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+ # the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+
+ # INPUT PARAMETERS
+ # res_ptr     $4
+ # s1_ptr      $5
+ # s2_ptr      $6
+ # size                $7
+
+       .text
+       .align  2
+       .globl  __mpn_add_n
+       .ent    __mpn_add_n
+__mpn_add_n:
+       .set    noreorder
+       .set    nomacro
+
+       ld      $10,0($5)
+       ld      $11,0($6)
+
+       daddiu  $7,$7,-1
+       and     $9,$7,4-1       # number of limbs in first loop
+       beq     $9,$0,.L0       # if multiple of 4 limbs, skip first loop
+        move   $2,$0
+
+       dsubu   $7,$7,$9
+
+.Loop0:        daddiu  $9,$9,-1
+       ld      $12,8($5)
+       daddu   $11,$11,$2
+       ld      $13,8($6)
+       sltu    $8,$11,$2
+       daddu   $11,$10,$11
+       sltu    $2,$11,$10
+       sd      $11,0($4)
+       or      $2,$2,$8
+
+       daddiu  $5,$5,8
+       daddiu  $6,$6,8
+       move    $10,$12
+       move    $11,$13
+       bne     $9,$0,.Loop0
+        daddiu $4,$4,8
+
+.L0:   beq     $7,$0,.Lend
+        nop
+
+.Loop: daddiu  $7,$7,-4
+
+       ld      $12,8($5)
+       daddu   $11,$11,$2
+       ld      $13,8($6)
+       sltu    $8,$11,$2
+       daddu   $11,$10,$11
+       sltu    $2,$11,$10
+       sd      $11,0($4)
+       or      $2,$2,$8
+
+       ld      $10,16($5)
+       daddu   $13,$13,$2
+       ld      $11,16($6)
+       sltu    $8,$13,$2
+       daddu   $13,$12,$13
+       sltu    $2,$13,$12
+       sd      $13,8($4)
+       or      $2,$2,$8
+
+       ld      $12,24($5)
+       daddu   $11,$11,$2
+       ld      $13,24($6)
+       sltu    $8,$11,$2
+       daddu   $11,$10,$11
+       sltu    $2,$11,$10
+       sd      $11,16($4)
+       or      $2,$2,$8
+
+       ld      $10,32($5)
+       daddu   $13,$13,$2
+       ld      $11,32($6)
+       sltu    $8,$13,$2
+       daddu   $13,$12,$13
+       sltu    $2,$13,$12
+       sd      $13,24($4)
+       or      $2,$2,$8
+
+       daddiu  $5,$5,32
+       daddiu  $6,$6,32
+
+       bne     $7,$0,.Loop
+        daddiu $4,$4,32
+
+.Lend: daddu   $11,$11,$2
+       sltu    $8,$11,$2
+       daddu   $11,$10,$11
+       sltu    $2,$11,$10
+       sd      $11,0($4)
+       j       $31
+       or      $2,$2,$8
+
+       .end    __mpn_add_n
diff --git a/sysdeps/mips/mips3/addmul_1.s b/sysdeps/mips/mips3/addmul_1.s

new file mode 100644 (file)

index 0000000..7af0172
--- /dev/null
+++ b/sysdeps/mips/mips3/addmul_1.s
@@ -0,0 +1,96 @@
+ # MIPS3 __mpn_addmul_1 -- Multiply a limb vector with a single limb and
+ # add the product to a second limb vector.
+
+ # Copyright (C) 1992, 1994, 1995 Free Software Foundation, Inc.
+
+ # This file is part of the GNU MP Library.
+
+ # The GNU MP Library is free software; you can redistribute it and/or modify
+ # it under the terms of the GNU Library General Public License as published by
+ # the Free Software Foundation; either version 2 of the License, or (at your
+ # option) any later version.
+
+ # The GNU MP Library is distributed in the hope that it will be useful, but
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ # or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
+ # License for more details.
+
+ # You should have received a copy of the GNU Library General Public License
+ # along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+ # the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+
+ # INPUT PARAMETERS
+ # res_ptr     $4
+ # s1_ptr      $5
+ # size                $6
+ # s2_limb     $7
+
+       .text
+       .align  4
+       .globl  __mpn_addmul_1
+       .ent    __mpn_addmul_1
+__mpn_addmul_1:
+       .set    noreorder
+       .set    nomacro
+
+ # warm up phase 0
+       ld      $8,0($5)
+
+ # warm up phase 1
+       daddiu  $5,$5,8
+       dmultu  $8,$7
+
+       daddiu  $6,$6,-1
+       beq     $6,$0,$LC0
+        move   $2,$0           # zero cy2
+
+       daddiu  $6,$6,-1
+       beq     $6,$0,$LC1
+       ld      $8,0($5)        # load new s1 limb as early as possible
+
+Loop:  ld      $10,0($4)
+       mflo    $3
+       mfhi    $9
+       daddiu  $5,$5,8
+       daddu   $3,$3,$2        # add old carry limb to low product limb
+       dmultu  $8,$7
+       ld      $8,0($5)        # load new s1 limb as early as possible
+       daddiu  $6,$6,-1        # decrement loop counter
+       sltu    $2,$3,$2        # carry from previous addition -> $2
+       daddu   $3,$10,$3
+       sltu    $10,$3,$10
+       daddu   $2,$2,$10
+       sd      $3,0($4)
+       daddiu  $4,$4,8
+       bne     $6,$0,Loop      # should be "bnel"
+        daddu  $2,$9,$2        # add high product limb and carry from addition
+
+ # cool down phase 1
+$LC1:  ld      $10,0($4)
+       mflo    $3
+       mfhi    $9
+       daddu   $3,$3,$2
+       sltu    $2,$3,$2
+       dmultu  $8,$7
+       daddu   $3,$10,$3
+       sltu    $10,$3,$10
+       daddu   $2,$2,$10
+       sd      $3,0($4)
+       daddiu  $4,$4,8
+       daddu   $2,$9,$2        # add high product limb and carry from addition
+
+ # cool down phase 0
+$LC0:  ld      $10,0($4)
+       mflo    $3
+       mfhi    $9
+       daddu   $3,$3,$2
+       sltu    $2,$3,$2
+       daddu   $3,$10,$3
+       sltu    $10,$3,$10
+       daddu   $2,$2,$10
+       sd      $3,0($4)
+       j       $31
+       daddu   $2,$9,$2        # add high product limb and carry from addition
+
+       .end    __mpn_addmul_1
diff --git a/sysdeps/mips/mips3/lshift.s b/sysdeps/mips/mips3/lshift.s

new file mode 100644 (file)

index 0000000..c05dcaf
--- /dev/null
+++ b/sysdeps/mips/mips3/lshift.s
@@ -0,0 +1,94 @@
+ # MIPS3 __mpn_lshift --
+
+ # Copyright (C) 1995 Free Software Foundation, Inc.
+
+ # This file is part of the GNU MP Library.
+
+ # The GNU MP Library is free software; you can redistribute it and/or modify
+ # it under the terms of the GNU Library General Public License as published by
+ # the Free Software Foundation; either version 2 of the License, or (at your
+ # option) any later version.
+
+ # The GNU MP Library is distributed in the hope that it will be useful, but
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ # or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
+ # License for more details.
+
+ # You should have received a copy of the GNU Library General Public License
+ # along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+ # the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+
+ # INPUT PARAMETERS
+ # res_ptr     $4
+ # src_ptr     $5
+ # size                $6
+ # cnt         $7
+
+       .text
+       .align  2
+       .globl  __mpn_lshift
+       .ent    __mpn_lshift
+__mpn_lshift:
+       .set    noreorder
+       .set    nomacro
+
+       dsll    $2,$6,3
+       daddu   $5,$5,$2        # make r5 point at end of src
+       ld      $10,-8($5)      # load first limb
+       dsubu   $13,$0,$7
+       daddu   $4,$4,$2        # make r4 point at end of res
+       daddiu  $6,$6,-1
+       and     $9,$6,4-1       # number of limbs in first loop
+       beq     $9,$0,.L0       # if multiple of 4 limbs, skip first loop
+        dsrl   $2,$10,$13      # compute function result
+
+       dsubu   $6,$6,$9
+
+.Loop0:        ld      $3,-16($5)
+       daddiu  $4,$4,-8
+       daddiu  $5,$5,-8
+       daddiu  $9,$9,-1
+       dsll    $11,$10,$7
+       dsrl    $12,$3,$13
+       move    $10,$3
+       or      $8,$11,$12
+       bne     $9,$0,.Loop0
+        sd     $8,0($4)
+
+.L0:   beq     $6,$0,.Lend
+        nop
+
+.Loop: ld      $3,-16($5)
+       daddiu  $4,$4,-32
+       daddiu  $6,$6,-4
+       dsll    $11,$10,$7
+       dsrl    $12,$3,$13
+
+       ld      $10,-24($5)
+       dsll    $14,$3,$7
+       or      $8,$11,$12
+       sd      $8,24($4)
+       dsrl    $9,$10,$13
+
+       ld      $3,-32($5)
+       dsll    $11,$10,$7
+       or      $8,$14,$9
+       sd      $8,16($4)
+       dsrl    $12,$3,$13
+
+       ld      $10,-40($5)
+       dsll    $14,$3,$7
+       or      $8,$11,$12
+       sd      $8,8($4)
+       dsrl    $9,$10,$13
+
+       daddiu  $5,$5,-32
+       or      $8,$14,$9
+       bgtz    $6,.Loop
+        sd     $8,0($4)
+
+.Lend: dsll    $8,$10,$7
+       j       $31
+       sd      $8,-8($4)
+       .end    __mpn_lshift
diff --git a/sysdeps/mips/mips3/mul_1.s b/sysdeps/mips/mips3/mul_1.s

new file mode 100644 (file)

index 0000000..87954e5
--- /dev/null
+++ b/sysdeps/mips/mips3/mul_1.s
@@ -0,0 +1,84 @@
+ # MIPS3 __mpn_mul_1 -- Multiply a limb vector with a single limb and
+ # store the product in a second limb vector.
+
+ # Copyright (C) 1992, 1994, 1995 Free Software Foundation, Inc.
+
+ # This file is part of the GNU MP Library.
+
+ # The GNU MP Library is free software; you can redistribute it and/or modify
+ # it under the terms of the GNU Library General Public License as published by
+ # the Free Software Foundation; either version 2 of the License, or (at your
+ # option) any later version.
+
+ # The GNU MP Library is distributed in the hope that it will be useful, but
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ # or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
+ # License for more details.
+
+ # You should have received a copy of the GNU Library General Public License
+ # along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+ # the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+
+ # INPUT PARAMETERS
+ # res_ptr     $4
+ # s1_ptr      $5
+ # size                $6
+ # s2_limb     $7
+
+       .text
+       .align  4
+       .globl  __mpn_mul_1
+       .ent    __mpn_mul_1
+__mpn_mul_1:
+       .set    noreorder
+       .set    nomacro
+
+ # warm up phase 0
+       ld      $8,0($5)
+
+ # warm up phase 1
+       daddiu  $5,$5,8
+       dmultu  $8,$7
+
+       daddiu  $6,$6,-1
+       beq     $6,$0,$LC0
+        move   $2,$0           # zero cy2
+
+       daddiu  $6,$6,-1
+       beq     $6,$0,$LC1
+       ld      $8,0($5)        # load new s1 limb as early as possible
+
+Loop:  mflo    $10
+       mfhi    $9
+       daddiu  $5,$5,8
+       daddu   $10,$10,$2      # add old carry limb to low product limb
+       dmultu  $8,$7
+       ld      $8,0($5)        # load new s1 limb as early as possible
+       daddiu  $6,$6,-1        # decrement loop counter
+       sltu    $2,$10,$2       # carry from previous addition -> $2
+       sd      $10,0($4)
+       daddiu  $4,$4,8
+       bne     $6,$0,Loop      # should be "bnel"
+        daddu  $2,$9,$2        # add high product limb and carry from addition
+
+ # cool down phase 1
+$LC1:  mflo    $10
+       mfhi    $9
+       daddu   $10,$10,$2
+       sltu    $2,$10,$2
+       dmultu  $8,$7
+       sd      $10,0($4)
+       daddiu  $4,$4,8
+       daddu   $2,$9,$2        # add high product limb and carry from addition
+
+ # cool down phase 0
+$LC0:  mflo    $10
+       mfhi    $9
+       daddu   $10,$10,$2
+       sltu    $2,$10,$2
+       sd      $10,0($4)
+       j       $31
+       daddu   $2,$9,$2        # add high product limb and carry from addition
+
+       .end    __mpn_mul_1
diff --git a/sysdeps/mips/mips3/rshift.s b/sysdeps/mips/mips3/rshift.s

new file mode 100644 (file)

index 0000000..e0e2ca2
--- /dev/null
+++ b/sysdeps/mips/mips3/rshift.s
@@ -0,0 +1,91 @@
+ # MIPS3 __mpn_rshift --
+
+ # Copyright (C) 1995 Free Software Foundation, Inc.
+
+ # This file is part of the GNU MP Library.
+
+ # The GNU MP Library is free software; you can redistribute it and/or modify
+ # it under the terms of the GNU Library General Public License as published by
+ # the Free Software Foundation; either version 2 of the License, or (at your
+ # option) any later version.
+
+ # The GNU MP Library is distributed in the hope that it will be useful, but
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ # or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
+ # License for more details.
+
+ # You should have received a copy of the GNU Library General Public License
+ # along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+ # the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+
+ # INPUT PARAMETERS
+ # res_ptr     $4
+ # src_ptr     $5
+ # size                $6
+ # cnt         $7
+
+       .text
+       .align  2
+       .globl  __mpn_rshift
+       .ent    __mpn_rshift
+__mpn_rshift:
+       .set    noreorder
+       .set    nomacro
+
+       ld      $10,0($5)       # load first limb
+       dsubu   $13,$0,$7
+       daddiu  $6,$6,-1
+       and     $9,$6,4-1       # number of limbs in first loop
+       beq     $9,$0,.L0       # if multiple of 4 limbs, skip first loop
+        dsll   $2,$10,$13      # compute function result
+
+       dsubu   $6,$6,$9
+
+.Loop0:        ld      $3,8($5)
+       daddiu  $4,$4,8
+       daddiu  $5,$5,8
+       daddiu  $9,$9,-1
+       dsrl    $11,$10,$7
+       dsll    $12,$3,$13
+       move    $10,$3
+       or      $8,$11,$12
+       bne     $9,$0,.Loop0
+        sd     $8,-8($4)
+
+.L0:   beq     $6,$0,.Lend
+        nop
+
+.Loop: ld      $3,8($5)
+       daddiu  $4,$4,32
+       daddiu  $6,$6,-4
+       dsrl    $11,$10,$7
+       dsll    $12,$3,$13
+
+       ld      $10,16($5)
+       dsrl    $14,$3,$7
+       or      $8,$11,$12
+       sd      $8,-32($4)
+       dsll    $9,$10,$13
+
+       ld      $3,24($5)
+       dsrl    $11,$10,$7
+       or      $8,$14,$9
+       sd      $8,-24($4)
+       dsll    $12,$3,$13
+
+       ld      $10,32($5)
+       dsrl    $14,$3,$7
+       or      $8,$11,$12
+       sd      $8,-16($4)
+       dsll    $9,$10,$13
+
+       daddiu  $5,$5,32
+       or      $8,$14,$9
+       bgtz    $6,.Loop
+        sd     $8,-8($4)
+
+.Lend: dsrl    $8,$10,$7
+       j       $31
+       sd      $8,0($4)
+       .end    __mpn_rshift
diff --git a/sysdeps/mips/mips3/sub_n.s b/sysdeps/mips/mips3/sub_n.s

new file mode 100644 (file)

index 0000000..9a45ffd
--- /dev/null
+++ b/sysdeps/mips/mips3/sub_n.s
@@ -0,0 +1,119 @@
+ # MIPS3 __mpn_sub_n -- Subtract two limb vectors of the same length > 0 and
+ # store difference in a third limb vector.
+
+ # Copyright (C) 1995 Free Software Foundation, Inc.
+
+ # This file is part of the GNU MP Library.
+
+ # The GNU MP Library is free software; you can redistribute it and/or modify
+ # it under the terms of the GNU Library General Public License as published by
+ # the Free Software Foundation; either version 2 of the License, or (at your
+ # option) any later version.
+
+ # The GNU MP Library is distributed in the hope that it will be useful, but
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ # or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
+ # License for more details.
+
+ # You should have received a copy of the GNU Library General Public License
+ # along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+ # the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+
+ # INPUT PARAMETERS
+ # res_ptr     $4
+ # s1_ptr      $5
+ # s2_ptr      $6
+ # size                $7
+
+       .text
+       .align  2
+       .globl  __mpn_sub_n
+       .ent    __mpn_sub_n
+__mpn_sub_n:
+       .set    noreorder
+       .set    nomacro
+
+       ld      $10,0($5)
+       ld      $11,0($6)
+
+       daddiu  $7,$7,-1
+       and     $9,$7,4-1       # number of limbs in first loop
+       beq     $9,$0,.L0       # if multiple of 4 limbs, skip first loop
+        move   $2,$0
+
+       dsubu   $7,$7,$9
+
+.Loop0:        daddiu  $9,$9,-1
+       ld      $12,8($5)
+       daddu   $11,$11,$2
+       ld      $13,8($6)
+       sltu    $8,$11,$2
+       dsubu   $11,$10,$11
+       sltu    $2,$10,$11
+       sd      $11,0($4)
+       or      $2,$2,$8
+
+       daddiu  $5,$5,8
+       daddiu  $6,$6,8
+       move    $10,$12
+       move    $11,$13
+       bne     $9,$0,.Loop0
+        daddiu $4,$4,8
+
+.L0:   beq     $7,$0,.Lend
+        nop
+
+.Loop: daddiu  $7,$7,-4
+
+       ld      $12,8($5)
+       daddu   $11,$11,$2
+       ld      $13,8($6)
+       sltu    $8,$11,$2
+       dsubu   $11,$10,$11
+       sltu    $2,$10,$11
+       sd      $11,0($4)
+       or      $2,$2,$8
+
+       ld      $10,16($5)
+       daddu   $13,$13,$2
+       ld      $11,16($6)
+       sltu    $8,$13,$2
+       dsubu   $13,$12,$13
+       sltu    $2,$12,$13
+       sd      $13,8($4)
+       or      $2,$2,$8
+
+       ld      $12,24($5)
+       daddu   $11,$11,$2
+       ld      $13,24($6)
+       sltu    $8,$11,$2
+       dsubu   $11,$10,$11
+       sltu    $2,$10,$11
+       sd      $11,16($4)
+       or      $2,$2,$8
+
+       ld      $10,32($5)
+       daddu   $13,$13,$2
+       ld      $11,32($6)
+       sltu    $8,$13,$2
+       dsubu   $13,$12,$13
+       sltu    $2,$12,$13
+       sd      $13,24($4)
+       or      $2,$2,$8
+
+       daddiu  $5,$5,32
+       daddiu  $6,$6,32
+
+       bne     $7,$0,.Loop
+        daddiu $4,$4,32
+
+.Lend: daddu   $11,$11,$2
+       sltu    $8,$11,$2
+       dsubu   $11,$10,$11
+       sltu    $2,$10,$11
+       sd      $11,0($4)
+       j       $31
+       or      $2,$2,$8
+
+       .end    __mpn_sub_n
diff --git a/sysdeps/mips/mips3/submul_1.s b/sysdeps/mips/mips3/submul_1.s

new file mode 100644 (file)

index 0000000..f28c6a5
--- /dev/null
+++ b/sysdeps/mips/mips3/submul_1.s
@@ -0,0 +1,96 @@
+ # MIPS3 __mpn_submul_1 -- Multiply a limb vector with a single limb and
+ # subtract the product from a second limb vector.
+
+ # Copyright (C) 1992, 1994, 1995 Free Software Foundation, Inc.
+
+ # This file is part of the GNU MP Library.
+
+ # The GNU MP Library is free software; you can redistribute it and/or modify
+ # it under the terms of the GNU Library General Public License as published by
+ # the Free Software Foundation; either version 2 of the License, or (at your
+ # option) any later version.
+
+ # The GNU MP Library is distributed in the hope that it will be useful, but
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ # or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
+ # License for more details.
+
+ # You should have received a copy of the GNU Library General Public License
+ # along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+ # the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+
+ # INPUT PARAMETERS
+ # res_ptr     $4
+ # s1_ptr      $5
+ # size                $6
+ # s2_limb     $7
+
+       .text
+       .align  4
+       .globl  __mpn_submul_1
+       .ent    __mpn_submul_1
+__mpn_submul_1:
+       .set    noreorder
+       .set    nomacro
+
+ # warm up phase 0
+       ld      $8,0($5)
+
+ # warm up phase 1
+       daddiu  $5,$5,8
+       dmultu  $8,$7
+
+       daddiu  $6,$6,-1
+       beq     $6,$0,$LC0
+        move   $2,$0           # zero cy2
+
+       daddiu  $6,$6,-1
+       beq     $6,$0,$LC1
+       ld      $8,0($5)        # load new s1 limb as early as possible
+
+Loop:  ld      $10,0($4)
+       mflo    $3
+       mfhi    $9
+       daddiu  $5,$5,8
+       daddu   $3,$3,$2        # add old carry limb to low product limb
+       dmultu  $8,$7
+       ld      $8,0($5)        # load new s1 limb as early as possible
+       daddiu  $6,$6,-1        # decrement loop counter
+       sltu    $2,$3,$2        # carry from previous addition -> $2
+       dsubu   $3,$10,$3
+       sgtu    $10,$3,$10
+       daddu   $2,$2,$10
+       sd      $3,0($4)
+       daddiu  $4,$4,8
+       bne     $6,$0,Loop      # should be "bnel"
+        daddu  $2,$9,$2        # add high product limb and carry from addition
+
+ # cool down phase 1
+$LC1:  ld      $10,0($4)
+       mflo    $3
+       mfhi    $9
+       daddu   $3,$3,$2
+       sltu    $2,$3,$2
+       dmultu  $8,$7
+       dsubu   $3,$10,$3
+       sgtu    $10,$3,$10
+       daddu   $2,$2,$10
+       sd      $3,0($4)
+       daddiu  $4,$4,8
+       daddu   $2,$9,$2        # add high product limb and carry from addition
+
+ # cool down phase 0
+$LC0:  ld      $10,0($4)
+       mflo    $3
+       mfhi    $9
+       daddu   $3,$3,$2
+       sltu    $2,$3,$2
+       dsubu   $3,$10,$3
+       sgtu    $10,$3,$10
+       daddu   $2,$2,$10
+       sd      $3,0($4)
+       j       $31
+       daddu   $2,$9,$2        # add high product limb and carry from addition
+
+       .end    __mpn_submul_1
diff --git a/sysdeps/mips/mul_1.s b/sysdeps/mips/mul_1.s

new file mode 100644 (file)

index 0000000..01327e2
--- /dev/null
+++ b/sysdeps/mips/mul_1.s
@@ -0,0 +1,84 @@
+ # MIPS __mpn_mul_1 -- Multiply a limb vector with a single limb and
+ # store the product in a second limb vector.
+
+ # Copyright (C) 1992, 1994 Free Software Foundation, Inc.
+
+ # This file is part of the GNU MP Library.
+
+ # The GNU MP Library is free software; you can redistribute it and/or modify
+ # it under the terms of the GNU Library General Public License as published by
+ # the Free Software Foundation; either version 2 of the License, or (at your
+ # option) any later version.
+
+ # The GNU MP Library is distributed in the hope that it will be useful, but
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ # or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
+ # License for more details.
+
+ # You should have received a copy of the GNU Library General Public License
+ # along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+ # the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+
+ # INPUT PARAMETERS
+ # res_ptr     $4
+ # s1_ptr      $5
+ # size                $6
+ # s2_limb     $7
+
+       .text
+       .align   4
+       .globl   __mpn_mul_1
+       .ent    __mpn_mul_1
+__mpn_mul_1:
+       .set    noreorder
+       .set    nomacro
+
+ # warm up phase 0
+       lw      $8,0($5)
+
+ # warm up phase 1
+       addiu   $5,$5,4
+       multu   $8,$7
+
+       addiu   $6,$6,-1
+       beq     $6,$0,$LC0
+        move   $2,$0           # zero cy2
+
+       addiu   $6,$6,-1
+       beq     $6,$0,$LC1
+       lw      $8,0($5)        # load new s1 limb as early as possible
+
+Loop:  mflo    $10
+       mfhi    $9
+       addiu   $5,$5,4
+       addu    $10,$10,$2      # add old carry limb to low product limb
+       multu   $8,$7
+       lw      $8,0($5)        # load new s1 limb as early as possible
+       addiu   $6,$6,-1        # decrement loop counter
+       sltu    $2,$10,$2       # carry from previous addition -> $2
+       sw      $10,0($4)
+       addiu   $4,$4,4
+       bne     $6,$0,Loop      # should be "bnel"
+        addu   $2,$9,$2        # add high product limb and carry from addition
+
+ # cool down phase 1
+$LC1:  mflo    $10
+       mfhi    $9
+       addu    $10,$10,$2
+       sltu    $2,$10,$2
+       multu   $8,$7
+       sw      $10,0($4)
+       addiu   $4,$4,4
+       addu    $2,$9,$2        # add high product limb and carry from addition
+
+ # cool down phase 0
+$LC0:  mflo    $10
+       mfhi    $9
+       addu    $10,$10,$2
+       sltu    $2,$10,$2
+       sw      $10,0($4)
+       j       $31
+       addu    $2,$9,$2        # add high product limb and carry from addition
+
+       .end    __mpn_mul_1
diff --git a/sysdeps/mips/rshift.s b/sysdeps/mips/rshift.s

new file mode 100644 (file)

index 0000000..6941691
--- /dev/null
+++ b/sysdeps/mips/rshift.s
@@ -0,0 +1,91 @@
+ # MIPS2 __mpn_rshift --
+
+ # Copyright (C) 1995 Free Software Foundation, Inc.
+
+ # This file is part of the GNU MP Library.
+
+ # The GNU MP Library is free software; you can redistribute it and/or modify
+ # it under the terms of the GNU Library General Public License as published by
+ # the Free Software Foundation; either version 2 of the License, or (at your
+ # option) any later version.
+
+ # The GNU MP Library is distributed in the hope that it will be useful, but
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ # or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
+ # License for more details.
+
+ # You should have received a copy of the GNU Library General Public License
+ # along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+ # the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+
+ # INPUT PARAMETERS
+ # res_ptr     $4
+ # src_ptr     $5
+ # size                $6
+ # cnt         $7
+
+       .text
+       .align  2
+       .globl  __mpn_rshift
+       .ent    __mpn_rshift
+__mpn_rshift:
+       .set    noreorder
+       .set    nomacro
+
+       lw      $10,0($5)       # load first limb
+       subu    $13,$0,$7
+       addiu   $6,$6,-1
+       and     $9,$6,4-1       # number of limbs in first loop
+       beq     $9,$0,.L0       # if multiple of 4 limbs, skip first loop
+        sll    $2,$10,$13      # compute function result
+
+       subu    $6,$6,$9
+
+.Loop0:        lw      $3,4($5)
+       addiu   $4,$4,4
+       addiu   $5,$5,4
+       addiu   $9,$9,-1
+       srl     $11,$10,$7
+       sll     $12,$3,$13
+       move    $10,$3
+       or      $8,$11,$12
+       bne     $9,$0,.Loop0
+        sw     $8,-4($4)
+
+.L0:   beq     $6,$0,.Lend
+        nop
+
+.Loop: lw      $3,4($5)
+       addiu   $4,$4,16
+       addiu   $6,$6,-4
+       srl     $11,$10,$7
+       sll     $12,$3,$13
+
+       lw      $10,8($5)
+       srl     $14,$3,$7
+       or      $8,$11,$12
+       sw      $8,-16($4)
+       sll     $9,$10,$13
+
+       lw      $3,12($5)
+       srl     $11,$10,$7
+       or      $8,$14,$9
+       sw      $8,-12($4)
+       sll     $12,$3,$13
+
+       lw      $10,16($5)
+       srl     $14,$3,$7
+       or      $8,$11,$12
+       sw      $8,-8($4)
+       sll     $9,$10,$13
+
+       addiu   $5,$5,16
+       or      $8,$14,$9
+       bgtz    $6,.Loop
+        sw     $8,-4($4)
+
+.Lend: srl     $8,$10,$7
+       j       $31
+       sw      $8,0($4)
+       .end    __mpn_rshift
diff --git a/sysdeps/mips/sub_n.s b/sysdeps/mips/sub_n.s

new file mode 100644 (file)

index 0000000..63f3b55
--- /dev/null
+++ b/sysdeps/mips/sub_n.s
@@ -0,0 +1,119 @@
+ # MIPS2 __mpn_sub_n -- Subtract two limb vectors of the same length > 0 and
+ # store difference in a third limb vector.
+
+ # Copyright (C) 1995 Free Software Foundation, Inc.
+
+ # This file is part of the GNU MP Library.
+
+ # The GNU MP Library is free software; you can redistribute it and/or modify
+ # it under the terms of the GNU Library General Public License as published by
+ # the Free Software Foundation; either version 2 of the License, or (at your
+ # option) any later version.
+
+ # The GNU MP Library is distributed in the hope that it will be useful, but
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ # or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
+ # License for more details.
+
+ # You should have received a copy of the GNU Library General Public License
+ # along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+ # the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+
+ # INPUT PARAMETERS
+ # res_ptr     $4
+ # s1_ptr      $5
+ # s2_ptr      $6
+ # size                $7
+
+       .text
+       .align  2
+       .globl  __mpn_sub_n
+       .ent    __mpn_sub_n
+__mpn_sub_n:
+       .set    noreorder
+       .set    nomacro
+
+       lw      $10,0($5)
+       lw      $11,0($6)
+
+       addiu   $7,$7,-1
+       and     $9,$7,4-1       # number of limbs in first loop
+       beq     $9,$0,.L0       # if multiple of 4 limbs, skip first loop
+        move   $2,$0
+
+       subu    $7,$7,$9
+
+.Loop0:        addiu   $9,$9,-1
+       lw      $12,4($5)
+       addu    $11,$11,$2
+       lw      $13,4($6)
+       sltu    $8,$11,$2
+       subu    $11,$10,$11
+       sltu    $2,$10,$11
+       sw      $11,0($4)
+       or      $2,$2,$8
+
+       addiu   $5,$5,4
+       addiu   $6,$6,4
+       move    $10,$12
+       move    $11,$13
+       bne     $9,$0,.Loop0
+        addiu  $4,$4,4
+
+.L0:   beq     $7,$0,.Lend
+        nop
+
+.Loop: addiu   $7,$7,-4
+
+       lw      $12,4($5)
+       addu    $11,$11,$2
+       lw      $13,4($6)
+       sltu    $8,$11,$2
+       subu    $11,$10,$11
+       sltu    $2,$10,$11
+       sw      $11,0($4)
+       or      $2,$2,$8
+
+       lw      $10,8($5)
+       addu    $13,$13,$2
+       lw      $11,8($6)
+       sltu    $8,$13,$2
+       subu    $13,$12,$13
+       sltu    $2,$12,$13
+       sw      $13,4($4)
+       or      $2,$2,$8
+
+       lw      $12,12($5)
+       addu    $11,$11,$2
+       lw      $13,12($6)
+       sltu    $8,$11,$2
+       subu    $11,$10,$11
+       sltu    $2,$10,$11
+       sw      $11,8($4)
+       or      $2,$2,$8
+
+       lw      $10,16($5)
+       addu    $13,$13,$2
+       lw      $11,16($6)
+       sltu    $8,$13,$2
+       subu    $13,$12,$13
+       sltu    $2,$12,$13
+       sw      $13,12($4)
+       or      $2,$2,$8
+
+       addiu   $5,$5,16
+       addiu   $6,$6,16
+
+       bne     $7,$0,.Loop
+        addiu  $4,$4,16
+
+.Lend: addu    $11,$11,$2
+       sltu    $8,$11,$2
+       subu    $11,$10,$11
+       sltu    $2,$10,$11
+       sw      $11,0($4)
+       j       $31
+       or      $2,$2,$8
+
+       .end    __mpn_sub_n
diff --git a/sysdeps/mips/submul_1.s b/sysdeps/mips/submul_1.s

new file mode 100644 (file)

index 0000000..616dd1b
--- /dev/null
+++ b/sysdeps/mips/submul_1.s
@@ -0,0 +1,96 @@
+ # MIPS __mpn_submul_1 -- Multiply a limb vector with a single limb and
+ # subtract the product from a second limb vector.
+
+ # Copyright (C) 1992, 1994 Free Software Foundation, Inc.
+
+ # This file is part of the GNU MP Library.
+
+ # The GNU MP Library is free software; you can redistribute it and/or modify
+ # it under the terms of the GNU Library General Public License as published by
+ # the Free Software Foundation; either version 2 of the License, or (at your
+ # option) any later version.
+
+ # The GNU MP Library is distributed in the hope that it will be useful, but
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ # or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
+ # License for more details.
+
+ # You should have received a copy of the GNU Library General Public License
+ # along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+ # the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+
+ # INPUT PARAMETERS
+ # res_ptr     $4
+ # s1_ptr      $5
+ # size                $6
+ # s2_limb     $7
+
+       .text
+       .align   4
+       .globl   __mpn_submul_1
+       .ent    __mpn_submul_1
+__mpn_submul_1:
+       .set    noreorder
+       .set    nomacro
+
+ # warm up phase 0
+       lw      $8,0($5)
+
+ # warm up phase 1
+       addiu   $5,$5,4
+       multu   $8,$7
+
+       addiu   $6,$6,-1
+       beq     $6,$0,$LC0
+        move   $2,$0           # zero cy2
+
+       addiu   $6,$6,-1
+       beq     $6,$0,$LC1
+       lw      $8,0($5)        # load new s1 limb as early as possible
+
+Loop:  lw      $10,0($4)
+       mflo    $3
+       mfhi    $9
+       addiu   $5,$5,4
+       addu    $3,$3,$2        # add old carry limb to low product limb
+       multu   $8,$7
+       lw      $8,0($5)        # load new s1 limb as early as possible
+       addiu   $6,$6,-1        # decrement loop counter
+       sltu    $2,$3,$2        # carry from previous addition -> $2
+       subu    $3,$10,$3
+       sgtu    $10,$3,$10
+       addu    $2,$2,$10
+       sw      $3,0($4)
+       addiu   $4,$4,4
+       bne     $6,$0,Loop      # should be "bnel"
+        addu   $2,$9,$2        # add high product limb and carry from addition
+
+ # cool down phase 1
+$LC1:  lw      $10,0($4)
+       mflo    $3
+       mfhi    $9
+       addu    $3,$3,$2
+       sltu    $2,$3,$2
+       multu   $8,$7
+       subu    $3,$10,$3
+       sgtu    $10,$3,$10
+       addu    $2,$2,$10
+       sw      $3,0($4)
+       addiu   $4,$4,4
+       addu    $2,$9,$2        # add high product limb and carry from addition
+
+ # cool down phase 0
+$LC0:  lw      $10,0($4)
+       mflo    $3
+       mfhi    $9
+       addu    $3,$3,$2
+       sltu    $2,$3,$2
+       subu    $3,$10,$3
+       sgtu    $10,$3,$10
+       addu    $2,$2,$10
+       sw      $3,0($4)
+       j       $31
+       addu    $2,$9,$2        # add high product limb and carry from addition
+
+       .end    __mpn_submul_1
diff --git a/sysdeps/rs6000/add_n.s b/sysdeps/rs6000/add_n.s

new file mode 100644 (file)

index 0000000..34ad9e1
--- /dev/null
+++ b/sysdeps/rs6000/add_n.s
@@ -0,0 +1,54 @@
+# IBM POWER __mpn_add_n -- Add two limb vectors of equal, non-zero length.
+
+# Copyright (C) 1992, 1994 Free Software Foundation, Inc.
+
+# This file is part of the GNU MP Library.
+
+# The GNU MP Library is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Library General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or (at your
+# option) any later version.
+
+# The GNU MP Library is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
+# License for more details.
+
+# You should have received a copy of the GNU Library General Public License
+# along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+# the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+
+# INPUT PARAMETERS
+# res_ptr      r3
+# s1_ptr       r4
+# s2_ptr       r5
+# size         r6
+
+       .toc
+       .extern __mpn_add_n[DS]
+       .extern .__mpn_add_n
+.csect [PR]
+       .align 2
+       .globl __mpn_add_n
+       .globl .__mpn_add_n
+       .csect __mpn_add_n[DS]
+__mpn_add_n:
+       .long .__mpn_add_n, TOC[tc0], 0
+       .csect [PR]
+.__mpn_add_n:
+       mtctr   6               # copy size into CTR
+       l       8,0(4)          # load least significant s1 limb
+       l       0,0(5)          # load least significant s2 limb
+       cal     3,-4(3)         # offset res_ptr, it's updated before used
+       a       7,0,8           # add least significant limbs, set cy
+       bdz     Lend            # If done, skip loop
+Loop:  lu      8,4(4)          # load s1 limb and update s1_ptr
+       lu      0,4(5)          # load s2 limb and update s2_ptr
+       stu     7,4(3)          # store previous limb in load latecny slot
+       ae      7,0,8           # add new limbs with cy, set cy
+       bdn     Loop            # decrement CTR and loop back
+Lend:  st      7,4(3)          # store ultimate result limb
+       lil     3,0             # load cy into ...
+       aze     3,3             # ... return value register
+       br
diff --git a/sysdeps/rs6000/addmul_1.s b/sysdeps/rs6000/addmul_1.s

new file mode 100644 (file)

index 0000000..862b613
--- /dev/null
+++ b/sysdeps/rs6000/addmul_1.s
@@ -0,0 +1,122 @@
+# IBM POWER __mpn_addmul_1 -- Multiply a limb vector with a limb and add
+# the result to a second limb vector.
+
+# Copyright (C) 1992, 1994 Free Software Foundation, Inc.
+
+# This file is part of the GNU MP Library.
+
+# The GNU MP Library is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Library General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or (at your
+# option) any later version.
+
+# The GNU MP Library is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
+# License for more details.
+
+# You should have received a copy of the GNU Library General Public License
+# along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+# the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+
+# INPUT PARAMETERS
+# res_ptr      r3
+# s1_ptr       r4
+# size         r5
+# s2_limb      r6
+
+# The RS/6000 has no unsigned 32x32->64 bit multiplication instruction.  To
+# obtain that operation, we have to use the 32x32->64 signed multiplication
+# instruction, and add the appropriate compensation to the high limb of the
+# result.  We add the multiplicand if the multiplier has its most significant
+# bit set, and we add the multiplier if the multiplicand has its most
+# significant bit set.  We need to preserve the carry flag between each
+# iteration, so we have to compute the compensation carefully (the natural,
+# srai+and doesn't work).  Since the POWER architecture has a branch unit
+# we can branch in zero cycles, so that's how we perform the additions.
+
+       .toc
+       .csect .__mpn_addmul_1[PR]
+       .align 2
+       .globl __mpn_addmul_1
+       .globl .__mpn_addmul_1
+       .csect __mpn_addmul_1[DS]
+__mpn_addmul_1:
+       .long .__mpn_addmul_1[PR], TOC[tc0], 0
+       .csect .__mpn_addmul_1[PR]
+.__mpn_addmul_1:
+
+       cal     3,-4(3)
+       l       0,0(4)
+       cmpi    0,6,0
+       mtctr   5
+       mul     9,0,6
+       srai    7,0,31
+       and     7,7,6
+       mfmq    8
+       cax     9,9,7
+       l       7,4(3)
+       a       8,8,7           # add res_limb
+       blt     Lneg
+Lpos:  bdz     Lend
+
+Lploop:        lu      0,4(4)
+       stu     8,4(3)
+       cmpi    0,0,0
+       mul     10,0,6
+       mfmq    0
+       ae      8,0,9           # low limb + old_cy_limb + old cy
+       l       7,4(3)
+       aze     10,10           # propagate cy to new cy_limb
+       a       8,8,7           # add res_limb
+       bge     Lp0
+       cax     10,10,6         # adjust high limb for negative limb from s1
+Lp0:   bdz     Lend0
+       lu      0,4(4)
+       stu     8,4(3)
+       cmpi    0,0,0
+       mul     9,0,6
+       mfmq    0
+       ae      8,0,10
+       l       7,4(3)
+       aze     9,9
+       a       8,8,7
+       bge     Lp1
+       cax     9,9,6           # adjust high limb for negative limb from s1
+Lp1:   bdn     Lploop
+
+       b       Lend
+
+Lneg:  cax     9,9,0
+       bdz     Lend
+Lnloop:        lu      0,4(4)
+       stu     8,4(3)
+       cmpi    0,0,0
+       mul     10,0,6
+       mfmq    7
+       ae      8,7,9
+       l       7,4(3)
+       ae      10,10,0         # propagate cy to new cy_limb
+       a       8,8,7           # add res_limb
+       bge     Ln0
+       cax     10,10,6         # adjust high limb for negative limb from s1
+Ln0:   bdz     Lend0
+       lu      0,4(4)
+       stu     8,4(3)
+       cmpi    0,0,0
+       mul     9,0,6
+       mfmq    7
+       ae      8,7,10
+       l       7,4(3)
+       ae      9,9,0           # propagate cy to new cy_limb
+       a       8,8,7           # add res_limb
+       bge     Ln1
+       cax     9,9,6           # adjust high limb for negative limb from s1
+Ln1:   bdn     Lnloop
+       b       Lend
+
+Lend0: cal     9,0(10)
+Lend:  st      8,4(3)
+       aze     3,9
+       br
diff --git a/sysdeps/rs6000/lshift.s b/sysdeps/rs6000/lshift.s

new file mode 100644 (file)

index 0000000..69c7502
--- /dev/null
+++ b/sysdeps/rs6000/lshift.s
@@ -0,0 +1,58 @@
+# IBM POWER __mpn_lshift -- 
+
+# Copyright (C) 1992, 1994 Free Software Foundation, Inc.
+
+# This file is part of the GNU MP Library.
+
+# The GNU MP Library is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Library General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or (at your
+# option) any later version.
+
+# The GNU MP Library is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
+# License for more details.
+
+# You should have received a copy of the GNU Library General Public License
+# along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+# the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+
+# INPUT PARAMETERS
+# res_ptr      r3
+# s_ptr                r4
+# size         r5
+# cnt          r6
+
+       .toc
+       .extern __mpn_lshift[DS]
+       .extern .__mpn_lshift
+.csect [PR]
+       .align 2
+       .globl __mpn_lshift
+       .globl .__mpn_lshift
+       .csect __mpn_lshift[DS]
+__mpn_lshift:
+       .long .__mpn_lshift, TOC[tc0], 0
+       .csect [PR]
+.__mpn_lshift:
+       sli     0,5,2
+       cax     9,3,0
+       cax     4,4,0
+       sfi     8,6,32
+       mtctr   5               # put limb count in CTR loop register
+       lu      0,-4(4)         # read most significant limb
+       sre     3,0,8           # compute carry out limb, and init MQ register
+       bdz     Lend2           # if just one limb, skip loop
+       lu      0,-4(4)         # read 2:nd most significant limb
+       sreq    7,0,8           # compute most significant limb of result
+       bdz     Lend            # if just two limb, skip loop
+Loop:  lu      0,-4(4)         # load next lower limb
+       stu     7,-4(9)         # store previous result during read latency
+       sreq    7,0,8           # compute result limb
+       bdn     Loop            # loop back until CTR is zero
+Lend:  stu     7,-4(9)         # store 2:nd least significant limb
+Lend2: sle     7,0,6           # compute least significant limb
+       st      7,-4(9)         # store it"                             \
+       br
diff --git a/sysdeps/rs6000/mul_1.s b/sysdeps/rs6000/mul_1.s

new file mode 100644 (file)

index 0000000..f4fa894
--- /dev/null
+++ b/sysdeps/rs6000/mul_1.s
@@ -0,0 +1,109 @@
+# IBM POWER __mpn_mul_1 -- Multiply a limb vector with a limb and store
+# the result in a second limb vector.
+
+# Copyright (C) 1992, 1994 Free Software Foundation, Inc.
+
+# This file is part of the GNU MP Library.
+
+# The GNU MP Library is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Library General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or (at your
+# option) any later version.
+
+# The GNU MP Library is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
+# License for more details.
+
+# You should have received a copy of the GNU Library General Public License
+# along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+# the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+
+# INPUT PARAMETERS
+# res_ptr      r3
+# s1_ptr       r4
+# size         r5
+# s2_limb      r6
+
+# The RS/6000 has no unsigned 32x32->64 bit multiplication instruction.  To
+# obtain that operation, we have to use the 32x32->64 signed multiplication
+# instruction, and add the appropriate compensation to the high limb of the
+# result.  We add the multiplicand if the multiplier has its most significant
+# bit set, and we add the multiplier if the multiplicand has its most
+# significant bit set.  We need to preserve the carry flag between each
+# iteration, so we have to compute the compensation carefully (the natural,
+# srai+and doesn't work).  Since the POWER architecture has a branch unit
+# we can branch in zero cycles, so that's how we perform the additions.
+
+       .toc
+       .csect .__mpn_mul_1[PR]
+       .align 2
+       .globl __mpn_mul_1
+       .globl .__mpn_mul_1
+       .csect __mpn_mul_1[DS]
+__mpn_mul_1:
+       .long .__mpn_mul_1[PR], TOC[tc0], 0
+       .csect .__mpn_mul_1[PR]
+.__mpn_mul_1:
+
+       cal     3,-4(3)
+       l       0,0(4)
+       cmpi    0,6,0
+       mtctr   5
+       mul     9,0,6
+       srai    7,0,31
+       and     7,7,6
+       mfmq    8
+       ai      0,0,0           # reset carry
+       cax     9,9,7
+       blt     Lneg
+Lpos:  bdz     Lend
+Lploop:        lu      0,4(4)
+       stu     8,4(3)
+       cmpi    0,0,0
+       mul     10,0,6
+       mfmq    0
+       ae      8,0,9
+       bge     Lp0
+       cax     10,10,6         # adjust high limb for negative limb from s1
+Lp0:   bdz     Lend0
+       lu      0,4(4)
+       stu     8,4(3)
+       cmpi    0,0,0
+       mul     9,0,6
+       mfmq    0
+       ae      8,0,10
+       bge     Lp1
+       cax     9,9,6           # adjust high limb for negative limb from s1
+Lp1:   bdn     Lploop
+       b       Lend
+
+Lneg:  cax     9,9,0
+       bdz     Lend
+Lnloop:        lu      0,4(4)
+       stu     8,4(3)
+       cmpi    0,0,0
+       mul     10,0,6
+       cax     10,10,0         # adjust high limb for negative s2_limb
+       mfmq    0
+       ae      8,0,9
+       bge     Ln0
+       cax     10,10,6         # adjust high limb for negative limb from s1
+Ln0:   bdz     Lend0
+       lu      0,4(4)
+       stu     8,4(3)
+       cmpi    0,0,0
+       mul     9,0,6
+       cax     9,9,0           # adjust high limb for negative s2_limb
+       mfmq    0
+       ae      8,0,10
+       bge     Ln1
+       cax     9,9,6           # adjust high limb for negative limb from s1
+Ln1:   bdn     Lnloop
+       b       Lend
+
+Lend0: cal     9,0(10)
+Lend:  st      8,4(3)
+       aze     3,9
+       br
diff --git a/sysdeps/rs6000/rshift.s b/sysdeps/rs6000/rshift.s

new file mode 100644 (file)

index 0000000..6056acc
--- /dev/null
+++ b/sysdeps/rs6000/rshift.s
@@ -0,0 +1,56 @@
+# IBM POWER __mpn_rshift -- 
+
+# Copyright (C) 1992, 1994 Free Software Foundation, Inc.
+
+# This file is part of the GNU MP Library.
+
+# The GNU MP Library is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Library General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or (at your
+# option) any later version.
+
+# The GNU MP Library is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
+# License for more details.
+
+# You should have received a copy of the GNU Library General Public License
+# along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+# the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+
+# INPUT PARAMETERS
+# res_ptr      r3
+# s_ptr                r4
+# size         r5
+# cnt          r6
+
+       .toc
+       .extern __mpn_rshift[DS]
+       .extern .__mpn_rshift
+.csect [PR]
+       .align 2
+       .globl __mpn_rshift
+       .globl .__mpn_rshift
+       .csect __mpn_rshift[DS]
+__mpn_rshift:
+       .long .__mpn_rshift, TOC[tc0], 0
+       .csect [PR]
+.__mpn_rshift:
+       sfi     8,6,32
+       mtctr   5               # put limb count in CTR loop register
+       l       0,0(4)          # read least significant limb
+       ai      9,3,-4          # adjust res_ptr since it's offset in the stu:s
+       sle     3,0,8           # compute carry limb, and init MQ register
+       bdz     Lend2           # if just one limb, skip loop
+       lu      0,4(4)          # read 2:nd least significant limb
+       sleq    7,0,8           # compute least significant limb of result
+       bdz     Lend            # if just two limb, skip loop
+Loop:  lu      0,4(4)          # load next higher limb
+       stu     7,4(9)          # store previous result during read latency
+       sleq    7,0,8           # compute result limb
+       bdn     Loop            # loop back until CTR is zero
+Lend:  stu     7,4(9)          # store 2:nd most significant limb
+Lend2: sre     7,0,6           # compute most significant limb
+       st      7,4(9)          # store it"                             \
+       br
diff --git a/sysdeps/rs6000/sub_n.s b/sysdeps/rs6000/sub_n.s

new file mode 100644 (file)

index 0000000..402fdce
--- /dev/null
+++ b/sysdeps/rs6000/sub_n.s
@@ -0,0 +1,55 @@
+# IBM POWER __mpn_sub_n -- Subtract two limb vectors of the same length > 0 and
+# store difference in a third limb vector.
+
+# Copyright (C) 1992, 1994 Free Software Foundation, Inc.
+
+# This file is part of the GNU MP Library.
+
+# The GNU MP Library is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Library General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or (at your
+# option) any later version.
+
+# The GNU MP Library is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
+# License for more details.
+
+# You should have received a copy of the GNU Library General Public License
+# along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+# the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+
+# INPUT PARAMETERS
+# res_ptr      r3
+# s1_ptr       r4
+# s2_ptr       r5
+# size         r6
+
+       .toc
+       .extern __mpn_sub_n[DS]
+       .extern .__mpn_sub_n
+.csect [PR]
+       .align 2
+       .globl __mpn_sub_n
+       .globl .__mpn_sub_n
+       .csect __mpn_sub_n[DS]
+__mpn_sub_n:
+       .long .__mpn_sub_n, TOC[tc0], 0
+       .csect [PR]
+.__mpn_sub_n:
+       mtctr   6               # copy size into CTR
+       l       8,0(4)          # load least significant s1 limb
+       l       0,0(5)          # load least significant s2 limb
+       cal     3,-4(3)         # offset res_ptr, it's updated before used
+       sf      7,0,8           # add least significant limbs, set cy
+       bdz     Lend            # If done, skip loop
+Loop:  lu      8,4(4)          # load s1 limb and update s1_ptr
+       lu      0,4(5)          # load s2 limb and update s2_ptr
+       stu     7,4(3)          # store previous limb in load latecny slot
+       sfe     7,0,8           # add new limbs with cy, set cy
+       bdn     Loop            # decrement CTR and loop back
+Lend:  st      7,4(3)          # store ultimate result limb
+       sfe     3,0,0           # load !cy into ...
+       sfi     3,3,0           # ... return value register
+       br
diff --git a/sysdeps/rs6000/submul_1.s b/sysdeps/rs6000/submul_1.s

new file mode 100644 (file)

index 0000000..2526332
--- /dev/null
+++ b/sysdeps/rs6000/submul_1.s
@@ -0,0 +1,127 @@
+# IBM POWER __mpn_submul_1 -- Multiply a limb vector with a limb and subtract
+# the result from a second limb vector.
+
+# Copyright (C) 1992, 1994 Free Software Foundation, Inc.
+
+# This file is part of the GNU MP Library.
+
+# The GNU MP Library is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Library General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or (at your
+# option) any later version.
+
+# The GNU MP Library is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
+# License for more details.
+
+# You should have received a copy of the GNU Library General Public License
+# along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+# the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+
+# INPUT PARAMETERS
+# res_ptr      r3
+# s1_ptr       r4
+# size         r5
+# s2_limb      r6
+
+# The RS/6000 has no unsigned 32x32->64 bit multiplication instruction.  To
+# obtain that operation, we have to use the 32x32->64 signed multiplication
+# instruction, and add the appropriate compensation to the high limb of the
+# result.  We add the multiplicand if the multiplier has its most significant
+# bit set, and we add the multiplier if the multiplicand has its most
+# significant bit set.  We need to preserve the carry flag between each
+# iteration, so we have to compute the compensation carefully (the natural,
+# srai+and doesn't work).  Since the POWER architecture has a branch unit
+# we can branch in zero cycles, so that's how we perform the additions.
+
+       .toc
+       .csect .__mpn_submul_1[PR]
+       .align 2
+       .globl __mpn_submul_1
+       .globl .__mpn_submul_1
+       .csect __mpn_submul_1[DS]
+__mpn_submul_1:
+       .long .__mpn_submul_1[PR], TOC[tc0], 0
+       .csect .__mpn_submul_1[PR]
+.__mpn_submul_1:
+
+       cal     3,-4(3)
+       l       0,0(4)
+       cmpi    0,6,0
+       mtctr   5
+       mul     9,0,6
+       srai    7,0,31
+       and     7,7,6
+       mfmq    11
+       cax     9,9,7
+       l       7,4(3)
+       sf      8,11,7          # add res_limb
+       a       11,8,11         # invert cy (r11 is junk)
+       blt     Lneg
+Lpos:  bdz     Lend
+
+Lploop:        lu      0,4(4)
+       stu     8,4(3)
+       cmpi    0,0,0
+       mul     10,0,6
+       mfmq    0
+       ae      11,0,9          # low limb + old_cy_limb + old cy
+       l       7,4(3)
+       aze     10,10           # propagate cy to new cy_limb
+       sf      8,11,7          # add res_limb
+       a       11,8,11         # invert cy (r11 is junk)
+       bge     Lp0
+       cax     10,10,6         # adjust high limb for negative limb from s1
+Lp0:   bdz     Lend0
+       lu      0,4(4)
+       stu     8,4(3)
+       cmpi    0,0,0
+       mul     9,0,6
+       mfmq    0
+       ae      11,0,10
+       l       7,4(3)
+       aze     9,9
+       sf      8,11,7
+       a       11,8,11         # invert cy (r11 is junk)
+       bge     Lp1
+       cax     9,9,6           # adjust high limb for negative limb from s1
+Lp1:   bdn     Lploop
+
+       b       Lend
+
+Lneg:  cax     9,9,0
+       bdz     Lend
+Lnloop:        lu      0,4(4)
+       stu     8,4(3)
+       cmpi    0,0,0
+       mul     10,0,6
+       mfmq    7
+       ae      11,7,9
+       l       7,4(3)
+       ae      10,10,0         # propagate cy to new cy_limb
+       sf      8,11,7          # add res_limb
+       a       11,8,11         # invert cy (r11 is junk)
+       bge     Ln0
+       cax     10,10,6         # adjust high limb for negative limb from s1
+Ln0:   bdz     Lend0
+       lu      0,4(4)
+       stu     8,4(3)
+       cmpi    0,0,0
+       mul     9,0,6
+       mfmq    7
+       ae      11,7,10
+       l       7,4(3)
+       ae      9,9,0           # propagate cy to new cy_limb
+       sf      8,11,7          # add res_limb
+       a       11,8,11         # invert cy (r11 is junk)
+       bge     Ln1
+       cax     9,9,6           # adjust high limb for negative limb from s1
+Ln1:   bdn     Lnloop
+       b       Lend
+
+Lend0: cal     9,0(10)
+Lend:  st      8,4(3)
+       aze     3,9
+       br
diff --git a/sysdeps/vax/add_n.s b/sysdeps/vax/add_n.s

new file mode 100644 (file)

index 0000000..c89b226
--- /dev/null
+++ b/sysdeps/vax/add_n.s
@@ -0,0 +1,47 @@
+# VAX __mpn_add_n -- Add two limb vectors of the same length > 0 and store
+# sum in a third limb vector.
+
+# Copyright (C) 1992, 1994 Free Software Foundation, Inc.
+
+# This file is part of the GNU MP Library.
+
+# The GNU MP Library is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Library General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or (at your
+# option) any later version.
+
+# The GNU MP Library is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
+# License for more details.
+
+# You should have received a copy of the GNU Library General Public License
+# along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+# the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+
+# INPUT PARAMETERS
+# res_ptr      (sp + 4)
+# s1_ptr       (sp + 8)
+# s2_ptr       (sp + 12)
+# size         (sp + 16)
+
+.text
+       .align 1
+.globl ___mpn_add_n
+___mpn_add_n:
+       .word   0x0
+       movl    16(ap),r0
+       movl    12(ap),r1
+       movl    8(ap),r2
+       movl    4(ap),r3
+       subl2   r4,r4
+
+Loop:
+       movl    (r2)+,r4
+       adwc    (r1)+,r4
+       movl    r4,(r3)+
+       jsobgtr r0,Loop
+
+       adwc    r0,r0
+       ret
diff --git a/sysdeps/vax/addmul_1.s b/sysdeps/vax/addmul_1.s

new file mode 100644 (file)

index 0000000..8e83204
--- /dev/null
+++ b/sysdeps/vax/addmul_1.s
@@ -0,0 +1,125 @@
+# VAX __mpn_addmul_1 -- Multiply a limb vector with a limb and add
+# the result to a second limb vector.
+
+# Copyright (C) 1992, 1994 Free Software Foundation, Inc.
+
+# This file is part of the GNU MP Library.
+
+# The GNU MP Library is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Library General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or (at your
+# option) any later version.
+
+# The GNU MP Library is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
+# License for more details.
+
+# You should have received a copy of the GNU Library General Public License
+# along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+# the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+
+# INPUT PARAMETERS
+# res_ptr      (sp + 4)
+# s1_ptr       (sp + 8)
+# size         (sp + 12)
+# s2_limb      (sp + 16)
+
+.text
+       .align 1
+.globl ___mpn_addmul_1
+___mpn_addmul_1:
+       .word   0xfc0
+       movl    12(ap),r4
+       movl    8(ap),r8
+       movl    4(ap),r9
+       movl    16(ap),r6
+       jlss    s2_big
+
+       clrl    r3
+       incl    r4
+       ashl    $-1,r4,r7
+       jlbc    r4,L1
+       clrl    r11
+
+# Loop for S2_LIMB < 0x80000000
+Loop1: movl    (r8)+,r1
+       jlss    L1n0
+       emul    r1,r6,$0,r2
+       addl2   r11,r2
+       adwc    $0,r3
+       addl2   r2,(r9)+
+       adwc    $0,r3
+L1:    movl    (r8)+,r1
+       jlss    L1n1
+L1p1:  emul    r1,r6,$0,r10
+       addl2   r3,r10
+       adwc    $0,r11
+       addl2   r10,(r9)+
+       adwc    $0,r11
+
+       jsobgtr r7,Loop1
+       movl    r11,r0
+       ret
+
+L1n0:  emul    r1,r6,$0,r2
+       addl2   r11,r2
+       adwc    r6,r3
+       addl2   r2,(r9)+
+       adwc    $0,r3
+       movl    (r8)+,r1
+       jgeq    L1p1
+L1n1:  emul    r1,r6,$0,r10
+       addl2   r3,r10
+       adwc    r6,r11
+       addl2   r10,(r9)+
+       adwc    $0,r11
+
+       jsobgtr r7,Loop1
+       movl    r11,r0
+       ret
+
+
+s2_big:        clrl    r3
+       incl    r4
+       ashl    $-1,r4,r7
+       jlbc    r4,L2
+       clrl    r11
+
+# Loop for S2_LIMB >= 0x80000000
+Loop2: movl    (r8)+,r1
+       jlss    L2n0
+       emul    r1,r6,$0,r2
+       addl2   r11,r2
+       adwc    r1,r3
+       addl2   r2,(r9)+
+       adwc    $0,r3
+L2:    movl    (r8)+,r1
+       jlss    L2n1
+L2p1:  emul    r1,r6,$0,r10
+       addl2   r3,r10
+       adwc    r1,r11
+       addl2   r10,(r9)+
+       adwc    $0,r11
+
+       jsobgtr r7,Loop2
+       movl    r11,r0
+       ret
+
+L2n0:  emul    r1,r6,$0,r2
+       addl2   r11,r2
+       adwc    r6,r3
+       addl2   r2,(r9)+
+       adwc    r1,r3
+       movl    (r8)+,r1
+       jgeq    L2p1
+L2n1:  emul    r1,r6,$0,r10
+       addl2   r3,r10
+       adwc    r6,r11
+       addl2   r10,(r9)+
+       adwc    r1,r11
+
+       jsobgtr r7,Loop2
+       movl    r11,r0
+       ret
diff --git a/sysdeps/vax/mul_1.s b/sysdeps/vax/mul_1.s

new file mode 100644 (file)

index 0000000..3fe375b
--- /dev/null
+++ b/sysdeps/vax/mul_1.s
@@ -0,0 +1,122 @@
+# VAX __mpn_mul_1 -- Multiply a limb vector with a limb and store
+# the result in a second limb vector.
+
+# Copyright (C) 1992, 1994 Free Software Foundation, Inc.
+
+# This file is part of the GNU MP Library.
+
+# The GNU MP Library is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Library General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or (at your
+# option) any later version.
+
+# The GNU MP Library is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
+# License for more details.
+
+# You should have received a copy of the GNU Library General Public License
+# along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+# the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+
+# INPUT PARAMETERS
+# res_ptr      (sp + 4)
+# s1_ptr       (sp + 8)
+# size         (sp + 12)
+# s2_limb      (sp + 16)
+
+.text
+       .align 1
+.globl ___mpn_mul_1
+___mpn_mul_1:
+       .word   0xfc0
+       movl    12(ap),r4
+       movl    8(ap),r8
+       movl    4(ap),r9
+       movl    16(ap),r6
+       jlss    s2_big
+
+# One might want to combine the addl2 and the store below, but that
+# is actually just slower according to my timing tests.  (VAX 3600)
+
+       clrl    r3
+       incl    r4
+       ashl    $-1,r4,r7
+       jlbc    r4,L1
+       clrl    r11
+
+# Loop for S2_LIMB < 0x80000000
+Loop1: movl    (r8)+,r1
+       jlss    L1n0
+       emul    r1,r6,$0,r2
+       addl2   r11,r2
+       adwc    $0,r3
+       movl    r2,(r9)+
+L1:    movl    (r8)+,r1
+       jlss    L1n1
+L1p1:  emul    r1,r6,$0,r10
+       addl2   r3,r10
+       adwc    $0,r11
+       movl    r10,(r9)+
+
+       jsobgtr r7,Loop1
+       movl    r11,r0
+       ret
+
+L1n0:  emul    r1,r6,$0,r2
+       addl2   r11,r2
+       adwc    r6,r3
+       movl    r2,(r9)+
+       movl    (r8)+,r1
+       jgeq    L1p1
+L1n1:  emul    r1,r6,$0,r10
+       addl2   r3,r10
+       adwc    r6,r11
+       movl    r10,(r9)+
+
+       jsobgtr r7,Loop1
+       movl    r11,r0
+       ret
+
+
+s2_big:        clrl    r3
+       incl    r4
+       ashl    $-1,r4,r7
+       jlbc    r4,L2
+       clrl    r11
+
+# Loop for S2_LIMB >= 0x80000000
+Loop2: movl    (r8)+,r1
+       jlss    L2n0
+       emul    r1,r6,$0,r2
+       addl2   r11,r2
+       adwc    r1,r3
+       movl    r2,(r9)+
+L2:    movl    (r8)+,r1
+       jlss    L2n1
+L2p1:  emul    r1,r6,$0,r10
+       addl2   r3,r10
+       adwc    r1,r11
+       movl    r10,(r9)+
+
+       jsobgtr r7,Loop2
+       movl    r11,r0
+       ret
+
+L2n0:  emul    r1,r6,$0,r2
+       addl2   r1,r3
+       addl2   r11,r2
+       adwc    r6,r3
+       movl    r2,(r9)+
+       movl    (r8)+,r1
+       jgeq    L2p1
+L2n1:  emul    r1,r6,$0,r10
+       addl2   r1,r11
+       addl2   r3,r10
+       adwc    r6,r11
+       movl    r10,(r9)+
+
+       jsobgtr r7,Loop2
+       movl    r11,r0
+       ret
diff --git a/sysdeps/vax/sub_n.s b/sysdeps/vax/sub_n.s

new file mode 100644 (file)

index 0000000..300b4de
--- /dev/null
+++ b/sysdeps/vax/sub_n.s
@@ -0,0 +1,47 @@
+# VAX __mpn_sub_n -- Subtract two limb vectors of the same length > 0 and store
+# difference in a third limb vector.
+
+# Copyright (C) 1992, 1994 Free Software Foundation, Inc.
+
+# This file is part of the GNU MP Library.
+
+# The GNU MP Library is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Library General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or (at your
+# option) any later version.
+
+# The GNU MP Library is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
+# License for more details.
+
+# You should have received a copy of the GNU Library General Public License
+# along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+# the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+
+# INPUT PARAMETERS
+# res_ptr      (sp + 4)
+# s1_ptr       (sp + 8)
+# s2_ptr       (sp + 12)
+# size         (sp + 16)
+
+.text
+       .align 1
+.globl ___mpn_sub_n
+___mpn_sub_n:
+       .word   0x0
+       movl    16(ap),r0
+       movl    12(ap),r1
+       movl    8(ap),r2
+       movl    4(ap),r3
+       subl2   r4,r4
+
+Loop:
+       movl    (r2)+,r4
+       sbwc    (r1)+,r4
+       movl    r4,(r3)+
+       jsobgtr r0,Loop
+
+       adwc    r0,r0
+       ret
diff --git a/sysdeps/vax/submul_1.s b/sysdeps/vax/submul_1.s

new file mode 100644 (file)

index 0000000..875cbfd
--- /dev/null
+++ b/sysdeps/vax/submul_1.s
@@ -0,0 +1,125 @@
+# VAX __mpn_submul_1 -- Multiply a limb vector with a limb and subtract
+# the result from a second limb vector.
+
+# Copyright (C) 1992, 1994 Free Software Foundation, Inc.
+
+# This file is part of the GNU MP Library.
+
+# The GNU MP Library is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Library General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or (at your
+# option) any later version.
+
+# The GNU MP Library is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
+# License for more details.
+
+# You should have received a copy of the GNU Library General Public License
+# along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+# the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+
+# INPUT PARAMETERS
+# res_ptr      (sp + 4)
+# s1_ptr       (sp + 8)
+# size         (sp + 12)
+# s2_limb      (sp + 16)
+
+.text
+       .align 1
+.globl ___mpn_submul_1
+___mpn_submul_1:
+       .word   0xfc0
+       movl    12(ap),r4
+       movl    8(ap),r8
+       movl    4(ap),r9
+       movl    16(ap),r6
+       jlss    s2_big
+
+       clrl    r3
+       incl    r4
+       ashl    $-1,r4,r7
+       jlbc    r4,L1
+       clrl    r11
+
+# Loop for S2_LIMB < 0x80000000
+Loop1: movl    (r8)+,r1
+       jlss    L1n0
+       emul    r1,r6,$0,r2
+       addl2   r11,r2
+       adwc    $0,r3
+       subl2   r2,(r9)+
+       adwc    $0,r3
+L1:    movl    (r8)+,r1
+       jlss    L1n1
+L1p1:  emul    r1,r6,$0,r10
+       addl2   r3,r10
+       adwc    $0,r11
+       subl2   r10,(r9)+
+       adwc    $0,r11
+
+       jsobgtr r7,Loop1
+       movl    r11,r0
+       ret
+
+L1n0:  emul    r1,r6,$0,r2
+       addl2   r11,r2
+       adwc    r6,r3
+       subl2   r2,(r9)+
+       adwc    $0,r3
+       movl    (r8)+,r1
+       jgeq    L1p1
+L1n1:  emul    r1,r6,$0,r10
+       addl2   r3,r10
+       adwc    r6,r11
+       subl2   r10,(r9)+
+       adwc    $0,r11
+
+       jsobgtr r7,Loop1
+       movl    r11,r0
+       ret
+
+
+s2_big:        clrl    r3
+       incl    r4
+       ashl    $-1,r4,r7
+       jlbc    r4,L2
+       clrl    r11
+
+# Loop for S2_LIMB >= 0x80000000
+Loop2: movl    (r8)+,r1
+       jlss    L2n0
+       emul    r1,r6,$0,r2
+       addl2   r11,r2
+       adwc    r1,r3
+       subl2   r2,(r9)+
+       adwc    $0,r3
+L2:    movl    (r8)+,r1
+       jlss    L2n1
+L2p1:  emul    r1,r6,$0,r10
+       addl2   r3,r10
+       adwc    r1,r11
+       subl2   r10,(r9)+
+       adwc    $0,r11
+
+       jsobgtr r7,Loop2
+       movl    r11,r0
+       ret
+
+L2n0:  emul    r1,r6,$0,r2
+       addl2   r11,r2
+       adwc    r6,r3
+       subl2   r2,(r9)+
+       adwc    r1,r3
+       movl    (r8)+,r1
+       jgeq    L2p1
+L2n1:  emul    r1,r6,$0,r10
+       addl2   r3,r10
+       adwc    r6,r11
+       subl2   r10,(r9)+
+       adwc    r1,r11
+
+       jsobgtr r7,Loop2
+       movl    r11,r0
+       ret
diff --git a/sysdeps/z8000/add_n.s b/sysdeps/z8000/add_n.s

new file mode 100644 (file)

index 0000000..21efaf5
--- /dev/null
+++ b/sysdeps/z8000/add_n.s
@@ -0,0 +1,52 @@
+! Z8000 __mpn_add_n -- Add two limb vectors of equal, non-zero length.
+
+! Copyright (C) 1993, 1994 Free Software Foundation, Inc.
+
+! This file is part of the GNU MP Library.
+
+! The GNU MP Library is free software; you can redistribute it and/or modify
+! it under the terms of the GNU Library General Public License as published by
+! the Free Software Foundation; either version 2 of the License, or (at your
+! option) any later version.
+
+! The GNU MP Library is distributed in the hope that it will be useful, but
+! WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+! or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
+! License for more details.
+
+! You should have received a copy of the GNU Library General Public License
+! along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+! the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+
+! INPUT PARAMETERS
+! res_ptr      r7
+! s1_ptr       r6
+! s2_ptr       r5
+! size         r4
+
+! If we are really crazy, we can use push to write a few result words
+! backwards, using push just because it is faster than reg+disp.  We'd
+! then add 2x the number of words written to r7...
+
+       unseg
+       .text
+       even
+       global ___mpn_add_n
+___mpn_add_n:
+       pop     r0,@r6
+       pop     r1,@r5
+       add     r0,r1
+       ld      @r7,r0
+       dec     r4
+       jr      eq,Lend
+Loop:  pop     r0,@r6
+       pop     r1,@r5
+       adc     r0,r1
+       inc     r7,#2
+       ld      @r7,r0
+       dec     r4
+       jr      ne,Loop
+Lend:  ld      r2,r4           ! use 0 already in r4
+       adc     r2,r2
+       ret     t
diff --git a/sysdeps/z8000/mul_1.s b/sysdeps/z8000/mul_1.s

new file mode 100644 (file)

index 0000000..2075225
--- /dev/null
+++ b/sysdeps/z8000/mul_1.s
@@ -0,0 +1,67 @@
+! Z8000 __mpn_mul_1 -- Multiply a limb vector with a limb and store
+! the result in a second limb vector.
+
+! Copyright (C) 1993, 1994 Free Software Foundation, Inc.
+
+! This file is part of the GNU MP Library.
+
+! The GNU MP Library is free software; you can redistribute it and/or modify
+! it under the terms of the GNU Library General Public License as published by
+! the Free Software Foundation; either version 2 of the License, or (at your
+! option) any later version.
+
+! The GNU MP Library is distributed in the hope that it will be useful, but
+! WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+! or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
+! License for more details.
+
+! You should have received a copy of the GNU Library General Public License
+! along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+! the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+
+! INPUT PARAMETERS
+! res_ptr      r7
+! s1_ptr       r6
+! size         r5
+! s2_limb      r4
+
+       unseg
+       .text
+       even
+       global ___mpn_mul_1
+___mpn_mul_1:
+       sub     r2,r2           ! zero carry limb
+       and     r4,r4
+       jr      mi,Lneg
+
+Lpos:  pop     r1,@r6
+       ld      r9,r1
+       mult    rr8,r4
+       and     r1,r1           ! shift msb of loaded limb into cy
+       jr      mi,Lp           ! branch if loaded limb's msb is set
+       add     r8,r4           ! hi_limb += sign_comp2
+Lp:    add     r9,r2           ! lo_limb += cy_limb
+       xor     r2,r2
+       adc     r2,r8
+       ld      @r7,r9
+       inc     r7,#2
+       dec     r5
+       jr      ne,Lpos
+       ret t
+
+Lneg:  pop     r1,@r6
+       ld      r9,r1
+       mult    rr8,r4
+       add     r8,r1           ! hi_limb += sign_comp1
+       and     r1,r1
+       jr      mi,Ln
+       add     r8,r4           ! hi_limb += sign_comp2
+Ln:    add     r9,r2           ! lo_limb += cy_limb
+       xor     r2,r2
+       adc     r2,r8
+       ld      @r7,r9
+       inc     r7,#2
+       dec     r5
+       jr      ne,Lneg
+       ret t
diff --git a/sysdeps/z8000/sub_n.s b/sysdeps/z8000/sub_n.s

new file mode 100644 (file)

index 0000000..f75ef22
--- /dev/null
+++ b/sysdeps/z8000/sub_n.s
@@ -0,0 +1,53 @@
+! Z8000 __mpn_sub_n -- Subtract two limb vectors of the same length > 0 and
+! store difference in a third limb vector.
+
+! Copyright (C) 1993, 1994 Free Software Foundation, Inc.
+
+! This file is part of the GNU MP Library.
+
+! The GNU MP Library is free software; you can redistribute it and/or modify
+! it under the terms of the GNU Library General Public License as published by
+! the Free Software Foundation; either version 2 of the License, or (at your
+! option) any later version.
+
+! The GNU MP Library is distributed in the hope that it will be useful, but
+! WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+! or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
+! License for more details.
+
+! You should have received a copy of the GNU Library General Public License
+! along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+! the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+
+! INPUT PARAMETERS
+! res_ptr      r7
+! s1_ptr       r6
+! s2_ptr       r5
+! size         r4
+
+! If we are really crazy, we can use push to write a few result words
+! backwards, using push just because it is faster than reg+disp.  We'd
+! then add 2x the number of words written to r7...
+
+       unseg
+       .text
+       even
+       global ___mpn_sub_n
+___mpn_sub_n:
+       pop     r0,@r6
+       pop     r1,@r5
+       sub     r0,r1
+       ld      @r7,r0
+       dec     r4
+       jr      eq,Lend
+Loop:  pop     r0,@r6
+       pop     r1,@r5
+       sbc     r0,r1
+       inc     r7,#2
+       ld      @r7,r0
+       dec     r4
+       jr      ne,Loop
+Lend:  ld      r2,r4           ! use 0 already in r4
+       adc     r2,r2
+       ret     t
author	Roland McGrath <roland@gnu.org>
	Mon, 16 Oct 1995 01:18:40 +0000 (01:18 +0000)
committer	Roland McGrath <roland@gnu.org>
	Mon, 16 Oct 1995 01:18:40 +0000 (01:18 +0000)
sysdeps/alpha/add_n.s	[new file with mode: 0644]	patch \| blob
sysdeps/alpha/addmul_1.s	[new file with mode: 0644]	patch \| blob
sysdeps/alpha/alphaev5/add_n.s	[new file with mode: 0644]	patch \| blob
sysdeps/alpha/alphaev5/lshift.s	[new file with mode: 0644]	patch \| blob
sysdeps/alpha/alphaev5/rshift.s	[new file with mode: 0644]	patch \| blob
sysdeps/alpha/lshift.s	[new file with mode: 0644]	patch \| blob
sysdeps/alpha/mul_1.s	[new file with mode: 0644]	patch \| blob
sysdeps/alpha/rshift.s	[new file with mode: 0644]	patch \| blob
sysdeps/alpha/sub_n.s	[new file with mode: 0644]	patch \| blob
sysdeps/alpha/submul_1.s	[new file with mode: 0644]	patch \| blob
sysdeps/hppa/add_n.s	[new file with mode: 0644]	patch \| blob
sysdeps/hppa/hppa1.1/addmul_1.s	[new file with mode: 0644]	patch \| blob
sysdeps/hppa/hppa1.1/mul_1.s	[new file with mode: 0644]	patch \| blob
sysdeps/hppa/hppa1.1/submul_1.s	[new file with mode: 0644]	patch \| blob
sysdeps/hppa/hppa1.1/udiv_qrnnd.s	[new file with mode: 0644]	patch \| blob
sysdeps/hppa/lshift.s	[new file with mode: 0644]	patch \| blob
sysdeps/hppa/rshift.s	[new file with mode: 0644]	patch \| blob
sysdeps/hppa/sub_n.s	[new file with mode: 0644]	patch \| blob
sysdeps/hppa/udiv_qrnnd.s	[new file with mode: 0644]	patch \| blob
sysdeps/i960/add_n.s	[new file with mode: 0644]	patch \| blob
sysdeps/i960/addmul_1.s	[new file with mode: 0644]	patch \| blob
sysdeps/i960/mul_1.s	[new file with mode: 0644]	patch \| blob
sysdeps/i960/sub_n.s	[new file with mode: 0644]	patch \| blob
sysdeps/m88k/m88100/add_n.s	[new file with mode: 0644]	patch \| blob
sysdeps/m88k/m88100/mul_1.s	[new file with mode: 0644]	patch \| blob
sysdeps/m88k/m88100/sub_n.s	[new file with mode: 0644]	patch \| blob
sysdeps/m88k/m88110/mul_1.s	[new file with mode: 0644]	patch \| blob
sysdeps/mips/add_n.s	[new file with mode: 0644]	patch \| blob
sysdeps/mips/addmul_1.s	[new file with mode: 0644]	patch \| blob
sysdeps/mips/lshift.s	[new file with mode: 0644]	patch \| blob
sysdeps/mips/mips3/add_n.s	[new file with mode: 0644]	patch \| blob
sysdeps/mips/mips3/addmul_1.s	[new file with mode: 0644]	patch \| blob
sysdeps/mips/mips3/lshift.s	[new file with mode: 0644]	patch \| blob
sysdeps/mips/mips3/mul_1.s	[new file with mode: 0644]	patch \| blob
sysdeps/mips/mips3/rshift.s	[new file with mode: 0644]	patch \| blob
sysdeps/mips/mips3/sub_n.s	[new file with mode: 0644]	patch \| blob
sysdeps/mips/mips3/submul_1.s	[new file with mode: 0644]	patch \| blob
sysdeps/mips/mul_1.s	[new file with mode: 0644]	patch \| blob
sysdeps/mips/rshift.s	[new file with mode: 0644]	patch \| blob
sysdeps/mips/sub_n.s	[new file with mode: 0644]	patch \| blob
sysdeps/mips/submul_1.s	[new file with mode: 0644]	patch \| blob
sysdeps/rs6000/add_n.s	[new file with mode: 0644]	patch \| blob
sysdeps/rs6000/addmul_1.s	[new file with mode: 0644]	patch \| blob
sysdeps/rs6000/lshift.s	[new file with mode: 0644]	patch \| blob
sysdeps/rs6000/mul_1.s	[new file with mode: 0644]	patch \| blob
sysdeps/rs6000/rshift.s	[new file with mode: 0644]	patch \| blob
sysdeps/rs6000/sub_n.s	[new file with mode: 0644]	patch \| blob
sysdeps/rs6000/submul_1.s	[new file with mode: 0644]	patch \| blob
sysdeps/vax/add_n.s	[new file with mode: 0644]	patch \| blob
sysdeps/vax/addmul_1.s	[new file with mode: 0644]	patch \| blob
sysdeps/vax/mul_1.s	[new file with mode: 0644]	patch \| blob
sysdeps/vax/sub_n.s	[new file with mode: 0644]	patch \| blob
sysdeps/vax/submul_1.s	[new file with mode: 0644]	patch \| blob
sysdeps/z8000/add_n.s	[new file with mode: 0644]	patch \| blob
sysdeps/z8000/mul_1.s	[new file with mode: 0644]	patch \| blob
sysdeps/z8000/sub_n.s	[new file with mode: 0644]	patch \| blob