Imported Upstream version 6.0.0
[platform/upstream/gmp.git] / mpn / pa64 / sqr_diagonal.asm
1 dnl  HP-PA 2.0 64-bit mpn_sqr_diagonal.
2
3 dnl  Copyright 2001-2003 Free Software Foundation, Inc.
4
5 dnl  This file is part of the GNU MP Library.
6 dnl
7 dnl  The GNU MP Library is free software; you can redistribute it and/or modify
8 dnl  it under the terms of either:
9 dnl
10 dnl    * the GNU Lesser General Public License as published by the Free
11 dnl      Software Foundation; either version 3 of the License, or (at your
12 dnl      option) any later version.
13 dnl
14 dnl  or
15 dnl
16 dnl    * the GNU General Public License as published by the Free Software
17 dnl      Foundation; either version 2 of the License, or (at your option) any
18 dnl      later version.
19 dnl
20 dnl  or both in parallel, as here.
21 dnl
22 dnl  The GNU MP Library is distributed in the hope that it will be useful, but
23 dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
24 dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
25 dnl  for more details.
26 dnl
27 dnl  You should have received copies of the GNU General Public License and the
28 dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
29 dnl  see https://www.gnu.org/licenses/.
30
31
32 dnl  This code runs at 7.25 cycles/limb on PA8000 and 7.75 cycles/limb on
33 dnl  PA8500.  The cache would saturate at 5 cycles/limb, so there is some room
34 dnl  for optimization.
35
36 include(`../config.m4')
37
38 C INPUT PARAMETERS
39 define(`rp',`%r26')
40 define(`up',`%r25')
41 define(`n',`%r24')
42
43 define(`p00',`%r28')
44 define(`p32',`%r29')
45 define(`p64',`%r31')
46 define(`t0',`%r19')
47 define(`t1',`%r20')
48
49 ifdef(`HAVE_ABI_2_0w',
50 `       .level  2.0w
51 ',`     .level  2.0
52 ')
53 PROLOGUE(mpn_sqr_diagonal)
54         ldo             128(%r30),%r30
55
56         fldds,ma        8(up),%fr8
57         addib,=         -1,n,L(end1)
58         nop
59         fldds,ma        8(up),%fr4
60         xmpyu           %fr8l,%fr8r,%fr10
61         fstd            %fr10,-120(%r30)
62         xmpyu           %fr8r,%fr8r,%fr9
63         fstd            %fr9,0(rp)
64         xmpyu           %fr8l,%fr8l,%fr11
65         fstd            %fr11,8(rp)
66         addib,=         -1,n,L(end2)
67         ldo             16(rp),rp
68
69 LDEF(loop)
70         fldds,ma        8(up),%fr8              C load next up limb
71         xmpyu           %fr4l,%fr4r,%fr6
72         fstd            %fr6,-128(%r30)
73         xmpyu           %fr4r,%fr4r,%fr5        C multiply in fp regs
74         fstd            %fr5,0(rp)
75         xmpyu           %fr4l,%fr4l,%fr7
76         fstd            %fr7,8(rp)
77         ldd             -120(%r30),p32
78         ldd             -16(rp),p00             C accumulate in int regs
79         ldd             -8(rp),p64
80         depd,z          p32,30,31,t0
81         add             t0,p00,p00
82         std             p00,-16(rp)
83         extrd,u         p32,32,33,t1
84         add,dc          t1,p64,p64
85         std             p64,-8(rp)
86         addib,=         -1,n,L(exit)
87         ldo             16(rp),rp
88
89         fldds,ma        8(up),%fr4
90         xmpyu           %fr8l,%fr8r,%fr10
91         fstd            %fr10,-120(%r30)
92         xmpyu           %fr8r,%fr8r,%fr9
93         fstd            %fr9,0(rp)
94         xmpyu           %fr8l,%fr8l,%fr11
95         fstd            %fr11,8(rp)
96         ldd             -128(%r30),p32
97         ldd             -16(rp),p00
98         ldd             -8(rp),p64
99         depd,z          p32,30,31,t0
100         add             t0,p00,p00
101         std             p00,-16(rp)
102         extrd,u         p32,32,33,t1
103         add,dc          t1,p64,p64
104         std             p64,-8(rp)
105         addib,<>        -1,n,L(loop)
106         ldo             16(rp),rp
107
108 LDEF(end2)
109         xmpyu           %fr4l,%fr4r,%fr6
110         fstd            %fr6,-128(%r30)
111         xmpyu           %fr4r,%fr4r,%fr5
112         fstd            %fr5,0(rp)
113         xmpyu           %fr4l,%fr4l,%fr7
114         fstd            %fr7,8(rp)
115         ldd             -120(%r30),p32
116         ldd             -16(rp),p00
117         ldd             -8(rp),p64
118         depd,z          p32,30,31,t0
119         add             t0,p00,p00
120         std             p00,-16(rp)
121         extrd,u         p32,32,33,t1
122         add,dc          t1,p64,p64
123         std             p64,-8(rp)
124         ldo             16(rp),rp
125         ldd             -128(%r30),p32
126         ldd             -16(rp),p00
127         ldd             -8(rp),p64
128         depd,z          p32,30,31,t0
129         add             t0,p00,p00
130         std             p00,-16(rp)
131         extrd,u         p32,32,33,t1
132         add,dc          t1,p64,p64
133         std             p64,-8(rp)
134         bve             (%r2)
135         ldo             -128(%r30),%r30
136
137 LDEF(exit)
138         xmpyu           %fr8l,%fr8r,%fr10
139         fstd            %fr10,-120(%r30)
140         xmpyu           %fr8r,%fr8r,%fr9
141         fstd            %fr9,0(rp)
142         xmpyu           %fr8l,%fr8l,%fr11
143         fstd            %fr11,8(rp)
144         ldd             -128(%r30),p32
145         ldd             -16(rp),p00
146         ldd             -8(rp),p64
147         depd,z          p32,31,32,t0
148         add             t0,p00,p00
149         extrd,u         p32,31,32,t1
150         add,dc          t1,p64,p64
151         add             t0,p00,p00
152         add,dc          t1,p64,p64
153         std             p00,-16(rp)
154         std             p64,-8(rp)
155         ldo             16(rp),rp
156         ldd             -120(%r30),p32
157         ldd             -16(rp),p00
158         ldd             -8(rp),p64
159         depd,z          p32,31,32,t0
160         add             t0,p00,p00
161         extrd,u         p32,31,32,t1
162         add,dc          t1,p64,p64
163         add             t0,p00,p00
164         add,dc          t1,p64,p64
165         std             p00,-16(rp)
166         std             p64,-8(rp)
167         bve             (%r2)
168         ldo             -128(%r30),%r30
169
170 LDEF(end1)
171         xmpyu           %fr8l,%fr8r,%fr10
172         fstd            %fr10,-128(%r30)
173         xmpyu           %fr8r,%fr8r,%fr9
174         fstd            %fr9,0(rp)
175         xmpyu           %fr8l,%fr8l,%fr11
176         fstd            %fr11,8(rp)
177         ldo             16(rp),rp
178         ldd             -128(%r30),p32
179         ldd             -16(rp),p00
180         ldd             -8(rp),p64
181         depd,z          p32,31,32,t0
182         add             t0,p00,p00
183         extrd,u         p32,31,32,t1
184         add,dc          t1,p64,p64
185         add             t0,p00,p00
186         add,dc          t1,p64,p64
187         std             p00,-16(rp)
188         std             p64,-8(rp)
189         bve             (%r2)
190         ldo             -128(%r30),%r30
191 EPILOGUE(mpn_sqr_diagonal)