Tizen 2.1 base
[external/gmp.git] / mpn / pa64 / sqr_diagonal.asm
1 dnl  HP-PA 2.0 64-bit mpn_sqr_diagonal.
2
3 dnl  Copyright 2001, 2002, 2003 Free Software Foundation, Inc.
4
5 dnl  This file is part of the GNU MP Library.
6
7 dnl  The GNU MP Library is free software; you can redistribute it and/or modify
8 dnl  it under the terms of the GNU Lesser General Public License as published
9 dnl  by the Free Software Foundation; either version 3 of the License, or (at
10 dnl  your option) any later version.
11
12 dnl  The GNU MP Library is distributed in the hope that it will be useful, but
13 dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
14 dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
15 dnl  License for more details.
16
17 dnl  You should have received a copy of the GNU Lesser General Public License
18 dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
19
20
21 dnl  This code runs at 7.25 cycles/limb on PA8000 and 7.75 cycles/limb on
22 dnl  PA8500.  The cache would saturate at 5 cycles/limb, so there is some room
23 dnl  for optimization.
24
25 include(`../config.m4')
26
27 C INPUT PARAMETERS
28 define(`rp',`%r26')
29 define(`up',`%r25')
30 define(`n',`%r24')
31
32 define(`p00',`%r28')
33 define(`p32',`%r29')
34 define(`p64',`%r31')
35 define(`t0',`%r19')
36 define(`t1',`%r20')
37
38 ifdef(`HAVE_ABI_2_0w',
39 `       .level  2.0w
40 ',`     .level  2.0
41 ')
42 PROLOGUE(mpn_sqr_diagonal)
43         ldo             128(%r30),%r30
44
45         fldds,ma        8(up),%fr8
46         addib,=         -1,n,L(end1)
47         nop
48         fldds,ma        8(up),%fr4
49         xmpyu           %fr8l,%fr8r,%fr10
50         fstd            %fr10,-120(%r30)
51         xmpyu           %fr8r,%fr8r,%fr9
52         fstd            %fr9,0(rp)
53         xmpyu           %fr8l,%fr8l,%fr11
54         fstd            %fr11,8(rp)
55         addib,=         -1,n,L(end2)
56         ldo             16(rp),rp
57
58 LDEF(loop)
59         fldds,ma        8(up),%fr8              C load next up limb
60         xmpyu           %fr4l,%fr4r,%fr6
61         fstd            %fr6,-128(%r30)
62         xmpyu           %fr4r,%fr4r,%fr5        C multiply in fp regs
63         fstd            %fr5,0(rp)
64         xmpyu           %fr4l,%fr4l,%fr7
65         fstd            %fr7,8(rp)
66         ldd             -120(%r30),p32
67         ldd             -16(rp),p00             C accumulate in int regs
68         ldd             -8(rp),p64
69         depd,z          p32,30,31,t0
70         add             t0,p00,p00
71         std             p00,-16(rp)
72         extrd,u         p32,32,33,t1
73         add,dc          t1,p64,p64
74         std             p64,-8(rp)
75         addib,=         -1,n,L(exit)
76         ldo             16(rp),rp
77
78         fldds,ma        8(up),%fr4
79         xmpyu           %fr8l,%fr8r,%fr10
80         fstd            %fr10,-120(%r30)
81         xmpyu           %fr8r,%fr8r,%fr9
82         fstd            %fr9,0(rp)
83         xmpyu           %fr8l,%fr8l,%fr11
84         fstd            %fr11,8(rp)
85         ldd             -128(%r30),p32
86         ldd             -16(rp),p00
87         ldd             -8(rp),p64
88         depd,z          p32,30,31,t0
89         add             t0,p00,p00
90         std             p00,-16(rp)
91         extrd,u         p32,32,33,t1
92         add,dc          t1,p64,p64
93         std             p64,-8(rp)
94         addib,<>        -1,n,L(loop)
95         ldo             16(rp),rp
96
97 LDEF(end2)
98         xmpyu           %fr4l,%fr4r,%fr6
99         fstd            %fr6,-128(%r30)
100         xmpyu           %fr4r,%fr4r,%fr5
101         fstd            %fr5,0(rp)
102         xmpyu           %fr4l,%fr4l,%fr7
103         fstd            %fr7,8(rp)
104         ldd             -120(%r30),p32
105         ldd             -16(rp),p00
106         ldd             -8(rp),p64
107         depd,z          p32,30,31,t0
108         add             t0,p00,p00
109         std             p00,-16(rp)
110         extrd,u         p32,32,33,t1
111         add,dc          t1,p64,p64
112         std             p64,-8(rp)
113         ldo             16(rp),rp
114         ldd             -128(%r30),p32
115         ldd             -16(rp),p00
116         ldd             -8(rp),p64
117         depd,z          p32,30,31,t0
118         add             t0,p00,p00
119         std             p00,-16(rp)
120         extrd,u         p32,32,33,t1
121         add,dc          t1,p64,p64
122         std             p64,-8(rp)
123         bve             (%r2)
124         ldo             -128(%r30),%r30
125
126 LDEF(exit)
127         xmpyu           %fr8l,%fr8r,%fr10
128         fstd            %fr10,-120(%r30)
129         xmpyu           %fr8r,%fr8r,%fr9
130         fstd            %fr9,0(rp)
131         xmpyu           %fr8l,%fr8l,%fr11
132         fstd            %fr11,8(rp)
133         ldd             -128(%r30),p32
134         ldd             -16(rp),p00
135         ldd             -8(rp),p64
136         depd,z          p32,31,32,t0
137         add             t0,p00,p00
138         extrd,u         p32,31,32,t1
139         add,dc          t1,p64,p64
140         add             t0,p00,p00
141         add,dc          t1,p64,p64
142         std             p00,-16(rp)
143         std             p64,-8(rp)
144         ldo             16(rp),rp
145         ldd             -120(%r30),p32
146         ldd             -16(rp),p00
147         ldd             -8(rp),p64
148         depd,z          p32,31,32,t0
149         add             t0,p00,p00
150         extrd,u         p32,31,32,t1
151         add,dc          t1,p64,p64
152         add             t0,p00,p00
153         add,dc          t1,p64,p64
154         std             p00,-16(rp)
155         std             p64,-8(rp)
156         bve             (%r2)
157         ldo             -128(%r30),%r30
158
159 LDEF(end1)
160         xmpyu           %fr8l,%fr8r,%fr10
161         fstd            %fr10,-128(%r30)
162         xmpyu           %fr8r,%fr8r,%fr9
163         fstd            %fr9,0(rp)
164         xmpyu           %fr8l,%fr8l,%fr11
165         fstd            %fr11,8(rp)
166         ldo             16(rp),rp
167         ldd             -128(%r30),p32
168         ldd             -16(rp),p00
169         ldd             -8(rp),p64
170         depd,z          p32,31,32,t0
171         add             t0,p00,p00
172         extrd,u         p32,31,32,t1
173         add,dc          t1,p64,p64
174         add             t0,p00,p00
175         add,dc          t1,p64,p64
176         std             p00,-16(rp)
177         std             p64,-8(rp)
178         bve             (%r2)
179         ldo             -128(%r30),%r30
180 EPILOGUE(mpn_sqr_diagonal)