1 dnl PowerPC-32/VMX and PowerPC-64/VMX mpn_copyi.
3 dnl Copyright 2006 Free Software Foundation, Inc.
5 dnl This file is part of the GNU MP Library.
7 dnl The GNU MP Library is free software; you can redistribute it and/or modify
8 dnl it under the terms of the GNU Lesser General Public License as published
9 dnl by the Free Software Foundation; either version 3 of the License, or (at
10 dnl your option) any later version.
12 dnl The GNU MP Library is distributed in the hope that it will be useful, but
13 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
14 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
15 dnl License for more details.
17 dnl You should have received a copy of the GNU Lesser General Public License
18 dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
20 include(`../config.m4')
22 C 16-byte coaligned unaligned
23 C cycles/limb cycles/limb
24 C 7400,7410 (G4): 0.5 0.64
25 C 744x,745x (G4+): 0.75 0.82
26 C 970 (G5): 0.78 1.02 (64-bit limbs)
29 C * Works for all sizes and alignments.
32 C * Optimize unaligned case. Some basic tests with 2-way and 4-way unrolling
33 C indicate that we can reach 0.56 c/l for 7400, 0.75 c/l for 745x, and 0.80
35 C * Consider using VMX instructions also for head and tail, by using some
36 C read-modify-write tricks.
37 C * The VMX code is used from the smallest sizes it handles, but measurements
38 C show a large speed bump at the cutoff points. Small copying (perhaps
39 C using some read-modify-write technique) should be optimized.
40 C * Make a mpn_com based on this code.
42 define(`GMP_LIMB_BYTES', eval(GMP_LIMB_BITS/8))
43 define(`LIMBS_PER_VR', eval(16/GMP_LIMB_BYTES))
44 define(`LIMBS_PER_2VR', eval(32/GMP_LIMB_BYTES))
47 ifelse(GMP_LIMB_BITS,32,`
48 define(`LIMB32',` $1')
52 define(`LIMB64',` $1')
66 LIMB32(`cmpi cr7, n, 11 ')
67 LIMB64(`cmpdi cr7, n, 5 ')
73 C Handle small cases with plain operations
76 LIMB32(`lwz r0, 0(up) ')
77 LIMB64(`ld r0, 0(up) ')
78 addi up, up, GMP_LIMB_BYTES
79 LIMB32(`stw r0, 0(rp) ')
80 LIMB64(`std r0, 0(rp) ')
81 addi rp, rp, GMP_LIMB_BYTES
85 C Handle large cases with VMX operations
88 oris r0, r12, 0xf800 C Set VRSAVE bit 0-4
91 LIMB32(`rlwinm. r7, rp, 30,30,31') C (rp >> 2) mod 4
92 LIMB64(`rlwinm. r7, rp, 29,31,31') C (rp >> 3) mod 2
95 subfic r7, r7, LIMBS_PER_VR
98 LIMB32(`lwz r0, 0(up) ')
99 LIMB64(`ld r0, 0(up) ')
100 addi up, up, GMP_LIMB_BYTES
101 LIMB32(`addic. r7, r7, -1 ')
102 LIMB32(`stw r0, 0(rp) ')
103 LIMB64(`std r0, 0(rp) ')
104 addi rp, rp, GMP_LIMB_BYTES
105 LIMB32(`bne L(top0) ')
109 LIMB32(`rlwinm. r0, up, 30,30,31') C (up >> 2) mod 4
110 LIMB64(`rlwinm. r0, up, 29,31,31') C (up >> 3) mod 2
112 LIMB64(`srdi r7, n, 2 ') C loop count corresponding to n
113 LIMB32(`srwi r7, n, 3 ') C loop count corresponding to n
114 mtctr r7 C copy n to count register
122 LIMB32(`andi. r0, n, 0x4 ')
123 LIMB64(`andi. r0, n, 0x2 ')
137 L(lpu): lvx v0, 0, up
152 LIMB32(`andi. r0, n, 0x4 ')
153 LIMB64(`andi. r0, n, 0x2 ')
162 L(lpa): lvx v0, 0, up
172 LIMB32(`rlwinm. r7, n, 0,30,31 ') C r7 = n mod 4
173 LIMB64(`rlwinm. r7, n, 0,31,31 ') C r7 = n mod 2
177 LIMB32(`lwzx r0, r10, up ')
178 LIMB64(`ld r0, 0(up) ')
179 LIMB32(`addic. r7, r7, -1 ')
180 LIMB32(`stwx r0, r10, rp ')
181 LIMB64(`std r0, 0(rp) ')
182 LIMB32(`addi r10, r10, GMP_LIMB_BYTES')
183 LIMB32(`bne L(top2) ')
185 L(ret): mtspr 256, r12