1 dnl PowerPC-32/VMX and PowerPC-64/VMX mpn_copyd.
3 dnl Copyright 2006 Free Software Foundation, Inc.
5 dnl This file is part of the GNU MP Library.
7 dnl The GNU MP Library is free software; you can redistribute it and/or modify
8 dnl it under the terms of the GNU Lesser General Public License as published
9 dnl by the Free Software Foundation; either version 3 of the License, or (at
10 dnl your option) any later version.
12 dnl The GNU MP Library is distributed in the hope that it will be useful, but
13 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
14 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
15 dnl License for more details.
17 dnl You should have received a copy of the GNU Lesser General Public License
18 dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
20 include(`../config.m4')
22 C 16-byte coaligned unaligned
23 C cycles/limb cycles/limb
24 C 7400,7410 (G4): 0.5 0.64
25 C 744x,745x (G4+): 0.75 0.82
26 C 970 (G5): 0.78 1.02 (64-bit limbs)
29 C * Works for all sizes and alignments.
32 C * Optimize unaligned case. Some basic tests with 2-way and 4-way unrolling
33 C indicate that we can reach 0.56 c/l for 7400, 0.75 c/l for 745x, and 0.80
35 C * Consider using VMX instructions also for head and tail, by using some
36 C read-modify-write tricks.
37 C * The VMX code is used from the smallest sizes it handles, but measurements
38 C show a large speed bump at the cutoff points. Small copying (perhaps
39 C using some read-modify-write technique) should be optimized.
40 C * Make a mpn_com based on this code.
42 define(`GMP_LIMB_BYTES', eval(GMP_LIMB_BITS/8))
43 define(`LIMBS_PER_VR', eval(16/GMP_LIMB_BYTES))
44 define(`LIMBS_PER_2VR', eval(32/GMP_LIMB_BYTES))
47 ifelse(GMP_LIMB_BITS,32,`
48 define(`LIMB32',` $1')
52 define(`LIMB64',` $1')
66 LIMB32(`slwi. r0, n, 2 ')
67 LIMB64(`sldi. r0, n, 3 ')
71 LIMB32(`cmpi cr7, n, 11 ')
72 LIMB64(`cmpdi cr7, n, 5 ')
77 C Handle small cases with plain operations
80 LIMB32(`lwz r0, -4(up) ')
81 LIMB64(`ld r0, -8(up) ')
82 addi up, up, -GMP_LIMB_BYTES
83 LIMB32(`stw r0, -4(rp) ')
84 LIMB64(`std r0, -8(rp) ')
85 addi rp, rp, -GMP_LIMB_BYTES
89 C Handle large cases with VMX operations
94 oris r0, r12, 0xf800 C Set VRSAVE bit 0-4
97 LIMB32(`rlwinm. r7, rp, 30,30,31') C (rp >> 2) mod 4
98 LIMB64(`rlwinm. r7, rp, 29,31,31') C (rp >> 3) mod 2
103 LIMB32(`lwz r0, 12(up) ')
104 LIMB64(`ld r0, 8(up) ')
105 addi up, up, -GMP_LIMB_BYTES
106 LIMB32(`addic. r7, r7, -1 ')
107 LIMB32(`stw r0, 12(rp) ')
108 LIMB64(`std r0, 8(rp) ')
109 addi rp, rp, -GMP_LIMB_BYTES
110 LIMB32(`bne L(top0) ')
114 LIMB32(`rlwinm. r0, up, 30,30,31') C (up >> 2) mod 4
115 LIMB64(`rlwinm. r0, up, 29,31,31') C (up >> 3) mod 2
117 LIMB64(`srdi r7, n, 2 ') C loop count corresponding to n
118 LIMB32(`srwi r7, n, 3 ') C loop count corresponding to n
119 mtctr r7 C copy n to count register
128 LIMB32(`andi. r0, n, 0x4 ')
129 LIMB64(`andi. r0, n, 0x2 ')
143 L(lpu): lvx v0, 0, up
157 LIMB32(`andi. r0, n, 0x4 ')
158 LIMB64(`andi. r0, n, 0x2 ')
167 L(lpa): lvx v0, 0, up
177 LIMB32(`rlwinm. r7, n, 0,30,31 ') C r7 = n mod 4
178 LIMB64(`rlwinm. r7, n, 0,31,31 ') C r7 = n mod 2
180 LIMB32(`li r10, 12 ')
182 LIMB32(`lwzx r0, r10, up ')
183 LIMB64(`ld r0, 8(up) ')
184 LIMB32(`addic. r7, r7, -1 ')
185 LIMB32(`stwx r0, r10, rp ')
186 LIMB64(`std r0, 8(rp) ')
187 LIMB32(`addi r10, r10, -GMP_LIMB_BYTES')
188 LIMB32(`bne L(top2) ')
190 L(ret): mtspr 256, r12