1 dnl PowerPC-32/VMX and PowerPC-64/VMX mpn_and_n, mpn_andn_n, mpn_nand_n,
2 dnl mpn_ior_n, mpn_iorn_n, mpn_nior_n, mpn_xor_n, mpn_xnor_n -- mpn bitwise
3 dnl logical operations.
5 dnl Copyright 2006 Free Software Foundation, Inc.
7 dnl This file is part of the GNU MP Library.
9 dnl The GNU MP Library is free software; you can redistribute it and/or modify
10 dnl it under the terms of the GNU Lesser General Public License as published
11 dnl by the Free Software Foundation; either version 3 of the License, or (at
12 dnl your option) any later version.
14 dnl The GNU MP Library is distributed in the hope that it will be useful, but
15 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
16 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
17 dnl License for more details.
19 dnl You should have received a copy of the GNU Lesser General Public License
20 dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
22 include(`../config.m4')
25 C and,ior,andn,nior,xor iorn,xnor nand
26 C cycles/limb cycles/limb cycles/limb
27 C 7400,7410 (G4): 1.39 ? ?
28 C 744x,745x (G4+): 1.14 1.39 1.39
32 C * Works for all sizes and alignment for 32-bit limbs.
33 C * Works for n >= 4 for 64-bit limbs; untested for smaller operands.
34 C * Current performance makes this pointless for 970
37 C * Might want to make variants when just one of the source operands needs
38 C vperm, and when neither needs it. The latter runs 50% faster on 7400.
39 C * Idea: If the source operands are equally aligned, we could do the logops
40 C first, then vperm before storing! That means we never need more than one
42 C * Perhaps align `rp' after initial alignment loop?
43 C * Instead of having scalar code in the beginning and end, consider using
44 C read-modify-write vector code.
45 C * Software pipeline? Hopefully not too important, this is hairy enough
47 C * At least be more clever about operand loading, i.e., load v operands before
48 C u operands, since v operands are sometimes negated.
50 define(`GMP_LIMB_BYTES', eval(GMP_LIMB_BITS/8))
51 define(`LIMBS_PER_VR', eval(16/GMP_LIMB_BYTES))
52 define(`LIMBS_PER_2VR', eval(32/GMP_LIMB_BYTES))
54 define(`vnegb', `') C default neg-before to null
55 define(`vnega', `') C default neg-before to null
57 ifdef(`OPERATION_and_n',
58 ` define(`func', `mpn_and_n')
59 define(`logopS',`and $1,$2,$3')
60 define(`logop', `vand $1,$2,$3')')
61 ifdef(`OPERATION_andn_n',
62 ` define(`func', `mpn_andn_n')
63 define(`logopS',`andc $1,$2,$3')
64 define(`logop', `vandc $1,$2,$3')')
65 ifdef(`OPERATION_nand_n',
66 ` define(`func', `mpn_nand_n')
67 define(`logopS',`nand $1,$2,$3')
68 define(`logop', `vand $1,$2,$3')
69 define(`vnega', `vnor $1,$2,$2')')
70 ifdef(`OPERATION_ior_n',
71 ` define(`func', `mpn_ior_n')
72 define(`logopS',`or $1,$2,$3')
73 define(`logop', `vor $1,$2,$3')')
74 ifdef(`OPERATION_iorn_n',
75 ` define(`func', `mpn_iorn_n')
76 define(`logopS',`orc $1,$2,$3')
77 define(`vnegb', `vnor $1,$2,$2')
78 define(`logop', `vor $1,$2,$3')')
79 ifdef(`OPERATION_nior_n',
80 ` define(`func', `mpn_nior_n')
81 define(`logopS',`nor $1,$2,$3')
82 define(`logop', `vnor $1,$2,$3')')
83 ifdef(`OPERATION_xor_n',
84 ` define(`func', `mpn_xor_n')
85 define(`logopS',`xor $1,$2,$3')
86 define(`logop', `vxor $1,$2,$3')')
87 ifdef(`OPERATION_xnor_n',
88 ` define(`func',`mpn_xnor_n')
89 define(`logopS',`eqv $1,$2,$3')
90 define(`vnegb', `vnor $1,$2,$2')
91 define(`logop', `vxor $1,$2,$3')')
93 ifelse(GMP_LIMB_BITS,`32',`
94 define(`LIMB32',` $1')
98 define(`LIMB64',` $1')
110 MULFUNC_PROLOGUE(mpn_and_n mpn_andn_n mpn_nand_n mpn_ior_n mpn_iorn_n mpn_nior_n mpn_xor_n mpn_xnor_n)
115 LIMB32(`cmpwi cr0, n, 8 ')
116 LIMB64(`cmpdi cr0, n, 4 ')
121 LIMB32(`lwz r8, 0(up) ')
122 LIMB32(`lwz r9, 0(vp) ')
123 LIMB32(`logopS( r0, r8, r9) ')
124 LIMB32(`stw r0, 0(rp) ')
125 LIMB32(`bdz L(endS) ')
128 LIMB32(`lwzu r8, 4(up) ')
129 LIMB64(`ld r8, 0(up) ')
130 LIMB64(`addi up, up, GMP_LIMB_BYTES ')
131 LIMB32(`lwzu r9, 4(vp) ')
132 LIMB64(`ld r9, 0(vp) ')
133 LIMB64(`addi vp, vp, GMP_LIMB_BYTES ')
135 LIMB32(`stwu r0, 4(rp) ')
136 LIMB64(`std r0, 0(rp) ')
137 LIMB64(`addi rp, rp, GMP_LIMB_BYTES ')
142 L(big): mfspr r12, 256
143 oris r0, r12, 0xfffc C Set VRSAVE bit 0-13 FIXME
146 C First loop until the destination is 16-byte aligned. This will execute 0 or 1
147 C times for 64-bit machines, and 0 to 3 times for 32-bit machines.
149 LIMB32(`rlwinm. r0, rp, 30,30,31') C (rp >> 2) mod 4
150 LIMB64(`rlwinm. r0, rp, 29,31,31') C (rp >> 3) mod 2
153 subfic r7, r0, LIMBS_PER_VR
157 LIMB32(`lwz r8, 0(up) ')
158 LIMB64(`ld r8, 0(up) ')
159 addi up, up, GMP_LIMB_BYTES
160 LIMB32(`lwz r9, 0(vp) ')
161 LIMB64(`ld r9, 0(vp) ')
162 addi vp, vp, GMP_LIMB_BYTES
163 LIMB32(`addic. r7, r7, -1 ')
165 LIMB32(`stwx r0, r10, rp ')
166 LIMB64(`std r0, 0(rp) ')
167 LIMB32(`addi r10, r10, GMP_LIMB_BYTES')
168 LIMB32(`bne L(top0) ')
170 addi rp, rp, 16 C update rp, but preserve its alignment
173 LIMB64(`srdi r7, n, 1 ') C loop count corresponding to n
174 LIMB32(`srwi r7, n, 2 ') C loop count corresponding to n
175 mtctr r7 C copy n to count register
197 L(gt1): addi up, up, 16
200 L(top): lvx v0, 0, up
230 1: vperm v4, v2, v0, us
239 L(end): andi. r0, up, 15
247 1: vperm v4, v0, v2, us
259 LIMB32(`rlwinm. r7, n, 0,30,31 ') C r7 = n mod 4
260 LIMB64(`rlwinm. r7, n, 0,31,31 ') C r7 = n mod 2
263 LIMB32(`rlwinm rp, rp, 0,0,27 ')
264 LIMB64(`rldicr rp, rp, 0,59 ')
267 LIMB32(`lwzx r8, r10, up ')
268 LIMB64(`ldx r8, r10, up ')
269 LIMB32(`lwzx r9, r10, vp ')
270 LIMB64(`ldx r9, r10, vp ')
271 LIMB32(`addic. r7, r7, -1 ')
273 LIMB32(`stwx r0, r10, rp ')
274 LIMB64(`std r0, 0(rp) ')
275 LIMB32(`addi r10, r10, GMP_LIMB_BYTES')
276 LIMB32(`bne L(top2) ')
278 L(ret): mtspr 256, r12
282 C This works for 64-bit PowerPC, since a limb ptr can only be aligned
283 C in 2 relevant ways, which means we can always find a pair of aligned
284 C pointers of rp, up, and vp.
285 C process words until rp is 16-byte aligned
286 C if (((up | vp) & 15) == 0)
287 C process with VMX without any vperm
288 C else if ((up & 15) != 0 && (vp & 15) != 0)
289 C process with VMX using vperm on store data
290 C else if ((up & 15) != 0)
291 C process with VMX using vperm on up data
293 C process with VMX using vperm on vp data
295 C rlwinm, r0, up, 0,28,31
296 C rlwinm r0, vp, 0,28,31
299 C crand cr0, cr0, cr7