1 dnl PowerPC-32/VMX and PowerPC-64/VMX mpn_popcount.
3 dnl Copyright 2006 Free Software Foundation, Inc.
5 dnl This file is part of the GNU MP Library.
7 dnl The GNU MP Library is free software; you can redistribute it and/or modify
8 dnl it under the terms of the GNU Lesser General Public License as published
9 dnl by the Free Software Foundation; either version 3 of the License, or (at
10 dnl your option) any later version.
12 dnl The GNU MP Library is distributed in the hope that it will be useful, but
13 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
14 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
15 dnl License for more details.
17 dnl You should have received a copy of the GNU Lesser General Public License
18 dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
20 include(`../config.m4')
23 C 7400,7410 (G4): 2.75
24 C 744x,745x (G4+): 2.25
28 C * Works for all sizes and alignments.
31 C * Tune the awkward huge n outer loop code.
32 C * Two lvx, two vperm, and two vxor could make us a similar hamdist.
33 C * For the 970, a combined VMX+intop approach might be best.
34 C * Compress cnsts table in 64-bit mode, only half the values are needed.
36 define(`GMP_LIMB_BYTES', eval(GMP_LIMB_BITS/8))
37 define(`LIMBS_PER_VR', eval(16/GMP_LIMB_BYTES))
38 define(`LIMBS_PER_2VR', eval(32/GMP_LIMB_BYTES))
40 define(`OPERATION_popcount')
42 ifdef(`OPERATION_popcount',`
43 define(`func',`mpn_popcount')
48 ifdef(`OPERATION_hamdist',`
49 define(`func',`mpn_hamdist')
56 define(`x01010101',`v2')
57 define(`x00110011',`v7')
58 define(`x00001111',`v10')
63 ifelse(GMP_LIMB_BITS,32,`
64 define(`LIMB32',` $1')
68 define(`LIMB64',` $1')
71 C The inner loop handles up to 2^34 bits, i.e., 2^31 64-limbs, due to overflow
72 C in vsum4ubs. For large operands, we work in chunks, of size LIMBS_PER_CHUNK.
73 define(`LIMBS_PER_CHUNK', 0x1000)
74 define(`LIMBS_CHUNK_THRES', 0x1001)
77 PROLOGUE(mpn_popcount)
79 oris r0, r10, 0xfffc C Set VRSAVE bit 0-13
82 ifdef(`HAVE_ABI_mode32',
83 ` rldicl n, n, 0, 32') C zero extend n
85 C Load various constants into vector registers
88 vspltisb cnt1, 1 C 0x0101...01 used as shift count
89 vspltisb cnt2, 2 C 0x0202...02 used as shift count
90 vspltisb cnt4, 4 C 0x0404...04 used as shift count
91 lvx x01010101, 0, r11 C 0x3333...33
92 lvx x00110011, r12, r11 C 0x5555...55
93 vspltisb x00001111, 15 C 0x0f0f...0f
95 LIMB64(`lis r0, LIMBS_CHUNK_THRES ')
96 LIMB64(`cmpd cr7, n, r0 ')
100 rlwinm r6, up, 2,26,29
104 LIMB32(`rlwinm r8, up, 30,30,31 ')
105 LIMB64(`rlwinm r8, up, 29,31,31 ')
106 add n, n, r8 C compensate n for rounded down `up'
109 li r8, 0 C grand total count
111 vxor v3, v3, v3 C zero total count
113 addic. n, n, -LIMBS_PER_VR
116 addic. n, n, -LIMBS_PER_VR
119 C For 64-bit machines, handle huge n that would overflow vsum4ubs
120 LIMB64(`ble cr7, L(small) ')
121 LIMB64(`addis r9, n, -LIMBS_PER_CHUNK ') C remaining n
122 LIMB64(`lis n, LIMBS_PER_CHUNK ')
126 LIMB32(`srwi r7, n, 3 ') C loop count corresponding to n
127 LIMB64(`srdi r7, n, 2 ') C loop count corresponding to n
129 mtctr r7 C copy n to count register
133 L(top): lvx v0, 0, up
134 li r7, 128 C prefetch distance
135 L(ent): lvx v1, r12, up
139 dcbt up, r7 C prefetch
140 vand v8, v4, x01010101
141 vand v9, v5, x01010101
142 vsububm v0, v0, v8 C 64 2-bit accumulators (0..2)
143 vsububm v1, v1, v9 C 64 2-bit accumulators (0..2)
146 vand v8, v0, x00110011
147 vand v9, v1, x00110011
148 vand v4, v4, x00110011
149 vand v5, v5, x00110011
150 vaddubm v0, v4, v8 C 32 4-bit accumulators (0..4)
151 vaddubm v1, v5, v9 C 32 4-bit accumulators (0..4)
152 vaddubm v8, v0, v1 C 32 4-bit accumulators (0..8)
154 vand v6, v8, x00001111
155 vand v9, v9, x00001111
156 vaddubm v6, v9, v6 C 16 8-bit accumulators (0..16)
157 vsum4ubs v3, v6, v3 C sum 4 x 4 bytes into 4 32-bit fields
160 andi. n, n, eval(LIMBS_PER_2VR-1)
165 cmpwi n, LIMBS_PER_VR
171 LIMB32(`rlwinm r6, n, 4,26,27 ')
172 LIMB64(`rlwinm r6, n, 5,26,26 ')
179 vand v8, v4, x01010101
180 vand v9, v5, x01010101
181 vsububm v0, v0, v8 C 64 2-bit accumulators (0..2)
182 vsububm v1, v1, v9 C 64 2-bit accumulators (0..2)
185 vand v8, v0, x00110011
186 vand v9, v1, x00110011
187 vand v4, v4, x00110011
188 vand v5, v5, x00110011
189 vaddubm v0, v4, v8 C 32 4-bit accumulators (0..4)
190 vaddubm v1, v5, v9 C 32 4-bit accumulators (0..4)
191 vaddubm v8, v0, v1 C 32 4-bit accumulators (0..8)
193 vand v6, v8, x00001111
194 vand v9, v9, x00001111
195 vaddubm v6, v9, v6 C 16 8-bit accumulators (0..16)
196 vsum4ubs v3, v6, v3 C sum 4 x 4 bytes into 4 32-bit fields
199 li r7, -16 C FIXME: does all ppc32 and ppc64 ABIs
200 stvx v3, r7, r1 C FIXME: ...support storing below sp?
211 C Handle outer loop for huge n. We inherit cr7 and r0 from above.
212 LIMB64(`ble cr7, L(ret)
213 vxor v3, v3, v3 C zero total count
217 addis r9, n, -LIMBS_PER_CHUNK C remaining n
218 lis n, LIMBS_PER_CHUNK
219 L(2): srdi r7, n, 2 C loop count corresponding to n
220 mtctr r7 C copy n to count register
230 .byte 0x55,0x55,0x55,0x55,0x55,0x55,0x55,0x55
231 .byte 0x55,0x55,0x55,0x55,0x55,0x55,0x55,0x55
233 .byte 0x33,0x33,0x33,0x33,0x33,0x33,0x33,0x33
234 .byte 0x33,0x33,0x33,0x33,0x33,0x33,0x33,0x33
235 C Masks for high end of number
236 .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
237 .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
239 .byte 0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00
240 .byte 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
242 .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
243 .byte 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
245 .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
246 .byte 0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00
247 C Masks for low end of number
248 .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
249 .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
251 .byte 0x00,0x00,0x00,0x00,0xff,0xff,0xff,0xff
252 .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
254 .byte 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
255 .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
257 .byte 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
258 .byte 0x00,0x00,0x00,0x00,0xff,0xff,0xff,0xff