mpn/powerpc64/vmx/popcount.asm

   1 dnl  PowerPC-32/VMX and PowerPC-64/VMX mpn_popcount.
   2
   3 dnl  Copyright 2006 Free Software Foundation, Inc.
   4
   5 dnl  This file is part of the GNU MP Library.
   6
   7 dnl  The GNU MP Library is free software; you can redistribute it and/or modify
   8 dnl  it under the terms of the GNU Lesser General Public License as published
   9 dnl  by the Free Software Foundation; either version 3 of the License, or (at
  10 dnl  your option) any later version.
  11
  12 dnl  The GNU MP Library is distributed in the hope that it will be useful, but
  13 dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
  14 dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
  15 dnl  License for more details.
  16
  17 dnl  You should have received a copy of the GNU Lesser General Public License
  18 dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
  19
  20 include(`../config.m4')
  21
  22 C                   cycles/limb
  23 C 7400,7410 (G4):       2.75
  24 C 744x,745x (G4+):      2.25
  25 C 970 (G5):             5.3
  26
  27 C STATUS
  28 C  * Works for all sizes and alignments.
  29
  30 C TODO
  31 C  * Tune the awkward huge n outer loop code.
  32 C  * Two lvx, two vperm, and two vxor could make us a similar hamdist.
  33 C  * For the 970, a combined VMX+intop approach might be best.
  34 C  * Compress cnsts table in 64-bit mode, only half the values are needed.
  35
  36 define(`GMP_LIMB_BYTES', eval(GMP_LIMB_BITS/8))
  37 define(`LIMBS_PER_VR',  eval(16/GMP_LIMB_BYTES))
  38 define(`LIMBS_PER_2VR', eval(32/GMP_LIMB_BYTES))
  39
  40 define(`OPERATION_popcount')
  41
  42 ifdef(`OPERATION_popcount',`
  43   define(`func',`mpn_popcount')
  44   define(`up',          `r3')
  45   define(`n',           `r4')
  46   define(`HAM',         `dnl')
  47 ')
  48 ifdef(`OPERATION_hamdist',`
  49   define(`func',`mpn_hamdist')
  50   define(`up',          `r3')
  51   define(`vp',          `r4')
  52   define(`n',           `r5')
  53   define(`HAM',         `$1')
  54 ')
  55
  56 define(`x01010101',`v2')
  57 define(`x00110011',`v7')
  58 define(`x00001111',`v10')
  59 define(`cnt1',`v11')
  60 define(`cnt2',`v12')
  61 define(`cnt4',`v13')
  62
  63 ifelse(GMP_LIMB_BITS,32,`
  64         define(`LIMB32',`       $1')
  65         define(`LIMB64',`')
  66 ',`
  67         define(`LIMB32',`')
  68         define(`LIMB64',`       $1')
  69 ')
  70
  71 C The inner loop handles up to 2^34 bits, i.e., 2^31 64-limbs, due to overflow
  72 C in vsum4ubs.  For large operands, we work in chunks, of size LIMBS_PER_CHUNK.
  73 define(`LIMBS_PER_CHUNK', 0x1000)
  74 define(`LIMBS_CHUNK_THRES', 0x1001)
  75
  76 ASM_START()
  77 PROLOGUE(mpn_popcount)
  78         mfspr   r10, 256
  79         oris    r0, r10, 0xfffc         C Set VRSAVE bit 0-13
  80         mtspr   256, r0
  81
  82 ifdef(`HAVE_ABI_mode32',
  83 `       rldicl  n, n, 0, 32')           C zero extend n
  84
  85 C Load various constants into vector registers
  86         LEAL(   r11, cnsts)
  87         li      r12, 16
  88         vspltisb cnt1, 1                C 0x0101...01 used as shift count
  89         vspltisb cnt2, 2                C 0x0202...02 used as shift count
  90         vspltisb cnt4, 4                C 0x0404...04 used as shift count
  91         lvx     x01010101, 0, r11       C 0x3333...33
  92         lvx     x00110011, r12, r11     C 0x5555...55
  93         vspltisb x00001111, 15          C 0x0f0f...0f
  94
  95 LIMB64(`lis     r0, LIMBS_CHUNK_THRES   ')
  96 LIMB64(`cmpd    cr7, n, r0              ')
  97
  98         lvx     v0, 0, up
  99         addi    r7, r11, 96
 100         rlwinm  r6, up, 2,26,29
 101         lvx     v8, r7, r6
 102         vand    v0, v0, v8
 103
 104 LIMB32(`rlwinm  r8, up, 30,30,31        ')
 105 LIMB64(`rlwinm  r8, up, 29,31,31        ')
 106         add     n, n, r8                C compensate n for rounded down `up'
 107
 108         vxor    v1, v1, v1
 109         li      r8, 0                   C grand total count
 110
 111         vxor    v3, v3, v3              C zero total count
 112
 113         addic.  n, n, -LIMBS_PER_VR
 114         ble     L(sum)
 115
 116         addic.  n, n, -LIMBS_PER_VR
 117         ble     L(lsum)
 118
 119 C For 64-bit machines, handle huge n that would overflow vsum4ubs
 120 LIMB64(`ble     cr7, L(small)           ')
 121 LIMB64(`addis   r9, n, -LIMBS_PER_CHUNK ') C remaining n
 122 LIMB64(`lis     n, LIMBS_PER_CHUNK      ')
 123 L(small):
 124
 125
 126 LIMB32(`srwi    r7, n, 3        ')      C loop count corresponding to n
 127 LIMB64(`srdi    r7, n, 2        ')      C loop count corresponding to n
 128         addi    r7, r7, 1
 129         mtctr   r7                      C copy n to count register
 130         b       L(ent)
 131
 132         ALIGN(8)
 133 L(top): lvx     v0, 0, up
 134         li      r7, 128                 C prefetch distance
 135 L(ent): lvx     v1, r12, up
 136         addi    up, up, 32
 137         vsr     v4, v0, cnt1
 138         vsr     v5, v1, cnt1
 139         dcbt    up, r7                  C prefetch
 140         vand    v8, v4, x01010101
 141         vand    v9, v5, x01010101
 142         vsububm v0, v0, v8              C 64 2-bit accumulators (0..2)
 143         vsububm v1, v1, v9              C 64 2-bit accumulators (0..2)
 144         vsr     v4, v0, cnt2
 145         vsr     v5, v1, cnt2
 146         vand    v8, v0, x00110011
 147         vand    v9, v1, x00110011
 148         vand    v4, v4, x00110011
 149         vand    v5, v5, x00110011
 150         vaddubm v0, v4, v8              C 32 4-bit accumulators (0..4)
 151         vaddubm v1, v5, v9              C 32 4-bit accumulators (0..4)
 152         vaddubm v8, v0, v1              C 32 4-bit accumulators (0..8)
 153         vsr     v9, v8, cnt4
 154         vand    v6, v8, x00001111
 155         vand    v9, v9, x00001111
 156         vaddubm v6, v9, v6              C 16 8-bit accumulators (0..16)
 157         vsum4ubs v3, v6, v3             C sum 4 x 4 bytes into 4 32-bit fields
 158         bdnz    L(top)
 159
 160         andi.   n, n, eval(LIMBS_PER_2VR-1)
 161         beq     L(rt)
 162
 163         lvx     v0, 0, up
 164         vxor    v1, v1, v1
 165         cmpwi   n, LIMBS_PER_VR
 166         ble     L(sum)
 167 L(lsum):
 168         vor     v1, v0, v0
 169         lvx     v0, r12, up
 170 L(sum):
 171 LIMB32(`rlwinm  r6, n, 4,26,27  ')
 172 LIMB64(`rlwinm  r6, n, 5,26,26  ')
 173         addi    r7, r11, 32
 174         lvx     v8, r7, r6
 175         vand    v0, v0, v8
 176
 177         vsr     v4, v0, cnt1
 178         vsr     v5, v1, cnt1
 179         vand    v8, v4, x01010101
 180         vand    v9, v5, x01010101
 181         vsububm v0, v0, v8              C 64 2-bit accumulators (0..2)
 182         vsububm v1, v1, v9              C 64 2-bit accumulators (0..2)
 183         vsr     v4, v0, cnt2
 184         vsr     v5, v1, cnt2
 185         vand    v8, v0, x00110011
 186         vand    v9, v1, x00110011
 187         vand    v4, v4, x00110011
 188         vand    v5, v5, x00110011
 189         vaddubm v0, v4, v8              C 32 4-bit accumulators (0..4)
 190         vaddubm v1, v5, v9              C 32 4-bit accumulators (0..4)
 191         vaddubm v8, v0, v1              C 32 4-bit accumulators (0..8)
 192         vsr     v9, v8, cnt4
 193         vand    v6, v8, x00001111
 194         vand    v9, v9, x00001111
 195         vaddubm v6, v9, v6              C 16 8-bit accumulators (0..16)
 196         vsum4ubs v3, v6, v3             C sum 4 x 4 bytes into 4 32-bit fields
 197
 198 L(rt):
 199         li      r7, -16                 C FIXME: does all ppc32 and ppc64 ABIs
 200         stvx    v3, r7, r1              C FIXME: ...support storing below sp?
 201
 202         lwz     r7, -16(r1)
 203         add     r8, r8, r7
 204         lwz     r7, -12(r1)
 205         add     r8, r8, r7
 206         lwz     r7, -8(r1)
 207         add     r8, r8, r7
 208         lwz     r7, -4(r1)
 209         add     r8, r8, r7
 210
 211 C Handle outer loop for huge n.  We inherit cr7 and r0 from above.
 212 LIMB64(`ble     cr7, L(ret)
 213         vxor    v3, v3, v3              C zero total count
 214         mr      n, r9
 215         cmpd    cr7, n, r0
 216         ble     cr7, L(2)
 217         addis   r9, n, -LIMBS_PER_CHUNK C remaining n
 218         lis     n, LIMBS_PER_CHUNK
 219 L(2):   srdi    r7, n, 2                C loop count corresponding to n
 220         mtctr   r7                      C copy n to count register
 221         b       L(top)
 222 ')
 223
 224 L(ret): mr      r3, r8
 225         mtspr   256, r10
 226         blr
 227 EPILOGUE()
 228
 229 DEF_OBJECT(cnsts,16)
 230         .byte   0x55,0x55,0x55,0x55,0x55,0x55,0x55,0x55
 231         .byte   0x55,0x55,0x55,0x55,0x55,0x55,0x55,0x55
 232
 233         .byte   0x33,0x33,0x33,0x33,0x33,0x33,0x33,0x33
 234         .byte   0x33,0x33,0x33,0x33,0x33,0x33,0x33,0x33
 235 C Masks for high end of number
 236         .byte   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
 237         .byte   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
 238
 239         .byte   0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00
 240         .byte   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
 241
 242         .byte   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
 243         .byte   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
 244
 245         .byte   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
 246         .byte   0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00
 247 C Masks for low end of number
 248         .byte   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
 249         .byte   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
 250
 251         .byte   0x00,0x00,0x00,0x00,0xff,0xff,0xff,0xff
 252         .byte   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
 253
 254         .byte   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
 255         .byte   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
 256
 257         .byte   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
 258         .byte   0x00,0x00,0x00,0x00,0xff,0xff,0xff,0xff
 259 END_OBJECT(cnsts)
 260 ASM_END()