mpn/x86/pentium4/sse2/popcount.asm

   1 dnl  X86-32 and X86-64 mpn_popcount using SSE2.
   2
   3 dnl  Copyright 2006, 2007 Free Software Foundation, Inc.
   4 dnl
   5 dnl  This file is part of the GNU MP Library.
   6 dnl
   7 dnl  The GNU MP Library is free software; you can redistribute it and/or modify
   8 dnl  it under the terms of the GNU Lesser General Public License as published
   9 dnl  by the Free Software Foundation; either version 3 of the License, or (at
  10 dnl  your option) any later version.
  11 dnl
  12 dnl  The GNU MP Library is distributed in the hope that it will be useful, but
  13 dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
  14 dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
  15 dnl  License for more details.
  16 dnl
  17 dnl  You should have received a copy of the GNU Lesser General Public License
  18 dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
  19
  20
  21 include(`../config.m4')
  22
  23
  24 C 32-bit                     popcount        hamdist
  25 C                           cycles/limb     cycles/limb
  26 C P5:                           -
  27 C P6 model 0-8,10-12)           -
  28 C P6 model 9  (Banias)          ?
  29 C P6 model 13 (Dothan)          4
  30 C P4 model 0  (Willamette)      ?
  31 C P4 model 1  (?)               ?
  32 C P4 model 2  (Northwood)       3.9
  33 C P4 model 3  (Prescott)        ?
  34 C P4 model 4  (Nocona)          ?
  35 C K6:                           -
  36 C K7:                           -
  37 C K8:                           ?
  38
  39 C 64-bit                     popcount        hamdist
  40 C                           cycles/limb     cycles/limb
  41 C P4 model 4 (Nocona):          8
  42 C K8:                           7.5
  43 C K10:                          3.5
  44 C P6 core2:                     3.68
  45 C P6 corei7:                    3.15
  46
  47 C TODO
  48 C  * Make a mpn_hamdist based on this.  Alignment could either be handled by
  49 C    using movdqu for one operand and movdqa for the other, or by painfully
  50 C    shifting as we go.  Unfortunately, there seem to be no useable shift
  51 C    instruction, except for one that takes an immediate count.
  52 C  * It would probably be possible to cut a few cycles/limb using software
  53 C    pipelining.
  54 C  * There are 35 decode slots unused by the SSE2 instructions.  Loop control
  55 C    needs just 2 or 3 slots, leaving around 32 slots.  This allows a parallel
  56 C    integer based popcount.  Such a combined loop would handle 6 limbs in
  57 C    about 30 cycles on K8.
  58 C  * We could save a byte or two by using 32-bit operations on areg.
  59 C  * Check if using movdqa to a temp of and then register-based pand is faster.
  60
  61 ifelse(GMP_LIMB_BITS,`32',
  62 `       define(`up',  `%edx')
  63         define(`n',   `%ecx')
  64         define(`areg',`%eax')
  65         define(`breg',`%ebx')
  66         define(`zero',`%xmm4')
  67         define(`LIMB32',`       $1')
  68         define(`LIMB64',`dnl')
  69 ',`
  70         define(`up',  `%rdi')
  71         define(`n',   `%rsi')
  72         define(`areg',`%rax')
  73         define(`breg',`%rdx')
  74         define(`zero',`%xmm8')
  75         define(`LIMB32',`dnl')
  76         define(`LIMB64',`       $1')
  77 ')
  78
  79 define(`mm01010101',`%xmm6')
  80 define(`mm00110011',`%xmm7')
  81 define(`mm00001111',`%xmm2')
  82
  83 define(`GMP_LIMB_BYTES', eval(GMP_LIMB_BITS/8))
  84 define(`LIMBS_PER_XMM',  eval(16/GMP_LIMB_BYTES))
  85 define(`LIMBS_PER_2XMM', eval(32/GMP_LIMB_BYTES))
  86
  87 undefine(`psadbw')                      C override inherited m4 version
  88
  89 ASM_START()
  90
  91 C Make cnsts global to work around Apple relocation bug.
  92 ifdef(`DARWIN',`
  93         define(`cnsts', MPN(popccnsts))
  94         GLOBL   cnsts')
  95
  96         TEXT
  97         ALIGN(32)
  98 PROLOGUE(mpn_popcount)
  99
 100 LIMB32(`mov     4(%esp), up     ')
 101 LIMB32(`mov     8(%esp), n      ')
 102 LIMB32(`push    %ebx            ')
 103
 104         pxor    %xmm3, %xmm3            C zero grand total count
 105 LIMB64(`pxor    zero, zero      ')
 106 ifdef(`PIC',`
 107         LEA(    cnsts, breg)
 108 ',`
 109 LIMB32(`mov     $cnsts, breg    ')
 110 LIMB64(`movabs  $cnsts, breg    ')
 111 ')
 112
 113         movdqa  -48(breg), mm01010101
 114         movdqa  -32(breg), mm00110011
 115         movdqa  -16(breg), mm00001111
 116
 117         mov     up, areg
 118         and     $-16, up                C round `up' down to 128-bit boundary
 119         and     $12, areg               C 32:areg = 0, 4, 8, 12
 120                                         C 64:areg = 0, 8
 121         movdqa  (up), %xmm0
 122         pand    64(breg,areg,4), %xmm0
 123         shr     $m4_log2(GMP_LIMB_BYTES), %eax
 124         add     areg, n                 C compensate n for rounded down `up'
 125
 126         pxor    %xmm4, %xmm4
 127         sub     $LIMBS_PER_XMM, n
 128         jbe     L(sum)
 129
 130         sub     $LIMBS_PER_XMM, n
 131         ja      L(ent)
 132         jmp     L(lsum)
 133
 134         ALIGN(16)
 135 L(top): movdqa  (up), %xmm0
 136 L(ent): movdqa  16(up), %xmm4
 137
 138         movdqa  %xmm0, %xmm1
 139         movdqa  %xmm4, %xmm5
 140         psrld   $1, %xmm0
 141         psrld   $1, %xmm4
 142         pand    mm01010101, %xmm0
 143         pand    mm01010101, %xmm4
 144         psubd   %xmm0, %xmm1
 145         psubd   %xmm4, %xmm5
 146
 147         movdqa  %xmm1, %xmm0
 148         movdqa  %xmm5, %xmm4
 149         psrlq   $2, %xmm1
 150         psrlq   $2, %xmm5
 151         pand    mm00110011, %xmm0
 152         pand    mm00110011, %xmm4
 153         pand    mm00110011, %xmm1
 154         pand    mm00110011, %xmm5
 155         paddq   %xmm0, %xmm1
 156         paddq   %xmm4, %xmm5
 157
 158 LIMB32(`pxor    zero, zero      ')
 159
 160         add     $32, up
 161         sub     $LIMBS_PER_2XMM, n
 162
 163         paddq   %xmm5, %xmm1
 164         movdqa  %xmm1, %xmm0
 165         psrlq   $4, %xmm1
 166         pand    mm00001111, %xmm0
 167         pand    mm00001111, %xmm1
 168         paddq   %xmm0, %xmm1
 169
 170         psadbw  zero, %xmm1
 171         paddq   %xmm1, %xmm3            C add to grand total
 172
 173         jnc     L(top)
 174 L(end):
 175         add     $LIMBS_PER_2XMM, n
 176         jz      L(rt)
 177         movdqa  (up), %xmm0
 178         pxor    %xmm4, %xmm4
 179         sub     $LIMBS_PER_XMM, n
 180         jbe     L(sum)
 181 L(lsum):
 182         movdqa  %xmm0, %xmm4
 183         movdqa  16(up), %xmm0
 184 L(sum):
 185         shl     $m4_log2(GMP_LIMB_BYTES), n
 186         and     $12, n
 187         pand    (breg,n,4), %xmm0
 188
 189         movdqa  %xmm0, %xmm1
 190         movdqa  %xmm4, %xmm5
 191         psrld   $1, %xmm0
 192         psrld   $1, %xmm4
 193         pand    mm01010101, %xmm0
 194         pand    mm01010101, %xmm4
 195         psubd   %xmm0, %xmm1
 196         psubd   %xmm4, %xmm5
 197
 198         movdqa  %xmm1, %xmm0
 199         movdqa  %xmm5, %xmm4
 200         psrlq   $2, %xmm1
 201         psrlq   $2, %xmm5
 202         pand    mm00110011, %xmm0
 203         pand    mm00110011, %xmm4
 204         pand    mm00110011, %xmm1
 205         pand    mm00110011, %xmm5
 206         paddq   %xmm0, %xmm1
 207         paddq   %xmm4, %xmm5
 208
 209 LIMB32(`pxor    zero, zero      ')
 210
 211         paddq   %xmm5, %xmm1
 212         movdqa  %xmm1, %xmm0
 213         psrlq   $4, %xmm1
 214         pand    mm00001111, %xmm0
 215         pand    mm00001111, %xmm1
 216         paddq   %xmm0, %xmm1
 217
 218         psadbw  zero, %xmm1
 219         paddq   %xmm1, %xmm3            C add to grand total
 220
 221
 222 C Add the two 64-bit halves of the grand total counter
 223 L(rt):  movdqa  %xmm3, %xmm0
 224         psrldq  $8, %xmm3
 225         paddq   %xmm3, %xmm0
 226         movd    %xmm0, areg             C movq avoided due to gas bug
 227
 228 LIMB32(`pop     %ebx            ')
 229         ret
 230
 231 EPILOGUE()
 232 DEF_OBJECT(dummy,16)
 233 C Three magic constants used for masking out bits
 234         .byte   0x55,0x55,0x55,0x55,0x55,0x55,0x55,0x55
 235         .byte   0x55,0x55,0x55,0x55,0x55,0x55,0x55,0x55
 236
 237         .byte   0x33,0x33,0x33,0x33,0x33,0x33,0x33,0x33
 238         .byte   0x33,0x33,0x33,0x33,0x33,0x33,0x33,0x33
 239
 240         .byte   0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f
 241         .byte   0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f
 242 cnsts:
 243 C Masks for high end of number
 244         .byte   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
 245         .byte   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
 246
 247         .byte   0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00
 248         .byte   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
 249
 250         .byte   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
 251         .byte   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
 252
 253         .byte   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
 254         .byte   0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00
 255 C Masks for low end of number
 256         .byte   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
 257         .byte   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
 258
 259         .byte   0x00,0x00,0x00,0x00,0xff,0xff,0xff,0xff
 260         .byte   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
 261
 262         .byte   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
 263         .byte   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
 264
 265         .byte   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
 266         .byte   0x00,0x00,0x00,0x00,0xff,0xff,0xff,0xff
 267 END_OBJECT(dummy)