mpn/powerpc32/vmx/logops_n.asm

   1 dnl  PowerPC-32/VMX and PowerPC-64/VMX mpn_and_n, mpn_andn_n, mpn_nand_n,
   2 dnl  mpn_ior_n, mpn_iorn_n, mpn_nior_n, mpn_xor_n, mpn_xnor_n -- mpn bitwise
   3 dnl  logical operations.
   4
   5 dnl  Copyright 2006 Free Software Foundation, Inc.
   6
   7 dnl  This file is part of the GNU MP Library.
   8
   9 dnl  The GNU MP Library is free software; you can redistribute it and/or modify
  10 dnl  it under the terms of the GNU Lesser General Public License as published
  11 dnl  by the Free Software Foundation; either version 3 of the License, or (at
  12 dnl  your option) any later version.
  13
  14 dnl  The GNU MP Library is distributed in the hope that it will be useful, but
  15 dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
  16 dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
  17 dnl  License for more details.
  18
  19 dnl  You should have received a copy of the GNU Lesser General Public License
  20 dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
  21
  22 include(`../config.m4')
  23
  24
  25 C               and,ior,andn,nior,xor    iorn,xnor         nand
  26 C                   cycles/limb         cycles/limb    cycles/limb
  27 C 7400,7410 (G4):       1.39                 ?              ?
  28 C 744x,745x (G4+):      1.14                1.39           1.39
  29 C 970:                  1.7                 2.0            2.0
  30
  31 C STATUS
  32 C  * Works for all sizes and alignment for 32-bit limbs.
  33 C  * Works for n >= 4 for 64-bit limbs; untested for smaller operands.
  34 C  * Current performance makes this pointless for 970
  35
  36 C TODO
  37 C  * Might want to make variants when just one of the source operands needs
  38 C    vperm, and when neither needs it.  The latter runs 50% faster on 7400.
  39 C  * Idea: If the source operands are equally aligned, we could do the logops
  40 C    first, then vperm before storing!  That means we never need more than one
  41 C    vperm, ever!
  42 C  * Perhaps align `rp' after initial alignment loop?
  43 C  * Instead of having scalar code in the beginning and end, consider using
  44 C    read-modify-write vector code.
  45 C  * Software pipeline?  Hopefully not too important, this is hairy enough
  46 C    already.
  47 C  * At least be more clever about operand loading, i.e., load v operands before
  48 C    u operands, since v operands are sometimes negated.
  49
  50 define(`GMP_LIMB_BYTES', eval(GMP_LIMB_BITS/8))
  51 define(`LIMBS_PER_VR',  eval(16/GMP_LIMB_BYTES))
  52 define(`LIMBS_PER_2VR', eval(32/GMP_LIMB_BYTES))
  53
  54 define(`vnegb', `')             C default neg-before to null
  55 define(`vnega', `')             C default neg-before to null
  56
  57 ifdef(`OPERATION_and_n',
  58 `       define(`func',  `mpn_and_n')
  59         define(`logopS',`and    $1,$2,$3')
  60         define(`logop', `vand   $1,$2,$3')')
  61 ifdef(`OPERATION_andn_n',
  62 `       define(`func',  `mpn_andn_n')
  63         define(`logopS',`andc   $1,$2,$3')
  64         define(`logop', `vandc  $1,$2,$3')')
  65 ifdef(`OPERATION_nand_n',
  66 `       define(`func',  `mpn_nand_n')
  67         define(`logopS',`nand   $1,$2,$3')
  68         define(`logop', `vand   $1,$2,$3')
  69         define(`vnega', `vnor   $1,$2,$2')')
  70 ifdef(`OPERATION_ior_n',
  71 `       define(`func',  `mpn_ior_n')
  72         define(`logopS',`or     $1,$2,$3')
  73         define(`logop', `vor    $1,$2,$3')')
  74 ifdef(`OPERATION_iorn_n',
  75 `       define(`func',  `mpn_iorn_n')
  76         define(`logopS',`orc    $1,$2,$3')
  77         define(`vnegb', `vnor   $1,$2,$2')
  78         define(`logop', `vor    $1,$2,$3')')
  79 ifdef(`OPERATION_nior_n',
  80 `       define(`func',  `mpn_nior_n')
  81         define(`logopS',`nor    $1,$2,$3')
  82         define(`logop', `vnor   $1,$2,$3')')
  83 ifdef(`OPERATION_xor_n',
  84 `       define(`func',  `mpn_xor_n')
  85         define(`logopS',`xor    $1,$2,$3')
  86         define(`logop', `vxor   $1,$2,$3')')
  87 ifdef(`OPERATION_xnor_n',
  88 `       define(`func',`mpn_xnor_n')
  89         define(`logopS',`eqv    $1,$2,$3')
  90         define(`vnegb', `vnor   $1,$2,$2')
  91         define(`logop', `vxor   $1,$2,$3')')
  92
  93 ifelse(GMP_LIMB_BITS,`32',`
  94         define(`LIMB32',`       $1')
  95         define(`LIMB64',`')
  96 ',`
  97         define(`LIMB32',`')
  98         define(`LIMB64',`       $1')
  99 ')
 100
 101 C INPUT PARAMETERS
 102 define(`rp',    `r3')
 103 define(`up',    `r4')
 104 define(`vp',    `r5')
 105 define(`n',     `r6')
 106
 107 define(`us',    `v8')
 108 define(`vs',    `v9')
 109
 110 MULFUNC_PROLOGUE(mpn_and_n mpn_andn_n mpn_nand_n mpn_ior_n mpn_iorn_n mpn_nior_n mpn_xor_n mpn_xnor_n)
 111
 112 ASM_START()
 113 PROLOGUE(func)
 114
 115 LIMB32(`cmpwi   cr0, n, 8       ')
 116 LIMB64(`cmpdi   cr0, n, 4       ')
 117         bge     L(big)
 118
 119         mtctr   n
 120
 121 LIMB32(`lwz     r8, 0(up)       ')
 122 LIMB32(`lwz     r9, 0(vp)       ')
 123 LIMB32(`logopS( r0, r8, r9)     ')
 124 LIMB32(`stw     r0, 0(rp)       ')
 125 LIMB32(`bdz     L(endS)         ')
 126
 127 L(topS):
 128 LIMB32(`lwzu    r8, 4(up)       ')
 129 LIMB64(`ld      r8, 0(up)       ')
 130 LIMB64(`addi    up, up, GMP_LIMB_BYTES  ')
 131 LIMB32(`lwzu    r9, 4(vp)       ')
 132 LIMB64(`ld      r9, 0(vp)       ')
 133 LIMB64(`addi    vp, vp, GMP_LIMB_BYTES  ')
 134         logopS( r0, r8, r9)
 135 LIMB32(`stwu    r0, 4(rp)       ')
 136 LIMB64(`std     r0, 0(rp)       ')
 137 LIMB64(`addi    rp, rp, GMP_LIMB_BYTES  ')
 138         bdnz    L(topS)
 139 L(endS):
 140         blr
 141
 142 L(big): mfspr   r12, 256
 143         oris    r0, r12, 0xfffc         C Set VRSAVE bit 0-13 FIXME
 144         mtspr   256, r0
 145
 146 C First loop until the destination is 16-byte aligned.  This will execute 0 or 1
 147 C times for 64-bit machines, and 0 to 3 times for 32-bit machines.
 148
 149 LIMB32(`rlwinm. r0, rp, 30,30,31')      C (rp >> 2) mod 4
 150 LIMB64(`rlwinm. r0, rp, 29,31,31')      C (rp >> 3) mod 2
 151         beq     L(aligned)
 152
 153         subfic  r7, r0, LIMBS_PER_VR
 154 LIMB32(`li      r10, 0          ')
 155         subf    n, r7, n
 156 L(top0):
 157 LIMB32(`lwz     r8, 0(up)       ')
 158 LIMB64(`ld      r8, 0(up)       ')
 159         addi    up, up, GMP_LIMB_BYTES
 160 LIMB32(`lwz     r9, 0(vp)       ')
 161 LIMB64(`ld      r9, 0(vp)       ')
 162         addi    vp, vp, GMP_LIMB_BYTES
 163 LIMB32(`addic.  r7, r7, -1      ')
 164         logopS( r0, r8, r9)
 165 LIMB32(`stwx    r0, r10, rp     ')
 166 LIMB64(`std     r0, 0(rp)       ')
 167 LIMB32(`addi    r10, r10, GMP_LIMB_BYTES')
 168 LIMB32(`bne     L(top0)         ')
 169
 170         addi    rp, rp, 16              C update rp, but preserve its alignment
 171
 172 L(aligned):
 173 LIMB64(`srdi    r7, n, 1        ')      C loop count corresponding to n
 174 LIMB32(`srwi    r7, n, 2        ')      C loop count corresponding to n
 175         mtctr   r7                      C copy n to count register
 176
 177         li      r10, 16
 178         lvsl    us, 0, up
 179         lvsl    vs, 0, vp
 180
 181         lvx     v2, 0, up
 182         lvx     v3, 0, vp
 183         bdnz    L(gt1)
 184         lvx     v0, r10, up
 185         lvx     v1, r10, vp
 186         vperm   v4, v2, v0, us
 187         vperm   v5, v3, v1, vs
 188         vnegb(  v5, v5)
 189         logop(  v6, v4, v5)
 190         vnega(  v6, v6)
 191         stvx    v6, 0, rp
 192         addi    up, up, 16
 193         addi    vp, vp, 16
 194         addi    rp, rp, 4
 195         b       L(tail)
 196
 197 L(gt1): addi    up, up, 16
 198         addi    vp, vp, 16
 199
 200 L(top): lvx     v0, 0, up
 201         lvx     v1, 0, vp
 202         vperm   v4, v2, v0, us
 203         vperm   v5, v3, v1, vs
 204         vnegb(  v5, v5)
 205         logop(  v6, v4, v5)
 206         vnega(  v6, v6)
 207         stvx    v6, 0, rp
 208         bdz     L(end)
 209         lvx     v2, r10, up
 210         lvx     v3, r10, vp
 211         vperm   v4, v0, v2, us
 212         vperm   v5, v1, v3, vs
 213         vnegb(  v5, v5)
 214         logop(  v6, v4, v5)
 215         vnega(  v6, v6)
 216         stvx    v6, r10, rp
 217         addi    up, up, 32
 218         addi    vp, vp, 32
 219         addi    rp, rp, 32
 220         bdnz    L(top)
 221
 222         andi.   r0, up, 15
 223         vxor    v0, v0, v0
 224         beq     1f
 225         lvx     v0, 0, up
 226 1:      andi.   r0, vp, 15
 227         vxor    v1, v1, v1
 228         beq     1f
 229         lvx     v1, 0, vp
 230 1:      vperm   v4, v2, v0, us
 231         vperm   v5, v3, v1, vs
 232         vnegb(  v5, v5)
 233         logop(  v6, v4, v5)
 234         vnega(  v6, v6)
 235         stvx    v6, 0, rp
 236         addi    rp, rp, 4
 237         b       L(tail)
 238
 239 L(end): andi.   r0, up, 15
 240         vxor    v2, v2, v2
 241         beq     1f
 242         lvx     v2, r10, up
 243 1:      andi.   r0, vp, 15
 244         vxor    v3, v3, v3
 245         beq     1f
 246         lvx     v3, r10, vp
 247 1:      vperm   v4, v0, v2, us
 248         vperm   v5, v1, v3, vs
 249         vnegb(  v5, v5)
 250         logop(  v6, v4, v5)
 251         vnega(  v6, v6)
 252         stvx    v6, r10, rp
 253
 254         addi    up, up, 16
 255         addi    vp, vp, 16
 256         addi    rp, rp, 20
 257
 258 L(tail):
 259 LIMB32(`rlwinm. r7, n, 0,30,31  ')      C r7 = n mod 4
 260 LIMB64(`rlwinm. r7, n, 0,31,31  ')      C r7 = n mod 2
 261         beq     L(ret)
 262         addi    rp, rp, 15
 263 LIMB32(`rlwinm  rp, rp, 0,0,27  ')
 264 LIMB64(`rldicr  rp, rp, 0,59    ')
 265         li      r10, 0
 266 L(top2):
 267 LIMB32(`lwzx    r8, r10, up     ')
 268 LIMB64(`ldx     r8, r10, up     ')
 269 LIMB32(`lwzx    r9, r10, vp     ')
 270 LIMB64(`ldx     r9, r10, vp     ')
 271 LIMB32(`addic.  r7, r7, -1      ')
 272         logopS( r0, r8, r9)
 273 LIMB32(`stwx    r0, r10, rp     ')
 274 LIMB64(`std     r0, 0(rp)       ')
 275 LIMB32(`addi    r10, r10, GMP_LIMB_BYTES')
 276 LIMB32(`bne     L(top2)         ')
 277
 278 L(ret): mtspr   256, r12
 279         blr
 280 EPILOGUE()
 281
 282 C This works for 64-bit PowerPC, since a limb ptr can only be aligned
 283 C in 2 relevant ways, which means we can always find a pair of aligned
 284 C pointers of rp, up, and vp.
 285 C process words until rp is 16-byte aligned
 286 C if (((up | vp) & 15) == 0)
 287 C   process with VMX without any vperm
 288 C else if ((up & 15) != 0 && (vp & 15) != 0)
 289 C   process with VMX using vperm on store data
 290 C else if ((up & 15) != 0)
 291 C   process with VMX using vperm on up data
 292 C else
 293 C   process with VMX using vperm on vp data
 294 C
 295 C       rlwinm, r0, up, 0,28,31
 296 C       rlwinm  r0, vp, 0,28,31
 297 C       cmpwi   cr7, r0, 0
 298 C       cror    cr6, cr0, cr7
 299 C       crand   cr0, cr0, cr7