mpn/powerpc32/vmx/copyi.asm

   1 dnl  PowerPC-32/VMX and PowerPC-64/VMX mpn_copyi.
   2
   3 dnl  Copyright 2006 Free Software Foundation, Inc.
   4
   5 dnl  This file is part of the GNU MP Library.
   6
   7 dnl  The GNU MP Library is free software; you can redistribute it and/or modify
   8 dnl  it under the terms of the GNU Lesser General Public License as published
   9 dnl  by the Free Software Foundation; either version 3 of the License, or (at
  10 dnl  your option) any later version.
  11
  12 dnl  The GNU MP Library is distributed in the hope that it will be useful, but
  13 dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
  14 dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
  15 dnl  License for more details.
  16
  17 dnl  You should have received a copy of the GNU Lesser General Public License
  18 dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
  19
  20 include(`../config.m4')
  21
  22 C                16-byte coaligned      unaligned
  23 C                   cycles/limb        cycles/limb
  24 C 7400,7410 (G4):       0.5                0.64
  25 C 744x,745x (G4+):      0.75               0.82
  26 C 970 (G5):             0.78               1.02         (64-bit limbs)
  27
  28 C STATUS
  29 C  * Works for all sizes and alignments.
  30
  31 C TODO
  32 C  * Optimize unaligned case.  Some basic tests with 2-way and 4-way unrolling
  33 C    indicate that we can reach 0.56 c/l for 7400, 0.75 c/l for 745x, and 0.80
  34 C    c/l for 970.
  35 C  * Consider using VMX instructions also for head and tail, by using some
  36 C    read-modify-write tricks.
  37 C  * The VMX code is used from the smallest sizes it handles, but measurements
  38 C    show a large speed bump at the cutoff points.  Small copying (perhaps
  39 C    using some read-modify-write technique) should be optimized.
  40 C  * Make a mpn_com based on this code.
  41
  42 define(`GMP_LIMB_BYTES', eval(GMP_LIMB_BITS/8))
  43 define(`LIMBS_PER_VR',  eval(16/GMP_LIMB_BYTES))
  44 define(`LIMBS_PER_2VR', eval(32/GMP_LIMB_BYTES))
  45
  46
  47 ifelse(GMP_LIMB_BITS,32,`
  48         define(`LIMB32',`       $1')
  49         define(`LIMB64',`')
  50 ',`
  51         define(`LIMB32',`')
  52         define(`LIMB64',`       $1')
  53 ')
  54
  55 C INPUT PARAMETERS
  56 define(`rp',    `r3')
  57 define(`up',    `r4')
  58 define(`n',     `r5')
  59
  60 define(`us',    `v4')
  61
  62
  63 ASM_START()
  64 PROLOGUE(mpn_copyi)
  65
  66 LIMB32(`cmpi    cr7, n, 11      ')
  67 LIMB64(`cmpdi   cr7, n, 5       ')
  68         bge     cr7, L(big)
  69
  70         or.     r0, n, n
  71         beqlr   cr0
  72
  73 C Handle small cases with plain operations
  74         mtctr   n
  75 L(topS):
  76 LIMB32(`lwz     r0, 0(up)       ')
  77 LIMB64(`ld      r0, 0(up)       ')
  78         addi    up, up, GMP_LIMB_BYTES
  79 LIMB32(`stw     r0, 0(rp)       ')
  80 LIMB64(`std     r0, 0(rp)       ')
  81         addi    rp, rp, GMP_LIMB_BYTES
  82         bdnz    L(topS)
  83         blr
  84
  85 C Handle large cases with VMX operations
  86 L(big):
  87         mfspr   r12, 256
  88         oris    r0, r12, 0xf800         C Set VRSAVE bit 0-4
  89         mtspr   256, r0
  90
  91 LIMB32(`rlwinm. r7, rp, 30,30,31')      C (rp >> 2) mod 4
  92 LIMB64(`rlwinm. r7, rp, 29,31,31')      C (rp >> 3) mod 2
  93         beq     L(rp_aligned)
  94
  95         subfic  r7, r7, LIMBS_PER_VR
  96         subf    n, r7, n
  97 L(top0):
  98 LIMB32(`lwz     r0, 0(up)       ')
  99 LIMB64(`ld      r0, 0(up)       ')
 100         addi    up, up, GMP_LIMB_BYTES
 101 LIMB32(`addic.  r7, r7, -1      ')
 102 LIMB32(`stw     r0, 0(rp)       ')
 103 LIMB64(`std     r0, 0(rp)       ')
 104         addi    rp, rp, GMP_LIMB_BYTES
 105 LIMB32(`bne     L(top0)         ')
 106
 107 L(rp_aligned):
 108
 109 LIMB32(`rlwinm. r0, up, 30,30,31')      C (up >> 2) mod 4
 110 LIMB64(`rlwinm. r0, up, 29,31,31')      C (up >> 3) mod 2
 111
 112 LIMB64(`srdi    r7, n, 2        ')      C loop count corresponding to n
 113 LIMB32(`srwi    r7, n, 3        ')      C loop count corresponding to n
 114         mtctr   r7                      C copy n to count register
 115
 116         li      r10, 16
 117
 118         beq     L(up_aligned)
 119
 120         lvsl    us, 0, up
 121
 122 LIMB32(`andi.   r0, n, 0x4      ')
 123 LIMB64(`andi.   r0, n, 0x2      ')
 124         beq     L(1)
 125         lvx     v0, 0, up
 126         lvx     v2, r10, up
 127         vperm   v3, v0, v2, us
 128         stvx    v3, 0, rp
 129         addi    up, up, 32
 130         addi    rp, rp, 16
 131         b       L(lpu)
 132 L(1):   lvx     v2, 0, up
 133         addi    up, up, 16
 134         b       L(lpu)
 135
 136         ALIGN(32)
 137 L(lpu): lvx     v0, 0, up
 138         vperm   v3, v2, v0, us
 139         stvx    v3, 0, rp
 140         lvx     v2, r10, up
 141         addi    up, up, 32
 142         vperm   v3, v0, v2, us
 143         stvx    v3, r10, rp
 144         addi    rp, rp, 32
 145         bdnz    L(lpu)
 146
 147         addi    up, up, -16
 148         b       L(tail)
 149
 150 L(up_aligned):
 151
 152 LIMB32(`andi.   r0, n, 0x4      ')
 153 LIMB64(`andi.   r0, n, 0x2      ')
 154         beq     L(lpa)
 155         lvx     v0, 0,   up
 156         stvx    v0, 0,   rp
 157         addi    up, up, 16
 158         addi    rp, rp, 16
 159         b       L(lpa)
 160
 161         ALIGN(32)
 162 L(lpa): lvx     v0, 0,   up
 163         lvx     v1, r10, up
 164         addi    up, up, 32
 165         nop
 166         stvx    v0, 0,   rp
 167         stvx    v1, r10, rp
 168         addi    rp, rp, 32
 169         bdnz    L(lpa)
 170
 171 L(tail):
 172 LIMB32(`rlwinm. r7, n, 0,30,31  ')      C r7 = n mod 4
 173 LIMB64(`rlwinm. r7, n, 0,31,31  ')      C r7 = n mod 2
 174         beq     L(ret)
 175 LIMB32(`li      r10, 0          ')
 176 L(top2):
 177 LIMB32(`lwzx    r0, r10, up     ')
 178 LIMB64(`ld      r0, 0(up)       ')
 179 LIMB32(`addic.  r7, r7, -1      ')
 180 LIMB32(`stwx    r0, r10, rp     ')
 181 LIMB64(`std     r0, 0(rp)       ')
 182 LIMB32(`addi    r10, r10, GMP_LIMB_BYTES')
 183 LIMB32(`bne     L(top2)         ')
 184
 185 L(ret): mtspr   256, r12
 186         blr
 187 EPILOGUE()