mpn/powerpc32/vmx/copyd.asm

   1 dnl  PowerPC-32/VMX and PowerPC-64/VMX mpn_copyd.
   2
   3 dnl  Copyright 2006 Free Software Foundation, Inc.
   4
   5 dnl  This file is part of the GNU MP Library.
   6
   7 dnl  The GNU MP Library is free software; you can redistribute it and/or modify
   8 dnl  it under the terms of the GNU Lesser General Public License as published
   9 dnl  by the Free Software Foundation; either version 3 of the License, or (at
  10 dnl  your option) any later version.
  11
  12 dnl  The GNU MP Library is distributed in the hope that it will be useful, but
  13 dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
  14 dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
  15 dnl  License for more details.
  16
  17 dnl  You should have received a copy of the GNU Lesser General Public License
  18 dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
  19
  20 include(`../config.m4')
  21
  22 C                16-byte coaligned      unaligned
  23 C                   cycles/limb        cycles/limb
  24 C 7400,7410 (G4):       0.5                0.64
  25 C 744x,745x (G4+):      0.75               0.82
  26 C 970 (G5):             0.78               1.02         (64-bit limbs)
  27
  28 C STATUS
  29 C  * Works for all sizes and alignments.
  30
  31 C TODO
  32 C  * Optimize unaligned case.  Some basic tests with 2-way and 4-way unrolling
  33 C    indicate that we can reach 0.56 c/l for 7400, 0.75 c/l for 745x, and 0.80
  34 C    c/l for 970.
  35 C  * Consider using VMX instructions also for head and tail, by using some
  36 C    read-modify-write tricks.
  37 C  * The VMX code is used from the smallest sizes it handles, but measurements
  38 C    show a large speed bump at the cutoff points.  Small copying (perhaps
  39 C    using some read-modify-write technique) should be optimized.
  40 C  * Make a mpn_com based on this code.
  41
  42 define(`GMP_LIMB_BYTES', eval(GMP_LIMB_BITS/8))
  43 define(`LIMBS_PER_VR',  eval(16/GMP_LIMB_BYTES))
  44 define(`LIMBS_PER_2VR', eval(32/GMP_LIMB_BYTES))
  45
  46
  47 ifelse(GMP_LIMB_BITS,32,`
  48         define(`LIMB32',`       $1')
  49         define(`LIMB64',`')
  50 ',`
  51         define(`LIMB32',`')
  52         define(`LIMB64',`       $1')
  53 ')
  54
  55 C INPUT PARAMETERS
  56 define(`rp',    `r3')
  57 define(`up',    `r4')
  58 define(`n',     `r5')
  59
  60 define(`us',    `v4')
  61
  62
  63 ASM_START()
  64 PROLOGUE(mpn_copyd)
  65
  66 LIMB32(`slwi.   r0, n, 2        ')
  67 LIMB64(`sldi.   r0, n, 3        ')
  68         add     rp, rp, r0
  69         add     up, up, r0
  70
  71 LIMB32(`cmpi    cr7, n, 11      ')
  72 LIMB64(`cmpdi   cr7, n, 5       ')
  73         bge     cr7, L(big)
  74
  75         beqlr   cr0
  76
  77 C Handle small cases with plain operations
  78         mtctr   n
  79 L(topS):
  80 LIMB32(`lwz     r0, -4(up)      ')
  81 LIMB64(`ld      r0, -8(up)      ')
  82         addi    up, up, -GMP_LIMB_BYTES
  83 LIMB32(`stw     r0, -4(rp)      ')
  84 LIMB64(`std     r0, -8(rp)      ')
  85         addi    rp, rp, -GMP_LIMB_BYTES
  86         bdnz    L(topS)
  87         blr
  88
  89 C Handle large cases with VMX operations
  90 L(big):
  91         addi    rp, rp, -16
  92         addi    up, up, -16
  93         mfspr   r12, 256
  94         oris    r0, r12, 0xf800         C Set VRSAVE bit 0-4
  95         mtspr   256, r0
  96
  97 LIMB32(`rlwinm. r7, rp, 30,30,31')      C (rp >> 2) mod 4
  98 LIMB64(`rlwinm. r7, rp, 29,31,31')      C (rp >> 3) mod 2
  99         beq     L(rp_aligned)
 100
 101         subf    n, r7, n
 102 L(top0):
 103 LIMB32(`lwz     r0, 12(up)      ')
 104 LIMB64(`ld      r0, 8(up)       ')
 105         addi    up, up, -GMP_LIMB_BYTES
 106 LIMB32(`addic.  r7, r7, -1      ')
 107 LIMB32(`stw     r0, 12(rp)      ')
 108 LIMB64(`std     r0, 8(rp)       ')
 109         addi    rp, rp, -GMP_LIMB_BYTES
 110 LIMB32(`bne     L(top0)         ')
 111
 112 L(rp_aligned):
 113
 114 LIMB32(`rlwinm. r0, up, 30,30,31')      C (up >> 2) mod 4
 115 LIMB64(`rlwinm. r0, up, 29,31,31')      C (up >> 3) mod 2
 116
 117 LIMB64(`srdi    r7, n, 2        ')      C loop count corresponding to n
 118 LIMB32(`srwi    r7, n, 3        ')      C loop count corresponding to n
 119         mtctr   r7                      C copy n to count register
 120
 121         li      r10, -16
 122
 123         beq     L(up_aligned)
 124
 125         lvsl    us, 0, up
 126
 127         addi    up, up, 16
 128 LIMB32(`andi.   r0, n, 0x4      ')
 129 LIMB64(`andi.   r0, n, 0x2      ')
 130         beq     L(1)
 131         lvx     v0, 0, up
 132         lvx     v2, r10, up
 133         vperm   v3, v2, v0, us
 134         stvx    v3, 0, rp
 135         addi    up, up, -32
 136         addi    rp, rp, -16
 137         b       L(lpu)
 138 L(1):   lvx     v2, 0, up
 139         addi    up, up, -16
 140         b       L(lpu)
 141
 142         ALIGN(32)
 143 L(lpu): lvx     v0, 0, up
 144         vperm   v3, v0, v2, us
 145         stvx    v3, 0, rp
 146         lvx     v2, r10, up
 147         addi    up, up, -32
 148         vperm   v3, v2, v0, us
 149         stvx    v3, r10, rp
 150         addi    rp, rp, -32
 151         bdnz    L(lpu)
 152
 153         b       L(tail)
 154
 155 L(up_aligned):
 156
 157 LIMB32(`andi.   r0, n, 0x4      ')
 158 LIMB64(`andi.   r0, n, 0x2      ')
 159         beq     L(lpa)
 160         lvx     v0, 0,   up
 161         stvx    v0, 0,   rp
 162         addi    up, up, -16
 163         addi    rp, rp, -16
 164         b       L(lpa)
 165
 166         ALIGN(32)
 167 L(lpa): lvx     v0, 0,   up
 168         lvx     v1, r10, up
 169         addi    up, up, -32
 170         nop
 171         stvx    v0, 0,   rp
 172         stvx    v1, r10, rp
 173         addi    rp, rp, -32
 174         bdnz    L(lpa)
 175
 176 L(tail):
 177 LIMB32(`rlwinm. r7, n, 0,30,31  ')      C r7 = n mod 4
 178 LIMB64(`rlwinm. r7, n, 0,31,31  ')      C r7 = n mod 2
 179         beq     L(ret)
 180 LIMB32(`li      r10, 12         ')
 181 L(top2):
 182 LIMB32(`lwzx    r0, r10, up     ')
 183 LIMB64(`ld      r0, 8(up)       ')
 184 LIMB32(`addic.  r7, r7, -1      ')
 185 LIMB32(`stwx    r0, r10, rp     ')
 186 LIMB64(`std     r0, 8(rp)       ')
 187 LIMB32(`addi    r10, r10, -GMP_LIMB_BYTES')
 188 LIMB32(`bne     L(top2)         ')
 189
 190 L(ret): mtspr   256, r12
 191         blr
 192 EPILOGUE()