mpn/x86/k7/mmx/copyi.asm

   1 dnl  AMD K7 mpn_copyi -- copy limb vector, incrementing.
   2
   3 dnl  Copyright 1999, 2000, 2002, 2003 Free Software Foundation, Inc.
   4 dnl
   5 dnl  This file is part of the GNU MP Library.
   6 dnl
   7 dnl  The GNU MP Library is free software; you can redistribute it and/or
   8 dnl  modify it under the terms of the GNU Lesser General Public License as
   9 dnl  published by the Free Software Foundation; either version 3 of the
  10 dnl  License, or (at your option) any later version.
  11 dnl
  12 dnl  The GNU MP Library is distributed in the hope that it will be useful,
  13 dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
  14 dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15 dnl  Lesser General Public License for more details.
  16 dnl
  17 dnl  You should have received a copy of the GNU Lesser General Public License
  18 dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
  19
  20 include(`../config.m4')
  21
  22
  23 C    alignment dst/src, A=0mod8 N=4mod8
  24 C       A/A   A/N   N/A   N/N
  25 C K7    0.75  1.0   1.0   0.75
  26
  27
  28 C void mpn_copyi (mp_ptr dst, mp_srcptr src, mp_size_t size);
  29 C
  30 C Copy src,size to dst,size.
  31 C
  32 C This code at 0.75 or 1.0 c/l is always faster than a plain rep movsl at
  33 C 1.33 c/l.
  34 C
  35 C The K7 can do a 64-bit load and 64-bit store in one cycle (optimization
  36 C guile 22007 appendix B), so 0.5 c/l should be possible, however nothing
  37 C under 0.7 c/l is known.  Apparently only two 32-bit stores can be done in
  38 C one cycle, so perhaps some scheduling is needed to ensure it's a
  39 C load+store in each cycle, not store+store.
  40 C
  41 C If both source and destination are unaligned then one limb is processed at
  42 C the start to make them aligned and so get 0.75 c/l, whereas if they'd been
  43 C used unaligned it would be 1.5 c/l.
  44
  45 defframe(PARAM_SIZE,12)
  46 defframe(PARAM_SRC, 8)
  47 defframe(PARAM_DST, 4)
  48
  49 dnl  parameter space reused
  50 define(SAVE_EBX,`PARAM_SIZE')
  51
  52 dnl  minimum 5 since the unrolled code can't handle less than 5
  53 deflit(UNROLL_THRESHOLD, 5)
  54
  55         TEXT
  56         ALIGN(32)
  57 PROLOGUE(mpn_copyi)
  58 deflit(`FRAME',0)
  59
  60         movl    PARAM_SIZE, %ecx
  61         movl    %ebx, SAVE_EBX
  62
  63         movl    PARAM_SRC, %eax
  64         movl    PARAM_DST, %edx
  65
  66         cmpl    $UNROLL_THRESHOLD, %ecx
  67         jae     L(unroll)
  68
  69         orl     %ecx, %ecx
  70         jz      L(simple_done)
  71
  72 L(simple):
  73         C eax   src, incrementing
  74         C ebx   scratch
  75         C ecx   counter
  76         C edx   dst, incrementing
  77         C
  78         C this loop is 2 cycles/limb
  79
  80         movl    (%eax), %ebx
  81         movl    %ebx, (%edx)
  82         decl    %ecx
  83         leal    4(%eax), %eax
  84         leal    4(%edx), %edx
  85         jnz     L(simple)
  86
  87 L(simple_done):
  88         movl    SAVE_EBX, %ebx
  89         ret
  90
  91
  92 L(unroll):
  93         movl    %eax, %ebx
  94         leal    -12(%eax,%ecx,4), %eax  C src end - 12
  95         subl    $3, %ecx                C size-3
  96
  97         andl    %edx, %ebx
  98         leal    (%edx,%ecx,4), %edx     C dst end - 12
  99         negl    %ecx
 100
 101         testl   $4, %ebx   C testl to pad code closer to 16 bytes for L(top)
 102         jz      L(aligned)
 103
 104         C both src and dst unaligned, process one limb to align them
 105         movl    (%eax,%ecx,4), %ebx
 106         movl    %ebx, (%edx,%ecx,4)
 107         incl    %ecx
 108 L(aligned):
 109
 110
 111         ALIGN(16)
 112 L(top):
 113         C eax   src end - 12
 114         C ebx
 115         C ecx   counter, negative, limbs
 116         C edx   dst end - 12
 117
 118         movq    (%eax,%ecx,4), %mm0
 119         movq    8(%eax,%ecx,4), %mm1
 120         addl    $4, %ecx
 121         movq    %mm0, -16(%edx,%ecx,4)
 122         movq    %mm1, -16+8(%edx,%ecx,4)
 123         ja      L(top)          C jump no carry and not zero
 124
 125
 126         C now %ecx is 0 to 3 representing respectively 3 to 0 limbs remaining
 127
 128         testb   $2, %cl
 129         jnz     L(finish_not_two)
 130
 131         movq    (%eax,%ecx,4), %mm0
 132         movq    %mm0, (%edx,%ecx,4)
 133 L(finish_not_two):
 134
 135         testb   $1, %cl
 136         jnz     L(done)
 137
 138         movl    8(%eax), %ebx
 139         movl    %ebx, 8(%edx)
 140
 141 L(done):
 142         movl    SAVE_EBX, %ebx
 143         emms
 144         ret
 145
 146 EPILOGUE()