mpn/x86/k6/k62mmx/copyd.asm

   1 dnl  AMD K6-2 mpn_copyd -- copy limb vector, decrementing.
   2
   3 dnl  Copyright 2001, 2002 Free Software Foundation, Inc.
   4 dnl
   5 dnl  This file is part of the GNU MP Library.
   6 dnl
   7 dnl  The GNU MP Library is free software; you can redistribute it and/or
   8 dnl  modify it under the terms of the GNU Lesser General Public License as
   9 dnl  published by the Free Software Foundation; either version 3 of the
  10 dnl  License, or (at your option) any later version.
  11 dnl
  12 dnl  The GNU MP Library is distributed in the hope that it will be useful,
  13 dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
  14 dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15 dnl  Lesser General Public License for more details.
  16 dnl
  17 dnl  You should have received a copy of the GNU Lesser General Public License
  18 dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
  19
  20 include(`../config.m4')
  21
  22
  23 C K6-2: 1.0 cycles/limb
  24
  25
  26 C void mpn_copyd (mp_ptr dst, mp_srcptr src, mp_size_t size);
  27 C
  28 C The loop here is no faster than a rep movsl at 1.0 c/l, but it avoids a 30
  29 C cycle startup time, which amounts for instance to a 2x speedup at 15
  30 C limbs.
  31 C
  32 C If dst is 4mod8 the loop would be 1.17 c/l, but that's avoided by
  33 C processing one limb separately to make it aligned.  This and a final odd
  34 C limb are handled in a branch-free fashion, ending up re-copying if the
  35 C special case isn't needed.
  36 C
  37 C Alternatives:
  38 C
  39 C There used to be a big unrolled version of this, running at 0.56 c/l if
  40 C the destination was aligned, but that seemed rather excessive for the
  41 C relative importance of copyd.
  42 C
  43 C If the destination alignment is ignored and just left to run at 1.17 c/l
  44 C some code size and a fixed few cycles can be saved.  Considering how few
  45 C uses copyd finds perhaps that should be favoured.  The current code has
  46 C the attraction of being no slower than a basic rep movsl though.
  47
  48 defframe(PARAM_SIZE,12)
  49 defframe(PARAM_SRC, 8)
  50 defframe(PARAM_DST, 4)
  51
  52 dnl  re-using parameter space
  53 define(SAVE_EBX,`PARAM_SIZE')
  54
  55         TEXT
  56         ALIGN(16)
  57
  58 PROLOGUE(mpn_copyd)
  59 deflit(`FRAME',0)
  60
  61         movl    PARAM_SIZE, %ecx
  62         movl    %ebx, SAVE_EBX
  63
  64         movl    PARAM_SRC, %eax
  65         movl    PARAM_DST, %edx
  66
  67         subl    $1, %ecx                C better code alignment than decl
  68         jb      L(zero)
  69
  70         jz      L(one_more)
  71         leal    4(%edx,%ecx,4), %ebx
  72
  73 Zdisp(  movd,   0,(%eax,%ecx,4), %mm0)  C high limb
  74 Zdisp(  movd,   %mm0, 0,(%edx,%ecx,4))  C Zdisp for good code alignment
  75
  76         cmpl    $1, %ecx
  77         je      L(one_more)
  78
  79         shrl    $2, %ebx
  80         andl    $1, %ebx                C 1 if dst[size-2] unaligned
  81
  82         subl    %ebx, %ecx
  83         nop                             C code alignment
  84
  85 L(top):
  86         C eax   src
  87         C ebx
  88         C ecx   counter
  89         C edx   dst
  90
  91         movq    -4(%eax,%ecx,4), %mm0
  92         subl    $2, %ecx
  93
  94         movq    %mm0, 4(%edx,%ecx,4)
  95         ja      L(top)
  96
  97
  98 L(one_more):
  99         movd    (%eax), %mm0
 100         movd    %mm0, (%edx)
 101
 102         movl    SAVE_EBX, %ebx
 103         emms_or_femms
 104 L(zero):
 105         ret
 106
 107 EPILOGUE()