mpn/x86/p6/copyd.asm

   1 dnl  Intel P6 mpn_copyd -- copy limb vector backwards.
   2
   3 dnl  Copyright 2001, 2002 Free Software Foundation, Inc.
   4 dnl
   5 dnl  This file is part of the GNU MP Library.
   6 dnl
   7 dnl  The GNU MP Library is free software; you can redistribute it and/or
   8 dnl  modify it under the terms of the GNU Lesser General Public License as
   9 dnl  published by the Free Software Foundation; either version 3 of the
  10 dnl  License, or (at your option) any later version.
  11 dnl
  12 dnl  The GNU MP Library is distributed in the hope that it will be useful,
  13 dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
  14 dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15 dnl  Lesser General Public License for more details.
  16 dnl
  17 dnl  You should have received a copy of the GNU Lesser General Public License
  18 dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
  19
  20 include(`../config.m4')
  21
  22
  23 C P6: 1.75 cycles/limb, or 0.75 if no overlap
  24
  25
  26 C void mpn_copyd (mp_ptr dst, mp_srcptr src, mp_size_t size);
  27 C
  28 C An explicit loop is used because a decrementing rep movsl is a bit slow at
  29 C 2.4 c/l.  That rep movsl also has about a 40 cycle startup time, and the
  30 C code here stands a chance of being faster if the branches predict well.
  31 C
  32 C The slightly strange loop form seems necessary for the claimed speed.
  33 C Maybe load/store ordering affects it.
  34 C
  35 C The source and destination are checked to see if they're actually
  36 C overlapping, since it might be possible to use an incrementing rep movsl
  37 C at 0.75 c/l.  (It doesn't suffer the bad startup time of the decrementing
  38 C version.)
  39 C
  40 C Enhancements:
  41 C
  42 C Top speed for an all-integer copy is probably 1.0 c/l, being one load and
  43 C one store each cycle.  Unrolling the loop below would approach 1.0, but
  44 C it'd be good to know why something like store/load/subl + store/load/jnz
  45 C doesn't already run at 1.0 c/l.  It looks like it should decode in 2
  46 C cycles, but doesn't run that way.
  47
  48 defframe(PARAM_SIZE,12)
  49 defframe(PARAM_SRC, 8)
  50 defframe(PARAM_DST, 4)
  51
  52 dnl  re-using parameter space
  53 define(SAVE_ESI,`PARAM_SIZE')
  54 define(SAVE_EDI,`PARAM_SRC')
  55
  56         TEXT
  57         ALIGN(16)
  58
  59 PROLOGUE(mpn_copyd)
  60 deflit(`FRAME',0)
  61
  62         movl    PARAM_SIZE, %ecx
  63
  64         movl    %esi, SAVE_ESI
  65         movl    PARAM_SRC, %esi
  66
  67         movl    %edi, SAVE_EDI
  68         movl    PARAM_DST, %edi
  69
  70         subl    $1, %ecx
  71         jb      L(zero)
  72
  73         movl    (%esi,%ecx,4), %eax             C src[size-1]
  74         jz      L(one)
  75
  76         movl    -4(%esi,%ecx,4), %edx           C src[size-2]
  77         subl    $2, %ecx
  78         jbe     L(done_loop)                    C 2 or 3 limbs only
  79
  80
  81         C The usual overlap is
  82         C
  83         C     high                   low
  84         C     +------------------+
  85         C     |               dst|
  86         C     +------------------+
  87         C           +------------------+
  88         C           |               src|
  89         C           +------------------+
  90         C
  91         C We can use an incrementing copy in the following circumstances.
  92         C
  93         C     src+4*size<=dst, since then the regions are disjoint
  94         C
  95         C     src==dst, clearly (though this shouldn't occur normally)
  96         C
  97         C     src>dst, since in that case it's a requirement of the
  98         C              parameters that src>=dst+size*4, and hence the
  99         C              regions are disjoint
 100         C
 101
 102         leal    (%edi,%ecx,4), %edx
 103         cmpl    %edi, %esi
 104         jae     L(use_movsl)            C src >= dst
 105
 106         cmpl    %edi, %edx
 107         movl    4(%esi,%ecx,4), %edx    C src[size-2] again
 108         jbe     L(use_movsl)            C src+4*size <= dst
 109
 110
 111 L(top):
 112         C eax   prev high limb
 113         C ebx
 114         C ecx   counter, size-3 down to 0 or -1, inclusive, by 2s
 115         C edx   prev low limb
 116         C esi   src
 117         C edi   dst
 118         C ebp
 119
 120         movl    %eax, 8(%edi,%ecx,4)
 121         movl    (%esi,%ecx,4), %eax
 122
 123         movl    %edx, 4(%edi,%ecx,4)
 124         movl    -4(%esi,%ecx,4), %edx
 125
 126         subl    $2, %ecx
 127         jnbe    L(top)
 128
 129
 130 L(done_loop):
 131         movl    %eax, 8(%edi,%ecx,4)
 132         movl    %edx, 4(%edi,%ecx,4)
 133
 134         C copy low limb (needed if size was odd, but will already have been
 135         C done in the loop if size was even)
 136         movl    (%esi), %eax
 137 L(one):
 138         movl    %eax, (%edi)
 139         movl    SAVE_EDI, %edi
 140         movl    SAVE_ESI, %esi
 141
 142         ret
 143
 144
 145 L(use_movsl):
 146         C eax
 147         C ebx
 148         C ecx   size-3
 149         C edx
 150         C esi   src
 151         C edi   dst
 152         C ebp
 153
 154         addl    $3, %ecx
 155
 156         cld             C better safe than sorry, see mpn/x86/README
 157
 158         rep
 159         movsl
 160
 161 L(zero):
 162         movl    SAVE_ESI, %esi
 163         movl    SAVE_EDI, %edi
 164
 165         ret
 166
 167 EPILOGUE()