mpn/x86/pentium/copyi.asm

   1 dnl  Intel Pentium mpn_copyi -- copy limb vector, incrementing.
   2
   3 dnl  Copyright 1996, 2001, 2002, 2006 Free Software Foundation, Inc.
   4 dnl
   5 dnl  This file is part of the GNU MP Library.
   6 dnl
   7 dnl  The GNU MP Library is free software; you can redistribute it and/or
   8 dnl  modify it under the terms of the GNU Lesser General Public License as
   9 dnl  published by the Free Software Foundation; either version 3 of the
  10 dnl  License, or (at your option) any later version.
  11 dnl
  12 dnl  The GNU MP Library is distributed in the hope that it will be useful,
  13 dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
  14 dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15 dnl  Lesser General Public License for more details.
  16 dnl
  17 dnl  You should have received a copy of the GNU Lesser General Public License
  18 dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
  19
  20 include(`../config.m4')
  21
  22
  23 C P5: 1.25 cycles/limb
  24
  25
  26 C void mpn_copyi (mp_ptr dst, mp_srcptr src, mp_size_t size);
  27 C
  28 C Destination prefetching is done to avoid repeated write-throughs on lines
  29 C not already in L1.
  30 C
  31 C At least one of the src or dst pointer needs to be incremented rather than
  32 C using indexing, so that there's somewhere to put the loop control without
  33 C an AGI.  Incrementing one and not two lets us keep loop overhead to 2
  34 C cycles.  Making it the src pointer incremented avoids an AGI on the %ecx
  35 C subtracts in the finishup code.
  36 C
  37 C The block of finishup code is almost as big as the main loop itself, which
  38 C is unfortunate, but it's faster that way than with say rep movsl, by about
  39 C 10 cycles for instance on P55.
  40 C
  41 C There's nothing to be gained from MMX on P55, since it can do only one
  42 C movq load (or store) per cycle, so the throughput would be the same as the
  43 C code here (and even then only if src and dst have the same alignment mod
  44 C 8).
  45
  46 defframe(PARAM_SIZE,12)
  47 defframe(PARAM_SRC, 8)
  48 defframe(PARAM_DST, 4)
  49
  50         TEXT
  51         ALIGN(8)
  52 PROLOGUE(mpn_copyi)
  53 deflit(`FRAME',0)
  54
  55         movl    PARAM_SIZE, %ecx
  56         movl    PARAM_DST, %edx
  57
  58         pushl   %ebx    FRAME_pushl()
  59         pushl   %esi    FRAME_pushl()
  60
  61         leal    (%edx,%ecx,4), %edx     C &dst[size-1]
  62         xorl    $-1, %ecx               C -size-1
  63
  64         movl    PARAM_SRC, %esi
  65         addl    $8, %ecx                C -size+7
  66
  67         jns     L(end)
  68
  69         movl    -28(%edx,%ecx,4), %eax  C fetch destination cache line, dst[0]
  70         nop
  71
  72 L(top):
  73         C eax   scratch
  74         C ebx   scratch
  75         C ecx   counter, limbs, negative
  76         C edx   &dst[size-1]
  77         C esi   src, incrementing
  78         C edi
  79         C ebp
  80
  81         movl    (%edx,%ecx,4), %eax     C fetch destination cache line
  82         addl    $8, %ecx
  83
  84         movl    (%esi), %eax            C read words pairwise
  85         movl    4(%esi), %ebx
  86         movl    %eax, -60(%edx,%ecx,4)  C store words pairwise
  87         movl    %ebx, -56(%edx,%ecx,4)
  88
  89         movl    8(%esi), %eax
  90         movl    12(%esi), %ebx
  91         movl    %eax, -52(%edx,%ecx,4)
  92         movl    %ebx, -48(%edx,%ecx,4)
  93
  94         movl    16(%esi), %eax
  95         movl    20(%esi), %ebx
  96         movl    %eax, -44(%edx,%ecx,4)
  97         movl    %ebx, -40(%edx,%ecx,4)
  98
  99         movl    24(%esi), %eax
 100         movl    28(%esi), %ebx
 101         movl    %eax, -36(%edx,%ecx,4)
 102         movl    %ebx, -32(%edx,%ecx,4)
 103
 104         leal    32(%esi), %esi
 105         js      L(top)
 106
 107
 108 L(end):
 109         C ecx   0 to 7, representing respectively 7 to 0 limbs remaining
 110         C esi   src end
 111         C edx   dst, next location to store
 112
 113         subl    $4, %ecx
 114         jns     L(no4)
 115
 116         movl    (%esi), %eax
 117         movl    4(%esi), %ebx
 118         movl    %eax, -12(%edx,%ecx,4)
 119         movl    %ebx, -8(%edx,%ecx,4)
 120
 121         movl    8(%esi), %eax
 122         movl    12(%esi), %ebx
 123         movl    %eax, -4(%edx,%ecx,4)
 124         movl    %ebx, (%edx,%ecx,4)
 125
 126         addl    $16, %esi
 127         addl    $4, %ecx
 128 L(no4):
 129
 130         subl    $2, %ecx
 131         jns     L(no2)
 132
 133         movl    (%esi), %eax
 134         movl    4(%esi), %ebx
 135         movl    %eax, -4(%edx,%ecx,4)
 136         movl    %ebx, (%edx,%ecx,4)
 137
 138         addl    $8, %esi
 139         addl    $2, %ecx
 140 L(no2):
 141
 142         jnz     L(done)
 143
 144         movl    (%esi), %eax
 145         movl    %eax, -4(%edx,%ecx,4)   C risk of cache bank clash here
 146
 147 L(done):
 148         popl    %esi
 149         popl    %ebx
 150
 151         ret
 152
 153 EPILOGUE()