mpn/x86/pentium4/sse2/rsh1add_n.asm

   1 dnl  Intel Pentium-4 mpn_rsh1add_n -- mpn (x+y)/2
   2
   3 dnl  Copyright 2001, 2002, 2003, 2004 Free Software Foundation, Inc.
   4 dnl
   5 dnl  This file is part of the GNU MP Library.
   6 dnl
   7 dnl  The GNU MP Library is free software; you can redistribute it and/or
   8 dnl  modify it under the terms of the GNU Lesser General Public License as
   9 dnl  published by the Free Software Foundation; either version 3 of the
  10 dnl  License, or (at your option) any later version.
  11 dnl
  12 dnl  The GNU MP Library is distributed in the hope that it will be useful,
  13 dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
  14 dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15 dnl  Lesser General Public License for more details.
  16 dnl
  17 dnl  You should have received a copy of the GNU Lesser General Public License
  18 dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
  19
  20 include(`../config.m4')
  21
  22
  23 C        cycles/limb (approx)
  24 C      dst!=src1,2  dst==src1  dst==src2
  25 C P4:      4.5         6.5        6.5
  26
  27
  28 C mp_limb_t mpn_rsh1add_n (mp_ptr wp, mp_srcptr xp, mp_srcptr yp,
  29 C                          mp_size_t size);
  30 C
  31 C The slightly strange combination of indexing and pointer incrementing
  32 C that's used seems to work best.  Not sure why, but for instance leal
  33 C incrementing on %esi is a 1 or 2 cycle slowdown.
  34 C
  35 C The dependent chain is paddq combining the carry and next (shifted) part,
  36 C plus psrlq to move the new carry down.  That, and just 4 mmx instructions
  37 C in total, makes 4 c/l the target speed, which is almost achieved for
  38 C separate src/dst but when src==dst the write combining anomalies slow it
  39 C down.
  40
  41 defframe(PARAM_SIZE, 16)
  42 defframe(PARAM_YP,   12)
  43 defframe(PARAM_XP,   8)
  44 defframe(PARAM_WP,   4)
  45
  46 dnl  re-use parameter space
  47 define(SAVE_EBX,`PARAM_XP')
  48 define(SAVE_ESI,`PARAM_YP')
  49
  50         TEXT
  51         ALIGN(8)
  52
  53 PROLOGUE(mpn_rsh1add_n)
  54 deflit(`FRAME',0)
  55
  56         movl    PARAM_XP, %edx
  57         movl    %ebx, SAVE_EBX
  58
  59         movl    PARAM_YP, %ebx
  60         movl    %esi, SAVE_ESI
  61
  62         movl    PARAM_WP, %esi
  63
  64         movd    (%edx), %mm0            C xp[0]
  65
  66         movd    (%ebx), %mm1            C yp[0]
  67         movl    PARAM_SIZE, %ecx
  68
  69         movl    (%edx), %eax            C xp[0]
  70
  71         addl    (%ebx), %eax            C xp[0]+yp[0]
  72
  73         paddq   %mm1, %mm0              C xp[0]+yp[0]
  74         leal    (%esi,%ecx,4), %esi     C wp end
  75         negl    %ecx                    C -size
  76
  77         psrlq   $1, %mm0                C (xp[0]+yp[0])/2
  78         and     $1, %eax                C return value, rsh1 bit of xp[0]+yp[0]
  79         addl    $1, %ecx                C -(size-1)
  80         jz      L(done)
  81
  82
  83 L(top):
  84         C eax   return value
  85         C ebx   yp end
  86         C ecx   counter, limbs, -(size-1) to -1 inclusive
  87         C edx   xp end
  88         C esi   wp end
  89         C mm0   carry (32 bits)
  90
  91         movd    4(%edx), %mm1   C xp[i+1]
  92         movd    4(%ebx), %mm2   C yp[i+1]
  93         leal    4(%edx), %edx
  94         leal    4(%ebx), %ebx
  95         paddq   %mm2, %mm1              C xp[i+1]+yp[i+1]
  96         psllq   $31, %mm1               C low bit at 31, further 32 above
  97
  98         paddq   %mm1, %mm0              C 31 and carry from prev add
  99         movd    %mm0, -4(%esi,%ecx,4)   C low ready to store dst[i]
 100
 101         psrlq   $32, %mm0               C high becomes new carry
 102
 103         addl    $1, %ecx
 104         jnz     L(top)
 105
 106
 107 L(done):
 108         movd    %mm0, -4(%esi)          C dst[size-1]
 109         movl    SAVE_EBX, %ebx
 110
 111         movl    SAVE_ESI, %esi
 112         emms
 113         ret
 114
 115 EPILOGUE()