mpn/x86_64/pentium4/aors_n.asm

   1 dnl  x86-64 mpn_add_n/mpn_sub_n optimized for Pentium 4.
   2
   3 dnl  Copyright 2007, 2008 Free Software Foundation, Inc.
   4
   5 dnl  This file is part of the GNU MP Library.
   6
   7 dnl  The GNU MP Library is free software; you can redistribute it and/or modify
   8 dnl  it under the terms of the GNU Lesser General Public License as published
   9 dnl  by the Free Software Foundation; either version 3 of the License, or (at
  10 dnl  your option) any later version.
  11
  12 dnl  The GNU MP Library is distributed in the hope that it will be useful, but
  13 dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
  14 dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
  15 dnl  License for more details.
  16
  17 dnl  You should have received a copy of the GNU Lesser General Public License
  18 dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
  19
  20 include(`../config.m4')
  21
  22
  23 C            cycles/limb
  24 C K8,K9:         2.8
  25 C K10:           2.8
  26 C P4:            4
  27 C P6-15:         3.6-5  (fluctuating)
  28
  29
  30 C INPUT PARAMETERS
  31 define(`rp',    `%rdi')
  32 define(`up',    `%rsi')
  33 define(`vp',    `%rdx')
  34 define(`n',     `%rcx')
  35 define(`cy',    `%r8')
  36
  37 ifdef(`OPERATION_add_n', `
  38         define(ADDSUB,        add)
  39         define(func,          mpn_add_n)
  40         define(func_nc,       mpn_add_nc)')
  41 ifdef(`OPERATION_sub_n', `
  42         define(ADDSUB,        sub)
  43         define(func,          mpn_sub_n)
  44         define(func_nc,       mpn_sub_nc)')
  45
  46 MULFUNC_PROLOGUE(mpn_add_n mpn_add_nc mpn_sub_n mpn_sub_nc)
  47
  48 ASM_START()
  49
  50         TEXT
  51         ALIGN(16)
  52
  53 PROLOGUE(func_nc)
  54         jmp     L(ent)
  55 EPILOGUE()
  56
  57 PROLOGUE(func)
  58         xor     %r8, %r8
  59 L(ent): push    %rbx
  60         push    %r12
  61
  62         mov     (vp), %r9
  63
  64         mov     R32(n), R32(%rax)
  65         and     $3, R32(%rax)
  66         jne     L(n00)          C n = 0, 4, 8, ...
  67         mov     R32(%r8), R32(%rbx)
  68         mov     (up), %r8
  69         mov     8(up), %r10
  70         ADDSUB  %r9, %r8
  71         mov     8(vp), %r9
  72         setc    R8(%rax)
  73         lea     -16(rp), rp
  74         jmp     L(L00)
  75
  76 L(n00): cmp     $2, R32(%rax)
  77         jnc     L(n01)          C n = 1, 5, 9, ...
  78         mov     (up), %r11
  79         mov     R32(%r8), R32(%rax)
  80         xor     R32(%rbx), R32(%rbx)
  81         dec     n
  82         jnz     L(gt1)
  83         ADDSUB  %r9, %r11
  84         setc    R8(%rbx)
  85         ADDSUB  %rax, %r11
  86         adc     $0, R32(%rbx)
  87         mov     %r11, (rp)
  88         jmp     L(ret)
  89 L(gt1): mov     8(up), %r8
  90         ADDSUB  %r9, %r11
  91         mov     8(vp), %r9
  92         setc    R8(%rbx)
  93         lea     -8(rp), rp
  94         lea     8(up), up
  95         lea     8(vp), vp
  96         jmp     L(L01)
  97
  98 L(n01): jne     L(n10)          C n = 2, 6, 10, ...
  99         mov     (up), %r12
 100         mov     R32(%r8), R32(%rbx)
 101         mov     8(up), %r11
 102         ADDSUB  %r9, %r12
 103         mov     8(vp), %r9
 104         setc    R8(%rax)
 105         lea     -32(rp), rp
 106         lea     16(up), up
 107         lea     16(vp), vp
 108         jmp     L(L10)
 109
 110 L(n10): mov     (up), %r10      C n = 3, 7, 11, ...
 111         mov     R32(%r8), R32(%rax)
 112         xor     R32(%rbx), R32(%rbx)
 113         mov     8(up), %r12
 114         ADDSUB  %r9, %r10
 115         mov     8(vp), %r9
 116         setc    R8(%rbx)
 117         lea     -24(rp), rp
 118         lea     -8(up), up
 119         lea     -8(vp), vp
 120         jmp     L(L11)
 121
 122 L(c0):  mov     $1, R8(%rbx)
 123         jmp     L(rc0)
 124 L(c1):  mov     $1, R8(%rax)
 125         jmp     L(rc1)
 126 L(c2):  mov     $1, R8(%rbx)
 127         jmp     L(rc2)
 128 L(c3):  mov     $1, R8(%rax)
 129         jmp     L(rc3)
 130
 131         ALIGN(16)
 132 L(top): mov     (up), %r8       C not on critical path
 133         ADDSUB  %r9, %r11       C not on critical path
 134         mov     (vp), %r9       C not on critical path
 135         setc    R8(%rbx)        C save carry out
 136         mov     %r12, (rp)
 137 L(L01): ADDSUB  %rax, %r11      C apply previous carry out
 138         jc      L(c0)           C jump if ripple
 139 L(rc0): mov     8(up), %r10
 140         ADDSUB  %r9, %r8
 141         mov     8(vp), %r9
 142         setc    R8(%rax)
 143         mov     %r11, 8(rp)
 144 L(L00): ADDSUB  %rbx, %r8
 145         jc      L(c1)
 146 L(rc1): mov     16(up), %r12
 147         ADDSUB  %r9, %r10
 148         mov     16(vp), %r9
 149         setc    R8(%rbx)
 150         mov     %r8, 16(rp)
 151 L(L11): ADDSUB  %rax, %r10
 152         jc      L(c2)
 153 L(rc2): mov     24(up), %r11
 154         ADDSUB  %r9, %r12
 155         lea     32(up), up
 156         mov     24(vp), %r9
 157         lea     32(vp), vp
 158         setc    R8(%rax)
 159         mov     %r10, 24(rp)
 160 L(L10): ADDSUB  %rbx, %r12
 161         jc      L(c3)
 162 L(rc3): lea     32(rp), rp
 163         sub     $4, n
 164         ja      L(top)
 165
 166 L(end): ADDSUB  %r9, %r11
 167         setc    R8(%rbx)
 168         mov     %r12, (rp)
 169         ADDSUB  %rax, %r11
 170         jnc     L(1)
 171         mov     $1, R8(%rbx)
 172 L(1):   mov     %r11, 8(rp)
 173
 174 L(ret): mov     R32(%rbx), R32(%rax)
 175         pop     %r12
 176         pop     %rbx
 177         ret
 178 EPILOGUE()