mpn/x86_64/pentium4/aorslsh1_n.asm

   1 dnl  AMD64 mpn_addlsh1_n, mpn_sublsh1_n -- rp[] = up[] +- (vp[] << 1),
   2 dnl  optimized for Pentium 4.
   3
   4 dnl  Copyright 2008 Free Software Foundation, Inc.
   5
   6 dnl  This file is part of the GNU MP Library.
   7
   8 dnl  The GNU MP Library is free software; you can redistribute it and/or modify
   9 dnl  it under the terms of the GNU Lesser General Public License as published
  10 dnl  by the Free Software Foundation; either version 3 of the License, or (at
  11 dnl  your option) any later version.
  12
  13 dnl  The GNU MP Library is distributed in the hope that it will be useful, but
  14 dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
  15 dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
  16 dnl  License for more details.
  17
  18 dnl  You should have received a copy of the GNU Lesser General Public License
  19 dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
  20
  21 include(`../config.m4')
  22
  23 C            cycles/limb
  24 C K8,K9:         3.8
  25 C K10:           4.8
  26 C P4:            5.8
  27 C P6-15:         ?
  28
  29
  30 C INPUT PARAMETERS
  31 define(`rp',`%rdi')
  32 define(`up',`%rsi')
  33 define(`vp',`%rdx')
  34 define(`n', `%rcx')
  35
  36 ifdef(`OPERATION_addlsh1_n', `
  37         define(ADDSUB,        add)
  38         define(func,          mpn_addlsh1_n)')
  39 ifdef(`OPERATION_sublsh1_n', `
  40         define(ADDSUB,        sub)
  41         define(func,          mpn_sublsh1_n)')
  42
  43 MULFUNC_PROLOGUE(mpn_addlsh1_n mpn_sublsh1_n)
  44
  45 ASM_START()
  46         TEXT
  47         ALIGN(16)
  48 PROLOGUE(func)
  49         push    %rbx
  50         push    %r12
  51         push    %rbp
  52
  53         mov     (vp), %r9
  54         shl     %r9
  55         mov     4(vp), R32(%rbp)
  56
  57         xor     R32(%rbx), R32(%rbx)
  58
  59         mov     R32(n), R32(%rax)
  60         and     $3, R32(%rax)
  61         jne     L(n00)          C n = 0, 4, 8, ...
  62
  63         mov     (up), %r8
  64         mov     8(up), %r10
  65         shr     $31, R32(%rbp)
  66         ADDSUB  %r9, %r8
  67         mov     8(vp), %r9
  68         lea     (%rbp,%r9,2), %r9
  69         setc    R8(%rax)
  70         mov     12(vp), R32(%rbp)
  71         lea     -16(rp), rp
  72         jmp     L(L00)
  73
  74 L(n00): cmp     $2, R32(%rax)
  75         jnc     L(n01)          C n = 1, 5, 9, ...
  76         mov     (up), %r11
  77         lea     -8(rp), rp
  78         shr     $31, R32(%rbp)
  79         ADDSUB  %r9, %r11
  80         setc    R8(%rbx)
  81         dec     n
  82         jz      L(1)            C jump for n = 1
  83         mov     8(up), %r8
  84         mov     8(vp), %r9
  85         lea     (%rbp,%r9,2), %r9
  86         mov     12(vp), R32(%rbp)
  87         lea     8(up), up
  88         lea     8(vp), vp
  89         jmp     L(L01)
  90
  91 L(n01): jne     L(n10)          C n = 2, 6, 10, ...
  92         mov     (up), %r12
  93         mov     8(up), %r11
  94         shr     $31, R32(%rbp)
  95         ADDSUB  %r9, %r12
  96         mov     8(vp), %r9
  97         lea     (%rbp,%r9,2), %r9
  98         setc    R8(%rax)
  99         mov     12(vp), R32(%rbp)
 100         lea     16(up), up
 101         lea     16(vp), vp
 102         jmp     L(L10)
 103
 104 L(n10): mov     (up), %r10
 105         mov     8(up), %r12
 106         shr     $31, R32(%rbp)
 107         ADDSUB  %r9, %r10
 108         mov     8(vp), %r9
 109         lea     (%rbp,%r9,2), %r9
 110         setc    R8(%rbx)
 111         mov     12(vp), R32(%rbp)
 112         lea     -24(rp), rp
 113         lea     -8(up), up
 114         lea     -8(vp), vp
 115         jmp     L(L11)
 116
 117 L(c0):  mov     $1, R8(%rbx)
 118         jmp     L(rc0)
 119 L(c1):  mov     $1, R8(%rax)
 120         jmp     L(rc1)
 121 L(c2):  mov     $1, R8(%rbx)
 122         jmp     L(rc2)
 123
 124         ALIGN(16)
 125 L(top): mov     (up), %r8       C not on critical path
 126         shr     $31, R32(%rbp)
 127         ADDSUB  %r9, %r11       C not on critical path
 128         mov     (vp), %r9
 129         lea     (%rbp,%r9,2), %r9
 130         setc    R8(%rbx)        C save carry out
 131         mov     4(vp), R32(%rbp)
 132         mov     %r12, (rp)
 133         ADDSUB  %rax, %r11      C apply previous carry out
 134         jc      L(c0)           C jump if ripple
 135 L(rc0):
 136 L(L01): mov     8(up), %r10
 137         shr     $31, R32(%rbp)
 138         ADDSUB  %r9, %r8
 139         mov     8(vp), %r9
 140         lea     (%rbp,%r9,2), %r9
 141         setc    R8(%rax)
 142         mov     12(vp), R32(%rbp)
 143         mov     %r11, 8(rp)
 144         ADDSUB  %rbx, %r8
 145         jc      L(c1)
 146 L(rc1):
 147 L(L00): mov     16(up), %r12
 148         shr     $31, R32(%rbp)
 149         ADDSUB  %r9, %r10
 150         mov     16(vp), %r9
 151         lea     (%rbp,%r9,2), %r9
 152         setc    R8(%rbx)
 153         mov     20(vp), R32(%rbp)
 154         mov     %r8, 16(rp)
 155         ADDSUB  %rax, %r10
 156         jc      L(c2)
 157 L(rc2):
 158 L(L11): mov     24(up), %r11
 159         shr     $31, R32(%rbp)
 160         ADDSUB  %r9, %r12
 161         mov     24(vp), %r9
 162         lea     (%rbp,%r9,2), %r9
 163         lea     32(up), up
 164         lea     32(vp), vp
 165         setc    R8(%rax)
 166         mov     -4(vp), R32(%rbp)
 167         mov     %r10, 24(rp)
 168         ADDSUB  %rbx, %r12
 169         jc      L(c3)
 170 L(rc3): lea     32(rp), rp
 171 L(L10): sub     $4, n
 172         ja      L(top)
 173
 174 L(end):
 175         shr     $31, R32(%rbp)
 176         ADDSUB  %r9, %r11
 177         setc    R8(%rbx)
 178         mov     %r12, (rp)
 179         ADDSUB  %rax, %r11
 180         jnc     L(1)
 181         mov     $1, R8(%rbx)
 182 L(1):   mov     %r11, 8(rp)
 183         lea     (%rbx,%rbp), R32(%rax)
 184         pop     %rbp
 185         pop     %r12
 186         pop     %rbx
 187         emms
 188         ret
 189 L(c3):  mov     $1, R8(%rax)
 190         jmp     L(rc3)
 191 EPILOGUE()
 192 ASM_END()