mpn/x86_64/sublsh1_n.asm

   1 dnl  AMD64 mpn_sublsh1_n -- rp[] = up[] - (vp[] << 1)
   2
   3 dnl  Copyright 2003, 2005, 2006, 2007 Free Software Foundation, Inc.
   4
   5 dnl  This file is part of the GNU MP Library.
   6
   7 dnl  The GNU MP Library is free software; you can redistribute it and/or modify
   8 dnl  it under the terms of the GNU Lesser General Public License as published
   9 dnl  by the Free Software Foundation; either version 3 of the License, or (at
  10 dnl  your option) any later version.
  11
  12 dnl  The GNU MP Library is distributed in the hope that it will be useful, but
  13 dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
  14 dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
  15 dnl  License for more details.
  16
  17 dnl  You should have received a copy of the GNU Lesser General Public License
  18 dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
  19
  20 include(`../config.m4')
  21
  22
  23 C            cycles/limb
  24 C K8,K9:         2.2
  25 C K10:           2.2
  26 C P4:           12.75
  27 C P6 core2:      3.45
  28 C P6 corei7:     3.45
  29 C P6 atom:       ?
  30
  31
  32 C Sometimes speed degenerates, supposedly related to that some operand
  33 C alignments cause cache conflicts.
  34
  35 C The speed is limited by decoding/issue bandwidth.  There are 26 instructions
  36 C in the loop, which corresponds to 26/3/4 = 2.167 c/l.
  37
  38 C INPUT PARAMETERS
  39 define(`rp',`%rdi')
  40 define(`up',`%rsi')
  41 define(`vp',`%rdx')
  42 define(`n', `%rcx')
  43
  44 ASM_START()
  45         TEXT
  46         ALIGN(16)
  47 PROLOGUE(mpn_sublsh1_n)
  48         push    %rbx
  49         push    %rbp
  50
  51         mov     (vp), %r8
  52         mov     R32(n), R32(%rax)
  53         lea     (rp,n,8), rp
  54         lea     (up,n,8), up
  55         lea     (vp,n,8), vp
  56         neg     n
  57         xor     R32(%rbp), R32(%rbp)
  58         and     $3, R32(%rax)
  59         je      L(b00)
  60         cmp     $2, R32(%rax)
  61         jc      L(b01)
  62         je      L(b10)
  63
  64 L(b11): add     %r8, %r8
  65         mov     8(vp,n,8), %r9
  66         adc     %r9, %r9
  67         mov     16(vp,n,8), %r10
  68         adc     %r10, %r10
  69         sbb     R32(%rax), R32(%rax)    C save scy
  70         mov     (up,n,8), %rbp
  71         mov     8(up,n,8), %rbx
  72         sub     %r8, %rbp
  73         sbb     %r9, %rbx
  74         mov     %rbp, (rp,n,8)
  75         mov     %rbx, 8(rp,n,8)
  76         mov     16(up,n,8), %rbp
  77         sbb     %r10, %rbp
  78         mov     %rbp, 16(rp,n,8)
  79         sbb     R32(%rbp), R32(%rbp)    C save acy
  80         add     $3, n
  81         jmp     L(ent)
  82
  83 L(b10): add     %r8, %r8
  84         mov     8(vp,n,8), %r9
  85         adc     %r9, %r9
  86         sbb     R32(%rax), R32(%rax)    C save scy
  87         mov     (up,n,8), %rbp
  88         mov     8(up,n,8), %rbx
  89         sub     %r8, %rbp
  90         sbb     %r9, %rbx
  91         mov     %rbp, (rp,n,8)
  92         mov     %rbx, 8(rp,n,8)
  93         sbb     R32(%rbp), R32(%rbp)    C save acy
  94         add     $2, n
  95         jmp     L(ent)
  96
  97 L(b01): add     %r8, %r8
  98         sbb     R32(%rax), R32(%rax)    C save scy
  99         mov     (up,n,8), %rbp
 100         sub     %r8, %rbp
 101         mov     %rbp, (rp,n,8)
 102         sbb     R32(%rbp), R32(%rbp)    C save acy
 103         inc     n
 104 L(ent): jns     L(end)
 105
 106         ALIGN(16)
 107 L(top): add     R32(%rax), R32(%rax)    C restore scy
 108
 109         mov     (vp,n,8), %r8
 110 L(b00): adc     %r8, %r8
 111         mov     8(vp,n,8), %r9
 112         adc     %r9, %r9
 113         mov     16(vp,n,8), %r10
 114         adc     %r10, %r10
 115         mov     24(vp,n,8), %r11
 116         adc     %r11, %r11
 117
 118         sbb     R32(%rax), R32(%rax)    C save scy
 119         add     R32(%rbp), R32(%rbp)    C restore acy
 120
 121         mov     (up,n,8), %rbp
 122         mov     8(up,n,8), %rbx
 123         sbb     %r8, %rbp
 124         sbb     %r9, %rbx
 125         mov     %rbp, (rp,n,8)
 126         mov     %rbx, 8(rp,n,8)
 127         mov     16(up,n,8), %rbp
 128         mov     24(up,n,8), %rbx
 129         sbb     %r10, %rbp
 130         sbb     %r11, %rbx
 131         mov     %rbp, 16(rp,n,8)
 132         mov     %rbx, 24(rp,n,8)
 133
 134         sbb     R32(%rbp), R32(%rbp)    C save acy
 135         add     $4, n
 136         js      L(top)
 137
 138 L(end): add     R32(%rbp), R32(%rax)
 139         neg     R32(%rax)
 140
 141         pop     %rbp
 142         pop     %rbx
 143         ret
 144 EPILOGUE()