mpn/x86_64/core2/aorslsh1_n.asm

   1 dnl  x86-64 mpn_addlsh1_n and mpn_sublsh1_n, optimized for "Core" 2.
   2
   3 dnl  Copyright 2008 Free Software Foundation, Inc.
   4
   5 dnl  This file is part of the GNU MP Library.
   6
   7 dnl  The GNU MP Library is free software; you can redistribute it and/or modify
   8 dnl  it under the terms of the GNU Lesser General Public License as published
   9 dnl  by the Free Software Foundation; either version 3 of the License, or (at
  10 dnl  your option) any later version.
  11
  12 dnl  The GNU MP Library is distributed in the hope that it will be useful, but
  13 dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
  14 dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
  15 dnl  License for more details.
  16
  17 dnl  You should have received a copy of the GNU Lesser General Public License
  18 dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
  19
  20 include(`../config.m4')
  21
  22 C            cycles/limb
  23 C K8,K9:         4.25
  24 C K10:           ?
  25 C P4:            ?
  26 C P6-15:         3
  27
  28 C INPUT PARAMETERS
  29 define(`rp',`%rdi')
  30 define(`up',`%rsi')
  31 define(`vp',`%rdx')
  32 define(`n', `%rcx')
  33
  34 ifdef(`OPERATION_addlsh1_n', `
  35         define(ADDSUB,  add)
  36         define(ADCSBB,  adc)
  37         define(func,    mpn_addlsh1_n)')
  38 ifdef(`OPERATION_sublsh1_n', `
  39         define(ADDSUB,  sub)
  40         define(ADCSBB,  sbb)
  41         define(func,    mpn_sublsh1_n)')
  42
  43 MULFUNC_PROLOGUE(mpn_addlsh1_n mpn_sublsh1_n)
  44
  45 ASM_START()
  46         TEXT
  47         ALIGN(8)
  48 PROLOGUE(func)
  49         push    %rbx
  50         push    %r12
  51
  52         mov     R32(%rcx), R32(%rax)
  53         lea     24(up,n,8), up
  54         lea     24(vp,n,8), vp
  55         lea     24(rp,n,8), rp
  56         neg     n
  57
  58         xor     R32(%r11), R32(%r11)
  59
  60         mov     -24(vp,n,8), %r8        C do first limb early
  61         shrd    $63, %r8, %r11
  62
  63         and     $3, R32(%rax)
  64         je      L(b0)
  65         cmp     $2, R32(%rax)
  66         jc      L(b1)
  67         je      L(b2)
  68
  69 L(b3):  mov     -16(vp,n,8), %r9
  70         shrd    $63, %r9, %r8
  71         mov     -8(vp,n,8), %r10
  72         shrd    $63, %r10, %r9
  73         mov     -24(up,n,8), %r12
  74         ADDSUB  %r11, %r12
  75         mov     %r12, -24(rp,n,8)
  76         mov     -16(up,n,8), %r12
  77         ADCSBB  %r8, %r12
  78         mov     %r12, -16(rp,n,8)
  79         mov     -8(up,n,8), %r12
  80         ADCSBB  %r9, %r12
  81         mov     %r12, -8(rp,n,8)
  82         mov     %r10, %r11
  83         sbb     R32(%rax), R32(%rax)    C save cy
  84         add     $3, n
  85         js      L(top)
  86         jmp     L(end)
  87
  88 L(b1):  mov     -24(up,n,8), %r12
  89         ADDSUB  %r11, %r12
  90         mov     %r12, -24(rp,n,8)
  91         mov     %r8, %r11
  92         sbb     R32(%rax), R32(%rax)    C save cy
  93         inc     n
  94         js      L(top)
  95         jmp     L(end)
  96
  97 L(b2):  mov     -16(vp,n,8), %r9
  98         shrd    $63, %r9, %r8
  99         mov     -24(up,n,8), %r12
 100         ADDSUB  %r11, %r12
 101         mov     %r12, -24(rp,n,8)
 102         mov     -16(up,n,8), %r12
 103         ADCSBB  %r8, %r12
 104         mov     %r12, -16(rp,n,8)
 105         mov     %r9, %r11
 106         sbb     R32(%rax), R32(%rax)    C save cy
 107         add     $2, n
 108         js      L(top)
 109         jmp     L(end)
 110
 111         ALIGN(16)
 112 L(top): mov     -24(vp,n,8), %r8
 113         shrd    $63, %r8, %r11
 114 L(b0):  mov     -16(vp,n,8), %r9
 115         shrd    $63, %r9, %r8
 116         mov     -8(vp,n,8), %r10
 117         shrd    $63, %r10, %r9
 118         mov     (vp,n,8), %rbx
 119         shrd    $63, %rbx, %r10
 120
 121         add     R32(%rax), R32(%rax)    C restore cy
 122
 123         mov     -24(up,n,8), %r12
 124         ADCSBB  %r11, %r12
 125         mov     %r12, -24(rp,n,8)
 126
 127         mov     -16(up,n,8), %r12
 128         ADCSBB  %r8, %r12
 129         mov     %r12, -16(rp,n,8)
 130
 131         mov     -8(up,n,8), %r12
 132         ADCSBB  %r9, %r12
 133         mov     %r12, -8(rp,n,8)
 134
 135         mov     (up,n,8), %r12
 136         ADCSBB  %r10, %r12
 137         mov     %r12, (rp,n,8)
 138
 139         mov     %rbx, %r11
 140         sbb     R32(%rax), R32(%rax)    C save cy
 141
 142         add     $4, n
 143         js      L(top)
 144
 145 L(end): add     %r11, %r11
 146         pop     %r12
 147         pop     %rbx
 148         sbb     $0, R32(%rax)
 149         neg     R32(%rax)
 150         ret
 151 EPILOGUE()