mpn/x86_64/mul_2.asm

   1 dnl  AMD64 mpn_mul_2 -- Multiply an n-limb vector with a 2-limb vector and
   2 dnl  store the result in a third limb vector.
   3
   4 dnl  Copyright 2008 Free Software Foundation, Inc.
   5
   6 dnl  This file is part of the GNU MP Library.
   7
   8 dnl  The GNU MP Library is free software; you can redistribute it and/or modify
   9 dnl  it under the terms of the GNU Lesser General Public License as published
  10 dnl  by the Free Software Foundation; either version 3 of the License, or (at
  11 dnl  your option) any later version.
  12
  13 dnl  The GNU MP Library is distributed in the hope that it will be useful, but
  14 dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
  15 dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
  16 dnl  License for more details.
  17
  18 dnl  You should have received a copy of the GNU Lesser General Public License
  19 dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
  20
  21 include(`../config.m4')
  22
  23 C            cycles/limb
  24 C K8,K9:         2.275
  25 C K10:           2.275
  26 C P4:            ?
  27 C P6 core2:      4.0
  28 C P6 corei7:     3.8
  29
  30 C This code is the result of running a code generation and optimization tool
  31 C suite written by David Harvey and Torbjorn Granlund.
  32
  33 C TODO
  34 C  * Work on feed-in and wind-down code.
  35 C  * Convert "mov $0" to "xor".
  36 C  * Adjust initial lea to save some bytes.
  37 C  * Perhaps adjust n from n_param&3 value?
  38 C  * Replace with 2.25 c/l sequence.
  39
  40 C INPUT PARAMETERS
  41 define(`rp',     `%rdi')
  42 define(`up',     `%rsi')
  43 define(`n_param',`%rdx')
  44 define(`vp',     `%rcx')
  45
  46 define(`v0', `%r8')
  47 define(`v1', `%r9')
  48 define(`w0', `%rbx')
  49 define(`w1', `%rcx')
  50 define(`w2', `%rbp')
  51 define(`w3', `%r10')
  52 define(`n',  `%r11')
  53
  54 ASM_START()
  55         TEXT
  56         ALIGN(16)
  57 PROLOGUE(mpn_mul_2)
  58         push    %rbx
  59         push    %rbp
  60
  61         mov     (vp), v0
  62         mov     8(vp), v1
  63
  64         mov     (up), %rax
  65
  66         mov     n_param, n
  67         neg     n
  68         lea     -8(up,n_param,8), up
  69         lea     -8(rp,n_param,8), rp
  70
  71         and     $3, R32(n_param)
  72         jz      L(m2p0)
  73         cmp     $2, R32(n_param)
  74         jc      L(m2p1)
  75         jz      L(m2p2)
  76 L(m2p3):
  77         mul     v0
  78         xor     R32(w3), R32(w3)
  79         mov     %rax, w1
  80         mov     %rdx, w2
  81         mov     8(up,n,8), %rax
  82         add     $-1, n
  83         mul     v1
  84         add     %rax, w2
  85         jmp     L(m23)
  86 L(m2p0):
  87         mul     v0
  88         xor     R32(w2), R32(w2)
  89         mov     %rax, w0
  90         mov     %rdx, w1
  91         jmp     L(m20)
  92 L(m2p1):
  93         mul     v0
  94         xor     R32(w3), R32(w3)
  95         xor     R32(w0), R32(w0)
  96         xor     R32(w1), R32(w1)
  97         add     $1, n
  98         jmp     L(m2top)
  99 L(m2p2):
 100         mul     v0
 101         xor     R32(w0), R32(w0)
 102         xor     R32(w1), R32(w1)
 103         mov     %rax, w2
 104         mov     %rdx, w3
 105         mov     8(up,n,8), %rax
 106         add     $-2, n
 107         jmp     L(m22)
 108
 109
 110         ALIGN(32)
 111 L(m2top):
 112         add     %rax, w3
 113         adc     %rdx, w0
 114         mov     0(up,n,8), %rax
 115         adc     $0, R32(w1)
 116         mov     $0, R32(w2)
 117         mul     v1
 118         add     %rax, w0
 119         mov     w3, 0(rp,n,8)
 120         adc     %rdx, w1
 121         mov     8(up,n,8), %rax
 122         mul     v0
 123         add     %rax, w0
 124         adc     %rdx, w1
 125         adc     $0, R32(w2)
 126 L(m20): mov     8(up,n,8), %rax
 127         mul     v1
 128         add     %rax, w1
 129         adc     %rdx, w2
 130         mov     16(up,n,8), %rax
 131         mov     $0, R32(w3)
 132         mul     v0
 133         add     %rax, w1
 134         mov     16(up,n,8), %rax
 135         adc     %rdx, w2
 136         adc     $0, R32(w3)
 137         mul     v1
 138         add     %rax, w2
 139         mov     w0, 8(rp,n,8)
 140 L(m23): adc     %rdx, w3
 141         mov     24(up,n,8), %rax
 142         mul     v0
 143         mov     $0, R32(w0)
 144         add     %rax, w2
 145         adc     %rdx, w3
 146         mov     w1, 16(rp,n,8)
 147         mov     24(up,n,8), %rax
 148         mov     $0, R32(w1)
 149         adc     $0, R32(w0)
 150 L(m22): mul     v1
 151         add     %rax, w3
 152         mov     w2, 24(rp,n,8)
 153         adc     %rdx, w0
 154         mov     32(up,n,8), %rax
 155         mul     v0
 156         add     $4, n
 157         js      L(m2top)
 158
 159
 160         add     %rax, w3
 161         adc     %rdx, w0
 162         adc     $0, R32(w1)
 163         mov     (up), %rax
 164         mul     v1
 165         mov     w3, (rp)
 166         add     %rax, w0
 167         adc     %rdx, w1
 168         mov     w0, 8(rp)
 169         mov     w1, %rax
 170
 171         pop     %rbp
 172         pop     %rbx
 173         ret
 174 EPILOGUE()