mpn/x86_64/fastsse/lshiftc.asm

   1 dnl  AMD64 mpn_lshiftc optimised for CPUs with fast SSE.
   2
   3 dnl  Contributed to the GNU project by David Harvey and Torbjorn Granlund.
   4
   5 dnl  Copyright 2010, 2011, 2012 Free Software Foundation, Inc.
   6
   7 dnl  This file is part of the GNU MP Library.
   8
   9 dnl  The GNU MP Library is free software; you can redistribute it and/or modify
  10 dnl  it under the terms of the GNU Lesser General Public License as published
  11 dnl  by the Free Software Foundation; either version 3 of the License, or (at
  12 dnl  your option) any later version.
  13
  14 dnl  The GNU MP Library is distributed in the hope that it will be useful, but
  15 dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
  16 dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
  17 dnl  License for more details.
  18
  19 dnl  You should have received a copy of the GNU Lesser General Public License
  20 dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
  21
  22 include(`../config.m4')
  23
  24
  25 C            cycles/limb             cycles/limb              good
  26 C          16-byte aligned         16-byte unaligned        for cpu?
  27 C AMD K8,K9      ?                       ?
  28 C AMD K10        1.85  (1.635)           1.9   (1.67)           Y
  29 C AMD bd1        1.82  (1.75)            1.82  (1.75)           Y
  30 C AMD bobcat     4.5                     4.5
  31 C Intel P4       3.6   (3.125)           3.6   (3.125)          Y
  32 C Intel core2    2.05  (1.67)            2.55  (1.75)
  33 C Intel NHM      2.05  (1.875)           2.6   (2.25)
  34 C Intel SBR      1.55  (1.44)            2     (1.57)           Y
  35 C Intel atom     ?                       ?
  36 C VIA nano       2.5   (2.5)             2.5   (2.5)            Y
  37
  38 C We try to do as many 16-byte operations as possible.  The top-most and
  39 C bottom-most writes might need 8-byte operations.  We always write using
  40 C 16-byte operations, we read with both 8-byte and 16-byte operations.
  41
  42 C There are two inner-loops, one for when rp = ap (mod 16) and one when this is
  43 C not true.  The aligned case reads 16+8 bytes, the unaligned case reads
  44 C 16+8+X bytes, where X is 8 or 16 depending on how punpcklqdq is implemented.
  45
  46 C This is not yet great code:
  47 C   (1) The unaligned case makes too many reads.
  48 C   (2) We should do some unrolling, at least 2-way.
  49 C With 2-way unrolling but no scheduling we reach 1.5 c/l on K10 and 2 c/l on
  50 C Nano.
  51
  52 C INPUT PARAMETERS
  53 define(`rp',  `%rdi')
  54 define(`ap',  `%rsi')
  55 define(`n',   `%rdx')
  56 define(`cnt', `%rcx')
  57
  58 ASM_START()
  59         TEXT
  60         ALIGN(16)
  61 PROLOGUE(mpn_lshiftc)
  62         movd    R32(%rcx), %xmm4
  63         mov     $64, R32(%rax)
  64         sub     R32(%rcx), R32(%rax)
  65         movd    R32(%rax), %xmm5
  66
  67         neg     R32(%rcx)
  68         mov     -8(ap,n,8), %rax
  69         shr     R8(%rcx), %rax
  70
  71         pcmpeqb %xmm7, %xmm7            C set to 111...111
  72
  73         cmp     $2, n
  74         jle     L(le2)
  75
  76         lea     (rp,n,8), R32(%rcx)
  77         test    $8, R8(%rcx)
  78         je      L(rp_aligned)
  79
  80 C Do one initial limb in order to make rp aligned
  81         movq    -8(ap,n,8), %xmm0
  82         movq    -16(ap,n,8), %xmm1
  83         psllq   %xmm4, %xmm0
  84         psrlq   %xmm5, %xmm1
  85         por     %xmm1, %xmm0
  86         pxor    %xmm7, %xmm0
  87         movq    %xmm0, -8(rp,n,8)
  88         dec     n
  89
  90 L(rp_aligned):
  91         lea     (ap,n,8), R32(%rcx)
  92         test    $8, R8(%rcx)
  93         je      L(aent)
  94         jmp     L(uent)
  95 C *****************************************************************************
  96
  97 C Handle the case when ap != rp (mod 16).
  98
  99         ALIGN(16)
 100 L(utop):movq    (ap,n,8), %xmm1
 101         punpcklqdq  8(ap,n,8), %xmm1
 102         movdqa  -8(ap,n,8), %xmm0
 103         psllq   %xmm4, %xmm1
 104         psrlq   %xmm5, %xmm0
 105         por     %xmm1, %xmm0
 106         pxor    %xmm7, %xmm0
 107         movdqa  %xmm0, (rp,n,8)
 108 L(uent):sub     $2, n
 109         ja      L(utop)
 110
 111         jne     L(end8)
 112
 113         movq    (ap), %xmm1
 114         pxor    %xmm0, %xmm0
 115         punpcklqdq  %xmm1, %xmm0
 116         punpcklqdq  8(ap), %xmm1
 117         psllq   %xmm4, %xmm1
 118         psrlq   %xmm5, %xmm0
 119         por     %xmm1, %xmm0
 120         pxor    %xmm7, %xmm0
 121         movdqa  %xmm0, (rp)
 122         ret
 123 C *****************************************************************************
 124
 125 C Handle the case when ap = rp (mod 16).
 126
 127         ALIGN(16)
 128 L(atop):movdqa  (ap,n,8), %xmm0         C xmm0 = B*ap[n-1] + ap[n-2]
 129         movq    -8(ap,n,8), %xmm1       C xmm1 = ap[n-3]
 130         punpcklqdq  %xmm0, %xmm1        C xmm1 = B*ap[n-2] + ap[n-3]
 131         psllq   %xmm4, %xmm0
 132         psrlq   %xmm5, %xmm1
 133         por     %xmm1, %xmm0
 134         pxor    %xmm7, %xmm0
 135         movdqa  %xmm0, (rp,n,8)
 136 L(aent):sub     $2, n
 137         ja      L(atop)
 138
 139         jne     L(end8)
 140
 141         movdqa  (ap), %xmm0
 142         pxor    %xmm1, %xmm1
 143         punpcklqdq  %xmm0, %xmm1
 144         psllq   %xmm4, %xmm0
 145         psrlq   %xmm5, %xmm1
 146         por     %xmm1, %xmm0
 147         pxor    %xmm7, %xmm0
 148         movdqa  %xmm0, (rp)
 149         ret
 150 C *****************************************************************************
 151
 152         ALIGN(16)
 153 L(le2): jne     L(end8)
 154
 155         movq    8(ap), %xmm0
 156         movq    (ap), %xmm1
 157         psllq   %xmm4, %xmm0
 158         psrlq   %xmm5, %xmm1
 159         por     %xmm1, %xmm0
 160         pxor    %xmm7, %xmm0
 161         movq    %xmm0, 8(rp)
 162
 163 L(end8):movq    (ap), %xmm0
 164         psllq   %xmm4, %xmm0
 165         pxor    %xmm7, %xmm0
 166         movq    %xmm0, (rp)
 167         ret
 168 EPILOGUE()