mpn/x86_64/fastsse/lshiftc.asm

   1 dnl  AMD64 mpn_lshiftc optimised for CPUs with fast SSE.
   2
   3 dnl  Contributed to the GNU project by David Harvey and Torbjorn Granlund.
   4
   5 dnl  Copyright 2010-2012 Free Software Foundation, Inc.
   6
   7 dnl  This file is part of the GNU MP Library.
   8 dnl
   9 dnl  The GNU MP Library is free software; you can redistribute it and/or modify
  10 dnl  it under the terms of either:
  11 dnl
  12 dnl    * the GNU Lesser General Public License as published by the Free
  13 dnl      Software Foundation; either version 3 of the License, or (at your
  14 dnl      option) any later version.
  15 dnl
  16 dnl  or
  17 dnl
  18 dnl    * the GNU General Public License as published by the Free Software
  19 dnl      Foundation; either version 2 of the License, or (at your option) any
  20 dnl      later version.
  21 dnl
  22 dnl  or both in parallel, as here.
  23 dnl
  24 dnl  The GNU MP Library is distributed in the hope that it will be useful, but
  25 dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
  26 dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  27 dnl  for more details.
  28 dnl
  29 dnl  You should have received copies of the GNU General Public License and the
  30 dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
  31 dnl  see https://www.gnu.org/licenses/.
  32
  33 include(`../config.m4')
  34
  35
  36 C            cycles/limb             cycles/limb              good
  37 C          16-byte aligned         16-byte unaligned        for cpu?
  38 C AMD K8,K9      ?                       ?
  39 C AMD K10        1.85  (1.635)           1.9   (1.67)           Y
  40 C AMD bd1        1.82  (1.75)            1.82  (1.75)           Y
  41 C AMD bobcat     4.5                     4.5
  42 C Intel P4       3.6   (3.125)           3.6   (3.125)          Y
  43 C Intel core2    2.05  (1.67)            2.55  (1.75)
  44 C Intel NHM      2.05  (1.875)           2.6   (2.25)
  45 C Intel SBR      1.55  (1.44)            2     (1.57)           Y
  46 C Intel atom     ?                       ?
  47 C VIA nano       2.5   (2.5)             2.5   (2.5)            Y
  48
  49 C We try to do as many 16-byte operations as possible.  The top-most and
  50 C bottom-most writes might need 8-byte operations.  We always write using
  51 C 16-byte operations, we read with both 8-byte and 16-byte operations.
  52
  53 C There are two inner-loops, one for when rp = ap (mod 16) and one when this is
  54 C not true.  The aligned case reads 16+8 bytes, the unaligned case reads
  55 C 16+8+X bytes, where X is 8 or 16 depending on how punpcklqdq is implemented.
  56
  57 C This is not yet great code:
  58 C   (1) The unaligned case makes too many reads.
  59 C   (2) We should do some unrolling, at least 2-way.
  60 C With 2-way unrolling but no scheduling we reach 1.5 c/l on K10 and 2 c/l on
  61 C Nano.
  62
  63 C INPUT PARAMETERS
  64 define(`rp',  `%rdi')
  65 define(`ap',  `%rsi')
  66 define(`n',   `%rdx')
  67 define(`cnt', `%rcx')
  68
  69 ASM_START()
  70         TEXT
  71         ALIGN(16)
  72 PROLOGUE(mpn_lshiftc)
  73         movd    R32(%rcx), %xmm4
  74         mov     $64, R32(%rax)
  75         sub     R32(%rcx), R32(%rax)
  76         movd    R32(%rax), %xmm5
  77
  78         neg     R32(%rcx)
  79         mov     -8(ap,n,8), %rax
  80         shr     R8(%rcx), %rax
  81
  82         pcmpeqb %xmm7, %xmm7            C set to 111...111
  83
  84         cmp     $2, n
  85         jle     L(le2)
  86
  87         lea     (rp,n,8), R32(%rcx)
  88         test    $8, R8(%rcx)
  89         je      L(rp_aligned)
  90
  91 C Do one initial limb in order to make rp aligned
  92         movq    -8(ap,n,8), %xmm0
  93         movq    -16(ap,n,8), %xmm1
  94         psllq   %xmm4, %xmm0
  95         psrlq   %xmm5, %xmm1
  96         por     %xmm1, %xmm0
  97         pxor    %xmm7, %xmm0
  98         movq    %xmm0, -8(rp,n,8)
  99         dec     n
 100
 101 L(rp_aligned):
 102         lea     (ap,n,8), R32(%rcx)
 103         test    $8, R8(%rcx)
 104         je      L(aent)
 105         jmp     L(uent)
 106 C *****************************************************************************
 107
 108 C Handle the case when ap != rp (mod 16).
 109
 110         ALIGN(16)
 111 L(utop):movq    (ap,n,8), %xmm1
 112         punpcklqdq  8(ap,n,8), %xmm1
 113         movdqa  -8(ap,n,8), %xmm0
 114         psllq   %xmm4, %xmm1
 115         psrlq   %xmm5, %xmm0
 116         por     %xmm1, %xmm0
 117         pxor    %xmm7, %xmm0
 118         movdqa  %xmm0, (rp,n,8)
 119 L(uent):sub     $2, n
 120         ja      L(utop)
 121
 122         jne     L(end8)
 123
 124         movq    (ap), %xmm1
 125         pxor    %xmm0, %xmm0
 126         punpcklqdq  %xmm1, %xmm0
 127         punpcklqdq  8(ap), %xmm1
 128         psllq   %xmm4, %xmm1
 129         psrlq   %xmm5, %xmm0
 130         por     %xmm1, %xmm0
 131         pxor    %xmm7, %xmm0
 132         movdqa  %xmm0, (rp)
 133         ret
 134 C *****************************************************************************
 135
 136 C Handle the case when ap = rp (mod 16).
 137
 138         ALIGN(16)
 139 L(atop):movdqa  (ap,n,8), %xmm0         C xmm0 = B*ap[n-1] + ap[n-2]
 140         movq    -8(ap,n,8), %xmm1       C xmm1 = ap[n-3]
 141         punpcklqdq  %xmm0, %xmm1        C xmm1 = B*ap[n-2] + ap[n-3]
 142         psllq   %xmm4, %xmm0
 143         psrlq   %xmm5, %xmm1
 144         por     %xmm1, %xmm0
 145         pxor    %xmm7, %xmm0
 146         movdqa  %xmm0, (rp,n,8)
 147 L(aent):sub     $2, n
 148         ja      L(atop)
 149
 150         jne     L(end8)
 151
 152         movdqa  (ap), %xmm0
 153         pxor    %xmm1, %xmm1
 154         punpcklqdq  %xmm0, %xmm1
 155         psllq   %xmm4, %xmm0
 156         psrlq   %xmm5, %xmm1
 157         por     %xmm1, %xmm0
 158         pxor    %xmm7, %xmm0
 159         movdqa  %xmm0, (rp)
 160         ret
 161 C *****************************************************************************
 162
 163         ALIGN(16)
 164 L(le2): jne     L(end8)
 165
 166         movq    8(ap), %xmm0
 167         movq    (ap), %xmm1
 168         psllq   %xmm4, %xmm0
 169         psrlq   %xmm5, %xmm1
 170         por     %xmm1, %xmm0
 171         pxor    %xmm7, %xmm0
 172         movq    %xmm0, 8(rp)
 173
 174 L(end8):movq    (ap), %xmm0
 175         psllq   %xmm4, %xmm0
 176         pxor    %xmm7, %xmm0
 177         movq    %xmm0, (rp)
 178         ret
 179 EPILOGUE()