mpn/x86_64/mod_34lsub1.asm

   1 dnl  AMD64 mpn_mod_34lsub1 -- remainder modulo 2^48-1.
   2
   3 dnl  Copyright 2000, 2001, 2002, 2004, 2005, 2007 Free Software Foundation,
   4 dnl  Inc.
   5 dnl
   6 dnl  This file is part of the GNU MP Library.
   7 dnl
   8 dnl  The GNU MP Library is free software; you can redistribute it and/or
   9 dnl  modify it under the terms of the GNU Lesser General Public License as
  10 dnl  published by the Free Software Foundation; either version 3 of the
  11 dnl  License, or (at your option) any later version.
  12 dnl
  13 dnl  The GNU MP Library is distributed in the hope that it will be useful,
  14 dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
  15 dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  16 dnl  Lesser General Public License for more details.
  17 dnl
  18 dnl  You should have received a copy of the GNU Lesser General Public License
  19 dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
  20
  21 include(`../config.m4')
  22
  23
  24 C            cycles/limb
  25 C K8,K9:         1.0
  26 C K10:           1.12
  27 C P4:            3.25
  28 C P6-15 (Core2): 1.5
  29 C P6-28 (Atom):  2.5
  30
  31
  32 C INPUT PARAMETERS
  33 C up    rdi
  34 C n     rsi
  35
  36 C mp_limb_t mpn_mod_34lsub1 (mp_srcptr up, mp_size_t n)
  37
  38 C TODO
  39 C  * Apply the movzwl tricks to the x86/k7 code
  40 C  * Review feed-in and wind-down code.  In particular, try to avoid adc and
  41 C    sbb to placate Pentium4.
  42 C  * More unrolling and/or index addressing could bring time to under 1 c/l
  43 C    for Athlon64, approaching 0.67 c/l seems possible.
  44 C  * There are recurrencies on the carry registers (r8, r9, r10) that might
  45 C    be the limiting factor for the Pentium4 speed.  Splitting these into 6
  46 C    registers would help.
  47 C  * For ultimate Athlon64 performance, a sequence like this might be best.
  48 C    It should reach 0.5 c/l (limited by L1 cache bandwidth).
  49 C
  50 C       add     (%rdi), %rax
  51 C       adc     8(%rdi), %rcx
  52 C       adc     16(%rdi), %rdx
  53 C       adc     $0, %r8
  54 C       add     24(%rdi), %rax
  55 C       adc     32(%rdi), %rcx
  56 C       adc     40(%rdi), %rdx
  57 C       adc     $0, %r8
  58 C       ...
  59
  60
  61 ASM_START()
  62         TEXT
  63         ALIGN(32)
  64 PROLOGUE(mpn_mod_34lsub1)
  65
  66         mov     $0x0000FFFFFFFFFFFF, %r11
  67
  68         sub     $2, %rsi
  69         ja      L(gt2)
  70
  71         mov     (%rdi), %rax
  72         nop
  73         jb      L(1)
  74
  75         mov     8(%rdi), %rsi
  76         mov     %rax, %rdx
  77         shr     $48, %rax               C src[0] low
  78
  79         and     %r11, %rdx              C src[0] high
  80         add     %rdx, %rax
  81         mov     %esi, %edx
  82
  83         shr     $32, %rsi               C src[1] high
  84         add     %rsi, %rax
  85
  86         shl     $16, %rdx               C src[1] low
  87         add     %rdx, %rax
  88
  89 L(1):   ret
  90
  91
  92         ALIGN(16)
  93 L(gt2): xor     %eax, %eax
  94         xor     %ecx, %ecx
  95         xor     %edx, %edx
  96         xor     %r8, %r8
  97         xor     %r9, %r9
  98         xor     %r10, %r10
  99
 100 L(top): add     (%rdi), %rax
 101         adc     $0, %r10
 102         add     8(%rdi), %rcx
 103         adc     $0, %r8
 104         add     16(%rdi), %rdx
 105         adc     $0, %r9
 106
 107         sub     $3,%rsi
 108         jng     L(end)
 109
 110         add     24(%rdi), %rax
 111         adc     $0, %r10
 112         add     32(%rdi), %rcx
 113         adc     $0, %r8
 114         add     40(%rdi), %rdx
 115         lea     48(%rdi), %rdi
 116         adc     $0, %r9
 117
 118         sub     $3,%rsi
 119         jg      L(top)
 120
 121
 122         add     $-24, %rdi
 123 L(end): add     %r9, %rax
 124         adc     %r10, %rcx
 125         adc     %r8, %rdx
 126
 127         inc     %rsi
 128         mov     $0x1, %r10d
 129         js      L(combine)
 130
 131         mov     $0x10000, %r10d
 132         adc     24(%rdi), %rax
 133         dec     %rsi
 134         js      L(combine)
 135
 136         adc     32(%rdi), %rcx
 137         mov     $0x100000000, %r10
 138
 139 L(combine):
 140         sbb     %rsi, %rsi              C carry
 141         mov     %rax, %rdi              C 0mod3
 142         shr     $48, %rax               C 0mod3 high
 143
 144         and     %r10, %rsi              C carry masked
 145         and     %r11, %rdi              C 0mod3 low
 146         mov     %ecx, %r10d             C 1mod3
 147
 148         add     %rsi, %rax              C apply carry
 149         shr     $32, %rcx               C 1mod3 high
 150
 151         add     %rdi, %rax              C apply 0mod3 low
 152         movzwl  %dx, %edi               C 2mod3
 153         shl     $16, %r10               C 1mod3 low
 154
 155         add     %rcx, %rax              C apply 1mod3 high
 156         shr     $16, %rdx               C 2mod3 high
 157
 158         add     %r10, %rax              C apply 1mod3 low
 159         shl     $32, %rdi               C 2mod3 low
 160
 161         add     %rdx, %rax              C apply 2mod3 high
 162         add     %rdi, %rax              C apply 2mod3 low
 163
 164         ret
 165 EPILOGUE()