mpn/x86_64/mod_1_1.asm

   1 dnl  AMD64 mpn_mod_1_1p
   2
   3 dnl  Contributed to the GNU project by Torbjörn Granlund and Niels Möller.
   4
   5 dnl  Copyright 2009-2012 Free Software Foundation, Inc.
   6
   7 dnl  This file is part of the GNU MP Library.
   8 dnl
   9 dnl  The GNU MP Library is free software; you can redistribute it and/or modify
  10 dnl  it under the terms of either:
  11 dnl
  12 dnl    * the GNU Lesser General Public License as published by the Free
  13 dnl      Software Foundation; either version 3 of the License, or (at your
  14 dnl      option) any later version.
  15 dnl
  16 dnl  or
  17 dnl
  18 dnl    * the GNU General Public License as published by the Free Software
  19 dnl      Foundation; either version 2 of the License, or (at your option) any
  20 dnl      later version.
  21 dnl
  22 dnl  or both in parallel, as here.
  23 dnl
  24 dnl  The GNU MP Library is distributed in the hope that it will be useful, but
  25 dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
  26 dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  27 dnl  for more details.
  28 dnl
  29 dnl  You should have received copies of the GNU General Public License and the
  30 dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
  31 dnl  see https://www.gnu.org/licenses/.
  32
  33 include(`../config.m4')
  34
  35 C            cycles/limb
  36 C AMD K8,K9      6
  37 C AMD K10        6
  38 C Intel P4      26
  39 C Intel core2   12.5
  40 C Intel NHM     11.3
  41 C Intel SBR      8.4    (slowdown, old code took 8.0)
  42 C Intel atom    26
  43 C VIA nano      13
  44
  45 define(`B2mb',   `%r10')
  46 define(`B2modb', `%r11')
  47 define(`ap',     `%rdi')
  48 define(`n',      `%rsi')
  49 define(`pre',    `%r8')
  50 define(`b',      `%rbx')
  51
  52 define(`r0',     `%rbp') C r1 kept in %rax
  53 define(`r2',     `%rcx')  C kept negated. Also used as shift count
  54 define(`t0',     `%r9')
  55
  56 C mp_limb_t
  57 C mpn_mod_1_1p (mp_srcptr ap, mp_size_t n, mp_limb_t b, mp_limb_t bmodb[4])
  58 C                       %rdi         %rsi         %rdx                %rcx
  59 C The pre array contains bi, cnt, B1modb, B2modb
  60 C Note: This implementation needs B1modb only when cnt > 0
  61
  62 C The iteration is almost as follows,
  63 C
  64 C   r_2 B^3 + r_1 B^2 + r_0 B + u = r_1 B2modb + (r_0 + r_2 B2mod) B + u
  65 C
  66 C where r2 is a single bit represented as a mask. But to make sure that the
  67 C result fits in two limbs and a bit, carry from the addition
  68 C
  69 C   r_0 + r_2 B2mod
  70 C
  71 C is handled specially. On carry, we subtract b to cancel the carry,
  72 C and we use instead the value
  73 C
  74 C   r_0 + B2mb (mod B)
  75 C
  76 C This addition can be issued early since it doesn't depend on r2, and it is
  77 C the source of the cmov in the loop.
  78 C
  79 C We have the invariant that r_2 B^2 + r_1 B + r_0 < B^2 + B b
  80
  81 ABI_SUPPORT(DOS64)
  82 ABI_SUPPORT(STD64)
  83
  84 ASM_START()
  85         TEXT
  86         ALIGN(16)
  87 PROLOGUE(mpn_mod_1_1p)
  88         FUNC_ENTRY(4)
  89         push    %rbp
  90         push    %rbx
  91         mov     %rdx, b
  92         mov     %rcx, pre
  93
  94         mov     -8(ap, n, 8), %rax
  95         cmp     $3, n
  96         jnc     L(first)
  97         mov     -16(ap, n, 8), r0
  98         jmp     L(reduce_two)
  99
 100 L(first):
 101         C First iteration, no r2
 102         mov     24(pre), B2modb
 103         mul     B2modb
 104         mov     -24(ap, n, 8), r0
 105         add     %rax, r0
 106         mov     -16(ap, n, 8), %rax
 107         adc     %rdx, %rax
 108         sbb     r2, r2
 109         sub     $4, n
 110         jc      L(reduce_three)
 111
 112         mov     B2modb, B2mb
 113         sub     b, B2mb
 114
 115         ALIGN(16)
 116 L(top): and     B2modb, r2
 117         lea     (B2mb, r0), t0
 118         mul     B2modb
 119         add     r0, r2
 120         mov     (ap, n, 8), r0
 121         cmovc   t0, r2
 122         add     %rax, r0
 123         mov     r2, %rax
 124         adc     %rdx, %rax
 125         sbb     r2, r2
 126         sub     $1, n
 127         jnc     L(top)
 128
 129 L(reduce_three):
 130         C Eliminate r2
 131         and     b, r2
 132         sub     r2, %rax
 133
 134 L(reduce_two):
 135         mov     8(pre), R32(%rcx)
 136         test    R32(%rcx), R32(%rcx)
 137         jz      L(normalized)
 138
 139         C Unnormalized, use B1modb to reduce to size < B (b+1)
 140         mulq    16(pre)
 141         xor     t0, t0
 142         add     %rax, r0
 143         adc     %rdx, t0
 144         mov     t0, %rax
 145
 146         C Left-shift to normalize
 147 ifdef(`SHLD_SLOW',`
 148         shl     R8(%rcx), %rax
 149         mov     r0, t0
 150         neg     R32(%rcx)
 151         shr     R8(%rcx), t0
 152         or      t0, %rax
 153         neg     R32(%rcx)
 154 ',`
 155         shld    R8(%rcx), r0, %rax
 156 ')
 157         shl     R8(%rcx), r0
 158         jmp     L(udiv)
 159
 160 L(normalized):
 161         mov     %rax, t0
 162         sub     b, t0
 163         cmovnc  t0, %rax
 164
 165 L(udiv):
 166         lea     1(%rax), t0
 167         mulq    (pre)
 168         add     r0, %rax
 169         adc     t0, %rdx
 170         imul    b, %rdx
 171         sub     %rdx, r0
 172         cmp     r0, %rax
 173         lea     (b, r0), %rax
 174         cmovnc  r0, %rax
 175         cmp     b, %rax
 176         jnc     L(fix)
 177 L(ok):  shr     R8(%rcx), %rax
 178
 179         pop     %rbx
 180         pop     %rbp
 181         FUNC_EXIT()
 182         ret
 183 L(fix): sub     b, %rax
 184         jmp     L(ok)
 185 EPILOGUE()
 186
 187         ALIGN(16)
 188 PROLOGUE(mpn_mod_1_1p_cps)
 189         FUNC_ENTRY(2)
 190         push    %rbp
 191         bsr     %rsi, %rcx
 192         push    %rbx
 193         mov     %rdi, %rbx
 194         push    %r12
 195         xor     $63, R32(%rcx)
 196         mov     %rsi, %r12
 197         mov     R32(%rcx), R32(%rbp)
 198         sal     R8(%rcx), %r12
 199 IFSTD(` mov     %r12, %rdi      ')      C pass parameter
 200 IFDOS(` mov     %r12, %rcx      ')      C pass parameter
 201         CALL(   mpn_invert_limb)
 202         neg     %r12
 203         mov     %r12, %r8
 204         mov     %rax, (%rbx)            C store bi
 205         mov     %rbp, 8(%rbx)           C store cnt
 206         imul    %rax, %r12
 207         mov     %r12, 24(%rbx)          C store B2modb
 208         mov     R32(%rbp), R32(%rcx)
 209         test    R32(%rcx), R32(%rcx)
 210         jz      L(z)
 211
 212         mov     $1, R32(%rdx)
 213 ifdef(`SHLD_SLOW',`
 214         C Destroys %rax, unlike shld. Otherwise, we could do B1modb
 215         C before B2modb, and get rid of the move %r12, %r8 above.
 216
 217         shl     R8(%rcx), %rdx
 218         neg     R32(%rcx)
 219         shr     R8(%rcx), %rax
 220         or      %rax, %rdx
 221         neg     R32(%rcx)
 222 ',`
 223         shld    R8(%rcx), %rax, %rdx
 224 ')
 225         imul    %rdx, %r8
 226         shr     R8(%rcx), %r8
 227         mov     %r8, 16(%rbx)           C store B1modb
 228 L(z):
 229         pop     %r12
 230         pop     %rbx
 231         pop     %rbp
 232         FUNC_EXIT()
 233         ret
 234 EPILOGUE()
 235 ASM_END()