mpn/x86_64/dive_1.asm

   1 dnl  AMD64 mpn_divexact_1 -- mpn by limb exact division.
   2
   3 dnl  Copyright 2001, 2002, 2004, 2005, 2006 Free Software Foundation, Inc.
   4
   5 dnl  This file is part of the GNU MP Library.
   6
   7 dnl  The GNU MP Library is free software; you can redistribute it and/or modify
   8 dnl  it under the terms of the GNU Lesser General Public License as published
   9 dnl  by the Free Software Foundation; either version 3 of the License, or (at
  10 dnl  your option) any later version.
  11
  12 dnl The GNU MP Library is distributed in the hope that it will be useful, but
  13 dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
  14 dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
  15 dnl  License for more details.
  16
  17 dnl  You should have received a copy of the GNU Lesser General Public License
  18 dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
  19
  20 include(`../config.m4')
  21
  22
  23 C            cycles/limb
  24 C K8,K9:        10
  25 C K10:          10
  26 C P4:           33
  27 C P6 core2:     13.25
  28 C P6 corei7:    14
  29 C P6 atom:      42
  30
  31 C A quick adoption of the 32-bit K7 code.
  32
  33
  34 C INPUT PARAMETERS
  35 C rp            rdi
  36 C up            rsi
  37 C n             rdx
  38 C divisor       rcx
  39
  40 ASM_START()
  41         TEXT
  42         ALIGN(16)
  43 PROLOGUE(mpn_divexact_1)
  44         push    %rbx
  45
  46         mov     %rcx, %rax
  47         xor     R32(%rcx), R32(%rcx)    C shift count
  48         mov     %rdx, %r8
  49
  50         bt      $0, R32(%rax)
  51         jnc     L(evn)                  C skip bsfq unless divisor is even
  52
  53 L(odd): mov     %rax, %rbx
  54         shr     R32(%rax)
  55         and     $127, R32(%rax)         C d/2, 7 bits
  56
  57 ifdef(`PIC',`
  58         mov     binvert_limb_table@GOTPCREL(%rip), %rdx
  59 ',`
  60         movabs  $binvert_limb_table, %rdx
  61 ')
  62
  63         movzbl  (%rdx,%rax), R32(%rax)  C inv 8 bits
  64
  65         mov     %rbx, %r11              C d without twos
  66
  67         lea     (%rax,%rax), R32(%rdx)  C 2*inv
  68         imul    R32(%rax), R32(%rax)    C inv*inv
  69         imul    R32(%rbx), R32(%rax)    C inv*inv*d
  70         sub     R32(%rax), R32(%rdx)    C inv = 2*inv - inv*inv*d, 16 bits
  71
  72         lea     (%rdx,%rdx), R32(%rax)  C 2*inv
  73         imul    R32(%rdx), R32(%rdx)    C inv*inv
  74         imul    R32(%rbx), R32(%rdx)    C inv*inv*d
  75         sub     R32(%rdx), R32(%rax)    C inv = 2*inv - inv*inv*d, 32 bits
  76
  77         lea     (%rax,%rax), %r10       C 2*inv
  78         imul    %rax, %rax              C inv*inv
  79         imul    %rbx, %rax              C inv*inv*d
  80         sub     %rax, %r10              C inv = 2*inv - inv*inv*d, 64 bits
  81
  82         lea     (%rsi,%r8,8), %rsi      C up end
  83         lea     -8(%rdi,%r8,8), %rdi    C rp end
  84         neg     %r8                     C -n
  85
  86         mov     (%rsi,%r8,8), %rax      C up[0]
  87
  88         inc     %r8
  89         jz      L(one)
  90
  91         mov     (%rsi,%r8,8), %rdx      C up[1]
  92
  93         shrd    R8(%rcx), %rdx, %rax
  94
  95         xor     R32(%rbx), R32(%rbx)
  96         jmp     L(ent)
  97
  98 L(evn): bsf     %rax, %rcx
  99         shr     R8(%rcx), %rax
 100         jmp     L(odd)
 101
 102         ALIGN(8)
 103 L(top):
 104         C rax   q
 105         C rbx   carry bit, 0 or 1
 106         C rcx   shift
 107         C rdx
 108         C rsi   up end
 109         C rdi   rp end
 110         C r8    counter, limbs, negative
 111         C r10   d^(-1) mod 2^64
 112         C r11   d, shifted down
 113
 114         mul     %r11                    C carry limb in rdx     0 10
 115         mov     -8(%rsi,%r8,8), %rax    C
 116         mov     (%rsi,%r8,8), %r9       C
 117         shrd    R8(%rcx), %r9, %rax     C
 118         nop                             C
 119         sub     %rbx, %rax              C apply carry bit
 120         setc    %bl                     C
 121         sub     %rdx, %rax              C apply carry limb      5
 122         adc     $0, %rbx                C                       6
 123 L(ent): imul    %r10, %rax              C                       6
 124         mov     %rax, (%rdi,%r8,8)      C
 125         inc     %r8                     C
 126         jnz     L(top)
 127
 128         mul     %r11                    C carry limb in rdx
 129         mov     -8(%rsi), %rax          C up high limb
 130         shr     R8(%rcx), %rax
 131         sub     %rbx, %rax              C apply carry bit
 132         sub     %rdx, %rax              C apply carry limb
 133         imul    %r10, %rax
 134         mov     %rax, (%rdi)
 135         pop     %rbx
 136         ret
 137
 138 L(one): shr     R8(%rcx), %rax
 139         imul    %r10, %rax
 140         mov     %rax, (%rdi)
 141         pop     %rbx
 142         ret
 143
 144 EPILOGUE()