mpn/x86_64/bdiv_q_1.asm

   1 dnl  AMD64 mpn_bdiv_q_1, mpn_pi1_bdiv_q_1 -- schoolbook Hensel division by
   2 dnl  1-limb divisor, returning quotient only.
   3
   4 dnl  Copyright 2001, 2002, 2004, 2005, 2006, 2009 Free Software Foundation,
   5 dnl  Inc.
   6
   7 dnl  This file is part of the GNU MP Library.
   8
   9 dnl  The GNU MP Library is free software; you can redistribute it and/or modify
  10 dnl  it under the terms of the GNU Lesser General Public License as published
  11 dnl  by the Free Software Foundation; either version 3 of the License, or (at
  12 dnl  your option) any later version.
  13
  14 dnl The GNU MP Library is distributed in the hope that it will be useful, but
  15 dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
  16 dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
  17 dnl  License for more details.
  18
  19 dnl  You should have received a copy of the GNU Lesser General Public License
  20 dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
  21
  22 include(`../config.m4')
  23
  24
  25 C            cycles/limb
  26 C K8,K9:        10
  27 C K10:          10
  28 C P4:           33
  29 C P6 core2:     13.25
  30 C P6 corei7:    14
  31 C P6 atom:      42
  32
  33
  34 C INPUT PARAMETERS
  35 C rp            rdi
  36 C up            rsi
  37 C n             rdx
  38 C d             rcx
  39 C di            r8      just mpn_pi1_bdiv_q_1
  40 C shift         r9      just mpn_pi1_bdiv_q_1
  41
  42
  43 ASM_START()
  44         TEXT
  45         ALIGN(16)
  46 PROLOGUE(mpn_bdiv_q_1)
  47         push    %rbx
  48
  49         mov     %rcx, %rax
  50         xor     R32(%rcx), R32(%rcx)    C shift count
  51         mov     %rdx, %r10
  52
  53         bt      $0, R32(%rax)
  54         jnc     L(evn)                  C skip bsfq unless divisor is even
  55
  56 L(odd): mov     %rax, %rbx
  57         shr     R32(%rax)
  58         and     $127, R32(%rax)         C d/2, 7 bits
  59
  60 ifdef(`PIC',`
  61         mov     binvert_limb_table@GOTPCREL(%rip), %rdx
  62 ',`
  63         movabs  $binvert_limb_table, %rdx
  64 ')
  65
  66         movzbl  (%rdx,%rax), R32(%rax)  C inv 8 bits
  67
  68         mov     %rbx, %r11              C d without twos
  69
  70         lea     (%rax,%rax), R32(%rdx)  C 2*inv
  71         imul    R32(%rax), R32(%rax)    C inv*inv
  72         imul    R32(%rbx), R32(%rax)    C inv*inv*d
  73         sub     R32(%rax), R32(%rdx)    C inv = 2*inv - inv*inv*d, 16 bits
  74
  75         lea     (%rdx,%rdx), R32(%rax)  C 2*inv
  76         imul    R32(%rdx), R32(%rdx)    C inv*inv
  77         imul    R32(%rbx), R32(%rdx)    C inv*inv*d
  78         sub     R32(%rdx), R32(%rax)    C inv = 2*inv - inv*inv*d, 32 bits
  79
  80         lea     (%rax,%rax), %r8        C 2*inv
  81         imul    %rax, %rax              C inv*inv
  82         imul    %rbx, %rax              C inv*inv*d
  83         sub     %rax, %r8               C inv = 2*inv - inv*inv*d, 64 bits
  84
  85         jmp     L(com)
  86
  87 L(evn): bsf     %rax, %rcx
  88         shr     R8(%rcx), %rax
  89         jmp     L(odd)
  90 EPILOGUE()
  91
  92 PROLOGUE(mpn_pi1_bdiv_q_1)
  93         push    %rbx
  94
  95         mov     %rcx, %r11              C d
  96         mov     %rdx, %r10              C n
  97         mov     %r9, %rcx               C shift
  98 L(com):
  99         mov     (%rsi), %rax            C up[0]
 100
 101         dec     %r10
 102         jz      L(one)
 103
 104         mov     8(%rsi), %rdx           C up[1]
 105         lea     (%rsi,%r10,8), %rsi     C up end
 106         lea     (%rdi,%r10,8), %rdi     C rp end
 107         neg     %r10                    C -n
 108
 109         shrd    R8(%rcx), %rdx, %rax
 110
 111         xor     R32(%rbx), R32(%rbx)
 112         jmp     L(ent)
 113
 114         ALIGN(8)
 115 L(top):
 116         C rax   q
 117         C rbx   carry bit, 0 or 1
 118         C rcx   shift
 119         C rdx
 120         C rsi   up end
 121         C rdi   rp end
 122         C r10   counter, limbs, negative
 123
 124         mul     %r11                    C carry limb in rdx
 125         mov     (%rsi,%r10,8), %rax
 126         mov     8(%rsi,%r10,8), %r9
 127         shrd    R8(%rcx), %r9, %rax
 128         nop
 129         sub     %rbx, %rax              C apply carry bit
 130         setc    R8(%rbx)
 131         sub     %rdx, %rax              C apply carry limb
 132         adc     $0, %rbx
 133 L(ent): imul    %r8, %rax
 134         mov     %rax, (%rdi,%r10,8)
 135         inc     %r10
 136         jnz     L(top)
 137
 138         mul     %r11                    C carry limb in rdx
 139         mov     (%rsi), %rax            C up high limb
 140         shr     R8(%rcx), %rax
 141         sub     %rbx, %rax              C apply carry bit
 142         sub     %rdx, %rax              C apply carry limb
 143         imul    %r8, %rax
 144         mov     %rax, (%rdi)
 145         pop     %rbx
 146         ret
 147
 148 L(one): shr     R8(%rcx), %rax
 149         imul    %r8, %rax
 150         mov     %rax, (%rdi)
 151         pop     %rbx
 152         ret
 153 EPILOGUE()