mpn/x86/pentium4/sse2/dive_1.asm

   1 dnl  Intel Pentium-4 mpn_divexact_1 -- mpn by limb exact division.
   2
   3 dnl  Copyright 2001, 2002, 2007 Free Software Foundation, Inc.
   4 dnl
   5 dnl  This file is part of the GNU MP Library.
   6 dnl
   7 dnl  The GNU MP Library is free software; you can redistribute it and/or
   8 dnl  modify it under the terms of the GNU Lesser General Public License as
   9 dnl  published by the Free Software Foundation; either version 3 of the
  10 dnl  License, or (at your option) any later version.
  11 dnl
  12 dnl  The GNU MP Library is distributed in the hope that it will be useful,
  13 dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
  14 dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15 dnl  Lesser General Public License for more details.
  16 dnl
  17 dnl  You should have received a copy of the GNU Lesser General Public License
  18 dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
  19
  20 include(`../config.m4')
  21
  22
  23 C P4: 19.0 cycles/limb
  24
  25
  26 C void mpn_divexact_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
  27 C                      mp_limb_t divisor);
  28 C
  29 C Pairs of movd's are used to avoid unaligned loads.  Despite the loads not
  30 C being on the dependent chain and there being plenty of cycles available,
  31 C using an unaligned movq on every second iteration measured about 23 c/l.
  32 C
  33 C Using divl for size==1 seems a touch quicker than mul-by-inverse.  The mul
  34 C will be about 9+2*4+2*2+10*4+19+12 = 92 cycles latency, though some of
  35 C that might be hidden by out-of-order execution, whereas divl is around 60.
  36 C At size==2 an extra 19 for the mul versus 60 for the divl will see the mul
  37 C faster.
  38
  39 defframe(PARAM_DIVISOR,16)
  40 defframe(PARAM_SIZE,   12)
  41 defframe(PARAM_SRC,    8)
  42 defframe(PARAM_DST,    4)
  43
  44         TEXT
  45
  46         ALIGN(16)
  47 PROLOGUE(mpn_divexact_1)
  48 deflit(`FRAME',0)
  49
  50         movl    PARAM_SIZE, %edx
  51
  52         movl    PARAM_SRC, %eax
  53
  54         movl    PARAM_DIVISOR, %ecx
  55         subl    $1, %edx
  56         jnz     L(two_or_more)
  57
  58         movl    (%eax), %eax
  59         xorl    %edx, %edx
  60
  61         divl    %ecx
  62         movl    PARAM_DST, %ecx
  63
  64         movl    %eax, (%ecx)
  65         ret
  66
  67
  68 L(two_or_more):
  69         C eax   src
  70         C ebx
  71         C ecx   divisor
  72         C edx   size-1
  73
  74         movl    %ecx, %eax
  75         bsfl    %ecx, %ecx              C trailing twos
  76
  77         shrl    %cl, %eax               C d = divisor without twos
  78         movd    %eax, %mm6
  79         movd    %ecx, %mm7              C shift
  80
  81         shrl    %eax                    C d/2
  82
  83         andl    $127, %eax              C d/2, 7 bits
  84
  85 ifdef(`PIC',`
  86         LEA(    binvert_limb_table, %ecx)
  87         movzbl  (%eax,%ecx), %eax               C inv 8 bits
  88 ',`
  89         movzbl  binvert_limb_table(%eax), %eax  C inv 8 bits
  90 ')
  91
  92         C
  93
  94         movd    %eax, %mm5              C inv
  95
  96         movd    %eax, %mm0              C inv
  97
  98         pmuludq %mm5, %mm5              C inv*inv
  99
 100         C
 101
 102         pmuludq %mm6, %mm5              C inv*inv*d
 103         paddd   %mm0, %mm0              C 2*inv
 104
 105         C
 106
 107         psubd   %mm5, %mm0              C inv = 2*inv - inv*inv*d
 108         pxor    %mm5, %mm5
 109
 110         paddd   %mm0, %mm5
 111         pmuludq %mm0, %mm0              C inv*inv
 112
 113         pcmpeqd %mm4, %mm4
 114         psrlq   $32, %mm4               C 0x00000000FFFFFFFF
 115
 116         C
 117
 118         pmuludq %mm6, %mm0              C inv*inv*d
 119         paddd   %mm5, %mm5              C 2*inv
 120
 121         movl    PARAM_SRC, %eax
 122         movl    PARAM_DST, %ecx
 123         pxor    %mm1, %mm1              C initial carry limb
 124
 125         C
 126
 127         psubd   %mm0, %mm5              C inv = 2*inv - inv*inv*d
 128
 129         ASSERT(e,`      C expect d*inv == 1 mod 2^GMP_LIMB_BITS
 130         pushl   %eax    FRAME_pushl()
 131         movq    %mm6, %mm0
 132         pmuludq %mm5, %mm0
 133         movd    %mm0, %eax
 134         cmpl    $1, %eax
 135         popl    %eax    FRAME_popl()')
 136
 137         pxor    %mm0, %mm0              C initial carry bit
 138
 139
 140 C The dependent chain here is as follows.
 141 C
 142 C                                       latency
 143 C       psubq    s = (src-cbit) - climb    2
 144 C       pmuludq  q = s*inverse             8
 145 C       pmuludq  prod = q*divisor          8
 146 C       psrlq    climb = high(prod)        2
 147 C                                         --
 148 C                                         20
 149 C
 150 C Yet the loop measures 19.0 c/l, so obviously there's something gained
 151 C there over a straight reading of the chip documentation.
 152
 153 L(top):
 154         C eax   src, incrementing
 155         C ebx
 156         C ecx   dst, incrementing
 157         C edx   counter, size-1 iterations
 158         C
 159         C mm0   carry bit
 160         C mm1   carry limb
 161         C mm4   0x00000000FFFFFFFF
 162         C mm5   inverse
 163         C mm6   divisor
 164         C mm7   shift
 165
 166         movd    (%eax), %mm2
 167         movd    4(%eax), %mm3
 168         addl    $4, %eax
 169         punpckldq %mm3, %mm2
 170
 171         psrlq   %mm7, %mm2
 172         pand    %mm4, %mm2              C src
 173         psubq   %mm0, %mm2              C src - cbit
 174
 175         psubq   %mm1, %mm2              C src - cbit - climb
 176         movq    %mm2, %mm0
 177         psrlq   $63, %mm0               C new cbit
 178
 179         pmuludq %mm5, %mm2              C s*inverse
 180         movd    %mm2, (%ecx)            C q
 181         addl    $4, %ecx
 182
 183         movq    %mm6, %mm1
 184         pmuludq %mm2, %mm1              C q*divisor
 185         psrlq   $32, %mm1               C new climb
 186
 187         subl    $1, %edx
 188         jnz     L(top)
 189
 190
 191 L(done):
 192         movd    (%eax), %mm2
 193         psrlq   %mm7, %mm2              C src
 194         psubq   %mm0, %mm2              C src - cbit
 195
 196         psubq   %mm1, %mm2              C src - cbit - climb
 197
 198         pmuludq %mm5, %mm2              C s*inverse
 199         movd    %mm2, (%ecx)            C q
 200
 201         emms
 202         ret
 203
 204 EPILOGUE()