mpn/x86/pentium4/sse2/bdiv_q_1.asm

   1 dnl  Intel Pentium-4 mpn_divexact_1 -- mpn by limb exact division.
   2
   3 dnl  Rearranged from mpn/x86/pentium4/sse2/dive_1.asm by Marco Bodrato.
   4
   5 dnl  Copyright 2001, 2002, 2007, 2011 Free Software Foundation, Inc.
   6
   7 dnl  This file is part of the GNU MP Library.
   8 dnl
   9 dnl  The GNU MP Library is free software; you can redistribute it and/or modify
  10 dnl  it under the terms of either:
  11 dnl
  12 dnl    * the GNU Lesser General Public License as published by the Free
  13 dnl      Software Foundation; either version 3 of the License, or (at your
  14 dnl      option) any later version.
  15 dnl
  16 dnl  or
  17 dnl
  18 dnl    * the GNU General Public License as published by the Free Software
  19 dnl      Foundation; either version 2 of the License, or (at your option) any
  20 dnl      later version.
  21 dnl
  22 dnl  or both in parallel, as here.
  23 dnl
  24 dnl  The GNU MP Library is distributed in the hope that it will be useful, but
  25 dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
  26 dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  27 dnl  for more details.
  28 dnl
  29 dnl  You should have received copies of the GNU General Public License and the
  30 dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
  31 dnl  see https://www.gnu.org/licenses/.
  32
  33 include(`../config.m4')
  34
  35
  36 C P4: 19.0 cycles/limb
  37
  38 C Pairs of movd's are used to avoid unaligned loads.  Despite the loads not
  39 C being on the dependent chain and there being plenty of cycles available,
  40 C using an unaligned movq on every second iteration measured about 23 c/l.
  41 C
  42
  43 defframe(PARAM_SHIFT,  24)
  44 defframe(PARAM_INVERSE,20)
  45 defframe(PARAM_DIVISOR,16)
  46 defframe(PARAM_SIZE,   12)
  47 defframe(PARAM_SRC,    8)
  48 defframe(PARAM_DST,    4)
  49
  50         TEXT
  51
  52 C mp_limb_t
  53 C mpn_pi1_bdiv_q_1 (mp_ptr dst, mp_srcptr src, mp_size_t size, mp_limb_t divisor,
  54 C                   mp_limb_t inverse, int shift)
  55         ALIGN(32)
  56 PROLOGUE(mpn_pi1_bdiv_q_1)
  57 deflit(`FRAME',0)
  58
  59         movl    PARAM_SIZE, %edx
  60
  61         movl    PARAM_SRC, %eax
  62
  63         movl    PARAM_DIVISOR, %ecx
  64
  65         movd    %ecx, %mm6
  66         movl    PARAM_SHIFT, %ecx
  67
  68         movd    %ecx, %mm7              C shift
  69
  70         C
  71
  72         movl    PARAM_INVERSE, %ecx
  73         movd    %ecx, %mm5              C inv
  74
  75         movl    PARAM_DST, %ecx
  76         pxor    %mm1, %mm1              C initial carry limb
  77         pxor    %mm0, %mm0              C initial carry bit
  78
  79         subl    $1, %edx
  80         jz      L(done)
  81
  82         pcmpeqd %mm4, %mm4
  83         psrlq   $32, %mm4               C 0x00000000FFFFFFFF
  84
  85 C The dependent chain here is as follows.
  86 C
  87 C                                       latency
  88 C       psubq    s = (src-cbit) - climb    2
  89 C       pmuludq  q = s*inverse             8
  90 C       pmuludq  prod = q*divisor          8
  91 C       psrlq    climb = high(prod)        2
  92 C                                         --
  93 C                                         20
  94 C
  95 C Yet the loop measures 19.0 c/l, so obviously there's something gained
  96 C there over a straight reading of the chip documentation.
  97
  98 L(top):
  99         C eax   src, incrementing
 100         C ebx
 101         C ecx   dst, incrementing
 102         C edx   counter, size-1 iterations
 103         C
 104         C mm0   carry bit
 105         C mm1   carry limb
 106         C mm4   0x00000000FFFFFFFF
 107         C mm5   inverse
 108         C mm6   divisor
 109         C mm7   shift
 110
 111         movd    (%eax), %mm2
 112         movd    4(%eax), %mm3
 113         addl    $4, %eax
 114         punpckldq %mm3, %mm2
 115
 116         psrlq   %mm7, %mm2
 117         pand    %mm4, %mm2              C src
 118         psubq   %mm0, %mm2              C src - cbit
 119
 120         psubq   %mm1, %mm2              C src - cbit - climb
 121         movq    %mm2, %mm0
 122         psrlq   $63, %mm0               C new cbit
 123
 124         pmuludq %mm5, %mm2              C s*inverse
 125         movd    %mm2, (%ecx)            C q
 126         addl    $4, %ecx
 127
 128         movq    %mm6, %mm1
 129         pmuludq %mm2, %mm1              C q*divisor
 130         psrlq   $32, %mm1               C new climb
 131
 132 L(entry):
 133         subl    $1, %edx
 134         jnz     L(top)
 135
 136 L(done):
 137         movd    (%eax), %mm2
 138         psrlq   %mm7, %mm2              C src
 139         psubq   %mm0, %mm2              C src - cbit
 140
 141         psubq   %mm1, %mm2              C src - cbit - climb
 142
 143         pmuludq %mm5, %mm2              C s*inverse
 144         movd    %mm2, (%ecx)            C q
 145
 146         emms
 147         ret
 148
 149 EPILOGUE()
 150
 151         ALIGN(16)
 152 C mp_limb_t mpn_bdiv_q_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
 153 C                           mp_limb_t divisor);
 154 C
 155 PROLOGUE(mpn_bdiv_q_1)
 156 deflit(`FRAME',0)
 157
 158         movl    PARAM_SIZE, %edx
 159
 160         movl    PARAM_DIVISOR, %ecx
 161
 162         C eax   src
 163         C ebx
 164         C ecx   divisor
 165         C edx   size-1
 166
 167         movl    %ecx, %eax
 168         bsfl    %ecx, %ecx              C trailing twos
 169
 170         shrl    %cl, %eax               C d = divisor without twos
 171         movd    %eax, %mm6
 172         movd    %ecx, %mm7              C shift
 173
 174         shrl    %eax                    C d/2
 175
 176         andl    $127, %eax              C d/2, 7 bits
 177
 178 ifdef(`PIC',`
 179         LEA(    binvert_limb_table, %ecx)
 180         movzbl  (%eax,%ecx), %eax               C inv 8 bits
 181 ',`
 182         movzbl  binvert_limb_table(%eax), %eax  C inv 8 bits
 183 ')
 184
 185         C
 186
 187         movd    %eax, %mm5              C inv
 188
 189         movd    %eax, %mm0              C inv
 190
 191         pmuludq %mm5, %mm5              C inv*inv
 192
 193         C
 194
 195         pmuludq %mm6, %mm5              C inv*inv*d
 196         paddd   %mm0, %mm0              C 2*inv
 197
 198         C
 199
 200         psubd   %mm5, %mm0              C inv = 2*inv - inv*inv*d
 201         pxor    %mm5, %mm5
 202
 203         paddd   %mm0, %mm5
 204         pmuludq %mm0, %mm0              C inv*inv
 205
 206         pcmpeqd %mm4, %mm4
 207         psrlq   $32, %mm4               C 0x00000000FFFFFFFF
 208
 209         C
 210
 211         pmuludq %mm6, %mm0              C inv*inv*d
 212         paddd   %mm5, %mm5              C 2*inv
 213
 214         movl    PARAM_SRC, %eax
 215         movl    PARAM_DST, %ecx
 216         pxor    %mm1, %mm1              C initial carry limb
 217
 218         C
 219
 220         psubd   %mm0, %mm5              C inv = 2*inv - inv*inv*d
 221
 222         ASSERT(e,`      C expect d*inv == 1 mod 2^GMP_LIMB_BITS
 223         pushl   %eax    FRAME_pushl()
 224         movq    %mm6, %mm0
 225         pmuludq %mm5, %mm0
 226         movd    %mm0, %eax
 227         cmpl    $1, %eax
 228         popl    %eax    FRAME_popl()')
 229
 230         pxor    %mm0, %mm0              C initial carry bit
 231         jmp     L(entry)
 232
 233 EPILOGUE()