mpn/x86/p6/dive_1.asm

   1 dnl  Intel P6 mpn_modexact_1_odd -- exact division style remainder.
   2
   3 dnl  Copyright 2001, 2002, 2007 Free Software Foundation, Inc.
   4 dnl
   5 dnl  This file is part of the GNU MP Library.
   6 dnl
   7 dnl  The GNU MP Library is free software; you can redistribute it and/or
   8 dnl  modify it under the terms of the GNU Lesser General Public License as
   9 dnl  published by the Free Software Foundation; either version 3 of the
  10 dnl  License, or (at your option) any later version.
  11 dnl
  12 dnl  The GNU MP Library is distributed in the hope that it will be useful,
  13 dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
  14 dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15 dnl  Lesser General Public License for more details.
  16 dnl
  17 dnl  You should have received a copy of the GNU Lesser General Public License
  18 dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
  19
  20 include(`../config.m4')
  21
  22
  23 C       odd  even  divisor
  24 C P6:  10.0  12.0  cycles/limb
  25
  26
  27 C void mpn_divexact_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
  28 C                      mp_limb_t divisor);
  29 C
  30 C The odd case is basically the same as mpn_modexact_1_odd, just with an
  31 C extra store, and it runs at the same 10 cycles which is the dependent
  32 C chain.
  33 C
  34 C The shifts for the even case aren't on the dependent chain so in principle
  35 C it could run the same too, but nothing running at 10 has been found.
  36 C Perhaps there's too many uops (an extra 4 over the odd case).
  37
  38 defframe(PARAM_DIVISOR,16)
  39 defframe(PARAM_SIZE,   12)
  40 defframe(PARAM_SRC,     8)
  41 defframe(PARAM_DST,     4)
  42
  43 defframe(SAVE_EBX,     -4)
  44 defframe(SAVE_ESI,     -8)
  45 defframe(SAVE_EDI,    -12)
  46 defframe(SAVE_EBP,    -16)
  47 defframe(VAR_INVERSE, -20)
  48 deflit(STACK_SPACE, 20)
  49
  50         TEXT
  51
  52         ALIGN(16)
  53 PROLOGUE(mpn_divexact_1)
  54 deflit(`FRAME',0)
  55
  56         movl    PARAM_DIVISOR, %eax
  57         subl    $STACK_SPACE, %esp      FRAME_subl_esp(STACK_SPACE)
  58
  59         movl    %esi, SAVE_ESI
  60         movl    PARAM_SRC, %esi
  61
  62         movl    %ebx, SAVE_EBX
  63         movl    PARAM_SIZE, %ebx
  64
  65         bsfl    %eax, %ecx              C trailing twos
  66
  67         movl    %ebp, SAVE_EBP
  68
  69         shrl    %cl, %eax               C d without twos
  70
  71         movl    %eax, %edx
  72         shrl    %eax                    C d/2 without twos
  73
  74         movl    %edx, PARAM_DIVISOR
  75         andl    $127, %eax
  76
  77 ifdef(`PIC',`
  78         LEA(    binvert_limb_table, %ebp)
  79         movzbl  (%eax,%ebp), %ebp               C inv 8 bits
  80 ',`
  81         movzbl  binvert_limb_table(%eax), %ebp  C inv 8 bits
  82 ')
  83
  84         leal    (%ebp,%ebp), %eax       C 2*inv
  85
  86         imull   %ebp, %ebp              C inv*inv
  87
  88         movl    %edi, SAVE_EDI
  89         movl    PARAM_DST, %edi
  90
  91         leal    (%esi,%ebx,4), %esi     C src end
  92
  93         imull   PARAM_DIVISOR, %ebp     C inv*inv*d
  94
  95         subl    %ebp, %eax              C inv = 2*inv - inv*inv*d
  96         leal    (%eax,%eax), %ebp       C 2*inv
  97
  98         imull   %eax, %eax              C inv*inv
  99
 100         leal    (%edi,%ebx,4), %edi     C dst end
 101         negl    %ebx                    C -size
 102
 103         movl    %edi, PARAM_DST
 104
 105         imull   PARAM_DIVISOR, %eax     C inv*inv*d
 106
 107         subl    %eax, %ebp              C inv = 2*inv - inv*inv*d
 108
 109         ASSERT(e,`      C d*inv == 1 mod 2^GMP_LIMB_BITS
 110         movl    PARAM_DIVISOR, %eax
 111         imull   %ebp, %eax
 112         cmpl    $1, %eax')
 113
 114         movl    %ebp, VAR_INVERSE
 115         movl    (%esi,%ebx,4), %eax     C src[0]
 116
 117         orl     %ecx, %ecx
 118         jnz     L(even)
 119
 120         C ecx initial carry is zero
 121         jmp     L(odd_entry)
 122
 123
 124 C The dependent chain here is
 125 C
 126 C       subl    %edx, %eax       1
 127 C       imull   %ebp, %eax       4
 128 C       mull    PARAM_DIVISOR    5
 129 C                              ----
 130 C       total                   10
 131 C
 132 C and this is the measured speed.  No special scheduling is necessary, out
 133 C of order execution hides the load latency.
 134
 135 L(odd_top):
 136         C eax   scratch (src limb)
 137         C ebx   counter, limbs, negative
 138         C ecx   carry bit
 139         C edx   carry limb, high of last product
 140         C esi   &src[size]
 141         C edi   &dst[size]
 142         C ebp
 143
 144         mull    PARAM_DIVISOR
 145
 146         movl    (%esi,%ebx,4), %eax
 147         subl    %ecx, %eax
 148
 149         sbbl    %ecx, %ecx
 150         subl    %edx, %eax
 151
 152         sbbl    $0, %ecx
 153
 154 L(odd_entry):
 155         imull   VAR_INVERSE, %eax
 156
 157         movl    %eax, (%edi,%ebx,4)
 158         negl    %ecx
 159
 160         incl    %ebx
 161         jnz     L(odd_top)
 162
 163
 164         movl    SAVE_ESI, %esi
 165
 166         movl    SAVE_EDI, %edi
 167
 168         movl    SAVE_EBP, %ebp
 169
 170         movl    SAVE_EBX, %ebx
 171         addl    $STACK_SPACE, %esp
 172
 173         ret
 174
 175
 176 L(even):
 177         C eax   src[0]
 178         C ebx   counter, limbs, negative
 179         C ecx   shift
 180         C edx
 181         C esi
 182         C edi
 183         C ebp
 184
 185         xorl    %ebp, %ebp              C initial carry bit
 186         xorl    %edx, %edx              C initial carry limb (for size==1)
 187
 188         incl    %ebx
 189         jz      L(even_one)
 190
 191         movl    (%esi,%ebx,4), %edi     C src[1]
 192
 193         shrdl(  %cl, %edi, %eax)
 194
 195         jmp     L(even_entry)
 196
 197
 198 L(even_top):
 199         C eax   scratch
 200         C ebx   counter, limbs, negative
 201         C ecx   shift
 202         C edx   scratch
 203         C esi   &src[size]
 204         C edi   &dst[size] and scratch
 205         C ebp   carry bit
 206
 207         movl    (%esi,%ebx,4), %edi
 208
 209         mull    PARAM_DIVISOR
 210
 211         movl    -4(%esi,%ebx,4), %eax
 212         shrdl(  %cl, %edi, %eax)
 213
 214         subl    %ebp, %eax
 215
 216         sbbl    %ebp, %ebp
 217         subl    %edx, %eax
 218
 219         sbbl    $0, %ebp
 220
 221 L(even_entry):
 222         imull   VAR_INVERSE, %eax
 223
 224         movl    PARAM_DST, %edi
 225         negl    %ebp
 226
 227         movl    %eax, -4(%edi,%ebx,4)
 228         incl    %ebx
 229         jnz     L(even_top)
 230
 231
 232
 233         mull    PARAM_DIVISOR
 234
 235         movl    -4(%esi), %eax
 236
 237 L(even_one):
 238         shrl    %cl, %eax
 239         movl    SAVE_ESI, %esi
 240
 241         subl    %ebp, %eax
 242         movl    SAVE_EBP, %ebp
 243
 244         subl    %edx, %eax
 245         movl    SAVE_EBX, %ebx
 246
 247         imull   VAR_INVERSE, %eax
 248
 249         movl    %eax, -4(%edi)
 250         movl    SAVE_EDI, %edi
 251         addl    $STACK_SPACE, %esp
 252
 253         ret
 254
 255 EPILOGUE()