mpn/x86/k7/mode1o.asm

   1 dnl  AMD K7 mpn_modexact_1_odd -- exact division style remainder.
   2
   3 dnl  Copyright 2000, 2001, 2002, 2004, 2007 Free Software Foundation, Inc.
   4 dnl
   5 dnl  This file is part of the GNU MP Library.
   6 dnl
   7 dnl  The GNU MP Library is free software; you can redistribute it and/or
   8 dnl  modify it under the terms of the GNU Lesser General Public License as
   9 dnl  published by the Free Software Foundation; either version 3 of the
  10 dnl  License, or (at your option) any later version.
  11 dnl
  12 dnl  The GNU MP Library is distributed in the hope that it will be useful,
  13 dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
  14 dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15 dnl  Lesser General Public License for more details.
  16 dnl
  17 dnl  You should have received a copy of the GNU Lesser General Public License
  18 dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
  19
  20 include(`../config.m4')
  21
  22
  23 C          cycles/limb
  24 C Athlon:     11.0
  25 C Hammer:      7.0
  26
  27
  28 C mp_limb_t mpn_modexact_1_odd (mp_srcptr src, mp_size_t size,
  29 C                               mp_limb_t divisor);
  30 C mp_limb_t mpn_modexact_1c_odd (mp_srcptr src, mp_size_t size,
  31 C                                mp_limb_t divisor, mp_limb_t carry);
  32 C
  33 C With the loop running at just 11 cycles it doesn't seem worth bothering to
  34 C check for high<divisor to save one step.
  35 C
  36 C Using a divl for size==1 measures slower than the modexact method, which
  37 C is not too surprising since for the latter it's only about 24 cycles to
  38 C calculate the modular inverse.
  39
  40 defframe(PARAM_CARRY,  16)
  41 defframe(PARAM_DIVISOR,12)
  42 defframe(PARAM_SIZE,   8)
  43 defframe(PARAM_SRC,    4)
  44
  45 defframe(SAVE_EBX,     -4)
  46 defframe(SAVE_ESI,     -8)
  47 defframe(SAVE_EDI,    -12)
  48 defframe(SAVE_EBP,    -16)
  49
  50 deflit(STACK_SPACE, 16)
  51
  52         TEXT
  53
  54         ALIGN(16)
  55 PROLOGUE(mpn_modexact_1c_odd)
  56 deflit(`FRAME',0)
  57
  58         movl    PARAM_CARRY, %ecx
  59         jmp     L(start_1c)
  60
  61 EPILOGUE()
  62
  63
  64         ALIGN(16)
  65 PROLOGUE(mpn_modexact_1_odd)
  66 deflit(`FRAME',0)
  67
  68         xorl    %ecx, %ecx
  69 L(start_1c):
  70         movl    PARAM_DIVISOR, %eax
  71         subl    $STACK_SPACE, %esp      FRAME_subl_esp(STACK_SPACE)
  72
  73         movl    %esi, SAVE_ESI
  74         movl    PARAM_DIVISOR, %esi
  75
  76         movl    %edi, SAVE_EDI
  77
  78         shrl    %eax                    C d/2
  79
  80         andl    $127, %eax
  81
  82 ifdef(`PIC',`
  83         LEA(    binvert_limb_table, %edi)
  84         movzbl  (%eax,%edi), %edi               C inv 8 bits
  85 ',`
  86         movzbl  binvert_limb_table(%eax), %edi  C inv 8 bits
  87 ')
  88
  89         xorl    %edx, %edx              C initial extra carry
  90         leal    (%edi,%edi), %eax       C 2*inv
  91
  92         imull   %edi, %edi              C inv*inv
  93
  94         movl    %ebp, SAVE_EBP
  95         movl    PARAM_SIZE, %ebp
  96
  97         movl    %ebx, SAVE_EBX
  98         movl    PARAM_SRC, %ebx
  99
 100         imull   %esi, %edi              C inv*inv*d
 101
 102         subl    %edi, %eax              C inv = 2*inv - inv*inv*d
 103         leal    (%eax,%eax), %edi       C 2*inv
 104
 105         imull   %eax, %eax              C inv*inv
 106
 107         imull   %esi, %eax              C inv*inv*d
 108
 109         leal    (%ebx,%ebp,4), %ebx     C src end
 110         negl    %ebp                    C -size
 111
 112         subl    %eax, %edi              C inv = 2*inv - inv*inv*d
 113
 114         ASSERT(e,`      C d*inv == 1 mod 2^GMP_LIMB_BITS
 115         movl    %esi, %eax
 116         imull   %edi, %eax
 117         cmpl    $1, %eax')
 118
 119
 120 C The dependent chain here is
 121 C
 122 C                            cycles
 123 C       subl    %edx, %eax      1
 124 C       imull   %edi, %eax      4
 125 C       mull    %esi            6  (high limb)
 126 C                             ----
 127 C       total                  11
 128 C
 129 C Out of order execution hides the load latency for the source data, so no
 130 C special scheduling is required.
 131
 132 L(top):
 133         C eax   src limb
 134         C ebx   src end ptr
 135         C ecx   next carry bit, 0 or 1 (or initial carry param)
 136         C edx   carry limb, high of last product
 137         C esi   divisor
 138         C edi   inverse
 139         C ebp   counter, limbs, negative
 140
 141         movl    (%ebx,%ebp,4), %eax
 142
 143         subl    %ecx, %eax              C apply carry bit
 144         movl    $0, %ecx
 145
 146         setc    %cl                     C new carry bit
 147
 148         subl    %edx, %eax              C apply carry limb
 149         adcl    $0, %ecx
 150
 151         imull   %edi, %eax
 152
 153         mull    %esi
 154
 155         incl    %ebp
 156         jnz     L(top)
 157
 158
 159         movl    SAVE_ESI, %esi
 160         movl    SAVE_EDI, %edi
 161         leal    (%ecx,%edx), %eax
 162
 163         movl    SAVE_EBX, %ebx
 164         movl    SAVE_EBP, %ebp
 165         addl    $STACK_SPACE, %esp
 166
 167         ret
 168
 169 EPILOGUE()