mpn/x86/k6/mode1o.asm

   1 dnl  AMD K6 mpn_modexact_1_odd -- exact division style remainder.
   2
   3 dnl  Copyright 2000, 2001, 2002, 2003, 2007 Free Software Foundation, Inc.
   4 dnl
   5 dnl  This file is part of the GNU MP Library.
   6 dnl
   7 dnl  The GNU MP Library is free software; you can redistribute it and/or
   8 dnl  modify it under the terms of the GNU Lesser General Public License as
   9 dnl  published by the Free Software Foundation; either version 3 of the
  10 dnl  License, or (at your option) any later version.
  11 dnl
  12 dnl  The GNU MP Library is distributed in the hope that it will be useful,
  13 dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
  14 dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15 dnl  Lesser General Public License for more details.
  16 dnl
  17 dnl  You should have received a copy of the GNU Lesser General Public License
  18 dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
  19
  20 include(`../config.m4')
  21
  22
  23 C K6: 10.0 cycles/limb
  24
  25
  26 C mp_limb_t mpn_modexact_1_odd (mp_srcptr src, mp_size_t size,
  27 C                               mp_limb_t divisor);
  28 C mp_limb_t mpn_modexact_1c_odd (mp_srcptr src, mp_size_t size,
  29 C                                mp_limb_t divisor, mp_limb_t carry);
  30 C
  31 C A special case for high<divisor at the end measured only about 4 cycles
  32 C faster, and so isn't used.
  33 C
  34 C A special case for size==1 using a divl rather than the inverse measured
  35 C only about 5 cycles faster, and so isn't used.  When size==1 and
  36 C high<divisor it can skip a division and be a full 24 cycles faster, but
  37 C this isn't an important case.
  38
  39 defframe(PARAM_CARRY,  16)
  40 defframe(PARAM_DIVISOR,12)
  41 defframe(PARAM_SIZE,   8)
  42 defframe(PARAM_SRC,    4)
  43
  44         TEXT
  45
  46         ALIGN(32)
  47 PROLOGUE(mpn_modexact_1c_odd)
  48 deflit(`FRAME',0)
  49
  50         movl    PARAM_DIVISOR, %ecx
  51         pushl   %esi            FRAME_pushl()
  52
  53         movl    PARAM_CARRY, %edx
  54         jmp     L(start_1c)
  55
  56 EPILOGUE()
  57
  58
  59         ALIGN(16)
  60 PROLOGUE(mpn_modexact_1_odd)
  61 deflit(`FRAME',0)
  62
  63         movl    PARAM_DIVISOR, %ecx
  64         pushl   %esi            FRAME_pushl()
  65
  66         xorl    %edx, %edx
  67 L(start_1c):
  68         pushl   %edi            FRAME_pushl()
  69
  70         shrl    %ecx                    C d/2
  71         movl    PARAM_DIVISOR, %esi
  72
  73         andl    $127, %ecx              C d/2, 7 bits
  74         pushl   %ebp            FRAME_pushl()
  75
  76 ifdef(`PIC',`
  77         LEA(    binvert_limb_table, %edi)
  78 Zdisp(  movzbl, 0,(%ecx,%edi), %edi)            C inv 8 bits
  79 ',`
  80         movzbl  binvert_limb_table(%ecx), %edi  C inv 8 bits
  81 ')
  82         leal    (%edi,%edi), %ecx       C 2*inv
  83
  84         imull   %edi, %edi              C inv*inv
  85
  86         movl    PARAM_SRC, %eax
  87         movl    PARAM_SIZE, %ebp
  88
  89         imull   %esi, %edi              C inv*inv*d
  90
  91         pushl   %ebx            FRAME_pushl()
  92         leal    (%eax,%ebp,4), %ebx     C src end
  93
  94         subl    %edi, %ecx              C inv = 2*inv - inv*inv*d
  95         leal    (%ecx,%ecx), %edi       C 2*inv
  96
  97         imull   %ecx, %ecx              C inv*inv
  98
  99         movl    (%eax), %eax            C src low limb
 100         negl    %ebp                    C -size
 101
 102         imull   %esi, %ecx              C inv*inv*d
 103
 104         subl    %ecx, %edi              C inv = 2*inv - inv*inv*d
 105
 106         ASSERT(e,`      C d*inv == 1 mod 2^GMP_LIMB_BITS
 107         pushl   %eax
 108         movl    %esi, %eax
 109         imull   %edi, %eax
 110         cmpl    $1, %eax
 111         popl    %eax')
 112
 113         jmp     L(entry)
 114
 115
 116 C Rotating the mul to the top of the loop saves 1 cycle, presumably by
 117 C hiding the loop control under the imul latency.
 118 C
 119 C The run time is 10 cycles, but decoding is only 9 (and the dependent chain
 120 C only 8).  It's not clear how to get down to 9 cycles.
 121 C
 122 C The xor and rcl to handle the carry bit could be an sbb instead, with the
 123 C the carry bit add becoming a sub, but that doesn't save anything.
 124
 125 L(top):
 126         C eax   (low product)
 127         C ebx   src end
 128         C ecx   carry bit, 0 or 1
 129         C edx   (high product, being carry limb)
 130         C esi   divisor
 131         C edi   inverse
 132         C ebp   counter, limbs, negative
 133
 134         mull    %esi
 135
 136         movl    (%ebx,%ebp,4), %eax
 137         addl    %ecx, %edx              C apply carry bit to carry limb
 138
 139 L(entry):
 140         xorl    %ecx, %ecx
 141         subl    %edx, %eax              C apply carry limb
 142
 143         rcll    %ecx
 144
 145         imull   %edi, %eax
 146
 147         incl    %ebp
 148         jnz     L(top)
 149
 150
 151
 152         popl    %ebx
 153         popl    %ebp
 154
 155         mull    %esi
 156
 157         popl    %edi
 158         popl    %esi
 159
 160         leal    (%ecx,%edx), %eax
 161
 162         ret
 163
 164 EPILOGUE()