mpn/x86/p6/mode1o.asm

   1 dnl  Intel P6 mpn_modexact_1_odd -- exact division style remainder.
   2
   3 dnl  Copyright 2000, 2001, 2002, 2007 Free Software Foundation, Inc.
   4 dnl
   5 dnl  This file is part of the GNU MP Library.
   6 dnl
   7 dnl  The GNU MP Library is free software; you can redistribute it and/or
   8 dnl  modify it under the terms of the GNU Lesser General Public License as
   9 dnl  published by the Free Software Foundation; either version 3 of the
  10 dnl  License, or (at your option) any later version.
  11 dnl
  12 dnl  The GNU MP Library is distributed in the hope that it will be useful,
  13 dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
  14 dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15 dnl  Lesser General Public License for more details.
  16 dnl
  17 dnl  You should have received a copy of the GNU Lesser General Public License
  18 dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
  19
  20 include(`../config.m4')
  21
  22
  23 C P6: 10.0 cycles/limb
  24
  25
  26 C mp_limb_t mpn_modexact_1_odd (mp_srcptr src, mp_size_t size,
  27 C                               mp_limb_t divisor);
  28 C mp_limb_t mpn_modexact_1c_odd (mp_srcptr src, mp_size_t size,
  29 C                                mp_limb_t divisor, mp_limb_t carry);
  30 C
  31 C It's not worth skipping a step at the end when high<divisor since the main
  32 C loop is only 10 cycles.
  33
  34 defframe(PARAM_CARRY,  16)
  35 defframe(PARAM_DIVISOR,12)
  36 defframe(PARAM_SIZE,   8)
  37 defframe(PARAM_SRC,    4)
  38
  39 dnl  Not enough room under modexact_1 to make these re-use the parameter
  40 dnl  space, unfortunately.
  41 defframe(SAVE_EBX,     -4)
  42 defframe(SAVE_ESI,     -8)
  43 defframe(SAVE_EDI,    -12)
  44 deflit(STACK_SPACE, 12)
  45
  46         TEXT
  47
  48         ALIGN(16)
  49 PROLOGUE(mpn_modexact_1c_odd)
  50 deflit(`FRAME',0)
  51
  52         movl    PARAM_CARRY, %ecx
  53         jmp     L(start_1c)
  54
  55 EPILOGUE()
  56
  57         ALIGN(16)
  58 PROLOGUE(mpn_modexact_1_odd)
  59 deflit(`FRAME',0)
  60
  61         xorl    %ecx, %ecx
  62 L(start_1c):
  63         movl    PARAM_DIVISOR, %eax
  64
  65         subl    $STACK_SPACE, %esp      FRAME_subl_esp(STACK_SPACE)
  66
  67         movl    %esi, SAVE_ESI
  68         movl    PARAM_SRC, %esi
  69
  70         shrl    %eax                    C d/2
  71         movl    %edi, SAVE_EDI
  72
  73         andl    $127, %eax
  74
  75 ifdef(`PIC',`
  76         LEA(    binvert_limb_table, %edi)
  77         movzbl  (%eax,%edi), %edi               C inv 8 bits
  78 ',`
  79         movzbl  binvert_limb_table(%eax), %edi  C inv 8 bits
  80 ')
  81
  82         xorl    %edx, %edx              C initial extra carry
  83         leal    (%edi,%edi), %eax       C 2*inv
  84
  85         imull   %edi, %edi              C inv*inv
  86
  87         movl    %ebx, SAVE_EBX
  88         movl    PARAM_SIZE, %ebx
  89
  90         imull   PARAM_DIVISOR, %edi     C inv*inv*d
  91
  92         subl    %edi, %eax              C inv = 2*inv - inv*inv*d
  93         leal    (%eax,%eax), %edi       C 2*inv
  94
  95         imull   %eax, %eax              C inv*inv
  96
  97         imull   PARAM_DIVISOR, %eax     C inv*inv*d
  98
  99         leal    (%esi,%ebx,4), %esi     C src end
 100         negl    %ebx                    C -size
 101
 102         subl    %eax, %edi              C inv = 2*inv - inv*inv*d
 103
 104         ASSERT(e,`      C d*inv == 1 mod 2^GMP_LIMB_BITS
 105         movl    PARAM_DIVISOR, %eax
 106         imull   %edi, %eax
 107         cmpl    $1, %eax')
 108
 109
 110 C The dependent chain here is
 111 C
 112 C       subl    %edx, %eax       1
 113 C       imull   %edi, %eax       4
 114 C       mull    PARAM_DIVISOR    5
 115 C                              ----
 116 C       total                   10
 117 C
 118 C and this is the measured speed.  No special scheduling is necessary, out
 119 C of order execution hides the load latency.
 120
 121 L(top):
 122         C eax   scratch (src limb)
 123         C ebx   counter, limbs, negative
 124         C ecx   carry bit, 0 or 1
 125         C edx   carry limb, high of last product
 126         C esi   &src[size]
 127         C edi   inverse
 128         C ebp
 129
 130         movl    (%esi,%ebx,4), %eax
 131         subl    %ecx, %eax
 132
 133         sbbl    %ecx, %ecx
 134         subl    %edx, %eax
 135
 136         sbbl    $0, %ecx
 137
 138         imull   %edi, %eax
 139
 140         negl    %ecx
 141
 142         mull    PARAM_DIVISOR
 143
 144         incl    %ebx
 145         jnz     L(top)
 146
 147
 148         movl    SAVE_ESI, %esi
 149         leal    (%ecx,%edx), %eax
 150
 151         movl    SAVE_EDI, %edi
 152
 153         movl    SAVE_EBX, %ebx
 154         addl    $STACK_SPACE, %esp
 155
 156         ret
 157
 158 EPILOGUE()