mpn/x86/k6/mod_34lsub1.asm

   1 dnl  AMD K6 mpn_mod_34lsub1 -- mpn remainder modulo 2**24-1.
   2
   3 dnl  Copyright 2000, 2001, 2002 Free Software Foundation, Inc.
   4 dnl
   5 dnl  This file is part of the GNU MP Library.
   6 dnl
   7 dnl  The GNU MP Library is free software; you can redistribute it and/or
   8 dnl  modify it under the terms of the GNU Lesser General Public License as
   9 dnl  published by the Free Software Foundation; either version 3 of the
  10 dnl  License, or (at your option) any later version.
  11 dnl
  12 dnl  The GNU MP Library is distributed in the hope that it will be useful,
  13 dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
  14 dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15 dnl  Lesser General Public License for more details.
  16 dnl
  17 dnl  You should have received a copy of the GNU Lesser General Public License
  18 dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
  19
  20 include(`../config.m4')
  21
  22
  23 C K6: 2.66 cycles/limb
  24
  25
  26 C mp_limb_t mpn_mod_34lsub1 (mp_srcptr src, mp_size_t size)
  27 C
  28 C An attempt was made to use a loop like
  29 C
  30 C L(top):
  31 C       adcl    (%edx), %eax
  32 C       adcl    4(%edx), %ebx
  33 C       adcl    8(%edx), %esi
  34 C       leal    12(%edx), %edx
  35 C       loop    L(top)
  36 C
  37 C with %ecx starting from floor(size/3), but it still measured 2.66 c/l.
  38 C The form used instead can save about 6 cycles by not dividing by 3.
  39 C
  40 C In the code used, putting the "leal"s at the top of the loop is necessary
  41 C for the claimed speed, anywhere else costs an extra cycle per loop.
  42 C Perhaps a tight loop like this needs short decode instructions at the
  43 C branch target, which would explain the leal/loop form above taking 8
  44 C cycles instead of 7 too.
  45
  46 defframe(PARAM_SIZE, 8)
  47 defframe(PARAM_SRC,  4)
  48
  49 dnl  re-use parameter space
  50 define(SAVE_EBX, `PARAM_SIZE')
  51 define(SAVE_ESI, `PARAM_SRC')
  52
  53         TEXT
  54         ALIGN(16)
  55 PROLOGUE(mpn_mod_34lsub1)
  56 deflit(`FRAME',0)
  57
  58         movl    PARAM_SIZE, %eax
  59         movl    PARAM_SRC, %edx
  60
  61         subl    $2, %eax
  62         ja      L(three_or_more)
  63
  64 Zdisp(  movl,   0,(%edx), %eax)         C avoid code cache line boundary
  65         jne     L(one)
  66
  67         movl    %eax, %ecx
  68         movl    4(%edx), %edx
  69
  70         shrl    $24, %eax               C src[0] high
  71         andl    $0x00FFFFFF, %ecx       C src[0] low
  72
  73         addl    %ecx, %eax
  74         movl    %edx, %ecx
  75
  76         shll    $8, %edx
  77         andl    $0x00FFFF00, %edx       C src[1] high
  78
  79         shrl    $16, %ecx               C src[1] low
  80         addl    %ecx, %eax
  81
  82         addl    %edx, %eax
  83
  84 L(one):
  85         ret
  86
  87
  88 L(three_or_more):
  89         C eax   size-2
  90         C ebx
  91         C ecx
  92         C edx   src
  93
  94         movl    %ebx, SAVE_EBX
  95         xorl    %ebx, %ebx
  96
  97         movl    %esi, SAVE_ESI
  98         pushl   %edi    FRAME_pushl()
  99
 100         xorl    %esi, %esi
 101         xorl    %edi, %edi              C and clear carry flag
 102
 103 L(top):
 104         C eax   counter, limbs
 105         C ebx   acc 0mod3
 106         C ecx
 107         C edx   src, incrementing
 108         C esi   acc 1mod3
 109         C edi   acc 2mod3
 110         C ebp
 111
 112         leal    -2(%eax), %eax
 113         leal    12(%edx), %edx
 114
 115         adcl    -12(%edx), %ebx
 116         adcl    -8(%edx), %esi
 117         adcl    -4(%edx), %edi
 118
 119         decl    %eax
 120         jg      L(top)
 121
 122
 123         C ecx is -3, -2 or -1 representing 0, 1 or 2 more limbs, respectively
 124
 125         movb    $0, %cl
 126         incl    %eax
 127
 128         js      L(combine)              C 0 more
 129
 130 Zdisp(  adcl,   0,(%edx), %ebx)         C avoid code cache line crossings
 131
 132         movb    $8, %cl
 133         decl    %eax
 134
 135         js      L(combine)              C 1 more
 136
 137         adcl    4(%edx), %esi
 138
 139         movb    $16, %cl
 140
 141
 142 L(combine):
 143         sbbl    %edx, %edx
 144
 145         shll    %cl, %edx               C carry
 146         movl    %ebx, %eax              C 0mod3
 147
 148         shrl    $24, %eax               C 0mod3 high
 149         andl    $0x00FFFFFF, %ebx       C 0mod3 low
 150
 151         subl    %edx, %eax              C apply carry
 152         movl    %esi, %ecx              C 1mod3
 153
 154         shrl    $16, %esi               C 1mod3 high
 155         addl    %ebx, %eax              C apply 0mod3 low
 156
 157         andl    $0x0000FFFF, %ecx
 158         addl    %esi, %eax              C apply 1mod3 high
 159
 160         shll    $8, %ecx                C 1mod3 low
 161         movl    %edi, %edx              C 2mod3
 162
 163         shrl    $8, %edx                C 2mod3 high
 164         addl    %ecx, %eax              C apply 1mod3 low
 165
 166         addl    %edx, %eax              C apply 2mod3 high
 167         andl    $0x000000FF, %edi
 168
 169         shll    $16, %edi               C 2mod3 low
 170         movl    SAVE_EBX, %ebx
 171
 172         addl    %edi, %eax              C apply 2mod3 low
 173         movl    SAVE_ESI, %esi
 174
 175         popl    %edi
 176
 177         ret
 178
 179 EPILOGUE()