mpn/x86/mul_basecase.asm

   1 dnl  x86 mpn_mul_basecase -- Multiply two limb vectors and store the result
   2 dnl  in a third limb vector.
   3
   4 dnl  Copyright 1996, 1997, 1998, 1999, 2000, 2001, 2002 Free Software
   5 dnl  Foundation, Inc.
   6 dnl
   7 dnl  This file is part of the GNU MP Library.
   8 dnl
   9 dnl  The GNU MP Library is free software; you can redistribute it and/or
  10 dnl  modify it under the terms of the GNU Lesser General Public License as
  11 dnl  published by the Free Software Foundation; either version 3 of the
  12 dnl  License, or (at your option) any later version.
  13 dnl
  14 dnl  The GNU MP Library is distributed in the hope that it will be useful,
  15 dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
  16 dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17 dnl  Lesser General Public License for more details.
  18 dnl
  19 dnl  You should have received a copy of the GNU Lesser General Public License
  20 dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
  21
  22 include(`../config.m4')
  23
  24
  25 C     cycles/crossproduct
  26 C P5:     15
  27 C P6:      7.5
  28 C K6:     12.5
  29 C K7:      5.5
  30 C P4:     24
  31
  32
  33 C void mpn_mul_basecase (mp_ptr wp,
  34 C                        mp_srcptr xp, mp_size_t xsize,
  35 C                        mp_srcptr yp, mp_size_t ysize);
  36 C
  37 C This was written in a haste since the Pentium optimized code that was used
  38 C for all x86 machines was slow for the Pentium II.  This code would benefit
  39 C from some cleanup.
  40 C
  41 C To shave off some percentage of the run-time, one should make 4 variants
  42 C of the Louter loop, for the four different outcomes of un mod 4.  That
  43 C would avoid Loop0 altogether.  Code expansion would be > 4-fold for that
  44 C part of the function, but since it is not very large, that would be
  45 C acceptable.
  46 C
  47 C The mul loop (at L(oopM)) might need some tweaking.  It's current speed is
  48 C unknown.
  49
  50 defframe(PARAM_YSIZE,20)
  51 defframe(PARAM_YP,   16)
  52 defframe(PARAM_XSIZE,12)
  53 defframe(PARAM_XP,   8)
  54 defframe(PARAM_WP,   4)
  55
  56 defframe(VAR_MULTIPLIER, -4)
  57 defframe(VAR_COUNTER,    -8)
  58 deflit(VAR_STACK_SPACE,  8)
  59
  60         TEXT
  61         ALIGN(8)
  62
  63 PROLOGUE(mpn_mul_basecase)
  64 deflit(`FRAME',0)
  65
  66         subl    $VAR_STACK_SPACE,%esp
  67         pushl   %esi
  68         pushl   %ebp
  69         pushl   %edi
  70 deflit(`FRAME',eval(VAR_STACK_SPACE+12))
  71
  72         movl    PARAM_XP,%esi
  73         movl    PARAM_WP,%edi
  74         movl    PARAM_YP,%ebp
  75
  76         movl    (%esi),%eax             C load xp[0]
  77         mull    (%ebp)                  C multiply by yp[0]
  78         movl    %eax,(%edi)             C store to wp[0]
  79         movl    PARAM_XSIZE,%ecx        C xsize
  80         decl    %ecx                    C If xsize = 1, ysize = 1 too
  81         jz      L(done)
  82
  83         pushl   %ebx
  84 FRAME_pushl()
  85         movl    %edx,%ebx
  86
  87         leal    4(%esi),%esi
  88         leal    4(%edi),%edi
  89
  90 L(oopM):
  91         movl    (%esi),%eax             C load next limb at xp[j]
  92         leal    4(%esi),%esi
  93         mull    (%ebp)
  94         addl    %ebx,%eax
  95         movl    %edx,%ebx
  96         adcl    $0,%ebx
  97         movl    %eax,(%edi)
  98         leal    4(%edi),%edi
  99         decl    %ecx
 100         jnz     L(oopM)
 101
 102         movl    %ebx,(%edi)             C most significant limb of product
 103         addl    $4,%edi                 C increment wp
 104         movl    PARAM_XSIZE,%eax
 105         shll    $2,%eax
 106         subl    %eax,%edi
 107         subl    %eax,%esi
 108
 109         movl    PARAM_YSIZE,%eax        C ysize
 110         decl    %eax
 111         jz      L(skip)
 112         movl    %eax,VAR_COUNTER        C set index i to ysize
 113
 114 L(outer):
 115         movl    PARAM_YP,%ebp           C yp
 116         addl    $4,%ebp                 C make ebp point to next v limb
 117         movl    %ebp,PARAM_YP
 118         movl    (%ebp),%eax             C copy y limb ...
 119         movl    %eax,VAR_MULTIPLIER     C ... to stack slot
 120         movl    PARAM_XSIZE,%ecx
 121
 122         xorl    %ebx,%ebx
 123         andl    $3,%ecx
 124         jz      L(end0)
 125
 126 L(oop0):
 127         movl    (%esi),%eax
 128         mull    VAR_MULTIPLIER
 129         leal    4(%esi),%esi
 130         addl    %ebx,%eax
 131         movl    $0,%ebx
 132         adcl    %ebx,%edx
 133         addl    %eax,(%edi)
 134         adcl    %edx,%ebx               C propagate carry into cylimb
 135
 136         leal    4(%edi),%edi
 137         decl    %ecx
 138         jnz     L(oop0)
 139
 140 L(end0):
 141         movl    PARAM_XSIZE,%ecx
 142         shrl    $2,%ecx
 143         jz      L(endX)
 144
 145         ALIGN(8)
 146 L(oopX):
 147         movl    (%esi),%eax
 148         mull    VAR_MULTIPLIER
 149         addl    %eax,%ebx
 150         movl    $0,%ebp
 151         adcl    %edx,%ebp
 152
 153         movl    4(%esi),%eax
 154         mull    VAR_MULTIPLIER
 155         addl    %ebx,(%edi)
 156         adcl    %eax,%ebp       C new lo + cylimb
 157         movl    $0,%ebx
 158         adcl    %edx,%ebx
 159
 160         movl    8(%esi),%eax
 161         mull    VAR_MULTIPLIER
 162         addl    %ebp,4(%edi)
 163         adcl    %eax,%ebx       C new lo + cylimb
 164         movl    $0,%ebp
 165         adcl    %edx,%ebp
 166
 167         movl    12(%esi),%eax
 168         mull    VAR_MULTIPLIER
 169         addl    %ebx,8(%edi)
 170         adcl    %eax,%ebp       C new lo + cylimb
 171         movl    $0,%ebx
 172         adcl    %edx,%ebx
 173
 174         addl    %ebp,12(%edi)
 175         adcl    $0,%ebx         C propagate carry into cylimb
 176
 177         leal    16(%esi),%esi
 178         leal    16(%edi),%edi
 179         decl    %ecx
 180         jnz     L(oopX)
 181
 182 L(endX):
 183         movl    %ebx,(%edi)
 184         addl    $4,%edi
 185
 186         C we incremented wp and xp in the loop above; compensate
 187         movl    PARAM_XSIZE,%eax
 188         shll    $2,%eax
 189         subl    %eax,%edi
 190         subl    %eax,%esi
 191
 192         movl    VAR_COUNTER,%eax
 193         decl    %eax
 194         movl    %eax,VAR_COUNTER
 195         jnz     L(outer)
 196
 197 L(skip):
 198         popl    %ebx
 199         popl    %edi
 200         popl    %ebp
 201         popl    %esi
 202         addl    $8,%esp
 203         ret
 204
 205 L(done):
 206         movl    %edx,4(%edi)       C store to wp[1]
 207         popl    %edi
 208         popl    %ebp
 209         popl    %esi
 210         addl    $8,%esp
 211         ret
 212
 213 EPILOGUE()