1 dnl x86 mpn_mul_basecase -- Multiply two limb vectors and store the result
2 dnl in a third limb vector.
4 dnl Copyright 1996, 1997, 1998, 1999, 2000, 2001, 2002 Free Software
7 dnl This file is part of the GNU MP Library.
9 dnl The GNU MP Library is free software; you can redistribute it and/or
10 dnl modify it under the terms of the GNU Lesser General Public License as
11 dnl published by the Free Software Foundation; either version 3 of the
12 dnl License, or (at your option) any later version.
14 dnl The GNU MP Library is distributed in the hope that it will be useful,
15 dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
16 dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 dnl Lesser General Public License for more details.
19 dnl You should have received a copy of the GNU Lesser General Public License
20 dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
22 include(`../config.m4')
33 C void mpn_mul_basecase (mp_ptr wp,
34 C mp_srcptr xp, mp_size_t xsize,
35 C mp_srcptr yp, mp_size_t ysize);
37 C This was written in a haste since the Pentium optimized code that was used
38 C for all x86 machines was slow for the Pentium II. This code would benefit
41 C To shave off some percentage of the run-time, one should make 4 variants
42 C of the Louter loop, for the four different outcomes of un mod 4. That
43 C would avoid Loop0 altogether. Code expansion would be > 4-fold for that
44 C part of the function, but since it is not very large, that would be
47 C The mul loop (at L(oopM)) might need some tweaking. It's current speed is
50 defframe(PARAM_YSIZE,20)
51 defframe(PARAM_YP, 16)
52 defframe(PARAM_XSIZE,12)
56 defframe(VAR_MULTIPLIER, -4)
57 defframe(VAR_COUNTER, -8)
58 deflit(VAR_STACK_SPACE, 8)
63 PROLOGUE(mpn_mul_basecase)
66 subl $VAR_STACK_SPACE,%esp
70 deflit(`FRAME',eval(VAR_STACK_SPACE+12))
76 movl (%esi),%eax C load xp[0]
77 mull (%ebp) C multiply by yp[0]
78 movl %eax,(%edi) C store to wp[0]
79 movl PARAM_XSIZE,%ecx C xsize
80 decl %ecx C If xsize = 1, ysize = 1 too
91 movl (%esi),%eax C load next limb at xp[j]
102 movl %ebx,(%edi) C most significant limb of product
103 addl $4,%edi C increment wp
104 movl PARAM_XSIZE,%eax
109 movl PARAM_YSIZE,%eax C ysize
112 movl %eax,VAR_COUNTER C set index i to ysize
115 movl PARAM_YP,%ebp C yp
116 addl $4,%ebp C make ebp point to next v limb
118 movl (%ebp),%eax C copy y limb ...
119 movl %eax,VAR_MULTIPLIER C ... to stack slot
120 movl PARAM_XSIZE,%ecx
134 adcl %edx,%ebx C propagate carry into cylimb
141 movl PARAM_XSIZE,%ecx
156 adcl %eax,%ebp C new lo + cylimb
163 adcl %eax,%ebx C new lo + cylimb
170 adcl %eax,%ebp C new lo + cylimb
175 adcl $0,%ebx C propagate carry into cylimb
186 C we incremented wp and xp in the loop above; compensate
187 movl PARAM_XSIZE,%eax
192 movl VAR_COUNTER,%eax
194 movl %eax,VAR_COUNTER
206 movl %edx,4(%edi) C store to wp[1]