1 dnl Intel P6 mpn_mod_34lsub1 -- remainder modulo 2^24-1.
3 dnl Copyright 2000, 2001, 2002, 2004 Free Software Foundation, Inc.
5 dnl This file is part of the GNU MP Library.
7 dnl The GNU MP Library is free software; you can redistribute it and/or
8 dnl modify it under the terms of the GNU Lesser General Public License as
9 dnl published by the Free Software Foundation; either version 3 of the
10 dnl License, or (at your option) any later version.
12 dnl The GNU MP Library is distributed in the hope that it will be useful,
13 dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
14 dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 dnl Lesser General Public License for more details.
17 dnl You should have received a copy of the GNU Lesser General Public License
18 dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
20 include(`../config.m4')
26 C Experiments with more unrolling indicate that 1.5 c/l is possible on P6-13
27 C with the current carry handling scheme.
29 C mp_limb_t mpn_mod_34lsub1 (mp_srcptr src, mp_size_t size)
31 C Groups of three limbs are handled, with carry bits from 0mod3 into 1mod3
32 C into 2mod3, but at that point going into a separate carries total so we
33 C don't keep the carry flag live across the loop control. Avoiding decl
34 C lets us get to 2.0 c/l, as compared to the generic x86 code at 3.66.
37 defframe(PARAM_SIZE, 8)
38 defframe(PARAM_SRC, 4)
40 dnl re-use parameter space
41 define(SAVE_EBX, `PARAM_SIZE')
42 define(SAVE_ESI, `PARAM_SRC')
46 PROLOGUE(mpn_mod_34lsub1)
52 subl $2, %ecx C size-2
53 movl (%edx), %eax C src[0]
59 movl 4(%edx), %ecx C src[1]
61 movl %eax, %edx C src[0]
62 shrl $24, %eax C src[0] high
64 andl $0xFFFFFF, %edx C src[0] low
67 movl %ecx, %edx C src[1]
68 shrl $16, %ecx C src[1] high
73 shll $8, %edx C src[1] low
81 C eax src[0], initial acc 0mod3
90 movl 4(%edx), %ebx C src[1], initial 1mod3
91 subl $3, %ecx C size-5
94 movl 8(%edx), %esi C src[2], initial 2mod3
96 pushl %edi FRAME_pushl()
97 movl $0, %edi C initial carries 0mod3
98 jng L(done) C if size < 6
107 C edi carrys into 0mod3
117 jg L(top) C at least 3 more to process
121 C ecx is -2, -1 or 0 representing 0, 1 or 2 more limbs respectively
123 jl L(done_0) C if -2, meaning 0 more limbs
127 je L(done_1) C if -1, meaning 1 more limb only
130 addl 12(%edx), %eax C 0mod3
131 adcl %ecx, %ebx C 1mod3
132 adcl $0, %esi C 2mod3
133 adcl $0, %edi C carries 0mod3
144 movl %eax, %ecx C 0mod3
145 shrl $24, %eax C 0mod3 high initial total
147 andl $0xFFFFFF, %ecx C 0mod3 low
148 movl %edi, %edx C carries
149 shrl $24, %edi C carries high
151 addl %ecx, %eax C add 0mod3 low
152 andl $0xFFFFFF, %edx C carries 0mod3 low
153 movl %ebx, %ecx C 1mod3
155 shrl $16, %ebx C 1mod3 high
156 addl %edi, %eax C add carries high
157 addl %edx, %eax C add carries 0mod3 low
159 andl $0xFFFF, %ecx C 1mod3 low mask
160 addl %ebx, %eax C add 1mod3 high
163 shll $8, %ecx C 1mod3 low
164 movl %esi, %edx C 2mod3
165 popl %edi FRAME_popl()
167 shrl $8, %esi C 2mod3 high
168 andl $0xFF, %edx C 2mod3 low mask
169 addl %ecx, %eax C add 1mod3 low
171 shll $16, %edx C 2mod3 low
172 addl %esi, %eax C add 2mod3 high
175 addl %edx, %eax C add 2mod3 low