1 dnl AMD64 mpn_mod_34lsub1 -- remainder modulo 2^48-1.
3 dnl Copyright 2000, 2001, 2002, 2004, 2005, 2007 Free Software Foundation,
6 dnl This file is part of the GNU MP Library.
8 dnl The GNU MP Library is free software; you can redistribute it and/or
9 dnl modify it under the terms of the GNU Lesser General Public License as
10 dnl published by the Free Software Foundation; either version 3 of the
11 dnl License, or (at your option) any later version.
13 dnl The GNU MP Library is distributed in the hope that it will be useful,
14 dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
15 dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 dnl Lesser General Public License for more details.
18 dnl You should have received a copy of the GNU Lesser General Public License
19 dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
21 include(`../config.m4')
36 C mp_limb_t mpn_mod_34lsub1 (mp_srcptr up, mp_size_t n)
39 C * Apply the movzwl tricks to the x86/k7 code
40 C * Review feed-in and wind-down code. In particular, try to avoid adc and
41 C sbb to placate Pentium4.
42 C * More unrolling and/or index addressing could bring time to under 1 c/l
43 C for Athlon64, approaching 0.67 c/l seems possible.
44 C * There are recurrencies on the carry registers (r8, r9, r10) that might
45 C be the limiting factor for the Pentium4 speed. Splitting these into 6
46 C registers would help.
47 C * For ultimate Athlon64 performance, a sequence like this might be best.
48 C It should reach 0.5 c/l (limited by L1 cache bandwidth).
64 PROLOGUE(mpn_mod_34lsub1)
66 mov $0x0000FFFFFFFFFFFF, %r11
77 shr $48, %rax C src[0] low
79 and %r11, %rdx C src[0] high
83 shr $32, %rsi C src[1] high
86 shl $16, %rdx C src[1] low
93 L(gt2): xor %eax, %eax
100 L(top): add (%rdi), %rax
123 L(end): add %r9, %rax
137 mov $0x100000000, %r10
140 sbb %rsi, %rsi C carry
141 mov %rax, %rdi C 0mod3
142 shr $48, %rax C 0mod3 high
144 and %r10, %rsi C carry masked
145 and %r11, %rdi C 0mod3 low
146 mov %ecx, %r10d C 1mod3
148 add %rsi, %rax C apply carry
149 shr $32, %rcx C 1mod3 high
151 add %rdi, %rax C apply 0mod3 low
152 movzwl %dx, %edi C 2mod3
153 shl $16, %r10 C 1mod3 low
155 add %rcx, %rax C apply 1mod3 high
156 shr $16, %rdx C 2mod3 high
158 add %r10, %rax C apply 1mod3 low
159 shl $32, %rdi C 2mod3 low
161 add %rdx, %rax C apply 2mod3 high
162 add %rdi, %rax C apply 2mod3 low