1 dnl Intel Pentium 4 mpn_mod_34lsub1 -- remainder modulo 2^24-1.
3 dnl Copyright 2000, 2001, 2002, 2003 Free Software Foundation, Inc.
5 dnl This file is part of the GNU MP Library.
7 dnl The GNU MP Library is free software; you can redistribute it and/or
8 dnl modify it under the terms of the GNU Lesser General Public License as
9 dnl published by the Free Software Foundation; either version 3 of the
10 dnl License, or (at your option) any later version.
12 dnl The GNU MP Library is distributed in the hope that it will be useful,
13 dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
14 dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 dnl Lesser General Public License for more details.
17 dnl You should have received a copy of the GNU Lesser General Public License
18 dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
20 include(`../config.m4')
23 C Pentium4: 1.0 cycles/limb
26 C mp_limb_t mpn_mod_34lsub1 (mp_srcptr src, mp_size_t size)
30 C There might a couple of cycles to save by using plain integer code for
31 C more small sizes. 2 limbs measures about 20 cycles, but 3 limbs jumps to
32 C about 46 (inclusive of some function call overheads).
34 defframe(PARAM_SIZE, 8)
35 defframe(PARAM_SRC, 4)
37 dnl re-use parameter space
38 define(SAVE_EBX, `PARAM_SRC')
39 define(SAVE_ESI, `PARAM_SIZE')
43 PROLOGUE(mpn_mod_34lsub1)
56 shrl $24, %eax C src[0] high
58 andl $0x00FFFFFF, %ecx C src[0] low
64 shrl $16, %ecx C src[1] low
67 andl $0x00FFFF00, %edx C src[1] high
80 psrlq $32, %mm7 C 0x00000000FFFFFFFF, low 32 bits
83 psrlq $40, %mm6 C 0x0000000000FFFFFF, low 24 bits
88 C ecx counter, size-2 to 0, -1 or -2
89 C edx src, incrementing
97 C mm6 0x0000000000FFFFFF
98 C mm7 0x00000000FFFFFFFF
114 C ecx is -2, -1 or 0 representing 0, 1 or 2 more limbs, respectively
117 js L(combine) C 0 more
122 jz L(combine) C 1 more
128 movq %mm7, %mm3 C low halves
137 psrlq $32, %mm0 C high halves
141 paddq %mm0, %mm4 C fold high halves to give 33 bits each
145 psllq $8, %mm4 C combine at respective offsets
148 paddq %mm5, %mm3 C 0x000cxxxxxxxxxxxx, 50 bits
150 pand %mm3, %mm6 C fold at 24 bits
156 ASSERT(z, C nothing left in high dword