1 /* UltraSPARC 64 mpn_divexact_1 -- mpn by limb exact division.
3 THE FUNCTIONS IN THIS FILE ARE FOR INTERNAL USE ONLY. THEY'RE ALMOST
4 CERTAIN TO BE SUBJECT TO INCOMPATIBLE CHANGES OR DISAPPEAR COMPLETELY IN
5 FUTURE GNU MP RELEASES.
7 Copyright 2000, 2001, 2003 Free Software Foundation, Inc.
9 This file is part of the GNU MP Library.
11 The GNU MP Library is free software; you can redistribute it and/or modify
12 it under the terms of the GNU Lesser General Public License as published by
13 the Free Software Foundation; either version 3 of the License, or (at your
14 option) any later version.
16 The GNU MP Library is distributed in the hope that it will be useful, but
17 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
18 or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
19 License for more details.
21 You should have received a copy of the GNU Lesser General Public License
22 along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
28 #include "mpn/sparc64/sparc64.h"
31 /* 64-bit divisor 32-bit divisor
32 cycles/limb cycles/limb
38 /* There are two key ideas here to reduce mulx's. Firstly when the divisor
39 is 32-bits the high of q*d can be calculated without the two 32x32->64
40 cross-products involving the high 32-bits of the divisor, that being zero
41 of course. Secondly umul_ppmm_lowequal and umul_ppmm_half_lowequal save
42 one mulx (each) knowing the low of q*d is equal to the input limb l.
44 For size==1, a simple udivx is used. This is faster than calculating an
47 For a 32-bit divisor and small sizes, an attempt was made at a simple
48 udivx loop (two per 64-bit limb), but it turned out to be slower than
49 mul-by-inverse. At size==2 the inverse is about 260 cycles total
50 compared to a udivx at 291. Perhaps the latter would suit when size==2
51 but the high 32-bits of the second limb is zero (saving one udivx), but
52 it doesn't seem worth a special case just for that. */
55 mpn_divexact_1 (mp_ptr dst, mp_srcptr src, mp_size_t size, mp_limb_t divisor)
57 mp_limb_t inverse, s, s_next, c, l, ls, q;
58 unsigned rshift, lshift;
59 mp_limb_t lshift_mask;
63 ASSERT (divisor != 0);
64 ASSERT (MPN_SAME_OR_SEPARATE_P (dst, src, size));
65 ASSERT_MPN (src, size);
66 ASSERT_LIMB (divisor);
68 s = *src++; /* src low limb */
76 if ((divisor & 1) == 0)
78 count_trailing_zeros (rshift, divisor);
84 binvert_limb (inverse, divisor);
88 /* lshift==64 means no shift, so must mask out other part in this case */
89 lshift_mask = (rshift == 0 ? 0 : MP_LIMB_T_MAX);
92 divisor_h = HIGH32 (divisor);
100 ls = (s >> rshift) | ((s_next << lshift) & lshift_mask);
103 SUBC_LIMB (c, l, ls, c);
108 umul_ppmm_half_lowequal (l, q, divisor, l);
123 mp_limb_t divisor_l = LOW32 (divisor);
127 ls = (s >> rshift) | ((s_next << lshift) & lshift_mask);
130 SUBC_LIMB (c, l, ls, c);
135 umul_ppmm_lowequal (l, q, divisor, divisor_h, divisor_l, l);