1 dnl AMD64 mpn_invert_limb -- Invert a normalized limb.
3 dnl Contributed to the GNU project by Torbjorn Granlund and Niels Möller.
5 dnl Copyright 2004, 2007, 2008, 2009 Free Software Foundation, Inc.
7 dnl This file is part of the GNU MP Library.
9 dnl The GNU MP Library is free software; you can redistribute it and/or modify
10 dnl it under the terms of the GNU Lesser General Public License as published
11 dnl by the Free Software Foundation; either version 3 of the License, or (at
12 dnl your option) any later version.
14 dnl The GNU MP Library is distributed in the hope that it will be useful, but
15 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
16 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
17 dnl License for more details.
19 dnl You should have received a copy of the GNU Lesser General Public License
20 dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
22 include(`../config.m4')
25 C cycles/limb (approx) div
33 C rax rcx rdx rdi rsi r8
39 PROLOGUE(mpn_invert_limb) C Kn C2 Ci
40 mov %rdi, %rax C 0 0 0
44 mov approx_tab@GOTPCREL(%rip), %r8
47 lea -512+approx_tab(%rip), %r8
49 movabs $-512+approx_tab, %r8
51 movzwl (%r8,%rax,2), R32(%rcx) C %rcx = v0
53 C v1 = (v0 << 11) - (v0*v0*d40 >> 40) - 1
54 mov %rdi, %rsi C 0 0 0
55 mov R32(%rcx), R32(%rax) C 4 5 5
56 imul R32(%rcx), R32(%rcx) C 4 5 5
59 imul %rsi, %rcx C 8 10 8
60 shr $40, %rcx C 12 15 11
61 sal $11, R32(%rax) C 5 6 6
63 sub R32(%rcx), R32(%rax) C %rax = v1
65 C v2 = (v1 << 13) + (v1 * (2^60 - v1*d40) >> 47
66 mov $0x1000000000000000, %rcx
67 imul %rax, %rsi C 14 17 13
72 add %rax, %rcx C %rcx = v2
74 C v3 = (v2 << 31) + (v2 * (2^96 - v2 * d63 + (v2>>1) & mask) >> 65
75 mov %rdi, %rsi C 0 0 0
77 sbb %rax, %rax C -d0 = -(d mod 2)
78 sub %rax, %rsi C d63 = ceil(d/2)
79 imul %rcx, %rsi C v2 * d63
80 and %rcx, %rax C v2 * d0
81 shr $1, %rax C (v2>>1) * d0
82 sub %rsi, %rax C (v2>>1) * d0 - v2 * d63
86 add %rdx, %rcx C %rcx = v3
101 .value 0x7fd,0x7f5,0x7ed,0x7e5,0x7dd,0x7d5,0x7ce,0x7c6
102 .value 0x7bf,0x7b7,0x7b0,0x7a8,0x7a1,0x79a,0x792,0x78b
103 .value 0x784,0x77d,0x776,0x76f,0x768,0x761,0x75b,0x754
104 .value 0x74d,0x747,0x740,0x739,0x733,0x72c,0x726,0x720
105 .value 0x719,0x713,0x70d,0x707,0x700,0x6fa,0x6f4,0x6ee
106 .value 0x6e8,0x6e2,0x6dc,0x6d6,0x6d1,0x6cb,0x6c5,0x6bf
107 .value 0x6ba,0x6b4,0x6ae,0x6a9,0x6a3,0x69e,0x698,0x693
108 .value 0x68d,0x688,0x683,0x67d,0x678,0x673,0x66e,0x669
109 .value 0x664,0x65e,0x659,0x654,0x64f,0x64a,0x645,0x640
110 .value 0x63c,0x637,0x632,0x62d,0x628,0x624,0x61f,0x61a
111 .value 0x616,0x611,0x60c,0x608,0x603,0x5ff,0x5fa,0x5f6
112 .value 0x5f1,0x5ed,0x5e9,0x5e4,0x5e0,0x5dc,0x5d7,0x5d3
113 .value 0x5cf,0x5cb,0x5c6,0x5c2,0x5be,0x5ba,0x5b6,0x5b2
114 .value 0x5ae,0x5aa,0x5a6,0x5a2,0x59e,0x59a,0x596,0x592
115 .value 0x58e,0x58a,0x586,0x583,0x57f,0x57b,0x577,0x574
116 .value 0x570,0x56c,0x568,0x565,0x561,0x55e,0x55a,0x556
117 .value 0x553,0x54f,0x54c,0x548,0x545,0x541,0x53e,0x53a
118 .value 0x537,0x534,0x530,0x52d,0x52a,0x526,0x523,0x520
119 .value 0x51c,0x519,0x516,0x513,0x50f,0x50c,0x509,0x506
120 .value 0x503,0x500,0x4fc,0x4f9,0x4f6,0x4f3,0x4f0,0x4ed
121 .value 0x4ea,0x4e7,0x4e4,0x4e1,0x4de,0x4db,0x4d8,0x4d5
122 .value 0x4d2,0x4cf,0x4cc,0x4ca,0x4c7,0x4c4,0x4c1,0x4be
123 .value 0x4bb,0x4b9,0x4b6,0x4b3,0x4b0,0x4ad,0x4ab,0x4a8
124 .value 0x4a5,0x4a3,0x4a0,0x49d,0x49b,0x498,0x495,0x493
125 .value 0x490,0x48d,0x48b,0x488,0x486,0x483,0x481,0x47e
126 .value 0x47c,0x479,0x477,0x474,0x472,0x46f,0x46d,0x46a
127 .value 0x468,0x465,0x463,0x461,0x45e,0x45c,0x459,0x457
128 .value 0x455,0x452,0x450,0x44e,0x44b,0x449,0x447,0x444
129 .value 0x442,0x440,0x43e,0x43b,0x439,0x437,0x435,0x432
130 .value 0x430,0x42e,0x42c,0x42a,0x428,0x425,0x423,0x421
131 .value 0x41f,0x41d,0x41b,0x419,0x417,0x414,0x412,0x410
132 .value 0x40e,0x40c,0x40a,0x408,0x406,0x404,0x402,0x400