Tizen 2.1 base
[external/gmp.git] / mpn / x86_64 / invert_limb.asm
1 dnl  AMD64 mpn_invert_limb -- Invert a normalized limb.
2
3 dnl  Contributed to the GNU project by Torbjorn Granlund and Niels Möller.
4
5 dnl  Copyright 2004, 2007, 2008, 2009 Free Software Foundation, Inc.
6
7 dnl  This file is part of the GNU MP Library.
8
9 dnl  The GNU MP Library is free software; you can redistribute it and/or modify
10 dnl  it under the terms of the GNU Lesser General Public License as published
11 dnl  by the Free Software Foundation; either version 3 of the License, or (at
12 dnl  your option) any later version.
13
14 dnl  The GNU MP Library is distributed in the hope that it will be useful, but
15 dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
16 dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
17 dnl  License for more details.
18
19 dnl  You should have received a copy of the GNU Lesser General Public License
20 dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
21
22 include(`../config.m4')
23
24
25 C            cycles/limb (approx)       div
26 C K8,K9:         48                      71
27 C K10:           48                      77
28 C P4:           135                     161
29 C P6 core2:      69                     116
30 C P6 corei7:     55                      89
31 C P6 atom:      129                     191
32
33 C rax rcx rdx rdi rsi r8
34
35
36 ASM_START()
37         TEXT
38         ALIGN(16)
39 PROLOGUE(mpn_invert_limb)               C                       Kn      C2      Ci
40         mov     %rdi, %rax              C                        0       0       0
41         shr     $55, %rax               C                        1       1       1
42 ifdef(`PIC',`
43 ifdef(`DARWIN',`
44         mov     approx_tab@GOTPCREL(%rip), %r8
45         add     $-512, %r8
46 ',`
47         lea     -512+approx_tab(%rip), %r8
48 ')',`
49         movabs  $-512+approx_tab, %r8
50 ')
51         movzwl  (%r8,%rax,2), R32(%rcx) C       %rcx = v0
52
53         C v1 = (v0 << 11) - (v0*v0*d40 >> 40) - 1
54         mov     %rdi, %rsi              C                        0       0       0
55         mov     R32(%rcx), R32(%rax)    C                        4       5       5
56         imul    R32(%rcx), R32(%rcx)    C                        4       5       5
57         shr     $24, %rsi               C                        1       1       1
58         inc     %rsi                    C       %rsi = d40
59         imul    %rsi, %rcx              C                        8      10       8
60         shr     $40, %rcx               C                       12      15      11
61         sal     $11, R32(%rax)          C                        5       6       6
62         dec     R32(%rax)
63         sub     R32(%rcx), R32(%rax)    C       %rax = v1
64
65         C v2 = (v1 << 13) + (v1 * (2^60 - v1*d40) >> 47
66         mov     $0x1000000000000000, %rcx
67         imul    %rax, %rsi              C                       14      17      13
68         sub     %rsi, %rcx
69         imul    %rax, %rcx
70         sal     $13, %rax
71         shr     $47, %rcx
72         add     %rax, %rcx              C       %rcx = v2
73
74         C v3 = (v2 << 31) + (v2 * (2^96 - v2 * d63 + (v2>>1) & mask) >> 65
75         mov     %rdi, %rsi              C                        0       0       0
76         shr     $1, %rsi                C d/2
77         sbb     %rax, %rax              C -d0 = -(d mod 2)
78         sub     %rax, %rsi              C d63 = ceil(d/2)
79         imul    %rcx, %rsi              C v2 * d63
80         and     %rcx, %rax              C v2 * d0
81         shr     $1, %rax                C (v2>>1) * d0
82         sub     %rsi, %rax              C (v2>>1) * d0 - v2 * d63
83         mul     %rcx
84         sal     $31, %rcx
85         shr     $1, %rdx
86         add     %rdx, %rcx              C       %rcx = v3
87
88         mov     %rdi, %rax
89         mul     %rcx
90         add     %rdi, %rax
91         mov     %rcx, %rax
92         adc     %rdi, %rdx
93         sub     %rdx, %rax
94
95         ret
96 EPILOGUE()
97
98         RODATA
99         ALIGN(2)
100 approx_tab:
101         .value  0x7fd,0x7f5,0x7ed,0x7e5,0x7dd,0x7d5,0x7ce,0x7c6
102         .value  0x7bf,0x7b7,0x7b0,0x7a8,0x7a1,0x79a,0x792,0x78b
103         .value  0x784,0x77d,0x776,0x76f,0x768,0x761,0x75b,0x754
104         .value  0x74d,0x747,0x740,0x739,0x733,0x72c,0x726,0x720
105         .value  0x719,0x713,0x70d,0x707,0x700,0x6fa,0x6f4,0x6ee
106         .value  0x6e8,0x6e2,0x6dc,0x6d6,0x6d1,0x6cb,0x6c5,0x6bf
107         .value  0x6ba,0x6b4,0x6ae,0x6a9,0x6a3,0x69e,0x698,0x693
108         .value  0x68d,0x688,0x683,0x67d,0x678,0x673,0x66e,0x669
109         .value  0x664,0x65e,0x659,0x654,0x64f,0x64a,0x645,0x640
110         .value  0x63c,0x637,0x632,0x62d,0x628,0x624,0x61f,0x61a
111         .value  0x616,0x611,0x60c,0x608,0x603,0x5ff,0x5fa,0x5f6
112         .value  0x5f1,0x5ed,0x5e9,0x5e4,0x5e0,0x5dc,0x5d7,0x5d3
113         .value  0x5cf,0x5cb,0x5c6,0x5c2,0x5be,0x5ba,0x5b6,0x5b2
114         .value  0x5ae,0x5aa,0x5a6,0x5a2,0x59e,0x59a,0x596,0x592
115         .value  0x58e,0x58a,0x586,0x583,0x57f,0x57b,0x577,0x574
116         .value  0x570,0x56c,0x568,0x565,0x561,0x55e,0x55a,0x556
117         .value  0x553,0x54f,0x54c,0x548,0x545,0x541,0x53e,0x53a
118         .value  0x537,0x534,0x530,0x52d,0x52a,0x526,0x523,0x520
119         .value  0x51c,0x519,0x516,0x513,0x50f,0x50c,0x509,0x506
120         .value  0x503,0x500,0x4fc,0x4f9,0x4f6,0x4f3,0x4f0,0x4ed
121         .value  0x4ea,0x4e7,0x4e4,0x4e1,0x4de,0x4db,0x4d8,0x4d5
122         .value  0x4d2,0x4cf,0x4cc,0x4ca,0x4c7,0x4c4,0x4c1,0x4be
123         .value  0x4bb,0x4b9,0x4b6,0x4b3,0x4b0,0x4ad,0x4ab,0x4a8
124         .value  0x4a5,0x4a3,0x4a0,0x49d,0x49b,0x498,0x495,0x493
125         .value  0x490,0x48d,0x48b,0x488,0x486,0x483,0x481,0x47e
126         .value  0x47c,0x479,0x477,0x474,0x472,0x46f,0x46d,0x46a
127         .value  0x468,0x465,0x463,0x461,0x45e,0x45c,0x459,0x457
128         .value  0x455,0x452,0x450,0x44e,0x44b,0x449,0x447,0x444
129         .value  0x442,0x440,0x43e,0x43b,0x439,0x437,0x435,0x432
130         .value  0x430,0x42e,0x42c,0x42a,0x428,0x425,0x423,0x421
131         .value  0x41f,0x41d,0x41b,0x419,0x417,0x414,0x412,0x410
132         .value  0x40e,0x40c,0x40a,0x408,0x406,0x404,0x402,0x400
133 ASM_END()