Upload Tizen:Base source
[external/gmp.git] / mpn / x86 / invert_limb.asm
1  dnl  x86 mpn_invert_limb
2
3 dnl  Contributed to the GNU project by Niels Möller
4
5 dnl  Copyright 2009 Free Software Foundation, Inc.
6 dnl
7 dnl  This file is part of the GNU MP Library.
8 dnl
9 dnl  The GNU MP Library is free software; you can redistribute it and/or
10 dnl  modify it under the terms of the GNU Lesser General Public License as
11 dnl  published by the Free Software Foundation; either version 3 of the
12 dnl  License, or (at your option) any later version.
13 dnl
14 dnl  The GNU MP Library is distributed in the hope that it will be useful,
15 dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
16 dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17 dnl  Lesser General Public License for more details.
18 dnl
19 dnl  You should have received a copy of the GNU Lesser General Public License
20 dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
21
22 include(`../config.m4')
23
24 C            cycles (approx)    div
25 C K7:            46             53
26
27 C Register usage:
28
29 C Input D in %edi
30 C Current approximation is in %eax and/or %ecx
31 C %ebx and %edx are temporaries.
32 C %esi and %ebp is unused.
33
34 defframe(PARAM_DIVISOR,4)
35
36 ASM_START()
37
38 C Make approx_tab global to work around Apple relocation bug.
39 ifdef(`DARWIN',`
40         define(`approx_tab', MPN(invert_limb_tab))
41         GLOBL   approx_tab')
42
43         TEXT
44         ALIGN(16)
45 PROLOGUE(mpn_invert_limb)
46 deflit(`FRAME', 0)
47         C Adding the push of %ebp and the corresponding pop seems to
48         C reduce running time from 46 to 43 cycles on K7. Don't know
49         C if this is a benchmark artefact or some alignment issue.
50
51         push    %ebx    FRAME_pushl()
52         C push  %ebp    FRAME_pushl()
53         push    %edi    FRAME_pushl()
54
55         mov     PARAM_DIVISOR, %edi
56         mov     %edi, %eax
57         shr     $22, %eax
58 ifdef(`PIC',`
59         LEA(    approx_tab, %ebx)
60         movzwl  -1024(%ebx, %eax, 2), %eax
61 ',`
62         movzwl  -1024+approx_tab(%eax, %eax), %eax      C %eax = v0
63 ')
64
65         C v1 = (v0 << 4) - ((v0*v0*d_21) >> 32) - 1
66         mov     %eax, %ecx
67         imul    %eax, %eax
68         mov     %edi, %ebx
69         shr     $11, %ebx
70         inc     %ebx
71         mul     %ebx
72         mov     %edi, %ebx                              C Prepare
73         shr     %ebx
74         sbb     %eax, %eax
75         sub     %eax, %ebx                              C %ebx = d_31, %eax = mask
76         shl     $4, %ecx
77         dec     %ecx
78         sub     %edx, %ecx                              C %ecx = v1
79
80         C v_2 = (v1 << 15) + ((v1 *(2^48 - v1 * d31 + (v1 >> 1) & mask)) >> 33)
81         imul    %ecx, %ebx
82         and     %ecx, %eax
83         shr     %eax
84         sub     %ebx, %eax
85         mul     %ecx
86         mov     %edi, %eax                              C Prepare for next mul
87         shl     $15, %ecx
88         shr     %edx
89         add     %edx, %ecx                              C %ecx = v2
90
91         mul     %ecx
92         add     %edi, %eax
93         mov     %ecx, %eax
94         adc     %edi, %edx
95         sub     %edx, %eax                              C %eax = v3
96
97         pop     %edi
98         C pop   %ebp
99         pop     %ebx
100
101         ret
102
103 EPILOGUE()
104
105 DEF_OBJECT(approx_tab,2)
106         .value  0x7fe1,0x7fa1,0x7f61,0x7f22,0x7ee3,0x7ea4,0x7e65,0x7e27
107         .value  0x7de9,0x7dab,0x7d6d,0x7d30,0x7cf3,0x7cb6,0x7c79,0x7c3d
108         .value  0x7c00,0x7bc4,0x7b89,0x7b4d,0x7b12,0x7ad7,0x7a9c,0x7a61
109         .value  0x7a27,0x79ec,0x79b2,0x7979,0x793f,0x7906,0x78cc,0x7894
110         .value  0x785b,0x7822,0x77ea,0x77b2,0x777a,0x7742,0x770b,0x76d3
111         .value  0x769c,0x7665,0x762f,0x75f8,0x75c2,0x758c,0x7556,0x7520
112         .value  0x74ea,0x74b5,0x7480,0x744b,0x7416,0x73e2,0x73ad,0x7379
113         .value  0x7345,0x7311,0x72dd,0x72aa,0x7277,0x7243,0x7210,0x71de
114         .value  0x71ab,0x7179,0x7146,0x7114,0x70e2,0x70b1,0x707f,0x704e
115         .value  0x701c,0x6feb,0x6fba,0x6f8a,0x6f59,0x6f29,0x6ef9,0x6ec8
116         .value  0x6e99,0x6e69,0x6e39,0x6e0a,0x6ddb,0x6dab,0x6d7d,0x6d4e
117         .value  0x6d1f,0x6cf1,0x6cc2,0x6c94,0x6c66,0x6c38,0x6c0a,0x6bdd
118         .value  0x6bb0,0x6b82,0x6b55,0x6b28,0x6afb,0x6acf,0x6aa2,0x6a76
119         .value  0x6a49,0x6a1d,0x69f1,0x69c6,0x699a,0x696e,0x6943,0x6918
120         .value  0x68ed,0x68c2,0x6897,0x686c,0x6842,0x6817,0x67ed,0x67c3
121         .value  0x6799,0x676f,0x6745,0x671b,0x66f2,0x66c8,0x669f,0x6676
122         .value  0x664d,0x6624,0x65fc,0x65d3,0x65aa,0x6582,0x655a,0x6532
123         .value  0x650a,0x64e2,0x64ba,0x6493,0x646b,0x6444,0x641c,0x63f5
124         .value  0x63ce,0x63a7,0x6381,0x635a,0x6333,0x630d,0x62e7,0x62c1
125         .value  0x629a,0x6275,0x624f,0x6229,0x6203,0x61de,0x61b8,0x6193
126         .value  0x616e,0x6149,0x6124,0x60ff,0x60da,0x60b6,0x6091,0x606d
127         .value  0x6049,0x6024,0x6000,0x5fdc,0x5fb8,0x5f95,0x5f71,0x5f4d
128         .value  0x5f2a,0x5f07,0x5ee3,0x5ec0,0x5e9d,0x5e7a,0x5e57,0x5e35
129         .value  0x5e12,0x5def,0x5dcd,0x5dab,0x5d88,0x5d66,0x5d44,0x5d22
130         .value  0x5d00,0x5cde,0x5cbd,0x5c9b,0x5c7a,0x5c58,0x5c37,0x5c16
131         .value  0x5bf5,0x5bd4,0x5bb3,0x5b92,0x5b71,0x5b51,0x5b30,0x5b10
132         .value  0x5aef,0x5acf,0x5aaf,0x5a8f,0x5a6f,0x5a4f,0x5a2f,0x5a0f
133         .value  0x59ef,0x59d0,0x59b0,0x5991,0x5972,0x5952,0x5933,0x5914
134         .value  0x58f5,0x58d6,0x58b7,0x5899,0x587a,0x585b,0x583d,0x581f
135         .value  0x5800,0x57e2,0x57c4,0x57a6,0x5788,0x576a,0x574c,0x572e
136         .value  0x5711,0x56f3,0x56d5,0x56b8,0x569b,0x567d,0x5660,0x5643
137         .value  0x5626,0x5609,0x55ec,0x55cf,0x55b2,0x5596,0x5579,0x555d
138         .value  0x5540,0x5524,0x5507,0x54eb,0x54cf,0x54b3,0x5497,0x547b
139         .value  0x545f,0x5443,0x5428,0x540c,0x53f0,0x53d5,0x53b9,0x539e
140         .value  0x5383,0x5368,0x534c,0x5331,0x5316,0x52fb,0x52e0,0x52c6
141         .value  0x52ab,0x5290,0x5276,0x525b,0x5240,0x5226,0x520c,0x51f1
142         .value  0x51d7,0x51bd,0x51a3,0x5189,0x516f,0x5155,0x513b,0x5121
143         .value  0x5108,0x50ee,0x50d5,0x50bb,0x50a2,0x5088,0x506f,0x5056
144         .value  0x503c,0x5023,0x500a,0x4ff1,0x4fd8,0x4fbf,0x4fa6,0x4f8e
145         .value  0x4f75,0x4f5c,0x4f44,0x4f2b,0x4f13,0x4efa,0x4ee2,0x4eca
146         .value  0x4eb1,0x4e99,0x4e81,0x4e69,0x4e51,0x4e39,0x4e21,0x4e09
147         .value  0x4df1,0x4dda,0x4dc2,0x4daa,0x4d93,0x4d7b,0x4d64,0x4d4d
148         .value  0x4d35,0x4d1e,0x4d07,0x4cf0,0x4cd8,0x4cc1,0x4caa,0x4c93
149         .value  0x4c7d,0x4c66,0x4c4f,0x4c38,0x4c21,0x4c0b,0x4bf4,0x4bde
150         .value  0x4bc7,0x4bb1,0x4b9a,0x4b84,0x4b6e,0x4b58,0x4b41,0x4b2b
151         .value  0x4b15,0x4aff,0x4ae9,0x4ad3,0x4abd,0x4aa8,0x4a92,0x4a7c
152         .value  0x4a66,0x4a51,0x4a3b,0x4a26,0x4a10,0x49fb,0x49e5,0x49d0
153         .value  0x49bb,0x49a6,0x4990,0x497b,0x4966,0x4951,0x493c,0x4927
154         .value  0x4912,0x48fe,0x48e9,0x48d4,0x48bf,0x48ab,0x4896,0x4881
155         .value  0x486d,0x4858,0x4844,0x482f,0x481b,0x4807,0x47f3,0x47de
156         .value  0x47ca,0x47b6,0x47a2,0x478e,0x477a,0x4766,0x4752,0x473e
157         .value  0x472a,0x4717,0x4703,0x46ef,0x46db,0x46c8,0x46b4,0x46a1
158         .value  0x468d,0x467a,0x4666,0x4653,0x4640,0x462c,0x4619,0x4606
159         .value  0x45f3,0x45e0,0x45cd,0x45ba,0x45a7,0x4594,0x4581,0x456e
160         .value  0x455b,0x4548,0x4536,0x4523,0x4510,0x44fe,0x44eb,0x44d8
161         .value  0x44c6,0x44b3,0x44a1,0x448f,0x447c,0x446a,0x4458,0x4445
162         .value  0x4433,0x4421,0x440f,0x43fd,0x43eb,0x43d9,0x43c7,0x43b5
163         .value  0x43a3,0x4391,0x437f,0x436d,0x435c,0x434a,0x4338,0x4327
164         .value  0x4315,0x4303,0x42f2,0x42e0,0x42cf,0x42bd,0x42ac,0x429b
165         .value  0x4289,0x4278,0x4267,0x4256,0x4244,0x4233,0x4222,0x4211
166         .value  0x4200,0x41ef,0x41de,0x41cd,0x41bc,0x41ab,0x419a,0x418a
167         .value  0x4179,0x4168,0x4157,0x4147,0x4136,0x4125,0x4115,0x4104
168         .value  0x40f4,0x40e3,0x40d3,0x40c2,0x40b2,0x40a2,0x4091,0x4081
169         .value  0x4071,0x4061,0x4050,0x4040,0x4030,0x4020,0x4010,0x4000
170 END_OBJECT(approx_tab)