1 dnl x86 mpn_invert_limb
3 dnl Contributed to the GNU project by Niels Möller
5 dnl Copyright 2009 Free Software Foundation, Inc.
7 dnl This file is part of the GNU MP Library.
9 dnl The GNU MP Library is free software; you can redistribute it and/or
10 dnl modify it under the terms of the GNU Lesser General Public License as
11 dnl published by the Free Software Foundation; either version 3 of the
12 dnl License, or (at your option) any later version.
14 dnl The GNU MP Library is distributed in the hope that it will be useful,
15 dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
16 dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 dnl Lesser General Public License for more details.
19 dnl You should have received a copy of the GNU Lesser General Public License
20 dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
22 include(`../config.m4')
30 C Current approximation is in %eax and/or %ecx
31 C %ebx and %edx are temporaries.
32 C %esi and %ebp is unused.
34 defframe(PARAM_DIVISOR,4)
38 C Make approx_tab global to work around Apple relocation bug.
40 define(`approx_tab', MPN(invert_limb_tab))
45 PROLOGUE(mpn_invert_limb)
47 C Adding the push of %ebp and the corresponding pop seems to
48 C reduce running time from 46 to 43 cycles on K7. Don't know
49 C if this is a benchmark artefact or some alignment issue.
51 push %ebx FRAME_pushl()
52 C push %ebp FRAME_pushl()
53 push %edi FRAME_pushl()
55 mov PARAM_DIVISOR, %edi
59 LEA( approx_tab, %ebx)
60 movzwl -1024(%ebx, %eax, 2), %eax
62 movzwl -1024+approx_tab(%eax, %eax), %eax C %eax = v0
65 C v1 = (v0 << 4) - ((v0*v0*d_21) >> 32) - 1
72 mov %edi, %ebx C Prepare
75 sub %eax, %ebx C %ebx = d_31, %eax = mask
78 sub %edx, %ecx C %ecx = v1
80 C v_2 = (v1 << 15) + ((v1 *(2^48 - v1 * d31 + (v1 >> 1) & mask)) >> 33)
86 mov %edi, %eax C Prepare for next mul
89 add %edx, %ecx C %ecx = v2
95 sub %edx, %eax C %eax = v3
105 DEF_OBJECT(approx_tab,2)
106 .value 0x7fe1,0x7fa1,0x7f61,0x7f22,0x7ee3,0x7ea4,0x7e65,0x7e27
107 .value 0x7de9,0x7dab,0x7d6d,0x7d30,0x7cf3,0x7cb6,0x7c79,0x7c3d
108 .value 0x7c00,0x7bc4,0x7b89,0x7b4d,0x7b12,0x7ad7,0x7a9c,0x7a61
109 .value 0x7a27,0x79ec,0x79b2,0x7979,0x793f,0x7906,0x78cc,0x7894
110 .value 0x785b,0x7822,0x77ea,0x77b2,0x777a,0x7742,0x770b,0x76d3
111 .value 0x769c,0x7665,0x762f,0x75f8,0x75c2,0x758c,0x7556,0x7520
112 .value 0x74ea,0x74b5,0x7480,0x744b,0x7416,0x73e2,0x73ad,0x7379
113 .value 0x7345,0x7311,0x72dd,0x72aa,0x7277,0x7243,0x7210,0x71de
114 .value 0x71ab,0x7179,0x7146,0x7114,0x70e2,0x70b1,0x707f,0x704e
115 .value 0x701c,0x6feb,0x6fba,0x6f8a,0x6f59,0x6f29,0x6ef9,0x6ec8
116 .value 0x6e99,0x6e69,0x6e39,0x6e0a,0x6ddb,0x6dab,0x6d7d,0x6d4e
117 .value 0x6d1f,0x6cf1,0x6cc2,0x6c94,0x6c66,0x6c38,0x6c0a,0x6bdd
118 .value 0x6bb0,0x6b82,0x6b55,0x6b28,0x6afb,0x6acf,0x6aa2,0x6a76
119 .value 0x6a49,0x6a1d,0x69f1,0x69c6,0x699a,0x696e,0x6943,0x6918
120 .value 0x68ed,0x68c2,0x6897,0x686c,0x6842,0x6817,0x67ed,0x67c3
121 .value 0x6799,0x676f,0x6745,0x671b,0x66f2,0x66c8,0x669f,0x6676
122 .value 0x664d,0x6624,0x65fc,0x65d3,0x65aa,0x6582,0x655a,0x6532
123 .value 0x650a,0x64e2,0x64ba,0x6493,0x646b,0x6444,0x641c,0x63f5
124 .value 0x63ce,0x63a7,0x6381,0x635a,0x6333,0x630d,0x62e7,0x62c1
125 .value 0x629a,0x6275,0x624f,0x6229,0x6203,0x61de,0x61b8,0x6193
126 .value 0x616e,0x6149,0x6124,0x60ff,0x60da,0x60b6,0x6091,0x606d
127 .value 0x6049,0x6024,0x6000,0x5fdc,0x5fb8,0x5f95,0x5f71,0x5f4d
128 .value 0x5f2a,0x5f07,0x5ee3,0x5ec0,0x5e9d,0x5e7a,0x5e57,0x5e35
129 .value 0x5e12,0x5def,0x5dcd,0x5dab,0x5d88,0x5d66,0x5d44,0x5d22
130 .value 0x5d00,0x5cde,0x5cbd,0x5c9b,0x5c7a,0x5c58,0x5c37,0x5c16
131 .value 0x5bf5,0x5bd4,0x5bb3,0x5b92,0x5b71,0x5b51,0x5b30,0x5b10
132 .value 0x5aef,0x5acf,0x5aaf,0x5a8f,0x5a6f,0x5a4f,0x5a2f,0x5a0f
133 .value 0x59ef,0x59d0,0x59b0,0x5991,0x5972,0x5952,0x5933,0x5914
134 .value 0x58f5,0x58d6,0x58b7,0x5899,0x587a,0x585b,0x583d,0x581f
135 .value 0x5800,0x57e2,0x57c4,0x57a6,0x5788,0x576a,0x574c,0x572e
136 .value 0x5711,0x56f3,0x56d5,0x56b8,0x569b,0x567d,0x5660,0x5643
137 .value 0x5626,0x5609,0x55ec,0x55cf,0x55b2,0x5596,0x5579,0x555d
138 .value 0x5540,0x5524,0x5507,0x54eb,0x54cf,0x54b3,0x5497,0x547b
139 .value 0x545f,0x5443,0x5428,0x540c,0x53f0,0x53d5,0x53b9,0x539e
140 .value 0x5383,0x5368,0x534c,0x5331,0x5316,0x52fb,0x52e0,0x52c6
141 .value 0x52ab,0x5290,0x5276,0x525b,0x5240,0x5226,0x520c,0x51f1
142 .value 0x51d7,0x51bd,0x51a3,0x5189,0x516f,0x5155,0x513b,0x5121
143 .value 0x5108,0x50ee,0x50d5,0x50bb,0x50a2,0x5088,0x506f,0x5056
144 .value 0x503c,0x5023,0x500a,0x4ff1,0x4fd8,0x4fbf,0x4fa6,0x4f8e
145 .value 0x4f75,0x4f5c,0x4f44,0x4f2b,0x4f13,0x4efa,0x4ee2,0x4eca
146 .value 0x4eb1,0x4e99,0x4e81,0x4e69,0x4e51,0x4e39,0x4e21,0x4e09
147 .value 0x4df1,0x4dda,0x4dc2,0x4daa,0x4d93,0x4d7b,0x4d64,0x4d4d
148 .value 0x4d35,0x4d1e,0x4d07,0x4cf0,0x4cd8,0x4cc1,0x4caa,0x4c93
149 .value 0x4c7d,0x4c66,0x4c4f,0x4c38,0x4c21,0x4c0b,0x4bf4,0x4bde
150 .value 0x4bc7,0x4bb1,0x4b9a,0x4b84,0x4b6e,0x4b58,0x4b41,0x4b2b
151 .value 0x4b15,0x4aff,0x4ae9,0x4ad3,0x4abd,0x4aa8,0x4a92,0x4a7c
152 .value 0x4a66,0x4a51,0x4a3b,0x4a26,0x4a10,0x49fb,0x49e5,0x49d0
153 .value 0x49bb,0x49a6,0x4990,0x497b,0x4966,0x4951,0x493c,0x4927
154 .value 0x4912,0x48fe,0x48e9,0x48d4,0x48bf,0x48ab,0x4896,0x4881
155 .value 0x486d,0x4858,0x4844,0x482f,0x481b,0x4807,0x47f3,0x47de
156 .value 0x47ca,0x47b6,0x47a2,0x478e,0x477a,0x4766,0x4752,0x473e
157 .value 0x472a,0x4717,0x4703,0x46ef,0x46db,0x46c8,0x46b4,0x46a1
158 .value 0x468d,0x467a,0x4666,0x4653,0x4640,0x462c,0x4619,0x4606
159 .value 0x45f3,0x45e0,0x45cd,0x45ba,0x45a7,0x4594,0x4581,0x456e
160 .value 0x455b,0x4548,0x4536,0x4523,0x4510,0x44fe,0x44eb,0x44d8
161 .value 0x44c6,0x44b3,0x44a1,0x448f,0x447c,0x446a,0x4458,0x4445
162 .value 0x4433,0x4421,0x440f,0x43fd,0x43eb,0x43d9,0x43c7,0x43b5
163 .value 0x43a3,0x4391,0x437f,0x436d,0x435c,0x434a,0x4338,0x4327
164 .value 0x4315,0x4303,0x42f2,0x42e0,0x42cf,0x42bd,0x42ac,0x429b
165 .value 0x4289,0x4278,0x4267,0x4256,0x4244,0x4233,0x4222,0x4211
166 .value 0x4200,0x41ef,0x41de,0x41cd,0x41bc,0x41ab,0x419a,0x418a
167 .value 0x4179,0x4168,0x4157,0x4147,0x4136,0x4125,0x4115,0x4104
168 .value 0x40f4,0x40e3,0x40d3,0x40c2,0x40b2,0x40a2,0x4091,0x4081
169 .value 0x4071,0x4061,0x4050,0x4040,0x4030,0x4020,0x4010,0x4000
170 END_OBJECT(approx_tab)