1 /* Function atan vectorized with AVX-512.
2 Copyright (C) 2021 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 https://www.gnu.org/licenses/. */
20 * ALGORITHM DESCRIPTION:
22 * For 0.0 <= x <= 7.0/16.0: atan(x) = atan(0.0) + atan(s), where s=(x-0.0)/(1.0+0.0*x)
23 * For 7.0/16.0 <= x <= 11.0/16.0: atan(x) = atan(0.5) + atan(s), where s=(x-0.5)/(1.0+0.5*x)
24 * For 11.0/16.0 <= x <= 19.0/16.0: atan(x) = atan(1.0) + atan(s), where s=(x-1.0)/(1.0+1.0*x)
25 * For 19.0/16.0 <= x <= 39.0/16.0: atan(x) = atan(1.5) + atan(s), where s=(x-1.5)/(1.0+1.5*x)
26 * For 39.0/16.0 <= x <= inf : atan(x) = atan(inf) + atan(s), where s=-1.0/x
27 * Where atan(s) ~= s+s^3*Poly11(s^2) on interval |s|<7.0/0.16.
31 /* Offsets for data table __svml_datan_data_internal_avx512
35 #define MaxThreshold 128
53 .section .text.evex512,"ax",@progbits
54 ENTRY(_ZGVeN8v_atan_skx)
55 vmovups Shifter+__svml_datan_data_internal_avx512(%rip), %zmm4
56 vmovups MaxThreshold+__svml_datan_data_internal_avx512(%rip), %zmm3
57 vmovups One+__svml_datan_data_internal_avx512(%rip), %zmm9
59 /* saturate X range */
60 vmovups LargeX+__svml_datan_data_internal_avx512(%rip), %zmm7
61 vandpd __svml_datan_data_internal_avx512(%rip), %zmm0, %zmm8
64 vbroadcastsd .FLT_10(%rip), %zmm15
65 vaddpd {rn-sae}, %zmm4, %zmm8, %zmm2
66 vxorpd %zmm0, %zmm8, %zmm1
67 vcmppd $29, {sae}, %zmm3, %zmm8, %k2
69 /* round to 2 bits after binary point */
70 vreducepd $40, {sae}, %zmm8, %zmm6
71 vsubpd {rn-sae}, %zmm4, %zmm2, %zmm5
74 * if|X|>=MaxThreshold, set DiffX=-1
75 * VMSUB(D, DiffX, LargeMask, Zero, One);
77 vblendmpd MOne+__svml_datan_data_internal_avx512(%rip), %zmm6, %zmm10{%k2}
78 vfmadd231pd {rn-sae}, %zmm8, %zmm5, %zmm9
79 vmovups dIndexMed+__svml_datan_data_internal_avx512(%rip), %zmm5
81 /* table lookup sequence */
82 vmovups Tbl_H+__svml_datan_data_internal_avx512(%rip), %zmm6
83 vgetmantpd $0, {sae}, %zmm10, %zmm14
84 vgetexppd {sae}, %zmm10, %zmm11
85 vmovups coeff_5+__svml_datan_data_internal_avx512(%rip), %zmm10
88 * if|X|>=MaxThreshold, set Y=X
89 * VMADD(D, Y, LargeMask, X, Zero);
91 vminpd {sae}, %zmm8, %zmm7, %zmm9{%k2}
92 vcmppd $29, {sae}, %zmm5, %zmm2, %k1
93 vmovups Tbl_H+128+__svml_datan_data_internal_avx512(%rip), %zmm7
94 vmovups coeff_1+__svml_datan_data_internal_avx512(%rip), %zmm8
95 vgetmantpd $0, {sae}, %zmm9, %zmm3
96 vgetexppd {sae}, %zmm9, %zmm12
97 vmovups coeff_3+__svml_datan_data_internal_avx512(%rip), %zmm9
98 vpermt2pd Tbl_H+64+__svml_datan_data_internal_avx512(%rip), %zmm2, %zmm6
99 vsubpd {rn-sae}, %zmm12, %zmm11, %zmm4
100 vpermt2pd Tbl_H+192+__svml_datan_data_internal_avx512(%rip), %zmm2, %zmm7
101 vrcp14pd %zmm3, %zmm13
102 vmovups coeff_4+__svml_datan_data_internal_avx512(%rip), %zmm12
103 vmovups coeff_6+__svml_datan_data_internal_avx512(%rip), %zmm11
104 vblendmpd %zmm7, %zmm6, %zmm2{%k1}
105 vmulpd {rn-sae}, %zmm13, %zmm14, %zmm0
106 vfnmadd231pd {rn-sae}, %zmm3, %zmm13, %zmm15
107 vfnmadd213pd {rn-sae}, %zmm14, %zmm0, %zmm3
108 vfmadd213pd {rn-sae}, %zmm15, %zmm15, %zmm15
109 vfmadd213pd {rn-sae}, %zmm13, %zmm13, %zmm15
110 vfmadd213pd {rn-sae}, %zmm0, %zmm15, %zmm3
111 vscalefpd {rn-sae}, %zmm4, %zmm3, %zmm0
113 /* set table value to Pi/2 for large X */
114 vblendmpd Pi2+__svml_datan_data_internal_avx512(%rip), %zmm2, %zmm3{%k2}
115 vmovups coeff_2+__svml_datan_data_internal_avx512(%rip), %zmm2
117 /* polynomial evaluation */
118 vmulpd {rn-sae}, %zmm0, %zmm0, %zmm14
119 vmulpd {rn-sae}, %zmm14, %zmm14, %zmm13
120 vmulpd {rn-sae}, %zmm0, %zmm14, %zmm15
121 vfmadd231pd {rn-sae}, %zmm14, %zmm8, %zmm2
122 vfmadd231pd {rn-sae}, %zmm14, %zmm9, %zmm12
123 vfmadd213pd {rn-sae}, %zmm11, %zmm10, %zmm14
124 vfmadd213pd {rn-sae}, %zmm12, %zmm13, %zmm2
125 vfmadd213pd {rn-sae}, %zmm14, %zmm13, %zmm2
126 vfmadd213pd {rn-sae}, %zmm0, %zmm15, %zmm2
127 vaddpd {rn-sae}, %zmm3, %zmm2, %zmm0
128 vxorpd %zmm1, %zmm0, %zmm0
131 END(_ZGVeN8v_atan_skx)
133 .section .rodata, "a"
136 #ifdef __svml_datan_data_internal_avx512_typedef
137 typedef unsigned int VUINT32;
139 __declspec(align(64)) VUINT32 AbsMask[8][2];
140 __declspec(align(64)) VUINT32 Shifter[8][2];
141 __declspec(align(64)) VUINT32 MaxThreshold[8][2];
142 __declspec(align(64)) VUINT32 MOne[8][2];
143 __declspec(align(64)) VUINT32 One[8][2];
144 __declspec(align(64)) VUINT32 LargeX[8][2];
145 __declspec(align(64)) VUINT32 Zero[8][2];
146 __declspec(align(64)) VUINT32 Tbl_H[32][2];
147 __declspec(align(64)) VUINT32 dIndexMed[8][2];
148 __declspec(align(64)) VUINT32 Pi2[8][2];
149 __declspec(align(64)) VUINT32 coeff[6][8][2];
150 } __svml_datan_data_internal_avx512;
152 __svml_datan_data_internal_avx512:
154 .quad 0x7fffffffffffffff, 0x7fffffffffffffff, 0x7fffffffffffffff, 0x7fffffffffffffff, 0x7fffffffffffffff, 0x7fffffffffffffff, 0x7fffffffffffffff, 0x7fffffffffffffff
157 .quad 0x4318000000000000, 0x4318000000000000, 0x4318000000000000, 0x4318000000000000, 0x4318000000000000, 0x4318000000000000, 0x4318000000000000, 0x4318000000000000
158 /*== MaxThreshold ==*/
160 .quad 0x401f800000000000, 0x401f800000000000, 0x401f800000000000, 0x401f800000000000, 0x401f800000000000, 0x401f800000000000, 0x401f800000000000, 0x401f800000000000
163 .quad 0xbff0000000000000, 0xbff0000000000000, 0xbff0000000000000, 0xbff0000000000000, 0xbff0000000000000, 0xbff0000000000000, 0xbff0000000000000, 0xbff0000000000000
166 .quad 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000
169 .quad 0x47f0000000000000, 0x47f0000000000000, 0x47f0000000000000, 0x47f0000000000000, 0x47f0000000000000, 0x47f0000000000000, 0x47f0000000000000, 0x47f0000000000000
172 .quad 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000
175 .quad 0x0000000000000000, 0x3fcf5b75f92c80dd
176 .quad 0x3fddac670561bb4f, 0x3fe4978fa3269ee1
177 .quad 0x3fe921fb54442d18, 0x3fecac7c57846f9e
178 .quad 0x3fef730bd281f69b, 0x3ff0d38f2c5ba09f
179 .quad 0x3ff1b6e192ebbe44, 0x3ff270ef55a53a25
180 .quad 0x3ff30b6d796a4da8, 0x3ff38d6a6ce13353
181 .quad 0x3ff3fc176b7a8560, 0x3ff45b54837351a0
182 .quad 0x3ff4ae10fc6589a5, 0x3ff4f68dea672617
183 .quad 0x3ff5368c951e9cfd, 0x3ff56f6f33a3e6a7
184 .quad 0x3ff5a25052114e60, 0x3ff5d013c41adabd
185 .quad 0x3ff5f97315254857, 0x3ff61f06c6a92b89
186 .quad 0x3ff6414d44094c7c, 0x3ff660b02c736a06
187 .quad 0x3ff67d8863bc99bd, 0x3ff698213a9d5053
188 .quad 0x3ff6b0bae830c070, 0x3ff6c78c7edeb195
189 .quad 0x3ff6dcc57bb565fd, 0x3ff6f08f07435fec
190 .quad 0x3ff7030cf9403197, 0x3ff7145eac2088a4
193 .quad 0x4318000000000010, 0x4318000000000010, 0x4318000000000010, 0x4318000000000010, 0x4318000000000010, 0x4318000000000010, 0x4318000000000010, 0x4318000000000010
196 .quad 0x3ff921fb54442d18, 0x3ff921fb54442d18, 0x3ff921fb54442d18, 0x3ff921fb54442d18, 0x3ff921fb54442d18, 0x3ff921fb54442d18, 0x3ff921fb54442d18, 0x3ff921fb54442d18
199 .quad 0x3fb2e9b9f5c4fe97, 0x3fb2e9b9f5c4fe97, 0x3fb2e9b9f5c4fe97, 0x3fb2e9b9f5c4fe97, 0x3fb2e9b9f5c4fe97, 0x3fb2e9b9f5c4fe97, 0x3fb2e9b9f5c4fe97, 0x3fb2e9b9f5c4fe97
200 .quad 0xbfb74257c46790cc, 0xbfb74257c46790cc, 0xbfb74257c46790cc, 0xbfb74257c46790cc, 0xbfb74257c46790cc, 0xbfb74257c46790cc, 0xbfb74257c46790cc, 0xbfb74257c46790cc
201 .quad 0x3fbc71bfeff916a0, 0x3fbc71bfeff916a0, 0x3fbc71bfeff916a0, 0x3fbc71bfeff916a0, 0x3fbc71bfeff916a0, 0x3fbc71bfeff916a0, 0x3fbc71bfeff916a0, 0x3fbc71bfeff916a0
202 .quad 0xbfc249248eef04da, 0xbfc249248eef04da, 0xbfc249248eef04da, 0xbfc249248eef04da, 0xbfc249248eef04da, 0xbfc249248eef04da, 0xbfc249248eef04da, 0xbfc249248eef04da
203 .quad 0x3fc999999998741e, 0x3fc999999998741e, 0x3fc999999998741e, 0x3fc999999998741e, 0x3fc999999998741e, 0x3fc999999998741e, 0x3fc999999998741e, 0x3fc999999998741e
204 .quad 0xbfd555555555554d, 0xbfd555555555554d, 0xbfd555555555554d, 0xbfd555555555554d, 0xbfd555555555554d, 0xbfd555555555554d, 0xbfd555555555554d, 0xbfd555555555554d
206 .type __svml_datan_data_internal_avx512,@object
207 .size __svml_datan_data_internal_avx512,.-__svml_datan_data_internal_avx512
211 .long 0x00000000,0x3ff00000
212 .type .FLT_10,@object