1 /* Function atanf vectorized with AVX-512.
2 Copyright (C) 2021 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 https://www.gnu.org/licenses/. */
20 * ALGORITHM DESCRIPTION:
22 * For 0.0 <= x <= 7.0/16.0: atan(x) = atan(0.0) + atan(s), where s=(x-0.0)/(1.0+0.0*x)
23 * For 7.0/16.0 <= x <= 11.0/16.0: atan(x) = atan(0.5) + atan(s), where s=(x-0.5)/(1.0+0.5*x)
24 * For 11.0/16.0 <= x <= 19.0/16.0: atan(x) = atan(1.0) + atan(s), where s=(x-1.0)/(1.0+1.0*x)
25 * For 19.0/16.0 <= x <= 39.0/16.0: atan(x) = atan(1.5) + atan(s), where s=(x-1.5)/(1.0+1.5*x)
26 * For 39.0/16.0 <= x <= inf : atan(x) = atan(inf) + atan(s), where s=-1.0/x
27 * Where atan(s) ~= s+s^3*Poly11(s^2) on interval |s|<7.0/0.16.
31 /* Offsets for data table __svml_satan_data_internal_avx512
35 #define MaxThreshold 128
49 .section .text.exex512,"ax",@progbits
50 ENTRY(_ZGVeN16v_atanf_skx)
51 vandps __svml_satan_data_internal_avx512(%rip), %zmm0, %zmm7
52 vmovups MaxThreshold+__svml_satan_data_internal_avx512(%rip), %zmm3
53 vmovups One+__svml_satan_data_internal_avx512(%rip), %zmm8
55 /* round to 2 bits after binary point */
56 vreduceps $40, {sae}, %zmm7, %zmm5
58 /* saturate X range */
59 vmovups LargeX+__svml_satan_data_internal_avx512(%rip), %zmm6
60 vmovups Shifter+__svml_satan_data_internal_avx512(%rip), %zmm2
61 vcmpps $29, {sae}, %zmm3, %zmm7, %k1
63 /* table lookup sequence */
64 vmovups Tbl_H+__svml_satan_data_internal_avx512(%rip), %zmm3
65 vsubps {rn-sae}, %zmm5, %zmm7, %zmm4
66 vaddps {rn-sae}, %zmm2, %zmm7, %zmm1
67 vxorps %zmm0, %zmm7, %zmm0
68 vfmadd231ps {rn-sae}, %zmm7, %zmm4, %zmm8
69 vmovups coeff_2+__svml_satan_data_internal_avx512(%rip), %zmm4
71 /* if|X|>=MaxThreshold, set DiffX=-1 */
72 vblendmps MOne+__svml_satan_data_internal_avx512(%rip), %zmm5, %zmm9{%k1}
73 vmovups coeff_3+__svml_satan_data_internal_avx512(%rip), %zmm5
75 /* if|X|>=MaxThreshold, set Y=X */
76 vminps {sae}, %zmm7, %zmm6, %zmm8{%k1}
79 vgetmantps $0, {sae}, %zmm9, %zmm12
80 vgetexpps {sae}, %zmm9, %zmm10
81 vpermt2ps Tbl_H+64+__svml_satan_data_internal_avx512(%rip), %zmm1, %zmm3
82 vgetmantps $0, {sae}, %zmm8, %zmm15
83 vgetexpps {sae}, %zmm8, %zmm11
84 vmovups coeff_1+__svml_satan_data_internal_avx512(%rip), %zmm1
86 /* set table value to Pi/2 for large X */
87 vblendmps Pi2+__svml_satan_data_internal_avx512(%rip), %zmm3, %zmm9{%k1}
88 vrcp14ps %zmm15, %zmm13
89 vsubps {rn-sae}, %zmm11, %zmm10, %zmm2
90 vmulps {rn-sae}, %zmm13, %zmm12, %zmm14
91 vfnmadd213ps {rn-sae}, %zmm12, %zmm14, %zmm15
92 vfmadd213ps {rn-sae}, %zmm14, %zmm13, %zmm15
93 vscalefps {rn-sae}, %zmm2, %zmm15, %zmm7
95 /* polynomial evaluation */
96 vmulps {rn-sae}, %zmm7, %zmm7, %zmm8
97 vmulps {rn-sae}, %zmm7, %zmm8, %zmm6
98 vfmadd231ps {rn-sae}, %zmm8, %zmm1, %zmm4
99 vfmadd213ps {rn-sae}, %zmm5, %zmm4, %zmm8
100 vfmadd213ps {rn-sae}, %zmm7, %zmm6, %zmm8
101 vaddps {rn-sae}, %zmm9, %zmm8, %zmm10
102 vxorps %zmm0, %zmm10, %zmm0
105 END(_ZGVeN16v_atanf_skx)
107 .section .rodata, "a"
110 #ifdef __svml_satan_data_internal_avx512_typedef
111 typedef unsigned int VUINT32;
113 __declspec(align(64)) VUINT32 AbsMask[16][1];
114 __declspec(align(64)) VUINT32 Shifter[16][1];
115 __declspec(align(64)) VUINT32 MaxThreshold[16][1];
116 __declspec(align(64)) VUINT32 MOne[16][1];
117 __declspec(align(64)) VUINT32 One[16][1];
118 __declspec(align(64)) VUINT32 LargeX[16][1];
119 __declspec(align(64)) VUINT32 Zero[16][1];
120 __declspec(align(64)) VUINT32 Tbl_H[32][1];
121 __declspec(align(64)) VUINT32 Pi2[16][1];
122 __declspec(align(64)) VUINT32 coeff[3][16][1];
123 } __svml_satan_data_internal_avx512;
125 __svml_satan_data_internal_avx512:
127 .long 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff
130 .long 0x4a000000, 0x4a000000, 0x4a000000, 0x4a000000, 0x4a000000, 0x4a000000, 0x4a000000, 0x4a000000, 0x4a000000, 0x4a000000, 0x4a000000, 0x4a000000, 0x4a000000, 0x4a000000, 0x4a000000, 0x4a000000
131 /*== MaxThreshold ==*/
133 .long 0x40F80000, 0x40F80000, 0x40F80000, 0x40F80000, 0x40F80000, 0x40F80000, 0x40F80000, 0x40F80000, 0x40F80000, 0x40F80000, 0x40F80000, 0x40F80000, 0x40F80000, 0x40F80000, 0x40F80000, 0x40F80000
136 .long 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000
139 .long 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000
142 .long 0x4f800000, 0x4f800000, 0x4f800000, 0x4f800000, 0x4f800000, 0x4f800000, 0x4f800000, 0x4f800000, 0x4f800000, 0x4f800000, 0x4f800000, 0x4f800000, 0x4f800000, 0x4f800000, 0x4f800000, 0x4f800000
145 .long 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000
148 .long 0x00000000, 0x3e7adbb0
149 .long 0x3eed6338, 0x3f24bc7d
150 .long 0x3f490fdb, 0x3f6563e3
151 .long 0x3f7b985f, 0x3f869c79
152 .long 0x3f8db70d, 0x3f93877b
153 .long 0x3f985b6c, 0x3f9c6b53
154 .long 0x3f9fe0bb, 0x3fa2daa4
155 .long 0x3fa57088, 0x3fa7b46f
156 .long 0x3fa9b465, 0x3fab7b7a
157 .long 0x3fad1283, 0x3fae809e
158 .long 0x3fafcb99, 0x3fb0f836
159 .long 0x3fb20a6a, 0x3fb30581
160 .long 0x3fb3ec43, 0x3fb4c10a
161 .long 0x3fb585d7, 0x3fb63c64
162 .long 0x3fb6e62c, 0x3fb78478
163 .long 0x3fb81868, 0x3fb8a2f5
166 .long 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB
169 .long 0xbe0fa8de, 0xbe0fa8de, 0xbe0fa8de, 0xbe0fa8de, 0xbe0fa8de, 0xbe0fa8de, 0xbe0fa8de, 0xbe0fa8de, 0xbe0fa8de, 0xbe0fa8de, 0xbe0fa8de, 0xbe0fa8de, 0xbe0fa8de, 0xbe0fa8de, 0xbe0fa8de, 0xbe0fa8de
170 .long 0x3e4cc8e2, 0x3e4cc8e2, 0x3e4cc8e2, 0x3e4cc8e2, 0x3e4cc8e2, 0x3e4cc8e2, 0x3e4cc8e2, 0x3e4cc8e2, 0x3e4cc8e2, 0x3e4cc8e2, 0x3e4cc8e2, 0x3e4cc8e2, 0x3e4cc8e2, 0x3e4cc8e2, 0x3e4cc8e2, 0x3e4cc8e2
171 .long 0xbeaaaaaa, 0xbeaaaaaa, 0xbeaaaaaa, 0xbeaaaaaa, 0xbeaaaaaa, 0xbeaaaaaa, 0xbeaaaaaa, 0xbeaaaaaa, 0xbeaaaaaa, 0xbeaaaaaa, 0xbeaaaaaa, 0xbeaaaaaa, 0xbeaaaaaa, 0xbeaaaaaa, 0xbeaaaaaa, 0xbeaaaaaa
173 .type __svml_satan_data_internal_avx512,@object
174 .size __svml_satan_data_internal_avx512,.-__svml_satan_data_internal_avx512