From: Sunil K Pandey Date: Wed, 12 Jan 2022 19:02:19 +0000 (-0800) Subject: x86_64: Fix SSE4.2 libmvec atan2 function accuracy [BZ #28765] X-Git-Tag: glibc-2.35~101 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=49e2bf58d57758df244eb621d63cedd2ab6d1971;p=platform%2Fupstream%2Fglibc.git x86_64: Fix SSE4.2 libmvec atan2 function accuracy [BZ #28765] This patch fixes SSE4.2 libmvec atan2 function accuracy for following inputs to less than 4 ulps. {0x1.bcab29da0e947p-54,0x1.bc41f4d2294b8p-54} 4.19888 ulps {0x1.b836ed678be29p-588,0x1.b7be6f5a03a8cp-588} 4.09889 ulps This fixes BZ #28765. Reviewed-by: H.J. Lu --- diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_atan22_core_sse4.S b/sysdeps/x86_64/fpu/multiarch/svml_d_atan22_core_sse4.S index 4983051..138ff2f 100644 --- a/sysdeps/x86_64/fpu/multiarch/svml_d_atan22_core_sse4.S +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_atan22_core_sse4.S @@ -65,7 +65,7 @@ ENTRY(_ZGVbN2vv_atan2_sse4) subq $88, %rsp cfi_def_cfa_offset(96) - movaps %xmm0, %xmm8 + movaps %xmm1, %xmm11 /* * #define NO_VECTOR_ZERO_ATAN2_ARGS @@ -78,134 +78,161 @@ ENTRY(_ZGVbN2vv_atan2_sse4) * Cannot be replaced by VQRCP(D, dR0, dB); * Argument Absolute values */ - movups dABS_MASK+__svml_datan2_data_internal(%rip), %xmm4 + movups dABS_MASK+__svml_datan2_data_internal(%rip), %xmm1 + movaps %xmm0, %xmm10 movaps %xmm1, %xmm9 - movaps %xmm4, %xmm1 - andps %xmm8, %xmm4 - andps %xmm9, %xmm1 - movaps %xmm4, %xmm2 - cmpnltpd %xmm1, %xmm2 + andps %xmm10, %xmm1 + andps %xmm11, %xmm9 + movaps %xmm1, %xmm4 + cmpnltpd %xmm9, %xmm4 /* Argument signs */ - movups dSIGN_MASK+__svml_datan2_data_internal(%rip), %xmm3 - movaps %xmm2, %xmm0 - movups dPIO2+__svml_datan2_data_internal(%rip), %xmm5 - movaps %xmm3, %xmm7 - movaps %xmm3, %xmm6 + movups dSIGN_MASK+__svml_datan2_data_internal(%rip), %xmm5 + movaps %xmm4, %xmm0 + movaps %xmm5, %xmm8 + movaps %xmm5, %xmm7 /* * 1) If yx then a=-x, b=y, PIO2=Pi/2 */ - orps %xmm1, %xmm3 - movaps %xmm2, %xmm10 - andps %xmm2, %xmm5 - andnps %xmm4, %xmm0 - andps %xmm2, %xmm3 - andnps %xmm1, %xmm10 - andps %xmm4, %xmm2 - orps %xmm3, %xmm0 - orps %xmm2, %xmm10 - divpd %xmm10, %xmm0 - movq iCHK_WORK_SUB+__svml_datan2_data_internal(%rip), %xmm11 - -/* if x<0, dPI = Pi, else dPI =0 */ - movaps %xmm9, %xmm3 + orps %xmm9, %xmm5 + andnps %xmm1, %xmm0 + andps %xmm4, %xmm5 + andps %xmm11, %xmm8 + movups dPIO2+__svml_datan2_data_internal(%rip), %xmm6 + orps %xmm5, %xmm0 + movaps %xmm4, %xmm5 + andps %xmm4, %xmm6 + andnps %xmm9, %xmm5 + andps %xmm1, %xmm4 + orps %xmm4, %xmm5 + andps %xmm10, %xmm7 + divpd %xmm5, %xmm0 + movq iCHK_WORK_SUB+__svml_datan2_data_internal(%rip), %xmm2 + xorl %edx, %edx /* Check if y and x are on main path. */ - pshufd $221, %xmm1, %xmm12 - andps %xmm9, %xmm7 - psubd %xmm11, %xmm12 - andps %xmm8, %xmm6 - movq iCHK_WORK_CMP+__svml_datan2_data_internal(%rip), %xmm13 - xorl %edx, %edx - movups %xmm4, 16(%rsp) + pshufd $221, %xmm9, %xmm3 xorl %eax, %eax - pshufd $221, %xmm4, %xmm14 - movdqa %xmm12, %xmm4 - pcmpgtd %xmm13, %xmm4 - pcmpeqd %xmm13, %xmm12 - por %xmm12, %xmm4 + pshufd $221, %xmm1, %xmm13 + psubd %xmm2, %xmm3 + psubd %xmm2, %xmm13 + movdqa %xmm3, %xmm4 + movq iCHK_WORK_CMP+__svml_datan2_data_internal(%rip), %xmm12 + movdqa %xmm13, %xmm14 + pcmpgtd %xmm12, %xmm4 + pcmpeqd %xmm12, %xmm3 + pcmpgtd %xmm12, %xmm14 + pcmpeqd %xmm12, %xmm13 /* Polynomial. */ movaps %xmm0, %xmm12 + por %xmm3, %xmm4 mulpd %xmm0, %xmm12 - cmplepd dZERO+__svml_datan2_data_internal(%rip), %xmm3 - psubd %xmm11, %xmm14 - movdqa %xmm14, %xmm15 - pcmpeqd %xmm13, %xmm14 - pcmpgtd %xmm13, %xmm15 - por %xmm14, %xmm15 - movaps %xmm12, %xmm14 - mulpd %xmm12, %xmm14 - por %xmm15, %xmm4 - movaps %xmm14, %xmm15 - mulpd %xmm14, %xmm15 - movmskps %xmm4, %ecx - movups %xmm10, (%rsp) - movups dA19+__svml_datan2_data_internal(%rip), %xmm10 - mulpd %xmm15, %xmm10 - movups dA18+__svml_datan2_data_internal(%rip), %xmm13 - movups dA17+__svml_datan2_data_internal(%rip), %xmm11 - addpd dA15+__svml_datan2_data_internal(%rip), %xmm10 - mulpd %xmm15, %xmm13 - mulpd %xmm15, %xmm11 - mulpd %xmm15, %xmm10 - addpd dA14+__svml_datan2_data_internal(%rip), %xmm13 - addpd dA13+__svml_datan2_data_internal(%rip), %xmm11 - addpd dA11+__svml_datan2_data_internal(%rip), %xmm10 - mulpd %xmm15, %xmm13 - mulpd %xmm15, %xmm11 - mulpd %xmm15, %xmm10 - addpd dA10+__svml_datan2_data_internal(%rip), %xmm13 - addpd dA09+__svml_datan2_data_internal(%rip), %xmm11 - addpd dA07+__svml_datan2_data_internal(%rip), %xmm10 - mulpd %xmm15, %xmm13 - mulpd %xmm15, %xmm11 - mulpd %xmm15, %xmm10 - addpd dA06+__svml_datan2_data_internal(%rip), %xmm13 - addpd dA05+__svml_datan2_data_internal(%rip), %xmm11 - addpd dA03+__svml_datan2_data_internal(%rip), %xmm10 - mulpd %xmm15, %xmm13 - mulpd %xmm15, %xmm11 - mulpd %xmm12, %xmm10 - addpd dA02+__svml_datan2_data_internal(%rip), %xmm13 - addpd dA01+__svml_datan2_data_internal(%rip), %xmm11 - addpd %xmm10, %xmm13 - mulpd %xmm11, %xmm12 - mulpd %xmm13, %xmm14 - movups dA16+__svml_datan2_data_internal(%rip), %xmm2 - mulpd %xmm15, %xmm2 - addpd dA12+__svml_datan2_data_internal(%rip), %xmm2 - mulpd %xmm15, %xmm2 - addpd dA08+__svml_datan2_data_internal(%rip), %xmm2 - mulpd %xmm15, %xmm2 - addpd dA04+__svml_datan2_data_internal(%rip), %xmm2 - -/* A00=1.0, account for it later VQFMA(D, dP4, dP4, dR8, dA00); */ - mulpd %xmm2, %xmm15 - addpd %xmm12, %xmm15 - addpd %xmm14, %xmm15 + +/* P = A19*R2 + A18 */ + movups dA19+__svml_datan2_data_internal(%rip), %xmm15 + movaps %xmm11, %xmm2 + mulpd %xmm12, %xmm15 + addpd dA18+__svml_datan2_data_internal(%rip), %xmm15 + +/* P = P*R2 + A17 */ + mulpd %xmm12, %xmm15 + addpd dA17+__svml_datan2_data_internal(%rip), %xmm15 + +/* P = P*R2 + A16 */ + mulpd %xmm12, %xmm15 + addpd dA16+__svml_datan2_data_internal(%rip), %xmm15 + +/* P = P*R2 + A15 */ + mulpd %xmm12, %xmm15 + addpd dA15+__svml_datan2_data_internal(%rip), %xmm15 + +/* P = P*R2 + A14 */ + mulpd %xmm12, %xmm15 + addpd dA14+__svml_datan2_data_internal(%rip), %xmm15 + +/* P = P*R2 + A13 */ + mulpd %xmm12, %xmm15 + addpd dA13+__svml_datan2_data_internal(%rip), %xmm15 + +/* P = P*R2 + A12 */ + mulpd %xmm12, %xmm15 + addpd dA12+__svml_datan2_data_internal(%rip), %xmm15 + +/* P = P*R2 + A11 */ + mulpd %xmm12, %xmm15 + addpd dA11+__svml_datan2_data_internal(%rip), %xmm15 + +/* P = P*R2 + A10 */ + mulpd %xmm12, %xmm15 + addpd dA10+__svml_datan2_data_internal(%rip), %xmm15 + +/* P = P*R2 + A09 */ + mulpd %xmm12, %xmm15 + addpd dA09+__svml_datan2_data_internal(%rip), %xmm15 + +/* P = P*R2 + A08 */ + mulpd %xmm12, %xmm15 + addpd dA08+__svml_datan2_data_internal(%rip), %xmm15 + +/* P = P*R2 + A07 */ + mulpd %xmm12, %xmm15 + addpd dA07+__svml_datan2_data_internal(%rip), %xmm15 + +/* P = P*R2 + A06 */ + mulpd %xmm12, %xmm15 + addpd dA06+__svml_datan2_data_internal(%rip), %xmm15 + +/* P = P*R2 + A05 */ + mulpd %xmm12, %xmm15 + addpd dA05+__svml_datan2_data_internal(%rip), %xmm15 + +/* P = P*R2 + A04 */ + mulpd %xmm12, %xmm15 + addpd dA04+__svml_datan2_data_internal(%rip), %xmm15 + +/* P = P*R2 + A03 */ + mulpd %xmm12, %xmm15 + addpd dA03+__svml_datan2_data_internal(%rip), %xmm15 + +/* P = P*R2 + A02 */ + mulpd %xmm12, %xmm15 + addpd dA02+__svml_datan2_data_internal(%rip), %xmm15 + +/* P = P*R2 + A01 */ + mulpd %xmm12, %xmm15 + addpd dA01+__svml_datan2_data_internal(%rip), %xmm15 + +/* P = P*R2 */ + mulpd %xmm15, %xmm12 /* * Reconstruction. * dP=(R+R*dP) + dPIO2 */ - mulpd %xmm0, %xmm15 - addpd %xmm15, %xmm0 - addpd %xmm5, %xmm0 - andps __svml_datan2_data_internal(%rip), %xmm3 + mulpd %xmm0, %xmm12 + addpd %xmm12, %xmm0 + +/* if x<0, dPI = Pi, else dPI =0 */ + movups dZERO+__svml_datan2_data_internal(%rip), %xmm3 + por %xmm13, %xmm14 + cmplepd %xmm3, %xmm2 + addpd %xmm6, %xmm0 + andps __svml_datan2_data_internal(%rip), %xmm2 + orps %xmm8, %xmm0 + addpd %xmm2, %xmm0 + por %xmm14, %xmm4 orps %xmm7, %xmm0 - addpd %xmm3, %xmm0 + movmskps %xmm4, %ecx /* Special branch for fast (vector) processing of zero arguments */ - movups 16(%rsp), %xmm11 - orps %xmm6, %xmm0 testb $3, %cl /* Go to auxilary branch */ jne L(AUX_BRANCH) - # LOE rbx rbp r12 r13 r14 r15 eax edx xmm0 xmm1 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm11 + # LOE rbx rbp r12 r13 r14 r15 eax edx xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 /* Return from auxilary branch * for out of main path inputs @@ -220,7 +247,7 @@ L(AUX_BRANCH_RETURN): /* Go to special inputs processing branch */ jne L(SPECIAL_VALUES_BRANCH) - # LOE rbx rbp r12 r13 r14 r15 eax edx xmm0 xmm8 xmm9 + # LOE rbx rbp r12 r13 r14 r15 eax edx xmm0 xmm10 xmm11 /* Restore registers * and exit the function @@ -237,8 +264,8 @@ L(EXIT): */ L(SPECIAL_VALUES_BRANCH): - movups %xmm8, 32(%rsp) - movups %xmm9, 48(%rsp) + movups %xmm10, 32(%rsp) + movups %xmm11, 48(%rsp) movups %xmm0, 64(%rsp) # LOE rbx rbp r12 r13 r14 r15 eax edx xmm0 @@ -315,66 +342,64 @@ L(SCALAR_MATH_CALL): */ L(AUX_BRANCH): -/* Check if at least on of Y or Y is zero: iAXAYZERO */ - movups dZERO+__svml_datan2_data_internal(%rip), %xmm2 - /* Check if both X & Y are not NaNs: iXYnotNAN */ - movaps %xmm9, %xmm12 - movaps %xmm8, %xmm10 - cmpordpd %xmm9, %xmm12 - cmpordpd %xmm8, %xmm10 - cmpeqpd %xmm2, %xmm1 - cmpeqpd %xmm2, %xmm11 - andps %xmm10, %xmm12 - orps %xmm11, %xmm1 - pshufd $221, %xmm1, %xmm1 - pshufd $221, %xmm12, %xmm11 + movaps %xmm11, %xmm13 + movaps %xmm10, %xmm12 + cmpordpd %xmm11, %xmm13 + cmpordpd %xmm10, %xmm12 -/* Check if at least on of Y or Y is zero and not NaN: iAXAYZEROnotNAN */ - pand %xmm11, %xmm1 - -/* Exclude from previous callout mask zero (and not NaN) arguments */ - movdqa %xmm1, %xmm13 - pandn %xmm4, %xmm13 +/* Check if at least on of Y or Y is zero: iAXAYZERO */ + cmpeqpd %xmm3, %xmm9 + cmpeqpd %xmm3, %xmm1 /* * Path for zero arguments (at least one of both) * Check if both args are zeros (den. is zero) */ - movups (%rsp), %xmm4 - cmpeqpd %xmm2, %xmm4 + cmpeqpd %xmm3, %xmm5 + andps %xmm12, %xmm13 + orps %xmm1, %xmm9 + pshufd $221, %xmm9, %xmm1 + pshufd $221, %xmm13, %xmm9 -/* Go to callout */ - movmskps %xmm13, %edx +/* Check if at least on of Y or Y is zero and not NaN: iAXAYZEROnotNAN */ + pand %xmm9, %xmm1 + +/* Exclude from previous callout mask zero (and not NaN) arguments */ + movdqa %xmm1, %xmm14 + pandn %xmm4, %xmm14 /* Set sPIO2 to zero if den. is zero */ - movaps %xmm4, %xmm15 - andps %xmm2, %xmm4 - andnps %xmm5, %xmm15 - andl $3, %edx - orps %xmm4, %xmm15 - pshufd $221, %xmm9, %xmm5 - orps %xmm7, %xmm15 + movaps %xmm5, %xmm4 + andnps %xmm6, %xmm4 + andps %xmm3, %xmm5 /* Res = sign(Y)*(X<0)?(PIO2+PI):PIO2 */ - pshufd $221, %xmm2, %xmm7 - pcmpgtd %xmm5, %xmm7 - pshufd $80, %xmm7, %xmm14 - andps %xmm3, %xmm14 - addpd %xmm14, %xmm15 + pshufd $221, %xmm3, %xmm3 + orps %xmm5, %xmm4 + pshufd $221, %xmm11, %xmm5 + orps %xmm8, %xmm4 + pcmpgtd %xmm5, %xmm3 + pshufd $80, %xmm3, %xmm6 + andps %xmm2, %xmm6 + addpd %xmm6, %xmm4 + +/* Go to callout */ + movmskps %xmm14, %edx /* Merge results from main and spec path */ - pshufd $80, %xmm1, %xmm3 - orps %xmm6, %xmm15 - movdqa %xmm3, %xmm6 - andps %xmm3, %xmm15 - andnps %xmm0, %xmm6 - movaps %xmm6, %xmm0 - orps %xmm15, %xmm0 + pshufd $80, %xmm1, %xmm2 + orps %xmm7, %xmm4 + movdqa %xmm2, %xmm7 + andps %xmm2, %xmm4 + andnps %xmm0, %xmm7 + andl $3, %edx + movaps %xmm7, %xmm0 + orps %xmm4, %xmm0 /* Return to main vector processing path */ jmp L(AUX_BRANCH_RETURN) - # LOE rbx rbp r12 r13 r14 r15 eax edx xmm0 xmm8 xmm9 + # LOE rbx rbp r12 r13 r14 r15 eax edx xmm0 xmm10 xmm11 END(_ZGVbN2vv_atan2_sse4) .section .rodata, "a"