2012-04-22 Mike Frysinger <vapier@gentoo.org>
+ * sysdeps/ia64/fpu/e_acosf.S: Trim trailing whitespace.
+ * sysdeps/ia64/fpu/e_acoshl.S: Likewise.
+ * sysdeps/ia64/fpu/e_acosl.S: Likewise.
+ * sysdeps/ia64/fpu/e_asinf.S: Likewise.
+ * sysdeps/ia64/fpu/e_asinl.S: Likewise.
+ * sysdeps/ia64/fpu/e_atan2f.S: Likewise.
+ * sysdeps/ia64/fpu/e_atanhl.S: Likewise.
+ * sysdeps/ia64/fpu/e_coshl.S: Likewise.
+ * sysdeps/ia64/fpu/e_exp.S: Likewise.
+ * sysdeps/ia64/fpu/e_expf.S: Likewise.
+ * sysdeps/ia64/fpu/e_fmodl.S: Likewise.
+ * sysdeps/ia64/fpu/e_hypot.S: Likewise.
+ * sysdeps/ia64/fpu/e_hypotf.S: Likewise.
+ * sysdeps/ia64/fpu/e_hypotl.S: Likewise.
+ * sysdeps/ia64/fpu/e_log.S: Likewise.
+ * sysdeps/ia64/fpu/e_log2.S: Likewise.
+ * sysdeps/ia64/fpu/e_log2f.S: Likewise.
+ * sysdeps/ia64/fpu/e_log2l.S: Likewise.
+ * sysdeps/ia64/fpu/e_logl.S: Likewise.
+ * sysdeps/ia64/fpu/e_powf.S: Likewise.
+ * sysdeps/ia64/fpu/e_remainder.S: Likewise.
+ * sysdeps/ia64/fpu/e_remainderf.S: Likewise.
+ * sysdeps/ia64/fpu/e_remainderl.S: Likewise.
+ * sysdeps/ia64/fpu/e_scalb.S: Likewise.
+ * sysdeps/ia64/fpu/e_scalbf.S: Likewise.
+ * sysdeps/ia64/fpu/e_scalbl.S: Likewise.
+ * sysdeps/ia64/fpu/e_sinhl.S: Likewise.
+ * sysdeps/ia64/fpu/e_sqrt.S: Likewise.
+ * sysdeps/ia64/fpu/e_sqrtf.S: Likewise.
+ * sysdeps/ia64/fpu/e_sqrtl.S: Likewise.
+ * sysdeps/ia64/fpu/libm_cpu_defs.h: Likewise.
+ * sysdeps/ia64/fpu/libm_error_codes.h: Likewise.
+ * sysdeps/ia64/fpu/libm_frexp.S: Likewise.
+ * sysdeps/ia64/fpu/libm_frexpf.S: Likewise.
+ * sysdeps/ia64/fpu/libm_frexpl.S: Likewise.
+ * sysdeps/ia64/fpu/libm_scalblnf.S: Likewise.
+ * sysdeps/ia64/fpu/libm_tan.S: Likewise.
+ * sysdeps/ia64/fpu/s_asinhl.S: Likewise.
+ * sysdeps/ia64/fpu/s_atanf.S: Likewise.
+ * sysdeps/ia64/fpu/s_atanl.S: Likewise.
+ * sysdeps/ia64/fpu/s_cbrtl.S: Likewise.
+ * sysdeps/ia64/fpu/s_cos.S: Likewise.
+ * sysdeps/ia64/fpu/s_cosf.S: Likewise.
+ * sysdeps/ia64/fpu/s_erf.S: Likewise.
+ * sysdeps/ia64/fpu/s_erfc.S: Likewise.
+ * sysdeps/ia64/fpu/s_erfcf.S: Likewise.
+ * sysdeps/ia64/fpu/s_erfcl.S: Likewise.
+ * sysdeps/ia64/fpu/s_erff.S: Likewise.
+ * sysdeps/ia64/fpu/s_erfl.S: Likewise.
+ * sysdeps/ia64/fpu/s_expm1.S: Likewise.
+ * sysdeps/ia64/fpu/s_expm1f.S: Likewise.
+ * sysdeps/ia64/fpu/s_expm1l.S: Likewise.
+ * sysdeps/ia64/fpu/s_fabs.S: Likewise.
+ * sysdeps/ia64/fpu/s_fabsf.S: Likewise.
+ * sysdeps/ia64/fpu/s_fabsl.S: Likewise.
+ * sysdeps/ia64/fpu/s_finite.S: Likewise.
+ * sysdeps/ia64/fpu/s_fma.S: Likewise.
+ * sysdeps/ia64/fpu/s_fmaf.S: Likewise.
+ * sysdeps/ia64/fpu/s_fmal.S: Likewise.
+ * sysdeps/ia64/fpu/s_fmax.S: Likewise.
+ * sysdeps/ia64/fpu/s_fmaxf.S: Likewise.
+ * sysdeps/ia64/fpu/s_fmaxl.S: Likewise.
+ * sysdeps/ia64/fpu/s_fpclassify.S: Likewise.
+ * sysdeps/ia64/fpu/s_frexp.c: Likewise.
+ * sysdeps/ia64/fpu/s_frexpf.c: Likewise.
+ * sysdeps/ia64/fpu/s_frexpl.c: Likewise.
+ * sysdeps/ia64/fpu/s_ldexp.c: Likewise.
+ * sysdeps/ia64/fpu/s_ldexpf.c: Likewise.
+ * sysdeps/ia64/fpu/s_ldexpl.c: Likewise.
+ * sysdeps/ia64/fpu/s_log1pl.S: Likewise.
+ * sysdeps/ia64/fpu/s_modf.S: Likewise.
+ * sysdeps/ia64/fpu/s_modff.S: Likewise.
+ * sysdeps/ia64/fpu/s_modfl.S: Likewise.
+ * sysdeps/ia64/fpu/s_nextafter.S: Likewise.
+ * sysdeps/ia64/fpu/s_nextafterf.S: Likewise.
+ * sysdeps/ia64/fpu/s_nextafterl.S: Likewise.
+ * sysdeps/ia64/fpu/s_nexttoward.S: Likewise.
+ * sysdeps/ia64/fpu/s_nexttowardf.S: Likewise.
+ * sysdeps/ia64/fpu/s_nexttowardl.S: Likewise.
+ * sysdeps/ia64/fpu/s_round.S: Likewise.
+ * sysdeps/ia64/fpu/s_roundf.S: Likewise.
+ * sysdeps/ia64/fpu/s_roundl.S: Likewise.
+ * sysdeps/ia64/fpu/s_scalblnf.c: Likewise.
+ * sysdeps/ia64/fpu/s_scalbn.c: Likewise.
+ * sysdeps/ia64/fpu/s_scalbnf.c: Likewise.
+ * sysdeps/ia64/fpu/s_scalbnl.c: Likewise.
+ * sysdeps/ia64/fpu/s_signbit.S: Likewise.
+ * sysdeps/ia64/fpu/s_significand.S: Likewise.
+ * sysdeps/ia64/fpu/s_significandf.S: Likewise.
+ * sysdeps/ia64/fpu/s_significandl.S: Likewise.
+ * sysdeps/ia64/fpu/s_tan.S: Likewise.
+ * sysdeps/ia64/fpu/s_tanf.S: Likewise.
+ * sysdeps/ia64/fpu/s_tanh.S: Likewise.
+ * sysdeps/ia64/fpu/s_tanhf.S: Likewise.
+ * sysdeps/ia64/fpu/s_tanhl.S: Likewise.
+ * sysdeps/ia64/fpu/s_tanl.S: Likewise.
+ * sysdeps/ia64/fpu/w_tgamma.S: Likewise.
+ * sysdeps/ia64/fpu/w_tgammaf.S: Likewise.
+ * sysdeps/ia64/fpu/w_tgammal.S: Likewise.
+ * sysdeps/ia64/softpipe.h: Likewise.
+ * sysdeps/ia64/strchr.S: Likewise.
+ * sysdeps/ia64/strlen.S: Likewise.
+ * sysdeps/ia64/strncmp.S: Likewise.
+ * sysdeps/unix/sysv/linux/ia64/register-dump.h: Likewise.
+
+2012-04-22 Mike Frysinger <vapier@gentoo.org>
+
* sysdeps/ia64/Implies: Copied from the main tree.
* sysdeps/ia64/Makefile: Likewise.
* sysdeps/ia64/Versions: Likewise.
// The acosf function returns the arc cosine in the range [0, +pi] radians.
// acos(1) returns +0
-// acos(x) returns a Nan and raises the invalid exception for |x| >1
+// acos(x) returns a Nan and raises the invalid exception for |x| >1
// |x| <= sqrt(2)/2. get Ax and Bx
.section .text
GLOBAL_LIBM_ENTRY(acosf)
-
+
// Load the addresses of the two tables.
// Then, load the coefficients and other constants.
-{ .mfi
+{ .mfi
alloc r32 = ar.pfs,1,8,4,0
fnma.s1 acosf_t = f8,f8,f1
dep.z ACOSF_GR_1by2 = 0x3f,24,8 // 0x3f000000
-}
-{ .mfi
+}
+{ .mfi
addl ACOSF_Addr1 = @ltoff(acosf_coeff_1_table),gp
fma.s1 acosf_x2 = f8,f8,f0
addl ACOSF_Addr2 = @ltoff(acosf_coeff_2_table),gp ;;
}
-
-{ .mfi
+
+{ .mfi
ld8 ACOSF_Addr1 = [ACOSF_Addr1]
fmerge.s acosf_abs_x = f1,f8
dep ACOSF_GR_3by2 = -1,r0,22,8 // 0x3fc00000
-}
-{ .mlx
+}
+{ .mlx
nop.m 999
movl ACOSF_GR_5by2 = 0x40200000;;
}
-
-{ .mfi
+
+{ .mfi
setf.s acosf_1by2 = ACOSF_GR_1by2
fmerge.s acosf_sgn_x = f8,f1
nop.i 999
-}
-{ .mfi
+}
+{ .mfi
ld8 ACOSF_Addr2 = [ACOSF_Addr2]
nop.f 0
nop.i 999;;
}
-
-{ .mfi
+
+{ .mfi
setf.s acosf_5by2 = ACOSF_GR_5by2
fcmp.lt.s1 p11,p12 = f8,f0
nop.i 999;;
}
-{ .mmf
+{ .mmf
ldfpd acosf_coeff_P1,acosf_coeff_P4 = [ACOSF_Addr1],16
setf.s acosf_3by2 = ACOSF_GR_3by2
fclass.m.unc p8,p0 = f8, 0xc3 ;; //@qnan | @snan
}
-
-{ .mfi
+
+{ .mfi
ldfpd acosf_coeff_P7,acosf_coeff_P6 = [ACOSF_Addr1],16
fma.s1 acosf_t2 = acosf_t,acosf_t,f0
nop.i 999
-}
-{ .mfi
+}
+{ .mfi
ldfpd acosf_coeff_P3,acosf_coeff_P8 = [ACOSF_Addr2],16
fma.s1 acosf_x4 = acosf_x2,acosf_x2,f0
nop.i 999;;
}
-
-{ .mfi
+
+{ .mfi
ldfpd acosf_coeff_P9,acosf_const_sqrt2by2 = [ACOSF_Addr1]
fclass.m.unc p10,p0 = f8, 0x07 //@zero
nop.i 999
-}
-{ .mfi
+}
+{ .mfi
ldfpd acosf_coeff_P5,acosf_coeff_P2 = [ACOSF_Addr2],16
fma.s1 acosf_x3 = f8,acosf_x2,f0
nop.i 999;;
}
-
-{ .mfi
+
+{ .mfi
ldfd acosf_const_piby2 = [ACOSF_Addr2]
frsqrta.s1 acosf_B,p0 = acosf_t
nop.i 999
-}
-{ .mfb
+}
+{ .mfb
nop.m 999
(p8) fma.s.s0 f8 = f8,f1,f0
(p8) br.ret.spnt b0 ;; // Exit if x=nan
}
-
-{ .mfb
+
+{ .mfb
nop.m 999
fcmp.eq.s1 p6,p0 = acosf_abs_x,f1
(p10) br.cond.spnt ACOSF_ZERO ;; // Branch if x=0
-}
-
-{ .mfi
+}
+
+{ .mfi
nop.m 999
fcmp.gt.s1 p9,p0 = acosf_abs_x,f1
nop.i 999;;
-}
-
-{ .mfi
+}
+
+{ .mfi
nop.m 999
fma.s1 acosf_x8 = acosf_x4,acosf_x4,f0
nop.i 999
-}
-{ .mfb
+}
+{ .mfb
nop.m 999
fma.s1 acosf_t4 = acosf_t2,acosf_t2,f0
(p6) br.cond.spnt ACOSF_ABS_ONE ;; // Branch if |x|=1
-}
+}
-{ .mfi
+{ .mfi
nop.m 999
fma.s1 acosf_x5 = acosf_x2,acosf_x3,f0
nop.i 999
}
-{ .mfb
+{ .mfb
(p9) mov GR_Parameter_TAG = 59
fma.s1 acosf_yby2 = acosf_t,acosf_1by2,f0
(p9) br.cond.spnt __libm_error_region ;; // Branch if |x|>1
}
-{ .mfi
+{ .mfi
nop.m 999
fma.s1 acosf_Az = acosf_t,acosf_B,f0
nop.i 999
-}
-{ .mfi
+}
+{ .mfi
nop.m 999
fma.s1 acosf_B2 = acosf_B,acosf_B,f0
nop.i 999;;
}
-
-{ .mfi
+
+{ .mfi
nop.m 999
fma.s1 acosf_poly_p1 = f8,acosf_coeff_P1,f0
nop.i 999
-}
-{ .mfi
+}
+{ .mfi
nop.m 999
fma.s1 acosf_2poly_p1 = acosf_coeff_P1,acosf_t,f1
nop.i 999;;
}
-{ .mfi
+{ .mfi
nop.m 999
fma.s1 acosf_poly_p3 = acosf_coeff_P4,acosf_x2,acosf_coeff_P3
nop.i 999
-}
-{ .mfi
+}
+{ .mfi
nop.m 999
fma.s1 acosf_2poly_p6 = acosf_coeff_P7,acosf_t,acosf_coeff_P6
nop.i 999;;
-}
+}
-{ .mfi
+{ .mfi
nop.m 999
fma.s1 acosf_poly_p7 = acosf_x2,acosf_coeff_P8,acosf_coeff_P7
nop.i 999
-}
-{ .mfi
+}
+{ .mfi
nop.m 999
fma.s1 acosf_2poly_p2 = acosf_coeff_P3,acosf_t,acosf_coeff_P2
nop.i 999;;
}
-
-{ .mfi
+
+{ .mfi
nop.m 999
fma.s1 acosf_poly_p5 = acosf_x2,acosf_coeff_P6,acosf_coeff_P5
nop.i 999
-}
-{ .mfi
+}
+{ .mfi
nop.m 999
fma.s1 acosf_2poly_p4 = acosf_coeff_P5,acosf_t,acosf_coeff_P4
nop.i 999;;
}
-
-{ .mfi
+
+{ .mfi
nop.m 999
fma.s1 acosf_x11 = acosf_x8,acosf_x3,f0
nop.i 999
-}
-{ .mfi
+}
+{ .mfi
nop.m 999
fnma.s1 acosf_dz = acosf_B2,acosf_yby2,acosf_1by2
nop.i 999;;
}
-
-{ .mfi
+
+{ .mfi
nop.m 999
fma.s1 acosf_poly_p1a = acosf_x2,acosf_poly_p1,f8
nop.i 999
}
-{ .mfi
+{ .mfi
nop.m 999
fma.s1 acosf_2poly_p8 = acosf_coeff_P9,acosf_t,acosf_coeff_P8
nop.i 999;;
}
-
+
// Get the absolute value of x and determine the region in which x lies
-{ .mfi
+{ .mfi
nop.m 999
fcmp.le.s1 p7,p8 = acosf_abs_x,acosf_const_sqrt2by2
nop.i 999
-}
-{ .mfi
+}
+{ .mfi
nop.m 999
fma.s1 acosf_poly_p2 = acosf_x2,acosf_poly_p3,acosf_coeff_P2
nop.i 999;;
}
-
-{ .mfi
+
+{ .mfi
nop.m 999
fma.s1 acosf_poly_p7a = acosf_x4,acosf_coeff_P9,acosf_poly_p7
nop.i 999
-}
-{ .mfi
+}
+{ .mfi
nop.m 999
fma.s1 acosf_2poly_p2a = acosf_2poly_p2,acosf_t2,acosf_2poly_p1
nop.i 999;;
}
-
-{ .mfi
+
+{ .mfi
nop.m 999
(p8) fma.s1 acosf_sgnx_t4 = acosf_sgn_x,acosf_t4,f0
nop.i 999
-}
-{ .mfi
+}
+{ .mfi
nop.m 999
(p8) fma.s1 acosf_2poly_p4a = acosf_2poly_p6,acosf_t2,acosf_2poly_p4
nop.i 999;;
}
-
-{ .mfi
+
+{ .mfi
nop.m 999
(p8) fma.s1 acosf_Sz = acosf_5by2,acosf_dz,acosf_3by2
nop.i 999
-}
-{ .mfi
+}
+{ .mfi
nop.m 999
(p8) fma.s1 acosf_d2z = acosf_dz,acosf_dz,f0
nop.i 999;;
}
-
-{ .mfi
+
+{ .mfi
nop.m 999
(p8) fnma.d.s1 acosf_sgn_x_piby2 = acosf_sgn_x,acosf_const_piby2,acosf_const_piby2
nop.i 999
-}
-{ .mfi
+}
+{ .mfi
nop.m 999
(p7) fma.s1 acosf_poly_Ax = acosf_x5,acosf_poly_p2,acosf_poly_p1a
nop.i 999;;
-}
-
-{ .mfi
+}
+
+{ .mfi
nop.m 999
(p7) fma.s1 acosf_poly_Bx = acosf_x4,acosf_poly_p7a,acosf_poly_p5
nop.i 999
-}
-{ .mfi
+}
+{ .mfi
nop.m 999
(p8) fma.s1 acosf_sgnx_2poly_p2 = acosf_sgn_x,acosf_2poly_p2a,f0
nop.i 999;;
-}
-
-{ .mfi
+}
+
+{ .mfi
nop.m 999
fcmp.eq.s0 p6,p0 = f8,f0 // Only purpose is to set D if x denormal
nop.i 999
}
-{ .mfi
+{ .mfi
nop.m 999
(p8) fma.s1 acosf_2poly_p4b = acosf_2poly_p8,acosf_t4,acosf_2poly_p4a
nop.i 999;;
}
-
-{ .mfi
+
+{ .mfi
nop.m 999
(p8) fma.s1 acosf_Fz = acosf_d2z,acosf_Sz,acosf_dz
nop.i 999;;
-}
+}
-
-{ .mfi
+
+{ .mfi
nop.m 999
(p8) fma.d.s1 acosf_Pt = acosf_2poly_p4b,acosf_sgnx_t4,acosf_sgnx_2poly_p2
nop.i 999;;
-}
-
-{ .mfi
+}
+
+{ .mfi
nop.m 999
(p8) fma.d.s1 acosf_z = acosf_Az,acosf_Fz,acosf_Az
nop.i 999 ;;
-}
-
-{ .mfi
+}
+
+{ .mfi
nop.m 999
(p7) fma.d.s1 acosf_sinf1 = acosf_x11,acosf_poly_Bx,acosf_poly_Ax
nop.i 999;;
-}
-
+}
+
.pred.rel "mutex",p8,p7 //acosf_pred_GTsqrt2by2,acosf_pred_LEsqrt2by2
-{ .mfi
+{ .mfi
nop.m 999
(p8) fma.s.s0 f8 = acosf_z,acosf_Pt,acosf_sgn_x_piby2
nop.i 999
-}
-
-{ .mfb
+}
+
+{ .mfb
nop.m 999
(p7) fms.s.s0 f8 = acosf_const_piby2,f1,acosf_sinf1
br.ret.sptk b0 ;;
-}
+}
ACOSF_ZERO:
// Here if x=0
-{ .mfb
+{ .mfb
nop.m 999
fma.s.s0 f8 = acosf_const_piby2,f1,f0 // acosf(0)=pi/2
br.ret.sptk b0 ;;
-}
+}
ACOSF_ABS_ONE:
.pred.rel "mutex",p11,p12
// Here if |x|=1
-{ .mfi
+{ .mfi
nop.m 999
(p11) fma.s.s0 f8 = acosf_const_piby2,f1,acosf_const_piby2 // acosf(-1)=pi
nop.i 999
-}
-{ .mfb
+}
+{ .mfb
nop.m 999
(p12) fma.s.s0 f8 = f1,f0,f0 // acosf(1)=0
br.ret.sptk b0 ;;
-}
+}
GLOBAL_LIBM_END(acosf)
// products derived from this software without specific prior written
// permission.
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
// Intel Corporation is the author of this code, and requests that all
-// problem reports or change requests be submitted to it directly at
+// problem reports or change requests be submitted to it directly at
// http://www.intel.com/software/products/opensource/libraries/num.htm.
//
//*********************************************************************
//
-// History:
+// History:
// 10/01/01 Initial version
// 10/10/01 Performance inproved
// 12/11/01 Changed huges_logp to not be global
//
// Overview of operation
//==============================================================
-//
+//
// There are 6 paths:
// 1. x = 1
// Return acoshl(x) = 0;
//
// 3. x = [S,Q]Nan or +INF
// Return acoshl(x) = x + x;
-//
+//
// 4. 'Near 1': 1 < x < 1+1/8
-// Return acoshl(x) = sqrtl(2*y)*(1-P(y)/Q(y)),
+// Return acoshl(x) = sqrtl(2*y)*(1-P(y)/Q(y)),
// where y = 1, P(y)/Q(y) - rational approximation
//
// 5. 'Huges': x > 0.5*2^64
// Return acoshl(x) = (logl(2*x-1));
-//
+//
// 6. 'Main path': 1+1/8 < x < 0.5*2^64
// b_hi + b_lo = x + sqrt(x^2 - 1);
// acoshl(x) = logl_special(b_hi, b_lo);
-//
-// Algorithm description
+//
+// Algorithm description
//==============================================================
//
// I. Near 1 path algorithm
// **************************************************************
-// The formula is acoshl(x) = sqrtl(2*y)*(1-P(y)/Q(y)),
+// The formula is acoshl(x) = sqrtl(2*y)*(1-P(y)/Q(y)),
// where y = 1, P(y)/Q(y) - rational approximation
//
// 1) y = x - 1, y2 = 2 * y
//
// 2) Compute in parallel sqrtl(2*y) and P(y)/Q(y)
// a) sqrtl computation method described below (main path algorithm, item 2))
-// As result we obtain (gg+gl) - multiprecision result
+// As result we obtain (gg+gl) - multiprecision result
// as pair of double extended values
// b) P(y) and Q(y) calculated without any extra precision manipulations
// c) P/Q division:
// y = frcpa(Q) initial approximation of 1/Q
// z = P*y initial approximation of P/Q
-//
+//
// e = 1 - b*y
// e2 = e + e^2
// e1 = e^2
// b) res = ((((gl + ll) + lh) + hl) + hh) + gg;
// (exactly in this order)
//
-// II. Main path algorithm
+// II. Main path algorithm
// ( thanks to Peter Markstein for the idea of sqrt(x^2+1) computation! )
// **********************************************************************
//
// 1) m2 = (m2_hi+m2_lo) = x^2-1 obtaining
// ------------------------------------
// m2_hi = x2_hi - 1, where x2_hi = x * x;
-// m2_lo = x2_lo + p1_lo, where
-// x2_lo = FMS(x*x-x2_hi),
+// m2_lo = x2_lo + p1_lo, where
+// x2_lo = FMS(x*x-x2_hi),
// p1_lo = (1 + m2_hi) - x2_hi;
//
// 2) g = (g_hi+g_lo) = sqrt(m2) = sqrt(m2_hi+m2_lo)
// ----------------------------------------------
// r = invsqrt(m2_hi) (8-bit reciprocal square root approximation);
// g = m2_hi * r (first 8 bit-approximation of sqrt);
-//
+//
// h = 0.5 * r;
// e = 0.5 - g * h;
// g = g * e + g (second 16 bit-approximation of sqrt);
-//
+//
// h = h * e + h;
// e = 0.5 - g * h;
// g = g * e + g (third 32 bit-approximation of sqrt);
// h = h * e + h;
// e = 0.5 - g * h;
// g_hi = g * e + g (fourth 64 bit-approximation of sqrt);
-//
+//
// Remainder computation:
// h = h * e + h;
// d = (m2_hi - g_hi * g_hi) + m2_lo;
// -------------------------------------------------------------------
// b_hi = (g_hi + x) + gl;
// b_lo = (x - b_hi) + g_hi + gl;
-//
+//
// Now we pass b presented as sum b_hi + b_lo to special version
// of logl function which accept a pair of arguments as
-// mutiprecision value.
-//
+// mutiprecision value.
+//
// Special log algorithm overview
// ================================
// Here we use a table lookup method. The basic idea is that in
-// order to compute logl(Arg) for an argument Arg in [1,2),
+// order to compute logl(Arg) for an argument Arg in [1,2),
// we construct a value G such that G*Arg is close to 1 and that
// logl(1/G) is obtainable easily from a table of values calculated
// beforehand. Thus
// G := G_1 * G_2 * G_3
// r := (G * S_hi - 1) + G * S_lo
//
-// These G_j's have the property that the product is exactly
+// These G_j's have the property that the product is exactly
// representable and that |r| < 2^(-12) as a result.
//
// Step 2: Approximation
//
// Registers used
//==============================================================
-// Floating Point registers used:
+// Floating Point registers used:
// f8, input
// f32 -> f95 (64 registers)
-// General registers used:
+// General registers used:
// r32 -> r67 (36 registers)
// Predicate registers used:
// p7 for 'NaNs, Inf' path
// p8 for 'near 1' path
// p9 for 'huges' path
-// p10 for x = 1
+// p10 for x = 1
// p11 for x < 1
//
//*********************************************************************
// IEEE Special Conditions:
//
// acoshl(+inf) = +inf
-// acoshl(-inf) = QNaN
-// acoshl(1) = 0
+// acoshl(-inf) = QNaN
+// acoshl(1) = 0
// acoshl(x<1) = QNaN
// acoshl(SNaN) = QNaN
// acoshl(QNaN) = QNaN
// Data tables
//==============================================================
-
+
RODATA
.align 64
// Near 1 path rational aproximation coefficients
LOCAL_OBJECT_START(Poly_P)
-data8 0xB0978143F695D40F, 0x3FF1 // .84205539791447100108478906277453574946e-4
-data8 0xB9800D841A8CAD29, 0x3FF6 // .28305085180397409672905983082168721069e-2
-data8 0xC889F455758C1725, 0x3FF9 // .24479844297887530847660233111267222945e-1
-data8 0x9BE1DFF006F45F12, 0x3FFB // .76114415657565879842941751209926938306e-1
-data8 0x9E34AF4D372861E0, 0x3FFB // .77248925727776366270605984806795850504e-1
-data8 0xF3DC502AEE14C4AE, 0x3FA6 // .3077953476682583606615438814166025592e-26
+data8 0xB0978143F695D40F, 0x3FF1 // .84205539791447100108478906277453574946e-4
+data8 0xB9800D841A8CAD29, 0x3FF6 // .28305085180397409672905983082168721069e-2
+data8 0xC889F455758C1725, 0x3FF9 // .24479844297887530847660233111267222945e-1
+data8 0x9BE1DFF006F45F12, 0x3FFB // .76114415657565879842941751209926938306e-1
+data8 0x9E34AF4D372861E0, 0x3FFB // .77248925727776366270605984806795850504e-1
+data8 0xF3DC502AEE14C4AE, 0x3FA6 // .3077953476682583606615438814166025592e-26
LOCAL_OBJECT_END(Poly_P)
//
LOCAL_OBJECT_START(Poly_Q)
-data8 0xF76E3FD3C7680357, 0x3FF1 // .11798413344703621030038719253730708525e-3
-data8 0xD107D2E7273263AE, 0x3FF7 // .63791065024872525660782716786703188820e-2
-data8 0xB609BE5CDE206AEF, 0x3FFB // .88885771950814004376363335821980079985e-1
-data8 0xF7DEACAC28067C8A, 0x3FFD // .48412074662702495416825113623936037072302
-data8 0x8F9BE5890CEC7E38, 0x3FFF // 1.1219450873557867470217771071068369729526
-data8 0xED4F06F3D2BC92D1, 0x3FFE // .92698710873331639524734537734804056798748
+data8 0xF76E3FD3C7680357, 0x3FF1 // .11798413344703621030038719253730708525e-3
+data8 0xD107D2E7273263AE, 0x3FF7 // .63791065024872525660782716786703188820e-2
+data8 0xB609BE5CDE206AEF, 0x3FFB // .88885771950814004376363335821980079985e-1
+data8 0xF7DEACAC28067C8A, 0x3FFD // .48412074662702495416825113623936037072302
+data8 0x8F9BE5890CEC7E38, 0x3FFF // 1.1219450873557867470217771071068369729526
+data8 0xED4F06F3D2BC92D1, 0x3FFE // .92698710873331639524734537734804056798748
LOCAL_OBJECT_END(Poly_Q)
-// Q coeffs
+// Q coeffs
LOCAL_OBJECT_START(Constants_Q)
-data4 0x00000000,0xB1721800,0x00003FFE,0x00000000
+data4 0x00000000,0xB1721800,0x00003FFE,0x00000000
data4 0x4361C4C6,0x82E30865,0x0000BFE2,0x00000000
data4 0x328833CB,0xCCCCCAF2,0x00003FFC,0x00000000
data4 0xA9D4BAFB,0x80000077,0x0000BFFD,0x00000000
data4 0xAAABE3D2,0xAAAAAAAA,0x00003FFD,0x00000000
-data4 0xFFFFDAB7,0xFFFFFFFF,0x0000BFFD,0x00000000
+data4 0xFFFFDAB7,0xFFFFFFFF,0x0000BFFD,0x00000000
LOCAL_OBJECT_END(Constants_Q)
// Z1 - 16 bit fixed
data8 0xBE049391B6B7C239
LOCAL_OBJECT_END(Constants_G_H_h2)
-// G3 and H3 - IEEE single and h3 - IEEE double
+// G3 and H3 - IEEE single and h3 - IEEE double
LOCAL_OBJECT_START(Constants_G_H_h3)
data4 0x3F7FFC00,0x38800100
data8 0x3D355595562224CD
FR_QQ4 = f42
FR_QQ5 = f43
-FR_Q1 = f44
-FR_Q2 = f45
-FR_Q3 = f46
-FR_Q4 = f47
+FR_Q1 = f44
+FR_Q2 = f45
+FR_Q3 = f46
+FR_Q4 = f47
FR_Half = f48
FR_Two = f49
-FR_log2_hi = f50
-FR_log2_lo = f51
+FR_log2_hi = f50
+FR_log2_lo = f51
FR_X2 = f52
// Special logl registers
-FR_XLog_Hi = f65
-FR_XLog_Lo = f66
+FR_XLog_Hi = f65
+FR_XLog_Lo = f66
-FR_Y_hi = f67
+FR_Y_hi = f67
FR_Y_lo = f68
-FR_S_hi = f69
-FR_S_lo = f70
+FR_S_hi = f69
+FR_S_lo = f70
FR_poly_lo = f71
FR_poly_hi = f72
FR_G2 = f76
FR_H2 = f77
-FR_h2 = f78
+FR_h2 = f78
-FR_r = f79
-FR_rsq = f80
-FR_rcub = f81
+FR_r = f79
+FR_rsq = f80
+FR_rcub = f81
-FR_float_N = f82
+FR_float_N = f82
-FR_G3 = f83
-FR_H3 = f84
-FR_h3 = f85
+FR_G3 = f83
+FR_H3 = f84
+FR_h3 = f85
-FR_2_to_minus_N = f86
+FR_2_to_minus_N = f86
// Near 1 registers
FR_QV2 = f76
FR_Y0 = f77
-FR_Q0 = f78
+FR_Q0 = f78
FR_E0 = f79
FR_E2 = f80
FR_E1 = f81
GR_Poly_Q = r38
// Special logl registers
-GR_Index1 = r39
-GR_Index2 = r40
-GR_signif = r41
-GR_X_0 = r42
-GR_X_1 = r43
-GR_X_2 = r44
+GR_Index1 = r39
+GR_Index2 = r40
+GR_signif = r41
+GR_X_0 = r42
+GR_X_1 = r43
+GR_X_2 = r44
GR_minus_N = r45
-GR_Z_1 = r46
-GR_Z_2 = r47
-GR_N = r48
-GR_Bias = r49
-GR_M = r50
-GR_Index3 = r51
-GR_exp_2tom80 = r52
-GR_exp_mask = r53
-GR_exp_2tom7 = r54
-GR_ad_ln10 = r55
+GR_Z_1 = r46
+GR_Z_2 = r47
+GR_N = r48
+GR_Bias = r49
+GR_M = r50
+GR_Index3 = r51
+GR_exp_2tom80 = r52
+GR_exp_mask = r53
+GR_exp_2tom7 = r54
+GR_ad_ln10 = r55
GR_ad_tbl_1 = r56
GR_ad_tbl_2 = r57
GR_ad_tbl_3 = r58
addl GR_Poly_Q = @ltoff(Poly_Q), gp // Address of Q-coeff table
fma.s1 FR_X2 = FR_Arg, FR_Arg, f0 // Obtain x^2
addl GR_Poly_P = @ltoff(Poly_P), gp // Address of P-coeff table
-};;
+};;
-{ .mfi
+{ .mfi
getf.d GR_Arg = FR_Arg // get arument as double (int64)
fma.s0 FR_Two = f1, f1, f1 // construct 2.0
addl GR_ad_z_1 = @ltoff(Constants_Z_1#),gp // logl tables
}
-{ .mlx
- nop.m 0
+{ .mlx
+ nop.m 0
movl GR_TwoP63 = 0x43E8000000000000 // 0.5*2^63 (huge arguments)
-};;
+};;
-{ .mfi
+{ .mfi
ld8 GR_Poly_P = [GR_Poly_P] // get actual P-coeff table address
fcmp.eq.s1 p10, p0 = FR_Arg, f1 // if arg == 1 (return 0)
nop.i 0
}
-{ .mlx
+{ .mlx
ld8 GR_Poly_Q = [GR_Poly_Q] // get actual Q-coeff table address
movl GR_OneP125 = 0x3FF2000000000000 // 1.125 (near 1 path bound)
};;
-{ .mfi
+{ .mfi
ld8 GR_ad_z_1 = [GR_ad_z_1] // Get pointer to Constants_Z_1
fclass.m p7,p0 = FR_Arg, 0xe3 // if arg NaN inf
cmp.le p9, p0 = GR_TwoP63, GR_Arg // if arg > 0.5*2^63 ('huges')
cmp.ge p8, p0 = GR_OneP125, GR_Arg // if arg<1.125 -near 1 path
fms.s1 FR_XM1 = FR_Arg, f1, f1 // X0 = X-1 (for near 1 path)
(p11) br.cond.spnt acoshl_lt_pone // error branch (less than 1)
-};;
+};;
-{ .mmi
+{ .mmi
setf.exp FR_Half = GR_Half // construct 0.5
(p9) setf.s FR_XLog_Lo = r0 // Low of logl arg=0 (Huges path)
mov GR_exp_mask = 0x1FFFF // Create exponent mask
-};;
+};;
-{ .mmf
+{ .mmf
(p8) ldfe FR_PP5 = [GR_Poly_P],16 // Load P5
(p8) ldfe FR_QQ5 = [GR_Poly_Q],16 // Load Q5
fms.s1 FR_M2 = FR_X2, f1, f1 // m2 = x^2 - 1
};;
-{ .mfi
+{ .mfi
(p8) ldfe FR_QQ4 = [GR_Poly_Q],16 // Load Q4
- fms.s1 FR_M2L = FR_Arg, FR_Arg, FR_X2 // low part of
+ fms.s1 FR_M2L = FR_Arg, FR_Arg, FR_X2 // low part of
// m2 = fma(X*X - m2)
add GR_ad_tbl_1 = 0x040, GR_ad_z_1 // Point to Constants_G_H_h1
}
{ .mfb
-(p8) ldfe FR_PP4 = [GR_Poly_P],16 // Load P4
+(p8) ldfe FR_PP4 = [GR_Poly_P],16 // Load P4
(p7) fma.s0 FR_Res = FR_Arg,f1,FR_Arg // r = a + a (Nan, Inf)
(p7) br.ret.spnt b0 // return (Nan, Inf)
-};;
+};;
{ .mfi
(p8) ldfe FR_PP3 = [GR_Poly_P],16 // Load P3
(p9) fms.s1 FR_XLog_Hi = FR_Two, FR_Arg, f1 // Hi of log arg = 2*X-1
(p9) br.cond.spnt huges_logl // special version of log
}
-;;
+;;
-{ .mfi
+{ .mfi
(p8) ldfe FR_PP2 = [GR_Poly_P],16 // Load P2
(p8) fma.s1 FR_2XM1 = FR_Two, FR_XM1, f0 // 2X0 = 2 * X0
add GR_ad_z_2 = 0x140, GR_ad_z_1 // Point to Constants_Z_2
{ .mfb
(p8) ldfe FR_QQ2 = [GR_Poly_Q],16 // Load Q2
(p10) fma.s0 FR_Res = f0,f1,f0 // r = 0 (arg = 1)
-(p10) br.ret.spnt b0 // return (arg = 1)
-};;
+(p10) br.ret.spnt b0 // return (arg = 1)
+};;
-{ .mmi
+{ .mmi
(p8) ldfe FR_PP1 = [GR_Poly_P],16 // Load P1
(p8) ldfe FR_QQ1 = [GR_Poly_Q],16 // Load Q1
add GR_ad_tbl_2 = 0x180, GR_ad_z_1 // Point to Constants_G_H_h2
}
;;
-{ .mfi
-(p8) ldfe FR_PP0 = [GR_Poly_P] // Load P0
+{ .mfi
+(p8) ldfe FR_PP0 = [GR_Poly_P] // Load P0
fma.s1 FR_Tmp = f1, f1, FR_M2 // Tmp = 1 + m2
add GR_ad_tbl_3 = 0x280, GR_ad_z_1 // Point to Constants_G_H_h3
}
(p8) ldfe FR_QQ0 = [GR_Poly_Q]
nop.f 0
(p8) br.cond.spnt near_1 // near 1 path
-};;
-{ .mfi
+};;
+{ .mfi
ldfe FR_log2_hi = [GR_ad_q],16 // Load log2_hi
nop.f 0
mov GR_Bias = 0x0FFFF // Create exponent bias
};;
-{ .mfi
+{ .mfi
nop.m 0
frsqrta.s1 FR_Rcp, p0 = FR_M2 // Rcp = 1/m2 reciprocal appr.
nop.i 0
-};;
+};;
{ .mfi
ldfe FR_log2_lo = [GR_ad_q],16 // Load log2_lo
nop.i 0
}
{ .mfi
- nop.m 0
+ nop.m 0
fma.s1 FR_HH = FR_Half, FR_Rcp, f0 // h = 0.5 * Rcp
nop.i 0
};;
nop.i 0
}
{ .mfi
- nop.m 0
+ nop.m 0
fma.s1 FR_M2L = FR_Tmp, f1, FR_M2L // low part of m2 = Tmp+m2l
nop.i 0
};;
{ .mfi
ldfe FR_Q2 = [GR_ad_q],16 // Load Q2
- fma.s1 FR_GG = FR_GG, FR_EE, FR_GG // g = g * e + g
+ fma.s1 FR_GG = FR_GG, FR_EE, FR_GG // g = g * e + g
// 16 bit Newton Raphson iteration
nop.i 0
}
};;
{ .mfi
nop.m 0
- fma.s1 FR_GG = FR_GG, FR_EE, FR_GG // g = g * e + g
+ fma.s1 FR_GG = FR_GG, FR_EE, FR_GG // g = g * e + g
// 32 bit Newton Raphson iteration
nop.i 0
}
{ .mfi
nop.m 0
- fma.s1 FR_GG = FR_GG, FR_EE, FR_GG // g = g * e + g
+ fma.s1 FR_GG = FR_GG, FR_EE, FR_GG // g = g * e + g
// 64 bit Newton Raphson iteration
nop.i 0
}
{ .mfi
nop.m 0
nop.f 0
- extr.u GR_Index2 = GR_X_1, 6, 4 // Extract bits 6-9 of X_1
+ extr.u GR_Index2 = GR_X_1, 6, 4 // Extract bits 6-9 of X_1
};;
{ .mfi
pmpyshr2.u GR_X_2 = GR_X_1,GR_Z_2,15 // Get bits 30-15 of X_1 * Z_2
};;
-// WE CANNOT USE GR_X_2 IN NEXT 3 CYCLES ("DEAD" ZONE!)
+// WE CANNOT USE GR_X_2 IN NEXT 3 CYCLES ("DEAD" ZONE!)
// BECAUSE OF POSSIBLE 10 CLOCKS STALL!
// (Just nops added - nothing to do here)
{ .mfi
nop.m 0
- fadd.s0 FR_Y_lo = FR_poly_hi, FR_poly_lo
+ fadd.s0 FR_Y_lo = FR_poly_hi, FR_poly_lo
// Y_lo=poly_hi+poly_lo
nop.i 0
};;
{ .mmi
ldfe FR_log2_lo = [GR_ad_q],16 // Load log2_lo
- sub GR_N = GR_N, GR_Bias
+ sub GR_N = GR_N, GR_Bias
mov GR_exp_2tom80 = 0x0ffaf // Exponent of 2^-80
};;
{ .mmi
ldfe FR_Q2 = [GR_ad_q],16 // Load Q2
nop.m 0
- extr.u GR_Index2 = GR_X_1, 6, 4 // Extract bits 6-9 of X_1
+ extr.u GR_Index2 = GR_X_1, 6, 4 // Extract bits 6-9 of X_1
};;
{ .mmi
pmpyshr2.u GR_X_2 = GR_X_1,GR_Z_2,15 // Get bits 30-15 of X_1*Z_2
};;
-// WE CANNOT USE GR_X_2 IN NEXT 3 CYCLES ("DEAD" ZONE!)
+// WE CANNOT USE GR_X_2 IN NEXT 3 CYCLES ("DEAD" ZONE!)
// BECAUSE OF POSSIBLE 10 CLOCKS STALL!
// (Just nops added - nothing to do here)
};;
{ .mfi
nop.m 0
- fadd.s0 FR_Y_lo = FR_poly_hi, FR_poly_lo // Y_lo=poly_hi+poly_lo
+ fadd.s0 FR_Y_lo = FR_poly_hi, FR_poly_lo // Y_lo=poly_hi+poly_lo
nop.i 0
};;
{ .mfb
// NEAR ONE INTERVAL
near_1:
-{ .mfi
- nop.m 0
+{ .mfi
+ nop.m 0
frsqrta.s1 FR_Rcp, p0 = FR_2XM1 // Rcp = 1/x reciprocal appr. &SQRT&
- nop.i 0
+ nop.i 0
};;
-{ .mfi
- nop.m 0
+{ .mfi
+ nop.m 0
fma.s1 FR_PV6 = FR_PP5, FR_XM1, FR_PP4 // pv6 = P5*xm1+P4 $POLY$
- nop.i 0
+ nop.i 0
}
{ .mfi
- nop.m 0
+ nop.m 0
fma.s1 FR_QV6 = FR_QQ5, FR_XM1, FR_QQ4 // qv6 = Q5*xm1+Q4 $POLY$
- nop.i 0
+ nop.i 0
};;
-{ .mfi
- nop.m 0
+{ .mfi
+ nop.m 0
fma.s1 FR_PV4 = FR_PP3, FR_XM1, FR_PP2 // pv4 = P3*xm1+P2 $POLY$
- nop.i 0
+ nop.i 0
}
{ .mfi
- nop.m 0
+ nop.m 0
fma.s1 FR_QV4 = FR_QQ3, FR_XM1, FR_QQ2 // qv4 = Q3*xm1+Q2 $POLY$
- nop.i 0
+ nop.i 0
};;
-{ .mfi
- nop.m 0
+{ .mfi
+ nop.m 0
fma.s1 FR_XM12 = FR_XM1, FR_XM1, f0 // xm1^2 = xm1 * xm1 $POLY$
- nop.i 0
+ nop.i 0
};;
-{ .mfi
- nop.m 0
+{ .mfi
+ nop.m 0
fma.s1 FR_PV2 = FR_PP1, FR_XM1, FR_PP0 // pv2 = P1*xm1+P0 $POLY$
- nop.i 0
+ nop.i 0
}
{ .mfi
- nop.m 0
+ nop.m 0
fma.s1 FR_QV2 = FR_QQ1, FR_XM1, FR_QQ0 // qv2 = Q1*xm1+Q0 $POLY$
- nop.i 0
+ nop.i 0
};;
-{ .mfi
- nop.m 0
- fma.s1 FR_GG = FR_Rcp, FR_2XM1, f0 // g = Rcp * x &SQRT&
- nop.i 0
+{ .mfi
+ nop.m 0
+ fma.s1 FR_GG = FR_Rcp, FR_2XM1, f0 // g = Rcp * x &SQRT&
+ nop.i 0
}
{ .mfi
- nop.m 0
+ nop.m 0
fma.s1 FR_HH = FR_Half, FR_Rcp, f0 // h = 0.5 * Rcp &SQRT&
- nop.i 0
+ nop.i 0
};;
-{ .mfi
- nop.m 0
+{ .mfi
+ nop.m 0
fma.s1 FR_PV3 = FR_XM12, FR_PV6, FR_PV4//pv3=pv6*xm1^2+pv4 $POLY$
- nop.i 0
+ nop.i 0
}
{ .mfi
- nop.m 0
+ nop.m 0
fma.s1 FR_QV3 = FR_XM12, FR_QV6, FR_QV4//qv3=qv6*xm1^2+qv4 $POLY$
- nop.i 0
+ nop.i 0
};;
-{ .mfi
- nop.m 0
+{ .mfi
+ nop.m 0
fnma.s1 FR_EE = FR_GG, FR_HH, FR_Half // e = 0.5 - g * h &SQRT&
- nop.i 0
+ nop.i 0
};;
-{ .mfi
- nop.m 0
+{ .mfi
+ nop.m 0
fma.s1 FR_PP = FR_XM12, FR_PV3, FR_PV2 //pp=pv3*xm1^2+pv2 $POLY$
- nop.i 0
+ nop.i 0
}
{ .mfi
- nop.m 0
+ nop.m 0
fma.s1 FR_QQ = FR_XM12, FR_QV3, FR_QV2 //qq=qv3*xm1^2+qv2 $POLY$
- nop.i 0
+ nop.i 0
};;
{ .mfi
- nop.m 0
+ nop.m 0
fma.s1 FR_GG = FR_GG, FR_EE, FR_GG // g = g * e + g &SQRT&
- nop.i 0
+ nop.i 0
}
{ .mfi
- nop.m 0
+ nop.m 0
fma.s1 FR_HH = FR_HH, FR_EE, FR_HH // h = h * e + h &SQRT&
- nop.i 0
+ nop.i 0
};;
{ .mfi
- nop.m 0
+ nop.m 0
frcpa.s1 FR_Y0,p0 = f1,FR_QQ // y = frcpa(b) #DIV#
- nop.i 0
+ nop.i 0
}
{ .mfi
- nop.m 0
+ nop.m 0
fnma.s1 FR_EE = FR_GG, FR_HH, FR_Half // e = 0.5 - g*h &SQRT&
- nop.i 0
+ nop.i 0
};;
{ .mfi
- nop.m 0
+ nop.m 0
fma.s1 FR_Q0 = FR_PP,FR_Y0,f0 // q = a*y #DIV#
- nop.i 0
+ nop.i 0
}
{ .mfi
- nop.m 0
+ nop.m 0
fnma.s1 FR_E0 = FR_Y0,FR_QQ,f1 // e = 1 - b*y #DIV#
- nop.i 0
+ nop.i 0
};;
{ .mfi
- nop.m 0
- fma.s1 FR_GG = FR_GG, FR_EE, FR_GG // g = g * e + g &SQRT&
- nop.i 0
+ nop.m 0
+ fma.s1 FR_GG = FR_GG, FR_EE, FR_GG // g = g * e + g &SQRT&
+ nop.i 0
}
{ .mfi
- nop.m 0
+ nop.m 0
fma.s1 FR_HH = FR_HH, FR_EE, FR_HH // h = h * e + h &SQRT&
- nop.i 0
+ nop.i 0
};;
{ .mfi
- nop.m 0
+ nop.m 0
fma.s1 FR_E2 = FR_E0,FR_E0,FR_E0 // e2 = e+e^2 #DIV#
- nop.i 0
+ nop.i 0
}
{ .mfi
- nop.m 0
+ nop.m 0
fma.s1 FR_E1 = FR_E0,FR_E0,f0 // e1 = e^2 #DIV#
- nop.i 0
+ nop.i 0
};;
{ .mfi
- nop.m 0
+ nop.m 0
fnma.s1 FR_EE = FR_GG, FR_HH, FR_Half // e = 0.5 - g * h &SQRT&
- nop.i 0
+ nop.i 0
}
{ .mfi
- nop.m 0
+ nop.m 0
fnma.s1 FR_DD = FR_GG, FR_GG, FR_2XM1 // d = x - g * g &SQRT&
- nop.i 0
+ nop.i 0
};;
{ .mfi
- nop.m 0
+ nop.m 0
fma.s1 FR_Y1 = FR_Y0,FR_E2,FR_Y0 // y1 = y+y*e2 #DIV#
- nop.i 0
+ nop.i 0
}
{ .mfi
- nop.m 0
+ nop.m 0
fma.s1 FR_E3 = FR_E1,FR_E1,FR_E0 // e3 = e+e1^2 #DIV#
- nop.i 0
+ nop.i 0
};;
{ .mfi
- nop.m 0
+ nop.m 0
fma.s1 FR_GG = FR_DD, FR_HH, FR_GG // g = d * h + g &SQRT&
- nop.i 0
+ nop.i 0
}
{ .mfi
- nop.m 0
+ nop.m 0
fma.s1 FR_HH = FR_HH, FR_EE, FR_HH // h = h * e + h &SQRT&
- nop.i 0
+ nop.i 0
};;
{ .mfi
- nop.m 0
+ nop.m 0
fma.s1 FR_Y2 = FR_Y1,FR_E3,FR_Y0 // y2 = y+y1*e3 #DIV#
- nop.i 0
+ nop.i 0
}
{ .mfi
- nop.m 0
+ nop.m 0
fnma.s1 FR_R0 = FR_QQ,FR_Q0,FR_PP // r = a-b*q #DIV#
- nop.i 0
+ nop.i 0
};;
{ .mfi
- nop.m 0
- fnma.s1 FR_DD = FR_GG, FR_GG, FR_2XM1 // d = x - g * g &SQRT&
- nop.i 0
+ nop.m 0
+ fnma.s1 FR_DD = FR_GG, FR_GG, FR_2XM1 // d = x - g * g &SQRT&
+ nop.i 0
};;
{ .mfi
- nop.m 0
+ nop.m 0
fnma.s1 FR_E4 = FR_QQ,FR_Y2,f1 // e4 = 1-b*y2 #DIV#
- nop.i 0
+ nop.i 0
}
{ .mfi
- nop.m 0
+ nop.m 0
fma.s1 FR_X_Hi = FR_R0,FR_Y2,FR_Q0 // x = q+r*y2 #DIV#
- nop.i 0
+ nop.i 0
};;
{ .mfi
- nop.m 0
+ nop.m 0
fma.s1 FR_GL = FR_DD, FR_HH, f0 // gl = d * h &SQRT&
- nop.i 0
+ nop.i 0
};;
{ .mfi
- nop.m 0
+ nop.m 0
fma.s1 FR_Y3 = FR_Y2,FR_E4,FR_Y2 // y3 = y2+y2*e4 #DIV#
- nop.i 0
+ nop.i 0
}
{ .mfi
- nop.m 0
+ nop.m 0
fnma.s1 FR_R1 = FR_QQ,FR_X_Hi,FR_PP // r1 = a-b*x #DIV#
- nop.i 0
+ nop.i 0
};;
{ .mfi
- nop.m 0
+ nop.m 0
fma.s1 FR_HH = FR_GG, FR_X_Hi, f0 // hh = gg * x_hi
- nop.i 0
+ nop.i 0
}
{ .mfi
- nop.m 0
+ nop.m 0
fma.s1 FR_LH = FR_GL, FR_X_Hi, f0 // lh = gl * x_hi
- nop.i 0
+ nop.i 0
};;
{ .mfi
- nop.m 0
+ nop.m 0
fma.s1 FR_X_lo = FR_R1,FR_Y3,f0 // x_lo = r1*y3 #DIV#
- nop.i 0
+ nop.i 0
};;
{ .mfi
- nop.m 0
+ nop.m 0
fma.s1 FR_LL = FR_GL, FR_X_lo, f0 // ll = gl*x_lo
- nop.i 0
+ nop.i 0
}
{ .mfi
- nop.m 0
+ nop.m 0
fma.s1 FR_HL = FR_GG, FR_X_lo, f0 // hl = gg * x_lo
- nop.i 0
+ nop.i 0
};;
{ .mfi
- nop.m 0
+ nop.m 0
fms.s1 FR_Res = FR_GL, f1, FR_LL // res = gl + ll
- nop.i 0
+ nop.i 0
};;
{ .mfi
- nop.m 0
+ nop.m 0
fms.s1 FR_Res = FR_Res, f1, FR_LH // res = res + lh
- nop.i 0
+ nop.i 0
};;
{ .mfi
- nop.m 0
+ nop.m 0
fms.s1 FR_Res = FR_Res, f1, FR_HL // res = res + hl
- nop.i 0
+ nop.i 0
};;
{ .mfi
- nop.m 0
+ nop.m 0
fms.s1 FR_Res = FR_Res, f1, FR_HH // res = res + hh
- nop.i 0
+ nop.i 0
};;
{ .mfb
- nop.m 0
+ nop.m 0
fma.s0 FR_Res = FR_Res, f1, FR_GG // result = res + gg
br.ret.sptk b0 // Exit for near 1 path
};;
acoshl_lt_pone:
{ .mfi
- nop.m 0
+ nop.m 0
fmerge.s FR_Arg_X = FR_Arg, FR_Arg
- nop.i 0
+ nop.i 0
};;
{ .mfb
mov GR_Parameter_TAG = 135
{ .mib
stfe [GR_Parameter_X] = FR_Arg_X // Parameter 1 to stack
add GR_Parameter_RESULT = 0,GR_Parameter_Y // Parameter 3 address
- nop.b 0
+ nop.b 0
}
{ .mib
stfe [GR_Parameter_Y] = FR_Res // Parameter 3 to stack
F_CS7 = f37
F_CS8 = f38
F_CS9 = f39
-F_S23 = f40
-F_S45 = f41
-F_S67 = f42
-F_S89 = f43
-F_S25 = f44
-F_S69 = f45
-F_S29 = f46
-F_X2 = f47
-F_X4 = f48
-F_TSQRT = f49
-F_DTX = f50
-F_R = f51
-F_R2 = f52
-F_R3 = f53
-F_R4 = f54
-
-F_C3 = f55
-F_C5 = f56
-F_C7 = f57
-F_C9 = f58
-F_P79 = f59
-F_P35 = f60
-F_P39 = f61
-
-F_ATHI = f62
-F_ATLO = f63
-
-F_T1 = f64
-F_Y = f65
-F_Y2 = f66
-F_ANDMASK = f67
-F_ORMASK = f68
-F_S = f69
-F_05 = f70
-F_SQRT_1S2 = f71
-F_DS = f72
-F_Z = f73
-F_1T2 = f74
-F_DZ = f75
-F_ZE = f76
-F_YZ = f77
-F_Y1S2 = f78
-F_Y1S2X = f79
-F_1X = f80
-F_ST = f81
-F_1T2_ST = f82
-F_TSS = f83
-F_Y1S2X2 = f84
-F_DZ_TERM = f85
-F_DTS = f86
-F_DS2X = f87
-F_T2 = f88
-F_ZY1S2S = f89
-F_Y1S2_1X = f90
+F_S23 = f40
+F_S45 = f41
+F_S67 = f42
+F_S89 = f43
+F_S25 = f44
+F_S69 = f45
+F_S29 = f46
+F_X2 = f47
+F_X4 = f48
+F_TSQRT = f49
+F_DTX = f50
+F_R = f51
+F_R2 = f52
+F_R3 = f53
+F_R4 = f54
+
+F_C3 = f55
+F_C5 = f56
+F_C7 = f57
+F_C9 = f58
+F_P79 = f59
+F_P35 = f60
+F_P39 = f61
+
+F_ATHI = f62
+F_ATLO = f63
+
+F_T1 = f64
+F_Y = f65
+F_Y2 = f66
+F_ANDMASK = f67
+F_ORMASK = f68
+F_S = f69
+F_05 = f70
+F_SQRT_1S2 = f71
+F_DS = f72
+F_Z = f73
+F_1T2 = f74
+F_DZ = f75
+F_ZE = f76
+F_YZ = f77
+F_Y1S2 = f78
+F_Y1S2X = f79
+F_1X = f80
+F_ST = f81
+F_1T2_ST = f82
+F_TSS = f83
+F_Y1S2X2 = f84
+F_DZ_TERM = f85
+F_DTS = f86
+F_DS2X = f87
+F_T2 = f88
+F_ZY1S2S = f89
+F_Y1S2_1X = f90
F_TS = f91
-F_PI2_LO = f92
-F_PI2_HI = f93
-F_S19 = f94
-F_INV1T2_2 = f95
-F_CORR = f96
-F_DZ0 = f97
-
-F_C11 = f98
-F_C13 = f99
+F_PI2_LO = f92
+F_PI2_HI = f93
+F_S19 = f94
+F_INV1T2_2 = f95
+F_CORR = f96
+F_DZ0 = f97
+
+F_C11 = f98
+F_C13 = f99
F_C15 = f100
F_C17 = f101
F_P1113 = f102
// History
//==============================================================
// 02/02/00 Initial version
-// 06/28/00 Improved speed
+// 06/28/00 Improved speed
// 06/31/00 Changed register allocation because of some duplicate macros
-// moved nan exit bundle up to gain a cycle.
+// moved nan exit bundle up to gain a cycle.
// 08/08/00 Improved speed by avoiding SIR flush.
// 08/15/00 Bundle added after call to __libm_error_support to properly
// set [the previously overwritten] GR_Parameter_RESULT.
// 05/20/02 Cleaned up namespace and sf0 syntax
// 02/06/03 Reordered header: .section, .global, .proc, .align
-
+
// Description
//=========================================
// The asinf function computes the arc sine of x in the range [-pi,+pi].
// A doman error occurs for arguments not in the range [-1,+1].
// asinf(+-0) returns +-0
-// asinf(x) returns a Nan and raises the invalid exception for |x| >1
+// asinf(x) returns a Nan and raises the invalid exception for |x| >1
// The acosf function returns the arc cosine in the range [0, +pi] radians.
// A doman error occurs for arguments not in the range [-1,+1].
.section .text
GLOBAL_LIBM_ENTRY(asinf)
-
+
// Load the addresses of the two tables.
// Then, load the coefficients and other constants.
-{ .mfi
+{ .mfi
alloc r32 = ar.pfs,1,8,4,0
fnma.s1 asinf_t = f8,f8,f1
dep.z ASINF_GR_1by2 = 0x3f,24,8 // 0x3f000000
-}
-{ .mfi
+}
+{ .mfi
addl ASINF_Addr1 = @ltoff(asinf_coeff_1_table),gp
fma.s1 asinf_x2 = f8,f8,f0
addl ASINF_Addr2 = @ltoff(asinf_coeff_2_table),gp ;;
}
-
-{ .mfi
+
+{ .mfi
ld8 ASINF_Addr1 = [ASINF_Addr1]
fmerge.s asinf_abs_x = f1,f8
dep ASINF_GR_3by2 = -1,r0,22,8 // 0x3fc00000
-}
-{ .mlx
+}
+{ .mlx
nop.m 999
movl ASINF_GR_5by2 = 0x40200000;;
}
-
-{ .mfi
+
+{ .mfi
setf.s asinf_1by2 = ASINF_GR_1by2
fmerge.s asinf_sgn_x = f8,f1
nop.i 999
-}
-{ .mfi
+}
+{ .mfi
ld8 ASINF_Addr2 = [ASINF_Addr2]
nop.f 0
nop.i 999;;
}
-
-{ .mfi
+
+{ .mfi
setf.s asinf_5by2 = ASINF_GR_5by2
fcmp.lt.s1 p11,p12 = f8,f0
nop.i 999;;
}
-{ .mmf
+{ .mmf
ldfpd asinf_coeff_P1,asinf_coeff_P4 = [ASINF_Addr1],16
setf.s asinf_3by2 = ASINF_GR_3by2
fclass.m.unc p8,p0 = f8, 0xc3 ;; //@qnan | @snan
}
-
-{ .mfi
+
+{ .mfi
ldfpd asinf_coeff_P7,asinf_coeff_P6 = [ASINF_Addr1],16
fma.s1 asinf_t2 = asinf_t,asinf_t,f0
nop.i 999
-}
-{ .mfi
+}
+{ .mfi
ldfpd asinf_coeff_P3,asinf_coeff_P8 = [ASINF_Addr2],16
fma.s1 asinf_x4 = asinf_x2,asinf_x2,f0
nop.i 999;;
}
-
-{ .mfi
+
+{ .mfi
ldfpd asinf_coeff_P9,asinf_const_sqrt2by2 = [ASINF_Addr1]
fclass.m.unc p10,p0 = f8, 0x07 //@zero
nop.i 999
-}
-{ .mfi
+}
+{ .mfi
ldfpd asinf_coeff_P5,asinf_coeff_P2 = [ASINF_Addr2],16
fma.s1 asinf_x3 = f8,asinf_x2,f0
nop.i 999;;
}
-
-{ .mfi
+
+{ .mfi
ldfd asinf_const_piby2 = [ASINF_Addr2]
frsqrta.s1 asinf_B,p0 = asinf_t
nop.i 999
-}
-{ .mfb
+}
+{ .mfb
nop.m 999
(p8) fma.s.s0 f8 = f8,f1,f0
(p8) br.ret.spnt b0 ;; // Exit if x=nan
}
-
-{ .mfb
+
+{ .mfb
nop.m 999
fcmp.eq.s1 p6,p0 = asinf_abs_x,f1
(p10) br.ret.spnt b0 ;; // Exit if x=0
-}
-
-{ .mfi
+}
+
+{ .mfi
nop.m 999
fcmp.gt.s1 p9,p0 = asinf_abs_x,f1
nop.i 999;;
-}
-
-{ .mfi
+}
+
+{ .mfi
nop.m 999
fma.s1 asinf_x8 = asinf_x4,asinf_x4,f0
nop.i 999
-}
-{ .mfb
+}
+{ .mfb
nop.m 999
fma.s1 asinf_t4 = asinf_t2,asinf_t2,f0
(p6) br.cond.spnt ASINF_ABS_ONE ;; // Branch if |x|=1
-}
+}
-{ .mfi
+{ .mfi
nop.m 999
fma.s1 asinf_x5 = asinf_x2,asinf_x3,f0
nop.i 999
}
-{ .mfb
+{ .mfb
(p9) mov GR_Parameter_TAG = 62
fma.s1 asinf_yby2 = asinf_t,asinf_1by2,f0
(p9) br.cond.spnt __libm_error_region ;; // Branch if |x|>1
}
-{ .mfi
+{ .mfi
nop.m 999
fma.s1 asinf_Az = asinf_t,asinf_B,f0
nop.i 999
-}
-{ .mfi
+}
+{ .mfi
nop.m 999
fma.s1 asinf_B2 = asinf_B,asinf_B,f0
nop.i 999;;
}
-
-{ .mfi
+
+{ .mfi
nop.m 999
fma.s1 asinf_poly_p1 = f8,asinf_coeff_P1,f0
nop.i 999
-}
-{ .mfi
+}
+{ .mfi
nop.m 999
fma.s1 asinf_2poly_p1 = asinf_coeff_P1,asinf_t,f1
nop.i 999;;
}
-{ .mfi
+{ .mfi
nop.m 999
fma.s1 asinf_poly_p3 = asinf_coeff_P4,asinf_x2,asinf_coeff_P3
nop.i 999
-}
-{ .mfi
+}
+{ .mfi
nop.m 999
fma.s1 asinf_2poly_p6 = asinf_coeff_P7,asinf_t,asinf_coeff_P6
nop.i 999;;
-}
+}
-{ .mfi
+{ .mfi
nop.m 999
fma.s1 asinf_poly_p7 = asinf_x2,asinf_coeff_P8,asinf_coeff_P7
nop.i 999
-}
-{ .mfi
+}
+{ .mfi
nop.m 999
fma.s1 asinf_2poly_p2 = asinf_coeff_P3,asinf_t,asinf_coeff_P2
nop.i 999;;
}
-
-{ .mfi
+
+{ .mfi
nop.m 999
fma.s1 asinf_poly_p5 = asinf_x2,asinf_coeff_P6,asinf_coeff_P5
nop.i 999
-}
-{ .mfi
+}
+{ .mfi
nop.m 999
fma.s1 asinf_2poly_p4 = asinf_coeff_P5,asinf_t,asinf_coeff_P4
nop.i 999;;
}
-
-{ .mfi
+
+{ .mfi
nop.m 999
fma.d.s1 asinf_x11 = asinf_x8,asinf_x3,f0
nop.i 999
-}
-{ .mfi
+}
+{ .mfi
nop.m 999
fnma.s1 asinf_dz = asinf_B2,asinf_yby2,asinf_1by2
nop.i 999;;
}
-
-{ .mfi
+
+{ .mfi
nop.m 999
fma.s1 asinf_poly_p1a = asinf_x2,asinf_poly_p1,f8
nop.i 999
}
-{ .mfi
+{ .mfi
nop.m 999
fma.s1 asinf_2poly_p8 = asinf_coeff_P9,asinf_t,asinf_coeff_P8
nop.i 999;;
}
-
+
// Get the absolute value of x and determine the region in which x lies
-{ .mfi
+{ .mfi
nop.m 999
fcmp.le.s1 p7,p8 = asinf_abs_x,asinf_const_sqrt2by2
nop.i 999
-}
-{ .mfi
+}
+{ .mfi
nop.m 999
fma.s1 asinf_poly_p2 = asinf_x2,asinf_poly_p3,asinf_coeff_P2
nop.i 999;;
}
-
-{ .mfi
+
+{ .mfi
nop.m 999
fma.s1 asinf_poly_p7a = asinf_x4,asinf_coeff_P9,asinf_poly_p7
nop.i 999
-}
-{ .mfi
+}
+{ .mfi
nop.m 999
fma.s1 asinf_2poly_p2a = asinf_2poly_p2,asinf_t2,asinf_2poly_p1
nop.i 999;;
}
-
-{ .mfi
+
+{ .mfi
nop.m 999
(p8) fma.s1 asinf_sgnx_t4 = asinf_sgn_x,asinf_t4,f0
nop.i 999
-}
-{ .mfi
+}
+{ .mfi
nop.m 999
(p8) fma.s1 asinf_2poly_p4a = asinf_2poly_p6,asinf_t2,asinf_2poly_p4
nop.i 999;;
}
-
-{ .mfi
+
+{ .mfi
nop.m 999
(p8) fma.s1 asinf_Sz = asinf_5by2,asinf_dz,asinf_3by2
nop.i 999
-}
-{ .mfi
+}
+{ .mfi
nop.m 999
(p8) fma.s1 asinf_d2z = asinf_dz,asinf_dz,f0
nop.i 999;;
}
-
-{ .mfi
+
+{ .mfi
nop.m 999
(p8) fma.s1 asinf_sgn_x_piby2 = asinf_sgn_x,asinf_const_piby2,f0
nop.i 999
-}
-{ .mfi
+}
+{ .mfi
nop.m 999
(p7) fma.d.s1 asinf_poly_Ax = asinf_x5,asinf_poly_p2,asinf_poly_p1a
nop.i 999;;
-}
-
-{ .mfi
+}
+
+{ .mfi
nop.m 999
(p7) fma.d.s1 asinf_poly_Bx = asinf_x4,asinf_poly_p7a,asinf_poly_p5
nop.i 999
-}
-{ .mfi
+}
+{ .mfi
nop.m 999
(p8) fma.s1 asinf_sgnx_2poly_p2 = asinf_sgn_x,asinf_2poly_p2a,f0
nop.i 999;;
-}
-
-{ .mfi
+}
+
+{ .mfi
nop.m 999
fcmp.eq.s0 p6,p0 = f8,f0 // Only purpose is to set D if x denormal
nop.i 999
}
-{ .mfi
+{ .mfi
nop.m 999
(p8) fma.s1 asinf_2poly_p4b = asinf_2poly_p8,asinf_t4,asinf_2poly_p4a
nop.i 999;;
}
-
-{ .mfi
+
+{ .mfi
nop.m 999
(p8) fma.s1 asinf_Fz = asinf_d2z,asinf_Sz,asinf_dz
nop.i 999;;
-}
+}
+
-
-{ .mfi
+{ .mfi
nop.m 999
(p8) fma.d.s1 asinf_Pt = asinf_2poly_p4b,asinf_sgnx_t4,asinf_sgnx_2poly_p2
nop.i 999;;
-}
-
-{ .mfi
+}
+
+{ .mfi
nop.m 999
(p8) fma.d.s1 asinf_z = asinf_Az,asinf_Fz,asinf_Az
nop.i 999;;
-}
-
+}
+
.pred.rel "mutex",p8,p7 //asinf_pred_GTsqrt2by2,asinf_pred_LEsqrt2by2
-{ .mfi
+{ .mfi
nop.m 999
(p8) fnma.s.s0 f8 = asinf_z,asinf_Pt,asinf_sgn_x_piby2
nop.i 999
-}
-
-{ .mfb
+}
+
+{ .mfb
nop.m 999
(p7) fma.s.s0 f8 = asinf_x11,asinf_poly_Bx,asinf_poly_Ax
br.ret.sptk b0 ;;
-}
+}
ASINF_ABS_ONE:
// Here for short exit if |x|=1
-{ .mfb
+{ .mfb
nop.m 999
fma.s.s0 f8 = asinf_sgn_x,asinf_const_piby2,f0
br.ret.sptk b0
-}
+}
;;
GLOBAL_LIBM_END(asinf)
// Stack operations when calling error support.
-// (1) (2)
-// sp -> + psp -> +
-// | |
-// | | <- GR_Y
-// | |
-// | <-GR_Y Y2->|
-// | |
-// | | <- GR_X
-// | |
-// sp-64 -> + sp -> +
-// save ar.pfs save b0
-// save gp
+// (1) (2)
+// sp -> + psp -> +
+// | |
+// | | <- GR_Y
+// | |
+// | <-GR_Y Y2->|
+// | |
+// | | <- GR_X
+// | |
+// sp-64 -> + sp -> +
+// save ar.pfs save b0
+// save gp
// Stack operations when calling error support.
F_CS7 = f37
F_CS8 = f38
F_CS9 = f39
-F_S23 = f40
-F_S45 = f41
-F_S67 = f42
-F_S89 = f43
-F_S25 = f44
-F_S69 = f45
-F_S29 = f46
-F_X2 = f47
-F_X4 = f48
-F_TSQRT = f49
-F_DTX = f50
-F_R = f51
-F_R2 = f52
-F_R3 = f53
-F_R4 = f54
-
-F_C3 = f55
-F_C5 = f56
-F_C7 = f57
-F_C9 = f58
-F_P79 = f59
-F_P35 = f60
-F_P39 = f61
-
-F_ATHI = f62
-F_ATLO = f63
-
-F_T1 = f64
-F_Y = f65
-F_Y2 = f66
-F_ANDMASK = f67
-F_ORMASK = f68
-F_S = f69
-F_05 = f70
-F_SQRT_1S2 = f71
-F_DS = f72
-F_Z = f73
-F_1T2 = f74
-F_DZ = f75
-F_ZE = f76
-F_YZ = f77
-F_Y1S2 = f78
-F_Y1S2X = f79
-F_1X = f80
-F_ST = f81
-F_1T2_ST = f82
-F_TSS = f83
-F_Y1S2X2 = f84
-F_DZ_TERM = f85
-F_DTS = f86
-F_DS2X = f87
-F_T2 = f88
-F_ZY1S2S = f89
-F_Y1S2_1X = f90
+F_S23 = f40
+F_S45 = f41
+F_S67 = f42
+F_S89 = f43
+F_S25 = f44
+F_S69 = f45
+F_S29 = f46
+F_X2 = f47
+F_X4 = f48
+F_TSQRT = f49
+F_DTX = f50
+F_R = f51
+F_R2 = f52
+F_R3 = f53
+F_R4 = f54
+
+F_C3 = f55
+F_C5 = f56
+F_C7 = f57
+F_C9 = f58
+F_P79 = f59
+F_P35 = f60
+F_P39 = f61
+
+F_ATHI = f62
+F_ATLO = f63
+
+F_T1 = f64
+F_Y = f65
+F_Y2 = f66
+F_ANDMASK = f67
+F_ORMASK = f68
+F_S = f69
+F_05 = f70
+F_SQRT_1S2 = f71
+F_DS = f72
+F_Z = f73
+F_1T2 = f74
+F_DZ = f75
+F_ZE = f76
+F_YZ = f77
+F_Y1S2 = f78
+F_Y1S2X = f79
+F_1X = f80
+F_ST = f81
+F_1T2_ST = f82
+F_TSS = f83
+F_Y1S2X2 = f84
+F_DZ_TERM = f85
+F_DTS = f86
+F_DS2X = f87
+F_T2 = f88
+F_ZY1S2S = f89
+F_Y1S2_1X = f90
F_TS = f91
-F_PI2_LO = f92
-F_PI2_HI = f93
-F_S19 = f94
-F_INV1T2_2 = f95
-F_CORR = f96
-F_DZ0 = f97
-
-F_C11 = f98
-F_C13 = f99
+F_PI2_LO = f92
+F_PI2_HI = f93
+F_S19 = f94
+F_INV1T2_2 = f95
+F_CORR = f96
+F_DZ0 = f97
+
+F_C11 = f98
+F_C13 = f99
F_C15 = f100
F_C17 = f101
F_P1113 = f102
//..
//..Suppose (v,u) = (y,x), we calculate atan(v/u) as follows:
//..A = y * frcpa(x) (so A = (y/x)(1 - beta))
-//..atan(y/x) = atan(A) + atan( ((y/x)-A))/(1 + (y/x)A) ), the second term is
+//..atan(y/x) = atan(A) + atan( ((y/x)-A))/(1 + (y/x)A) ), the second term is
//..a correction.
-//..atan(A) is approximated by a polynomial
+//..atan(A) is approximated by a polynomial
//..A + p1 A^3 + p2 A^5 + ... + p10 A^21,
//..atan(G) is approximated as follows:
//..Let G = (y - Ax)/(x + Ay), atan(G) can be approximated by G + g * p1
//..
//..Suppose (v,u) = (x,y), we calculate atan(v/u) as follows:
//..Z = x * frcpa(y) (so Z = (x/y)(1 - beta))
-//..atan(x/y) = atan(Z) + atan( ((x/y)-Z))/(1 + (x/y)Z) ), the second term is
+//..atan(x/y) = atan(Z) + atan( ((x/y)-Z))/(1 + (x/y)Z) ), the second term is
//..a correction.
-//..atan(Z) is approximated by a polynomial
+//..atan(Z) is approximated by a polynomial
//..Z + p1 Z^3 + p2 Z^5 + ... + p10 Z^21,
//..atan(T) is approximated as follows:
//..Let T = (x - Ay)/(y + Ax), atan(T) can be approximated by T + t * p1
//..atan(A) ~=~ A + p1 A^3 + ... + P10 A^21
//..
//..This polynomial is computed as follows:
-//..Asq = A*A; Acub = A*Asq, A4 = Asq*Asq
+//..Asq = A*A; Acub = A*Asq, A4 = Asq*Asq
//..A5 = Asq*Acub, A6 = Asq*A4; A11 = A5 * A6
//..
//..poly_A1 = p9 + Asq*p10, poly_A2 = p7 + Asq*p8, poly_A3 = p5 + Asq*p6
//..
//..poly_A4 = p1 * A
//,,poly_A5 = p3 + Asq * p4, poly_A4 = A + Asq*poly_A4
-//..poly_A5 = p2 + Asq * poly_A5
+//..poly_A5 = p2 + Asq * poly_A5
//..poly_A4 = poly_A4 + A5 * poly_A5
//..
//..atan_A = poly_A4 + A11 * poly_A1
//..atan(A) ~=~ A + p1 A^3 + ... + P10 A^21
//..
//..This polynomial is computed as follows:
-//..Asq = A*A; Acub = A*Asq, A4 = Asq*Asq
+//..Asq = A*A; Acub = A*Asq, A4 = Asq*Asq
//..A5 = Asq*Acub, A6 = Asq*A4; A11 = A5 * A6
//..
//..poly_A1 = p9 + Asq*p10, poly_A2 = p7 + Asq*p8, poly_A3 = p5 + Asq*p6
//..
//..poly_A4 = p1 * A
//,,poly_A5 = p3 + Asq * p4, poly_A4 = A + Asq*poly_A4
-//..poly_A5 = p2 + Asq * poly_A5
+//..poly_A5 = p2 + Asq * poly_A5
//..poly_A4 = poly_A4 + A5 * poly_A5
//..
//..atan_A = poly_A4 + A11 * poly_A1
//coef_pj, j = 1,2,...,10; atan(A) ~=~ A + p1 A^3 + p2 A^5 + ... + p10 A^21
//
// coef_p1 = -.3333332707155439167401311806315789E+00
-// coef_p1 in dbl = BFD5 5555 1219 1621
+// coef_p1 in dbl = BFD5 5555 1219 1621
//
// coef_p2 = .1999967670926658391827857030875748E+00
-// coef_p2 in dbl = 3FC9 997E 7AFB FF4E
+// coef_p2 in dbl = 3FC9 997E 7AFB FF4E
//
// coef_p3 = -.1427989384500152360161563301087296E+00
-// coef_p3 in dbl = BFC2 473C 5145 EE38
+// coef_p3 in dbl = BFC2 473C 5145 EE38
//
// coef_p4 = .1105852823460720770079031213661163E+00
-// coef_p4 in dbl = 3FBC 4F51 2B18 65F5
+// coef_p4 in dbl = 3FBC 4F51 2B18 65F5
//
// coef_p5 = -.8811839915595312348625710228448363E-01
-// coef_p5 in dbl = BFB6 8EED 6A8C FA32
+// coef_p5 in dbl = BFB6 8EED 6A8C FA32
//
// coef_p6 = .6742329836955067042153645159059714E-01
-// coef_p6 in dbl = 3FB1 42A7 3D7C 54E3
+// coef_p6 in dbl = 3FB1 42A7 3D7C 54E3
//
// coef_p7 = -.4468571068774672908561591262231909E-01
-// coef_p7 in dbl = BFA6 E10B A401 393F
+// coef_p7 in dbl = BFA6 E10B A401 393F
//
// coef_p8 = .2252333246746511135532726960586493E-01
-// coef_p8 in dbl = 3F97 105B 4160 F86B
+// coef_p8 in dbl = 3F97 105B 4160 F86B
//
// coef_p9 = -.7303884867007574742501716845542314E-02
-// coef_p9 in dbl = BF7D EAAD AA33 6451
+// coef_p9 in dbl = BF7D EAAD AA33 6451
//
// coef_p10 = .1109686868355312093949039454619058E-02
-// coef_p10 in dbl = 3F52 2E5D 33BC 9BAA
+// coef_p10 in dbl = 3F52 2E5D 33BC 9BAA
//
// Special values
.section .text
GLOBAL_IEEE754_ENTRY(atan2f)
-
-{ .mfi
+
+{ .mfi
alloc r32 = ar.pfs,1,5,4,0
frcpa.s1 atan2f_Z0,p0 = f1,f8 // Approx to 1/y
nop.i 999
-}
-{ .mfi
+}
+{ .mfi
addl atan2f_GR_Addr_1 = @ltoff(atan2f_coef_table1),gp
fma.s1 atan2f_xsq = f9,f9,f0
nop.i 999 ;;
}
-
-{ .mfi
+
+{ .mfi
ld8 atan2f_GR_Addr_1 = [atan2f_GR_Addr_1]
frcpa.s1 atan2f_A0,p0 = f1,f9 // Approx to 1/x
nop.i 999
-}
-{ .mfi
+}
+{ .mfi
nop.m 999
fma.s1 atan2f_ysq = f8,f8,f0
nop.i 999 ;;
}
-
-{ .mfi
+
+{ .mfi
nop.m 999
fcmp.ge.s1 p8,p9 = f9,f0 // Set p8 if x>=0, p9 if x<0
nop.i 999
}
-{ .mfi
+{ .mfi
nop.m 999
fma.s1 atan2f_xy = f9,f8,f0
nop.i 999 ;;
}
-
-
-{ .mfi
+
+
+{ .mfi
add atan2f_GR_Addr_2 = 0x30, atan2f_GR_Addr_1
fmerge.s atan2f_sgn_Y = f8,f1
nop.i 999 ;;
-}
-
-{ .mmf
+}
+
+{ .mmf
ldfpd atan2f_coef_p1,atan2f_coef_p10 = [atan2f_GR_Addr_1],16
ldfpd atan2f_coef_p9,atan2f_coef_p8 = [atan2f_GR_Addr_2],16
fclass.m p10,p0 = f9,0xe7 // Test x @inf|@snan|@qnan|@zero
-}
+}
;;
-
-{ .mfi
+
+{ .mfi
ldfpd atan2f_coef_p7,atan2f_coef_p6 = [atan2f_GR_Addr_1],16
fma.s1 atan2f_T_denom = atan2f_Z0,atan2f_xsq,f8
nop.i 999
-}
-{ .mfi
+}
+{ .mfi
ldfpd atan2f_coef_p5,atan2f_coef_p4 = [atan2f_GR_Addr_2],16
fma.s1 atan2f_Z = atan2f_Z0,f9,f0
nop.i 999 ;;
}
-
-{ .mfi
+
+{ .mfi
ldfpd atan2f_coef_p3,atan2f_coef_p2 = [atan2f_GR_Addr_1],16
fma.s1 atan2f_G_denom = atan2f_A0,atan2f_ysq,f9
nop.i 999
-}
-{ .mfi
+}
+{ .mfi
ldfpd atan2f_const_piby2,atan2f_const_pi = [atan2f_GR_Addr_2],16
fma.s1 atan2f_A = atan2f_A0,f8,f0
nop.i 999 ;;
}
-{ .mfi
+{ .mfi
ldfpd atan2f_const_piby4,atan2f_const_3piby4 = [atan2f_GR_Addr_2]
fclass.m p11,p0 = f8,0xe7 // Test y @inf|@snan|@qnan|@zero
nop.i 999
-}
-{ .mfb
+}
+{ .mfb
nop.m 999
fnma.s1 atan2f_T_numer = atan2f_Z0,atan2f_xy,f9
(p10) br.cond.spnt ATAN2F_XY_INF_NAN_ZERO ;; // Branch on x nan,inf,zero
-}
+}
// p6 if |y|>|x|, p7 if |x|>=|y| , use xsq and ysq for test
-{ .mfi
+{ .mfi
nop.m 999
fcmp.gt.s1 p6,p7 = atan2f_ysq,atan2f_xsq
nop.i 999
}
-{ .mfb
+{ .mfb
nop.m 999
fnma.s1 atan2f_G_numer = atan2f_A0,atan2f_xy,f8
(p11) br.cond.spnt ATAN2F_XY_INF_NAN_ZERO ;; // Branch on y nan,inf,zero
}
-
-{ .mfi
+
+{ .mfi
nop.m 999
(p8) fma.s1 atan2f_const_1 = atan2f_sgn_Y,f0,f0
nop.i 999
-}
-{ .mfi
+}
+{ .mfi
nop.m 999
(p9) fma.s1 atan2f_const_1 = atan2f_sgn_Y,f1,f0
nop.i 999 ;;
}
-
-{ .mfi
+
+{ .mfi
nop.m 999
(p6) fnma.s1 atan2f_U = atan2f_Z,f1,f0
nop.i 999
-}
-{ .mfi
+}
+{ .mfi
nop.m 999
(p6) fma.s1 atan2f_Usq = atan2f_Z,atan2f_Z,f0
nop.i 999 ;;
-}
+}
+
-
-{ .mfi
+{ .mfi
nop.m 999
(p7) fma.s1 atan2f_U = atan2f_A,f1,f0
nop.i 999
-}
-{ .mfi
+}
+{ .mfi
nop.m 999
(p7) fma.s1 atan2f_Usq = atan2f_A,atan2f_A,f0
nop.i 999 ;;
}
-{ .mfi
+{ .mfi
nop.m 999
(p6) frcpa.s1 atan2f_Q1,p0 = f1,atan2f_T_denom
nop.i 999
-}
-{ .mfi
+}
+{ .mfi
nop.m 999
(p6) fma.s1 atan2f_R_denom = atan2f_T_denom,f1,f0
nop.i 999 ;;
-}
+}
-{ .mfi
+{ .mfi
nop.m 999
(p7) frcpa.s1 atan2f_Q1,p0 = f1,atan2f_G_denom
nop.i 999
-}
-{ .mfi
+}
+{ .mfi
nop.m 999
(p7) fma.s1 atan2f_R_denom = atan2f_G_denom,f1,f0
nop.i 999 ;;
-}
+}
-{ .mfi
+{ .mfi
nop.m 999
(p6) fnma.s1 atan2f_R_numer = atan2f_T_numer,f1,f0
nop.i 999
-}
-{ .mfi
+}
+{ .mfi
nop.m 999
(p7) fma.s1 atan2f_R_numer = atan2f_G_numer,f1,f0
nop.i 999 ;;
-}
+}
+
-
-{ .mfi
+{ .mfi
nop.m 999
(p6) fnma.s1 atan2f_p1rnum = atan2f_T_numer,atan2f_coef_p1,f0
nop.i 999 ;;
-}
-{ .mfi
+}
+{ .mfi
nop.m 999
(p7) fma.s1 atan2f_p1rnum = atan2f_G_numer,atan2f_coef_p1,f0
nop.i 999 ;;
-}
+}
+
-
-{ .mfi
+{ .mfi
nop.m 999
fma.s1 atan2f_U4 = atan2f_Usq,atan2f_Usq,f0
nop.i 999
}
-{ .mfi
+{ .mfi
nop.m 999
fma.s1 atan2f_poly_u109 = atan2f_Usq,atan2f_coef_p10,atan2f_coef_p9
nop.i 999 ;;
-}
-
-{ .mfi
+}
+
+{ .mfi
nop.m 999
fma.s1 atan2f_poly_u87 = atan2f_Usq,atan2f_coef_p8,atan2f_coef_p7
nop.i 999
-}
-{ .mfi
+}
+{ .mfi
nop.m 999
fma.s1 atan2f_poly_u65 = atan2f_Usq,atan2f_coef_p6,atan2f_coef_p5
nop.i 999 ;;
}
-
-
-{ .mfi
+
+
+{ .mfi
nop.m 999
fma.s1 atan2f_poly_u43 = atan2f_Usq,atan2f_coef_p4,atan2f_coef_p3
nop.i 999
-}
-{ .mfi
+}
+{ .mfi
nop.m 999
fnma.s1 atan2f_Q_beta = atan2f_Q1,atan2f_R_denom,f1
nop.i 999 ;;
}
-{ .mfi
+{ .mfi
nop.m 999
fma.s1 atan2f_poly_u21 = atan2f_Usq,atan2f_coef_p2,atan2f_coef_p1
nop.i 999
-}
-{ .mfi
+}
+{ .mfi
nop.m 999
fma.s1 atan2f_r = atan2f_Q1,atan2f_R_numer,f0
nop.i 999 ;;
}
-{ .mfi
+{ .mfi
nop.m 999
(p6) fma.s1 atan2f_C = atan2f_sgn_Y,atan2f_const_piby2,f0
nop.i 999
-}
-{ .mfi
+}
+{ .mfi
nop.m 999
(p7) fma.s1 atan2f_C = atan2f_const_1,atan2f_const_pi,f0
nop.i 999 ;;
-}
+}
-{ .mfi
+{ .mfi
nop.m 999
fma.s1 atan2f_U6 = atan2f_U4,atan2f_Usq,f0
nop.i 999
}
-{ .mfi
+{ .mfi
nop.m 999
fma.s1 atan2f_U8 = atan2f_U4,atan2f_U4,f0
nop.i 999 ;;
}
-{ .mfi
+{ .mfi
nop.m 999
fma.s1 atan2f_poly_u10to7 = atan2f_U4,atan2f_poly_u109,atan2f_poly_u87
nop.i 999
-}
-{ .mfi
+}
+{ .mfi
nop.m 999
fma.s1 atan2f_pR = atan2f_p1rnum,atan2f_Q1,f0
nop.i 999 ;;
-}
+}
-{ .mfi
+{ .mfi
nop.m 999
fma.s1 atan2f_poly_u6to3 = atan2f_U4,atan2f_poly_u65,atan2f_poly_u43
nop.i 999
-}
-{ .mfi
+}
+{ .mfi
nop.m 999
fma.s1 atan2f_Q2 = atan2f_Q1,atan2f_Q_beta,atan2f_Q1
nop.i 999 ;;
}
-{ .mfi
+{ .mfi
nop.m 999
fma.s1 atan2f_Q_beta2 = atan2f_Q_beta,atan2f_Q_beta,f0
nop.i 999
-}
-{ .mfi
+}
+{ .mfi
nop.m 999
fma.s1 atan2f_rsq = atan2f_r,atan2f_r,f0
nop.i 999 ;;
}
-{ .mfi
+{ .mfi
nop.m 999
fma.s1 atan2f_poly_u210 = atan2f_Usq,atan2f_poly_u21,f1
nop.i 999 ;;
-}
-
+}
+
{ .mfi
nop.m 999
fcmp.eq.s0 p8,p0 = f8,f9 // Dummy op to set flag on denormal inputs
nop.i 999
}
-{ .mfi
+{ .mfi
nop.m 999
fma.s1 atan2f_poly_u10to3 = atan2f_U8,atan2f_poly_u10to7,atan2f_poly_u6to3
nop.i 999 ;;
-}
+}
-{ .mfi
+{ .mfi
nop.m 999
fma.s1 atan2f_Q3 = atan2f_Q2,atan2f_Q_beta2,atan2f_Q2
nop.i 999
-}
-{ .mfi
+}
+{ .mfi
nop.m 999
fma.s1 atan2f_pRC = atan2f_rsq,atan2f_pR,atan2f_C
nop.i 999 ;;
-}
-
-{ .mfi
+}
+
+{ .mfi
nop.m 999
fma.s1 atan2f_poly_u10to0 = atan2f_U6,atan2f_poly_u10to3,atan2f_poly_u210
nop.i 999 ;;
-}
+}
-{ .mfi
+{ .mfi
nop.m 999
fma.s1 atan2f_pQRC = atan2f_R_numer,atan2f_Q3,atan2f_pRC
nop.i 999 ;;
-}
+}
-{ .mfb
+{ .mfb
nop.m 999
fma.s.s0 f8 = atan2f_U,atan2f_poly_u10to0,atan2f_pQRC
br.ret.sptk b0 ;;
-}
+}
-.file "atanhl.s"
+.file "atanhl.s"
// Copyright (c) 2001 - 2003, Intel Corporation
// products derived from this software without specific prior written
// permission.
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,INCLUDING,BUT NOT
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,INCLUDING,BUT NOT
// LIMITED TO,THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT,INDIRECT,INCIDENTAL,SPECIAL,
// EXEMPLARY,OR CONSEQUENTIAL DAMAGES (INCLUDING,BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,DATA,OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,DATA,OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY,WHETHER IN CONTRACT,STRICT LIABILITY OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE,EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE,EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
// Intel Corporation is the author of this code,and requests that all
-// problem reports or change requests be submitted to it directly at
+// problem reports or change requests be submitted to it directly at
// http://www.intel.com/software/products/opensource/libraries/num.htm.
//
//*********************************************************************
//
-// History:
+// History:
// 09/10/01 Initial version
// 12/11/01 Corrected .restore syntax
// 05/20/02 Cleaned up namespace and sf0 syntax
//
//*********************************************************************
//
-// Function: atanhl(x) computes the principle value of the inverse
+// Function: atanhl(x) computes the principle value of the inverse
// hyperbolic tangent of x.
//
//*********************************************************************
// IEEE Special Conditions:
//
// atanhl(inf) = QNaN
-// atanhl(-inf) = QNaN
-// atanhl(+/-0) = +/-0
-// atanhl(1) = +inf
-// atanhl(-1) = -inf
+// atanhl(-inf) = QNaN
+// atanhl(+/-0) = +/-0
+// atanhl(1) = +inf
+// atanhl(-1) = -inf
// atanhl(|x|>1) = QNaN
// atanhl(SNaN) = QNaN
// atanhl(QNaN) = QNaN
// Case atanhl_regular:
//
// Here we use formula atanhl(x) = sign(x)*log1pl(2*|x|/(1-|x|))/2 and
-// calculation is subdivided into two stages. The first stage is
-// calculating of X = 2*|x|/(1-|x|). The second one is calculating of
+// calculation is subdivided into two stages. The first stage is
+// calculating of X = 2*|x|/(1-|x|). The second one is calculating of
// sign(x)*log1pl(X)/2. To obtain required accuracy we use precise division
// algorythm output of which is a pair of two extended precision values those
// approximate result of division with accuracy higher than working
//
// y = frcpa(b) initial approximation of 1/b
// q = a*y initial approximation of a/b
-//
+//
// e = 1 - b*y
// e2 = e + e^2
// e1 = e^2
// r1 = a - b*X
// r1 = r1 - b_lo*X
// X_lo = r1*y3 low part of a/b
-//
+//
// 2. special log1p algorithm overview
// ***********************************
//
// Here we use a table lookup method. The basic idea is that in
-// order to compute logl(Arg) = log1pl (Arg-1) for an argument Arg in [1,2),
+// order to compute logl(Arg) = log1pl (Arg-1) for an argument Arg in [1,2),
// we construct a value G such that G*Arg is close to 1 and that
// logl(1/G) is obtainable easily from a table of values calculated
// beforehand. Thus
// G := G_1 * G_2 * G_3
// r := (G * S_hi - 1) + G * S_lo
//
-// These G_j's have the property that the product is exactly
+// These G_j's have the property that the product is exactly
// representable and that |r| < 2^(-12) as a result.
//
// Step 2: Approximation
data8 0xCCCCCCCCCCCCCCCD,0x00003FFC // C5
data8 0xAAAAAAAAAAAAAAAA,0x00003FFD // C3
data4 0x3f000000 // 1/2
-data4 0x00000000 // pad
+data4 0x00000000 // pad
data4 0x00000000
data4 0x00000000
LOCAL_OBJECT_END(Constants_TaylorSeries)
data8 0xBE049391B6B7C239
LOCAL_OBJECT_END(Constants_G_H_h2)
-// G3 and H3 - IEEE single and h3 - IEEE double
+// G3 and H3 - IEEE single and h3 - IEEE double
LOCAL_OBJECT_START(Constants_G_H_h3)
data4 0x3F7FFC00,0x38800100
data8 0x3D355595562224CD
alloc r32 = ar.pfs,0,17,4,0
fnma.s1 FR_Bp = f8,f1,f1 // b = 1 - |arg| (for x>0)
mov GR_ExpMask = 0x1ffff
-}
-{ .mfi
+}
+{ .mfi
addl GR_ad_taylor = @ltoff(Constants_TaylorSeries),gp
fma.s1 FR_Bn = f8,f1,f1 // b = 1 - |arg| (for x<0)
mov GR_NearZeroBound = 0xfffa // biased exp of 1/32
-};;
-{ .mfi
+};;
+{ .mfi
getf.exp GR_ArgExp = f8
fcmp.lt.s1 p6,p7 = f8,f0 // is negative?
nop.i 0
-}
-{ .mfi
+}
+{ .mfi
ld8 GR_ad_taylor = [GR_ad_taylor]
fmerge.s FR_abs_x = f1,f8
nop.i 0
-};;
-{ .mfi
+};;
+{ .mfi
nop.m 0
fclass.m p8,p0 = f8,0x1C7 // is arg NaT,Q/SNaN or +/-0 ?
nop.i 0
}
-{ .mfi
+{ .mfi
nop.m 0
fma.s1 FR_x2 = f8,f8,f0
nop.i 0
-};;
-{ .mfi
+};;
+{ .mfi
add GR_ad_z_1 = 0x0F0,GR_ad_taylor
fclass.m p9,p0 = f8,0x0a // is arg -denormal ?
add GR_ad_taylor_2 = 0x010,GR_ad_taylor
-}
-{ .mfi
+}
+{ .mfi
add GR_ad_05 = 0x080,GR_ad_taylor
nop.f 0
nop.i 0
-};;
-{ .mfi
+};;
+{ .mfi
ldfe FR_C17 = [GR_ad_taylor],32
fclass.m p10,p0 = f8,0x09 // is arg +denormal ?
add GR_ad_tbl_1 = 0x040,GR_ad_z_1 // point to Constants_G_H_h1
-}
-{ .mfb
+}
+{ .mfb
add GR_ad_z_2 = 0x140,GR_ad_z_1 // point to Constants_Z_2
(p8) fma.s0 f8 = f8,f1,f0 // NaN or +/-0
(p8) br.ret.spnt b0 // exit for Nan or +/-0
-};;
-{ .mfi
+};;
+{ .mfi
ldfe FR_C15 = [GR_ad_taylor_2],32
fclass.m p15,p0 = f8,0x23 // is +/-INF ?
add GR_ad_tbl_2 = 0x180,GR_ad_z_1 // point to Constants_G_H_h2
-}
-{ .mfb
+}
+{ .mfb
ldfe FR_C13 = [GR_ad_taylor],32
(p9) fnma.s0 f8 = f8,f8,f8 // -denormal
(p9) br.ret.spnt b0 // exit for -denormal
-};;
-{ .mfi
+};;
+{ .mfi
ldfe FR_C11 = [GR_ad_taylor_2],32
fcmp.eq.s0 p13,p0 = FR_abs_x,f1 // is |arg| = 1?
nop.i 0
-}
-{ .mfb
+}
+{ .mfb
ldfe FR_C9 = [GR_ad_taylor],32
(p10) fma.s0 f8 = f8,f8,f8 // +denormal
(p10) br.ret.spnt b0 // exit for +denormal
-};;
-{ .mfi
+};;
+{ .mfi
ldfe FR_C7 = [GR_ad_taylor_2],32
(p6) frcpa.s1 FR_Yn,p11 = f1,FR_Bn // y = frcpa(b)
and GR_ArgExp = GR_ArgExp,GR_ExpMask // biased exponent
-}
-{ .mfb
+}
+{ .mfb
ldfe FR_C5 = [GR_ad_taylor],32
fnma.s1 FR_B = FR_abs_x,f1,f1 // b = 1 - |arg|
(p15) br.cond.spnt atanhl_gt_one // |arg| > 1
ldfs FR_Half = [GR_ad_05]
(p7) fnma.s1 FR_B_lo = FR_Bp,f1,f1
nop.i 0
-};;
+};;
{ .mfi
nop.m 0
- (p6) fnma.s1 FR_E0 = FR_Yn,FR_Bn,f1 // e = 1-b*y
+ (p6) fnma.s1 FR_E0 = FR_Yn,FR_Bn,f1 // e = 1-b*y
nop.i 0
-}
-{ .mfb
+}
+{ .mfb
nop.m 0
(p6) fma.s1 FR_Y0 = FR_Yn,f1,f0
(p8) br.cond.spnt atanhl_gt_one // |arg| > 1
};;
{ .mfi
nop.m 0
- (p7) fnma.s1 FR_E0 = FR_Yp,FR_Bp,f1
+ (p7) fnma.s1 FR_E0 = FR_Yp,FR_Bp,f1
nop.i 0
}
{ .mfi
{ .mfi
ldfe FR_log2_lo = [GR_ad_q],16 // load log2_lo
nop.f 0
- sub GR_N = GR_N,GR_Bias
+ sub GR_N = GR_N,GR_Bias
};;
{ .mfi
ldfe FR_Q4 = [GR_ad_q],16 // load Q4
- fms.s1 FR_S_lo = FR_AA,f1,FR_Z // form S_lo = AA - Z
+ fms.s1 FR_S_lo = FR_AA,f1,FR_Z // form S_lo = AA - Z
sub GR_minus_N = GR_Bias,GR_N // form exponent of 2^(-N)
};;
{ .mmf
{ .mfi
ldfe FR_Q2 = [GR_ad_q],16 // load Q2
nop.f 0
- extr.u GR_Index2 = GR_X_1,6,4 // extract bits 6-9 of X_1
+ extr.u GR_Index2 = GR_X_1,6,4 // extract bits 6-9 of X_1
};;
{ .mmi
ldfe FR_Q1 = [GR_ad_q] // load Q1
}
{ .mfi
nop.m 0
- nop.f 0
+ nop.f 0
nop.i 0
};;
{ .mfi
nop.m 0
- nop.f 0
+ nop.f 0
nop.i 0
};;
{ .mfi
nop.m 0
- nop.f 0
+ nop.f 0
nop.i 0
};;
{ .mfb
nop.m 0
fma.s0 f8 = FR_C17,FR_x3,f8
- br.ret.sptk b0
+ br.ret.sptk b0
};;
atanhl_eq_one:
// products derived from this software without specific prior written
// permission.
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
// Intel Corporation is the author of this code, and requests that all
-// problem reports or change requests be submitted to it directly at
+// problem reports or change requests be submitted to it directly at
// http://www.intel.com/software/products/opensource/libraries/num.htm.
//
// History
//==============================================================
-// 02/02/00 Initial version
+// 02/02/00 Initial version
// 04/04/00 Unwind support added
// 08/15/00 Bundle added after call to __libm_error_support to properly
// set [the previously overwritten] GR_Parameter_RESULT.
//
// Registers used
//==============================================================
-// general registers:
+// general registers:
// r14 -> r40
// predicate registers used:
// p6 -> p11
// floating-point registers used:
-// f9 -> f15; f32 -> f90;
+// f9 -> f15; f32 -> f90;
// f8 has input, then output
//
// Overview of operation
// 1. COSH_BY_POLY 0 < |x| < 0.25
// ===============
// Evaluate cosh(x) by a 12th order polynomial
-// Care is take for the order of multiplication; and P2 is not exactly 1/4!,
+// Care is take for the order of multiplication; and P2 is not exactly 1/4!,
// P3 is not exactly 1/6!, etc.
// cosh(x) = 1 + (P1*x^2 + P2*x^4 + P3*x^6 + P4*x^8 + P5*x^10 + P6*x^12)
//
// =============
// cosh(x) = cosh(B+R)
// = cosh(B)cosh(R) + sinh(B)sinh(R)
-//
+//
// ax = |x| = M*log2/64 + R
// B = M*log2/64
-// M = 64*N + j
+// M = 64*N + j
// We will calculate M and get N as (M-j)/64
// The division is a shift.
// exp(B) = exp(N*log2 + j*log2/64)
// = 2^N * 2^(j*log2/64)
// cosh(B) = 1/2(e^B + e^-B)
-// = 1/2(2^N * 2^(j*log2/64) + 2^-N * 2^(-j*log2/64))
-// cosh(B) = (2^(N-1) * 2^(j*log2/64) + 2^(-N-1) * 2^(-j*log2/64))
-// sinh(B) = (2^(N-1) * 2^(j*log2/64) - 2^(-N-1) * 2^(-j*log2/64))
+// = 1/2(2^N * 2^(j*log2/64) + 2^-N * 2^(-j*log2/64))
+// cosh(B) = (2^(N-1) * 2^(j*log2/64) + 2^(-N-1) * 2^(-j*log2/64))
+// sinh(B) = (2^(N-1) * 2^(j*log2/64) - 2^(-N-1) * 2^(-j*log2/64))
// 2^(j*log2/64) is stored as Tjhi + Tjlo , j= -32,....,32
// Tjhi is double-extended (80-bit) and Tjlo is single(32-bit)
//
// R = ax - M*log2_by_64_hi - M*log2_by_64_lo
// exp(R) = 1 + R +R^2(1/2! + R(1/3! + R(1/4! + ... + R(1/n!)...)
// = 1 + p_odd + p_even
-// where the p_even uses the A coefficients and the p_even uses
+// where the p_even uses the A coefficients and the p_even uses
// the B coefficients
//
// So sinh(R) = 1 + p_odd + p_even -(1 -p_odd -p_even)/2 = p_odd
GR_Parameter_TAG = r40
-f_ABS_X = f9
+f_ABS_X = f9
f_X2 = f10
f_X4 = f11
f_tmp = f14
f_S_hi = f69
f_SC_hi_temp = f70
-f_C_lo_temp1 = f71
-f_C_lo_temp2 = f72
-f_C_lo_temp3 = f73
-f_C_lo_temp4 = f73
+f_C_lo_temp1 = f71
+f_C_lo_temp2 = f72
+f_C_lo_temp3 = f73
+f_C_lo_temp4 = f73
f_C_lo = f74
f_C_hi = f75
-f_Y_hi = f77
-f_Y_lo_temp = f78
-f_Y_lo = f79
+f_Y_hi = f77
+f_Y_lo_temp = f78
+f_Y_lo = f79
f_NORM_X = f80
f_P1 = f81
}
{ .mfi
nop.m 0
- fnorm.s1 f_NORM_X = f8
+ fnorm.s1 f_NORM_X = f8
mov r_exp_2tom57 = 0xffff-57
}
;;
{ .mfi
setf.d f_RSHF_2TO57 = r_rshf_2to57 // Form const 1.100 * 2^120
fclass.m p10,p0 = f8, 0x0b // Test for denorm
- mov r_exp_mask = 0x1ffff
+ mov r_exp_mask = 0x1ffff
}
{ .mlx
setf.sig f_INV_LN2_2TO63 = r_sig_inv_ln2 // Form 1/ln2 * 2^63
add r_ad5 = 0x580, r_ad1 // Point to j_lo_table midpoint
}
{ .mib
- ldfe f_log2by64_hi = [r_ad1],16
+ ldfe f_log2by64_hi = [r_ad1],16
and r_exp_x = r_exp_mask, r_signexp_x
(p7) br.ret.spnt b0 // Exit if x=0
}
// Get the A coefficients for COSH_BY_TBL
{ .mfi
- ldfe f_A1 = [r_ad3],16
+ ldfe f_A1 = [r_ad3],16
fcmp.lt.s1 p8,p9 = f8,f0 // Test for x<0
cmp.lt p7,p0 = r_exp_x, r_exp_0_25 // Test x < 0.25
}
{ .mfb
add r_ad2o = 0x30, r_ad2e // Point to p_table odd coeffs
-(p6) fma.s0 f8 = f8,f8,f0 // Result for x nan, inf
+(p6) fma.s0 f8 = f8,f8,f0 // Result for x nan, inf
(p6) br.ret.spnt b0 // Exit for x nan, inf
}
;;
// Calculate X2 = ax*ax for COSH_BY_POLY
{ .mfi
- ldfe f_log2by64_lo = [r_ad1],16
+ ldfe f_log2by64_lo = [r_ad1],16
nop.f 0
nop.i 0
}
{ .mfb
- ldfe f_A2 = [r_ad3],16
+ ldfe f_A2 = [r_ad3],16
fma.s1 f_X2 = f_NORM_X, f_NORM_X, f0
(p7) br.cond.spnt COSH_BY_POLY
}
;;
// Here if |x| >= 0.25
-COSH_BY_TBL:
+COSH_BY_TBL:
// ******************************************************
// STEP 1 (TBL and EXP) - Argument reduction
// ******************************************************
-// Get the following constants.
+// Get the following constants.
// Inv_log2by64
// log2by64_hi
// log2by64_lo
// Subtract RSHF constant to get rounded M as a floating point value
// M_temp * 2^(63-6) - 2^63
{ .mfb
- ldfe f_B3 = [r_ad3],16
+ ldfe f_B3 = [r_ad3],16
fms.s1 f_M = f_M_temp, f_2TOM57, f_RSHF
(p6) br.cond.spnt COSH_HUGE // Branch if result will overflow
}
;;
{ .mfi
- getf.sig r_M = f_M_temp
+ getf.sig r_M = f_M_temp
nop.f 0
cmp.ge p7,p6 = r_exp_x, r_exp_32 // Test if x >= 32
}
;;
-// Calculate j. j is the signed extension of the six lsb of M. It
+// Calculate j. j is the signed extension of the six lsb of M. It
// has a range of -32 thru 31.
// Calculate R
// N = (M-j)/64
{ .mfi
ldfe f_Tjhi = [r_ad_J_hi]
- fnma.s1 f_R = f_M, f_log2by64_lo, f_R_temp
- shr r_N = r_Mmj, 0x6 // N = (M-j)/64
+ fnma.s1 f_R = f_M, f_log2by64_lo, f_R_temp
+ shr r_N = r_Mmj, 0x6 // N = (M-j)/64
}
{ .mfi
shladd r_ad_mJ_hi = r_mj, 4, r_ad4 // pointer to Tmjhi
}
;;
-//
-// If TBL,
+//
+// If TBL,
// Calculate S_hi and S_lo, and C_hi
// SC_hi_temp = sneg * Tmjhi
// S_hi = spos * Tjhi - SC_hi_temp
{ .mfi
nop.m 0
-(p6) fma.s1 f_SC_hi_temp = f_sneg, f_Tmjhi, f0
+(p6) fma.s1 f_SC_hi_temp = f_sneg, f_Tmjhi, f0
nop.i 0
}
;;
-// If TBL,
+// If TBL,
// C_lo_temp3 = sneg * Tmjlo
// C_lo_temp4 = spos * Tjlo + C_lo_temp3
// C_lo_temp4 = spos * Tjlo + (sneg * Tmjlo)
}
;;
-// If EXP,
+// If EXP,
// Compute 2^(N-1) * Tjhi and 2^(N-1) * Tjlo
{ .mfi
nop.m 0
{ .mfi
nop.m 0
-(p6) fma.s1 f_C_lo_temp2 = f_sneg, f_Tmjhi, f_C_lo_temp1
+(p6) fma.s1 f_C_lo_temp2 = f_sneg, f_Tmjhi, f_C_lo_temp1
nop.i 0
}
;;
;;
// If TBL,
-// Y_hi = C_hi
+// Y_hi = C_hi
// Y_lo = S_hi*p_odd + (C_hi*p_even + C_lo)
{ .mfi
nop.m 0
// Here if 0 < |x| < 0.25
-COSH_BY_POLY:
+COSH_BY_POLY:
{ .mmf
ldfe f_P6 = [r_ad2e],16
ldfe f_P5 = [r_ad2o],16
{ .mmi
ldfe f_P2 = [r_ad2e],16
- ldfe f_P1 = [r_ad2o],16
+ ldfe f_P1 = [r_ad2o],16
nop.i 0
}
;;
// Here if |x| >= overflow limit
-COSH_HUGE:
+COSH_HUGE:
// for COSH_HUGE, put 24000 in exponent; take sign from input
{ .mmi
mov r_exp_huge = 0x15dbf
;;
{ .mfi
- alloc r32 = ar.pfs,0,5,4,0
+ alloc r32 = ar.pfs,0,5,4,0
fma.s1 f_signed_hi_lo = f_huge, f1, f1
nop.i 0
}
{ .mib
stfe [GR_Parameter_X] = f8 // STORE Parameter 1 on stack
add GR_Parameter_RESULT = 0,GR_Parameter_Y // Parameter 3 address
- nop.b 0
+ nop.b 0
}
{ .mib
stfe [GR_Parameter_Y] = f_pre_result // STORE Parameter 3 on stack
nop.i 0
}
;;
-
+
{ .mfb
nop.m 0
fma.d.s0 f8 = fTmp, fTmp, f0 // Set I,U, tiny (+0.0) result
.section .text
GLOBAL_IEEE754_ENTRY(expf)
-
+
{ .mlx
addl rTblAddr = @ltoff(_expf_table),gp
movl r64DivLn2 = 0x40571547652B82FE // 64/ln(2)
nop.i 0
}
;;
-
+
{ .mfb
nop.m 0
fma.s.s0 f8 = fTmp, fTmp, f0 // Set I,U, tiny (+0.0) result
{ .mfi
nop.m 0
- // Final iteration (p8): is FR_ABS_A the correct remainder
+ // Final iteration (p8): is FR_ABS_A the correct remainder
// (quotient was not overestimated) ?
(p8) fcmp.lt.unc.s1 p6, p10 = FR_QREM, f0
nop.i 0
nop.m 0
// add b to estimated remainder (to cover the case when the quotient was
// overestimated)
- // also set correct sign by using
+ // also set correct sign by using
// FR_B_SGN_A = |b|*sgn(a), FR_ROUNDCONST = sgn(a)
(p6) fma.s0 f8 = FR_QREM, FR_ROUNDCONST, FR_B_SGN_A
nop.b 0
// products derived from this software without specific prior written
// permission.
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
// Intel Corporation is the author of this code, and requests that all
-// problem reports or change requests be submitted to it directly at
+// problem reports or change requests be submitted to it directly at
// http://www.intel.com/software/products/opensource/libraries/num.htm.
//
//*********************************************************************
//
-// History:
+// History:
// 02/02/00 hand-optimized
// 04/04/00 Unwind support added
// 06/20/00 new version
// x2 = x * x in double-extended
// y2 = y * y in double-extended
// temp = x2 + y2 in double-extended
-// sqrt(temp) rounded to double
+// sqrt(temp) rounded to double
//
//*********************************************************************
// Compute x*x
fma.s1 f10=f8,f8,f0
// r2=bias-1
- mov r2=0xfffe
+ mov r2=0xfffe
}
{.mfi
// 63/8
{.mfi
nop.m 0
// if possible overflow, copy f8 to f32
- // set Denormal, if necessary
- // (p8)
+ // set Denormal, if necessary
+ // (p8)
fma.d.s0 f32=f8,f1,f0
nop.i 0;;
}
{ .mfi
nop.m 0
-// Identify Natvals, Infs, NaNs, and Zeros
+// Identify Natvals, Infs, NaNs, and Zeros
// and return result
fclass.m.unc p7, p0 = f12, 0x1E7
nop.i 0;;
-}
+}
{.mfb
// get exponent of x^2+y^2
getf.exp r3=f12
// H0=0.5*z0
(p6) fma.s1 f15=f8,f7,f0
nop.i 0;;
-}
+}
{.mfi
nop.i 0
}
{.mfi
- // Is x^2 + y^2 well less than the overflow
+ // Is x^2 + y^2 well less than the overflow
// threshold?
(p6) cmp.lt.unc p7, p8 = r3,r2
// P=P13+d3*P47
}
{ .mfi
- nop.m 0
-(p8) fsetc.s2 0x7F,0x42
+ nop.m 0
+(p8) fsetc.s2 0x7F,0x42
// Possible overflow path, must detect by
// Setting widest range exponent with prevailing
// rounding mode.
nop.i 0 ;;
}
{ .mfi
- nop.m 0
+ nop.m 0
(p8) fcmp.lt.unc.s1 p9, p10 = f12, f11
nop.i 0 ;;
}
nop.m 0
mov GR_Parameter_TAG = 46
// No overflow
-(p9) br.ret.sptk b0;;
+(p9) br.ret.sptk b0;;
}
GLOBAL_IEEE754_END(hypot)
// products derived from this software without specific prior written
// permission.
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
// Intel Corporation is the author of this code, and requests that all
-// problem reports or change requests be submitted to it directly at
+// problem reports or change requests be submitted to it directly at
// http://www.intel.com/software/products/opensource/libraries/num.htm.
//
//*********************************************************************
//
-// History:
+// History:
// 02/02/00 hand-optimized
// 04/04/00 Unwind support added
// 06/26/00 new version
// x2 = x * x in double-extended
// y2 = y * y in double-extended
// temp = x2 + y2 in double-extended
-// sqrt(temp) rounded to single precision
+// sqrt(temp) rounded to single precision
//
//*********************************************************************
// Compute x*x
fma.s1 f10=f8,f8,f0
// r2=bias-1
- mov r2=0xfffe
+ mov r2=0xfffe
}
{.mfi
nop.m 0
{.mfi
nop.m 0
// if possible overflow, copy f8 to f14
- // set Denormal, if necessary
- // (p8)
+ // set Denormal, if necessary
+ // (p8)
fma.s.s0 f14=f8,f1,f0
nop.i 0;;
}
{ .mfi
nop.m 0
-// Identify Natvals, Infs, NaNs, and Zeros
+// Identify Natvals, Infs, NaNs, and Zeros
// and return result
fclass.m.unc p7, p0 = f12, 0x1E7
nop.i 0
-}
+}
{.mfi
nop.m 0
// z0=frsqrta(a)
// H0=0.5*z0
(p6) fma.s1 f10=f8,f7,f0
nop.i 0;;
-}
+}
{.mfi
{.mfi
- // Is x^2 + y^2 well less than the overflow
+ // Is x^2 + y^2 well less than the overflow
// threshold?
(p6) cmp.lt.unc p7, p8 = r3,r2
// P=P01+d2*P23
}
{ .mfi
- nop.m 0
-(p8) fsetc.s2 0x7F,0x42
+ nop.m 0
+(p8) fsetc.s2 0x7F,0x42
// Possible overflow path, must detect by
// Setting widest range exponent with prevailing
// rounding mode.
nop.i 0 ;;
}
{ .mfi
- nop.m 0
+ nop.m 0
(p8) fcmp.lt.unc.s1 p9, p10 = f12, f11
nop.i 0 ;;
}
nop.m 0
mov GR_Parameter_TAG = 47
// No overflow
-(p9) br.ret.sptk b0;;
+(p9) br.ret.sptk b0;;
}
GLOBAL_IEEE754_END(hypotf)
.prologue
{ .mii
add GR_Parameter_Y=-32,sp // Parameter 2 value
- mov GR_Parameter_TAG = 47
+ mov GR_Parameter_TAG = 47
.save ar.pfs,GR_SAVE_PFS
mov GR_SAVE_PFS=ar.pfs // Save ar.pfs
}
mov b0 = GR_SAVE_B0 // Restore return address
};;
{ .mib
- mov gp = GR_SAVE_GP // Restore gp
+ mov gp = GR_SAVE_GP // Restore gp
mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs
br.ret.sptk b0 // Return
-};;
+};;
LOCAL_LIBM_END(__libm_error_region)
// products derived from this software without specific prior written
// permission.
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
// Intel Corporation is the author of this code, and requests that all
-// problem reports or change requests be submitted to it directly at
+// problem reports or change requests be submitted to it directly at
// http://www.intel.com/software/products/opensource/libraries/num.htm.
//
//*********************************************************************
//
-// History:
+// History:
// 02/02/00 hand-optimized
// 04/04/00 Unwind support added
// 06/20/00 new version
// Compute x*x
fma.s1 f10=f8,f8,f0
// r2=bias-1
- mov r2=0xfffe
+ mov r2=0xfffe
}
{.mfi
nop.m 0
{.mfi
nop.m 0
// if possible overflow, copy f8 to f32
- // set Denormal, if necessary
- // (p8)
+ // set Denormal, if necessary
+ // (p8)
fma.s0 f32=f8,f1,f0
nop.i 0;;
}
}
{ .mfi
nop.m 0
-// Identify Natvals, Infs, NaNs, and Zeros
+// Identify Natvals, Infs, NaNs, and Zeros
// and return result
fclass.m.unc p7, p0 = f12, 0x1E7
nop.i 0
-}
+}
{.mfi
// get exponent of x^2+y^2
getf.exp r3=f12
// H0=0.5*z0
(p6) fma.s1 f15=f8,f7,f0
nop.i 0;;
-}
+}
{.mfb
nop.m 0
nop.i 0
}
{.mfi
- // Is x^2 + y^2 well less than the overflow
+ // Is x^2 + y^2 well less than the overflow
// threshold?
(p6) cmp.lt.unc p7, p8 = r3,r2
// c=dxy+da
}
{ .mfi
- nop.m 0
-(p8) fsetc.s2 0x7F,0x42
+ nop.m 0
+(p8) fsetc.s2 0x7F,0x42
// Possible overflow path, must detect by
// Setting widest range exponent with prevailing
// rounding mode.
nop.i 0 ;;
}
{ .mfi
- nop.m 0
+ nop.m 0
(p8) fcmp.lt.unc.s1 p9, p10 = f12, f11
nop.i 0 ;;
}
nop.m 0
mov GR_Parameter_TAG = 45;
// No overflow
-(p9) br.ret.sptk b0;;
+(p9) br.ret.sptk b0;;
}
GLOBAL_IEEE754_END(hypotl)
fnorm.s1 FR_NormX = f8
mov GR_bias = 0xffff
};;
-
+
{ .mfi
setf.d FR_A3 = GR_A3 // create A3
fcmp.eq.s1 p12,p0 = f1,f8 // is x equal to 1.0?
- dep.z GR_xorg = GR_xorg, 44, 19 // 0x3fefe00000000000
+ dep.z GR_xorg = GR_xorg, 44, 19 // 0x3fefe00000000000
// double precision memory
// representation of 255/256
}
{ .mfi
(p6) getf.exp GR_rexp = FR_r // Get signexp of x-1
(p7) fcvt.xf FR_N = FR_N
-(p8) cmp.eq p9,p6 = r0,r0 // Also set p9 and clear p6 if log10
+(p8) cmp.eq p9,p6 = r0,r0 // Also set p9 and clear p6 if log10
// and arg near 1
};;
// products derived from this software without specific prior written
// permission.
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
// Intel Corporation is the author of this code, and requests that all
-// problem reports or change requests be submitted to it directly at
+// problem reports or change requests be submitted to it directly at
// http://www.intel.com/software/products/opensource/libraries/num.htm.
//
// History
//=================================================================
-// 09/11/00 Initial version
+// 09/11/00 Initial version
// 03/19/01 Added one polynomial coefficient, to improve accuracy
// 05/20/02 Cleaned up namespace and sf0 syntax
// 02/10/03 Reordered header: .section, .global, .proc, .align
// j=0 if f<128; j=1 if f>=128
// T is a table that stores log2(1/y) (in entries 1..255) rounded to
// double extended precision; f is used as an index; T[255]=0
-//
+//
// If f=0 and b9=0, r is set to 2^{-8}* 0.b9 b10 ... b52 = m-1 (fractional part of m),
-// and 0 is used instead of T[0]
+// and 0 is used instead of T[0]
// (polynomial evaluation only, for m=1+r, 0<=r<2^{-9})
// If f=255, r is set to (m-2)/2 (T[255]=0, and only polynomial evaluation is used
// for m=2(1-r'), 0<=r'<2^{-9})
//
// log2(x) is approximated as
// (l-j) + T[f] + (c1*r+c2*r^2+...+c7*r^7), if f>0
-//
+//
-// Special values
+// Special values
//=================================================================
// log2(0)=-inf, raises Divide by Zero
// log2(+inf)=inf
GR_SAVE_B0 = r33
GR_SAVE_PFS = r34
-GR_SAVE_GP = r35 // This reg. can safely be used
+GR_SAVE_GP = r35 // This reg. can safely be used
GR_SAVE_SP = r36
GR_Parameter_X = r37
GLOBAL_LIBM_ENTRY(log2)
{ .mfi
- alloc r32=ar.pfs,1,4,4,0
- // y=frcpa(x)
+ alloc r32=ar.pfs,1,4,4,0
+ // y=frcpa(x)
frcpa.s1 f6,p0=f1,f8
- // will form significand of 1.5 (to test whether the index is 128 or above)
+ // will form significand of 1.5 (to test whether the index is 128 or above)
mov r24=0xc
}
{.mfi
nop.m 0
- // normalize x
+ // normalize x
fma.s1 f7=f8,f1,f0
// r2 = pointer to C_1...C_6 followed by T_table
addl r2 = @ltoff(poly_coeffs), gp;;
getf.sig r25=f8
// f8 denormal ?
fclass.m p8,p10=f8,0x9
- // will form significand of 1.5 (to test whether the index is 128 or above)
+ // will form significand of 1.5 (to test whether the index is 128 or above)
shl r24=r24,60
}
{.mfi
getf.exp r29=f8
// load start address for C_1...C_6 followed by T_table
ld8 r2=[r2]
- // will continue only for positive normal/denormal numbers
+ // will continue only for positive normal/denormal numbers
fclass.nm.unc p12,p7 = f8, 0x19 ;;
}
{.mmi
// load C_6, C_7
ldfpd f12,f13=[r2],16
- // r27=bias-1 (if index >=128, will add exponent+1)
+ // r27=bias-1 (if index >=128, will add exponent+1)
(p12) mov r27=0xfffe
(p8) shr.u r28=r25,63-8;;
}
{.mmf
// load T (unless first 9 bits after leading 1 are 0)
(p12) ldfe f33=[r2]
- // f8=expon - bias
+ // f8=expon - bias
setf.sig f8=r29
// set T=0 (if first 9 bits after leading 1 are 0)
(p8) fma.s1 f33=f0,f0,f0;;
SPECIAL_LOG2:
-{.mfi
+{.mfi
nop.m 0
// x=+Infinity ?
fclass.m p7,p0=f8,0x21
(p7) br.ret.spnt b0;;
}
{.mfi
- (p8) mov GR_Parameter_TAG = 170
+ (p8) mov GR_Parameter_TAG = 170
// log2(+/-0)=-infinity, raises Divide by Zero
// set f8=-0
(p8) fmerge.ns f8=f0,f8
(p8) br.cond.sptk __libm_error_region;;
}
{.mfb
- (p6) mov GR_Parameter_TAG = 171
+ (p6) mov GR_Parameter_TAG = 171
// x<0: return NaN, raise Invalid
(p6) frcpa.s0 f8,p0=f0,f0
(p6) br.cond.sptk __libm_error_region;;
-}
-
+}
+
{.mfb
nop.m 0
add GR_Parameter_Y=-32,sp // Parameter 2 value
nop.f 0
.save ar.pfs,GR_SAVE_PFS
- mov GR_SAVE_PFS=ar.pfs // Save ar.pfs
+ mov GR_SAVE_PFS=ar.pfs // Save ar.pfs
}
{ .mfi
-.fframe 64
+.fframe 64
add sp=-64,sp // Create new stack
nop.f 0
mov GR_SAVE_GP=gp // Save gp
{ .mmi
stfd [GR_Parameter_Y] = FR_Y,16 // STORE Parameter 2 on stack
add GR_Parameter_X = 16,sp // Parameter 1 address
-.save b0, GR_SAVE_B0
- mov GR_SAVE_B0=b0 // Save b0
+.save b0, GR_SAVE_B0
+ mov GR_SAVE_B0=b0 // Save b0
};;
.body
{ .mib
- stfd [GR_Parameter_X] = FR_X // STORE Parameter 1 on stack
- add GR_Parameter_RESULT = 0,GR_Parameter_Y // Parameter 3 address
- nop.b 0
+ stfd [GR_Parameter_X] = FR_X // STORE Parameter 1 on stack
+ add GR_Parameter_RESULT = 0,GR_Parameter_Y // Parameter 3 address
+ nop.b 0
}
{ .mib
stfd [GR_Parameter_Y] = FR_RESULT // STORE Parameter 3 on stack
- add GR_Parameter_Y = -16,GR_Parameter_Y
+ add GR_Parameter_Y = -16,GR_Parameter_Y
br.call.sptk b0=__libm_error_support# // Call error handling function
};;
{ .mmi
mov b0 = GR_SAVE_B0 // Restore return address
};;
{ .mib
- mov gp = GR_SAVE_GP // Restore gp
+ mov gp = GR_SAVE_GP // Restore gp
mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs
br.ret.sptk b0 // Return
-};;
+};;
LOCAL_LIBM_END(__libm_error_region)
.type __libm_error_support#,@function
// products derived from this software without specific prior written
// permission.
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
// Intel Corporation is the author of this code, and requests that all
-// problem reports or change requests be submitted to it directly at
+// problem reports or change requests be submitted to it directly at
// http://www.intel.com/software/products/opensource/libraries/num.htm.
//
// History
//==============================================================
-// 09/11/00 Initial version
+// 09/11/00 Initial version
// 05/20/02 Cleaned up namespace and sf0 syntax
// 02/10/03 Reordered header: .section, .global, .proc, .align
//
// j=0 if f<128; j=1 if f>=128
// T is a table that stores log2(1/y) (in entries 1..255) rounded to
// double extended precision; f is used as an index; T[255]=0
-//
+//
// If f=0 and b9=0, r is set to 2^{-8}* 0.b9 b10 ... b52 = m-1 (fractional part of m),
-// and 0 is used instead of T[0]
+// and 0 is used instead of T[0]
// (polynomial evaluation only, for m=1+r, 0<=r<2^{-9})
// If f=255, r is set to (m-2)/2 (T[255]=0, and only polynomial evaluation is used
// for m=2(1-r'), 0<=r'<2^{-9})
//
// log2f(x) is approximated as
// (l-j) + T[f] + (c1*r+c2*r^2+...+c6*r^6), if f>0
-//
+//
-// Special values
+// Special values
//==============================================================
// log2f(0)=-inf, raises Divide by Zero
// log2f(+inf)=inf
GR_SAVE_B0 = r33
GR_SAVE_PFS = r34
-GR_SAVE_GP = r35 // This reg. can safely be used
+GR_SAVE_GP = r35 // This reg. can safely be used
GR_SAVE_SP = r36
GR_Parameter_X = r37
GLOBAL_LIBM_ENTRY(log2f)
{ .mfi
- alloc r32=ar.pfs,1,4,4,0
- // y=frcpa(x)
+ alloc r32=ar.pfs,1,4,4,0
+ // y=frcpa(x)
frcpa.s1 f6,p0=f1,f8
- // will form significand of 1.5 (to test whether the index is 128 or above)
+ // will form significand of 1.5 (to test whether the index is 128 or above)
mov r24=0xc
}
{.mfi
nop.m 0
- // normalize x
+ // normalize x
fma.s1 f7=f8,f1,f0
// r2 = pointer to C_1...C_6 followed by T_table
addl r2 = @ltoff(poly_coeffs), gp;;
getf.sig r25=f8
// f8 denormal ?
fclass.m p8,p10=f8,0x9
- // will form significand of 1.5 (to test whether the index is 128 or above)
+ // will form significand of 1.5 (to test whether the index is 128 or above)
shl r24=r24,60
}
{.mfi
getf.exp r29=f8
// load start address for C_1...C_6 followed by T_table
ld8 r2=[r2]
- // will continue only for positive normal/denormal numbers
+ // will continue only for positive normal/denormal numbers
fclass.nm.unc p12,p7 = f8, 0x19 ;;
}
// load C_3, C_4
ldfpd f10,f11=[r2],16
nop.f 0
- // r27=bias-1 (if index >=128, will add exponent+1)
+ // r27=bias-1 (if index >=128, will add exponent+1)
(p12) mov r27=0xfffe;;
}
cmp.ltu p8,p12=r25,r26;;
}
{.mfi
- // f8=expon - bias
+ // f8=expon - bias
setf.sig f8=r29
nop.f 0
// get T address
SPECIAL_log2f:
-{.mfi
+{.mfi
nop.m 0
// x=+Infinity ?
fclass.m p7,p0=f8,0x21
(p7) br.ret.spnt b0;;
}
{.mfi
- (p8) mov GR_Parameter_TAG = 172
+ (p8) mov GR_Parameter_TAG = 172
// log2f(+/-0)=-infinity, raises Divide by Zero
// set f8=-0
(p8) fmerge.ns f8=f0,f8
(p8) br.cond.sptk __libm_error_region;;
}
{.mfb
- (p6) mov GR_Parameter_TAG = 173
+ (p6) mov GR_Parameter_TAG = 173
// x<0: return NaN, raise Invalid
(p6) frcpa.s0 f8,p0=f0,f0
(p6) br.cond.sptk __libm_error_region;;
-}
-
+}
+
{.mfb
nop.m 0
add GR_Parameter_Y=-32,sp // Parameter 2 value
nop.f 0
.save ar.pfs,GR_SAVE_PFS
- mov GR_SAVE_PFS=ar.pfs // Save ar.pfs
+ mov GR_SAVE_PFS=ar.pfs // Save ar.pfs
}
{ .mfi
-.fframe 64
+.fframe 64
add sp=-64,sp // Create new stack
nop.f 0
mov GR_SAVE_GP=gp // Save gp
{ .mmi
stfs [GR_Parameter_Y] = FR_Y,16 // STORE Parameter 2 on stack
add GR_Parameter_X = 16,sp // Parameter 1 address
-.save b0, GR_SAVE_B0
- mov GR_SAVE_B0=b0 // Save b0
+.save b0, GR_SAVE_B0
+ mov GR_SAVE_B0=b0 // Save b0
};;
.body
{ .mib
- stfs [GR_Parameter_X] = FR_X // STORE Parameter 1 on stack
- add GR_Parameter_RESULT = 0,GR_Parameter_Y // Parameter 3 address
- nop.b 0
+ stfs [GR_Parameter_X] = FR_X // STORE Parameter 1 on stack
+ add GR_Parameter_RESULT = 0,GR_Parameter_Y // Parameter 3 address
+ nop.b 0
}
{ .mib
stfs [GR_Parameter_Y] = FR_RESULT // STORE Parameter 3 on stack
- add GR_Parameter_Y = -16,GR_Parameter_Y
+ add GR_Parameter_Y = -16,GR_Parameter_Y
br.call.sptk b0=__libm_error_support# // Call error handling function
};;
{ .mmi
mov b0 = GR_SAVE_B0 // Restore return address
};;
{ .mib
- mov gp = GR_SAVE_GP // Restore gp
+ mov gp = GR_SAVE_GP // Restore gp
mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs
br.ret.sptk b0 // Return
-};;
+};;
LOCAL_LIBM_END(__libm_error_region)
.type __libm_error_support#,@function
// products derived from this software without specific prior written
// permission.
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
// Intel Corporation is the author of this code, and requests that all
-// problem reports or change requests be submitted to it directly at
+// problem reports or change requests be submitted to it directly at
// http://www.intel.com/software/products/opensource/libraries/num.htm.
//
// History
//==============================================================
-// 09/25/00 Initial version
+// 09/25/00 Initial version
// 11/22/00 Fixed accuracy bug (for mantissas near 1, 2)
-// 12/07/00 Fixed C_1l constant, eliminated rounding errors in
+// 12/07/00 Fixed C_1l constant, eliminated rounding errors in
// reduced argument (x*frcpa(x)-1)
// 05/20/02 Cleaned up namespace and sf0 syntax
// 02/10/03 Reordered header: .section, .global, .proc, .align
// Implementation
//
// Let x = 2^l * m, where m=1.b1 b2 ... b8 b9 ... b52
-// y=frcpa(m), r=m*y-1, f=b1 b2 .. b8
-// T_hi is a table that stores the 24 most significant bits of log2(1/y)
+// y=frcpa(m), r=m*y-1, f=b1 b2 .. b8
+// T_hi is a table that stores the 24 most significant bits of log2(1/y)
// (in entries 1..255) in single precision format
// T_low is a table that stores (log2(1/y)-T_high), rounded to double
-// precision
+// precision
//
// f is used as an index; T_high[255]=T_low[255]=0
-//
+//
// If f=0 and b9=0, r is set to 2^{-8}* 0.b9 b10 ... b52 = m-1 (fractional part of m),
-// and 0 is used instead of T_high[0], T_low[0]
+// and 0 is used instead of T_high[0], T_low[0]
// (polynomial evaluation only, for m=1+r, 0<=r<2^{-9})
// If f=255, r is set to (m-2)/2 (T[255]=0, and only polynomial evaluation is used
// for m=2(1-r'), 0<=r'<2^{-9})
//
// log2l(x) is approximated as
// (l+T_high[f]+C1r) + (D+r*(c1+c2*r+c3*r^2...+c8*r^7)+(T_low[f]+C_1*E))
-//
+//
-// Special values
+// Special values
//==============================================================
// log2l(0)=-inf, raises Divide by Zero
// log2l(+inf)=inf
GR_SAVE_B0 = r33
GR_SAVE_PFS = r34
-GR_SAVE_GP = r35 // This reg. can safely be used
+GR_SAVE_GP = r35 // This reg. can safely be used
GR_SAVE_SP = r36
GR_Parameter_X = r37
data8 0xb8aa3b295c17f0bc, 0x00003fff // C_1
data8 0x3fca61762a7aded9, 0xbfc71547652b82fe // C_7, C_8
-data8 0x3fd2776c50ef9bfe, 0xbfcec709dc3a03fd // C_5, C_6
+data8 0x3fd2776c50ef9bfe, 0xbfcec709dc3a03fd // C_5, C_6
data8 0x3fdec709dc3a03fd, 0xbfd71547652b82fe // C_3, C_4
//data8 0xd871319ff0342580, 0x0000bfbd // C_1l (low part of C1)
data8 0x82f0025f2dc582ee, 0x0000bfbe // C_1l (low part of C1)
GLOBAL_IEEE754_ENTRY(log2l)
{ .mfi
- alloc r32=ar.pfs,1,4,4,0
- // normalize x
- // y=frcpa(x)
+ alloc r32=ar.pfs,1,4,4,0
+ // normalize x
+ // y=frcpa(x)
frcpa.s1 f41,p0=f1,f8
// r26=bias-1
mov r26=0xfffe
getf.exp r29=f8
// load start address for C_1...C_7 followed by T_table
ld8 r2=[r2]
- // will continue only for positive normal/unnormal numbers
- fclass.m.unc p0,p12 = f8, 0x19;;
+ // will continue only for positive normal/unnormal numbers
+ fclass.m.unc p0,p12 = f8, 0x19;;
}
}
{.mfb
- add r3=16,r2
+ add r3=16,r2
// r=x*y-1
fms.s1 f6=f41,f8,f1
(p12) br.cond.spnt SPECIAL_log2l
// add 1 to the exponent additive term, and estimate log2(1-r)
(p10) add r29=1,r29
nop.f 0
- (p7) br.cond.spnt LOG2_PSEUDO_ZERO
+ (p7) br.cond.spnt LOG2_PSEUDO_ZERO
}
{.mfi
- // get T_low adress
+ // get T_low adress
shladd r3=r28,3,r3
// if first 8 bits after leading 1 are all ones, use polynomial approx. only
(p10) fms.s1 f6=f7,f36,f1
.pred.rel "mutex",p8,p12
{.mfi
- // f8=expon - bias
+ // f8=expon - bias
setf.sig f8=r29
// general case: 2^{16}+C1*r
(p12) fma.s1 f33=f6,f14,f32
mov FR_X=f8
nop.i 0
}
-{.mfi
+{.mfi
nop.m 0
// x=+Infinity ?
fclass.m p7,p0=f8,0x21
(p7) br.ret.spnt b0;;
}
{.mfi
- (p8) mov GR_Parameter_TAG = 168
+ (p8) mov GR_Parameter_TAG = 168
// log2l(+/-0)=-infinity, raises Divide by Zero
// set f8=-0
(p8) fmerge.ns f8=f0,f8
(p8) br.cond.sptk __libm_error_region;;
}
{.mfb
- (p6) mov GR_Parameter_TAG = 169
+ (p6) mov GR_Parameter_TAG = 169
// x<0: return NaN, raise Invalid
(p6) frcpa.s0 f8,p0=f0,f0
(p6) br.cond.sptk __libm_error_region;;
-}
-
+}
+
{.mfb
nop.m 0
nop.i 0
}
{.mfi
- mov GR_Parameter_TAG = 168
+ mov GR_Parameter_TAG = 168
// log2l(+/-0)=-infinity, raises Divide by Zero
// set f8=-0
fmerge.ns f8=f0,f8
add GR_Parameter_Y=-32,sp // Parameter 2 value
nop.f 0
.save ar.pfs,GR_SAVE_PFS
- mov GR_SAVE_PFS=ar.pfs // Save ar.pfs
+ mov GR_SAVE_PFS=ar.pfs // Save ar.pfs
}
{ .mfi
-.fframe 64
+.fframe 64
add sp=-64,sp // Create new stack
nop.f 0
mov GR_SAVE_GP=gp // Save gp
{ .mmi
stfe [GR_Parameter_Y] = FR_Y,16 // STORE Parameter 2 on stack
add GR_Parameter_X = 16,sp // Parameter 1 address
-.save b0, GR_SAVE_B0
- mov GR_SAVE_B0=b0 // Save b0
+.save b0, GR_SAVE_B0
+ mov GR_SAVE_B0=b0 // Save b0
};;
.body
{ .mib
- stfe [GR_Parameter_X] = FR_X // STORE Parameter 1 on stack
- add GR_Parameter_RESULT = 0,GR_Parameter_Y // Parameter 3 address
- nop.b 0
+ stfe [GR_Parameter_X] = FR_X // STORE Parameter 1 on stack
+ add GR_Parameter_RESULT = 0,GR_Parameter_Y // Parameter 3 address
+ nop.b 0
}
{ .mib
stfe [GR_Parameter_Y] = FR_RESULT // STORE Parameter 3 on stack
- add GR_Parameter_Y = -16,GR_Parameter_Y
+ add GR_Parameter_Y = -16,GR_Parameter_Y
br.call.sptk b0=__libm_error_support# // Call error handling function
};;
{ .mmi
mov b0 = GR_SAVE_B0 // Restore return address
};;
{ .mib
- mov gp = GR_SAVE_GP // Restore gp
+ mov gp = GR_SAVE_GP // Restore gp
mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs
br.ret.sptk b0 // Return
-};;
+};;
LOCAL_LIBM_END(__libm_error_region)
.type __libm_error_support#,@function
-.file "logl.s"
+.file "logl.s"
// Copyright (c) 2000 - 2003, Intel Corporation
// products derived from this software without specific prior written
// permission.
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
// Intel Corporation is the author of this code, and requests that all
-// problem reports or change requests be submitted to it directly at
+// problem reports or change requests be submitted to it directly at
// http://www.intel.com/software/products/opensource/libraries/num.htm.
//
//*********************************************************************
//
-// History:
-// 05/21/01 Extracted logl and log10l from log1pl.s file, and optimized
+// History:
+// 05/21/01 Extracted logl and log10l from log1pl.s file, and optimized
// all paths.
// 06/20/01 Fixed error tag for x=-inf.
// 05/20/02 Cleaned up namespace and sf0 syntax
// IEEE Special Conditions:
//
// Denormal fault raised on denormal inputs
-// Overflow exceptions cannot occur
-// Underflow exceptions raised when appropriate for log1p
+// Overflow exceptions cannot occur
+// Underflow exceptions raised when appropriate for log1p
// (Error Handling Routine called for underflow)
// Inexact raised when appropriate by algorithm
//
// logl(inf) = inf
-// logl(-inf) = QNaN
-// logl(+/-0) = -inf
+// logl(-inf) = QNaN
+// logl(+/-0) = -inf
// logl(SNaN) = QNaN
// logl(QNaN) = QNaN
// logl(EM_special Values) = QNaN
// log10l(inf) = inf
-// log10l(-inf) = QNaN
-// log10l(+/-0) = -inf
+// log10l(-inf) = QNaN
+// log10l(+/-0) = -inf
// log10l(SNaN) = QNaN
// log10l(QNaN) = QNaN
// log10l(EM_special Values) = QNaN
// logl( 1 + X ) can be approximated by a simple polynomial
// in W = X-1. This polynomial resembles the truncated Taylor
// series W - W^/2 + W^3/3 - ...
-//
+//
// Case log_regular:
//
// Here we use a table lookup method. The basic idea is that in
-// order to compute logl(Arg) for an argument Arg in [1,2), we
+// order to compute logl(Arg) for an argument Arg in [1,2), we
// construct a value G such that G*Arg is close to 1 and that
// logl(1/G) is obtainable easily from a table of values calculated
// beforehand. Thus
//
// X = 2^N * S_hi exactly
//
-// where S_hi in [1,2)
+// where S_hi in [1,2)
//
// Step 1: Argument Reduction
//
// G := G_1 * G_2 * G_3
// r := (G * S_hi - 1)
//
-// These G_j's have the property that the product is exactly
+// These G_j's have the property that the product is exactly
// representable and that |r| < 2^(-12) as a result.
//
// Step 2: Approximation
//
// Here we compute a simple polynomial. To exploit parallelism, we split
// the polynomial into two portions.
-//
+//
// W := X - 1
// Wsq := W * W
// W4 := Wsq*Wsq
// Step 0. Initialization
// ----------------------
//
-// Z := X
+// Z := X
// N := unbaised exponent of Z
// S_hi := 2^(-N) * Z
//
// with 1.0000 in fixed point.
//
//
-// Define A_2 := 1.0 0 0 0 d_5 d_6 d_7 d_8. This is X_1
+// Define A_2 := 1.0 0 0 0 d_5 d_6 d_7 d_8. This is X_1
// truncated to lsb = 2^(-8). Similar to A_1,
// A_2 is not needed in actual implementation. It
// helps explain how some of the values are defined.
// Fetch G_3 := (1/A_3) truncated to 21 sig. bits.
// floating pt. Fetch is done using index_3.
//
-// Compute G := G_1 * G_2 * G_3.
+// Compute G := G_1 * G_2 * G_3.
//
// This is done exactly since each of G_j only has 21 sig. bits.
//
-// Compute
+// Compute
//
-// r := (G*S_hi - 1)
+// r := (G*S_hi - 1)
//
//
// Step 2. Approximation
// Finally
//
// Y_hi := N*log2_hi + SUM ( log1byGj_hi )
-// Y_lo := poly_hi + [ poly_lo +
+// Y_lo := poly_hi + [ poly_lo +
// ( SUM ( log1byGj_lo ) + N*log2_lo ) ]
//
// ************* DO NOT CHANGE THE ORDER OF THESE TABLES *************
-// P_8, P_7, P_6, P_5, P_4, P_3, P_2, and P_1
+// P_8, P_7, P_6, P_5, P_4, P_3, P_2, and P_1
LOCAL_OBJECT_START(Constants_P)
data8 0xE3936754EFD62B15,0x00003FFB
data8 0xFFFFFFFFFFFFFFFE,0x0000BFFD
LOCAL_OBJECT_END(Constants_P)
-// log2_hi, log2_lo, Q_4, Q_3, Q_2, and Q_1
+// log2_hi, log2_lo, Q_4, Q_3, Q_2, and Q_1
LOCAL_OBJECT_START(Constants_Q)
data8 0xB172180000000000,0x00003FFE
// Z1 - 16 bit fixed
-
+
LOCAL_OBJECT_START(Constants_Z_1)
data4 0x00008000
data4 0x00007879
data8 0xBE049391B6B7C239
LOCAL_OBJECT_END(Constants_G_H_h2)
-// G3 and H3 - IEEE single and h3 - IEEE double
+// G3 and H3 - IEEE single and h3 - IEEE double
LOCAL_OBJECT_START(Constants_G_H_h3)
data4 0x3F7FFC00,0x38800100
// Floating Point Registers
-FR_Input_X = f8
+FR_Input_X = f8
-FR_Y_hi = f34
+FR_Y_hi = f34
FR_Y_lo = f35
FR_Scale = f36
-FR_X_Prime = f37
-FR_S_hi = f38
+FR_X_Prime = f37
+FR_S_hi = f38
FR_W = f39
FR_G = f40
FR_H = f41
-FR_wsq = f42
+FR_wsq = f42
FR_w4 = f43
FR_h = f44
-FR_w6 = f45
+FR_w6 = f45
FR_G2 = f46
FR_H2 = f47
FR_poly_lo = f48
-FR_P8 = f49
+FR_P8 = f49
FR_poly_hi = f50
-FR_P7 = f51
-FR_h2 = f52
-FR_rsq = f53
+FR_P7 = f51
+FR_h2 = f52
+FR_rsq = f53
FR_P6 = f54
-FR_r = f55
-
-FR_log2_hi = f56
-FR_log2_lo = f57
-FR_p87 = f58
-FR_p876 = f58
-FR_p8765 = f58
-FR_float_N = f59
-FR_Q4 = f60
-
-FR_p43 = f61
-FR_p432 = f61
-FR_p4321 = f61
-FR_P4 = f62
-FR_G3 = f63
-FR_H3 = f64
-FR_h3 = f65
-
-FR_Q3 = f66
-FR_P3 = f67
-FR_Q2 = f68
-FR_P2 = f69
-FR_1LN10_hi = f70
-
-FR_Q1 = f71
-FR_P1 = f72
-FR_1LN10_lo = f73
-FR_P5 = f74
-FR_rcub = f75
-
-FR_Output_X_tmp = f76
+FR_r = f55
+
+FR_log2_hi = f56
+FR_log2_lo = f57
+FR_p87 = f58
+FR_p876 = f58
+FR_p8765 = f58
+FR_float_N = f59
+FR_Q4 = f60
+
+FR_p43 = f61
+FR_p432 = f61
+FR_p4321 = f61
+FR_P4 = f62
+FR_G3 = f63
+FR_H3 = f64
+FR_h3 = f65
+
+FR_Q3 = f66
+FR_P3 = f67
+FR_Q2 = f68
+FR_P2 = f69
+FR_1LN10_hi = f70
+
+FR_Q1 = f71
+FR_P1 = f72
+FR_1LN10_lo = f73
+FR_P5 = f74
+FR_rcub = f75
+
+FR_Output_X_tmp = f76
FR_X = f8
FR_Y = f0
// General Purpose Registers
GR_ad_p = r33
-GR_Index1 = r34
-GR_Index2 = r35
-GR_signif = r36
-GR_X_0 = r37
-GR_X_1 = r38
-GR_X_2 = r39
-GR_Z_1 = r40
-GR_Z_2 = r41
-GR_N = r42
-GR_Bias = r43
-GR_M = r44
-GR_Index3 = r45
+GR_Index1 = r34
+GR_Index2 = r35
+GR_signif = r36
+GR_X_0 = r37
+GR_X_1 = r38
+GR_X_2 = r39
+GR_Z_1 = r40
+GR_Z_2 = r41
+GR_N = r42
+GR_Bias = r43
+GR_M = r44
+GR_Index3 = r45
GR_ad_p2 = r46
-GR_exp_mask = r47
-GR_exp_2tom7 = r48
-GR_ad_ln10 = r49
+GR_exp_mask = r47
+GR_exp_2tom7 = r48
+GR_ad_ln10 = r49
GR_ad_tbl_1 = r50
GR_ad_tbl_2 = r51
GR_ad_tbl_3 = r52
// Common code for logl and log10
-LOGL_BEGIN:
+LOGL_BEGIN:
{ .mfi
ld8 GR_ad_z_1 = [GR_ad_z_1] // Get pointer to Constants_Z_1
fclass.m p10, p0 = FR_Input_X, 0x0b // Test for denormal
{ .mmi
ldfe FR_log2_lo = [GR_ad_q],16 // Load log2_lo
(p14) ldfe FR_1LN10_hi = [GR_ad_ln10],16 // If log10l, load 1/ln10_hi
- sub GR_N = GR_N, GR_Bias
+ sub GR_N = GR_N, GR_Bias
}
;;
{ .mmi
getf.exp GR_M = FR_W // Get signexp of w = x - 1
ldfe FR_Q2 = [GR_ad_q],16 // Load Q2
- extr.u GR_Index2 = GR_X_1, 6, 4 // Extract bits 6-9 of X_1
+ extr.u GR_Index2 = GR_X_1, 6, 4 // Extract bits 6-9 of X_1
}
;;
{ .mfi
nop.m 999
-(p9) fadd.s1 FR_Y_lo = FR_poly_hi, FR_poly_lo // Y_lo = poly_hi + poly_lo
+(p9) fadd.s1 FR_Y_lo = FR_poly_hi, FR_poly_lo // Y_lo = poly_hi + poly_lo
nop.i 999
}
;;
// Here if x=+-0
-LOGL_64_zero:
+LOGL_64_zero:
//
// If x=+-0 raise divide by zero and return -inf
-//
+//
{ .mfi
(p7) mov GR_Parameter_TAG = 0
- fsub.s1 FR_Output_X_tmp = f0, f1
+ fsub.s1 FR_Output_X_tmp = f0, f1
nop.i 999
}
;;
{ .mfb
-(p14) mov GR_Parameter_TAG = 6
- frcpa.s0 FR_Output_X_tmp, p8 = FR_Output_X_tmp, f0
+(p14) mov GR_Parameter_TAG = 6
+ frcpa.s0 FR_Output_X_tmp, p8 = FR_Output_X_tmp, f0
br.cond.sptk __libm_error_region
}
;;
-LOGL_64_special:
+LOGL_64_special:
{ .mfi
nop.m 999
fclass.m.unc p8, p0 = FR_Input_X, 0x1E1 // Test for natval, nan, +inf
}
;;
-//
+//
// For SNaN raise invalid and return QNaN.
// For QNaN raise invalid and return QNaN.
// For +Inf return +Inf.
-//
+//
{ .mfb
nop.m 999
-(p8) fmpy.s0 f8 = FR_Input_X, f1
+(p8) fmpy.s0 f8 = FR_Input_X, f1
(p8) br.ret.sptk b0 // Return for natval, nan, +inf
}
;;
-//
+//
// For -Inf raise invalid and return QNaN.
-//
+//
{ .mmi
(p7) mov GR_Parameter_TAG = 1
nop.m 999
{ .mfb
(p14) mov GR_Parameter_TAG = 7
- fmpy.s0 FR_Output_X_tmp = FR_Input_X, f0
+ fmpy.s0 FR_Output_X_tmp = FR_Input_X, f0
br.cond.sptk __libm_error_region
}
;;
}
;;
-LOGL_64_unsupported:
-//
+LOGL_64_unsupported:
+//
// Return generated NaN or other value.
-//
+//
{ .mfb
nop.m 999
- fmpy.s0 f8 = FR_Input_X, f0
+ fmpy.s0 f8 = FR_Input_X, f0
br.ret.sptk b0
}
;;
// Here if -inf < x < 0
-LOGL_64_negative:
-//
+LOGL_64_negative:
+//
// Deal with x < 0 in a special way - raise
// invalid and produce QNaN indefinite.
-//
+//
{ .mfi
(p7) mov GR_Parameter_TAG = 1
frcpa.s0 FR_Output_X_tmp, p8 = f0, f0
// 0.1...11 2^-3ffe (biased, 1)
// largest dn smallest normal
-// Form small constant (2^-170) to correct underflow result near region of
+// Form small constant (2^-170) to correct underflow result near region of
// smallest denormal in round-nearest.
// Put in s2 (td set, ftz set)
mov pow_GR_rcs0_mask = 0x0c00 // Set mask for rc.s0
}
{ .mfi
-(p12) mov pow_GR_tmp = 0x2ffff - 170
+(p12) mov pow_GR_tmp = 0x2ffff - 170
nop.f 999
-(p13) mov pow_GR_tmp = 0x0ffff - 170
+(p13) mov pow_GR_tmp = 0x0ffff - 170
}
;;
//
// API
//====================================================================
-// double remainder(double,double);
+// double remainder(double,double);
//
// Overview of operation
//====================================================================
// remainder(a,b)=a-i*b,
-// where i is an integer such that, if b!=0 and a is finite,
+// where i is an integer such that, if b!=0 and a is finite,
// |a/b-i|<=1/2. If |a/b-i|=1/2, i is even.
//
// Algorithm
// a). eliminate special cases
// b). if |a/b|<0.25 (first quotient estimate), return a
// c). use single precision divide algorithm to get quotient q
-// rounded to 24 bits of precision
-// d). calculate partial remainders (using both q and q-ulp);
-// select one and RZ(a/b) based on the sign of |a|-|b|*q
+// rounded to 24 bits of precision
+// d). calculate partial remainders (using both q and q-ulp);
+// select one and RZ(a/b) based on the sign of |a|-|b|*q
// e). if the exponent difference (exponent(a)-exponent(b))
-// is less than 24 (quotient estimate<2^{24}-2), use RZ(a/b)
+// is less than 24 (quotient estimate<2^{24}-2), use RZ(a/b)
// and sticky bits to round to integer; exit loop and
// calculate final remainder
// f). if exponent(a)-exponent(b)>=24, select new value of a as
-// the partial remainder calculated using RZ(a/b);
-// repeat from c).
+// the partial remainder calculated using RZ(a/b);
+// repeat from c).
//
// Special cases
//====================================================================
GR_SAVE_B0 = r33
GR_SAVE_PFS = r34
-GR_SAVE_GP = r35
+GR_SAVE_GP = r35
GR_SAVE_SP = r36
GR_Parameter_X = r37
// Y +-NAN, +-inf, +-0? p11
{ .mfi
setf.exp f32=r28
- fclass.m.unc p11,p0 = f9, 0xe7
+ fclass.m.unc p11,p0 = f9, 0xe7
nop.i 999
}
// qnan snan inf norm unorm 0 -+
// X +-NAN, +-inf, ? p9
{ .mfi
nop.m 999
- fclass.m.unc p9,p0 = f8, 0xe3
- nop.i 999;;
+ fclass.m.unc p9,p0 = f8, 0xe3
+ nop.i 999;;
}
{.mfi
// y0 = 1 / b in f10
frcpa.s1 f10,p6=f13,f14
nop.i 0;;
-}
+}
{.bbb
(p9) br.cond.spnt FREM_X_NAN_INF
// set D flag if a (f8) is denormal
fnma.s0 f6=f8,f1,f8
nop.i 0;;
-}
+}
-remloop24:
+remloop24:
{ .mfi
nop.m 0
// Step (2)
nop.m 0
// r2=1.25*2^{-24}
movl r2=0x33a00000;;
-}
+}
{.mfi
nop.m 0
// q2 = q1 + e1 * q1 in f6
(p6) fma.s1 f6=f7,f15,f15
nop.i 0;;
-}
+}
{.mmi
// f15=1.25*2^{-24}
setf.s f15=r2
- // q<1/4 ? (i.e. expon< -2)
+ // q<1/4 ? (i.e. expon< -2)
(p7) cmp.gt p7,p0=r28,r29
nop.i 0;;
}
{.mfb
// r29= -32+bias
mov r29=0xffdf
- // if |a/b|<1/4, set D flag before returning
+ // if |a/b|<1/4, set D flag before returning
(p7) fma.d.s0 f9=f9,f0,f8
nop.b 0;;
}
// set f8 to current a value | sign
fmerge.s f8=f8,f13
nop.i 0;;
-}
+}
{.mfi
nop.m 0
cmp.eq p11,p14=r2,r28
nop.i 0;;
-}
+}
.pred.rel "mutex",p11,p14
{.mfi
// if exp_q=2^23, then r=a-b*2^{23}
(p11) fnma.s1 f13=f12,f14,f13
nop.i 0
-}
+}
{.mfi
nop.m 0
// r2=a-b*q'
.pred.rel "mutex",p8,p9
{.mfi
- nop.m 0
+ nop.m 0
// (p8) Q=q+(last iteration ? sticky bits:0)
// i.e. Q=q+q*x (x=2^{-32} or 0)
(p8) fma.s1 f11=f11,f7,f11
// (p10) new a =r
(p10) mov f13=f6
(p12) br.cond.sptk remloop24;;
-}
+}
// last iteration
{.mfi
// save sign of a
fmerge.s f7=f8,f8
nop.i 0
-} {.mfi
+} {.mfi
nop.m 0
// normalize
fcvt.xf f11=f11
nop.i 0;;
-}
+}
{.mfi
nop.m 0
- // This can be removed if sign of 0 is not important
+ // This can be removed if sign of 0 is not important
// get remainder using sf1
fnma.d.s1 f12=f9,f11,f8
nop.i 0
{.mfi
nop.m 0
// f12=0?
- // This can be removed if sign of 0 is not important
+ // This can be removed if sign of 0 is not important
fcmp.eq.unc.s1 p8,p0=f12,f0
nop.i 0;;
}
{.mfb
nop.m 0
// if f8=0, set sign correctly
- // This can be removed if sign of 0 is not important
+ // This can be removed if sign of 0 is not important
(p8) fmerge.s f8=f7,f8
// return
br.ret.sptk b0;;
}
-FREM_X_NAN_INF:
+FREM_X_NAN_INF:
// Y zero ?
-{.mfi
+{.mfi
nop.m 0
fma.s1 f10=f9,f1,f0
nop.i 0;;
nop.m 0
nop.i 0
// if Y zero
- (p11) br.cond.spnt FREM_Y_ZERO;;
+ (p11) br.cond.spnt FREM_Y_ZERO;;
}
// X infinity? Return QNAN indefinite
{ .mfi
nop.m 999
- fclass.m.unc p8,p0 = f8, 0x23
+ fclass.m.unc p8,p0 = f8, 0x23
nop.i 999
}
// X infinity? Return QNAN indefinite
{ .mfi
nop.m 999
- fclass.m.unc p11,p0 = f8, 0x23
- nop.i 999;;
+ fclass.m.unc p11,p0 = f8, 0x23
+ nop.i 999;;
}
// Y NaN ?
{.mfi
// also set Denormal flag if necessary
(p8) fma.s0 f9=f9,f1,f0
nop.i 0
-}
+}
{ .mfi
nop.m 999
-(p8) frcpa.s0 f8,p7 = f8,f8
+(p8) frcpa.s0 f8,p7 = f8,f8
nop.i 999 ;;
}
}
{ .mfi
nop.m 999
-(p8) fma.d.s0 f8=f8,f1,f0
- nop.i 0 ;;
+(p8) fma.d.s0 f8=f8,f1,f0
+ nop.i 0 ;;
}
{ .mfb
nop.m 999
- frcpa.s0 f8,p7=f8,f9
- (p11) br.cond.spnt EXP_ERROR_RETURN;;
+ frcpa.s0 f8,p7=f8,f9
+ (p11) br.cond.spnt EXP_ERROR_RETURN;;
}
{ .mib
nop.m 0
nop.i 0
- br.ret.spnt b0 ;;
+ br.ret.spnt b0 ;;
}
-FREM_Y_NAN_INF_ZERO:
+FREM_Y_NAN_INF_ZERO:
// Y INF
{ .mfi
nop.m 999
- fclass.m.unc p7,p0 = f9, 0x23
+ fclass.m.unc p7,p0 = f9, 0x23
nop.i 999 ;;
}
{ .mfb
nop.m 999
-(p7) fma.d.s0 f8=f8,f1,f0
-(p7) br.ret.spnt b0 ;;
+(p7) fma.d.s0 f8=f8,f1,f0
+(p7) br.ret.spnt b0 ;;
}
// Y NAN?
{ .mfi
nop.m 999
- fclass.m.unc p9,p0 = f9, 0xc3
+ fclass.m.unc p9,p0 = f9, 0xc3
nop.i 999 ;;
}
{ .mfb
nop.m 999
-(p9) fma.d.s0 f8=f9,f1,f0
-(p9) br.ret.spnt b0 ;;
+(p9) fma.d.s0 f8=f9,f1,f0
+(p9) br.ret.spnt b0 ;;
}
FREM_Y_ZERO:
// X NAN?
{ .mfi
nop.m 999
- fclass.m.unc p9,p10 = f8, 0xc3
+ fclass.m.unc p9,p10 = f8, 0xc3
nop.i 999 ;;
}
{ .mfi
nop.m 999
-(p10) fclass.nm p9,p10 = f8, 0xff
+(p10) fclass.nm p9,p10 = f8, 0xff
nop.i 999 ;;
}
{ .mfi
nop.m 999
-(p10) frcpa.s0 f11,p7 = f0,f0
- nop.i 999;;
+(p10) frcpa.s0 f11,p7 = f0,f0
+ nop.i 999;;
}
{ .mfi
nop.m 999
- fmerge.s f10 = f8, f8
+ fmerge.s f10 = f8, f8
nop.i 999
}
{ .mfi
nop.m 999
- fma.d.s0 f8=f11,f1,f0
+ fma.d.s0 f8=f11,f1,f0
nop.i 999
}
-EXP_ERROR_RETURN:
+EXP_ERROR_RETURN:
{ .mib
- mov GR_Parameter_TAG = 124
+ mov GR_Parameter_TAG = 124
nop.i 999
- br.sptk __libm_error_region;;
+ br.sptk __libm_error_region;;
}
GLOBAL_IEEE754_END(remainder)
add GR_Parameter_Y=-32,sp // Parameter 2 value
nop.f 0
.save ar.pfs,GR_SAVE_PFS
- mov GR_SAVE_PFS=ar.pfs // Save ar.pfs
+ mov GR_SAVE_PFS=ar.pfs // Save ar.pfs
}
{ .mfi
-.fframe 64
+.fframe 64
add sp=-64,sp // Create new stack
nop.f 0
mov GR_SAVE_GP=gp // Save gp
{ .mmi
stfd [GR_Parameter_Y] = FR_Y,16 // Save Parameter 2 on stack
add GR_Parameter_X = 16,sp // Parameter 1 address
-.save b0, GR_SAVE_B0
- mov GR_SAVE_B0=b0 // Save b0
+.save b0, GR_SAVE_B0
+ mov GR_SAVE_B0=b0 // Save b0
};;
.body
{ .mib
- stfd [GR_Parameter_X] = FR_X // Store Parameter 1 on stack
- add GR_Parameter_RESULT = 0,GR_Parameter_Y
+ stfd [GR_Parameter_X] = FR_X // Store Parameter 1 on stack
+ add GR_Parameter_RESULT = 0,GR_Parameter_Y
nop.b 0 // Parameter 3 address
}
{ .mib
stfd [GR_Parameter_Y] = FR_RESULT // Store Parameter 3 on stack
- add GR_Parameter_Y = -16,GR_Parameter_Y
+ add GR_Parameter_Y = -16,GR_Parameter_Y
br.call.sptk b0=__libm_error_support# // Call error handling function
};;
{ .mmi
mov b0 = GR_SAVE_B0 // Restore return address
};;
{ .mib
- mov gp = GR_SAVE_GP // Restore gp
+ mov gp = GR_SAVE_GP // Restore gp
mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs
br.ret.sptk b0 // Return
-};;
+};;
LOCAL_LIBM_END(__libm_error_region)
// History
//====================================================================
// 02/02/00 Initial version
-// 03/02/00 New algorithm
+// 03/02/00 New algorithm
// 04/04/00 Unwind support added
// 07/21/00 Fixed quotient=2^{24*m+23} bug
// 08/15/00 Bundle added after call to __libm_error_support to properly
//
// API
//====================================================================
-// float remainderf(float,float);
+// float remainderf(float,float);
//
// Overview of operation
//====================================================================
// remainder(a,b)=a-i*b,
-// where i is an integer such that, if b!=0 and a is finite,
+// where i is an integer such that, if b!=0 and a is finite,
// |a/b-i|<=1/2. If |a/b-i|=1/2, i is even.
//
// Algorithm
// a). eliminate special cases
// b). if |a/b|<0.25 (first quotient estimate), return a
// c). use single precision divide algorithm to get quotient q
-// rounded to 24 bits of precision
-// d). calculate partial remainders (using both q and q-ulp);
-// select one and RZ(a/b) based on the sign of |a|-|b|*q
+// rounded to 24 bits of precision
+// d). calculate partial remainders (using both q and q-ulp);
+// select one and RZ(a/b) based on the sign of |a|-|b|*q
// e). if the exponent difference (exponent(a)-exponent(b))
-// is less than 24 (quotient estimate<2^{24}-2), use RZ(a/b)
+// is less than 24 (quotient estimate<2^{24}-2), use RZ(a/b)
// and sticky bits to round to integer; exit loop and
// calculate final remainder
// f). if exponent(a)-exponent(b)>=24, select new value of a as
-// the partial remainder calculated using RZ(a/b);
-// repeat from c).
+// the partial remainder calculated using RZ(a/b);
+// repeat from c).
//
// Special cases
//====================================================================
GR_SAVE_B0 = r33
GR_SAVE_PFS = r34
-GR_SAVE_GP = r35
+GR_SAVE_GP = r35
GR_SAVE_SP = r36
GR_Parameter_X = r37
// Y +-NAN, +-inf, +-0? p11
{ .mfi
nop.m 999
- fclass.m.unc p11,p0 = f9, 0xe7
+ fclass.m.unc p11,p0 = f9, 0xe7
nop.i 999
}
// qnan snan inf norm unorm 0 -+
// X +-NAN, +-inf, ? p9
{ .mfi
nop.m 999
- fclass.m.unc p9,p0 = f8, 0xe3
- nop.i 999;;
+ fclass.m.unc p9,p0 = f8, 0xe3
+ nop.i 999;;
}
{.mfi
// y0 = 1 / b in f10
frcpa.s1 f10,p6=f13,f14
nop.i 0;;
-}
+}
{.bbb
(p9) br.cond.spnt FREM_X_NAN_INF
(p11) br.cond.spnt FREM_Y_NAN_INF_ZERO
// set D flag if a (f8) is denormal
fnma.s0 f6=f8,f1,f8
nop.i 0;;
-}
+}
.align 32
-remloop24:
+remloop24:
{ .mfi
// f12=2^{24}-2
setf.s f12=r3
// q0 = a * y0 in f15
(p6) fma.s1 f15=f13,f10,f0
nop.i 0
-}
+}
{ .mfi
nop.m 0
// Step (3)
// e0 = 1 - b * y0 in f7
(p6) fnma.s1 f7=f14,f10,f1
nop.i 0;;
-}
+}
{.mlx
nop.m 0
// r2=1.25*2^{-24}
movl r2=0x33a00000;;
-}
+}
{ .mfi
nop.m 0
// Step (4)
// q1 = q0 + e0 * q0 in f6
(p6) fma.s1 f6=f7,f15,f15
nop.i 0
-}
+}
{ .mfi
nop.m 0
// Step (5)
// q2 = q1 + e1 * q1 in f6
(p6) fma.s1 f6=f7,f6,f6
nop.i 0
-}
+}
{ .mfi
mov r2=0x3e7
// Step (7)
// e2 = e1 * e1 in f7
(p6) fma.s1 f7=f7,f7,f0
nop.i 0;;
-}
+}
{.mmi
- // q<1/4 ? (i.e. expon< -2)
+ // q<1/4 ? (i.e. expon< -2)
(p7) cmp.gt.unc p7,p0=r28,r29
nop.m 0
// r2=0x3e7000000
{.mfb
// r2=0x3e7000001
add r2=1,r2
- // if |a/b|<1/4, set D flag before returning
+ // if |a/b|<1/4, set D flag before returning
(p7) fma.s.s0 f9=f9,f0,f8
nop.b 0;;
}
fmerge.s f8=f8,f13
// r2=2^{-24}+2^{-48} (double prec.)
shl r2=r2,28;;
-}
+}
{ .mfi
// q3 = q2 + e2 * q2 in f6
(p6) fma.d.s1 f6=f7,f6,f6
nop.i 0;;
-}
+}
{ .mfi
nop.m 0
// Step (9)
// q = q3 in f11
(p6) fma.s.s1 f11=f6,f1,f0
nop.i 0;;
-}
+}
{.mfi
// f7=2^{-24}
setf.d f7=r2
// r=a-b*q
fnma.s1 f6=f14,f11,f13
nop.i 0
-}
+}
{.mfi
nop.m 0
// q'=q-q*(1.25*2^{-24}) (q'=q-ulp)
// r>0 iff q=RZ(a/b) and inexact
fcmp.gt.unc.s1 p8,p0=f6,f0
nop.i 0
-}
+}
{.mfi
nop.m 0
// r<0 iff q'=RZ(a/b) and inexact
// i.e. Q=q+q*x (x=2^{-32} or 0)
(p8) fma.s1 f11=f11,f12,f11
nop.i 0
-}
+}
{.mfi
nop.m 0
// (p9) Q=q'+(last iteration ? sticky bits:0)
// (p10) new a =r
(p10) mov f13=f6
(p12) br.cond.sptk remloop24;;
-}
+}
// last iteration
{.mfi
// save sign of a
fmerge.s f7=f8,f8
nop.i 0
-}
-{.mfi
+}
+{.mfi
nop.m 0
// normalize
fcvt.xf f11=f11
nop.i 0;;
-}
+}
{.mfi
nop.m 0
- // This can be removed if sign of 0 is not important
+ // This can be removed if sign of 0 is not important
// get remainder using sf1
fnma.s.s1 f12=f9,f11,f8
nop.i 0
{.mfi
nop.m 0
// f12=0?
- // This can be removed if sign of 0 is not important
+ // This can be removed if sign of 0 is not important
fcmp.eq.unc.s1 p8,p0=f12,f0
nop.i 0;;
}
{.mfb
nop.m 0
// if f8=0, set sign correctly
- // This can be removed if sign of 0 is not important
+ // This can be removed if sign of 0 is not important
(p8) fmerge.s f8=f7,f8
// return
br.ret.sptk b0;;
}
-FREM_X_NAN_INF:
+FREM_X_NAN_INF:
// Y zero ?
-{.mfi
+{.mfi
nop.m 0
fma.s1 f10=f9,f1,f0
nop.i 0;;
nop.m 0
nop.i 0
// if Y zero
- (p11) br.cond.spnt FREM_Y_ZERO;;
+ (p11) br.cond.spnt FREM_Y_ZERO;;
}
// X infinity? Return QNAN indefinite
{ .mfi
nop.m 999
- fclass.m.unc p8,p0 = f8, 0x23
+ fclass.m.unc p8,p0 = f8, 0x23
nop.i 999
}
// X infinity? Return QNAN indefinite
{ .mfi
nop.m 999
- fclass.m.unc p11,p0 = f8, 0x23
- nop.i 999;;
+ fclass.m.unc p11,p0 = f8, 0x23
+ nop.i 999;;
}
// Y NaN ?
{.mfi
// also set Denormal flag if necessary
(p8) fma.s0 f9=f9,f1,f0
nop.i 0
-}
+}
{ .mfi
nop.m 999
-(p8) frcpa.s0 f8,p7 = f8,f8
+(p8) frcpa.s0 f8,p7 = f8,f8
nop.i 999 ;;
}
}
{ .mfi
nop.m 999
-(p8) fma.s.s0 f8=f8,f1,f0
- nop.i 0 ;;
+(p8) fma.s.s0 f8=f8,f1,f0
+ nop.i 0 ;;
}
{ .mfb
nop.m 999
- frcpa.s0 f8,p7=f8,f9
- (p11) br.cond.spnt EXP_ERROR_RETURN;;
+ frcpa.s0 f8,p7=f8,f9
+ (p11) br.cond.spnt EXP_ERROR_RETURN;;
}
{ .mib
nop.m 0
nop.i 0
- br.ret.spnt b0 ;;
+ br.ret.spnt b0 ;;
}
-FREM_Y_NAN_INF_ZERO:
+FREM_Y_NAN_INF_ZERO:
// Y INF
{ .mfi
nop.m 999
- fclass.m.unc p7,p0 = f9, 0x23
+ fclass.m.unc p7,p0 = f9, 0x23
nop.i 999 ;;
}
{ .mfb
nop.m 999
-(p7) fma.s.s0 f8=f8,f1,f0
-(p7) br.ret.spnt b0 ;;
+(p7) fma.s.s0 f8=f8,f1,f0
+(p7) br.ret.spnt b0 ;;
}
// Y NAN?
{ .mfi
nop.m 999
- fclass.m.unc p9,p0 = f9, 0xc3
+ fclass.m.unc p9,p0 = f9, 0xc3
nop.i 999 ;;
}
{ .mfb
nop.m 999
-(p9) fma.s.s0 f8=f9,f1,f0
-(p9) br.ret.spnt b0 ;;
+(p9) fma.s.s0 f8=f9,f1,f0
+(p9) br.ret.spnt b0 ;;
}
FREM_Y_ZERO:
// X NAN?
{ .mfi
nop.m 999
- fclass.m.unc p9,p10 = f8, 0xc3
+ fclass.m.unc p9,p10 = f8, 0xc3
nop.i 999 ;;
}
{ .mfi
nop.m 999
-(p10) fclass.nm p9,p10 = f8, 0xff
+(p10) fclass.nm p9,p10 = f8, 0xff
nop.i 999 ;;
}
{ .mfi
nop.m 999
-(p10) frcpa.s0 f11,p7 = f0,f0
+(p10) frcpa.s0 f11,p7 = f0,f0
nop.i 999;;
}
{ .mfi
nop.m 999
- fmerge.s f10 = f8, f8
+ fmerge.s f10 = f8, f8
nop.i 999
}
{ .mfi
nop.m 999
- fma.s.s0 f8=f11,f1,f0
+ fma.s.s0 f8=f11,f1,f0
nop.i 999
}
-EXP_ERROR_RETURN:
+EXP_ERROR_RETURN:
{ .mib
- mov GR_Parameter_TAG = 125
+ mov GR_Parameter_TAG = 125
nop.i 999
- br.sptk __libm_error_region;;
+ br.sptk __libm_error_region;;
}
GLOBAL_IEEE754_END(remainderf)
add GR_Parameter_Y=-32,sp // Parameter 2 value
nop.f 0
.save ar.pfs,GR_SAVE_PFS
- mov GR_SAVE_PFS=ar.pfs // Save ar.pfs
+ mov GR_SAVE_PFS=ar.pfs // Save ar.pfs
}
{ .mfi
-.fframe 64
+.fframe 64
add sp=-64,sp // Create new stack
nop.f 0
mov GR_SAVE_GP=gp // Save gp
{ .mmi
stfs [GR_Parameter_Y] = FR_Y,16 // Save Parameter 2 on stack
add GR_Parameter_X = 16,sp // Parameter 1 address
-.save b0, GR_SAVE_B0
- mov GR_SAVE_B0=b0 // Save b0
+.save b0, GR_SAVE_B0
+ mov GR_SAVE_B0=b0 // Save b0
};;
.body
{ .mib
- stfs [GR_Parameter_X] = FR_X // Store Parameter 1 on stack
- add GR_Parameter_RESULT = 0,GR_Parameter_Y
+ stfs [GR_Parameter_X] = FR_X // Store Parameter 1 on stack
+ add GR_Parameter_RESULT = 0,GR_Parameter_Y
nop.b 0 // Parameter 3 address
}
{ .mib
stfs [GR_Parameter_Y] = FR_RESULT // Store Parameter 3 on stack
- add GR_Parameter_Y = -16,GR_Parameter_Y
+ add GR_Parameter_Y = -16,GR_Parameter_Y
br.call.sptk b0=__libm_error_support#;; // Call error handling function
}
{ .mmi
mov b0 = GR_SAVE_B0 // Restore return address
};;
{ .mib
- mov gp = GR_SAVE_GP // Restore gp
+ mov gp = GR_SAVE_GP // Restore gp
mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs
br.ret.sptk b0 // Return
-};;
+};;
LOCAL_LIBM_END(__libm_error_region)
// History
//====================================================================
// 02/02/00 Initial version
-// 03/02/00 New algorithm
+// 03/02/00 New algorithm
// 04/04/00 Unwind support added
// 07/21/00 Fixed quotient=2^{24*m+23}*1.q1...q23 1 bug
// 08/15/00 Bundle added after call to __libm_error_support to properly
//
// API
//====================================================================
-// long double remainderl(long double,long double);
+// long double remainderl(long double,long double);
//
// Overview of operation
//====================================================================
// remainder(a,b)=a-i*b,
-// where i is an integer such that, if b!=0 and a is finite,
+// where i is an integer such that, if b!=0 and a is finite,
// |a/b-i|<=1/2. If |a/b-i|=1/2, i is even.
//
// Algorithm
// a). eliminate special cases
// b). if |a/b|<0.25 (first quotient estimate), return a
// c). use single precision divide algorithm to get quotient q
-// rounded to 24 bits of precision
-// d). calculate partial remainders (using both q and q-ulp);
-// select one and RZ(a/b) based on the sign of |a|-|b|*q
+// rounded to 24 bits of precision
+// d). calculate partial remainders (using both q and q-ulp);
+// select one and RZ(a/b) based on the sign of |a|-|b|*q
// e). if the exponent difference (exponent(a)-exponent(b))
-// is less than 24 (quotient estimate<2^{24}-2), use RZ(a/b)
+// is less than 24 (quotient estimate<2^{24}-2), use RZ(a/b)
// and sticky bits to round to integer; exit loop and
// calculate final remainder
// f). if exponent(a)-exponent(b)>=24, select new value of a as
-// the partial remainder calculated using RZ(a/b);
-// repeat from c).
+// the partial remainder calculated using RZ(a/b);
+// repeat from c).
//
// Special cases
//====================================================================
GR_SAVE_B0 = r33
GR_SAVE_PFS = r34
-GR_SAVE_GP = r35
+GR_SAVE_GP = r35
GR_SAVE_SP = r36
GR_Parameter_X = r37
// Y +-NAN, +-inf, +-0? p11
{ .mfi
nop.m 999
-(p10) fclass.m p11,p10 = f9, 0xe7
+(p10) fclass.m p11,p10 = f9, 0xe7
nop.i 999
}
// qnan snan inf norm unorm 0 -+
// X +-NAN, +-inf, ? p9
{ .mfi
nop.m 999
- fclass.m.unc p9,p8 = f8, 0xe3
- nop.i 999;;
+ fclass.m.unc p9,p8 = f8, 0xe3
+ nop.i 999;;
}
{.mfi
// y0 = 1 / b in f10
frcpa.s1 f10,p6=f13,f14
nop.i 0;;
-}
+}
// Y +-NAN, +-inf, +-0? p11
{ .mfi
nop.m 999
// pseudo-NaN ?
-(p10) fclass.nm p11,p0 = f9, 0xff
+(p10) fclass.nm p11,p0 = f9, 0xff
nop.i 999
}
{ .mfi
nop.m 999
-(p8) fclass.nm p9,p0 = f8, 0xff
+(p8) fclass.nm p9,p0 = f8, 0xff
nop.i 999;;
}
// set D flag if a (f8) is denormal
fnma.s0 f6=f8,f1,f8
nop.i 0;;
-}
+}
-remloop24:
+remloop24:
{ .mfi
nop.m 0
// Step (2)
nop.m 0
// r2=1.25*2^{-24}
movl r2=0x33a00000;;
-}
+}
{.mfi
nop.m 0
// q2 = q1 + e1 * q1 in f6
(p6) fma.s1 f6=f7,f15,f15
nop.i 0;;
-}
+}
{.mmi
// f15=1.25*2^{-24}
setf.s f15=r2
- // q<1/4 ? (i.e. expon< -2)
+ // q<1/4 ? (i.e. expon< -2)
(p7) cmp.gt p7,p0=r28,r29
nop.i 0;;
}
{.mfb
// r29= -32+bias
mov r29=0xffdf
- // if |a/b|<1/4, set D flag before returning
+ // if |a/b|<1/4, set D flag before returning
(p7) fma.s0 f9=f9,f0,f8
nop.b 0;;
}
// set f8 to current a value | sign
fmerge.s f8=f8,f13
nop.i 0;;
-}
+}
{.mfi
getf.exp r28=f6
// last step ? (q<2^{23})
nop.m 0
cmp.eq p11,p14=r2,r28
nop.i 0;;
-}
+}
.pred.rel "mutex",p11,p14
{.mfi
// if exp_q=2^23, then r=a-b*2^{23}
(p11) fnma.s1 f13=f12,f14,f13
nop.i 0
-}
+}
{.mfi
nop.m 0
// r2=a-b*q'
.pred.rel "mutex",p8,p9
{.mfi
- nop.m 0
+ nop.m 0
// (p8) Q=q+(last iteration ? sticky bits:0)
// i.e. Q=q+q*x (x=2^{-32} or 0)
(p8) fma.s1 f11=f11,f7,f11
// (p10) new a =r
(p10) mov f13=f6
(p12) br.cond.sptk remloop24;;
-}
+}
// last iteration
{.mfi
// save sign of a
fmerge.s f7=f8,f8
nop.i 0
-} {.mfi
+} {.mfi
nop.m 0
// normalize
fcvt.xf f11=f11
nop.i 0;;
-}
+}
{.mfi
nop.m 0
- // This can be removed if sign of 0 is not important
+ // This can be removed if sign of 0 is not important
// get remainder using sf1
fnma.s1 f12=f9,f11,f8
nop.i 0
{.mfi
nop.m 0
// f12=0?
- // This can be removed if sign of 0 is not important
+ // This can be removed if sign of 0 is not important
fcmp.eq.unc.s1 p8,p0=f12,f0
nop.i 0;;
}
{.mfb
nop.m 0
// if f8=0, set sign correctly
- // This can be removed if sign of 0 is not important
+ // This can be removed if sign of 0 is not important
(p8) fmerge.s f8=f7,f8
// return
br.ret.sptk b0;;
-FREM_X_NAN_INF:
+FREM_X_NAN_INF:
// Y zero ?
-{.mfi
+{.mfi
nop.m 0
fma.s1 f10=f9,f1,f0
nop.i 0;;
nop.m 0
nop.i 0
// if Y zero
- (p11) br.cond.spnt FREM_Y_ZERO;;
+ (p11) br.cond.spnt FREM_Y_ZERO;;
}
// X infinity? Return QNAN indefinite
{ .mfi
nop.m 999
- fclass.m.unc p8,p0 = f8, 0x23
+ fclass.m.unc p8,p0 = f8, 0x23
nop.i 999
}
// X infinity? Return QNAN indefinite
{ .mfi
nop.m 999
- fclass.m.unc p11,p0 = f8, 0x23
- nop.i 999;;
+ fclass.m.unc p11,p0 = f8, 0x23
+ nop.i 999;;
}
// Y NaN ?
{.mfi
// also set Denormal flag if necessary
(p8) fnma.s0 f9=f9,f1,f9
nop.i 0
-}
+}
{ .mfi
nop.m 999
-(p8) frcpa.s0 f8,p7 = f8,f8
+(p8) frcpa.s0 f8,p7 = f8,f8
nop.i 999 ;;
}
}
{ .mfi
nop.m 999
-(p8) fma.s0 f8=f8,f1,f0
- nop.i 0 ;;
+(p8) fma.s0 f8=f8,f1,f0
+ nop.i 0 ;;
}
{ .mfb
nop.m 999
- frcpa.s0 f8,p7=f8,f9
- (p11) br.cond.spnt EXP_ERROR_RETURN;;
+ frcpa.s0 f8,p7=f8,f9
+ (p11) br.cond.spnt EXP_ERROR_RETURN;;
}
{ .mib
nop.m 0
nop.i 0
- br.ret.spnt b0 ;;
+ br.ret.spnt b0 ;;
}
-FREM_Y_NAN_INF_ZERO:
+FREM_Y_NAN_INF_ZERO:
// Y INF
{ .mfi
nop.m 999
- fclass.m.unc p7,p0 = f9, 0x23
+ fclass.m.unc p7,p0 = f9, 0x23
nop.i 999 ;;
}
{ .mfb
nop.m 999
-(p7) fma.s0 f8=f8,f1,f0
-(p7) br.ret.spnt b0 ;;
+(p7) fma.s0 f8=f8,f1,f0
+(p7) br.ret.spnt b0 ;;
}
// Y NAN?
{ .mfi
nop.m 999
- fclass.m.unc p9,p10 = f9, 0xc3
+ fclass.m.unc p9,p10 = f9, 0xc3
nop.i 999 ;;
}
{ .mfi
nop.m 999
-(p10) fclass.nm p9,p0 = f9, 0xff
+(p10) fclass.nm p9,p0 = f9, 0xff
nop.i 999 ;;
}
{ .mfb
nop.m 999
-(p9) fma.s0 f8=f9,f1,f0
-(p9) br.ret.spnt b0 ;;
+(p9) fma.s0 f8=f9,f1,f0
+(p9) br.ret.spnt b0 ;;
}
FREM_Y_ZERO:
// X NAN?
{ .mfi
nop.m 999
- fclass.m.unc p9,p10 = f8, 0xc3
+ fclass.m.unc p9,p10 = f8, 0xc3
nop.i 999 ;;
}
{ .mfi
nop.m 999
-(p10) fclass.nm p9,p10 = f8, 0xff
+(p10) fclass.nm p9,p10 = f8, 0xff
nop.i 999 ;;
}
}
{ .mfi
nop.m 999
-(p10) frcpa.s0 f11,p7 = f0,f0
+(p10) frcpa.s0 f11,p7 = f0,f0
nop.i 999;;
}
{ .mfi
nop.m 999
- fmerge.s f10 = f8, f8
+ fmerge.s f10 = f8, f8
nop.i 999
}
{ .mfi
nop.m 999
- fma.s0 f8=f11,f1,f0
+ fma.s0 f8=f11,f1,f0
nop.i 999;;
}
-EXP_ERROR_RETURN:
+EXP_ERROR_RETURN:
{ .mib
- mov GR_Parameter_TAG = 123
+ mov GR_Parameter_TAG = 123
nop.i 999
- br.sptk __libm_error_region;;
+ br.sptk __libm_error_region;;
}
GLOBAL_IEEE754_END(remainderl)
add GR_Parameter_Y=-32,sp // Parameter 2 value
nop.f 0
.save ar.pfs,GR_SAVE_PFS
- mov GR_SAVE_PFS=ar.pfs // Save ar.pfs
+ mov GR_SAVE_PFS=ar.pfs // Save ar.pfs
}
{ .mfi
-.fframe 64
+.fframe 64
add sp=-64,sp // Create new stack
nop.f 0
mov GR_SAVE_GP=gp // Save gp
{ .mmi
stfe [GR_Parameter_Y] = FR_Y,16 // Save Parameter 2 on stack
add GR_Parameter_X = 16,sp // Parameter 1 address
-.save b0, GR_SAVE_B0
- mov GR_SAVE_B0=b0 // Save b0
+.save b0, GR_SAVE_B0
+ mov GR_SAVE_B0=b0 // Save b0
};;
.body
{ .mib
- stfe [GR_Parameter_X] = FR_X // Store Parameter 1 on stack
- add GR_Parameter_RESULT = 0,GR_Parameter_Y
+ stfe [GR_Parameter_X] = FR_X // Store Parameter 1 on stack
+ add GR_Parameter_RESULT = 0,GR_Parameter_Y
nop.b 0 // Parameter 3 address
}
{ .mib
stfe [GR_Parameter_Y] = FR_RESULT // Store Parameter 3 on stack
- add GR_Parameter_Y = -16,GR_Parameter_Y
+ add GR_Parameter_Y = -16,GR_Parameter_Y
br.call.sptk b0=__libm_error_support# // Call error handling function
};;
{ .mmi
mov b0 = GR_SAVE_B0 // Restore return address
};;
{ .mib
- mov gp = GR_SAVE_GP // Restore gp
+ mov gp = GR_SAVE_GP // Restore gp
mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs
br.ret.sptk b0 // Return
-};;
+};;
LOCAL_LIBM_END(__libm_error_region)
SCALB_NAN_INF_ZERO:
//
-// Before entry, N has been converted to a fp integer in significand of
+// Before entry, N has been converted to a fp integer in significand of
// FR_N_float_int
//
// Convert N_float_int to floating point value
SCALBF_NAN_INF_ZERO:
//
-// Before entry, N has been converted to a fp integer in significand of
+// Before entry, N has been converted to a fp integer in significand of
// FR_N_float_int
//
// Convert N_float_int to floating point value
SCALBL_NAN_INF_ZERO:
//
-// Before entry, N has been converted to a fp integer in significand of
+// Before entry, N has been converted to a fp integer in significand of
// FR_N_float_int
//
// Convert N_float_int to floating point value
// products derived from this software without specific prior written
// permission.
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
// Intel Corporation is the author of this code, and requests that all
-// problem reports or change requests be submitted to it directly at
+// problem reports or change requests be submitted to it directly at
// http://www.intel.com/software/products/opensource/libraries/num.htm.
//
// History
// 08/15/00 Bundle added after call to __libm_error_support to properly
// set [the previously overwritten] GR_Parameter_RESULT.
// 10/12/00 Update to set denormal operand and underflow flags
-// 01/22/01 Fixed to set inexact flag for small args. Fixed incorrect
+// 01/22/01 Fixed to set inexact flag for small args. Fixed incorrect
// call to __libm_error_support for 710.476 < x < 11357.2166.
// 05/02/01 Reworked to improve speed of all paths
// 05/20/02 Cleaned up namespace and sf0 syntax
//
// Registers used
//==============================================================
-// general registers:
+// general registers:
// r14 -> r40
// predicate registers used:
// p6 -> p11
// floating-point registers used:
-// f9 -> f15; f32 -> f90;
+// f9 -> f15; f32 -> f90;
// f8 has input, then output
//
// Overview of operation
// 1. SINH_BY_POLY 0 < |x| < 0.25
// ===============
// Evaluate sinh(x) by a 13th order polynomial
-// Care is take for the order of multiplication; and P_1 is not exactly 1/3!,
+// Care is take for the order of multiplication; and P_1 is not exactly 1/3!,
// P_2 is not exactly 1/5!, etc.
// sinh(x) = sign * (series(e^x) - series(e^-x))/2
// = sign * (ax + ax^3/3! + ax^5/5! + ax^7/7! + ax^9/9! + ax^11/11!
// =============
// sinh(x) = sinh(B+R)
// = sinh(B)cosh(R) + cosh(B)sinh(R)
-//
+//
// ax = |x| = M*log2/64 + R
// B = M*log2/64
-// M = 64*N + j
+// M = 64*N + j
// We will calculate M and get N as (M-j)/64
// The division is a shift.
// exp(B) = exp(N*log2 + j*log2/64)
// = 2^N * 2^(j*log2/64)
// sinh(B) = 1/2(e^B -e^-B)
-// = 1/2(2^N * 2^(j*log2/64) - 2^-N * 2^(-j*log2/64))
-// sinh(B) = (2^(N-1) * 2^(j*log2/64) - 2^(-N-1) * 2^(-j*log2/64))
-// cosh(B) = (2^(N-1) * 2^(j*log2/64) + 2^(-N-1) * 2^(-j*log2/64))
+// = 1/2(2^N * 2^(j*log2/64) - 2^-N * 2^(-j*log2/64))
+// sinh(B) = (2^(N-1) * 2^(j*log2/64) - 2^(-N-1) * 2^(-j*log2/64))
+// cosh(B) = (2^(N-1) * 2^(j*log2/64) + 2^(-N-1) * 2^(-j*log2/64))
// 2^(j*log2/64) is stored as Tjhi + Tjlo , j= -32,....,32
// Tjhi is double-extended (80-bit) and Tjlo is single(32-bit)
//
// R = ax - M*log2_by_64_hi - M*log2_by_64_lo
// exp(R) = 1 + R +R^2(1/2! + R(1/3! + R(1/4! + ... + R(1/n!)...)
// = 1 + p_odd + p_even
-// where the p_even uses the A coefficients and the p_even uses
+// where the p_even uses the A coefficients and the p_even uses
// the B coefficients
//
// So sinh(R) = 1 + p_odd + p_even -(1 -p_odd -p_even)/2 = p_odd
GR_Parameter_TAG = r40
-f_ABS_X = f9
+f_ABS_X = f9
f_X2 = f10
f_X4 = f11
f_tmp = f14
f_S_hi = f69
f_SC_hi_temp = f70
-f_S_lo_temp1 = f71
-f_S_lo_temp2 = f72
-f_S_lo_temp3 = f73
-f_S_lo_temp4 = f73
+f_S_lo_temp1 = f71
+f_S_lo_temp2 = f72
+f_S_lo_temp3 = f73
+f_S_lo_temp4 = f73
f_S_lo = f74
f_C_hi = f75
-f_Y_hi = f77
-f_Y_lo_temp = f78
-f_Y_lo = f79
+f_Y_hi = f77
+f_Y_lo_temp = f78
+f_Y_lo = f79
f_NORM_X = f80
f_P1 = f81
}
{ .mfi
nop.m 0
- fnorm.s1 f_NORM_X = f8
+ fnorm.s1 f_NORM_X = f8
mov r_exp_2tom57 = 0xffff-57
}
;;
{ .mfi
setf.d f_RSHF_2TO57 = r_rshf_2to57 // Form const 1.100 * 2^120
fclass.m p10,p0 = f8, 0x0b // Test for denorm
- mov r_exp_mask = 0x1ffff
+ mov r_exp_mask = 0x1ffff
}
{ .mlx
setf.sig f_INV_LN2_2TO63 = r_sig_inv_ln2 // Form 1/ln2 * 2^63
add r_ad5 = 0x580, r_ad1 // Point to j_lo_table midpoint
}
{ .mib
- ldfe f_log2by64_hi = [r_ad1],16
+ ldfe f_log2by64_hi = [r_ad1],16
and r_exp_x = r_exp_mask, r_signexp_x
(p7) br.ret.spnt b0 // Exit if x=0
}
// Get the A coefficients for SINH_BY_TBL
{ .mfi
- ldfe f_A1 = [r_ad3],16
+ ldfe f_A1 = [r_ad3],16
fcmp.lt.s1 p8,p9 = f8,f0 // Test for x<0
cmp.lt p7,p0 = r_exp_x, r_exp_0_25 // Test x < 0.25
}
{ .mfb
add r_ad2o = 0x30, r_ad2e // Point to p_table odd coeffs
-(p6) fma.s0 f8 = f8,f1,f0 // Result for x nan, inf
+(p6) fma.s0 f8 = f8,f1,f0 // Result for x nan, inf
(p6) br.ret.spnt b0 // Exit for x nan, inf
}
;;
// Calculate X2 = ax*ax for SINH_BY_POLY
{ .mfi
- ldfe f_log2by64_lo = [r_ad1],16
+ ldfe f_log2by64_lo = [r_ad1],16
nop.f 0
nop.i 0
}
{ .mfb
- ldfe f_A2 = [r_ad3],16
+ ldfe f_A2 = [r_ad3],16
fma.s1 f_X2 = f_NORM_X, f_NORM_X, f0
(p7) br.cond.spnt SINH_BY_POLY
}
;;
// Here if |x| >= 0.25
-SINH_BY_TBL:
+SINH_BY_TBL:
// ******************************************************
// STEP 1 (TBL and EXP) - Argument reduction
// ******************************************************
-// Get the following constants.
+// Get the following constants.
// Inv_log2by64
// log2by64_hi
// log2by64_lo
// Subtract RSHF constant to get rounded M as a floating point value
// M_temp * 2^(63-6) - 2^63
{ .mfb
- ldfe f_B3 = [r_ad3],16
+ ldfe f_B3 = [r_ad3],16
fms.s1 f_M = f_M_temp, f_2TOM57, f_RSHF
(p6) br.cond.spnt SINH_HUGE // Branch if result will overflow
}
;;
{ .mfi
- getf.sig r_M = f_M_temp
+ getf.sig r_M = f_M_temp
nop.f 0
cmp.ge p7,p6 = r_exp_x, r_exp_32 // Test if x >= 32
}
;;
-// Calculate j. j is the signed extension of the six lsb of M. It
+// Calculate j. j is the signed extension of the six lsb of M. It
// has a range of -32 thru 31.
// Calculate R
// N = (M-j)/64
{ .mfi
ldfe f_Tjhi = [r_ad_J_hi]
- fnma.s1 f_R = f_M, f_log2by64_lo, f_R_temp
- shr r_N = r_Mmj, 0x6 // N = (M-j)/64
+ fnma.s1 f_R = f_M, f_log2by64_lo, f_R_temp
+ shr r_N = r_Mmj, 0x6 // N = (M-j)/64
}
{ .mfi
shladd r_ad_mJ_hi = r_mj, 4, r_ad4 // pointer to Tmjhi
}
;;
-//
-// If TBL,
+//
+// If TBL,
// Calculate S_hi and S_lo, and C_hi
// SC_hi_temp = sneg * Tmjhi
// S_hi = spos * Tjhi - SC_hi_temp
{ .mfi
nop.m 0
-(p6) fma.s1 f_SC_hi_temp = f_sneg, f_Tmjhi, f0
+(p6) fma.s1 f_SC_hi_temp = f_sneg, f_Tmjhi, f0
nop.i 0
}
;;
-// If TBL,
+// If TBL,
// S_lo_temp3 = sneg * Tmjlo
// S_lo_temp4 = spos * Tjlo - S_lo_temp3
// S_lo_temp4 = spos * Tjlo -(sneg * Tmjlo)
}
;;
-// If EXP,
+// If EXP,
// Compute sgnx * 2^(N-1) * Tjhi and sgnx * 2^(N-1) * Tjlo
{ .mfi
nop.m 0
{ .mfi
nop.m 0
-(p6) fnma.s1 f_S_lo_temp2 = f_sneg, f_Tmjhi, f_S_lo_temp1
+(p6) fnma.s1 f_S_lo_temp2 = f_sneg, f_Tmjhi, f_S_lo_temp1
nop.i 0
}
;;
;;
// If TBL,
-// Y_hi = S_hi
+// Y_hi = S_hi
// Y_lo = C_hi*p_odd + (S_hi*p_even + S_lo)
{ .mfi
nop.m 0
// Here if 0 < |x| < 0.25
-SINH_BY_POLY:
+SINH_BY_POLY:
{ .mmf
ldfe f_P6 = [r_ad2e],16
ldfe f_P5 = [r_ad2o],16
{ .mmi
ldfe f_P2 = [r_ad2e],16
- ldfe f_P1 = [r_ad2o],16
+ ldfe f_P1 = [r_ad2o],16
nop.i 0
}
;;
{ .mfi
nop.m 0
(p6) fma.s0 f8 = f8,f8,f8 // If x +denorm, result=x+x^2
- nop.i 0
+ nop.i 0
}
{ .mfb
nop.m 0
// Here if |x| >= overflow limit
-SINH_HUGE:
+SINH_HUGE:
// for SINH_HUGE, put 24000 in exponent; take sign from input
{ .mmi
mov r_exp_huge = 0x15dbf
.pred.rel "mutex",p8,p9
{ .mfi
- alloc r32 = ar.pfs,0,5,4,0
+ alloc r32 = ar.pfs,0,5,4,0
(p8) fnma.s1 f_signed_hi_lo = f_huge, f1, f1
nop.i 0
}
{ .mib
stfe [GR_Parameter_X] = f8 // STORE Parameter 1 on stack
add GR_Parameter_RESULT = 0,GR_Parameter_Y // Parameter 3 address
- nop.b 0
+ nop.b 0
}
{ .mib
stfe [GR_Parameter_Y] = f_pre_result // STORE Parameter 3 on stack
// Copyright (c) 2000 - 2003, Intel Corporation
// All rights reserved.
-//
+//
// Contributed 2000 by the Intel Numerics Group, Intel Corporation
-//
+//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
// products derived from this software without specific prior written
// permission.
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
// Intel Corporation is the author of this code, and requests that all
-// problem reports or change requests be submitted to it directly at
+// problem reports or change requests be submitted to it directly at
// http://www.intel.com/software/products/opensource/libraries/num.htm.
//
//********************************************************************
setf.s f10=r3
// Step (1)
// y0 = 1/sqrt(a) in f7
- fclass.m.unc p7,p8 = f8,0x3A
+ fclass.m.unc p7,p8 = f8,0x3A
nop.i 0;;
} { .mlx
nop.m 0
// g2 = g1 + d * h1 in f7
(p6) fma.d.s0 f8=f9,f6,f7
(p6) br.ret.sptk b0 ;;
-}
+}
{ .mfb
nop.m 0
//
// This branch includes all those special values that are not negative,
// with the result equal to frcpa(x)
-//
+//
.prologue
// We are distinguishing between over(under)flow and letting
{ .mib
stfd [GR_Parameter_X] = f15 // STORE Parameter 1 on stack
add GR_Parameter_RESULT = 0,GR_Parameter_Y // Parameter 3 address
- nop.b 0
+ nop.b 0
}
{ .mib
stfd [GR_Parameter_Y] = f8 // STORE Parameter 3 on stack
// products derived from this software without specific prior written
// permission.
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
// Intel Corporation is the author of this code, and requests that all
-// problem reports or change requests be submitted to it directly at
+// problem reports or change requests be submitted to it directly at
// http://www.intel.com/software/products/opensource/libraries/num.htm.
//
//*********************************************************************
//
//********************************************************************
//
-// Accuracy: Correctly Rounded
+// Accuracy: Correctly Rounded
//
//********************************************************************
//
// All faults and exceptions should be raised correctly.
// sqrtf(QNaN) = QNaN
// sqrtf(SNaN) = QNaN
-// sqrtf(+/-0) = +/-0
+// sqrtf(+/-0) = +/-0
// sqrtf(negative) = QNaN and error handling is called
//
//********************************************************************
GR_SAVE_B0 = r34
GR_SAVE_PFS = r33
-GR_SAVE_GP = r35
+GR_SAVE_GP = r35
GR_Parameter_X = r37
GR_Parameter_Y = r38
setf.exp f12 = r2
// Step (1)
// y0 = 1/sqrt(a) in f7
- fclass.m.unc p7,p8 = f8,0x3A
+ fclass.m.unc p7,p8 = f8,0x3A
nop.i 0
} { .mfi
nop.m 0
- // Make a copy of x just in case
- mov f13 = f8
+ // Make a copy of x just in case
+ mov f13 = f8
nop.i 0;;
} { .mfi
nop.m 0
.prologue
{ .mii
add GR_Parameter_Y=-32,sp // Parameter 2 value
- mov GR_Parameter_TAG = 50
+ mov GR_Parameter_TAG = 50
.save ar.pfs,GR_SAVE_PFS
mov GR_SAVE_PFS=ar.pfs // Save ar.pfs
}
mov b0 = GR_SAVE_B0 // Restore return address
};;
{ .mib
- mov gp = GR_SAVE_GP // Restore gp
+ mov gp = GR_SAVE_GP // Restore gp
mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs
br.ret.sptk b0 // Return
-};;
+};;
LOCAL_LIBM_END(__libm_error_region)
// products derived from this software without specific prior written
// permission.
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
// Intel Corporation is the author of this code, and requests that all
-// problem reports or change requests be submitted to it directly at
+// problem reports or change requests be submitted to it directly at
// http://www.intel.com/software/products/opensource/libraries/num.htm.
//
//********************************************************************
nop.i 0;;
} { .mfi
nop.m 0
- // Make copy input x
- mov f13=f8
+ // Make copy input x
+ mov f13=f8
nop.i 0
} { .mfi
nop.m 0
// d0 = 1/2 - S0 * H0 in f10
(p6) fnma.s1 f10=f7,f9,f12
nop.i 0;;
-}
+}
{ .mfi
nop.m 0
mov f15=f8
/* file: libm_cpu_defs.h */
-
-
+
+
// Copyright (c) 2000 - 2004, Intel Corporation
// All rights reserved.
//
/* file: libm_error_codes.h */
-
-/*
+
+/*
// Copyright (c) 2000 - 2004, Intel Corporation
// All rights reserved.
//
*/
#if !defined(__LIBM_ERROR_CODES_H__)
-#define __LIBM_ERROR_CODES_H__
+#define __LIBM_ERROR_CODES_H__
typedef enum
{
tgamma_overflow, tgamma_negative, tgamma_reserve, /* 258, 259, 260 */
tgammaf_overflow, tgammaf_negative, tgammaf_reserve, /* 261, 262, 263 */
exp10l_underflow, exp10_underflow, exp10f_underflow, /* 264, 265, 266 */
- nextafterl_underflow, nextafter_underflow,
+ nextafterl_underflow, nextafter_underflow,
nextafterf_underflow, /* 267, 268, 269 */
- nexttowardl_underflow, nexttoward_underflow,
+ nexttowardl_underflow, nexttoward_underflow,
nexttowardf_underflow /* 270, 271, 272 */
} error_types;
// products derived from this software without specific prior written
// permission.
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
// Intel Corporation is the author of this code, and requests that all
-// problem reports or change requests be submitted to it directly at
+// problem reports or change requests be submitted to it directly at
// http://www.intel.com/software/products/opensource/libraries/num.htm.
//
// History
// Registers used
//==============================================================
//
-// general registers:
+// general registers:
// r14 exponent bias for x negative
// r15 exponent bias for x positive
// r16 signexp of x
// The normalization also sets fault flags and takes faults if necessary
{ .mfi
mov r20 = 0x1003f
- fnorm.s0 f9 = f8
+ fnorm.s0 f9 = f8
nop.i 999 ;;
}
{ .mfi
setf.exp f11 = r14
(p7) fcmp.lt.s0 p7,p8 = f8,f0
-(p6) cmp.eq.unc p10,p11 = r34, r0 ;;
+(p6) cmp.eq.unc p10,p11 = r34, r0 ;;
}
// If x NAN, ZERO, INFINITY, set *y=0 and exit
{ .mfi
(p9) add r15 = 64, r15
(p9) fmpy.s0 f9 = f9, f12
- cmp.eq p10,p11 = r34, r0 ;;
+ cmp.eq p10,p11 = r34, r0 ;;
}
// true exponent stored to int pointer
-// the bias is treated as 0xfffe instead of
+// the bias is treated as 0xfffe instead of
// normal 0xffff because we want the significand
// to be in the range <=0.5 sig < 1.0
// Store the value of the exponent at the pointer in r33
-// If x>0 form significand result
+// If x>0 form significand result
{ .mfi
nop.m 999
(p8) fmerge.se f8 = f10,f9
}
// Get signexp of normalized x
-// If x<0 form significand result
+// If x<0 form significand result
{ .mfi
getf.exp r16 = f9
(p7) fmerge.se f8 = f11,f9
// products derived from this software without specific prior written
// permission.
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
// Intel Corporation is the author of this code, and requests that all
-// problem reports or change requests be submitted to it directly at
+// problem reports or change requests be submitted to it directly at
// http://www.intel.com/software/products/opensource/libraries/num.htm.
//
// History
// Registers used
//==============================================================
//
-// general registers:
+// general registers:
// r14 exponent bias for x negative
// r15 exponent bias for x positive
// r16 signexp of x
// The normalization also sets fault flags and takes faults if necessary
{ .mfi
mov r20 = 0x1003f
- fnorm.s0 f9 = f8
+ fnorm.s0 f9 = f8
nop.i 999 ;;
}
{ .mfi
setf.exp f11 = r14
(p7) fcmp.lt.s0 p7,p8 = f8,f0
-(p6) cmp.eq.unc p10,p11 = r34, r0 ;;
+(p6) cmp.eq.unc p10,p11 = r34, r0 ;;
}
// If x NAN, ZERO, INFINITY, set *y=0 and exit
{ .mfi
(p9) add r15 = 64, r15
(p9) fmpy.s0 f9 = f9, f12
- cmp.eq p10,p11 = r34, r0 ;;
+ cmp.eq p10,p11 = r34, r0 ;;
}
// true exponent stored to int pointer
-// the bias is treated as 0xfffe instead of
+// the bias is treated as 0xfffe instead of
// normal 0xffff because we want the significand
// to be in the range <=0.5 sig < 1.0
// Store the value of the exponent at the pointer in r33
-// If x>0 form significand result
+// If x>0 form significand result
{ .mfi
nop.m 999
(p8) fmerge.se f8 = f10,f9
}
// Get signexp of normalized x
-// If x<0 form significand result
+// If x<0 form significand result
{ .mfi
getf.exp r16 = f9
(p7) fmerge.se f8 = f11,f9
// products derived from this software without specific prior written
// permission.
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
// Intel Corporation is the author of this code, and requests that all
-// problem reports or change requests be submitted to it directly at
+// problem reports or change requests be submitted to it directly at
// http://www.intel.com/software/products/opensource/libraries/num.htm.
//
// History
// Registers used
//==============================================================
//
-// general registers:
+// general registers:
// r14 exponent bias for x negative
// r15 exponent bias for x positive
// r16 signexp of x
// The normalization also sets fault flags and takes faults if necessary
{ .mfi
mov r20 = 0x1003f
- fnorm.s0 f9 = f8
+ fnorm.s0 f9 = f8
nop.i 999 ;;
}
{ .mfi
setf.exp f11 = r14
(p7) fcmp.lt.s0 p7,p8 = f8,f0
-(p6) cmp.eq.unc p10,p11 = r35, r0 ;;
+(p6) cmp.eq.unc p10,p11 = r35, r0 ;;
}
// If x NAN, ZERO, INFINITY, set *y=0 and exit
{ .mfi
(p9) add r15 = 64, r15
(p9) fmpy.s0 f9 = f9, f12
- cmp.eq p10,p11 = r35, r0 ;;
+ cmp.eq p10,p11 = r35, r0 ;;
}
// true exponent stored to int pointer
-// the bias is treated as 0xfffe instead of
+// the bias is treated as 0xfffe instead of
// normal 0xffff because we want the significand
// to be in the range <=0.5 sig < 1.0
// Store the value of the exponent at the pointer in r34
-// If x>0 form significand result
+// If x>0 form significand result
{ .mfi
nop.m 999
(p8) fmerge.se f8 = f10,f9
}
// Get signexp of normalized x
-// If x<0 form significand result
+// If x<0 form significand result
{ .mfi
getf.exp r16 = f9
(p7) fmerge.se f8 = f11,f9
//
// API
//==============================================================
-// float __libm_scalblnf (float x, long int n, int long_int_type)
-// input floating point f8 and long int n (r33)
+// float __libm_scalblnf (float x, long int n, int long_int_type)
+// input floating point f8 and long int n (r33)
// input long_int_type = 0 if long int defined as 32 bits, = 1 if 64 bits
// output floating point f8
//
// Copyright (C) 2000, 2001, Intel Corporation
// All rights reserved.
-//
+//
// Contributed 2/2/2000 by John Harrison, Ted Kubaska, Bob Norin, Shane Story,
// and Ping Tak Peter Tang of the Computational Software Lab, Intel Corporation.
//
// products derived from this software without specific prior written
// permission.
//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
// Intel Corporation is the author of this code, and requests that all
-// problem reports or change requests be submitted to it directly at
+// problem reports or change requests be submitted to it directly at
// http://developer.intel.com/opensource.
//
// *********************************************************************
//
-// History:
-// 02/02/00 Initial Version
+// History:
+// 02/02/00 Initial Version
// 4/04/00 Unwind support added
// 12/28/00 Fixed false invalid flags
//
//
// *********************************************************************
//
-// Accuracy: Very accurate for double-precision values
+// Accuracy: Very accurate for double-precision values
//
// *********************************************************************
//
//
// Denormal fault raised on denormal inputs
// Overflow exceptions do not occur
-// Underflow exceptions raised when appropriate for tan
+// Underflow exceptions raised when appropriate for tan
// (No specialized error handling for this routine)
// Inexact raised when appropriate by algorithm
//
// tan( B + x ) = ------------------------
// 1 - tan(B)*tan(x)
//
-// / \
+// / \
// | tan(B) + tan(x) |
// = tan(B) + | ------------------------ - tan(B) |
// cot( B + x ) = ------------------------
// tan(B) + tan(x)
//
-// / \
+// / \
// | 1 - tan(B)*tan(x) |
// = cot(B) + | ----------------------- - cot(B) |
// / (1/[sin(B)*cos(B)]) * tan(x)
// tan(Arg) = sgn_r * | tan(B) + --------------------------------
// \ cot(B) - tan(x)
-// \
+// \
// + CORR |
// /
// / (1/[sin(B)*cos(B)]) * tan(x)
// tan(Arg) = sgn_r * | -cot(B) + --------------------------------
// \ tan(B) + tan(x)
-// \
+// \
// + CORR |
// /
// / (1/[sin(B)*cos(B)]) * tan(x)
// sgn_r * | tan(B) + -------------------------------- +
// \ cot(B) - tan(x)
-// \
+// \
// CORR |
// /
// / (1/[sin(B)*cos(B)]) * tan(x)
// sgn_r * | -cot(B) + -------------------------------- +
// \ tan(B) + tan(x)
-// \
+// \
// CORR |
// /
data4 0x5FDBEC21, 0x8000E147, 0x00004000, 0x00000000
data4 0xA07791FA, 0x80186650, 0x00004000, 0x00000000
-Arg = f8
+Arg = f8
Result = f8
fp_tmp = f9
U_2 = f10
table_ptr1 = r37
table_ptr2 = r38
i_0 = r39
-i_1 = r40
-N_fix_gr = r41
-N_inc = r42
-exp_Arg = r43
-exp_r = r44
-sig_r = r45
-lookup = r46
-table_offset = r47
-Create_B = r48
+i_1 = r40
+N_fix_gr = r41
+N_inc = r42
+exp_Arg = r43
+exp_r = r44
+sig_r = r45
+lookup = r46
+table_offset = r47
+Create_B = r48
gr_tmp = r49
GR_Parameter_X = r49
.proc __libm_tan
-__libm_tan:
+__libm_tan:
{ .mfi
alloc r32 = ar.pfs, 0,17,2,0
(p0) fclass.m.unc p6,p0 = Arg, 0x1E7
- addl gr_tmp = -1,r0
+ addl gr_tmp = -1,r0
}
;;
;;
//
-// Check for NatVals, Infs , NaNs, and Zeros
+// Check for NatVals, Infs , NaNs, and Zeros
// Check for everything - if false, then must be pseudo-zero
// or pseudo-nan.
// Local table pointer
{ .mbb
(p0) add table_ptr2 = 96, table_ptr1
-(p6) br.cond.spnt __libm_TAN_SPECIAL
+(p6) br.cond.spnt __libm_TAN_SPECIAL
(p7) br.cond.spnt __libm_TAN_SPECIAL ;;
}
//
// Point to Inv_P_0
-// Branch out to deal with unsupporteds and special values.
+// Branch out to deal with unsupporteds and special values.
//
{ .mmf
{ .mmi
(p0) ldfs NEGTWO_TO_24 = [table_ptr1],12 ;;
//
-// Do fcmp to generate Denormal exception
+// Do fcmp to generate Denormal exception
// - can't do FNORM (will generate Underflow when U is unmasked!)
// Normalize input argument.
//
}
-TAN_LARGER_ARG:
+TAN_LARGER_ARG:
{ .mmf
(p0) addl table_ptr1 = @ltoff(TAN_BASE_CONSTANTS), gp
nop.m 999
-(p0) fmpy.s1 N_0 = Arg, Inv_P_0
+(p0) fmpy.s1 N_0 = Arg, Inv_P_0
}
;;
}
-TAN_SMALL_R:
+TAN_SMALL_R:
{ .mii
nop.m 999
(p11) ldfe P1_8 = [table_ptr1], -16 ;;
//
// N even: Poly1 = P1_2 + P1_3 * rsq
-// N odd: poly1 = 1.0 + S_hi * r
+// N odd: poly1 = 1.0 + S_hi * r
// 16 bits partial account for necessary (-1)
//
(p11) ldfe P1_7 = [table_ptr1], -16
}
-TAN_NORMAL_R:
+TAN_NORMAL_R:
{ .mfi
(p0) getf.sig sig_r = r
// xsq = x * x
// N even: Tx = T_hi * x
// Load T_lo.
-// Load C_lo - increment pointer to get SC_inv
+// Load C_lo - increment pointer to get SC_inv
// - cant get all the way, do an add later.
//
(p0) add table_ptr2 = 569, table_ptr2 ;;
.proc __libm_callout
__libm_callout:
-TAN_ARG_TOO_LARGE:
+TAN_ARG_TOO_LARGE:
.prologue
// (1)
{ .mfi
// (4)
{ .mmi
mov gp = GR_SAVE_GP // Restore gp
-(p0) mov N_fix_gr = r8
+(p0) mov N_fix_gr = r8
nop.i 999
}
;;
.restore sp
add sp = 64,sp // Restore stack pointer
(p6) br.cond.spnt TAN_SMALL_R
-(p0) br.cond.sptk TAN_NORMAL_R
+(p0) br.cond.sptk TAN_NORMAL_R
}
;;
.endp __libm_callout
{ .mfb
nop.m 999
(p0) fmpy.s0 Arg = Arg, f0
-(p0) br.ret.sptk b0
+(p0) br.ret.sptk b0
}
.endp __libm_TAN_SPECIAL
ASM_SIZE_DIRECTIVE(__libm_TAN_SPECIAL)
// products derived from this software without specific prior written
// permission.
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
// Intel Corporation is the author of this code, and requests that all
-// problem reports or change requests be submitted to it directly at
+// problem reports or change requests be submitted to it directly at
// http://www.intel.com/software/products/opensource/libraries/num.htm.
//
//*********************************************************************
//
-// History:
+// History:
// 09/04/01 Initial version
// 09/13/01 Performance improved, symmetry problems fixed
// 10/10/01 Performance improved, split issues removed
//
// Overview of operation
//==============================================================
-//
+//
// There are 6 paths:
// 1. x = 0, [S,Q]Nan or +/-INF
// Return asinhl(x) = x + x;
-//
+//
// 2. x = + denormal
// Return asinhl(x) = x - x^2;
-//
+//
// 3. x = - denormal
// Return asinhl(x) = x + x^2;
-//
+//
// 4. 'Near 0': max denormal < |x| < 1/128
// Return asinhl(x) = sign(x)*(x+x^3*(c3+x^2*(c5+x^2*(c7+x^2*(c9)))));
//
// 5. 'Huges': |x| > 2^63
// Return asinhl(x) = sign(x)*(logl(2*x));
-//
+//
// 6. 'Main path': 1/128 < |x| < 2^63
// b_hi + b_lo = x + sqrt(x^2 + 1);
// asinhl(x) = sign(x)*(log_special(b_hi, b_lo));
-//
-// Algorithm description
+//
+// Algorithm description
//==============================================================
//
-// Main path algorithm
+// Main path algorithm
// ( thanks to Peter Markstein for the idea of sqrt(x^2+1) computation! )
// *************************************************************************
//
// 1) p2 = (p2_hi+p2_lo) = x^2+1 obtaining
// ------------------------------------
// p2_hi = x2_hi + 1, where x2_hi = x * x;
-// p2_lo = x2_lo + p1_lo, where
-// x2_lo = FMS(x*x-x2_hi),
+// p2_lo = x2_lo + p1_lo, where
+// x2_lo = FMS(x*x-x2_hi),
// p1_lo = (1 - p2_hi) + x2_hi;
//
// 2) g = (g_hi+g_lo) = sqrt(p2) = sqrt(p2_hi+p2_lo)
// ----------------------------------------------
// r = invsqrt(p2_hi) (8-bit reciprocal square root approximation);
// g = p2_hi * r (first 8 bit-approximation of sqrt);
-//
+//
// h = 0.5 * r;
// e = 0.5 - g * h;
// g = g * e + g (second 16 bit-approximation of sqrt);
-//
+//
// h = h * e + h;
// e = 0.5 - g * h;
// g = g * e + g (third 32 bit-approximation of sqrt);
// h = h * e + h;
// e = 0.5 - g * h;
// g_hi = g * e + g (fourth 64 bit-approximation of sqrt);
-//
+//
// Remainder computation:
// h = h * e + h;
// d = (p2_hi - g_hi * g_hi) + p2_lo;
// -------------------------------------------------------------------
// b_hi = (g_hi + x) + gl;
// b_lo = (g_hi - b_hi) + x + gl;
-//
+//
// Now we pass b presented as sum b_hi + b_lo to special version
// of logl function which accept a pair of arguments as
-// 'mutiprecision' value.
-//
+// 'mutiprecision' value.
+//
// Special log algorithm overview
// ================================
// Here we use a table lookup method. The basic idea is that in
-// order to compute logl(Arg) = logl (Arg-1) for an argument Arg in [1,2),
+// order to compute logl(Arg) = logl (Arg-1) for an argument Arg in [1,2),
// we construct a value G such that G*Arg is close to 1 and that
// logl(1/G) is obtainable easily from a table of values calculated
// beforehand. Thus
// G := G_1 * G_2 * G_3
// r := (G * S_hi - 1) + G * S_lo
//
-// These G_j's have the property that the product is exactly
+// These G_j's have the property that the product is exactly
// representable and that |r| < 2^(-12) as a result.
//
// Step 2: Approximation
//
// Step 3: Reconstruction
//
-// Finally,
+// Finally,
//
// logl( X ) = logl( 2^N * (S_hi + S_lo) )
// ~=~ N*logl(2) + logl(1/G) + logl(1 + r)
//
// Registers used
//==============================================================
-// Floating Point registers used:
+// Floating Point registers used:
// f8, input
// f32 -> f101 (70 registers)
-// General registers used:
+// General registers used:
// r32 -> r57 (26 registers)
// Predicate registers used:
// p6 -> p11
// p6 for '0, NaNs, Inf' path
-// p7 for '+ denormals' path
+// p7 for '+ denormals' path
// p8 for 'near 0' path
// p9 for 'huges' path
-// p10 for '- denormals' path
+// p10 for '- denormals' path
// p11 for negative values
//
// Data tables
//==============================================================
-
+
RODATA
.align 64
data8 0xAAAAAAAAAAAAAAA9, 0x0000BFFC
LOCAL_OBJECT_END(Poly_C_near_0_35)
-// Q coeffs
+// Q coeffs
LOCAL_OBJECT_START(Constants_Q)
-data4 0x00000000,0xB1721800,0x00003FFE,0x00000000
+data4 0x00000000,0xB1721800,0x00003FFE,0x00000000
data4 0x4361C4C6,0x82E30865,0x0000BFE2,0x00000000
data4 0x328833CB,0xCCCCCAF2,0x00003FFC,0x00000000
data4 0xA9D4BAFB,0x80000077,0x0000BFFD,0x00000000
data4 0xAAABE3D2,0xAAAAAAAA,0x00003FFD,0x00000000
-data4 0xFFFFDAB7,0xFFFFFFFF,0x0000BFFD,0x00000000
+data4 0xFFFFDAB7,0xFFFFFFFF,0x0000BFFD,0x00000000
LOCAL_OBJECT_END(Constants_Q)
// Z1 - 16 bit fixed
data8 0xBE049391B6B7C239
LOCAL_OBJECT_END(Constants_G_H_h2)
-// G3 and H3 - IEEE single and h3 - IEEE double
+// G3 and H3 - IEEE single and h3 - IEEE double
LOCAL_OBJECT_START(Constants_G_H_h3)
data4 0x3F7FFC00,0x38800100
data8 0x3D355595562224CD
FR_Arg = f8
FR_Res = f8
FR_AX = f32
-FR_XLog_Hi = f33
-FR_XLog_Lo = f34
+FR_XLog_Hi = f33
+FR_XLog_Lo = f34
// Special logl registers
-FR_Y_hi = f35
+FR_Y_hi = f35
FR_Y_lo = f36
FR_Scale = f37
-FR_X_Prime = f38
-FR_S_hi = f39
+FR_X_Prime = f38
+FR_S_hi = f39
FR_W = f40
FR_G = f41
FR_H = f42
-FR_wsq = f43
+FR_wsq = f43
FR_w4 = f44
FR_h = f45
-FR_w6 = f46
+FR_w6 = f46
FR_G2 = f47
FR_H2 = f48
FR_poly_lo = f49
-FR_P8 = f50
+FR_P8 = f50
FR_poly_hi = f51
-FR_P7 = f52
-FR_h2 = f53
-FR_rsq = f54
+FR_P7 = f52
+FR_h2 = f53
+FR_rsq = f54
FR_P6 = f55
-FR_r = f56
+FR_r = f56
+
+FR_log2_hi = f57
+FR_log2_lo = f58
-FR_log2_hi = f57
-FR_log2_lo = f58
-
-FR_float_N = f59
-FR_Q4 = f60
+FR_float_N = f59
+FR_Q4 = f60
-FR_G3 = f61
-FR_H3 = f62
-FR_h3 = f63
+FR_G3 = f61
+FR_H3 = f62
+FR_h3 = f63
-FR_Q3 = f64
-FR_Q2 = f65
-FR_1LN10_hi = f66
+FR_Q3 = f64
+FR_Q2 = f65
+FR_1LN10_hi = f66
-FR_Q1 = f67
-FR_1LN10_lo = f68
-FR_P5 = f69
-FR_rcub = f70
+FR_Q1 = f67
+FR_1LN10_lo = f68
+FR_P5 = f69
+FR_rcub = f70
-FR_Neg_One = f71
-FR_Z = f72
-FR_AA = f73
-FR_BB = f74
-FR_S_lo = f75
-FR_2_to_minus_N = f76
+FR_Neg_One = f71
+FR_Z = f72
+FR_AA = f73
+FR_BB = f74
+FR_S_lo = f75
+FR_2_to_minus_N = f76
// Huge & Main path prolog registers
GR_Poly_C_79 = r46
// Special logl registers
-GR_Index1 = r34
-GR_Index2 = r35
-GR_signif = r36
-GR_X_0 = r37
-GR_X_1 = r38
-GR_X_2 = r39
-GR_Z_1 = r40
-GR_Z_2 = r41
-GR_N = r42
-GR_Bias = r43
-GR_M = r44
-GR_Index3 = r45
-GR_exp_2tom80 = r45
-GR_exp_mask = r47
-GR_exp_2tom7 = r48
-GR_ad_ln10 = r49
+GR_Index1 = r34
+GR_Index2 = r35
+GR_signif = r36
+GR_X_0 = r37
+GR_X_1 = r38
+GR_X_2 = r39
+GR_Z_1 = r40
+GR_Z_2 = r41
+GR_N = r42
+GR_Bias = r43
+GR_M = r44
+GR_Index3 = r45
+GR_exp_2tom80 = r45
+GR_exp_mask = r47
+GR_exp_2tom7 = r48
+GR_ad_ln10 = r49
GR_ad_tbl_1 = r50
GR_ad_tbl_2 = r51
GR_ad_tbl_3 = r52
{ .mfb
cmp.le p9, p0 = GR_TwoP63, GR_ArgExp // if arg > 2^63 ('huges')
(p6) fma.s0 FR_Res = FR_Arg,f1,FR_Arg // r = a + a
-(p6) br.ret.spnt b0 // return
+(p6) br.ret.spnt b0 // return
};;
// (X^2 + 1) computation
{ .mfi
{ .mfi
ldfe FR_Q1 = [GR_ad_q] // Load Q1
- fma.s1 FR_GG = FR_GG, FR_EE, FR_GG // g = g * e + g
+ fma.s1 FR_GG = FR_GG, FR_EE, FR_GG // g = g * e + g
// 16 bit Newton Raphson iteration
nop.i 0
}
{ .mfi
nop.m 0
- fma.s1 FR_GG = FR_GG, FR_EE, FR_GG // g = g * e + g
+ fma.s1 FR_GG = FR_GG, FR_EE, FR_GG // g = g * e + g
// 32 bit Newton Raphson iteration
nop.i 0
}
{ .mfi
nop.m 0
- fma.s1 FR_GG = FR_GG, FR_EE, FR_GG // g = g * e + g
+ fma.s1 FR_GG = FR_GG, FR_EE, FR_GG // g = g * e + g
// 64 bit Newton Raphson iteration
nop.i 0
}
{ .mfi
nop.m 0
nop.f 0
- extr.u GR_Index2 = GR_X_1, 6, 4 // Extract bits 6-9 of X_1
+ extr.u GR_Index2 = GR_X_1, 6, 4 // Extract bits 6-9 of X_1
};;
pmpyshr2.u GR_X_2 = GR_X_1,GR_Z_2,15 // Get bits 30-15 of X_1 * Z_2
};;
-// WE CANNOT USE GR_X_2 IN NEXT 3 CYCLES ("DEAD" ZONE!)
+// WE CANNOT USE GR_X_2 IN NEXT 3 CYCLES ("DEAD" ZONE!)
// BECAUSE OF POSSIBLE 10 CLOCKS STALL!
// So we can negate Q coefficients there for negative values
{ .mfi
nop.m 0
- fadd.s0 FR_Y_lo = FR_poly_hi, FR_poly_lo
+ fadd.s0 FR_Y_lo = FR_poly_hi, FR_poly_lo
// Y_lo=poly_hi+poly_lo
nop.i 0
}
{ .mmi
ldfe FR_log2_lo = [GR_ad_q],16 // Load log2_lo
- sub GR_N = GR_N, GR_Bias
+ sub GR_N = GR_N, GR_Bias
mov GR_exp_2tom80 = 0x0ffaf // Exponent of 2^-80
};;
{ .mmi
nop.m 0
ldfe FR_Q2 = [GR_ad_q],16 // Load Q2
- extr.u GR_Index2 = GR_X_1, 6, 4 // Extract bits 6-9 of X_1
+ extr.u GR_Index2 = GR_X_1, 6, 4 // Extract bits 6-9 of X_1
};;
{ .mmi
.section .text
GLOBAL_LIBM_ENTRY(atanf)
-{ .mfi
+{ .mfi
alloc r32 = ar.pfs,1,2,0,0
frcpa.s1 atanf_z,p0 = f1,f8
addl EXP_Addr2 = @ltoff(atanf_coeff_2_table),gp
-}
-{ .mfi
+}
+{ .mfi
addl EXP_Addr1 = @ltoff(atanf_coeff_1_table),gp
fma.s1 atanf_t = f8,f8,f0
nop.i 999;;
}
-
-{ .mfi
+
+{ .mfi
nop.m 999
fmerge.s atanf_sgn_x = f8,f1
nop.i 999;;
-}
-
-{ .mfi
+}
+
+{ .mfi
ld8 EXP_Addr1 = [EXP_Addr1]
fmerge.s atanf_abs_x = f1,f8
nop.i 999
-}
-{ .mfi
+}
+{ .mfi
ld8 EXP_Addr2 = [EXP_Addr2]
nop.f 999
nop.i 999;;
}
-
-{ .mfi
+
+{ .mfi
nop.m 999
fclass.m p8,p0 = f8,0x7 // @zero
nop.i 999;;
-}
-
-{ .mfi
+}
+
+{ .mfi
nop.m 999
fcmp.eq.unc.s0 p9,p10 = f8,f1
nop.i 999;;
-}
-
-{ .mfi
+}
+
+{ .mfi
ldfpd atanf_coeff_R4,atanf_coeff_R5 = [EXP_Addr1],16
fnma.s1 atanf_b = f8,atanf_z,f1
nop.i 999
-}
-{ .mfi
+}
+{ .mfi
ldfpd atanf_coeff_R1,atanf_coeff_R2 = [EXP_Addr2],16
fma.s1 atanf_zsq = atanf_z,atanf_z,f0
nop.i 999;;
}
-
-{ .mfi
+
+{ .mfi
ldfpd atanf_coeff_R3,atanf_coeff_P1 = [EXP_Addr1],16
fma.s1 atanf_xcub = f8,atanf_t,f0
nop.i 999
-}
-{ .mfi
+}
+{ .mfi
ldfpd atanf_coeff_Q6,atanf_coeff_Q7 = [EXP_Addr2],16
fma.s1 atanf_tsq = atanf_t,atanf_t,f0
nop.i 999;;
}
-
-{ .mfi
+
+{ .mfi
ldfpd atanf_coeff_Q8,atanf_coeff_Q9 = [EXP_Addr1],16
// fcmp.le.s1 atanf_pred_LE1,atanf_pred_GT1 = atanf_abs_x,f1
fcmp.le.s1 p6,p7 = atanf_abs_x,f1
nop.i 999
-}
-{ .mfi
+}
+{ .mfi
ldfpd atanf_coeff_Q4,atanf_coeff_Q5 = [EXP_Addr2],16
nop.f 999
nop.i 999;;
}
-
-{ .mfi
+
+{ .mfi
ldfpd atanf_coeff_Q2,atanf_coeff_Q3 = [EXP_Addr1],16
fclass.m p8,p0 = f8,0xe7 // @inf|@qnan|@snan|@zero
nop.i 999
-}
-{ .mfi
+}
+{ .mfi
ldfpd atanf_coeff_P5,atanf_coeff_P6 = [EXP_Addr2],16
nop.f 999
nop.i 999;;
}
-
-{ .mfi
+
+{ .mfi
ldfpd atanf_coeff_Q0,atanf_coeff_Q1 = [EXP_Addr1],16
nop.f 999
nop.i 999
-}
-{ .mfi
+}
+{ .mfi
ldfpd atanf_coeff_P7,atanf_coeff_P8 = [EXP_Addr2],16
nop.f 999
nop.i 999;;
}
-
-{ .mfi
+
+{ .mfi
ldfpd atanf_coeff_P3,atanf_coeff_P4 = [EXP_Addr1],16
fma.s1 atanf_bsq = atanf_b,atanf_b,f0
nop.i 999
-}
-{ .mfi
+}
+{ .mfi
ldfpd atanf_coeff_P9,atanf_coeff_P10 = [EXP_Addr2]
fma.s1 atanf_z4 = atanf_zsq,atanf_zsq,f0
nop.i 999;;
}
-
-{ .mfi
+
+{ .mfi
ldfpd atanf_coeff_P2,atanf_piby2 = [EXP_Addr1]
fma.s1 atanf_x6 = atanf_t,atanf_tsq,f0
nop.i 999
-}
-{ .mfi
+}
+{ .mfi
nop.m 999
fma.s1 atanf_t4 = atanf_tsq,atanf_tsq,f0
nop.i 999;;
}
-
-{ .mfb
+
+{ .mfb
nop.m 999
fma.s1 atanf_x5 = atanf_t,atanf_xcub,f0
(p8) br.cond.spnt ATANF_X_INF_NAN_ZERO
-}
+}
;;
-
-{ .mfi
+
+{ .mfi
nop.m 999
fma.s1 atanf_poly_r1 = atanf_b,atanf_coeff_R1,f1
nop.i 999
-}
-{ .mfi
+}
+{ .mfi
nop.m 999
fma.s1 atanf_poly_r3 = atanf_b,atanf_coeff_R5,atanf_coeff_R4
nop.i 999;;
}
-
-{ .mfi
+
+{ .mfi
nop.m 999
fma.s1 atanf_poly_r2 = atanf_b,atanf_coeff_R3,atanf_coeff_R2
nop.i 999
-}
-{ .mfi
+}
+{ .mfi
nop.m 999
fma.s1 atanf_z8 = atanf_z4,atanf_z4,f0
nop.i 999;;
}
-
-{ .mfi
+
+{ .mfi
nop.m 999
fma.s1 atanf_poly_q2 = atanf_t,atanf_coeff_Q5,atanf_coeff_Q4
nop.i 999
-}
-{ .mfi
+}
+{ .mfi
nop.m 999
fma.s1 atanf_poly_q3 = atanf_t,atanf_coeff_Q7,atanf_coeff_Q6
nop.i 999;;
}
-
-{ .mfi
+
+{ .mfi
nop.m 999
fma.s1 atanf_z5 = atanf_z,atanf_z4,f0
nop.i 999
-}
-{ .mfi
+}
+{ .mfi
nop.m 999
fma.s1 atanf_poly_q1 = atanf_t,atanf_coeff_Q9,atanf_coeff_Q8
nop.i 999;;
}
-
-{ .mfi
+
+{ .mfi
nop.m 999
fma.s1 atanf_poly_q4 = atanf_t,atanf_coeff_Q1,atanf_coeff_Q0
nop.i 999
-}
-{ .mfi
+}
+{ .mfi
nop.m 999
fma.s1 atanf_poly_q5 = atanf_t,atanf_coeff_Q3,atanf_coeff_Q2
nop.i 999;;
}
-
-{ .mfi
+
+{ .mfi
nop.m 999
fma.s1 atanf_poly_p4 = f8,atanf_coeff_P1,f0
nop.i 999
-}
-{ .mfi
+}
+{ .mfi
nop.m 999
fma.s1 atanf_poly_p5 = atanf_t,atanf_coeff_P4,atanf_coeff_P3
nop.i 999;;
}
-
-{ .mfi
+
+{ .mfi
nop.m 999
fma.s1 atanf_poly_r1 = atanf_z8,atanf_poly_r1,f0
nop.i 999
-}
-{ .mfi
+}
+{ .mfi
nop.m 999
fma.s1 atanf_z8_bsq = atanf_z8,atanf_bsq,f0
nop.i 999;;
}
-
-{ .mfi
+
+{ .mfi
nop.m 999
fma.s1 atanf_poly_q2 = atanf_tsq,atanf_poly_q3,atanf_poly_q2
nop.i 999
-}
-{ .mfi
+}
+{ .mfi
nop.m 999
fma.s1 atanf_poly_r2 = atanf_bsq,atanf_poly_r3,atanf_poly_r2
nop.i 999;;
}
-
-{ .mfi
+
+{ .mfi
nop.m 999
fma.s1 atanf_poly_p2 = atanf_t,atanf_coeff_P8,atanf_coeff_P7
nop.i 999
-}
-{ .mfi
+}
+{ .mfi
nop.m 999
fma.s1 atanf_poly_q1 = atanf_poly_q1,f1,atanf_tsq
nop.i 999;;
}
-
-{ .mfi
+
+{ .mfi
nop.m 999
fma.s1 atanf_z13 = atanf_z5,atanf_z8,f0
nop.i 999
-}
-{ .mfi
+}
+{ .mfi
nop.m 999
fma.s1 atanf_poly_p1 = atanf_t,atanf_coeff_P10,atanf_coeff_P9
nop.i 999;;
}
-
-{ .mfi
+
+{ .mfi
nop.m 999
fma.s1 atanf_poly_p4 = atanf_t,atanf_poly_p4,f8
nop.i 999
-}
-{ .mfi
+}
+{ .mfi
nop.m 999
fma.s1 atanf_poly_q4 = atanf_tsq,atanf_poly_q5,atanf_poly_q4
nop.i 999;;
}
-
-{ .mfi
+
+{ .mfi
nop.m 999
fma.s1 atanf_poly_p3 = atanf_t,atanf_coeff_P6,atanf_coeff_P5
nop.i 999
-}
-{ .mfi
+}
+{ .mfi
nop.m 999
fma.s1 atanf_poly_p5 = atanf_t,atanf_poly_p5,atanf_coeff_P2
nop.i 999;;
}
-
-{ .mfi
+
+{ .mfi
nop.m 999
fma.s1 atanf_x11 = atanf_x5,atanf_x6,f0
nop.i 999
-}
-{ .mfi
+}
+{ .mfi
nop.m 999
fma.s1 atanf_poly_r = atanf_z8_bsq,atanf_poly_r2,atanf_poly_r1
nop.i 999;;
}
-
-{ .mfi
+
+{ .mfi
nop.m 999
fma.s0 atanf_sgnx_piby2 = atanf_sgn_x,atanf_piby2,f0
nop.i 999
-}
-{ .mfi
+}
+{ .mfi
nop.m 999
fma.s1 atanf_poly_q2 = atanf_t4,atanf_poly_q1,atanf_poly_q2
nop.i 999;;
}
-
-{ .mfi
+
+{ .mfi
nop.m 999
fma.s1 atanf_poly_p1 = atanf_tsq,atanf_poly_p1,atanf_poly_p2
nop.i 999;;
-}
-
-{ .mfi
+}
+
+{ .mfi
nop.m 999
fma.s1 atanf_poly_p4 = atanf_x5,atanf_poly_p5,atanf_poly_p4
nop.i 999;;
-}
-
-{ .mfi
+}
+
+{ .mfi
nop.m 999
fma.s1 atanf_z21_poly_r = atanf_z13,atanf_poly_r,f0
nop.i 999;;
-}
-
-{ .mfi
+}
+
+{ .mfi
nop.m 999
fma.s1 atanf_poly_q = atanf_t4,atanf_poly_q2,atanf_poly_q4
nop.i 999;;
-}
-
-{ .mfi
+}
+
+{ .mfi
nop.m 999
fma.s1 atanf_poly_p1 = atanf_tsq,atanf_poly_p1,atanf_poly_p3
nop.i 999;;
-}
-
-{ .mfi
+}
+
+{ .mfi
nop.m 999
//(atanf_pred_GT1) fnma.s atanf_answer = atanf_poly_q,atanf_z21_poly_r,atanf_sgnx_piby2
(p7) fnma.s.s0 atanf_answer = atanf_poly_q,atanf_z21_poly_r,atanf_sgnx_piby2
nop.i 999;;
-}
-
-{ .mfb
+}
+
+{ .mfb
nop.m 999
//(atanf_pred_LE1) fma.s atanf_answer = atanf_x11,atanf_poly_p1,atanf_poly_p4
(p6) fma.s.s0 atanf_answer = atanf_x11,atanf_poly_p1,atanf_poly_p4
br.ret.sptk b0
-}
+}
// products derived from this software without specific prior written
// permission.
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
// Intel Corporation is the author of this code, and requests that all
-// problem reports or change requests be submitted to it directly at
+// problem reports or change requests be submitted to it directly at
// http://www.intel.com/software/products/opensource/libraries/num.htm.
//
//
// IEEE Special Conditions:
//
// Denormal fault raised on denormal inputs
-// Underflow exceptions may occur
+// Underflow exceptions may occur
// Special error handling for the y=0 and x=0 case
// Inexact raised when appropriate by algorithm
//
// atanl(SNaN) = QNaN
// atanl(QNaN) = QNaN
// atanl(+/-0) = +/- 0
-// atanl(+/-Inf) = +/-pi/2
+// atanl(+/-Inf) = +/-pi/2
//
// atan2l(Any NaN for x or y) = QNaN
-// atan2l(+/-0,x) = +/-0 for x > 0
-// atan2l(+/-0,x) = +/-pi for x < 0
-// atan2l(+/-0,+0) = +/-0
-// atan2l(+/-0,-0) = +/-pi
+// atan2l(+/-0,x) = +/-0 for x > 0
+// atan2l(+/-0,x) = +/-pi for x < 0
+// atan2l(+/-0,+0) = +/-0
+// atan2l(+/-0,-0) = +/-pi
// atan2l(y,+/-0) = pi/2 y > 0
// atan2l(y,+/-0) = -pi/2 y < 0
// atan2l(+/-y, Inf) = +/-0 for finite y > 0
-// atan2l(+/-Inf, x) = +/-pi/2 for finite x
-// atan2l(+/-y, -Inf) = +/-pi for finite y > 0
+// atan2l(+/-Inf, x) = +/-pi/2 for finite x
+// atan2l(+/-y, -Inf) = +/-pi for finite y > 0
// atan2l(+/-Inf, Inf) = +/-pi/4
// atan2l(+/-Inf, -Inf) = +/-3pi/4
//
GR_SAVE_B0 = r34
GR_SAVE_GP = r35
sign_X = r36
-sign_Y = r37
-swap = r38
-table_ptr1 = r39
-table_ptr2 = r40
-k = r41
-lookup = r42
-exp_ArgX = r43
-exp_ArgY = r44
-exponent_Q = r45
-significand_Q = r46
-special = r47
-sp_exp_Q = r48
-sp_exp_4sig_Q = r49
-table_base = r50
+sign_Y = r37
+swap = r38
+table_ptr1 = r39
+table_ptr2 = r40
+k = r41
+lookup = r42
+exp_ArgX = r43
+exp_ArgY = r44
+exponent_Q = r45
+significand_Q = r46
+special = r47
+sp_exp_Q = r48
+sp_exp_4sig_Q = r49
+table_base = r50
int_temp = r51
GR_Parameter_X = r49
GR_temp = r52
RODATA
-.align 16
+.align 16
LOCAL_OBJECT_START(Constants_atan)
// double pi/2
// Entries Tbl_lo (single precision)
// B = 1+Index/16+1/32 Index = 0
//
-data8 0x3FE9A000A935BD8E
+data8 0x3FE9A000A935BD8E
data4 0x23ACA08F, 0x00000000
//
// Entries Tbl_hi (double precision) Index = 0,1,...,15
// Entries Tbl_lo (single precision)
// Index = 0,1,...,15 B = 2^(-1)*(1+Index/16+1/32)
//
-data8 0x3FDE77EB7F175A34
+data8 0x3FDE77EB7F175A34
data4 0x238729EE, 0x00000000
-data8 0x3FE0039C73C1A40B
+data8 0x3FE0039C73C1A40B
data4 0x249334DB, 0x00000000
-data8 0x3FE0C6145B5B43DA
+data8 0x3FE0C6145B5B43DA
data4 0x22CBA7D1, 0x00000000
-data8 0x3FE1835A88BE7C13
+data8 0x3FE1835A88BE7C13
data4 0x246310E7, 0x00000000
-data8 0x3FE23B71E2CC9E6A
+data8 0x3FE23B71E2CC9E6A
data4 0x236210E5, 0x00000000
-data8 0x3FE2EE628406CBCA
+data8 0x3FE2EE628406CBCA
data4 0x2462EAF5, 0x00000000
-data8 0x3FE39C391CD41719
+data8 0x3FE39C391CD41719
data4 0x24B73EF3, 0x00000000
-data8 0x3FE445065B795B55
+data8 0x3FE445065B795B55
data4 0x24C11260, 0x00000000
-data8 0x3FE4E8DE5BB6EC04
+data8 0x3FE4E8DE5BB6EC04
data4 0x242519EE, 0x00000000
-data8 0x3FE587D81F732FBA
+data8 0x3FE587D81F732FBA
data4 0x24D4346C, 0x00000000
-data8 0x3FE6220D115D7B8D
+data8 0x3FE6220D115D7B8D
data4 0x24ED487B, 0x00000000
-data8 0x3FE6B798920B3D98
+data8 0x3FE6B798920B3D98
data4 0x2495FF1E, 0x00000000
-data8 0x3FE748978FBA8E0F
+data8 0x3FE748978FBA8E0F
data4 0x223D9531, 0x00000000
-data8 0x3FE7D528289FA093
+data8 0x3FE7D528289FA093
data4 0x242B0411, 0x00000000
-data8 0x3FE85D69576CC2C5
+data8 0x3FE85D69576CC2C5
data4 0x2335B374, 0x00000000
-data8 0x3FE8E17AA99CC05D
+data8 0x3FE8E17AA99CC05D
data4 0x24C27CFB, 0x00000000
//
// Entries Tbl_hi (double precision) Index = 0,1,...,15
// Entries Tbl_lo (single precision)
// Index = 0,1,...,15 B = 2^(-2)*(1+Index/16+1/32)
//
-data8 0x3FD025FA510665B5
+data8 0x3FD025FA510665B5
data4 0x24263482, 0x00000000
data8 0x3FD1151A362431C9
data4 0x242C8DC9, 0x00000000
{ .mfi
ldfd P_hi = [table_ptr1],8 // Load double precision hi part of pi
fclass.m p8,p0 = ArgY_orig, 0x1e7 // Test y natval, nan, inf, zero
- nop.i 999
+ nop.i 999
}
;;
{ .mfi
ldfps P_lo, TWO_TO_NEG3 = [table_ptr1], 8 // Load P_lo and constant 2^-3
- nop.f 999
- nop.i 999
+ nop.f 999
+ nop.i 999
}
{ .mfi
nop.m 999
fma.s1 M = f1, f1, f0 // Set M = 1.0
- nop.i 999
+ nop.i 999
}
;;
{ .mfi
nop.m 999
fcmp.ge.s1 p6,p7 = Xsq, Ysq // Test for |x| >= |y| using squares
- nop.i 999
+ nop.i 999
}
{ .mfb
nop.m 999
{ .mfi
ldfd P_hi = [table_ptr1],8 // Load double precision hi part of pi
fclass.m p8,p0 = ArgY_orig, 0x1e7 // Test y natval, nan, inf, zero
- nop.i 999
+ nop.i 999
}
;;
{ .mfi
ldfps P_lo, TWO_TO_NEG3 = [table_ptr1], 8 // Load P_lo and constant 2^-3
fclass.m p9,p0 = ArgX_orig, 0x1e7 // Test x natval, nan, inf, zero
- nop.i 999
+ nop.i 999
}
{ .mfi
nop.m 999
fma.s1 M = f1, f1, f0 // Set M = 1.0
- nop.i 999
+ nop.i 999
}
;;
{ .mfi
nop.m 999
fcmp.ge.s1 p6,p7 = Xsq, Ysq // Test for |x| >= |y| using squares
- nop.i 999
+ nop.i 999
}
{ .mfb
nop.m 999
}
;;
-// Create a single precision representation of the signexp of Q with the
+// Create a single precision representation of the signexp of Q with the
// 4 most significant bits of the significand followed by a 1 and then 18 0's
{ .mfi
nop.m 999
;;
//
-// Generate sign_exp_Q b_1 b_2 b_3 b_4 1 0 0 0 ... 0 in single precision
+// Generate sign_exp_Q b_1 b_2 b_3 b_4 1 0 0 0 ... 0 in single precision
// representation. Note sign of Q is always 0.
//
{ .mfi
// C_hi_hold = 1 - C_hi * U_prime_hi (1)
{ .mfi
nop.m 999
- fnma.s1 C_hi_hold = C_hi, U_prime_hi, f1
+ fnma.s1 C_hi_hold = C_hi, U_prime_hi, f1
nop.i 999
}
;;
// C_hi_hold = 1 - C_hi * U_prime_hi (3)
{ .mfi
nop.m 999
- fnma.s1 C_hi_hold = C_hi, U_prime_hi, f1
+ fnma.s1 C_hi_hold = C_hi, U_prime_hi, f1
nop.i 999
}
;;
;;
-ATANL_POLY:
+ATANL_POLY:
// Here if 0 < V/U < 2^-3
//
// ***********************************************
// Create small double in case need to raise underflow
{ .mfi
- setf.d FR_temp = GR_temp
+ setf.d FR_temp = GR_temp
fma.s1 poly = z8, poly1, poly2 // poly = poly2 + z8 * poly1
nop.i 999
}
}
;;
-//
+//
// If Res_lo is denormal test if Result equals zero
-//
+//
{ .mfi
nop.m 999
(p14) fclass.m.unc p14, p0 = Result, 0x07
;;
-ATANL_UNSUPPORTED:
+ATANL_UNSUPPORTED:
{ .mfb
nop.m 999
- fmpy.s0 Result = ArgX,ArgY
+ fmpy.s0 Result = ArgX,ArgY
br.ret.sptk b0
}
;;
// Here if x or y inf or zero
-ATANL_SPECIAL_HANDLING:
+ATANL_SPECIAL_HANDLING:
{ .mfi
nop.m 999
fclass.m p6, p7 = ArgY_orig, 0x007 // Test y zero
;;
// Here if y not zero
-ATANL_ArgY_Not_ZERO:
+ATANL_ArgY_Not_ZERO:
{ .mfi
nop.m 999
fclass.m p0, p10 = ArgY, 0x023 // Test y inf
;;
{ .mfi
-(p6) add table_ptr1 = 16, table_ptr1 // Point to pi/2, if x finite
+(p6) add table_ptr1 = 16, table_ptr1 // Point to pi/2, if x finite
fclass.m p8, p0 = ArgX, 0x022 // Test for x=-inf
nop.i 999
}
;;
// Here if y not INF, and x=0 or INF
-ATANL_ArgY_Not_INF:
+ATANL_ArgY_Not_INF:
//
// Return +PI/2 when ArgY NOT Inf, ArgY > 0 and ArgX = +/-0
// Return -PI/2 when ArgY NOT Inf, ArgY < 0 and ArgX = +/-0
;;
GLOBAL_IEEE754_END(atan2l)
-
+
LOCAL_LIBM_ENTRY(__libm_error_region)
.prologue
{ .mfi
//
// The result is computed as
// cbrt(x)= cbrt(1 - (1 - x*y)) * (1/cbrt(y))
-// where y = frcpa(x) = (-1)^sgn_y * 2^(3*k+j) * m_y,
+// where y = frcpa(x) = (-1)^sgn_y * 2^(3*k+j) * m_y,
// m_y in [1,2), j in {0,1,2}
//
// cbrt(1 - (1 - x*y)) is approximated by a degree-6 polynomial
// (double-extended precision) and D (single precision) as follows:
// T_hi (1 + D)= 1/cbrt(y) to about 80 bits of accuracy
//
-// The tables are only stored for three exponent values (i.e.
+// The tables are only stored for three exponent values (i.e.
// only for 2^j * m_y, where j in {0,1,2} and m_y covers the 256
// possible mantissas for an frcpa result); the index is formed
// by the 8 leading mantissa bits of x, which is the same index used
// by the hardware to get frcpa(x).
//
-// The table values are multiplied by 2^k where e is the exponent of
+// The table values are multiplied by 2^k where e is the exponent of
// the input number. This multiplication is carried out in parallel with
// the polynomial evaluation:
// T= 2^(k) * T_hi
GR_TMP1 = r21
GR_SGNMASK = r22
GR_T_INDEX = r23
- GR_IX_T = r23
+ GR_IX_T = r23
GR_IX_D = r24
GR_D_INDEX = r24
GR_TMP2 = r25
and GR_SIGN = GR_NORMEXPSGN, GR_SGNMASK
// eliminate leading 1 from GR_NORMSIG = 2nd table index
shl GR_INDEX2 = GR_NORMSIG, 1
- // eliminate sign from exponent
+ // eliminate sign from exponent
andcm GR_NORMEXP = GR_NORMEXPSGN, GR_SGNMASK
}
;;
(p6) fnma.s1 FR_R = FR_RCP, FR_XNORM, f1
// Start computation of floor(exponent/3) by
// computing (2^20+2)/3*exponent = exponent*0x55556
- // 1: exponent* = 5;
- // (2^{16}-1)/3 = 0x5555:
+ // 1: exponent* = 5;
+ // (2^{16}-1)/3 = 0x5555:
// will form 0x5555*exponent by using shladd's
shladd GR_EXP5 = GR_NORMEXP, 2, GR_NORMEXP
}
// 09/05/02 Work range is widened by reduction strengthen (3 parts of Pi/16)
// 02/10/03 Reordered header: .section, .global, .proc, .align
// 08/08/03 Improved performance
-// 10/28/04 Saved sincos_r_sincos to avoid clobber by dynamic loader
+// 10/28/04 Saved sincos_r_sincos to avoid clobber by dynamic loader
// 03/31/05 Reformatted delimiters between data tables
// API
// nfloat = Round result to integer (round-to-nearest)
//
// r = x - nfloat * pi/2^k
-// Do this as ((((x - nfloat * HIGH(pi/2^k))) -
-// nfloat * LOW(pi/2^k)) -
+// Do this as ((((x - nfloat * HIGH(pi/2^k))) -
+// nfloat * LOW(pi/2^k)) -
// nfloat * LOWEST(pi/2^k) for increased accuracy.
// pi/2^k is stored as two numbers that when added make pi/2^k.
// pi/2^k = HIGH(pi/2^k) + LOW(pi/2^k)
-// HIGH and LOW parts are rounded to zero values,
+// HIGH and LOW parts are rounded to zero values,
// and LOWEST is rounded to nearest one.
//
// x = (nfloat * pi/2^k) + r
{ .mfi
ldfe sincos_Pi_by_16_3 = [sincos_AD_1],16
nop.f 999
- dep.z sincos_r_exp = sincos_r_signexp, 0, 17
+ dep.z sincos_r_exp = sincos_r_signexp, 0, 17
};;
// Polynomial coefficients (Q4, P4, Q3, P3, Q2, Q1, P2, P1) loading
// p10 is true if f8 exp is >= 0x1001a (2^27)
{ .mmb
ldfpd sincos_P4,sincos_Q4 = [sincos_AD_1],16
- cmp.ge p10,p0 = sincos_r_exp,sincos_exp_limit
+ cmp.ge p10,p0 = sincos_r_exp,sincos_exp_limit
(p10) br.cond.spnt _SINCOS_LARGE_ARGS // Go to "large args" routine
};;
{ .mfi
ldfpd sincos_P1,sincos_Q1 = [sincos_AD_1],16
fnma.s1 sincos_r = sincos_NFLOAT, sincos_Pi_by_16_1, sincos_NORM_f8
- nop.i 999
+ nop.i 999
};;
// Add 2^(k-1) (which is in sincos_r_sincos) to N
;;
// Get M (least k+1 bits of N)
and sincos_GR_m = 0x1f,sincos_GR_n
- nop.i 999
+ nop.i 999
};;
// sincos_r = sincos_r -sincos_Nfloat * sincos_Pi_by_16_2
{ .mfi
add sincos_AD_2 = sincos_GR_32m, sincos_AD_1
(p8) fclass.m.unc p10,p0 = f8,0x0b
- nop.i 999
+ nop.i 999
};;
// Load Sin and Cos table value using obtained index m (sincosf_AD_2)
{ .mfi
ldfe sincos_Sm = [sincos_AD_2],16
- nop.f 999
- nop.i 999
+ nop.f 999
+ nop.i 999
};;
// get rsq = r*r
{ .mfi
nop.m 999
fmpy.s0 fp_tmp = fp_tmp,fp_tmp // forces inexact flag
- nop.i 999
+ nop.i 999
};;
// sincos_r_exact = sincos_r -sincos_Nfloat * sincos_Pi_by_16_3
{ .mfi
nop.m 999
fnma.s1 sincos_r_exact = sincos_NFLOAT, sincos_Pi_by_16_3, sincos_r
- nop.i 999
+ nop.i 999
};;
-// Polynomials calculation
+// Polynomials calculation
// P_1 = P4*r^2 + P3
// Q_2 = Q4*r^2 + Q3
{ .mfi
{ .mfi
nop.m 999
fma.s1 sincos_Q_temp1 = sincos_rsq, sincos_Q4, sincos_Q3
- nop.i 999
+ nop.i 999
};;
// get rcube = r^3 and S[m]*r^2
{ .mfi
nop.m 999
fmpy.s1 sincos_rcub = sincos_r_exact, sincos_rsq
- nop.i 999
+ nop.i 999
};;
-// Polynomials calculation
+// Polynomials calculation
// Q_2 = Q_1*r^2 + Q2
// P_1 = P_1*r^2 + P2
{ .mfi
{ .mfi
nop.m 999
fma.s1 sincos_P_temp2 = sincos_rsq, sincos_P_temp1, sincos_P2
- nop.i 999
+ nop.i 999
};;
-// Polynomials calculation
+// Polynomials calculation
// Q = Q_2*r^2 + Q1
// P = P_2*r^2 + P1
{ .mfi
{ .mfi
nop.m 999
fma.s1 sincos_P = sincos_rsq, sincos_P_temp2, sincos_P1
- nop.i 999
+ nop.i 999
};;
// Get final P and Q
{ .mfi
nop.m 999
fma.s1 sincos_P = sincos_rcub,sincos_P, sincos_r_exact
- nop.i 999
+ nop.i 999
};;
// If sin(denormal), force underflow to be set
_SINCOS_UNORM:
// Here if x=unorm
{ .mfb
- getf.exp sincos_r_signexp = sincos_NORM_f8 // Get signexp of x
+ getf.exp sincos_r_signexp = sincos_NORM_f8 // Get signexp of x
fcmp.eq.s0 p11,p0 = f8, f0 // Dummy op to set denorm flag
br.cond.sptk _SINCOS_COMMON2 // Return to main path
};;
{ .mfi
nop.m 999
fmpy.s0 sincos_save_tmp = sincos_save_tmp, sincos_save_tmp
- nop.i 999
+ nop.i 999
};;
{ .mib
// nfloat = Round result to integer (round-to-nearest)
//
// r = x - nfloat * pi/2^k
-// Do this as (x - nfloat * HIGH(pi/2^k)) - nfloat * LOW(pi/2^k)
+// Do this as (x - nfloat * HIGH(pi/2^k)) - nfloat * LOW(pi/2^k)
// for increased accuracy.
// pi/2^k is stored as two numbers that when added make pi/2^k.
// Polynomial coefficients (Q2, Q1, P2, P1) loading
{ .mmi
ldfpd sincosf_P2,sincosf_Q2 = [sincosf_AD_1],16
- nop.m 999
- nop.i 999
+ nop.m 999
+ nop.i 999
};;
// Select exponent (17 lsb)
{ .mmi
ldfpd sincosf_P1,sincosf_Q1 = [sincosf_AD_1],16
- nop.m 999
+ nop.m 999
dep.z sincosf_r_exp = sincosf_r_signexp, 0, 17
};;
// Multiply x by scaled 16/pi and add large const to shift integer part of W to
// rightmost bits of significand
{ .mfi
- nop.m 999
+ nop.m 999
fma.s1 sincosf_W_2TO61_RSH = sincosf_NORM_f8, sincosf_SIG_INV_PI_BY_16_2TO61, sincosf_RSHF_2TO61
- nop.i 999
+ nop.i 999
};;
// sincosf_NFLOAT = Round_Int_Nearest(sincosf_W)
{ .mfi
nop.m 999
fms.s1 sincosf_NFLOAT = sincosf_W_2TO61_RSH,sincosf_2TOM61,sincosf_RSHF
- nop.i 999
+ nop.i 999
};;
// get N = (int)sincosf_int_Nfloat
{ .mfi
getf.sig sincosf_GR_n = sincosf_W_2TO61_RSH // integer N value
nop.f 999
- nop.i 999
+ nop.i 999
};;
// Add 2^(k-1) (which is in sincosf_r_sincos=8) to N
{ .mfi
add sincosf_GR_n = sincosf_GR_n, sincosf_r_sincos
fnma.s1 sincosf_r = sincosf_NFLOAT, sincosf_Pi_by_16_1, sincosf_NORM_f8
- nop.i 999
+ nop.i 999
};;
// Get M (least k+1 bits of N)
{ .mmi
- and sincosf_GR_m = 0x1f,sincosf_GR_n // Put mask 0x1F -
+ and sincosf_GR_m = 0x1f,sincosf_GR_n // Put mask 0x1F -
nop.m 999 // - select k+1 bits
nop.i 999
};;
{ .mfi
shladd sincosf_AD_2 = sincosf_GR_32m, 4, sincosf_AD_1
(p8) fclass.m.unc p10,p0 = f8,0x0b // If sin denormal input -
- nop.i 999
+ nop.i 999
};;
// Load Sin and Cos table value using obtained index m (sincosf_AD_2)
{ .mfi
nop.m 999
fmpy.s0 fp_tmp = fp_tmp, fp_tmp // forces inexact flag
- nop.i 999
+ nop.i 999
};;
-// Polynomials calculation
+// Polynomials calculation
// Q = Q2*r^2 + Q1
// P = P2*r^2 + P1
{ .mfi
{ .mfi
nop.m 999
fma.s1 sincosf_P = sincosf_rsq, sincosf_P2, sincosf_P1
- nop.i 999
+ nop.i 999
};;
// get rcube and S[m]*r^2
{ .mfi
nop.m 999
fmpy.s1 sincosf_rcub = sincosf_r_exact, sincosf_rsq
- nop.i 999
+ nop.i 999
};;
// Get final P and Q
{ .mfi
nop.m 999
fma.s1 sincosf_P = sincosf_rcub,sincosf_P,sincosf_r_exact
- nop.i 999
+ nop.i 999
};;
// If sinf(denormal) - force underflow to be set
}
{ .mfi // force inexact set
nop.m 999
- fmpy.s0 sincosf_save_tmp = sincosf_save_tmp, sincosf_save_tmp
- nop.i 999
+ fmpy.s0 sincosf_save_tmp = sincosf_save_tmp, sincosf_save_tmp
+ nop.i 999
};;
{ .mib
// products derived from this software without specific prior written
// permission.
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
// Intel Corporation is the author of this code, and requests that all
-// problem reports or change requests be submitted to it directly at
+// problem reports or change requests be submitted to it directly at
// http://www.intel.com/software/products/opensource/libraries/num.htm.
//
// History
// Return erf(x) = x *Pol9(x^2)
//
// 3. For several subranges of 0.5 <= |x| < 5.90625
-// Return erf(x) = sign(x)*Pol19(y),
+// Return erf(x) = sign(x)*Pol19(y),
// where y = (|x|-b)/a, Pol19(y) = A0 + A1*y^1 + A2*y^2 + ... + A19*y^19
//
// For each subrange there is particular set of coefficients.
// 6. |x| = INF
// Return erf(x) = sign(x) * 1.0
//
-// 7. x = [S,Q]NaN
+// 7. x = [S,Q]NaN
// Return erf(x) = QNaN
//
// 8. x is positive denormal
//
// Registers used
//==============================================================
-// Floating Point registers used:
+// Floating Point registers used:
// f8, input, output
// f32 -> f63
-// General registers used:
+// General registers used:
// r32 -> r48, r2, r3
// Predicate registers used:
// p6 to filter out case when x = denormal
// p7 to filter out case when x = [Q,S]NaN or +/-0,
// used also to process denormals
-// p8 to filter out case when 3.25 <= |x| < 4.0,
+// p8 to filter out case when 3.25 <= |x| < 4.0,
// used also to process denormals
// p9 to filter out case when |x| = inf
// p10 to filter out case when |x| < 0.5
fTQuadr = f59
fTDeg3 = f60
fTDeg7 = f61
-fArgAbsNormSgn = f62
+fArgAbsNormSgn = f62
fTQuadrSgn = f63
// Data tables
LOCAL_OBJECT_START(erf_data)
// Coefficients ##0..15
-// Polynomial coefficients for the erf(x), 0.5 <= |x| < 1.0
+// Polynomial coefficients for the erf(x), 0.5 <= |x| < 1.0
data8 0xB69AC40646D1F6C1, 0x00003FD2 //A19
data8 0x90AD48C0118FA10C, 0x00003FD7 //A18
data8 0x826FBAD055EA4AB8, 0x0000BFDB //A17
data8 0xCCF89D9351CE26E3, 0x0000BFF4 //A6
data8 0xEFF75AD1F0F22809, 0x00003FF2 //A5
data8 0xBB793EF404C09A22, 0x00003FF8 //A4
-// Polynomial coefficients for the erf(x), 1.0 <= |x| < 2.0
+// Polynomial coefficients for the erf(x), 1.0 <= |x| < 2.0
data8 0xBAE93FF4174EA59B, 0x00003FE6 //A19
data8 0x8A0FD46092F95D44, 0x0000BFEA //A18
data8 0xA37B3242B7809E12, 0x00003FEC //A17
data8 0x9722D22DA628A17B, 0x00003FF7 //A6
data8 0x8DB0A586F8F3381F, 0x0000BFFB //A5
data8 0x8DB0A5879F87E5BE, 0x00003FFB //A4
-// Polynomial coefficients for the erf(x), 2.0 <= |x| < 3.25
+// Polynomial coefficients for the erf(x), 2.0 <= |x| < 3.25
data8 0x9C4AF1F3A4B21AFC, 0x00003FF6 //A19
data8 0x8D40D5D5DB741AB8, 0x0000BFF9 //A18
data8 0xDEBE7099E0A75BA4, 0x00003FFA //A17
data8 0xF0C9A6BBDE469115, 0x00003FF9 //A6
data8 0xD673A02CB5766633, 0x00003FFD //A5
data8 0x8D162CBAD8A12649, 0x0000BFFE //A4
-// Polynomial coefficients for the erf(x), 4.0 <= |x| < 6.0
+// Polynomial coefficients for the erf(x), 4.0 <= |x| < 6.0
data8 0xD4428B75C6FE8FD1, 0x0000BFFC //A19
data8 0xF76BE1935675D5C8, 0x00003FFE //A18
data8 0xFD6BB3B14AA7A8E6, 0x0000BFFF //A17
data8 0xA4C07E9BB3FCB0F3, 0x0000BFF4 //A4
//
// Coefficients ##16..19
-// Polynomial coefficients for the erf(x), 0.5 <= |x| < 1.0
+// Polynomial coefficients for the erf(x), 0.5 <= |x| < 1.0
data8 0x95FA98C337005D13, 0x0000BFF9 //A3
data8 0xE0F7E524D2808A97, 0x0000BFFB //A2
data8 0xE0F7E524D2808A98, 0x00003FFD //A1
data8 0x853F7AE0C76E915F, 0x00003FFE //A0
-// Polynomial coefficients for the erf(x), 1.0 <= |x| < 2.0
+// Polynomial coefficients for the erf(x), 1.0 <= |x| < 2.0
data8 0x8DB0A587A96ABCF0, 0x00003FFC //A3
data8 0xD488F84B7DE18DA8, 0x0000BFFD //A2
data8 0xD488F84B7DE12E9C, 0x00003FFD //A1
data8 0xC58571D23D5C4B3A, 0x00003FFD //A3
data8 0xA94DCF467CD6AFF3, 0x0000BFFC //A2
data8 0xA94DCF467CD10A16, 0x00003FFA //A1
-data8 0xFECD70A13CAF1997, 0x00003FFE //A0
-// Polynomial coefficients for the erf(x), 4.0 <= |x| < 6.0
+data8 0xFECD70A13CAF1997, 0x00003FFE //A0
+// Polynomial coefficients for the erf(x), 4.0 <= |x| < 6.0
data8 0xB01D2B4F0D5AB8B0, 0x00003FF1 //A3
data8 0x8858A465CE594BD1, 0x0000BFEE //A2
data8 0x8858A447456DE61D, 0x00003FEA //A1
data8 0xFFFFFFBDC88BB107, 0x00003FFE //A0
-// Polynomial coefficients for the erf(x), 0.0 <= |x| < 0.5
+// Polynomial coefficients for the erf(x), 0.0 <= |x| < 0.5
data8 0xBE839EDBB36C7FCE //A9
data8 0x3EBB7745A18DD242 //A8
data8 0xBF4C02DB238F2AFC //A5
data8 0x3FBCE2F21A042B25 //A2
data8 0x906EBA8214DB688D, 0x00003FFF //A0
// 1.0 - 2^(-63)
-data8 0xFFFFFFFFFFFFFFFF, 0x00003FFE
-// Polynomial coefficients for the erf(x), 3.25 <= |x| < 4.0
+data8 0xFFFFFFFFFFFFFFFF, 0x00003FFE
+// Polynomial coefficients for the erf(x), 3.25 <= |x| < 4.0
data8 0x95E91576C7A12250, 0x00003FE7 //A14
data8 0x8E5E0D0E1F5D3CB5, 0x0000BFEA //A13
data8 0xED761DAFAF814DE9, 0x00003FEB //A12
data8 0xF4DAF4680DA54C02, 0x00003FEF //A1
data8 0xFFFFB7CFB3F2ABBE, 0x00003FFE //A0
// A = 2.0/sqrt(Pi)
-data8 0x906EBA8214DB688D, 0x00003FFF
+data8 0x906EBA8214DB688D, 0x00003FFF
LOCAL_OBJECT_END(erf_data)
}
;;
{ .mfi
- getf.d rArg = f8 // x in GR
+ getf.d rArg = f8 // x in GR
fclass.m p6,p0 = f8, 0x0b // is x denormal ?
shl rThreeAndQ = rThreeAndQ, 44 // bits of 3.25
}
nop.f 0
(p6) br.cond.spnt erf_denormal // branch out if x is denormal
}
-;;
+;;
{ .mfi
and rShiftedArgMasked = rShiftedArg, rMask // bias of x << 8
fmerge.s fArgAbs = f1, f8 // |x|
(p7) fma.d.s0 f8 = f8,f1,f8 // NaN or +/-0
(p7) br.ret.spnt b0 // exit for x = NaN or +/-0
}
-;;
+;;
{ .mfi
sub rIndex = rShiftedArgMasked, rBias // index << 8
- nop.f 0
- cmp.lt p10, p0 = rShiftedArgMasked, rBias // p10 = 1 if |x| < 0.5
+ nop.f 0
+ cmp.lt p10, p0 = rShiftedArgMasked, rBias // p10 = 1 if |x| < 0.5
}
{ .mfb
- // p8 = 1 if 3.25 <= |x| < 4.0
-(p8) cmp.lt p8, p11 = rShiftedAbsArg, rBiasedExpOf4
+ // p8 = 1 if 3.25 <= |x| < 4.0
+(p8) cmp.lt p8, p11 = rShiftedAbsArg, rBiasedExpOf4
fms.s1 fArgAbsNorm = fArgAbsNorm, f1, f1
(p10) br.cond.spnt erf_near_zero // branch out if |x| < 0.5
}
}
;;
{ .mfi
- adds rCoeffAddr2 = 16, rCoeffAddr1
+ adds rCoeffAddr2 = 16, rCoeffAddr1
fmerge.s fSignumX = f8, f1 // signum(x)
nop.i 0
-}
+}
{ .mfb
cmp.lt p12, p0 = rSaturation, rShiftedAbsArg // |x| > 5.90625?
nop.f 0
ldfe fA13 = [rCoeffAddr1], 32
nop.f 0
// address of coefficients ##16..23
- add rCoeffAddr3 = rCoeffAddr3, rIndex
+ add rCoeffAddr3 = rCoeffAddr3, rIndex
}
{.mfi
ldfe fA12 = [rCoeffAddr2], 32
ldfe fA6 = [rCoeffAddr2], 32
nop.f 0
(p8) br.cond.spnt erf_3q_4 // branch out if 3.25 < |x| < 4.0
-}
+}
;;
{.mfi
ldfe fA5 = [rCoeffAddr1], 32
;;
{ .mfi
nop.m 0
- fma.s1 fA15 = fA15, fTSqr, fA13
+ fma.s1 fA15 = fA15, fTSqr, fA13
nop.i 0
}
{ .mfi
}
;;
{ .mfi
- nop.m 0
+ nop.m 0
fma.s1 fA7 = fA7, fTSqr, fA5
nop.i 0
}
;;
{ .mfi
- nop.m 0
+ nop.m 0
fma.s1 fRes = fRes, fTQuadr, fA15
nop.i 0
}
;;
{ .mfi
- nop.m 0
+ nop.m 0
fma.s1 fA4 = fA4, fTSqr, fA2
nop.i 0
}
}
;;
{ .mfi
- nop.m 0
+ nop.m 0
fma.s1 fA4 = fA7, fTDeg3, fA4
nop.i 0
}
// Here if 3.25 < |x| < 4.0
.align 32
-erf_3q_4:
+erf_3q_4:
.pred.rel "mutex", p14, p15
{ .mfi
ldfe fA5 = [rCoeffAddr1], 32
fma.s1 fA15 = fA15, fArgAbs, fA14
nop.i 0
}
-;;
+;;
{ .mfi
nop.m 0
fma.s1 fA13 = fA13, fArgAbs, fA12
fma.s1 fA11 = fA11, fArgAbs, fA10
nop.i 0
}
-;;
+;;
{ .mfi
nop.m 0
fma.s1 fA9 = fA9, fArgAbs, fA8
fma.s1 fArgAbsNormSgn = fArgAbs, fSignumX, f0
nop.i 0
}
-;;
+;;
{ .mfi
nop.m 0
fma.s1 fTQuadr = fTSqr, fTSqr, f0
fma.s1 fRes = fRes, fTSqr, fA17
nop.i 0
}
-;;
+;;
{ .mfi
nop.m 0
fma.s1 fA15 = fA15, fTSqr, fA13
nop.i 0
}
-;;
+;;
{ .mfi
nop.m 0
fma.s1 fA11 = fA11, fTSqr, fA9
nop.i 0
-}
+}
{ .mfi
nop.m 0
fma.s1 fA7 = fA7, fArgAbs, fA6
nop.i 0
}
-;;
+;;
{ .mfi
nop.m 0
fma.s1 fTDeg7 = fTQuadr, fTSqr, f0
fma.s1 fRes = fRes, fTQuadr, fA15
nop.i 0
}
-;;
+;;
{ .mfi
nop.m 0
- fma.s1 fA11 = fA11, fTSqr, fA7
+ fma.s1 fA11 = fA11, fTSqr, fA7
nop.i 0
}
;;
fma.s1 fRes = fRes, fTDeg7, fA11
nop.i 0
}
-;;
+;;
{ .mfi
nop.m 0
// result for negative argument
erf_near_zero:
{ .mfi
adds rCoeffAddr1 = 1280, rDataPtr // address of A9
- fma.s1 fTSqr = fArgSqr, fArgSqr, f0 // x^4
+ fma.s1 fTSqr = fArgSqr, fArgSqr, f0 // x^4
nop.i 0
}
{ .mfi
br.ret.sptk b0 // Exit for 5.90625 <=|x|< +inf
}
;;
-
+
// Here if x is double precision denormal
.align 32
erf_denormal:
// products derived from this software without specific prior written
// permission.
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
// Intel Corporation is the author of this code, and requests that all
-// problem reports or change requests be submitted to it directly at
+// problem reports or change requests be submitted to it directly at
// http://www.intel.com/software/products/opensource/libraries/num.htm.
//
// History
// Overview of operation
//==============================================================
// 1. 0 <= x <= 28.0
-//
+//
// erfc(x) = P14(z) * exp( -x^2 ), z = x - x(i).
//
// Comment:
// Let x(i) = -1.0 + 2^(i/4),i=0,...19. So we have 20 unequal
// argument intervals [x(i),x(i+1)] with length ratio q = 2^(1/4).
// Values x(i) we have in the table erfc_xb_table.
-//
+//
// Let x(i)<= x < x(i+1).
// We can find i as exponent of number (x + 1)^4.
-//
+//
// Let P14(z) - polynomial approximation of degree 14 for function
// erfc(z+x(i)) * exp( (z+x(i))^2) and 0 <= z <= x(i+1)-x(i).
// Polynomial coeffitients we have in the table erfc_p_table.
//
// So we can find result for erfc(x) as above.
-// Algorithm description for exp function see below.
-//
+// Algorithm description for exp function see below.
+//
// 2. -6 <= x < 0
//
// erfc(x) = 2.0 - erfc(-x)
//
// 3. x > 28.0
-// erfc(x) ~=~ 0.0
+// erfc(x) ~=~ 0.0
//
-// 4. x < -6.0
-// erfc(x) ~=~ 2.0
+// 4. x < -6.0
+// erfc(x) ~=~ 2.0
-// Special values
+// Special values
//==============================================================
// erfc(+0) = 1.0
// erfc(-0) = 1.0
-// erfc(+qnan) = +qnan
-// erfc(-qnan) = -qnan
-// erfc(+snan) = +qnan
-// erfc(-snan) = -qnan
+// erfc(+qnan) = +qnan
+// erfc(-qnan) = -qnan
+// erfc(+snan) = +qnan
+// erfc(-snan) = -qnan
-// erfc(-inf) = 2.0
+// erfc(-inf) = 2.0
// erfc(+inf) = +0
//==============================================================
// Comment for exp for erfc:
//
// We use quad precision for calculate input argument -x^2 and add
-// result low bits to value delta in exp.
+// result low bits to value delta in exp.
// Registers used
//==============================================================
-// Floating Point registers used:
+// Floating Point registers used:
// f8, input
// f9 -> f15, f32 -> f93
-// General registers used:
-// r32 -> r68
+// General registers used:
+// r32 -> r68
// Predicate registers used:
// p6 -> p15
EXP_W_2TO56_RSH = f9
EXP_RSHF_2TO56 = f10
-exp_P4 = f11
-exp_P3 = f12
-exp_P2 = f13
-exp_P1 = f14
+exp_P4 = f11
+exp_P3 = f12
+exp_P2 = f13
+exp_P1 = f14
exp_ln2_by_128_hi = f15
-
-exp_ln2_by_128_lo = f32
+
+exp_ln2_by_128_lo = f32
EXP_RSHF = f33
-EXP_Nfloat = f34
+EXP_Nfloat = f34
exp_r = f35
exp_f = f36
exp_rsq = f37
exp_P_hi = f44
exp_P = f45
exp_S = f46
-EXP_NORM_f8 = f47
+EXP_NORM_f8 = f47
exp_S2 = f48
exp_T2 = f49
LOCAL_OBJECT_START(erfc_p_table)
-// Pol0
+// Pol0
data8 0x8000000000000000, 0x00003FFF //A0 = +1.00000000000000000000e+00L
data8 0x906EBA8214DB688D, 0x0000BFFF //A1 = -1.12837916709551257389e+00L
data8 0xFFFFFFFFFFFFFFEB, 0x00003FFE //A2 = +9.99999999999999998841e-01L
data8 0xB3DD6B2DB3307D2E, 0x00003FF5 //A12 = +1.37226041156280127011e-03L
data8 0x8018A34267FED226, 0x0000BFF4 //A13 = -4.88648380816410282971e-04L
data8 0xFBBA6A7AEBD3ABD9, 0x00003FF1 //A14 = +1.20033353451879025825e-04L
-// Pol1
+// Pol1
data8 0xD15A1EF03BB91E71, 0x00003FFE //A0 = +8.17781385088640600540e-01L
data8 0xD1A4ADDAC3337118, 0x0000BFFE //A1 = -8.18919053944410683867e-01L
data8 0xA9AF9FFA2AD18CB0, 0x00003FFE //A2 = +6.62836073471060947628e-01L
data8 0x8D535042E11A0D89, 0x00003FF4 //A12 = +5.39113782651680545599e-04L
data8 0xBE589447DB26564E, 0x0000BFF2 //A13 = -1.81528103431449706486e-04L
data8 0xABC8C7EF636F5B0A, 0x00003FF0 //A14 = +4.09565689009869217620e-05L
-// Pol2
+// Pol2
data8 0xA9973ABB272898B2, 0x00003FFE //A0 = +6.62463827792779356910e-01L
data8 0x945F1A7993F7AADD, 0x0000BFFE //A1 = -5.79576162988785154930e-01L
data8 0xD84439C6609A8A62, 0x00003FFD //A2 = +4.22395520654665085222e-01L
data8 0xC2503856CE48A657, 0x00003FF2 //A12 = +1.85311660448280465934e-04L
data8 0xF52642F22A26965B, 0x0000BFF0 //A13 = -5.84481856856861454591e-05L
data8 0xC98588E1A95FFDBD, 0x00003FEE //A14 = +1.20116245684500489648e-05L
-// Pol3
+// Pol3
data8 0x887CBA2C47B1E2B5, 0x00003FFE //A0 = +5.33153186617432643784e-01L
data8 0xCD81909CF194328E, 0x0000BFFD //A1 = -4.01379126699602646289e-01L
data8 0x84DCA15C52122372, 0x00003FFD //A2 = +2.59495775718310530164e-01L
data8 0xE7704D06A3080C19, 0x00003FF0 //A12 = +5.51792801195012080688e-05L
data8 0x875A5B53E510F305, 0x0000BFEF //A13 = -1.61353297293572230995e-05L
data8 0xC8F10CDDB9CC9A42, 0x00003FEC //A14 = +2.99426321046583353559e-06L
-// Pol4
+// Pol4
data8 0xDAEC3C07CAB590C1, 0x00003FFD //A0 = +4.27583576155807004411e-01L
data8 0x8BE271F8BE0280AC, 0x0000BFFD //A1 = -2.73212014783898564863e-01L
data8 0x9E13941E19661429, 0x00003FFC //A2 = +1.54371561371908397882e-01L
data8 0xED10FE8FC0E44CAD, 0x00003FEE //A12 = +1.41302576244352578317e-05L
data8 0xFE49912328516F81, 0x0000BFEC //A13 = -3.78917710289305330220e-06L
data8 0xA8F6077E25DAFD33, 0x00003FEA //A14 = +6.29428967202166402369e-07L
-// Pol5
+// Pol5
data8 0xAF72220985BED710, 0x00003FFD //A0 = +3.42667640364081975844e-01L
data8 0xBC1CB559042410AB, 0x0000BFFC //A1 = -1.83703263815036934677e-01L
data8 0xB730BF62E0B63A3C, 0x00003FFB //A2 = +8.94484474229911741150e-02L
data8 0xD023CF5C3F915685, 0x00003FEC //A12 = +3.10152594473606007552e-06L
data8 0xCA7016FADFF584F5, 0x0000BFEA //A13 = -7.54139761055503416594e-07L
data8 0xEEBB5CC0901D2BB0, 0x00003FE7 //A14 = +1.11168196441717301549e-07L
-// Pol6
+// Pol6
data8 0x8CD1160326A754AF, 0x00003FFD //A0 = +2.75032699474947383325e-01L
data8 0xFB22A4C657119388, 0x0000BFFB //A1 = -1.22624671271190511269e-01L
data8 0xD02B2CA872A774E9, 0x00003FFA //A2 = +5.08224243596176920409e-02L
data8 0x9CE72C0409A3E800, 0x00003FEA //A12 = +5.84509280984781223375e-07L
data8 0x88CCD7A000D1C213, 0x0000BFE8 //A13 = -1.27405082040077425019e-07L
data8 0x8DF4EC84F093B1C0, 0x00003FE5 //A14 = +1.65259388738830506389e-08L
-// Pol7
+// Pol7
data8 0xE2BF82A153B1B82E, 0x00003FFC //A0 = +2.21433678719152843912e-01L
data8 0xA72A9AE0BD7F29D5, 0x0000BFFB //A1 = -8.16242313227913578068e-02L
data8 0xE98939292289EDBE, 0x00003FF9 //A2 = +2.85078159732432477516e-02L
data8 0xCCA1CA2AC3EB8973, 0x00003FE7 //A12 = +9.52891963880517988726e-08L
data8 0x9E26A080F9DA39DE, 0x0000BFE5 //A13 = -1.84111863600343741644e-08L
data8 0x8F3DC58F64A92C62, 0x00003FE2 //A14 = +2.08443519336792003049e-09L
-// Pol8
+// Pol8
data8 0xB74C13E914E9666F, 0x00003FFC //A0 = +1.79001151181389950418e-01L
data8 0xDEB57268A58B763B, 0x0000BFFA //A1 = -5.43722600071728705200e-02L
data8 0x821FF0D4C605A4CD, 0x00003FF9 //A2 = +1.58843711598712515609e-02L
data8 0xE9F15C8E7F58CF90, 0x00003FE4 //A12 = +1.36172642554216769522e-08L
data8 0x9E90F22B11FAF8B5, 0x0000BFE2 //A13 = -2.30744183054978535129e-09L
data8 0xF8CF74F1A138FBBA, 0x00003FDE //A14 = +2.26291720693360003233e-10L
-// Pol9
+// Pol9
data8 0x94D45274A831ED57, 0x00003FFC //A0 = +1.45341194505862183128e-01L
data8 0x94D4518B699A4A68, 0x0000BFFA //A1 = -3.63352952323113355459e-02L
data8 0x90C3B59FF403A916, 0x00003FF8 //A2 = +8.83572327421709216515e-03L
data8 0xEEBB49645DE0E34C, 0x00003FE1 //A12 = +1.73700091999434388879e-09L
data8 0x8C86D8677DEACFBA, 0x0000BFDF //A13 = -2.55616650187281815453e-10L
data8 0xBDB223D0FE2A7D6B, 0x00003FDB //A14 = +2.15659223402509415592e-11L
-// Pol10
+// Pol10
data8 0xF2C1812715E4050A, 0x00003FFB //A0 = +1.18533143048567888157e-01L
data8 0xC7DA2C565ADAEE57, 0x0000BFF9 //A1 = -2.43960252726894623056e-02L
data8 0xA15CEFFD632F697D, 0x00003FF7 //A2 = +4.92440908672041077933e-03L
data8 0xDE1CE78ADB6DDF04, 0x00003FDE //A12 = +2.02010513073041015283e-10L
data8 0xE124FFAA267301A5, 0x0000BFDB //A13 = -2.55959692063871343080e-11L
data8 0x81F1BEBEFBE168D2, 0x00003FD8 //A14 = +1.84661980716000872722e-12L
-// Pol11
+// Pol11
data8 0xC6CE5D7D18203EAA, 0x00003FFB //A0 = +9.70732978630764996752e-02L
data8 0x86E8A30A76923C88, 0x0000BFF9 //A1 = -1.64683517829920230086e-02L
data8 0xB4A1CBB7576B4183, 0x00003FF6 //A2 = +2.75622581042760461528e-03L
data8 0xC0970F2551C52F96, 0x00003FDB //A12 = +2.18949565869759698947e-11L
data8 0xA6E029ABB3BB500C, 0x0000BFD8 //A13 = -2.37144541649446501026e-12L
data8 0xA3E43F3857D1B6A5, 0x00003FD4 //A14 = +1.45564973108152568130e-13L
-// Pol12
+// Pol12
data8 0xA36E35FC807B3E64, 0x00003FFB //A0 = +7.98000543291529334886e-02L
data8 0xB725A29237C8F94F, 0x0000BFF8 //A1 = -1.11784064873715046550e-02L
data8 0xCB51EF23EAD5F327, 0x00003FF5 //A2 = +1.55120891755237931425e-03L
data8 0x9EDB00104DB66DD9, 0x00003FD8 //A12 = +2.25747200093121867690e-12L
data8 0xE9F80AF513F2B8AB, 0x0000BFD4 //A13 = -2.07806143133802417637e-13L
data8 0xC2B840C3859AB166, 0x00003FD0 //A14 = +1.08091168358477817812e-14L
-// Pol13
+// Pol13
data8 0x86CD0BF01914407A, 0x00003FFB //A0 = +6.58207829138836028568e-02L
data8 0xF9F4A17FA70807C3, 0x0000BFF7 //A1 = -7.62803922344113067603e-03L
data8 0xE63BF84EDE20EDAA, 0x00003FF4 //A2 = +8.78273993036530088653e-04L
data8 0xFDB2E0599016AD1E, 0x00003FD4 //A12 = +2.25329742249079975388e-13L
data8 0x9E179A99CDD4BF4B, 0x0000BFD1 //A13 = -1.75517603530017718494e-14L
data8 0xDE4DE992A707C7BC, 0x00003FCC //A14 = +7.71273133169032472595e-16L
-// Pol14
+// Pol14
data8 0xDF0639E60CF6E96C, 0x00003FFA //A0 = +5.44492971101228988138e-02L
data8 0xAB6737B6065BD1C2, 0x0000BFF7 //A1 = -5.23081035867078490333e-03L
data8 0x8322CC0765FD9C27, 0x00003FF4 //A2 = +5.00243857322493802503e-04L
data8 0xC6D8869855133985, 0x00003FD1 //A12 = +2.20763189681614758000e-14L
data8 0xD10AC0B228ABCECC, 0x0000BFCD //A13 = -1.45052027893524847250e-15L
data8 0xF7C6DEB4522487A3, 0x00003FC8 //A14 = +5.37280367113168366711e-17L
-// Pol15
+// Pol15
data8 0xB8F57DECFAC3B255, 0x00003FFA //A0 = +4.51559943173131409760e-02L
data8 0xEC1B8A6C822C036F, 0x0000BFF6 //A1 = -3.60271577347565115947e-03L
data8 0x963A6DD66951B72E, 0x00003FF3 //A2 = +2.86537625289770759336e-04L
data8 0x9A88033A08842BEA, 0x00003FCE //A12 = +2.14455258045503137285e-15L
data8 0x88BCF775B7B3A939, 0x0000BFCA //A13 = -1.18601440246395438386e-16L
data8 0x88687B63A5B7135E, 0x00003FC5 //A14 = +3.69734984736162880476e-18L
-// Pol16
+// Pol16
data8 0x99B8A501204BF3E7, 0x00003FFA //A0 = +3.75296063885057657456e-02L
data8 0xA33FA20D2867C79C, 0x0000BFF6 //A1 = -2.49097544033960143953e-03L
data8 0xACFD14CA6AA55829, 0x00003FF2 //A2 = +1.64974783411741182991e-04L
data8 0xEFCCD20DE93A138E, 0x00003FCA //A12 = +2.07993414310230172191e-16L
data8 0xB259764466732080, 0x0000BFC6 //A13 = -9.66834364652262630640e-18L
data8 0x9597C1DB6AF830E4, 0x00003FC1 //A14 = +2.53420063550355940811e-19L
-// Pol17
+// Pol17
data8 0xFFFCBD66BAA4368C, 0x00003FF9 //A0 = +3.12484454387527380657e-02L
data8 0xE28174723762D197, 0x0000BFF5 //A1 = -1.72810121976742793952e-03L
data8 0xC81D832836019EC4, 0x00003FF1 //A2 = +9.54224026432644399736e-05L
data8 0xBA94473E52495304, 0x00003FC7 //A12 = +2.02289587087169937807e-17L
data8 0xE913D34CBB853CEE, 0x0000BFC2 //A13 = -7.89697093687557412061e-19L
data8 0xA44576A85E8CAB59, 0x00003FBD //A14 = +1.73929048516879172258e-20L
-// Pol18
+// Pol18
data8 0xD579A3FE4622DED2, 0x00003FF9 //A0 = +2.60589793198885278242e-02L
data8 0x9D97EB84E7CD89C8, 0x0000BFF5 //A1 = -1.20234251012583627659e-03L
data8 0xE86EFDC2CCA5C47B, 0x00003FF0 //A2 = +5.54164790116744315389e-05L
data8 0x91F283C0351A9ACA, 0x00003FC4 //A12 = +1.97795505638619048412e-18L
data8 0x990BC4FAFA9C7542, 0x0000BFBF //A13 = -6.48174913943425248713e-20L
data8 0xB536865B89676892, 0x00003FB9 //A14 = +1.19916696090758913485e-21L
-// Pol19
+// Pol19
data8 0xB241CEB1B7C953F1, 0x00003FF9 //A0 = +2.17598950382519671244e-02L
data8 0xDBD6FBA9B11B85E1, 0x0000BFF4 //A1 = -8.38622198373701898430e-04L
data8 0x877605B1AD082441, 0x00003FF0 //A2 = +3.22964249573360786077e-05L
mov exp_GR_rshf_2to56 = 0x4768 // begin 1.1 2^(63+56)
}
{ .mlx
- mov exp_TB1_size = 0x100
+ mov exp_TB1_size = 0x100
movl exp_GR_sig_inv_ln2 = 0xb8aa3b295c17f0bc //signif. of 1/ln2
};;
{ .mfi
mov exp_GR_exp_2tom56 = 0xffff-56
fnma.s1 EXP_NORM_f8 = f8, f8, f0 // high bits for -x^2
- nop.i 0
+ nop.i 0
};;
{ .mfi
setf.sig EXP_INV_LN2_2TO63 = exp_GR_sig_inv_ln2 // form 1/ln2 * 2^63
(p6) fma.s1 FR_AbsArg = f1, f0, f8 // |x|, if x >= 0
- mov GR_POS_ARG_ASYMP = 0x403C
+ mov GR_POS_ARG_ASYMP = 0x403C
}
{ .mfi
mov GR_NEG_ARG_ASYMP = 0x4018
{ .mfi
setf.exp EXP_2TOM56 = exp_GR_exp_2tom56 // 2^-56 for scaling Nfloat
fclass.m p10,p0 = f8, 0x21 // p10: x = +inf
- mov exp_GR_17ones = 0x1FFFF
+ mov exp_GR_17ones = 0x1FFFF
}
-{ .mlx
+{ .mlx
setf.d EXP_RSHF_2TO56 = exp_GR_rshf_2to56 // const 1.10*2^(63+56)
- movl GR_ERFC_XB_TB = 0x1A0
+ movl GR_ERFC_XB_TB = 0x1A0
};;
shl exp_GR_rshf = exp_GR_rshf, 48 //end 1.1 2^63 for right shift
}
{ .mfi
- nop.m 0
+ nop.m 0
(p7) fma.s1 FR_Tmp = FR_Tmp1, FR_Tmp1, f0 // (|x|+1)^2, x<0
- mov GR_0x1 = 0x1
+ mov GR_0x1 = 0x1
};;
{ .mfi
{ .mfi
nop.m 0
fclass.m p11,p0 = f8, 0xc3 // p11: x = nan
- nop.i 0
+ nop.i 0
}
{ .mfi
setf.d EXP_RSHF = exp_GR_rshf //Form right shift const 1.100 * 2^63
};;
{ .mfi
- setf.d FR_EpsNorm = GR_EpsNorm
- nop.f 0
+ setf.d FR_EpsNorm = GR_EpsNorm
+ nop.f 0
(p6) shl GR_ARG_ASYMP = GR_POS_ARG_ASYMP, 48//p6:ARG_ASYMP= 28.0,x>=0
}
{ .mfi
};;
{ .mfi
- sub GR_mBIAS = r0, GR_BIAS
+ sub GR_mBIAS = r0, GR_BIAS
fma.s1 FR_Tmp = FR_Tmp, FR_Tmp, f0 // (|x|+1)^4
nop.i 0
}
{ .mfi
ldfe exp_ln2_by_128_lo = [EXP_AD_TB1], 16
nop.f 0
- nop.i 0
+ nop.i 0
};;
{ .mfi
- getf.d GR_AbsArg = FR_AbsArg
+ getf.d GR_AbsArg = FR_AbsArg
nop.f 0
add GR_ERFC_XB_TB = GR_ERFC_XB_TB, EXP_AD_TB1//pointer to XB_TBL
}
fma.s1 EXP_W_2TO56_RSH = EXP_NORM_f8,EXP_INV_LN2_2TO63,EXP_RSHF_2TO56
shladd GR_ShftPi_bias = GR_ShftPi_bias, 4, r0 // BIAS * 240
}
-{ .mfb
+{ .mfb
nop.m 0
(p10) fma.d.s0 f8 = f0, f1, f0 // p10: y = 0 for x = +inf
(p10) br.ret.spnt b0 // p10: quick exit for x = +inf
.pred.rel "mutex",p6,p7
{ .mfi
(p6) cmp.gt.unc p15,p0 = GR_AbsArg,GR_ARG_ASYMP //p15: x > 28.0,p6: x >= 0
- nop.f 0
+ nop.f 0
(p7) cmp.gt.unc p14,p0 = GR_AbsArg, GR_ARG_ASYMP //p14: x < - 6.0,p7: x < 0
}
{ .mfb
(p11) fma.d.s0 f8 = f8, f1, f0 //p11: y = x for x = nan
(p11) br.ret.spnt b0 //p11: quick exit for x = nan
};;
-
-{ .mfi
- add EXP_AD_P = exp_TB2_size, EXP_AD_TB2
+
+{ .mfi
+ add EXP_AD_P = exp_TB2_size, EXP_AD_TB2
fms.s1 f8_sq_lo = f1, f1, f8_sq_lo // 1 - low bits for -x^2
nop.i 0
};;
ldfpd exp_P4, exp_P3 = [EXP_AD_P], 16
fmerge.s FR_X = f8,f8
shladd GR_ShftXBi_bias = GR_mBIAS, 4, r0
-}
+}
{ .mfb
nop.m 0
(p14) fnma.d.s0 FR_RESULT = FR_EpsNorm,FR_EpsNorm,FR_2 //p14:y ~=~ 2,x< -6.0
(p14) br.ret.spnt b0 //p14: quick exit for x < -6.0
};;
-//p15: y ~=~ 0.0(result with underflow error), x > ARG_ASYMP = 28,
+//p15: y ~=~ 0.0(result with underflow error), x > ARG_ASYMP = 28,
{ .mfi
ldfpd exp_P2, exp_P1 = [EXP_AD_P]
fma.d.s0 FR_Tmpf = f1, f1, FR_EpsNorm // flag i
}
{ .mfb
(p15) mov GR_Parameter_TAG = 208
-(p15) fma.d.s0 FR_RESULT = FR_EpsNorm,FR_EpsNorm,f0
+(p15) fma.d.s0 FR_RESULT = FR_EpsNorm,FR_EpsNorm,f0
(p15) br.cond.spnt __libm_error_region
};;
//p8: x < 27.0, result without ungerflow error
{ .mfi
getf.exp GR_IndxPlusBias = FR_Tmp // exp + bias for (|x|+1)^4
- fcmp.lt.s1 p8,p0 = FR_NormX,FR_UnfBound
+ fcmp.lt.s1 p8,p0 = FR_NormX,FR_UnfBound
nop.i 0
}
{ .mfi
{ .mmi
shladd GR_ShftXBi = GR_IndxPlusBias, 4, GR_ShftXBi_bias
shladd GR_ShftPi = GR_IndxPlusBias, 4, GR_ShftPi_bias
- shl GR_ShftPi_8 = GR_IndxPlusBias, 8
+ shl GR_ShftPi_8 = GR_IndxPlusBias, 8
};;
{ .mmi
- getf.sig exp_GR_N = EXP_W_2TO56_RSH
+ getf.sig exp_GR_N = EXP_W_2TO56_RSH
add GR_ERFC_XB_TB = GR_ERFC_XB_TB, GR_ShftXBi// pointer to XB[i]
sub GR_ShftPi = GR_ShftPi_8, GR_ShftPi // (256-16)*i
};;
{ .mmi
ldfe FR_Xb = [GR_ERFC_XB_TB]
add GR_ShftA12 = 0xC0, GR_ShftPi // pointer shift for A12
- add GR_ShftA13 = 0xD0, GR_ShftPi // pointer shift for A13
+ add GR_ShftA13 = 0xD0, GR_ShftPi // pointer shift for A13
};;
{ .mfi
add GR_P_A13 = GR_ERFC_P_TB, GR_ShftA13 // pointer to A13
nop.f 0
- and exp_GR_index_1 = 0x0f, exp_GR_N
+ and exp_GR_index_1 = 0x0f, exp_GR_N
}
{ .mfi
add GR_P_A12 = GR_ERFC_P_TB, GR_ShftA12 // pointer to A12
};;
{ .mfi
- ldfe FR_A12 = [GR_P_A12], -64
+ ldfe FR_A12 = [GR_P_A12], -64
nop.f 0
- and exp_GR_index_2_16 = 0x70, exp_GR_N
+ and exp_GR_index_2_16 = 0x70, exp_GR_N
}
{ .mfi
- ldfe FR_A13 = [GR_P_A13], -64
+ ldfe FR_A13 = [GR_P_A13], -64
nop.f 0
- shladd EXP_AD_T1 = exp_GR_index_1, 4, EXP_AD_TB1
-};;
+ shladd EXP_AD_T1 = exp_GR_index_1, 4, EXP_AD_TB1
+};;
-{ .mmi
+{ .mmi
ldfe FR_A8 = [GR_P_A12], 32
- ldfe FR_A9 = [GR_P_A13], 32
+ ldfe FR_A9 = [GR_P_A13], 32
add EXP_AD_T2 = EXP_AD_TB2, exp_GR_index_2_16
};;
{ .mmi
ldfe FR_A10 = [GR_P_A12], -96
ldfe FR_A11 = [GR_P_A13], -96
- nop.i 0
+ nop.i 0
};;
{ .mmi
ldfe FR_A4 = [GR_P_A12], 32
- ldfe FR_A5 = [GR_P_A13], 32
+ ldfe FR_A5 = [GR_P_A13], 32
shr r2 = exp_GR_N, 0x7
};;
-{ .mfi
- ldfe FR_A6 = [GR_P_A12], -64
+{ .mfi
+ ldfe FR_A6 = [GR_P_A12], -64
fma.s1 exp_rP4pP3 = exp_r, exp_P4, exp_P3
nop.i 0
}
-{ .mfi
- ldfe FR_A7 = [GR_P_A13], -64
+{ .mfi
+ ldfe FR_A7 = [GR_P_A13], -64
fma.s1 exp_rsq = exp_r, exp_r, f0
nop.i 0
};;
{ .mmi
ldfe FR_A2 = [GR_P_A12], -32
- ldfe FR_A3 = [GR_P_A13], -32
+ ldfe FR_A3 = [GR_P_A13], -32
addl exp_GR_biased_M = 0xffff, r2
};;
-{ .mmi
+{ .mmi
ldfe FR_A0 = [GR_P_A12], 224
ldfe FR_A1 = [GR_P_A13]
nop.i 0
};;
{ .mfi
- nop.m 0
+ nop.m 0
fma.s1 exp_rcube = exp_r, exp_rsq, f0
nop.i 0
-}
-{ .mfi
- nop.m 0
+}
+{ .mfi
+ nop.m 0
fma.s1 exp_P_lo = exp_r, exp_rP4pP3, exp_P2
nop.i 0
};;
{ .mfi
nop.m 0
fnma.s1 exp_f = EXP_Nfloat, exp_ln2_by_128_lo, f8_sq_lo
- nop.i 0
-};;
+ nop.i 0
+};;
{ .mfi
nop.m 0
fma.s1 FR_P14_0_1 = FR_LocArg, FR_LocArg, f0 // xloc ^2
- nop.i 0
+ nop.i 0
}
{ .mfi
- nop.m 0
+ nop.m 0
fma.s1 FR_P14_0_2 = FR_A13, FR_LocArg, FR_A12
- nop.i 0
+ nop.i 0
};;
{ .mfi
nop.m 0
- fma.s1 FR_P14_1_1 = FR_A9, FR_LocArg, FR_A8
+ fma.s1 FR_P14_1_1 = FR_A9, FR_LocArg, FR_A8
nop.i 0
}
{ .mfi
nop.m 0
- fma.s1 FR_P14_1_2 = FR_A11, FR_LocArg, FR_A10
+ fma.s1 FR_P14_1_2 = FR_A11, FR_LocArg, FR_A10
nop.i 0
};;
{ .mfi
nop.m 0
fma.s1 FR_P14_2_1 = FR_A5, FR_LocArg, FR_A4
- nop.i 0
+ nop.i 0
}
{ .mfi
nop.m 0
fma.s1 FR_P14_2_2 = FR_A7, FR_LocArg, FR_A6
nop.i 0
-};;
+};;
{ .mfi
nop.m 0
nop.i 0
};;
{ .mfi
- nop.m 0
+ nop.m 0
fma.s1 exp_S1 = EXP_2M, exp_T1, f0
nop.i 0
};;
};;
{ .mfi
- nop.m 0
- fma.s1 exp_S = exp_S1, exp_S2, f0
- nop.i 0
+ nop.m 0
+ fma.s1 exp_S = exp_S1, exp_S2, f0
+ nop.i 0
}
{ .mfi
nop.m 0
{ .mfi
nop.m 0
fma.s1 FR_Exp = exp_S, exp_P, exp_S // exp(-x^2)
- nop.i 0
+ nop.i 0
}
{ .mfi
nop.m 0
- fma.s1 FR_Pol = FR_P14_13_2, FR_P14_12_1, FR_P14_13_1
+ fma.s1 FR_Pol = FR_P14_13_2, FR_P14_12_1, FR_P14_13_1
nop.i 0
};;
{ .mfi
nop.m 0
fma.d.s0 FR_Tmpf = f8, f1, f0 // flag d
- nop.i 0
+ nop.i 0
};;
//p6: result for 0 < x < = 28.0,
//p7: result for -6.0 <= x < 0,
//p8: exit for - 6.0 <= x < UnfBound ~=~ 26.54..
-
+
.pred.rel "mutex",p6,p7
{ .mfi
nop.m 0
-(p6) fma.d.s0 f8 = FR_Exp, FR_Pol, f0
- nop.i 0
+(p6) fma.d.s0 f8 = FR_Exp, FR_Pol, f0
+ nop.i 0
}
{ .mfb
mov GR_Parameter_TAG = 208
-(p7) fnma.d.s0 f8 = FR_Exp, FR_Pol, FR_2
-(p8) br.ret.sptk b0
+(p7) fnma.d.s0 f8 = FR_Exp, FR_Pol, FR_2
+(p8) br.ret.sptk b0
};;
GLOBAL_LIBM_END(erfc)
mov GR_SAVE_PFS=ar.pfs // Save ar.pfs
}
{ .mfi
-.fframe 64
+.fframe 64
add sp=-64,sp // Create new stack
nop.f 0
mov GR_SAVE_GP=gp // Save gp
{ .mmi
stfd [GR_Parameter_Y] = FR_Y,16 // STORE Parameter 2 on stack
add GR_Parameter_X = 16,sp // Parameter 1 address
-.save b0, GR_SAVE_B0
+.save b0, GR_SAVE_B0
mov GR_SAVE_B0=b0 // Save b0
};;
.body
{ .mib
stfd [GR_Parameter_X] = FR_X // STORE Parameter 1 on stack
add GR_Parameter_RESULT = 0,GR_Parameter_Y // Parameter 3 address
- nop.b 0
+ nop.b 0
}
{ .mib
stfd [GR_Parameter_Y] = FR_RESULT // STORE Parameter 3 on stack
- add GR_Parameter_Y = -16,GR_Parameter_Y
+ add GR_Parameter_Y = -16,GR_Parameter_Y
br.call.sptk b0=__libm_error_support# // Call error handling function
};;
{ .mmi
mov gp = GR_SAVE_GP // Restore gp
mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs
br.ret.sptk b0 // Return
-};;
+};;
LOCAL_LIBM_END(__libm_error_region)
.type __libm_error_support#,@function
// products derived from this software without specific prior written
// permission.
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
// Intel Corporation is the author of this code, and requests that all
-// problem reports or change requests be submitted to it directly at
+// problem reports or change requests be submitted to it directly at
// http://www.intel.com/software/products/opensource/libraries/num.htm.
//
// History
// Overview of operation
//==============================================================
// 1. 0 <= x <= 10.06
-//
+//
// erfcf(x) = P15(x) * exp( -x^2 )
//
// Comment:
//
// Let x(0)=0, x(i) = 2^(i), i=1,...3, x(4)= 10.06
-//
+//
// Let x(i)<= x < x(i+1).
// We can find i as exponent of argument x (let i = 0 for 0<= x < 2 )
-//
+//
// Let P15(x) - polynomial approximation of degree 15 for function
// erfcf(x) * exp( x^2) and x(i) <= x <= x(i+1), i = 0,1,2,3
// Polynomial coeffitients we have in the table erfc_p_table.
//
// So we can find result for erfcf(x) as above.
// Algorithm description for exp function see below.
-//
+//
// 2. -4.4 <= x < 0
//
// erfcf(x) = 2.0 - erfcf(-x)
// erfcf(x) ~=~ 0.0
//
// 4. x < -4.4
-//
+//
// erfcf(x) ~=~ 2.0
-// Special values
+// Special values
//==============================================================
// erfcf(+0) = 1.0
// erfcf(-0) = 1.0
-// erfcf(+qnan) = +qnan
-// erfcf(-qnan) = -qnan
-// erfcf(+snan) = +qnan
-// erfcf(-snan) = -qnan
+// erfcf(+qnan) = +qnan
+// erfcf(-qnan) = -qnan
+// erfcf(+snan) = +qnan
+// erfcf(-snan) = -qnan
-// erfcf(-inf) = 2.0
+// erfcf(-inf) = 2.0
// erfcf(+inf) = +0
//==============================================================
//
// Registers used
//==============================================================
-// Floating Point registers used:
+// Floating Point registers used:
// f8, input
// f6,f7,f9 -> f11, f32 -> f92
-// General registers used:
-// r14 -> r22,r32 -> r50
+// General registers used:
+// r14 -> r22,r32 -> r50
// Predicate registers used:
// p6 -> p15
EXP_W_2TO56_RSH = f9
exp_ln2_by_128_hi = f11
-EXP_RSHF_2TO56 = f32
-exp_ln2_by_128_lo = f33
+EXP_RSHF_2TO56 = f32
+exp_ln2_by_128_lo = f33
EXP_RSHF = f34
-EXP_Nfloat = f35
+EXP_Nfloat = f35
exp_r = f36
exp_rsq = f37
EXP_2M = f38
exp_T1 = f40
exp_P = f41
exp_S = f42
-EXP_NORM_f8 = f43
+EXP_NORM_f8 = f43
exp_S2 = f44
exp_T2 = f45
// double-extended 1/ln(2)
// 3fff b8aa 3b29 5c17 f0bb be87fed0691d3e88
-// 3fff b8aa 3b29 5c17 f0bc
+// 3fff b8aa 3b29 5c17 f0bc
// For speed the significand will be loaded directly with a movl and setf.sig
// and the exponent will be bias+63 instead of bias+0. Thus subsequent
// computations need to scale appropriately.
-// The constant 128/ln(2) is needed for the computation of w. This is also
+// The constant 128/ln(2) is needed for the computation of w. This is also
// obtained by scaling the computations.
//
-// Two shifting constants are loaded directly with movl and setf.d.
-// 1. EXP_RSHF_2TO56 = 1.1000..00 * 2^(63-7)
+// Two shifting constants are loaded directly with movl and setf.d.
+// 1. EXP_RSHF_2TO56 = 1.1000..00 * 2^(63-7)
// This constant is added to x*1/ln2 to shift the integer part of
// x*128/ln2 into the rightmost bits of the significand.
// The result of this fma is EXP_W_2TO56_RSH.
-// 2. EXP_RSHF = 1.1000..00 * 2^(63)
+// 2. EXP_RSHF = 1.1000..00 * 2^(63)
// This constant is subtracted from EXP_W_2TO56_RSH * 2^(-56) to give
// the integer part of w, n, as a floating-point number.
// The result of this fms is EXP_Nfloat.
LOCAL_OBJECT_START(erfc_p_table)
-// Pol_0
+// Pol_0
data8 0xBEA3260C63CB0446 //A15 = -5.70673541831883454676e-07
data8 0x3EE63D6178077654 //A14 = +1.06047480138940182343e-05
data8 0xBF18646BC5FC70A7 //A13 = -9.30491237309283694347e-05
data8 0x3FEFFFFFC67295FC //A2 = +9.99999892800303301771e-01
data8 0xBFF20DD74F8CD2BF //A1 = -1.12837916445020868099e+00
data8 0x3FEFFFFFFFFE7C1D //A0 = +9.99999999988975570714e-01
-// Pol_1
+// Pol_1
data8 0xBDE8EC4BDD953B56 //A15 = -1.81338928934942767144e-10
data8 0x3E43607F269E2A1C //A14 = +9.02309090272196442358e-09
data8 0xBE8C4D9E69C10E02 //A13 = -2.10875261143659275328e-07
data8 0x3FEE7C91BDF13578 //A2 = +9.52706213932898128515e-01
data8 0xBFF1CB5B61F8C589 //A1 = -1.11214769621105541214e+00
data8 0x3FEFEA56BC81FD37 //A0 = +9.97355812243688815239e-01
-// Pol_2
+// Pol_2
data8 0xBD302724A12F46E0 //A15 = -5.73866382814058809406e-14
data8 0x3D98889B75D3102E //A14 = +5.57829983681360947356e-12
data8 0xBDF16EA15074A1E9 //A13 = -2.53671153922423457844e-10
data8 0x3FE28A95CB8C6D3E //A2 = +5.79417131000276437708e-01
data8 0xBFEC21205D358672 //A1 = -8.79043752717008257224e-01
data8 0x3FEDAE44D5EDFE5B //A0 = +9.27523057776805771830e-01
-// Pol_3
+// Pol_3
data8 0xBCA3BCA734AC82F1 //A15 = -1.36952437983096410260e-16
data8 0x3D16740DC3990612 //A14 = +1.99425676175410093285e-14
data8 0xBD77F4353812C46A //A13 = -1.36162367755616790260e-12
.section .text
GLOBAL_LIBM_ENTRY(erfcf)
-// Form index i for table erfc_p_table as exponent of x
-// We use i + bias in real calculations
+// Form index i for table erfc_p_table as exponent of x
+// We use i + bias in real calculations
{ .mlx
getf.exp GR_IndxPlusBias = f8 // (sign + exp + bias) of x
movl exp_GR_sig_inv_ln2 = 0xb8aa3b295c17f0bc //signif.of 1/ln2
;;
// Form two constants we need
-// 1/ln2 * 2^63 to compute w = x * 1/ln2 * 128
+// 1/ln2 * 2^63 to compute w = x * 1/ln2 * 128
// 1.1000..000 * 2^(63+63-7) to right shift int(w) into the significand
// p9: x = 0,+inf,-inf,nan,unnorm.
// p10: x!= 0,+inf,-inf,nan,unnorm.
{ .mfi
setf.sig EXP_INV_LN2_2TO63 = exp_GR_sig_inv_ln2 // Form 1/ln2*2^63
- fclass.m p9,p10 = f8,0xef
+ fclass.m p9,p10 = f8,0xef
shl GR_ShftPi_bias = GR_BIAS, 7
}
{ .mfi
}
;;
-// Form shift GR_ShftPi from the beginning of erfc_p_table
+// Form shift GR_ShftPi from the beginning of erfc_p_table
// to the polynomial with number i
{ .mfi
ldfps FR_UnfBound, FR_EpsNorm = [EXP_AD_TB1],8
{ .mfi
setf.d EXP_RSHF = exp_GR_rshf // Form right shift 1.100 * 2^63
(p7) fms.s1 FR_AbsArg = f1, f0, f8 // |x| if x < 0
- mov exp_TB1_size = 0x100
+ mov exp_TB1_size = 0x100
}
;;
-// Form pointer GR_P_POINT_3 to the beginning of erfc_p_table
+// Form pointer GR_P_POINT_3 to the beginning of erfc_p_table
{ .mfi
setf.d FR_05 = GR_05
nop.f 0
add GR_P_POINT_2 = GR_P_POINT_3, GR_ShftPi
}
{ .mfi
- ldfe exp_ln2_by_128_hi = [EXP_AD_TB1],16
+ ldfe exp_ln2_by_128_hi = [EXP_AD_TB1],16
fma.s1 FR_NormX = f8,f1,f0
add GR_P_POINT_3 = GR_P_POINT_3, GR_ShftPi
}
// Load coefficients for polynomial P15(x)
{ .mfi
ldfpd FR_A15, FR_A14 = [GR_P_POINT_1], 16
- nop.f 0
+ nop.f 0
add GR_P_POINT_3 = 0x30, GR_P_POINT_3
}
{ .mfi
ldfe exp_ln2_by_128_lo = [EXP_AD_TB1], 16
- nop.f 0
- add GR_P_POINT_2 = 0x20, GR_P_POINT_2
+ nop.f 0
+ add GR_P_POINT_2 = 0x20, GR_P_POINT_2
}
;;
// Now EXP_AD_TB1 points to the beginning of table 1
{ .mlx
- ldfpd FR_A13, FR_A12 = [GR_P_POINT_1]
+ ldfpd FR_A13, FR_A12 = [GR_P_POINT_1]
movl GR_1_by_6 = 0x3FC5555555555555
}
{ .mfi
// By adding 1.10...0*2^63 we shift and get round_int(W) in significand.
// We actually add 1.10...0*2^56 to X * Inv_log2 to do the same thing.
{ .mfi
- ldfpd FR_A7, FR_A6 = [GR_P_POINT_3]
+ ldfpd FR_A7, FR_A6 = [GR_P_POINT_3]
fma.s1 EXP_W_2TO56_RSH = EXP_NORM_f8,EXP_INV_LN2_2TO63,EXP_RSHF_2TO56
add EXP_AD_TB2 = exp_TB1_size, EXP_AD_TB1
-
+
}
{ .mfi
ldfpd FR_A5, FR_A4 = [GR_P_POINT_4], 16
fmerge.s FR_X = f8,f8
nop.i 0
}
-{ .mfi
+{ .mfi
ldfpd FR_A1, FR_A0 = [GR_P_POINT_1]
nop.f 0
nop.i 0
nop.m 0
(p6) fcmp.gt.unc.s1 p15,p0 = FR_AbsArg, FR_POS_ARG_ASYMP //p6: x > 0
nop.i 0
-}
+}
;;
{ .mfi
}
;;
-// Nfloat = round_int(W)
+// Nfloat = round_int(W)
// The signficand of EXP_W_2TO56_RSH contains the rounded integer part of W,
// as a twos complement number in the lower bits (that is, it may be negative).
// That twos complement number (called N) is put into exp_GR_N.
nop.m 0
fms.s1 EXP_Nfloat = EXP_W_2TO56_RSH, EXP_2TOM56, EXP_RSHF
nop.i 0
-}
+}
{ .mfb
(p15) mov GR_Parameter_TAG = 209
(p15) fma.s.s0 FR_RESULT = FR_EpsNorm,FR_EpsNorm,f0 //Result.for x>10.06
(p15) br.cond.spnt __libm_error_region
-}
+}
;;
// Now we can calculate polynomial P15(x)
{ .mfi
nop.m 0
- fma.s1 FR_P15_1_2 = FR_A13, FR_AbsArg, FR_A12
- nop.i 0
+ fma.s1 FR_P15_1_2 = FR_A13, FR_AbsArg, FR_A12
+ nop.i 0
}
;;
{ .mfi
- getf.sig exp_GR_N = EXP_W_2TO56_RSH
- fma.s1 FR_P15_2_1 = FR_A9, FR_AbsArg, FR_A8
- nop.i 0
+ getf.sig exp_GR_N = EXP_W_2TO56_RSH
+ fma.s1 FR_P15_2_1 = FR_A9, FR_AbsArg, FR_A8
+ nop.i 0
}
{ .mfi
nop.m 0
- fma.s1 FR_P15_2_2 = FR_A11, FR_AbsArg, FR_A10
+ fma.s1 FR_P15_2_2 = FR_A11, FR_AbsArg, FR_A10
nop.i 0
}
;;
{ .mfi
nop.m 0
fma.s1 FR_P15_3_1 = FR_A5, FR_AbsArg, FR_A4
- nop.i 0
+ nop.i 0
}
{ .mfi
nop.m 0
and exp_GR_index_1 = 0x0f, exp_GR_N
fma.s1 FR_P15_4_1 = FR_A1, FR_AbsArg, FR_A0
shr r2 = exp_GR_N, 0x7
-
+
}
{ .mfi
and exp_GR_index_2_16 = 0x70, exp_GR_N
}
;;
-// EXP_AD_T1 has address of T1
-// EXP_AD_T2 has address if T2
+// EXP_AD_T1 has address of T1
+// EXP_AD_T2 has address if T2
{ .mfi
add EXP_AD_T2 = EXP_AD_TB2, exp_GR_index_2_16
- nop.f 0
+ nop.f 0
shladd EXP_AD_T1 = exp_GR_index_1, 4, EXP_AD_TB1
}
{ .mfi
;;
// Create Scale = 2^M
-// r = x - Nfloat * ln2_by_128_hi
-
+// r = x - Nfloat * ln2_by_128_hi
+
{ .mfi
setf.exp EXP_2M = exp_GR_biased_M
fma.s1 FR_P15_7_1 = FR_P15_0_1, FR_P15_1_1, FR_P15_1_2
nop.i 0
}
{ .mfi
- ldfe exp_T2 = [EXP_AD_T2]
+ ldfe exp_T2 = [EXP_AD_T2]
nop.f 0
nop.i 0
}
}
{ .mfi
nop.m 0
- fma.s1 FR_P15_8_1 = FR_P15_1_1, FR_P15_2_2, FR_P15_2_1
+ fma.s1 FR_P15_8_1 = FR_P15_1_1, FR_P15_2_2, FR_P15_2_1
nop.i 0
}
;;
{ .mfi
nop.m 0
- fma.s1 FR_P15_9_1 = FR_P15_1_1, FR_P15_4_2, FR_P15_4_1
+ fma.s1 FR_P15_9_1 = FR_P15_1_1, FR_P15_4_2, FR_P15_4_1
nop.i 0
}
{ .mfi
nop.m 0
- fma.s1 FR_P15_9_2 = FR_P15_1_1, FR_P15_3_2, FR_P15_3_1
+ fma.s1 FR_P15_9_2 = FR_P15_1_1, FR_P15_3_2, FR_P15_3_1
nop.i 0
}
;;
{ .mfi
nop.m 0
- fma.s1 exp_P = FR_1_by_6, exp_r, FR_05
+ fma.s1 exp_P = FR_1_by_6, exp_r, FR_05
nop.i 0
}
{ .mfi
nop.m 0
- fma.s1 exp_rsq = exp_r, exp_r, f0
+ fma.s1 exp_rsq = exp_r, exp_r, f0
nop.i 0
}
;;
{ .mfi
nop.m 0
- fma.s1 FR_P15_13_1 = FR_P15_7_2, FR_P15_7_1, FR_P15_8_1
+ fma.s1 FR_P15_13_1 = FR_P15_7_2, FR_P15_7_1, FR_P15_8_1
nop.i 0
}
;;
{ .mfi
nop.m 0
- fma.s1 FR_P15_14_1 = FR_P15_7_2, FR_P15_9_2, FR_P15_9_1
+ fma.s1 FR_P15_14_1 = FR_P15_7_2, FR_P15_9_2, FR_P15_9_1
nop.i 0
}
{ .mfi
}
{ .mfi
nop.m 0
- fma.s1 exp_S1 = EXP_2M, exp_T2, f0
+ fma.s1 exp_S1 = EXP_2M, exp_T2, f0
nop.i 0
}
;;
{ .mfi
nop.m 0
fma.s1 FR_Exp = exp_S, exp_P, exp_S // exp(-x^2)
- nop.i 0
+ nop.i 0
}
-;;
+;;
{ .mfi
nop.m 0
fma.s.s0 FR_Tmpf = f8, f1, f0 // Flag d
- nop.i 0
+ nop.i 0
}
;;
-//p6: result for 0 < x < = POS_ARG_ASYMP
+//p6: result for 0 < x < = POS_ARG_ASYMP
//p7: result for - NEG_ARG_ASYMP <= x < 0
//p8: exit for - NEG_ARG_ASYMP <= x <= UnfBound, x!=0
.pred.rel "mutex",p6,p7
{ .mfi
nop.m 0
-(p6) fma.s.s0 f8 = FR_Exp, FR_Pol, f0
- nop.i 0
+(p6) fma.s.s0 f8 = FR_Exp, FR_Pol, f0
+ nop.i 0
}
{ .mfb
mov GR_Parameter_TAG = 209
(p7) fnma.s.s0 f8 = FR_Exp, FR_Pol, FR_2
-(p8) br.ret.sptk b0
+(p8) br.ret.sptk b0
}
;;
{ .mfb
nop.m 0
nop.f 0
-(p10) br.cond.spnt __libm_error_region
+(p10) br.cond.spnt __libm_error_region
}
;;
// Call via (p10) br.cond.spnt __libm_error_region
-// for UnfBound < x < = POS_ARG_ASYMP
+// for UnfBound < x < = POS_ARG_ASYMP
// and
-//
+//
// call via (p15) br.cond.spnt __libm_error_region
// for x > POS_ARG_ASYMP
mov GR_SAVE_PFS=ar.pfs // Save ar.pfs
}
{ .mfi
-.fframe 64
+.fframe 64
add sp=-64,sp // Create new stack
nop.f 0
mov GR_SAVE_GP=gp // Save gp
{ .mmi
stfs [GR_Parameter_Y] = FR_Y,16 // STORE Parameter 2 on stack
add GR_Parameter_X = 16,sp // Parameter 1 address
-.save b0, GR_SAVE_B0
+.save b0, GR_SAVE_B0
mov GR_SAVE_B0=b0 // Save b0
};;
.body
{ .mib
stfs [GR_Parameter_X] = FR_X // STORE Parameter 1 on stack
add GR_Parameter_RESULT = 0,GR_Parameter_Y // Parameter 3 address
- nop.b 0
+ nop.b 0
}
{ .mib
stfs [GR_Parameter_Y] = FR_RESULT // STORE Parameter 3 on stack
- add GR_Parameter_Y = -16,GR_Parameter_Y
+ add GR_Parameter_Y = -16,GR_Parameter_Y
br.call.sptk b0=__libm_error_support# // Call error handling function
};;
{ .mmi
mov gp = GR_SAVE_GP // Restore gp
mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs
br.ret.sptk b0 // Return
-};;
+};;
LOCAL_LIBM_END(__libm_error_region)
.type __libm_error_support#,@function
// products derived from this software without specific prior written
// permission.
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
// Intel Corporation is the author of this code, and requests that all
-// problem reports or change requests be submitted to it directly at
+// problem reports or change requests be submitted to it directly at
// http://www.intel.com/software/products/opensource/libraries/num.htm.
//
// History
// Implementation and Algorithm Notes:
//==============================================================
// 1. 0 <= x <= 107.0
-//
+//
// erfcl(x) ~=~ P15(z) * expl( -x^2 )/(dx + x), z = x - xc(i).
//
// Comment:
// argument intervals [x(i),x(i+1)] with length ratio q = 2^(1/4).
// Values xc(i) we have in the table erfc_xc_table,xc(i)=x(i)for i = 0
// and xc(i)= 0.5*( x(i)+x(i+1) ) for i>0.
-//
+//
// Let x(i)<= x < x(i+1).
// We can find i as exponent of number (x + 1)^4.
-//
+//
// Let P15(z)= a0+ a1*z +..+a15*z^15 - polynomial approximation of degree 15
-// for function erfcl(z+xc(i)) * expl( (z+xc(i))^2)* (dx+z+xc(i)) and
+// for function erfcl(z+xc(i)) * expl( (z+xc(i))^2)* (dx+z+xc(i)) and
// -0.5*[x(i+1)-x(i)] <= z <= 0.5*[x(i+1)-x(i)].
//
// Let Q(z)= (P(z)- S)/S, S = a0, rounded to 16 bits.
// Polynomial coeffitients for Q(z) we have in the table erfc_Q_table as
// long double values
//
-// We use multi precision to calculate input argument -x^2 for expl and
-// for u = 1/(dx + x).
+// We use multi precision to calculate input argument -x^2 for expl and
+// for u = 1/(dx + x).
//
// Algorithm description for expl function see below. In accordance with
// denotation of this algorithm we have for expl:
//
-// expl(X) ~=~ 2^K*T_1*(1+W_1)*T_2*(1+W_2)*(1+ poly(r)), X = -x^2.
+// expl(X) ~=~ 2^K*T_1*(1+W_1)*T_2*(1+W_2)*(1+ poly(r)), X = -x^2.
//
// Final calculations for erfcl:
-//
+//
// erfcl(x) ~=~
//
// 2^K*T_1*(1+W_1)*T_2*(1+W_2)*(1+ poly(r))*(1-dy)*S*(1+Q(z))*u*(1+du),
// 1) M = 2^K*T_1*T_2*S without rounding error,
// 2) W = W_1 + (W_2 + W_1*W_2), where 1+W ~=~ (1+W_1)(1+W_2),
// 3) H = W - dy, where 1+H ~=~ (1+W )(1-dy),
-// 4) R = poly(r)*H + poly(r),
+// 4) R = poly(r)*H + poly(r),
// 5) R = H + R , where 1+R ~=~ (1+H )(1+poly(r)),
// 6) G = Q(z)*R + Q(z),
// 7) R1 = R + du, where 1+R1 ~=~ (1+R)(1+du),
// 8) G1 = R1 + G, where 1+G1 ~=~ (1+R1)(1+Q(z)),
// 9) V = G1*M*u,
-// 10) erfcl(x) ~=~ M*u + V
-//
+// 10) erfcl(x) ~=~ M*u + V
+//
// 2. -6.5 <= x < 0
//
// erfcl(x) = 2.0 - erfl(-x)
//
// 3. x > 107.0
-// erfcl(x) ~=~ 0.0
+// erfcl(x) ~=~ 0.0
//
-// 4. x < -6.5
-// erfcl(x) ~=~ 2.0
+// 4. x < -6.5
+// erfcl(x) ~=~ 2.0
-// Special values
+// Special values
//==============================================================
// erfcl(+0) = 1.0
// erfcl(-0) = 1.0
-// erfcl(+qnan) = +qnan
-// erfcl(-qnan) = -qnan
-// erfcl(+snan) = +qnan
-// erfcl(-snan) = -qnan
+// erfcl(+qnan) = +qnan
+// erfcl(-qnan) = -qnan
+// erfcl(+snan) = +qnan
+// erfcl(-snan) = -qnan
-// erfcl(-inf) = 2.0
+// erfcl(-inf) = 2.0
// erfcl(+inf) = +0
//==============================================================
//
// On input, X is in register format
//
-// On output,
+// On output,
//
// scale*(Y_hi + Y_lo) approximates exp(X)
//
// The accuracy is sufficient for a highly accurate 64 sig.
-// bit implementation. Safe is set if there is no danger of
-// overflow/underflow when the result is composed from scale,
-// Y_hi and Y_lo. Thus, we can have a fast return if Safe is set.
-// Otherwise, one must prepare to handle the possible exception
-// appropriately. Note that SAFE not set (false) does not mean
+// bit implementation. Safe is set if there is no danger of
+// overflow/underflow when the result is composed from scale,
+// Y_hi and Y_lo. Thus, we can have a fast return if Safe is set.
+// Otherwise, one must prepare to handle the possible exception
+// appropriately. Note that SAFE not set (false) does not mean
// that overflow/underflow will occur; only the setting of SAFE
// guarantees the opposite.
//
-// **** High Level Overview ****
+// **** High Level Overview ****
//
// The method consists of three cases.
-//
+//
// If |X| < Tiny use case exp_tiny;
// else if |X| < 2^(-6) use case exp_small;
// else use case exp_regular;
//
// Case exp_tiny:
//
-// 1 + X can be used to approximate exp(X)
+// 1 + X can be used to approximate exp(X)
// X + X^2/2 can be used to approximate exp(X) - 1
//
// Case exp_small:
//
-// Here, exp(X) and exp(X) - 1 can all be
+// Here, exp(X) and exp(X) - 1 can all be
// approximated by a relatively simple polynomial.
//
// This polynomial resembles the truncated Taylor series
// r := (X - N*L_hi) - N*L_lo
//
// We pick L_hi such that N*L_hi is representable in 64 sig. bits
-// and thus the FMA X - N*L_hi is error free. So r is the
-// 1 rounding error from an exact reduction with respect to
-//
+// and thus the FMA X - N*L_hi is error free. So r is the
+// 1 rounding error from an exact reduction with respect to
+//
// L_hi + L_lo.
//
// In particular, L_hi has 30 significant bit and can be stored
// Step 2: Approximation
//
// exp(r) - 1 is approximated by a short polynomial of the form
-//
+//
// r + A_1 r^2 + A_2 r^3 + A_3 r^4 .
//
-// Step 3: Composition from Table Values
+// Step 3: Composition from Table Values
//
// The value 2^( N / 2^12 ) can be composed from a couple of tables
// of precalculated values. First, express N as three integers
// lsb's, M_1 is the next 6, and K is simply N shifted right
// arithmetically (sign extended) by 12 bits.
//
-// Now, 2^( N / 2^12 ) is simply
-//
+// Now, 2^( N / 2^12 ) is simply
+//
// 2^K * 2^( M_1 / 2^6 ) * 2^( M_2 / 2^12 )
//
// Clearly, 2^K needs no tabulation. The other two values are less
// Define two mathematical values, delta_1 and delta_2, implicitly
// such that
//
-// T_1 = exp( [M_1 log(2)/2^6] - delta_1 )
+// T_1 = exp( [M_1 log(2)/2^6] - delta_1 )
// T_2 = exp( [M_2 log(2)/2^12] - delta_2 )
//
// are representable as 24 significant bits. To illustrate the idea,
-// we show how we define delta_1:
+// we show how we define delta_1:
//
// T_1 := round_to_24_bits( exp( M_1 log(2)/2^6 ) )
-// delta_1 = (M_1 log(2)/2^6) - log( T_1 )
+// delta_1 = (M_1 log(2)/2^6) - log( T_1 )
//
// The last equality means mathematical equality. We then tabulate
//
// T and W via
//
// T := T_1 * T_2 ...exactly
-// W := W_1 + (1 + W_1)*W_2
+// W := W_1 + (1 + W_1)*W_2
//
// W approximates exp( delta ) - 1 where delta = delta_1 + delta_2.
// The mathematical product of T and (W+1) is an accurate representation
//
// Step 4. Reconstruction
//
-// Finally, we can reconstruct exp(X), exp(X) - 1.
+// Finally, we can reconstruct exp(X), exp(X) - 1.
// Because
//
-// X = K * log(2) + (M_1*log(2)/2^6 - delta_1)
+// X = K * log(2) + (M_1*log(2)/2^6 - delta_1)
// + (M_2*log(2)/2^12 - delta_2)
// + delta_1 + delta_2 + r ...accurately
// We have
//
// exp(X) ~=~ 2^K * ( T + T*[exp(delta_1+delta_2+r) - 1] )
// ~=~ 2^K * ( T + T*[exp(delta + r) - 1] )
-// ~=~ 2^K * ( T + T*[(exp(delta)-1)
+// ~=~ 2^K * ( T + T*[(exp(delta)-1)
// + exp(delta)*(exp(r)-1)] )
// ~=~ 2^K * ( T + T*( W + (1+W)*poly(r) ) )
// ~=~ 2^K * ( Y_hi + Y_lo )
// exp(X)-1 ~=~ 2^K * ( Y_hi + Y_lo ) - 1
// ~=~ 2^K * ( Y_hi + Y_lo - 2^(-K) )
//
-// and we combine Y_hi + Y_lo - 2^(-N) into the form of two
+// and we combine Y_hi + Y_lo - 2^(-N) into the form of two
// numbers Y_hi + Y_lo carefully.
//
// **** Algorithm Details ****
//
// Case exp_tiny:
//
-// The important points are to ensure an accurate result under
-// different rounding directions and a correct setting of the SAFE
+// The important points are to ensure an accurate result under
+// different rounding directions and a correct setting of the SAFE
// flag.
//
// If expm1 is 1, then
// Here we compute a simple polynomial. To exploit parallelism, we split
// the polynomial into several portions.
//
-// Let r = X
+// Let r = X
//
// If exp ...i.e. exp( argument )
//
-// rsq := r * r;
+// rsq := r * r;
// r4 := rsq*rsq
// poly_lo := P_3 + r*(P_4 + r*(P_5 + r*P_6))
// poly_hi := r + rsq*(P_1 + r*P_2)
// Registers used
//==============================================================
-// Floating Point registers used:
+// Floating Point registers used:
// f8, input
// f9 -> f14, f36 -> f126
-// General registers used:
-// r32 -> r71
+// General registers used:
+// r32 -> r71
// Predicate registers used:
// p6 -> p15
LOCAL_OBJECT_END(Constants_exp_64_C)
LOCAL_OBJECT_START(Constants_exp_64_T1)
-data4 0x3F800000,0x3F8164D2,0x3F82CD87,0x3F843A29
-data4 0x3F85AAC3,0x3F871F62,0x3F88980F,0x3F8A14D5
+data4 0x3F800000,0x3F8164D2,0x3F82CD87,0x3F843A29
+data4 0x3F85AAC3,0x3F871F62,0x3F88980F,0x3F8A14D5
data4 0x3F8B95C2,0x3F8D1ADF,0x3F8EA43A,0x3F9031DC
data4 0x3F91C3D3,0x3F935A2B,0x3F94F4F0,0x3F96942D
data4 0x3F9837F0,0x3F99E046,0x3F9B8D3A,0x3F9D3EDA
LOCAL_OBJECT_END(Constants_exp_64_T1)
LOCAL_OBJECT_START(Constants_exp_64_T2)
-data4 0x3F800000,0x3F80058C,0x3F800B18,0x3F8010A4
-data4 0x3F801630,0x3F801BBD,0x3F80214A,0x3F8026D7
-data4 0x3F802C64,0x3F8031F2,0x3F803780,0x3F803D0E
-data4 0x3F80429C,0x3F80482B,0x3F804DB9,0x3F805349
-data4 0x3F8058D8,0x3F805E67,0x3F8063F7,0x3F806987
-data4 0x3F806F17,0x3F8074A8,0x3F807A39,0x3F807FCA
-data4 0x3F80855B,0x3F808AEC,0x3F80907E,0x3F809610
-data4 0x3F809BA2,0x3F80A135,0x3F80A6C7,0x3F80AC5A
-data4 0x3F80B1ED,0x3F80B781,0x3F80BD14,0x3F80C2A8
-data4 0x3F80C83C,0x3F80CDD1,0x3F80D365,0x3F80D8FA
-data4 0x3F80DE8F,0x3F80E425,0x3F80E9BA,0x3F80EF50
-data4 0x3F80F4E6,0x3F80FA7C,0x3F810013,0x3F8105AA
-data4 0x3F810B41,0x3F8110D8,0x3F81166F,0x3F811C07
-data4 0x3F81219F,0x3F812737,0x3F812CD0,0x3F813269
-data4 0x3F813802,0x3F813D9B,0x3F814334,0x3F8148CE
+data4 0x3F800000,0x3F80058C,0x3F800B18,0x3F8010A4
+data4 0x3F801630,0x3F801BBD,0x3F80214A,0x3F8026D7
+data4 0x3F802C64,0x3F8031F2,0x3F803780,0x3F803D0E
+data4 0x3F80429C,0x3F80482B,0x3F804DB9,0x3F805349
+data4 0x3F8058D8,0x3F805E67,0x3F8063F7,0x3F806987
+data4 0x3F806F17,0x3F8074A8,0x3F807A39,0x3F807FCA
+data4 0x3F80855B,0x3F808AEC,0x3F80907E,0x3F809610
+data4 0x3F809BA2,0x3F80A135,0x3F80A6C7,0x3F80AC5A
+data4 0x3F80B1ED,0x3F80B781,0x3F80BD14,0x3F80C2A8
+data4 0x3F80C83C,0x3F80CDD1,0x3F80D365,0x3F80D8FA
+data4 0x3F80DE8F,0x3F80E425,0x3F80E9BA,0x3F80EF50
+data4 0x3F80F4E6,0x3F80FA7C,0x3F810013,0x3F8105AA
+data4 0x3F810B41,0x3F8110D8,0x3F81166F,0x3F811C07
+data4 0x3F81219F,0x3F812737,0x3F812CD0,0x3F813269
+data4 0x3F813802,0x3F813D9B,0x3F814334,0x3F8148CE
data4 0x3F814E68,0x3F815402,0x3F81599C,0x3F815F37
LOCAL_OBJECT_END(Constants_exp_64_T2)
LOCAL_OBJECT_START(erfc_Q_table)
// Q(z)= (P(z)- S)/S
//
-// Pol0
+// Pol0
data8 0x98325D50F9DC3499, 0x0000BFAA //A0 = +3.07358861423101280650e-26L
data8 0xED35081A2494DDD9, 0x00003FF8 //A1 = +1.44779757616302832466e-02L
data8 0x9443549BCD0F94CE, 0x0000BFFD //A2 = -2.89576190966300084405e-01L
data8 0x87F0E77BA914FBEB, 0x00003FF5 //A13 = +1.03714776726541296794e-03L
data8 0xC306C2894C5CEF2D, 0x0000BFF3 //A14 = -3.71983348634136412407e-04L
data8 0xBDAB416A989D0697, 0x00003FF1 //A15 = +9.04412111877987292294e-05L
-// Pol1
+// Pol1
data8 0x82808893DA2DD83F, 0x00003FEE //A0 = +7.77853035974467145290e-06L
data8 0xAE9CD9DCADC86113, 0x0000BFFB //A1 = -8.52601070853077921197e-02L
data8 0x9D429743E312AD9F, 0x0000BFFB //A2 = -7.67871682732076080494e-02L
data8 0xDF67AC6287A63B03, 0x00003FF2 //A13 = +2.13055585989529858265e-04L
data8 0xA719CFEE67FCE1CE, 0x0000BFF1 //A14 = -7.96798844477905965933e-05L
data8 0xEF926367BABBB029, 0x00003FEF //A15 = +2.85591875675765038065e-05L
-// Pol2
+// Pol2
data8 0x82B5E5A93B059C50, 0x00003FEF //A0 = +1.55819100856330860049e-05L
data8 0xDC856BC2542B1938, 0x0000BFFB //A1 = -1.07676355235999875911e-01L
data8 0xDF225EF5694F14AE, 0x0000BFF8 //A2 = -1.36190345125628043277e-02L
data8 0x82901D055A0D5CB6, 0x00003FF1 //A13 = +6.22572626227726684168e-05L
data8 0xBB957698542D6FD0, 0x0000BFEF //A14 = -2.23617364009159182821e-05L
data8 0x810740E1DF572394, 0x00003FEE //A15 = +7.69068800065192940487e-06L
-// Pol3
+// Pol3
data8 0x9526D1C87655AFA8, 0x00003FEC //A0 = +2.22253260814242012255e-06L
data8 0xA47E21EBFE73F72F, 0x0000BFF8 //A1 = -1.00398379581527733314e-02L
data8 0xDE65685FCDF7A913, 0x0000BFFA //A2 = -5.42959286802879105148e-02L
data8 0xB40F241C01C907E9, 0x00003FEF //A13 = +2.14647227210702861416e-05L
data8 0xF436D84AD7D4D316, 0x0000BFED //A14 = -7.27815144835213913238e-06L
data8 0x9EB432503FB0B7BC, 0x00003FEC //A15 = +2.36487228755136968792e-06L
-// Pol4
+// Pol4
data8 0xE0BA539E4AFC4741, 0x00003FED //A0 = +6.69741148991838024429e-06L
data8 0x8583BF71139452CF, 0x0000BFFA //A1 = -3.25963476363756051657e-02L
data8 0x8384FEF6D08AD6CE, 0x0000BFF9 //A2 = -1.60546283500634200479e-02L
data8 0x9459160B1E6F1F8D, 0x00003FED //A13 = +4.42111470121432700283e-06L
data8 0xBE0A05701BD0DD42, 0x0000BFEB //A14 = -1.41590196994052764542e-06L
data8 0xE905D729105081BF, 0x00003FE9 //A15 = +4.34038814785401120999e-07L
-// Pol5
+// Pol5
data8 0xA33649C3AB459832, 0x00003FEE //A0 = +9.72819704141525206634e-06L
data8 0x9E4EA2F44C9A24BD, 0x0000BFFA //A1 = -3.86492123987296806210e-02L
data8 0xE80C0B1280F357BF, 0x0000BFF2 //A2 = -2.21297306012713370124e-04L
data8 0xCD2C2F079D2FCB36, 0x00003FEA //A13 = +7.64327468786076941271e-07L
data8 0xF5EF4A4B2EA426F2, 0x0000BFE8 //A14 = -2.29044563492386125272e-07L
data8 0x8CE52181393820FC, 0x00003FE7 //A15 = +6.56093668622712763489e-08L
-// Pol6
+// Pol6
data8 0xB2015D7F1864B7CF, 0x00003FEC //A0 = +2.65248615880090351276e-06L
data8 0x954EA7A861B4462A, 0x0000BFFA //A1 = -3.64519642954351295215e-02L
data8 0x9E46F2A4D9157E69, 0x00003FF7 //A2 = +4.83023498390681965101e-03L
data8 0xED209EBD68E1145F, 0x00003FE7 //A13 = +1.10421060667544991323e-07L
data8 0x83A126E22A17568D, 0x0000BFE6 //A14 = -3.06473811074239684132e-08L
data8 0x8B778496EDE9F415, 0x00003FE4 //A15 = +8.11804009754249175736e-09L
-// Pol7
+// Pol7
data8 0x8E152F522501B7B9, 0x00003FEE //A0 = +8.46879203970927626532e-06L
data8 0xFD22F92EE21F491E, 0x0000BFF9 //A1 = -3.09004656656418947425e-02L
data8 0xAF0C41847D89EC14, 0x00003FF7 //A2 = +5.34203719233189217519e-03L
data8 0xE56A19A67DD66100, 0x00003FE4 //A13 = +1.33536787408751203998e-08L
data8 0xE964D255CB31DFFA, 0x0000BFE2 //A14 = -3.39632729387679010008e-09L
data8 0xE22E62E932B704D4, 0x00003FE0 //A15 = +8.22842400379225526299e-10L
-// Pol8
+// Pol8
data8 0xB8B835882D46A6C8, 0x00003FEF //A0 = +2.20202883282415435401e-05L
data8 0xC9D1F63F89B74E90, 0x0000BFF9 //A1 = -2.46362504515706189782e-02L
data8 0x8E376748B1274F30, 0x00003FF7 //A2 = +4.34010070001387441657e-03L
data8 0xBB15B0021581C8B6, 0x00003FE1 //A13 = +1.36122047057936849125e-09L
data8 0xAC9D6585D4AF505E, 0x0000BFDF //A14 = -3.13984547328132268695e-10L
data8 0x975A1439C3795183, 0x00003FDD //A15 = +6.88268624429648826457e-11L
-// Pol9
+// Pol9
data8 0x99A7676284CDC9FE, 0x00003FEF //A0 = +1.83169747921764176475e-05L
data8 0x9AD0AE249A02896C, 0x0000BFF9 //A1 = -1.88983346204739151909e-02L
data8 0xCB89B4AEC19898BE, 0x00003FF6 //A2 = +3.10574208447745576452e-03L
data8 0x81D0E2AA27DEB74A, 0x00003FDE //A13 = +1.18066926578104076645e-10L
data8 0xD75FB9049190BEFD, 0x0000BFDB //A14 = -2.44851795398843967972e-11L
data8 0xA9384A51D48C8703, 0x00003FD9 //A15 = +4.80951837368635202609e-12L
-// Pol10
+// Pol10
data8 0xD2B3482EE449C535, 0x00003FEE //A0 = +1.25587177382575655080e-05L
data8 0xE7939B2D0607DFCF, 0x0000BFF8 //A1 = -1.41343131436717436429e-02L
data8 0x8810EB4AC5F0F1CE, 0x00003FF6 //A2 = +2.07620377002350121270e-03L
data8 0x9A4A95EE59127779, 0x00003FDA //A13 = +8.77044784978207256260e-12L
data8 0xE518330AF013C2F6, 0x0000BFD7 //A14 = -1.62781453276882333209e-12L
data8 0xA036A9DF71BD108A, 0x00003FD5 //A15 = +2.84596398987114375607e-13L
-// Pol11
+// Pol11
data8 0x9191CFBF001F3BB3, 0x00003FEE //A0 = +8.67662287973472452343e-06L
data8 0xAA47E0CF01AE9730, 0x0000BFF8 //A1 = -1.03931136509584404513e-02L
data8 0xAEABE7F17B01D18F, 0x00003FF5 //A2 = +1.33263784731775399430e-03L
data8 0x9BC3A7D6396C6756, 0x00003FD6 //A13 = +5.53385887288503961220e-13L
data8 0xD0110D5683740B8C, 0x0000BFD3 //A14 = -9.24001363293241428519e-14L
data8 0x81786D7856A5CC92, 0x00003FD1 //A15 = +1.43741041714595023996e-14L
-// Pol12
+// Pol12
data8 0xB85654F6033B3372, 0x00003FEF //A0 = +2.19747106911869287049e-05L
data8 0xF78B40078736B406, 0x0000BFF7 //A1 = -7.55444170413862312647e-03L
data8 0xDA8FDE84D88E5D5D, 0x00003FF4 //A2 = +8.33747822263358628569e-04L
data8 0xFC1441C4CD105981, 0x00003FD1 //A13 = +2.79864052545369490865e-14L
data8 0x9CC959853267F026, 0x0000BFCF //A14 = -4.35170017302700609509e-15L
data8 0xB06BA14016154F1E, 0x00003FCC //A15 = +6.12081320471295704631e-16L
-// Pol13
+// Pol13
data8 0xA59E74BF544F2422, 0x00003FEF //A0 = +1.97433196215210145261e-05L
data8 0xB2814F4EDAE15330, 0x0000BFF7 //A1 = -5.44754383528015875700e-03L
data8 0x867C249D378F0A23, 0x00003FF4 //A2 = +5.13019308804593120161e-04L
data8 0xEC9CAF64237B5060, 0x00003FCC //A13 = +8.20912960028437475035e-16L
data8 0xA9156668FCF01479, 0x0000BFCA //A14 = -1.46656639874123613261e-16L
data8 0xBAEF58D8118DD5D4, 0x00003FC7 //A15 = +2.02675278255254907493e-17L
-// Pol14
+// Pol14
data8 0xC698952E9CEAA800, 0x00003FEF //A0 = +2.36744912073515619263e-05L
data8 0x800395F8C7B4FA00, 0x0000BFF7 //A1 = -3.90667746392883642897e-03L
data8 0xA3B2467B6B391831, 0x00003FF3 //A2 = +3.12226081793919541155e-04L
data8 0xCCEBE3043B689428, 0x0000BFC8 //A13 = -4.44352525147076912166e-17L
data8 0xA779DAB4BE1F80BB, 0x0000BFBC //A14 = -8.86610526981738255206e-21L
data8 0xB171271F3517282C, 0x00003FC1 //A15 = +3.00598445879282370850e-19L
-// Pol15
+// Pol15
data8 0xB7AC727D1C3FEB05, 0x00003FEE //A0 = +1.09478009914822049780e-05L
data8 0xB6E6274485C10B0A, 0x0000BFF6 //A1 = -2.79081782038927199588e-03L
data8 0xC5CAE2122D009506, 0x00003FF2 //A2 = +1.88629638738336219173e-04L
data8 0xD6B789E01141231B, 0x0000BFC6 //A13 = -1.16398290506765191078e-17L
data8 0xB5EEE343E9CFE3EC, 0x00003FC2 //A14 = +6.16413506924643419723e-19L
data8 0x859B41A39D600346, 0x0000BFBE //A15 = -2.82922705825870414438e-20L
-// Pol16
+// Pol16
data8 0x85708B69FD184E11, 0x00003FED //A0 = +3.97681079176353356199e-06L
data8 0x824D92BC60A1F70A, 0x0000BFF6 //A1 = -1.98826630037499070532e-03L
data8 0xEDCF7D3576BB5258, 0x00003FF1 //A2 = +1.13396885054265675352e-04L
data8 0xDF62F9F44F5C7170, 0x0000BFC3 //A13 = -1.51372666097522872780e-18L
data8 0xBA4E118E88CFDD31, 0x00003FBF //A14 = +7.89032177282079635722e-20L
data8 0x942AD897FC4D2F2A, 0x0000BFBB //A15 = -3.92195756076319409245e-21L
-// Pol17
+// Pol17
data8 0xCB8514540566C717, 0x00003FEF //A0 = +2.42614557068144130848e-05L
data8 0xB94F08D6816E0CD4, 0x0000BFF5 //A1 = -1.41379340061829929314e-03L
data8 0x8E7C342C2DABB51B, 0x00003FF1 //A2 = +6.79422240687700109911e-05L
data8 0xC0D22F655BA5EF39, 0x0000BFC0 //A13 = -1.63325770165403860181e-19L
data8 0x8F14B9EBD5A9AB25, 0x00003FBC //A14 = +7.57464305512080733773e-21L
data8 0xCD4804BBF6DC1B6F, 0x0000BFB7 //A15 = -3.39609459750208886298e-22L
-// Pol18
+// Pol18
data8 0xE251DFE45AB0C22E, 0x00003FEE //A0 = +1.34897126299700418200e-05L
data8 0x83943CC7D59D4215, 0x0000BFF5 //A1 = -1.00386850310061655307e-03L
data8 0xAA57896951134BCA, 0x00003FF0 //A2 = +4.06126834109940757047e-05L
data8 0x98F194AEE31D188D, 0x0000BFBD //A13 = -1.61935414722333263347e-20L
data8 0xC42F5029BB622157, 0x00003FB8 //A14 = +6.49121108201931196678e-22L
data8 0xF43BD08079E50E0F, 0x0000BFB3 //A15 = -2.52531675510242468317e-23L
-// Pol19
+// Pol19
data8 0x82557B149A04D08E, 0x00003FEF //A0 = +1.55370127331027842820e-05L
data8 0xBAAB433307CE614B, 0x0000BFF4 //A1 = -7.12085701486669872724e-04L
data8 0xCB52D9DBAC16FE82, 0x00003FEF //A2 = +2.42380662859334411743e-05L
data8 0xE7D49EC23F1A16A0, 0x0000BFB9 //A13 = -1.53412587409583783059e-21L
data8 0xFDE429BC9947B2BE, 0x00003FB4 //A14 = +5.25034823750902928092e-23L
data8 0x872137A062C042EF, 0x0000BFB0 //A15 = -1.74651114923000080365e-24L
-// Pol20
+// Pol20
data8 0x8B9B185C6A2659AC, 0x00003FEF //A0 = +1.66423130594825442963e-05L
data8 0x84503AD52588A1E8, 0x0000BFF4 //A1 = -5.04735556466270303549e-04L
data8 0xF26C7C2B566388E1, 0x00003FEE //A2 = +1.44495826764677427386e-05L
data8 0xAB15D69425373A67, 0x0000BFB6 //A13 = -1.41518447770061562822e-22L
data8 0x9EF95456F75B4DF4, 0x00003FB1 //A14 = +4.10938011540250142351e-24L
data8 0x8FADCC45E81433E7, 0x0000BFAC //A15 = -1.16062889679749879834e-25L
-// Pol21
+// Pol21
data8 0xB47A917B0F7B50AE, 0x00003FEF //A0 = +2.15147474240529518138e-05L
data8 0xBB77DC3BA0C937B3, 0x0000BFF3 //A1 = -3.57567223048598672970e-04L
data8 0x90694DFF4EBF7370, 0x00003FEE //A2 = +8.60758700336677694536e-06L
data8 0xF86F9772306F56C1, 0x0000BFB2 //A13 = -1.28438352359240135735e-23L
data8 0xC32F6FEEDE86528E, 0x00003FAD //A14 = +3.15338862172962186458e-25L
data8 0x9534ED189744D7D4, 0x0000BFA8 //A15 = -7.53301543611470014315e-27L
-// Pol22
+// Pol22
data8 0xCBA0A2DB94A2C494, 0x00003FEF //A0 = +2.42742878212752702946e-05L
data8 0x84C089154A49E0E8, 0x0000BFF3 //A1 = -2.53204520651046300034e-04L
data8 0xABF5665BD0D8B0CD, 0x00003FED //A2 = +5.12476542947092361490e-06L
data8 0xB28C15C117CC604F, 0x0000BFAF //A13 = -1.15383428132352407085e-24L
data8 0xECB8428626DA072C, 0x00003FA9 //A14 = +2.39025879246942839796e-26L
data8 0x98B731BCFA2CE2B2, 0x0000BFA4 //A15 = -4.81885474332093262902e-28L
-// Pol23
+// Pol23
data8 0xC6D013811314D31B, 0x00003FED //A0 = +5.92508308918577687876e-06L
data8 0xBBF3057B8DBACBCF, 0x0000BFF2 //A1 = -1.79242422493281965934e-04L
data8 0xCCADECA501162313, 0x00003FEC //A2 = +3.04996061562356504918e-06L
data8 0xFEF9ED74D4F4C9B0, 0x0000BFAB //A13 = -1.02984099170876754831e-25L
data8 0x8E6F410068C12043, 0x00003FA6 //A14 = +1.79777721804459361762e-27L
data8 0x9AE2F6705481630E, 0x0000BFA0 //A15 = -3.05459905177379058768e-29L
-// Pol24
+// Pol24
data8 0xD2D858D5B01C9434, 0x00003FEE //A0 = +1.25673476165670766128e-05L
data8 0x8505330F8B4FDE49, 0x0000BFF2 //A1 = -1.26858053564784963985e-04L
data8 0xF39171C8B1D418C2, 0x00003FEB //A2 = +1.81472407620770441249e-06L
data8 0xB550CEA09DA96F44, 0x0000BFA8 //A13 = -9.15410112414783078242e-27L
data8 0xAA9149317996F32F, 0x00003FA2 //A14 = +1.34554050666508391264e-28L
data8 0x9C3008EFE3F52F19, 0x0000BF9C //A15 = -1.92516125328592532359e-30L
-// Pol25
+// Pol25
data8 0xA68E78218806283F, 0x00003FEF //A0 = +1.98550844852103406280e-05L
data8 0xBC41423996DC8A37, 0x0000BFF1 //A1 = -8.97669395268764751516e-05L
data8 0x90E55AE31A2F8271, 0x00003FEB //A2 = +1.07955871580069359702e-06L
data8 0x8098FA125C18D8DB, 0x0000BFA5 //A13 = -8.11564737276592661642e-28L
data8 0xCB9E4D5C08923227, 0x00003F9E //A14 = +1.00391606269366059664e-29L
data8 0x9CEC3BF7A0BE2CAF, 0x0000BF98 //A15 = -1.20888920108938909316e-31L
-// Pol26
+// Pol26
data8 0xC17AB25E269272F7, 0x00003FEE //A0 = +1.15322640047234590651e-05L
data8 0x85310509E633FEF2, 0x0000BFF1 //A1 = -6.35106483144690768696e-05L
data8 0xAC5E4C4DCB2D940C, 0x00003FEA //A2 = +6.42122148740412561597e-07L
data8 0xB61C8A29D98F24C0, 0x0000BFA1 //A13 = -7.18303147470398859453e-29L
data8 0xF296F69FE45BDA7D, 0x00003F9A //A14 = +7.47537230021540031251e-31L
data8 0x9D4B25BF6FB7234B, 0x0000BF94 //A15 = -7.57340869663212138051e-33L
-// Pol27
+// Pol27
data8 0xC7772CC326D6FBB8, 0x00003FEE //A0 = +1.18890718679826004395e-05L
data8 0xE0F9D5410565D55D, 0x0000BFF0 //A1 = -5.36384368533203585378e-05L
data8 0x85C0BE825680E148, 0x00003FEA //A2 = +4.98268406609692971520e-07L
.section .text
GLOBAL_LIBM_ENTRY(erfcl)
-
+
{ .mfi
alloc r32 = ar.pfs, 0, 36, 4, 0
fma.s1 FR_Tmp = f1, f1, f8 // |x|+1, if x >= 0
{ .mfi
nop.m 0
fnma.s1 FR_norm_x = f8, f8, f0 //high bits for -x^2
- nop.i 0
+ nop.i 0
}
;;
{ .mfi
setf.sig FR_INV_LN2_2TO63 = GR_sig_inv_ln2 // form 1/ln2 * 2^63
(p6) fma.s1 FR_AbsArg = f1, f0, f8 // |x|, if x >= 0
- nop.i 0
+ nop.i 0
}
{ .mfi
setf.d FR_RSHF_2TO51 = GR_rshf_2to51 //const 1.10 * 2^(63+51)
fclass.m p10,p0 = f8, 0x21 // p10: x = +inf
mov GR_exp_bias = 0x0FFFF // Set exponent bias
}
-{ .mlx
+{ .mlx
setf.d FR_RSHF = GR_rshf // Right shift const 1.1000 * 2^63
- movl GR_ERFC_XC_TB = 0x650
+ movl GR_ERFC_XC_TB = 0x650
}
;;
{ .mfi
ldfpd FR_POS_ARG_ASYMP,FR_NEG_ARG_ASYMP = [GR_ad_Arg], 16
(p7) fma.s1 FR_Tmp = FR_Tmp1, FR_Tmp1, f0 // (|x|+1)^2, x<0
- mov GR_0x1 = 0x1
+ mov GR_0x1 = 0x1
}
;;
-//p8: y = 1.0, x = 0.0,quick exit
+//p8: y = 1.0, x = 0.0,quick exit
{ .mfi
ldfpd FR_dx,FR_dx1 = [GR_ad_Arg], 16
fclass.m p9,p0 = f8, 0x22 // p9: x = -inf
}
{ .mfb
- nop.m 0
-(p8) fma.s0 f8 = f1, f1, f0
-(p8) br.ret.spnt b0
+ nop.m 0
+(p8) fma.s0 f8 = f1, f1, f0
+(p8) br.ret.spnt b0
}
;;
{ .mfi
- ldfe FR_UnfBound = [GR_ad_Arg], 16
+ ldfe FR_UnfBound = [GR_ad_Arg], 16
fclass.m p11,p0 = f8, 0xc3 // p11: x = nan
- mov GR_BIAS = 0x0FFFF
+ mov GR_BIAS = 0x0FFFF
}
{ .mfi
nop.m 0
{ .mfi
add GR_ad_C = 0x20, GR_ad_Arg // Point to C table
- nop.f 0
+ nop.f 0
add GR_ad_T1 = 0x50, GR_ad_Arg // Point to T1 table
}
{ .mfi
add GR_ad_T2 = 0x150, GR_ad_Arg // Point to T2 table
- nop.f 0
+ nop.f 0
add GR_ERFC_XC_TB = GR_ERFC_XC_TB, GR_ad_Arg //poin.to XB_TBL
}
;;
// p9: y = 2.0, x = -inf, quick exit
{ .mfi
- sub GR_mBIAS = r0, GR_BIAS
+ sub GR_mBIAS = r0, GR_BIAS
fma.s1 FR_2 = f1, f1, f1
- nop.i 0
+ nop.i 0
}
{ .mfb
ldfe FR_L_lo = [GR_ad_Arg],16 // Get L_lo
-(p9) fma.s0 f8 = f1, f1, f1
-(p9) br.ret.spnt b0
+(p9) fma.s0 f8 = f1, f1, f1
+(p9) br.ret.spnt b0
}
;;
fma.s1 FR_N_signif = FR_norm_x, FR_INV_LN2_2TO63, FR_RSHF_2TO51
and GR_exp_x = GR_signexp_x, GR_exp_mask
}
-{ .mfb
+{ .mfb
adds GR_ERFC_S_TB = 0x1C0, GR_ERFC_XC_TB // pointer to S_TBL
-(p10) fma.s0 f8 = f0, f1, f0
-(p10) br.ret.spnt b0
+(p10) fma.s0 f8 = f0, f1, f0
+(p10) br.ret.spnt b0
}
;;
// p11: y = x, x = nan, quick exit
{ .mfi
ldfe FR_C3 = [GR_ad_C],16 // Get C3 for normal path
- fcmp.lt.s1 p12,p0 = FR_AbsArg, FR_ch_dx
+ fcmp.lt.s1 p12,p0 = FR_AbsArg, FR_ch_dx
shl GR_ShftPi_bias = GR_BIAS, 8 // BIAS * 256
}
{ .mfb
sub GR_exp_x = GR_exp_x, GR_exp_bias // Get exponent
(p11) fma.s0 f8 = f8, f1, f0
-(p11) br.ret.spnt b0
+(p11) br.ret.spnt b0
}
;;
{ .mfi
ldfe FR_C1 = [GR_ad_C],16 // Get C1 for main path
(p6) fcmp.gt.unc.s1 p15,p0 = FR_AbsArg, FR_POS_ARG_ASYMP // p6: x >= 0
- nop.i 0
+ nop.i 0
}
{ .mfb
nop.m 0
(p7) fcmp.gt.unc.s1 p14,p0 = FR_AbsArg,FR_NEG_ARG_ASYMP // p7: x < 0
shladd GR_ShftXBi_bias = GR_mBIAS, 4, r0
}
-;;
+;;
{ .mfi
nop.m 0
}
{ .mfi
nop.m 0
- fms.s1 FR_float_N = FR_N_signif, FR_2TOM51, FR_RSHF
+ fms.s1 FR_float_N = FR_N_signif, FR_2TOM51, FR_RSHF
nop.i 0
}
;;
// p14: y ~=~ 2, x < -6.5,quick exit
{ .mfi
getf.exp GR_IndxPlusBias = FR_Tmp // exp + bias for (|x|+1)^4
- fcmp.lt.s1 p8,p0 = FR_NormX,FR_UnfBound
+ fcmp.lt.s1 p8,p0 = FR_NormX,FR_UnfBound
nop.i 0
}
{ .mfb
nop.m 0
(p14) fnma.s0 FR_RESULT = FR_EpsNorm,FR_EpsNorm,FR_2
-(p14) br.ret.spnt b0
+(p14) br.ret.spnt b0
}
;;
// p15: y ~=~ 0.0 (result with underflow error), x > POS_ARG_ASYMP = 107.0,
-// call __libm_error_region
+// call __libm_error_region
{ .mfb
(p15) mov GR_Parameter_TAG = 207
-(p15) fma.s0 FR_RESULT = FR_EpsNorm,FR_EpsNorm,f0
-(p15) br.cond.spnt __libm_error_region
+(p15) fma.s0 FR_RESULT = FR_EpsNorm,FR_EpsNorm,f0
+(p15) br.cond.spnt __libm_error_region
}
;;
getf.sig GR_N_fix = FR_N_signif // Get N from significand
nop.f 0
shl GR_ShftPi = GR_IndxPlusBias, 8
-
+
}
{ .mfi
shladd GR_ShftXBi = GR_IndxPlusBias, 4, GR_ShftXBi_bias
fma.s1 FR_Xpdx_hi = FR_AbsArg, f1, FR_dx // x + dx
add GR_ShftA14 = 0xE0, GR_ShftPi // pointer shift for A14
-
+
}
{ .mfi
ldfe FR_S = [GR_ERFC_S_TB]
}
;;
-{ .mfi
+{ .mfi
ldfe FR_A14 = [GR_P_POINT_1], -32
- nop.f 0
+ nop.f 0
extr.u GR_M2 = GR_N_fix, 0, 6 // Extract index M_2
}
-{ .mfi
+{ .mfi
ldfe FR_A15 = [GR_P_POINT_2], -32
- nop.f 0
+ nop.f 0
shladd GR_ad_W1 = GR_M1,3,GR_ad_W1 // Point to W1
}
;;
nop.f 0
shladd GR_ad_T1 = GR_M1,2,GR_ad_T1 // Point to T1
}
-;;
+;;
{ .mfi
ldfe FR_A8 = [GR_P_POINT_1], 32
shladd GR_ad_W2 = GR_M2,3,GR_ad_W2 // Point to W2
}
;;
-
+
{ .mfi
- ldfe FR_A10 = [GR_P_POINT_1], -96
+ ldfe FR_A10 = [GR_P_POINT_1], -96
nop.f 0
shladd GR_ad_T2 = GR_M2,2,GR_ad_T2 // Point to T2
}
{ .mfi
- ldfe FR_A11 = [GR_P_POINT_2], -96
+ ldfe FR_A11 = [GR_P_POINT_2], -96
fnma.s1 FR_r = FR_L_lo, FR_float_N, FR_r //r = -L_lo*float_N + r
nop.i 0
}
-;;
+;;
-{ .mfi
+{ .mfi
ldfe FR_A4 = [GR_P_POINT_1], 32
(p10) fms.s1 FR_Tmp = FR_dx,f1, FR_Xpdx_hi //for lo of x+dx, x<=dx
nop.i 0
}
-{ .mfi
+{ .mfi
ldfe FR_A5 = [GR_P_POINT_2], 32
(p9) fms.s1 FR_Tmp = FR_AbsArg, f1, FR_Xpdx_hi //for lo of x+dx, x>dx
nop.i 0
}
;;
-{ .mfi
- ldfe FR_A6 = [GR_P_POINT_1], -64
+{ .mfi
+ ldfe FR_A6 = [GR_P_POINT_1], -64
frcpa.s1 FR_U,p11 = f1, FR_Xpdx_hi // hi of 1 /(x + dx)
nop.i 0
}
-{ .mfi
- ldfe FR_A7 = [GR_P_POINT_2], -64
+{ .mfi
+ ldfe FR_A7 = [GR_P_POINT_2], -64
nop.f 0
nop.i 0
}
{ .mfi
ldfe FR_A2 = [GR_P_POINT_1], -32
- nop.f 0
- nop.i 0
+ nop.f 0
+ nop.i 0
}
{ .mfi
ldfe FR_A3 = [GR_P_POINT_2], -32
- nop.f 0
- nop.i 0
+ nop.f 0
+ nop.i 0
}
;;
-{ .mfi
+{ .mfi
ldfe FR_A0 = [GR_P_POINT_1], 224
nop.f 0
nop.i 0
{ .mfi
ldfd FR_W1 = [GR_ad_W1],0 // Get W1
- nop.f 0
- nop.i 0
+ nop.f 0
+ nop.i 0
}
{ .mfi
ldfd FR_W2 = [GR_ad_W2],0 // Get W2
fma.s1 FR_poly = FR_r, FR_C3, FR_C2 // poly = r * A3 + A2
- nop.i 0
+ nop.i 0
}
;;
{ .mfi
ldfs FR_T1 = [GR_ad_T1],0 // Get T1
(p10) fma.s1 FR_Xpdx_lo = FR_AbsArg,f1, FR_Tmp//lo of x + dx , x <= dx
- nop.i 0
+ nop.i 0
}
{ .mfi
ldfs FR_T2 = [GR_ad_T2],0 // Get T2
(p9) fma.s1 FR_Xpdx_lo = FR_dx,f1, FR_Tmp // lo of x + dx, x > dx
- nop.i 0
+ nop.i 0
}
;;
{ .mfi
nop.m 0
fnma.s1 FR_Tmp1 = FR_Xpdx_hi, FR_U, FR_2 // N-R, iter. N1
- nop.i 0
+ nop.i 0
}
{ .mfi
nop.m 0
fmpy.s1 FR_rsq = FR_r, FR_r // rsq = r * r
- nop.i 0
+ nop.i 0
}
;;
nop.i 0
}
{ .mfi
- nop.m 0
- fma.s1 FR_P15_0_1 = FR_A15, FR_LocArg, FR_A14
- nop.i 0
+ nop.m 0
+ fma.s1 FR_P15_0_1 = FR_A15, FR_LocArg, FR_A14
+ nop.i 0
}
;;
{ .mfi
nop.m 0
- fma.s1 FR_P15_1_2 = FR_A13, FR_LocArg, FR_A12
- nop.i 0
+ fma.s1 FR_P15_1_2 = FR_A13, FR_LocArg, FR_A12
+ nop.i 0
}
{ .mfi
nop.m 0
fma.s1 FR_poly = FR_r, FR_poly, FR_C1 // poly = r * poly + A1
- nop.i 0
+ nop.i 0
}
;;
{ .mfi
nop.m 0
- fma.s1 FR_P15_2_1 = FR_A9, FR_LocArg, FR_A8
- nop.i 0
+ fma.s1 FR_P15_2_1 = FR_A9, FR_LocArg, FR_A8
+ nop.i 0
}
{ .mfi
nop.m 0
- fma.s1 FR_P15_2_2 = FR_A11, FR_LocArg, FR_A10
+ fma.s1 FR_P15_2_2 = FR_A11, FR_LocArg, FR_A10
nop.i 0
}
;;
{ .mfi
nop.m 0
fma.s1 FR_P15_3_1 = FR_A5, FR_LocArg, FR_A4
- nop.i 0
+ nop.i 0
}
{ .mfi
nop.m 0
fma.s1 FR_W = FR_W1, FR_W2, FR_W2 // W = W1 * W2 + W2
nop.i 0
}
-;;
-
+;;
+
{ .mfi
nop.m 0
fmpy.s1 FR_T = FR_T1, FR_T2 // T = T1 * T2
{ .mfi
nop.m 0
fma.s1 FR_T_scale = FR_T, FR_scale, f0 // T_scale = T * scale
- nop.i 0
+ nop.i 0
}
;;
{ .mfi
nop.m 0
fma.s1 FR_U = FR_U, FR_Tmp, f0 // N-R, iter. N2
- nop.i 0
+ nop.i 0
}
;;
{ .mfi
nop.m 0
- fma.s1 FR_M = FR_T_scale, FR_S, f0
- nop.i 0
+ fma.s1 FR_M = FR_T_scale, FR_S, f0
+ nop.i 0
}
;;
{ .mfi
nop.m 0
fnma.s1 FR_Tmp = FR_Xpdx_hi, FR_U, FR_2 // N-R, iter. N3
- nop.i 0
+ nop.i 0
}
;;
{ .mfi
nop.m 0
fms.s1 FR_H = FR_W, f1, FR_xsq_lo // H = W - xsq_lo
- nop.i 0
+ nop.i 0
}
;;
{ .mfi
nop.m 0
fma.s1 FR_U = FR_U, FR_Tmp, f0 // N-R, iter. N3
- nop.i 0
+ nop.i 0
}
;;
{ .mfi
nop.m 0
- fma.s1 FR_Q = FR_A1, FR_LocArg, FR_Q
+ fma.s1 FR_Q = FR_A1, FR_LocArg, FR_Q
nop.i 0
}
;;
{ .mfi
nop.m 0
fnma.s1 FR_Tmp = FR_Xpdx_hi, FR_U, f1 // for du
- nop.i 0
+ nop.i 0
}
{ .mfi
nop.m 0
- fma.s1 FR_R = FR_H, FR_poly, FR_poly
- nop.i 0
+ fma.s1 FR_R = FR_H, FR_poly, FR_poly
+ nop.i 0
}
;;
nop.m 0
fma.s1 FR_res_pos_x_hi = FR_M, FR_U, f0 // M *U
nop.i 0
-
+
}
-;;
+;;
{ .mfi
nop.m 0
fma.s1 FR_R = FR_R, f1, FR_H // R = H + P(r) + H*P(r)
- nop.i 0
+ nop.i 0
}
;;
{ .mfi
nop.m 0
fma.s0 FR_Tmpf = f8, f1, f0 // flag d
- nop.i 0
+ nop.i 0
}
;;
{ .mfi
nop.m 0
- fnma.s1 FR_dU = FR_Xpdx_lo, FR_U, FR_Tmp
- nop.i 0
+ fnma.s1 FR_dU = FR_Xpdx_lo, FR_U, FR_Tmp
+ nop.i 0
}
;;
// for -6.5 <= x < 0
{ .mfi
nop.m 0
- fms.s1 FR_res_pos_x_lo = FR_M, FR_U, FR_res_pos_x_hi
- nop.i 0
-
+ fms.s1 FR_res_pos_x_lo = FR_M, FR_U, FR_res_pos_x_hi
+ nop.i 0
+
}
{ .mfi
nop.m 0
(p7) fnma.s1 FR_Tmp1 = FR_res_pos_x_hi, f1, FR_2 //p7: x < 0
- nop.i 0
-
+ nop.i 0
+
}
;;
{ .mfi
nop.m 0
- fma.s1 FR_G = FR_R, FR_Q, FR_Q
+ fma.s1 FR_G = FR_R, FR_Q, FR_Q
nop.i 0
-
+
}
;;
nop.m 0
fma.s1 FR_Tmp = FR_R, f1, FR_dU // R + du
nop.i 0
-
+
}
;;
{ .mfi
nop.m 0
(p7) fnma.s1 FR_Tmp2 = FR_Tmp1, f1, FR_2 //p7: x < 0
- nop.i 0
-
+ nop.i 0
+
}
;;
{ .mfi
nop.m 0
- fma.s1 FR_G = FR_G, f1, FR_Tmp
+ fma.s1 FR_G = FR_G, f1, FR_Tmp
nop.i 0
-
+
}
;;
{ .mfi
nop.m 0
(p7) fnma.s1 FR_Tmp2 = FR_res_pos_x_hi, f1, FR_Tmp2 //p7: x < 0
- nop.i 0
-
+ nop.i 0
+
}
;;
nop.m 0
fma.s1 FR_V = FR_G, FR_res_pos_x_hi, f0 // V = G * M *U
nop.i 0
-
+
}
;;
{ .mfi
nop.m 0
(p7) fma.s1 FR_res_pos_x_lo = FR_res_pos_x_lo, f1, FR_V //p7: x < 0
- nop.i 0
-
+ nop.i 0
+
}
;;
{ .mfi
nop.m 0
(p7) fnma.s1 FR_Tmp2 = FR_res_pos_x_lo, f1, FR_Tmp2 //p7: x < 0
- nop.i 0
-
+ nop.i 0
+
}
;;
-//p6: result for 0 < x < = POS_ARG_ASYMP
+//p6: result for 0 < x < = POS_ARG_ASYMP
//p7: result for - NEG_ARG_ASYMP <= x < 0
//p8: exit for - NEG_ARG_ASYMP <= x < UnfBound
-ERFC_RESULT:
+ERFC_RESULT:
.pred.rel "mutex",p6,p7
{ .mfi
nop.m 0
(p6) fma.s0 f8 = FR_M, FR_U, FR_V // p6: x >= 0
- nop.i 0
+ nop.i 0
}
{ .mfb
mov GR_Parameter_TAG = 207
(p7) fma.s0 f8 = FR_Tmp2, f1, FR_Tmp1 // p7: x < 0
-(p8) br.ret.sptk b0
+(p8) br.ret.sptk b0
};;
GLOBAL_LIBM_END(erfcl)
mov GR_SAVE_PFS=ar.pfs // Save ar.pfs
}
{ .mfi
-.fframe 64
+.fframe 64
add sp=-64,sp // Create new stack
nop.f 0
mov GR_SAVE_GP=gp // Save gp
{ .mmi
stfe [GR_Parameter_Y] = FR_Y,16 // STORE Parameter 2 on stack
add GR_Parameter_X = 16,sp // Parameter 1 address
-.save b0, GR_SAVE_B0
+.save b0, GR_SAVE_B0
mov GR_SAVE_B0=b0 // Save b0
};;
.body
{ .mib
stfe [GR_Parameter_X] = FR_X // STORE Parameter 1 on stack
add GR_Parameter_RESULT = 0,GR_Parameter_Y // Parameter 3 address
- nop.b 0
+ nop.b 0
}
{ .mib
stfe [GR_Parameter_Y] = FR_RESULT // STORE Parameter 3 on stack
- add GR_Parameter_Y = -16,GR_Parameter_Y
+ add GR_Parameter_Y = -16,GR_Parameter_Y
br.call.sptk b0=__libm_error_support# // Call error handling function
};;
{ .mmi
mov gp = GR_SAVE_GP // Restore gp
mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs
br.ret.sptk b0 // Return
-};;
+};;
LOCAL_LIBM_END(__libm_error_region)
.type __libm_error_support#,@function
// products derived from this software without specific prior written
// permission.
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
// Intel Corporation is the author of this code, and requests that all
-// problem reports or change requests be submitted to it directly at
+// problem reports or change requests be submitted to it directly at
// http://www.intel.com/software/products/opensource/libraries/num.htm.
//
// History
// 5. |x| = INF
// Return erff(x) = sign(x) * 1.0
//
-// 6. x = [S,Q]NaN
+// 6. x = [S,Q]NaN
// Return erff(x) = QNaN
//
// 7. x is positive denormal
//
// Registers used
//==============================================================
-// Floating Point registers used:
+// Floating Point registers used:
// f8, input
// f32 -> f59
-// General registers used:
+// General registers used:
// r32 -> r45, r2, r3
// Predicate registers used:
data8 0x40312115B0932F24 // D0
data8 0xC0160D6CD0991EA3 // D1
data8 0xBFE04A567A6DBE4A // D2
-data8 0xBF4207BC640D1509 // B0
+data8 0xBF4207BC640D1509 // B0
// Polynomial coefficients for the erf(x), 0.25 <= |x| < 0.5
data8 0x3F90849356383F58 // C0
data8 0x3F830BD5BA240F09 // C1
data8 0x406EFF5F088CEC4B // D1
data8 0xC03A5743DF38FDE0 // D2
data8 0xBEE397A9FA5686A2 // B0
-// Polynomial coefficients for the erf(x), -0.125 < x < 0.125
+// Polynomial coefficients for the erf(x), -0.125 < x < 0.125
data8 0x3FF20DD7504270CB // C0
data8 0xBFD8127465AFE719 // C1
data8 0x3FBCE2D77791DD77 // C2
;;
{ .mfi
- getf.s rArg = f8 // x in GR
+ getf.s rArg = f8 // x in GR
fclass.m p7,p0 = f8, 0x0b // is x denormal ?
// sign bit and 2 most bits in significand
- shl rMask = rMask, 20
+ shl rMask = rMask, 20
}
{ .mfi
ld8 rDataPtr = [rDataPtr]
{ .mfi
andcm rOffset2 = rArg, rMask
fclass.m p6,p0 = f8, 0xc7 // is x [S,Q]NaN or +/-0 ?
- shl rBound = rBound, 20 // 0.125f in GR
+ shl rBound = rBound, 20 // 0.125f in GR
}
{ .mfb
andcm rAbsArg = rArg, rSignBit // |x| in GR
shr rOffset2 = rOffset2, 21
}
{ .mfi
- cmp.lt p10, p8 = rAbsArg, rBound // |x| < 0.125?
+ cmp.lt p10, p8 = rAbsArg, rBound // |x| < 0.125?
nop.f 0
adds rCoeffAddr3 = 16, rDataPtr
}
{ .mfi
shladd rCoeffAddr1 = rBias, 4, rDataPtr
fma.s1 fArg3Sgn = fArgSqr, f8, f0 // sign(x)*|x|^3
- // is |x| < 4.0?
- cmp.lt p11, p12 = rAbsArg, rSaturation
+ // is |x| < 4.0?
+ cmp.lt p11, p12 = rAbsArg, rSaturation
}
{ .mfi
shladd rCoeffAddr3 = rBias, 4, rCoeffAddr3
{ .mfi
(p11) ldfpd fC0, fC1 = [rCoeffAddr1]
(p9) fmerge.s f8 = f8,f1 // +/- inf
-(p12) adds rDataPtr = 512, rDataPtr
+(p12) adds rDataPtr = 512, rDataPtr
}
{ .mfb
(p11) ldfpd fC2, fC3 = [rCoeffAddr3], 16
{ .mfi
nop.m 0
- fma.s1 fPolATmp = fA3, fAbsArg, fA2 // A3*|x| + A2
+ fma.s1 fPolATmp = fA3, fAbsArg, fA2 // A3*|x| + A2
nop.i 0
}
{ .mfi
{ .mfi
nop.m 0
// C3*|x|^3 + C2*x^2 + C1*|x| + C0
- fma.s1 fPolC = fPolC, fArgSqr, fPolCTmp
+ fma.s1 fPolC = fPolC, fArgSqr, fPolCTmp
nop.i 0
}
;;
{ .mfi
nop.m 0
// PolD = sign(x)*(|x|^7 + D2*x^6 + D1*|x|^5 + D0*x^4)
- fma.d.s1 fPolD = fPolD, fArg4Sgn, fPolDTmp
+ fma.d.s1 fPolD = fPolD, fArg4Sgn, fPolDTmp
nop.i 0
}
;;
{ .mfi
nop.m 0
- // PolA = A3|x|^3 + A2*x^2 + A1*|x| + A0
- fma.d.s1 fPolA = fPolATmp, fArgSqr, fPolA
+ // PolA = A3|x|^3 + A2*x^2 + A1*|x| + A0
+ fma.d.s1 fPolA = fPolATmp, fArgSqr, fPolA
nop.i 0
}
-;;
+;;
{ .mfi
nop.m 0
- // PolC = B0*x^4 + C3*|x|^3 + C2*|x|^2 + C1*|x| + C0
- fma.d.s1 fPolC = fPolC, f1, fB0
+ // PolC = B0*x^4 + C3*|x|^3 + C2*|x|^2 + C1*|x| + C0
+ fma.d.s1 fPolC = fPolC, f1, fB0
nop.i 0
}
-;;
+;;
{ .mfi
nop.m 0
(p14) fma.s.s0 f8 = fPolC, fPolD, fPolA // for positive x
- nop.i 0
+ nop.i 0
}
{ .mfb
nop.m 0
br.ret.sptk b0 // Exit for 4.0 <=|x|< +inf
}
;;
-
+
// Here if x is single precision denormal
erff_denormal:
{ .mfi
// products derived from this software without specific prior written
// permission.
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
// Intel Corporation is the author of this code, and requests that all
-// problem reports or change requests be submitted to it directly at
+// problem reports or change requests be submitted to it directly at
// http://www.intel.com/software/products/opensource/libraries/num.htm.
//
// History
//
// 3. Main path: 1/8 <= |x| < 6.53
// For several ranges of 1/8 <= |x| < 6.53
-// Return erfl(x) = sign(x)*((A0H+A0L) + y*(A1H+A1L) + y^2*(A2H+A2L) +
+// Return erfl(x) = sign(x)*((A0H+A0L) + y*(A1H+A1L) + y^2*(A2H+A2L) +
// + y^3*A3 + y^4*A4 + ... + y^25*A25 )
// where y = (|x|/a) - b
//
// 4.0 <= |x| < 6.53 a = 4.0, b = 1.5
// ( [3.25;4.0] subrange separated for monotonicity issues resolve )
//
-// 4. Saturation path: 6.53 <= |x| < +INF
+// 4. Saturation path: 6.53 <= |x| < +INF
// Return erfl(x) = sign(x)*(1.0 - tiny_value)
// (tiny_value ~ 1e-1233)
//
// Multiprecision have to be performed only for first few
// polynomial iterations (up to 3-rd x degree)
// Here we use the same parallelisation way as above:
-// Split whole polynomial to first, "multiprecision" part, and second,
+// Split whole polynomial to first, "multiprecision" part, and second,
// so called "tail", native precision part.
//
-// 1) Multiprecision part:
+// 1) Multiprecision part:
// [v1=(A0H+A0L)+y*(A1H+A1L)] + [v2=y^2*((A2H+A2L)+y*A3)]
// v1 and v2 terms calculated in parallel
//
// v3 = x^4 * ( A4 + x*A5 + ... + x^21*A25 )
// v3 is splitted to 2 even parts (10 coefficient in each one).
// These 2 parts are also factorized using binary tree technique.
-//
+//
// So Multiprecision and Tail parts cost is almost the same
// and we have both results ready before final summation.
//
-// 4. Saturation path: 6.53 <= |x| < +INF
+// 4. Saturation path: 6.53 <= |x| < +INF
//
// We use formula sign(x)*(1.0 - tiny_value) instead of simple sign(x)*1.0
// just to meet IEEE requirements for different rounding modes in this case.
//
// Registers used
//==============================================================
-// Floating Point registers used:
+// Floating Point registers used:
// f8 - input & output
// f32 -> f90
-// General registers used:
-// r2, r3, r32 -> r52
+// General registers used:
+// r2, r3, r32 -> r52
// Predicate registers used:
// p0, p6 -> p11, p14, p15
fA17 = f52
fA18 = f53
fA19 = f54
-fA20 = f55
-fA21 = f56
-fA22 = f57
+fA20 = f55
+fA21 = f56
+fA22 = f57
fA23 = f58
fA24 = f59
fA25 = f60
fRes3L = f80
fRes4 = f81
-fTT = f82
+fTT = f82
fTH = f83
fTL = f84
-fTT2 = f85
+fTT2 = f85
fTH2 = f86
fTL2 = f87
LOCAL_OBJECT_START(erfl_data)
////////// Main tables ///////////
_0p125_to_0p25_data: // exp = 2^-3
-// Polynomial coefficients for the erf(x), 1/8 <= |x| < 1/4
+// Polynomial coefficients for the erf(x), 1/8 <= |x| < 1/4
data8 0xACD9ED470F0BB048, 0x0000BFF4 //A3 = -6.5937529303909561891162915809e-04
data8 0xBF6A254428DDB452 //A2H = -3.1915980570631852578089571182e-03
data8 0xBC131B3BE3AC5079 //A2L = -2.5893976889070198978842231134e-19
LOCAL_OBJECT_END(erfl_data)
LOCAL_OBJECT_START(_0p25_to_0p5_data)
-// Polynomial coefficients for the erf(x), 1/4 <= |x| < 1/2
+// Polynomial coefficients for the erf(x), 1/4 <= |x| < 1/2
data8 0xF083628E8F7CE71D, 0x0000BFF6 //A3 = -3.6699405305266733332335619531e-03
data8 0xBF978749A434FE4E //A2H = -2.2977018973732214746075186440e-02
data8 0xBC30B3FAFBC21107 //A2L = -9.0547407100537663337591537643e-19
LOCAL_OBJECT_END(_0p25_to_0p5_data)
LOCAL_OBJECT_START(_0p5_to_1_data)
-// Polynomial coefficients for the erf(x), 1/2 <= |x| < 1
+// Polynomial coefficients for the erf(x), 1/2 <= |x| < 1
data8 0xDB742C8FB372DBE0, 0x00003FF6 //A3 = 3.3485993187250381721535255963e-03
data8 0xBFBEDC5644353C26 //A2H = -1.2054957547410136142751468924e-01
data8 0xBC6D7215B023455F //A2L = -1.2770012232203569059818773287e-17
LOCAL_OBJECT_END(_0p5_to_1_data)
LOCAL_OBJECT_START(_1_to_2_data)
-// Polynomial coefficients for the erf(x), 1 <= |x| < 2.0
+// Polynomial coefficients for the erf(x), 1 <= |x| < 2.0
data8 0x8E15015F5B55BEAC, 0x00003FFC //A3 = 1.3875200409423426678618977531e-01
data8 0xBFC6D5A95D0A1B7E //A2H = -1.7839543383544403942764233761e-01
data8 0xBC7499F704C80E02 //A2L = -1.7868888188464394090788198634e-17
LOCAL_OBJECT_END(_1_to_2_data)
LOCAL_OBJECT_START(_2_to_3p25_data)
-// Polynomial coefficients for the erf(x), 2 <= |x| < 3.25
+// Polynomial coefficients for the erf(x), 2 <= |x| < 3.25
data8 0xCEDBA58E8EE6F055, 0x00003FF7 //A3 = 6.3128050215859026984338771121e-03
data8 0xBF5B60D5E974CBBD //A2H = -1.6710366233609740427984435840e-03
data8 0xBC0E11E2AEC18AF6 //A2L = -2.0376133202996259839305825162e-19
LOCAL_OBJECT_END(_2_to_3p25_data)
LOCAL_OBJECT_START(_4_to_6p53_data)
-// Polynomial coefficients for the erf(x), 4 <= |x| < 6.53
+// Polynomial coefficients for the erf(x), 4 <= |x| < 6.53
data8 0xDF3151BE8652827E, 0x00003FD5 //A3 = 3.9646979666953349095427642209e-13
data8 0xBD1C4A9787DF888B //A2H = -2.5127788450714750484839908889e-14
data8 0xB99B35483E4603FD //A2L = -3.3536613901268985626466020210e-31
LOCAL_OBJECT_END(_4_to_6p53_data)
LOCAL_OBJECT_START(_3p25_to_4_data)
-// Polynomial coefficients for the erf(x), 3.25 <= |x| < 4
+// Polynomial coefficients for the erf(x), 3.25 <= |x| < 4
data8 0xB01D29846286CE08, 0x00003FEE //A3 = 1.0497207328743021499800978059e-05
data8 0xBEC10B1488AEB234 //A2H = -2.0317175474986489113480084279e-06
data8 0xBB7F19701B8B74F9 //A2L = -4.1159669348226960337518214996e-22
//////// "Tail" tables //////////
LOCAL_OBJECT_START(_0p125_to_0p25_data_tail)
-// Polynomial coefficients for the erf(x), 1/8 <= |x| < 1/4
+// Polynomial coefficients for the erf(x), 1/8 <= |x| < 1/4
data8 0x93086CBD21ED3962, 0x00003FCA //A13 = 1.2753071968462837024755878679e-16
data8 0x83CB5045A6D4B419, 0x00003FCF //A12 = 3.6580237062957773626379648530e-15
data8 0x8FCDB723209690EB, 0x0000BFD3 //A11 = -6.3861616307180801527566117146e-14
LOCAL_OBJECT_END(_0p125_to_0p25_data_tail)
LOCAL_OBJECT_START(_0p25_to_0p5_data_tail)
-// Polynomial coefficients for the erf(x), 1/4 <= |x| < 1/2
+// Polynomial coefficients for the erf(x), 1/4 <= |x| < 1/2
data8 0x8CEAC59AF361B78A, 0x0000BFD6 //A13 = -5.0063802958258679384986669123e-13
data8 0x9BC67404F348C0CE, 0x00003FDB //A12 = 1.7709590771868743572061278273e-11
data8 0xF4B5D0348AFAAC7A, 0x00003FDB //A11 = 2.7820329729584630464848160970e-11
LOCAL_OBJECT_END(_0p25_to_0p5_data_tail)
LOCAL_OBJECT_START(_0p5_to_1_data_tail)
-// Polynomial coefficients for the erf(x), 1/2 <= |x| < 1
+// Polynomial coefficients for the erf(x), 1/2 <= |x| < 1
data8 0x9ED99EDF111CB785, 0x0000BFE4 //A13 = -9.2462916180079278241704711522e-09
data8 0xDEAF7539AE2FB062, 0x0000BFE5 //A12 = -2.5923990465973151101298441139e-08
data8 0xA392D5E5CC9DB1A7, 0x00003FE9 //A11 = 3.0467952847327075747032372101e-07
LOCAL_OBJECT_END(_0p5_to_1_data_tail)
LOCAL_OBJECT_START(_1_to_2_data_tail)
-// Polynomial coefficients for the erf(x), 1 <= |x| < 2.0
+// Polynomial coefficients for the erf(x), 1 <= |x| < 2.0
data8 0x969EAC5C7B46CAB9, 0x00003FEF //A13 = 1.7955281439310148162059582795e-05
data8 0xA2ED832912E9FCD9, 0x00003FF1 //A12 = 7.7690020847111408916570845775e-05
data8 0x85677C39C48E43E7, 0x0000BFF3 //A11 = -2.5444839340796031538582511806e-04
LOCAL_OBJECT_END(_1_to_2_data_tail)
LOCAL_OBJECT_START(_2_to_3p25_data_tail)
-// Polynomial coefficients for the erf(x), 2 <= |x| < 3.25
+// Polynomial coefficients for the erf(x), 2 <= |x| < 3.25
data8 0x847C24DAC7C7558B, 0x00003FF5 //A13 = 1.0107798565424606512130100541e-03
data8 0xCB6340EAF02C3DF8, 0x00003FF8 //A12 = 1.2413800617425931997420375435e-02
data8 0xB5163D252DBBC107, 0x0000BFF9 //A11 = -2.2105330871844825370020459523e-02
LOCAL_OBJECT_END(_2_to_3p25_data_tail)
LOCAL_OBJECT_START(_4_to_6p53_data_tail)
-// Polynomial coefficients for the erf(x), 4 <= |x| < 6.53
+// Polynomial coefficients for the erf(x), 4 <= |x| < 6.53
data8 0xD8235ABF08B8A6D1, 0x00003FEE //A13 = 1.2882834877224764938429832586e-05
data8 0xAEDF44F9C77844C2, 0x0000BFEC //A12 = -2.6057980393716019511497492890e-06
data8 0xCCD5490956A4FCFD, 0x00003FEA //A11 = 7.6306293047300300284923464089e-07
LOCAL_OBJECT_END(_4_to_6p53_data_tail)
LOCAL_OBJECT_START(_3p25_to_4_data_tail)
-// Polynomial coefficients for the erf(x), 3.25 <= |x| < 4
+// Polynomial coefficients for the erf(x), 3.25 <= |x| < 4
data8 0x95BE1BEAD738160F, 0x00003FF2 //A13 = 1.4280568455209843005829620687e-04
data8 0x8108C8FFAC0F0B21, 0x0000BFF4 //A12 = -4.9222685622046459346377033307e-04
data8 0xD72A7FAEE7832BBE, 0x00003FF4 //A11 = 8.2079319302109644436194651098e-04
LOCAL_OBJECT_START(_0_to_1o8_data)
-// Polynomial coefficients for the erf(x), 0.0 <= |x| < 0.125
+// Polynomial coefficients for the erf(x), 0.0 <= |x| < 0.125
data8 0x3FF20DD750429B6D, 0x3C71AE3A8DDFFEDE //A1H, A1L
data8 0xF8B0DACE42525CC2, 0x0000BFEE //A15
data8 0xFCD02E1BF0EC2C37, 0x00003FF1 //A13
GLOBAL_LIBM_ENTRY(erfl)
{ .mfi
- alloc r32 = ar.pfs, 0, 21, 0, 0
+ alloc r32 = ar.pfs, 0, 21, 0, 0
fmerge.se fArgAbsNorm = f1, f8 // normalized x (1.0 <= x < 2.0)
addl rSignBit = 0x20000, r0 // Set sign bit for exponent
}
{ .mfi
getf.exp rArgExp = f8 // Get arg exponent
- fclass.m p6,p0 = f8, 0xEF // Filter 0, denormals and specials
+ fclass.m p6,p0 = f8, 0xEF // Filter 0, denormals and specials
// 0xEF = @qnan|@snan|@pos|@neg|@zero|@unorm|@inf
- addl rBias = 0xfffc, r0 // Value to subtract from exp
+ addl rBias = 0xfffc, r0 // Value to subtract from exp
// to get actual interval number
}
{ .mfi
ld8 rDataPtr = [rDataPtr] // Get real common data pointer
fma.s1 fArgSqr = f8, f8, f0 // x^2 (for [0;1/8] path)
- addl r2to4 = 0x10000, r0 // unbiased exponent
+ addl r2to4 = 0x10000, r0 // unbiased exponent
// for [2;4] binary interval
};;
{ .mfi
- getf.sig rArgSig = f8 // Get arg significand
+ getf.sig rArgSig = f8 // Get arg significand
fcmp.lt.s1 p15, p14 = f8, f0 // Is arg negative/positive?
addl rSaturation = 0xd0e, r0 // First 12 bits of
// saturation value signif.
}
{ .mfi
- setf.d f1p5 = r1p5 // 1.5 construction
+ setf.d f1p5 = r1p5 // 1.5 construction
fma.s1 f2p0 = f1,f1,f1 // 2.0 construction
addl r3p25Sign = 0xd00, r0 // First 12 bits of
// 3.25 value signif.
{ .mfi
sub rInterval = rArgExp, rBias // Get actual interval number
nop.f 0
- shr.u rArgSig = rArgSig, 52 // Leave only 12 bits of sign.
+ shr.u rArgSig = rArgSig, 52 // Leave only 12 bits of sign.
}
{ .mfi
adds rShiftedDataPtr = 0x10, rDataPtr // Second ptr to data
};;
{ .mfi
-(p8) cmp.le p8, p10 = r3p25Sign, rArgSig // If sign. is greater
+(p8) cmp.le p8, p10 = r3p25Sign, rArgSig // If sign. is greater
// than 1.25? (means arg is in [3.25;4] interval)
nop.f 0
- shl rOffset = rInterval, 8 // Make offset from
+ shl rOffset = rInterval, 8 // Make offset from
// interval number
}
{ .mfi
};;
{ .mfi
-(p8) adds rOffset = 0x200, rOffset // Add additional offset
+(p8) adds rOffset = 0x200, rOffset // Add additional offset
// if arg is in [3.25;4] (another data set)
fma.s1 fArgCube = fArgSqr, f8, f0 // x^3 (for [0;1/8] path)
shl rTailOffset = rInterval, 7 // Make offset to "tail" data
// from interval number
}
{ .mib
- setf.exp fTiny = rTiny // Construct "tiny" value
+ setf.exp fTiny = rTiny // Construct "tiny" value
// for saturation path
cmp.ltu p11, p0 = 0x5, rInterval // if arg > 8
-(p9) br.cond.spnt _0_to_1o8
+(p9) br.cond.spnt _0_to_1o8
};;
{ .mfi
- add rAddr1 = rDataPtr, rOffset // Get address for
- // interval data
+ add rAddr1 = rDataPtr, rOffset // Get address for
+ // interval data
nop.f 0
shl rTailAddOffset = rInterval, 5 // Offset to interval
- // "tail" data
+ // "tail" data
}
{ .mib
add rAddr2 = rShiftedDataPtr, rOffset // Get second
- // address for interval data
-(p7) cmp.leu p11, p0 = rSaturation, rArgSig // if arg is
+ // address for interval data
+(p7) cmp.leu p11, p0 = rSaturation, rArgSig // if arg is
// in [6.53;8] interval
(p11) br.cond.spnt _saturation // Branch to Saturation path
};;
.pred.rel "mutex",p8,p10
{ .mfi
ldfe fA18 = [rAddr1], 16 // Load A18
-(p8) fms.s1 fArgAbsNorm = fArgAbsNorm, f1, f2p0 // Add 2.0
+(p8) fms.s1 fArgAbsNorm = fArgAbsNorm, f1, f2p0 // Add 2.0
// to normalized arg (for [3.24;4] interval)
adds rTailAddr2 = 0x10, rTailAddr1 // First tail
// data address
}
{ .mfi
- ldfe fA25 = [rAddr2], 16 // Load A25
-(p10) fms.s1 fArgAbsNorm = fArgAbsNorm, f1, f1p5 // Add 1.5
+ ldfe fA25 = [rAddr2], 16 // Load A25
+(p10) fms.s1 fArgAbsNorm = fArgAbsNorm, f1, f1p5 // Add 1.5
// to normalized arg
nop.i 0
};;
fma.s1 fA23 = fA24, fArgAbsNorm, fA23 // Polynomial tail
nop.i 0
}
-{ .mfi
+{ .mfi
nop.m 0
- fma.s1 fA21 = fA22, fArgAbsNorm, fA21 // Polynomial tail
+ fma.s1 fA21 = fA22, fArgAbsNorm, fA21 // Polynomial tail
nop.i 0
};;
fma.s1 fRes3L = fRes3L, f1, fTH // (A3*x+A2)*x^2
nop.i 0
}
-{ .mfi
+{ .mfi
nop.m 0
fma.s1 fA19 = fA20, fArgAbsNorm, fA19 // Polynomial tail
nop.i 0
fma.s1 fRes1H = fTH2, f1, fA0H // A1*x+A0
nop.i 0
}
-{ .mfi
+{ .mfi
nop.m 0
fms.s1 fTL2 = fA1H, fArgAbsNorm, fTH2 // A1*x+A0
nop.i 0
fma.s1 fA8 = fA9, fArgAbsNorm, fA8 // Polynomial tail
nop.i 0
}
-{ .mfi
+{ .mfi
nop.m 0
fma.s1 fA10 = fA11, fArgAbsNorm, fA10 // Polynomial tail
nop.i 0
fms.s1 fArgAbsNorm11 = fArgAbsNorm4, fArgAbsNorm4, f0 // x^8
nop.i 0
}
-{ .mfi
+{ .mfi
nop.m 0
fma.s1 fA4 = fA5, fArgAbsNorm, fA4 // Polynomial tail
nop.i 0
fma.s1 fRes3L = fRes3L, f1, fA2L // (A3*x+A2)*x^2
nop.i 0
}
-{ .mfi
+{ .mfi
nop.m 0
fma.s1 fA6 = fA7, fArgAbsNorm, fA6 // Polynomial tail
nop.i 0
fma.s1 fTL2 = fTL2, f1, fTT2 // A1*x+A0
nop.i 0
}
-{ .mfi
+{ .mfi
nop.m 0
fms.s1 fRes1L = fA0H, f1, fRes1H // A1*x+A0
nop.i 0
fma.s1 fA23 = fA25, fArgAbsNorm2, fA23 // Polynomial tail
nop.i 0
}
-{ .mfi
+{ .mfi
nop.m 0
fma.s1 fA12 = fA14, fArgAbsNorm2, fA12 // Polynomial tail
nop.i 0
fma.s1 fA19 = fA21, fArgAbsNorm2, fA19 // Polynomial tail
nop.i 0
}
-{ .mfi
+{ .mfi
nop.m 0
fma.s1 fA8 = fA10, fArgAbsNorm2, fA8 // Polynomial tail
nop.i 0
fma.s1 fA15 = fA17, fArgAbsNorm2, fA15 // Polynomial tail
nop.i 0
}
-{ .mfi
+{ .mfi
nop.m 0
fms.s1 fArgAbsNorm11 = fArgAbsNorm11, fArgAbsNorm3, f0 // x^11
nop.i 0
fma.s1 fTT = fRes3L, fArgAbsNorm2, f0 // (A3*x+A2)*x^2
nop.i 0
}
-{ .mfi
+{ .mfi
nop.m 0
fma.s1 fA4 = fA6, fArgAbsNorm2, fA4 // Polynomial tail
nop.i 0
fma.s1 fA19 = fA23, fArgAbsNorm4, fA19 // Polynomial tail
nop.i 0
}
-{ .mfi
+{ .mfi
nop.m 0
fma.s1 fA8 = fA12, fArgAbsNorm4, fA8 // Polynomial tail
nop.i 0
{ .mfi
nop.m 0
- fma.s1 fRes4 = fA15, fArgAbsNorm11, fA4 // Result of
+ fma.s1 fRes4 = fA15, fArgAbsNorm11, fA4 // Result of
// polynomial tail
nop.i 0
};;
fms.s1 fRes2L = fRes3H, fArgAbsNorm2, fRes2H // (A3*x+A2)*x^2
nop.i 0
}
-{ .mfi
+{ .mfi
nop.m 0
fma.s1 fResH = fRes2H, f1, fRes1H // High result
nop.i 0
nop.i 0
};;
-{ .mfi
+{ .mfi
nop.m 0
fma.s1 fRes2L = fRes2L, f1, fTT // (A3*x+A2)*x^2
nop.i 0
}
-{ .mfi
+{ .mfi
nop.m 0
fms.s1 fResL = fRes1H, f1, fResH // Low result
nop.i 0
fma.s1 fRes1L = fRes1L, f1, fRes2L // Low result
nop.i 0
}
-{ .mfi
+{ .mfi
nop.m 0
fma.s1 fResL = fResL, f1, fRes2H // Low result
nop.i 0
};;
-{ .mfi
+{ .mfi
nop.m 0
(p15) fneg fResH = fResH // Invert high result if arg is neg.
nop.i 0
};;
.pred.rel "mutex",p14,p15
-{ .mfi
+{ .mfi
nop.m 0
(p14) fma.s0 f8 = fResH, f1, fResL // Add high and low results
nop.i 0
}
-{ .mfb
+{ .mfb
nop.m 0
(p15) fms.s0 f8 = fResH, f1, fResL // Add high and low results
br.ret.sptk b0 // Main path return
_saturation:
.pred.rel "mutex",p14,p15
-{ .mfi
+{ .mfi
nop.m 0
(p14) fms.s0 f8 = f1, f1, fTiny // Saturation result r = 1-tiny
nop.i 0
};;
-{ .mfb
+{ .mfb
nop.m 0
(p15) fnma.s0 f8 = f1, f1, fTiny // Saturation result r = tiny-1
br.ret.sptk b0 // Saturation path return
// 0, denormals and special IEEE numbers path /////////////////////////////////
erfl_spec:
-{ .mfi
+{ .mfi
addl rDataPtr = 0xBE0, rDataPtr // Ptr to denormals coeffs
fclass.m p6,p0 = f8, 0x23 // To filter infinities
- // 0x23 = @pos|@neg|@inf
+ // 0x23 = @pos|@neg|@inf
nop.i 0
};;
-{ .mfi
+{ .mfi
ldfpd fA1H, fA1L = [rDataPtr] // Load denormals coeffs A1H, A1L
fclass.m p7,p0 = f8, 0xC7 // To filter NaNs & Zeros
// 0xC7 = @pos|@neg|@zero|@qnan|@snan
nop.i 0
};;
-{ .mfb
+{ .mfb
nop.m 0
-(p6) fmerge.s f8 = f8, f1 // +/-1 for INF args
+(p6) fmerge.s f8 = f8, f1 // +/-1 for INF args
(p6) br.ret.spnt b0 // exit for x = INF
};;
-{ .mfb
+{ .mfb
nop.m 0
-(p7) fma.s0 f8 = f8, f1, f8 // +/-0 for 0 args
+(p7) fma.s0 f8 = f8, f1, f8 // +/-0 for 0 args
// and NaNs for NaNs
(p7) br.ret.spnt b0 // exit for x = NaN or +/-0
};;
-{ .mfi
+{ .mfi
nop.m 0
fnorm.s0 f8 = f8 // Normalize arg
nop.i 0
};;
-{ .mfi
+{ .mfi
nop.m 0
fms.s1 fRes1H = f8, fA1H, f0 // HighRes
nop.i 0
}
-{ .mfi
+{ .mfi
nop.m 0
fms.s1 fRes1L = f8, fA1L, f0 // LowRes
nop.i 0
};;
-{ .mfi
+{ .mfi
nop.m 0
fms.s1 fRes1Hd = f8, fA1H, fRes1H // HighRes delta
nop.i 0
};;
-{ .mfi
+{ .mfi
nop.m 0
fma.s1 fRes = fRes1L, f1, fRes1Hd // LowRes+HighRes delta
nop.i 0
};;
-{ .mfi
+{ .mfi
nop.m 0
fma.s1 fRes = f8, f8, fRes // r=x^2+r
nop.i 0
};;
-{ .mfb
+{ .mfb
nop.m 0
fma.s0 f8 = fRes, f1, fRes1H // res = r+ResHigh
br.ret.sptk b0 // 0, denormals, specials return
// 0 < |x| < 1/8 path /////////////////////////////////////////////////////////
_0_to_1o8:
-{ .mmi
+{ .mmi
adds rAddr1 = 0xB60, rDataPtr // Ptr 1 to coeffs
adds rAddr2 = 0xB80, rDataPtr // Ptr 2 to coeffs
nop.i 0
};;
-{ .mmi
+{ .mmi
ldfpd fA1H, fA1L = [rAddr1], 16 // Load A1High, A1Low
ldfe fA13 = [rAddr2], 16 // Load A13
nop.i 0
};;
-{ .mmi
+{ .mmi
ldfe fA15 = [rAddr1], 48 // Load A15
ldfe fA11 = [rAddr2], 32 // Load A11
nop.i 0
};;
-{ .mmi
+{ .mmi
ldfe fA9 = [rAddr1], 32 // Load A9
ldfe fA7 = [rAddr2], 32 // Load A7
nop.i 0
};;
-{ .mmi
+{ .mmi
ldfe fA5 = [rAddr1] // Load A5
ldfe fA3 = [rAddr2] // Load A3
nop.i 0
};;
-{ .mfi
+{ .mfi
nop.m 0
fms.s1 fRes1H = f8, fA1H, f0 // x*(A1H+A1L)
nop.i 0
}
-{ .mfi
+{ .mfi
nop.m 0
fms.s1 fRes1L = f8, fA1L, f0 // x*(A1H+A1L)
nop.i 0
};;
-{ .mfi
+{ .mfi
nop.m 0
fma.s1 fA11 = fA13, fArgSqr, fA11 // Polynomial tail
nop.i 0
}
-{ .mfi
+{ .mfi
nop.m 0
- fma.s1 fArgFour = fArgSqr, fArgSqr, f0 // a^4
+ fma.s1 fArgFour = fArgSqr, fArgSqr, f0 // a^4
nop.i 0
};;
-{ .mfi
+{ .mfi
nop.m 0
fma.s1 fA3 = fA5, fArgSqr, fA3 // Polynomial tail
nop.i 0
}
-{ .mfi
+{ .mfi
nop.m 0
fma.s1 fA7 = fA9, fArgSqr, fA7 // Polynomial tail
nop.i 0
};;
-{ .mfi
+{ .mfi
nop.m 0
fms.s1 fRes1Hd = f8, fA1H, fRes1H // x*(A1H+A1L) delta
nop.i 0
};;
-{ .mfi
+{ .mfi
nop.m 0
fma.s1 fA11 = fA15, fArgFour, fA11 // Polynomial tail
nop.i 0
};;
-{ .mfi
+{ .mfi
nop.m 0
fma.s1 fA3 = fA7, fArgFour, fA3 // Polynomial tail
nop.i 0
}
-{ .mfi
+{ .mfi
nop.m 0
fma.s1 fArgEight = fArgFour, fArgFour, f0 // a^8
nop.i 0
};;
-{ .mfi
+{ .mfi
nop.m 0
fma.s1 f8 = fRes1L, f1, fRes1Hd // x*(A1H+A1L)
nop.i 0
};;
-{ .mfi
+{ .mfi
nop.m 0
fma.s1 fRes = fA11, fArgEight, fA3 //Polynomial tail result
nop.i 0
};;
-{ .mfi
+{ .mfi
nop.m 0
fma.s1 f8 = fRes, fArgCube, f8 // (Polynomial tail)*x^3
nop.i 0
};;
-{ .mfb
+{ .mfb
nop.m 0
- fma.s0 f8 = f8, f1, fRes1H // (Polynomial tail)*x^3 +
+ fma.s0 f8 = f8, f1, fRes1H // (Polynomial tail)*x^3 +
// + x*(A1H+A1L)
br.ret.sptk b0 // [0;1/8] interval return
};;
-
+
GLOBAL_LIBM_END(erfl)
// 5. x >= 709.7827
// Result overflows. Set I, O, and call error support
//
-// 6. 2^-2 <= x < 709.7827 or -48.0 <= x < -2^-2
+// 6. 2^-2 <= x < 709.7827 or -48.0 <= x < -2^-2
// This is the main path. The algorithm is described below:
// Take the input x. w is "how many log2/128 in x?"
// Here we know result is essentially -1 + eps, where eps only affects
// rounded result. Set I.
//
-// 5. x >= 88.7228
+// 5. x >= 88.7228
// Result overflows. Set I, O, and call error support
//
-// 6. 2^-2 <= x < 88.7228 or -24.0 <= x < -2^-2
+// 6. 2^-2 <= x < 88.7228 or -24.0 <= x < -2^-2
// This is the main path. The algorithm is described below:
// Take the input x. w is "how many log2/128 in x?"
{ .mfb
nop.m 0
-(p7) fma.s.s0 f8 = fA8765432, fXsq, fNormX // Small path,
+(p7) fma.s.s0 f8 = fA8765432, fXsq, fNormX // Small path,
// result=xsq*A8765432+x
(p7) br.ret.spnt b0 // Exit if 2^-40 <= |x| < 2^-2
}
// products derived from this software without specific prior written
// permission.
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
// Intel Corporation is the author of this code, and requests that all
-// problem reports or change requests be submitted to it directly at
+// problem reports or change requests be submitted to it directly at
// http://www.intel.com/software/products/opensource/libraries/num.htm.
//
// History
// 04/17/03 Eliminated misplaced and unused data label
// 12/15/03 Eliminated call to error support on expm1l underflow
//
-//*********************************************************************
+//*********************************************************************
//
// Function: Combined expl(x) and expm1l(x), where
-// x
+// x
// expl(x) = e , for double-extended precision x values
// x
// expm1l(x) = e - 1 for double-extended precision x values
//
-//*********************************************************************
+//*********************************************************************
//
// Resources Used:
//
-// Floating-Point Registers: f8 (Input and Return Value)
-// f9-f15,f32-f77
+// Floating-Point Registers: f8 (Input and Return Value)
+// f9-f15,f32-f77
//
-// General Purpose Registers:
+// General Purpose Registers:
// r14-r38
// r35-r38 (Used to pass arguments to error handling routine)
-//
+//
// Predicate Registers: p6-p15
//
-//*********************************************************************
+//*********************************************************************
//
// IEEE Special Conditions:
//
-// Denormal fault raised on denormal inputs
+// Denormal fault raised on denormal inputs
// Overflow exceptions raised when appropriate for exp and expm1
// Underflow exceptions raised when appropriate for exp and expm1
// (Error Handling Routine called for overflow and Underflow)
-// Inexact raised when appropriate by algorithm
+// Inexact raised when appropriate by algorithm
//
// exp(inf) = inf
// exp(-inf) = +0
// exp(0) = 1
// exp(EM_special Values) = QNaN
// exp(inf) = inf
-// expm1(-inf) = -1
+// expm1(-inf) = -1
// expm1(SNaN) = QNaN
// expm1(QNaN) = QNaN
// expm1(0) = 0
// expm1(EM_special Values) = QNaN
-//
-//*********************************************************************
+//
+//*********************************************************************
//
// Implementation and Algorithm Notes:
//
// p6 for exp,
// p7 for expm1,
//
-// On output,
+// On output,
//
// scale*(Y_hi + Y_lo) approximates exp(X) if exp
// scale*(Y_hi + Y_lo) approximates exp(X)-1 if expm1
//
// The accuracy is sufficient for a highly accurate 64 sig.
-// bit implementation. Safe is set if there is no danger of
-// overflow/underflow when the result is composed from scale,
-// Y_hi and Y_lo. Thus, we can have a fast return if Safe is set.
-// Otherwise, one must prepare to handle the possible exception
-// appropriately. Note that SAFE not set (false) does not mean
+// bit implementation. Safe is set if there is no danger of
+// overflow/underflow when the result is composed from scale,
+// Y_hi and Y_lo. Thus, we can have a fast return if Safe is set.
+// Otherwise, one must prepare to handle the possible exception
+// appropriately. Note that SAFE not set (false) does not mean
// that overflow/underflow will occur; only the setting of SAFE
// guarantees the opposite.
//
-// **** High Level Overview ****
+// **** High Level Overview ****
//
// The method consists of three cases.
-//
+//
// If |X| < Tiny use case exp_tiny;
// else if |X| < 2^(-m) use case exp_small; m=12 for exp, m=7 for expm1
// else use case exp_regular;
//
// Case exp_tiny:
//
-// 1 + X can be used to approximate exp(X)
+// 1 + X can be used to approximate exp(X)
// X + X^2/2 can be used to approximate exp(X) - 1
//
// Case exp_small:
//
-// Here, exp(X) and exp(X) - 1 can all be
+// Here, exp(X) and exp(X) - 1 can all be
// approximated by a relatively simple polynomial.
//
// This polynomial resembles the truncated Taylor series
// r := (X - N*L_hi) - N*L_lo
//
// We pick L_hi such that N*L_hi is representable in 64 sig. bits
-// and thus the FMA X - N*L_hi is error free. So r is the
-// 1 rounding error from an exact reduction with respect to
-//
+// and thus the FMA X - N*L_hi is error free. So r is the
+// 1 rounding error from an exact reduction with respect to
+//
// L_hi + L_lo.
//
// In particular, L_hi has 30 significant bit and can be stored
// Step 2: Approximation
//
// exp(r) - 1 is approximated by a short polynomial of the form
-//
+//
// r + A_1 r^2 + A_2 r^3 + A_3 r^4 .
//
-// Step 3: Composition from Table Values
+// Step 3: Composition from Table Values
//
// The value 2^( N / 2^12 ) can be composed from a couple of tables
// of precalculated values. First, express N as three integers
// lsb's, M_1 is the next 6, and K is simply N shifted right
// arithmetically (sign extended) by 12 bits.
//
-// Now, 2^( N / 2^12 ) is simply
-//
+// Now, 2^( N / 2^12 ) is simply
+//
// 2^K * 2^( M_1 / 2^6 ) * 2^( M_2 / 2^12 )
//
// Clearly, 2^K needs no tabulation. The other two values are less
// Define two mathematical values, delta_1 and delta_2, implicitly
// such that
//
-// T_1 = exp( [M_1 log(2)/2^6] - delta_1 )
+// T_1 = exp( [M_1 log(2)/2^6] - delta_1 )
// T_2 = exp( [M_2 log(2)/2^12] - delta_2 )
//
// are representable as 24 significant bits. To illustrate the idea,
-// we show how we define delta_1:
+// we show how we define delta_1:
//
// T_1 := round_to_24_bits( exp( M_1 log(2)/2^6 ) )
-// delta_1 = (M_1 log(2)/2^6) - log( T_1 )
+// delta_1 = (M_1 log(2)/2^6) - log( T_1 )
//
// The last equality means mathematical equality. We then tabulate
//
// T and W via
//
// T := T_1 * T_2 ...exactly
-// W := W_1 + (1 + W_1)*W_2
+// W := W_1 + (1 + W_1)*W_2
//
// W approximates exp( delta ) - 1 where delta = delta_1 + delta_2.
// The mathematical product of T and (W+1) is an accurate representation
//
// Step 4. Reconstruction
//
-// Finally, we can reconstruct exp(X), exp(X) - 1.
+// Finally, we can reconstruct exp(X), exp(X) - 1.
// Because
//
-// X = K * log(2) + (M_1*log(2)/2^6 - delta_1)
+// X = K * log(2) + (M_1*log(2)/2^6 - delta_1)
// + (M_2*log(2)/2^12 - delta_2)
// + delta_1 + delta_2 + r ...accurately
// We have
//
// exp(X) ~=~ 2^K * ( T + T*[exp(delta_1+delta_2+r) - 1] )
// ~=~ 2^K * ( T + T*[exp(delta + r) - 1] )
-// ~=~ 2^K * ( T + T*[(exp(delta)-1)
+// ~=~ 2^K * ( T + T*[(exp(delta)-1)
// + exp(delta)*(exp(r)-1)] )
// ~=~ 2^K * ( T + T*( W + (1+W)*poly(r) ) )
// ~=~ 2^K * ( Y_hi + Y_lo )
// exp(X)-1 ~=~ 2^K * ( Y_hi + Y_lo ) - 1
// ~=~ 2^K * ( Y_hi + Y_lo - 2^(-K) )
//
-// and we combine Y_hi + Y_lo - 2^(-N) into the form of two
+// and we combine Y_hi + Y_lo - 2^(-N) into the form of two
// numbers Y_hi + Y_lo carefully.
//
// **** Algorithm Details ****
//
// Case exp_tiny:
//
-// The important points are to ensure an accurate result under
-// different rounding directions and a correct setting of the SAFE
+// The important points are to ensure an accurate result under
+// different rounding directions and a correct setting of the SAFE
// flag.
//
// If expm1 is 1, then
// Here we compute a simple polynomial. To exploit parallelism, we split
// the polynomial into several portions.
//
-// Let r = X
+// Let r = X
//
// If exp ...i.e. exp( argument )
//
-// rsq := r * r;
+// rsq := r * r;
// r4 := rsq*rsq
// poly_lo := P_3 + r*(P_4 + r*(P_5 + r*P_6))
// poly_hi := r + rsq*(P_1 + r*P_2)
GR_Parameter_X = r35
GR_Parameter_Y = r36
GR_Parameter_RESULT = r37
-GR_Parameter_TAG = r38
+GR_Parameter_TAG = r38
// Floating Point Registers
//
// double-extended 1/ln(2)
// 3fff b8aa 3b29 5c17 f0bb be87fed0691d3e88
-// 3fff b8aa 3b29 5c17 f0bc
+// 3fff b8aa 3b29 5c17 f0bc
// For speed the significand will be loaded directly with a movl and setf.sig
// and the exponent will be bias+63 instead of bias+0. Thus subsequent
// computations need to scale appropriately.
-// The constant 2^12/ln(2) is needed for the computation of N. This is also
+// The constant 2^12/ln(2) is needed for the computation of N. This is also
// obtained by scaling the computations.
//
-// Two shifting constants are loaded directly with movl and setf.d.
-// 1. RSHF_2TO51 = 1.1000..00 * 2^(63-12)
+// Two shifting constants are loaded directly with movl and setf.d.
+// 1. RSHF_2TO51 = 1.1000..00 * 2^(63-12)
// This constant is added to x*1/ln2 to shift the integer part of
// x*2^12/ln2 into the rightmost bits of the significand.
// The result of this fma is N_signif.
-// 2. RSHF = 1.1000..00 * 2^(63)
+// 2. RSHF = 1.1000..00 * 2^(63)
// This constant is subtracted from N_signif * 2^(-51) to give
// the integer part of N, N_fix, as a floating-point number.
// The result of this fms is float_N.
RODATA
-.align 64
+.align 64
LOCAL_OBJECT_START(Constants_exp_64_Arg)
//data8 0xB8AA3B295C17F0BC,0x0000400B // Inv_L = 2^12/log(2)
data8 0xB17217F400000000,0x00003FF2 // L_hi = hi part log(2)/2^12
LOCAL_OBJECT_END(Constants_exp_64_Q)
LOCAL_OBJECT_START(Constants_exp_64_T1)
-data4 0x3F800000,0x3F8164D2,0x3F82CD87,0x3F843A29
-data4 0x3F85AAC3,0x3F871F62,0x3F88980F,0x3F8A14D5
+data4 0x3F800000,0x3F8164D2,0x3F82CD87,0x3F843A29
+data4 0x3F85AAC3,0x3F871F62,0x3F88980F,0x3F8A14D5
data4 0x3F8B95C2,0x3F8D1ADF,0x3F8EA43A,0x3F9031DC
data4 0x3F91C3D3,0x3F935A2B,0x3F94F4F0,0x3F96942D
data4 0x3F9837F0,0x3F99E046,0x3F9B8D3A,0x3F9D3EDA
LOCAL_OBJECT_END(Constants_exp_64_T1)
LOCAL_OBJECT_START(Constants_exp_64_T2)
-data4 0x3F800000,0x3F80058C,0x3F800B18,0x3F8010A4
-data4 0x3F801630,0x3F801BBD,0x3F80214A,0x3F8026D7
-data4 0x3F802C64,0x3F8031F2,0x3F803780,0x3F803D0E
-data4 0x3F80429C,0x3F80482B,0x3F804DB9,0x3F805349
-data4 0x3F8058D8,0x3F805E67,0x3F8063F7,0x3F806987
-data4 0x3F806F17,0x3F8074A8,0x3F807A39,0x3F807FCA
-data4 0x3F80855B,0x3F808AEC,0x3F80907E,0x3F809610
-data4 0x3F809BA2,0x3F80A135,0x3F80A6C7,0x3F80AC5A
-data4 0x3F80B1ED,0x3F80B781,0x3F80BD14,0x3F80C2A8
-data4 0x3F80C83C,0x3F80CDD1,0x3F80D365,0x3F80D8FA
-data4 0x3F80DE8F,0x3F80E425,0x3F80E9BA,0x3F80EF50
-data4 0x3F80F4E6,0x3F80FA7C,0x3F810013,0x3F8105AA
-data4 0x3F810B41,0x3F8110D8,0x3F81166F,0x3F811C07
-data4 0x3F81219F,0x3F812737,0x3F812CD0,0x3F813269
-data4 0x3F813802,0x3F813D9B,0x3F814334,0x3F8148CE
+data4 0x3F800000,0x3F80058C,0x3F800B18,0x3F8010A4
+data4 0x3F801630,0x3F801BBD,0x3F80214A,0x3F8026D7
+data4 0x3F802C64,0x3F8031F2,0x3F803780,0x3F803D0E
+data4 0x3F80429C,0x3F80482B,0x3F804DB9,0x3F805349
+data4 0x3F8058D8,0x3F805E67,0x3F8063F7,0x3F806987
+data4 0x3F806F17,0x3F8074A8,0x3F807A39,0x3F807FCA
+data4 0x3F80855B,0x3F808AEC,0x3F80907E,0x3F809610
+data4 0x3F809BA2,0x3F80A135,0x3F80A6C7,0x3F80AC5A
+data4 0x3F80B1ED,0x3F80B781,0x3F80BD14,0x3F80C2A8
+data4 0x3F80C83C,0x3F80CDD1,0x3F80D365,0x3F80D8FA
+data4 0x3F80DE8F,0x3F80E425,0x3F80E9BA,0x3F80EF50
+data4 0x3F80F4E6,0x3F80FA7C,0x3F810013,0x3F8105AA
+data4 0x3F810B41,0x3F8110D8,0x3F81166F,0x3F811C07
+data4 0x3F81219F,0x3F812737,0x3F812CD0,0x3F813269
+data4 0x3F813802,0x3F813D9B,0x3F814334,0x3F8148CE
data4 0x3F814E68,0x3F815402,0x3F81599C,0x3F815F37
LOCAL_OBJECT_END(Constants_exp_64_T2)
//
// Set p7 true for expm1, p6 false
-//
+//
{ .mlx
getf.exp GR_signexp_x = f8 // Get sign and exponent of x, redo if unorm
movl GR_sig_inv_ln2 = 0xb8aa3b295c17f0bc // significand of 1/ln2
}
{ .mlx
- addl GR_ad_Arg = @ltoff(Constants_exp_64_Arg#),gp
+ addl GR_ad_Arg = @ltoff(Constants_exp_64_Arg#),gp
movl GR_rshf_2to51 = 0x4718000000000000 // 1.10000 2^(63+51)
}
;;
{ .mfi
ld8 GR_ad_Arg = [GR_ad_Arg] // Point to Arg table
fclass.m p8, p0 = f8, 0x1E7 // Test x for natval, nan, inf, zero
- cmp.eq p7, p6 = r0, r0
+ cmp.eq p7, p6 = r0, r0
}
{ .mfb
mov GR_exp_half = 0x0FFFE // Exponent of 0.5, for very small path
fnorm.s1 FR_norm_x = f8 // Normalize x
- br.cond.sptk exp_continue
+ br.cond.sptk exp_continue
}
;;
GLOBAL_IEEE754_ENTRY(expl)
//
// Set p7 false for exp, p6 true
-//
+//
{ .mlx
getf.exp GR_signexp_x = f8 // Get sign and exponent of x, redo if unorm
movl GR_sig_inv_ln2 = 0xb8aa3b295c17f0bc // significand of 1/ln2
}
{ .mlx
- addl GR_ad_Arg = @ltoff(Constants_exp_64_Arg#),gp
+ addl GR_ad_Arg = @ltoff(Constants_exp_64_Arg#),gp
movl GR_rshf_2to51 = 0x4718000000000000 // 1.10000 2^(63+51)
}
;;
}
;;
-exp_continue:
+exp_continue:
// Form two constants we need
-// 1/ln2 * 2^63 to compute w = x * 1/ln2 * 128
+// 1/ln2 * 2^63 to compute w = x * 1/ln2 * 128
// 1.1000..000 * 2^(63+63-12) to right shift int(N) into the significand
{ .mfi
// Now we are on the main path for |x| >= 2^-m, m=12 for exp, m=7 for expm1
//
-// float_N = round_int(N_signif)
+// float_N = round_int(N_signif)
// The signficand of N_signif contains the rounded integer part of X * 2^12/ln2,
// as a twos complement number in the lower bits (that is, it may be negative).
// That twos complement number (called N) is put into GR_N.
;;
{ .mfi
-(p7) cmp.lt.unc p8, p9 = 10, GR_K // If expm1, set p8 if K > 10
+(p7) cmp.lt.unc p8, p9 = 10, GR_K // If expm1, set p8 if K > 10
fma.s1 FR_poly = FR_r, FR_poly, FR_A1 // poly = r * poly + A1
nop.i 999
}
}
;;
-
-EXP_SMALL:
+
+EXP_SMALL:
// Here if 2^-60 < |x| < 2^-m, m=12 for exp, m=7 for expm1
{ .mfi
(p7) ldfe FR_Q3 = [GR_ad_Q],16 // Get Q3 for small path, if expm1
;;
-EXP_VERY_SMALL:
+EXP_VERY_SMALL:
//
// Here if 0 < |x| < 2^-60
// If exp, result = 1.0 + x
(p7) br.ret.sptk b0 // If expm1, exit
}
;;
-
-
+
+
EXP_OVERFLOW:
// Here if x >= min_oflow_x
{ .mmi
;;
-EXP_64_SPECIAL:
+EXP_64_SPECIAL:
// Here if x natval, nan, inf, zero
// If x natval, +inf, or if expm1 and x zero, just return x.
// The other cases must be tested for, and results set.
;;
-EXP_64_UNSUPPORTED:
+EXP_64_UNSUPPORTED:
// Here if x unsupported type
{ .mfb
nop.m 999
// products derived from this software without specific prior written
// permission.
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
// Intel Corporation is the author of this code, and requests that all
-// problem reports or change requests be submitted to it directly at
+// problem reports or change requests be submitted to it directly at
// http://www.intel.com/software/products/opensource/libraries/num.htm.
//
// History
//==============================================================
-// 02/02/00 Initial version
+// 02/02/00 Initial version
// 02/07/02 Added __libm_fabs entry point to test in case compiler inlines
// 05/20/02 Cleaned up namespace and sf0 syntax
// 02/10/03 Reordered header: .section, .global, .proc, .align
//
// Overview of operation
//==============================================================
-// returns absolute value of x
+// returns absolute value of x
// floating-point registers used: 1
// f8, input
{ .mfi
nop.m 999
- fcmp.eq.unc.s0 p6,p7 = f8,f1
+ fcmp.eq.unc.s0 p6,p7 = f8,f1
nop.i 999 ;;
}
{ .mfb
nop.m 999
- fmerge.s f8 = f0,f8
- br.ret.sptk b0 ;;
+ fmerge.s f8 = f0,f8
+ br.ret.sptk b0 ;;
}
GLOBAL_IEEE754_END(fabs)
// products derived from this software without specific prior written
// permission.
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
// Intel Corporation is the author of this code, and requests that all
-// problem reports or change requests be submitted to it directly at
+// problem reports or change requests be submitted to it directly at
// http://www.intel.com/software/products/opensource/libraries/num.htm.
//
// History
//==============================================================
-// 02/02/00 Initial version
+// 02/02/00 Initial version
// 02/07/02 Added __libm_fabsf entry point to test in case compiler inlines
// 05/20/02 Cleaned up namespace and sf0 syntax
// 02/10/03 Reordered header: .section, .global, .proc, .align
//
// Overview of operation
//==============================================================
-// returns absolute value of x
+// returns absolute value of x
// floating-point registers used: 1
// f8, input
{ .mfi
nop.m 999
- fcmp.eq.unc.s0 p6,p7 = f8,f1
+ fcmp.eq.unc.s0 p6,p7 = f8,f1
nop.i 999 ;;
}
{ .mfb
nop.m 999
- fmerge.s f8 = f0,f8
- br.ret.sptk b0 ;;
+ fmerge.s f8 = f0,f8
+ br.ret.sptk b0 ;;
}
GLOBAL_IEEE754_END(fabsf)
// products derived from this software without specific prior written
// permission.
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
// Intel Corporation is the author of this code, and requests that all
-// problem reports or change requests be submitted to it directly at
+// problem reports or change requests be submitted to it directly at
// http://www.intel.com/software/products/opensource/libraries/num.htm.
//
// History
//==============================================================
-// 02/02/00 Initial version
+// 02/02/00 Initial version
// 02/07/02 Added __libm_fabsl entry point to test in case compiler inlines
// 05/20/02 Cleaned up namespace and sf0 syntax
// 02/10/03 Reordered header: .section, .global, .proc, .align
//
// Overview of operation
//==============================================================
-// returns absolute value of x
+// returns absolute value of x
// floating-point registers used: 1
// f8, input
{ .mfi
nop.m 999
- fcmp.eq.unc.s0 p6,p7 = f8,f1
+ fcmp.eq.unc.s0 p6,p7 = f8,f1
nop.i 999 ;;
}
{ .mfb
nop.m 999
- fmerge.s f8 = f0,f8
- br.ret.sptk b0 ;;
+ fmerge.s f8 = f0,f8
+ br.ret.sptk b0 ;;
}
GLOBAL_IEEE754_END(fabsl)
(p6) mov ret0 = 0
(p7) mov ret0 = 1
br.ret.sptk.many rp
-}
+}
END (__finite)
strong_alias (__finite, __finitef)
// products derived from this software without specific prior written
// permission.
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
// Intel Corporation is the author of this code, and requests that all
-// problem reports or change requests be submitted to it directly at
+// problem reports or change requests be submitted to it directly at
// http://www.intel.com/software/products/opensource/libraries/num.htm.
//
// History
//==============================================================
-// 06/07/01 Initial version
+// 06/07/01 Initial version
// 05/20/02 Cleaned up namespace and sf0 syntax
// 02/10/03 Reordered header: .section, .global, .proc, .align
//
{ .mfb
nop.m 999
fma.d.s0 f8 = f8, f9, f10 // Result = x * y + z
- br.ret.sptk b0
+ br.ret.sptk b0
}
;;
// products derived from this software without specific prior written
// permission.
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
// Intel Corporation is the author of this code, and requests that all
-// problem reports or change requests be submitted to it directly at
+// problem reports or change requests be submitted to it directly at
// http://www.intel.com/software/products/opensource/libraries/num.htm.
//
// History
//==============================================================
-// 06/07/01 Initial version
+// 06/07/01 Initial version
// 05/20/02 Cleaned up namespace and sf0 syntax
// 02/10/03 Reordered header: .section, .global, .proc, .align
//
{ .mfb
nop.m 999
fma.s.s0 f8 = f8, f9, f10 // Result = x * y + z
- br.ret.sptk b0
+ br.ret.sptk b0
}
;;
// products derived from this software without specific prior written
// permission.
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
// Intel Corporation is the author of this code, and requests that all
-// problem reports or change requests be submitted to it directly at
+// problem reports or change requests be submitted to it directly at
// http://www.intel.com/software/products/opensource/libraries/num.htm.
//
// History
//==============================================================
-// 06/07/01 Initial version
+// 06/07/01 Initial version
// 05/20/02 Cleaned up namespace and sf0 syntax
// 02/10/03 Reordered header: .section, .global, .proc, .align
//
{ .mfb
nop.m 999
fma.s0 f8 = f8, f9, f10 // Result = x * y + z
- br.ret.sptk b0
+ br.ret.sptk b0
}
;;
// products derived from this software without specific prior written
// permission.
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
// Intel Corporation is the author of this code, and requests that all
-// problem reports or change requests be submitted to it directly at
+// problem reports or change requests be submitted to it directly at
// http://www.intel.com/software/products/opensource/libraries/num.htm.
//
// History
//==============================================================
-// 05/31/01 Initial version
+// 05/31/01 Initial version
// 05/20/02 Cleaned up namespace and sf0 syntax
// 02/10/03 Reordered header: .section, .global, .proc, .align
//
// Overview of operation
//==============================================================
// returns the algebraic maximum of 2 input values
-//
+//
// Special cases:
// fmax(x, nan) returns x if x is numeric // Must special case this one
// fmax(nan, y) returns y if y is numeric
// fmax(-0,+0) returns +0
// fmax(-0,-0) returns -0
// fmax(+0,-0) returns +0 // Must special case this one
-//
+//
// SNaN causes invalid to be set
// floating-point registers used: 2
{ .mfb
nop.m 999
(p10) fmerge.s f8 = f9, f9 // If x nan, return y, else do nothing (returns x)
- br.ret.sptk b0
+ br.ret.sptk b0
}
;;
// products derived from this software without specific prior written
// permission.
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
// Intel Corporation is the author of this code, and requests that all
-// problem reports or change requests be submitted to it directly at
+// problem reports or change requests be submitted to it directly at
// http://www.intel.com/software/products/opensource/libraries/num.htm.
//
// History
//==============================================================
-// 05/31/01 Initial version
+// 05/31/01 Initial version
// 05/20/02 Cleaned up namespace and sf0 syntax
// 02/10/03 Reordered header: .section, .global, .proc, .align
//
// Overview of operation
//==============================================================
// returns the algebraic maximum of 2 input values
-//
+//
// Special cases:
// fmaxf(x, nan) returns x if x is numeric // Must special case this one
// fmaxf(nan, y) returns y if y is numeric
// fmaxf(-0,+0) returns +0
// fmaxf(-0,-0) returns -0
// fmaxf(+0,-0) returns +0 // Must special case this one
-//
+//
// SNaN causes invalid to be set
// floating-point registers used: 2
{ .mfb
nop.m 999
(p10) fmerge.s f8 = f9, f9 // If x nan, return y, else do nothing (returns x)
- br.ret.sptk b0
+ br.ret.sptk b0
}
;;
// products derived from this software without specific prior written
// permission.
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
// Intel Corporation is the author of this code, and requests that all
-// problem reports or change requests be submitted to it directly at
+// problem reports or change requests be submitted to it directly at
// http://www.intel.com/software/products/opensource/libraries/num.htm.
//
// History
//==============================================================
-// 05/31/01 Initial version
+// 05/31/01 Initial version
// 05/20/02 Cleaned up namespace and sf0 syntax
// 02/10/03 Reordered header: .section, .global, .proc, .align
//
// Overview of operation
//==============================================================
// returns the algebraic maximum of 2 input values
-//
+//
// Special cases:
// fmaxl(x, nan) returns x if x is numeric // Must special case this one
// fmaxl(nan, y) returns y if y is numeric
// fmaxl(-0,+0) returns +0
// fmaxl(-0,-0) returns -0
// fmaxl(+0,-0) returns +0 // Must special case this one
-//
+//
// SNaN causes invalid to be set
// floating-point registers used: 2
{ .mfb
nop.m 999
(p10) fmerge.s f8 = f9, f9 // If x nan, return y, else do nothing (returns x)
- br.ret.sptk b0
+ br.ret.sptk b0
}
;;
(p7) fclass.m p7, p8 = farg0, @inf
(p6) br.ret.sptk.many rp
;;
-}
+}
{
.mfb
(p7) mov ret0 = 1
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
double frexp(double x, int *y)
{
-#ifdef SIZE_INT_64
+#ifdef SIZE_INT_64
return( __libm_frexp(x, y, 1) );
#else
-#ifdef SIZE_INT_32
+#ifdef SIZE_INT_32
return( __libm_frexp(x, y, 0) );
#endif
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
float frexpf(float x, int *y)
{
-#ifdef SIZE_INT_64
+#ifdef SIZE_INT_64
return( __libm_frexpf(x, y, 1) );
#else
-#ifdef SIZE_INT_32
+#ifdef SIZE_INT_32
return( __libm_frexpf(x, y, 0) );
#endif
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
long double frexpl(long double x, int *y)
{
-#ifdef SIZE_INT_64
+#ifdef SIZE_INT_64
return( __libm_frexpl(x, y, 1) );
#else
-#ifdef SIZE_INT_32
+#ifdef SIZE_INT_32
return( __libm_frexpl(x, y, 0) );
#endif
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
double ldexp(double x, int n)
{
-#ifdef SIZE_INT_64
- return __libm_ldexp(x,n,1);
+#ifdef SIZE_INT_64
+ return __libm_ldexp(x,n,1);
#else
-#ifdef SIZE_INT_32
+#ifdef SIZE_INT_32
return __libm_ldexp(x,n,0);
#endif
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
float ldexpf(float x, int n)
{
-#ifdef SIZE_INT_64
- return __libm_ldexpf(x,n,1);
+#ifdef SIZE_INT_64
+ return __libm_ldexpf(x,n,1);
#else
-#ifdef SIZE_INT_32
+#ifdef SIZE_INT_32
return __libm_ldexpf(x,n,0);
#endif
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
long double ldexpl(long double x, int n)
{
-#ifdef SIZE_INT_64
- return __libm_ldexpl(x,n,1);
+#ifdef SIZE_INT_64
+ return __libm_ldexpl(x,n,1);
#else
-#ifdef SIZE_INT_32
+#ifdef SIZE_INT_32
return __libm_ldexpl(x,n,0);
#endif
-.file "log1pl.s"
+.file "log1pl.s"
// Copyright (c) 2000 - 2003, Intel Corporation
// products derived from this software without specific prior written
// permission.
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
// Intel Corporation is the author of this code, and requests that all
-// problem reports or change requests be submitted to it directly at
+// problem reports or change requests be submitted to it directly at
// http://www.intel.com/software/products/opensource/libraries/num.htm.
//
//*********************************************************************
//
-// History:
+// History:
// 02/02/00 Initial version
// 04/04/00 Unwind support added
// 08/15/00 Bundle added after call to __libm_error_support to properly
// IEEE Special Conditions:
//
// Denormal fault raised on denormal inputs
-// Overflow exceptions cannot occur
-// Underflow exceptions raised when appropriate for log1p
+// Overflow exceptions cannot occur
+// Underflow exceptions raised when appropriate for log1p
// Inexact raised when appropriate by algorithm
//
// log1pl(inf) = inf
-// log1pl(-inf) = QNaN
-// log1pl(+/-0) = +/-0
-// log1pl(-1) = -inf
+// log1pl(-inf) = QNaN
+// log1pl(+/-0) = +/-0
+// log1pl(-1) = -inf
// log1pl(SNaN) = QNaN
// log1pl(QNaN) = QNaN
// log1pl(EM_special Values) = QNaN
// log1pl( X ) = log( X+1 ) can be approximated by a simple polynomial
// in W = X. This polynomial resembles the truncated Taylor
// series W - W^/2 + W^3/3 - ...
-//
+//
// Case log_regular:
//
// Here we use a table lookup method. The basic idea is that in
-// order to compute logl(Arg) = log1pl (Arg-1) for an argument Arg in [1,2),
+// order to compute logl(Arg) = log1pl (Arg-1) for an argument Arg in [1,2),
// we construct a value G such that G*Arg is close to 1 and that
// logl(1/G) is obtainable easily from a table of values calculated
// beforehand. Thus
// G := G_1 * G_2 * G_3
// r := (G * S_hi - 1) + G * S_lo
//
-// These G_j's have the property that the product is exactly
+// These G_j's have the property that the product is exactly
// representable and that |r| < 2^(-12) as a result.
//
// Step 2: Approximation
//
// Although log1pl(X) is basically X, we would like to preserve the inexactness
// nature as well as consistent behavior under different rounding modes.
-// We can do this by computing the result as
-//
+// We can do this by computing the result as
+//
// log1pl(X) = X - X*X
//
//
//
// Here we compute a simple polynomial. To exploit parallelism, we split
// the polynomial into two portions.
-//
+//
// W := X
// Wsq := W * W
// W4 := Wsq*Wsq
// with 1.0000 in fixed point.
//
//
-// Define A_2 := 1.0 0 0 0 d_5 d_6 d_7 d_8. This is X_1
+// Define A_2 := 1.0 0 0 0 d_5 d_6 d_7 d_8. This is X_1
// truncated to lsb = 2^(-8). Similar to A_1,
// A_2 is not needed in actual implementation. It
// helps explain how some of the values are defined.
// Fetch G_3 := (1/A_3) truncated to 21 sig. bits.
// floating pt. Fetch is done using index_3.
//
-// Compute G := G_1 * G_2 * G_3.
+// Compute G := G_1 * G_2 * G_3.
//
// This is done exactly since each of G_j only has 21 sig. bits.
//
-// Compute
+// Compute
//
// r := (G*S_hi - 1) + G*S_lo using 2 FMA operations.
//
// Finally
//
// Y_hi := N*log2_hi + SUM ( log1byGj_hi )
-// Y_lo := poly_hi + [ poly_lo +
+// Y_lo := poly_hi + [ poly_lo +
// ( SUM ( log1byGj_lo ) + N*log2_lo ) ]
//
// ************* DO NOT CHANGE THE ORDER OF THESE TABLES *************
-// P_8, P_7, P_6, P_5, P_4, P_3, P_2, and P_1
+// P_8, P_7, P_6, P_5, P_4, P_3, P_2, and P_1
LOCAL_OBJECT_START(Constants_P)
//data4 0xEFD62B15,0xE3936754,0x00003FFB,0x00000000
data8 0xFFFFFFFFFFFFFFFE,0x0000BFFD
LOCAL_OBJECT_END(Constants_P)
-// log2_hi, log2_lo, Q_4, Q_3, Q_2, and Q_1
+// log2_hi, log2_lo, Q_4, Q_3, Q_2, and Q_1
LOCAL_OBJECT_START(Constants_Q)
-//data4 0x00000000,0xB1721800,0x00003FFE,0x00000000
+//data4 0x00000000,0xB1721800,0x00003FFE,0x00000000
//data4 0x4361C4C6,0x82E30865,0x0000BFE2,0x00000000
//data4 0x328833CB,0xCCCCCAF2,0x00003FFC,0x00000000
//data4 0xA9D4BAFB,0x80000077,0x0000BFFD,0x00000000
//data4 0xAAABE3D2,0xAAAAAAAA,0x00003FFD,0x00000000
-//data4 0xFFFFDAB7,0xFFFFFFFF,0x0000BFFD,0x00000000
+//data4 0xFFFFDAB7,0xFFFFFFFF,0x0000BFFD,0x00000000
data8 0xB172180000000000,0x00003FFE
data8 0x82E308654361C4C6,0x0000BFE2
data8 0xCCCCCAF2328833CB,0x00003FFC
// Z1 - 16 bit fixed
-
+
LOCAL_OBJECT_START(Constants_Z_1)
data4 0x00008000
data4 0x00007879
data8 0xBE049391B6B7C239
LOCAL_OBJECT_END(Constants_G_H_h2)
-// G3 and H3 - IEEE single and h3 - IEEE double
+// G3 and H3 - IEEE single and h3 - IEEE double
LOCAL_OBJECT_START(Constants_G_H_h3)
data4 0x3F7FFC00,0x38800100
// Floating Point Registers
-FR_Input_X = f8
+FR_Input_X = f8
-FR_Y_hi = f34
+FR_Y_hi = f34
FR_Y_lo = f35
FR_Scale = f36
-FR_X_Prime = f37
-FR_S_hi = f38
+FR_X_Prime = f37
+FR_S_hi = f38
FR_W = f39
FR_G = f40
FR_H = f41
-FR_wsq = f42
+FR_wsq = f42
FR_w4 = f43
FR_h = f44
-FR_w6 = f45
+FR_w6 = f45
FR_G2 = f46
FR_H2 = f47
FR_poly_lo = f48
-FR_P8 = f49
+FR_P8 = f49
FR_poly_hi = f50
-FR_P7 = f51
-FR_h2 = f52
-FR_rsq = f53
+FR_P7 = f51
+FR_h2 = f52
+FR_rsq = f53
FR_P6 = f54
-FR_r = f55
-
-FR_log2_hi = f56
-FR_log2_lo = f57
-FR_p87 = f58
-FR_p876 = f58
-FR_p8765 = f58
-FR_float_N = f59
-FR_Q4 = f60
-
-FR_p43 = f61
-FR_p432 = f61
-FR_p4321 = f61
-FR_P4 = f62
-FR_G3 = f63
-FR_H3 = f64
-FR_h3 = f65
-
-FR_Q3 = f66
-FR_P3 = f67
-FR_Q2 = f68
-FR_P2 = f69
-FR_1LN10_hi = f70
-
-FR_Q1 = f71
-FR_P1 = f72
-FR_1LN10_lo = f73
-FR_P5 = f74
-FR_rcub = f75
-
-FR_Output_X_tmp = f76
-FR_Neg_One = f77
-FR_Z = f78
-FR_AA = f79
-FR_BB = f80
-FR_S_lo = f81
-FR_2_to_minus_N = f82
+FR_r = f55
+
+FR_log2_hi = f56
+FR_log2_lo = f57
+FR_p87 = f58
+FR_p876 = f58
+FR_p8765 = f58
+FR_float_N = f59
+FR_Q4 = f60
+
+FR_p43 = f61
+FR_p432 = f61
+FR_p4321 = f61
+FR_P4 = f62
+FR_G3 = f63
+FR_H3 = f64
+FR_h3 = f65
+
+FR_Q3 = f66
+FR_P3 = f67
+FR_Q2 = f68
+FR_P2 = f69
+FR_1LN10_hi = f70
+
+FR_Q1 = f71
+FR_P1 = f72
+FR_1LN10_lo = f73
+FR_P5 = f74
+FR_rcub = f75
+
+FR_Output_X_tmp = f76
+FR_Neg_One = f77
+FR_Z = f78
+FR_AA = f79
+FR_BB = f80
+FR_S_lo = f81
+FR_2_to_minus_N = f82
FR_X = f8
FR_Y = f0
// General Purpose Registers
GR_ad_p = r33
-GR_Index1 = r34
-GR_Index2 = r35
-GR_signif = r36
-GR_X_0 = r37
-GR_X_1 = r38
-GR_X_2 = r39
+GR_Index1 = r34
+GR_Index2 = r35
+GR_signif = r36
+GR_X_0 = r37
+GR_X_1 = r38
+GR_X_2 = r39
GR_minus_N = r39
-GR_Z_1 = r40
-GR_Z_2 = r41
-GR_N = r42
-GR_Bias = r43
-GR_M = r44
-GR_Index3 = r45
-GR_exp_2tom80 = r45
+GR_Z_1 = r40
+GR_Z_2 = r41
+GR_N = r42
+GR_Bias = r43
+GR_M = r44
+GR_Index3 = r45
+GR_exp_2tom80 = r45
GR_ad_p2 = r46
-GR_exp_mask = r47
-GR_exp_2tom7 = r48
-GR_ad_ln10 = r49
+GR_exp_mask = r47
+GR_exp_2tom7 = r48
+GR_ad_ln10 = r49
GR_ad_tbl_1 = r50
GR_ad_tbl_2 = r51
GR_ad_tbl_3 = r52
//
{ .mmi
ldfe FR_log2_lo = [GR_ad_q],16 // Load log2_lo
- sub GR_N = GR_N, GR_Bias
+ sub GR_N = GR_N, GR_Bias
mov GR_exp_2tom80 = 0x0ffaf // Exponent of 2^-80
}
;;
{ .mfi
ldfe FR_Q4 = [GR_ad_q],16 // Load Q4
- fms.s1 FR_S_lo = FR_AA, f1, FR_Z // Form S_lo = AA - Z
+ fms.s1 FR_S_lo = FR_AA, f1, FR_Z // Form S_lo = AA - Z
sub GR_minus_N = GR_Bias, GR_N // Form exponent of 2^(-N)
}
;;
{ .mmi
getf.exp GR_M = FR_W // Get signexp of w = x
ldfe FR_Q2 = [GR_ad_q],16 // Load Q2
- extr.u GR_Index2 = GR_X_1, 6, 4 // Extract bits 6-9 of X_1
+ extr.u GR_Index2 = GR_X_1, 6, 4 // Extract bits 6-9 of X_1
}
;;
{ .mfi
nop.m 999
-(p9) fadd.s1 FR_Y_lo = FR_poly_hi, FR_poly_lo // Y_lo = poly_hi + poly_lo
+(p9) fadd.s1 FR_Y_lo = FR_poly_hi, FR_poly_lo // Y_lo = poly_hi + poly_lo
nop.i 999
}
;;
// Here if x=-1
-LOG1P_EQ_Minus_1:
+LOG1P_EQ_Minus_1:
//
// If x=-1 raise divide by zero and return -inf
-//
+//
{ .mfi
mov GR_Parameter_TAG = 138
- fsub.s1 FR_Output_X_tmp = f0, f1
+ fsub.s1 FR_Output_X_tmp = f0, f1
nop.i 999
}
;;
{ .mfb
nop.m 999
- frcpa.s0 FR_Output_X_tmp, p8 = FR_Output_X_tmp, f0
+ frcpa.s0 FR_Output_X_tmp, p8 = FR_Output_X_tmp, f0
br.cond.sptk __libm_error_region
}
;;
-LOG1P_special:
+LOG1P_special:
{ .mfi
nop.m 999
fclass.m.unc p8, p0 = FR_Input_X, 0x1E1 // Test for natval, nan, +inf
}
;;
-//
+//
// For SNaN raise invalid and return QNaN.
// For QNaN raise invalid and return QNaN.
// For +Inf return +Inf.
-//
+//
{ .mfb
nop.m 999
-(p8) fmpy.s0 f8 = FR_Input_X, f1
+(p8) fmpy.s0 f8 = FR_Input_X, f1
(p8) br.ret.sptk b0 // Return for natval, nan, +inf
}
;;
-//
+//
// For -Inf raise invalid and return QNaN.
-//
+//
{ .mfb
mov GR_Parameter_TAG = 139
- fmpy.s0 FR_Output_X_tmp = FR_Input_X, f0
+ fmpy.s0 FR_Output_X_tmp = FR_Input_X, f0
br.cond.sptk __libm_error_region
}
;;
-LOG1P_unsupported:
-//
+LOG1P_unsupported:
+//
// Return generated NaN or other value.
-//
+//
{ .mfb
nop.m 999
- fmpy.s0 f8 = FR_Input_X, f0
+ fmpy.s0 f8 = FR_Input_X, f0
br.ret.sptk b0
}
;;
// Here if -inf < x < -1
-LOG1P_LT_Minus_1:
-//
+LOG1P_LT_Minus_1:
+//
// Deal with x < -1 in a special way - raise
// invalid and produce QNaN indefinite.
-//
+//
{ .mfb
mov GR_Parameter_TAG = 139
frcpa.s0 FR_Output_X_tmp, p8 = f0, f0
// products derived from this software without specific prior written
// permission.
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
// Intel Corporation is the author of this code, and requests that all
-// problem reports or change requests be submitted to it directly at
+// problem reports or change requests be submitted to it directly at
// http://www.intel.com/software/products/opensource/libraries/num.htm.
//
// History
//==============================================================
// 02/02/00 Initial version
// 04/04/00 Improved speed, corrected result for NaN input
-// 12/22/00 Fixed so inexact flag is never set, and invalid is not set for
+// 12/22/00 Fixed so inexact flag is never set, and invalid is not set for
// qnans nor for inputs larger than 2^63.
// 05/20/02 Cleaned up namespace and sf0 syntax
// 02/10/03 Reordered header: .section, .global, .proc, .align
// CALCULATION: NOT HUGE, NOT SMALL
// To get the integer part
-// Take the floating-point input and truncate
+// Take the floating-point input and truncate
// then convert this integer to fp Call it MODF_INTEGER_PART
// Subtract MODF_INTEGER_PART from MODF_NORM_F8 to get fraction part
-// Then put fraction part in f8
+// Then put fraction part in f8
// put integer part MODF_INTEGER_PART into *iptr
// Registers used
//==============================================================
-// predicate registers used:
+// predicate registers used:
// p6 - p13
// 0xFFFF 0x10033
// p13 --------------------------------------------------->|
//
-// floating-point registers used:
+// floating-point registers used:
MODF_NORM_F8 = f9
MODF_FRACTION_PART = f10
MODF_INTEGER_PART = f11
MODF_INT_INTEGER_PART = f12
-// general registers used
+// general registers used
modf_signexp = r14
modf_GR_no_frac = r15
modf_GR_FFFF = r16
-modf_17_ones = r17
+modf_17_ones = r17
modf_exp = r18
// r33 = iptr
-
+
.section .text
GLOBAL_LIBM_ENTRY(modf)
// Assume input is normalized and get signexp
// Normalize input just in case
-// Form exponent bias
+// Form exponent bias
{ .mfi
getf.exp modf_signexp = f8
fnorm.s0 MODF_NORM_F8 = f8
// Is x unnorm?
// qnan snan inf norm unorm 0 -+
// 0 0 0 0 1 0 11 = 0x0b UNORM
-// Set p13 to indicate calculation path, else p6 if nan or inf
+// Set p13 to indicate calculation path, else p6 if nan or inf
{ .mfi
- and modf_exp = modf_17_ones, modf_signexp
+ and modf_exp = modf_17_ones, modf_signexp
fclass.m.unc p8,p0 = f8, 0x0b
nop.i 999 ;;
}
// products derived from this software without specific prior written
// permission.
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
// Intel Corporation is the author of this code, and requests that all
-// problem reports or change requests be submitted to it directly at
+// problem reports or change requests be submitted to it directly at
// http://www.intel.com/software/products/opensource/libraries/num.htm.
//
// History
//==============================================================
// 02/02/00 Initial version
// 04/04/00 Improved speed, corrected result for NaN input
-// 12/22/00 Fixed so inexact flag is never set, and invalid is not set for
+// 12/22/00 Fixed so inexact flag is never set, and invalid is not set for
// qnans nor for inputs larger than 2^63.
// 05/20/02 Cleaned up namespace and sf0 syntax
// 02/10/03 Reordered header: .section, .global, .proc, .align
// CALCULATION: NOT HUGE, NOT SMALL
// To get the integer part
-// Take the floating-point input and truncate
+// Take the floating-point input and truncate
// then convert this integer to fp Call it MODF_INTEGER_PART
// Subtract MODF_INTEGER_PART from MODF_NORM_F8 to get fraction part
-// Then put fraction part in f8
+// Then put fraction part in f8
// put integer part MODF_INTEGER_PART into *iptr
// Registers used
//==============================================================
-// predicate registers used:
+// predicate registers used:
// p6 - p13
// 0xFFFF 0x10016
// p13 --------------------------------------------------->|
//
-// floating-point registers used:
+// floating-point registers used:
MODF_NORM_F8 = f9
MODF_FRACTION_PART = f10
MODF_INTEGER_PART = f11
MODF_INT_INTEGER_PART = f12
-// general registers used
+// general registers used
modf_signexp = r14
modf_GR_no_frac = r15
modf_GR_FFFF = r16
-modf_17_ones = r17
+modf_17_ones = r17
modf_exp = r18
// r33 = iptr
-
+
.section .text
GLOBAL_LIBM_ENTRY(modff)
// Assume input is normalized and get signexp
// Normalize input just in case
-// Form exponent bias
+// Form exponent bias
{ .mfi
getf.exp modf_signexp = f8
fnorm.s0 MODF_NORM_F8 = f8
// Is x unnorm?
// qnan snan inf norm unorm 0 -+
// 0 0 0 0 1 0 11 = 0x0b UNORM
-// Set p13 to indicate calculation path, else p6 if nan or inf
+// Set p13 to indicate calculation path, else p6 if nan or inf
{ .mfi
- and modf_exp = modf_17_ones, modf_signexp
+ and modf_exp = modf_17_ones, modf_signexp
fclass.m.unc p8,p0 = f8, 0x0b
nop.i 999 ;;
}
// products derived from this software without specific prior written
// permission.
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
// Intel Corporation is the author of this code, and requests that all
-// problem reports or change requests be submitted to it directly at
+// problem reports or change requests be submitted to it directly at
// http://www.intel.com/software/products/opensource/libraries/num.htm.
//
// History
// 02/02/00 Initial version
// 04/04/00 Improved speed, corrected result for NaN input
// 05/30/00 Fixed bug for exponent 0x1003e
-// 12/22/00 Fixed so inexact flag is never set, and invalid is not set for
+// 12/22/00 Fixed so inexact flag is never set, and invalid is not set for
// qnans nor for inputs larger than 2^63.
// 05/20/02 Cleaned up namespace and sf0 syntax
// 02/10/03 Reordered header: .section, .global, .proc, .align
// CALCULATION: NOT HUGE, NOT SMALL
// To get the integer part
-// Take the floating-point input and truncate
+// Take the floating-point input and truncate
// then convert this integer to fp Call it MODF_INTEGER_PART
// Subtract MODF_INTEGER_PART from MODF_NORM_F8 to get fraction part
-// Then put fraction part in f8
+// Then put fraction part in f8
// put integer part MODF_INTEGER_PART into *iptr
// Registers used
//==============================================================
-// predicate registers used:
+// predicate registers used:
// p6 - p13
// 0xFFFF 0x1003e
// p13 --------------------------------------------------->|
//
-// floating-point registers used:
+// floating-point registers used:
MODF_NORM_F8 = f9
MODF_FRACTION_PART = f10
MODF_INTEGER_PART = f11
MODF_INT_INTEGER_PART = f12
-// general registers used
+// general registers used
modf_signexp = r14
modf_GR_no_frac = r15
modf_GR_FFFF = r16
-modf_17_ones = r17
+modf_17_ones = r17
modf_exp = r18
// r34 = iptr
-
+
.section .text
GLOBAL_LIBM_ENTRY(modfl)
// Assume input is normalized and get signexp
// Normalize input just in case
-// Form exponent bias
+// Form exponent bias
{ .mfi
getf.exp modf_signexp = f8
fnorm.s0 MODF_NORM_F8 = f8
// Is x unnorm?
// qnan snan inf norm unorm 0 -+
// 0 0 0 0 1 0 11 = 0x0b UNORM
-// Set p13 to indicate calculation path, else p6 if nan or inf
+// Set p13 to indicate calculation path, else p6 if nan or inf
{ .mfi
- and modf_exp = modf_17_ones, modf_signexp
+ and modf_exp = modf_17_ones, modf_signexp
fclass.m.unc p8,p0 = f8, 0x0b
nop.i 999 ;;
}
// products derived from this software without specific prior written
// permission.
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
// Intel Corporation is the author of this code, and requests that all
-// problem reports or change requests be submitted to it directly at
+// problem reports or change requests be submitted to it directly at
// http://www.intel.com/software/products/opensource/libraries/num.htm.
//
// History
//==============================================================
-// 02/02/00 Initial version
+// 02/02/00 Initial version
// 03/03/00 Modified to conform to C9X, and improve speed of main path
// 03/14/00 Fixed case where x is a power of 2, and x > y, improved speed
// 04/04/00 Unwind support added
//
// Overview of operation
//==============================================================
-// nextafter determines the next representable value
-// after x in the direction of y.
+// nextafter determines the next representable value
+// after x in the direction of y.
.section .text
// Form smallest denormal significand = ulp size
{ .mfi
getf.exp GR_exp = f8
- fcmp.lt.s1 p10,p11 = f8, f9
+ fcmp.lt.s1 p10,p11 = f8, f9
addl GR_sden_sig = 0x800, r0
}
// Form largest normal significand 0xfffffffffffff800
// Form largest normal exponent
{ .mfi
getf.sig GR_sig = f8
- fcmp.eq.s0 p6,p0 = f8, f9
+ fcmp.eq.s0 p6,p0 = f8, f9
addl GR_max_pexp = 0x103fe, r0
}
// Move largest normal significand to fp reg for special cases
// It decreases (p13 set) if x<y and x<0 or if x>y and x>=0
{ .mfi
setf.sig FR_sden_sig = GR_sden_sig
- fclass.m p8,p0 = f8, 0xc3
+ fclass.m p8,p0 = f8, 0xc3
(p10) cmp.lt p12,p13 = GR_exp, GR_sign_mask
}
{ .mfi
// Form new exponent in case result exponent needs incrementing or decrementing
{ .mfi
setf.exp FR_new_exp = GR_exp
- fclass.m p9,p0 = f9, 0xc3
+ fclass.m p9,p0 = f9, 0xc3
(p12) add GR_exp1 = 1, GR_exp
}
{ .mib
}
{ .mfb
nop.m 999
-(p8) fma.s0 f8 = f8,f1,f9
+(p8) fma.s0 f8 = f8,f1,f9
(p8) br.ret.spnt b0 ;; // Exit if x=nan
}
// Is x=inf?
{ .mfi
setf.exp FR_exp1 = GR_exp1
- fclass.m p6,p0 = f8, 0x23
+ fclass.m p6,p0 = f8, 0x23
addl GR_exp_mask = 0x1ffff, r0
}
{ .mfb
setf.sig FR_snorm_sig = GR_snorm_sig
-(p9) fma.s0 f8 = f8,f1,f9
+(p9) fma.s0 f8 = f8,f1,f9
(p9) br.ret.spnt b0 ;; // Exit if y=nan
}
{ .mfb
setf.sig FR_lden_sig = GR_lden_sig
mov FR_save_f8 = f8
-(p7) br.cond.spnt NEXT_ZERO ;; // Exit if x=0
+(p7) br.cond.spnt NEXT_ZERO ;; // Exit if x=0
}
// Mask off the sign to get x_exp
{ .mfb
and GR_x_exp = GR_exp_mask, GR_exp
nop.f 999
-(p6) br.cond.spnt NEXT_INF ;; // Exit if x=inf
+(p6) br.cond.spnt NEXT_INF ;; // Exit if x=inf
}
// Check 6 special cases when significand rolls over:
// Set p9, result is sig=max_den_sig, exp same, signal underflow and inexact
// 5 sig size decr, x_sig=min_den_sig, x_exp = min_exp
// Set p10, result is zero, sign of x, signal underflow and inexact
-// 6 sig size decr, x_sig=min_sig, x_exp < min_exp
+// 6 sig size decr, x_sig=min_sig, x_exp < min_exp
// Set p14, result is zero, sign of x, signal underflow and inexact
//
// Form exponent of smallest double denormal (if normalized register format)
br.cond.sptk NEXT_COMMON_FINISH ;;
}
-NEXT_INF:
+NEXT_INF:
// Here if f8 is +- infinity
// INF
// if f8 is +inf, no matter what y is return largest double
{ .mfb
nop.m 999
- fmerge.s f8 = f8,FR_lnorm
- br.ret.sptk b0 ;;
+ fmerge.s f8 = f8,FR_lnorm
+ br.ret.sptk b0 ;;
}
-NEXT_ZERO:
+NEXT_ZERO:
// Here if f8 is +- zero
// ZERO
-// if f8 is zero and y is +, return + smallest double denormal
-// if f8 is zero and y is -, return - smallest double denormal
+// if f8 is zero and y is +, return + smallest double denormal
+// if f8 is zero and y is -, return - smallest double denormal
{ .mfi
nop.m 999
// Add correct sign from direction arg
{ .mfi
nop.m 999
- fmerge.s f8 = f9,FR_sden
+ fmerge.s f8 = f9,FR_sden
nop.i 999 ;;
}
br.cond.sptk NEXT_UNDERFLOW ;;
}
-NEXT_UNDERFLOW:
+NEXT_UNDERFLOW:
// Here if result is a denorm, or input is finite and result is zero
// Call error support to report possible range error
{ .mib
}
;;
-NEXT_OVERFLOW:
+NEXT_OVERFLOW:
// Here if input is finite, but result will be infinite
// Use frcpa to generate infinity of correct sign
// Call error support to report possible range error
{ .mib
stfd [GR_Parameter_X] = FR_save_f8 // STORE Parameter 1 on stack
add GR_Parameter_RESULT = 0,GR_Parameter_Y // Parameter 3 address
- nop.b 0
+ nop.b 0
}
{ .mib
stfd [GR_Parameter_Y] = f8 // STORE Parameter 3 on stack
// products derived from this software without specific prior written
// permission.
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
// Intel Corporation is the author of this code, and requests that all
-// problem reports or change requests be submitted to it directly at
+// problem reports or change requests be submitted to it directly at
// http://www.intel.com/software/products/opensource/libraries/num.htm.
//
// History
//==============================================================
-// 02/02/00 Initial version
+// 02/02/00 Initial version
// 03/03/00 Modified to conform to C9X, and improve speed of main path
// 03/14/00 Fixed case where x is a power of 2, and x > y, improved speed
// 04/04/00 Unwind support added
//
// Overview of operation
//==============================================================
-// nextafterf determines the next representable value
-// after x in the direction of y.
+// nextafterf determines the next representable value
+// after x in the direction of y.
.section .text
// Extract significand from x
// Form largest normal significand
{ .mlx
- nop.m 0
+ nop.m 0
movl GR_lnorm_sig = 0xffffff0000000000 ;;
}
// Move largest normal significand to fp reg for special cases
{ .mfi
setf.sig FR_lnorm_sig = GR_lnorm_sig
- nop.f 0
+ nop.f 0
addl GR_sign_mask = 0x20000, r0 ;;
}
// It decreases (p13 set) if x<y and x<0 or if x>y and x>=0
{ .mfi
setf.sig FR_sden_sig = GR_sden_sig
- fclass.m p8,p0 = f8, 0xc3
+ fclass.m p8,p0 = f8, 0xc3
(p10) cmp.lt p12,p13 = GR_exp, GR_sign_mask
}
{ .mfi
// Form new exponent in case result exponent needs incrementing or decrementing
{ .mfi
setf.exp FR_new_exp = GR_exp
- fclass.m p9,p0 = f9, 0xc3
+ fclass.m p9,p0 = f9, 0xc3
(p12) add GR_exp1 = 1, GR_exp
}
{ .mib
}
{ .mfb
nop.m 999
-(p8) fma.s0 f8 = f8,f1,f9
+(p8) fma.s0 f8 = f8,f1,f9
(p8) br.ret.spnt b0 ;; // Exit if x=nan
}
// Is x=inf?
{ .mfi
setf.exp FR_exp1 = GR_exp1
- fclass.m p6,p0 = f8, 0x23
+ fclass.m p6,p0 = f8, 0x23
addl GR_exp_mask = 0x1ffff, r0
}
{ .mfb
setf.sig FR_snorm_sig = GR_snorm_sig
-(p9) fma.s0 f8 = f8,f1,f9
+(p9) fma.s0 f8 = f8,f1,f9
(p9) br.ret.spnt b0 ;; // Exit if y=nan
}
{ .mfb
setf.sig FR_lden_sig = GR_lden_sig
mov FR_save_f8 = f8
-(p7) br.cond.spnt NEXT_ZERO ;; // Exit if x=0
+(p7) br.cond.spnt NEXT_ZERO ;; // Exit if x=0
}
// Mask off the sign to get x_exp
{ .mfb
and GR_x_exp = GR_exp_mask, GR_exp
nop.f 999
-(p6) br.cond.spnt NEXT_INF ;; // Exit if x=inf
+(p6) br.cond.spnt NEXT_INF ;; // Exit if x=inf
}
// Check 6 special cases when significand rolls over:
// Set p9, result is sig=max_den_sig, exp same, signal underflow and inexact
// 5 sig size decr, x_sig=min_den_sig, x_exp = min_exp
// Set p10, result is zero, sign of x, signal underflow and inexact
-// 6 sig size decr, x_sig=min_sig, x_exp < min_exp
+// 6 sig size decr, x_sig=min_sig, x_exp < min_exp
// Set p14, result is zero, sign of x, signal underflow and inexact
//
// Form exponent of smallest float denormal (if normalized register format)
br.cond.sptk NEXT_COMMON_FINISH ;;
}
-NEXT_INF:
+NEXT_INF:
// Here if f8 is +- infinity
// INF
// if f8 is +inf, no matter what y is return largest float
{ .mfb
nop.m 999
- fmerge.s f8 = f8,FR_lnorm
- br.ret.sptk b0 ;;
+ fmerge.s f8 = f8,FR_lnorm
+ br.ret.sptk b0 ;;
}
-NEXT_ZERO:
+NEXT_ZERO:
// Here if f8 is +- zero
// ZERO
-// if f8 is zero and y is +, return + smallest float denormal
-// if f8 is zero and y is -, return - smallest float denormal
+// if f8 is zero and y is +, return + smallest float denormal
+// if f8 is zero and y is -, return - smallest float denormal
{ .mfi
nop.m 999
// Add correct sign from direction arg
{ .mfi
nop.m 999
- fmerge.s f8 = f9,FR_sden
+ fmerge.s f8 = f9,FR_sden
nop.i 999 ;;
}
br.cond.sptk NEXT_UNDERFLOW ;;
}
-NEXT_UNDERFLOW:
+NEXT_UNDERFLOW:
// Here if result is a denorm, or input is finite and result is zero
// Call error support to report possible range error
{ .mib
}
;;
-NEXT_OVERFLOW:
+NEXT_OVERFLOW:
// Here if input is finite, but result will be infinite
// Use frcpa to generate infinity of correct sign
// Call error support to report possible range error
{ .mib
stfs [GR_Parameter_X] = FR_save_f8 // STORE Parameter 1 on stack
add GR_Parameter_RESULT = 0,GR_Parameter_Y // Parameter 3 address
- nop.b 0
+ nop.b 0
}
{ .mib
stfs [GR_Parameter_Y] = f8 // STORE Parameter 3 on stack
// products derived from this software without specific prior written
// permission.
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
// Intel Corporation is the author of this code, and requests that all
-// problem reports or change requests be submitted to it directly at
+// problem reports or change requests be submitted to it directly at
// http://www.intel.com/software/products/opensource/libraries/num.htm.
//
// History
//==============================================================
-// 02/02/00 Initial version
+// 02/02/00 Initial version
// 03/03/00 Modified to conform to C9X, and improve speed of main path
// 03/14/00 Fixed case where x is a power of 2, and x > y, improved speed
// 04/04/00 Unwind support added
// set [the previously overwritten] GR_Parameter_RESULT.
// 09/09/00 Updated fcmp so that qnans do not raise invalid.
// 12/15/00 Fixed case of smallest long double normal to largest denormal,
-// now adhere to C99 for two zero args, and fixed flag settings
+// now adhere to C99 for two zero args, and fixed flag settings
// for several cases
// 05/20/02 Cleaned up namespace and sf0 syntax
// 02/10/03 Reordered header: .section, .global, .proc, .align
//
// Overview of operation
//==============================================================
-// nextafterl determines the next representable value
-// after x in the direction of y.
+// nextafterl determines the next representable value
+// after x in the direction of y.
.section .text
// Form smallest denormal significand = ulp size
{ .mfi
getf.exp GR_exp = f8
- fcmp.lt.s1 p10,p11 = f8, f9
+ fcmp.lt.s1 p10,p11 = f8, f9
addl GR_sden_sig = 0x1, r0
}
// Form largest normal significand 0xffffffffffffffff
// It decreases (p13 set) if x<y and x<0 or if x>y and x>=0
{ .mfi
setf.sig FR_sden_sig = GR_sden_sig
- fclass.m p8,p0 = f8, 0xc3
+ fclass.m p8,p0 = f8, 0xc3
(p10) cmp.lt p12,p13 = GR_exp, GR_sign_mask
}
// Move smallest normal exp to fp regs
// Form new exponent in case result exponent needs incrementing or decrementing
{ .mfi
setf.exp FR_new_exp = GR_exp
- fclass.m p9,p0 = f9, 0xc3
+ fclass.m p9,p0 = f9, 0xc3
(p12) add GR_exp1 = 1, GR_exp
}
{ .mib
}
{ .mfb
setf.exp FR_den_exp = GR_min_pexp
-(p8) fma.s0 f8 = f8,f1,f9
+(p8) fma.s0 f8 = f8,f1,f9
(p8) br.ret.spnt b0 ;; // Exit if x=nan
}
// Is x=inf?
{ .mfi
setf.exp FR_exp1 = GR_exp1
- fclass.m p6,p0 = f8, 0x23
+ fclass.m p6,p0 = f8, 0x23
addl GR_exp_mask = 0x1ffff, r0
}
{ .mfb
setf.sig FR_snorm_sig = GR_snorm_sig
-(p9) fma.s0 f8 = f8,f1,f9
+(p9) fma.s0 f8 = f8,f1,f9
(p9) br.ret.spnt b0 ;; // Exit if y=nan
}
{ .mfb
setf.sig FR_lden_sig = GR_lden_sig
mov FR_save_f8 = f8
-(p7) br.cond.spnt NEXT_ZERO ;; // Exit if x=0
+(p7) br.cond.spnt NEXT_ZERO ;; // Exit if x=0
}
// Mask off the sign to get x_exp
{ .mfb
and GR_x_exp = GR_exp_mask, GR_exp
nop.f 999
-(p6) br.cond.spnt NEXT_INF ;; // Exit if x=inf
+(p6) br.cond.spnt NEXT_INF ;; // Exit if x=inf
}
// Check 5 special cases when significand rolls over:
{ .mmi
(p6) cmp.lt.unc p6,p7 = GR_x_exp, GR_max_pexp
(p10) cmp.eq.unc p10,p0 = GR_new_sig, r0
-(p9) cmp.le.unc p9,p8 = GR_x_exp, GR_min_pexp
+(p9) cmp.le.unc p9,p8 = GR_x_exp, GR_min_pexp
;;
}
br.cond.sptk NEXT_COMMON_FINISH ;;
}
-NEXT_INF:
+NEXT_INF:
// Here if f8 is +- infinity
// INF
// if f8 is +inf, no matter what y is return largest long double
{ .mfb
nop.m 999
- fmerge.s f8 = f8,FR_lnorm
- br.ret.sptk b0 ;;
+ fmerge.s f8 = f8,FR_lnorm
+ br.ret.sptk b0 ;;
}
-NEXT_ZERO:
+NEXT_ZERO:
// Here if f8 is +- zero
// ZERO
-// if f8 is zero and y is +, return + smallest long double denormal
-// if f8 is zero and y is -, return - smallest long double denormal
+// if f8 is zero and y is +, return + smallest long double denormal
+// if f8 is zero and y is -, return - smallest long double denormal
{ .mfi
nop.m 999
// Add correct sign from direction arg
{ .mfi
nop.m 999
- fmerge.s f8 = f9,FR_sden
+ fmerge.s f8 = f9,FR_sden
nop.i 999 ;;
}
br.cond.sptk NEXT_UNDERFLOW ;;
}
-NEXT_UNDERFLOW:
+NEXT_UNDERFLOW:
// Here if result is a denorm, or input is finite and result is zero
// Call error support to report possible range error
{ .mib
}
;;
-NEXT_OVERFLOW:
+NEXT_OVERFLOW:
// Here if input is finite, but result will be infinite
// Use frcpa to generate infinity of correct sign
// Call error support to report possible range error
{ .mib
stfe [GR_Parameter_X] = FR_save_f8 // STORE Parameter 1 on stack
add GR_Parameter_RESULT = 0,GR_Parameter_Y // Parameter 3 address
- nop.b 0
+ nop.b 0
}
{ .mib
stfe [GR_Parameter_Y] = f8 // STORE Parameter 3 on stack
// products derived from this software without specific prior written
// permission.
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
// Intel Corporation is the author of this code, and requests that all
-// problem reports or change requests be submitted to it directly at
+// problem reports or change requests be submitted to it directly at
// http://www.intel.com/software/products/opensource/libraries/num.htm.
//
// History
//==============================================================
-// 08/15/01 Initial version
+// 08/15/01 Initial version
// 08/23/01 Corrected error tag number
// 05/20/02 Cleaned up namespace and sf0 syntax
// 02/10/03 Reordered header: .section, .global, .proc, .align
//
// Overview of operation
//==============================================================
-// nexttoward determines the next representable value
-// after x in the direction of y.
+// nexttoward determines the next representable value
+// after x in the direction of y.
.section .text
// Form smallest denormal significand = ulp size
{ .mfi
getf.exp GR_exp = f8
- fcmp.lt.s1 p10,p11 = f8, f9
+ fcmp.lt.s1 p10,p11 = f8, f9
addl GR_sden_sig = 0x800, r0
}
// Form largest normal significand 0xfffffffffffff800
// Form largest normal exponent
{ .mfi
getf.sig GR_sig = f8
- fcmp.eq.s0 p6,p0 = f8, f9
+ fcmp.eq.s0 p6,p0 = f8, f9
addl GR_max_pexp = 0x103fe, r0
}
// Move largest normal significand to fp reg for special cases
// It decreases (p13 set) if x<y and x<0 or if x>y and x>=0
{ .mfi
setf.sig FR_sden_sig = GR_sden_sig
- fclass.m p8,p0 = f8, 0xc3
+ fclass.m p8,p0 = f8, 0xc3
(p10) cmp.lt p12,p13 = GR_exp, GR_sign_mask
}
{ .mfi
// Form new exponent in case result exponent needs incrementing or decrementing
{ .mfi
setf.exp FR_new_exp = GR_exp
- fclass.m p9,p0 = f9, 0xc3
+ fclass.m p9,p0 = f9, 0xc3
(p12) add GR_exp1 = 1, GR_exp
}
{ .mib
}
{ .mfb
nop.m 999
-(p8) fma.s0 f8 = f8,f1,f9
+(p8) fma.s0 f8 = f8,f1,f9
(p8) br.ret.spnt b0 ;; // Exit if x=nan
}
// Is x=inf?
{ .mfi
setf.exp FR_exp1 = GR_exp1
- fclass.m p6,p0 = f8, 0x23
+ fclass.m p6,p0 = f8, 0x23
addl GR_exp_mask = 0x1ffff, r0
}
{ .mfb
setf.sig FR_snorm_sig = GR_snorm_sig
-(p9) fma.s0 f8 = f8,f1,f9
+(p9) fma.s0 f8 = f8,f1,f9
(p9) br.ret.spnt b0 ;; // Exit if y=nan
}
{ .mfb
setf.sig FR_lden_sig = GR_lden_sig
mov FR_save_f8 = f8
-(p7) br.cond.spnt NEXT_ZERO ;; // Exit if x=0
+(p7) br.cond.spnt NEXT_ZERO ;; // Exit if x=0
}
// Mask off the sign to get x_exp
{ .mfb
and GR_x_exp = GR_exp_mask, GR_exp
nop.f 999
-(p6) br.cond.spnt NEXT_INF ;; // Exit if x=inf
+(p6) br.cond.spnt NEXT_INF ;; // Exit if x=inf
}
// Check 6 special cases when significand rolls over:
// Set p9, result is sig=max_den_sig, exp same, signal underflow and inexact
// 5 sig size decr, x_sig=min_den_sig, x_exp = min_exp
// Set p10, result is zero, sign of x, signal underflow and inexact
-// 6 sig size decr, x_sig=min_sig, x_exp < min_exp
+// 6 sig size decr, x_sig=min_sig, x_exp < min_exp
// Set p14, result is zero, sign of x, signal underflow and inexact
//
// Form exponent of smallest double denormal (if normalized register format)
br.cond.sptk NEXT_COMMON_FINISH ;;
}
-NEXT_INF:
+NEXT_INF:
// Here if f8 is +- infinity
// INF
// if f8 is +inf, no matter what y is return largest double
{ .mfb
nop.m 999
- fmerge.s f8 = f8,FR_lnorm
- br.ret.sptk b0 ;;
+ fmerge.s f8 = f8,FR_lnorm
+ br.ret.sptk b0 ;;
}
-NEXT_ZERO:
+NEXT_ZERO:
// Here if f8 is +- zero
// ZERO
-// if f8 is zero and y is +, return + smallest double denormal
-// if f8 is zero and y is -, return - smallest double denormal
+// if f8 is zero and y is +, return + smallest double denormal
+// if f8 is zero and y is -, return - smallest double denormal
{ .mfi
nop.m 999
// Add correct sign from direction arg
{ .mfi
nop.m 999
- fmerge.s f8 = f9,FR_sden
+ fmerge.s f8 = f9,FR_sden
nop.i 999 ;;
}
br.cond.sptk NEXT_UNDERFLOW ;;
}
-NEXT_UNDERFLOW:
+NEXT_UNDERFLOW:
// Here if result is a denorm, or input is finite and result is zero
// Call error support to report possible range error
{ .mib
}
;;
-NEXT_OVERFLOW:
+NEXT_OVERFLOW:
// Here if input is finite, but result will be infinite
// Use frcpa to generate infinity of correct sign
// Call error support to report possible range error
{ .mib
stfd [GR_Parameter_X] = FR_save_f8 // STORE Parameter 1 on stack
add GR_Parameter_RESULT = 0,GR_Parameter_Y // Parameter 3 address
- nop.b 0
+ nop.b 0
}
{ .mib
stfd [GR_Parameter_Y] = f8 // STORE Parameter 3 on stack
// products derived from this software without specific prior written
// permission.
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
// Intel Corporation is the author of this code, and requests that all
-// problem reports or change requests be submitted to it directly at
+// problem reports or change requests be submitted to it directly at
// http://www.intel.com/software/products/opensource/libraries/num.htm.
//
// History
//==============================================================
-// 08/15/01 Initial version
+// 08/15/01 Initial version
// 08/23/01 Corrected error tag number
// 05/20/02 Cleaned up namespace and sf0 syntax
// 02/10/03 Reordered header: .section, .global, .proc, .align
//
// Overview of operation
//==============================================================
-// nexttowardf determines the next representable value
-// after x in the direction of y.
+// nexttowardf determines the next representable value
+// after x in the direction of y.
.section .text
// Extract significand from x
// Form largest normal significand
{ .mlx
- nop.m 0
+ nop.m 0
movl GR_lnorm_sig = 0xffffff0000000000 ;;
}
// Move largest normal significand to fp reg for special cases
{ .mfi
setf.sig FR_lnorm_sig = GR_lnorm_sig
- nop.f 0
+ nop.f 0
addl GR_sign_mask = 0x20000, r0 ;;
}
// It decreases (p13 set) if x<y and x<0 or if x>y and x>=0
{ .mfi
setf.sig FR_sden_sig = GR_sden_sig
- fclass.m p8,p0 = f8, 0xc3
+ fclass.m p8,p0 = f8, 0xc3
(p10) cmp.lt p12,p13 = GR_exp, GR_sign_mask
}
{ .mfi
// Form new exponent in case result exponent needs incrementing or decrementing
{ .mfi
setf.exp FR_new_exp = GR_exp
- fclass.m p9,p0 = f9, 0xc3
+ fclass.m p9,p0 = f9, 0xc3
(p12) add GR_exp1 = 1, GR_exp
}
{ .mib
}
{ .mfb
nop.m 999
-(p8) fma.s0 f8 = f8,f1,f9
+(p8) fma.s0 f8 = f8,f1,f9
(p8) br.ret.spnt b0 ;; // Exit if x=nan
}
// Is x=inf?
{ .mfi
setf.exp FR_exp1 = GR_exp1
- fclass.m p6,p0 = f8, 0x23
+ fclass.m p6,p0 = f8, 0x23
addl GR_exp_mask = 0x1ffff, r0
}
{ .mfb
setf.sig FR_snorm_sig = GR_snorm_sig
-(p9) fma.s0 f8 = f8,f1,f9
+(p9) fma.s0 f8 = f8,f1,f9
(p9) br.ret.spnt b0 ;; // Exit if y=nan
}
{ .mfb
setf.sig FR_lden_sig = GR_lden_sig
mov FR_save_f8 = f8
-(p7) br.cond.spnt NEXT_ZERO ;; // Exit if x=0
+(p7) br.cond.spnt NEXT_ZERO ;; // Exit if x=0
}
// Mask off the sign to get x_exp
{ .mfb
and GR_x_exp = GR_exp_mask, GR_exp
nop.f 999
-(p6) br.cond.spnt NEXT_INF ;; // Exit if x=inf
+(p6) br.cond.spnt NEXT_INF ;; // Exit if x=inf
}
// Check 6 special cases when significand rolls over:
// Set p9, result is sig=max_den_sig, exp same, signal underflow and inexact
// 5 sig size decr, x_sig=min_den_sig, x_exp = min_exp
// Set p10, result is zero, sign of x, signal underflow and inexact
-// 6 sig size decr, x_sig=min_sig, x_exp < min_exp
+// 6 sig size decr, x_sig=min_sig, x_exp < min_exp
// Set p14, result is zero, sign of x, signal underflow and inexact
//
// Form exponent of smallest float denormal (if normalized register format)
br.cond.sptk NEXT_COMMON_FINISH ;;
}
-NEXT_INF:
+NEXT_INF:
// Here if f8 is +- infinity
// INF
// if f8 is +inf, no matter what y is return largest float
{ .mfb
nop.m 999
- fmerge.s f8 = f8,FR_lnorm
- br.ret.sptk b0 ;;
+ fmerge.s f8 = f8,FR_lnorm
+ br.ret.sptk b0 ;;
}
-NEXT_ZERO:
+NEXT_ZERO:
// Here if f8 is +- zero
// ZERO
-// if f8 is zero and y is +, return + smallest float denormal
-// if f8 is zero and y is -, return - smallest float denormal
+// if f8 is zero and y is +, return + smallest float denormal
+// if f8 is zero and y is -, return - smallest float denormal
{ .mfi
nop.m 999
// Add correct sign from direction arg
{ .mfi
nop.m 999
- fmerge.s f8 = f9,FR_sden
+ fmerge.s f8 = f9,FR_sden
nop.i 999 ;;
}
br.cond.sptk NEXT_UNDERFLOW ;;
}
-NEXT_UNDERFLOW:
+NEXT_UNDERFLOW:
// Here if result is a denorm, or input is finite and result is zero
// Call error support to report possible range error
{ .mib
}
;;
-NEXT_OVERFLOW:
+NEXT_OVERFLOW:
// Here if input is finite, but result will be infinite
// Use frcpa to generate infinity of correct sign
// Call error support to report possible range error
{ .mib
stfs [GR_Parameter_X] = FR_save_f8 // STORE Parameter 1 on stack
add GR_Parameter_RESULT = 0,GR_Parameter_Y // Parameter 3 address
- nop.b 0
+ nop.b 0
}
{ .mib
stfs [GR_Parameter_Y] = f8 // STORE Parameter 3 on stack
// products derived from this software without specific prior written
// permission.
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
// Intel Corporation is the author of this code, and requests that all
-// problem reports or change requests be submitted to it directly at
+// problem reports or change requests be submitted to it directly at
// http://www.intel.com/software/products/opensource/libraries/num.htm.
//
// History
//==============================================================
-// 08/15/01 Initial version
+// 08/15/01 Initial version
// 08/23/01 Corrected error tag number
// 05/20/02 Cleaned up namespace and sf0 syntax
// 02/10/03 Reordered header: .section, .global, .proc, .align
//
// Overview of operation
//==============================================================
-// nexttowardl determines the next representable value
-// after x in the direction of y.
+// nexttowardl determines the next representable value
+// after x in the direction of y.
.section .text
// Form smallest denormal significand = ulp size
{ .mfi
getf.exp GR_exp = f8
- fcmp.lt.s1 p10,p11 = f8, f9
+ fcmp.lt.s1 p10,p11 = f8, f9
addl GR_sden_sig = 0x1, r0
}
// Form largest normal significand 0xffffffffffffffff
// It decreases (p13 set) if x<y and x<0 or if x>y and x>=0
{ .mfi
setf.sig FR_sden_sig = GR_sden_sig
- fclass.m p8,p0 = f8, 0xc3
+ fclass.m p8,p0 = f8, 0xc3
(p10) cmp.lt p12,p13 = GR_exp, GR_sign_mask
}
// Move smallest normal exp to fp regs
// Form new exponent in case result exponent needs incrementing or decrementing
{ .mfi
setf.exp FR_new_exp = GR_exp
- fclass.m p9,p0 = f9, 0xc3
+ fclass.m p9,p0 = f9, 0xc3
(p12) add GR_exp1 = 1, GR_exp
}
{ .mib
}
{ .mfb
setf.exp FR_den_exp = GR_min_pexp
-(p8) fma.s0 f8 = f8,f1,f9
+(p8) fma.s0 f8 = f8,f1,f9
(p8) br.ret.spnt b0 ;; // Exit if x=nan
}
// Is x=inf?
{ .mfi
setf.exp FR_exp1 = GR_exp1
- fclass.m p6,p0 = f8, 0x23
+ fclass.m p6,p0 = f8, 0x23
addl GR_exp_mask = 0x1ffff, r0
}
{ .mfb
setf.sig FR_snorm_sig = GR_snorm_sig
-(p9) fma.s0 f8 = f8,f1,f9
+(p9) fma.s0 f8 = f8,f1,f9
(p9) br.ret.spnt b0 ;; // Exit if y=nan
}
{ .mfb
setf.sig FR_lden_sig = GR_lden_sig
mov FR_save_f8 = f8
-(p7) br.cond.spnt NEXT_ZERO ;; // Exit if x=0
+(p7) br.cond.spnt NEXT_ZERO ;; // Exit if x=0
}
// Mask off the sign to get x_exp
{ .mfb
and GR_x_exp = GR_exp_mask, GR_exp
nop.f 999
-(p6) br.cond.spnt NEXT_INF ;; // Exit if x=inf
+(p6) br.cond.spnt NEXT_INF ;; // Exit if x=inf
}
// Check 5 special cases when significand rolls over:
{ .mmi
(p6) cmp.lt.unc p6,p7 = GR_x_exp, GR_max_pexp
(p10) cmp.eq.unc p10,p0 = GR_new_sig, r0
-(p9) cmp.le.unc p9,p8 = GR_x_exp, GR_min_pexp
+(p9) cmp.le.unc p9,p8 = GR_x_exp, GR_min_pexp
;;
}
br.cond.sptk NEXT_COMMON_FINISH ;;
}
-NEXT_INF:
+NEXT_INF:
// Here if f8 is +- infinity
// INF
// if f8 is +inf, no matter what y is return largest long double
{ .mfb
nop.m 999
- fmerge.s f8 = f8,FR_lnorm
- br.ret.sptk b0 ;;
+ fmerge.s f8 = f8,FR_lnorm
+ br.ret.sptk b0 ;;
}
-NEXT_ZERO:
+NEXT_ZERO:
// Here if f8 is +- zero
// ZERO
-// if f8 is zero and y is +, return + smallest long double denormal
-// if f8 is zero and y is -, return - smallest long double denormal
+// if f8 is zero and y is +, return + smallest long double denormal
+// if f8 is zero and y is -, return - smallest long double denormal
{ .mfi
nop.m 999
// Add correct sign from direction arg
{ .mfi
nop.m 999
- fmerge.s f8 = f9,FR_sden
+ fmerge.s f8 = f9,FR_sden
nop.i 999 ;;
}
br.cond.sptk NEXT_UNDERFLOW ;;
}
-NEXT_UNDERFLOW:
+NEXT_UNDERFLOW:
// Here if result is a denorm, or input is finite and result is zero
// Call error support to report possible range error
{ .mib
}
;;
-NEXT_OVERFLOW:
+NEXT_OVERFLOW:
// Here if input is finite, but result will be infinite
// Use frcpa to generate infinity of correct sign
// Call error support to report possible range error
{ .mib
stfe [GR_Parameter_X] = FR_save_f8 // STORE Parameter 1 on stack
add GR_Parameter_RESULT = 0,GR_Parameter_Y // Parameter 3 address
- nop.b 0
+ nop.b 0
}
{ .mib
stfe [GR_Parameter_Y] = f8 // STORE Parameter 3 on stack
{ .mmi
cmp.lt p6,p0 = rExp, rExpHalf // Is |x| < 0.5?
cmp.ge p7,p0 = rExp, rBigexp // Is |x| >= 2^52?
- cmp.lt p10,p0 = rExp, rExpHalf // Is |x| < 0.5?
+ cmp.lt p10,p0 = rExp, rExpHalf // Is |x| < 0.5?
}
;;
{ .mmi
cmp.lt p6,p0 = rExp, rExpHalf // Is |x| < 0.5?
cmp.ge p7,p0 = rExp, rBigexp // Is |x| >= 2^23?
- cmp.lt p10,p0 = rExp, rExpHalf // Is |x| < 0.5?
+ cmp.lt p10,p0 = rExp, rExpHalf // Is |x| < 0.5?
}
;;
{ .mmi
cmp.lt p6,p0 = rExp, rExpHalf // Is |x| < 0.5?
cmp.ge p7,p0 = rExp, rBigexp // Is |x| >= 2^63?
- cmp.lt p10,p0 = rExp, rExpHalf // Is |x| < 0.5?
+ cmp.lt p10,p0 = rExp, rExpHalf // Is |x| < 0.5?
}
;;
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
float scalblnf(float x, long int n)
{
-#ifdef SIZE_LONG_INT_64
- return __libm_scalblnf(x,n,1);
+#ifdef SIZE_LONG_INT_64
+ return __libm_scalblnf(x,n,1);
#else
-#ifdef SIZE_LONG_INT_32
+#ifdef SIZE_LONG_INT_32
return __libm_scalblnf(x,n,0);
#endif
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
double scalbn(double x, int n)
{
-#ifdef SIZE_INT_64
- return __libm_scalbn(x,n,1);
+#ifdef SIZE_INT_64
+ return __libm_scalbn(x,n,1);
#else
-#ifdef SIZE_INT_32
+#ifdef SIZE_INT_32
return __libm_scalbn(x,n,0);
#endif
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
float scalbnf(float x, int n)
{
-#ifdef SIZE_INT_64
- return __libm_scalbnf(x,n,1);
+#ifdef SIZE_INT_64
+ return __libm_scalbnf(x,n,1);
#else
-#ifdef SIZE_INT_32
+#ifdef SIZE_INT_32
return __libm_scalbnf(x,n,0);
#endif
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
long double scalbnl(long double x, int n)
{
-#ifdef SIZE_INT_64
- return __libm_scalbnl(x,n,1);
+#ifdef SIZE_INT_64
+ return __libm_scalbnl(x,n,1);
#else
-#ifdef SIZE_INT_32
+#ifdef SIZE_INT_32
return __libm_scalbnl(x,n,0);
#endif
(p6) mov ret0 = 1
(p7) mov ret0 = 0
br.ret.sptk.many rp
-}
+}
END (__signbit)
strong_alias (__signbit, __signbitf)
// products derived from this software without specific prior written
// permission.
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
// Intel Corporation is the author of this code, and requests that all
-// problem reports or change requests be submitted to it directly at
+// problem reports or change requests be submitted to it directly at
// http://www.intel.com/software/products/opensource/libraries/num.htm.
//
// History
// If x = sig * 2**n with 1 <= sig < 2
// significand returns sig
//
-// predicate registers used:
+// predicate registers used:
// p6, p7
//
-// floating-point registers used:
-// f8, f9, f10
+// floating-point registers used:
+// f8, f9, f10
.section .text
GLOBAL_LIBM_ENTRY(significand)
// f10 gets f8(sign) with f1(exp,significand)
{ .mfi
nop.m 999
- fmerge.s f10 = f8,f1
+ fmerge.s f10 = f8,f1
nop.i 999
}
{ .mfi
nop.m 999
- fnorm.s0 f9 = f8
+ fnorm.s0 f9 = f8
nop.i 999 ;;
}
// return sign(f8) exp(f8) significand(f8), normalized.
{ .mfi
nop.m 999
- fclass.m.unc p0,p6 = f8, 0xe7
+ fclass.m.unc p0,p6 = f8, 0xe7
nop.i 999 ;;
}
{ .mfb
nop.m 999
- fnorm.d.s0 f8 = f8
+ fnorm.d.s0 f8 = f8
br.ret.sptk b0 ;;
}
// products derived from this software without specific prior written
// permission.
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
// Intel Corporation is the author of this code, and requests that all
-// problem reports or change requests be submitted to it directly at
+// problem reports or change requests be submitted to it directly at
// http://www.intel.com/software/products/opensource/libraries/num.htm.
//
// History
// If x = sig * 2**n with 1 <= sig < 2
// significandf returns sig
//
-// predicate registers used:
+// predicate registers used:
// p6, p7
//
-// floating-point registers used:
-// f8, f9, f10
+// floating-point registers used:
+// f8, f9, f10
.section .text
GLOBAL_LIBM_ENTRY(significandf)
// f10 gets f8(sign) with f1(exp,significand)
{ .mfi
nop.m 999
- fmerge.s f10 = f8,f1
+ fmerge.s f10 = f8,f1
nop.i 999
}
{ .mfi
nop.m 999
- fnorm.s0 f9 = f8
+ fnorm.s0 f9 = f8
nop.i 999 ;;
}
// return sign(f8) exp(f8) significand(f8), normalized.
{ .mfi
nop.m 999
- fclass.m.unc p0,p6 = f8, 0xe7
+ fclass.m.unc p0,p6 = f8, 0xe7
nop.i 999 ;;
}
// products derived from this software without specific prior written
// permission.
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
// Intel Corporation is the author of this code, and requests that all
-// problem reports or change requests be submitted to it directly at
+// problem reports or change requests be submitted to it directly at
// http://www.intel.com/software/products/opensource/libraries/num.htm.
//
// History
// If x = sig * 2**n with 1 <= sig < 2
// significandl returns sig
//
-// predicate registers used:
+// predicate registers used:
// p6, p7
//
-// floating-point registers used:
-// f8, f9, f10
+// floating-point registers used:
+// f8, f9, f10
.section .text
GLOBAL_LIBM_ENTRY(significandl)
// f10 gets f8(sign) with f1(exp,significand)
{ .mfi
nop.m 999
- fmerge.s f10 = f8,f1
+ fmerge.s f10 = f8,f1
nop.i 999
}
{ .mfi
nop.m 999
- fnorm.s0 f9 = f8
+ fnorm.s0 f9 = f8
nop.i 999 ;;
}
fclass.m.unc p7,p0 = f8, 0x0b
nop.i 999 ;;
}
-
+
// p6 = TRUE ==> x is not (nan,inf,0)
// return sign(f8) exp(f1) significand(f8)
// else x is (nan,inf,0)
// return sign(f8) exp(f8) significand(f8), normalized.
{ .mfi
nop.m 999
- fclass.m.unc p0,p6 = f8, 0xe7
+ fclass.m.unc p0,p6 = f8, 0xe7
nop.i 999 ;;
}
// This will be the final result unless x double-extended denormal
{ .mfi
nop.m 999
- fnorm.s0 f8 = f8
+ fnorm.s0 f8 = f8
nop.i 999 ;;
}
// Final normalization if x double-extended denorm
{ .mfb
nop.m 999
-(p7) fnorm.s0 f8 = f8
+(p7) fnorm.s0 f8 = f8
br.ret.sptk b0 ;;
}
// products derived from this software without specific prior written
// permission.
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Intel Corporation is the author of this code, and requests that all
-// problem reports or change requests be submitted to it directly at
+// problem reports or change requests be submitted to it directly at
// http://www.intel.com/software/products/opensource/libraries/num.htm.
//
// History
// products derived from this software without specific prior written
// permission.
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Intel Corporation is the author of this code, and requests that all
-// problem reports or change requests be submitted to it directly at
+// problem reports or change requests be submitted to it directly at
// http://www.intel.com/software/products/opensource/libraries/num.htm.
//
// History
// products derived from this software without specific prior written
// permission.
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
// Intel Corporation is the author of this code, and requests that all
-// problem reports or change requests be submitted to it directly at
+// problem reports or change requests be submitted to it directly at
// http://www.intel.com/software/products/opensource/libraries/num.htm.
//
// History
//
// 3. Main path: 0.25 <= |x| < 19.0625
// For several ranges of 0.25 <= |x| < 19.0625
-// Return tanh(x) = sign(x)*(A0 + y*A1 + y^2*A2 +
+// Return tanh(x) = sign(x)*(A0 + y*A1 + y^2*A2 +
// + y^3*A3 + ... + y^19*A19)
// where y = (|x|/a) - b
-//
+//
// For each range there is particular set of coefficients.
// Below is the list of ranges:
// 1/4 <= |x| < 1/2 a = 0.25, b = 1.0
// 8.0 <= |x| < 13.0 a = 8.0, b = 1.0
// 13.0 <= |x| < 16.0 a = 8.0, b = 2.0
// 16.0 <= |x| < 19.0625 a = 16.0, b = 1.0
-// ( [3.25;4.0], [6.5;8.0], [13.0;16.0] subranges separated
+// ( [3.25;4.0], [6.5;8.0], [13.0;16.0] subranges separated
// for monotonicity issues resolve )
//
-// 4. Saturation path: 19.0625 <= |x| < +INF
+// 4. Saturation path: 19.0625 <= |x| < +INF
// Return tanh(x) = sign(x)*(1.0 - tiny_value)
// (tiny_value ~ 2^(-63))
//
// Registers used
//==============================================================================
-// Floating Point registers used:
+// Floating Point registers used:
// f8 = input, output
// f32 -> f64
//
-// General registers used:
+// General registers used:
// r32 -> r51, r2, r3
//
// Predicate registers used:
// p6, p8, p10, p11, p12, p14, p15
// p6 arg is zero, denormal or special IEEE
-// p8 to filter out case when signd(x) > 1.625
+// p8 to filter out case when signd(x) > 1.625
// p10 to filter out case when |x| < 0.25
-// p11 to filter out case when signd(x) <= 1.625
+// p11 to filter out case when signd(x) <= 1.625
// p12 to filter out case when |x| >= 19.0625
// p14 set to 1 for positive x
// p15 set to 1 for negative x
fTQuadr = f59
fTDeg3 = f60
fTDeg7 = f61
-fArgAbsNormSgn = f62
+fArgAbsNormSgn = f62
fTQuadrSgn = f63
fTwo = f64
// Main path coefficients:
// Coefficients ##0..15 ("main" coefficient tables)
-// Polynomial coefficients for the tanh(x), 0.25 <= |x| < 0.5
+// Polynomial coefficients for the tanh(x), 0.25 <= |x| < 0.5
data8 0xE9D218BC9A3FB55A, 0x00003FC7 //A19
data8 0xC8C0D38687F36EBA, 0x00003FCE //A18
data8 0xA2663E519FAC8A43, 0x0000BFD2 //A17
data8 0x942226246A8C2A86, 0x00003FF1 //A5
data8 0x8F06D9FF7DB47261, 0x00003FF4 //A4
//
-// Polynomial coefficients for the tanh(x), 0.5 <= |x| < 1.0
+// Polynomial coefficients for the tanh(x), 0.5 <= |x| < 1.0
data8 0xC4A7B8FB672A8520, 0x00003FDC //A19
data8 0xA20724B847E13499, 0x0000BFE0 //A18
data8 0xE17DB53F02E4D340, 0x00003FE2 //A17
data8 0xDF017BE0D4FE45D8, 0x0000BFF4 //A5
data8 0xA8A0C6E2226DF3CD, 0x00003FF8 //A4
//
-// Polynomial coefficients for the tanh(x), 1.0 <= |x| < 2.0
+// Polynomial coefficients for the tanh(x), 1.0 <= |x| < 2.0
data8 0x8E89D2EBFDAA160B, 0x00003FE9 //A19
data8 0xDD9226310A272046, 0x0000BFEC //A18
data8 0xA038042D28B0D665, 0x00003FEF //A17
data8 0xBDACE06F531D9491, 0x0000BFFA //A5
data8 0xE3048AD1DB2F648C, 0x00003FF9 //A4
//
-// Polynomial coefficients for the tanh(x), 2.0 <= |x| < 3.25
+// Polynomial coefficients for the tanh(x), 2.0 <= |x| < 3.25
data8 0x856EC3B0330A385A, 0x00003FEB //A19
data8 0xC641D69DAE2D429C, 0x0000BFF2 //A18
data8 0xC683EB0BE1343FFF, 0x00003FF5 //A17
data8 0x9CABD76D1D5C3878, 0x00003FFC //A5
data8 0x92906D077941CAA9, 0x0000BFFD //A4
//
-// Polynomial coefficients for the tanh(x), 4.0 <= |x| < 6.5
+// Polynomial coefficients for the tanh(x), 4.0 <= |x| < 6.5
data8 0x9232D19F71709AC9, 0x0000BFF5 //A19
data8 0x819E31323F5DD3F8, 0x00003FF8 //A18
data8 0xDA8E1CDB8D23DC29, 0x0000BFF9 //A17
data8 0xB99874B482BD17EE, 0x00003FFC //A5
data8 0xE93FB2F99431DC1D, 0x0000BFFB //A4
//
-// Polynomial coefficients for the tanh(x), 8.0 <= |x| < 13.0
+// Polynomial coefficients for the tanh(x), 8.0 <= |x| < 13.0
data8 0xAAA9EB7EADA85CEC, 0x00003FF5 //A19
data8 0x980C80EE05A6BE78, 0x0000BFF8 //A18
data8 0x818DA9F5396390A5, 0x00003FFA //A17
data8 0x80E375C1B847B72F, 0x00003FF6 //A5
data8 0xA11C7DD978CF700A, 0x0000BFF4 //A4
//
-// Polynomial coefficients for the tanh(x), 16.0 <= |x| < 19.0625
+// Polynomial coefficients for the tanh(x), 16.0 <= |x| < 19.0625
data8 0xE29D17C510F86F6B, 0x00003FF3 //A19
data8 0x88FE52EB39A3A98C, 0x0000BFF5 //A18
data8 0xA406547E50360693, 0x00003FF5 //A17
data8 0x98176FD06229A385, 0x0000BFE1 //A4
//
// Binary subranges
-// Polynomial coefficients for the tanh(x), 3.25 <= |x| < 4.0
+// Polynomial coefficients for the tanh(x), 3.25 <= |x| < 4.0
data8 0xEF2EE841288F6706, 0x00003FE9 //A19
data8 0xE65D5B74B85F82A6, 0x00003FEB //A18
data8 0xE495FC21E42A79FF, 0x00003FEA //A17
data8 0xB998746D57061F74, 0x00003FF7 //A5
data8 0xE93FB2F482327C19, 0x0000BFF7 //A4
//
-// Polynomial coefficients for the tanh(x), 6.5 <= |x| < 8.0
+// Polynomial coefficients for the tanh(x), 6.5 <= |x| < 8.0
data8 0xEB189B71ADC40BE2, 0x00003FEA //A19
data8 0xA60B46F9FF6DC2DF, 0x00003FEA //A18
data8 0xBB061CDD9F368B9D, 0x00003FEC //A17
data8 0x80E38B18E8D0F460, 0x00003FF1 //A5
data8 0xA11C80E20AAFDD3C, 0x0000BFF0 //A4
//
-// Polynomial coefficients for the tanh(x), 13.0 <= |x| < 16.0
+// Polynomial coefficients for the tanh(x), 13.0 <= |x| < 16.0
data8 0xBECD0AF7E22E5594, 0x00003FE9 //A19
data8 0xE2834E2D68C1128C, 0x00003FEA //A18
data8 0x97B117611B317379, 0x00003FEB //A17
data8 0x98176E2309B7C73A, 0x0000BFDD //A4
//
// Coefficients ##16..19 ("tail" coefficient tables)
-// Polynomial coefficients for the tanh(x), 0.25 <= |x| < 0.5
+// Polynomial coefficients for the tanh(x), 0.25 <= |x| < 0.5
data8 0x838F209ABB9BA7B3, 0x0000BFF7 //A3
data8 0xEBC0AC78DA4FC500, 0x0000BFF8 //A2
data8 0xF0A4D02960B60E69, 0x00003FFC //A1
data8 0xFACBF534D0E42F8A, 0x00003FFC //A0
//
-// Polynomial coefficients for the tanh(x), 0.5 <= |x| < 1.0
+// Polynomial coefficients for the tanh(x), 0.5 <= |x| < 1.0
data8 0xC0ECBDC0A0D133A6, 0x0000BFF8 //A3
data8 0xBA13A076BF8E812F, 0x0000BFFB //A2
data8 0xC954A37D1A1CA070, 0x00003FFD //A1
data8 0xEC9A9EBAB4579B29, 0x00003FFD //A0
//
-// Polynomial coefficients for the tanh(x), 1.0 <= |x| < 2.0
+// Polynomial coefficients for the tanh(x), 1.0 <= |x| < 2.0
data8 0xD42E9175A6EA1397, 0x00003FFB //A3
data8 0xA3C361378A55CF56, 0x0000BFFD //A2
data8 0xD706E07CC8622983, 0x00003FFD //A1
data8 0xE42327BB13076BD6, 0x00003FD5 //A1
data8 0xFFFFFFFFFFF8DEE7, 0x00003FFE //A0
//
-// Polynomial coefficients for the tanh(x), 0.0 <= |x| < 0.25
+// Polynomial coefficients for the tanh(x), 0.0 <= |x| < 0.25
// ('tanh_near_zero' path)
data8 0xBF2BA5D26E479D0C //A9
data8 0x3F4336D96F81EE26 //A8
//
// 1.0 - 2^(-63)
// ('tanh_saturation' path)
-data8 0xFFFFFFFFFFFFFFFF, 0x00003FFE
+data8 0xFFFFFFFFFFFFFFFF, 0x00003FFE
LOCAL_OBJECT_END(tanh_data)
// CAUTION: The order of table coefficients shouldn't be changed!
};;
{ .mfi
- getf.d rArg = f8 // x in GR
- fclass.m p6,p0 = f8, 0xEF // Filter 0, denormals and specials
+ getf.d rArg = f8 // x in GR
+ fclass.m p6,p0 = f8, 0xEF // Filter 0, denormals and specials
// 0xEF = @qnan|@snan|@pos|@neg|@zero|@unorm|@inf
shl rArgSgnd = rArgSgnd, 52 // mask for exponent
}
nop.f 0
(p6) br.cond.spnt _tanh_spec // Branch to zero, denorm & specs
};;
-
+
{ .mfi
and rShiftedArgMasked = rShiftedArg, rMask // bias of x << 8
fmerge.s fArgAbs = f1, f8 // |x|
- shr rShiftedAbsArg = rAbsArg, 44 // Select only necessary
+ shr rShiftedAbsArg = rAbsArg, 44 // Select only necessary
// bits of absolute arg
}
{ .mfi
{ .mfi
sub rIndex = rShiftedArgMasked, rBias // index << 8
- nop.f 0
+ nop.f 0
cmp.lt p10, p0 = rShiftedArgMasked, rBias // p10=1 if |x|<0.25
}
{ .mfb
(p8) cmp.gt p8, p11 = rAbsArg, rTwo // If arg is greater than 2.0?
// (then we should use binary subranges)
- nop.f 0
+ nop.f 0
(p10) br.cond.spnt tanh_near_zero // branch out if |x| < 0.25
};;
.pred.rel "mutex",p8,p11
{ .mfi
-(p8) add rIndex = 0x400, rIndex // Make pointer to binary
+(p8) add rIndex = 0x400, rIndex // Make pointer to binary
// subranges
(p11) fms.s1 fArgAbsNorm = fArgAbsNorm, f1, f1 // |x|/b - 1.0
addl rSaturation = 0x40331, r0 // shifted bits of 19.0625
}
{ .mfi
- nop.m 0
+ nop.m 0
(p8) fms.s1 fArgAbsNorm = fArgAbsNorm, f1, fTwo // |x|/b - 2.0
// this is only for binary subranges [3.25;4], [6.5;8], [13.0;16]
- nop.i 0
+ nop.i 0
}
;;
adds rCoeffAddr2 = 16, rCoeffAddr1 // Shifted pointer to coeffs
fmerge.s fSignumX = f8, f1 // signum(x)
nop.i 0
-}
+}
{ .mfb
cmp.le p12, p0 = rSaturation, rShiftedAbsArg // |x|>=19.0625?
nop.f 0
{.mfi
ldfe fA12 = [rCoeffAddr2], 32 // Load A12
nop.f 0
- cmp.lt p15, p14 = rArg, r0 // Arg positive (p14)
+ cmp.lt p15, p14 = rArg, r0 // Arg positive (p14)
// or negative (p15)?
};;
{.mfi
ldfe fA11 = [rCoeffAddr1], 32 // Load A11
nop.f 0
- add rCoeffAddr4 = rCoeffAddr3, rCoeffAddr4 // shifted "tail"
- // coeffs to load
+ add rCoeffAddr4 = rCoeffAddr3, rCoeffAddr4 // shifted "tail"
+ // coeffs to load
}
{.mfi
ldfe fA10 = [rCoeffAddr2], 32 // Load A10
{ .mfi
nop.m 0
- fma.s1 fA15 = fA15, fTSqr, fA13 // Polynomial
+ fma.s1 fA15 = fA15, fTSqr, fA13 // Polynomial
nop.i 0
}
{ .mfi
};;
{ .mfi
- nop.m 0
+ nop.m 0
fma.s1 fA7 = fA7, fTSqr, fA5 // Polynomial
nop.i 0
};;
{ .mfi
- nop.m 0
+ nop.m 0
fma.s1 fRes = fRes, fTQuadr, fA15 // Polynomial
nop.i 0
};;
{ .mfi
- nop.m 0
+ nop.m 0
fma.s1 fA4 = fA4, fTSqr, fA2 // Polynomial
nop.i 0
};;
};;
{ .mfi
- nop.m 0
+ nop.m 0
fma.s1 fA4 = fA7, fTDeg3, fA4 // Polynomial
nop.i 0
};;
tanh_near_zero:
{ .mfi
adds rCoeffAddr1 = 0xC80, rDataPtr // address of A9
- fma.s0 fTSqr = fArgSqr, fArgSqr, f0 // x^4
+ fma.s0 fTSqr = fArgSqr, fArgSqr, f0 // x^4
nop.i 0
}
{ .mfi
-
+
// 0, denormals and special IEEE numbers path /////////////////////////////////
_tanh_spec:
-{ .mfi
- cmp.lt p15, p14 = rArg, r0 // Is arg negative (p15)
+{ .mfi
+ cmp.lt p15, p14 = rArg, r0 // Is arg negative (p15)
// or positive p14)
fclass.m p6,p0 = f8, 0x23 // To filter infinities
- // 0x23 = @pos|@neg|@inf
+ // 0x23 = @pos|@neg|@inf
nop.i 0
};;
-{ .mfi
+{ .mfi
nop.m 0
fclass.m p7,p0 = f8, 0xC7 // To filter NaNs & Zeros
// 0xC7 = @pos|@neg|@zero|@qnan|@snan
nop.i 0
};;
-{ .mfb
+{ .mfb
nop.m 0
-(p6) fmerge.s f8 = f8, f1 // +/-1 for INF args
+(p6) fmerge.s f8 = f8, f1 // +/-1 for INF args
(p6) br.ret.spnt b0 // exit for x = INF
};;
-{ .mfb
+{ .mfb
nop.m 0
-(p7) fma.d.s0 f8 = f8, f1, f8 // +/-0 for 0 args
+(p7) fma.d.s0 f8 = f8, f1, f8 // +/-0 for 0 args
// and NaNs for NaNs
(p7) br.ret.spnt b0 // exit for x = NaN or +/-0
};;
-{ .mfi
+{ .mfi
nop.m 0
fnorm.s0 f8 = f8 // Normalize arg
nop.i 0
};;
.pred.rel "mutex",p14,p15
-{ .mfi
+{ .mfi
nop.m 0
(p14) fnma.d.s0 f8 = f8, f8, f8 // res = r-r^2
nop.i 0
}
-{ .mfb
+{ .mfb
nop.m 0
(p15) fma.d.s0 f8 = f8, f8, f8 // res = r+r^2
br.ret.sptk b0 // 0, denormals, specials return
// products derived from this software without specific prior written
// permission.
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
// Intel Corporation is the author of this code, and requests that all
-// problem reports or change requests be submitted to it directly at
+// problem reports or change requests be submitted to it directly at
// http://www.intel.com/software/products/opensource/libraries/num.htm.
//
// History
// 6. |x| = INF
// Return tanhf(x) = sign(x) * 1.0
//
-// 7. x = [S,Q]NaN
+// 7. x = [S,Q]NaN
// Return tanhf(x) = QNaN
//
// 8. x is positive denormal
//
// Registers used
//==============================================================
-// Floating Point registers used:
+// Floating Point registers used:
// f8, input
// f32 -> f59
-// General registers used:
+// General registers used:
// r32 -> r46, r2, r3
// Predicate registers used:
data8 0x4090E74249760FDD // D1
data8 0xC04B6F537FCF2F1E // D2
data8 0x3E0DCD879C91ADEA // B0
-// Polynomial coefficients for the tanh(x), -0.3125 < x < 0.3125
+// Polynomial coefficients for the tanh(x), -0.3125 < x < 0.3125
data8 0xBFD555551E8245B7 // A0
data8 0x3FC110E63F52E689 // A1
data8 0xBFAB8CD6A5B7BAFA // A2
data8 0x3FFA729FC7085674 // A1
data8 0xBFF2F44D923A8FA4 // A2
data8 0x3FE092FC5712227E // A3
-// Polynomial coefficients for the tanh(x), 8.0 <= |x| <= 9.125
+// Polynomial coefficients for the tanh(x), 8.0 <= |x| <= 9.125
data8 0x3FEFFF5769EE3041 // A0
data8 0x3EFBBF148D850891 // A1
data8 0xBEC86BCEF0F5C2FE // A2
;;
{ .mfi
- getf.s rArg = f8 // x in GR
+ getf.s rArg = f8 // x in GR
fclass.m p7,p0 = f8, 0x0b // is x denormal ?
// sign bit and 2 most bits in significand
- shl rMask = rMask, 20
+ shl rMask = rMask, 20
}
{ .mfi
ld8 rDataPtr = [rDataPtr]
shr rOffset2 = rOffset2, 21
}
{ .mfi
- cmp.lt p10, p8 = rAbsArg, rBound // |x| < 0.3125?
+ cmp.lt p10, p8 = rAbsArg, rBound // |x| < 0.3125?
nop.f 0
adds rCoeffAddr3 = 16, rDataPtr
}
{ .mfi
shladd rCoeffAddr1 = rBias, 4, rDataPtr
fma.s1 fArg3Sgn = fArgSqr, f8, f0 // sign(x)*|x|^3
- // is |x| < 9.125?
- cmp.lt p11, p12 = rAbsArg, rSaturation
+ // is |x| < 9.125?
+ cmp.lt p11, p12 = rAbsArg, rSaturation
}
{ .mfi
shladd rCoeffAddr3 = rBias, 4, rCoeffAddr3
{ .mfi
(p11) ldfpd fC0, fC1 = [rCoeffAddr1]
(p9) fmerge.s f8 = f8,f1 // +/- inf
-(p12) adds rDataPtr = 544, rDataPtr
+(p12) adds rDataPtr = 544, rDataPtr
}
{ .mfb
(p11) ldfpd fC2, fC3 = [rCoeffAddr3], 16
{ .mfb
nop.m 0
fma.s1 fArg6Sgn = fArg3, fArg3Sgn, f0 // sign(x)*|x|^6
-(p13) br.cond.spnt tanhf_close_to_saturation
+(p13) br.cond.spnt tanhf_close_to_saturation
}
;;
{ .mfi
nop.m 0
- fma.s1 fPolATmp = fA3, fAbsArg, fA2 // A3*|x| + A2
+ fma.s1 fPolATmp = fA3, fAbsArg, fA2 // A3*|x| + A2
nop.i 0
}
{ .mfi
{ .mfi
nop.m 0
// C3*|x|^3 + C2*x^2 + C1*|x| + C0
- fma.s1 fPolC = fPolC, fArgSqr, fPolCTmp
+ fma.s1 fPolC = fPolC, fArgSqr, fPolCTmp
nop.i 0
}
;;
{ .mfi
nop.m 0
// PolD = sign(x)*(|x|^7 + D2*x^6 + D1*|x|^5 + D0*x^4)
- fma.d.s1 fPolD = fPolD, fArg4Sgn, fPolDTmp
+ fma.d.s1 fPolD = fPolD, fArg4Sgn, fPolDTmp
nop.i 0
}
;;
{ .mfi
nop.m 0
- // PolA = A3|x|^3 + A2*x^2 + A1*|x| + A0
- fma.d.s1 fPolA = fPolATmp, fArgSqr, fPolA
+ // PolA = A3|x|^3 + A2*x^2 + A1*|x| + A0
+ fma.d.s1 fPolA = fPolATmp, fArgSqr, fPolA
nop.i 0
}
-;;
+;;
{ .mfi
nop.m 0
- // PolC = B0*x^4 + C3*|x|^3 + C2*|x|^2 + C1*|x| + C0
- fma.d.s1 fPolC = fPolC, f1, fB0
+ // PolC = B0*x^4 + C3*|x|^3 + C2*|x|^2 + C1*|x| + C0
+ fma.d.s1 fPolC = fPolC, f1, fB0
nop.i 0
}
-;;
+;;
{ .mfi
nop.m 0
(p14) fma.s.s0 f8 = fPolC, fPolD, fPolA // for positive x
- nop.i 0
+ nop.i 0
}
{ .mfb
nop.m 0
br.ret.sptk b0 // Exit for 9.125 <=|x|< +inf
}
;;
-
+
// Here if 8.0 <= |x| < 9.125
tanhf_close_to_saturation:
{ .mfi
nop.m 0
fma.s1 fPolA = fA3, fAbsArg, fA2 // A3*|x| + A2
nop.i 0
-}
+}
;;
.pred.rel "mutex", p14, p15
nop.m 0
// for positive x
(p14) fma.s.s0 f8 = fPolA, fArgSqr, fPolATmp
- nop.i 0
+ nop.i 0
}
{ .mfb
nop.m 0
// products derived from this software without specific prior written
// permission.
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
// Intel Corporation is the author of this code, and requests that all
-// problem reports or change requests be submitted to it directly at
+// problem reports or change requests be submitted to it directly at
// http://www.intel.com/software/products/opensource/libraries/num.htm.
//
// History
//
// 3. Main path: 1/8 <= |x| < 22.8
// For several ranges of 1/8 <= |x| < 22.8
-// Return tanhl(x) = sign(x)*((A0H+A0L) + y*(A1H+A1L) + y^2*(A2H+A2L) +
+// Return tanhl(x) = sign(x)*((A0H+A0L) + y*(A1H+A1L) + y^2*(A2H+A2L) +
// + y^3*A3 + y^4*A4 + ... + y^25*A25 )
// where y = (|x|/a) - b
//
// 8.0 <= |x| < 13.0 a = 8.0, b = 1.5
// 13.0 <= |x| < 16.0 a = 8.0, b = 2.0
// 16.0 <= |x| < 22.8 a = 16.0, b = 1.5
-// ( [3.25;4.0], [6.5;8.0], [13.9;16.0] subranges separated
+// ( [3.25;4.0], [6.5;8.0], [13.9;16.0] subranges separated
// for monotonicity issues resolve )
//
-// 4. Saturation path: 22.8 <= |x| < +INF
+// 4. Saturation path: 22.8 <= |x| < +INF
// Return tanhl(x) = sign(x)*(1.0 - tiny_value)
// (tiny_value ~ 1e-1233)
//
// Multiprecision have to be performed only for first few
// polynomial iterations (up to 3-rd x degree)
// Here we use the same parallelisation way as above:
-// Split whole polynomial to first, "multiprecision" part, and second,
+// Split whole polynomial to first, "multiprecision" part, and second,
// so called "tail", native precision part.
//
-// 1) Multiprecision part:
+// 1) Multiprecision part:
// [v1=(A0H+A0L)+y*(A1H+A1L)] + [v2=y^2*((A2H+A2L)+y*A3)]
// v1 and v2 terms calculated in parallel
//
// v3 = x^4 * ( A4 + x*A5 + ... + x^21*A25 )
// v3 is splitted to 2 even parts (10 coefficient in each one).
// These 2 parts are also factorized using binary tree technique.
-//
+//
// So Multiprecision and Tail parts cost is almost the same
// and we have both results ready before final summation.
//
// not at the last operation but much more earlier and at
// several places.
//
-// 4. Saturation path: 22.8 <= |x| < +INF
+// 4. Saturation path: 22.8 <= |x| < +INF
//
// We use formula sign(x)*(1.0 - tiny_value) instead of simple sign(x)*1.0
// just to meet IEEE requirements for different rounding modes in this case.
//
// Registers used
//==============================================================
-// Floating Point registers used:
+// Floating Point registers used:
// f8 - input & output
// f32 -> f92
-// General registers used:
-// r2, r3, r32 -> r52
+// General registers used:
+// r2, r3, r32 -> r52
// Predicate registers used:
// p0, p6 -> p11, p14, p15
// p6 - arg is zero, denormal or special IEEE
// p7 - arg is in [16;32] binary interval
-// p8 - arg is in one of subranges
+// p8 - arg is in one of subranges
// [3.25;4.0], [6.5;8.0], [13.9;16.0]
// p9 - arg < 1/8
-// p10 - arg is NOT in one of subranges
+// p10 - arg is NOT in one of subranges
// [3.25;4.0], [6.5;8.0], [13.9;16.0]
// p11 - arg in saturation domain
// p14 - arg is positive
fA17 = f52
fA18 = f53
fA19 = f54
-fA20 = f55
-fA21 = f56
-fA22 = f57
+fA20 = f55
+fA21 = f56
+fA22 = f57
fA23 = f58
fA24 = f59
fA25 = f60
fRes3L = f80
fRes4 = f81
-fTT = f82
+fTT = f82
fTH = f83
fTL = f84
-fTT2 = f85
+fTT2 = f85
fTH2 = f86
fTL2 = f87
////////// Main tables ///////////
_0p125_to_0p25_data: // exp = 2^-3
-// Polynomial coefficients for the tanh(x), 1/8 <= |x| < 1/4
+// Polynomial coefficients for the tanh(x), 1/8 <= |x| < 1/4
data8 0x93D27D6AE7E835F8, 0x0000BFF4 //A3 = -5.6389704216278164626050408239e-04
data8 0xBF66E8668A78A8BC //A2H = -2.7963640930198357253955165902e-03
data8 0xBBD5384EFD0E7A54 //A2L = -1.7974001252014762983581666453e-20
LOCAL_OBJECT_END(tanhl_data)
LOCAL_OBJECT_START(_0p25_to_0p5_data)
-// Polynomial coefficients for the tanh(x), 1/4 <= |x| < 1/2
+// Polynomial coefficients for the tanh(x), 1/4 <= |x| < 1/2
data8 0xB6E27B747C47C8AD, 0x0000BFF6 //A3 = -2.7905990032063258105302045572e-03
data8 0xBF93FD54E226F8F7 //A2H = -1.9521070769536099515084615064e-02
data8 0xBC491BC884F6F18A //A2L = -2.7222721075104525371410300625e-18
LOCAL_OBJECT_END(_0p25_to_0p5_data)
LOCAL_OBJECT_START(_0p5_to_1_data)
-// Polynomial coefficients for the tanh(x), 1/2 <= |x| < 1
+// Polynomial coefficients for the tanh(x), 1/2 <= |x| < 1
data8 0xAB402BE491EE72A7, 0x00003FF7 //A3 = 5.2261556931080934657023772945e-03
data8 0xBFB8403D3DDA87BE //A2H = -9.4730212784752659826992271519e-02
data8 0xBC6FF7BC2AB71A8B //A2L = -1.3863786398568460929625760740e-17
LOCAL_OBJECT_END(_0p5_to_1_data)
LOCAL_OBJECT_START(_1_to_2_data)
-// Polynomial coefficients for the tanh(x), 1 <= |x| < 2.0
+// Polynomial coefficients for the tanh(x), 1 <= |x| < 2.0
data8 0xB3D8FB48A548D99A, 0x00003FFB //A3 = 8.7816203264683800892441646129e-02
data8 0xBFC4EFBD8FB38E3B //A2H = -1.6356629864377389416141284073e-01
data8 0xBC77687FD8087B23 //A2L = -2.0303377679446772162287121190e-17
LOCAL_OBJECT_END(_1_to_2_data)
LOCAL_OBJECT_START(_2_to_3p25_data)
-// Polynomial coefficients for the tanh(x), 2 <= |x| < 3.25
+// Polynomial coefficients for the tanh(x), 2 <= |x| < 3.25
data8 0xD45657BEC559E366, 0x00003FFA //A3 = 5.1840155367548909799883161889e-02
data8 0xBFA41B109CA6AB81 //A2H = -3.9268988726084870510835145296e-02
data8 0xBC2C3D708A4E56C5 //A2L = -7.6544669252238280132415018518e-19
LOCAL_OBJECT_END(_2_to_3p25_data)
LOCAL_OBJECT_START(_4_to_6p5_data)
-// Polynomial coefficients for the tanh(x), 4 <= |x| < 6.5
+// Polynomial coefficients for the tanh(x), 4 <= |x| < 6.5
data8 0x896FDBD321A0BE58, 0x00003FF5 //A3 = 1.0485606995331904734870550114e-03
data8 0xBF39C522B95A37D6 //A2H = -3.9321992640217512306882730044e-04
data8 0xBBA9B3EC39A45338 //A2L = -2.7213922673282819034134988241e-21
LOCAL_OBJECT_END(_4_to_6p5_data)
LOCAL_OBJECT_START(_8_to_13_data)
-// Polynomial coefficients for the tanh(x), 8 <= |x| < 13
+// Polynomial coefficients for the tanh(x), 8 <= |x| < 13
data8 0xDD6050A898303460, 0x00003FE6 //A3 = 5.1543170295688189081352133793e-08
data8 0xBE44C1078FDBADC0 //A2H = -9.6643444318955652627581125180e-09
data8 0xBAF95FCAA6DBBA6F //A2L = -1.3118146684038113473094275420e-24
LOCAL_OBJECT_END(_8_to_13_data)
LOCAL_OBJECT_START(_16_to_22p8_data)
-// Polynomial coefficients for the tanh(x), 16 <= |x| < 22.88
+// Polynomial coefficients for the tanh(x), 16 <= |x| < 22.88
data8 0x992C00F33DDE804D, 0x00003FCE //A3 = 2.1256869805798788337547274131e-15
data8 0x3C8D42EA28102760 //A2H = 5.0760412270332007485198379096e-17
data8 0x391A747B43B072DD //A2L = 1.2737621993898125881520341053e-33
LOCAL_OBJECT_END(_16_to_22p8_data)
LOCAL_OBJECT_START(_3p25_to_4_data)
-// Polynomial coefficients for the tanh(x), 3.25 <= |x| < 4
+// Polynomial coefficients for the tanh(x), 3.25 <= |x| < 4
data8 0xE9E07240432926E6, 0x00003FF7 //A3 = 7.1373517862636557382403555215e-03
data8 0xBF75F495227AF306 //A2H = -5.3602052282115727338540622782e-03
data8 0xBBBE92D355A6B716 //A2L = -6.4741983326810209847018826624e-21
LOCAL_OBJECT_END(_3p25_to_4_data)
LOCAL_OBJECT_START(_6p5_to_8_data)
-// Polynomial coefficients for the tanh(x), 6.5 <= |x| < 8.0
+// Polynomial coefficients for the tanh(x), 6.5 <= |x| < 8.0
data8 0xA11C8A63815E5657, 0x00003FEF //A3 = 1.9205985861286093001394561449e-05
data8 0xBEDE355AD6CB61D8 //A2H = -7.2022479400070228499307345427e-06
data8 0xBB8E6B50B8468A63 //A2L = -8.0518953122203408718779840543e-22
LOCAL_OBJECT_END(_6p5_to_8_data)
LOCAL_OBJECT_START(_13_to_16_data)
-// Polynomial coefficients for the tanh(x), 13 <= |x| < 16
+// Polynomial coefficients for the tanh(x), 13 <= |x| < 16
data8 0x98176FD2075BDBD5, 0x00003FDB //A3 = 1.7290807363028159200235264756e-11
data8 0xBD8C8464F76162D1 //A2H = -3.2420263805679445515400340441e-12
data8 0xBA2D56B508E0F1FD //A2L = -1.8515322669984580704502445180e-28
//////// "Tail" tables //////////
LOCAL_OBJECT_START(_0p125_to_0p25_data_tail)
-// Polynomial coefficients for the erf(x), 1/8 <= |x| < 1/4
+// Polynomial coefficients for the erf(x), 1/8 <= |x| < 1/4
data8 0x9D7D206E97ADC83A, 0x0000BFCC //A13 = -5.4639895428711257047470806445e-16
data8 0xA8972B666A845810, 0x00003FD3 //A12 = 7.4869224589947988668562043110e-14
data8 0x9A5B31511C9F4698, 0x0000BFD4 //A11 = -1.3709586467430093373657009487e-13
LOCAL_OBJECT_END(_0p125_to_0p25_data_tail)
LOCAL_OBJECT_START(_0p25_to_0p5_data_tail)
-// Polynomial coefficients for the tanh(x), 1/4 <= |x| < 1/2
+// Polynomial coefficients for the tanh(x), 1/4 <= |x| < 1/2
data8 0x9E2972C008B9965E, 0x0000BFDC //A13 = -3.5961854154738002253192260213e-11
data8 0xC3EABA3D219BEA8A, 0x00003FDB //A12 = 2.2273173303628274478819473067e-11
data8 0xC50FB68D960D5CD9, 0x00003FE1 //A11 = 1.4338102430978399800743148719e-09
LOCAL_OBJECT_END(_0p25_to_0p5_data_tail)
LOCAL_OBJECT_START(_0p5_to_1_data_tail)
-// Polynomial coefficients for the tanh(x), 1/2 <= |x| < 1
+// Polynomial coefficients for the tanh(x), 1/2 <= |x| < 1
data8 0xDF67FB36FFA2A538, 0x00003FE7 //A13 = 1.0403160796697495720021114635e-07
data8 0xB7FB80FB5AFA63A4, 0x0000BFE8 //A12 = -1.7134699677764282023124981753e-07
data8 0xC87625A0BA7D6C5F, 0x0000BFEA //A11 = -7.4677732458471897291461679095e-07
LOCAL_OBJECT_END(_0p5_to_1_data_tail)
LOCAL_OBJECT_START(_1_to_2_data_tail)
-// Polynomial coefficients for the tanh(x), 1 <= |x| < 2.0
+// Polynomial coefficients for the tanh(x), 1 <= |x| < 2.0
data8 0xCCAEE174EAC17F78, 0x0000BFEE //A13 = -1.2200065117856038355953618829e-05
data8 0xA39DD0981D1A2776, 0x0000BFF0 //A12 = -3.9009204899026604074167603200e-05
data8 0xB7104FA27FAF80D0, 0x00003FF2 //A11 = 1.7458316338540792661905876072e-04
LOCAL_OBJECT_END(_1_to_2_data_tail)
LOCAL_OBJECT_START(_2_to_3p25_data_tail)
-// Polynomial coefficients for the tanh(x), 2 <= |x| < 3.25
+// Polynomial coefficients for the tanh(x), 2 <= |x| < 3.25
data8 0x92E1711A3BD6408B, 0x0000BFF4 //A13 = -5.6030514548041036913731470443e-04
data8 0x8B9BD885FF3E98C5, 0x00003FF5 //A12 = 1.0651304064581604055612602669e-03
data8 0xD041356C7FA26A22, 0x0000BFF5 //A11 = -1.5888574328066952147023520244e-03
LOCAL_OBJECT_END(_2_to_3p25_data_tail)
LOCAL_OBJECT_START(_4_to_6p5_data_tail)
-// Polynomial coefficients for the tanh(x), 4 <= |x| < 6.5
+// Polynomial coefficients for the tanh(x), 4 <= |x| < 6.5
data8 0x870CCE8C76C52C7E, 0x00003FF5 //A13 = 1.0303499350193060915603525934e-03
data8 0xE1431E54AD2A738B, 0x0000BFF5 //A12 = -1.7186140560972621669872002486e-03
data8 0xAB20056533E28734, 0x00003FF6 //A11 = 2.6111615345168277554841545330e-03
LOCAL_OBJECT_END(_4_to_6p5_data_tail)
LOCAL_OBJECT_START(_8_to_13_data_tail)
-// Polynomial coefficients for the tanh(x), 8 <= |x| < 13
+// Polynomial coefficients for the tanh(x), 8 <= |x| < 13
data8 0xE50C3476BED020AA, 0x00003FF0 //A13 = 5.4609221347524272615754239857e-05
data8 0xBA16F5F4EDC0EABC, 0x0000BFF0 //A12 = -4.4367239594986428539386662937e-05
data8 0x8B916C2F002C3D91, 0x00003FF0 //A11 = 3.3275617838067362533536610680e-05
LOCAL_OBJECT_END(_8_to_13_data_tail)
LOCAL_OBJECT_START(_16_to_22p8_data_tail)
-// Polynomial coefficients for the tanh(x), 16 <= |x| < 22.88
+// Polynomial coefficients for the tanh(x), 16 <= |x| < 22.88
data8 0xEAF4AF87336E81B1, 0x00003FEF //A13 = 2.8008914392791730186582989654e-05
data8 0xD5B309EA768E2711, 0x00003FED //A12 = 6.3687375204024238267961143128e-06
data8 0xA4048CA537113538, 0x00003FEB //A11 = 1.2220276227448617951538196845e-06
LOCAL_OBJECT_END(_16_to_22p8_data_tail)
LOCAL_OBJECT_START(_3p25_to_4_data_tail)
-// Polynomial coefficients for the tanh(x), 3.25 <= |x| < 4
+// Polynomial coefficients for the tanh(x), 3.25 <= |x| < 4
data8 0xBE9A2BE19F21BA1C, 0x0000BFEE //A13 = -1.1360778336288065244475976873e-05
data8 0xF84910F515BDB014, 0x00003FED //A12 = 7.3994819819577018481862729782e-06
data8 0xC4C84FB788AA4007, 0x00003FEF //A11 = 2.3458298013663976251972482656e-05
LOCAL_OBJECT_END(_3p25_to_4_data_tail)
LOCAL_OBJECT_START(_6p5_to_8_data_tail)
-// Polynomial coefficients for the tanh(x), 6.5 <= |x| < 8.0
+// Polynomial coefficients for the tanh(x), 6.5 <= |x| < 8.0
data8 0xA6881D7D21774BFD, 0x00003FEF //A13 = 1.9852125640303530752913966680e-05
data8 0x875E983AA042E605, 0x0000BFF0 //A12 = -3.2274606306629334402383651599e-05
data8 0xCB19E01E94FC133C, 0x00003FF0 //A11 = 4.8423069963831314927026982707e-05
LOCAL_OBJECT_END(_6p5_to_8_data_tail)
LOCAL_OBJECT_START(_13_to_16_data_tail)
-// Polynomial coefficients for the tanh(x), 13 <= |x| < 16
+// Polynomial coefficients for the tanh(x), 13 <= |x| < 16
data8 0x9D6CCDA4767CA6D9, 0x00003FE5 //A13 = 1.8326683535066775712253572575e-08
data8 0xFFAF154F334BF403, 0x0000BFE4 //A12 = -1.4882762852665077172347508377e-08
data8 0xBFC68FA7C61B6C17, 0x00003FE4 //A11 = 1.1162810813806544919835662888e-08
LOCAL_OBJECT_END(_13_to_16_data_tail)
LOCAL_OBJECT_START(_0_to_1o8_data)
-// Polynomial coefficients for the tanh(x), 0.0 <= |x| < 0.125
+// Polynomial coefficients for the tanh(x), 0.0 <= |x| < 0.125
data8 0xBA0EC1879495150B, 0x0000BFF5 // A15 = -1.4195071451378679802688367813e-03
data8 0xEB5A82898D1BCBA4, 0x00003FF6 // A13 = 3.5912102408030526706365632879e-03
data8 0x91370DAFE0B64438, 0x0000BFF8 // A11 = -8.8632234251336964576640807982e-03
GLOBAL_LIBM_ENTRY(tanhl)
{ .mfi
- alloc r32 = ar.pfs, 0, 21, 0, 0
+ alloc r32 = ar.pfs, 0, 21, 0, 0
fmerge.se fArgAbsNorm = f1, f8 // normalized x (1.0 <= x < 2.0)
addl rSignBit = 0x20000, r0 // Set sign bit for exponent
}
{ .mfi
getf.exp rArgExp = f8 // Get arg exponent
- fclass.m p6,p0 = f8, 0xEF // Filter 0, denormals and specials
+ fclass.m p6,p0 = f8, 0xEF // Filter 0, denormals and specials
// 0xEF = @qnan|@snan|@pos|@neg|@zero|@unorm|@inf
- addl rBias = 0xfffc, r0 // Value to subtract from exp
+ addl rBias = 0xfffc, r0 // Value to subtract from exp
// to get actual interval number
}
{ .mfi
ld8 rDataPtr = [rDataPtr] // Get real common data pointer
fma.s1 fArgSqr = f8, f8, f0 // x^2 (for [0;1/8] path)
- addl r2to4 = 0x10000, r0 // unbiased exponent
+ addl r2to4 = 0x10000, r0 // unbiased exponent
// for [2;4] binary interval
};;
{ .mfi
- getf.sig rArgSig = f8 // Get arg significand
+ getf.sig rArgSig = f8 // Get arg significand
fcmp.lt.s1 p15, p14 = f8, f0 // Is arg negative/positive?
addl rSaturation = 0xb70, r0 // First 12 bits of
// saturation value signif.
}
{ .mfi
- setf.d f1p5 = r1p5 // 1.5 construction
+ setf.d f1p5 = r1p5 // 1.5 construction
fma.s1 f2p0 = f1,f1,f1 // 2.0 construction
addl r1625Sign = 0xd01, r0 // First 12 bits of
// 1.625 value signif.
{ .mfb
addl rTiny = 0xf000, r0 // Tiny value for saturation path
nop.f 0
-(p6) br.cond.spnt tanhl_spec // Branch to zero, denorm & specs
+(p6) br.cond.spnt tanhl_spec // Branch to zero, denorm & specs
};;
{ .mfi
sub rInterval = rArgExp, rBias // Get actual interval number
nop.f 0
- shr.u rArgSig = rArgSig, 52 // Leave only 12 bits of sign.
+ shr.u rArgSig = rArgSig, 52 // Leave only 12 bits of sign.
}
{ .mfi
adds rShiftedDataPtr = 0x10, rDataPtr // Second ptr to data
};;
{ .mfi
-(p8) cmp.le p8, p10 = r1625Sign, rArgSig // If signd is greater
+(p8) cmp.le p8, p10 = r1625Sign, rArgSig // If signd is greater
// than 1.625? (arg is at one of binary subranges)
nop.f 0
- shl rOffset = rInterval, 8 // Make offset from
+ shl rOffset = rInterval, 8 // Make offset from
// interval number
}
{ .mfi
};;
{ .mfi
-(p8) adds rOffset = 0x400, rOffset // Add additional offset
+(p8) adds rOffset = 0x400, rOffset // Add additional offset
// (arg is at one of binary subranges)
fma.s1 fArgCube = fArgSqr, f8, f0 // x^3 (for [0;1/8] path)
shl rTailOffset = rInterval, 7 // Make offset to "tail" data
// from interval number
}
{ .mib
- setf.exp fTiny = rTiny // Construct "tiny" value
+ setf.exp fTiny = rTiny // Construct "tiny" value
// for saturation path
cmp.ltu p11, p0 = 0x7, rInterval // if arg > 32
-(p9) br.cond.spnt _0_to_1o8
+(p9) br.cond.spnt _0_to_1o8
};;
{ .mfi
- add rAddr1 = rDataPtr, rOffset // Get address for
- // interval data
+ add rAddr1 = rDataPtr, rOffset // Get address for
+ // interval data
nop.f 0
shl rTailAddOffset = rInterval, 5 // Offset to interval
- // "tail" data
+ // "tail" data
}
{ .mib
add rAddr2 = rShiftedDataPtr, rOffset // Get second
- // address for interval data
-(p7) cmp.leu p11, p0 = rSaturation, rArgSig // if arg is
+ // address for interval data
+(p7) cmp.leu p11, p0 = rSaturation, rArgSig // if arg is
// in [22.8;32] interval
(p11) br.cond.spnt _saturation // Branch to Saturation path
};;
.pred.rel "mutex",p8,p10
{ .mfi
ldfe fA18 = [rAddr1], 16 // Load A18
-(p8) fms.s1 fArgAbsNorm = fArgAbsNorm, f1, f2p0 // Add 2.0
+(p8) fms.s1 fArgAbsNorm = fArgAbsNorm, f1, f2p0 // Add 2.0
// (arg is at one of binary subranges)
adds rTailAddr2 = 0x10, rTailAddr1 // First tail
// data address
}
{ .mfi
- ldfe fA25 = [rAddr2], 16 // Load A25
-(p10) fms.s1 fArgAbsNorm = fArgAbsNorm, f1, f1p5 // Add 1.5
+ ldfe fA25 = [rAddr2], 16 // Load A25
+(p10) fms.s1 fArgAbsNorm = fArgAbsNorm, f1, f1p5 // Add 1.5
// to normalized arg
nop.i 0
};;
fma.s1 fA23 = fA24, fArgAbsNorm, fA23 // Polynomial tail
nop.i 0
}
-{ .mfi
+{ .mfi
nop.m 0
- fma.s1 fA21 = fA22, fArgAbsNorm, fA21 // Polynomial tail
+ fma.s1 fA21 = fA22, fArgAbsNorm, fA21 // Polynomial tail
nop.i 0
};;
fma.s1 fRes3L = fRes3L, f1, fTH // (A3*x+A2)*x^2
nop.i 0
}
-{ .mfi
+{ .mfi
nop.m 0
fma.s1 fA19 = fA20, fArgAbsNorm, fA19 // Polynomial tail
nop.i 0
fma.s1 fRes1H = fTH2, f1, fA0H // A1*x+A0
nop.i 0
}
-{ .mfi
+{ .mfi
nop.m 0
fms.s1 fTL2 = fA1H, fArgAbsNorm, fTH2 // A1*x+A0
nop.i 0
fma.s1 fA8 = fA9, fArgAbsNorm, fA8 // Polynomial tail
nop.i 0
}
-{ .mfi
+{ .mfi
nop.m 0
fma.s1 fA10 = fA11, fArgAbsNorm, fA10 // Polynomial tail
nop.i 0
fms.s1 fArgAbsNorm11 = fArgAbsNorm4, fArgAbsNorm4, f0 // x^8
nop.i 0
}
-{ .mfi
+{ .mfi
nop.m 0
fma.s1 fA4 = fA5, fArgAbsNorm, fA4 // Polynomial tail
nop.i 0
fma.s1 fRes3L = fRes3L, f1, fA2L // (A3*x+A2)*x^2
nop.i 0
}
-{ .mfi
+{ .mfi
nop.m 0
fma.s1 fA6 = fA7, fArgAbsNorm, fA6 // Polynomial tail
nop.i 0
fma.s1 fTL2 = fTL2, f1, fTT2 // A1*x+A0
nop.i 0
}
-{ .mfi
+{ .mfi
nop.m 0
fms.s1 fRes1L = fA0H, f1, fRes1H // A1*x+A0
nop.i 0
fma.s1 fA23 = fA25, fArgAbsNorm2, fA23 // Polynomial tail
nop.i 0
}
-{ .mfi
+{ .mfi
nop.m 0
fma.s1 fA12 = fA14, fArgAbsNorm2, fA12 // Polynomial tail
nop.i 0
fma.s1 fA19 = fA21, fArgAbsNorm2, fA19 // Polynomial tail
nop.i 0
}
-{ .mfi
+{ .mfi
nop.m 0
fma.s1 fA8 = fA10, fArgAbsNorm2, fA8 // Polynomial tail
nop.i 0
fma.s1 fA15 = fA17, fArgAbsNorm2, fA15 // Polynomial tail
nop.i 0
}
-{ .mfi
+{ .mfi
nop.m 0
fms.s1 fArgAbsNorm11 = fArgAbsNorm11, fArgAbsNorm3, f0 // x^11
nop.i 0
fma.s1 fTT = fRes3L, fArgAbsNorm2, f0 // (A3*x+A2)*x^2
nop.i 0
}
-{ .mfi
+{ .mfi
nop.m 0
fma.s1 fA4 = fA6, fArgAbsNorm2, fA4 // Polynomial tail
nop.i 0
fma.s1 fA19 = fA23, fArgAbsNorm4, fA19 // Polynomial tail
nop.i 0
}
-{ .mfi
+{ .mfi
nop.m 0
fma.s1 fA8 = fA12, fArgAbsNorm4, fA8 // Polynomial tail
nop.i 0
fms.s1 fRes2L = fRes3H, fArgAbsNorm2, fRes2H // (A3*x+A2)*x^2
nop.i 0
}
-{ .mfi
+{ .mfi
nop.m 0
fma.s1 fResH = fRes2H, f1, fRes1H // High result
nop.i 0
nop.i 0
};;
-{ .mfi
+{ .mfi
nop.m 0
fma.s1 fRes2L = fRes2L, f1, fTT // (A3*x+A2)*x^2
nop.i 0
}
-{ .mfi
+{ .mfi
nop.m 0
fms.s1 fResL = fRes1H, f1, fResH // Low result
nop.i 0
// .s0 - for symmetry issue resolving at +/-inf rounding mode
nop.i 0
}
-{ .mfi
+{ .mfi
nop.m 0
fma.s1 fResL = fResL, f1, fRes2H // Low result
nop.i 0
};;
.pred.rel "mutex",p14,p15
-{ .mfi
+{ .mfi
nop.m 0
(p14) fma.s0 f8 = fResL, f1, fResH// Add high and low results
nop.i 0
}
-{ .mfb
+{ .mfb
nop.m 0
(p15) fms.s0 f8 = fResL, f1, fResH // Add high and low results
br.ret.sptk b0 // Main path return
_saturation:
.pred.rel "mutex",p14,p15
-{ .mfi
+{ .mfi
nop.m 0
(p14) fms.s0 f8 = f1, f1, fTiny // Saturation result r = 1-tiny
nop.i 0
};;
-{ .mfb
+{ .mfb
nop.m 0
(p15) fnma.s0 f8 = f1, f1, fTiny // Saturation result r = tiny-1
br.ret.sptk b0 // Saturation path return
// 0, denormals and special IEEE numbers path /////////////////////////////////
tanhl_spec:
-{ .mfi
+{ .mfi
nop.m 0
fclass.m p6,p0 = f8, 0x23 // To filter infinities
- // 0x23 = @pos|@neg|@inf
+ // 0x23 = @pos|@neg|@inf
nop.i 0
};;
-{ .mfi
+{ .mfi
nop.m 0
fclass.m p7,p0 = f8, 0xC7 // To filter NaNs & Zeros
// 0xC7 = @pos|@neg|@zero|@qnan|@snan
nop.i 0
};;
-{ .mfb
+{ .mfb
nop.m 0
-(p6) fmerge.s f8 = f8, f1 // +/-1 for INF args
+(p6) fmerge.s f8 = f8, f1 // +/-1 for INF args
(p6) br.ret.spnt b0 // exit for x = INF
};;
-{ .mfb
+{ .mfb
nop.m 0
-(p7) fma.s0 f8 = f8, f1, f8 // +/-0 for 0 args
+(p7) fma.s0 f8 = f8, f1, f8 // +/-0 for 0 args
// and NaNs for NaNs
(p7) br.ret.spnt b0 // exit for x = NaN or +/-0
};;
-{ .mfi
+{ .mfi
nop.m 0
fnorm.s0 f8 = f8 // Normalize arg
nop.i 0
};;
.pred.rel "mutex",p14,p15
-{ .mfi
+{ .mfi
nop.m 0
(p14) fnma.s0 f8 = f8, f8, f8 // res = r-r^2
nop.i 0
}
-{ .mfb
+{ .mfb
nop.m 0
(p15) fma.s0 f8 = f8, f8, f8 // res = r+r^2
br.ret.sptk b0 // 0, denormals, IEEE specials return
// 0 < |x| < 1/8 path /////////////////////////////////////////////////////////
_0_to_1o8:
-{ .mmi
+{ .mmi
adds rAddr1 = 0x11e0, rDataPtr // Ptr 1 to coeffs
adds rAddr2 = 0x11f0, rDataPtr // Ptr 2 to coeffs
nop.i 0
};;
-{ .mmi
+{ .mmi
ldfe fA15 = [rAddr1], 32 // Load A15
ldfe fA13 = [rAddr2], 32 // Load A13
nop.i 0
};;
-{ .mmi
+{ .mmi
ldfe fA11 = [rAddr1], 32 // Load A11
ldfe fA9 = [rAddr2], 32 // Load A9
nop.i 0
};;
-{ .mmi
+{ .mmi
ldfe fA7 = [rAddr1], 32 // Load A7
ldfe fA5 = [rAddr2] // Load A5
nop.i 0
};;
-{ .mfi
+{ .mfi
ldfe fA3 = [rAddr1] // Load A3
fma.s1 fA11 = fA13, fArgSqr, fA11 // Polynomial tail
nop.i 0
}
-{ .mfi
+{ .mfi
nop.m 0
- fma.s1 fArgFour = fArgSqr, fArgSqr, f0 // a^4
+ fma.s1 fArgFour = fArgSqr, fArgSqr, f0 // a^4
nop.i 0
};;
-{ .mfi
+{ .mfi
nop.m 0
fma.s1 fA3 = fA5, fArgSqr, fA3 // Polynomial tail
nop.i 0
}
-{ .mfi
+{ .mfi
nop.m 0
fma.s1 fA7 = fA9, fArgSqr, fA7 // Polynomial tail
nop.i 0
};;
-{ .mfi
+{ .mfi
nop.m 0
fma.s1 fA11 = fA15, fArgFour, fA11 // Polynomial tail
nop.i 0
};;
-{ .mfi
+{ .mfi
nop.m 0
fma.s1 fA3 = fA7, fArgFour, fA3 // Polynomial tail
nop.i 0
}
-{ .mfi
+{ .mfi
nop.m 0
fma.s1 fArgEight = fArgFour, fArgFour, f0 // a^8
nop.i 0
};;
-{ .mfi
+{ .mfi
nop.m 0
fma.s1 fRes = fA11, fArgEight, fA3 //Polynomial tail result
nop.i 0
};;
-{ .mfb
+{ .mfb
nop.m 0
fma.s0 f8 = fRes, fArgCube, f8 // (Polynomial tail)*x^3
br.ret.sptk b0 // [0;1/8] interval return
};;
-
+
GLOBAL_LIBM_END(tanhl)
// products derived from this software without specific prior written
// permission.
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
// Intel Corporation is the author of this code, and requests that all
-// problem reports or change requests be submitted to it directly at
+// problem reports or change requests be submitted to it directly at
// http://www.intel.com/software/products/opensource/libraries/num.htm.
//
//*********************************************************************
//
-// History:
+// History:
//
// 02/02/00 (hand-optimized)
// 04/04/00 Unwind support added
// products derived from this software without specific prior written
// permission.
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,INCLUDING,BUT NOT
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,INCLUDING,BUT NOT
// LIMITED TO,THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT,INDIRECT,INCIDENTAL,SPECIAL,
// EXEMPLARY,OR CONSEQUENTIAL DAMAGES (INCLUDING,BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,DATA,OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,DATA,OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY,WHETHER IN CONTRACT,STRICT LIABILITY OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE,EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE,EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
// Intel Corporation is the author of this code,and requests that all
-// problem reports or change requests be submitted to it directly at
+// problem reports or change requests be submitted to it directly at
// http://www.intel.com/software/products/opensource/libraries/num.htm.
//
//*********************************************************************
//
-// History:
+// History:
// 10/12/01 Initial version
// 05/20/02 Cleaned up namespace and sf0 syntax
// 02/10/03 Reordered header: .section, .global, .proc, .align
// IEEE Special Conditions:
//
// tgamma(+inf) = +inf
-// tgamma(-inf) = QNaN
-// tgamma(+/-0) = +/-inf
+// tgamma(-inf) = QNaN
+// tgamma(+/-0) = +/-inf
// tgamma(x<0, x - integer) = QNaN
// tgamma(SNaN) = QNaN
// tgamma(QNaN) = QNaN
// Overview
//
// The method consists of three cases.
-//
+//
// If 2 <= x < OVERFLOW_BOUNDARY use case tgamma_regular;
// else if 0 < x < 2 use case tgamma_from_0_to_2;
// else if -(i+1) < x < -i, i = 0...184 use case tgamma_negatives;
// r = x - N, note 0 <= r < 1
//
// n = N & ~0xF - index of table that contains coefficient of
-// polynomial approximation
+// polynomial approximation
// i = N & 0xF - is used in recursive formula
-//
+//
//
// Step 2: Approximation
// ---------------------
// -----------------
// In case when i > 0 we need to multiply P22n(r) by product
// R(i)=(x-1)*(x-2)*...*(x-i). To reduce number of fp-instructions
-// we can calculate R as follow:
+// we can calculate R as follow:
// R(i) = ((x-1)*(x-2))*((x-3)*(x-4))*...*((x-(i-1))*(x-i)) if i is
// even or R = ((x-1)*(x-2))*((x-3)*(x-4))*...*((x-(i-2))*(x-(i-1)))*
// *(i-1) if i is odd. In both cases we need to calculate
// if 1.25 <= x < 1.5 than GAMMA(x) = P15(x-x_min) where
// x_min is point of local minimum on [1; 2] interval.
// if 1.5 <= x < 2.0 than GAMMA(x) = P15(x-1.5)
-// and
+// and
// if 0 < x < 1 than GAMMA(x) = GAMMA(x+1)/x
//
// Case -(i+1) < x < -i, i = 0...184
//
// Step 1: Reduction
// -----------------
-// Note that period of sin(PI*x) is 2 and range reduction for
-// sin(PI*x) is like to range reduction for GAMMA(x)
+// Note that period of sin(PI*x) is 2 and range reduction for
+// sin(PI*x) is like to range reduction for GAMMA(x)
// i.e r = x - [x] with exception of cases
// when r > 0.5 (in such cases r = 1 - (x - [x])).
//
// Step 2: Approximation
// ---------------------
-// To approximate sin(PI*x)/PI = sin(PI*(2*n+r))/PI =
+// To approximate sin(PI*x)/PI = sin(PI*(2*n+r))/PI =
// = (-1)^n*sin(PI*r)/PI Taylor series is used.
// sin(PI*r)/PI ~ S21(r).
//
// ----------------
// To calculate 1/(x*GAMMA(x)*S21(r)) we use frcpa instruction
// with following Newton-Raphson interations.
-//
+//
//
//*********************************************************************
}
{ .mfb
ldfe FR_C01 = [GR_ad_Co],32
-(p7) fms.s1 FR_r02 = FR_r02,f1,f1
+(p7) fms.s1 FR_r02 = FR_r02,f1,f1
// jump if x is NaTVal, NaN, +/-0, +/-INF
(p10) br.cond.spnt tgamma_spec
};;
{ .mfi
ldfe FR_C30 = [GR_ad_Co],32
fma.s1 FR_Rq3 = FR_Rq3,FR_6,FR_X2pX // (x-5)*(x-6)
- nop.i 0
+ nop.i 0
};;
{ .mfi
ldfe FR_C40 = [GR_ad_Ce],32
}
{ .mfi
ldfe FR_C70 = [GR_ad_Co7],32
- fma.s1 FR_rs = f0,f0,FR_r // reduced arg for sin(pi*x)
+ fma.s1 FR_rs = f0,f0,FR_r // reduced arg for sin(pi*x)
add GR_ad_Co = 0x550,GR_ad_Data
};;
{ .mfi
nop.i 0
}
{ .mfi
- nop.m 0
+ nop.m 0
fma.s1 FR_C01 = FR_C01,FR_C11,f0
nop.i 0
};;
{ .mfi
- nop.m 0
+ nop.m 0
fma.s1 FR_C21 = FR_C21,FR_C31,f0
nop.i 0
}
(p12) cmp.lt.unc p7,p0 = 2,GR_Sig2 // should mul by FR_Rq2?
};;
{ .mfi
- nop.m 0
+ nop.m 0
fma.s1 FR_C41 = FR_C41,FR_C51,f0
- nop.i 0
+ nop.i 0
}
{ .mfi
nop.m 0
(p12) cmp.lt.unc p9,p0 = 6,GR_Sig2 // should mul by FR_Rq4?
};;
{ .mfi
- nop.m 0
+ nop.m 0
fma.s1 FR_C61 = FR_C61,FR_C71,f0
(p15) cmp.eq p11,p0 = r0,r0
}
(p12) cmp.lt.unc p8,p0 = 10,GR_Sig2 // should mul by FR_Rq6?
};;
{ .mfi
- nop.m 0
+ nop.m 0
fma.s1 FR_C81 = FR_C81,FR_C91,f0
nop.i 0
}
(p14) cmp.ltu p0,p11 = 0x9,GR_Tbl_Ind
};;
{ .mfi
- nop.m 0
- fcvt.xf FR_RqLin = FR_Xt
+ nop.m 0
+ fcvt.xf FR_RqLin = FR_Xt
nop.i 0
}
{ .mfi
nop.i 0
};;
{ .mfi
- nop.m 0
+ nop.m 0
fma.s1 FR_C01 = FR_C01,FR_C21,f0
nop.i 0
}
{ .mfi
- nop.m 0
+ nop.m 0
fma.s1 FR_rs4 = FR_rs2,FR_rs2,f0
(p12) cmp.lt.unc p8,p0 = 4,GR_Sig2 // should mul by FR_Rq3?
};;
(p12) cmp.lt.unc p9,p0 = 12,GR_Sig2 // should mul by FR_Rq7?
};;
{ .mfi
- nop.m 0
+ nop.m 0
fma.s1 FR_C41 = FR_C41,FR_C61,f0
nop.i 0
}
{ .mfi
- nop.m 0
+ nop.m 0
(p9) fma.s1 FR_Rq5 = FR_Rq5,FR_Rq7,f0
- nop.i 0
+ nop.i 0
};;
{ .mfi
- nop.m 0
+ nop.m 0
fma.s1 FR_C81 = FR_C81,FR_CA1,f0
- nop.i 0
+ nop.i 0
}
{ .mfi
nop.m 0
mov GR_ExpOf1 = 0x2FFFF
}
{ .mfi
- nop.m 0
+ nop.m 0
(p6) fms.s1 FR_RqLin = FR_AbsX,f1,FR_RqLin
(p12) cmp.lt.unc p8,p0 = 8,GR_Sig2 // should mul by FR_Rq5?
};;
{ .mfi
- nop.m 0
+ nop.m 0
fma.s1 FR_C01 = FR_C01,FR_C41,f0
nop.i 0
}
}
{ .mfi
nop.m 0
-(p15) fcmp.lt.unc.s1 p0,p10 = FR_AbsX,FR_OvfBound // x >= overflow_boundary
+(p15) fcmp.lt.unc.s1 p0,p10 = FR_AbsX,FR_OvfBound // x >= overflow_boundary
nop.i 0
};;
{ .mfi
(p15) cmp.eq.unc p0,p11 = r0,GR_SigRqLin
}
{ .mfb
- nop.m 0
+ nop.m 0
fma.s1 FR_GAMMA = FR_C01,FR_C81,f0
(p11) br.cond.spnt tgamma_positives
};;
};;
.pred.rel "mutex",p8,p9
{ .mfi
- nop.m 0
+ nop.m 0
(p9) fma.s1 FR_GAMMA = FR_GAMMA,FR_Rq1,f0
tbit.z p6,p7 = GR_Sig,0 // p6 if sin<0, p7 if sin>0
}
{ .mfi
- nop.m 0
+ nop.m 0
(p8) fma.s1 FR_GAMMA = FR_GAMMA,FR_RqLin,f0
nop.i 0
};;
};;
.pred.rel "mutex",p6,p7
{ .mfi
- nop.m 0
-(p6) fnma.s1 FR_GAMMA = FR_GAMMA,FR_S21,f0
- nop.i 0
+ nop.m 0
+(p6) fnma.s1 FR_GAMMA = FR_GAMMA,FR_S21,f0
+ nop.i 0
}
{ .mfi
- nop.m 0
+ nop.m 0
(p7) fma.s1 FR_GAMMA = FR_GAMMA,FR_S21,f0
mov GR_Sig2 = 1
};;
{ .mfi
nop.m 0
fma.s1 FR_Rcp1 = FR_Rcp0,FR_Rcp1,FR_Rcp0
- nop.i 0
+ nop.i 0
};;
// NR method: ineration #2
{ .mfi
tgamma_positives:
.pred.rel "mutex",p8,p9
{ .mfi
- nop.m 0
+ nop.m 0
(p9) fma.d.s0 f8 = FR_GAMMA,FR_Rq1,f0
nop.i 0
}
{ .mfb
- nop.m 0
+ nop.m 0
(p8) fma.d.s0 f8 = FR_GAMMA,FR_RqLin,f0
br.ret.sptk b0
};;
};;
{ .mfi
(p6) getf.sig GR_Sig = FR_NormX
- nop.f 0
+ nop.f 0
(p6) shl GR_Sig2 = GR_Sig2,63
}
{ .mfi
(p6) mov GR_NzOvfBound = 0xFBFF
};;
{ .mfi
- cmp.eq p8,p0 = GR_Sign_Exp,GR_ExpOf05 // r02 >= 1/2
+ cmp.eq p8,p0 = GR_Sign_Exp,GR_ExpOf05 // r02 >= 1/2
nop.f 0
- cmp.eq p9,p10 = GR_Sign_Exp,GR_ExpOf025 // r02 >= 1/4
+ cmp.eq p9,p10 = GR_Sign_Exp,GR_ExpOf025 // r02 >= 1/4
}
{ .mfi
(p6) cmp.ltu.unc p11,p0 = GR_Sign_Exp,GR_NzOvfBound // p11 <- overflow
(p11) br.cond.spnt tgamma_ovf_near_0 //tgamma_spec_res
};;
{ .mfi
- ldfe FR_A15 = [GR_ad_Co],32
+ ldfe FR_A15 = [GR_ad_Co],32
nop.f 0
(p12) cmp.eq.unc p13,p0 = GR_Sig,GR_Sig2
}
{ .mfb
- ldfe FR_A14 = [GR_ad_Ce],32
+ ldfe FR_A14 = [GR_ad_Ce],32
nop.f 0
(p13) br.cond.spnt tgamma_ovf_near_0_boundary //tgamma_spec_res
};;
{ .mfi
- ldfe FR_A13 = [GR_ad_Co],32
+ ldfe FR_A13 = [GR_ad_Co],32
nop.f 0
nop.i 0
}
{ .mfi
- ldfe FR_A12 = [GR_ad_Ce],32
+ ldfe FR_A12 = [GR_ad_Ce],32
nop.f 0
nop.i 0
};;
.pred.rel "mutex",p9,p10
{ .mfi
- ldfe FR_A11 = [GR_ad_Co],32
-(p10) fma.s1 FR_r2 = FR_r02,FR_r02,f0
+ ldfe FR_A11 = [GR_ad_Co],32
+(p10) fma.s1 FR_r2 = FR_r02,FR_r02,f0
nop.i 0
}
{ .mfi
- ldfe FR_A10 = [GR_ad_Ce],32
-(p9) fma.s1 FR_r2 = FR_r,FR_r,f0
+ ldfe FR_A10 = [GR_ad_Ce],32
+(p9) fma.s1 FR_r2 = FR_r,FR_r,f0
nop.i 0
};;
{ .mfi
- ldfe FR_A9 = [GR_ad_Co],32
+ ldfe FR_A9 = [GR_ad_Co],32
(p6) fma.s1 FR_Rcp1 = FR_Rcp0,FR_Rcp1,FR_Rcp0
nop.i 0
}
{ .mfi
- ldfe FR_A8 = [GR_ad_Ce],32
+ ldfe FR_A8 = [GR_ad_Ce],32
(p10) fma.s1 FR_r = f0,f0,FR_r02
nop.i 0
};;
{ .mfi
- ldfe FR_A7 = [GR_ad_Co],32
+ ldfe FR_A7 = [GR_ad_Co],32
nop.f 0
nop.i 0
}
{ .mfi
- ldfe FR_A6 = [GR_ad_Ce],32
- nop.f 0
+ ldfe FR_A6 = [GR_ad_Ce],32
+ nop.f 0
nop.i 0
};;
{ .mfi
- ldfe FR_A5 = [GR_ad_Co],32
+ ldfe FR_A5 = [GR_ad_Co],32
nop.f 0
nop.i 0
}
{ .mfi
- ldfe FR_A4 = [GR_ad_Ce],32
+ ldfe FR_A4 = [GR_ad_Ce],32
nop.f 0
nop.i 0
};;
{ .mfi
- ldfe FR_A3 = [GR_ad_Co],32
+ ldfe FR_A3 = [GR_ad_Co],32
nop.f 0
nop.i 0
}
{ .mfi
- ldfe FR_A2 = [GR_ad_Ce],32
+ ldfe FR_A2 = [GR_ad_Ce],32
nop.f 0
nop.i 0
};;
{ .mfi
- ldfe FR_A1 = [GR_ad_Co],32
- fma.s1 FR_r4 = FR_r2,FR_r2,f0
+ ldfe FR_A1 = [GR_ad_Co],32
+ fma.s1 FR_r4 = FR_r2,FR_r2,f0
nop.i 0
}
{ .mfi
- ldfe FR_A0 = [GR_ad_Ce],32
+ ldfe FR_A0 = [GR_ad_Ce],32
nop.f 0
nop.i 0
};;
};;
{ .mfi
nop.m 0
- fma.s1 FR_r8 = FR_r4,FR_r4,f0
+ fma.s1 FR_r8 = FR_r4,FR_r4,f0
nop.i 0
};;
{ .mfi
};;
.pred.rel "mutex",p6,p7
{ .mfi
- nop.m 0
+ nop.m 0
(p6) fma.s1 FR_A15 = FR_A15,FR_r8,FR_A7
nop.i 0
}
{ .mfi
- nop.m 0
+ nop.m 0
(p7) fma.d.s0 f8 = FR_A15,FR_r8,FR_A7
nop.i 0
};;
{ .mfb
- nop.m 0
+ nop.m 0
(p6) fma.d.s0 f8 = FR_A15,FR_Rcp3,f0
br.ret.sptk b0
};;
{ .mfi
nop.m 0
nop.f 0
- shl r8 = r8,52
+ shl r8 = r8,52
};;
{ .mfi
sub r8 = r8,r0,1
.pred.rel "mutex",p14,p15
{ .mfi
// set p8 to 0 in case of overflow and to 1 otherwise
- // for negative arg:
+ // for negative arg:
// no overflow if rounding mode either Z or +Inf, i.e.
// GR_fpsr > 1
(p14) cmp.lt p8,p0 = 1,GR_fpsr
nop.f 0
- // for positive arg:
+ // for positive arg:
// no overflow if rounding mode either Z or -Inf, i.e.
// (GR_fpsr & 1) == 0
(p15) tbit.z p0,p8 = GR_fpsr,0
tgamma_ovf_near_0:
{ .mfi
mov r8 = 0x1FFFE
- nop.f 0
+ nop.f 0
nop.i 0
};;
{ .mfi
};;
.pred.rel "mutex",p14,p15
{ .mfi
- nop.m 0
+ nop.m 0
(p15) fma.d.s0 f8 = f9,f9,f0 // Set I,O and +INF result
- nop.i 0
+ nop.i 0
}
{ .mfb
- nop.m 0
+ nop.m 0
(p14) fnma.d.s0 f8 = f9,f9,f0 // Set I,O and -INF result
br.cond.sptk tgamma_libm_err
};;
};;
{ .mfb
(p11) cmp.ltu.unc p7,p8 = GR_0x30033,GR_Sign_Exp
- nop.f 0
+ nop.f 0
(p10) br.cond.spnt tgamma_singularity
};;
.pred.rel "mutex",p7,p8
mov GR_TAG = 258 // overflow
}
{ .mfb
- nop.m 0
+ nop.m 0
(p15) fma.d.s0 f8 = f9,f9,f0 // Set I,O and +INF result
br.cond.sptk tgamma_libm_err
};;
{ .mfi
(p7) mov GR_TAG = 259 // negative
(p7) frcpa.s0 f8,p0 = f1,f8
- nop.i 0
+ nop.i 0
}
{ .mib
nop.m 0
add GR_Parameter_Y=-32,sp // Parameter 2 value
nop.f 0
.save ar.pfs,GR_SAVE_PFS
- mov GR_SAVE_PFS=ar.pfs // Save ar.pfs
+ mov GR_SAVE_PFS=ar.pfs // Save ar.pfs
}
{ .mfi
-.fframe 64
+.fframe 64
add sp=-64,sp // Create new stack
nop.f 0
mov GR_SAVE_GP=gp // Save gp
{ .mmi
stfd [GR_Parameter_Y] = FR_Y,16 // STORE Parameter 2 on stack
add GR_Parameter_X = 16,sp // Parameter 1 address
-.save b0, GR_SAVE_B0
- mov GR_SAVE_B0=b0 // Save b0
+.save b0, GR_SAVE_B0
+ mov GR_SAVE_B0=b0 // Save b0
};;
.body
{ .mib
- stfd [GR_Parameter_X] = FR_X // STORE Parameter 1 on stack
- add GR_Parameter_RESULT = 0,GR_Parameter_Y // Parameter 3 address
+ stfd [GR_Parameter_X] = FR_X // STORE Parameter 1 on stack
+ add GR_Parameter_RESULT = 0,GR_Parameter_Y // Parameter 3 address
nop.b 0
}
{ .mib
stfd [GR_Parameter_Y] = FR_RESULT // STORE Parameter 3 on stack
- add GR_Parameter_Y = -16,GR_Parameter_Y
+ add GR_Parameter_Y = -16,GR_Parameter_Y
br.call.sptk b0=__libm_error_support# // Call error handling function
};;
{ .mmi
mov b0 = GR_SAVE_B0 // Restore return address
};;
{ .mib
- mov gp = GR_SAVE_GP // Restore gp
+ mov gp = GR_SAVE_GP // Restore gp
mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs
br.ret.sptk b0 // Return
-};;
+};;
LOCAL_LIBM_END(__libm_error_region)
.type __libm_error_support#,@function
// products derived from this software without specific prior written
// permission.
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,INCLUDING,BUT NOT
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,INCLUDING,BUT NOT
// LIMITED TO,THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT,INDIRECT,INCIDENTAL,SPECIAL,
// EXEMPLARY,OR CONSEQUENTIAL DAMAGES (INCLUDING,BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,DATA,OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,DATA,OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY,WHETHER IN CONTRACT,STRICT LIABILITY OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE,EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE,EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
// Intel Corporation is the author of this code,and requests that all
-// problem reports or change requests be submitted to it directly at
+// problem reports or change requests be submitted to it directly at
// http://www.intel.com/software/products/opensource/libraries/num.htm.
//
//*********************************************************************
//
-// History:
+// History:
// 11/30/01 Initial version
// 05/20/02 Cleaned up namespace and sf0 syntax
// 02/10/03 Reordered header: .section, .global, .proc, .align
// IEEE Special Conditions:
//
// tgammaf(+inf) = +inf
-// tgammaf(-inf) = QNaN
-// tgammaf(+/-0) = +/-inf
+// tgammaf(-inf) = QNaN
+// tgammaf(+/-0) = +/-inf
// tgammaf(x<0, x - integer) = QNaN
// tgammaf(SNaN) = QNaN
// tgammaf(QNaN) = QNaN
// Overview
//
// The method consists of three cases.
-//
+//
// If 2 <= x < OVERFLOW_BOUNDARY use case tgamma_regular;
// else if 0 < x < 2 use case tgamma_from_0_to_2;
// else if -(i+1) < x < -i, i = 0...43 use case tgamma_negatives;
// r = x - N, note 0 <= r < 1
//
// n = N & ~0xF - index of table that contains coefficient of
-// polynomial approximation
+// polynomial approximation
// i = N & 0xF - is used in recursive formula
-//
+//
//
// Step 2: Approximation
// ---------------------
// -----------------
// In case when i > 0 we need to multiply P12n(r) by product
// R(i,x)=(x-1)*(x-2)*...*(x-i). To reduce number of fp-instructions
-// we can calculate R as follow:
+// we can calculate R as follow:
// R(i,x) = ((x-1)*(x-2))*((x-3)*(x-4))*...*((x-(i-1))*(x-i)) if i is
// even or R = ((x-1)*(x-2))*((x-3)*(x-4))*...*((x-(i-2))*(x-(i-1)))*
// *(i-1) if i is odd. In both cases we need to calculate
// x_min is point of local minimum on [1; 2] interval.
// if 1.5 <= x < 1.75 than GAMMA(x) = P7(x-1.5)
// if 1.75 <= x < 2.0 than GAMMA(x) = P7(x-1.5)
-// and
+// and
// if 0 < x < 1 than GAMMA(x) = GAMMA(x+1)/x
//
// Case -(i+1) < x < -i, i = 0...43
//
// Step 1: Reduction
// -----------------
-// Note that period of sin(PI*x) is 2 and range reduction for
-// sin(PI*x) is like to range reduction for GAMMA(x)
+// Note that period of sin(PI*x) is 2 and range reduction for
+// sin(PI*x) is like to range reduction for GAMMA(x)
// i.e rs = x - round(x) and |rs| <= 0.5.
//
// Step 2: Approximation
// ---------------------
-// To approximate sin(PI*x)/PI = sin(PI*(2*n+rs))/PI =
+// To approximate sin(PI*x)/PI = sin(PI*(2*n+rs))/PI =
// = (-1)^n*sin(PI*rs)/PI Taylor series is used.
// sin(PI*rs)/PI ~ S17(rs).
//
// ----------------
// To calculate 1/x and 1/(GAMMA(x)*S12(rs)) we use frcpa
// instruction with following Newton-Raphson interations.
-//
+//
//
//*********************************************************************
FR_Y = f1
FR_RESULT = f8
-FR_iXt = f11
+FR_iXt = f11
FR_Xt = f12
FR_r = f13
FR_r2 = f14
{ .mfi
nop.m 0
(p14) fma.s1 FR_rs2 = FR_rs,FR_rs,f0
- nop.i 0
+ nop.i 0
}
{ .mfb
nop.m 0
(p7) br.cond.spnt tgammaf_overflow_near0_bound
};;
{ .mfi
- nop.m 0
+ nop.m 0
(p6) fnma.s1 FR_Rq1 = FR_Rq1,FR_Rq0,f0
- nop.i 0
+ nop.i 0
}
{ .mfi
- nop.m 0
+ nop.m 0
(p10) fma.s1 FR_Rq2 = FR_Rq2,FR_Rq3,f0
and GR_Sig = 0x7,GR_Sig
};;
nop.i 0
};;
{ .mfi
- nop.m 0
+ nop.m 0
(p9) fma.s1 FR_Rq1 = FR_Rq1,FR_Rq2,f0
nop.i 0
}
{ .mfi
nop.m 0
fma.s1 FR_C51 = FR_C51,FR_r,FR_C50
- nop.i 0
+ nop.i 0
};;
{ .mfi
(p14) getf.exp GR_SignExp = FR_rs
fma.s1 FR_C01 = FR_C01,FR_C11,f0
- nop.i 0
+ nop.i 0
}
{ .mfi
nop.m 0
(p14) fma.s1 FR_S01 = FR_S01,FR_rs2,FR_S00
- nop.i 0
+ nop.i 0
};;
{ .mfi
nop.m 0
{ .mfi
nop.m 0
(p14) fma.s1 FR_S11 = FR_S11,FR_rs2,FR_S10
-(p14) tbit.z.unc p11,p12 = GR_SignExp,17
+(p14) tbit.z.unc p11,p12 = GR_SignExp,17
}
{ .mfi
nop.m 0
{ .mfi
nop.m 0
(p7) fma.s1 FR_An = FR_Rq1,FR_An,f0
- nop.i 0
+ nop.i 0
};;
{ .mfb
nop.m 0
};;
{ .mfi
- nop.m 0
+ nop.m 0
(p14) fma.s1 FR_GAMMA = FR_C01,FR_C41,f0
(p14) tbit.z.unc p6,p7 = GR_Sig,0
}
{ .mfi
cmp.gt p9,p0 = GR_Arg,GR_ExpOf05
fma.s1 FR_r = f0,f0,FR_NormX // reduced arg for (0;1)
- mov GR_ExpOf025 = 0x7FA
+ mov GR_ExpOf025 = 0x7FA
};;
{ .mfi
getf.s GR_ArgNz = f8
(p6) mov GR_Tbl12Offs = 0x40 // 0.25 <= x < 0.5
}
{ .mfi
- add GR_ad_Ce = 0x2C0,GR_ad_Data
+ add GR_ad_Ce = 0x2C0,GR_ad_Data
nop.f 0
add GR_ad_Co = 0x2A0,GR_ad_Data
};;
ldfpd FR_A7,FR_A6 = [GR_ad_Co],16
ldfpd FR_A5,FR_A4 = [GR_ad_Ce],16
// jump if argument is close to 0 positive
-(p12) br.cond.spnt tgammaf_overflow
+(p12) br.cond.spnt tgammaf_overflow
};;
{ .mfi
ldfpd FR_A3,FR_A2 = [GR_ad_Co],16
{ .mfb
ldfpd FR_A1,FR_A0 = [GR_ad_Ce],16
nop.f 0
- br.cond.sptk tgamma_from_0_to_2
+ br.cond.sptk tgamma_from_0_to_2
};;
// here if 1 < x < 2
{ .mfi
nop.m 0
nop.f 0
- and GR_TblOffs = GR_TblOffs,GR_TblOffsMask
+ and GR_TblOffs = GR_TblOffs,GR_TblOffsMask
};;
{ .mfi
shladd GR_ad_Co = GR_TblOffs,3,GR_ad_Co
nop.i 0
};;
{ .mfi
- nop.m 0
+ nop.m 0
(p10) fma.s1 FR_GAMMA = FR_A7,FR_r4,FR_A3
nop.i 0
}
{ .mfi
- nop.m 0
+ nop.m 0
(p11) fma.s.s0 f8 = FR_A7,FR_r4,FR_A3
nop.i 0
};;
{ .mfb
- nop.m 0
+ nop.m 0
(p10) fma.s.s0 f8 = FR_GAMMA,FR_Rcp2,f0
br.ret.sptk b0
};;
{ .mfi
nop.m 0
nop.f 0
- shl r8 = r8,20
+ shl r8 = r8,20
};;
{ .mfi
sub r8 = r8,r0,1
.pred.rel "mutex",p14,p15
{ .mfi
// set p8 to 0 in case of overflow and to 1 otherwise
- // for negative arg:
+ // for negative arg:
// no overflow if rounding mode either Z or +Inf, i.e.
// GR_fpsr > 1
(p14) cmp.lt p8,p0 = 1,GR_fpsr
nop.f 0
- // for positive arg:
+ // for positive arg:
// no overflow if rounding mode either Z or -Inf, i.e.
// (GR_fpsr & 1) == 0
(p15) tbit.z p0,p8 = GR_fpsr,0
mov GR_TAG = 261 // overflow
}
{ .mfb
- nop.m 0
+ nop.m 0
(p15) fma.s.s0 f8 = f9,f9,f0 // set I,O and +INF result
br.cond.sptk tgammaf_libm_err
};;
{ .mfi
(p7) mov GR_TAG = 262 // negative
(p7) frcpa.s0 f8,p0 = f1,f8
- nop.i 0
+ nop.i 0
}
{ .mib
nop.m 0
add GR_Parameter_Y=-32,sp // Parameter 2 value
nop.f 0
.save ar.pfs,GR_SAVE_PFS
- mov GR_SAVE_PFS=ar.pfs // Save ar.pfs
+ mov GR_SAVE_PFS=ar.pfs // Save ar.pfs
}
{ .mfi
-.fframe 64
+.fframe 64
add sp=-64,sp // Create new stack
nop.f 0
mov GR_SAVE_GP=gp // Save gp
{ .mmi
stfs [GR_Parameter_Y] = FR_Y,16 // STORE Parameter 2 on stack
add GR_Parameter_X = 16,sp // Parameter 1 address
-.save b0, GR_SAVE_B0
- mov GR_SAVE_B0=b0 // Save b0
+.save b0, GR_SAVE_B0
+ mov GR_SAVE_B0=b0 // Save b0
};;
.body
{ .mib
- stfs [GR_Parameter_X] = FR_X // STORE Parameter 1 on stack
- add GR_Parameter_RESULT = 0,GR_Parameter_Y // Parameter 3 address
- nop.b 0
+ stfs [GR_Parameter_X] = FR_X // STORE Parameter 1 on stack
+ add GR_Parameter_RESULT = 0,GR_Parameter_Y // Parameter 3 address
+ nop.b 0
}
{ .mib
stfs [GR_Parameter_Y] = FR_RESULT // STORE Parameter 3 on stack
- add GR_Parameter_Y = -16,GR_Parameter_Y
+ add GR_Parameter_Y = -16,GR_Parameter_Y
br.call.sptk b0=__libm_error_support# // Call error handling function
};;
{ .mmi
mov b0 = GR_SAVE_B0 // Restore return address
};;
{ .mib
- mov gp = GR_SAVE_GP // Restore gp
+ mov gp = GR_SAVE_GP // Restore gp
mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs
br.ret.sptk b0 // Return
-};;
+};;
LOCAL_LIBM_END(__libm_error_region)
.type __libm_error_support#,@function
// products derived from this software without specific prior written
// permission.
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
// Intel Corporation is the author of this code, and requests that all
-// problem reports or change requests be submitted to it directly at
+// problem reports or change requests be submitted to it directly at
// http://www.intel.com/software/products/opensource/libraries/num.htm.
//
// History
// Floating-Point Registers: f8-f15
// f32-f127
//
-// General Purpose Registers: r32-r67
+// General Purpose Registers: r32-r67
//
// Predicate Registers: p6-p15
//
// IEEE Special Conditions:
//
// tgammal(+inf) = +inf
-// tgammal(-inf) = QNaN
-// tgammal(+/-0) = +/-inf
+// tgammal(-inf) = QNaN
+// tgammal(+/-0) = +/-inf
// tgammal(x<0, x - integer) = QNaN
// tgammal(SNaN) = QNaN
// tgammal(QNaN) = QNaN
// Algorithm description
// ---------------------
//
-// There are 3 main paths in the implementation
+// There are 3 main paths in the implementation
// (and additional special values branches)
//
// 1) |X| >= 13 - Stirling formula computation
// a) Positive arguments:
-// TGAMMAL(X) = exp((X-0.5)*ln(X) - X + C + S(Z)),
-// where C = 0.5*ln(2*Pi) , Z = 1/Z, S(Z) - Bernulli polynomial
+// TGAMMAL(X) = exp((X-0.5)*ln(X) - X + C + S(Z)),
+// where C = 0.5*ln(2*Pi) , Z = 1/Z, S(Z) - Bernulli polynomial
// (up to 'B18' term).
-// Some of these calculation done in multiprecision.
-// Ln returns multiprecision result too
+// Some of these calculation done in multiprecision.
+// Ln returns multiprecision result too
// and exp also accepts and returns pair of values.
-//
+//
// b) Negative arguments
// TGAMMAL(-X) = PI/(X*TGAMMAL(X)*sin(PI*X)).
// (X*sin(PI*X))/PI calculated in parallel with TGAMMAL.
// Here we use polynomial of 9th degree with 2 multiprecision steps.
-// Argument range reduction is:
+// Argument range reduction is:
// N = [x] with round to nearest, r = x - N, -0.5 <= r < 0.5
// After ((X-0.5)*ln(X) - X + C + S(Z)) completed we just invert
// its result and compute exp with negative argument (1/exp(x)=exp(-x))
// and first 6 multiprecision computations.
// Range reduction looks like
// N = [x] with truncate, r = x - N - 0.5, -0.5 <= r < 0.5
-// For odd intervals we use reccurent formula:
+// For odd intervals we use reccurent formula:
// TGAMMAL(X) = TGAMMA(X-1)*(X-1)
-// [1;2] interval is splitted to 3 subranges:
+// [1;2] interval is splitted to 3 subranges:
// [1;1.25], [1.25;1.75], [1.75;2] with the same polynomial forms
//
// b) Negative arguments
//
// 3) 0 < |X| < 1 - Near 0 part
// a) Here we use reccurent formula TGAMMAL(X) = TGAMMAL(X+1)/X
-// TGAMMAL(X+1) calculated as shown above,
+// TGAMMAL(X+1) calculated as shown above,
// 1/X result obtained in parallel. Then we just multiply these values.
// There is only additional separated subrange: [0;0.125] with specific
// polynomial constants set.
.align 16
LOCAL_OBJECT_START(Constants_Tgammal_log_80_Q)
-// log2_hi, log2_lo, Q_6, Q_5, Q_4, Q_3, Q_2, Q_1
+// log2_hi, log2_lo, Q_6, Q_5, Q_4, Q_3, Q_2, Q_1
data4 0x00000000,0xB1721800,0x00003FFE,0x00000000
data4 0x4361C4C6,0x82E30865,0x0000BFE2,0x00000000
data4 0xA51BE0AF,0x92492453,0x00003FFC,0x00000000
data4 0xCCCE3872,0xCCCCCCCC,0x00003FFC,0x00000000
data4 0xFFFFB4FB,0xFFFFFFFF,0x0000BFFC,0x00000000
data4 0xAAAAAAAB,0xAAAAAAAA,0x00003FFD,0x00000000
-data4 0x00000000,0x80000000,0x0000BFFE,0x00000000
+data4 0x00000000,0x80000000,0x0000BFFE,0x00000000
LOCAL_OBJECT_END(Constants_Tgammal_log_80_Q)
.align 64
LOCAL_OBJECT_START(Constants_Tgammal_log_80_Z_G_H_h1)
-// Z1 - 16 bit fixed, G1 and H1 IEEE single, h1 IEEE double
+// Z1 - 16 bit fixed, G1 and H1 IEEE single, h1 IEEE double
data4 0x00008000,0x3F800000,0x00000000,0x00000000
-data4 0x00000000,0x00000000,0x00000000,0x00000000
+data4 0x00000000,0x00000000,0x00000000,0x00000000
data4 0x00007879,0x3F70F0F0,0x3D785196,0x00000000
data4 0xEBA0E0D1,0x8B1D330B,0x00003FDA,0x00000000
data4 0x000071C8,0x3F638E38,0x3DF13843,0x00000000
data4 0x9EADD553,0xE2AF365E,0x00003FE2,0x00000000
data4 0x00006BCB,0x3F579430,0x3E2FF9A0,0x00000000
-data4 0x752F34A2,0xF585FEC3,0x0000BFE3,0x00000000
+data4 0x752F34A2,0xF585FEC3,0x0000BFE3,0x00000000
data4 0x00006667,0x3F4CCCC8,0x3E647FD6,0x00000000
-data4 0x893B03F3,0xF3546435,0x00003FE2,0x00000000
-data4 0x00006187,0x3F430C30,0x3E8B3AE7,0x00000000
-data4 0x39CDD2AC,0xBABA62E0,0x00003FE4,0x00000000
-data4 0x00005D18,0x3F3A2E88,0x3EA30C68,0x00000000
+data4 0x893B03F3,0xF3546435,0x00003FE2,0x00000000
+data4 0x00006187,0x3F430C30,0x3E8B3AE7,0x00000000
+data4 0x39CDD2AC,0xBABA62E0,0x00003FE4,0x00000000
+data4 0x00005D18,0x3F3A2E88,0x3EA30C68,0x00000000
data4 0x457978A1,0x8718789F,0x00003FE2,0x00000000
-data4 0x0000590C,0x3F321640,0x3EB9CEC8,0x00000000
-data4 0x3185E56A,0x9442DF96,0x0000BFE4,0x00000000
-data4 0x00005556,0x3F2AAAA8,0x3ECF9927,0x00000000
-data4 0x2BBE2CBD,0xCBF9A4BF,0x00003FE4,0x00000000
-data4 0x000051EC,0x3F23D708,0x3EE47FC5,0x00000000
-data4 0x852D5935,0xF3537535,0x00003FE3,0x00000000
-data4 0x00004EC5,0x3F1D89D8,0x3EF8947D,0x00000000
-data4 0x46CDF32F,0xA1F1E699,0x0000BFDF,0x00000000
-data4 0x00004BDB,0x3F17B420,0x3F05F3A1,0x00000000
-data4 0xD8484CE3,0x84A61856,0x00003FE4,0x00000000
+data4 0x0000590C,0x3F321640,0x3EB9CEC8,0x00000000
+data4 0x3185E56A,0x9442DF96,0x0000BFE4,0x00000000
+data4 0x00005556,0x3F2AAAA8,0x3ECF9927,0x00000000
+data4 0x2BBE2CBD,0xCBF9A4BF,0x00003FE4,0x00000000
+data4 0x000051EC,0x3F23D708,0x3EE47FC5,0x00000000
+data4 0x852D5935,0xF3537535,0x00003FE3,0x00000000
+data4 0x00004EC5,0x3F1D89D8,0x3EF8947D,0x00000000
+data4 0x46CDF32F,0xA1F1E699,0x0000BFDF,0x00000000
+data4 0x00004BDB,0x3F17B420,0x3F05F3A1,0x00000000
+data4 0xD8484CE3,0x84A61856,0x00003FE4,0x00000000
data4 0x00004925,0x3F124920,0x3F0F4303,0x00000000
-data4 0xFF28821B,0xC7DD97E0,0x0000BFE2,0x00000000
-data4 0x0000469F,0x3F0D3DC8,0x3F183EBF,0x00000000
-data4 0xEF1FD32F,0xD3C4A887,0x00003FE3,0x00000000
-data4 0x00004445,0x3F088888,0x3F20EC80,0x00000000
-data4 0x464C76DA,0x84672BE6,0x00003FE5,0x00000000
+data4 0xFF28821B,0xC7DD97E0,0x0000BFE2,0x00000000
+data4 0x0000469F,0x3F0D3DC8,0x3F183EBF,0x00000000
+data4 0xEF1FD32F,0xD3C4A887,0x00003FE3,0x00000000
+data4 0x00004445,0x3F088888,0x3F20EC80,0x00000000
+data4 0x464C76DA,0x84672BE6,0x00003FE5,0x00000000
data4 0x00004211,0x3F042108,0x3F29516A,0x00000000
-data4 0x18835FB9,0x9A43A511,0x0000BFE5,0x00000000
+data4 0x18835FB9,0x9A43A511,0x0000BFE5,0x00000000
LOCAL_OBJECT_END(Constants_Tgammal_log_80_Z_G_H_h1)
.align 64
LOCAL_OBJECT_START(Constants_Tgammal_log_80_Z_G_H_h2)
// Z2 - 16 bit fixed, G2 and H2 IEEE single, h2 IEEE double
-data4 0x00008000,0x3F800000,0x00000000,0x00000000
-data4 0x00000000,0x00000000,0x00000000,0x00000000
-data4 0x00007F81,0x3F7F00F8,0x3B7F875D,0x00000000
+data4 0x00008000,0x3F800000,0x00000000,0x00000000
+data4 0x00000000,0x00000000,0x00000000,0x00000000
+data4 0x00007F81,0x3F7F00F8,0x3B7F875D,0x00000000
data4 0x211398BF,0xAD08B116,0x00003FDB,0x00000000
-data4 0x00007F02,0x3F7E03F8,0x3BFF015B,0x00000000
-data4 0xC376958E,0xB106790F,0x00003FDE,0x00000000
-data4 0x00007E85,0x3F7D08E0,0x3C3EE393,0x00000000
-data4 0x79A7679A,0xFD03F242,0x0000BFDA,0x00000000
-data4 0x00007E08,0x3F7C0FC0,0x3C7E0586,0x00000000
-data4 0x05E7AE08,0xF03F81C3,0x0000BFDF,0x00000000
-data4 0x00007D8D,0x3F7B1880,0x3C9E75D2,0x00000000
+data4 0x00007F02,0x3F7E03F8,0x3BFF015B,0x00000000
+data4 0xC376958E,0xB106790F,0x00003FDE,0x00000000
+data4 0x00007E85,0x3F7D08E0,0x3C3EE393,0x00000000
+data4 0x79A7679A,0xFD03F242,0x0000BFDA,0x00000000
+data4 0x00007E08,0x3F7C0FC0,0x3C7E0586,0x00000000
+data4 0x05E7AE08,0xF03F81C3,0x0000BFDF,0x00000000
+data4 0x00007D8D,0x3F7B1880,0x3C9E75D2,0x00000000
data4 0x049EB22F,0xD1B87D3C,0x00003FDE,0x00000000
-data4 0x00007D12,0x3F7A2328,0x3CBDC97A,0x00000000
-data4 0x3A9E81E0,0xFABC8B95,0x00003FDF,0x00000000
+data4 0x00007D12,0x3F7A2328,0x3CBDC97A,0x00000000
+data4 0x3A9E81E0,0xFABC8B95,0x00003FDF,0x00000000
data4 0x00007C98,0x3F792FB0,0x3CDCFE47,0x00000000
-data4 0x7C4B5443,0xF5F3653F,0x00003FDF,0x00000000
-data4 0x00007C20,0x3F783E08,0x3CFC15D0,0x00000000
-data4 0xF65A1773,0xE78AB204,0x00003FE0,0x00000000
-data4 0x00007BA8,0x3F774E38,0x3D0D874D,0x00000000
-data4 0x7B8EF695,0xDB7CBFFF,0x0000BFE0,0x00000000
-data4 0x00007B31,0x3F766038,0x3D1CF49B,0x00000000
-data4 0xCF773FB3,0xC0241AEA,0x0000BFE0,0x00000000
-data4 0x00007ABB,0x3F757400,0x3D2C531D,0x00000000
-data4 0xC9539FDF,0xFC8F4D48,0x00003FE1,0x00000000
-data4 0x00007A45,0x3F748988,0x3D3BA322,0x00000000
-data4 0x954665C2,0x9CD035FB,0x0000BFE1,0x00000000
-data4 0x000079D1,0x3F73A0D0,0x3D4AE46F,0x00000000
-data4 0xDD367A30,0xEC9017C7,0x00003FE1,0x00000000
-data4 0x0000795D,0x3F72B9D0,0x3D5A1756,0x00000000
-data4 0xCB11189C,0xEE6625D3,0x0000BFE1,0x00000000
-data4 0x000078EB,0x3F71D488,0x3D693B9D,0x00000000
-data4 0xBE11C424,0xA49C8DB5,0x0000BFE0,0x00000000
+data4 0x7C4B5443,0xF5F3653F,0x00003FDF,0x00000000
+data4 0x00007C20,0x3F783E08,0x3CFC15D0,0x00000000
+data4 0xF65A1773,0xE78AB204,0x00003FE0,0x00000000
+data4 0x00007BA8,0x3F774E38,0x3D0D874D,0x00000000
+data4 0x7B8EF695,0xDB7CBFFF,0x0000BFE0,0x00000000
+data4 0x00007B31,0x3F766038,0x3D1CF49B,0x00000000
+data4 0xCF773FB3,0xC0241AEA,0x0000BFE0,0x00000000
+data4 0x00007ABB,0x3F757400,0x3D2C531D,0x00000000
+data4 0xC9539FDF,0xFC8F4D48,0x00003FE1,0x00000000
+data4 0x00007A45,0x3F748988,0x3D3BA322,0x00000000
+data4 0x954665C2,0x9CD035FB,0x0000BFE1,0x00000000
+data4 0x000079D1,0x3F73A0D0,0x3D4AE46F,0x00000000
+data4 0xDD367A30,0xEC9017C7,0x00003FE1,0x00000000
+data4 0x0000795D,0x3F72B9D0,0x3D5A1756,0x00000000
+data4 0xCB11189C,0xEE6625D3,0x0000BFE1,0x00000000
+data4 0x000078EB,0x3F71D488,0x3D693B9D,0x00000000
+data4 0xBE11C424,0xA49C8DB5,0x0000BFE0,0x00000000
LOCAL_OBJECT_END(Constants_Tgammal_log_80_Z_G_H_h2)
.align 64
LOCAL_OBJECT_START(Constants_Tgammal_log_80_h3_G_H)
-// h3 IEEE double extended, H3 and G3 IEEE single
-data4 0x112666B0,0xAAACAAB1,0x00003FD3,0x3F7FFC00
+// h3 IEEE double extended, H3 and G3 IEEE single
+data4 0x112666B0,0xAAACAAB1,0x00003FD3,0x3F7FFC00
data4 0x9B7FAD21,0x90051030,0x00003FD8,0x3F7FF400
-data4 0xF4D783C4,0xA6B46F46,0x00003FDA,0x3F7FEC00
-data4 0x11C6DDCA,0xDA148D88,0x0000BFD8,0x3F7FE400
+data4 0xF4D783C4,0xA6B46F46,0x00003FDA,0x3F7FEC00
+data4 0x11C6DDCA,0xDA148D88,0x0000BFD8,0x3F7FE400
data4 0xCA964D95,0xCE65C1D8,0x0000BFD8,0x3F7FDC00
-data4 0x23412D13,0x883838EE,0x0000BFDB,0x3F7FD400
-data4 0x983ED687,0xB7E5CFA1,0x00003FDB,0x3F7FCC08
-data4 0xE3C3930B,0xDBE23B16,0x0000BFD9,0x3F7FC408
-data4 0x48AA4DFC,0x9B92F1FC,0x0000BFDC,0x3F7FBC10
-data4 0xCE9C8F7E,0x9A8CEB15,0x0000BFD9,0x3F7FB410
-data4 0x0DECE74A,0x8C220879,0x00003FDC,0x3F7FAC18
+data4 0x23412D13,0x883838EE,0x0000BFDB,0x3F7FD400
+data4 0x983ED687,0xB7E5CFA1,0x00003FDB,0x3F7FCC08
+data4 0xE3C3930B,0xDBE23B16,0x0000BFD9,0x3F7FC408
+data4 0x48AA4DFC,0x9B92F1FC,0x0000BFDC,0x3F7FBC10
+data4 0xCE9C8F7E,0x9A8CEB15,0x0000BFD9,0x3F7FB410
+data4 0x0DECE74A,0x8C220879,0x00003FDC,0x3F7FAC18
data4 0x2F053150,0xB25CA912,0x0000BFDA,0x3F7FA420
-data4 0xD9A5BE20,0xA5876555,0x00003FDB,0x3F7F9C20
-data4 0x2053F087,0xC919BB6E,0x00003FD9,0x3F7F9428
-data4 0x041E9A77,0xB70BDA79,0x00003FDC,0x3F7F8C30
-data4 0xEA1C9C30,0xF18A5C08,0x00003FDA,0x3F7F8438
-data4 0x796D89E5,0xA3790D84,0x0000BFDD,0x3F7F7C40
-data4 0xA2915A3A,0xE1852369,0x0000BFDD,0x3F7F7448
-data4 0xA39ED868,0xD803858F,0x00003FDC,0x3F7F6C50
-data4 0x9417EBB7,0xB2EEE356,0x0000BFDD,0x3F7F6458
-data4 0x9BB0D07F,0xED5C1F8A,0x0000BFDC,0x3F7F5C68
-data4 0xE87C740A,0xD6D201A0,0x0000BFDD,0x3F7F5470
-data4 0x1CA74025,0xE8DEBF5E,0x00003FDC,0x3F7F4C78
+data4 0xD9A5BE20,0xA5876555,0x00003FDB,0x3F7F9C20
+data4 0x2053F087,0xC919BB6E,0x00003FD9,0x3F7F9428
+data4 0x041E9A77,0xB70BDA79,0x00003FDC,0x3F7F8C30
+data4 0xEA1C9C30,0xF18A5C08,0x00003FDA,0x3F7F8438
+data4 0x796D89E5,0xA3790D84,0x0000BFDD,0x3F7F7C40
+data4 0xA2915A3A,0xE1852369,0x0000BFDD,0x3F7F7448
+data4 0xA39ED868,0xD803858F,0x00003FDC,0x3F7F6C50
+data4 0x9417EBB7,0xB2EEE356,0x0000BFDD,0x3F7F6458
+data4 0x9BB0D07F,0xED5C1F8A,0x0000BFDC,0x3F7F5C68
+data4 0xE87C740A,0xD6D201A0,0x0000BFDD,0x3F7F5470
+data4 0x1CA74025,0xE8DEBF5E,0x00003FDC,0x3F7F4C78
data4 0x1F34A7EB,0x9A995A97,0x0000BFDC,0x3F7F4488
-data4 0x359EED97,0x9CB0F742,0x0000BFDA,0x3F7F3C90
-data4 0xBBC6A1C8,0xD6F833C2,0x0000BFDD,0x3F7F34A0
-data4 0xE71090EC,0xE1F68F2A,0x00003FDC,0x3F7F2CA8
-data4 0xC160A74F,0xD1881CF1,0x0000BFDB,0x3F7F24B8
-data4 0xD78CB5A4,0x9AD05AE2,0x00003FD6,0x3F7F1CC8
-data4 0x9A77DC4B,0xE658CB8E,0x0000BFDD,0x3F7F14D8
-data4 0x6BD6D312,0xBA281296,0x00003FDC,0x3F7F0CE0
-data4 0xF95210D0,0xB478BBEB,0x0000BFDB,0x3F7F04F0
-data4 0x38800100,0x39400480,0x39A00640,0x39E00C41 // H's start here
-data4 0x3A100A21,0x3A300F22,0x3A4FF51C,0x3A6FFC1D
+data4 0x359EED97,0x9CB0F742,0x0000BFDA,0x3F7F3C90
+data4 0xBBC6A1C8,0xD6F833C2,0x0000BFDD,0x3F7F34A0
+data4 0xE71090EC,0xE1F68F2A,0x00003FDC,0x3F7F2CA8
+data4 0xC160A74F,0xD1881CF1,0x0000BFDB,0x3F7F24B8
+data4 0xD78CB5A4,0x9AD05AE2,0x00003FD6,0x3F7F1CC8
+data4 0x9A77DC4B,0xE658CB8E,0x0000BFDD,0x3F7F14D8
+data4 0x6BD6D312,0xBA281296,0x00003FDC,0x3F7F0CE0
+data4 0xF95210D0,0xB478BBEB,0x0000BFDB,0x3F7F04F0
+data4 0x38800100,0x39400480,0x39A00640,0x39E00C41 // H's start here
+data4 0x3A100A21,0x3A300F22,0x3A4FF51C,0x3A6FFC1D
data4 0x3A87F20B,0x3A97F68B,0x3AA7EB86,0x3AB7E101
-data4 0x3AC7E701,0x3AD7DD7B,0x3AE7D474,0x3AF7CBED
-data4 0x3B03E1F3,0x3B0BDE2F,0x3B13DAAA,0x3B1BD766
-data4 0x3B23CC5C,0x3B2BC997,0x3B33C711,0x3B3BBCC6
-data4 0x3B43BAC0,0x3B4BB0F4,0x3B53AF6D,0x3B5BA620
-data4 0x3B639D12,0x3B6B9444,0x3B7393BC,0x3B7B8B6D
+data4 0x3AC7E701,0x3AD7DD7B,0x3AE7D474,0x3AF7CBED
+data4 0x3B03E1F3,0x3B0BDE2F,0x3B13DAAA,0x3B1BD766
+data4 0x3B23CC5C,0x3B2BC997,0x3B33C711,0x3B3BBCC6
+data4 0x3B43BAC0,0x3B4BB0F4,0x3B53AF6D,0x3B5BA620
+data4 0x3B639D12,0x3B6B9444,0x3B7393BC,0x3B7B8B6D
LOCAL_OBJECT_END(Constants_Tgammal_log_80_h3_G_H)
-.align 64
+.align 64
LOCAL_OBJECT_START(Constants_Tgammal_stirling)
//0.5*ln(2*Pi)=9.1893853320467266954096885e-01 + 7.2239360881843238220057778e-17
data8 0x3FED67F1C864BEB4, 0x3C94D252F2400510
-// Bernulli numbers
+// Bernulli numbers
data8 0xAAAAAAAAAAAAAAAB, 0x00003FFB //B2 = 8.3333333333333333333333333333e-02
data8 0xBF66C16C16C16C17 //B4 = -2.7777777777777777777777777778e-03
data8 0x3F4A01A01A01A01A //B6 = 7.9365079365079365079365079365e-04
data8 0x3FE0000000000000 // 0.5
LOCAL_OBJECT_END(Constants_Tgammal_stirling)
-.align 64
+.align 64
LOCAL_OBJECT_START(Constants_Tgammal_sin)
-// Polynomial coefficients for the sin(Pi*x)/Pi, 0 <= |x| < 0.5
+// Polynomial coefficients for the sin(Pi*x)/Pi, 0 <= |x| < 0.5
//A2 = 8.1174242528335360802316245099e-01 + 5.1302254650266899774269946201e-18
data8 0x3FE9F9CB402BC46C, 0x3C57A8B3819B7CEC
//A1 = -1.6449340668482264060656916627e+00 + -3.0210280454695477893051351574e-17
data8 0xC354723906D95E92, 0x0000BFFC //A3 = -1.9075182412208257558294507774e-01
LOCAL_OBJECT_END(Constants_Tgammal_sin)
-.align 64
+.align 64
LOCAL_OBJECT_START(Constants_Tgammal_exp_64_Arg)
data4 0x00000000,0xB17217F4,0x00003FF2,0x00000000 // L_hi = hi part log(2)/2^12
data4 0xF278ECE6,0xF473DE6A,0x00003FD4,0x00000000 // L_lo = lo part log(2)/2^12
LOCAL_OBJECT_END(Constants_Tgammal_exp_64_A)
LOCAL_OBJECT_START(Constants_Tgammal_exp_64_T1)
-data4 0x3F800000,0x3F8164D2,0x3F82CD87,0x3F843A29
-data4 0x3F85AAC3,0x3F871F62,0x3F88980F,0x3F8A14D5
+data4 0x3F800000,0x3F8164D2,0x3F82CD87,0x3F843A29
+data4 0x3F85AAC3,0x3F871F62,0x3F88980F,0x3F8A14D5
data4 0x3F8B95C2,0x3F8D1ADF,0x3F8EA43A,0x3F9031DC
data4 0x3F91C3D3,0x3F935A2B,0x3F94F4F0,0x3F96942D
data4 0x3F9837F0,0x3F99E046,0x3F9B8D3A,0x3F9D3EDA
LOCAL_OBJECT_END(Constants_Tgammal_exp_64_T1)
LOCAL_OBJECT_START(Constants_Tgammal_exp_64_T2)
-data4 0x3F800000,0x3F80058C,0x3F800B18,0x3F8010A4
-data4 0x3F801630,0x3F801BBD,0x3F80214A,0x3F8026D7
-data4 0x3F802C64,0x3F8031F2,0x3F803780,0x3F803D0E
-data4 0x3F80429C,0x3F80482B,0x3F804DB9,0x3F805349
-data4 0x3F8058D8,0x3F805E67,0x3F8063F7,0x3F806987
-data4 0x3F806F17,0x3F8074A8,0x3F807A39,0x3F807FCA
-data4 0x3F80855B,0x3F808AEC,0x3F80907E,0x3F809610
-data4 0x3F809BA2,0x3F80A135,0x3F80A6C7,0x3F80AC5A
-data4 0x3F80B1ED,0x3F80B781,0x3F80BD14,0x3F80C2A8
-data4 0x3F80C83C,0x3F80CDD1,0x3F80D365,0x3F80D8FA
-data4 0x3F80DE8F,0x3F80E425,0x3F80E9BA,0x3F80EF50
-data4 0x3F80F4E6,0x3F80FA7C,0x3F810013,0x3F8105AA
-data4 0x3F810B41,0x3F8110D8,0x3F81166F,0x3F811C07
-data4 0x3F81219F,0x3F812737,0x3F812CD0,0x3F813269
-data4 0x3F813802,0x3F813D9B,0x3F814334,0x3F8148CE
+data4 0x3F800000,0x3F80058C,0x3F800B18,0x3F8010A4
+data4 0x3F801630,0x3F801BBD,0x3F80214A,0x3F8026D7
+data4 0x3F802C64,0x3F8031F2,0x3F803780,0x3F803D0E
+data4 0x3F80429C,0x3F80482B,0x3F804DB9,0x3F805349
+data4 0x3F8058D8,0x3F805E67,0x3F8063F7,0x3F806987
+data4 0x3F806F17,0x3F8074A8,0x3F807A39,0x3F807FCA
+data4 0x3F80855B,0x3F808AEC,0x3F80907E,0x3F809610
+data4 0x3F809BA2,0x3F80A135,0x3F80A6C7,0x3F80AC5A
+data4 0x3F80B1ED,0x3F80B781,0x3F80BD14,0x3F80C2A8
+data4 0x3F80C83C,0x3F80CDD1,0x3F80D365,0x3F80D8FA
+data4 0x3F80DE8F,0x3F80E425,0x3F80E9BA,0x3F80EF50
+data4 0x3F80F4E6,0x3F80FA7C,0x3F810013,0x3F8105AA
+data4 0x3F810B41,0x3F8110D8,0x3F81166F,0x3F811C07
+data4 0x3F81219F,0x3F812737,0x3F812CD0,0x3F813269
+data4 0x3F813802,0x3F813D9B,0x3F814334,0x3F8148CE
data4 0x3F814E68,0x3F815402,0x3F81599C,0x3F815F37
LOCAL_OBJECT_END(Constants_Tgammal_exp_64_T2)
LOCAL_OBJECT_START(Constants_Tgammal_poly)
-// Polynomial coefficients for the tgammal(x), 2 <= |x| < 3
+// Polynomial coefficients for the tgammal(x), 2 <= |x| < 3
//A5 = 2.8360780594841213109180699803e-02 + 2.2504152891014320704380000000e-19
data8 0x3F9D0A9BC49353D2, 0x3C109AEA0F23CE2D
//A4 = 1.0967323400216015538699565468e-01 + 9.9225166000430644587276000000e-18
data8 0xD2AF690725C62D88, 0x00003FF5 //A7 = 1.6074004848394703022110823298e-03
data8 0xAA44E635D4B7B682, 0x00003FF8 //A6 = 1.0392403425906843901680697839e-02
//
-// Polynomial coefficients for the tgammal(x), 4 <= |x| < 5
+// Polynomial coefficients for the tgammal(x), 4 <= |x| < 5
//A5 = 1.1600674810589555185913468449e+00 + 3.0229979112715124660731000000e-17
data8 0x3FF28FA2EB44D22E, 0x3C816D285234C815
//A4 = 3.1374268565470946334983182169e+00 + 1.3694868953995008497659600000e-16
data8 0xD696DF8D8389FE53, 0x00003FFB //A7 = 1.0477995539298934056097943975e-01
data8 0xBDD5C153048BC435, 0x00003FFD //A6 = 3.7077144754791605130056406006e-01
//
-// Polynomial coefficients for the tgammal(x), 6 <= |x| < 7
+// Polynomial coefficients for the tgammal(x), 6 <= |x| < 7
//A5 = 6.7169398121054200601065531373e+01 + 2.9481001527213915901489600000e-15
data8 0x4050CAD76B377BA0, 0x3CEA8DDB2B2DE93E
//A4 = 1.6115104376855398982115730178e+02 + 1.3422421925418824418257300000e-14
data8 0xEEC1371E265A2C3A, 0x00004001 //A7 = 7.4610858525146049022238037342e+00
data8 0xBF514B9BE68ED59D, 0x00004003 //A6 = 2.3914694993947572859629197920e+01
//
-// Polynomial coefficients for the tgammal(x), 8 <= |x| < 9
+// Polynomial coefficients for the tgammal(x), 8 <= |x| < 9
//A5 = 5.8487447114416836484451778233e+03 + 4.7365465221455983144182900000e-13
data8 0x40B6D8BEA568B6FD, 0x3D60AA4D44C2589B
//A4 = 1.2796464063087094473303295672e+04 + 1.2373341702514898266244200000e-12
data8 0xBF6CFEFD67F59845, 0x00004008 //A7 = 7.6570306334640770654588802417e+02
data8 0x8DB5D2F001635C29, 0x0000400A //A6 = 2.2673639984182571062068713002e+03
//
-// Polynomial coefficients for the tgammal(x), 10 <= |x| < 11
+// Polynomial coefficients for the tgammal(x), 10 <= |x| < 11
//A5 = 7.2546009516580589115619659424e+05 + 1.0343348865365065212891728822e-10
data8 0x412623A830B99290, 0x3DDC6E7C157611C4
//A4 = 1.4756292870840241666883230209e+06 + 8.1516565365333844166705674775e-11
data8 0xD3E5E8D6923910C1, 0x0000400F //A7 = 1.0849181904819284819615140521e+05
data8 0x930D70602F50B754, 0x00004011 //A6 = 3.0116351174131169193070583741e+05
//
-// Polynomial coefficients for the tgammal(x), 12 <= |x| < 13
+// Polynomial coefficients for the tgammal(x), 12 <= |x| < 13
//A5 = 1.2249876249976964294910430908e+08 + 6.0051348061679753770848000000e-09
data8 0x419D34BB29FFC39D, 0x3E39CAB72E01818D
//A4 = 2.3482765927605420351028442383e+08 + 1.1874729051592862323641700000e-08
LOCAL_OBJECT_START(Constants_Tgammal_poly_splitted)
-// Polynomial coefficients for the tgammal(x), 1 <= |x| < 1.25
+// Polynomial coefficients for the tgammal(x), 1 <= |x| < 1.25
//A5 = -9.8199506890310417350775651357e-01+ -3.2546247786122976510752200000e-17
data8 0xBFEF6C80EC38B509, 0xBC82C2FA7A3DE3BD
//A4 = 9.8172808683439960475425323239e-01 + 4.4847611775298520359811400000e-17
data8 0xFEF9F8AB891ABB24, 0x0000BFFE //A7 = -9.9600176036720260345608796766e-01
data8 0xFE3F0537573C8235, 0x00003FFE //A6 = 9.9314911461918778676646301341e-01
//
-// Polynomial coefficients for the tgammal(x), 1.25 <= |x| < 1.75
+// Polynomial coefficients for the tgammal(x), 1.25 <= |x| < 1.75
//A5 = -7.7523052299853054125655660300e-02+ -1.2693512521686721504433600000e-17
data8 0xBFB3D88CFE50601B, 0xBC6D44ED60EE2170
//A4 = 1.4464535904462152982041800442e-01 + 2.5426820829345729856648800000e-17
data8 0x9BA7EAE64C42FDF7, 0x0000BFFA //A7 = -3.8001935555045161419575037512e-02
data8 0xF0115BA1A77607E7, 0x00003FFA //A6 = 5.8610303817173477119764956736e-02
//
-// Polynomial coefficients for the tgammal(x), 1.75 <= |x| < 2.0
+// Polynomial coefficients for the tgammal(x), 1.75 <= |x| < 2.0
//A5 = 2.6698206874501426502654943818e-04 + 3.4033756836921062797887300000e-20
data8 0x3F317F3740FE2A68, 0x3BE417093234B06E
//A4 = 7.4249010753513894345090307070e-02 + 3.9810018444482764697014200000e-18
data8 0xBAF374824937A323, 0x00003FF6 //A7 = 2.8526458211545152218493600470e-03
data8 0xB6BF7564F52140C6, 0x00003FF8 //A6 = 1.1154045718131014476684982178e-02
//
-// Polynomial coefficients for the tgammal(x), 0.0 <= |x| < 0.125
+// Polynomial coefficients for the tgammal(x), 0.0 <= |x| < 0.125
//A5 = -9.8199506890314514073736518185e-01+ -5.9363811993837985890950900000e-17
data8 0xBFEF6C80EC38B67A, 0xBC911C46B447C81F
//A4 = 9.8172808683440015986576554496e-01 + 2.7457414262802803699834200000e-17
GR_l_BIAS = r34
GR_l_Index1 = r35
GR_l_Index2 = r36
-GR_l_signif_Z = r37
+GR_l_signif_Z = r37
GR_l_X_0 = r38
GR_l_X_1 = r39
GR_l_X_2 = r40
FR_l_rsq = f52
FR_l_Y_lo_res = f53
-FR_l_Y0 = f55
-FR_l_Q0 = f56
-FR_l_E0 = f57
-FR_l_E2 = f58
-FR_l_E1 = f59
-FR_l_Y1 = f60
-FR_l_E3 = f61
-FR_l_Y2 = f62
-
-FR_l_Z = f63
-FR_l_Z2 = f64
-FR_l_Z4 = f65
-FR_l_Z8 = f66
-
-FR_l_CH = f67
-FR_l_CL = f68
-
-FR_l_B2 = f69
-FR_l_B4 = f70
-FR_l_B6 = f71
-FR_l_B8 = f72
-FR_l_B10 = f73
-FR_l_B12 = f74
-FR_l_B14 = f75
-FR_l_B16 = f76
-FR_l_B18 = f77
-FR_l_Half = f78
+FR_l_Y0 = f55
+FR_l_Q0 = f56
+FR_l_E0 = f57
+FR_l_E2 = f58
+FR_l_E1 = f59
+FR_l_Y1 = f60
+FR_l_E3 = f61
+FR_l_Y2 = f62
+
+FR_l_Z = f63
+FR_l_Z2 = f64
+FR_l_Z4 = f65
+FR_l_Z8 = f66
+
+FR_l_CH = f67
+FR_l_CL = f68
+
+FR_l_B2 = f69
+FR_l_B4 = f70
+FR_l_B6 = f71
+FR_l_B8 = f72
+FR_l_B10 = f73
+FR_l_B12 = f74
+FR_l_B14 = f75
+FR_l_B16 = f76
+FR_l_B18 = f77
+FR_l_Half = f78
FR_l_SS = f79
FR_l_AbsX_m_Half = f80
FR_l_CXH = f81
FR_n_TH = f98
FR_n_TL = f99
-FR_n_A2H = f100
-FR_n_A2L = f101
-FR_n_A1H = f102
-FR_n_A1L = f103
-FR_n_A9 = f104
-FR_n_A8 = f105
-FR_n_A7 = f106
-FR_n_A6 = f107
-FR_n_A5 = f108
-FR_n_A4 = f109
-FR_n_A3 = f110
+FR_n_A2H = f100
+FR_n_A2L = f101
+FR_n_A1H = f102
+FR_n_A1L = f103
+FR_n_A9 = f104
+FR_n_A8 = f105
+FR_n_A7 = f106
+FR_n_A6 = f107
+FR_n_A5 = f108
+FR_n_A4 = f109
+FR_n_A3 = f110
FR_n_PolyH = f111
FR_n_PolyL = f112
FR_n_Y0 = f117
-FR_n_Q0 = f118
-FR_n_E0 = f119
-
-FR_n_E2 = f120
-FR_n_E1 = f121
-
-FR_n_Y1 = f55
-FR_n_E3 = f56
-
-FR_n_Y2 = f57
-FR_n_R0 = f58
+FR_n_Q0 = f118
+FR_n_E0 = f119
+
+FR_n_E2 = f120
+FR_n_E1 = f121
+
+FR_n_Y1 = f55
+FR_n_E3 = f56
+
+FR_n_Y2 = f57
+FR_n_R0 = f58
FR_n_E4 = f59
FR_n_RcpResH = f60
FR_p_AbsXM1 = f35
FR_p_2 = f36
-FR_p_A20 = f37
-FR_p_A19 = f38
-FR_p_A18 = f39
-FR_p_A17 = f40
-FR_p_A16 = f41
-FR_p_A15 = f42
-FR_p_A14 = f43
-FR_p_A13 = f44
-FR_p_A12 = f45
-FR_p_A11 = f46
-FR_p_A10 = f47
-FR_p_A9 = f48
-FR_p_A8 = f49
-FR_p_A7 = f50
-FR_p_A6 = f51
-FR_p_A5H = f52
-FR_p_A5L = f53
-FR_p_A4H = f54
-FR_p_A4L = f55
-FR_p_A3H = f56
-FR_p_A3L = f57
-FR_p_A2H = f58
-FR_p_A2L = f59
-FR_p_A1H = f60
-FR_p_A1L = f61
-FR_p_A0H = f62
-FR_p_A0L = f63
+FR_p_A20 = f37
+FR_p_A19 = f38
+FR_p_A18 = f39
+FR_p_A17 = f40
+FR_p_A16 = f41
+FR_p_A15 = f42
+FR_p_A14 = f43
+FR_p_A13 = f44
+FR_p_A12 = f45
+FR_p_A11 = f46
+FR_p_A10 = f47
+FR_p_A9 = f48
+FR_p_A8 = f49
+FR_p_A7 = f50
+FR_p_A6 = f51
+FR_p_A5H = f52
+FR_p_A5L = f53
+FR_p_A4H = f54
+FR_p_A4L = f55
+FR_p_A3H = f56
+FR_p_A3L = f57
+FR_p_A2H = f58
+FR_p_A2L = f59
+FR_p_A1H = f60
+FR_p_A1L = f61
+FR_p_A0H = f62
+FR_p_A0L = f63
FR_p_XR = f64
-FR_p_XR2 = f65
-FR_p_XR2L = f52
-
-FR_p_XR3 = f58
-FR_p_XR3L = f38
-
-FR_p_XR4 = f42
-FR_p_XR6 = f40
-FR_p_XR8 = f37
-
-FR_p_Poly5H = f66
-FR_p_Poly5L = f67
-FR_p_Poly4H = f53
-FR_p_Poly4L = f44
-FR_p_Poly3H = f41
-FR_p_Poly3L = f47
-FR_p_Poly2H = f68
-FR_p_Poly2L = f54
-FR_p_Poly1H = f55
-FR_p_Poly1L = f46
-FR_p_Poly0H = f39
-FR_p_Poly0L = f43
-
-FR_p_Temp5H = f69
-FR_p_Temp5L = f70
-FR_p_Temp4H = f71
-FR_p_Temp4L = f60
-FR_p_Temp2H = f72
-FR_p_Temp2L = f73
-FR_p_Temp1H = f59
-FR_p_Temp1L = f61
-FR_p_Temp0H = f49
-FR_p_Temp0L = f48
-FR_p_PolyTail = f45
-FR_p_OddPoly0H = f56
-FR_p_OddPoly0L = f51
+FR_p_XR2 = f65
+FR_p_XR2L = f52
+
+FR_p_XR3 = f58
+FR_p_XR3L = f38
+
+FR_p_XR4 = f42
+FR_p_XR6 = f40
+FR_p_XR8 = f37
+
+FR_p_Poly5H = f66
+FR_p_Poly5L = f67
+FR_p_Poly4H = f53
+FR_p_Poly4L = f44
+FR_p_Poly3H = f41
+FR_p_Poly3L = f47
+FR_p_Poly2H = f68
+FR_p_Poly2L = f54
+FR_p_Poly1H = f55
+FR_p_Poly1L = f46
+FR_p_Poly0H = f39
+FR_p_Poly0L = f43
+
+FR_p_Temp5H = f69
+FR_p_Temp5L = f70
+FR_p_Temp4H = f71
+FR_p_Temp4L = f60
+FR_p_Temp2H = f72
+FR_p_Temp2L = f73
+FR_p_Temp1H = f59
+FR_p_Temp1L = f61
+FR_p_Temp0H = f49
+FR_p_Temp0L = f48
+FR_p_PolyTail = f45
+FR_p_OddPoly0H = f56
+FR_p_OddPoly0L = f51
FR_p_0p25 = f73
//=======================================================
// Negative polynomial part registers
// General Purpose Registers
-GR_r_sin_Table = r47
-GR_r_sin_Table2 = r60
+GR_r_sin_Table = r47
+GR_r_sin_Table2 = r60
// Floating Point Registers
-FR_r_IXNS = FR_n_IXNS
-FR_r_IXN = FR_n_IXN
+FR_r_IXNS = FR_n_IXNS
+FR_r_IXN = FR_n_IXN
FR_r_AbsX = FR_l_AbsX
-FR_r_A9 = f74
-FR_r_A8 = f75
-FR_r_A7 = f76
-FR_r_A6 = f77
-FR_r_A5 = f78
-FR_r_A4 = f79
-FR_r_A3 = f80
-FR_r_A2H = f81
-FR_r_A2L = f82
-FR_r_A1H = f83
-FR_r_A1L = f84
-
-FR_r_XNS = f85
-FR_r_XS = f86
-FR_r_XS2 = f87
-FR_r_XS2L = f88
-FR_r_XS4 = f89
-FR_r_XS7 = f90
-FR_r_XS8 = f91
+FR_r_A9 = f74
+FR_r_A8 = f75
+FR_r_A7 = f76
+FR_r_A6 = f77
+FR_r_A5 = f78
+FR_r_A4 = f79
+FR_r_A3 = f80
+FR_r_A2H = f81
+FR_r_A2L = f82
+FR_r_A1H = f83
+FR_r_A1L = f84
+
+FR_r_XNS = f85
+FR_r_XS = f86
+FR_r_XS2 = f87
+FR_r_XS2L = f88
+FR_r_XS4 = f89
+FR_r_XS7 = f90
+FR_r_XS8 = f91
FR_r_Tail = f92
-FR_r_TT = f93
-FR_r_TH = f94
-FR_r_TL = f95
+FR_r_TT = f93
+FR_r_TH = f94
+FR_r_TL = f95
FR_r_ResH = f96
FR_r_ResL = f97
-FR_r_Res3H = f98
-FR_r_Res3L = f99
-
-FR_r_Res1H = f100
-FR_r_Res1L = f101
-
-
-
-FR_r_Y0 = f102
-FR_r_Q0 = f103
-FR_r_E0 = f104
-FR_r_E2 = f105
-FR_r_E1 = f106
-FR_r_Y1 = f107
-FR_r_E3 = f108
-FR_r_Y2 = f109
-FR_r_R0 = f110
-FR_r_E4 = f111
-FR_r_ZH = f112
-FR_r_Y3 = f113
-FR_r_R1 = f114
+FR_r_Res3H = f98
+FR_r_Res3L = f99
+
+FR_r_Res1H = f100
+FR_r_Res1L = f101
+
+
+
+FR_r_Y0 = f102
+FR_r_Q0 = f103
+FR_r_E0 = f104
+FR_r_E2 = f105
+FR_r_E1 = f106
+FR_r_Y1 = f107
+FR_r_E3 = f108
+FR_r_Y2 = f109
+FR_r_R0 = f110
+FR_r_E4 = f111
+FR_r_ZH = f112
+FR_r_Y3 = f113
+FR_r_R1 = f114
FR_r_ZHN = f115
FR_r_ZL = f115
FR_r_NegOne = f116
-FR_z_Y0 = f102
-FR_z_Q0 = f103
-FR_z_E0 = f104
-FR_z_E2 = f105
-FR_z_E1 = f106
-FR_z_Y1 = f107
-FR_z_E3 = f108
-FR_z_Y2 = f109
-FR_z_R0 = f110
-FR_z_E4 = f111
-FR_z_ZH = f112
-FR_z_Y3 = f113
-FR_z_R1 = f114
-FR_z_ZL = f115
+FR_z_Y0 = f102
+FR_z_Q0 = f103
+FR_z_E0 = f104
+FR_z_E2 = f105
+FR_z_E1 = f106
+FR_z_Y1 = f107
+FR_z_E3 = f108
+FR_z_Y2 = f109
+FR_z_R0 = f110
+FR_z_E4 = f111
+FR_z_ZH = f112
+FR_z_Y3 = f113
+FR_z_R1 = f114
+FR_z_ZL = f115
// General Purpose Registers
GR_DenOverflow = r33
GR_u_XN = r34
-GR_SAVE_B0 = r35
-GR_SAVE_GP = r36
-GR_SAVE_SP = r37
+GR_SAVE_B0 = r35
+GR_SAVE_GP = r36
+GR_SAVE_SP = r37
// Floating Point Registers
FR_u_IXN = f34
GR_Parameter_TAG = r67
FR_RESULT = f8
-FR_X = f32
+FR_X = f32
FR_Y = f1
{ .mfi
alloc r32 = ar.pfs,0,32,4,0
fabs FR_l_AbsX = f8 // Get absolute value of X
- addl GR_n_sin_Table = @ltoff(Constants_Tgammal_sin), gp
+ addl GR_n_sin_Table = @ltoff(Constants_Tgammal_sin), gp
}
-{ .mfi
+{ .mfi
addl GR_l_Log_Table=@ltoff(Constants_Tgammal_log_80_Z_G_H_h1#),gp
nop.f 0
addl GR_l_Stirling_Table = @ltoff(Constants_Tgammal_stirling), gp
};;
{ .mfi
- ld8 GR_n_sin_Table = [GR_n_sin_Table]
+ ld8 GR_n_sin_Table = [GR_n_sin_Table]
fclass.m p6,p0 = f8,0x1EF // Check x for NaN, 0, INF, denorm
// NatVal.
addl GR_c_NegSingularity = 0x1003E, r0
};;
{ .mfi
- ld8 GR_p_Table = [GR_p_Table]
+ ld8 GR_p_Table = [GR_p_Table]
fcmp.lt.s1 p15, p14 = f8,f0 // p14 - positive arg, p15 - negative
- shl GR_l_Index1 = GR_l_Index1,5 // Adjust Index1 ptr (x32)
+ shl GR_l_Index1 = GR_l_Index1,5 // Adjust Index1 ptr (x32)
}
{ .mfb
adds GR_c_NegUnderflow = 1765, r0
andcm GR_c_X = GR_c_X, GR_c_SignBit // Remove sign
};;
-{ .mfi
+{ .mfi
addl GR_l_Log_Table = @ltoff(Constants_Tgammal_log_80_Z_G_H_h2#), gp
fcmp.lt.s1 p10, p0 = FR_l_AbsX, f1 // If |X|<1 then p10 = 1
nop.i 0
movl GR_l_BIAS = 0x000000000000FFFF // Bias for exponent
};;
-{ .mfi
+{ .mfi
ld8 GR_l_Log_Table = [GR_l_Log_Table]
frcpa.s1 FR_l_Y0, p0 = f1, FR_l_AbsX // y = frcpa(x)
nop.i 0
}
{ .mfi
- ldfs FR_l_G_1 = [GR_l_Index1],4 // Load G_1
+ ldfs FR_l_G_1 = [GR_l_Index1],4 // Load G_1
fsub.s1 FR_l_W = FR_l_AbsX, f1 // W = |X|-1
nop.i 0
};;
-{ .mfi
+{ .mfi
getf.exp GR_l_N_Unbiased= FR_l_AbsX // exponent of |X|
fmerge.se FR_l_S = f1, FR_l_AbsX // S = merging of X and 1.0
- cmp.gtu p11, p0 = GR_c_13, GR_c_X // If 1 <= |X| < 13
+ cmp.gtu p11, p0 = GR_c_13, GR_c_X // If 1 <= |X| < 13
// then p11 = 1
}
{ .mfb
(p10) br.cond.spnt tgamma_lt_1 // Branch to |X| < 1 path ///////////////////
};;
-{ .mfi
- ldfpd FR_n_A2H, FR_n_A2L = [GR_n_sin_Table], 16
+{ .mfi
+ ldfpd FR_n_A2H, FR_n_A2L = [GR_n_sin_Table], 16
nop.f 0
pmpyshr2.u GR_l_X_1 = GR_l_X_0,GR_l_Z_1,15 // Adjust Index2 (x32)
}
-{ .mfb
- ldfe FR_l_B2 = [GR_l_Stirling_Table], 16
+{ .mfb
+ ldfe FR_l_B2 = [GR_l_Stirling_Table], 16
nop.f 0
(p11) br.cond.spnt tgamma_lt_13 // Branch to 1 <= |X| < 13 path ///////////////
};;
-{ .mfi
- ldfe FR_l_h_1 = [GR_l_Index1],0
+{ .mfi
+ ldfe FR_l_h_1 = [GR_l_Index1],0
nop.f 0
sub GR_l_N = GR_l_N_Unbiased, GR_l_BIAS // N - BIAS
}
-{ .mib
+{ .mib
ldfpd FR_l_B4,FR_l_B6= [GR_l_Stirling_Table], 16 // Load C
(p15) cmp.geu.unc p8,p0 = GR_l_N_Unbiased, GR_c_NegSingularity
(p8) br.cond.spnt tgammal_singularity // Singularity for arg < to -2^63 //////
};;
-{ .mmi
-(p15) ldfpd FR_n_A1H, FR_n_A1L = [GR_n_sin_Table], 16
+{ .mmi
+(p15) ldfpd FR_n_A1H, FR_n_A1L = [GR_n_sin_Table], 16
ldfpd FR_l_B8, FR_l_B10 = [GR_l_Stirling_Table], 16
- add GR_c_Table = 0x20, GR_c_Table
+ add GR_c_Table = 0x20, GR_c_Table
};;
{ .mfi
-(p15) ldfe FR_n_A9 = [GR_n_sin_Table], 16
- fma.s1 FR_l_Q0 = f1,FR_l_Y0,f0 // Q0 = Y0
+(p15) ldfe FR_n_A9 = [GR_n_sin_Table], 16
+ fma.s1 FR_l_Q0 = f1,FR_l_Y0,f0 // Q0 = Y0
nop.i 0
}
-{ .mfi
- ldfpd FR_l_B12, FR_l_B14 = [GR_l_Stirling_Table], 16
- fnma.s1 FR_l_E0 = FR_l_Y0,FR_l_AbsX,f1 // e = 1-b*y
+{ .mfi
+ ldfpd FR_l_B12, FR_l_B14 = [GR_l_Stirling_Table], 16
+ fnma.s1 FR_l_E0 = FR_l_Y0,FR_l_AbsX,f1 // e = 1-b*y
nop.i 0
};;
-{ .mfi
-(p15) ldfe FR_n_A8 = [GR_n_sin_Table], 16
+{ .mfi
+(p15) ldfe FR_n_A8 = [GR_n_sin_Table], 16
fcvt.xf FR_c_XN = FR_n_IXN // Convert to FP repr. of int X
- extr.u GR_l_Index2 = GR_l_X_1, 6, 4 // Extract Index2
+ extr.u GR_l_Index2 = GR_l_X_1, 6, 4 // Extract Index2
}
-{ .mfi
+{ .mfi
ldfpd FR_l_B16, FR_l_B18 = [GR_l_Stirling_Table], 16
nop.f 0
nop.i 0
};;
-{ .mfi
-(p15) ldfe FR_n_A7 = [GR_n_sin_Table], 16
+{ .mfi
+(p15) ldfe FR_n_A7 = [GR_n_sin_Table], 16
fms.s1 FR_l_CXH = FR_l_CH, f1, FR_l_AbsX // CXH = CH+|X|
shl GR_l_Index2 = GR_l_Index2,5
}
-{ .mfi
+{ .mfi
ldfd FR_l_Half = [GR_l_Stirling_Table] // Load 0.5
nop.f 0
nop.i 0
};;
-{ .mfi
+{ .mfi
add GR_l_Index2 = GR_l_Index2, GR_l_Log_Table // Add offset
nop.f 0
nop.i 0
}
-{ .mfi
-(p15) ldfe FR_n_A6 = [GR_n_sin_Table], 16
+{ .mfi
+(p15) ldfe FR_n_A6 = [GR_n_sin_Table], 16
(p15) fma.s1 FR_n_XS = FR_l_AbsX , f1, FR_n_XNS // xs = x - int(x)
nop.i 0
};;
-{ .mmi
- ld2 GR_l_Z_2 = [GR_l_Index2],4
+{ .mmi
+ ld2 GR_l_Z_2 = [GR_l_Index2],4
addl GR_l_Log_Table = @ltoff(Constants_Tgammal_log_80_h3_G_H#),gp
nop.i 0
};;
-{ .mfi
+{ .mfi
ld8 GR_l_Log_Table = [GR_l_Log_Table]
fma.s1 FR_l_E2 = FR_l_E0,FR_l_E0,FR_l_E0 // e2 = e+e^2
nop.i 0
}
-{ .mfi
- ldfs FR_l_G_2 = [GR_l_Index2],4
+{ .mfi
+ ldfs FR_l_G_2 = [GR_l_Index2],4
fma.s1 FR_l_E1 = FR_l_E0,FR_l_E0,f0 // e1 = e^2
nop.i 0
};;
-{ .mmi
- ldfs FR_l_H_2 = [GR_l_Index2],8
-(p15) ldfe FR_n_A5 = [GR_n_sin_Table], 16
+{ .mmi
+ ldfs FR_l_H_2 = [GR_l_Index2],8
+(p15) ldfe FR_n_A5 = [GR_n_sin_Table], 16
nop.i 0
};;
-{ .mfi
+{ .mfi
setf.sig FR_l_float_N = GR_l_N // float_N = Make N a fp number
nop.f 0
- pmpyshr2.u GR_l_X_2 = GR_l_X_1,GR_l_Z_2,15 // X_2 = X_1 * Z_2
+ pmpyshr2.u GR_l_X_2 = GR_l_X_1,GR_l_Z_2,15 // X_2 = X_1 * Z_2
}
-{ .mfi
- ldfe FR_l_h_2 = [GR_l_Index2],0
+{ .mfi
+ ldfe FR_l_h_2 = [GR_l_Index2],0
fma.s1 FR_l_CXL = FR_l_AbsX, f1, FR_l_CXH // CXL = |X|+CXH
add GR_l_Log_Table1= 0x200, GR_l_Log_Table
};;
-{ .mfi
-(p15) ldfe FR_n_A4 = [GR_n_sin_Table], 16
+{ .mfi
+(p15) ldfe FR_n_A4 = [GR_n_sin_Table], 16
(p15) fcmp.eq.unc.s1 p9,p0 = FR_l_AbsX, FR_c_XN //if argument is integer
// and negative
nop.i 0
}
-{ .mfi
+{ .mfi
ldfe FR_c_PosOverflow = [GR_c_Table],16 //Load pos overflow value
(p15) fma.s1 FR_n_XS2 = FR_n_XS, FR_n_XS, f0 // xs^2 = xs*xs
nop.i 0
};;
-{ .mfi
-(p15) ldfe FR_n_A3 = [GR_n_sin_Table], 16
- nop.f 0
+{ .mfi
+(p15) ldfe FR_n_A3 = [GR_n_sin_Table], 16
+ nop.f 0
nop.i 0
};;
-{ .mfi
+{ .mfi
(p15) getf.sig GR_n_XN = FR_n_IXN // int(x) to general reg
fma.s1 FR_l_Y1 = FR_l_Y0,FR_l_E2,FR_l_Y0 // y1 = y+y*e2
- nop.i 0
+ nop.i 0
}
-{ .mfb
- nop.m 0
+{ .mfb
+ nop.m 0
fma.s1 FR_l_E3 = FR_l_E1,FR_l_E1,FR_l_E0 // e3 = e+e1^2
(p9) br.cond.spnt tgammal_singularity // Singularity for integer /////////////
// and negative arguments //////////////
};;
-{ .mfi
+{ .mfi
nop.m 0
fms.s1 FR_l_AbsX_m_Half = FR_l_AbsX, f1, FR_l_Half // |x|-0.5
extr.u GR_l_Index2 = GR_l_X_2, 1, 5 // Get Index3
};;
-{ .mfi
- shladd GR_l_Log_Table1= GR_l_Index2, 2, GR_l_Log_Table1
+{ .mfi
+ shladd GR_l_Log_Table1= GR_l_Index2, 2, GR_l_Log_Table1
nop.f 0
shladd GR_l_Index3 = GR_l_Index2,4, GR_l_Log_Table // Index3
}
// at underflow domain (X < -1765) //////
};;
-{ .mfi
- addl GR_l_Log_Table = @ltoff(Constants_Tgammal_log_80_Q#), gp
+{ .mfi
+ addl GR_l_Log_Table = @ltoff(Constants_Tgammal_log_80_Q#), gp
(p15) fma.s1 FR_n_TT = FR_n_A2L, FR_n_XS2, f0 // T=A2L*x^2
- tbit.nz.unc p13, p12 = GR_n_XN, 0x0 // whether [X] odd or even
+ tbit.nz.unc p13, p12 = GR_n_XN, 0x0 // whether [X] odd or even
}
{ .mfi
nop.m 0
nop.i 0
};;
-{ .mfi
- ld8 GR_l_Log_Table = [GR_l_Log_Table]
+{ .mfi
+ ld8 GR_l_Log_Table = [GR_l_Log_Table]
(p15) fma.s1 FR_n_A7 = FR_n_A8, FR_n_XS2, FR_n_A7 // poly tail
- nop.i 0
+ nop.i 0
}
-{ .mfi
- ldfe FR_l_h_3 = [GR_l_Index3],12
+{ .mfi
+ ldfe FR_l_h_3 = [GR_l_Index3],12
(p15) fma.s1 FR_n_XS4 = FR_n_XS2, FR_n_XS2, f0 // xs^4 = xs^2*xs^2
- nop.i 0
+ nop.i 0
};;
-{ .mfi
- ldfs FR_l_H_3 = [GR_l_Log_Table1], 0
+{ .mfi
+ ldfs FR_l_H_3 = [GR_l_Log_Table1], 0
fma.s1 FR_l_Y2 = FR_l_Y1, FR_l_E3, FR_l_Y0 // y2 = y+y1*e3
- nop.i 0
+ nop.i 0
}
-{ .mfi
- ldfs FR_l_G_3 = [GR_l_Index3], 0
+{ .mfi
+ ldfs FR_l_G_3 = [GR_l_Index3], 0
fnma.s1 FR_l_Z = FR_l_AbsX,FR_l_Q0,f1 // r = a-b*q
- nop.i 0
+ nop.i 0
};;
-{ .mfi
+{ .mfi
nop.m 0
fmpy.s1 FR_l_G = FR_l_G_1, FR_l_G_2 // G = G1 * G_2
nop.i 0
}
-{ .mfi
+{ .mfi
nop.m 0
fadd.s1 FR_l_H = FR_l_H_1, FR_l_H_2 // H = H_1 + H_2
nop.i 0
};;
-{ .mfi
+{ .mfi
ldfe FR_l_log2_hi = [GR_l_Log_Table],16 // load log2_hi part
- fadd.s1 FR_l_h = FR_l_h_1, FR_l_h_2 // h = h_1 + h_2
+ fadd.s1 FR_l_h = FR_l_h_1, FR_l_h_2 // h = h_1 + h_2
nop.i 0
}
{ .mfi
- nop.m 0
+ nop.m 0
fcvt.xf FR_l_float_N = FR_l_float_N // int(N)
- nop.i 0
+ nop.i 0
};;
-{ .mfi
+{ .mfi
ldfe FR_l_log2_lo = [GR_l_Log_Table],16 // Load log2_lo part
fma.s1 FR_l_CXL = FR_l_CXL, f1, FR_l_CL
- nop.i 0
+ nop.i 0
}
{ .mfi
- nop.m 0
+ nop.m 0
(p15) fma.s1 FR_n_TT = FR_n_A2H, FR_n_XS2L, FR_n_TT // T=A2H*x2L+T
- nop.i 0
+ nop.i 0
};;
-{ .mfi
- ldfe FR_l_Q_6 = [GR_l_Log_Table],16
+{ .mfi
+ ldfe FR_l_Q_6 = [GR_l_Log_Table],16
(p15) fma.s1 FR_n_A3 = FR_n_A4, FR_n_XS2, FR_n_A3 // poly tail
- nop.i 0
+ nop.i 0
}
{ .mfi
- nop.m 0
+ nop.m 0
(p15) fma.s1 FR_n_A5 = FR_n_A6, FR_n_XS2, FR_n_A5 // poly tail
- nop.i 0
+ nop.i 0
};;
-{ .mfi
- ldfe FR_l_Q_5 = [GR_l_Log_Table],16
+{ .mfi
+ ldfe FR_l_Q_5 = [GR_l_Log_Table],16
(p15) fabs FR_n_XS = FR_n_XS // abs(xs)
- nop.i 0
+ nop.i 0
}
{ .mfi
- nop.m 0
+ nop.m 0
fma.s1 FR_l_Z = FR_l_Z,FR_l_Y2,FR_l_Q0 // x_hi = q+r*y2
- nop.i 0
+ nop.i 0
};;
-{ .mfi
- ldfe FR_l_Q_4 = [GR_l_Log_Table],16
+{ .mfi
+ ldfe FR_l_Q_4 = [GR_l_Log_Table],16
(p15) fma.s1 FR_n_A7 = FR_n_A9, FR_n_XS4, FR_n_A7 // poly tail
- nop.i 0
+ nop.i 0
}
{ .mfi
- nop.m 0
+ nop.m 0
(p15) fma.s1 FR_n_XS7 = FR_n_XS4, FR_n_XS2, f0 // = x^4*x^2
- nop.i 0
+ nop.i 0
};;
-{ .mfi
- ldfe FR_l_Q_3 = [GR_l_Log_Table],16
+{ .mfi
+ ldfe FR_l_Q_3 = [GR_l_Log_Table],16
fneg FR_n_NegOne = f1 // -1.0
- nop.i 0
+ nop.i 0
}
{ .mfi
- nop.m 0
+ nop.m 0
(p15) fma.s1 FR_n_XS8 = FR_n_XS4, FR_n_XS4, f0 // xs^8 = xs^4*xs^4
- nop.i 0
+ nop.i 0
};;
-{ .mfi
- ldfe FR_l_Q_2 = [GR_l_Log_Table],16
- fadd.s1 FR_l_h = FR_l_h, FR_l_h_3 // h = h_1 + h_2 + h_3
- nop.i 0
+{ .mfi
+ ldfe FR_l_Q_2 = [GR_l_Log_Table],16
+ fadd.s1 FR_l_h = FR_l_h, FR_l_h_3 // h = h_1 + h_2 + h_3
+ nop.i 0
}
{ .mfi
- nop.m 0
+ nop.m 0
(p15) fma.s1 FR_n_TH = FR_n_A2H, FR_n_XS2, FR_n_TT // A2H*xs2+T
- nop.i 0
+ nop.i 0
};;
-{ .mfi
- ldfe FR_l_Q_1 = [GR_l_Log_Table],16
- fmpy.s1 FR_l_G = FR_l_G, FR_l_G_3 // G = G_1 * G_2 * G_3
- nop.i 0
+{ .mfi
+ ldfe FR_l_Q_1 = [GR_l_Log_Table],16
+ fmpy.s1 FR_l_G = FR_l_G, FR_l_G_3 // G = G_1 * G_2 * G_3
+ nop.i 0
}
{ .mfi
nop.m 0
- fadd.s1 FR_l_H = FR_l_H, FR_l_H_3 // H = H_1 + H_2 + H_3
- nop.i 0
+ fadd.s1 FR_l_H = FR_l_H, FR_l_H_3 // H = H_1 + H_2 + H_3
+ nop.i 0
};;
-{ .mfi
+{ .mfi
nop.m 0
fma.s1 FR_l_Z2 = FR_l_Z, FR_l_Z, f0 // Z^2
nop.i 0
nop.i 0
};;
-{ .mfi
+{ .mfi
nop.m 0
(p14) fcmp.gt.unc.s1 p7,p0 = FR_l_AbsX, FR_c_PosOverflow //X > 1755.5483
// (overflow domain, result cannot be represented by normal value)
nop.i 0
}
-{ .mfi
+{ .mfi
nop.m 0
(p15) fma.s1 FR_n_XS7 = FR_n_XS7, FR_n_XS, f0 // x^7 construction
nop.i 0
};;
-{ .mfi
+{ .mfi
nop.m 0
(p15) fms.s1 FR_n_TL = FR_n_A2H, FR_n_XS2, FR_n_TH // A2H*xs2+TH
nop.i 0
{ .mfi
nop.m 0
(p15) fma.s1 FR_n_PolyH = FR_n_TH, f1, FR_n_A1H // PolyH=TH+A1H
- nop.i 0
+ nop.i 0
};;
-{ .mfi
+{ .mfi
nop.m 0
fmpy.s1 FR_l_GS_hi = FR_l_G, FR_l_S // GS_hi = G*S
nop.i 0
(p7) br.cond.spnt tgammal_overflow // Overflow path for arg > 1755.5483 //////
};;
-{ .mfi
+{ .mfi
nop.m 0
fma.s1 FR_l_B14 = FR_l_B16, FR_l_Z2, FR_l_B14// bernulli tail
nop.i 0
}
-{ .mfi
+{ .mfi
nop.m 0
fma.s1 FR_l_Z4 = FR_l_Z2, FR_l_Z2, f0 // Z^4 = Z^2*Z^2
nop.i 0
};;
-{ .mfi
+{ .mfi
nop.m 0
fma.s1 FR_l_B2 = FR_l_B4, FR_l_Z2, FR_l_B2 // bernulli tail
nop.i 0
nop.i 0
};;
-{ .mfi
+{ .mfi
nop.m 0
fma.s1 FR_l_B10 = FR_l_B12, FR_l_Z2, FR_l_B10// bernulli tail
nop.i 0
nop.i 0
};;
-{ .mfi
+{ .mfi
nop.m 0
(p15) fma.s1 FR_n_TL = FR_n_TL, f1, FR_n_TT // TL = TL+T
nop.i 0
nop.i 0
};;
-{ .mfi
+{ .mfi
nop.m 0
fma.s1 FR_l_poly_lo = FR_l_r, FR_l_Q_6, FR_l_Q_5 // Q_5+r*Q_6
nop.i 0
{ .mfi
nop.m 0
fsub.s1 FR_l_r_cor = FR_l_GS_hi, f1 // r_cor = GS_hi -1
- nop.i 0
+ nop.i 0
};;
-{ .mfi
+{ .mfi
nop.m 0
fms.s1 FR_l_GS_lo = FR_l_G, FR_l_S, FR_l_GS_hi // G*S-GS_hi
nop.i 0
{ .mfi
nop.m 0
fma.s1 FR_l_poly = FR_l_r, FR_l_Q_2, FR_l_Q_1 //poly=r*Q2+Q1
- nop.i 0
+ nop.i 0
};;
-{ .mfi
+{ .mfi
nop.m 0
fmpy.s1 FR_l_rsq = FR_l_r, FR_l_r // rsq = r * r
nop.i 0
nop.m 0
fma.s1 FR_l_G = FR_l_float_N, FR_l_log2_hi, FR_l_H // Tbl =
// float_N*log2_hi + H
- nop.i 0
+ nop.i 0
};;
-{ .mfi
- nop.m 0
+{ .mfi
+ nop.m 0
fma.s1 FR_l_Y_lo = FR_l_float_N, FR_l_log2_lo, FR_l_h // Y_lo=
// float_N*log2_lo + h
- nop.i 0
+ nop.i 0
}
{ .mfi
- nop.m 0
+ nop.m 0
fma.s1 FR_l_B14 = FR_l_B18, FR_l_Z4, FR_l_B14 //bernulli tail
- nop.i 0
+ nop.i 0
};;
-{ .mfi
- nop.m 0
+{ .mfi
+ nop.m 0
fma.s1 FR_l_B2 = FR_l_B6, FR_l_Z4, FR_l_B2 //bernulli tail
- nop.i 0
+ nop.i 0
}
{ .mfi
- nop.m 0
+ nop.m 0
fma.s1 FR_l_Z8 = FR_l_Z4, FR_l_Z4, f0 //bernulli tail
- nop.i 0
+ nop.i 0
};;
-{ .mfi
+{ .mfi
nop.m 0
fma.s1 FR_l_poly_lo = FR_l_r, FR_l_poly_lo, FR_l_Q_4 // poly_lo =
// Q_4 + r * poly_lo
{ .mfi
nop.m 0
fsub.s1 FR_l_r_cor = FR_l_r_cor, FR_l_r // r_cor = r_cor - r
- nop.i 0
+ nop.i 0
};;
-{ .mfi
+{ .mfi
nop.m 0
(p15) fma.s1 FR_n_PolyL = FR_n_PolyL, f1, FR_n_TH // polyL+TH
nop.i 0
nop.i 0
};;
-{ .mfi
+{ .mfi
nop.m 0
- fadd.s1 FR_l_logl_YHi = FR_l_G, FR_l_r // Y_hi = Tbl + r
- nop.i 0
+ fadd.s1 FR_l_logl_YHi = FR_l_G, FR_l_r // Y_hi = Tbl + r
+ nop.i 0
};;
-{ .mfi
+{ .mfi
nop.m 0
fma.s1 FR_l_B10 = FR_l_B14, FR_l_Z4, FR_l_B10 //bernulli tail
- nop.i 0
+ nop.i 0
};;
-{ .mfi
+{ .mfi
nop.m 0
fma.s1 FR_l_poly_lo = FR_l_r, FR_l_poly_lo, FR_l_Q_3 // poly_lo =
// Q_3 + r * poly_lo
{ .mfi
nop.m 0
fadd.s1 FR_l_r_cor = FR_l_r_cor, FR_l_GS_lo // r_cor=r_cor+GS_lo
- nop.i 0
+ nop.i 0
};;
-{ .mfi
+{ .mfi
nop.m 0
(p15) fma.s1 FR_n_PolyL = FR_n_PolyL, f1, FR_n_TT // polyL+TT
nop.i 0
};;
-{ .mfi
+{ .mfi
nop.m 0
- fsub.s1 FR_l_Y_lo_res = FR_l_G, FR_l_logl_YHi // Y_lo = Tbl - Y_hi
+ fsub.s1 FR_l_Y_lo_res = FR_l_G, FR_l_logl_YHi // Y_lo = Tbl - Y_hi
nop.i 0
}
{ .mfi
nop.i 0
};;
-{ .mfi
+{ .mfi
nop.m 0
fma.s1 FR_l_SS = FR_l_B10, FR_l_Z8, FR_l_B2 // bernulli tail
nop.i 0
};;
-{ .mfi
+{ .mfi
nop.m 0
fadd.s1 FR_l_r_cor = FR_l_r_cor, FR_l_Y_lo // r_cor = r_cor+Y_lo
nop.i 0
nop.m 0
fma.s1 FR_l_poly = FR_l_rsq, FR_l_poly_lo, FR_l_poly //poly=
// r^2*polyLo+poly
- nop.i 0
+ nop.i 0
};;
-{ .mfi
+{ .mfi
nop.m 0
(p15) fma.s1 FR_n_TT = FR_n_PolyL, FR_n_XS2, f0 // T=polyL*xs^2
- nop.i 0
+ nop.i 0
};;
-{ .mfi
+{ .mfi
nop.m 0
fadd.s1 FR_l_Y_lo = FR_l_Y_lo_res, FR_l_r // Y_lo = Y_lo + r
nop.i 0
nop.i 0
};;
-{ .mfi
+{ .mfi
nop.m 0
fma.s1 FR_l_SSCXH = FR_l_SS, FR_l_Z, FR_l_CXH // SS*Z+CXH
nop.i 0
nop.i 0
};;
-{ .mlx
+{ .mlx
nop.m 0
movl GR_e_rshf_2to51 = 0x4718000000000000 // 1.10000 2^(63+51)
}
{ .mfi
nop.m 0
- fma.s1 FR_l_poly = FR_l_rsq, FR_l_poly, FR_l_r_cor // poly =
+ fma.s1 FR_l_poly = FR_l_rsq, FR_l_poly, FR_l_r_cor // poly =
// rsq * poly + r_cor
- nop.i 0
+ nop.i 0
};;
-{ .mfi
+{ .mfi
addl GR_e_ad_Arg = @ltoff(Constants_Tgammal_exp_64_Arg#),gp
-(p15) fma.s1 FR_n_TT = FR_n_PolyH, FR_n_XS2L, FR_n_TT
+(p15) fma.s1 FR_n_TT = FR_n_PolyH, FR_n_XS2L, FR_n_TT
mov GR_e_exp_mask = 0x1FFFF // Form exponent mask
}
{ .mlx
};;
-{ .mmi
+{ .mmi
setf.sig FR_e_INV_LN2_2TO63 = GR_e_sig_inv_ln2 // form 1/ln2 * 2^63
setf.d FR_e_RSHF_2TO51 = GR_e_rshf_2to51 // 1.1000 * 2^(63+51)
nop.i 0
};;
-{ .mfi
+{ .mfi
nop.m 0
fms.s1 FR_l_SSCXL = FR_l_CXH, f1, FR_l_SSCXH // CXH+SS*CXH
nop.i 0
}
{ .mfi
nop.m 0
- fma.s1 FR_e_expl_Input_AbsX = FR_l_XYH, f1, FR_l_SSCXH // HI EXP
+ fma.s1 FR_e_expl_Input_AbsX = FR_l_XYH, f1, FR_l_SSCXH // HI EXP
nop.i 0
};;
.pred.rel "mutex",p14,p15
-{ .mfi
+{ .mfi
nop.m 0
(p14) fma.s1 FR_e_expl_Input_X = FR_l_XYH, f1, FR_l_SSCXH // HI EXP
mov GR_e_exp_bias = 0x0FFFF // Set exponent bias
nop.i 0
};;
-{ .mfi
+{ .mfi
nop.m 0
fadd.s1 FR_l_logl_YLo = FR_l_Y_lo, FR_l_poly // YLo = YLo+poly
nop.i 0
};;
-{ .mfi
+{ .mfi
setf.exp FR_e_2TOM51 = GR_e_exp_2tom51 //2^-51 for scaling float_N
(p15) fma.s1 FR_n_TH = FR_n_PolyH, FR_n_XS2, FR_n_TT // TH=
// polyH*xs^2+T
nop.b 0
};;
-{ .mfi
+{ .mfi
add GR_e_ad_A = 0x20, GR_e_ad_Arg // Point to A table
nop.f 0
add GR_e_ad_T1 = 0x50, GR_e_ad_Arg // Point to T1 table
nop.i 0
};;
-{ .mfi
+{ .mfi
nop.m 0
fma.s1 FR_l_SSCXL = FR_l_SS, FR_l_Z, FR_l_SSCXL
nop.i 0
nop.i 0
};;
-{ .mfi
+{ .mfi
ldfe FR_e_L_hi = [GR_e_ad_Arg],16 // Get L_hi
nop.f 0
nop.i 0
};;
-{ .mfi
+{ .mfi
nop.m 0
fma.s1 FR_l_XYL = FR_l_logl_YLo, FR_l_AbsX_m_Half, FR_l_XYL
// XYL = YLo*|x-0.5|+XYL
nop.i 0
};;
-{ .mfi
+{ .mfi
ldfe FR_e_L_lo = [GR_e_ad_Arg],16 // Get L_lo
-(p15) fms.s1 FR_n_TL = FR_n_PolyH, FR_n_XS2, FR_n_TH // TL =
+(p15) fms.s1 FR_n_TL = FR_n_PolyH, FR_n_XS2, FR_n_TH // TL =
// = polyH*xs^2-TH
add GR_e_ad_W1 = 0x100, GR_e_ad_T2 // Point to W1 table
}
add GR_e_ad_W2 = 0x300, GR_e_ad_T2 // Point to W2 table
};;
-{ .mmi
+{ .mmi
getf.exp GR_e_signexp_x = FR_e_expl_Input_X // Extract sign and exp
- ldfe FR_e_A3 = [GR_e_ad_A],16 // Get A3
+ ldfe FR_e_A3 = [GR_e_ad_A],16 // Get A3
nop.i 0
};;
-{ .mfi
+{ .mfi
nop.m 0
fma.s1 FR_l_SSCXL = FR_l_SSCXL, f1, FR_l_CXL
nop.i 0
nop.i 0
};;
-{ .mfi
+{ .mfi
nop.m 0
fma.s1 FR_e_N_signif=FR_e_expl_Input_X,FR_e_INV_LN2_2TO63,FR_e_RSHF_2TO51
and GR_e_exp_x = GR_e_signexp_x, GR_e_exp_mask
};;
-{ .mmi
+{ .mmi
sub GR_e_exp_x = GR_e_exp_x, GR_e_exp_bias // Get exponent
ldfe FR_e_A2 = [GR_e_ad_A],16 // Get A2 for main path
nop.i 0
};;
-{ .mfi
+{ .mfi
nop.m 0
(p15) fma.s1 FR_n_PolyH = FR_n_Poly1H, FR_n_XS, f0//sin(Pi*x) poly
nop.i 0
nop.i 0
};;
-{ .mfi
+{ .mfi
nop.m 0
(p15) fma.s1 FR_n_TL = FR_n_TL, f1, FR_n_TT//sin(Pi*x) poly
nop.i 0
};;
-{ .mfi
+{ .mfi
nop.m 0
fma.s1 FR_l_Temp = FR_l_XYL, f1, FR_l_SSCXL // XYL+SS*CXL
nop.i 0
nop.i 0
};;
-{ .mfi
+{ .mfi
ldfe FR_e_A1 = [GR_e_ad_A],16 // Get A1
nop.f 0
nop.i 0
}
{ .mfi
nop.m 0
- fms.s1 FR_e_float_N = FR_e_N_signif, FR_e_2TOM51, FR_e_RSHF
+ fms.s1 FR_e_float_N = FR_e_N_signif, FR_e_2TOM51, FR_e_RSHF
// Get float N = signd*2^51-RSHIFTER
nop.i 0
};;
-{ .mfi
+{ .mfi
nop.m 0
(p15) fma.s1 FR_n_Poly1L = FR_n_Poly1L, f1, FR_n_TH //sin(Pi*x) poly
nop.i 0
nop.i 0
};;
-{ .mfi
+{ .mfi
getf.sig GR_e_N_fix = FR_e_N_signif // Get N from significand
nop.f 0
nop.i 0
};;
.pred.rel "mutex",p14,p15
-{ .mfi
+{ .mfi
nop.m 0
-(p14) fma.s1 FR_e_expl_Input_Y = FR_e_expl_Input_Y, f1, FR_l_Temp
+(p14) fma.s1 FR_e_expl_Input_Y = FR_e_expl_Input_Y, f1, FR_l_Temp
nop.i 0
}
-{ .mfi
+{ .mfi
nop.m 0
-(p15) fms.s1 FR_e_expl_Input_Y = FR_e_expl_Input_Y, f1, FR_l_Temp
+(p15) fms.s1 FR_e_expl_Input_Y = FR_e_expl_Input_Y, f1, FR_l_Temp
// arguments for exp computation
nop.i 0
};;
-{ .mfi
+{ .mfi
nop.m 0
fnma.s1 FR_e_r = FR_e_L_hi, FR_e_float_N, FR_e_expl_Input_X
// r = -L_hi * float_N + x
extr.u GR_e_M1 = GR_e_N_fix, 6, 6 // Extract index M_1
};;
-{ .mfi
+{ .mfi
nop.m 0
(p15) fma.s1 FR_n_Poly1L = FR_n_Poly1L, f1, FR_n_TL //sin(Pi*x) poly
nop.i 0
};;
-{ .mmf
+{ .mmf
nop.m 0
nop.m 0
- fma.s1 FR_e_r = FR_e_r, f1, FR_e_expl_Input_Y
+ fma.s1 FR_e_r = FR_e_r, f1, FR_e_expl_Input_Y
// r = r + FR_e_expl_Input_Y
};;
-{ .mmi
+{ .mmi
shladd GR_e_ad_W1 = GR_e_M1,3,GR_e_ad_W1 // Point to W1
shladd GR_e_ad_T1 = GR_e_M1,2,GR_e_ad_T1 // Point to T1
extr.u GR_e_M2 = GR_e_N_fix, 0, 6 // Extract index M_2
};;
-{ .mfi
+{ .mfi
ldfs FR_e_T1 = [GR_e_ad_T1],0 // Get T1
nop.f 0
extr GR_e_K = GR_e_N_fix, 12, 32 //Extract limit range K
shladd GR_e_ad_W2 = GR_e_M2,3,GR_e_ad_W2 // Point to W2
};;
-{ .mfi
+{ .mfi
ldfs FR_e_T2 = [GR_e_ad_T2],0 // Get T2
nop.f 0
add GR_e_exp_2_k = GR_e_exp_bias, GR_e_K // exp of 2^k
sub GR_e_exp_2_mk = GR_e_exp_bias, GR_e_K // exp of 2^-k
};;
-{ .mmi
+{ .mmi
ldfd FR_e_W2 = [GR_e_ad_W2],0 // Get W2
nop.m 0
nop.i 0
};;
-{ .mmf
+{ .mmf
setf.exp FR_e_scale = GR_e_exp_2_k // Set scale = 2^k
setf.exp FR_e_2_mk = GR_e_exp_2_mk // Form 2^-k
- fnma.s1 FR_e_r = FR_e_L_lo, FR_e_float_N, FR_e_r
+ fnma.s1 FR_e_r = FR_e_L_lo, FR_e_float_N, FR_e_r
// r = -L_lo * float_N + r
};;
-{ .mfi
+{ .mfi
nop.m 0
-(p15) fma.s1 FR_n_PolyL = FR_n_Tail, FR_n_XS7, FR_n_PolyL
+(p15) fma.s1 FR_n_PolyL = FR_n_Tail, FR_n_XS7, FR_n_PolyL
//sin(Pi*x) poly
nop.i 0
};;
-{ .mfi
+{ .mfi
nop.m 0
fma.s1 FR_e_poly = FR_e_r, FR_e_A3, FR_e_A2 // poly=r*A3+A2
nop.i 0
nop.i 0
};;
-{ .mfi
+{ .mfi
nop.m 0
fmpy.s1 FR_e_T = FR_e_T1, FR_e_T2 // T = T1 * T2
nop.i 0
nop.i 0
};;
-{ .mfi
+{ .mfi
nop.m 0
(p15) fma.s1 FR_n_TT = FR_n_PolyL, FR_l_AbsX, f0 //sin(Pi*x) poly
nop.i 0
};;
-{ .mfi
+{ .mfi
nop.m 0
- fma.s1 FR_e_poly = FR_e_r, FR_e_poly, FR_e_A1
+ fma.s1 FR_e_poly = FR_e_r, FR_e_poly, FR_e_A1
// poly = r * poly + A1
nop.i 0
};;
-{ .mfi
+{ .mfi
nop.m 0
fma.s1 FR_e_T_scale = FR_e_T, FR_e_scale, f0 // T_scale=T*scale
nop.i 0
}
{ .mfi
nop.m 0
- fma.s1 FR_e_W = FR_e_W2, FR_e_W1_p1, FR_e_W1
+ fma.s1 FR_e_W = FR_e_W2, FR_e_W1_p1, FR_e_W1
// W = W2 * (W1+1.0) + W1
nop.i 0
};;
-{ .mfi
+{ .mfi
nop.m 0
-(p15) fma.s1 FR_n_SinxH = FR_n_PolyH, FR_l_AbsX, FR_n_TT
+(p15) fma.s1 FR_n_SinxH = FR_n_PolyH, FR_l_AbsX, FR_n_TT
// sin(Pi*x) poly
nop.i 0
};;
-{ .mfi
+{ .mfi
nop.m 0
mov FR_e_Y_hi = FR_e_T // Assume Y_hi = T
nop.i 0
};;
-{ .mfi
+{ .mfi
nop.m 0
- fma.s1 FR_e_poly = FR_e_rsq, FR_e_poly, FR_e_r
+ fma.s1 FR_e_poly = FR_e_rsq, FR_e_poly, FR_e_r
// poly = rsq * poly + r
nop.i 0
};;
-{ .mfi
+{ .mfi
nop.m 0
- fma.s1 FR_e_Wp1_T_scale = FR_e_W, FR_e_T_scale, FR_e_T_scale
+ fma.s1 FR_e_Wp1_T_scale = FR_e_W, FR_e_T_scale, FR_e_T_scale
// (W+1)*T*scale
nop.i 0
}
nop.i 0
};;
-{ .mfi
+{ .mfi
nop.m 0
(p15) fms.s1 FR_n_SinxL = FR_n_PolyH, FR_l_AbsX, FR_n_SinxH
// Low part of sin
nop.i 0
};;
-{ .mfi
+{ .mfi
nop.m 0
(p15) frcpa.s1 FR_n_Y0, p0 = f1, FR_n_SinxH // y = frcpa(b)
nop.i 0
};;
-{ .mfi
+{ .mfi
nop.m 0
fma.s1 FR_e_result_lo = FR_e_Wp1_T_scale, FR_e_poly, FR_e_W_T_scale
// Low part of exp result
nop.i 0
};;
-{ .mfi
+{ .mfi
nop.m 0
(p15) fma.s1 FR_n_SinxL = FR_n_SinxL, f1, FR_n_TT // sin low result
nop.i 0
};;
-{ .mfi
+{ .mfi
nop.m 0
(p15) fma.s1 FR_n_Q0 = f1,FR_n_Y0,f0 // q = y
nop.i 0
}
-{ .mfi
+{ .mfi
nop.m 0
-(p15) fnma.s1 FR_n_E0 = FR_n_Y0, FR_n_SinxH, f1 // e = 1-b*y
+(p15) fnma.s1 FR_n_E0 = FR_n_Y0, FR_n_SinxH, f1 // e = 1-b*y
nop.i 0
};;
-{ .mfb
+{ .mfb
nop.m 0
(p14) fma.s0 f8 = FR_e_Y_hi, FR_e_scale, FR_e_result_lo
(p14) br.ret.spnt b0 // Exit for positive Stirling path //////////////////////
};;
-{ .mfi
+{ .mfi
nop.m 0
fma.s1 FR_e_expl_Output_X = FR_e_Y_hi, FR_e_scale, f0 // exp result
nop.i 0
nop.i 0
};;
-{ .mfi
+{ .mfi
nop.m 0
fma.s1 FR_n_E2 = FR_n_E0,FR_n_E0,FR_n_E0 // e2 = e+e^2
nop.i 0
}
-{ .mfi
+{ .mfi
nop.m 0
fma.s1 FR_n_E1 = FR_n_E0,FR_n_E0,f0 // e1 = e^2
nop.i 0
};;
-{ .mfi
+{ .mfi
nop.m 0
fma.s1 FR_n_Y1 = FR_n_Y0,FR_n_E2,FR_n_Y0 // y1 = y+y*e2
nop.i 0
}
-{ .mfi
+{ .mfi
nop.m 0
fma.s1 FR_n_E3 = FR_n_E1,FR_n_E1,FR_n_E0 // e3 = e+e1^2
nop.i 0
};;
-{ .mfi
+{ .mfi
nop.m 0
fma.s1 FR_n_Y2 = FR_n_Y1,FR_n_E3,FR_n_Y0 // y2 = y+y1*e3
nop.i 0
}
-{ .mfi
+{ .mfi
nop.m 0
fnma.s1 FR_n_R0 = FR_n_SinxH,FR_n_Q0,f1 // r = a-b*q
nop.i 0
};;
-{ .mfi
+{ .mfi
nop.m 0
fnma.s1 FR_n_E4 = FR_n_SinxH,FR_n_Y2,f1 // e4 = 1-b*y2
nop.i 0
}
-{ .mfi
+{ .mfi
nop.m 0
fma.s1 FR_n_RcpResH = FR_n_R0,FR_n_Y2,FR_n_Q0 // x = q+r*y2
nop.i 0
};;
-{ .mfi
+{ .mfi
nop.m 0
fma.s1 FR_n_Y3 = FR_n_Y2,FR_n_E4,FR_n_Y2 // y3 = y2+y2*e4
nop.i 0
}
-{ .mfi
+{ .mfi
nop.m 0
fnma.s1 FR_n_R1 = FR_n_SinxH,FR_n_RcpResH,f1 // r1 = a-b*x
nop.i 0
};;
-{ .mfi
+{ .mfi
nop.m 0
- fnma.s1 FR_n_R1 = FR_n_SinxL,FR_n_RcpResH,FR_n_R1
+ fnma.s1 FR_n_R1 = FR_n_SinxL,FR_n_RcpResH,FR_n_R1
// r1 = r1 - b_lo*X
nop.i 0
};;
-{ .mfi
+{ .mfi
nop.m 0
fma.s1 FR_n_RcpResL = FR_n_R1,FR_n_Y3,f0 // x_lo = r1*y3
nop.i 0
}
-{ .mfi
+{ .mfi
nop.m 0
fma.s1 FR_n_Temp = FR_n_RcpResH, FR_e_expl_Output_Y, f0
// Multiplying exp and sin result
nop.i 0
};;
-{ .mfi
+{ .mfi
nop.m 0
fma.s1 FR_n_Temp = FR_n_RcpResL, FR_e_expl_Output_X, FR_n_Temp
// Multiplying exp and sin result
nop.i 0
};;
-{ .mfi
+{ .mfi
nop.m 0
fma.s1 FR_n_ResH = FR_n_RcpResH, FR_e_expl_Output_X, FR_n_Temp
// Multiplying exp and sin result
nop.i 0
};;
-{ .mfi
+{ .mfi
nop.m 0
fms.s1 FR_n_ResL = FR_n_RcpResH, FR_e_expl_Output_X, FR_n_ResH
// Multiplying exp and sin result
nop.i 0
}
-{ .mfi
+{ .mfi
nop.m 0
(p12) fma.s1 FR_n_ResH = FR_n_ResH, FR_n_NegOne, f0 // Negate
nop.i 0
};;
-{ .mfi
+{ .mfi
nop.m 0
fma.s1 FR_n_ResL = FR_n_ResL, f1, FR_n_Temp
// Multiplying exp and sin result - low result obtained
};;
.pred.rel "mutex",p12,p13
-{ .mfi
+{ .mfi
nop.m 0
(p13) fma.s0 f8 = FR_n_ResH, f1, FR_n_ResL // For odd
nop.i 0
}
-{ .mfb
+{ .mfb
nop.m 0
(p12) fms.s0 f8 = FR_n_ResH, f1, FR_n_ResL // For even
br.ret.sptk b0 // Exit for negative Stirling path //////////////////////
//------------------------------------------------------------------------------
.align 64
tgamma_lt_13:
-{ .mfi
+{ .mfi
getf.sig GR_p_XN = FR_p_IXN // Get significand
fcvt.xf FR_p_XN = FR_p_IXN // xn = [x]
add GR_r_sin_Table2= 0x40, GR_r_sin_Table // Shifted table addr.
-}
+}
{ .mfi
ldfpd FR_p_0p5, FR_p_1p5 = [GR_c_Table], 16 // 0.5 & 1.5
fms.s1 FR_p_AbsXM1 = FR_p_AbsX, f1, f1 // X-1
add GR_p_Table2 = 0xB0, GR_p_Table
-};;
+};;
-{ .mfi
+{ .mfi
add GR_r_sin_Table = -16, GR_r_sin_Table // For compensation
fcvt.xf FR_r_XNS = FR_r_IXNS // Convert int repr to float
shr.u GR_p_X_Sgnd = GR_p_X_Sgnd, 59 // Get only 5 bit of signd
-};;
+};;
-{ .mfi
+{ .mfi
ldfpd FR_r_A2H,FR_r_A2L = [GR_r_sin_Table], 16 // Load A2
nop.f 0
- add GR_p_Int = -2, GR_p_XN // int = int - 2
-}
+ add GR_p_Int = -2, GR_p_XN // int = int - 2
+}
{ .mfi
- ldfe FR_r_A6 = [GR_r_sin_Table2], 16
+ ldfe FR_r_A6 = [GR_r_sin_Table2], 16
nop.f 0
cmp.gtu p11, p12 = 0x2, GR_p_XN // p11: x < 2 (splitted intervals),
// p12: x > 2 (base intervals)
-};;
+};;
-{ .mfi
- ldfpd FR_r_A1H, FR_r_A1L = [GR_r_sin_Table], 16
+{ .mfi
+ ldfpd FR_r_A1H, FR_r_A1L = [GR_r_sin_Table], 16
nop.f 0
shr GR_p_Int = GR_p_Int, 1 // int/2
-}
+}
{ .mfi
- ldfe FR_r_A5 = [GR_r_sin_Table2], 16
+ ldfe FR_r_A5 = [GR_r_sin_Table2], 16
nop.f 0
(p11) cmp.gtu.unc p10, p11 = 0x1C, GR_p_X_Sgnd // sgnd(x) < 0.75
-};;
+};;
-{ .mfi
- ldfe FR_r_A9 = [GR_r_sin_Table], 16
+{ .mfi
+ ldfe FR_r_A9 = [GR_r_sin_Table], 16
nop.f 0
shl GR_p_Offset = GR_p_Int, 4 // offset = int*16
-}
+}
{ .mfi
- ldfe FR_r_A4 = [GR_r_sin_Table2], 16
+ ldfe FR_r_A4 = [GR_r_sin_Table2], 16
nop.f 0
(p10) cmp.gtu.unc p9, p10 = 0x14, GR_p_X_Sgnd // sgnd(x) < 0.25
-};;
+};;
-{ .mfi
- ldfe FR_r_A8 = [GR_r_sin_Table], 16
+{ .mfi
+ ldfe FR_r_A8 = [GR_r_sin_Table], 16
nop.f 0
(p12) tbit.nz.unc p13, p12 = GR_p_XN, 0x0 // p13: reccurent computations
// X is at [3;4], [5;6], [7;8]... interval
-}
+}
{ .mfi
- ldfe FR_r_A3 = [GR_r_sin_Table2], 16
+ ldfe FR_r_A3 = [GR_r_sin_Table2], 16
nop.f 0
shladd GR_p_Offset = GR_p_Int, 2, GR_p_Offset // +int*4
-};;
+};;
.pred.rel "mutex",p9,p11
-{ .mfi
- add GR_p_Offset = GR_p_Int, GR_p_Offset
+{ .mfi
+ add GR_p_Offset = GR_p_Int, GR_p_Offset
// +int, so offset = int*21
(p9) fms.s1 FR_p_XR = FR_p_AbsX, f1, f1 // r = x-1
- nop.i 0
-}
+ nop.i 0
+}
{ .mfi
- ldfe FR_r_A7 = [GR_r_sin_Table], 16
-(p11) fms.s1 FR_p_XR = FR_p_2, f1, FR_p_AbsX
+ ldfe FR_r_A7 = [GR_r_sin_Table], 16
+(p11) fms.s1 FR_p_XR = FR_p_2, f1, FR_p_AbsX
// r = 2-x for 1.75 < x < 2
- nop.i 0
-};;
+ nop.i 0
+};;
.pred.rel "mutex",p9,p10
.pred.rel "mutex",p10,p11
.pred.rel "mutex",p9,p11
-{ .mfi
+{ .mfi
(p9) add GR_p_Offset = 126, r0 // 1.0 < x < 1.25 table
-(p15) fcmp.eq.unc.s1 p7,p0 = FR_p_AbsX, FR_p_XN
+(p15) fcmp.eq.unc.s1 p7,p0 = FR_p_AbsX, FR_p_XN
// If arg is integer and negative - singularity branch
- nop.i 0
+ nop.i 0
}
-{ .mfi
+{ .mfi
(p10) add GR_p_Offset = 147, r0 // 1.25 < x < 1.75 table
nop.f 0
(p11) add GR_p_Offset = 168, r0 // 1.75 < x < 2.0 table
-};;
+};;
-{ .mmf
- shladd GR_p_Table = GR_p_Offset, 4, GR_p_Table
+{ .mmf
+ shladd GR_p_Table = GR_p_Offset, 4, GR_p_Table
shladd GR_p_Table2 = GR_p_Offset, 4, GR_p_Table2
fma.s1 FR_r_XS = FR_r_AbsX , f1, FR_r_XNS // xs = x - [x]
-};;
+};;
-{ .mmb
- ldfpd FR_p_A5H, FR_p_A5L = [GR_p_Table], 16
- ldfpd FR_p_A2H, FR_p_A2L = [GR_p_Table2], 16
+{ .mmb
+ ldfpd FR_p_A5H, FR_p_A5L = [GR_p_Table], 16
+ ldfpd FR_p_A2H, FR_p_A2L = [GR_p_Table2], 16
(p7) br.cond.spnt tgammal_singularity // Singularity for integer /////////////
// and negative argument ///////////////
};;
-{ .mfi
- ldfpd FR_p_A4H, FR_p_A4L = [GR_p_Table], 16
+{ .mfi
+ ldfpd FR_p_A4H, FR_p_A4L = [GR_p_Table], 16
fma.s1 FR_p_XN = FR_p_XN, f1, FR_p_0p5 // xn = xn+0.5
- nop.i 0
+ nop.i 0
}
-{ .mfi
- ldfpd FR_p_A1H, FR_p_A1L = [GR_p_Table2], 16
+{ .mfi
+ ldfpd FR_p_A1H, FR_p_A1L = [GR_p_Table2], 16
(p10) fms.s1 FR_p_XR = FR_p_AbsX, f1, FR_p_1p5 // r = x - 1.5
- nop.i 0
+ nop.i 0
};;
-{ .mmi
- ldfpd FR_p_A3H, FR_p_A3L = [GR_p_Table], 16
- ldfpd FR_p_A0H, FR_p_A0L = [GR_p_Table2], 16
+{ .mmi
+ ldfpd FR_p_A3H, FR_p_A3L = [GR_p_Table], 16
+ ldfpd FR_p_A0H, FR_p_A0L = [GR_p_Table2], 16
nop.i 0
};;
-{ .mmi
- ldfe FR_p_A20 = [GR_p_Table], 16
- ldfe FR_p_A12 = [GR_p_Table2], 16
+{ .mmi
+ ldfe FR_p_A20 = [GR_p_Table], 16
+ ldfe FR_p_A12 = [GR_p_Table2], 16
nop.i 0
};;
-{ .mmf
- ldfe FR_p_A19 = [GR_p_Table], 16
- ldfe FR_p_A11 = [GR_p_Table2], 16
+{ .mmf
+ ldfe FR_p_A19 = [GR_p_Table], 16
+ ldfe FR_p_A11 = [GR_p_Table2], 16
fma.s1 FR_r_XS2 = FR_r_XS, FR_r_XS, f0 // xs2 = xs*xs
};;
-{ .mmi
- ldfe FR_p_A18 = [GR_p_Table], 16
- ldfe FR_p_A10 = [GR_p_Table2], 16
+{ .mmi
+ ldfe FR_p_A18 = [GR_p_Table], 16
+ ldfe FR_p_A10 = [GR_p_Table2], 16
nop.i 0
};;
.pred.rel "mutex",p12,p13
-{ .mfi
- ldfe FR_p_A17 = [GR_p_Table], 16
+{ .mfi
+ ldfe FR_p_A17 = [GR_p_Table], 16
(p12) fms.s1 FR_p_XR = FR_p_AbsX, f1, FR_p_XN // r = x - xn
- nop.i 0
+ nop.i 0
}
-{ .mfi
+{ .mfi
ldfe FR_p_A9 = [GR_p_Table2], 16
(p13) fms.s1 FR_p_XR = FR_p_AbsX, f1, FR_p_XN
- nop.i 0
+ nop.i 0
};;
-{ .mmi
- ldfe FR_p_A16 = [GR_p_Table], 16
- ldfe FR_p_A8 = [GR_p_Table2], 16
+{ .mmi
+ ldfe FR_p_A16 = [GR_p_Table], 16
+ ldfe FR_p_A8 = [GR_p_Table2], 16
(p9) cmp.eq p12, p0 = r0, r0 // clear p12
};;
-{ .mmi
- ldfe FR_p_A15 = [GR_p_Table], 16
- ldfe FR_p_A7 = [GR_p_Table2], 16
+{ .mmi
+ ldfe FR_p_A15 = [GR_p_Table], 16
+ ldfe FR_p_A7 = [GR_p_Table2], 16
(p10) cmp.eq p12, p0 = r0, r0 // clear p12
};;
-{ .mfi
- ldfe FR_p_A14 = [GR_p_Table], 16
+{ .mfi
+ ldfe FR_p_A14 = [GR_p_Table], 16
fma.s1 FR_r_TH = FR_r_A2H, FR_r_XS2, f0 // sin for neg
(p11) cmp.eq p12, p0 = r0, r0 // clear p12
}
-{ .mfi
+{ .mfi
ldfe FR_p_A6 = [GR_p_Table2], 16
fma.s1 FR_r_TL = FR_r_A2L, FR_r_XS2, f0 // sin for neg
- nop.i 0
+ nop.i 0
};;
-{ .mfi
+{ .mfi
ldfe FR_p_A13 = [GR_p_Table], 16
fms.s1 FR_r_XS2L = FR_r_XS, FR_r_XS, FR_r_XS2 // x2Lo part
- nop.i 0
+ nop.i 0
};;
-{ .mfi
- nop.m 0
+{ .mfi
+ nop.m 0
fma.s1 FR_p_Temp5H = FR_p_A5H, FR_p_XR, f0 // A5H*r
// 'Low poly'
- nop.i 0
+ nop.i 0
}
-{ .mfi
- nop.m 0
+{ .mfi
+ nop.m 0
fma.s1 FR_p_XR2 = FR_p_XR, FR_p_XR, f0 // r^2 = r*r
- nop.i 0
+ nop.i 0
};;
-{ .mfi
- nop.m 0
+{ .mfi
+ nop.m 0
fabs FR_r_XS = FR_r_XS // abs(xs)
- nop.i 0
+ nop.i 0
}
-{ .mfi
- nop.m 0
- fma.s1 FR_p_Temp2H = FR_p_A2H, FR_p_XR, f0 // A2H*r
+{ .mfi
+ nop.m 0
+ fma.s1 FR_p_Temp2H = FR_p_A2H, FR_p_XR, f0 // A2H*r
// 'High poly'
- nop.i 0
+ nop.i 0
};;
-{ .mfi
- nop.m 0
+{ .mfi
+ nop.m 0
fms.s1 FR_r_TT = FR_r_A2H, FR_r_XS2, FR_r_TH // sin for neg
- nop.i 0
+ nop.i 0
}
-{ .mfi
- nop.m 0
+{ .mfi
+ nop.m 0
fma.s1 FR_r_ResH = FR_r_TH, f1, FR_r_A1H // sin for neg
- nop.i 0
+ nop.i 0
};;
-{ .mfi
- nop.m 0
+{ .mfi
+ nop.m 0
fma.s1 FR_r_TL = FR_r_A2H, FR_r_XS2L, FR_r_TL // sin for neg
- nop.i 0
+ nop.i 0
};;
-{ .mfi
- nop.m 0
+{ .mfi
+ nop.m 0
fms.s1 FR_p_Temp5L = FR_p_A5H,FR_p_XR,FR_p_Temp5H //A5H*r delta
// 'Low poly'
- nop.i 0
+ nop.i 0
}
-{ .mfi
- nop.m 0
+{ .mfi
+ nop.m 0
fma.s1 FR_p_Poly5H = FR_p_Temp5H, f1, FR_p_A4H // A5H*r+A4H
// 'Low poly'
- nop.i 0
+ nop.i 0
};;
-{ .mfi
- nop.m 0
+{ .mfi
+ nop.m 0
fms.s1 FR_p_Temp2L = FR_p_A2H, FR_p_XR, FR_p_Temp2H//A2H*r delta
//'High poly'
- nop.i 0
+ nop.i 0
}
-{ .mfi
- nop.m 0
+{ .mfi
+ nop.m 0
fma.s1 FR_p_Poly2H = FR_p_Temp2H, f1, FR_p_A1H // A2H*r+A1H
//'High poly'
- nop.i 0
+ nop.i 0
};;
-{ .mfi
- nop.m 0
+{ .mfi
+ nop.m 0
fma.s1 FR_p_XR3 = FR_p_XR2, FR_p_XR, f0 // r^3 = r^2*r
- nop.i 0
+ nop.i 0
}
-{ .mfi
- nop.m 0
+{ .mfi
+ nop.m 0
fms.s1 FR_p_XR2L = FR_p_XR, FR_p_XR, FR_p_XR2 // r^2 delta
- nop.i 0
+ nop.i 0
};;
-{ .mfi
- nop.m 0
+{ .mfi
+ nop.m 0
fma.s1 FR_p_A18 = FR_p_A19, FR_p_XR, FR_p_A18 // Poly tail
- nop.i 0
+ nop.i 0
}
-{ .mfi
- nop.m 0
+{ .mfi
+ nop.m 0
fma.s1 FR_p_A14 = FR_p_A15, FR_p_XR, FR_p_A14 // Poly tail
- nop.i 0
+ nop.i 0
};;
-{ .mfi
- nop.m 0
+{ .mfi
+ nop.m 0
fma.s1 FR_p_XR4 = FR_p_XR2, FR_p_XR2, f0 // r^4 = r^2*r^2
- nop.i 0
+ nop.i 0
};;
-{ .mfi
- nop.m 0
+{ .mfi
+ nop.m 0
fma.s1 FR_p_Temp5L = FR_p_A5L, FR_p_XR, FR_p_Temp5L// Low part
// of A5*r+A4
- nop.i 0
+ nop.i 0
}
-{ .mfi
- nop.m 0
+{ .mfi
+ nop.m 0
fms.s1 FR_p_Poly5L = FR_p_A4H, f1, FR_p_Poly5H // Low part
// of A5*r+A4
- nop.i 0
+ nop.i 0
};;
-{ .mfi
- nop.m 0
+{ .mfi
+ nop.m 0
fma.s1 FR_p_Temp4H = FR_p_Poly5H, FR_p_XR, f0 // (A5H*r+A4H)*r
- nop.i 0
+ nop.i 0
}
-{ .mfi
- nop.m 0
+{ .mfi
+ nop.m 0
fma.s1 FR_p_Temp2L = FR_p_A2L, FR_p_XR, FR_p_Temp2L // A2*r low
- nop.i 0
+ nop.i 0
};;
-{ .mfi
- nop.m 0
+{ .mfi
+ nop.m 0
fms.s1 FR_p_Poly2L = FR_p_A1H, f1, FR_p_Poly2H // High poly
- nop.i 0
+ nop.i 0
}
-{ .mfi
- nop.m 0
+{ .mfi
+ nop.m 0
fma.s1 FR_p_Temp1H = FR_p_Poly2H, FR_p_XR, f0 // High poly
- nop.i 0
+ nop.i 0
};;
-{ .mfi
- nop.m 0
+{ .mfi
+ nop.m 0
fms.s1 FR_p_XR3L = FR_p_XR2, FR_p_XR, FR_p_XR3 // x^3 delta
- nop.i 0
+ nop.i 0
}
-{ .mfi
- nop.m 0
+{ .mfi
+ nop.m 0
fma.s1 FR_p_A16 = FR_p_A17, FR_p_XR, FR_p_A16 // Poly tail
- nop.i 0
+ nop.i 0
};;
-{ .mfi
- nop.m 0
+{ .mfi
+ nop.m 0
fms.s1 FR_r_ResL = FR_r_A1H, f1, FR_r_ResH // sin for neg
- nop.i 0
+ nop.i 0
}
-{ .mfi
- nop.m 0
+{ .mfi
+ nop.m 0
fma.s1 FR_r_TL = FR_r_TL, f1, FR_r_TT // sin for neg
- nop.i 0
+ nop.i 0
};;
-{ .mfi
- nop.m 0
+{ .mfi
+ nop.m 0
fma.s1 FR_p_Temp5L = FR_p_Temp5L, f1, FR_p_A4L // Low poly
- nop.i 0
+ nop.i 0
}
-{ .mfi
- nop.m 0
+{ .mfi
+ nop.m 0
fma.s1 FR_p_Poly5L = FR_p_Poly5L, f1, FR_p_Temp5H // Low poly
- nop.i 0
+ nop.i 0
};;
-{ .mfi
- nop.m 0
+{ .mfi
+ nop.m 0
fms.s1 FR_p_Temp4L = FR_p_Poly5H,FR_p_XR,FR_p_Temp4H //Low poly
- nop.i 0
+ nop.i 0
}
-{ .mfi
- nop.m 0
+{ .mfi
+ nop.m 0
fma.s1 FR_p_Poly4H = FR_p_Temp4H, f1, FR_p_A3H // Low poly
- nop.i 0
+ nop.i 0
};;
-{ .mfi
- nop.m 0
+{ .mfi
+ nop.m 0
fma.s1 FR_p_Temp2L = FR_p_Temp2L, f1, FR_p_A1L // High poly
- nop.i 0
+ nop.i 0
}
-{ .mfi
- nop.m 0
+{ .mfi
+ nop.m 0
fma.s1 FR_p_Poly2L = FR_p_Poly2L, f1, FR_p_Temp2H // High poly
- nop.i 0
+ nop.i 0
};;
-{ .mfi
- nop.m 0
+{ .mfi
+ nop.m 0
fms.s1 FR_p_Temp1L = FR_p_Poly2H,FR_p_XR,FR_p_Temp1H //High poly
- nop.i 0
+ nop.i 0
}
-{ .mfi
- nop.m 0
+{ .mfi
+ nop.m 0
fma.s1 FR_p_Poly1H = FR_p_Temp1H, f1, FR_p_A0H // High poly
- nop.i 0
+ nop.i 0
};;
-{ .mfi
- nop.m 0
+{ .mfi
+ nop.m 0
fma.s1 FR_p_A12 = FR_p_A13, FR_p_XR, FR_p_A12 // Poly tail
- nop.i 0
+ nop.i 0
}
-{ .mfi
- nop.m 0
+{ .mfi
+ nop.m 0
fma.s1 FR_p_XR3L = FR_p_XR2L, FR_p_XR, FR_p_XR3L // x^3 low
- nop.i 0
+ nop.i 0
};;
-{ .mfi
- nop.m 0
+{ .mfi
+ nop.m 0
fma.s1 FR_p_Poly5L = FR_p_Poly5L, f1, FR_p_Temp5L // Low poly
- nop.i 0
+ nop.i 0
}
-{ .mfi
- nop.m 0
+{ .mfi
+ nop.m 0
fma.s1 FR_p_A10 = FR_p_A11, FR_p_XR, FR_p_A10 // Poly tail
- nop.i 0
+ nop.i 0
};;
-{ .mfi
- nop.m 0
+{ .mfi
+ nop.m 0
fms.s1 FR_p_Poly4L = FR_p_A3H, f1, FR_p_Poly4H // Low poly
- nop.i 0
+ nop.i 0
}
-{ .mfi
- nop.m 0
+{ .mfi
+ nop.m 0
fma.s1 FR_p_A6 = FR_p_A7, FR_p_XR, FR_p_A6 // Poly tail
- nop.i 0
+ nop.i 0
};;
-{ .mfi
- nop.m 0
+{ .mfi
+ nop.m 0
fma.s1 FR_p_A8 = FR_p_A9, FR_p_XR, FR_p_A8 // Poly tail
- nop.i 0
+ nop.i 0
}
-{ .mfi
- nop.m 0
+{ .mfi
+ nop.m 0
fma.s1 FR_p_XR6 = FR_p_XR4, FR_p_XR2, f0 // Poly tail
- nop.i 0
+ nop.i 0
};;
-{ .mfi
- nop.m 0
+{ .mfi
+ nop.m 0
fma.s1 FR_p_Poly2L = FR_p_Poly2L, f1, FR_p_Temp2L // High poly
- nop.i 0
+ nop.i 0
}
-{ .mfi
- nop.m 0
+{ .mfi
+ nop.m 0
fms.s1 FR_p_Poly1L = FR_p_A0H, f1, FR_p_Poly1H // High poly
- nop.i 0
+ nop.i 0
};;
-{ .mfi
- nop.m 0
+{ .mfi
+ nop.m 0
fma.s1 FR_r_ResL = FR_r_ResL, f1, FR_r_TH // sin for neg
- nop.i 0
+ nop.i 0
}
-{ .mfi
- nop.m 0
+{ .mfi
+ nop.m 0
fma.s1 FR_r_TT = FR_r_TL, f1, FR_r_A1L // sin for neg
- nop.i 0
+ nop.i 0
};;
-{ .mfi
- nop.m 0
+{ .mfi
+ nop.m 0
fma.s1 FR_p_Temp4L = FR_p_Poly5L,FR_p_XR,FR_p_Temp4L // Low poly
- nop.i 0
+ nop.i 0
}
-{ .mfi
- nop.m 0
+{ .mfi
+ nop.m 0
fma.s1 FR_p_A18 = FR_p_A20, FR_p_XR2, FR_p_A18 // Poly tail
- nop.i 0
+ nop.i 0
};;
-{ .mfi
- nop.m 0
+{ .mfi
+ nop.m 0
fma.s1 FR_p_Poly4L = FR_p_Poly4L, f1, FR_p_Temp4H // Low poly
- nop.i 0
+ nop.i 0
}
-{ .mfi
- nop.m 0
+{ .mfi
+ nop.m 0
fma.s1 FR_p_A14 = FR_p_A16, FR_p_XR2, FR_p_A14 // Poly tail
- nop.i 0
+ nop.i 0
};;
-{ .mfi
- nop.m 0
+{ .mfi
+ nop.m 0
fma.s1 FR_p_A6 = FR_p_A8, FR_p_XR2, FR_p_A6 // Poly tail
- nop.i 0
+ nop.i 0
}
-{ .mfi
- nop.m 0
+{ .mfi
+ nop.m 0
fma.s1 FR_p_A10 = FR_p_A12, FR_p_XR2, FR_p_A10 // Poly tail
- nop.i 0
+ nop.i 0
};;
-{ .mfi
- nop.m 0
+{ .mfi
+ nop.m 0
fma.s1 FR_p_Temp1L = FR_p_Poly2L,FR_p_XR,FR_p_Temp1L //High poly
- nop.i 0
+ nop.i 0
}
-{ .mfi
- nop.m 0
+{ .mfi
+ nop.m 0
fma.s1 FR_p_Poly1L = FR_p_Poly1L, f1, FR_p_Temp1H // High poly
- nop.i 0
+ nop.i 0
};;
-{ .mfi
- nop.m 0
+{ .mfi
+ nop.m 0
fma.s1 FR_r_ResL = FR_r_ResL, f1, FR_r_TT // sin for neg
- nop.i 0
+ nop.i 0
}
-{ .mfi
- nop.m 0
+{ .mfi
+ nop.m 0
fma.s1 FR_r_TH = FR_r_ResH, FR_r_XS2, f0 // sin for neg
- nop.i 0
+ nop.i 0
};;
-{ .mfi
- nop.m 0
+{ .mfi
+ nop.m 0
fma.s1 FR_p_Temp4L = FR_p_Temp4L, f1, FR_p_A3L // Low poly
- nop.i 0
+ nop.i 0
}
-{ .mfi
- nop.m 0
+{ .mfi
+ nop.m 0
fma.s1 FR_p_Poly3H = FR_p_Poly4H, FR_p_XR3, f0 // Low poly
- nop.i 0
+ nop.i 0
};;
-{ .mfi
- nop.m 0
+{ .mfi
+ nop.m 0
fma.s1 FR_p_A14 = FR_p_A18, FR_p_XR4, FR_p_A14 // Poly tail
- nop.i 0
+ nop.i 0
}
-{ .mfi
- nop.m 0
+{ .mfi
+ nop.m 0
fma.s1 FR_p_XR8 = FR_p_XR4, FR_p_XR4, f0 // Poly tail
- nop.i 0
+ nop.i 0
};;
-{ .mfi
- nop.m 0
+{ .mfi
+ nop.m 0
fma.s1 FR_r_TL = FR_r_ResH, FR_r_XS2L, f0 // sin for neg
- nop.i 0
+ nop.i 0
};;
-{ .mfi
- nop.m 0
+{ .mfi
+ nop.m 0
fma.s1 FR_p_Temp1L = FR_p_Temp1L, f1, FR_p_A0L // High poly
- nop.i 0
+ nop.i 0
}
-{ .mfi
- nop.m 0
+{ .mfi
+ nop.m 0
fma.s1 FR_p_A6 = FR_p_A10, FR_p_XR4, FR_p_A6 // Poly tail
- nop.i 0
+ nop.i 0
};;
-{ .mfi
- nop.m 0
+{ .mfi
+ nop.m 0
fms.s1 FR_r_TT = FR_r_ResH, FR_r_XS2, FR_r_TH // sin for neg
- nop.i 0
+ nop.i 0
}
-{ .mfi
- nop.m 0
+{ .mfi
+ nop.m 0
fma.s1 FR_r_Res3H = FR_r_TH, f1, f1 // sin for neg
- nop.i 0
+ nop.i 0
};;
-{ .mfi
- nop.m 0
+{ .mfi
+ nop.m 0
fma.s1 FR_p_Poly4L = FR_p_Poly4L, f1, FR_p_Temp4L // Low poly
- nop.i 0
+ nop.i 0
}
-{ .mfi
- nop.m 0
+{ .mfi
+ nop.m 0
fma.s1 FR_p_Poly3L = FR_p_Poly4H, FR_p_XR3L, f0 // Low poly
- nop.i 0
+ nop.i 0
};;
-{ .mfi
- nop.m 0
+{ .mfi
+ nop.m 0
fma.s1 FR_p_Poly0H = FR_p_Poly3H,f1,FR_p_Poly1H //Low & High add
- nop.i 0
+ nop.i 0
}
-{ .mfi
- nop.m 0
+{ .mfi
+ nop.m 0
fma.s1 FR_r_A7 = FR_r_A8, FR_r_XS2, FR_r_A7 // sin for neg
- nop.i 0
+ nop.i 0
};;
-{ .mfi
- nop.m 0
+{ .mfi
+ nop.m 0
fma.s1 FR_r_TL = FR_r_ResL, FR_r_XS2, FR_r_TL // sin for neg
- nop.i 0
+ nop.i 0
}
-{ .mfi
- nop.m 0
+{ .mfi
+ nop.m 0
fma.s1 FR_r_XS4 = FR_r_XS2, FR_r_XS2, f0 // sin for neg
- nop.i 0
+ nop.i 0
};;
-{ .mfi
- nop.m 0
+{ .mfi
+ nop.m 0
fma.s1 FR_p_Poly1L = FR_p_Poly1L, f1, FR_p_Temp1L // High poly
- nop.i 0
+ nop.i 0
}
-{ .mfi
- nop.m 0
+{ .mfi
+ nop.m 0
fma.s1 FR_p_PolyTail = FR_p_A14, FR_p_XR8, FR_p_A6 // Poly tail
- nop.i 0
+ nop.i 0
};;
-{ .mfi
- nop.m 0
+{ .mfi
+ nop.m 0
fms.s1 FR_r_Res3L = f1, f1, FR_r_Res3H // sin for neg
- nop.i 0
+ nop.i 0
}
-{ .mfi
- nop.m 0
+{ .mfi
+ nop.m 0
fma.s1 FR_r_ResH = FR_r_Res3H, FR_r_XS, f0 // sin for neg
- nop.i 0
+ nop.i 0
};;
-{ .mfi
- nop.m 0
+{ .mfi
+ nop.m 0
fms.s1 FR_p_Temp0L = FR_p_Poly4H,FR_p_XR3,FR_p_Poly3H //Low poly
- nop.i 0
+ nop.i 0
}
-{ .mfi
- nop.m 0
+{ .mfi
+ nop.m 0
fma.s1 FR_p_Poly3L = FR_p_Poly4L,FR_p_XR3,FR_p_Poly3L //Low poly
- nop.i 0
+ nop.i 0
};;
{ .mfi
- nop.m 0
+ nop.m 0
fms.s1 FR_p_Poly0L = FR_p_Poly1H,f1,FR_p_Poly0H //Low & High add
- nop.i 0
+ nop.i 0
}
-{ .mfi
- nop.m 0
-(p13) fma.s1 FR_p_OddPoly0H = FR_p_Poly0H, FR_p_AbsXM1, f0
+{ .mfi
+ nop.m 0
+(p13) fma.s1 FR_p_OddPoly0H = FR_p_Poly0H, FR_p_AbsXM1, f0
// Reccurent computations - multiplying by X-1
- nop.i 0
+ nop.i 0
};;
-{ .mfi
- nop.m 0
+{ .mfi
+ nop.m 0
fma.s1 FR_r_TL = FR_r_TL, f1, FR_r_TT // sin for neg
- nop.i 0
+ nop.i 0
}
-{ .mfi
- nop.m 0
+{ .mfi
+ nop.m 0
fma.s1 FR_r_A3 = FR_r_A4, FR_r_XS2, FR_r_A3 // sin for neg
- nop.i 0
+ nop.i 0
};;
-{ .mfi
- nop.m 0
+{ .mfi
+ nop.m 0
fma.s1 FR_p_Poly1L = FR_p_PolyTail,FR_p_XR6,FR_p_Poly1L//High
- nop.i 0
+ nop.i 0
}
-{ .mfi
- nop.m 0
+{ .mfi
+ nop.m 0
fma.s1 FR_r_A5 = FR_r_A6, FR_r_XS2, FR_r_A5 // sin for neg
- nop.i 0
+ nop.i 0
};;
-{ .mfi
- nop.m 0
+{ .mfi
+ nop.m 0
fma.s1 FR_r_Res3L = FR_r_Res3L, f1, FR_r_TH // sin for neg
- nop.i 0
+ nop.i 0
}
-{ .mfi
- nop.m 0
+{ .mfi
+ nop.m 0
fms.s1 FR_r_ResL = FR_r_Res3H, FR_r_XS, FR_r_ResH//sin for neg
- nop.i 0
+ nop.i 0
};;
{ .mfi
- nop.m 0
+ nop.m 0
fma.s1 FR_p_Poly3L = FR_p_Poly3L, f1, FR_p_Temp0L // Low poly
- nop.i 0
+ nop.i 0
}
-{ .mfi
- nop.m 0
+{ .mfi
+ nop.m 0
fma.s1 FR_r_A7 = FR_r_A9, FR_r_XS4, FR_r_A7 // sin for neg
- nop.i 0
+ nop.i 0
};;
{ .mfi
- nop.m 0
+ nop.m 0
fma.s1 FR_p_Poly0L = FR_p_Poly0L,f1,FR_p_Poly3H //Low & High add
- nop.i 0
+ nop.i 0
}
-{ .mfi
- nop.m 0
+{ .mfi
+ nop.m 0
(p13) fms.s1 FR_p_OddPoly0L = FR_p_Poly0H, FR_p_AbsXM1, FR_p_OddPoly0H
// Reccurent computations - multiplying by X-1 (low part)
- nop.i 0
+ nop.i 0
};;
-{ .mfi
- nop.m 0
+{ .mfi
+ nop.m 0
fma.s1 FR_r_A3 = FR_r_A5, FR_r_XS4, FR_r_A3 // sin for neg
- nop.i 0
+ nop.i 0
}
-{ .mfi
- nop.m 0
+{ .mfi
+ nop.m 0
fma.s1 FR_r_XS7 = FR_r_XS4, FR_r_XS2, f0 // xs^6
- nop.i 0
+ nop.i 0
};;
-{ .mfi
- nop.m 0
+{ .mfi
+ nop.m 0
fma.s1 FR_r_Res3L = FR_r_Res3L, f1, FR_r_TL // sin for neg
- nop.i 0
+ nop.i 0
}
-{ .mfi
- nop.m 0
+{ .mfi
+ nop.m 0
fma.s1 FR_r_XS8 = FR_r_XS4, FR_r_XS4, f0 // sin for neg
- nop.i 0
+ nop.i 0
};;
-{ .mfi
- nop.m 0
+{ .mfi
+ nop.m 0
fma.s1 FR_p_Temp0H = FR_p_Poly3L,f1,FR_p_Poly1L //Low & High add
- nop.i 0
+ nop.i 0
};;
-{ .mfi
- nop.m 0
+{ .mfi
+ nop.m 0
fma.s1 FR_r_XS7 = FR_r_XS7, FR_r_XS, f0 // xs^7
- nop.i 0
+ nop.i 0
};;
-{ .mfi
- nop.m 0
+{ .mfi
+ nop.m 0
fma.s1 FR_r_ResL = FR_r_Res3L, FR_r_XS, FR_r_ResL//sin for neg
- nop.i 0
+ nop.i 0
}
-{ .mfi
- nop.m 0
+{ .mfi
+ nop.m 0
fma.s1 FR_r_Tail = FR_r_A7, FR_r_XS8, FR_r_A3 // sin tail res
- nop.i 0
+ nop.i 0
};;
-{ .mfi
- nop.m 0
+{ .mfi
+ nop.m 0
fma.s1 FR_p_Poly0L = FR_p_Poly0L,f1,FR_p_Temp0H //Low & High add
- nop.i 0
+ nop.i 0
};;
-{ .mfi
- nop.m 0
+{ .mfi
+ nop.m 0
fma.s1 FR_r_ResL = FR_r_Tail,FR_r_XS7,FR_r_ResL //sin for neg
- nop.i 0
+ nop.i 0
};;
-{ .mfi
- nop.m 0
+{ .mfi
+ nop.m 0
(p13) fma.s1 FR_p_OddPoly0L = FR_p_Poly0L, FR_p_AbsXM1, FR_p_OddPoly0L
// Reccurent computations - multiplying by X-1 (low part)
- nop.i 0
+ nop.i 0
};;
-{ .mfi
- nop.m 0
+{ .mfi
+ nop.m 0
fma.s1 FR_r_TT = FR_r_ResL, FR_r_AbsX, f0 // X*sin
- nop.i 0
+ nop.i 0
};;
.pred.rel "mutex",p12,p13
-{ .mfi
- nop.m 0
+{ .mfi
+ nop.m 0
(p12) fma.s0 f8 = FR_p_Poly0H, f1, FR_p_Poly0L // Even
- nop.i 0
+ nop.i 0
}
-{ .mfb
- nop.m 0
+{ .mfb
+ nop.m 0
(p13) fma.s0 f8 = FR_p_OddPoly0H, f1, FR_p_OddPoly0L // Odd
(p14) br.ret.spnt b0 // Exit for 1 <= |X| < 13 path (positive arguments)/////
};;
-{ .mfi
- nop.m 0
-(p13) fma.s1 FR_p_Poly0H = FR_p_OddPoly0H, f1, f0
+{ .mfi
+ nop.m 0
+(p13) fma.s1 FR_p_Poly0H = FR_p_OddPoly0H, f1, f0
// Reccurent computations
- nop.i 0
+ nop.i 0
}
-{ .mfi
- nop.m 0
-(p13) fma.s1 FR_p_Poly0L = FR_p_OddPoly0L, f1, f0
+{ .mfi
+ nop.m 0
+(p13) fma.s1 FR_p_Poly0L = FR_p_OddPoly0L, f1, f0
// Reccurent computations
- nop.i 0
+ nop.i 0
};;
-{ .mfi
- nop.m 0
+{ .mfi
+ nop.m 0
fma.s1 FR_r_Res1H = FR_r_ResH, FR_r_AbsX, FR_r_TT // X*sin
(p11) cmp.eq p13, p12 = r0, r0
};;
-{ .mfi
- nop.m 0
+{ .mfi
+ nop.m 0
fms.s1 FR_r_Res1L = FR_r_ResH,FR_r_AbsX,FR_r_Res1H// X*sin
(p9) cmp.eq p13, p12 = r0, r0
};;
-{ .mfi
- nop.m 0
+{ .mfi
+ nop.m 0
fma.s1 FR_r_Res1L = FR_r_Res1L, f1, FR_r_TT // sin for neg
(p10) cmp.eq p13, p12 = r0, r0
}
-{ .mfi
- nop.m 0
+{ .mfi
+ nop.m 0
fma.s1 FR_r_TL = FR_p_Poly0L, FR_r_Res1H, f0 // mult by sin
- nop.i 0
+ nop.i 0
};;
-{ .mfi
- nop.m 0
+{ .mfi
+ nop.m 0
fma.s1 FR_r_TL = FR_p_Poly0H,FR_r_Res1L,FR_r_TL//mult by sin
- nop.i 0
+ nop.i 0
};;
-{ .mfi
- nop.m 0
+{ .mfi
+ nop.m 0
fma.s1 FR_r_ResH = FR_p_Poly0H,FR_r_Res1H,FR_r_TL//mult by sin
- nop.i 0
+ nop.i 0
};;
-{ .mfi
- nop.m 0
+{ .mfi
+ nop.m 0
fms.s1 FR_r_ResL = FR_p_Poly0H,FR_r_Res1H,FR_r_ResH//sin mult
- nop.i 0
+ nop.i 0
};;
-{ .mfi
- nop.m 0
+{ .mfi
+ nop.m 0
frcpa.s1 FR_r_Y0,p0 = f1,FR_r_ResH // y = frcpa(b)
- nop.i 0
+ nop.i 0
};;
-{ .mfi
- nop.m 0
+{ .mfi
+ nop.m 0
fneg FR_r_NegOne = f1 // Form -1.0
- nop.i 0
+ nop.i 0
}
-{ .mfi
- nop.m 0
+{ .mfi
+ nop.m 0
fma.s1 FR_r_ResL = FR_r_ResL, f1, FR_r_TL //Low result of mult
- nop.i 0
+ nop.i 0
};;
-{ .mfi
- nop.m 0
+{ .mfi
+ nop.m 0
fma.s1 FR_r_Q0 = f1,FR_r_Y0,f0 // q = a*y
- nop.i 0
+ nop.i 0
}
-{ .mfi
- nop.m 0
- fnma.s1 FR_r_E0 = FR_r_Y0,FR_r_ResH,f1 // e = 1-b*y
- nop.i 0
+{ .mfi
+ nop.m 0
+ fnma.s1 FR_r_E0 = FR_r_Y0,FR_r_ResH,f1 // e = 1-b*y
+ nop.i 0
};;
-{ .mfi
- nop.m 0
+{ .mfi
+ nop.m 0
fma.s1 FR_r_E2 = FR_r_E0,FR_r_E0,FR_r_E0 // e2 = e+e^2
- nop.i 0
+ nop.i 0
}
-{ .mfi
- nop.m 0
+{ .mfi
+ nop.m 0
fma.s1 FR_r_E1 = FR_r_E0,FR_r_E0,f0 // e1 = e^2
- nop.i 0
+ nop.i 0
};;
-{ .mfi
- nop.m 0
+{ .mfi
+ nop.m 0
fma.s1 FR_r_Y1 = FR_r_Y0,FR_r_E2,FR_r_Y0 // y1 = y+y*e2
- nop.i 0
+ nop.i 0
}
-{ .mfi
- nop.m 0
+{ .mfi
+ nop.m 0
fma.s1 FR_r_E3 = FR_r_E1,FR_r_E1,FR_r_E0 // e3 = e+e1^2
- nop.i 0
+ nop.i 0
};;
-{ .mfi
- nop.m 0
+{ .mfi
+ nop.m 0
fma.s1 FR_r_Y2 = FR_r_Y1,FR_r_E3,FR_r_Y0 // y2 = y+y1*e3
- nop.i 0
+ nop.i 0
}
-{ .mfi
- nop.m 0
+{ .mfi
+ nop.m 0
fnma.s1 FR_r_R0 = FR_r_ResH,FR_r_Q0,f1 // r = a-b*q
- nop.i 0
+ nop.i 0
};;
-{ .mfi
- nop.m 0
+{ .mfi
+ nop.m 0
fnma.s1 FR_r_E4 = FR_r_ResH,FR_r_Y2,f1 // e4 = 1-b*y2
- nop.i 0
+ nop.i 0
}
-{ .mfi
- nop.m 0
+{ .mfi
+ nop.m 0
fma.s1 FR_r_ZH = FR_r_R0,FR_r_Y2,FR_r_Q0 // x = q+r*y2
- nop.i 0
+ nop.i 0
};;
-{ .mfi
- nop.m 0
+{ .mfi
+ nop.m 0
fma.s1 FR_r_Y3 = FR_r_Y2,FR_r_E4,FR_r_Y2 // y3 = y2+y2*e4
- nop.i 0
+ nop.i 0
}
-{ .mfi
- nop.m 0
+{ .mfi
+ nop.m 0
fnma.s1 FR_r_R1 = FR_r_ResH,FR_r_ZH,f1 // r1 = a-b*x
- nop.i 0
+ nop.i 0
};;
-{ .mfi
- nop.m 0
+{ .mfi
+ nop.m 0
fnma.s1 FR_r_R1 = FR_r_ResL,FR_r_ZH,FR_r_R1 // r1=r1-b_lo*X
- nop.i 0
+ nop.i 0
}
-{ .mfi
- nop.m 0
+{ .mfi
+ nop.m 0
(p12) fma.s1 FR_r_ZHN = FR_r_ZH,FR_r_NegOne, f0 // Negate for evens
- nop.i 0
+ nop.i 0
};;
.pred.rel "mutex",p13,p12
-{ .mfi
- nop.m 0
+{ .mfi
+ nop.m 0
(p13) fma.s0 f8 = FR_r_R1,FR_r_Y3,FR_r_ZH // Final result
- nop.i 0
+ nop.i 0
}
-{ .mfb
- nop.m 0
+{ .mfb
+ nop.m 0
(p12) fnma.s0 f8 = FR_r_R1,FR_r_Y3,FR_r_ZHN // Final result
br.ret.sptk b0 // Exit for 1 <= |X| < 13 path (negative arguments)//////
};;
//------------------------------------------------------------------------------
.align 64
tgamma_lt_1:
-{ .mfi
+{ .mfi
getf.exp GR_p_Exp = FR_p_AbsX // exp of abs X
fma.s1 FR_z_Q0 = f1,FR_z_Y0,f0 // q = a*y
add GR_r_sin_Table2= 0x50, GR_r_sin_Table
-}
+}
{ .mfi
- ldfpd FR_p_0p5, FR_p_1p5 = [GR_c_Table], 16
- fnma.s1 FR_z_E0 = FR_z_Y0,f8,f1 // e = 1-b*y
+ ldfpd FR_p_0p5, FR_p_1p5 = [GR_c_Table], 16
+ fnma.s1 FR_z_E0 = FR_z_Y0,f8,f1 // e = 1-b*y
add GR_p_Table2 = 0xB0, GR_p_Table
-};;
+};;
-{ .mfi
+{ .mfi
ldfd FR_p_0p25 = [GR_c_Table]
fcvt.xf FR_r_XNS = FR_r_IXNS // Convert int repr to float
- shr.u GR_p_X_Sgnd = GR_p_X_Sgnd, 60
+ shr.u GR_p_X_Sgnd = GR_p_X_Sgnd, 60
// Obtain only 4 bits of significand
}
-{ .mfi
+{ .mfi
nop.m 0
nop.f 0
add GR_p_Bias = 0xffff, r0 // Set bias
-};;
+};;
-{ .mfi
- ldfpd FR_r_A2H, FR_r_A2L = [GR_r_sin_Table], 16
+{ .mfi
+ ldfpd FR_r_A2H, FR_r_A2L = [GR_r_sin_Table], 16
nop.f 0
shl GR_p_XN = GR_p_Exp, 4
// Shift exp to 4 bits left to set place for significand
-}
+}
{ .mlx
ldfe FR_r_A6 = [GR_r_sin_Table2], 16
movl GR_p_0p75 = 0xfffec // 0.75
-};;
+};;
-{ .mfi
- ldfpd FR_r_A1H, FR_r_A1L = [GR_r_sin_Table], 16
+{ .mfi
+ ldfpd FR_r_A1H, FR_r_A1L = [GR_r_sin_Table], 16
nop.f 0
- or GR_p_XN = GR_p_XN, GR_p_X_Sgnd
+ or GR_p_XN = GR_p_XN, GR_p_X_Sgnd
// Combine exp with 4 high bits of significand
-}
+}
{ .mfi
- ldfe FR_r_A5 = [GR_r_sin_Table2], 16
+ ldfe FR_r_A5 = [GR_r_sin_Table2], 16
nop.f 0
sub GR_p_Exp = GR_p_Exp, GR_p_Bias // Unbiased exp
-};;
+};;
-{ .mmi
- ldfe FR_r_A9 = [GR_r_sin_Table], 16
- ldfe FR_r_A4 = [GR_r_sin_Table2], 16
+{ .mmi
+ ldfe FR_r_A9 = [GR_r_sin_Table], 16
+ ldfe FR_r_A4 = [GR_r_sin_Table2], 16
cmp.gtu.unc p10, p11 = GR_p_0p75, GR_p_XN // sgnd(x) < 0.75
-};;
+};;
-{ .mfi
- ldfe FR_r_A8 = [GR_r_sin_Table], 16
+{ .mfi
+ ldfe FR_r_A8 = [GR_r_sin_Table], 16
fma.s1 FR_z_E2 = FR_z_E0,FR_z_E0,FR_z_E0 // e2 = e+e^2
(p10) cmp.gt.unc p9, p10 = -2, GR_p_Exp // x < 0.25
-}
+}
{ .mfi
- ldfe FR_r_A3 = [GR_r_sin_Table2], 16
+ ldfe FR_r_A3 = [GR_r_sin_Table2], 16
fma.s1 FR_z_E1 = FR_z_E0,FR_z_E0,f0 // e1 = e^2
(p11) add GR_p_Offset = 168, r0 // [0.75;1] interval
-};;
+};;
-{ .mmi
+{ .mmi
(p10) add GR_p_Offset = 147, r0 // [0.25;0.75] interval
- ldfe FR_r_A7 = [GR_r_sin_Table], 16
+ ldfe FR_r_A7 = [GR_r_sin_Table], 16
(p9) cmp.gt.unc p8, p9 = -3, GR_p_Exp // x < 0.125
-};;
+};;
.pred.rel "mutex",p9,p8
-{ .mmi
+{ .mmi
(p9) add GR_p_Offset = 126, r0 // [0.125;0.25] interval
(p8) add GR_p_Offset = 189, r0 // [0.;0.125] interval
- nop.i 0
-};;
+ nop.i 0
+};;
-{ .mmf
+{ .mmf
shladd GR_p_Table = GR_p_Offset, 4, GR_p_Table //Make addresses
shladd GR_p_Table2 = GR_p_Offset, 4, GR_p_Table2
fma.s1 FR_r_XS = FR_r_AbsX , f1, FR_r_XNS // xs = |x|-[x]
-};;
+};;
.pred.rel "mutex",p8,p11
-{ .mfi
- ldfpd FR_p_A5H, FR_p_A5L = [GR_p_Table], 16
+{ .mfi
+ ldfpd FR_p_A5H, FR_p_A5L = [GR_p_Table], 16
(p11) fms.s1 FR_p_XR = f1, f1, FR_p_AbsX // r = 1 - |x|
// for [0.75;1] interval
- nop.i 0
+ nop.i 0
}
-{ .mfi
- ldfpd FR_p_A2H, FR_p_A2L = [GR_p_Table2], 16
+{ .mfi
+ ldfpd FR_p_A2H, FR_p_A2L = [GR_p_Table2], 16
(p8) fms.s1 FR_p_XR = FR_p_AbsX, f1, f0 // r = |x|
// for [0.;0.125] interval
- nop.i 0
+ nop.i 0
};;
-{ .mfi
- ldfpd FR_p_A4H, FR_p_A4L = [GR_p_Table], 16
+{ .mfi
+ ldfpd FR_p_A4H, FR_p_A4L = [GR_p_Table], 16
fma.s1 FR_z_Y1 = FR_z_Y0,FR_z_E2,FR_z_Y0 // y1 = y+y*e2
- nop.i 0
+ nop.i 0
}
-{ .mfi
- ldfpd FR_p_A1H, FR_p_A1L = [GR_p_Table2], 16
+{ .mfi
+ ldfpd FR_p_A1H, FR_p_A1L = [GR_p_Table2], 16
fma.s1 FR_z_E3 = FR_z_E1,FR_z_E1,FR_z_E0 // e3 = e+e1^2
- nop.i 0
+ nop.i 0
};;
.pred.rel "mutex",p9,p10
-{ .mfi
- ldfpd FR_p_A3H, FR_p_A3L = [GR_p_Table], 16
+{ .mfi
+ ldfpd FR_p_A3H, FR_p_A3L = [GR_p_Table], 16
(p9) fms.s1 FR_p_XR = FR_p_AbsX, f1, f0 // r = |x|
// for [0.125;0.25] interval
- nop.i 0
+ nop.i 0
}
-{ .mfi
- ldfpd FR_p_A0H, FR_p_A0L = [GR_p_Table2], 16
+{ .mfi
+ ldfpd FR_p_A0H, FR_p_A0L = [GR_p_Table2], 16
(p10) fms.s1 FR_p_XR = FR_p_AbsX, f1, FR_p_0p5 // r = |x| - 0.5
// for [0.25;0.75] interval
- nop.i 0
+ nop.i 0
};;
-{ .mmi
- ldfe FR_p_A20 = [GR_p_Table], 16
- ldfe FR_p_A12 = [GR_p_Table2], 16
+{ .mmi
+ ldfe FR_p_A20 = [GR_p_Table], 16
+ ldfe FR_p_A12 = [GR_p_Table2], 16
nop.i 0
};;
-{ .mfi
- ldfe FR_p_A19 = [GR_p_Table], 16
+{ .mfi
+ ldfe FR_p_A19 = [GR_p_Table], 16
fma.s1 FR_r_XS2 = FR_r_XS, FR_r_XS, f0 // xs^2
- nop.i 0
+ nop.i 0
}
-{ .mfi
- ldfe FR_p_A11 = [GR_p_Table2], 16
+{ .mfi
+ ldfe FR_p_A11 = [GR_p_Table2], 16
nop.f 0
- nop.i 0
+ nop.i 0
};;
-{ .mmi
- ldfe FR_p_A18 = [GR_p_Table], 16
- ldfe FR_p_A10 = [GR_p_Table2], 16
+{ .mmi
+ ldfe FR_p_A18 = [GR_p_Table], 16
+ ldfe FR_p_A10 = [GR_p_Table2], 16
nop.i 0
};;
.pred.rel "mutex",p12,p13
-{ .mfi
- ldfe FR_p_A17 = [GR_p_Table], 16
+{ .mfi
+ ldfe FR_p_A17 = [GR_p_Table], 16
fma.s1 FR_z_Y2 = FR_z_Y1,FR_z_E3,FR_z_Y0 // y2 = y+y1*e3
- nop.i 0
+ nop.i 0
}
-{ .mfi
- ldfe FR_p_A9 = [GR_p_Table2], 16
+{ .mfi
+ ldfe FR_p_A9 = [GR_p_Table2], 16
fnma.s1 FR_z_R0 = f8,FR_z_Q0,f1 // r = a-b*q
- nop.i 0
+ nop.i 0
};;
-{ .mmi
- ldfe FR_p_A16 = [GR_p_Table], 16
- ldfe FR_p_A8 = [GR_p_Table2], 16
- nop.i 0
+{ .mmi
+ ldfe FR_p_A16 = [GR_p_Table], 16
+ ldfe FR_p_A8 = [GR_p_Table2], 16
+ nop.i 0
};;
-{ .mmi
- ldfe FR_p_A15 = [GR_p_Table], 16
- ldfe FR_p_A7 = [GR_p_Table2], 16
+{ .mmi
+ ldfe FR_p_A15 = [GR_p_Table], 16
+ ldfe FR_p_A7 = [GR_p_Table2], 16
nop.i 0
};;
-{ .mfi
- ldfe FR_p_A14 = [GR_p_Table], 16
+{ .mfi
+ ldfe FR_p_A14 = [GR_p_Table], 16
fma.s1 FR_r_TH = FR_r_A2H, FR_r_XS2, f0 // neg sin
- nop.i 0
+ nop.i 0
}
-{ .mfi
- ldfe FR_p_A6 = [GR_p_Table2], 16
+{ .mfi
+ ldfe FR_p_A6 = [GR_p_Table2], 16
fma.s1 FR_r_TL = FR_r_A2L, FR_r_XS2, f0 // neg sin
- nop.i 0
+ nop.i 0
};;
-{ .mfi
- ldfe FR_p_A13 = [GR_p_Table], 16
+{ .mfi
+ ldfe FR_p_A13 = [GR_p_Table], 16
fms.s1 FR_r_XS2L = FR_r_XS, FR_r_XS, FR_r_XS2 // xs^2 delta
- nop.i 0
+ nop.i 0
};;
-{ .mfi
- nop.m 0
+{ .mfi
+ nop.m 0
fma.s1 FR_p_Temp5H = FR_p_A5H, FR_p_XR, f0 // Low poly
- nop.i 0
+ nop.i 0
}
-{ .mfi
- nop.m 0
+{ .mfi
+ nop.m 0
fma.s1 FR_p_XR2 = FR_p_XR, FR_p_XR, f0 // poly tail
- nop.i 0
+ nop.i 0
};;
-{ .mfi
- nop.m 0
+{ .mfi
+ nop.m 0
fabs FR_r_XS = FR_r_XS // Absolute value of xs
- nop.i 0
+ nop.i 0
}
-{ .mfi
- nop.m 0
+{ .mfi
+ nop.m 0
fma.s1 FR_p_Temp2H = FR_p_A2H, FR_p_XR, f0 // High poly
- nop.i 0
+ nop.i 0
};;
-{ .mfi
- nop.m 0
+{ .mfi
+ nop.m 0
fnma.s1 FR_z_E4 = f8,FR_z_Y2,f1 // e4 = 1-b*y2
- nop.i 0
+ nop.i 0
}
-{ .mfi
- nop.m 0
+{ .mfi
+ nop.m 0
fma.s1 FR_z_ZH = FR_z_R0,FR_z_Y2,FR_z_Q0 // 1/x = q+r*y2
- nop.i 0
+ nop.i 0
};;
-{ .mfi
- nop.m 0
+{ .mfi
+ nop.m 0
fms.s1 FR_r_TT = FR_r_A2H, FR_r_XS2, FR_r_TH // neg sin
- nop.i 0
+ nop.i 0
}
-{ .mfi
- nop.m 0
+{ .mfi
+ nop.m 0
fma.s1 FR_r_ResH = FR_r_TH, f1, FR_r_A1H // neg sin
- nop.i 0
+ nop.i 0
};;
-{ .mfi
- nop.m 0
+{ .mfi
+ nop.m 0
fma.s1 FR_r_TL = FR_r_A2H, FR_r_XS2L, FR_r_TL // neg sin
- nop.i 0
+ nop.i 0
};;
-{ .mfi
- nop.m 0
+{ .mfi
+ nop.m 0
fms.s1 FR_p_Temp5L = FR_p_A5H, FR_p_XR, FR_p_Temp5H // Low poly
- nop.i 0
+ nop.i 0
}
-{ .mfi
- nop.m 0
+{ .mfi
+ nop.m 0
fma.s1 FR_p_Poly5H = FR_p_Temp5H, f1, FR_p_A4H // Low poly
- nop.i 0
+ nop.i 0
};;
-{ .mfi
- nop.m 0
+{ .mfi
+ nop.m 0
fms.s1 FR_p_Temp2L = FR_p_A2H, FR_p_XR, FR_p_Temp2H // High poly
- nop.i 0
+ nop.i 0
}
-{ .mfi
- nop.m 0
+{ .mfi
+ nop.m 0
fma.s1 FR_p_Poly2H = FR_p_Temp2H, f1, FR_p_A1H // High poly
- nop.i 0
+ nop.i 0
};;
-{ .mfi
- nop.m 0
+{ .mfi
+ nop.m 0
fma.s1 FR_p_XR3 = FR_p_XR2, FR_p_XR, f0 // r^3
- nop.i 0
+ nop.i 0
}
-{ .mfi
- nop.m 0
+{ .mfi
+ nop.m 0
fms.s1 FR_p_XR2L = FR_p_XR, FR_p_XR, FR_p_XR2 // r^2 delta
- nop.i 0
+ nop.i 0
};;
-{ .mfi
- nop.m 0
+{ .mfi
+ nop.m 0
fma.s1 FR_p_A18 = FR_p_A19, FR_p_XR, FR_p_A18 // poly tail
- nop.i 0
+ nop.i 0
}
-{ .mfi
- nop.m 0
+{ .mfi
+ nop.m 0
fma.s1 FR_p_A14 = FR_p_A15, FR_p_XR, FR_p_A14 // poly tail
- nop.i 0
+ nop.i 0
};;
-{ .mfi
- nop.m 0
+{ .mfi
+ nop.m 0
fma.s1 FR_p_XR4 = FR_p_XR2, FR_p_XR2, f0 // poly tail
- nop.i 0
+ nop.i 0
}
-{ .mfi
- nop.m 0
+{ .mfi
+ nop.m 0
fma.s1 FR_z_Y3 = FR_z_Y2,FR_z_E4,FR_z_Y2 // y3 = y2+y2*e4
- nop.i 0
+ nop.i 0
};;
-{ .mfi
- nop.m 0
+{ .mfi
+ nop.m 0
fma.s1 FR_p_Temp5L = FR_p_A5L, FR_p_XR, FR_p_Temp5L // Low poly
- nop.i 0
+ nop.i 0
}
-{ .mfi
- nop.m 0
+{ .mfi
+ nop.m 0
fms.s1 FR_p_Poly5L = FR_p_A4H, f1, FR_p_Poly5H // Low poly
- nop.i 0
+ nop.i 0
};;
-{ .mfi
- nop.m 0
+{ .mfi
+ nop.m 0
fma.s1 FR_p_Temp4H = FR_p_Poly5H, FR_p_XR, f0 // Low poly
- nop.i 0
+ nop.i 0
}
-{ .mfi
- nop.m 0
+{ .mfi
+ nop.m 0
fma.s1 FR_p_Temp2L = FR_p_A2L, FR_p_XR, FR_p_Temp2L // High poly
- nop.i 0
+ nop.i 0
};;
-{ .mfi
- nop.m 0
+{ .mfi
+ nop.m 0
fms.s1 FR_p_Poly2L = FR_p_A1H, f1, FR_p_Poly2H // High poly
- nop.i 0
+ nop.i 0
}
-{ .mfi
- nop.m 0
+{ .mfi
+ nop.m 0
fma.s1 FR_p_Temp1H = FR_p_Poly2H, FR_p_XR, f0 // High poly
- nop.i 0
+ nop.i 0
};;
-{ .mfi
- nop.m 0
+{ .mfi
+ nop.m 0
fms.s1 FR_p_XR3L = FR_p_XR2, FR_p_XR, FR_p_XR3 // x^3 delta
- nop.i 0
+ nop.i 0
}
-{ .mfi
- nop.m 0
+{ .mfi
+ nop.m 0
fma.s1 FR_p_A16 = FR_p_A17, FR_p_XR, FR_p_A16 //poly tail
- nop.i 0
+ nop.i 0
};;
-{ .mfi
- nop.m 0
+{ .mfi
+ nop.m 0
fms.s1 FR_r_ResL = FR_r_A1H, f1, FR_r_ResH // neg sin
- nop.i 0
+ nop.i 0
}
-{ .mfi
- nop.m 0
+{ .mfi
+ nop.m 0
fma.s1 FR_r_TL = FR_r_TL, f1, FR_r_TT // neg sin
- nop.i 0
+ nop.i 0
};;
-{ .mfi
- nop.m 0
+{ .mfi
+ nop.m 0
fma.s1 FR_p_Temp5L = FR_p_Temp5L, f1, FR_p_A4L // Low poly
- nop.i 0
+ nop.i 0
}
-{ .mfi
- nop.m 0
+{ .mfi
+ nop.m 0
fma.s1 FR_p_Poly5L = FR_p_Poly5L, f1, FR_p_Temp5H //Low poly
- nop.i 0
+ nop.i 0
};;
-{ .mfi
- nop.m 0
+{ .mfi
+ nop.m 0
fms.s1 FR_p_Temp4L = FR_p_Poly5H, FR_p_XR, FR_p_Temp4H//Low poly
- nop.i 0
+ nop.i 0
}
-{ .mfi
- nop.m 0
+{ .mfi
+ nop.m 0
fma.s1 FR_p_Poly4H = FR_p_Temp4H, f1, FR_p_A3H // Low poly
- nop.i 0
+ nop.i 0
};;
-{ .mfi
- nop.m 0
+{ .mfi
+ nop.m 0
fma.s1 FR_p_Temp2L = FR_p_Temp2L, f1, FR_p_A1L // High poly
- nop.i 0
+ nop.i 0
}
-{ .mfi
- nop.m 0
+{ .mfi
+ nop.m 0
fma.s1 FR_p_Poly2L = FR_p_Poly2L, f1, FR_p_Temp2H // High poly
- nop.i 0
+ nop.i 0
};;
-{ .mfi
- nop.m 0
+{ .mfi
+ nop.m 0
fms.s1 FR_p_Temp1L = FR_p_Poly2H,FR_p_XR,FR_p_Temp1H //High poly
- nop.i 0
+ nop.i 0
}
-{ .mfi
- nop.m 0
+{ .mfi
+ nop.m 0
fma.s1 FR_p_Poly1H = FR_p_Temp1H, f1, FR_p_A0H // High poly
- nop.i 0
+ nop.i 0
};;
-{ .mfi
- nop.m 0
+{ .mfi
+ nop.m 0
fma.s1 FR_p_A12 = FR_p_A13, FR_p_XR, FR_p_A12 // poly tail
- nop.i 0
+ nop.i 0
}
-{ .mfi
- nop.m 0
+{ .mfi
+ nop.m 0
fma.s1 FR_p_XR3L = FR_p_XR2L, FR_p_XR, FR_p_XR3L // x^3 low
- nop.i 0
+ nop.i 0
};;
-{ .mfi
- nop.m 0
+{ .mfi
+ nop.m 0
fma.s1 FR_p_Poly5L = FR_p_Poly5L, f1, FR_p_Temp5L //Low poly
- nop.i 0
+ nop.i 0
}
-{ .mfi
- nop.m 0
+{ .mfi
+ nop.m 0
fma.s1 FR_p_A10 = FR_p_A11, FR_p_XR, FR_p_A10 //poly tail
- nop.i 0
+ nop.i 0
};;
-{ .mfi
- nop.m 0
+{ .mfi
+ nop.m 0
fms.s1 FR_p_Poly4L = FR_p_A3H, f1, FR_p_Poly4H /// Low poly
- nop.i 0
+ nop.i 0
}
-{ .mfi
- nop.m 0
+{ .mfi
+ nop.m 0
fma.s1 FR_p_A6 = FR_p_A7, FR_p_XR, FR_p_A6 // poly tail
- nop.i 0
+ nop.i 0
};;
-{ .mfi
- nop.m 0
+{ .mfi
+ nop.m 0
fma.s1 FR_p_A8 = FR_p_A9, FR_p_XR, FR_p_A8 // poly tail
- nop.i 0
+ nop.i 0
}
-{ .mfi
- nop.m 0
+{ .mfi
+ nop.m 0
fma.s1 FR_p_XR6 = FR_p_XR4, FR_p_XR2, f0 // r^6
- nop.i 0
+ nop.i 0
};;
-{ .mfi
- nop.m 0
+{ .mfi
+ nop.m 0
fma.s1 FR_p_Poly2L = FR_p_Poly2L, f1, FR_p_Temp2L // High poly
- nop.i 0
+ nop.i 0
}
-{ .mfi
- nop.m 0
+{ .mfi
+ nop.m 0
fms.s1 FR_p_Poly1L = FR_p_A0H, f1, FR_p_Poly1H // High poly
- nop.i 0
+ nop.i 0
};;
-{ .mfi
- nop.m 0
+{ .mfi
+ nop.m 0
fma.s1 FR_r_ResL = FR_r_ResL, f1, FR_r_TH // neg sin
- nop.i 0
+ nop.i 0
}
-{ .mfi
- nop.m 0
+{ .mfi
+ nop.m 0
fma.s1 FR_r_TT = FR_r_TL, f1, FR_r_A1L // neg sin
- nop.i 0
+ nop.i 0
};;
-{ .mfi
- nop.m 0
+{ .mfi
+ nop.m 0
fma.s1 FR_p_Temp4L = FR_p_Poly5L,FR_p_XR,FR_p_Temp4L //Low poly
- nop.i 0
+ nop.i 0
}
-{ .mfi
- nop.m 0
+{ .mfi
+ nop.m 0
fma.s1 FR_p_A18 = FR_p_A20, FR_p_XR2, FR_p_A18 // poly tail
- nop.i 0
+ nop.i 0
};;
-{ .mfi
- nop.m 0
+{ .mfi
+ nop.m 0
fma.s1 FR_p_Poly4L = FR_p_Poly4L, f1, FR_p_Temp4H // Low poly
- nop.i 0
+ nop.i 0
}
-{ .mfi
- nop.m 0
+{ .mfi
+ nop.m 0
fma.s1 FR_p_A14 = FR_p_A16, FR_p_XR2, FR_p_A14 // poly tail
- nop.i 0
+ nop.i 0
};;
-{ .mfi
- nop.m 0
+{ .mfi
+ nop.m 0
fma.s1 FR_p_A6 = FR_p_A8, FR_p_XR2, FR_p_A6 // poly tail
- nop.i 0
+ nop.i 0
}
-{ .mfi
- nop.m 0
+{ .mfi
+ nop.m 0
fma.s1 FR_p_A10 = FR_p_A12, FR_p_XR2, FR_p_A10 // poly tail
- nop.i 0
+ nop.i 0
};;
-{ .mfi
- nop.m 0
+{ .mfi
+ nop.m 0
fma.s1 FR_p_Temp1L = FR_p_Poly2L,FR_p_XR,FR_p_Temp1L //High poly
- nop.i 0
+ nop.i 0
}
-{ .mfi
- nop.m 0
+{ .mfi
+ nop.m 0
fma.s1 FR_p_Poly1L = FR_p_Poly1L, f1, FR_p_Temp1H // High poly
- nop.i 0
+ nop.i 0
};;
-{ .mfi
- nop.m 0
+{ .mfi
+ nop.m 0
fma.s1 FR_r_ResL = FR_r_ResL, f1, FR_r_TT // neg sin
- nop.i 0
+ nop.i 0
}
-{ .mfi
- nop.m 0
+{ .mfi
+ nop.m 0
fma.s1 FR_r_TH = FR_r_ResH, FR_r_XS2, f0 // neg sin
- nop.i 0
+ nop.i 0
};;
-{ .mfi
- nop.m 0
+{ .mfi
+ nop.m 0
fma.s1 FR_p_Temp4L = FR_p_Temp4L, f1, FR_p_A3L // Low poly
- nop.i 0
+ nop.i 0
}
-{ .mfi
- nop.m 0
+{ .mfi
+ nop.m 0
fma.s1 FR_p_Poly3H = FR_p_Poly4H, FR_p_XR3, f0 // Low poly
- nop.i 0
+ nop.i 0
};;
-{ .mfi
- nop.m 0
+{ .mfi
+ nop.m 0
fma.s1 FR_p_A14 = FR_p_A18, FR_p_XR4, FR_p_A14 // poly tail
- nop.i 0
+ nop.i 0
}
-{ .mfi
- nop.m 0
+{ .mfi
+ nop.m 0
fma.s1 FR_p_XR8 = FR_p_XR4, FR_p_XR4, f0 // r^8
- nop.i 0
+ nop.i 0
};;
-{ .mfi
- nop.m 0
+{ .mfi
+ nop.m 0
fma.s1 FR_r_TL = FR_r_ResH, FR_r_XS2L, f0 // neg sin
- nop.i 0
+ nop.i 0
}
-{ .mfi
- nop.m 0
+{ .mfi
+ nop.m 0
fnma.s1 FR_z_R1 = f8,FR_z_ZH,f1 // r1 = a-b*x
- nop.i 0
+ nop.i 0
};;
-{ .mfi
- nop.m 0
+{ .mfi
+ nop.m 0
fma.s1 FR_p_Temp1L = FR_p_Temp1L, f1, FR_p_A0L // High poly
- nop.i 0
+ nop.i 0
}
-{ .mfi
- nop.m 0
+{ .mfi
+ nop.m 0
fma.s1 FR_p_A6 = FR_p_A10, FR_p_XR4, FR_p_A6 // poly tail
- nop.i 0
+ nop.i 0
};;
-{ .mfi
- nop.m 0
+{ .mfi
+ nop.m 0
fms.s1 FR_r_TT = FR_r_ResH, FR_r_XS2, FR_r_TH // neg sin
- nop.i 0
+ nop.i 0
}
-{ .mfi
- nop.m 0
+{ .mfi
+ nop.m 0
fma.s1 FR_r_Res3H = FR_r_TH, f1, f1 // neg sin
- nop.i 0
+ nop.i 0
};;
-{ .mfi
- nop.m 0
+{ .mfi
+ nop.m 0
fma.s1 FR_p_Poly4L = FR_p_Poly4L, f1, FR_p_Temp4L // Low poly
- nop.i 0
+ nop.i 0
}
-{ .mfi
- nop.m 0
+{ .mfi
+ nop.m 0
fma.s1 FR_p_Poly3L = FR_p_Poly4H, FR_p_XR3L, f0 // Low poly
- nop.i 0
+ nop.i 0
};;
-{ .mfi
- nop.m 0
+{ .mfi
+ nop.m 0
fma.s1 FR_p_Poly0H = FR_p_Poly3H, f1, FR_p_Poly1H // Result
- nop.i 0
+ nop.i 0
}
-{ .mfi
- nop.m 0
+{ .mfi
+ nop.m 0
fma.s1 FR_r_A7 = FR_r_A8, FR_r_XS2, FR_r_A7 // neg sin
- nop.i 0
+ nop.i 0
};;
-{ .mfi
- nop.m 0
+{ .mfi
+ nop.m 0
fma.s1 FR_r_TL = FR_r_ResL, FR_r_XS2, FR_r_TL // neg sin
- nop.i 0
+ nop.i 0
}
-{ .mfi
- nop.m 0
+{ .mfi
+ nop.m 0
fma.s1 FR_r_XS4 = FR_r_XS2, FR_r_XS2, f0 // xs^4
- nop.i 0
+ nop.i 0
};;
-{ .mfi
- nop.m 0
+{ .mfi
+ nop.m 0
fma.s1 FR_p_Poly1L = FR_p_Poly1L, f1, FR_p_Temp1L // High poly
- nop.i 0
+ nop.i 0
}
-{ .mfi
- nop.m 0
+{ .mfi
+ nop.m 0
fma.s1 FR_p_PolyTail = FR_p_A14, FR_p_XR8, FR_p_A6 // poly tail
- nop.i 0
+ nop.i 0
};;
-{ .mfi
- nop.m 0
+{ .mfi
+ nop.m 0
fms.s1 FR_r_Res3L = f1, f1, FR_r_Res3H // neg sin
- nop.i 0
+ nop.i 0
}
-{ .mfi
- nop.m 0
+{ .mfi
+ nop.m 0
fma.s1 FR_r_ResH = FR_r_Res3H, FR_r_XS, f0 // neg sin
- nop.i 0
+ nop.i 0
};;
-{ .mfi
- nop.m 0
+{ .mfi
+ nop.m 0
fms.s1 FR_p_Temp0L = FR_p_Poly4H,FR_p_XR3,FR_p_Poly3H //Low poly
- nop.i 0
+ nop.i 0
}
-{ .mfi
- nop.m 0
+{ .mfi
+ nop.m 0
fma.s1 FR_p_Poly3L = FR_p_Poly4L,FR_p_XR3,FR_p_Poly3L //Low poly
- nop.i 0
+ nop.i 0
};;
-{ .mfi
- nop.m 0
+{ .mfi
+ nop.m 0
fms.s1 FR_p_Poly0L = FR_p_Poly1H, f1, FR_p_Poly0H // Result
- nop.i 0
+ nop.i 0
}
-{ .mfi
- nop.m 0
+{ .mfi
+ nop.m 0
fma.s1 FR_z_ZL = FR_z_R1,FR_z_Y3, f0 // x_lo = r1*y3
- nop.i 0
+ nop.i 0
};;
-{ .mfi
- nop.m 0
+{ .mfi
+ nop.m 0
fma.s1 FR_r_TL = FR_r_TL, f1, FR_r_TT // neg sin
- nop.i 0
+ nop.i 0
}
-{ .mfi
- nop.m 0
+{ .mfi
+ nop.m 0
fma.s1 FR_r_A3 = FR_r_A4, FR_r_XS2, FR_r_A3 /// neg sin
- nop.i 0
+ nop.i 0
};;
-{ .mfi
- nop.m 0
+{ .mfi
+ nop.m 0
fma.s1 FR_p_Poly1L = FR_p_PolyTail,FR_p_XR6,FR_p_Poly1L // High
- nop.i 0
+ nop.i 0
}
-{ .mfi
- nop.m 0
+{ .mfi
+ nop.m 0
fma.s1 FR_r_A5 = FR_r_A6, FR_r_XS2, FR_r_A5 // neg sin
- nop.i 0
+ nop.i 0
};;
-{ .mfi
- nop.m 0
+{ .mfi
+ nop.m 0
fma.s1 FR_r_Res3L = FR_r_Res3L, f1, FR_r_TH // neg sin
- nop.i 0
+ nop.i 0
}
-{ .mfi
- nop.m 0
+{ .mfi
+ nop.m 0
fms.s1 FR_r_ResL = FR_r_Res3H, FR_r_XS, FR_r_ResH // neg sin
- nop.i 0
+ nop.i 0
};;
-{ .mfi
- nop.m 0
+{ .mfi
+ nop.m 0
fma.s1 FR_p_Poly3L = FR_p_Poly3L, f1, FR_p_Temp0L // Low poly
- nop.i 0
+ nop.i 0
}
-{ .mfi
- nop.m 0
+{ .mfi
+ nop.m 0
fma.s1 FR_r_A7 = FR_r_A9, FR_r_XS4, FR_r_A7 // neg sin
- nop.i 0
+ nop.i 0
};;
-{ .mfi
- nop.m 0
+{ .mfi
+ nop.m 0
fma.s1 FR_p_Poly0L = FR_p_Poly0L, f1, FR_p_Poly3H // result
- nop.i 0
+ nop.i 0
};;
-{ .mfi
- nop.m 0
+{ .mfi
+ nop.m 0
(p14) fma.s1 f8 = FR_p_Poly0H, FR_z_ZH, f0 // z*poly
- nop.i 0
+ nop.i 0
}
-{ .mfi
- nop.m 0
+{ .mfi
+ nop.m 0
fma.s1 FR_p_Temp1L = FR_p_Poly0H, FR_z_ZL, f0 // z*poly low
- nop.i 0
+ nop.i 0
};;
-{ .mfi
- nop.m 0
+{ .mfi
+ nop.m 0
fma.s1 FR_r_A3 = FR_r_A5, FR_r_XS4, FR_r_A3 // sin tail
- nop.i 0
+ nop.i 0
}
-{ .mfi
- nop.m 0
+{ .mfi
+ nop.m 0
fma.s1 FR_r_XS7 = FR_r_XS4, FR_r_XS2, f0 // xs^6
- nop.i 0
+ nop.i 0
};;
-{ .mfi
- nop.m 0
+{ .mfi
+ nop.m 0
fma.s1 FR_r_Res3L = FR_r_Res3L, f1, FR_r_TL // sin low
- nop.i 0
+ nop.i 0
}
-{ .mfi
- nop.m 0
+{ .mfi
+ nop.m 0
fma.s1 FR_r_XS8 = FR_r_XS4, FR_r_XS4, f0 // xs^8
- nop.i 0
+ nop.i 0
};;
-{ .mfi
- nop.m 0
+{ .mfi
+ nop.m 0
fma.s1 FR_p_Temp0H = FR_p_Poly3L, f1, FR_p_Poly1L // result
- nop.i 0
+ nop.i 0
};;
-{ .mfi
- nop.m 0
+{ .mfi
+ nop.m 0
(p14) fms.s1 FR_p_Temp1H = FR_p_Poly0H, FR_z_ZH, f8 // hi result
- nop.i 0
+ nop.i 0
};;
-{ .mfi
- nop.m 0
+{ .mfi
+ nop.m 0
fma.s1 FR_r_XS7 = FR_r_XS7, FR_r_XS, f0 // xs^7
- nop.i 0
+ nop.i 0
};;
-{ .mfi
- nop.m 0
+{ .mfi
+ nop.m 0
fma.s1 FR_r_ResL = FR_r_Res3L, FR_r_XS, FR_r_ResL // lo result
- nop.i 0
+ nop.i 0
}
-{ .mfi
- nop.m 0
+{ .mfi
+ nop.m 0
fma.s1 FR_r_Tail = FR_r_A7, FR_r_XS8, FR_r_A3 // tail result
- nop.i 0
+ nop.i 0
};;
-{ .mfi
- nop.m 0
+{ .mfi
+ nop.m 0
fma.s1 FR_p_Poly0L = FR_p_Poly0L, f1, FR_p_Temp0H // lo result
- nop.i 0
+ nop.i 0
};;
-{ .mfi
- nop.m 0
+{ .mfi
+ nop.m 0
fma.s1 FR_r_ResL = FR_r_Tail, FR_r_XS7, FR_r_ResL // lo result
- nop.i 0
+ nop.i 0
};;
-{ .mfi
- nop.m 0
+{ .mfi
+ nop.m 0
(p14) fma.s1 FR_p_Temp1L = FR_p_Poly0L,FR_z_ZH,FR_p_Temp1L //hi result
- nop.i 0
+ nop.i 0
};;
-{ .mfi
- nop.m 0
+{ .mfi
+ nop.m 0
fma.s1 FR_r_TT = FR_r_ResL, f1, f0 // for low result
- nop.i 0
+ nop.i 0
};;
.pred.rel "mutex",p12,p13
-{ .mfi
- nop.m 0
+{ .mfi
+ nop.m 0
(p14) fma.s1 FR_p_Temp1L = FR_p_Temp1L, f1, FR_p_Temp1H // for lo res
- nop.i 0
+ nop.i 0
};;
-{ .mfi
+{ .mfi
(p10) cmp.eq p13, p12 = r0, r0 // set p13, clear p12
fma.s1 FR_r_Res1H = FR_r_ResH, f1, FR_r_TT // hi res
- nop.i 0
+ nop.i 0
};;
-{ .mfb
+{ .mfb
(p9) cmp.eq p13, p12 = r0, r0 // set p13, clear p12
(p14) fma.s0 f8 = f8, f1, FR_p_Temp1L // Final result
(p14) br.ret.spnt b0 // Exit for 0 < |X| < 1 path (positive arguments)///////
};;
-{ .mfi
+{ .mfi
(p11) cmp.eq p13, p12 = r0, r0 // set p13, clear p12
fms.s1 FR_r_Res1L = FR_r_ResH, f1, FR_r_Res1H // Low sin result
- nop.i 0
+ nop.i 0
};;
-{ .mfi
- nop.m 0
+{ .mfi
+ nop.m 0
fma.s1 FR_r_Res1L = FR_r_Res1L, f1, FR_r_TT // Low sin result
- nop.i 0
+ nop.i 0
}
-{ .mfi
- nop.m 0
+{ .mfi
+ nop.m 0
fma.s1 FR_r_TL = FR_p_Poly0L,FR_r_Res1H,f0 //Low sin result
- nop.i 0
+ nop.i 0
};;
-{ .mfi
- nop.m 0
+{ .mfi
+ nop.m 0
fma.s1 FR_r_TL = FR_p_Poly0H, FR_r_Res1L, FR_r_TL //Low sin
- nop.i 0
+ nop.i 0
};;
-{ .mfi
- nop.m 0
+{ .mfi
+ nop.m 0
fma.s1 FR_r_ResH = FR_p_Poly0H, FR_r_Res1H, FR_r_TL //High sin
- nop.i 0
+ nop.i 0
};;
-{ .mfi
- nop.m 0
+{ .mfi
+ nop.m 0
fms.s1 FR_r_ResL = FR_p_Poly0H,FR_r_Res1H,FR_r_ResH //Low res
- nop.i 0
+ nop.i 0
};;
-{ .mfi
- nop.m 0
+{ .mfi
+ nop.m 0
frcpa.s1 FR_r_Y0,p0 = f1,FR_r_ResH // y = frcpa(b)
- nop.i 0
+ nop.i 0
};;
-{ .mfi
- nop.m 0
+{ .mfi
+ nop.m 0
fneg FR_r_NegOne = f1 // Construct -1.0
- nop.i 0
+ nop.i 0
}
-{ .mfi
- nop.m 0
+{ .mfi
+ nop.m 0
fma.s1 FR_r_ResL = FR_r_ResL, f1, FR_r_TL // low sin
- nop.i 0
+ nop.i 0
};;
-{ .mfi
- nop.m 0
+{ .mfi
+ nop.m 0
fma.s1 FR_r_Q0 = f1,FR_r_Y0,f0 // q = a*y
- nop.i 0
+ nop.i 0
}
-{ .mfi
- nop.m 0
- fnma.s1 FR_r_E0 = FR_r_Y0,FR_r_ResH,f1 // e = 1-b*y
- nop.i 0
+{ .mfi
+ nop.m 0
+ fnma.s1 FR_r_E0 = FR_r_Y0,FR_r_ResH,f1 // e = 1-b*y
+ nop.i 0
};;
-{ .mfi
- nop.m 0
+{ .mfi
+ nop.m 0
fma.s1 FR_r_E2 = FR_r_E0,FR_r_E0,FR_r_E0 // e2 = e+e^2
- nop.i 0
+ nop.i 0
}
-{ .mfi
- nop.m 0
+{ .mfi
+ nop.m 0
fma.s1 FR_r_E1 = FR_r_E0,FR_r_E0,f0 // e1 = e^2
- nop.i 0
+ nop.i 0
};;
-{ .mfi
- nop.m 0
+{ .mfi
+ nop.m 0
fma.s1 FR_r_Y1 = FR_r_Y0,FR_r_E2,FR_r_Y0 // y1 = y+y*e2
- nop.i 0
+ nop.i 0
}
-{ .mfi
- nop.m 0
+{ .mfi
+ nop.m 0
fma.s1 FR_r_E3 = FR_r_E1,FR_r_E1,FR_r_E0 // e3 = e+e1^2
- nop.i 0
+ nop.i 0
};;
-{ .mfi
- nop.m 0
+{ .mfi
+ nop.m 0
fma.s1 FR_r_Y2 = FR_r_Y1,FR_r_E3,FR_r_Y0 // y2 = y+y1*e3
- nop.i 0
+ nop.i 0
}
-{ .mfi
- nop.m 0
+{ .mfi
+ nop.m 0
fnma.s1 FR_r_R0 = FR_r_ResH,FR_r_Q0,f1 // r = a-b*q
- nop.i 0
+ nop.i 0
};;
-{ .mfi
- nop.m 0
+{ .mfi
+ nop.m 0
fnma.s1 FR_r_E4 = FR_r_ResH,FR_r_Y2,f1 // e4 = 1-b*y2
- nop.i 0
+ nop.i 0
}
-{ .mfi
- nop.m 0
+{ .mfi
+ nop.m 0
fma.s1 FR_r_ZH = FR_r_R0,FR_r_Y2,FR_r_Q0 // x = q+r*y2
- nop.i 0
+ nop.i 0
};;
-{ .mfi
- nop.m 0
+{ .mfi
+ nop.m 0
fma.s1 FR_r_Y3 = FR_r_Y2,FR_r_E4,FR_r_Y2 // y3 = y2+y2*e4
- nop.i 0
+ nop.i 0
}
-{ .mfi
- nop.m 0
+{ .mfi
+ nop.m 0
fnma.s1 FR_r_R1 = FR_r_ResH,FR_r_ZH,f1 // r1 = a-b*x
- nop.i 0
+ nop.i 0
};;
-{ .mfi
- nop.m 0
+{ .mfi
+ nop.m 0
fnma.s1 FR_r_R1 = FR_r_ResL,FR_r_ZH,FR_r_R1 // r1=r1 - b_lo*X
- nop.i 0
+ nop.i 0
}
-{ .mfi
- nop.m 0
+{ .mfi
+ nop.m 0
fma.s1 FR_r_ZHN = FR_r_ZH,FR_r_NegOne, f0 // Negate
- nop.i 0
+ nop.i 0
};;
.pred.rel "mutex",p13,p12
-{ .mfb
- nop.m 0
+{ .mfb
+ nop.m 0
fnma.s0 f8 = FR_r_R1,FR_r_Y3,FR_r_ZHN // Result for neg
br.ret.sptk b0 // Exit for 0 < |X| < 1 path (negative arguments)//////
};;
.align 32
tgammal_spec:
{ .mlx
- nop.m 0
+ nop.m 0
movl GR_DenOverflow = 0x2000000000000001
}
{ .mfi
}
{ .mfi
-(p9) cmp.ltu.unc p10,p11 = GR_l_signif_Z, GR_DenOverflow
+(p9) cmp.ltu.unc p10,p11 = GR_l_signif_Z, GR_DenOverflow
(p9) fnorm.s0 f8 = f8
- nop.i 0
+ nop.i 0
};;
{ .mfb
- nop.m 0
+ nop.m 0
(p9) fcvt.fx.trunc.s1 FR_n_IXN = FR_l_AbsX // Round by truncate
(p11) br.cond.sptk tgamma_lt_1 // Return to gamma ('good' denormal)////////////
};;
{ .mfb
- nop.m 0
- nop.f 0
+ nop.m 0
+ nop.f 0
(p10) br.cond.spnt tgammal_overflow // "Bad" denormal - overflow! /////////////
};;
{ .mfi
(p7) mov GR_Parameter_TAG = 256 // negative
(p7) frcpa.s0 f8,p0 = f1,f8 // Raise V flag
- nop.i 0
+ nop.i 0
}
{ .mfb
nop.m 0
- nop.f 0
+ nop.f 0
(p8) br.cond.spnt tgammal_singularity // Branch for +ZERO ////////////////////
};;
{ .mfb
- nop.m 0
- nop.f 0
+ nop.m 0
+ nop.f 0
br.cond.spnt tgammal_libm_err // Branch for -ZERO ///////////////////////
};;
{ .mfi
addl r8 = 0x1FFFE, r0 // Exp of INF
fcmp.lt.s1 p15,p14 = f8,f0 // p14 - pos arg, p15 - neg arg
- nop.i 0
+ nop.i 0
};;
{ .mfi
.pred.rel "mutex",p14,p15
{ .mfi
- nop.m 0
+ nop.m 0
(p14) fma.s0 f8 = f9,f9,f0 // Set I,O and +INF result
- nop.i 0
+ nop.i 0
}
{ .mfb
- nop.m 0
+ nop.m 0
(p15) fnma.s0 f8 = f9,f9,f0 // Set I,O and -INF result
br.cond.sptk tgammal_libm_err // Call error handler /////////////////////
// with overflow error ////////////////////
.align 32
tgammal_underflow:
{ .mfi
- nop.m 0
+ nop.m 0
fcvt.fx.trunc.s1 FR_u_IXN = f8 // Convert arg to int repres. in FR
- nop.i 0
+ nop.i 0
};;
{ .mmi
/* The latency of a memory load assumed by the assembly implementation
of the mem and str functions. Since we don't have any clue about
- where the data might be, let's assume it's in the L2 cache.
+ where the data might be, let's assume it's in the L2 cache.
Assuming L3 would be too pessimistic :-)
Some functions define MEMLAT as 2, because they expect their data
.save ar.lc, saved_lc
mov saved_lc = ar.lc // save the loop counter
.body
- mov ret0 = str
+ mov ret0 = str
and tmp = 7, str // tmp = str % 8
mux1 chrx8 = chr, @brcst
extr.u chr = chr, 0, 8 // retain only the last byte
ld8 val1 = [ret0], 8;;
nop.b 0
nop.b 0
-.l2:
+.l2:
ld8.s val2 = [ret0], 8 // don't bomb out here
- czx1.r pos0 = val1
+ czx1.r pos0 = val1
xor tmp = val1, chrx8 // if val1 contains chr, tmp will
;; // contain a zero in its position
czx1.r poschr = tmp
(p6) br.cond.spnt .notfound
chk.s val2, .recovery
.back:
- mov val1 = val2
+ mov val1 = val2
br.cond.dptk .l2
.foundit:
(p6) cmp.lt p8, p0 = pos0, poschr // we found chr and null in the word
.save ar.lc, saved_lc
mov saved_lc = ar.lc // save the loop counter
.body
- mov str = in0
+ mov str = in0
mov len = r0 // len = 0
and tmp = 7, in0 // tmp = str % 8
;;
nop.b 0
nop.b 0
.l2: ld8.s val2 = [str], 8 // don't bomb out here
- czx1.r pos0 = val1
+ czx1.r pos0 = val1
;;
cmp.ne p6, p0 = 8, pos0
(p6) br.cond.spnt .foundit
chk.s val2, .recovery
.back:
- mov val1 = val2
+ mov val1 = val2
br.cond.dptk .l2
.foundit:
sub tmp = str, origadd // tmp = crt address - orig
sub ret0 = val1, val2
.restore_and_exit:
br.ret.sptk.many b0
-END(strncmp)
+END(strncmp)
libc_hidden_builtin_def (strncmp)
for (i = 0; i < 3; ++i)
ADD_MEM (bpregs[i], sizeof (bpregs[0]) - 1);
-
+
ADD_STRING ("\n\n IP: ");
ADD_MEM (spregs[0], sizeof (spregs[0]));
ADD_STRING (" RSC: ");