emit_insn (gen_truncrfxf2 (operands[0], q_res));
DONE;
})
+
+
+;; SQRT operations
+
+
+(define_insn "sqrt_approx_rf"
+ [(set (match_operand:RF 0 "fr_register_operand" "=f")
+ (unspec:RF [(match_operand:RF 1 "fr_reg_or_fp01_operand" "fG")]
+ UNSPEC_FR_SQRT_RECIP_APPROX_RES))
+ (set (match_operand:BI 2 "register_operand" "=c")
+ (unspec:BI [(match_dup 1)] UNSPEC_FR_SQRT_RECIP_APPROX))
+ (use (match_operand:SI 3 "const_int_operand" ""))]
+ ""
+ "frsqrta.s%3 %0, %2 = %F1"
+ [(set_attr "itanium_class" "fmisc")
+ (set_attr "predicable" "no")])
+
+(define_expand "sqrtsf2_internal_thr"
+ [(set (match_operand:SF 0 "fr_register_operand" "")
+ (sqrt:SF (match_operand:SF 1 "fr_register_operand" "")))]
+ "TARGET_INLINE_SQRT"
+{
+ rtx y = gen_reg_rtx (RFmode);
+ rtx b = gen_reg_rtx (RFmode);
+ rtx g = gen_reg_rtx (RFmode);
+ rtx e = gen_reg_rtx (RFmode);
+ rtx s = gen_reg_rtx (RFmode);
+ rtx f = gen_reg_rtx (RFmode);
+ rtx y1 = gen_reg_rtx (RFmode);
+ rtx g1 = gen_reg_rtx (RFmode);
+ rtx h = gen_reg_rtx (RFmode);
+ rtx d = gen_reg_rtx (RFmode);
+ rtx g2 = gen_reg_rtx (RFmode);
+ rtx cond = gen_reg_rtx (BImode);
+ rtx zero = CONST0_RTX (RFmode);
+ rtx one = CONST1_RTX (RFmode);
+ rtx c1 = ia64_dconst_0_5();
+ rtx c2 = ia64_dconst_0_375();
+ rtx reg_df_c1 = gen_reg_rtx (DFmode);
+ rtx reg_df_c2 = gen_reg_rtx (DFmode);
+ rtx reg_rf_c1 = gen_reg_rtx (RFmode);
+ rtx reg_rf_c2 = gen_reg_rtx (RFmode);
+ rtx status0 = CONST0_RTX (SImode);
+ rtx status1 = CONST1_RTX (SImode);
+ rtx trunc_sgl = CONST0_RTX (SImode);
+ rtx trunc_off = CONST2_RTX (SImode);
+
+ /* Put needed constants into registers. */
+ emit_insn (gen_movdf (reg_df_c1, c1));
+ emit_insn (gen_movdf (reg_df_c2, c2));
+ emit_insn (gen_extenddfrf2 (reg_rf_c1, reg_df_c1));
+ emit_insn (gen_extenddfrf2 (reg_rf_c2, reg_df_c2));
+ /* Empty conversion to put input into RFmode. */
+ emit_insn (gen_extendsfrf2 (b, operands[1]));
+ /* y = sqrt (1 / b) */
+ emit_insn (gen_sqrt_approx_rf (y, b, cond, status0));
+ /* g = b * y */
+ emit_insn (gen_mulrf3_cond (g, cond, b, y, zero, status1, trunc_off));
+ /* e = 1 - (g * y) */
+ emit_insn (gen_m2subrf4_cond (e, cond, one, g, y, zero, status1, trunc_off));
+ /* s = 0.5 + (0.375 * e) */
+ emit_insn (gen_m2addrf4_cond (s, cond, reg_rf_c1, reg_rf_c2, e, zero, status1, trunc_off));
+ /* f = y * e */
+ emit_insn (gen_mulrf3_cond (f, cond, y, e, zero, status1, trunc_off));
+ /* y1 = y + (f * s) */
+ emit_insn (gen_m2addrf4_cond (y1, cond, y, f, s, zero, status1, trunc_off));
+ /* g1 = single (b * y1) */
+ emit_insn (gen_mulrf3_cond (g1, cond, b, y1, zero, status1, trunc_sgl));
+ /* h = 0.5 * y1 */
+ emit_insn (gen_mulrf3_cond (h, cond, reg_rf_c1, y1, zero, status1, trunc_off));
+ /* d = b - g1 * g1 */
+ emit_insn (gen_m2subrf4_cond (d, cond, b, g1, g1, zero, status1, trunc_off));
+ /* g2 = single(g1 + (d * h)) */
+ emit_insn (gen_m2addrf4_cond (g2, cond, g1, d, h, y, status0, trunc_sgl));
+ /* Conversion back into SFmode. */
+ emit_insn (gen_truncrfsf2 (operands[0], g2));
+ DONE;
+})
+
+(define_expand "sqrtsf2_internal_lat"
+ [(set (match_operand:SF 0 "fr_register_operand" "")
+ (sqrt:SF (match_operand:SF 1 "fr_register_operand" "")))]
+ "TARGET_INLINE_SQRT"
+{
+ rtx y = gen_reg_rtx (RFmode);
+ rtx b = gen_reg_rtx (RFmode);
+ rtx g = gen_reg_rtx (RFmode);
+ rtx g1 = gen_reg_rtx (RFmode);
+ rtx g2 = gen_reg_rtx (RFmode);
+ rtx e = gen_reg_rtx (RFmode);
+ rtx s = gen_reg_rtx (RFmode);
+ rtx f = gen_reg_rtx (RFmode);
+ rtx f1 = gen_reg_rtx (RFmode);
+ rtx h = gen_reg_rtx (RFmode);
+ rtx h1 = gen_reg_rtx (RFmode);
+ rtx d = gen_reg_rtx (RFmode);
+ rtx cond = gen_reg_rtx (BImode);
+ rtx zero = CONST0_RTX (RFmode);
+ rtx one = CONST1_RTX (RFmode);
+ rtx c1 = ia64_dconst_0_5();
+ rtx c2 = ia64_dconst_0_375();
+ rtx reg_df_c1 = gen_reg_rtx (DFmode);
+ rtx reg_df_c2 = gen_reg_rtx (DFmode);
+ rtx reg_rf_c1 = gen_reg_rtx (RFmode);
+ rtx reg_rf_c2 = gen_reg_rtx (RFmode);
+ rtx status0 = CONST0_RTX (SImode);
+ rtx status1 = CONST1_RTX (SImode);
+ rtx trunc_sgl = CONST0_RTX (SImode);
+ rtx trunc_off = CONST2_RTX (SImode);
+
+ /* Put needed constants into registers. */
+ emit_insn (gen_movdf (reg_df_c1, c1));
+ emit_insn (gen_movdf (reg_df_c2, c2));
+ emit_insn (gen_extenddfrf2 (reg_rf_c1, reg_df_c1));
+ emit_insn (gen_extenddfrf2 (reg_rf_c2, reg_df_c2));
+ /* Empty conversion to put input into RFmode. */
+ emit_insn (gen_extendsfrf2 (b, operands[1]));
+ /* y = sqrt (1 / b) */
+ emit_insn (gen_sqrt_approx_rf (y, b, cond, status0));
+ /* g = b * y */
+ emit_insn (gen_mulrf3_cond (g, cond, b, y, zero, status1, trunc_off));
+ /* e = 1 - (g * y) */
+ emit_insn (gen_m2subrf4_cond (e, cond, one, g, y, zero, status1, trunc_off));
+ /* h = 0.5 * y */
+ emit_insn (gen_mulrf3_cond (h, cond, reg_rf_c1, y, zero, status1, trunc_off));
+ /* s = 0.5 + (0.375 * e) */
+ emit_insn (gen_m2addrf4_cond (s, cond, reg_rf_c1, reg_rf_c2, e, zero, status1, trunc_off));
+ /* f = e * g */
+ emit_insn (gen_mulrf3_cond (f, cond, e, g, zero, status1, trunc_off));
+ /* g1 = single (g + (f * s)) */
+ emit_insn (gen_m2addrf4_cond (g1, cond, g, f, s, zero, status1, trunc_sgl));
+ /* f1 = e * h */
+ emit_insn (gen_mulrf3_cond (f1, cond, e, h, zero, status1, trunc_off));
+ /* d = b - g1 * g1 */
+ emit_insn (gen_m2subrf4_cond (d, cond, b, g1, g1, zero, status1, trunc_off));
+ /* h1 = h + (f1 * s) */
+ emit_insn (gen_m2addrf4_cond (h1, cond, h, f1, s, zero, status1, trunc_off));
+ /* g2 = single(g1 + (d * h1)) */
+ emit_insn (gen_m2addrf4_cond (g2, cond, g1, d, h1, y, status0, trunc_sgl));
+ /* Conversion back into SFmode. */
+ emit_insn (gen_truncrfsf2 (operands[0], g2));
+ DONE;
+})
+
+(define_expand "sqrtdf2_internal_thr"
+ [(set (match_operand:DF 0 "fr_register_operand" "")
+ (sqrt:DF (match_operand:DF 1 "fr_register_operand" "")))]
+ "TARGET_INLINE_SQRT"
+{
+ rtx y = gen_reg_rtx (RFmode);
+ rtx b = gen_reg_rtx (RFmode);
+ rtx g = gen_reg_rtx (RFmode);
+ rtx g1 = gen_reg_rtx (RFmode);
+ rtx g2 = gen_reg_rtx (RFmode);
+ rtx g3 = gen_reg_rtx (RFmode);
+ rtx g4 = gen_reg_rtx (RFmode);
+ rtx r = gen_reg_rtx (RFmode);
+ rtx r1 = gen_reg_rtx (RFmode);
+ rtx h = gen_reg_rtx (RFmode);
+ rtx h1 = gen_reg_rtx (RFmode);
+ rtx h2 = gen_reg_rtx (RFmode);
+ rtx d = gen_reg_rtx (RFmode);
+ rtx d1 = gen_reg_rtx (RFmode);
+ rtx cond = gen_reg_rtx (BImode);
+ rtx zero = CONST0_RTX (RFmode);
+ rtx c1 = ia64_dconst_0_5();
+ rtx reg_df_c1 = gen_reg_rtx (DFmode);
+ rtx reg_rf_c1 = gen_reg_rtx (RFmode);
+ rtx status0 = CONST0_RTX (SImode);
+ rtx status1 = CONST1_RTX (SImode);
+ rtx trunc_dbl = CONST1_RTX (SImode);
+ rtx trunc_off = CONST2_RTX (SImode);
+
+ /* Put needed constants into registers. */
+ emit_insn (gen_movdf (reg_df_c1, c1));
+ emit_insn (gen_extenddfrf2 (reg_rf_c1, reg_df_c1));
+ /* Empty conversion to put input into RFmode. */
+ emit_insn (gen_extenddfrf2 (b, operands[1]));
+ /* y = sqrt (1 / b) */
+ emit_insn (gen_sqrt_approx_rf (y, b, cond, status0));
+ /* g = b * y */
+ emit_insn (gen_mulrf3_cond (g, cond, b, y, zero, status1, trunc_off));
+ /* h = 0.5 * y */
+ emit_insn (gen_mulrf3_cond (h, cond, reg_rf_c1, y, zero, status1, trunc_off));
+ /* r = 0.5 - (g * h) */
+ emit_insn (gen_m2subrf4_cond (r, cond, reg_rf_c1, g, h, zero, status1, trunc_off));
+ /* g1 = g + (g * r) */
+ emit_insn (gen_m2addrf4_cond (g1, cond, g, g, r, zero, status1, trunc_off));
+ /* h1 = h + (h * r) */
+ emit_insn (gen_m2addrf4_cond (h1, cond, h, h, r, zero, status1, trunc_off));
+ /* r1 = 0.5 - (g1 * h1) */
+ emit_insn (gen_m2subrf4_cond (r1, cond, reg_rf_c1, g1, h1, zero, status1, trunc_off));
+ /* g2 = g1 + (g1 * r1) */
+ emit_insn (gen_m2addrf4_cond (g2, cond, g1, g1, r1, zero, status1, trunc_off));
+ /* h2 = h1 + (h1 * r1) */
+ emit_insn (gen_m2addrf4_cond (h2, cond, h1, h1, r1, zero, status1, trunc_off));
+ /* d = b - (g2 * g2) */
+ emit_insn (gen_m2subrf4_cond (d, cond, b, g2, g2, zero, status1, trunc_off));
+ /* g3 = g2 + (d * h2) */
+ emit_insn (gen_m2addrf4_cond (g3, cond, g2, d, h2, zero, status1, trunc_off));
+ /* d1 = b - (g3 * g3) */
+ emit_insn (gen_m2subrf4_cond (d1, cond, b, g3, g3, zero, status1, trunc_off));
+ /* g4 = g3 + (d1 * h2) */
+ emit_insn (gen_m2addrf4_cond (g4, cond, g3, d1, h2, y, status1, trunc_dbl));
+ /* Conversion back into SFmode. */
+ emit_insn (gen_truncrfdf2 (operands[0], g4));
+ DONE;
+})
+
+(define_expand "sqrtxf2_internal"
+ [(set (match_operand:XF 0 "fr_register_operand" "")
+ (sqrt:XF (match_operand:XF 1 "fr_register_operand" "")))]
+ "TARGET_INLINE_SQRT"
+{
+ rtx y = gen_reg_rtx (RFmode);
+ rtx b = gen_reg_rtx (RFmode);
+ rtx g = gen_reg_rtx (RFmode);
+ rtx g1 = gen_reg_rtx (RFmode);
+ rtx g2 = gen_reg_rtx (RFmode);
+ rtx g3 = gen_reg_rtx (RFmode);
+ rtx g4 = gen_reg_rtx (RFmode);
+ rtx e = gen_reg_rtx (RFmode);
+ rtx e1 = gen_reg_rtx (RFmode);
+ rtx e2 = gen_reg_rtx (RFmode);
+ rtx h = gen_reg_rtx (RFmode);
+ rtx h1 = gen_reg_rtx (RFmode);
+ rtx h2 = gen_reg_rtx (RFmode);
+ rtx h3 = gen_reg_rtx (RFmode);
+ rtx d = gen_reg_rtx (RFmode);
+ rtx d1 = gen_reg_rtx (RFmode);
+ rtx cond = gen_reg_rtx (BImode);
+ rtx zero = CONST0_RTX (RFmode);
+ rtx c1 = ia64_dconst_0_5();
+ rtx reg_df_c1 = gen_reg_rtx (DFmode);
+ rtx reg_rf_c1 = gen_reg_rtx (RFmode);
+ rtx status0 = CONST0_RTX (SImode);
+ rtx status1 = CONST1_RTX (SImode);
+ rtx trunc_off = CONST2_RTX (SImode);
+
+ /* Put needed constants into registers. */
+ emit_insn (gen_movdf (reg_df_c1, c1));
+ emit_insn (gen_extenddfrf2 (reg_rf_c1, reg_df_c1));
+ /* Empty conversion to put input into RFmode. */
+ emit_insn (gen_extendxfrf2 (b, operands[1]));
+ /* y = sqrt (1 / b) */
+ emit_insn (gen_sqrt_approx_rf (y, b, cond, status0));
+ /* g = b * y */
+ emit_insn (gen_mulrf3_cond (g, cond, b, y, zero, status1, trunc_off));
+ /* h = 0.5 * y */
+ emit_insn (gen_mulrf3_cond (h, cond, reg_rf_c1, y, zero, status1, trunc_off));
+ /* e = 0.5 - (g * h) */
+ emit_insn (gen_m2subrf4_cond (e, cond, reg_rf_c1, g, h, zero, status1, trunc_off));
+ /* g1 = g + (g * e) */
+ emit_insn (gen_m2addrf4_cond (g1, cond, g, g, e, zero, status1, trunc_off));
+ /* h1 = h + (h * e) */
+ emit_insn (gen_m2addrf4_cond (h1, cond, h, h, e, zero, status1, trunc_off));
+ /* e1 = 0.5 - (g1 * h1) */
+ emit_insn (gen_m2subrf4_cond (e1, cond, reg_rf_c1, g1, h1, zero, status1, trunc_off));
+ /* g2 = g1 + (g1 * e1) */
+ emit_insn (gen_m2addrf4_cond (g2, cond, g1, g1, e1, zero, status1, trunc_off));
+ /* h2 = h1 + (h1 * e1) */
+ emit_insn (gen_m2addrf4_cond (h2, cond, h1, h1, e1, zero, status1, trunc_off));
+ /* d = b - (g2 * g2) */
+ emit_insn (gen_m2subrf4_cond (d, cond, b, g2, g2, zero, status1, trunc_off));
+ /* e2 = 0.5 - (g2 * h2) */
+ emit_insn (gen_m2subrf4_cond (e2, cond, reg_rf_c1, g2, h2, zero, status1, trunc_off));
+ /* g3 = g2 + (d * h2) */
+ emit_insn (gen_m2addrf4_cond (g3, cond, g2, d, h2, zero, status1, trunc_off));
+ /* h3 = h2 + (e2 * h2) */
+ emit_insn (gen_m2addrf4_cond (h3, cond, h2, e2, h2, zero, status1, trunc_off));
+ /* d1 = b - (g3 * g3) */
+ emit_insn (gen_m2subrf4_cond (d1, cond, b, g3, g3, zero, status1, trunc_off));
+ /* g4 = g3 + (d1 * h3) */
+ emit_insn (gen_m2addrf4_cond (g4, cond, g3, d1, h3, y, status1, trunc_off));
+ /* Conversion back into SFmode. */
+ emit_insn (gen_truncrfxf2 (operands[0], g4));
+ DONE;
+})
DONE;
})
-;; Inline square root.
-
-(define_insn "*sqrt_approx"
- [(set (match_operand:XF 0 "fr_register_operand" "=f")
- (div:XF (const_int 1)
- (unspec:XF [(match_operand:XF 2 "fr_reg_or_fp01_operand" "fG")]
- UNSPEC_FR_SQRT_RECIP_APPROX_RES)))
- (set (match_operand:BI 1 "register_operand" "=c")
- (unspec:BI [(match_dup 2)] UNSPEC_FR_SQRT_RECIP_APPROX))
- (use (match_operand:SI 3 "const_int_operand" "")) ]
- ""
- "frsqrta.s%3 %0, %1 = %2"
- [(set_attr "itanium_class" "fmisc")
- (set_attr "predicable" "no")])
-
(define_insn "setf_exp_xf"
[(set (match_operand:XF 0 "fr_register_operand" "=f")
(unspec:XF [(match_operand:DI 1 "register_operand" "r")]
"setf.exp %0 = %1"
[(set_attr "itanium_class" "frfr")])
+
+;; Inline square root.
+
(define_expand "sqrtsf2"
[(set (match_operand:SF 0 "fr_register_operand" "=&f")
(sqrt:SF (match_operand:SF 1 "fr_reg_or_fp01_operand" "fG")))]
"TARGET_INLINE_SQRT"
{
rtx insn;
-#if 0
if (TARGET_INLINE_SQRT == INL_MIN_LAT)
insn = gen_sqrtsf2_internal_lat (operands[0], operands[1]);
else
-#else
- gcc_assert (TARGET_INLINE_SQRT != INL_MIN_LAT);
-#endif
- insn = gen_sqrtsf2_internal_thr (operands[0], operands[1]);
+ insn = gen_sqrtsf2_internal_thr (operands[0], operands[1]);
emit_insn (insn);
DONE;
})
-
-;; Latency-optimized square root.
-;; FIXME: Implement.
-
-;; Throughput-optimized square root.
-
-(define_insn_and_split "sqrtsf2_internal_thr"
- [(set (match_operand:SF 0 "fr_register_operand" "=&f")
- (sqrt:SF (match_operand:SF 1 "fr_reg_or_fp01_operand" "fG")))
- ;; Register r2 in optimization guide.
- (clobber (match_scratch:DI 2 "=r"))
- ;; Register f8 in optimization guide
- (clobber (match_scratch:XF 3 "=&f"))
- ;; Register f9 in optimization guide
- (clobber (match_scratch:XF 4 "=&f"))
- ;; Register f10 in optimization guide
- (clobber (match_scratch:XF 5 "=&f"))
- ;; Register p6 in optimization guide.
- (clobber (match_scratch:BI 6 "=c"))]
- "TARGET_INLINE_SQRT == INL_MAX_THR"
- "#"
- "&& reload_completed"
- [ ;; exponent of +1/2 in r2
- (set (match_dup 2) (const_int 65534))
- ;; +1/2 in f8
- (set (match_dup 3)
- (unspec:XF [(match_dup 2)] UNSPEC_SETF_EXP))
- ;; Step 1
- ;; y0 = 1/sqrt(a) in f7
- (parallel [(set (match_dup 7)
- (div:XF (const_int 1)
- (unspec:XF [(match_dup 8)]
- UNSPEC_FR_SQRT_RECIP_APPROX_RES)))
- (set (match_dup 6)
- (unspec:BI [(match_dup 8)]
- UNSPEC_FR_SQRT_RECIP_APPROX))
- (use (const_int 0))])
- ;; Step 2
- ;; H0 = 1/2 * y0 in f9
- (cond_exec (ne (match_dup 6) (const_int 0))
- (parallel [(set (match_dup 4)
- (plus:XF (mult:XF (match_dup 3) (match_dup 7))
- (match_dup 9)))
- (use (const_int 1))]))
- ;; Step 3
- ;; S0 = a * y0 in f7
- (cond_exec (ne (match_dup 6) (const_int 0))
- (parallel [(set (match_dup 7)
- (plus:XF (mult:XF (match_dup 8) (match_dup 7))
- (match_dup 9)))
- (use (const_int 1))]))
- ;; Step 4
- ;; d = 1/2 - S0 * H0 in f10
- (cond_exec (ne (match_dup 6) (const_int 0))
- (parallel [(set (match_dup 5)
- (minus:XF (match_dup 3)
- (mult:XF (match_dup 7) (match_dup 4))))
- (use (const_int 1))]))
- ;; Step 5
- ;; d' = d + 1/2 * d in f8
- (cond_exec (ne (match_dup 6) (const_int 0))
- (parallel [(set (match_dup 3)
- (plus:XF (mult:XF (match_dup 3) (match_dup 5))
- (match_dup 5)))
- (use (const_int 1))]))
- ;; Step 6
- ;; e = d + d * d' in f8
- (cond_exec (ne (match_dup 6) (const_int 0))
- (parallel [(set (match_dup 3)
- (plus:XF (mult:XF (match_dup 5) (match_dup 3))
- (match_dup 5)))
- (use (const_int 1))]))
- ;; Step 7
- ;; S1 = S0 + e * S0 in f7
- (cond_exec (ne (match_dup 6) (const_int 0))
- (parallel [(set (match_dup 0)
- (float_truncate:SF
- (plus:XF (mult:XF (match_dup 3) (match_dup 7))
- (match_dup 7))))
- (use (const_int 1))]))
- ;; Step 8
- ;; H1 = H0 + e * H0 in f8
- (cond_exec (ne (match_dup 6) (const_int 0))
- (parallel [(set (match_dup 3)
- (plus:XF (mult:XF (match_dup 3) (match_dup 4))
- (match_dup 4)))
- (use (const_int 1))]))
- ;; Step 9
- ;; d1 = a - S1 * S1 in f9
- (cond_exec (ne (match_dup 6) (const_int 0))
- (parallel [(set (match_dup 4)
- (minus:XF (match_dup 8)
- (mult:XF (match_dup 7) (match_dup 7))))
- (use (const_int 1))]))
- ;; Step 10
- ;; S = S1 + d1 * H1 in f7
- (cond_exec (ne (match_dup 6) (const_int 0))
- (parallel [(set (match_dup 0)
- (float_truncate:SF
- (plus:XF (mult:XF (match_dup 4) (match_dup 3))
- (match_dup 7))))
- (use (const_int 0))]))]
-{
- /* Generate 82-bit versions of the input and output operands. */
- operands[7] = gen_rtx_REG (XFmode, REGNO (operands[0]));
- operands[8] = gen_rtx_REG (XFmode, REGNO (operands[1]));
- /* Generate required floating-point constants. */
- operands[9] = CONST0_RTX (XFmode);
-}
- [(set_attr "predicable" "no")])
\f
;; ::::::::::::::::::::
;; ::
if (TARGET_INLINE_SQRT == INL_MIN_LAT)
insn = gen_sqrtdf2_internal_lat (operands[0], operands[1]);
else
-#else
- gcc_assert (TARGET_INLINE_SQRT != INL_MIN_LAT);
#endif
insn = gen_sqrtdf2_internal_thr (operands[0], operands[1]);
emit_insn (insn);
DONE;
})
-
-;; Latency-optimized square root.
-;; FIXME: Implement.
-
-;; Throughput-optimized square root.
-
-(define_insn_and_split "sqrtdf2_internal_thr"
- [(set (match_operand:DF 0 "fr_register_operand" "=&f")
- (sqrt:DF (match_operand:DF 1 "fr_reg_or_fp01_operand" "fG")))
- ;; Register r2 in optimization guide.
- (clobber (match_scratch:DI 2 "=r"))
- ;; Register f8 in optimization guide
- (clobber (match_scratch:XF 3 "=&f"))
- ;; Register f9 in optimization guide
- (clobber (match_scratch:XF 4 "=&f"))
- ;; Register f10 in optimization guide
- (clobber (match_scratch:XF 5 "=&f"))
- ;; Register p6 in optimization guide.
- (clobber (match_scratch:BI 6 "=c"))]
- "TARGET_INLINE_SQRT == INL_MAX_THR"
- "#"
- "&& reload_completed"
- [ ;; exponent of +1/2 in r2
- (set (match_dup 2) (const_int 65534))
- ;; +1/2 in f10
- (set (match_dup 5)
- (unspec:XF [(match_dup 2)] UNSPEC_SETF_EXP))
- ;; Step 1
- ;; y0 = 1/sqrt(a) in f7
- (parallel [(set (match_dup 7)
- (div:XF (const_int 1)
- (unspec:XF [(match_dup 8)]
- UNSPEC_FR_SQRT_RECIP_APPROX_RES)))
- (set (match_dup 6)
- (unspec:BI [(match_dup 8)]
- UNSPEC_FR_SQRT_RECIP_APPROX))
- (use (const_int 0))])
- ;; Step 2
- ;; H0 = 1/2 * y0 in f8
- (cond_exec (ne (match_dup 6) (const_int 0))
- (parallel [(set (match_dup 3)
- (plus:XF (mult:XF (match_dup 5) (match_dup 7))
- (match_dup 9)))
- (use (const_int 1))]))
- ;; Step 3
- ;; G0 = a * y0 in f7
- (cond_exec (ne (match_dup 6) (const_int 0))
- (parallel [(set (match_dup 7)
- (plus:XF (mult:XF (match_dup 8) (match_dup 7))
- (match_dup 9)))
- (use (const_int 1))]))
- ;; Step 4
- ;; r0 = 1/2 - G0 * H0 in f9
- (cond_exec (ne (match_dup 6) (const_int 0))
- (parallel [(set (match_dup 4)
- (minus:XF (match_dup 5)
- (mult:XF (match_dup 7) (match_dup 3))))
- (use (const_int 1))]))
- ;; Step 5
- ;; H1 = H0 + r0 * H0 in f8
- (cond_exec (ne (match_dup 6) (const_int 0))
- (parallel [(set (match_dup 3)
- (plus:XF (mult:XF (match_dup 4) (match_dup 3))
- (match_dup 3)))
- (use (const_int 1))]))
- ;; Step 6
- ;; G1 = G0 + r0 * G0 in f7
- (cond_exec (ne (match_dup 6) (const_int 0))
- (parallel [(set (match_dup 7)
- (plus:XF (mult:XF (match_dup 4) (match_dup 7))
- (match_dup 7)))
- (use (const_int 1))]))
- ;; Step 7
- ;; r1 = 1/2 - G1 * H1 in f9
- (cond_exec (ne (match_dup 6) (const_int 0))
- (parallel [(set (match_dup 4)
- (minus:XF (match_dup 5)
- (mult:XF (match_dup 7) (match_dup 3))))
- (use (const_int 1))]))
- ;; Step 8
- ;; H2 = H1 + r1 * H1 in f8
- (cond_exec (ne (match_dup 6) (const_int 0))
- (parallel [(set (match_dup 3)
- (plus:XF (mult:XF (match_dup 4) (match_dup 3))
- (match_dup 3)))
- (use (const_int 1))]))
- ;; Step 9
- ;; G2 = G1 + r1 * G1 in f7
- (cond_exec (ne (match_dup 6) (const_int 0))
- (parallel [(set (match_dup 7)
- (plus:XF (mult:XF (match_dup 4) (match_dup 7))
- (match_dup 7)))
- (use (const_int 1))]))
- ;; Step 10
- ;; d2 = a - G2 * G2 in f9
- (cond_exec (ne (match_dup 6) (const_int 0))
- (parallel [(set (match_dup 4)
- (minus:XF (match_dup 8)
- (mult:XF (match_dup 7) (match_dup 7))))
- (use (const_int 1))]))
- ;; Step 11
- ;; G3 = G2 + d2 * H2 in f7
- (cond_exec (ne (match_dup 6) (const_int 0))
- (parallel [(set (match_dup 7)
- (plus:XF (mult:XF (match_dup 4) (match_dup 3))
- (match_dup 7)))
- (use (const_int 1))]))
- ;; Step 12
- ;; d3 = a - G3 * G3 in f9
- (cond_exec (ne (match_dup 6) (const_int 0))
- (parallel [(set (match_dup 4)
- (minus:XF (match_dup 8)
- (mult:XF (match_dup 7) (match_dup 7))))
- (use (const_int 1))]))
- ;; Step 13
- ;; S = G3 + d3 * H2 in f7
- (cond_exec (ne (match_dup 6) (const_int 0))
- (parallel [(set (match_dup 0)
- (float_truncate:DF
- (plus:XF (mult:XF (match_dup 4) (match_dup 3))
- (match_dup 7))))
- (use (const_int 0))]))]
-{
- /* Generate 82-bit versions of the input and output operands. */
- operands[7] = gen_rtx_REG (XFmode, REGNO (operands[0]));
- operands[8] = gen_rtx_REG (XFmode, REGNO (operands[1]));
- /* Generate required floating-point constants. */
- operands[9] = CONST0_RTX (XFmode);
-}
- [(set_attr "predicable" "no")])
\f
;; ::::::::::::::::::::
;; ::
"TARGET_INLINE_SQRT"
{
rtx insn;
-#if 0
- if (TARGET_INLINE_SQRT == INL_MIN_LAT)
- insn = gen_sqrtxf2_internal_lat (operands[0], operands[1]);
- else
-#else
- gcc_assert (TARGET_INLINE_SQRT != INL_MIN_LAT);
-#endif
- insn = gen_sqrtxf2_internal_thr (operands[0], operands[1]);
+ insn = gen_sqrtxf2_internal (operands[0], operands[1]);
emit_insn (insn);
DONE;
})
-;; Latency-optimized square root.
-;; FIXME: Implement.
-
-;; Throughput-optimized square root.
-
-(define_insn_and_split "sqrtxf2_internal_thr"
- [(set (match_operand:XF 0 "fr_register_operand" "=&f")
- (sqrt:XF (match_operand:XF 1 "fr_reg_or_fp01_operand" "fG")))
- ;; Register r2 in optimization guide.
- (clobber (match_scratch:DI 2 "=r"))
- ;; Register f8 in optimization guide
- (clobber (match_scratch:XF 3 "=&f"))
- ;; Register f9 in optimization guide
- (clobber (match_scratch:XF 4 "=&f"))
- ;; Register f10 in optimization guide
- (clobber (match_scratch:XF 5 "=&f"))
- ;; Register f11 in optimization guide
- (clobber (match_scratch:XF 6 "=&f"))
- ;; Register p6 in optimization guide.
- (clobber (match_scratch:BI 7 "=c"))]
- "TARGET_INLINE_SQRT == INL_MAX_THR"
- "#"
- "&& reload_completed"
- [ ;; exponent of +1/2 in r2
- (set (match_dup 2) (const_int 65534))
- ;; +1/2 in f8. The Intel manual mistakenly specifies f10.
- (set (match_dup 3)
- (unspec:XF [(match_dup 2)] UNSPEC_SETF_EXP))
- ;; Step 1
- ;; y0 = 1/sqrt(a) in f7
- (parallel [(set (match_dup 8)
- (div:XF (const_int 1)
- (unspec:XF [(match_dup 9)]
- UNSPEC_FR_SQRT_RECIP_APPROX_RES)))
- (set (match_dup 7)
- (unspec:BI [(match_dup 9)]
- UNSPEC_FR_SQRT_RECIP_APPROX))
- (use (const_int 0))])
- ;; Step 2
- ;; H0 = 1/2 * y0 in f9
- (cond_exec (ne (match_dup 7) (const_int 0))
- (parallel [(set (match_dup 4)
- (plus:XF (mult:XF (match_dup 3) (match_dup 8))
- (match_dup 10)))
- (use (const_int 1))]))
- ;; Step 3
- ;; S0 = a * y0 in f7
- (cond_exec (ne (match_dup 7) (const_int 0))
- (parallel [(set (match_dup 8)
- (plus:XF (mult:XF (match_dup 9) (match_dup 8))
- (match_dup 10)))
- (use (const_int 1))]))
- ;; Step 4
- ;; d0 = 1/2 - S0 * H0 in f10
- (cond_exec (ne (match_dup 7) (const_int 0))
- (parallel [(set (match_dup 5)
- (minus:XF (match_dup 3)
- (mult:XF (match_dup 8) (match_dup 4))))
- (use (const_int 1))]))
- ;; Step 5
- ;; H1 = H0 + d0 * H0 in f9
- (cond_exec (ne (match_dup 7) (const_int 0))
- (parallel [(set (match_dup 4)
- (plus:XF (mult:XF (match_dup 5) (match_dup 4))
- (match_dup 4)))
- (use (const_int 1))]))
- ;; Step 6
- ;; S1 = S0 + d0 * S0 in f7
- (cond_exec (ne (match_dup 7) (const_int 0))
- (parallel [(set (match_dup 8)
- (plus:XF (mult:XF (match_dup 5) (match_dup 8))
- (match_dup 8)))
- (use (const_int 1))]))
- ;; Step 7
- ;; d1 = 1/2 - S1 * H1 in f10
- (cond_exec (ne (match_dup 7) (const_int 0))
- (parallel [(set (match_dup 5)
- (minus:XF (match_dup 3)
- (mult:XF (match_dup 8) (match_dup 4))))
- (use (const_int 1))]))
- ;; Step 8
- ;; H2 = H1 + d1 * H1 in f9
- (cond_exec (ne (match_dup 7) (const_int 0))
- (parallel [(set (match_dup 4)
- (plus:XF (mult:XF (match_dup 5) (match_dup 4))
- (match_dup 4)))
- (use (const_int 1))]))
- ;; Step 9
- ;; S2 = S1 + d1 * S1 in f7
- (cond_exec (ne (match_dup 7) (const_int 0))
- (parallel [(set (match_dup 8)
- (plus:XF (mult:XF (match_dup 5) (match_dup 8))
- (match_dup 8)))
- (use (const_int 1))]))
- ;; Step 10
- ;; d2 = 1/2 - S2 * H2 in f10
- (cond_exec (ne (match_dup 7) (const_int 0))
- (parallel [(set (match_dup 5)
- (minus:XF (match_dup 3)
- (mult:XF (match_dup 8) (match_dup 4))))
- (use (const_int 1))]))
- ;; Step 11
- ;; e2 = a - S2 * S2 in f8
- (cond_exec (ne (match_dup 7) (const_int 0))
- (parallel [(set (match_dup 3)
- (minus:XF (match_dup 9)
- (mult:XF (match_dup 8) (match_dup 8))))
- (use (const_int 1))]))
- ;; Step 12
- ;; S3 = S2 + e2 * H2 in f7
- (cond_exec (ne (match_dup 7) (const_int 0))
- (parallel [(set (match_dup 8)
- (plus:XF (mult:XF (match_dup 3) (match_dup 4))
- (match_dup 8)))
- (use (const_int 1))]))
- ;; Step 13
- ;; H3 = H2 + d2 * H2 in f9
- (cond_exec (ne (match_dup 7) (const_int 0))
- (parallel [(set (match_dup 4)
- (plus:XF (mult:XF (match_dup 5) (match_dup 4))
- (match_dup 4)))
- (use (const_int 1))]))
- ;; Step 14
- ;; e3 = a - S3 * S3 in f8
- (cond_exec (ne (match_dup 7) (const_int 0))
- (parallel [(set (match_dup 3)
- (minus:XF (match_dup 9)
- (mult:XF (match_dup 8) (match_dup 8))))
- (use (const_int 1))]))
- ;; Step 15
- ;; S = S3 + e3 * H3 in f7
- (cond_exec (ne (match_dup 7) (const_int 0))
- (parallel [(set (match_dup 0)
- (plus:XF (mult:XF (match_dup 3) (match_dup 4))
- (match_dup 8)))
- (use (const_int 0))]))]
-{
- /* Generate 82-bit versions of the input and output operands. */
- operands[8] = gen_rtx_REG (XFmode, REGNO (operands[0]));
- operands[9] = gen_rtx_REG (XFmode, REGNO (operands[1]));
- /* Generate required floating-point constants. */
- operands[10] = CONST0_RTX (XFmode);
-}
- [(set_attr "predicable" "no")])
-
;; ??? frcpa works like cmp.foo.unc.
(define_insn "*recip_approx"