From 5d50fab3144e5114b3a848749da6717a94117be1 Mon Sep 17 00:00:00 2001 From: Jeff Law Date: Mon, 6 May 2002 10:53:34 -0600 Subject: [PATCH] pa-protos.h (hppa_fpstore_bypass_p): Declare. * pa-protos.h (hppa_fpstore_bypass_p): Declare. * pa.c (pa_adjust_cost): Remove all true dependency cost adjustments. Also remove support for non-DFA scheduling. * pa.md (700, 7100, 7100lc, 7200, 7300): Use bypass mechanism to adjust true dependency costs. Update various comments. (7100lc, 7200, 7300 scheduling): Simplify by combining the FP ALU & MPY units into a single unit. From-SVN: r53227 --- gcc/ChangeLog | 10 ++++ gcc/config/pa/pa-protos.h | 1 + gcc/config/pa/pa.c | 112 ++++++++++++-------------------------- gcc/config/pa/pa.md | 133 ++++++++++++++++++++++++++++++++++------------ 4 files changed, 144 insertions(+), 112 deletions(-) diff --git a/gcc/ChangeLog b/gcc/ChangeLog index 7d0a02c..bb742b6 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,3 +1,13 @@ +2002-05-06 Jeff Law + + * pa-protos.h (hppa_fpstore_bypass_p): Declare. + * pa.c (pa_adjust_cost): Remove all true dependency cost + adjustments. Also remove support for non-DFA scheduling. + * pa.md (700, 7100, 7100lc, 7200, 7300): Use bypass mechanism + to adjust true dependency costs. Update various comments. + (7100lc, 7200, 7300 scheduling): Simplify by combining the + FP ALU & MPY units into a single unit. + 2002-05-06 Catherine Moore * config/v850/v850.c (compute_register_save_size): Make sure diff --git a/gcc/config/pa/pa-protos.h b/gcc/config/pa/pa-protos.h index 0c81ae0..57d9d41 100644 --- a/gcc/config/pa/pa-protos.h +++ b/gcc/config/pa/pa-protos.h @@ -103,6 +103,7 @@ extern int is_function_label_plus_const PARAMS ((rtx)); extern int jump_in_call_delay PARAMS ((rtx)); extern enum reg_class secondary_reload_class PARAMS ((enum reg_class, enum machine_mode, rtx)); +extern int hppa_fpstore_bypass_p PARAMS ((rtx, rtx)); /* Declare functions defined in pa.c and used in templates. */ diff --git a/gcc/config/pa/pa.c b/gcc/config/pa/pa.c index 71c20d5..f5852b9 100644 --- a/gcc/config/pa/pa.c +++ b/gcc/config/pa/pa.c @@ -60,6 +60,33 @@ hppa_use_dfa_pipeline_interface () return 1; } +/* Return nonzero if there is a bypass for the output of + OUT_INSN and the fp store IN_INSN. */ +int +hppa_fpstore_bypass_p (out_insn, in_insn) + rtx out_insn, in_insn; +{ + enum machine_mode store_mode; + enum machine_mode other_mode; + rtx set; + + if (recog_memoized (in_insn) < 0 + || get_attr_type (in_insn) != TYPE_FPSTORE + || recog_memoized (out_insn) < 0) + return 0; + + store_mode = GET_MODE (SET_SRC (PATTERN (in_insn))); + + set = single_set (out_insn); + if (!set) + return 0; + + other_mode = GET_MODE (SET_SRC (set)); + + return (GET_MODE_SIZE (store_mode) == GET_MODE_SIZE (other_mode)); +} + + #ifndef DO_FRAME_NOTES #ifdef INCOMING_RETURN_ADDR_RTX #define DO_FRAME_NOTES 1 @@ -3907,8 +3934,9 @@ pa_adjust_cost (insn, link, dep_insn, cost) { enum attr_type attr_type; - /* Don't adjust costs for a pa8000 chip. */ - if (pa_cpu >= PROCESSOR_8000) + /* Don't adjust costs for a pa8000 chip, also do not adjust any + true dependencies as they are described with bypasses now. */ + if (pa_cpu >= PROCESSOR_8000 || REG_NOTE_KIND (link) == 0) return cost; if (! recog_memoized (insn)) @@ -3916,65 +3944,7 @@ pa_adjust_cost (insn, link, dep_insn, cost) attr_type = get_attr_type (insn); - if (REG_NOTE_KIND (link) == 0) - { - /* Data dependency; DEP_INSN writes a register that INSN reads some - cycles later. */ - - if (attr_type == TYPE_FPSTORE) - { - rtx pat = PATTERN (insn); - rtx dep_pat = PATTERN (dep_insn); - if (GET_CODE (pat) == PARALLEL) - { - /* This happens for the fstXs,mb patterns. */ - pat = XVECEXP (pat, 0, 0); - } - if (GET_CODE (pat) != SET || GET_CODE (dep_pat) != SET) - /* If this happens, we have to extend this to schedule - optimally. Return 0 for now. */ - return 0; - - if (rtx_equal_p (SET_DEST (dep_pat), SET_SRC (pat))) - { - if (! recog_memoized (dep_insn)) - return 0; - /* DEP_INSN is writing its result to the register - being stored in the fpstore INSN. */ - switch (get_attr_type (dep_insn)) - { - case TYPE_FPLOAD: - /* This cost 3 cycles, not 2 as the md says for the - 700 and 7100, 7100lc, 7200 and 7300. */ - return cost + 1; - - case TYPE_FPALU: - case TYPE_FPMULSGL: - case TYPE_FPMULDBL: - case TYPE_FPDIVSGL: - case TYPE_FPDIVDBL: - case TYPE_FPSQRTSGL: - case TYPE_FPSQRTDBL: - /* In these important cases, we save one cycle compared to - when flop instruction feed each other. */ - return cost - 1; - - default: - return cost; - } - } - - /* A flop-flop true depenendency where the sizes of the operand - carrying the dependency is difference causes an additional - cycle stall on the 7100lc, 7200, and 7300. Similarly for - a fpload-flop true dependency. */ - } - - /* For other data dependencies, the default cost specified in the - md is correct. */ - return cost; - } - else if (REG_NOTE_KIND (link) == REG_DEP_ANTI) + if (REG_NOTE_KIND (link) == REG_DEP_ANTI) { /* Anti dependency; DEP_INSN reads a register that INSN writes some cycles later. */ @@ -4010,10 +3980,7 @@ pa_adjust_cost (insn, link, dep_insn, cost) preceding arithmetic operation has finished if the target of the fpload is any of the sources (or destination) of the arithmetic operation. */ - if (hppa_use_dfa_pipeline_interface ()) - return insn_default_latency (dep_insn) - 1; - else - return cost - 1; + return insn_default_latency (dep_insn) - 1; default: return 0; @@ -4048,10 +4015,7 @@ pa_adjust_cost (insn, link, dep_insn, cost) preceding divide or sqrt operation has finished if the target of the ALU flop is any of the sources (or destination) of the divide or sqrt operation. */ - if (hppa_use_dfa_pipeline_interface ()) - return insn_default_latency (dep_insn) - 2; - else - return cost - 2; + return insn_default_latency (dep_insn) - 2; default: return 0; @@ -4101,10 +4065,7 @@ pa_adjust_cost (insn, link, dep_insn, cost) Exception: For PA7100LC, PA7200 and PA7300, the cost is 3 cycles, unless they bundle together. We also pay the penalty if the second insn is a fpload. */ - if (hppa_use_dfa_pipeline_interface ()) - return insn_default_latency (dep_insn) - 1; - else - return cost - 1; + return insn_default_latency (dep_insn) - 1; default: return 0; @@ -4139,10 +4100,7 @@ pa_adjust_cost (insn, link, dep_insn, cost) preceding divide or sqrt operation has finished if the target of the ALU flop is also the target of the divide or sqrt operation. */ - if (hppa_use_dfa_pipeline_interface ()) - return insn_default_latency (dep_insn) - 2; - else - return cost - 2; + return insn_default_latency (dep_insn) - 2; default: return 0; diff --git a/gcc/config/pa/pa.md b/gcc/config/pa/pa.md index 2a6dea3..e1b952b 100644 --- a/gcc/config/pa/pa.md +++ b/gcc/config/pa/pa.md @@ -206,20 +206,41 @@ "fpmpy_700*18") (define_insn_reservation "W7" 2 - (and (eq_attr "type" "load,fpload") + (and (eq_attr "type" "load") (eq_attr "cpu" "700")) "mem_700") -(define_insn_reservation "W8" 3 - (and (eq_attr "type" "store,fpstore") +(define_insn_reservation "W8" 2 + (and (eq_attr "type" "fpload") + (eq_attr "cpu" "700")) + "mem_700") + +(define_insn_reservation "W9" 3 + (and (eq_attr "type" "store") + (eq_attr "cpu" "700")) + "mem_700*3") + +(define_insn_reservation "W10" 3 + (and (eq_attr "type" "fpstore") (eq_attr "cpu" "700")) "mem_700*3") -(define_insn_reservation "W9" 1 +(define_insn_reservation "W11" 1 (and (eq_attr "type" "!fpcc,fpalu,fpmulsgl,fpmuldbl,fpdivsgl,fpdivdbl,fpsqrtsgl,fpsqrtdbl,load,fpload,store,fpstore") (eq_attr "cpu" "700")) "dummy_700") +;; We have a bypass for all computations in the FP unit which feed an +;; FP store as long as the sizes are the same. +(define_bypass 2 "W1,W2" "W10" "hppa_fpstore_bypass_p") +(define_bypass 9 "W3" "W10" "hppa_fpstore_bypass_p") +(define_bypass 11 "W4" "W10" "hppa_fpstore_bypass_p") +(define_bypass 13 "W5" "W10" "hppa_fpstore_bypass_p") +(define_bypass 17 "W6" "W10" "hppa_fpstore_bypass_p") + +;; We have an "anti-bypass" for FP loads which feed an FP store. +(define_bypass 4 "W8" "W10" "hppa_fpstore_bypass_p") + ;; Function units for the 7100 and 7150. The 7100/7150 can dual-issue ;; floating point computations with non-floating point computations (fp loads ;; and stores are not fp computations). @@ -228,8 +249,12 @@ ;; take two cycles, during which no Dcache operations should be scheduled. ;; Any special cases are handled in pa_adjust_cost. The 7100, 7150 and 7100LC ;; all have the same memory characteristics if one disregards cache misses. - +;; ;; The 7100/7150 has three floating-point units: ALU, MUL, and DIV. +;; There's no value in modeling the ALU and MUL separately though +;; since there can never be a functional unit conflict given the +;; latency and issue rates for those units. +;; ;; Timings: ;; Instruction Time Unit Minimum Distance (unit contention) ;; fcpy 2 ALU 1 @@ -247,11 +272,6 @@ ;; fdiv,dbl 15 DIV 15 ;; fsqrt,sgl 8 DIV 8 ;; fsqrt,dbl 15 DIV 15 -;; -;; We don't really model the FP ALU/MPY units properly (they are -;; distinct subunits in the FP unit). However, there can never be -;; a functional unit; conflict given the latency and issue rates -;; for those units. (define_automaton "pa7100") (define_cpu_unit "i_7100, f_7100,fpmac_7100,fpdivsqrt_7100,mem_7100" "pa7100") @@ -272,21 +292,45 @@ "f_7100+fpdivsqrt_7100,fpdivsqrt_7100*14") (define_insn_reservation "X3" 2 - (and (eq_attr "type" "load,fpload") + (and (eq_attr "type" "load") (eq_attr "cpu" "7100")) "i_7100+mem_7100") (define_insn_reservation "X4" 2 - (and (eq_attr "type" "store,fpstore") + (and (eq_attr "type" "fpload") + (eq_attr "cpu" "7100")) + "i_7100+mem_7100") + +(define_insn_reservation "X5" 2 + (and (eq_attr "type" "store") + (eq_attr "cpu" "7100")) + "i_7100+mem_7100,mem_7100") + +(define_insn_reservation "X6" 2 + (and (eq_attr "type" "fpstore") (eq_attr "cpu" "7100")) "i_7100+mem_7100,mem_7100") -(define_insn_reservation "X5" 1 +(define_insn_reservation "X7" 1 (and (eq_attr "type" "!fpcc,fpalu,fpmulsgl,fpmuldbl,fpdivsgl,fpsqrtsgl,fpdivdbl,fpsqrtdbl,load,fpload,store,fpstore") (eq_attr "cpu" "7100")) "i_7100") +;; We have a bypass for all computations in the FP unit which feed an +;; FP store as long as the sizes are the same. +(define_bypass 1 "X0" "X6" "hppa_fpstore_bypass_p") +(define_bypass 7 "X1" "X6" "hppa_fpstore_bypass_p") +(define_bypass 14 "X2" "X6" "hppa_fpstore_bypass_p") + +;; We have an "anti-bypass" for FP loads which feed an FP store. +(define_bypass 3 "X4" "X6" "hppa_fpstore_bypass_p") + ;; The 7100LC has three floating-point units: ALU, MUL, and DIV. +;; There's no value in modeling the ALU and MUL separately though +;; since there can never be a functional unit conflict that +;; can be avoided given the latency, issue rates and mandatory +;; one cycle cpu-wide lock for a double precision fp multiply. +;; ;; Timings: ;; Instruction Time Unit Minimum Distance (unit contention) ;; fcpy 2 ALU 1 @@ -321,29 +365,25 @@ ;; ;; load-load pairs ;; store-store pairs -;; fmpyadd,dbl -;; fmpysub,dbl ;; other issue modeling (define_automaton "pa7100lc") (define_cpu_unit "i0_7100lc, i1_7100lc, f_7100lc" "pa7100lc") -(define_cpu_unit "fpalu_7100lc,fpmul_7100lc" "pa7100lc") +(define_cpu_unit "fpmac_7100lc" "pa7100lc") (define_cpu_unit "mem_7100lc" "pa7100lc") -(define_insn_reservation "Y0" 2 - (and (eq_attr "type" "fpcc,fpalu") - (eq_attr "cpu" "7100LC,7200,7300")) - "f_7100lc,fpalu_7100lc") - ;; Double precision multiplies lock the entire CPU for one ;; cycle. There is no way to avoid this lock and trying to ;; schedule around the lock is pointless and thus there is no -;; value in trying to model this lock. Not modeling the lock -;; allows for a smaller DFA and may reduce register pressure. -(define_insn_reservation "Y1" 2 - (and (eq_attr "type" "fpmulsgl,fpmuldbl") +;; value in trying to model this lock. +;; +;; Not modeling the lock allows us to treat fp multiplies just +;; like any other FP alu instruction. It allows for a smaller +;; DFA and may reduce register pressure. +(define_insn_reservation "Y0" 2 + (and (eq_attr "type" "fpcc,fpalu,fpmulsgl,fpmuldbl") (eq_attr "cpu" "7100LC,7200,7300")) - "f_7100lc,fpmul_7100lc") + "f_7100lc,fpmac_7100lc") ;; fp division and sqrt instructions lock the entire CPU for ;; 7 cycles (single precision) or 14 cycles (double precision). @@ -351,43 +391,66 @@ ;; around the lock is pointless and thus there is no value in ;; trying to model this lock. Not modeling the lock allows ;; for a smaller DFA and may reduce register pressure. -(define_insn_reservation "Y2" 1 +(define_insn_reservation "Y1" 1 (and (eq_attr "type" "fpdivsgl,fpsqrtsgl,fpdivdbl,fpsqrtdbl") (eq_attr "cpu" "7100LC,7200,7300")) "f_7100lc") +(define_insn_reservation "Y2" 2 + (and (eq_attr "type" "load") + (eq_attr "cpu" "7100LC,7200,7300")) + "i1_7100lc+mem_7100lc") + (define_insn_reservation "Y3" 2 - (and (eq_attr "type" "load,fpload") + (and (eq_attr "type" "fpload") (eq_attr "cpu" "7100LC,7200,7300")) "i1_7100lc+mem_7100lc") (define_insn_reservation "Y4" 2 - (and (eq_attr "type" "store,fpstore") + (and (eq_attr "type" "store") + (eq_attr "cpu" "7100LC")) + "i1_7100lc+mem_7100lc,mem_7100lc") + +(define_insn_reservation "Y5" 2 + (and (eq_attr "type" "fpstore") (eq_attr "cpu" "7100LC")) "i1_7100lc+mem_7100lc,mem_7100lc") -(define_insn_reservation "Y5" 1 +(define_insn_reservation "Y6" 1 (and (eq_attr "type" "shift,nullshift") (eq_attr "cpu" "7100LC,7200,7300")) "i1_7100lc") -(define_insn_reservation "Y6" 1 +(define_insn_reservation "Y7" 1 (and (eq_attr "type" "!fpcc,fpalu,fpmulsgl,fpmuldbl,fpdivsgl,fpsqrtsgl,fpdivdbl,fpsqrtdbl,load,fpload,store,fpstore,shift,nullshift") (eq_attr "cpu" "7100LC,7200,7300")) "(i0_7100lc|i1_7100lc)") ;; The 7200 has a store-load penalty -(define_insn_reservation "Y7" 2 - (and (eq_attr "type" "store,fpstore") +(define_insn_reservation "Y8" 2 + (and (eq_attr "type" "store") + (eq_attr "cpu" "7200")) + "i1_7100lc,mem_7100lc") + +(define_insn_reservation "Y9" 2 + (and (eq_attr "type" "fpstore") (eq_attr "cpu" "7200")) "i1_7100lc,mem_7100lc") ;; The 7300 has no penalty for store-store or store-load -(define_insn_reservation "Y8" 2 - (and (eq_attr "type" "store,fpstore") +(define_insn_reservation "Y10" 2 + (and (eq_attr "type" "store") (eq_attr "cpu" "7300")) "i1_7100lc") +(define_insn_reservation "Y11" 2 + (and (eq_attr "type" "fpstore") + (eq_attr "cpu" "7300")) + "i1_7100lc") + +;; We have an "anti-bypass" for FP loads which feed an FP store. +(define_bypass 3 "Y3" "Y5,Y9,Y11" "hppa_fpstore_bypass_p") + ;; Scheduling for the PA8000 is somewhat different than scheduling for a ;; traditional architecture. ;; -- 2.7.4