From 4cad3320a449d20eeda8892fdad37e37f0a5fd56 Mon Sep 17 00:00:00 2001 From: steven Date: Thu, 4 Mar 2004 20:11:08 +0000 Subject: [PATCH] * ppro.md: Rewrite as a DFA pipeline description. * i386.md: Remove all uses of the ppro_uops attribute. * i386.c: (ix86_safe_ppro_uops, ix86_dump_ppro_packet, ix86_reorder_insn, ix86_sched_reorder_ppro, ix86_sched_init, ix86_sched_reorder, ix86_variable_issue, struct ix86_sched_data, TARGET_SCHED_VARIABLE_ISSUE, TARGET_SCHED_INIT, TARGET_SCHED_REORDER): Remove. (ia32_use_dfa_pipeline_interface): Add TARGET_PENTIUMPRO. (ia32_multipass_dfa_lookahead): Add TARGET_PENTIUMPRO. * athlon.md (athlon_ssecmp_load): Fix comment git-svn-id: svn+ssh://gcc.gnu.org/svn/gcc/trunk@78933 138bc75d-0d04-0410-961f-82ee72b054a4 --- gcc/ChangeLog | 15 + gcc/config/i386/athlon.md | 2 +- gcc/config/i386/i386.c | 257 +------------- gcc/config/i386/i386.md | 124 ++----- gcc/config/i386/ppro.md | 877 +++++++++++++++++++++++++++++++++++++++------- 5 files changed, 804 insertions(+), 471 deletions(-) diff --git a/gcc/ChangeLog b/gcc/ChangeLog index f1e583d..85b7c0e 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,7 +1,22 @@ +2004-03-04 Steven Bosscher + + * ppro.md: Rewrite as a DFA pipeline description. + * i386.md: Remove all uses of the ppro_uops attribute. + * i386.c: (ix86_safe_ppro_uops, ix86_dump_ppro_packet, + ix86_reorder_insn, ix86_sched_reorder_ppro, ix86_sched_init, + ix86_sched_reorder, ix86_variable_issue, + struct ix86_sched_data, TARGET_SCHED_VARIABLE_ISSUE, + TARGET_SCHED_INIT, TARGET_SCHED_REORDER): Remove. + (ia32_use_dfa_pipeline_interface): Add TARGET_PENTIUMPRO. + (ia32_multipass_dfa_lookahead): Add TARGET_PENTIUMPRO. + * athlon.md (athlon_ssecmp_load): Fix comment + 2004-03-04 Stuart Hastings + * gcc/doc/invoke.texi: Document -mlongcall for Darwin/PPC. 2004-03-04 Stuart Hastings + * gcc/config/i386/darwin.h: Darwin/x86 doesn't support CPUs before 686, tell Darwin assembler to allow prefetch insns, non-empty def of SUBTARGET_OPTION_TRANSLATE_TABLE. diff --git a/gcc/config/i386/athlon.md b/gcc/config/i386/athlon.md index 4ce9a38..308ae1e 100644 --- a/gcc/config/i386/athlon.md +++ b/gcc/config/i386/athlon.md @@ -581,7 +581,7 @@ (and (eq_attr "cpu" "k8") (eq_attr "type" "sselog")) "athlon-double,athlon-fpsched,athlon-fmul") -;; ??? pcmp executes in addmul, probably not wortwhile to brother about that. +;; ??? pcmp executes in addmul, probably not worthwhile to bother about that. (define_insn_reservation "athlon_ssecmp_load" 2 (and (eq_attr "cpu" "athlon") (and (eq_attr "type" "ssecmp") diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c index 2a12bd8..58e2633 100644 --- a/gcc/config/i386/i386.c +++ b/gcc/config/i386/i386.c @@ -806,9 +806,6 @@ static rtx gen_push (rtx); static int memory_address_length (rtx addr); static int ix86_flags_dependant (rtx, rtx, enum attr_type); static int ix86_agi_dependant (rtx, rtx, enum attr_type); -static enum attr_ppro_uops ix86_safe_ppro_uops (rtx); -static void ix86_dump_ppro_packet (FILE *); -static void ix86_reorder_insn (rtx *, rtx *); static struct machine_function * ix86_init_machine_status (void); static int ix86_split_to_parts (rtx, rtx *, enum machine_mode); static int ix86_nsaved_regs (void); @@ -816,16 +813,12 @@ static void ix86_emit_save_regs (void); static void ix86_emit_save_regs_using_mov (rtx, HOST_WIDE_INT); static void ix86_emit_restore_regs_using_mov (rtx, HOST_WIDE_INT, int); static void ix86_output_function_epilogue (FILE *, HOST_WIDE_INT); -static void ix86_sched_reorder_ppro (rtx *, rtx *); static HOST_WIDE_INT ix86_GOT_alias_set (void); static void ix86_adjust_counter (rtx, HOST_WIDE_INT); static rtx ix86_expand_aligntest (rtx, int); static void ix86_expand_strlensi_unroll_1 (rtx, rtx, rtx); static int ix86_issue_rate (void); static int ix86_adjust_cost (rtx, rtx, rtx, int); -static void ix86_sched_init (FILE *, int, int); -static int ix86_sched_reorder (FILE *, int, rtx *, int *, int); -static int ix86_variable_issue (FILE *, int, rtx, int); static int ia32_use_dfa_pipeline_interface (void); static int ia32_multipass_dfa_lookahead (void); static void ix86_init_mmx_sse_builtins (void); @@ -975,12 +968,6 @@ static void init_ext_80387_constants (void); #define TARGET_SCHED_ADJUST_COST ix86_adjust_cost #undef TARGET_SCHED_ISSUE_RATE #define TARGET_SCHED_ISSUE_RATE ix86_issue_rate -#undef TARGET_SCHED_VARIABLE_ISSUE -#define TARGET_SCHED_VARIABLE_ISSUE ix86_variable_issue -#undef TARGET_SCHED_INIT -#define TARGET_SCHED_INIT ix86_sched_init -#undef TARGET_SCHED_REORDER -#define TARGET_SCHED_REORDER ix86_sched_reorder #undef TARGET_SCHED_USE_DFA_PIPELINE_INTERFACE #define TARGET_SCHED_USE_DFA_PIPELINE_INTERFACE \ ia32_use_dfa_pipeline_interface @@ -12321,244 +12308,12 @@ ix86_adjust_cost (rtx insn, rtx link, rtx dep_insn, int cost) return cost; } -static union -{ - struct ppro_sched_data - { - rtx decode[3]; - int issued_this_cycle; - } ppro; -} ix86_sched_data; - -static enum attr_ppro_uops -ix86_safe_ppro_uops (rtx insn) -{ - if (recog_memoized (insn) >= 0) - return get_attr_ppro_uops (insn); - else - return PPRO_UOPS_MANY; -} - -static void -ix86_dump_ppro_packet (FILE *dump) -{ - if (ix86_sched_data.ppro.decode[0]) - { - fprintf (dump, "PPRO packet: %d", - INSN_UID (ix86_sched_data.ppro.decode[0])); - if (ix86_sched_data.ppro.decode[1]) - fprintf (dump, " %d", INSN_UID (ix86_sched_data.ppro.decode[1])); - if (ix86_sched_data.ppro.decode[2]) - fprintf (dump, " %d", INSN_UID (ix86_sched_data.ppro.decode[2])); - fputc ('\n', dump); - } -} - -/* We're beginning a new block. Initialize data structures as necessary. */ - -static void -ix86_sched_init (FILE *dump ATTRIBUTE_UNUSED, - int sched_verbose ATTRIBUTE_UNUSED, - int veclen ATTRIBUTE_UNUSED) -{ - memset (&ix86_sched_data, 0, sizeof (ix86_sched_data)); -} - -/* Shift INSN to SLOT, and shift everything else down. */ - -static void -ix86_reorder_insn (rtx *insnp, rtx *slot) -{ - if (insnp != slot) - { - rtx insn = *insnp; - do - insnp[0] = insnp[1]; - while (++insnp != slot); - *insnp = insn; - } -} - -static void -ix86_sched_reorder_ppro (rtx *ready, rtx *e_ready) -{ - rtx decode[3]; - enum attr_ppro_uops cur_uops; - int issued_this_cycle; - rtx *insnp; - int i; - - /* At this point .ppro.decode contains the state of the three - decoders from last "cycle". That is, those insns that were - actually independent. But here we're scheduling for the - decoder, and we may find things that are decodable in the - same cycle. */ - - memcpy (decode, ix86_sched_data.ppro.decode, sizeof (decode)); - issued_this_cycle = 0; - - insnp = e_ready; - cur_uops = ix86_safe_ppro_uops (*insnp); - - /* If the decoders are empty, and we've a complex insn at the - head of the priority queue, let it issue without complaint. */ - if (decode[0] == NULL) - { - if (cur_uops == PPRO_UOPS_MANY) - { - decode[0] = *insnp; - goto ppro_done; - } - - /* Otherwise, search for a 2-4 uop unsn to issue. */ - while (cur_uops != PPRO_UOPS_FEW) - { - if (insnp == ready) - break; - cur_uops = ix86_safe_ppro_uops (*--insnp); - } - - /* If so, move it to the head of the line. */ - if (cur_uops == PPRO_UOPS_FEW) - ix86_reorder_insn (insnp, e_ready); - - /* Issue the head of the queue. */ - issued_this_cycle = 1; - decode[0] = *e_ready--; - } - - /* Look for simple insns to fill in the other two slots. */ - for (i = 1; i < 3; ++i) - if (decode[i] == NULL) - { - if (ready > e_ready) - goto ppro_done; - - insnp = e_ready; - cur_uops = ix86_safe_ppro_uops (*insnp); - while (cur_uops != PPRO_UOPS_ONE) - { - if (insnp == ready) - break; - cur_uops = ix86_safe_ppro_uops (*--insnp); - } - - /* Found one. Move it to the head of the queue and issue it. */ - if (cur_uops == PPRO_UOPS_ONE) - { - ix86_reorder_insn (insnp, e_ready); - decode[i] = *e_ready--; - issued_this_cycle++; - continue; - } - - /* ??? Didn't find one. Ideally, here we would do a lazy split - of 2-uop insns, issue one and queue the other. */ - } - - ppro_done: - if (issued_this_cycle == 0) - issued_this_cycle = 1; - ix86_sched_data.ppro.issued_this_cycle = issued_this_cycle; -} - -/* We are about to being issuing insns for this clock cycle. - Override the default sort algorithm to better slot instructions. */ -static int -ix86_sched_reorder (FILE *dump ATTRIBUTE_UNUSED, - int sched_verbose ATTRIBUTE_UNUSED, rtx *ready, - int *n_readyp, int clock_var ATTRIBUTE_UNUSED) -{ - int n_ready = *n_readyp; - rtx *e_ready = ready + n_ready - 1; - - /* Make sure to go ahead and initialize key items in - ix86_sched_data if we are not going to bother trying to - reorder the ready queue. */ - if (n_ready < 2) - { - ix86_sched_data.ppro.issued_this_cycle = 1; - goto out; - } - - switch (ix86_tune) - { - default: - break; - - case PROCESSOR_PENTIUMPRO: - ix86_sched_reorder_ppro (ready, e_ready); - break; - } - -out: - return ix86_issue_rate (); -} - -/* We are about to issue INSN. Return the number of insns left on the - ready queue that can be issued this cycle. */ - -static int -ix86_variable_issue (FILE *dump, int sched_verbose, rtx insn, - int can_issue_more) -{ - int i; - switch (ix86_tune) - { - default: - return can_issue_more - 1; - - case PROCESSOR_PENTIUMPRO: - { - enum attr_ppro_uops uops = ix86_safe_ppro_uops (insn); - - if (uops == PPRO_UOPS_MANY) - { - if (sched_verbose) - ix86_dump_ppro_packet (dump); - ix86_sched_data.ppro.decode[0] = insn; - ix86_sched_data.ppro.decode[1] = NULL; - ix86_sched_data.ppro.decode[2] = NULL; - if (sched_verbose) - ix86_dump_ppro_packet (dump); - ix86_sched_data.ppro.decode[0] = NULL; - } - else if (uops == PPRO_UOPS_FEW) - { - if (sched_verbose) - ix86_dump_ppro_packet (dump); - ix86_sched_data.ppro.decode[0] = insn; - ix86_sched_data.ppro.decode[1] = NULL; - ix86_sched_data.ppro.decode[2] = NULL; - } - else - { - for (i = 0; i < 3; ++i) - if (ix86_sched_data.ppro.decode[i] == NULL) - { - ix86_sched_data.ppro.decode[i] = insn; - break; - } - if (i == 3) - abort (); - if (i == 2) - { - if (sched_verbose) - ix86_dump_ppro_packet (dump); - ix86_sched_data.ppro.decode[0] = NULL; - ix86_sched_data.ppro.decode[1] = NULL; - ix86_sched_data.ppro.decode[2] = NULL; - } - } - } - return --ix86_sched_data.ppro.issued_this_cycle; - } -} - static int ia32_use_dfa_pipeline_interface (void) { - if (TARGET_PENTIUM || TARGET_ATHLON_K8) + if (TARGET_PENTIUM + || TARGET_PENTIUMPRO + || TARGET_ATHLON_K8) return 1; return 0; } @@ -12572,8 +12327,12 @@ ia32_multipass_dfa_lookahead (void) { if (ix86_tune == PROCESSOR_PENTIUM) return 2; + + if (ix86_tune == PROCESSOR_PENTIUMPRO) + return 1; + else - return 0; + return 0; } diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md index 56c1a8c..e957552 100644 --- a/gcc/config/i386/i386.md +++ b/gcc/config/i386/i386.md @@ -246,7 +246,7 @@ (const_int 1) (const_int 0))) -;; Set when 0f opcode prefix is used. +;; Set when REX opcode prefix is used. (define_attr "prefix_rex" "" (cond [(and (eq_attr "mode" "DI") (eq_attr "type" "!push,pop,call,callv,leave,ibr")) @@ -939,8 +939,7 @@ "fnstsw\t%0" [(set_attr "length" "2") (set_attr "mode" "SI") - (set_attr "unit" "i387") - (set_attr "ppro_uops" "few")]) + (set_attr "unit" "i387")]) ;; FP compares, step 3 ;; Get ax into flags, general case. @@ -952,8 +951,7 @@ "sahf" [(set_attr "length" "1") (set_attr "athlon_decode" "vector") - (set_attr "mode" "SI") - (set_attr "ppro_uops" "one")]) + (set_attr "mode" "SI")]) ;; Pentium Pro can do steps 1 through 3 in one go. @@ -1264,8 +1262,7 @@ (set_attr "pent_pair" "np") (set_attr "athlon_decode" "vector") (set_attr "mode" "SI") - (set_attr "modrm" "0") - (set_attr "ppro_uops" "few")]) + (set_attr "modrm" "0")]) (define_expand "movhi" [(set (match_operand:HI 0 "nonimmediate_operand" "") @@ -1384,8 +1381,7 @@ [(set_attr "type" "imov") (set_attr "pent_pair" "np") (set_attr "mode" "HI") - (set_attr "modrm" "0") - (set_attr "ppro_uops" "few")]) + (set_attr "modrm" "0")]) (define_insn "*swaphi_2" [(set (match_operand:HI 0 "register_operand" "+r") @@ -1397,8 +1393,7 @@ [(set_attr "type" "imov") (set_attr "pent_pair" "np") (set_attr "mode" "SI") - (set_attr "modrm" "0") - (set_attr "ppro_uops" "few")]) + (set_attr "modrm" "0")]) (define_expand "movstricthi" [(set (strict_low_part (match_operand:HI 0 "nonimmediate_operand" "")) @@ -1557,8 +1552,7 @@ [(set_attr "type" "imov") (set_attr "pent_pair" "np") (set_attr "mode" "QI") - (set_attr "modrm" "0") - (set_attr "ppro_uops" "few")]) + (set_attr "modrm" "0")]) (define_expand "movstrictqi" [(set (strict_low_part (match_operand:QI 0 "nonimmediate_operand" "")) @@ -2115,8 +2109,7 @@ (set_attr "pent_pair" "np") (set_attr "athlon_decode" "vector") (set_attr "mode" "DI") - (set_attr "modrm" "0") - (set_attr "ppro_uops" "few")]) + (set_attr "modrm" "0")]) (define_expand "movsf" @@ -4450,8 +4443,7 @@ "fnstcw\t%0" [(set_attr "length" "2") (set_attr "mode" "HI") - (set_attr "unit" "i387") - (set_attr "ppro_uops" "few")]) + (set_attr "unit" "i387")]) (define_insn "x86_fldcw_1" [(set (reg:HI 18) @@ -4461,8 +4453,7 @@ [(set_attr "length" "2") (set_attr "mode" "HI") (set_attr "unit" "i387") - (set_attr "athlon_decode" "vector") - (set_attr "ppro_uops" "few")]) + (set_attr "athlon_decode" "vector")]) ;; Conversion between fixed point and floating point. @@ -4972,8 +4963,7 @@ "adc{q}\t{%2, %0|%0, %2}" [(set_attr "type" "alu") (set_attr "pent_pair" "pu") - (set_attr "mode" "DI") - (set_attr "ppro_uops" "few")]) + (set_attr "mode" "DI")]) (define_insn "*adddi3_cc_rex64" [(set (reg:CC 17) @@ -4997,8 +4987,7 @@ "adc{b}\t{%2, %0|%0, %2}" [(set_attr "type" "alu") (set_attr "pent_pair" "pu") - (set_attr "mode" "QI") - (set_attr "ppro_uops" "few")]) + (set_attr "mode" "QI")]) (define_insn "addhi3_carry" [(set (match_operand:HI 0 "nonimmediate_operand" "=rm,r") @@ -5010,8 +4999,7 @@ "adc{w}\t{%2, %0|%0, %2}" [(set_attr "type" "alu") (set_attr "pent_pair" "pu") - (set_attr "mode" "HI") - (set_attr "ppro_uops" "few")]) + (set_attr "mode" "HI")]) (define_insn "addsi3_carry" [(set (match_operand:SI 0 "nonimmediate_operand" "=rm,r") @@ -5023,8 +5011,7 @@ "adc{l}\t{%2, %0|%0, %2}" [(set_attr "type" "alu") (set_attr "pent_pair" "pu") - (set_attr "mode" "SI") - (set_attr "ppro_uops" "few")]) + (set_attr "mode" "SI")]) (define_insn "*addsi3_carry_zext" [(set (match_operand:DI 0 "register_operand" "=r") @@ -5037,8 +5024,7 @@ "adc{l}\t{%2, %k0|%k0, %2}" [(set_attr "type" "alu") (set_attr "pent_pair" "pu") - (set_attr "mode" "SI") - (set_attr "ppro_uops" "few")]) + (set_attr "mode" "SI")]) (define_insn "*addsi3_cc" [(set (reg:CC 17) @@ -6645,7 +6631,6 @@ "sbb{q}\t{%2, %0|%0, %2}" [(set_attr "type" "alu") (set_attr "pent_pair" "pu") - (set_attr "ppro_uops" "few") (set_attr "mode" "DI")]) (define_insn "*subdi_1_rex64" @@ -6694,7 +6679,6 @@ "sbb{b}\t{%2, %0|%0, %2}" [(set_attr "type" "alu") (set_attr "pent_pair" "pu") - (set_attr "ppro_uops" "few") (set_attr "mode" "QI")]) (define_insn "subhi3_carry" @@ -6707,7 +6691,6 @@ "sbb{w}\t{%2, %0|%0, %2}" [(set_attr "type" "alu") (set_attr "pent_pair" "pu") - (set_attr "ppro_uops" "few") (set_attr "mode" "HI")]) (define_insn "subsi3_carry" @@ -6720,7 +6703,6 @@ "sbb{l}\t{%2, %0|%0, %2}" [(set_attr "type" "alu") (set_attr "pent_pair" "pu") - (set_attr "ppro_uops" "few") (set_attr "mode" "SI")]) (define_insn "subsi3_carry_zext" @@ -6734,7 +6716,6 @@ "sbb{l}\t{%2, %k0|%k0, %2}" [(set_attr "type" "alu") (set_attr "pent_pair" "pu") - (set_attr "ppro_uops" "few") (set_attr "mode" "SI")]) (define_expand "subsi3" @@ -7155,7 +7136,6 @@ && (GET_CODE (operands[1]) != MEM || GET_CODE (operands[2]) != MEM)" "mul{q}\t%2" [(set_attr "type" "imul") - (set_attr "ppro_uops" "few") (set_attr "length_immediate" "0") (set (attr "athlon_decode") (if_then_else (eq_attr "cpu" "athlon") @@ -7183,7 +7163,6 @@ && (GET_CODE (operands[1]) != MEM || GET_CODE (operands[2]) != MEM)" "mul{l}\t%2" [(set_attr "type" "imul") - (set_attr "ppro_uops" "few") (set_attr "length_immediate" "0") (set (attr "athlon_decode") (if_then_else (eq_attr "cpu" "athlon") @@ -7272,7 +7251,6 @@ && (GET_CODE (operands[1]) != MEM || GET_CODE (operands[2]) != MEM)" "mul{q}\t%2" [(set_attr "type" "imul") - (set_attr "ppro_uops" "few") (set_attr "length_immediate" "0") (set (attr "athlon_decode") (if_then_else (eq_attr "cpu" "athlon") @@ -7308,7 +7286,6 @@ "GET_CODE (operands[1]) != MEM || GET_CODE (operands[2]) != MEM" "mul{l}\t%2" [(set_attr "type" "imul") - (set_attr "ppro_uops" "few") (set_attr "length_immediate" "0") (set (attr "athlon_decode") (if_then_else (eq_attr "cpu" "athlon") @@ -7331,7 +7308,6 @@ && (GET_CODE (operands[1]) != MEM || GET_CODE (operands[2]) != MEM)" "mul{l}\t%2" [(set_attr "type" "imul") - (set_attr "ppro_uops" "few") (set_attr "length_immediate" "0") (set (attr "athlon_decode") (if_then_else (eq_attr "cpu" "athlon") @@ -7368,7 +7344,6 @@ && (GET_CODE (operands[1]) != MEM || GET_CODE (operands[2]) != MEM)" "imul{q}\t%2" [(set_attr "type" "imul") - (set_attr "ppro_uops" "few") (set (attr "athlon_decode") (if_then_else (eq_attr "cpu" "athlon") (const_string "vector") @@ -7403,7 +7378,6 @@ "GET_CODE (operands[1]) != MEM || GET_CODE (operands[2]) != MEM" "imul{l}\t%2" [(set_attr "type" "imul") - (set_attr "ppro_uops" "few") (set (attr "athlon_decode") (if_then_else (eq_attr "cpu" "athlon") (const_string "vector") @@ -7425,7 +7399,6 @@ && (GET_CODE (operands[1]) != MEM || GET_CODE (operands[2]) != MEM)" "imul{l}\t%2" [(set_attr "type" "imul") - (set_attr "ppro_uops" "few") (set (attr "athlon_decode") (if_then_else (eq_attr "cpu" "athlon") (const_string "vector") @@ -7465,8 +7438,7 @@ "TARGET_QIMODE_MATH" "idiv{b}\t%2" [(set_attr "type" "idiv") - (set_attr "mode" "QI") - (set_attr "ppro_uops" "few")]) + (set_attr "mode" "QI")]) (define_insn "udivqi3" [(set (match_operand:QI 0 "register_operand" "=a") @@ -7476,8 +7448,7 @@ "TARGET_QIMODE_MATH" "div{b}\t%2" [(set_attr "type" "idiv") - (set_attr "mode" "QI") - (set_attr "ppro_uops" "few")]) + (set_attr "mode" "QI")]) ;; The patterns that match these are at the end of this file. @@ -7550,8 +7521,7 @@ "TARGET_64BIT" "idiv{q}\t%2" [(set_attr "type" "idiv") - (set_attr "mode" "DI") - (set_attr "ppro_uops" "few")]) + (set_attr "mode" "DI")]) (define_split [(set (match_operand:DI 0 "register_operand" "") @@ -7635,8 +7605,7 @@ "" "idiv{l}\t%2" [(set_attr "type" "idiv") - (set_attr "mode" "SI") - (set_attr "ppro_uops" "few")]) + (set_attr "mode" "SI")]) (define_split [(set (match_operand:SI 0 "register_operand" "") @@ -7710,7 +7679,6 @@ "TARGET_64BIT" "div{q}\t%2" [(set_attr "type" "idiv") - (set_attr "ppro_uops" "few") (set_attr "mode" "DI")]) (define_split @@ -7754,7 +7722,6 @@ "" "div{l}\t%2" [(set_attr "type" "idiv") - (set_attr "ppro_uops" "few") (set_attr "mode" "SI")]) (define_split @@ -7797,8 +7764,7 @@ "" "div{w}\t%2" [(set_attr "type" "idiv") - (set_attr "mode" "HI") - (set_attr "ppro_uops" "few")]) + (set_attr "mode" "HI")]) ;; We can not use div/idiv for double division, because it causes ;; "division by zero" on the overflow and that's not what we expect @@ -7817,8 +7783,7 @@ ; (clobber (reg:CC 17))] ; "" ; "div{l}\t{%2, %0|%0, %2}" -; [(set_attr "type" "idiv") -; (set_attr "ppro_uops" "few")]) +; [(set_attr "type" "idiv")]) ;;- Logical AND instructions @@ -9932,8 +9897,7 @@ "TARGET_80387 && reload_completed" "fchs" [(set_attr "type" "fsgn") - (set_attr "mode" "SF") - (set_attr "ppro_uops" "few")]) + (set_attr "mode" "SF")]) (define_insn "*negdf2_1" [(set (match_operand:DF 0 "register_operand" "=f") @@ -9941,8 +9905,7 @@ "TARGET_80387 && reload_completed" "fchs" [(set_attr "type" "fsgn") - (set_attr "mode" "DF") - (set_attr "ppro_uops" "few")]) + (set_attr "mode" "DF")]) (define_insn "*negextendsfdf2" [(set (match_operand:DF 0 "register_operand" "=f") @@ -9951,8 +9914,7 @@ "TARGET_80387" "fchs" [(set_attr "type" "fsgn") - (set_attr "mode" "DF") - (set_attr "ppro_uops" "few")]) + (set_attr "mode" "DF")]) (define_insn "*negxf2_1" [(set (match_operand:XF 0 "register_operand" "=f") @@ -9960,8 +9922,7 @@ "TARGET_80387 && reload_completed" "fchs" [(set_attr "type" "fsgn") - (set_attr "mode" "XF") - (set_attr "ppro_uops" "few")]) + (set_attr "mode" "XF")]) (define_insn "*negextenddfxf2" [(set (match_operand:XF 0 "register_operand" "=f") @@ -9970,8 +9931,7 @@ "TARGET_80387" "fchs" [(set_attr "type" "fsgn") - (set_attr "mode" "XF") - (set_attr "ppro_uops" "few")]) + (set_attr "mode" "XF")]) (define_insn "*negextendsfxf2" [(set (match_operand:XF 0 "register_operand" "=f") @@ -9980,8 +9940,7 @@ "TARGET_80387" "fchs" [(set_attr "type" "fsgn") - (set_attr "mode" "XF") - (set_attr "ppro_uops" "few")]) + (set_attr "mode" "XF")]) ;; Absolute value instructions @@ -10762,8 +10721,7 @@ (set_attr "prefix_0f" "1") (set_attr "mode" "SI") (set_attr "pent_pair" "np") - (set_attr "athlon_decode" "vector") - (set_attr "ppro_uops" "few")]) + (set_attr "athlon_decode" "vector")]) (define_expand "x86_shift_adj_1" [(set (reg:CCZ 17) @@ -11466,7 +11424,6 @@ [(set_attr "type" "ishift") (set_attr "prefix_0f" "1") (set_attr "pent_pair" "np") - (set_attr "ppro_uops" "few") (set_attr "mode" "SI")]) (define_expand "x86_shift_adj_3" @@ -13484,8 +13441,7 @@ else return "dec{l}\t%1\;%+jne\t%l0"; } - [(set_attr "ppro_uops" "many") - (set (attr "length") + [(set (attr "length") (if_then_else (and (eq_attr "alternative" "0") (and (ge (minus (match_dup 0) (pc)) (const_int -126)) @@ -13871,8 +13827,7 @@ "nop" [(set_attr "length" "1") (set_attr "length_immediate" "0") - (set_attr "modrm" "0") - (set_attr "ppro_uops" "one")]) + (set_attr "modrm" "0")]) ;; Align to 16-byte boundary, max skip in op0. Used to avoid ;; branch prediction penalty for the third jump in a 16-byte @@ -14033,8 +13988,7 @@ (ctz:SI (match_dup 1)))] "" "bsf{l}\t{%1, %0|%0, %1}" - [(set_attr "prefix_0f" "1") - (set_attr "ppro_uops" "few")]) + [(set_attr "prefix_0f" "1")]) (define_insn "ctzsi2" [(set (match_operand:SI 0 "register_operand" "=r") @@ -14042,8 +13996,7 @@ (clobber (reg:CC 17))] "" "bsf{l}\t{%1, %0|%0, %1}" - [(set_attr "prefix_0f" "1") - (set_attr "ppro_uops" "few")]) + [(set_attr "prefix_0f" "1")]) (define_expand "clzsi2" [(parallel @@ -14064,8 +14017,7 @@ (clobber (reg:CC 17))] "" "bsr{l}\t{%1, %0|%0, %1}" - [(set_attr "prefix_0f" "1") - (set_attr "ppro_uops" "few")]) + [(set_attr "prefix_0f" "1")]) ;; Thread-local storage patterns for ELF. ;; @@ -14482,7 +14434,6 @@ ] (const_string "fop"))) (set_attr "fp_int_src" "true") - (set_attr "ppro_uops" "many") (set_attr "mode" "SI")]) (define_insn "*fop_sf_3" @@ -14500,7 +14451,6 @@ ] (const_string "fop"))) (set_attr "fp_int_src" "true") - (set_attr "ppro_uops" "many") (set_attr "mode" "SI")]) (define_insn "*fop_df_1_nosse" @@ -14581,7 +14531,6 @@ ] (const_string "fop"))) (set_attr "fp_int_src" "true") - (set_attr "ppro_uops" "many") (set_attr "mode" "SI")]) (define_insn "*fop_df_3" @@ -14599,7 +14548,6 @@ ] (const_string "fop"))) (set_attr "fp_int_src" "true") - (set_attr "ppro_uops" "many") (set_attr "mode" "SI")]) (define_insn "*fop_df_4" @@ -14686,8 +14634,7 @@ ] (const_string "fop"))) (set_attr "fp_int_src" "true") - (set_attr "mode" "SI") - (set_attr "ppro_uops" "many")]) + (set_attr "mode" "SI")]) (define_insn "*fop_xf_3" [(set (match_operand:XF 0 "register_operand" "=f,f") @@ -14704,8 +14651,7 @@ ] (const_string "fop"))) (set_attr "fp_int_src" "true") - (set_attr "mode" "SI") - (set_attr "ppro_uops" "many")]) + (set_attr "mode" "SI")]) (define_insn "*fop_xf_4" [(set (match_operand:XF 0 "register_operand" "=f,f") diff --git a/gcc/config/i386/ppro.md b/gcc/config/i386/ppro.md index 911bf3b..fc1374d 100644 --- a/gcc/config/i386/ppro.md +++ b/gcc/config/i386/ppro.md @@ -1,5 +1,5 @@ -;; Pentium Pro/PII Scheduling -;; Copyright (C) 2002 Free Software Foundation, Inc. +;; Scheduling for the Intel P6 family of processors +;; Copyright (C) 2004 Free Software Foundation, Inc. ;; ;; This file is part of GCC. ;; @@ -18,133 +18,746 @@ ;; the Free Software Foundation, 59 Temple Place - Suite 330, ;; Boston, MA 02111-1307, USA. */ -;; Categorize how many uops an ia32 instruction evaluates to: -;; one -- an instruction with 1 uop can be decoded by any of the -;; three decoders. -;; few -- an instruction with 1 to 4 uops can be decoded only by -;; decoder 0. -;; many -- a complex instruction may take an unspecified number of -;; cycles to decode in decoder 0. - -(define_attr "ppro_uops" "one,few,many" - (cond [(eq_attr "type" "other,multi,call,callv,fpspc,str") - (const_string "many") - (eq_attr "type" "icmov,fcmov,str,cld,leave") - (const_string "few") - (eq_attr "type" "imov") - (if_then_else (eq_attr "memory" "store,both") - (const_string "few") - (const_string "one")) - (eq_attr "memory" "!none") - (const_string "few") - ] - (const_string "one"))) - -;; -;; The PPro has an out-of-order core, but the instruction decoders are -;; naturally in-order and asymmetric. We get best performance by scheduling -;; for the decoders, for in doing so we give the oo execution unit the -;; most choices. -;; -;; Rough readiness numbers. Fine tuning happens in i386.c. -;; -;; p0 describes port 0. -;; p01 describes ports 0 and 1 as a pair; alu insns can issue to either. -;; p2 describes port 2 for loads. -;; p34 describes ports 3 and 4 for stores. -;; fpu describes the fpu accessed via port 0. -;; ??? It is less than clear if there are separate fadd and fmul units -;; that could operate in parallel. -;; -;; ??? fxch isn't handled; not an issue until sched3 after reg-stack is real. - -(define_function_unit "ppro_p0" 1 0 - (and (eq_attr "cpu" "pentiumpro") - (eq_attr "type" "ishift,rotate,ishift1,rotate1,lea,ibr,cld")) - 1 1) - -(define_function_unit "ppro_p0" 1 0 - (and (eq_attr "cpu" "pentiumpro") - (eq_attr "type" "imul")) - 4 1) - -;; ??? Does the divider lock out the pipe while it works, -;; or is there a disconnected unit? -(define_function_unit "ppro_p0" 1 0 - (and (eq_attr "cpu" "pentiumpro") - (eq_attr "type" "idiv")) - 17 17) - -(define_function_unit "ppro_p0" 1 0 - (and (eq_attr "cpu" "pentiumpro") - (eq_attr "type" "fop,fsgn,fistp")) - 3 1) - -(define_function_unit "ppro_p0" 1 0 - (and (eq_attr "cpu" "pentiumpro") - (eq_attr "type" "fcmov")) - 2 1) - -(define_function_unit "ppro_p0" 1 0 - (and (eq_attr "cpu" "pentiumpro") - (eq_attr "type" "fcmp")) - 1 1) - -(define_function_unit "ppro_p0" 1 0 - (and (eq_attr "cpu" "pentiumpro") - (eq_attr "type" "fmov")) - 1 1) - -(define_function_unit "ppro_p0" 1 0 - (and (eq_attr "cpu" "pentiumpro") - (eq_attr "type" "fmul")) - 5 1) - -(define_function_unit "ppro_p0" 1 0 - (and (eq_attr "cpu" "pentiumpro") - (eq_attr "type" "fdiv,fpspc")) - 56 1) - -(define_function_unit "ppro_p01" 2 0 - (and (eq_attr "cpu" "pentiumpro") - (eq_attr "type" "!imov,fmov")) - 1 1) - -(define_function_unit "ppro_p01" 2 0 - (and (and (eq_attr "cpu" "pentiumpro") - (eq_attr "type" "imov,fmov")) - (eq_attr "memory" "none")) - 1 1) - -(define_function_unit "ppro_p2" 1 0 - (and (eq_attr "cpu" "pentiumpro") - (ior (eq_attr "type" "pop,leave") - (eq_attr "memory" "load,both"))) - 3 1) - -(define_function_unit "ppro_p34" 1 0 - (and (eq_attr "cpu" "pentiumpro") - (ior (eq_attr "type" "push") - (eq_attr "memory" "store,both"))) - 1 1) - -(define_function_unit "fpu" 1 0 - (and (eq_attr "cpu" "pentiumpro") - (eq_attr "type" "fop,fsgn,fmov,fcmp,fcmov,fistp")) - 1 1) - -(define_function_unit "fpu" 1 0 - (and (eq_attr "cpu" "pentiumpro") - (eq_attr "type" "fmul")) - 5 2) - -(define_function_unit "fpu" 1 0 - (and (eq_attr "cpu" "pentiumpro") - (eq_attr "type" "fdiv,fpspc")) - 56 56) - -;; imul uses the fpu. ??? does it have the same throughput as fmul? -(define_function_unit "fpu" 1 0 - (and (eq_attr "cpu" "pentiumpro") - (eq_attr "type" "imul")) - 4 1) +;; The P6 familiy includes the Pentium Pro, Pentium II, Pentium III, Celeron +;; and Xeon lines of CPUs. The DFA scheduler description in this file is +;; based on information that can be found in the following three documents: +;; +;; "P6 Family of Processors Hardware Developer's Manual", +;; Intel, September 1999. +;; +;; "Intel Architecture Optimization Manual", +;; Intel, 1999 (Order Number: 245127-001). +;; +;; "How to optimize for the Pentium family of microprocessors", +;; by Agner Fog, PhD. +;; +;; The P6 pipeline has three major components: +;; 1) the FETCH/DECODE unit, an in-order issue front-end +;; 2) the DISPATCH/EXECUTE unit, which is the out-of-order core +;; 3) the RETIRE unit, an in-order retirement unit +;; +;; So, the P6 CPUs have out-of-order cores, but the instruction decoder and +;; retirement unit are naturally in-order. +;; +;; BUS INTERFACE UNIT +;; / \ +;; L1 ICACHE L1 DCACHE +;; / | \ | \ +;; DECODER0 DECODER1 DECODER2 DISP/EXEC RETIRE +;; \ | / | | +;; INSTRUCTION POOL __________|_______/ +;; (inc. reorder buffer) +;; +;; Since the P6 CPUs execute instructions out-of-order, the most important +;; consideration in performance tuning is making sure enough micro-ops are +;; ready for execution in the out-of-order core, while not stalling the +;; decoder. +;; +;; TODO: +;; - Find a less crude way to model complex instructions, in +;; particular how many cycles they take to be decoded. +;; - Include decoder latencies in the total reservation latencies. +;; This isn't necessary right now because we assume for every +;; instruction that it never blocks a decoder. +;; - Figure out where the p0 and p1 reservations come from. These +;; appear not to be in the manual (e.g. why is cld "(p0+p1)*2" +;; better than "(p0|p1)*4" ???) +;; - Lots more because I'm sure this is still far from optimal :-) + +;; The ppro_idiv and ppro_fdiv automata are used to model issue +;; latencies of idiv and fdiv type insns. +(define_automaton "ppro_decoder,ppro_core,ppro_idiv,ppro_fdiv,ppro_load,ppro_store") + +;; Simple instructions of the register-register form have only one uop. +;; Load instructions are also only one uop. Store instructions decode to +;; two uops, and simple read-modify instructions also take two uops. +;; Simple instructions of the register-memory form have two to three uops. +;; Simple read-modify-write instructions have four uops. The rules for +;; the decoder are simple: +;; - an instruction with 1 uop can be decoded by any of the three +;; decoders in one cycle. +;; - an instruction with 1 to 4 uops can be decoded only by decoder 0 +;; but still in only one cycle. +;; - a complex (microcode) instruction can also only be decoded by +;; decoder 0, and this takes an unspecified number of cycles. +;; +;; The goal is to schedule such that we have a few-one-one uops sequence +;; in each cycle, to decode as many instructions per cycle as possible. +(define_cpu_unit "decoder0" "ppro_decoder") +(define_cpu_unit "decoder1" "ppro_decoder") +(define_cpu_unit "decoder2" "ppro_decoder") + +;; We first wish to find an instruction for decoder0, so exclude +;; decoder1 and decoder2 from being reserved until decoder 0 is +;; reserved. +(presence_set "decoder1" "decoder0") +(presence_set "decoder2" "decoder0") + +;; Most instructions can be decoded on any of the three decoders. +(define_reservation "decodern" "(decoder0|decoder1|decoder2)") + +;; The out-of-order core has five pipelines. During each cycle, the core +;; may dispatch zero or one uop on the port of any of the five pipelines +;; so the maximum number of dispatched uops per cycle is 5. In practicer, +;; 3 uops per cycle is more realistic. +;; +;; Two of the five pipelines contain several execution units: +;; +;; Port 0 Port 1 Port 2 Port 3 Port 4 +;; ALU ALU LOAD SAC SDA +;; FPU JUE +;; AGU MMX +;; MMX P3FPU +;; P3FPU +;; +;; (SAC=Store Address Calculation, SDA=Store Data Unit, P3FPU = SSE unit, +;; JUE = Jump Execution Unit, AGU = Addres Generation Unit) +;; +(define_cpu_unit "p0,p1" "ppro_core") +(define_cpu_unit "p2" "ppro_load") +(define_cpu_unit "p3,p4" "ppro_store") +(define_cpu_unit "idiv" "ppro_idiv") +(define_cpu_unit "fdiv" "ppro_fdiv") + +;; Only the irregular instructions have to be modeled here. A load +;; increases the latency by 2 or 3, or by nothing if the manual gives +;; a latency already. Store latencies are not accounted for. +;; +;; The simple instructions follow a very regular pattern of 1 uop per +;; reg-reg operation, 1 uop per load on port 2. and 2 uops per store +;; on port 4 and port 3. These instructions are modelled at the bottom +;; of this file. +;; +;; For microcoded instructions we don't know how many uops are produced. +;; These instructions are the "complex" ones in the Intel manuals. All +;; we _do_ know is that they typically produce four or more uops, so +;; they can only be decoded on decoder0. Modelling their latencies +;; doesn't make sense because we don't know how these instructions are +;; executed in the core. So we just model that they can only be decoded +;; on decoder 0, and say that it takes a little while before the result +;; is availale. +(define_insn_reservation "ppro_complex_insn" 6 + (eq_attr "type" "other,multi,call,callv,str") + "decoder0") + +;; imov with memory operands does not use the integer units. +(define_insn_reservation "ppro_imov" 1 + (and (eq_attr "cpu" "pentiumpro") + (and (eq_attr "memory" "none") + (eq_attr "type" "imov"))) + "decodern,(p0|p1)") + +(define_insn_reservation "ppro_imov_load" 4 + (and (eq_attr "cpu" "pentiumpro") + (and (eq_attr "memory" "load") + (eq_attr "type" "imov"))) + "decodern,p2") + +(define_insn_reservation "ppro_imov_store" 1 + (and (eq_attr "cpu" "pentiumpro") + (and (eq_attr "memory" "store") + (eq_attr "type" "imov"))) + "decoder0,p4+p3") + +;; imovx always decodes to one uop, and also doesn't use the integer +;; units if it has memory operands. +(define_insn_reservation "ppro_imovx" 1 + (and (eq_attr "cpu" "pentiumpro") + (and (eq_attr "memory" "none") + (eq_attr "type" "imovx"))) + "decodern,(p0|p1)") + +(define_insn_reservation "ppro_imovx_load" 4 + (and (eq_attr "cpu" "pentiumpro") + (and (eq_attr "memory" "load") + (eq_attr "type" "imovx"))) + "decodern,p2") + +;; lea executes on port 0 with latency one and throughput 1. +(define_insn_reservation "ppro_lea" 1 + (and (eq_attr "cpu" "pentiumpro") + (and (eq_attr "memory" "none") + (eq_attr "type" "lea"))) + "decodern,p0") + +;; Shift and rotate execute on port 0 with latency and throughput 1. +;; The load and store units need to be reserved when memory operands +;; are involved. +(define_insn_reservation "ppro_shift_rotate" 1 + (and (eq_attr "cpu" "pentiumpro") + (and (eq_attr "memory" "none") + (eq_attr "type" "ishift,ishift1,rotate,rotate1"))) + "decodern,p0") + +(define_insn_reservation "ppro_shift_rotate_mem" 4 + (and (eq_attr "cpu" "pentiumpro") + (and (eq_attr "memory" "!none") + (eq_attr "type" "ishift,ishift1,rotate,rotate1"))) + "decoder0,p2+p0,p4+p3") + +(define_insn_reservation "ppro_cld" 2 + (and (eq_attr "cpu" "pentiumpro") + (eq_attr "type" "cld")) + "decoder0,(p0+p1)*2") + +;; The P6 has a sophisticated branch prediction mechanism to miminize +;; latencies due to branching. In particular, it has a fast way to +;; execute branches that are taken multiple times (such as in loops). +;; Branches not taken suffer no penalty, and correctly predicted +;; branches cost only one fetch cycle. Mispredicted branches are very +;; costly: typically 15 cycles and possibly as many as 26 cycles. +;; +;; Unfortunatetely all this makes it quite difficult to properly model +;; the latencies for the compiler. Here I've made the choice to be +;; optimistic and assume branches are often predicted correctly, so +;; they have latency 1, and the decoders are not blocked. +;; +;; In addition, the model assumes a branch always decodes to only 1 uop, +;; which is not exactly true because there are a few instructions that +;; decode to 2 uops or microcode. But this probably gives the best +;; results because we can assume these instructions can decode on all +;; decoders. +(define_insn_reservation "ppro_branch" 1 + (and (eq_attr "cpu" "pentiumpro") + (and (eq_attr "memory" "none") + (eq_attr "type" "ibr"))) + "decodern,p1") + +;; ??? Indirect branches probably have worse latency than this. +(define_insn_reservation "ppro_indirect_branch" 6 + (and (eq_attr "cpu" "pentiumpro") + (and (eq_attr "memory" "!none") + (eq_attr "type" "ibr"))) + "decoder0,p2+p1") + +(define_insn_reservation "ppro_leave" 4 + (and (eq_attr "cpu" "pentiumpro") + (eq_attr "type" "leave")) + "decoder0,p2+(p0|p1),(p0|p1)") + +;; imul has throughput one, but latency 4, and can only execute on port 0. +(define_insn_reservation "ppro_imul" 4 + (and (eq_attr "cpu" "pentiumpro") + (and (eq_attr "memory" "none") + (eq_attr "type" "imul"))) + "decodern,p0") + +(define_insn_reservation "ppro_imul_mem" 4 + (and (eq_attr "cpu" "pentiumpro") + (and (eq_attr "memory" "!none") + (eq_attr "type" "imul"))) + "decoder0,p2+p0") + +;; div and idiv are very similar, so we model them the same. +;; QI, HI, and SI have issue latency 12, 21, and 37, respectively. +;; These issue latencies are modelled via the ppro_div automaton. +(define_insn_reservation "ppro_idiv_QI" 19 + (and (eq_attr "cpu" "pentiumpro") + (and (eq_attr "memory" "none") + (and (eq_attr "mode" "QI") + (eq_attr "type" "idiv")))) + "decoder0,(p0+idiv)*2,(p0|p1)+idiv,idiv*9") + +(define_insn_reservation "ppro_idiv_QI_load" 19 + (and (eq_attr "cpu" "pentiumpro") + (and (eq_attr "memory" "load") + (and (eq_attr "mode" "QI") + (eq_attr "type" "idiv")))) + "decoder0,p2+p0+idiv,p0+idiv,(p0|p1)+idiv,idiv*9") + +(define_insn_reservation "ppro_idiv_HI" 23 + (and (eq_attr "cpu" "pentiumpro") + (and (eq_attr "memory" "none") + (and (eq_attr "mode" "HI") + (eq_attr "type" "idiv")))) + "decoder0,(p0+idiv)*3,(p0|p1)+idiv,idiv*17") + +(define_insn_reservation "ppro_idiv_HI_load" 23 + (and (eq_attr "cpu" "pentiumpro") + (and (eq_attr "memory" "load") + (and (eq_attr "mode" "HI") + (eq_attr "type" "idiv")))) + "decoder0,p2+p0+idiv,p0+idiv,(p0|p1)+idiv,idiv*18") + +(define_insn_reservation "ppro_idiv_SI" 39 + (and (eq_attr "cpu" "pentiumpro") + (and (eq_attr "memory" "none") + (and (eq_attr "mode" "SI") + (eq_attr "type" "idiv")))) + "decoder0,(p0+idiv)*3,(p0|p1)+idiv,idiv*33") + +(define_insn_reservation "ppro_idiv_SI_load" 39 + (and (eq_attr "cpu" "pentiumpro") + (and (eq_attr "memory" "load") + (and (eq_attr "mode" "SI") + (eq_attr "type" "idiv")))) + "decoder0,p2+p0+idiv,p0+idiv,(p0|p1)+idiv,idiv*34") + +;; Floating point operations always execute on port 0. +;; ??? where do these latencies come from? fadd has latency 3 and +;; has throughput "1/cycle (align with FADD)". What do they +;; mean and how can we model that? +(define_insn_reservation "ppro_fop" 3 + (and (eq_attr "cpu" "pentiumpro") + (and (eq_attr "memory" "none,unknown") + (eq_attr "type" "fop"))) + "decodern,p0") + +(define_insn_reservation "ppro_fop_load" 5 + (and (eq_attr "cpu" "pentiumpro") + (and (eq_attr "memory" "load") + (eq_attr "type" "fop"))) + "decoder0,p2+p0,p0") + +(define_insn_reservation "ppro_fop_store" 3 + (and (eq_attr "cpu" "pentiumpro") + (and (eq_attr "memory" "store") + (eq_attr "type" "fop"))) + "decoder0,p0,p0,p0+p4+p3") + +(define_insn_reservation "ppro_fop_both" 5 + (and (eq_attr "cpu" "pentiumpro") + (and (eq_attr "memory" "both") + (eq_attr "type" "fop"))) + "decoder0,p2+p0,p0+p4+p3") + +(define_insn_reservation "ppro_fsgn" 1 + (and (eq_attr "cpu" "pentiumpro") + (eq_attr "type" "fsgn")) + "decodern,p0") + +(define_insn_reservation "ppro_fistp" 5 + (and (eq_attr "cpu" "pentiumpro") + (eq_attr "type" "fistp")) + "decoder0,p0*2,p4+p3") + +(define_insn_reservation "ppro_fcmov" 2 + (and (eq_attr "cpu" "pentiumpro") + (eq_attr "type" "fcmov")) + "decoder0,p0*2") + +(define_insn_reservation "ppro_fcmp" 1 + (and (eq_attr "cpu" "pentiumpro") + (and (eq_attr "memory" "none") + (eq_attr "type" "fcmp"))) + "decodern,p0") + +(define_insn_reservation "ppro_fcmp_load" 4 + (and (eq_attr "cpu" "pentiumpro") + (and (eq_attr "memory" "load") + (eq_attr "type" "fcmp"))) + "decoder0,p2+p0") + +(define_insn_reservation "ppro_fmov" 1 + (and (eq_attr "cpu" "pentiumpro") + (and (eq_attr "memory" "none") + (eq_attr "type" "fmov"))) + "decodern,p0") + +(define_insn_reservation "ppro_fmov_load" 1 + (and (eq_attr "cpu" "pentiumpro") + (and (eq_attr "memory" "load") + (and (eq_attr "mode" "!XF") + (eq_attr "type" "fmov")))) + "decodern,p2") + +(define_insn_reservation "ppro_fmov_XF_load" 3 + (and (eq_attr "cpu" "pentiumpro") + (and (eq_attr "memory" "load") + (and (eq_attr "mode" "XF") + (eq_attr "type" "fmov")))) + "decoder0,(p2+p0)*2") + +(define_insn_reservation "ppro_fmov_store" 1 + (and (eq_attr "cpu" "pentiumpro") + (and (eq_attr "memory" "store") + (and (eq_attr "mode" "!XF") + (eq_attr "type" "fmov")))) + "decodern,p0") + +(define_insn_reservation "ppro_fmov_XF_store" 3 + (and (eq_attr "cpu" "pentiumpro") + (and (eq_attr "memory" "store") + (and (eq_attr "mode" "XF") + (eq_attr "type" "fmov")))) + "decoder0,(p0+p4),(p0+p3)") + +;; fmul executes on port 0 with latency 5. It has issue latency 2, +;; but we don't model this. +(define_insn_reservation "ppro_fmul" 5 + (and (eq_attr "cpu" "pentiumpro") + (and (eq_attr "memory" "none") + (eq_attr "type" "fmul"))) + "decoder0,p0*2") + +(define_insn_reservation "ppro_fmul_load" 6 + (and (eq_attr "cpu" "pentiumpro") + (and (eq_attr "memory" "load") + (eq_attr "type" "fmul"))) + "decoder0,p2+p0,p0") + +;; fdiv latencies depend on the mode of the operands. XFmode gives +;; a latency of 38 cycles, DFmode gives 32, and SFmode gives latency 18. +;; Division by a power of 2 takes only 9 cycles, but we cannot model +;; that. Throughput is equal to latency - 1, which we model using the +;; ppro_div automaton. +(define_insn_reservation "ppro_fdiv_SF" 18 + (and (eq_attr "cpu" "pentiumpro") + (and (eq_attr "memory" "none") + (and (eq_attr "mode" "SF") + (eq_attr "type" "fdiv,fpspc")))) + "decodern,p0+fdiv,fdiv*16") + +(define_insn_reservation "ppro_fdiv_SF_load" 19 + (and (eq_attr "cpu" "pentiumpro") + (and (eq_attr "memory" "load") + (and (eq_attr "mode" "SF") + (eq_attr "type" "fdiv,fpspc")))) + "decoder0,p2+p0+fdiv,fdiv*16") + +(define_insn_reservation "ppro_fdiv_DF" 32 + (and (eq_attr "cpu" "pentiumpro") + (and (eq_attr "memory" "none") + (and (eq_attr "mode" "DF") + (eq_attr "type" "fdiv,fpspc")))) + "decodern,p0+fdiv,fdiv*30") + +(define_insn_reservation "ppro_fdiv_DF_load" 33 + (and (eq_attr "cpu" "pentiumpro") + (and (eq_attr "memory" "load") + (and (eq_attr "mode" "DF") + (eq_attr "type" "fdiv,fpspc")))) + "decoder0,p2+p0+fdiv,fdiv*30") + +(define_insn_reservation "ppro_fdiv_XF" 38 + (and (eq_attr "cpu" "pentiumpro") + (and (eq_attr "memory" "none") + (and (eq_attr "mode" "XF") + (eq_attr "type" "fdiv,fpspc")))) + "decodern,p0+fdiv,fdiv*36") + +(define_insn_reservation "ppro_fdiv_XF_load" 39 + (and (eq_attr "cpu" "pentiumpro") + (and (eq_attr "memory" "load") + (and (eq_attr "mode" "XF") + (eq_attr "type" "fdiv,fpspc")))) + "decoder0,p2+p0+fdiv,fdiv*36") + +;; MMX instructions can execute on either port 0 or port 1 with a +;; throughput of 1/cycle. +;; on port 0: - ALU (latency 1) +;; - Multiplier Unit (latency 3) +;; on port 1: - ALU (latency 1) +;; - Shift Unit (latency 1) +;; +;; MMX instructions are either of the type reg-reg, or read-modify, and +;; except for mmxshft and mmxmul they can execute on port 0 or port 1, +;; so they behave as "simple" instructions that need no special modelling. +;; We only have to model mmxshft and mmxmul. +(define_insn_reservation "ppro_mmx_shft" 1 + (and (eq_attr "cpu" "pentiumpro") + (and (eq_attr "memory" "none") + (eq_attr "type" "mmxshft"))) + "decodern,p1") + +(define_insn_reservation "ppro_mmx_shft_load" 2 + (and (eq_attr "cpu" "pentiumpro") + (and (eq_attr "memory" "none") + (eq_attr "type" "mmxshft"))) + "decoder0,p2+p1") + +(define_insn_reservation "ppro_mmx_mul" 3 + (and (eq_attr "cpu" "pentiumpro") + (and (eq_attr "memory" "none") + (eq_attr "type" "mmxmul"))) + "decodern,p0") + +(define_insn_reservation "ppro_mmx_mul_load" 3 + (and (eq_attr "cpu" "pentiumpro") + (and (eq_attr "memory" "none") + (eq_attr "type" "mmxmul"))) + "decoder0,p2+p0") + +(define_insn_reservation "ppro_sse_mmxcvt" 4 + (and (eq_attr "cpu" "pentiumpro") + (and (eq_attr "mode" "DI") + (eq_attr "type" "mmxcvt"))) + "decodern,p1") + +;; FIXME: These are Pentium III only, but we cannot tell here if +;; we're generating code for PentiumPro/Pentium II or Pentium III +;; (define_insn_reservation "ppro_sse_mmxshft" 2 +;; (and (eq_attr "cpu" "pentiumpro") +;; (and (eq_attr "mode" "DI") +;; (eq_attr "type" "mmxshft"))) +;; "decodern,p0") + +;; SSE is very complicated, and takes a bit more effort. +;; ??? I assumed that all SSE instructions decode on decoder0, +;; but is this correct? + +;; The sfence instruction. +(define_insn_reservation "ppro_sse_sfence" 3 + (and (eq_attr "cpu" "pentiumpro") + (and (eq_attr "memory" "unknown") + (eq_attr "type" "sse"))) + "decoder0,p4+p3") + +;; FIXME: This reservation is all wrong when we're scheduling sqrtss. +(define_insn_reservation "ppro_sse_SF" 3 + (and (eq_attr "cpu" "pentiumpro") + (and (eq_attr "mode" "SF") + (eq_attr "type" "sse"))) + "decodern,p0") + +(define_insn_reservation "ppro_sse_add_SF" 3 + (and (eq_attr "cpu" "pentiumpro") + (and (eq_attr "memory" "none") + (and (eq_attr "mode" "SF") + (eq_attr "type" "sseadd")))) + "decodern,p1") + +(define_insn_reservation "ppro_sse_add_SF_load" 3 + (and (eq_attr "cpu" "pentiumpro") + (and (eq_attr "memory" "load") + (and (eq_attr "mode" "SF") + (eq_attr "type" "sseadd")))) + "decoder0,p2+p1") + +(define_insn_reservation "ppro_sse_cmp_SF" 3 + (and (eq_attr "cpu" "pentiumpro") + (and (eq_attr "memory" "none") + (and (eq_attr "mode" "SF") + (eq_attr "type" "ssecmp")))) + "decoder0,p1") + +(define_insn_reservation "ppro_sse_cmp_SF_load" 3 + (and (eq_attr "cpu" "pentiumpro") + (and (eq_attr "memory" "load") + (and (eq_attr "mode" "SF") + (eq_attr "type" "ssecmp")))) + "decoder0,p2+p1") + +(define_insn_reservation "ppro_sse_comi_SF" 1 + (and (eq_attr "cpu" "pentiumpro") + (and (eq_attr "memory" "none") + (and (eq_attr "mode" "SF") + (eq_attr "type" "ssecomi")))) + "decodern,p0") + +(define_insn_reservation "ppro_sse_comi_SF_load" 1 + (and (eq_attr "cpu" "pentiumpro") + (and (eq_attr "memory" "load") + (and (eq_attr "mode" "SF") + (eq_attr "type" "ssecomi")))) + "decoder0,p2+p0") + +(define_insn_reservation "ppro_sse_mul_SF" 4 + (and (eq_attr "cpu" "pentiumpro") + (and (eq_attr "memory" "none") + (and (eq_attr "mode" "SF") + (eq_attr "type" "ssemul")))) + "decodern,p0") + +(define_insn_reservation "ppro_sse_mul_SF_load" 4 + (and (eq_attr "cpu" "pentiumpro") + (and (eq_attr "memory" "load") + (and (eq_attr "mode" "SF") + (eq_attr "type" "ssemul")))) + "decoder0,p2+p0") + +;; FIXME: ssediv doesn't close p0 for 17 cycles, surely??? +(define_insn_reservation "ppro_sse_div_SF" 18 + (and (eq_attr "cpu" "pentiumpro") + (and (eq_attr "memory" "none") + (and (eq_attr "mode" "SF") + (eq_attr "type" "ssediv")))) + "decoder0,p0*17") + +(define_insn_reservation "ppro_sse_div_SF_load" 18 + (and (eq_attr "cpu" "pentiumpro") + (and (eq_attr "memory" "none") + (and (eq_attr "mode" "SF") + (eq_attr "type" "ssediv")))) + "decoder0,(p2+p0),p0*16") + +(define_insn_reservation "ppro_sse_icvt_SF" 4 + (and (eq_attr "cpu" "pentiumpro") + (and (eq_attr "mode" "SF") + (eq_attr "type" "sseicvt"))) + "decoder0,(p2+p1)*2") + +(define_insn_reservation "ppro_sse_icvt_SI" 3 + (and (eq_attr "cpu" "pentiumpro") + (and (eq_attr "mode" "SI") + (eq_attr "type" "sseicvt"))) + "decoder0,(p2+p1)") + +(define_insn_reservation "ppro_sse_mov_SF" 3 + (and (eq_attr "cpu" "pentiumpro") + (and (eq_attr "memory" "none") + (and (eq_attr "mode" "SF") + (eq_attr "type" "ssemov")))) + "decoder0,(p0|p1)") + +(define_insn_reservation "ppro_sse_mov_SF_load" 3 + (and (eq_attr "cpu" "pentiumpro") + (and (eq_attr "memory" "load") + (and (eq_attr "mode" "SF") + (eq_attr "type" "ssemov")))) + "decoder0,p2+(p0|p1)") + +(define_insn_reservation "ppro_sse_mov_SF_store" 3 + (and (eq_attr "cpu" "pentiumpro") + (and (eq_attr "memory" "store") + (and (eq_attr "mode" "SF") + (eq_attr "type" "ssemov")))) + "decoder0,p4+p3") + +(define_insn_reservation "ppro_sse_V4SF" 4 + (and (eq_attr "cpu" "pentiumpro") + (and (eq_attr "mode" "V4SF") + (eq_attr "type" "sse"))) + "decoder0,p1*2") + +(define_insn_reservation "ppro_sse_add_V4SF" 3 + (and (eq_attr "cpu" "pentiumpro") + (and (eq_attr "memory" "none") + (and (eq_attr "mode" "V4SF") + (eq_attr "type" "sseadd")))) + "decoder0,p1*2") + +(define_insn_reservation "ppro_sse_add_V4SF_load" 3 + (and (eq_attr "cpu" "pentiumpro") + (and (eq_attr "memory" "load") + (and (eq_attr "mode" "V4SF") + (eq_attr "type" "sseadd")))) + "decoder0,(p2+p1)*2") + +(define_insn_reservation "ppro_sse_cmp_V4SF" 3 + (and (eq_attr "cpu" "pentiumpro") + (and (eq_attr "memory" "none") + (and (eq_attr "mode" "V4SF") + (eq_attr "type" "ssecmp")))) + "decoder0,p1*2") + +(define_insn_reservation "ppro_sse_cmp_V4SF_load" 3 + (and (eq_attr "cpu" "pentiumpro") + (and (eq_attr "memory" "load") + (and (eq_attr "mode" "V4SF") + (eq_attr "type" "ssecmp")))) + "decoder0,(p2+p1)*2") + +(define_insn_reservation "ppro_sse_cvt_V4SF" 3 + (and (eq_attr "cpu" "pentiumpro") + (and (eq_attr "memory" "none,unknown") + (and (eq_attr "mode" "V4SF") + (eq_attr "type" "ssecvt")))) + "decoder0,p1*2") + +(define_insn_reservation "ppro_sse_cvt_V4SF_other" 4 + (and (eq_attr "cpu" "pentiumpro") + (and (eq_attr "memory" "!none,unknown") + (and (eq_attr "mode" "V4SF") + (eq_attr "type" "ssecmp")))) + "decoder0,p1,p4+p3") + +(define_insn_reservation "ppro_sse_mul_V4SF" 5 + (and (eq_attr "cpu" "pentiumpro") + (and (eq_attr "memory" "none") + (and (eq_attr "mode" "V4SF") + (eq_attr "type" "ssemul")))) + "decoder0,p0*2") + +(define_insn_reservation "ppro_sse_mul_V4SF_load" 5 + (and (eq_attr "cpu" "pentiumpro") + (and (eq_attr "memory" "load") + (and (eq_attr "mode" "V4SF") + (eq_attr "type" "ssemul")))) + "decoder0,(p2+p0)*2") + +;; FIXME: p0 really closed this long??? +(define_insn_reservation "ppro_sse_div_V4SF" 48 + (and (eq_attr "cpu" "pentiumpro") + (and (eq_attr "memory" "none") + (and (eq_attr "mode" "V4SF") + (eq_attr "type" "ssediv")))) + "decoder0,p0*34") + +(define_insn_reservation "ppro_sse_div_V4SF_load" 48 + (and (eq_attr "cpu" "pentiumpro") + (and (eq_attr "memory" "none") + (and (eq_attr "mode" "V4SF") + (eq_attr "type" "ssediv")))) + "decoder0,(p2+p0)*2,p0*32") + +(define_insn_reservation "ppro_sse_log_V4SF" 2 + (and (eq_attr "cpu" "pentiumpro") + (and (eq_attr "memory" "none") + (and (eq_attr "mode" "V4SF") + (eq_attr "type" "sselog")))) + "decodern,p1") + +(define_insn_reservation "ppro_sse_log_V4SF_load" 2 + (and (eq_attr "cpu" "pentiumpro") + (and (eq_attr "memory" "none") + (and (eq_attr "mode" "V4SF") + (eq_attr "type" "sselog")))) + "decoder0,(p2+p1)") + +(define_insn_reservation "ppro_sse_mov_V4SF" 1 + (and (eq_attr "cpu" "pentiumpro") + (and (eq_attr "memory" "none") + (and (eq_attr "mode" "V4SF") + (eq_attr "type" "ssemov")))) + "decoder0,(p0|p1)*2") + +(define_insn_reservation "ppro_sse_mov_V4SF_load" 2 + (and (eq_attr "cpu" "pentiumpro") + (and (eq_attr "memory" "load") + (and (eq_attr "mode" "V4SF") + (eq_attr "type" "ssemov")))) + "decoder0,p2*2") + +(define_insn_reservation "ppro_sse_mov_V4SF_store" 3 + (and (eq_attr "cpu" "pentiumpro") + (and (eq_attr "memory" "store") + (and (eq_attr "mode" "V4SF") + (eq_attr "type" "ssemov")))) + "decoder0,(p4+p3)*2") + +;; All other instructions are modelled as simple instructions. +;; We have already modelled all i387 floating point instructions, so all +;; other instructions execute on either port 0 or port 1. This includes +;; the ALU units, and the MMX units. +;; +;; reg-reg instructions produce 1 uop so they can be decoded on any of +;; the three decoders. +(define_insn_reservation "ppro_insn" 1 + (and (eq_attr "cpu" "pentiumpro") + (and (eq_attr "memory" "none,unknown") + (eq_attr "type" "alu,alu1,negnot,incdec,icmp,test,setcc,icmov,push,pop,fxch,sseiadd,sseishft,sseimul,mmx,mmxadd,mmxcmp"))) + "decodern,(p0|p1)") + +;; read-modify and register-memory instructions have 2 or three uops, +;; so they have to be decoded on decoder0. +(define_insn_reservation "ppro_insn_load" 3 + (and (eq_attr "cpu" "pentiumpro") + (and (eq_attr "memory" "load") + (eq_attr "type" "alu,alu1,negnot,incdec,icmp,test,setcc,icmov,push,pop,fxch,sseiadd,sseishft,sseimul,mmx,mmxadd,mmxcmp"))) + "decoder0,p2+(p0|p1)") + +(define_insn_reservation "ppro_insn_store" 1 + (and (eq_attr "cpu" "pentiumpro") + (and (eq_attr "memory" "store") + (eq_attr "type" "alu,alu1,negnot,incdec,icmp,test,setcc,icmov,push,pop,fxch,sseiadd,sseishft,sseimul,mmx,mmxadd,mmxcmp"))) + "decoder0,(p0|p1),p4+p3") + +;; read-modify-store instructions produce 4 uops so they have to be +;; decoded on decoder0 as well. +(define_insn_reservation "ppro_insn_both" 4 + (and (eq_attr "cpu" "pentiumpro") + (and (eq_attr "memory" "both") + (eq_attr "type" "alu,alu1,negnot,incdec,icmp,test,setcc,icmov,push,pop,fxch,sseiadd,sseishft,sseimul,mmx,mmxadd,mmxcmp"))) + "decoder0,p2+(p0|p1),p4+p3") + -- 2.7.4