From b16abbcb8530ab4601873c978c50422960d0faee Mon Sep 17 00:00:00 2001 From: Bin Cheng Date: Fri, 14 Nov 2014 02:32:38 +0000 Subject: [PATCH] timevar.def (TV_SCHED_FUSION): New time var. * timevar.def (TV_SCHED_FUSION): New time var. * passes.def (pass_sched_fusion): New pass. * config/arm/arm.c (TARGET_SCHED_FUSION_PRIORITY): New. (extract_base_offset_in_addr, fusion_load_store): New. (arm_sched_fusion_priority): New. (arm_option_override): Disable scheduling fusion by default on non-armv7 processors or ldrd/strd isn't preferred. * sched-int.h (struct _haifa_insn_data): New field. (INSN_FUSION_PRIORITY, FUSION_MAX_PRIORITY, sched_fusion): New. * sched-rgn.c (rest_of_handle_sched_fusion): New. (pass_data_sched_fusion, pass_sched_fusion): New. (make_pass_sched_fusion): New. * haifa-sched.c (sched_fusion): New. (insn_cost): Handle sched_fusion. (priority): Handle sched_fusion by calling target hook. (enum rfs_decision): New enum value. (rfs_str): New element for RFS_FUSION. (rank_for_schedule): Support sched_fusion. (schedule_insn, max_issue, prune_ready_list): Handle sched_fusion. (schedule_block, fix_tick_ready): Handle sched_fusion. * common.opt (flag_schedule_fusion): New. * tree-pass.h (make_pass_sched_fusion): New. * target.def (fusion_priority): New. * doc/tm.texi.in (TARGET_SCHED_FUSION_PRIORITY): New. * doc/tm.texi: Regenerated. * doc/invoke.texi (-fschedule-fusion): New. testsuite: * gcc.target/arm/ldrd-strd-pair-1.c: New test. * gcc.target/arm/vfp-1.c: Improve scanning string. From-SVN: r217533 --- gcc/ChangeLog | 29 ++++++ gcc/common.opt | 4 + gcc/config/arm/arm.c | 131 ++++++++++++++++++++++++ gcc/doc/invoke.texi | 10 +- gcc/doc/tm.texi | 70 +++++++++++++ gcc/doc/tm.texi.in | 2 + gcc/haifa-sched.c | 84 +++++++++++++-- gcc/passes.def | 1 + gcc/sched-int.h | 8 ++ gcc/sched-rgn.c | 63 ++++++++++++ gcc/target.def | 73 +++++++++++++ gcc/testsuite/ChangeLog | 5 + gcc/testsuite/gcc.target/arm/ldrd-strd-pair-1.c | 23 +++++ gcc/testsuite/gcc.target/arm/vfp-1.c | 2 +- gcc/timevar.def | 1 + gcc/tree-pass.h | 1 + 16 files changed, 497 insertions(+), 10 deletions(-) create mode 100644 gcc/testsuite/gcc.target/arm/ldrd-strd-pair-1.c diff --git a/gcc/ChangeLog b/gcc/ChangeLog index ff92bb2..1d20459 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,3 +1,32 @@ +2014-11-14 Bin Cheng + + * timevar.def (TV_SCHED_FUSION): New time var. + * passes.def (pass_sched_fusion): New pass. + * config/arm/arm.c (TARGET_SCHED_FUSION_PRIORITY): New. + (extract_base_offset_in_addr, fusion_load_store): New. + (arm_sched_fusion_priority): New. + (arm_option_override): Disable scheduling fusion by default + on non-armv7 processors or ldrd/strd isn't preferred. + * sched-int.h (struct _haifa_insn_data): New field. + (INSN_FUSION_PRIORITY, FUSION_MAX_PRIORITY, sched_fusion): New. + * sched-rgn.c (rest_of_handle_sched_fusion): New. + (pass_data_sched_fusion, pass_sched_fusion): New. + (make_pass_sched_fusion): New. + * haifa-sched.c (sched_fusion): New. + (insn_cost): Handle sched_fusion. + (priority): Handle sched_fusion by calling target hook. + (enum rfs_decision): New enum value. + (rfs_str): New element for RFS_FUSION. + (rank_for_schedule): Support sched_fusion. + (schedule_insn, max_issue, prune_ready_list): Handle sched_fusion. + (schedule_block, fix_tick_ready): Handle sched_fusion. + * common.opt (flag_schedule_fusion): New. + * tree-pass.h (make_pass_sched_fusion): New. + * target.def (fusion_priority): New. + * doc/tm.texi.in (TARGET_SCHED_FUSION_PRIORITY): New. + * doc/tm.texi: Regenerated. + * doc/invoke.texi (-fschedule-fusion): New. + 2014-11-13 Rong Xu PR debug/63581 diff --git a/gcc/common.opt b/gcc/common.opt index e57c457..06daa43 100644 --- a/gcc/common.opt +++ b/gcc/common.opt @@ -1848,6 +1848,10 @@ frename-registers Common Report Var(flag_rename_registers) Init(2) Optimization Perform a register renaming optimization pass +fschedule-fusion +Common Report Var(flag_schedule_fusion) Init(2) Optimization +Perform a target dependent instruction fusion optimization pass + freorder-blocks Common Report Var(flag_reorder_blocks) Optimization Reorder basic blocks to improve code placement diff --git a/gcc/config/arm/arm.c b/gcc/config/arm/arm.c index 3f2ddd4..f9c98ac 100644 --- a/gcc/config/arm/arm.c +++ b/gcc/config/arm/arm.c @@ -311,6 +311,8 @@ static unsigned arm_add_stmt_cost (void *data, int count, static void arm_canonicalize_comparison (int *code, rtx *op0, rtx *op1, bool op0_preserve_value); static unsigned HOST_WIDE_INT arm_asan_shadow_offset (void); + +static void arm_sched_fusion_priority (rtx_insn *, int, int *, int*); /* Table of machine attributes. */ static const struct attribute_spec arm_attribute_table[] = @@ -708,6 +710,9 @@ static const struct attribute_spec arm_attribute_table[] = #undef TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS #define TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS true +#undef TARGET_SCHED_FUSION_PRIORITY +#define TARGET_SCHED_FUSION_PRIORITY arm_sched_fusion_priority + struct gcc_target targetm = TARGET_INITIALIZER; /* Obstack for minipool constant handling. */ @@ -3168,6 +3173,12 @@ arm_option_override (void) if (TARGET_THUMB2) inline_asm_unified = 1; + /* Disable scheduling fusion by default if it's not armv7 processor + or doesn't prefer ldrd/strd. */ + if (flag_schedule_fusion == 2 + && (!arm_arch7 || !current_tune->prefer_ldrd_strd)) + flag_schedule_fusion = 0; + /* Register global variables with the garbage collector. */ arm_add_gc_roots (); } @@ -32350,4 +32361,124 @@ arm_is_constant_pool_ref (rtx x) && CONSTANT_POOL_ADDRESS_P (XEXP (x, 0))); } +/* If MEM is in the form of [base+offset], extract the two parts + of address and set to BASE and OFFSET, otherwise return false + after clearing BASE and OFFSET. */ + +static bool +extract_base_offset_in_addr (rtx mem, rtx *base, rtx *offset) +{ + rtx addr; + + gcc_assert (MEM_P (mem)); + + addr = XEXP (mem, 0); + + /* Strip off const from addresses like (const (addr)). */ + if (GET_CODE (addr) == CONST) + addr = XEXP (addr, 0); + + if (GET_CODE (addr) == REG) + { + *base = addr; + *offset = const0_rtx; + return true; + } + + if (GET_CODE (addr) == PLUS + && GET_CODE (XEXP (addr, 0)) == REG + && CONST_INT_P (XEXP (addr, 1))) + { + *base = XEXP (addr, 0); + *offset = XEXP (addr, 1); + return true; + } + + *base = NULL_RTX; + *offset = NULL_RTX; + + return false; +} + +/* If INSN is a load or store of address in the form of [base+offset], + extract the two parts and set to BASE and OFFSET. IS_LOAD is set + to TRUE if it's a load. Return TRUE if INSN is such an instruction, + otherwise return FALSE. */ + +static bool +fusion_load_store (rtx_insn *insn, rtx *base, rtx *offset, bool *is_load) +{ + rtx x, dest, src; + + gcc_assert (INSN_P (insn)); + x = PATTERN (insn); + if (GET_CODE (x) != SET) + return false; + + src = SET_SRC (x); + dest = SET_DEST (x); + if (GET_CODE (src) == REG && GET_CODE (dest) == MEM) + { + *is_load = false; + extract_base_offset_in_addr (dest, base, offset); + } + else if (GET_CODE (src) == MEM && GET_CODE (dest) == REG) + { + *is_load = true; + extract_base_offset_in_addr (src, base, offset); + } + else + return false; + + return (*base != NULL_RTX && *offset != NULL_RTX); +} + +/* Implement the TARGET_SCHED_FUSION_PRIORITY hook. + + Currently we only support to fuse ldr or str instructions, so FUSION_PRI + and PRI are only calculated for these instructions. For other instruction, + FUSION_PRI and PRI are simply set to MAX_PRI. In the future, other kind + instruction fusion can be supported by returning different priorities. + + It's important that irrelevant instructions get the largest FUSION_PRI. */ + +static void +arm_sched_fusion_priority (rtx_insn *insn, int max_pri, + int *fusion_pri, int *pri) +{ + int tmp, off_val; + bool is_load; + rtx base, offset; + + gcc_assert (INSN_P (insn)); + + tmp = max_pri - 1; + if (!fusion_load_store (insn, &base, &offset, &is_load)) + { + *pri = tmp; + *fusion_pri = tmp; + return; + } + + /* Load goes first. */ + if (is_load) + *fusion_pri = tmp - 1; + else + *fusion_pri = tmp - 2; + + tmp /= 2; + + /* INSN with smaller base register goes first. */ + tmp -= ((REGNO (base) & 0xff) << 20); + + /* INSN with smaller offset goes first. */ + off_val = (int)(INTVAL (offset)); + if (off_val >= 0) + tmp -= (off_val & 0xfffff); + else + tmp += ((- off_val) & 0xfffff); + + *pri = tmp; + return; +} #include "gt-arm.h" diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi index 12dbb27..b7049d8 100644 --- a/gcc/doc/invoke.texi +++ b/gcc/doc/invoke.texi @@ -406,7 +406,7 @@ Objective-C and Objective-C++ Dialects}. -fprofile-correction -fprofile-dir=@var{path} -fprofile-generate @gol -fprofile-generate=@var{path} @gol -fprofile-use -fprofile-use=@var{path} -fprofile-values -fprofile-reorder-functions @gol --freciprocal-math -free -frename-registers -freorder-blocks @gol +-freciprocal-math -free -frename-registers -fschedule-fusion -freorder-blocks @gol -freorder-blocks-and-partition -freorder-functions @gol -frerun-cse-after-loop -freschedule-modulo-scheduled-loops @gol -frounding-math -fsched2-use-superblocks -fsched-pressure @gol @@ -9575,6 +9575,14 @@ a ``home register''. Enabled by default with @option{-funroll-loops} and @option{-fpeel-loops}. +@item -fschedule-fusion +@opindex fschedule-fusion +Performs a target dependent pass over the instruction stream to schedule +instructions of same type together because target machine can execute them +more efficiently if they are adjacent to each other in the instruction flow. + +Enabled at levels @option{-O2}, @option{-O3}, @option{-Os}. + @item -ftracer @opindex ftracer Perform tail duplication to enlarge superblock size. This transformation diff --git a/gcc/doc/tm.texi b/gcc/doc/tm.texi index 6e2825f..8d137f5 100644 --- a/gcc/doc/tm.texi +++ b/gcc/doc/tm.texi @@ -6771,6 +6771,76 @@ This hook is called by tree reassociator to determine a level of parallelism required in output calculations chain. @end deftypefn +@deftypefn {Target Hook} void TARGET_SCHED_FUSION_PRIORITY (rtx_insn *@var{insn}, int @var{max_pri}, int *@var{fusion_pri}, int *@var{pri}) +This hook is called by scheduling fusion pass. It calculates fusion +priorities for each instruction passed in by parameter. The priorities +are returned via pointer parameters. + +@var{insn} is the instruction whose priorities need to be calculated. +@var{max_pri} is the maximum priority can be returned in any cases. +@var{fusion_pri} is the pointer parameter through which @var{insn}'s +fusion priority should be calculated and returned. +@var{pri} is the pointer parameter through which @var{insn}'s priority +should be calculated and returned. + +Same @var{fusion_pri} should be returned for instructions which should +be scheduled together. Different @var{pri} should be returned for +instructions with same @var{fusion_pri}. @var{fusion_pri} is the major +sort key, @var{pri} is the minor sort key. All instructions will be +scheduled according to the two priorities. All priorities calculated +should be between 0 (exclusive) and @var{max_pri} (inclusive). To avoid +false dependencies, @var{fusion_pri} of instructions which need to be +scheduled together should be smaller than @var{fusion_pri} of irrelevant +instructions. + +Given below example: + + ldr r10, [r1, 4] + add r4, r4, r10 + ldr r15, [r2, 8] + sub r5, r5, r15 + ldr r11, [r1, 0] + add r4, r4, r11 + ldr r16, [r2, 12] + sub r5, r5, r16 + +On targets like ARM/AArch64, the two pairs of consecutive loads should be +merged. Since peephole2 pass can't help in this case unless consecutive +loads are actually next to each other in instruction flow. That's where +this scheduling fusion pass works. This hook calculates priority for each +instruction based on its fustion type, like: + + ldr r10, [r1, 4] ; fusion_pri=99, pri=96 + add r4, r4, r10 ; fusion_pri=100, pri=100 + ldr r15, [r2, 8] ; fusion_pri=98, pri=92 + sub r5, r5, r15 ; fusion_pri=100, pri=100 + ldr r11, [r1, 0] ; fusion_pri=99, pri=100 + add r4, r4, r11 ; fusion_pri=100, pri=100 + ldr r16, [r2, 12] ; fusion_pri=98, pri=88 + sub r5, r5, r16 ; fusion_pri=100, pri=100 + +Scheduling fusion pass then sorts all ready to issue instructions according +to the priorities. As a result, instructions of same fusion type will be +pushed together in instruction flow, like: + + ldr r11, [r1, 0] + ldr r10, [r1, 4] + ldr r15, [r2, 8] + ldr r16, [r2, 12] + add r4, r4, r10 + sub r5, r5, r15 + add r4, r4, r11 + sub r5, r5, r16 + +Now peephole2 pass can simply merge the two pairs of loads. + +Since scheduling fusion pass relies on peephole2 to do real fusion +work, it is only enabled by default when peephole2 is in effect. + +This is firstly introduced on ARM/AArch64 targets, please refer to +the hook implementation for how different fusion types are supported. +@end deftypefn + @node Sections @section Dividing the Output into Sections (Texts, Data, @dots{}) @c the above section title is WAY too long. maybe cut the part between diff --git a/gcc/doc/tm.texi.in b/gcc/doc/tm.texi.in index 929ee85..7c58a32 100644 --- a/gcc/doc/tm.texi.in +++ b/gcc/doc/tm.texi.in @@ -4811,6 +4811,8 @@ them: try the first ones in this list first. @hook TARGET_SCHED_REASSOCIATION_WIDTH +@hook TARGET_SCHED_FUSION_PRIORITY + @node Sections @section Dividing the Output into Sections (Texts, Data, @dots{}) @c the above section title is WAY too long. maybe cut the part between diff --git a/gcc/haifa-sched.c b/gcc/haifa-sched.c index db3187e..4fb97fb 100644 --- a/gcc/haifa-sched.c +++ b/gcc/haifa-sched.c @@ -1391,6 +1391,9 @@ insn_cost (rtx_insn *insn) { int cost; + if (sched_fusion) + return 0; + if (sel_sched_p ()) { if (recog_memoized (insn) < 0) @@ -1603,6 +1606,8 @@ dep_list_size (rtx insn, sd_list_types_def list) return nodbgcount; } +bool sched_fusion; + /* Compute the priority number for INSN. */ static int priority (rtx_insn *insn) @@ -1617,7 +1622,15 @@ priority (rtx_insn *insn) { int this_priority = -1; - if (dep_list_size (insn, SD_LIST_FORW) == 0) + if (sched_fusion) + { + int this_fusion_priority; + + targetm.sched.fusion_priority (insn, FUSION_MAX_PRIORITY, + &this_fusion_priority, &this_priority); + INSN_FUSION_PRIORITY (insn) = this_fusion_priority; + } + else if (dep_list_size (insn, SD_LIST_FORW) == 0) /* ??? We should set INSN_PRIORITY to insn_cost when and insn has some forward deps but all of them are ignored by contributes_to_priority hook. At the moment we set priority of @@ -2548,7 +2561,7 @@ enum rfs_decision { RFS_SCHED_GROUP, RFS_PRESSURE_DELAY, RFS_PRESSURE_TICK, RFS_FEEDS_BACKTRACK_INSN, RFS_PRIORITY, RFS_SPECULATION, RFS_SCHED_RANK, RFS_LAST_INSN, RFS_PRESSURE_INDEX, - RFS_DEP_COUNT, RFS_TIE, RFS_N }; + RFS_DEP_COUNT, RFS_TIE, RFS_FUSION, RFS_N }; /* Corresponding strings for print outs. */ static const char *rfs_str[RFS_N] = { @@ -2556,7 +2569,7 @@ static const char *rfs_str[RFS_N] = { "RFS_SCHED_GROUP", "RFS_PRESSURE_DELAY", "RFS_PRESSURE_TICK", "RFS_FEEDS_BACKTRACK_INSN", "RFS_PRIORITY", "RFS_SPECULATION", "RFS_SCHED_RANK", "RFS_LAST_INSN", "RFS_PRESSURE_INDEX", - "RFS_DEP_COUNT", "RFS_TIE" }; + "RFS_DEP_COUNT", "RFS_TIE", "RFS_FUSION" }; /* Statistical breakdown of rank_for_schedule decisions. */ typedef struct { unsigned stats[RFS_N]; } rank_for_schedule_stats_t; @@ -2627,6 +2640,55 @@ rank_for_schedule (const void *x, const void *y) /* Make sure that priority of TMP and TMP2 are initialized. */ gcc_assert (INSN_PRIORITY_KNOWN (tmp) && INSN_PRIORITY_KNOWN (tmp2)); + if (sched_fusion) + { + /* The instruction that has the same fusion priority as the last + instruction is the instruction we picked next. If that is not + the case, we sort ready list firstly by fusion priority, then + by priority, and at last by INSN_LUID. */ + int a = INSN_FUSION_PRIORITY (tmp); + int b = INSN_FUSION_PRIORITY (tmp2); + int last = -1; + + if (last_nondebug_scheduled_insn + && !NOTE_P (last_nondebug_scheduled_insn) + && BLOCK_FOR_INSN (tmp) + == BLOCK_FOR_INSN (last_nondebug_scheduled_insn)) + last = INSN_FUSION_PRIORITY (last_nondebug_scheduled_insn); + + if (a != last && b != last) + { + if (a == b) + { + a = INSN_PRIORITY (tmp); + b = INSN_PRIORITY (tmp2); + } + if (a != b) + return rfs_result (RFS_FUSION, b - a, tmp, tmp2); + else + return rfs_result (RFS_FUSION, + INSN_LUID (tmp) - INSN_LUID (tmp2), tmp, tmp2); + } + else if (a == b) + { + gcc_assert (last_nondebug_scheduled_insn + && !NOTE_P (last_nondebug_scheduled_insn)); + last = INSN_PRIORITY (last_nondebug_scheduled_insn); + + a = abs (INSN_PRIORITY (tmp) - last); + b = abs (INSN_PRIORITY (tmp2) - last); + if (a != b) + return rfs_result (RFS_FUSION, a - b, tmp, tmp2); + else + return rfs_result (RFS_FUSION, + INSN_LUID (tmp) - INSN_LUID (tmp2), tmp, tmp2); + } + else if (a == last) + return rfs_result (RFS_FUSION, -1, tmp, tmp2); + else + return rfs_result (RFS_FUSION, 1, tmp, tmp2); + } + if (sched_pressure != SCHED_PRESSURE_NONE) { /* Prefer insn whose scheduling results in the smallest register @@ -4007,8 +4069,8 @@ schedule_insn (rtx_insn *insn) gcc_assert (INSN_TICK (insn) >= MIN_TICK); if (INSN_TICK (insn) > clock_var) /* INSN has been prematurely moved from the queue to the ready list. - This is possible only if following flag is set. */ - gcc_assert (flag_sched_stalled_insns); + This is possible only if following flags are set. */ + gcc_assert (flag_sched_stalled_insns || sched_fusion); /* ??? Probably, if INSN is scheduled prematurely, we should leave INSN_TICK untouched. This is a machine-dependent issue, actually. */ @@ -5500,6 +5562,9 @@ max_issue (struct ready_list *ready, int privileged_n, state_t state, struct choice_entry *top; rtx_insn *insn; + if (sched_fusion) + return 0; + n_ready = ready->n_ready; gcc_assert (dfa_lookahead >= 1 && privileged_n >= 0 && privileged_n <= n_ready); @@ -5848,6 +5913,9 @@ prune_ready_list (state_t temp_state, bool first_cycle_insn_p, bool sched_group_found = false; int min_cost_group = 1; + if (sched_fusion) + return; + for (i = 0; i < ready.n_ready; i++) { rtx_insn *insn = ready_element (&ready, i); @@ -6059,7 +6127,7 @@ schedule_block (basic_block *target_bb, state_t init_state) rtx_insn *tail = PREV_INSN (next_tail); if ((current_sched_info->flags & DONT_BREAK_DEPENDENCIES) == 0 - && sched_pressure != SCHED_PRESSURE_MODEL) + && sched_pressure != SCHED_PRESSURE_MODEL && !sched_fusion) find_modifiable_mems (head, tail); /* We used to have code to avoid getting parameters moved from hard @@ -6455,7 +6523,7 @@ schedule_block (basic_block *target_bb, state_t init_state) { memcpy (temp_state, curr_state, dfa_state_size); cost = state_transition (curr_state, insn); - if (sched_pressure != SCHED_PRESSURE_WEIGHTED) + if (sched_pressure != SCHED_PRESSURE_WEIGHTED && !sched_fusion) gcc_assert (cost < 0); if (memcmp (temp_state, curr_state, dfa_state_size) != 0) cycle_issued_insns++; @@ -7288,7 +7356,7 @@ fix_tick_ready (rtx_insn *next) INSN_TICK (next) = tick; delay = tick - clock_var; - if (delay <= 0 || sched_pressure != SCHED_PRESSURE_NONE) + if (delay <= 0 || sched_pressure != SCHED_PRESSURE_NONE || sched_fusion) delay = QUEUE_READY; change_queue_index (next, delay); diff --git a/gcc/passes.def b/gcc/passes.def index ebd2b95..194e2a9 100644 --- a/gcc/passes.def +++ b/gcc/passes.def @@ -419,6 +419,7 @@ along with GCC; see the file COPYING3. If not see NEXT_PASS (pass_stack_adjustments); NEXT_PASS (pass_jump2); NEXT_PASS (pass_duplicate_computed_gotos); + NEXT_PASS (pass_sched_fusion); NEXT_PASS (pass_peephole2); NEXT_PASS (pass_if_after_reload); NEXT_PASS (pass_regrename); diff --git a/gcc/sched-int.h b/gcc/sched-int.h index eaeabfa..2e156f3 100644 --- a/gcc/sched-int.h +++ b/gcc/sched-int.h @@ -805,6 +805,9 @@ struct _haifa_insn_data /* A priority for each insn. */ int priority; + /* The fusion priority for each insn. */ + int fusion_priority; + /* The minimum clock tick at which the insn becomes ready. This is used to note timing constraints for the insns in the pending list. */ int tick; @@ -903,6 +906,7 @@ extern vec h_i_d; /* Accessor macros for h_i_d. There are more in haifa-sched.c and sched-rgn.c. */ #define INSN_PRIORITY(INSN) (HID (INSN)->priority) +#define INSN_FUSION_PRIORITY(INSN) (HID (INSN)->fusion_priority) #define INSN_REG_PRESSURE(INSN) (HID (INSN)->reg_pressure) #define INSN_MAX_REG_PRESSURE(INSN) (HID (INSN)->max_reg_pressure) #define INSN_REG_USE_LIST(INSN) (HID (INSN)->reg_use_list) @@ -1620,6 +1624,10 @@ extern void sd_copy_back_deps (rtx_insn *, rtx_insn *, bool); extern void sd_delete_dep (sd_iterator_def); extern void sd_debug_lists (rtx, sd_list_types_def); +/* Macros and declarations for scheduling fusion. */ +#define FUSION_MAX_PRIORITY (INT_MAX) +extern bool sched_fusion; + #endif /* INSN_SCHEDULING */ #endif /* GCC_SCHED_INT_H */ diff --git a/gcc/sched-rgn.c b/gcc/sched-rgn.c index 7a0af10..2a1c18c 100644 --- a/gcc/sched-rgn.c +++ b/gcc/sched-rgn.c @@ -3658,6 +3658,17 @@ rest_of_handle_sched2 (void) return 0; } +static unsigned int +rest_of_handle_sched_fusion (void) +{ +#ifdef INSN_SCHEDULING + sched_fusion = true; + schedule_insns (); + sched_fusion = false; +#endif + return 0; +} + namespace { const pass_data pass_data_live_range_shrinkage = @@ -3800,3 +3811,55 @@ make_pass_sched2 (gcc::context *ctxt) { return new pass_sched2 (ctxt); } + +namespace { + +const pass_data pass_data_sched_fusion = +{ + RTL_PASS, /* type */ + "sched_fusion", /* name */ + OPTGROUP_NONE, /* optinfo_flags */ + TV_SCHED_FUSION, /* tv_id */ + 0, /* properties_required */ + 0, /* properties_provided */ + 0, /* properties_destroyed */ + 0, /* todo_flags_start */ + TODO_df_finish, /* todo_flags_finish */ +}; + +class pass_sched_fusion : public rtl_opt_pass +{ +public: + pass_sched_fusion (gcc::context *ctxt) + : rtl_opt_pass (pass_data_sched_fusion, ctxt) + {} + + /* opt_pass methods: */ + virtual bool gate (function *); + virtual unsigned int execute (function *) + { + return rest_of_handle_sched_fusion (); + } + +}; // class pass_sched2 + +bool +pass_sched_fusion::gate (function *) +{ +#ifdef INSN_SCHEDULING + /* Scheduling fusion relies on peephole2 to do real fusion work, + so only enable it if peephole2 is in effect. */ + return (optimize > 0 && flag_peephole2 + && flag_schedule_fusion && targetm.sched.fusion_priority != NULL); +#else + return 0; +#endif +} + +} // anon namespace + +rtl_opt_pass * +make_pass_sched_fusion (gcc::context *ctxt) +{ + return new pass_sched_fusion (ctxt); +} diff --git a/gcc/target.def b/gcc/target.def index 0154967..c329b2a 100644 --- a/gcc/target.def +++ b/gcc/target.def @@ -1526,6 +1526,79 @@ parallelism required in output calculations chain.", int, (unsigned int opc, machine_mode mode), hook_int_uint_mode_1) +/* The following member value is a function that returns priority for + fusion of each instruction via pointer parameters. */ +DEFHOOK +(fusion_priority, +"This hook is called by scheduling fusion pass. It calculates fusion\n\ +priorities for each instruction passed in by parameter. The priorities\n\ +are returned via pointer parameters.\n\ +\n\ +@var{insn} is the instruction whose priorities need to be calculated.\n\ +@var{max_pri} is the maximum priority can be returned in any cases.\n\ +@var{fusion_pri} is the pointer parameter through which @var{insn}'s\n\ +fusion priority should be calculated and returned.\n\ +@var{pri} is the pointer parameter through which @var{insn}'s priority\n\ +should be calculated and returned.\n\ +\n\ +Same @var{fusion_pri} should be returned for instructions which should\n\ +be scheduled together. Different @var{pri} should be returned for\n\ +instructions with same @var{fusion_pri}. @var{fusion_pri} is the major\n\ +sort key, @var{pri} is the minor sort key. All instructions will be\n\ +scheduled according to the two priorities. All priorities calculated\n\ +should be between 0 (exclusive) and @var{max_pri} (inclusive). To avoid\n\ +false dependencies, @var{fusion_pri} of instructions which need to be\n\ +scheduled together should be smaller than @var{fusion_pri} of irrelevant\n\ +instructions.\n\ +\n\ +Given below example:\n\ +\n\ + ldr r10, [r1, 4]\n\ + add r4, r4, r10\n\ + ldr r15, [r2, 8]\n\ + sub r5, r5, r15\n\ + ldr r11, [r1, 0]\n\ + add r4, r4, r11\n\ + ldr r16, [r2, 12]\n\ + sub r5, r5, r16\n\ +\n\ +On targets like ARM/AArch64, the two pairs of consecutive loads should be\n\ +merged. Since peephole2 pass can't help in this case unless consecutive\n\ +loads are actually next to each other in instruction flow. That's where\n\ +this scheduling fusion pass works. This hook calculates priority for each\n\ +instruction based on its fustion type, like:\n\ +\n\ + ldr r10, [r1, 4] ; fusion_pri=99, pri=96 \n\ + add r4, r4, r10 ; fusion_pri=100, pri=100 \n\ + ldr r15, [r2, 8] ; fusion_pri=98, pri=92 \n\ + sub r5, r5, r15 ; fusion_pri=100, pri=100 \n\ + ldr r11, [r1, 0] ; fusion_pri=99, pri=100 \n\ + add r4, r4, r11 ; fusion_pri=100, pri=100 \n\ + ldr r16, [r2, 12] ; fusion_pri=98, pri=88 \n\ + sub r5, r5, r16 ; fusion_pri=100, pri=100 \n\ +\n\ +Scheduling fusion pass then sorts all ready to issue instructions according\n\ +to the priorities. As a result, instructions of same fusion type will be\n\ +pushed together in instruction flow, like:\n\ +\n\ + ldr r11, [r1, 0]\n\ + ldr r10, [r1, 4]\n\ + ldr r15, [r2, 8]\n\ + ldr r16, [r2, 12]\n\ + add r4, r4, r10\n\ + sub r5, r5, r15\n\ + add r4, r4, r11\n\ + sub r5, r5, r16\n\ +\n\ +Now peephole2 pass can simply merge the two pairs of loads.\n\ +\n\ +Since scheduling fusion pass relies on peephole2 to do real fusion\n\ +work, it is only enabled by default when peephole2 is in effect.\n\ +\n\ +This is firstly introduced on ARM/AArch64 targets, please refer to\n\ +the hook implementation for how different fusion types are supported.", +void, (rtx_insn *insn, int max_pri, int *fusion_pri, int *pri), NULL) + HOOK_VECTOR_END (sched) /* Functions relating to OpenMP and Cilk Plus SIMD clones. */ diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog index c61744b..b1e9bc3 100644 --- a/gcc/testsuite/ChangeLog +++ b/gcc/testsuite/ChangeLog @@ -1,3 +1,8 @@ +2014-11-14 Bin Cheng + + * gcc.target/arm/ldrd-strd-pair-1.c: New test. + * gcc.target/arm/vfp-1.c: Improve scanning string. + 2014-11-13 Rong Xu PR debug/63581 diff --git a/gcc/testsuite/gcc.target/arm/ldrd-strd-pair-1.c b/gcc/testsuite/gcc.target/arm/ldrd-strd-pair-1.c new file mode 100644 index 0000000..7a0bff5 --- /dev/null +++ b/gcc/testsuite/gcc.target/arm/ldrd-strd-pair-1.c @@ -0,0 +1,23 @@ +/* { dg-do compile } */ +/* { dg-require-effective-target arm_prefer_ldrd_strd } */ +/* { dg-options "-O2 -mthumb" } */ + +struct +{ + int x; + int y; + char c; + int d; +}a; + +int foo(int x, int y) +{ + int c; + a.x = x; + c = a.x; + a.d = c; + a.y = y; + + return 0; +} +/* { dg-final { scan-assembler "strd\t" { target { arm_thumb2_ok } } } } */ diff --git a/gcc/testsuite/gcc.target/arm/vfp-1.c b/gcc/testsuite/gcc.target/arm/vfp-1.c index 8ceef2b..b6bb7be 100644 --- a/gcc/testsuite/gcc.target/arm/vfp-1.c +++ b/gcc/testsuite/gcc.target/arm/vfp-1.c @@ -126,7 +126,7 @@ void test_convert () { } void test_ldst (float f[], double d[]) { - /* { dg-final { scan-assembler "vldr.32.+ \\\[r0, #1020\\\]" } } */ + /* { dg-final { scan-assembler "vldr.32.+ \\\[r0, #-?\[0-9\]+\\\]" } } */ /* { dg-final { scan-assembler "vldr.32.+ \\\[r\[0-9\], #-1020\\\]" { target { arm32 && { ! arm_thumb2_ok } } } } } */ /* { dg-final { scan-assembler "add.+ r0, #1024" } } */ /* { dg-final { scan-assembler "vstr.32.+ \\\[r\[0-9\]\\\]\n" } } */ diff --git a/gcc/timevar.def b/gcc/timevar.def index 95c84ad..5d8d0e7 100644 --- a/gcc/timevar.def +++ b/gcc/timevar.def @@ -247,6 +247,7 @@ DEFTIMEVAR (TV_IFCVT2 , "if-conversion 2") DEFTIMEVAR (TV_COMBINE_STACK_ADJUST , "combine stack adjustments") DEFTIMEVAR (TV_PEEPHOLE2 , "peephole 2") DEFTIMEVAR (TV_RENAME_REGISTERS , "rename registers") +DEFTIMEVAR (TV_SCHED_FUSION , "scheduling fusion") DEFTIMEVAR (TV_CPROP_REGISTERS , "hard reg cprop") DEFTIMEVAR (TV_SCHED2 , "scheduling 2") DEFTIMEVAR (TV_MACH_DEP , "machine dep reorg") diff --git a/gcc/tree-pass.h b/gcc/tree-pass.h index b8f2801..a3e1e72 100644 --- a/gcc/tree-pass.h +++ b/gcc/tree-pass.h @@ -552,6 +552,7 @@ extern rtl_opt_pass *make_pass_branch_target_load_optimize1 (gcc::context extern rtl_opt_pass *make_pass_thread_prologue_and_epilogue (gcc::context *ctxt); extern rtl_opt_pass *make_pass_stack_adjustments (gcc::context *ctxt); +extern rtl_opt_pass *make_pass_sched_fusion (gcc::context *ctxt); extern rtl_opt_pass *make_pass_peephole2 (gcc::context *ctxt); extern rtl_opt_pass *make_pass_if_after_reload (gcc::context *ctxt); extern rtl_opt_pass *make_pass_regrename (gcc::context *ctxt); -- 2.7.4