+2007-02-09 Jan Hubicka <jh@suse.cz>
+
+ * Makefile.in (passes.o, ipa-inline.o): Add dependencies.
+ * cgraphbuild.c (build_cgraph_edges): Compute frequencies.
+ (rebuild_cgraph_edges): Likewise.
+ * cgraph.c (cgraph_set_call_stmt): Add new argument frequency.
+ (dump_cgraph_node): Dump frequencies.
+ (cgraph_clone_edge): Add frequency scales.
+ (cgraph_clone_node): Add freuqnecy.
+ * cgraph.h (cgraph_edge): Add freuqnecy argument.
+ (CGRAPH_FREQ_BASE, CGRAPH_FREQ_MAX): New constants.
+ (cgraph_create_edge, cgraph_clone_edge, cgraph_clone_node): Update.
+ * tree-pass.h (TODO_rebuild_frequencies): New constant.
+ * cgraphunit.c (verify_cgraph_node): Verify frequencies.
+ (cgraph_copy_node_for_versioning): Update call of cgraph_clone_edge.
+ (save_inline_function_body): Likewise.
+ * ipa-inline.c: inluce rtl.h
+ (cgraph_clone_inlined_nods): Update call of cgraph_clone_node.
+ (cgraph_edge_badness): Use frequencies.
+ (cgraph_decide_recursive_inlining): Update clonning.
+ (cgraph_decide_inlining_of_small_function): Dump frequency.
+ * predict.c (estimate_bb_frequencies): Export.
+ * predict.h (estimate_bb_frequencies): Declare.
+ * tree-inline.c (copy_bb): Watch overflows.
+ (expand_call_inline): Update call of cgraph_create_edge.
+ (optimize_inline_calls): Use TODO flags to update frequnecies.
+ * passes.h: Include predict.h
+ (init_optimization_passes): Move profile ahead.
+ (execute_function_todo): Handle TODO_rebuild_frequencies.
+
2007-02-09 Roger Sayle <roger@eyesopen.com>
* config/alpha/alpha.c (emit_insxl): Force the first operand of
langhooks.h insn-flags.h $(CFGLAYOUT_H) $(REAL_H) $(CFGLOOP_H) \
hosthooks.h $(CGRAPH_H) $(COVERAGE_H) tree-pass.h $(TREE_DUMP_H) \
$(GGC_H) $(INTEGRATE_H) $(CPPLIB_H) opts.h $(TREE_FLOW_H) $(TREE_INLINE_H) \
- gt-passes.h
+ gt-passes.h $(PREDICT_H)
main.o : main.c $(CONFIG_H) $(SYSTEM_H) coretypes.h $(TM_H) toplev.h
ipa-inline.o : ipa-inline.c gt-ipa-inline.h $(CONFIG_H) $(SYSTEM_H) coretypes.h $(TM_H) \
$(TREE_H) langhooks.h $(TREE_INLINE_H) $(FLAGS_H) $(CGRAPH_H) intl.h \
$(DIAGNOSTIC_H) $(FIBHEAP_H) $(PARAMS_H) $(TIMEVAR_H) tree-pass.h \
- $(COVERAGE_H) $(HASHTAB_H)
+ $(COVERAGE_H) $(HASHTAB_H) $(RTL_H)
ipa-utils.o : ipa-utils.c $(IPA_UTILS_H) $(CONFIG_H) $(SYSTEM_H) \
coretypes.h $(TM_H) $(TREE_H) $(TREE_FLOW_H) $(TREE_INLINE_H) langhooks.h \
pointer-set.h $(GGC_H) $(C_COMMON_H) $(TREE_GIMPLE_H) \
struct cgraph_edge *
cgraph_create_edge (struct cgraph_node *caller, struct cgraph_node *callee,
- tree call_stmt, gcov_type count, int nest)
+ tree call_stmt, gcov_type count, int freq, int nest)
{
struct cgraph_edge *edge = GGC_NEW (struct cgraph_edge);
#ifdef ENABLE_CHECKING
caller->callees = edge;
callee->callers = edge;
edge->count = count;
+ gcc_assert (count >= 0);
+ edge->frequency = freq;
+ gcc_assert (freq >= 0);
+ gcc_assert (freq <= CGRAPH_FREQ_MAX);
edge->loop_nest = nest;
if (caller->call_site_hash)
{
if (edge->count)
fprintf (f, "("HOST_WIDEST_INT_PRINT_DEC"x) ",
(HOST_WIDEST_INT)edge->count);
+ if (edge->frequency)
+ fprintf (f, "(%.2f per call) ",
+ edge->frequency / (double)CGRAPH_FREQ_BASE);
if (!edge->inline_failed)
fprintf(f, "(inlined) ");
}
if (edge->count)
fprintf (f, "("HOST_WIDEST_INT_PRINT_DEC"x) ",
(HOST_WIDEST_INT)edge->count);
+ if (edge->frequency)
+ fprintf (f, "(%.2f per call) ",
+ edge->frequency / (double)CGRAPH_FREQ_BASE);
if (edge->loop_nest)
fprintf (f, "(nested in %i loops) ", edge->loop_nest);
}
/* Create clone of E in the node N represented by CALL_EXPR the callgraph. */
struct cgraph_edge *
cgraph_clone_edge (struct cgraph_edge *e, struct cgraph_node *n,
- tree call_stmt, gcov_type count_scale, int loop_nest,
- bool update_original)
+ tree call_stmt, gcov_type count_scale, int freq_scale,
+ int loop_nest, bool update_original)
{
struct cgraph_edge *new;
+ gcov_type count = e->count * count_scale / REG_BR_PROB_BASE;
+ gcov_type freq = e->frequency * (gcov_type) freq_scale / CGRAPH_FREQ_BASE;
- new = cgraph_create_edge (n, e->callee, call_stmt,
- e->count * count_scale / REG_BR_PROB_BASE,
+ if (freq > CGRAPH_FREQ_MAX)
+ freq = CGRAPH_FREQ_MAX;
+ new = cgraph_create_edge (n, e->callee, call_stmt, count, freq,
e->loop_nest + loop_nest);
new->inline_failed = e->inline_failed;
function's profile to reflect the fact that part of execution is handled
by node. */
struct cgraph_node *
-cgraph_clone_node (struct cgraph_node *n, gcov_type count, int loop_nest,
+cgraph_clone_node (struct cgraph_node *n, gcov_type count, int freq, int loop_nest,
bool update_original)
{
struct cgraph_node *new = cgraph_create_node ();
}
for (e = n->callees;e; e=e->next_callee)
- cgraph_clone_edge (e, new, e->call_stmt, count_scale, loop_nest,
+ cgraph_clone_edge (e, new, e->call_stmt, count_scale, freq, loop_nest,
update_original);
new->next_clone = n->next_clone;
const char *inline_failed;
/* Expected number of executions: calculated in profile.c. */
gcov_type count;
+ /* Expected frequency of executions within the function.
+ When set to CGRAPH_FREQ_BASE, the edge is expected to be called once
+ per function call. The range is 0 to CGRAPH_FREQ_MAX. */
+ int frequency;
/* Depth of loop nest, 1 means no loop nest. */
int loop_nest;
};
+#define CGRAPH_FREQ_BASE 1000
+#define CGRAPH_FREQ_MAX 100000
+
typedef struct cgraph_edge *cgraph_edge_p;
DEF_VEC_P(cgraph_edge_p);
void cgraph_node_remove_callees (struct cgraph_node *node);
struct cgraph_edge *cgraph_create_edge (struct cgraph_node *,
struct cgraph_node *,
- tree, gcov_type, int);
+ tree, gcov_type, int, int);
struct cgraph_node *cgraph_node (tree);
struct cgraph_node *cgraph_node_for_asm (tree asmname);
struct cgraph_edge *cgraph_edge (struct cgraph_node *, tree);
const char * cgraph_node_name (struct cgraph_node *);
struct cgraph_edge * cgraph_clone_edge (struct cgraph_edge *,
struct cgraph_node *,
- tree, gcov_type, int, bool);
-struct cgraph_node * cgraph_clone_node (struct cgraph_node *, gcov_type,
+ tree, gcov_type, int, int, bool);
+struct cgraph_node * cgraph_clone_node (struct cgraph_node *, gcov_type, int,
int, bool);
void cgraph_redirect_edge_callee (struct cgraph_edge *, struct cgraph_node *);
struct pointer_set_t *visited_nodes = pointer_set_create ();
block_stmt_iterator bsi;
tree step;
+ int entry_freq = ENTRY_BLOCK_PTR->frequency;
+
+ if (!entry_freq)
+ entry_freq = 1;
/* Create the callgraph edges and record the nodes referenced by the function.
body. */
if (call && (decl = get_callee_fndecl (call)))
{
+ int freq = (!bb->frequency && !entry_freq ? CGRAPH_FREQ_BASE
+ : bb->frequency * CGRAPH_FREQ_BASE / entry_freq);
+ if (freq > CGRAPH_FREQ_MAX)
+ freq = CGRAPH_FREQ_MAX;
cgraph_create_edge (node, cgraph_node (decl), stmt,
- bb->count,
+ bb->count, freq,
bb->loop_depth);
walk_tree (&TREE_OPERAND (call, 1),
record_reference, node, visited_nodes);
basic_block bb;
struct cgraph_node *node = cgraph_node (current_function_decl);
block_stmt_iterator bsi;
+ int entry_freq = ENTRY_BLOCK_PTR->frequency;
+
+ if (!entry_freq)
+ entry_freq = 1;
cgraph_node_remove_callees (node);
tree decl;
if (call && (decl = get_callee_fndecl (call)))
- cgraph_create_edge (node, cgraph_node (decl), stmt,
- bb->count,
- bb->loop_depth);
+ {
+ int freq = (!bb->frequency && !entry_freq ? CGRAPH_FREQ_BASE
+ : bb->frequency * CGRAPH_FREQ_BASE / entry_freq);
+ if (freq > CGRAPH_FREQ_MAX)
+ freq = CGRAPH_FREQ_MAX;
+ cgraph_create_edge (node, cgraph_node (decl), stmt,
+ bb->count, freq, bb->loop_depth);
+ }
}
initialize_inline_failed (node);
gcc_assert (!node->global.inlined_to);
error ("caller edge count is negative");
error_found = true;
}
+ if (e->frequency < 0)
+ {
+ error ("caller edge frequency is negative");
+ error_found = true;
+ }
+ if (e->frequency > CGRAPH_FREQ_MAX)
+ {
+ error ("caller edge frequency is too large");
+ error_found = true;
+ }
if (!e->inline_failed)
{
if (node->global.inlined_to
also cloned. */
for (e = old_version->callees;e; e=e->next_callee)
{
- new_e = cgraph_clone_edge (e, new_version, e->call_stmt, 0, e->loop_nest, true);
+ new_e = cgraph_clone_edge (e, new_version, e->call_stmt, 0, e->frequency,
+ e->loop_nest, true);
new_e->count = e->count;
}
/* Fix recursive calls.
{
struct cgraph_edge *e;
- first_clone = cgraph_clone_node (node, node->count, 0, false);
+ first_clone = cgraph_clone_node (node, node->count, 0, CGRAPH_FREQ_BASE,
+ false);
first_clone->needed = 0;
first_clone->reachable = 1;
/* Recursively clone all bodies. */
#include "coverage.h"
#include "ggc.h"
#include "tree-flow.h"
+#include "rtl.h"
/* Mode incremental inliner operate on:
else
{
struct cgraph_node *n;
- n = cgraph_clone_node (e->callee, e->count, e->loop_nest,
+ n = cgraph_clone_node (e->callee, e->count, e->frequency, e->loop_nest,
update_original);
cgraph_redirect_edge_callee (e, n);
}
smallest badness are inlined first. After each inlining is performed
the costs of all caller edges of nodes affected are recomputed so the
metrics may accurately depend on values such as number of inlinable callers
- of the function or function body size.
-
- With profiling we use number of executions of each edge to drive the cost.
- We also should distinguish hot and cold calls where the cold calls are
- inlined into only when code size is overall improved.
- */
+ of the function or function body size. */
static int
cgraph_edge_badness (struct cgraph_edge *edge)
{
+ int badness;
+ int growth =
+ cgraph_estimate_size_after_inlining (1, edge->caller, edge->callee);
+
+ growth -= edge->caller->global.insns;
+
+ /* Always prefer inlining saving code size. */
+ if (growth <= 0)
+ badness = INT_MIN - growth;
+
+ /* When profiling is available, base priorities -(#calls / growth).
+ So we optimize for overall number of "executed" inlined calls. */
if (max_count)
+ badness = ((int)((double)edge->count * INT_MIN / max_count)) / growth;
+
+ /* When function local profile is available, base priorities on
+ growth / frequency, so we optimize for overall frequency of inlined
+ calls. This is not too accurate since while the call might be frequent
+ within function, the function itself is infrequent.
+
+ Other objective to optimize for is number of different calls inlined.
+ We add the estimated growth after inlining all functions to biass the
+ priorities slightly in this direction (so fewer times called functions
+ of the same size gets priority). */
+ else if (flag_guess_branch_prob)
{
+ int div = edge->frequency * 100 / CGRAPH_FREQ_BASE;
int growth =
cgraph_estimate_size_after_inlining (1, edge->caller, edge->callee);
growth -= edge->caller->global.insns;
+ badness = growth * 256;
+
+ /* Decrease badness if call is nested. */
+ /* Compress the range so we don't overflow. */
+ if (div > 256)
+ div = 256 + ceil_log2 (div) - 8;
+ if (div < 1)
+ div = 1;
+ if (badness > 0)
+ badness /= div;
+ badness += cgraph_estimate_growth (edge->callee);
+ }
+ /* When function local profile is not available or it does not give
+ useful information (ie frequency is zero), base the cost on
+ loop nest and overall size growth, so we optimize for overall number
+ of functions fully inlined in program. */
+ else
+ {
+ int nest = MIN (edge->loop_nest, 8);
+ badness = cgraph_estimate_growth (edge->callee) * 256;
- /* Always prefer inlining saving code size. */
- if (growth <= 0)
- return INT_MIN - growth;
- return ((int)((double)edge->count * INT_MIN / max_count)) / growth;
+ /* Decrease badness if call is nested. */
+ if (badness > 0)
+ badness >>= nest;
+ else
+ {
+ badness <<= nest;
+ }
}
+ /* Make recursive inlining happen always after other inlining is done. */
+ if (cgraph_recursive_inlining_p (edge->caller, edge->callee, NULL))
+ return badness + 1;
else
- {
- int nest = MIN (edge->loop_nest, 8);
- int badness = cgraph_estimate_growth (edge->callee) * 256;
-
- /* Decrease badness if call is nested. */
- if (badness > 0)
- badness >>= nest;
- else
- badness <<= nest;
-
- /* Make recursive inlining happen always after other inlining is done. */
- if (cgraph_recursive_inlining_p (edge->caller, edge->callee, NULL))
- return badness + 1;
- else
- return badness;
- }
+ return badness;
}
/* Recompute heap nodes for each of caller edge. */
cgraph_node_name (node));
/* We need original clone to copy around. */
- master_clone = cgraph_clone_node (node, node->count, 1, false);
+ master_clone = cgraph_clone_node (node, node->count, CGRAPH_FREQ_BASE, 1, false);
master_clone->needed = true;
for (e = master_clone->callees; e; e = e->next_callee)
if (!e->inline_failed)
fprintf (dump_file,
" to be inlined into %s\n"
" Estimated growth after inlined into all callees is %+i insns.\n"
- " Estimated badness is %i.\n",
+ " Estimated badness is %i, frequency %.2f.\n",
cgraph_node_name (edge->caller),
cgraph_estimate_growth (edge->callee),
- cgraph_edge_badness (edge));
+ cgraph_edge_badness (edge),
+ edge->frequency / (double)CGRAPH_FREQ_BASE);
if (edge->count)
fprintf (dump_file," Called "HOST_WIDEST_INT_PRINT_DEC"x\n", edge->count);
}
#include "tree-flow.h"
#include "tree-pass.h"
#include "tree-dump.h"
+#include "predict.h"
#if defined (DWARF2_UNWIND_INFO) || defined (DWARF2_DEBUGGING_INFO)
#include "dwarf2out.h"
NEXT_PASS (pass_merge_phi);
NEXT_PASS (pass_dce);
NEXT_PASS (pass_tail_recursion);
+ NEXT_PASS (pass_profile);
NEXT_PASS (pass_release_ssa_names);
}
NEXT_PASS (pass_rebuild_cgraph_edges);
NEXT_PASS (pass_phiopt);
NEXT_PASS (pass_may_alias);
NEXT_PASS (pass_tail_recursion);
- NEXT_PASS (pass_profile);
NEXT_PASS (pass_ch);
NEXT_PASS (pass_stdarg);
NEXT_PASS (pass_lower_complex);
fflush (dump_file);
}
+ if (flags & TODO_rebuild_frequencies)
+ {
+ if (profile_status == PROFILE_GUESSED)
+ {
+ loop_optimizer_init (0);
+ add_noreturn_fake_exit_edges ();
+ mark_irreducible_loops ();
+ connect_infinite_loops_to_exit ();
+ estimate_bb_frequencies ();
+ remove_fake_exit_edges ();
+ loop_optimizer_finalize ();
+ }
+ else if (profile_status == PROFILE_READ)
+ counts_to_freqs ();
+ else
+ gcc_unreachable ();
+ }
+
#if defined ENABLE_CHECKING
if (flags & TODO_verify_ssa)
verify_ssa (true);
static void compute_function_frequency (void);
static void choose_function_section (void);
static bool can_predict_insn_p (rtx);
-static void estimate_bb_frequencies (void);
/* Information we hold about each branch predictor.
Filled using information from predict.def. */
/* Estimate basic blocks frequency by given branch probabilities. */
-static void
+void
estimate_bb_frequencies (void)
{
basic_block bb;
extern void predict_insn_def (rtx, enum br_predictor, enum prediction);
extern int counts_to_freqs (void);
+extern void estimate_bb_frequencies (void);
#endif /* GCC_PREDICT_H */
--- /dev/null
+inline void foo() {}
+
+int main()
+{
+ foo();
+
+#pragma omp parallel for
+ for ( int i=0; i<1; ++i )
+ foo();
+
+ return 0;
+}
copy_basic_block = create_basic_block (NULL, (void *) 0,
(basic_block) bb->prev_bb->aux);
copy_basic_block->count = bb->count * count_scale / REG_BR_PROB_BASE;
- copy_basic_block->frequency = (bb->frequency
+
+ /* We are going to rebuild frequencies from scratch. These values have just
+ small importance to drive canonicalize_loop_headers. */
+ copy_basic_block->frequency = ((gcov_type)bb->frequency
* frequency_scale / REG_BR_PROB_BASE);
+ if (copy_basic_block->frequency > BB_FREQ_MAX)
+ copy_basic_block->frequency = BB_FREQ_MAX;
copy_bsi = bsi_start (copy_basic_block);
for (bsi = bsi_start (bb);
edge = cgraph_edge (id->src_node, orig_stmt);
if (edge)
cgraph_clone_edge (edge, id->dst_node, stmt,
- REG_BR_PROB_BASE, 1, true);
+ REG_BR_PROB_BASE, 1, edge->frequency, true);
break;
case CB_CGE_MOVE_CLONES:
(incorrect node sharing is most common reason for missing edges. */
gcc_assert (dest->needed || !flag_unit_at_a_time);
cgraph_create_edge (id->dst_node, dest, stmt,
- bb->count, bb->loop_depth)->inline_failed
+ bb->count, CGRAPH_FREQ_BASE,
+ bb->loop_depth)->inline_failed
= N_("originally indirect function call not considered for inlining");
+ if (dump_file)
+ {
+ fprintf (dump_file, "Created new direct edge to %s",
+ cgraph_node_name (dest));
+ }
goto egress;
}
gcc_assert (e->inline_failed);
}
#endif
- /* We need to rescale frequencies again to peak at REG_BR_PROB_BASE
- as inlining loops might increase the maximum. */
- if (ENTRY_BLOCK_PTR->count)
- counts_to_freqs ();
/* We are not going to maintain the cgraph edges up to date.
Kill it so it won't confuse us. */
throw and they don't care to proactively update local EH info. This is
done later in fixup_cfg pass that also execute the verification. */
return (TODO_update_ssa | TODO_cleanup_cfg
- | (gimple_in_ssa_p (cfun) ? TODO_remove_unused_locals : 0));
+ | (gimple_in_ssa_p (cfun) ? TODO_remove_unused_locals : 0)
+ | (profile_status != PROFILE_ABSENT ? TODO_rebuild_frequencies : 0));
}
/* FN is a function that has a complete body, and CLONE is a function whose
#define TODO_verify_loops (1 << 6)
#define TODO_dump_cgraph (1 << 7)
#define TODO_remove_functions (1 << 8)
+#define TODO_rebuild_frequencies (1 << 9)
/* To-do flags for calls to update_ssa. */
in blocks that have one or more edges with no incoming definition
for O_j. This would lead to uninitialized warnings for O_j's
symbol. */
-#define TODO_update_ssa (1 << 9)
+#define TODO_update_ssa (1 << 10)
/* Update the SSA form without inserting any new PHI nodes at all.
This is used by passes that have either inserted all the PHI nodes
themselves or passes that need only to patch use-def and def-def
chains for virtuals (e.g., DCE). */
-#define TODO_update_ssa_no_phi (1 << 10)
+#define TODO_update_ssa_no_phi (1 << 11)
/* Insert PHI nodes everywhere they are needed. No pruning of the
IDF is done. This is used by passes that need the PHI nodes for
may be doing something wrong. Inserting PHI nodes for an old name
where not all edges carry a new replacement may lead to silent
codegen errors or spurious uninitialized warnings. */
-#define TODO_update_ssa_full_phi (1 << 11)
+#define TODO_update_ssa_full_phi (1 << 12)
/* Passes that update the SSA form on their own may want to delegate
the updating of virtual names to the generic updater. Since FUD
to do. NOTE: If this flag is used, any OLD->NEW mappings for real
names are explicitly destroyed and only the symbols marked for
renaming are processed. */
-#define TODO_update_ssa_only_virtuals (1 << 12)
+#define TODO_update_ssa_only_virtuals (1 << 13)
/* Some passes leave unused local variables that can be removed from
cfun->unexpanded_var_list. This reduces the size of dump files and
the memory footprint for VAR_DECLs. */
-#define TODO_remove_unused_locals (1 << 13)
+#define TODO_remove_unused_locals (1 << 14)
/* Internally used for the first in a sequence of passes. It is set
for the passes that are handed to register_dump_files. */
-#define TODO_set_props (1 << 14)
+#define TODO_set_props (1 << 15)
/* Set by passes that may make SMT's that were previously never used
in statements, used. */
-#define TODO_update_smt_usage (1 << 15)
+#define TODO_update_smt_usage (1 << 16)
#define TODO_update_ssa_any \
(TODO_update_ssa \