From 8e7a23728f66d2da88b47e34224410457fdefbf5 Mon Sep 17 00:00:00 2001 From: Richard Sandiford Date: Mon, 10 Jan 2022 14:47:08 +0000 Subject: [PATCH] ira: Try to avoid propagating conflicts MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit Suppose that: - an inner loop L contains an allocno A - L clobbers hard register R while A is live - A's parent allocno is AP Previously, propagate_allocno_info would propagate conflict sets up the loop tree, so that the conflict between A and R would become a conflict between AP and R (and so on for ancestors of AP). However, when IRA treats loops as separate allocation regions, it can decide on a loop-by-loop basis whether to allocate a register or spill to memory. Conflicts in inner loops therefore don't need to become hard conflicts in parent loops. Instead we can record that using the “conflicting” registers for the parent allocnos has a higher cost. In the example above, this higher cost is the sum of: - the cost of saving R on entry to L - the cost of keeping the pseudo register in memory throughout L - the cost of reloading R on exit from L This value is also a cap on the hard register cost that A can contribute to AP in general (not just for conflicts). Whatever allocation we pick for AP, there is always the option of spilling that register to memory throughout L, so the cost to A of allocating a register to AP can't be more than the cost of spilling A. To take an extreme example: if allocating a register R2 to A is more expensive than spilling A to memory, ALLOCNO_HARD_REG_COSTS (A)[R2] could be (say) 2 times greater than ALLOCNO_MEMORY_COST (A) or 100 times greater than ALLOCNO_MEMORY_COST (A). But this scale factor doesn't matter to AP. All that matters is that R2 is more expensive than memory for A, so that allocating R2 to AP should be costed as spilling A to memory (again assuming that A and AP are in different allocation regions). Propagating a factor of 100 would distort the register costs for AP. move_spill_restore tries to undo the propagation done by propagate_allocno_info, so we need some extra processing there. gcc/ PR rtl-optimization/98782 * ira-int.h (ira_allocno::might_conflict_with_parent_p): New field. (ALLOCNO_MIGHT_CONFLICT_WITH_PARENT_P): New macro. (ira_single_region_allocno_p): New function. (ira_total_conflict_hard_regs): Likewise. * ira-build.c (ira_create_allocno): Initialize ALLOCNO_MIGHT_CONFLICT_WITH_PARENT_P. (ira_propagate_hard_reg_costs): New function. (propagate_allocno_info): Use it. Try to avoid propagating hard register conflicts to parent allocnos if we can handle the conflicts by spilling instead. Limit the propagated register costs to the cost of spilling throughout the child loop. * ira-color.c (color_pass): Use ira_single_region_allocno_p to test whether a child and parent allocno can share the same register. (move_spill_restore): Adjust for the new behavior of propagate_allocno_info. gcc/testsuite/ * gcc.target/aarch64/reg-alloc-2.c: New test. --- gcc/ira-build.c | 55 +++++++++++++++++++++++--- gcc/ira-color.c | 53 ++++++++++++++++--------- gcc/ira-int.h | 37 +++++++++++++++++ gcc/testsuite/gcc.target/aarch64/reg-alloc-2.c | 47 ++++++++++++++++++++++ 4 files changed, 169 insertions(+), 23 deletions(-) create mode 100644 gcc/testsuite/gcc.target/aarch64/reg-alloc-2.c diff --git a/gcc/ira-build.c b/gcc/ira-build.c index 0547edf..875b4d8 100644 --- a/gcc/ira-build.c +++ b/gcc/ira-build.c @@ -497,6 +497,7 @@ ira_create_allocno (int regno, bool cap_p, bitmap_set_bit (loop_tree_node->all_allocnos, ALLOCNO_NUM (a)); ALLOCNO_NREFS (a) = 0; ALLOCNO_FREQ (a) = 0; + ALLOCNO_MIGHT_CONFLICT_WITH_PARENT_P (a) = false; ALLOCNO_HARD_REGNO (a) = -1; ALLOCNO_CALL_FREQ (a) = 0; ALLOCNO_CALLS_CROSSED_NUM (a) = 0; @@ -1991,6 +1992,35 @@ propagate_modified_regnos (ira_loop_tree_node_t loop_tree_node) loop_tree_node->modified_regnos); } +/* Propagate ALLOCNO_HARD_REG_COSTS from A to PARENT_A. Use SPILL_COST + as the cost of spilling a register throughout A (which we have to do + for PARENT_A allocations that conflict with A). */ +static void +ira_propagate_hard_reg_costs (ira_allocno_t parent_a, ira_allocno_t a, + int spill_cost) +{ + HARD_REG_SET conflicts = ira_total_conflict_hard_regs (a); + conflicts &= ~ira_total_conflict_hard_regs (parent_a); + + auto costs = ALLOCNO_HARD_REG_COSTS (a); + if (!hard_reg_set_empty_p (conflicts)) + ALLOCNO_MIGHT_CONFLICT_WITH_PARENT_P (a) = true; + else if (!costs) + return; + + auto aclass = ALLOCNO_CLASS (a); + ira_allocate_and_set_costs (&ALLOCNO_HARD_REG_COSTS (parent_a), + aclass, ALLOCNO_CLASS_COST (parent_a)); + auto parent_costs = ALLOCNO_HARD_REG_COSTS (parent_a); + for (int i = 0; i < ira_class_hard_regs_num[aclass]; ++i) + if (TEST_HARD_REG_BIT (conflicts, ira_class_hard_regs[aclass][i])) + parent_costs[i] += spill_cost; + else if (costs) + /* The cost to A of allocating this register to PARENT_A can't + be more than the cost of spilling the register throughout A. */ + parent_costs[i] += MIN (costs[i], spill_cost); +} + /* Propagate new info about allocno A (see comments about accumulated info in allocno definition) to the corresponding allocno on upper loop tree level. So allocnos on upper levels accumulate @@ -2019,12 +2049,27 @@ propagate_allocno_info (void) && bitmap_bit_p (ALLOCNO_LOOP_TREE_NODE (a)->border_allocnos, ALLOCNO_NUM (a))) { + /* Calculate the cost of storing to memory on entry to A's loop, + referencing as memory within A's loop, and restoring from + memory on exit from A's loop. */ + ira_loop_border_costs border_costs (a); + int spill_cost = INT_MAX; + if (ira_subloop_allocnos_can_differ_p (parent_a)) + spill_cost = (border_costs.spill_inside_loop_cost () + + ALLOCNO_MEMORY_COST (a)); + if (! ALLOCNO_BAD_SPILL_P (a)) ALLOCNO_BAD_SPILL_P (parent_a) = false; ALLOCNO_NREFS (parent_a) += ALLOCNO_NREFS (a); ALLOCNO_FREQ (parent_a) += ALLOCNO_FREQ (a); + + /* If A's allocation can differ from PARENT_A's, we can if necessary + spill PARENT_A on entry to A's loop and restore it afterwards. + Doing that has cost SPILL_COST. */ + if (!ira_subloop_allocnos_can_differ_p (parent_a)) + merge_hard_reg_conflicts (a, parent_a, true); + ALLOCNO_CALL_FREQ (parent_a) += ALLOCNO_CALL_FREQ (a); - merge_hard_reg_conflicts (a, parent_a, true); ALLOCNO_CALLS_CROSSED_NUM (parent_a) += ALLOCNO_CALLS_CROSSED_NUM (a); ALLOCNO_CHEAP_CALLS_CROSSED_NUM (parent_a) @@ -2037,15 +2082,15 @@ propagate_allocno_info (void) += ALLOCNO_EXCESS_PRESSURE_POINTS_NUM (a); aclass = ALLOCNO_CLASS (a); ira_assert (aclass == ALLOCNO_CLASS (parent_a)); - ira_allocate_and_accumulate_costs - (&ALLOCNO_HARD_REG_COSTS (parent_a), aclass, - ALLOCNO_HARD_REG_COSTS (a)); + ira_propagate_hard_reg_costs (parent_a, a, spill_cost); ira_allocate_and_accumulate_costs (&ALLOCNO_CONFLICT_HARD_REG_COSTS (parent_a), aclass, ALLOCNO_CONFLICT_HARD_REG_COSTS (a)); + /* The cost to A of allocating a register to PARENT_A can't be + more than the cost of spilling the register throughout A. */ ALLOCNO_CLASS_COST (parent_a) - += ALLOCNO_CLASS_COST (a); + += MIN (ALLOCNO_CLASS_COST (a), spill_cost); ALLOCNO_MEMORY_COST (parent_a) += ALLOCNO_MEMORY_COST (a); } } diff --git a/gcc/ira-color.c b/gcc/ira-color.c index ae0f08a..4344ee6 100644 --- a/gcc/ira-color.c +++ b/gcc/ira-color.c @@ -3344,7 +3344,7 @@ color_pass (ira_loop_tree_node_t loop_tree_node) unsigned int j; bitmap_iterator bi; machine_mode mode; - enum reg_class rclass, aclass, pclass; + enum reg_class rclass, aclass; ira_allocno_t a, subloop_allocno; ira_loop_tree_node_t subloop_node; @@ -3389,10 +3389,9 @@ color_pass (ira_loop_tree_node_t loop_tree_node) /* Remove from processing in the next loop. */ bitmap_clear_bit (consideration_allocno_bitmap, j); rclass = ALLOCNO_CLASS (a); - pclass = ira_pressure_class_translate[rclass]; - if (flag_ira_region == IRA_REGION_MIXED - && (loop_tree_node->reg_pressure[pclass] - <= ira_class_hard_regs_num[pclass])) + subloop_allocno = ALLOCNO_CAP_MEMBER (a); + subloop_node = ALLOCNO_LOOP_TREE_NODE (subloop_allocno); + if (ira_single_region_allocno_p (a, subloop_allocno)) { mode = ALLOCNO_MODE (a); hard_regno = ALLOCNO_HARD_REGNO (a); @@ -3402,8 +3401,6 @@ color_pass (ira_loop_tree_node_t loop_tree_node) ira_assert (index >= 0); } regno = ALLOCNO_REGNO (a); - subloop_allocno = ALLOCNO_CAP_MEMBER (a); - subloop_node = ALLOCNO_LOOP_TREE_NODE (subloop_allocno); ira_assert (!ALLOCNO_ASSIGNED_P (subloop_allocno)); ALLOCNO_HARD_REGNO (subloop_allocno) = hard_regno; ALLOCNO_ASSIGNED_P (subloop_allocno) = true; @@ -3426,7 +3423,6 @@ color_pass (ira_loop_tree_node_t loop_tree_node) ira_assert (ALLOCNO_CAP_MEMBER (a) == NULL); mode = ALLOCNO_MODE (a); rclass = ALLOCNO_CLASS (a); - pclass = ira_pressure_class_translate[rclass]; hard_regno = ALLOCNO_HARD_REGNO (a); /* Use hard register class here. ??? */ if (hard_regno >= 0) @@ -3443,11 +3439,11 @@ color_pass (ira_loop_tree_node_t loop_tree_node) ira_assert (ALLOCNO_CLASS (subloop_allocno) == rclass); ira_assert (bitmap_bit_p (subloop_node->all_allocnos, ALLOCNO_NUM (subloop_allocno))); - if ((flag_ira_region == IRA_REGION_MIXED - && (loop_tree_node->reg_pressure[pclass] - <= ira_class_hard_regs_num[pclass])) + if (ira_single_region_allocno_p (a, subloop_allocno) || !ira_subloop_allocnos_can_differ_p (a, hard_regno >= 0)) { + gcc_assert (!ALLOCNO_MIGHT_CONFLICT_WITH_PARENT_P + (subloop_allocno)); if (! ALLOCNO_ASSIGNED_P (subloop_allocno)) { ALLOCNO_HARD_REGNO (subloop_allocno) = hard_regno; @@ -3583,14 +3579,35 @@ move_spill_restore (void) if (subloop_allocno == NULL) continue; ira_assert (rclass == ALLOCNO_CLASS (subloop_allocno)); - /* We have accumulated cost. To get the real cost of - allocno usage in the loop we should subtract costs of - the subloop allocnos. */ - cost -= (ALLOCNO_MEMORY_COST (subloop_allocno) - - (ALLOCNO_HARD_REG_COSTS (subloop_allocno) == NULL - ? ALLOCNO_CLASS_COST (subloop_allocno) - : ALLOCNO_HARD_REG_COSTS (subloop_allocno)[index])); ira_loop_border_costs border_costs (subloop_allocno); + + /* We have accumulated cost. To get the real cost of + allocno usage in the loop we should subtract the costs + added by propagate_allocno_info for the subloop allocnos. */ + int reg_cost + = (ALLOCNO_HARD_REG_COSTS (subloop_allocno) == NULL + ? ALLOCNO_CLASS_COST (subloop_allocno) + : ALLOCNO_HARD_REG_COSTS (subloop_allocno)[index]); + + int spill_cost + = (border_costs.spill_inside_loop_cost () + + ALLOCNO_MEMORY_COST (subloop_allocno)); + + /* If HARD_REGNO conflicts with SUBLOOP_A then + propagate_allocno_info will have propagated + the cost of spilling HARD_REGNO in SUBLOOP_NODE. + (ira_subloop_allocnos_can_differ_p must be true + in that case.) Otherwise, SPILL_COST acted as + a cap on the propagated register cost, in cases + where the allocations can differ. */ + auto conflicts = ira_total_conflict_hard_regs (subloop_allocno); + if (TEST_HARD_REG_BIT (conflicts, hard_regno)) + reg_cost = spill_cost; + else if (ira_subloop_allocnos_can_differ_p (a)) + reg_cost = MIN (reg_cost, spill_cost); + + cost -= ALLOCNO_MEMORY_COST (subloop_allocno) - reg_cost; + if ((hard_regno2 = ALLOCNO_HARD_REGNO (subloop_allocno)) < 0) /* The register was spilled in the subloop. If we spill it in the outer loop too then we'll no longer need to diff --git a/gcc/ira-int.h b/gcc/ira-int.h index c5b1a13..8b87498 100644 --- a/gcc/ira-int.h +++ b/gcc/ira-int.h @@ -314,6 +314,13 @@ struct ira_allocno vector where a bit with given index represents allocno with the same number. */ unsigned int conflict_vec_p : 1; + /* True if the parent loop has an allocno for the same register and + if the parent allocno's assignment might not be valid in this loop. + This means that we cannot merge this allocno and the parent allocno + together. + + This is only ever true for non-cap allocnos. */ + unsigned int might_conflict_with_parent_p : 1; /* Hard register assigned to given allocno. Negative value means that memory was allocated to the allocno. During the reload, spilled allocno has value equal to the corresponding stack slot @@ -423,6 +430,8 @@ struct ira_allocno #define ALLOCNO_CAP_MEMBER(A) ((A)->cap_member) #define ALLOCNO_NREFS(A) ((A)->nrefs) #define ALLOCNO_FREQ(A) ((A)->freq) +#define ALLOCNO_MIGHT_CONFLICT_WITH_PARENT_P(A) \ + ((A)->might_conflict_with_parent_p) #define ALLOCNO_HARD_REGNO(A) ((A)->hard_regno) #define ALLOCNO_CALL_FREQ(A) ((A)->call_freq) #define ALLOCNO_CALLS_CROSSED_NUM(A) ((A)->calls_crossed_num) @@ -1623,4 +1632,32 @@ ira_subloop_allocnos_can_differ_p (ira_allocno_t a, bool allocated_p = true) return true; } +/* Return true if we should treat A and SUBLOOP_A as belonging to a + single region. */ +inline bool +ira_single_region_allocno_p (ira_allocno_t a, ira_allocno_t subloop_a) +{ + if (flag_ira_region != IRA_REGION_MIXED) + return false; + + if (ALLOCNO_MIGHT_CONFLICT_WITH_PARENT_P (subloop_a)) + return false; + + auto rclass = ALLOCNO_CLASS (a); + auto pclass = ira_pressure_class_translate[rclass]; + auto loop_used_regs = ALLOCNO_LOOP_TREE_NODE (a)->reg_pressure[pclass]; + return loop_used_regs <= ira_class_hard_regs_num[pclass]; +} + +/* Return the set of all hard registers that conflict with A. */ +inline HARD_REG_SET +ira_total_conflict_hard_regs (ira_allocno_t a) +{ + auto obj_0 = ALLOCNO_OBJECT (a, 0); + HARD_REG_SET conflicts = OBJECT_TOTAL_CONFLICT_HARD_REGS (obj_0); + for (int i = 1; i < ALLOCNO_NUM_OBJECTS (a); i++) + conflicts |= OBJECT_TOTAL_CONFLICT_HARD_REGS (ALLOCNO_OBJECT (a, i)); + return conflicts; +} + #endif /* GCC_IRA_INT_H */ diff --git a/gcc/testsuite/gcc.target/aarch64/reg-alloc-2.c b/gcc/testsuite/gcc.target/aarch64/reg-alloc-2.c new file mode 100644 index 0000000..d4d260c --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/reg-alloc-2.c @@ -0,0 +1,47 @@ +/* { dg-options "-O2 -fno-schedule-insns -fno-schedule-insns2" } */ +/* { dg-final { check-function-bodies "**" "" "" { target lp64 } } } */ + +#define PROB 0.1 + +struct L +{ + int data; + volatile struct L *next; + volatile struct L *inner; +}; + +/* The thing we're testing here is that the !head->inner path of the outer loop + body has no stack accesses. It's possible that we'll need to update this + pattern for unrelated code changes. but the test should be XFAILed rather + than changed if any new stack accesses occur on the !head->inner path. */ +/* +** foo: +** ... +** ldr (w[0-9]+), \[(x[0-9]+)\] +** add (w[0-9]+), (?:\3, \1|\1, \3) +** ldr (x[0-9]+), \[\2, #?16\] +** str \3, \[\2\] +** ldr \2, \[\2, #?8\] +** cbn?z \4, .* +** ... +** ret +*/ +void +foo (volatile struct L *head, int inc) +{ + while (head) + { + inc = head->data + inc; + volatile struct L *inner = head->inner; + head->data = inc; + head = head->next; + if (__builtin_expect_with_probability (inner != 0, 0, PROB)) + for (int i = 0; i < 1000; ++i) + /* Leave x30 for i. */ + asm volatile ("// foo" ::: + "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", + "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", + "x16", "x17", "x18", "x19", "x20", "x21", "x22", "x23", + "x24", "x25", "x26", "x27", "x28"); + } +} -- 2.7.4