From ccfff6a89aab4ea2cd04208d8bfa9aa53c09bd71 Mon Sep 17 00:00:00 2001 From: jye2 Date: Thu, 14 Nov 2013 08:38:54 +0000 Subject: [PATCH] 2013-11-14 Julian Brown Joey Ye * config/arm/arm.c (arm_cortex_m_branch_cost): New. (arm_v7m_tune): New. (arm_slowmul_tune, arm_fastmul_tune, arm_strongarm_tune, arm_9e_tune, arm_v6t2_tune, arm_cortex_tune, arm_cortex_a15_tune, arm_cortex_a5_tune, arm_v6m_tune): Add comments for Sched adj cost. * config/arm/arm-cores.def (cortex-m4, cortex-m3): Use arm_v7m_tune. testsuite: 2013-11-14 Joey Ye * gcc.dg/tree-ssa/forwprop-28.c: Disable for cortex_m. * gcc.dg/tree-ssa/vrp47.c: Likewise. * gcc.dg/tree-ssa/vrp87.c: Likewise. * gcc.dg/tree-ssa/ssa-dom-thread-4.c: Ingore for cortex_m. * gcc.dg/tree-ssa/ssa-vrp-thread-1.c: Likewise. git-svn-id: svn+ssh://gcc.gnu.org/svn/gcc/trunk@204778 138bc75d-0d04-0410-961f-82ee72b054a4 --- gcc/ChangeLog | 13 ++++++ gcc/config/arm/arm-cores.def | 4 +- gcc/config/arm/arm.c | 56 ++++++++++++++++++++---- gcc/testsuite/ChangeLog | 8 ++++ gcc/testsuite/gcc.dg/tree-ssa/forwprop-28.c | 4 ++ gcc/testsuite/gcc.dg/tree-ssa/ssa-dom-thread-4.c | 4 +- gcc/testsuite/gcc.dg/tree-ssa/ssa-vrp-thread-1.c | 4 +- gcc/testsuite/gcc.dg/tree-ssa/vrp47.c | 4 +- gcc/testsuite/gcc.dg/tree-ssa/vrp87.c | 4 ++ 9 files changed, 85 insertions(+), 16 deletions(-) diff --git a/gcc/ChangeLog b/gcc/ChangeLog index 1a591cb..636cdba 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,3 +1,16 @@ +2013-11-14 Julian Brown + Joey Ye + + * config/arm/arm.c (arm_cortex_m_branch_cost): New. + (arm_v7m_tune): New. + (arm_slowmul_tune, arm_fastmul_tune, + arm_strongarm_tune, arm_9e_tune, arm_v6t2_tune, + arm_cortex_tune, arm_cortex_a15_tune, + arm_cortex_a5_tune, arm_v6m_tune): Add comments + for Sched adj cost. + * config/arm/arm-cores.def (cortex-m4, cortex-m3): + Use arm_v7m_tune. + 2013-11-14 Kirill Yukhin PR target/57491 diff --git a/gcc/config/arm/arm-cores.def b/gcc/config/arm/arm-cores.def index 79e2e87..d664e73 100644 --- a/gcc/config/arm/arm-cores.def +++ b/gcc/config/arm/arm-cores.def @@ -134,8 +134,8 @@ ARM_CORE("cortex-r4", cortexr4, 7R, FL_LDSCHED, cortex) ARM_CORE("cortex-r4f", cortexr4f, 7R, FL_LDSCHED, cortex) ARM_CORE("cortex-r5", cortexr5, 7R, FL_LDSCHED | FL_ARM_DIV, cortex) ARM_CORE("cortex-r7", cortexr7, 7R, FL_LDSCHED | FL_ARM_DIV, cortex) -ARM_CORE("cortex-m4", cortexm4, 7EM, FL_LDSCHED, cortex) -ARM_CORE("cortex-m3", cortexm3, 7M, FL_LDSCHED, cortex) +ARM_CORE("cortex-m4", cortexm4, 7EM, FL_LDSCHED, v7m) +ARM_CORE("cortex-m3", cortexm3, 7M, FL_LDSCHED, v7m) ARM_CORE("cortex-m1", cortexm1, 6M, FL_LDSCHED, v6m) ARM_CORE("cortex-m0", cortexm0, 6M, FL_LDSCHED, v6m) ARM_CORE("cortex-m0plus", cortexm0plus, 6M, FL_LDSCHED, v6m) diff --git a/gcc/config/arm/arm.c b/gcc/config/arm/arm.c index d3734c3..64c6b49 100644 --- a/gcc/config/arm/arm.c +++ b/gcc/config/arm/arm.c @@ -266,6 +266,7 @@ static reg_class_t arm_preferred_rename_class (reg_class_t rclass); static unsigned int arm_autovectorize_vector_sizes (void); static int arm_default_branch_cost (bool, bool); static int arm_cortex_a5_branch_cost (bool, bool); +static int arm_cortex_m_branch_cost (bool, bool); static bool arm_vectorize_vec_perm_const_ok (enum machine_mode vmode, const unsigned char *sel); @@ -1260,7 +1261,7 @@ const struct tune_params arm_slowmul_tune = { arm_slowmul_rtx_costs, NULL, - NULL, + NULL, /* Sched adj cost. */ 3, /* Constant limit. */ 5, /* Max cond insns. */ ARM_PREFETCH_NOT_BENEFICIAL, @@ -1276,7 +1277,7 @@ const struct tune_params arm_fastmul_tune = { arm_fastmul_rtx_costs, NULL, - NULL, + NULL, /* Sched adj cost. */ 1, /* Constant limit. */ 5, /* Max cond insns. */ ARM_PREFETCH_NOT_BENEFICIAL, @@ -1295,7 +1296,7 @@ const struct tune_params arm_strongarm_tune = { arm_fastmul_rtx_costs, NULL, - NULL, + NULL, /* Sched adj cost. */ 1, /* Constant limit. */ 3, /* Max cond insns. */ ARM_PREFETCH_NOT_BENEFICIAL, @@ -1327,7 +1328,7 @@ const struct tune_params arm_9e_tune = { arm_9e_rtx_costs, NULL, - NULL, + NULL, /* Sched adj cost. */ 1, /* Constant limit. */ 5, /* Max cond insns. */ ARM_PREFETCH_NOT_BENEFICIAL, @@ -1343,7 +1344,7 @@ const struct tune_params arm_v6t2_tune = { arm_9e_rtx_costs, NULL, - NULL, + NULL, /* Sched adj cost. */ 1, /* Constant limit. */ 5, /* Max cond insns. */ ARM_PREFETCH_NOT_BENEFICIAL, @@ -1360,7 +1361,7 @@ const struct tune_params arm_cortex_tune = { arm_9e_rtx_costs, &generic_extra_costs, - NULL, + NULL, /* Sched adj cost. */ 1, /* Constant limit. */ 5, /* Max cond insns. */ ARM_PREFETCH_NOT_BENEFICIAL, @@ -1392,7 +1393,7 @@ const struct tune_params arm_cortex_a15_tune = { arm_9e_rtx_costs, &cortexa15_extra_costs, - NULL, + NULL, /* Sched adj cost. */ 1, /* Constant limit. */ 2, /* Max cond insns. */ ARM_PREFETCH_NOT_BENEFICIAL, @@ -1411,7 +1412,7 @@ const struct tune_params arm_cortex_a5_tune = { arm_9e_rtx_costs, NULL, - NULL, + NULL, /* Sched adj cost. */ 1, /* Constant limit. */ 1, /* Max cond insns. */ ARM_PREFETCH_NOT_BENEFICIAL, @@ -1439,13 +1440,36 @@ const struct tune_params arm_cortex_a9_tune = false /* Prefer Neon for 64-bits bitops. */ }; +/* armv7m tuning. On Cortex-M4 cores for example, MOVW/MOVT take a single + cycle to execute each. An LDR from the constant pool also takes two cycles + to execute, but mildly increases pipelining opportunity (consecutive + loads/stores can be pipelined together, saving one cycle), and may also + improve icache utilisation. Hence we prefer the constant pool for such + processors. */ + +const struct tune_params arm_v7m_tune = +{ + arm_9e_rtx_costs, + &generic_extra_costs, + NULL, /* Sched adj cost. */ + 1, /* Constant limit. */ + 5, /* Max cond insns. */ + ARM_PREFETCH_NOT_BENEFICIAL, + true, /* Prefer constant pool. */ + arm_cortex_m_branch_cost, + false, /* Prefer LDRD/STRD. */ + {false, false}, /* Prefer non short circuit. */ + &arm_default_vec_cost, /* Vectorizer costs. */ + false /* Prefer Neon for 64-bits bitops. */ +}; + /* The arm_v6m_tune is duplicated from arm_cortex_tune, rather than arm_v6t2_tune. It is used for cortex-m0, cortex-m1 and cortex-m0plus. */ const struct tune_params arm_v6m_tune = { arm_9e_rtx_costs, NULL, - NULL, + NULL, /* Sched adj cost. */ 1, /* Constant limit. */ 5, /* Max cond insns. */ ARM_PREFETCH_NOT_BENEFICIAL, @@ -11241,6 +11265,20 @@ arm_cortex_a5_branch_cost (bool speed_p, bool predictable_p) return speed_p ? 0 : arm_default_branch_cost (speed_p, predictable_p); } +/* Thumb-2 branches are relatively cheap on Cortex-M processors ("1 + P cycles" + on Cortex-M4, where P varies from 1 to 3 according to some criteria), since + sequences of non-executed instructions in IT blocks probably take the same + amount of time as executed instructions (and the IT instruction itself takes + space in icache). This function was experimentally determined to give good + results on a popular embedded benchmark. */ + +static int +arm_cortex_m_branch_cost (bool speed_p, bool predictable_p) +{ + return (TARGET_32BIT && speed_p) ? 1 + : arm_default_branch_cost (speed_p, predictable_p); +} + static bool fp_consts_inited = false; static REAL_VALUE_TYPE value_fp0; diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog index ec8d248..6d8e43a 100644 --- a/gcc/testsuite/ChangeLog +++ b/gcc/testsuite/ChangeLog @@ -1,3 +1,11 @@ +2013-11-14 Joey Ye + + * gcc.dg/tree-ssa/forwprop-28.c: Disable for cortex_m. + * gcc.dg/tree-ssa/vrp47.c: Likewise. + * gcc.dg/tree-ssa/vrp87.c: Likewise. + * gcc.dg/tree-ssa/ssa-dom-thread-4.c: Ingore for cortex_m. + * gcc.dg/tree-ssa/ssa-vrp-thread-1.c: Likewise. + 2013-11-14 Adam Butcher PR c++/58533 diff --git a/gcc/testsuite/gcc.dg/tree-ssa/forwprop-28.c b/gcc/testsuite/gcc.dg/tree-ssa/forwprop-28.c index 06b406f..1a4bf4a 100644 --- a/gcc/testsuite/gcc.dg/tree-ssa/forwprop-28.c +++ b/gcc/testsuite/gcc.dg/tree-ssa/forwprop-28.c @@ -1,5 +1,9 @@ /* { dg-do compile { target { ! "m68k*-*-* mmix*-*-* mep*-*-* bfin*-*-* v850*-*-* picochip*-*-* moxie*-*-* cris*-*-* m32c*-*-* fr30*-*-* mcore*-*-* powerpc*-*-* xtensa*-*-* arc*-*-*"} } } */ /* { dg-options "-O2 -fdump-tree-forwprop1" } */ +/* Skip on ARM Cortex-M, where LOGICAL_OP_NON_SHORT_CIRCUIT is set to false, + leading to two conditional jumps when evaluating an && condition. Forwprop1 + is not able to optimize this. */ +/* { dg-skip-if "" { arm_cortex_m } } */ extern char *frob (void); extern _Bool testit (void); diff --git a/gcc/testsuite/gcc.dg/tree-ssa/ssa-dom-thread-4.c b/gcc/testsuite/gcc.dg/tree-ssa/ssa-dom-thread-4.c index fec3075..0e4797c 100644 --- a/gcc/testsuite/gcc.dg/tree-ssa/ssa-dom-thread-4.c +++ b/gcc/testsuite/gcc.dg/tree-ssa/ssa-dom-thread-4.c @@ -59,9 +59,9 @@ bitmap_ior_and_compl (bitmap dst, const_bitmap a, const_bitmap b, code we missed the edge when the first conditional is false (b_elt is zero, which means the second conditional is always zero. */ -/* ARM Cortex-M0 defined LOGICAL_OP_NON_SHORT_CIRCUIT to false, +/* ARM Cortex-M defined LOGICAL_OP_NON_SHORT_CIRCUIT to false, so skip below test. */ -/* { dg-final { scan-tree-dump-times "Threaded" 3 "dom1" { target { ! { { mips*-*-* avr-*-* arc*-*-* } || { arm_cortex_m && arm_thumb1 } } } } } } */ +/* { dg-final { scan-tree-dump-times "Threaded" 3 "dom1" { target { ! { { mips*-*-* avr-*-* arc*-*-* } || { arm_cortex_m } } } } } } */ /* MIPS defines LOGICAL_OP_NON_SHORT_CIRCUIT to 0, so we split both "a_elt || b_elt" and "b_elt && kill_elt" into two conditions each, rather than using "(var1 != 0) op (var2 != 0)". Also, as on other targets, diff --git a/gcc/testsuite/gcc.dg/tree-ssa/ssa-vrp-thread-1.c b/gcc/testsuite/gcc.dg/tree-ssa/ssa-vrp-thread-1.c index 9d9473e..b498d8b 100644 --- a/gcc/testsuite/gcc.dg/tree-ssa/ssa-vrp-thread-1.c +++ b/gcc/testsuite/gcc.dg/tree-ssa/ssa-vrp-thread-1.c @@ -26,6 +26,8 @@ build_omp_regions_1 (basic_block bb, struct omp_region *parent, oof (); } -/* { dg-final { scan-tree-dump-times "Threaded" 1 "vrp1" } } */ +/* ARM Cortex-M defined LOGICAL_OP_NON_SHORT_CIRCUIT to false, + so skip below test. */ +/* { dg-final { { scan-tree-dump-times "Threaded" 1 "vrp1" } || { arm_cortex_m } } } */ /* { dg-final { cleanup-tree-dump "vrp1" } } */ diff --git a/gcc/testsuite/gcc.dg/tree-ssa/vrp47.c b/gcc/testsuite/gcc.dg/tree-ssa/vrp47.c index 74b520b..5a09fa0 100644 --- a/gcc/testsuite/gcc.dg/tree-ssa/vrp47.c +++ b/gcc/testsuite/gcc.dg/tree-ssa/vrp47.c @@ -6,10 +6,10 @@ /* { dg-do compile { target { ! "mips*-*-* arc*-*-* s390*-*-* avr-*-* mn10300-*-*" } } } */ /* { dg-options "-O2 -fdump-tree-vrp1 -fdump-tree-dom1 -fdump-tree-vrp2" } */ /* { dg-additional-options "-march=i586" { target { { i?86-*-* x86_64-*-* } && ia32 } } } */ -/* Skip on ARM Cortex-M0, where LOGICAL_OP_NON_SHORT_CIRCUIT is set to false, +/* Skip on ARM Cortex-M, where LOGICAL_OP_NON_SHORT_CIRCUIT is set to false, leading to two conditional jumps when evaluating an && condition. VRP is not able to optimize this. */ -/* { dg-skip-if "" { arm_cortex_m && arm_thumb1} } */ +/* { dg-skip-if "" { arm_cortex_m } } */ int h(int x, int y) { diff --git a/gcc/testsuite/gcc.dg/tree-ssa/vrp87.c b/gcc/testsuite/gcc.dg/tree-ssa/vrp87.c index aa85191..9aff0a6 100644 --- a/gcc/testsuite/gcc.dg/tree-ssa/vrp87.c +++ b/gcc/testsuite/gcc.dg/tree-ssa/vrp87.c @@ -2,6 +2,10 @@ /* { dg-options "-O2 -fdump-tree-vrp2-details -fdump-tree-cddce2-details" } */ /* { dg-additional-options "-mbranch-cost=2" { target avr-*-* } } */ +/* Skip on ARM Cortex-M, where LOGICAL_OP_NON_SHORT_CIRCUIT is set to false, + leading to two conditional jumps when evaluating an && condition. VRP is + not able to optimize this. */ +/* { dg-skip-if "" { arm_cortex_m } } */ struct bitmap_head_def; typedef struct bitmap_head_def *bitmap; -- 2.7.4