From: Kyrylo Tkachov Date: Thu, 18 Mar 2021 08:57:01 +0000 (+0000) Subject: aarch64: Improve generic SVE tuning defaults X-Git-Tag: upstream/12.2.0~9241 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=8f0c9d53ef3a9b8ba2579b53596cc2b7f5d8bf69;p=platform%2Fupstream%2Fgcc.git aarch64: Improve generic SVE tuning defaults This patch adds the recently-added tweak to split some SVE VL-based scalar operations [1] to the generic tuning used for SVE, as enabled by adding +sve to the -march flag, for example -march=armv8.2-a+sve. The recommendation for best performance on a particular CPU remains unchanged: use the -mcpu option for that CPU, where possible. -mcpu=native makes this straightforward for native compilation. The tweak to split out SVE VL-based scalar operations is a consistent win for the Neoverse V1 CPU and should be neutral for the Fujitsu A64FX. A run of SPEC2017 on A64FX with this tweak on didn't show any non-noise differences. It is also expected to be neutral on SVE2 implementations. Therefore, the patch enables the tweak for generic +sve tuning e.g. -march=armv8.2-a+sve. No SVE2 CPUs are expected to benefit from it, therefore the tweak is disabled for generic tuning when +sve2 is in -march e.g. -march=armv8.2-a+sve2. The implementation of this approach requires a bit of custom logic in aarch64_override_options_internal to handle these kinds of architecture-dependent decisions, but we do believe the user-facing principle here is important to implement. In general, for the generic target we're using a decision framework that looks like: * If all cores that are known to benefit from an optimization are of architecture X, and all other cores that implement X or above are not impacted, or have a very slight impact, we will consider it for generic tuning for architecture X. * We will not enable that optimisation for generic tuning for architecture X+1 if no known cores of architecture X+1 or above will benefit. This framework allows us to improve generic tuning for CPUs of generation X while avoiding accumulating tweaks for future CPUs of generation X+1, X+2... that do not need them, and thus avoid even the slight negative effects of these optimisations if the user is willing to tell us the desired architecture accurately. X above can mean either annual architecture updates (Armv8.2-a, Armv8.3-a etc) or optional architecture extensions (like SVE, SVE2). [1] http://gcc.gnu.org/g:a65b9ad863c5fc0aea12db58557f4d286a1974d7 gcc/ChangeLog: * config/aarch64/aarch64.c (aarch64_adjust_generic_arch_tuning): Define. (aarch64_override_options_internal): Use it. (generic_tunings): Add AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS to tune_flags. gcc/testsuite/ChangeLog: * g++.target/aarch64/sve/aarch64-sve.exp: Add -moverride=tune=none to sve_flags. * g++.target/aarch64/sve/acle/aarch64-sve-acle-asm.exp: Likewise. * g++.target/aarch64/sve/acle/aarch64-sve-acle.exp: Likewise. * gcc.target/aarch64/sve/aarch64-sve.exp: Likewise. * gcc.target/aarch64/sve/acle/aarch64-sve-acle-asm.exp: Likewise. * gcc.target/aarch64/sve/acle/aarch64-sve-acle.exp: Likewise. --- diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c index 7838d99..db69e69 100644 --- a/gcc/config/aarch64/aarch64.c +++ b/gcc/config/aarch64/aarch64.c @@ -1035,7 +1035,10 @@ static const struct tune_params generic_tunings = 2, /* min_div_recip_mul_df. */ 0, /* max_case_values. */ tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */ - (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */ + /* Enabling AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS significantly benefits + Neoverse V1. It does not have a noticeable effect on A64FX and should + have at most a very minor effect on SVE2 cores. */ + (AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS), /* tune_flags. */ &generic_prefetch_tune }; @@ -14485,6 +14488,19 @@ aarch64_parse_override_string (const char* input_string, free (string_root); } +/* Adjust CURRENT_TUNE (a generic tuning struct) with settings that + are best for a generic target with the currently-enabled architecture + extensions. */ +static void +aarch64_adjust_generic_arch_tuning (struct tune_params ¤t_tune) +{ + /* Neoverse V1 is the only core that is known to benefit from + AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS. There is therefore no + point enabling it for SVE2 and above. */ + if (TARGET_SVE2) + current_tune.extra_tuning_flags + &= ~AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS; +} static void aarch64_override_options_after_change_1 (struct gcc_options *opts) @@ -14555,6 +14571,8 @@ aarch64_override_options_internal (struct gcc_options *opts) we may later overwrite. */ aarch64_tune_params = *(selected_tune->tune); aarch64_architecture_version = selected_arch->architecture_version; + if (selected_tune->tune == &generic_tunings) + aarch64_adjust_generic_arch_tuning (aarch64_tune_params); if (opts->x_aarch64_override_tune_string) aarch64_parse_override_string (opts->x_aarch64_override_tune_string, diff --git a/gcc/testsuite/g++.target/aarch64/sve/aarch64-sve.exp b/gcc/testsuite/g++.target/aarch64/sve/aarch64-sve.exp index 4bbe2f5..d4761f2 100644 --- a/gcc/testsuite/g++.target/aarch64/sve/aarch64-sve.exp +++ b/gcc/testsuite/g++.target/aarch64/sve/aarch64-sve.exp @@ -38,6 +38,10 @@ if { [check_effective_target_aarch64_sve] } { set sve_flags "-march=armv8.2-a+sve" } +# Turn off any codegen tweaks by default that may affect expected assembly. +# Tests relying on those should turn them on explicitly. +set sve_flags "$sve_flags -moverride=tune=none" + # Main loop. dg-runtest [lsort [glob -nocomplain $srcdir/$subdir/*.C]] $sve_flags "" diff --git a/gcc/testsuite/g++.target/aarch64/sve/acle/aarch64-sve-acle-asm.exp b/gcc/testsuite/g++.target/aarch64/sve/acle/aarch64-sve-acle-asm.exp index 0734268..84ae95e 100644 --- a/gcc/testsuite/g++.target/aarch64/sve/acle/aarch64-sve-acle-asm.exp +++ b/gcc/testsuite/g++.target/aarch64/sve/acle/aarch64-sve-acle-asm.exp @@ -37,6 +37,10 @@ if { [check_effective_target_aarch64_sve] } { set sve_flags "-march=armv8.2-a+sve" } +# Turn off any codegen tweaks by default that may affect expected assembly. +# Tests relying on those should turn them on explicitly. +set sve_flags "$sve_flags -moverride=tune=none" + global gcc_runtest_parallelize_limit_minor if { [info exists gcc_runtest_parallelize_limit_minor] } { set old_limit_minor $gcc_runtest_parallelize_limit_minor diff --git a/gcc/testsuite/g++.target/aarch64/sve/acle/aarch64-sve-acle.exp b/gcc/testsuite/g++.target/aarch64/sve/acle/aarch64-sve-acle.exp index cb9de75..8d3d8b4 100644 --- a/gcc/testsuite/g++.target/aarch64/sve/acle/aarch64-sve-acle.exp +++ b/gcc/testsuite/g++.target/aarch64/sve/acle/aarch64-sve-acle.exp @@ -44,6 +44,10 @@ if { [check_effective_target_aarch64_sve] } { set sve_flags "-march=armv8.2-a+sve" } +# Turn off any codegen tweaks by default that may affect expected assembly. +# Tests relying on those should turn them on explicitly. +set sve_flags "$sve_flags -moverride=tune=none" + # Main loop. set gcc_subdir [string replace $subdir 0 2 gcc] set files [glob -nocomplain \ diff --git a/gcc/testsuite/gcc.target/aarch64/sve/aarch64-sve.exp b/gcc/testsuite/gcc.target/aarch64/sve/aarch64-sve.exp index 622fc92..1d3f566 100644 --- a/gcc/testsuite/gcc.target/aarch64/sve/aarch64-sve.exp +++ b/gcc/testsuite/gcc.target/aarch64/sve/aarch64-sve.exp @@ -44,6 +44,10 @@ if { [check_effective_target_aarch64_sve] } { set sve_flags "-march=armv8.2-a+sve" } +# Turn off any codegen tweaks by default that may affect expected assembly. +# Tests relying on those should turn them on explicitly. +set sve_flags "$sve_flags -moverride=tune=none" + # Most of the code-quality tests are written for LP64. Just do the # correctness tests for ILP32. if { [check_effective_target_ilp32] } { diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/aarch64-sve-acle-asm.exp b/gcc/testsuite/gcc.target/aarch64/sve/acle/aarch64-sve-acle-asm.exp index 6146e65..fcd07aa 100644 --- a/gcc/testsuite/gcc.target/aarch64/sve/acle/aarch64-sve-acle-asm.exp +++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/aarch64-sve-acle-asm.exp @@ -37,6 +37,10 @@ if { [check_effective_target_aarch64_sve] } { set sve_flags "-march=armv8.2-a+sve" } +# Turn off any codegen tweaks by default that may affect expected assembly. +# Tests relying on those should turn them on explicitly. +set sve_flags "$sve_flags -moverride=tune=none" + global gcc_runtest_parallelize_limit_minor if { [info exists gcc_runtest_parallelize_limit_minor] } { set old_limit_minor $gcc_runtest_parallelize_limit_minor diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/aarch64-sve-acle.exp b/gcc/testsuite/gcc.target/aarch64/sve/acle/aarch64-sve-acle.exp index a33b65a..2f36f1c 100644 --- a/gcc/testsuite/gcc.target/aarch64/sve/acle/aarch64-sve-acle.exp +++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/aarch64-sve-acle.exp @@ -44,6 +44,10 @@ if { [check_effective_target_aarch64_sve] } { set sve_flags "-march=armv8.2-a+sve" } +# Turn off any codegen tweaks by default that may affect expected assembly. +# Tests relying on those should turn them on explicitly. +set sve_flags "$sve_flags -moverride=tune=none" + # Main loop. set files [glob -nocomplain \ "$srcdir/$subdir/general/*.c" \