From 43fe811b73d8f585a4ae837d4a9d4c0f5d46b779 Mon Sep 17 00:00:00 2001 From: Siddhesh Poyarekar Date: Mon, 13 May 2013 13:44:32 +0530 Subject: [PATCH] Use HP_TIMING for benchmarks if available HP_TIMING uses native timestamping instructions if available, thus greatly reducing the overhead of recording start and end times for function calls. For architectures that don't have HP_TIMING available, we fall back to the clock_gettime bits. One may also override this by invoking the benchmark as follows: make USE_CLOCK_GETTIME=1 bench and get the benchmark results using clock_gettime. One has to do `make bench-clean` to ensure that the benchmark programs are rebuilt. --- ChangeLog | 7 +++++ benchtests/Makefile | 8 ++++- benchtests/bench-skeleton.c | 35 +++++++++------------- benchtests/bench-timing.h | 72 +++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 100 insertions(+), 22 deletions(-) create mode 100644 benchtests/bench-timing.h diff --git a/ChangeLog b/ChangeLog index 8a12bf9..ea678c8 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,5 +1,12 @@ 2013-05-13 Siddhesh Poyarekar + * benchtests/Makefile (CPPFLAGS-nonlib): Add + -DUSE_CLOCK_GETTIME if USE_CLOCK_GETTIME is defined. + (bench-deps): Add bench-timing.h. + * benchtests-bench-skeleton.c: Include bench-timing.h. + (main): Use TIMING_* macros instead of clock_gettime. + * benchtests/bench-timing.h: New file. + [BZ #14582] * sysdeps/ieee754/s_lib_version.c (_LIB_VERSION_INTERNAL): Renamed from _LIB_VERSION. diff --git a/benchtests/Makefile b/benchtests/Makefile index 19e1be6..8618390 100644 --- a/benchtests/Makefile +++ b/benchtests/Makefile @@ -86,13 +86,19 @@ endif CPPFLAGS-nonlib = -DDURATION=$(BENCH_DURATION) +# Use clock_gettime to measure performance of functions. The default is to use +# HP_TIMING if it is available. +ifdef USE_CLOCK_GETTIME +CPPFLAGS-nonlib += -DUSE_CLOCK_GETTIME +endif + # This makes sure CPPFLAGS-nonlib and CFLAGS-nonlib are passed # for all these modules. cpp-srcs-left := $(binaries-bench:=.c) lib := nonlib include $(patsubst %,$(..)cppflags-iterator.mk,$(cpp-srcs-left)) -bench-deps := bench-skeleton.c Makefile +bench-deps := bench-skeleton.c bench-timing.h Makefile run-bench = $(test-wrapper-env) \ GCONV_PATH=$(common-objpfx)iconvdata LC_ALL=C \ diff --git a/benchtests/bench-skeleton.c b/benchtests/bench-skeleton.c index 404900b..4e3a507 100644 --- a/benchtests/bench-skeleton.c +++ b/benchtests/bench-skeleton.c @@ -21,6 +21,7 @@ #include #include #include +#include "bench-timing.h" volatile unsigned int dontoptimize = 0; @@ -45,21 +46,16 @@ int main (int argc, char **argv) { unsigned long i, k; - struct timespec start, end, runtime; + struct timespec runtime; + timing_t start, end; startup(); memset (&runtime, 0, sizeof (runtime)); - memset (&start, 0, sizeof (start)); - memset (&end, 0, sizeof (end)); - clock_getres (CLOCK_PROCESS_CPUTIME_ID, &start); + unsigned long iters; - /* Measure 1000 times the resolution of the clock. So for a 1ns resolution - clock, we measure 1000 iterations of the function call at a time. - Measurements close to the minimum clock resolution won't make much sense, - but it's better than having nothing at all. */ - unsigned long iters = 1000 * start.tv_nsec; + TIMING_INIT (iters); for (int v = 0; v < NUM_VARIANTS; v++) { @@ -68,19 +64,18 @@ main (int argc, char **argv) runtime.tv_sec += DURATION; double d_total_i = 0; - uint64_t total = 0, max = 0, min = 0x7fffffffffffffff; + timing_t total = 0, max = 0, min = 0x7fffffffffffffff; while (1) { for (i = 0; i < NUM_SAMPLES (v); i++) { - clock_gettime (CLOCK_PROCESS_CPUTIME_ID, &start); + uint64_t cur; + TIMING_NOW (start); for (k = 0; k < iters; k++) BENCH_FUNC (v, i); - clock_gettime (CLOCK_PROCESS_CPUTIME_ID, &end); + TIMING_NOW (end); - uint64_t cur = (end.tv_nsec - start.tv_nsec - + ((end.tv_sec - start.tv_sec) - * (uint64_t) 1000000000)); + TIMING_DIFF (cur, start, end); if (cur > max) max = cur; @@ -88,7 +83,7 @@ main (int argc, char **argv) if (cur < min) min = cur; - total += cur; + TIMING_ACCUM (total, cur); d_total_i += iters; } @@ -104,13 +99,11 @@ main (int argc, char **argv) double d_iters; done: - d_total_s = total * 1e-9; + d_total_s = total; d_iters = iters; - printf ("%s: ITERS:%g: TOTAL:%gs, MAX:%gns, MIN:%gns, %g iter/s\n", - VARIANT (v), - d_total_i, d_total_s, max / d_iters, min / d_iters, - d_total_i / d_total_s); + TIMING_PRINT_STATS (VARIANT (v), d_total_s, d_iters, d_total_i, max, + min); } return 0; diff --git a/benchtests/bench-timing.h b/benchtests/bench-timing.h new file mode 100644 index 0000000..264d4b8 --- /dev/null +++ b/benchtests/bench-timing.h @@ -0,0 +1,72 @@ +/* Define timing macros. + Copyright (C) 2013 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + +#include +#include + +#if HP_TIMING_AVAIL && !defined USE_CLOCK_GETTIME +# define GL(x) _##x +# define GLRO(x) _##x +hp_timing_t _dl_hp_timing_overhead; +typedef hp_timing_t timing_t; + +# define TIMING_INIT(iters) \ +({ \ + HP_TIMING_DIFF_INIT(); \ + (iters) = 1000; \ +}) + +# define TIMING_NOW(var) HP_TIMING_NOW (var) +# define TIMING_DIFF(diff, start, end) HP_TIMING_DIFF ((diff), (start), (end)) +# define TIMING_ACCUM(sum, diff) HP_TIMING_ACCUM_NT ((sum), (diff)) + +# define TIMING_PRINT_STATS(func, d_total_s, d_iters, d_total_i, max, min) \ + printf ("%s: ITERS:%g: TOTAL:%gMcy, MAX:%gcy, MIN:%gcy, %g calls/Mcy\n", \ + (func), (d_total_i), (d_total_s) * 1e-6, (max) / (d_iters), \ + (min) / (d_iters), 1e6 * (d_total_i) / (d_total_s)); + +#else +typedef uint64_t timing_t; + +/* Measure 1000 times the resolution of the clock. So for a 1ns + resolution clock, we measure 1000 iterations of the function call at a + time. Measurements close to the minimum clock resolution won't make + much sense, but it's better than having nothing at all. */ +# define TIMING_INIT(iters) \ +({ \ + struct timespec start; \ + clock_getres (CLOCK_PROCESS_CPUTIME_ID, &start); \ + (iters) = 1000 * start.tv_nsec; \ +}) + +# define TIMING_NOW(var) \ +({ \ + struct timespec tv; \ + clock_gettime (CLOCK_PROCESS_CPUTIME_ID, &tv); \ + (var) = (uint64_t) (tv.tv_nsec + (uint64_t) 1000000000 * tv.tv_sec); \ +}) + +# define TIMING_DIFF(diff, start, end) (diff) = (end) - (start) +# define TIMING_ACCUM(sum, diff) (sum) += (diff) + +# define TIMING_PRINT_STATS(func, d_total_s, d_iters, d_total_i, max, min) \ + printf ("%s: ITERS:%g: TOTAL:%gs, MAX:%gns, MIN:%gns, %g iter/s\n", \ + (func), (d_total_i), (d_total_s) * 1e-9, (max) / (d_iters), \ + (min) / (d_iters), 1e9 * (d_total_i) / (d_total_s)) + +#endif -- 2.7.4