From 2237828857f064da554114c8027743418cad1a18 Mon Sep 17 00:00:00 2001 From: "Frank Ch. Eigler" Date: Thu, 29 Oct 2020 14:25:18 -0400 Subject: [PATCH] PR26775: make grooming progress visible & interruptible On very large servers, it's desirable to be able to interrupt a rescan or groom cycle. SIGUSR[12] now do that. (Unfortunately, this is not practically testable in the testsuite, since these cycles are so fast on that small dataset.) We also expose more internal progress count about the grooming pass, so the administrator can assess possible need to interrupt. Signed-off-by: Frank Ch. Eigler --- debuginfod/ChangeLog | 11 +++++++++++ debuginfod/debuginfod.cxx | 28 +++++++++++++++++++++++++--- doc/ChangeLog | 6 ++++++ doc/debuginfod.8 | 6 ++++-- tests/ChangeLog | 6 ++++++ tests/run-debuginfod-find.sh | 7 ++++--- 6 files changed, 56 insertions(+), 8 deletions(-) diff --git a/debuginfod/ChangeLog b/debuginfod/ChangeLog index 9eb9c81..5af4dc6 100644 --- a/debuginfod/ChangeLog +++ b/debuginfod/ChangeLog @@ -1,5 +1,16 @@ 2020-10-29 Frank Ch. Eigler + PR26775 + * debuginfod.cxx (forced_*_count): Make these global. + (runq::clear): New function. + (thread_main_scanner): Check for pending SIGUSR2; interrupt. + (scan_source_paths): Check for pending SIGUSR2; interrupt. + (groom): Report prometheus stats before groom also. Check for + pending SIGUSR1; interrupt. Increment thread_work_total for + each file scanned, not the entire cycle. + +2020-10-29 Frank Ch. Eigler + PR26810 * debuginfod.cxx (handle_buildid_*_match): Throw exceptions for more lower level libc errors. diff --git a/debuginfod/debuginfod.cxx b/debuginfod/debuginfod.cxx index 9da65d8..8448a50 100644 --- a/debuginfod/debuginfod.cxx +++ b/debuginfod/debuginfod.cxx @@ -388,7 +388,9 @@ static string db_path; static sqlite3 *db; // single connection, serialized across all our threads! static unsigned verbose; static volatile sig_atomic_t interrupted = 0; +static volatile sig_atomic_t forced_rescan_count = 0; static volatile sig_atomic_t sigusr1 = 0; +static volatile sig_atomic_t forced_groom_count = 0; static volatile sig_atomic_t sigusr2 = 0; static unsigned http_port = 8002; static unsigned rescan_s = 300; @@ -607,6 +609,14 @@ public: cv.notify_all(); } + // clear the workqueue, when scanning is interrupted with USR2 + void clear() { + unique_lock lock(mtx); + q.clear(); + set_metric("thread_work_pending","role","scan", q.size()); + cv.notify_all(); // maybe wake up waiting idlers + } + // block this scanner thread until there is work to do and no active bool wait_front (Payload& p) { @@ -2741,6 +2751,12 @@ thread_main_scanner (void* arg) } inc_metric("thread_work_total", "role","scan"); + + if (sigusr2 != forced_groom_count) // stop early if groom triggered + { + scanq.clear(); + break; + } } add_metric("thread_busy", "role", "scan", -1); @@ -2784,6 +2800,9 @@ scan_source_paths() { if (interrupted) break; + if (sigusr2 != forced_groom_count) // stop early if groom triggered + break; + fts_scanned ++; if (verbose > 2) @@ -2842,7 +2861,6 @@ thread_main_fts_source_paths (void* arg) { (void) arg; // ignore; we operate on global data - sig_atomic_t forced_rescan_count = 0; set_metric("thread_tid", "role","traverse", tid()); add_metric("thread_count", "role", "traverse", 1); @@ -2923,6 +2941,8 @@ void groom() struct timeval tv_start, tv_end; gettimeofday (&tv_start, NULL); + database_stats_report(); + // scan for files that have disappeared sqlite_ps files (db, "check old files", "select s.mtime, s.file, f.name from " BUILDIDS "_file_mtime_scanned s, " BUILDIDS "_files f " @@ -2951,6 +2971,10 @@ void groom() files_del_r_de.reset().bind(1,fileid).bind(2,mtime).step_ok_done(); files_del_scan.reset().bind(1,fileid).bind(2,mtime).step_ok_done(); } + + inc_metric("thread_work_total", "role", "groom"); + if (sigusr1 != forced_rescan_count) // stop early if scan triggered + break; } files.reset(); @@ -2987,7 +3011,6 @@ void groom() static void* thread_main_groom (void* /*arg*/) { - sig_atomic_t forced_groom_count = 0; set_metric("thread_tid", "role", "groom", tid()); add_metric("thread_count", "role", "groom", 1); @@ -3016,7 +3039,6 @@ thread_main_groom (void* /*arg*/) set_metric("thread_busy", "role", "groom", 1); groom (); last_groom = time(NULL); // NB: now was before grooming - inc_metric("thread_work_total", "role", "groom"); set_metric("thread_busy", "role", "groom", 0); } catch (const sqlite_exception& e) diff --git a/doc/ChangeLog b/doc/ChangeLog index bde52c1..8c33f17 100644 --- a/doc/ChangeLog +++ b/doc/ChangeLog @@ -1,3 +1,9 @@ +2020-10-29 Frank Ch. Eigler + + PR26775 + * debuginfod.8: Document that SIGUSR1 interrupts the groom + cycle, and SIGUSR2 interrupts rescan. + 2020-10-25 Mark Wielaard * debuginfod_find_debuginfo.3 (ECONNREFUSED): Document that this diff --git a/doc/debuginfod.8 b/doc/debuginfod.8 index a645cee..152e368 100644 --- a/doc/debuginfod.8 +++ b/doc/debuginfod.8 @@ -155,7 +155,8 @@ before doing it again. A rescan for unchanged files is fast (because the index also stores the file mtimes). A time of zero is acceptable, and means that only one initial scan should performed. The default rescan time is 300 seconds. Receiving a SIGUSR1 signal triggers a new -scan, independent of the rescan time (including if it was zero). +scan, independent of the rescan time (including if it was zero), +interrupting a groom pass (if any). .TP .B "\-g SECONDS" "\-\-groom\-time=SECONDS" @@ -167,7 +168,8 @@ it can deindex obsolete files. See also the \fIDATA MANAGEMENT\fP section. The default groom time is 86400 seconds (1 day). A time of zero is acceptable, and means that only one initial groom should be performed. Receiving a SIGUSR2 signal triggers a new grooming pass, -independent of the groom time (including if it was zero). +independent of the groom time (including if it was zero), interrupting +a rescan pass (if any).. .TP .B "\-G" diff --git a/tests/ChangeLog b/tests/ChangeLog index 6ea75d2..012e305 100644 --- a/tests/ChangeLog +++ b/tests/ChangeLog @@ -1,5 +1,11 @@ 2020-10-29 Frank Ch. Eigler + PR26775 + * run-debuginfod-find.sh: Modify test for different + thread_work_total semantics for grooming. + +2020-10-29 Frank Ch. Eigler + PR26810 * run-debuginfod-find.sh: Add tests for successful archive fetches across renamed RPMs, even without grooming. diff --git a/tests/run-debuginfod-find.sh b/tests/run-debuginfod-find.sh index 52def36..3af04c0 100755 --- a/tests/run-debuginfod-find.sh +++ b/tests/run-debuginfod-find.sh @@ -345,8 +345,9 @@ RPM_BUILDID=d44d42cbd7d915bc938c81333a21e355a6022fb7 # in rhel6/ subdir, for a l rm -r R/debuginfod-rpms/rhel6/* kill -USR2 $PID1 # groom cycle # Expect 3 rpms to be deleted by the groom -# 1 groom already took place at/soon-after startup, so -USR2 makes 2 -wait_ready $PORT1 'thread_work_total{role="groom"}' 2 +# 1 groom cycle already took place at/soon-after startup, so -USR2 makes 2 +# ... times the # of files checked in each cycle +wait_ready $PORT1 'thread_work_total{role="groom"}' 51 wait_ready $PORT1 'groom{statistic="file d/e"}' 3 rm -rf $DEBUGINFOD_CACHE_PATH # clean it from previous tests @@ -363,7 +364,7 @@ testrun ${abs_top_builddir}/debuginfod/debuginfod-find executable $BUILDID2 # run a groom cycle to force server to drop its fdcache kill -USR2 $PID1 # groom cycle -wait_ready $PORT1 'thread_work_total{role="groom"}' 3 +wait_ready $PORT1 'thread_work_total{role="groom"}' 98 # 3 complete cycles # move it around a couple of times to make it likely to hit a nonexistent entry during iteration mv R/debuginfod-rpms/rhel7 R/debuginfod-rpms/rhel7renamed kill -USR1 $PID1 # scan cycle -- 2.7.4