journal-upload: Update watchdog while in curl_easy_perform
authorKlearchos Chaloulos <klearchos.chaloulos@nokia.com>
Tue, 5 Apr 2016 10:47:04 +0000 (13:47 +0300)
committerKlearchos Chaloulos <klearchos.chaloulos@nokia.com>
Tue, 5 Apr 2016 14:37:00 +0000 (17:37 +0300)
It is observed that a combination of high log throughput, low I/O speed on journal remote side and many nodes uploading simultaneously caused the journal-upload process to dump core because of watchdog starvation. This is caused because journal-upload stays in curl_easy_perform(), because it cannot upload fast enough to reach the end of the journal. Currently journal-upload will return from curl_easy_perform() only when the end of the journal is reached. Therefore a check is added in journal_input_callback(), which will update the watchdog if the elapsed time since the start of the uploading process is greater than WATCHDOG_USEC/2.

src/journal-remote/journal-upload-journal.c
src/journal-remote/journal-upload.c
src/journal-remote/journal-upload.h

index e61b6bc..ac6eb58 100644 (file)
@@ -25,6 +25,7 @@
 #include "log.h"
 #include "utf8.h"
 #include "util.h"
+#include "sd-daemon.h"
 
 /**
  * Write up to size bytes to buf. Return negative on error, and number of
@@ -242,6 +243,28 @@ static ssize_t write_entry(char *buf, size_t size, Uploader *u) {
         assert_not_reached("WTF?");
 }
 
+static inline void check_update_watchdog(Uploader *u) {
+        usec_t watchdog_usec;
+        static usec_t before;
+        usec_t after;
+        usec_t elapsed_time;
+
+        if (sd_watchdog_enabled(false, &watchdog_usec) < 0)
+                return;
+        if (u->reset_reference_timestamp) {
+                before = now(CLOCK_MONOTONIC);
+                u->reset_reference_timestamp = false;
+        } else {
+                after = now(CLOCK_MONOTONIC);
+                elapsed_time = usec_sub(after, before);
+                if (elapsed_time > watchdog_usec / 2) {
+                        log_debug("Update watchdog timer");
+                        sd_notify(false, "WATCHDOG=1");
+                        u->reset_reference_timestamp = true;
+                }
+        }
+}
+
 static size_t journal_input_callback(void *buf, size_t size, size_t nmemb, void *userp) {
         Uploader *u = userp;
         int r;
@@ -252,6 +275,8 @@ static size_t journal_input_callback(void *buf, size_t size, size_t nmemb, void
         assert(u);
         assert(nmemb <= SSIZE_MAX / size);
 
+        check_update_watchdog(u);
+
         j = u->journal;
 
         while (j && filled < size * nmemb) {
index 6e1c3bb..f2e9117 100644 (file)
@@ -494,6 +494,7 @@ static int perform_upload(Uploader *u) {
 
         assert(u);
 
+        u->reset_reference_timestamp = true;
         code = curl_easy_perform(u->easy);
         if (code) {
                 if (u->error[0])
index b8cd04d..a31735b 100644 (file)
@@ -48,6 +48,7 @@ typedef struct Uploader {
 
         size_t entries_sent;
         char *last_cursor, *current_cursor;
+        bool reset_reference_timestamp;
 } Uploader;
 
 #define JOURNAL_UPLOAD_POLL_TIMEOUT (10 * USEC_PER_SEC)