launcher: Add support for running tests inside rr
authorThibault Saunier <tsaunier@igalia.com>
Tue, 10 Mar 2020 14:52:35 +0000 (11:52 -0300)
committerThibault Saunier <tsaunier@igalia.com>
Tue, 26 May 2020 21:19:36 +0000 (17:19 -0400)
Allowing us to easily run the tests forever and then replay the
failures!

https://rr-project.org/

Part-of: <https://gitlab.freedesktop.org/gstreamer/gst-devtools/-/merge_requests/197>

validate/launcher/apps/gstcheck.py
validate/launcher/baseclasses.py
validate/launcher/main.py
validate/launcher/utils.py

index d4f2f33..723dbc0 100644 (file)
@@ -290,7 +290,7 @@ class GstCheckTestsManager(MesonTestsManager):
         else:
             child_env['CK_TIMEOUT_MULTIPLIER'] = str(self.options.timeout_factor)
 
-        if self.options.gdb:
+        if self.options.gdb or self.options.rr:
             child_env['CK_FORK'] = "no"
 
         if self.options.gst_check_leak_trace_testnames:
index bea38bd..4b47bc4 100644 (file)
@@ -63,6 +63,7 @@ from .utils import mkdir, Result, Colors, printc, DEFAULT_TIMEOUT, GST_SECOND, \
 # The factor by which we increase the hard timeout when running inside
 # Valgrind
 GDB_TIMEOUT_FACTOR = VALGRIND_TIMEOUT_FACTOR = 20
+RR_TIMEOUT_FACTOR = 2
 TIMEOUT_FACTOR = float(os.environ.get("TIMEOUT_FACTOR", 1))
 # The error reported by valgrind when detecting errors
 VALGRIND_ERROR_CODE = 20
@@ -127,6 +128,7 @@ class Test(Loggable):
         self.workdir = workdir
         self.allow_flakiness = False
         self.html_log = None
+        self.rr_logdir = None
 
         self.clean()
 
@@ -237,6 +239,11 @@ class Test(Loggable):
                 self.out.write('\n\n## %s:\n\n```\n%s\n```\n' % (
                     os.path.basename(logfile), self.get_extra_log_content(logfile))
                 )
+
+            if self.rr_logdir:
+                self.out.write('\n\n## rr trace:\n\n```\nrr replay %s/latest-trace\n```\n' % (
+                    self.rr_logdir))
+
             self.out.flush()
             self.out.close()
 
@@ -293,6 +300,9 @@ class Test(Loggable):
 
     def add_stack_trace_to_logfile(self):
         self.debug("Adding stack trace")
+        if self.options.rr:
+            return
+
         trace_gatherer = BackTraceGenerator.get_default()
         stack_trace = trace_gatherer.get_trace(self)
 
@@ -364,7 +374,9 @@ class Test(Loggable):
             return
 
         self.debug("%s returncode: %s", self, self.process.returncode)
-        if self.process.returncode == 0:
+        if self.options.rr and self.process.returncode == -signal.SIGPIPE:
+            self.set_result(Result.SKIPPED, "SIGPIPE received under `rr`, known issue.")
+        elif self.process.returncode == 0:
             self.set_result(Result.PASSED)
         elif self.process.returncode in EXITING_SIGNALS:
             self.add_stack_trace_to_logfile()
@@ -443,7 +455,19 @@ class Test(Loggable):
         return os.environ.copy()
 
     def kill_subprocess(self):
-        utils.kill_subprocess(self, self.process, DEFAULT_TIMEOUT)
+        subprocs_id = None
+        if self.options.rr and self.process and self.process.returncode is None:
+            cmd = ["ps", "-o", "pid", "--ppid", str(self.process.pid), "--noheaders"]
+            try:
+                subprocs_id = [int(pid.strip('\n')) for
+                    pid in subprocess.check_output(cmd).decode().split(' ') if pid]
+            except FileNotFoundError:
+                self.error("Ps not found, will probably not be able to get rr "
+                    "working properly after we kill the process")
+            except subprocess.CalledProcessError as e:
+                self.error("Couldn't get rr subprocess pid: %s" % (e))
+
+        utils.kill_subprocess(self, self.process, DEFAULT_TIMEOUT, subprocs_id)
 
     def run_external_checks(self):
         pass
@@ -496,6 +520,20 @@ class Test(Loggable):
         args += ["--args"] + command
         return args
 
+    def use_rr(self, command, subenv):
+        command = ["rr", 'record', '-h'] + command
+
+        self.timeout *= RR_TIMEOUT_FACTOR
+        self.rr_logdir = os.path.join(self.options.logsdir, self.classname.replace(".", os.sep), 'rr-logs')
+        subenv['_RR_TRACE_DIR'] = self.rr_logdir
+        try:
+            shutil.rmtree(self.rr_logdir, ignore_errors=False, onerror=None)
+        except FileNotFoundError:
+            pass
+        self.add_env_variable('_RR_TRACE_DIR', self.rr_logdir)
+
+        return command
+
     def use_valgrind(self, command, subenv):
         vglogsfile = os.path.splitext(self.logfile)[0] + '.valgrind'
         self.extra_logfiles.add(vglogsfile)
@@ -607,6 +645,9 @@ class Test(Loggable):
         if self.options.valgrind:
             self.command = self.use_valgrind(self.command, self.proc_env)
 
+        if self.options.rr:
+            self.command = self.use_rr(self.command, self.proc_env)
+
         if not self.options.redirect_logs:
             self.out.write("# `%s`\n\n"
                            "## Command\n\n``` bash\n%s\n```\n\n" % (
@@ -762,6 +803,7 @@ class GstValidateTest(Test):
     """ A class representing a particular test. """
     HARD_TIMEOUT_FACTOR = 5
     fault_sig_regex = re.compile("<Caught SIGNAL: .*>")
+    needs_gst_inspect = set()
 
     def __init__(self, application_name, classname,
                  options, reporter, duration=0,
@@ -1009,9 +1051,13 @@ class GstValidateTest(Test):
         if self.result in [Result.FAILED, Result.PASSED, Result.SKIPPED]:
             return
 
+
         self.debug("%s returncode: %s", self, self.process.returncode)
 
         expected_issues = copy.deepcopy(self.expected_issues)
+        if self.options.rr:
+            # signal.SIGPPIPE is 13 but it sometimes isn't present in python for some reason.
+            expected_issues.append({"returncode": -13, "sometimes": True})
         self.criticals, not_found_expected_issues, expected_returncode = self.check_reported_issues(expected_issues)
         expected_timeout = None
         expected_signal = None
@@ -2029,7 +2075,7 @@ class _TestsLauncher(Loggable):
                 current_test_num += 1
                 res = test.test_end(retry_on_failure=retry_on_failures)
                 to_report = True
-                if res != Result.PASSED:
+                if res not in [Result.PASSED, Result.SKIPPED]:
                     if self.options.forever or self.options.fatal_error:
                         self.print_result(current_test_num - 1, test, retry_on_failure=retry_on_failures)
                         self.reporter.after_test(test)
index 1fdec98..d804254 100644 (file)
@@ -18,6 +18,7 @@
 # Boston, MA 02110-1301, USA.
 import os
 import sys
+import shutil
 from . import utils
 import urllib.parse
 from . import loggable
@@ -202,6 +203,7 @@ class LauncherConfig(Loggable):
         self.valgrind = False
         self.gdb = False
         self.no_display = False
+        self.rr = False
         self.xunit_file = None
         self.main_dir = utils.DEFAULT_MAIN_DIR
         self.output_dir = None
@@ -332,6 +334,12 @@ class LauncherConfig(Loggable):
                        Colors.FAIL)
                 return False
 
+        if self.rr:
+            if not shutil.which('rr'):
+                printc("Want to use rr, but not available on the system",
+                       Colors.FAIL)
+                return False
+
         if self.html:
             try:
                 import commonmark
@@ -478,6 +486,9 @@ class LauncherConfig(Loggable):
         parser.add_argument("-vg", "--valgrind", dest="valgrind",
                             action="store_true",
                             help="Run the tests inside Valgrind")
+        parser.add_argument("-rr", "--rr", dest="rr",
+                            action="store_true",
+                            help="Run the tests inside rr record")
         parser.add_argument("--gdb", dest="gdb",
                             action="store_true",
                             help="Run the tests inside gdb (implies"
index 1d89ee4..45c0951 100644 (file)
@@ -594,7 +594,7 @@ def check_bugs_resolution(bugs_definitions):
     return res
 
 
-def kill_subprocess(owner, process, timeout):
+def kill_subprocess(owner, process, timeout, subprocess_ids=None):
     if process is None:
         return
 
@@ -611,11 +611,16 @@ def kill_subprocess(owner, process, timeout):
                 subprocess.call(
                     ['taskkill', '/F', '/T', '/PID', str(process.pid)])
             else:
-                process.send_signal(killsig)
+                if subprocess_ids:
+                    for subprocess_id in subprocess_ids:
+                        os.kill(subprocess_id, killsig)
+                else:
+                    process.send_signal(killsig)
             time.sleep(waittime)
             waittime *= 2
         except OSError:
             pass
+
         if not is_windows() and time.time() - stime > timeout / 4:
             killsig = signal.SIGKILL
         if time.time() - stime > timeout: