ci/lava: Add a simple Structural Logger into submitter
authorGuilherme Gallo <guilherme.gallo@collabora.com>
Thu, 6 Apr 2023 04:31:04 +0000 (01:31 -0300)
committerMarge Bot <emma+marge@anholt.net>
Wed, 19 Apr 2023 14:36:37 +0000 (14:36 +0000)
Refactor some pieces of the submitter to improve the clarity of the
functions and create a simple dictionary with aggregated data from the
submitter execution which will be dumped to a file when the script
exits.

Add support for the AutoSaveDict based structured logger as well, which
will come in a follow-up commit.

Signed-off-by: Guilherme Gallo <guilherme.gallo@collabora.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/22500>

.gitlab-ci/lava/lava-submit.sh
.gitlab-ci/lava/lava_job_submitter.py
.gitlab-ci/tests/conftest.py
.gitlab-ci/tests/test_lava_job_submitter.py

index 5e1ec0c..5f58705 100755 (executable)
@@ -64,4 +64,5 @@ PYTHONPATH=artifacts/ artifacts/lava/lava_job_submitter.py \
        --visibility-group "${VISIBILITY_GROUP}" \
        --lava-tags "${LAVA_TAGS}" \
        --mesa-job-name "$CI_JOB_NAME" \
+       --structured-log-file "results/lava_job_detail.json" \
        >> results/lava.log
index 0372fb4..0ba515a 100755 (executable)
 
 
 import contextlib
+import json
 import pathlib
 import sys
 import time
+from collections import defaultdict
 from dataclasses import dataclass, fields
 from datetime import datetime, timedelta
 from io import StringIO
@@ -43,6 +45,10 @@ from lava.utils import (
 )
 from lavacli.utils import flow_yaml as lava_yaml
 
+# Initialize structural logging with a defaultdict, it can be changed for more
+# sophisticated dict-like data abstractions.
+STRUCTURAL_LOG = defaultdict(list)
+
 # Timeout in seconds to decide if the device from the dispatched LAVA job has
 # hung or not due to the lack of new log output.
 DEVICE_HANGING_TIMEOUT_SEC = int(getenv("LAVA_DEVICE_HANGING_TIMEOUT_SEC",  5*60))
@@ -68,7 +74,11 @@ NUMBER_OF_RETRIES_TIMEOUT_DETECTION = int(
     getenv("LAVA_NUMBER_OF_RETRIES_TIMEOUT_DETECTION", 2)
 )
 
-def find_exception_from_metadata(metadata, job_id):
+def raise_exception_from_metadata(metadata: dict, job_id: int) -> None:
+    """
+    Investigate infrastructure errors from the job metadata.
+    If it finds an error, raise it as MesaCIException.
+    """
     if "result" not in metadata or metadata["result"] != "fail":
         return
     if "error_type" in metadata:
@@ -90,23 +100,22 @@ def find_exception_from_metadata(metadata, job_id):
         raise MesaCIException(
             f"LAVA job {job_id} failed validation (possible download error). Retry."
         )
-    return metadata
 
 
-def find_lava_error(job) -> None:
-    # Look for infrastructure errors and retry if we see them.
+def raise_lava_error(job) -> None:
+    # Look for infrastructure errors, raise them, and retry if we see them.
     results_yaml = call_proxy(job.proxy.results.get_testjob_results_yaml, job.job_id)
     results = lava_yaml.load(results_yaml)
     for res in results:
         metadata = res["metadata"]
-        find_exception_from_metadata(metadata, job.job_id)
+        raise_exception_from_metadata(metadata, job.job_id)
 
     # If we reach this far, it means that the job ended without hwci script
     # result and no LAVA infrastructure problem was found
     job.status = "fail"
 
 
-def show_job_data(job, colour=f"{CONSOLE_LOG['BOLD']}{CONSOLE_LOG['FG_GREEN']}"):
+def show_final_job_data(job, colour=f"{CONSOLE_LOG['BOLD']}{CONSOLE_LOG['FG_GREEN']}"):
     with GitlabSection(
         "job_data",
         "LAVA job info",
@@ -233,7 +242,8 @@ def follow_job_execution(job, log_follower):
     # If this does not happen, it probably means a LAVA infrastructure error
     # happened.
     if job.status not in ["pass", "fail"]:
-        find_lava_error(job)
+        raise_lava_error(job)
+
 
 def structural_log_phases(job, log_follower):
     phases: dict[str, Any] = {
@@ -244,6 +254,7 @@ def structural_log_phases(job, log_follower):
     }
     job.log["dut_job_phases"] = phases
 
+
 def print_job_final_status(job):
     if job.status == "running":
         job.status = "hung"
@@ -256,13 +267,19 @@ def print_job_final_status(job):
     )
 
     job.refresh_log()
-    job.log["status"] = job.status
-    show_job_data(job, colour=f"{CONSOLE_LOG['BOLD']}{color}")
+    show_final_job_data(job, colour=f"{CONSOLE_LOG['BOLD']}{color}")
 
 
-def execute_job_with_retries(proxy, job_definition, retry_count) -> Optional[LAVAJob]:
+def execute_job_with_retries(
+    proxy, job_definition, retry_count, jobs_log
+) -> Optional[LAVAJob]:
     for attempt_no in range(1, retry_count + 2):
-        job = LAVAJob(proxy, job_definition)
+        # Need to get the logger value from its object to enable autosave
+        # features
+        jobs_log.append({})
+        job_log = jobs_log[-1]
+        job = LAVAJob(proxy, job_definition, job_log)
+        STRUCTURAL_LOG["dut_attempt_counter"] = attempt_no
 
         try:
             submit_job(job)
@@ -280,6 +297,7 @@ def execute_job_with_retries(proxy, job_definition, retry_count) -> Optional[LAV
             )
 
         finally:
+            job_log["finished_time"] = datetime.now().isoformat()
             print_job_final_status(job)
 
 
@@ -287,7 +305,7 @@ def retriable_follow_job(proxy, job_definition) -> LAVAJob:
     number_of_retries = NUMBER_OF_RETRIES_TIMEOUT_DETECTION
 
     if finished_job := execute_job_with_retries(
-        proxy, job_definition, number_of_retries
+        proxy, job_definition, number_of_retries, STRUCTURAL_LOG["dut_jobs"]
     ):
         return finished_job
 
@@ -335,12 +353,16 @@ class LAVAJobSubmitter(PathResolver):
     validate_only: bool = False  # Whether to only validate the job, not execute it
     visibility_group: str = None  # Only affects LAVA farm maintainers
     job_rootfs_overlay_url: str = None
+    structured_log_file: pathlib.Path = None  # Log file path with structured LAVA log
 
     def __post_init__(self) -> None:
         super().__post_init__()
         # Remove mesa job names with spaces, which breaks the lava-test-case command
         self.mesa_job_name = self.mesa_job_name.split(" ")[0]
 
+        if self.structured_log_file:
+            self.setup_structured_logger()
+
     def dump(self, job_definition):
         if self.dump_yaml:
             with GitlabSection(
@@ -377,10 +399,32 @@ class LAVAJobSubmitter(PathResolver):
         if self.validate_only:
             return
 
-        finished_job = retriable_follow_job(proxy, job_definition)
+        try:
+            finished_job = retriable_follow_job(proxy, job_definition)
+        except Exception as exception:
+            STRUCTURAL_LOG["job_combined_fail_reason"] = str(exception)
+            raise exception
         exit_code = 0 if finished_job.status == "pass" else 1
+        STRUCTURAL_LOG["job_combined_status"] = job.status
         sys.exit(exit_code)
 
+    def setup_structured_logger(self):
+        try:
+            global STRUCTURAL_LOG
+            STRUCTURAL_LOG = StructuredLogger(
+                self.structured_log_file, truncate=True
+            ).data
+        except NameError as e:
+            print(
+                f"Could not import StructuredLogger library: {e}. "
+                "Falling back to DummyLogger"
+            )
+
+        STRUCTURAL_LOG["fixed_tags"] = self.lava_tags
+        STRUCTURAL_LOG["dut_job_type"] = self.device_type
+        STRUCTURAL_LOG["job_combined_fail_reason"] = None
+        STRUCTURAL_LOG["job_combined_status"] = "not_submitted"
+
 
 if __name__ == "__main__":
     # given that we proxy from DUT -> LAVA dispatcher -> LAVA primary -> us ->
index be71c94..ba021cc 100644 (file)
@@ -1,3 +1,4 @@
+from collections import defaultdict
 from unittest.mock import MagicMock, patch
 
 import pytest
@@ -32,7 +33,7 @@ RESULT_GET_TESTJOB_RESULTS = [{"metadata": {"result": "test"}}]
 
 
 @pytest.fixture
-def mock_proxy():
+def mock_proxy(frozen_time):
     def create_proxy_mock(
         job_results=RESULT_GET_TESTJOB_RESULTS,
         testsuite_results=[generate_testsuite_result()],
@@ -51,6 +52,20 @@ def mock_proxy():
         proxy_logs_mock = proxy_mock.scheduler.jobs.logs
         proxy_logs_mock.return_value = jobs_logs_response()
 
+        proxy_job_state = proxy_mock.scheduler.job_state
+        proxy_job_state.return_value = {"job_state": "Running"}
+        proxy_job_state.side_effect = frozen_time.tick(1)
+
+        proxy_show_mock = proxy_mock.scheduler.jobs.show
+        proxy_show_mock.return_value = defaultdict(
+            str,
+            {
+                "device_type": "test_device",
+                "device": "test_device-cbg-1",
+                "state": "created",
+            },
+        )
+
         for key, value in kwargs.items():
             setattr(proxy_logs_mock, key, value)
 
index d481aac..7ef2255 100644 (file)
@@ -9,6 +9,7 @@ import xmlrpc.client
 from contextlib import nullcontext as does_not_raise
 from datetime import datetime
 from itertools import chain, repeat
+from pathlib import Path
 
 import pytest
 from lava.exceptions import MesaCIException, MesaCIRetryError
@@ -16,6 +17,7 @@ from lava.lava_job_submitter import (
     DEVICE_HANGING_TIMEOUT_SEC,
     NUMBER_OF_RETRIES_TIMEOUT_DETECTION,
     LAVAJob,
+    LAVAJobSubmitter,
     bootstrap_log_follower,
     follow_job_execution,
     retriable_follow_job,
@@ -299,7 +301,7 @@ def test_parse_job_result_from_log(message, expectation, mock_proxy):
 @pytest.mark.slow(
     reason="Slow and sketchy test. Needs a LAVA log raw file at /tmp/log.yaml"
 )
-def test_full_yaml_log(mock_proxy, frozen_time):
+def test_full_yaml_log(mock_proxy, frozen_time, tmp_path):
     import random
 
     from lavacli.utils import flow_yaml as lava_yaml
@@ -350,7 +352,10 @@ def test_full_yaml_log(mock_proxy, frozen_time):
     def reset_logs(*args):
         proxy.scheduler.jobs.logs.side_effect = load_lines()
 
+    tmp_file = Path(tmp_path) / "log.json"
+    LAVAJobSubmitter(structured_log_file=tmp_file)
     proxy.scheduler.jobs.submit = reset_logs
     with pytest.raises(MesaCIRetryError):
         time_travel_to_test_time()
         retriable_follow_job(proxy, "")
+        print(tmp_file.read_text())