From 11a97b644cbc2f8906f6493b65135374858331a4 Mon Sep 17 00:00:00 2001 From: Guilherme Gallo Date: Tue, 18 Apr 2023 00:18:12 -0300 Subject: [PATCH] ci/lava: Refactor LAVAJobSubmitter and add tests Some refactoring was needed to make LAVAJobSubmitter class testable via pytest. Signed-off-by: Guilherme Gallo Part-of: --- .gitlab-ci/lava/exceptions.py | 3 +- .gitlab-ci/lava/lava_job_submitter.py | 127 ++++++++++++++++++++-------- .gitlab-ci/lava/utils/lava_job.py | 21 +++-- .gitlab-ci/tests/test_lava_job_submitter.py | 87 ++++++++++++++++++- 4 files changed, 192 insertions(+), 46 deletions(-) diff --git a/.gitlab-ci/lava/exceptions.py b/.gitlab-ci/lava/exceptions.py index 3c9a63e..f877b02 100644 --- a/.gitlab-ci/lava/exceptions.py +++ b/.gitlab-ci/lava/exceptions.py @@ -12,9 +12,10 @@ class MesaCITimeoutError(MesaCIException): class MesaCIRetryError(MesaCIException): - def __init__(self, *args, retry_count: int) -> None: + def __init__(self, *args, retry_count: int, last_job: None) -> None: super().__init__(*args) self.retry_count = retry_count + self.last_job = last_job class MesaCIParseException(MesaCIException): diff --git a/.gitlab-ci/lava/lava_job_submitter.py b/.gitlab-ci/lava/lava_job_submitter.py index 39d07de..064219d 100755 --- a/.gitlab-ci/lava/lava_job_submitter.py +++ b/.gitlab-ci/lava/lava_job_submitter.py @@ -18,7 +18,7 @@ from collections import defaultdict from dataclasses import dataclass, fields from datetime import datetime, timedelta from io import StringIO -from os import getenv +from os import environ, getenv, path from typing import Any, Optional import fire @@ -282,6 +282,7 @@ def print_job_final_status(job): def execute_job_with_retries( proxy, job_definition, retry_count, jobs_log ) -> Optional[LAVAJob]: + last_failed_job = None for attempt_no in range(1, retry_count + 2): # Need to get the logger value from its object to enable autosave # features, if AutoSaveDict is enabled from StructuredLogging module @@ -290,41 +291,52 @@ def execute_job_with_retries( job = LAVAJob(proxy, job_definition, job_log) STRUCTURAL_LOG["dut_attempt_counter"] = attempt_no try: + job_log["submitter_start_time"] = datetime.now().isoformat() submit_job(job) wait_for_job_get_started(job) log_follower: LogFollower = bootstrap_log_follower() follow_job_execution(job, log_follower) return job + except (MesaCIException, KeyboardInterrupt) as exception: job.handle_exception(exception) + + finally: + print_job_final_status(job) + # If LAVA takes too long to post process the job, the submitter + # gives up and proceeds. + job_log["submitter_end_time"] = datetime.now().isoformat() + last_failed_job = job print_log( f"{CONSOLE_LOG['BOLD']}" f"Finished executing LAVA job in the attempt #{attempt_no}" f"{CONSOLE_LOG['RESET']}" ) - finally: - job_log["finished_time"] = datetime.now().isoformat() - print_job_final_status(job) + + return last_failed_job def retriable_follow_job(proxy, job_definition) -> LAVAJob: number_of_retries = NUMBER_OF_RETRIES_TIMEOUT_DETECTION - if finished_job := execute_job_with_retries( + last_attempted_job = execute_job_with_retries( proxy, job_definition, number_of_retries, STRUCTURAL_LOG["dut_jobs"] - ): - return finished_job - - # Job failed in all attempts - raise MesaCIRetryError( - f"{CONSOLE_LOG['BOLD']}" - f"{CONSOLE_LOG['FG_RED']}" - "Job failed after it exceeded the number of " - f"{number_of_retries} retries." - f"{CONSOLE_LOG['RESET']}", - retry_count=number_of_retries, ) + if last_attempted_job.exception is not None: + # Infra failed in all attempts + raise MesaCIRetryError( + f"{CONSOLE_LOG['BOLD']}" + f"{CONSOLE_LOG['FG_RED']}" + "Job failed after it exceeded the number of " + f"{number_of_retries} retries." + f"{CONSOLE_LOG['RESET']}", + retry_count=number_of_retries, + last_job=last_attempted_job, + ) + + return last_attempted_job + @dataclass class PathResolver: @@ -371,20 +383,9 @@ class LAVAJobSubmitter(PathResolver): return self.__structured_log_context = StructuredLoggerWrapper(self).logger_context() + self.proxy = setup_lava_proxy() - def dump(self, job_definition): - if self.dump_yaml: - with GitlabSection( - "yaml_dump", - "LAVA job definition (YAML)", - type=LogSectionType.LAVA_BOOT, - start_collapsed=True, - ): - print(hide_sensitive_data(job_definition)) - - def submit(self): - proxy = setup_lava_proxy() - + def __prepare_submission(self) -> str: # Overwrite the timeout for the testcases with the value offered by the # user. The testcase running time should be at least 4 times greater than # the other sections (boot and setup), so we can safely ignore them. @@ -398,27 +399,81 @@ class LAVAJobSubmitter(PathResolver): lava_yaml.dump(generate_lava_yaml_payload(self), job_definition_stream) job_definition = job_definition_stream.getvalue() - self.dump(job_definition) + if self.dump_yaml: + self.dump_job_definition(job_definition) - job = LAVAJob(proxy, job_definition) - if errors := job.validate(): + validation_job = LAVAJob(self.proxy, job_definition) + if errors := validation_job.validate(): fatal_err(f"Error in LAVA job definition: {errors}") print_log("LAVA job definition validated successfully") + return job_definition + + @classmethod + def is_under_ci(cls): + ci_envvar: str = getenv("CI", "false") + return ci_envvar.lower() == "true" + + def dump_job_definition(self, job_definition) -> None: + with GitlabSection( + "yaml_dump", + "LAVA job definition (YAML)", + type=LogSectionType.LAVA_BOOT, + start_collapsed=True, + ): + print(hide_sensitive_data(job_definition)) + + def submit(self) -> None: + """ + Prepares and submits the LAVA job. + If `validate_only` is True, it validates the job without submitting it. + If the job finishes with a non-pass status or encounters an exception, + the program exits with a non-zero return code. + """ + job_definition: str = self.__prepare_submission() + if self.validate_only: return with self.__structured_log_context: + last_attempt_job = None try: - finished_job = retriable_follow_job(proxy, job_definition) - exit_code = 0 if finished_job.status == "pass" else 1 - STRUCTURAL_LOG["job_combined_status"] = job.status - sys.exit(exit_code) + last_attempt_job = retriable_follow_job(self.proxy, job_definition) + + except MesaCIRetryError as retry_exception: + last_attempt_job = retry_exception.last_job except Exception as exception: STRUCTURAL_LOG["job_combined_fail_reason"] = str(exception) raise exception + finally: + self.finish_script(last_attempt_job) + + def print_log_artifact_url(self): + base_url = "https://$CI_PROJECT_ROOT_NAMESPACE.pages.freedesktop.org/" + artifacts_path = "-/$CI_PROJECT_NAME/-/jobs/$CI_JOB_ID/artifacts/" + relative_log_path = self.structured_log_file.relative_to(pathlib.Path.cwd()) + full_path = f"{base_url}{artifacts_path}{relative_log_path}" + artifact_url = path.expandvars(full_path) + + print_log(f"Structural Logging data available at: {artifact_url}") + + def finish_script(self, last_attempt_job): + if self.is_under_ci() and self.structured_log_file: + self.print_log_artifact_url() + + if not last_attempt_job: + # No job was run, something bad happened + STRUCTURAL_LOG["job_combined_status"] = "script_crash" + current_exception = str(sys.exc_info()[0]) + STRUCTURAL_LOG["job_combined_fail_reason"] = current_exception + raise SystemExit(1) + + STRUCTURAL_LOG["job_combined_status"] = last_attempt_job.status + + if last_attempt_job.status != "pass": + raise SystemExit(1) class StructuredLoggerWrapper: def __init__(self, submitter: LAVAJobSubmitter) -> None: diff --git a/.gitlab-ci/lava/utils/lava_job.py b/.gitlab-ci/lava/utils/lava_job.py index 04d5300..b69f8b9 100644 --- a/.gitlab-ci/lava/utils/lava_job.py +++ b/.gitlab-ci/lava/utils/lava_job.py @@ -34,6 +34,7 @@ class LAVAJob: self._is_finished = False self.log: dict[str, Any] = log self.status = "not_submitted" + self.__exception: Optional[str] = None def heartbeat(self) -> None: self.last_log_time: datetime = datetime.now() @@ -61,6 +62,15 @@ class LAVAJob: def is_finished(self) -> bool: return self._is_finished + @property + def exception(self) -> str: + return self.__exception + + @exception.setter + def exception(self, exception: Exception) -> None: + self.__exception = repr(exception) + self.log["dut_job_fail_reason"] = self.__exception + def validate(self) -> Optional[dict]: """Returns a dict with errors, if the validation fails. @@ -158,6 +168,10 @@ class LAVAJob: def handle_exception(self, exception: Exception): print_log(exception) + self.cancel() + self.exception = exception + + # Give more accurate status depending on exception if isinstance(exception, MesaCIKnownIssueException): self.status = "canceled" elif isinstance(exception, MesaCITimeoutError): @@ -165,11 +179,8 @@ class LAVAJob: elif isinstance(exception, MesaCIException): self.status = "failed" elif isinstance(exception, KeyboardInterrupt): - self.status = "canceled_by_user" + self.status = "interrupted" print_log("LAVA job submitter was interrupted. Cancelling the job.") - raise exception + raise else: self.status = "job_submitter_error" - - self.cancel() - self.log["dut_job_fail_reason"] = str(exception) diff --git a/.gitlab-ci/tests/test_lava_job_submitter.py b/.gitlab-ci/tests/test_lava_job_submitter.py index 7ef2255..f8208a3 100644 --- a/.gitlab-ci/tests/test_lava_job_submitter.py +++ b/.gitlab-ci/tests/test_lava_job_submitter.py @@ -5,11 +5,13 @@ # # SPDX-License-Identifier: MIT +import os import xmlrpc.client from contextlib import nullcontext as does_not_raise from datetime import datetime from itertools import chain, repeat from pathlib import Path +from unittest.mock import MagicMock, patch import pytest from lava.exceptions import MesaCIException, MesaCIRetryError @@ -50,6 +52,32 @@ def mock_proxy_waiting_time(mock_proxy): return update_mock_proxy +@pytest.fixture(params=[{"CI": "true"}, {"CI": "false"}], ids=["Under CI", "Local run"]) +def ci_environment(request): + with patch.dict(os.environ, request.param): + yield + + +@pytest.fixture +def lava_job_submitter( + ci_environment, + tmp_path, + mock_proxy, +): + os.chdir(tmp_path) + tmp_file = Path(tmp_path) / "log.json" + + with patch("lava.lava_job_submitter.setup_lava_proxy") as mock_setup_lava_proxy: + mock_setup_lava_proxy.return_value = mock_proxy() + yield LAVAJobSubmitter( + boot_method="test_boot", + ci_project_dir="test_dir", + device_type="test_device", + job_timeout_min=1, + structured_log_file=tmp_file, + ) + + @pytest.mark.parametrize("exception", [RuntimeError, SystemError, KeyError]) def test_submit_and_follow_respects_exceptions(mock_sleep, mock_proxy, exception): with pytest.raises(MesaCIException): @@ -301,7 +329,7 @@ def test_parse_job_result_from_log(message, expectation, mock_proxy): @pytest.mark.slow( reason="Slow and sketchy test. Needs a LAVA log raw file at /tmp/log.yaml" ) -def test_full_yaml_log(mock_proxy, frozen_time, tmp_path): +def test_full_yaml_log(mock_proxy, frozen_time, lava_job_submitter): import random from lavacli.utils import flow_yaml as lava_yaml @@ -352,10 +380,61 @@ def test_full_yaml_log(mock_proxy, frozen_time, tmp_path): def reset_logs(*args): proxy.scheduler.jobs.logs.side_effect = load_lines() - tmp_file = Path(tmp_path) / "log.json" - LAVAJobSubmitter(structured_log_file=tmp_file) proxy.scheduler.jobs.submit = reset_logs with pytest.raises(MesaCIRetryError): time_travel_to_test_time() + lava_job_submitter.submit() retriable_follow_job(proxy, "") - print(tmp_file.read_text()) + print(lava_job_submitter.structured_log_file.read_text()) + + +@pytest.mark.parametrize( + "validate_only,finished_job_status,expected_combined_status,expected_exit_code", + [ + (True, "pass", None, None), + (False, "pass", "pass", 0), + (False, "fail", "fail", 1), + ], + ids=[ + "validate_only_no_job_submission", + "successful_job_submission", + "failed_job_submission", + ], +) +def test_job_combined_status( + lava_job_submitter, + validate_only, + finished_job_status, + expected_combined_status, + expected_exit_code, +): + lava_job_submitter.validate_only = validate_only + + with patch( + "lava.lava_job_submitter.retriable_follow_job" + ) as mock_retriable_follow_job, patch( + "lava.lava_job_submitter.LAVAJobSubmitter._LAVAJobSubmitter__prepare_submission" + ) as mock_prepare_submission, patch( + "sys.exit" + ): + from lava.lava_job_submitter import STRUCTURAL_LOG + + mock_retriable_follow_job.return_value = MagicMock(status=finished_job_status) + + mock_job_definition = MagicMock(spec=str) + mock_prepare_submission.return_value = mock_job_definition + original_status: str = STRUCTURAL_LOG.get("job_combined_status") + + if validate_only: + lava_job_submitter.submit() + mock_retriable_follow_job.assert_not_called() + assert STRUCTURAL_LOG.get("job_combined_status") == original_status + return + + try: + lava_job_submitter.submit() + + except SystemExit as e: + assert e.code == expected_exit_code + + assert STRUCTURAL_LOG["job_combined_status"] == expected_combined_status -- 2.7.4