From 8626a52637f31aed08c62cf33727820cc68342c6 Mon Sep 17 00:00:00 2001 From: Guilherme Gallo Date: Fri, 5 May 2023 08:57:32 -0300 Subject: [PATCH] ci/lava: Add bridge function for job definition To use the supported job definition depending on some Mesa CI job characteristics. The strategy here, is to use LAVA with a containerized SSH session to follow the job output, escaping from dumping data to the UART, which proves to be error prone in some devices. Signed-off-by: Guilherme Gallo Part-of: --- .gitlab-ci/lava/lava_job_submitter.py | 6 +- .gitlab-ci/lava/utils/__init__.py | 2 +- .gitlab-ci/lava/utils/lava_job_definition.py | 254 ++++++++++----------------- .gitlab-ci/lava/utils/ssh_job_definition.py | 100 +---------- .gitlab-ci/lava/utils/uart_job_definition.py | 172 ++++++++++++++++++ 5 files changed, 271 insertions(+), 263 deletions(-) create mode 100644 .gitlab-ci/lava/utils/uart_job_definition.py diff --git a/.gitlab-ci/lava/lava_job_submitter.py b/.gitlab-ci/lava/lava_job_submitter.py index 7227a7c..5eb624e 100755 --- a/.gitlab-ci/lava/lava_job_submitter.py +++ b/.gitlab-ci/lava/lava_job_submitter.py @@ -37,7 +37,7 @@ from lava.utils import ( LogSectionType, call_proxy, fatal_err, - generate_lava_yaml_payload, + generate_lava_job_definition, hide_sensitive_data, print_log, setup_lava_proxy, @@ -399,9 +399,7 @@ class LAVAJobSubmitter(PathResolver): minutes=self.job_timeout_min ) - job_definition_stream = StringIO() - lava_yaml.dump(generate_lava_yaml_payload(self), job_definition_stream) - job_definition = job_definition_stream.getvalue() + job_definition = generate_lava_job_definition(self) if self.dump_yaml: self.dump_job_definition(job_definition) diff --git a/.gitlab-ci/lava/utils/__init__.py b/.gitlab-ci/lava/utils/__init__.py index a02767c..349d2b3 100644 --- a/.gitlab-ci/lava/utils/__init__.py +++ b/.gitlab-ci/lava/utils/__init__.py @@ -1,7 +1,7 @@ from .console_format import CONSOLE_LOG from .gitlab_section import GitlabSection from .lava_job import LAVAJob -from .lava_job_definition import generate_lava_yaml_payload +from .lava_job_definition import generate_lava_job_definition from .lava_proxy import call_proxy, setup_lava_proxy from .log_follower import ( LogFollower, diff --git a/.gitlab-ci/lava/utils/lava_job_definition.py b/.gitlab-ci/lava/utils/lava_job_definition.py index cb1b74b..b05961d 100644 --- a/.gitlab-ci/lava/utils/lava_job_definition.py +++ b/.gitlab-ci/lava/utils/lava_job_definition.py @@ -1,7 +1,16 @@ -# How many attempts should be made when a timeout happen during LAVA device boot. +from io import StringIO +from typing import TYPE_CHECKING, Any + +import re +from lava.utils.lava_farm import LavaFarm, get_lava_farm +from ruamel.yaml.scalarstring import LiteralScalarString +from ruamel.yaml import YAML from os import getenv -from typing import Any +if TYPE_CHECKING: + from lava.lava_job_submitter import LAVAJobSubmitter + +# How many attempts should be made when a timeout happen during LAVA device boot. NUMBER_OF_ATTEMPTS_LAVA_BOOT = int(getenv("LAVA_NUMBER_OF_ATTEMPTS_LAVA_BOOT", 3)) # Supports any integers in [0, 100]. @@ -10,8 +19,58 @@ NUMBER_OF_ATTEMPTS_LAVA_BOOT = int(getenv("LAVA_NUMBER_OF_ATTEMPTS_LAVA_BOOT", 3 JOB_PRIORITY = int(getenv("LAVA_JOB_PRIORITY", 75)) -def generate_lava_yaml_payload(args) -> dict[str, Any]: - # General metadata and permissions, plus also inexplicably kernel arguments +def generate_lava_yaml_payload(job_submitter: "LAVAJobSubmitter") -> dict[str, Any]: + """ + Bridge function to use the supported job definition depending on some Mesa + CI job characteristics. + + The strategy here, is to use LAVA with a containerized SSH session to follow + the job output, escaping from dumping data to the UART, which proves to be + error prone in some devices. + """ + from lava.utils.ssh_job_definition import ( + generate_lava_yaml_payload as ssh_lava_yaml, + ) + from lava.utils.uart_job_definition import ( + generate_lava_yaml_payload as uart_lava_yaml, + ) + + # Only Collabora's farm supports to run docker container as a LAVA actions, + # which is required to follow the job in a SSH section + current_farm = get_lava_farm() + + # SSH job definition still needs to add support for fastboot. + job_uses_fastboot: bool = job_submitter.boot_method == "fastboot" + + if current_farm == LavaFarm.COLLABORA and not job_uses_fastboot: + return ssh_lava_yaml(job_submitter) + + return uart_lava_yaml(job_submitter) + + +def generate_lava_job_definition(job_submitter: "LAVAJobSubmitter") -> str: + job_stream = StringIO() + yaml = YAML() + yaml.width = 4096 + yaml.dump(generate_lava_yaml_payload(job_submitter), job_stream) + return job_stream.getvalue() + + +def to_yaml_block(steps_array: list[str], escape_vars=[]) -> LiteralScalarString: + def escape_envvar(match): + return "\\" + match.group(0) + + filtered_array = [s for s in steps_array if s.strip() and not s.startswith("#")] + final_str = "\n".join(filtered_array) + + for escape_var in escape_vars: + # Find env vars and add '\\' before them + final_str = re.sub(rf"\${escape_var}*", escape_envvar, final_str) + return LiteralScalarString(final_str) + + +def generate_metadata(args) -> dict[str, Any]: + # General metadata and permissions values = { "job_name": f"mesa: {args.pipeline_info}", "device_type": args.device_type, @@ -25,7 +84,7 @@ def generate_lava_yaml_payload(args) -> dict[str, Any]: "actions": { "depthcharge-retry": { # Could take between 1 and 1.5 min in slower boots - "minutes": 2 + "minutes": 4 }, "depthcharge-start": { # Should take less than 1 min. @@ -34,7 +93,7 @@ def generate_lava_yaml_payload(args) -> dict[str, Any]: "depthcharge-action": { # This timeout englobes the entire depthcharge timing, # including retries - "minutes": 2 + "minutes": 5 * NUMBER_OF_ATTEMPTS_LAVA_BOOT, }, }, @@ -44,176 +103,39 @@ def generate_lava_yaml_payload(args) -> dict[str, Any]: if args.lava_tags: values["tags"] = args.lava_tags.split(",") - # URLs to our kernel rootfs to boot from, both generated by the base - # container build - - nfsrootfs = { - "url": f"{args.rootfs_url_prefix}/lava-rootfs.tar.zst", - "compression": "zstd", - } - - fastboot_deploy_nfs = { - "timeout": {"minutes": 10}, - "to": "nfs", - "nfsrootfs": nfsrootfs, - } - - fastboot_deploy_prepare = { - "timeout": {"minutes": 5}, - "to": "downloads", - "os": "oe", - "images": { - "kernel": { - "url": f"{args.kernel_url_prefix}/{args.kernel_image_name}", - }, - }, - "postprocess": { - "docker": { - "image": "registry.gitlab.collabora.com/lava/health-check-docker", - "steps": [ - 'gzip Image', - "cat Image.gz " + args.dtb_filename + ".dtb > Image.gz+dtb", - "mkbootimg --kernel Image.gz+dtb" + - ' --cmdline "root=/dev/nfs rw nfsroot=$NFS_SERVER_IP:$NFS_ROOTFS,tcp,hard rootwait ip=dhcp init=/init"' + - " --pagesize 4096 --base 0x80000000 -o boot.img", - ], - }, - } - } - if args.kernel_image_type: - fastboot_deploy_prepare["images"]["kernel"]["type"] = args.kernel_image_type - if args.dtb_filename: - fastboot_deploy_prepare["images"]["dtb"] = {"url": f"{args.kernel_url_prefix}/{args.dtb_filename}.dtb"} - - tftp_deploy = { - "timeout": {"minutes": 5}, - "to": "tftp", - "os": "oe", - "kernel": { - "url": f"{args.kernel_url_prefix}/{args.kernel_image_name}", - }, - "nfsrootfs": nfsrootfs, - } - if args.kernel_image_type: - tftp_deploy["kernel"]["type"] = args.kernel_image_type - if args.dtb_filename: - tftp_deploy["dtb"] = {"url": f"{args.kernel_url_prefix}/{args.dtb_filename}.dtb"} - - fastboot_deploy = { - "timeout": {"minutes": 2}, - "to": "fastboot", - "docker": { - "image": "registry.gitlab.collabora.com/lava/health-check-docker", - }, - "images": { - "boot": {"url": "downloads://boot.img"}, - }, - } - - fastboot_boot = { - "timeout": {"minutes": 2}, - "docker": {"image": "registry.gitlab.collabora.com/lava/health-check-docker"}, - "failure_retry": NUMBER_OF_ATTEMPTS_LAVA_BOOT, - "method": args.boot_method, - "prompts": ["lava-shell:"], - "commands": ["set_active a"] - } - - tftp_boot = { - "failure_retry": NUMBER_OF_ATTEMPTS_LAVA_BOOT, - "method": args.boot_method, - "prompts": ["lava-shell:"], - "commands": "nfs" - } - - # skeleton test definition: only declaring each job as a single 'test' - # since LAVA's test parsing is not useful to us - run_steps = [] - test = { - "timeout": {"minutes": args.job_timeout_min}, - "failure_retry": 1, - "definitions": [ - { - "name": "mesa", - "from": "inline", - "lava-signal": "kmsg", - "path": "inline/mesa.yaml", - "repository": { - "metadata": { - "name": "mesa", - "description": "Mesa test plan", - "os": ["oe"], - "scope": ["functional"], - "format": "Lava-Test Test Definition 1.0", - }, - "run": {"steps": run_steps}, - }, - } - ], - } + return values - # job execution script: - # - inline .gitlab-ci/common/init-stage1.sh - # - fetch and unpack per-pipeline build artifacts from build job - # - fetch and unpack per-job environment from lava-submit.sh - # - exec .gitlab-ci/common/init-stage2.sh - with open(args.first_stage_init, "r") as init_sh: - run_steps += [ - x.rstrip() for x in init_sh if not x.startswith("#") and x.rstrip() - ] - # We cannot distribute the Adreno 660 shader firmware inside rootfs, - # since the license isn't bundled inside the repository - if args.device_type == "sm8350-hdk": - run_steps.append( - "curl -L --retry 4 -f --retry-all-errors --retry-delay 60 " + - "https://github.com/allahjasif1990/hdk888-firmware/raw/main/a660_zap.mbn " + - "-o \"/lib/firmware/qcom/sm8350/a660_zap.mbn\"" - ) - run_steps.append( - "curl -L --retry 4 -f --retry-all-errors --retry-delay 60 " - f"{args.job_rootfs_overlay_url} | tar -xz -C /", - ) +def artifact_download_steps(args): + """ + This function is responsible for setting up the SSH server in the DUT and to + export the first boot environment to a file. + """ + # Putting JWT pre-processing and mesa download, within init-stage1.sh file, + # as we do with non-SSH version. + download_steps = [ + "set -ex", + "curl -L --retry 4 -f --retry-all-errors --retry-delay 60 " + f"{args.job_rootfs_overlay_url} | tar -xz -C /", + f"mkdir -p {args.ci_project_dir}", + f"curl -L --retry 4 -f --retry-all-errors --retry-delay 60 {args.build_url} | " + f"tar --zstd -x -C {args.ci_project_dir}", + ] + # If the JWT file is provided, we will use it to authenticate with the cloud + # storage provider and will hide it from the job output in Gitlab. if args.jwt_file: with open(args.jwt_file) as jwt_file: - run_steps += [ + download_steps += [ "set +x", - f'echo -n "{jwt_file.read()}" > "{args.jwt_file}" # HIDEME', + f'echo -n "{jwt_file.read()}" > "{args.jwt_file}" # HIDEME', "set -x", f'echo "export CI_JOB_JWT_FILE={args.jwt_file}" >> /set-job-env-vars.sh', ] else: - run_steps += [ - "echo Could not find jwt file, disabling MINIO requests...", + download_steps += [ + "echo Could not find jwt file, disabling S3 requests...", "sed -i '/MINIO_RESULTS_UPLOAD/d' /set-job-env-vars.sh", ] - run_steps += [ - f"mkdir -p {args.ci_project_dir}", - f"curl {args.build_url} | tar --zstd -x -C {args.ci_project_dir}", - # Sleep a bit to give time for bash to dump shell xtrace messages into - # console which may cause interleaving with LAVA_SIGNAL_STARTTC in some - # devices like a618. - "sleep 1", - # Putting CI_JOB name as the testcase name, it may help LAVA farm - # maintainers with monitoring - f"lava-test-case 'mesa-ci_{args.mesa_job_name}' --shell /init-stage2.sh", - ] - - if args.boot_method == "fastboot": - values["actions"] = [ - {"deploy": fastboot_deploy_nfs}, - {"deploy": fastboot_deploy_prepare}, - {"deploy": fastboot_deploy}, - {"boot": fastboot_boot}, - {"test": test}, - ] - else: # tftp - values["actions"] = [ - {"deploy": tftp_deploy}, - {"boot": tftp_boot}, - {"test": test}, - ] - - return values + return download_steps diff --git a/.gitlab-ci/lava/utils/ssh_job_definition.py b/.gitlab-ci/lava/utils/ssh_job_definition.py index c873f23..7b82772 100644 --- a/.gitlab-ci/lava/utils/ssh_job_definition.py +++ b/.gitlab-ci/lava/utils/ssh_job_definition.py @@ -27,20 +27,15 @@ Therefore, we have divided the job definition into four parts: script after sourcing "dut-env-vars.sh" again for the second SSH test case. """ -import re -from os import getenv from pathlib import Path from typing import Any -from ruamel.yaml.scalarstring import LiteralScalarString - -# How many attempts should be made when a timeout happen during LAVA device boot. -NUMBER_OF_ATTEMPTS_LAVA_BOOT = int(getenv("LAVA_NUMBER_OF_ATTEMPTS_LAVA_BOOT", 3)) - -# Supports any integers in [0, 100]. -# The scheduler considers the job priority when ordering the queue -# to consider which job should run next. -JOB_PRIORITY = int(getenv("LAVA_JOB_PRIORITY", 75)) +from .lava_job_definition import ( + artifact_download_steps, + to_yaml_block, + generate_metadata, + NUMBER_OF_ATTEMPTS_LAVA_BOOT, +) # Very early SSH server setup. Uses /dut_ready file to flag it is done. SSH_SERVER_COMMANDS = { @@ -83,55 +78,6 @@ lava_ssh_test_case() { ] -def to_yaml_block(steps_array: list[str], escape_vars=[]) -> LiteralScalarString: - def escape_envvar(match): - return "\\" + match.group(0) - - filtered_array = [s for s in steps_array if s.strip() and not s.startswith("#")] - final_str = "\n".join(filtered_array) - - for escape_var in escape_vars: - # Find env vars and add '\\' before them - final_str = re.sub(rf"\${escape_var}*", escape_envvar, final_str) - return LiteralScalarString(final_str) - - -def artifact_download_steps(args): - """ - This function is responsible for setting up the SSH server in the DUT and to - export the first boot environment to a file. - """ - # Putting JWT pre-processing and mesa download, within init-stage1.sh file, - # as we do with non-SSH version. - download_steps = [ - "set -ex", - "source /dut-env-vars.sh", - "curl -L --retry 4 -f --retry-all-errors --retry-delay 60 " - f"{args.job_rootfs_overlay_url} | tar -xz -C /", - f"mkdir -p {args.ci_project_dir}", - f"curl -L --retry 4 -f --retry-all-errors --retry-delay 60 {args.build_url} | " - f"tar --zstd -x -C {args.ci_project_dir}", - ] - - # If the JWT file is provided, we will use it to authenticate with the cloud - # storage provider and will hide it from the job output in Gitlab. - if args.jwt_file: - with open(args.jwt_file) as jwt_file: - download_steps += [ - "set +x", - f'echo -n "{jwt_file.read()}" > "{args.jwt_file}" # HIDEME', - "set -x", - f'echo "export CI_JOB_JWT_FILE={args.jwt_file}" >> /set-job-env-vars.sh', - ] - else: - download_steps += [ - "echo Could not find jwt file, disabling S3 requests...", - "sed -i '/MINIO_RESULTS_UPLOAD/d' /set-job-env-vars.sh", - ] - - return download_steps - - def generate_dut_test(args): # Commands executed on DUT. # Trying to execute the minimal number of commands, because the console data is @@ -200,6 +146,7 @@ def generate_docker_test(args): to_yaml_block( ( "lava_ssh_test_case 'artifact_download' 'bash --' << EOF", + "source /dut-env-vars.sh", *artifact_download_steps(args), "EOF", ) @@ -216,38 +163,7 @@ def generate_docker_test(args): def generate_lava_yaml_payload(args) -> dict[str, Any]: - # General metadata and permissions - values = { - "job_name": f"mesa: {args.pipeline_info}", - "device_type": args.device_type, - "visibility": {"group": [args.visibility_group]}, - "priority": JOB_PRIORITY, - "context": { - "extra_nfsroot_args": " init=/init rootwait usbcore.quirks=0bda:8153:k" - }, - "timeouts": { - "job": {"minutes": args.job_timeout_min}, - "actions": { - "depthcharge-retry": { - # Could take between 1 and 1.5 min in slower boots - "minutes": 4 - }, - "depthcharge-start": { - # Should take less than 1 min. - "minutes": 1, - }, - "depthcharge-action": { - # This timeout englobes the entire depthcharge timing, - # including retries - "minutes": 5 - * NUMBER_OF_ATTEMPTS_LAVA_BOOT, - }, - }, - }, - } - - if args.lava_tags: - values["tags"] = args.lava_tags.split(",") + values = generate_metadata(args) # URLs to our kernel rootfs to boot from, both generated by the base # container build diff --git a/.gitlab-ci/lava/utils/uart_job_definition.py b/.gitlab-ci/lava/utils/uart_job_definition.py new file mode 100644 index 0000000..36e2017 --- /dev/null +++ b/.gitlab-ci/lava/utils/uart_job_definition.py @@ -0,0 +1,172 @@ +from typing import Any +from .lava_job_definition import ( + generate_metadata, + NUMBER_OF_ATTEMPTS_LAVA_BOOT, + artifact_download_steps, +) + + +def generate_lava_yaml_payload(args) -> dict[str, Any]: + values = generate_metadata(args) + + # URLs to our kernel rootfs to boot from, both generated by the base + # container build + + nfsrootfs = { + "url": f"{args.rootfs_url_prefix}/lava-rootfs.tar.zst", + "compression": "zstd", + } + + fastboot_deploy_nfs = { + "timeout": {"minutes": 10}, + "to": "nfs", + "nfsrootfs": nfsrootfs, + } + + fastboot_deploy_prepare = { + "timeout": {"minutes": 5}, + "to": "downloads", + "os": "oe", + "images": { + "kernel": { + "url": f"{args.kernel_url_prefix}/{args.kernel_image_name}", + }, + }, + "postprocess": { + "docker": { + "image": "registry.gitlab.collabora.com/lava/health-check-docker", + "steps": [ + "gzip Image", + f"cat Image.gz {args.dtb_filename}.dtb > Image.gz+dtb", + "mkbootimg --kernel Image.gz+dtb" + + ' --cmdline "root=/dev/nfs rw nfsroot=$NFS_SERVER_IP:$NFS_ROOTFS,tcp,hard rootwait ip=dhcp init=/init"' + + " --pagesize 4096 --base 0x80000000 -o boot.img", + ], + } + }, + } + if args.kernel_image_type: + fastboot_deploy_prepare["images"]["kernel"]["type"] = args.kernel_image_type + if args.dtb_filename: + fastboot_deploy_prepare["images"]["dtb"] = { + "url": f"{args.kernel_url_prefix}/{args.dtb_filename}.dtb" + } + + tftp_deploy = { + "timeout": {"minutes": 5}, + "to": "tftp", + "os": "oe", + "kernel": { + "url": f"{args.kernel_url_prefix}/{args.kernel_image_name}", + }, + "nfsrootfs": nfsrootfs, + } + if args.kernel_image_type: + tftp_deploy["kernel"]["type"] = args.kernel_image_type + if args.dtb_filename: + tftp_deploy["dtb"] = { + "url": f"{args.kernel_url_prefix}/{args.dtb_filename}.dtb" + } + + fastboot_deploy = { + "timeout": {"minutes": 2}, + "to": "fastboot", + "docker": { + "image": "registry.gitlab.collabora.com/lava/health-check-docker", + }, + "images": { + "boot": {"url": "downloads://boot.img"}, + }, + } + + fastboot_boot = { + "timeout": {"minutes": 2}, + "docker": {"image": "registry.gitlab.collabora.com/lava/health-check-docker"}, + "failure_retry": NUMBER_OF_ATTEMPTS_LAVA_BOOT, + "method": args.boot_method, + "prompts": ["lava-shell:"], + "commands": ["set_active a"], + } + + tftp_boot = { + "failure_retry": NUMBER_OF_ATTEMPTS_LAVA_BOOT, + "method": args.boot_method, + "prompts": ["lava-shell:"], + "commands": "nfs", + } + + # skeleton test definition: only declaring each job as a single 'test' + # since LAVA's test parsing is not useful to us + run_steps = [] + test = { + "timeout": {"minutes": args.job_timeout_min}, + "failure_retry": 1, + "definitions": [ + { + "name": "mesa", + "from": "inline", + "lava-signal": "kmsg", + "path": "inline/mesa.yaml", + "repository": { + "metadata": { + "name": "mesa", + "description": "Mesa test plan", + "os": ["oe"], + "scope": ["functional"], + "format": "Lava-Test Test Definition 1.0", + }, + "run": {"steps": run_steps}, + }, + } + ], + } + + # job execution script: + # - inline .gitlab-ci/common/init-stage1.sh + # - fetch and unpack per-pipeline build artifacts from build job + # - fetch and unpack per-job environment from lava-submit.sh + # - exec .gitlab-ci/common/init-stage2.sh + + with open(args.first_stage_init, "r") as init_sh: + run_steps += [ + x.rstrip() for x in init_sh if not x.startswith("#") and x.rstrip() + ] + # We cannot distribute the Adreno 660 shader firmware inside rootfs, + # since the license isn't bundled inside the repository + if args.device_type == "sm8350-hdk": + run_steps.append( + "curl -L --retry 4 -f --retry-all-errors --retry-delay 60 " + + "https://github.com/allahjasif1990/hdk888-firmware/raw/main/a660_zap.mbn " + + '-o "/lib/firmware/qcom/sm8350/a660_zap.mbn"' + ) + + run_steps += artifact_download_steps(args) + + run_steps += [ + f"mkdir -p {args.ci_project_dir}", + f"curl {args.build_url} | tar --zstd -x -C {args.ci_project_dir}", + # Sleep a bit to give time for bash to dump shell xtrace messages into + # console which may cause interleaving with LAVA_SIGNAL_STARTTC in some + # devices like a618. + "sleep 1", + # Putting CI_JOB name as the testcase name, it may help LAVA farm + # maintainers with monitoring + f"lava-test-case 'mesa-ci_{args.mesa_job_name}' --shell /init-stage2.sh", + ] + + if args.boot_method == "fastboot": + values["actions"] = [ + {"deploy": fastboot_deploy_nfs}, + {"deploy": fastboot_deploy_prepare}, + {"deploy": fastboot_deploy}, + {"boot": fastboot_boot}, + {"test": test}, + ] + else: # tftp + values["actions"] = [ + {"deploy": tftp_deploy}, + {"boot": tftp_boot}, + {"test": test}, + ] + + return values -- 2.7.4