[igt-dev] [PATCH v2 11/12] ci: Add files from Mesa required to submit jobs to LAVA
Tomeu Vizoso
tomeu.vizoso at collabora.com
Thu Mar 10 07:37:59 UTC 2022
Mostly to submit jobs to LAVA and to setup the device after boot.
Signed-off-by: Tomeu Vizoso <tomeu.vizoso at collabora.com>
---
ci/ci-common/capture-devcoredump.sh | 14 +
ci/ci-common/generate-env.sh | 112 ++++++++
ci/ci-common/init-stage1.sh | 22 ++
ci/ci-common/init-stage2.sh | 78 ++++++
ci/lava/lava-submit.sh | 38 +++
ci/lava/lava_job_submitter.py | 380 ++++++++++++++++++++++++++++
6 files changed, 644 insertions(+)
create mode 100755 ci/ci-common/capture-devcoredump.sh
create mode 100755 ci/ci-common/generate-env.sh
create mode 100755 ci/ci-common/init-stage1.sh
create mode 100755 ci/ci-common/init-stage2.sh
create mode 100755 ci/lava/lava-submit.sh
create mode 100755 ci/lava/lava_job_submitter.py
diff --git a/ci/ci-common/capture-devcoredump.sh b/ci/ci-common/capture-devcoredump.sh
new file mode 100755
index 000000000000..ae370538eaeb
--- /dev/null
+++ b/ci/ci-common/capture-devcoredump.sh
@@ -0,0 +1,14 @@
+#!/bin/sh
+
+while true; do
+ devcds=`find /sys/devices/virtual/devcoredump/ -name data 2>/dev/null`
+ for i in $devcds; do
+ echo "Found a devcoredump at $i."
+ if cp $i /results/first.devcore; then
+ echo 1 > $i
+ echo "Saved to the job artifacts at /first.devcore"
+ exit 0
+ fi
+ done
+ sleep 10
+done
diff --git a/ci/ci-common/generate-env.sh b/ci/ci-common/generate-env.sh
new file mode 100755
index 000000000000..dc2b7febefb8
--- /dev/null
+++ b/ci/ci-common/generate-env.sh
@@ -0,0 +1,112 @@
+#!/bin/bash
+
+for var in \
+ ASAN_OPTIONS \
+ BASE_SYSTEM_FORK_HOST_PREFIX \
+ BASE_SYSTEM_MAINLINE_HOST_PREFIX \
+ CI_COMMIT_BRANCH \
+ CI_COMMIT_REF_NAME \
+ CI_COMMIT_TITLE \
+ CI_JOB_ID \
+ CI_JOB_JWT_FILE \
+ CI_JOB_NAME \
+ CI_JOB_URL \
+ CI_MERGE_REQUEST_SOURCE_BRANCH_NAME \
+ CI_MERGE_REQUEST_TITLE \
+ CI_NODE_INDEX \
+ CI_NODE_TOTAL \
+ CI_PAGES_DOMAIN \
+ CI_PIPELINE_ID \
+ CI_PIPELINE_URL \
+ CI_PROJECT_DIR \
+ CI_PROJECT_NAME \
+ CI_PROJECT_PATH \
+ CI_PROJECT_ROOT_NAMESPACE \
+ CI_RUNNER_DESCRIPTION \
+ CI_SERVER_URL \
+ CROSVM_GALLIUM_DRIVER \
+ CROSVM_GPU_ARGS \
+ CROSVM_TEST_SCRIPT \
+ DEQP_BIN_DIR \
+ DEQP_CASELIST_FILTER \
+ DEQP_CASELIST_INV_FILTER \
+ DEQP_CONFIG \
+ DEQP_EXPECTED_RENDERER \
+ DEQP_FRACTION \
+ DEQP_HEIGHT \
+ DEQP_RESULTS_DIR \
+ DEQP_RUNNER_OPTIONS \
+ DEQP_SUITE \
+ DEQP_TEMP_DIR \
+ DEQP_VARIANT \
+ DEQP_VER \
+ DEQP_WIDTH \
+ DEVICE_NAME \
+ DRIVER_NAME \
+ EGL_PLATFORM \
+ ETNA_MESA_DEBUG \
+ FDO_CI_CONCURRENT \
+ FDO_UPSTREAM_REPO \
+ FD_MESA_DEBUG \
+ FLAKES_CHANNEL \
+ GALLIUM_DRIVER \
+ GALLIVM_PERF \
+ GPU_VERSION \
+ GTEST \
+ GTEST_FAILS \
+ GTEST_FRACTION \
+ GTEST_RESULTS_DIR \
+ GTEST_RUNNER_OPTIONS \
+ GTEST_SKIPS \
+ HWCI_FREQ_MAX \
+ HWCI_KERNEL_MODULES \
+ HWCI_START_XORG \
+ HWCI_TEST_SCRIPT \
+ IGT_FORCE_DRIVER \
+ IR3_SHADER_DEBUG \
+ JOB_ARTIFACTS_BASE \
+ JOB_RESULTS_PATH \
+ JOB_ROOTFS_OVERLAY_PATH \
+ LD_LIBRARY_PATH \
+ LP_NUM_THREADS \
+ MESA_BASE_TAG \
+ MESA_BUILD_PATH \
+ MESA_DEBUG \
+ MESA_GLES_VERSION_OVERRIDE \
+ MESA_GLSL_VERSION_OVERRIDE \
+ MESA_GL_VERSION_OVERRIDE \
+ MESA_IMAGE \
+ MESA_IMAGE_PATH \
+ MESA_IMAGE_TAG \
+ MESA_TEMPLATES_COMMIT \
+ MESA_VK_IGNORE_CONFORMANCE_WARNING \
+ MINIO_HOST \
+ NIR_DEBUG \
+ PAN_I_WANT_A_BROKEN_VULKAN_DRIVER \
+ PAN_MESA_DEBUG \
+ PIGLIT_FRACTION \
+ PIGLIT_NO_WINDOW \
+ PIGLIT_OPTIONS \
+ PIGLIT_PLATFORM \
+ PIGLIT_PROFILES \
+ PIGLIT_REPLAY_ARTIFACTS_BASE_URL \
+ PIGLIT_REPLAY_DESCRIPTION_FILE \
+ PIGLIT_REPLAY_DEVICE_NAME \
+ PIGLIT_REPLAY_EXTRA_ARGS \
+ PIGLIT_REPLAY_REFERENCE_IMAGES_BASE \
+ PIGLIT_REPLAY_SUBCOMMAND \
+ PIGLIT_RESULTS \
+ PIGLIT_TESTS \
+ PIPELINE_ARTIFACTS_BASE \
+ SKQP_ASSETS_DIR \
+ SKQP_BACKENDS \
+ TU_DEBUG \
+ VIRGL_HOST_API \
+ VK_CPU \
+ VK_DRIVER \
+ VK_ICD_FILENAMES \
+ ; do
+ if [ -n "${!var+x}" ]; then
+ echo "export $var=${!var at Q}"
+ fi
+done
diff --git a/ci/ci-common/init-stage1.sh b/ci/ci-common/init-stage1.sh
new file mode 100755
index 000000000000..648c37a2f903
--- /dev/null
+++ b/ci/ci-common/init-stage1.sh
@@ -0,0 +1,22 @@
+#!/bin/sh
+
+# Very early init, used to make sure devices and network are set up and
+# reachable.
+
+set -ex
+
+cd /
+
+mount -t proc none /proc
+mount -t sysfs none /sys
+mount -t devtmpfs none /dev || echo possibly already mounted
+mkdir -p /dev/pts
+mount -t devpts devpts /dev/pts
+mount -t tmpfs tmpfs /tmp
+
+echo "nameserver 8.8.8.8" > /etc/resolv.conf
+[ -z "$NFS_SERVER_IP" ] || echo "$NFS_SERVER_IP caching-proxy" >> /etc/hosts
+
+# Set the time so we can validate certificates before we fetch anything;
+# however as not all DUTs have network, make this non-fatal.
+for i in 1 2 3; do sntp -sS pool.ntp.org && break || sleep 2; done || true
diff --git a/ci/ci-common/init-stage2.sh b/ci/ci-common/init-stage2.sh
new file mode 100755
index 000000000000..aff36635e59b
--- /dev/null
+++ b/ci/ci-common/init-stage2.sh
@@ -0,0 +1,78 @@
+#!/bin/sh
+
+# Second-stage init, used to set up devices and our job environment before
+# running tests.
+
+. /set-job-env-vars.sh
+
+set -ex
+
+# Set up any devices required by the jobs
+[ -z "$HWCI_KERNEL_MODULES" ] || (echo -n $HWCI_KERNEL_MODULES | xargs -d, -n1 /usr/sbin/modprobe)
+
+# Fix prefix confusion: the build installs to $CI_PROJECT_DIR, but we expect
+# it in /install
+ln -sf $CI_PROJECT_DIR/build /install
+export LD_LIBRARY_PATH=/install/lib
+export LIBGL_DRIVERS_PATH=/install/lib/dri
+
+# Store Mesa's disk cache under /tmp, rather than sending it out over NFS.
+export XDG_CACHE_HOME=/tmp
+
+# Make sure Python can find all our imports
+export PYTHONPATH=$(python3 -c "import sys;print(\":\".join(sys.path))")
+
+if [ "$HWCI_FREQ_MAX" = "true" ]; then
+ # Ensure initialization of the DRM device (needed by MSM)
+ head -0 /dev/dri/renderD128
+
+ # Disable GPU frequency scaling
+ DEVFREQ_GOVERNOR=`find /sys/devices -name governor | grep gpu || true`
+ test -z "$DEVFREQ_GOVERNOR" || echo performance > $DEVFREQ_GOVERNOR || true
+
+ # Disable CPU frequency scaling
+ echo performance | tee -a /sys/devices/system/cpu/cpufreq/policy*/scaling_governor || true
+
+ # Disable GPU runtime power management
+ GPU_AUTOSUSPEND=`find /sys/devices -name autosuspend_delay_ms | grep gpu | head -1`
+ test -z "$GPU_AUTOSUSPEND" || echo -1 > $GPU_AUTOSUSPEND || true
+fi
+
+# Start a little daemon to capture the first devcoredump we encounter. (They
+# expire after 5 minutes, so we poll for them).
+./capture-devcoredump.sh &
+
+# If we want Xorg to be running for the test, then we start it up before the
+# HWCI_TEST_SCRIPT because we need to use xinit to start X (otherwise
+# without using -displayfd you can race with Xorg's startup), but xinit will eat
+# your client's return code
+if [ -n "$HWCI_START_XORG" ]; then
+ echo "touch /xorg-started; sleep 100000" > /xorg-script
+ env \
+ xinit /bin/sh /xorg-script -- /usr/bin/Xorg -noreset -s 0 -dpms -logfile /Xorg.0.log &
+
+ # Wait for xorg to be ready for connections.
+ for i in 1 2 3 4 5; do
+ if [ -e /xorg-started ]; then
+ break
+ fi
+ sleep 5
+ done
+ export DISPLAY=:0
+fi
+
+RESULT=fail
+if sh $HWCI_TEST_SCRIPT; then
+ RESULT=pass
+ rm -rf results/trace/$PIGLIT_REPLAY_DEVICE_NAME
+fi
+
+# upload artifacts
+MINIO=$(cat /proc/cmdline | tr ' ' '\n' | grep minio_results | cut -d '=' -f 2 || true)
+if [ -n "$MINIO" ]; then
+ tar -czf results.tar.gz results/;
+ ci-fairy minio login --token-file "${CI_JOB_JWT_FILE}";
+ ci-fairy minio cp results.tar.gz minio://"$MINIO"/results.tar.gz;
+fi
+
+echo "hwci: test: $RESULT"
diff --git a/ci/lava/lava-submit.sh b/ci/lava/lava-submit.sh
new file mode 100755
index 000000000000..fac1c42bc296
--- /dev/null
+++ b/ci/lava/lava-submit.sh
@@ -0,0 +1,38 @@
+#!/bin/bash
+
+set -e
+set -x
+
+SCRIPTS_DIR=`dirname "$0"`/..
+
+rm -rf results
+mkdir -p results/job-rootfs-overlay/
+
+cp $SCRIPTS_DIR/ci-common/capture-devcoredump.sh results/job-rootfs-overlay/
+cp $SCRIPTS_DIR/ci-common/init-*.sh results/job-rootfs-overlay/
+$SCRIPTS_DIR/ci-common/generate-env.sh > results/job-rootfs-overlay/set-job-env-vars.sh
+
+tar zcf job-rootfs-overlay.tar.gz -C results/job-rootfs-overlay/ .
+ci-fairy minio login --token-file "${CI_JOB_JWT_FILE}"
+ci-fairy minio cp job-rootfs-overlay.tar.gz "minio://${JOB_ROOTFS_OVERLAY_PATH}"
+
+touch results/lava.log
+tail -f results/lava.log &
+$SCRIPTS_DIR/lava/lava_job_submitter.py \
+ --dump-yaml \
+ --pipeline-info "$CI_JOB_NAME: $CI_PIPELINE_URL on $CI_COMMIT_REF_NAME ${CI_NODE_INDEX}/${CI_NODE_TOTAL}" \
+ --base-system-url-prefix "https://${BASE_SYSTEM_MAINLINE_HOST_PATH}" \
+ --build-url "${FDO_HTTP_CACHE_URI:-}https://${BUILD_PATH}" \
+ --job-rootfs-overlay-url "${FDO_HTTP_CACHE_URI:-}https://${JOB_ROOTFS_OVERLAY_PATH}" \
+ --job-artifacts-base ${JOB_ARTIFACTS_BASE} \
+ --job-timeout ${JOB_TIMEOUT:-30} \
+ --first-stage-init $SCRIPTS_DIR/ci-common/init-stage1.sh \
+ --ci-project-dir ${CI_PROJECT_DIR} \
+ --device-type ${DEVICE_TYPE} \
+ --dtb ${DTB} \
+ --jwt-file "${CI_JOB_JWT_FILE}" \
+ --kernel-image-name ${KERNEL_IMAGE_NAME} \
+ --kernel-image-type "${KERNEL_IMAGE_TYPE}" \
+ --boot-method ${BOOT_METHOD} \
+ --visibility-group ${VISIBILITY_GROUP} \
+ --lava-tags "${LAVA_TAGS}" >> results/lava.log
diff --git a/ci/lava/lava_job_submitter.py b/ci/lava/lava_job_submitter.py
new file mode 100755
index 000000000000..5ea8eddf088d
--- /dev/null
+++ b/ci/lava/lava_job_submitter.py
@@ -0,0 +1,380 @@
+#!/usr/bin/env python3
+#
+# Copyright (C) 2020, 2021 Collabora Limited
+# Author: Gustavo Padovan <gustavo.padovan at collabora.com>
+#
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice (including the next
+# paragraph) shall be included in all copies or substantial portions of the
+# Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+"""Send a job to LAVA, track it and collect log back"""
+
+import argparse
+import pathlib
+import sys
+import time
+import traceback
+import urllib.parse
+import xmlrpc
+
+from datetime import datetime, timedelta
+from os import getenv
+
+import lavacli
+import yaml
+from lavacli.utils import loader
+
+# Timeout in seconds to decide if the device from the dispatched LAVA job has
+# hung or not due to the lack of new log output.
+DEVICE_HANGING_TIMEOUT_SEC = int(getenv("LAVA_DEVICE_HANGING_TIMEOUT_SEC", 5*60))
+
+# How many seconds the script should wait before try a new polling iteration to
+# check if the dispatched LAVA job is running or waiting in the job queue.
+WAIT_FOR_DEVICE_POLLING_TIME_SEC = int(getenv("LAVA_WAIT_FOR_DEVICE_POLLING_TIME_SEC", 10))
+
+# How many seconds to wait between log output LAVA RPC calls.
+LOG_POLLING_TIME_SEC = int(getenv("LAVA_LOG_POLLING_TIME_SEC", 5))
+
+# How many retries should be made when a timeout happen.
+NUMBER_OF_RETRIES_TIMEOUT_DETECTION = int(getenv("LAVA_NUMBER_OF_RETRIES_TIMEOUT_DETECTION", 2))
+
+
+def print_log(msg):
+ print("{}: {}".format(datetime.now(), msg))
+
+def fatal_err(msg):
+ print_log(msg)
+ sys.exit(1)
+
+
+def hide_sensitive_data(yaml_data, hide_tag="HIDEME"):
+ return "".join(line for line in yaml_data.splitlines(True) if hide_tag not in line)
+
+
+def generate_lava_yaml(args):
+ # General metadata and permissions, plus also inexplicably kernel arguments
+ values = {
+ 'job_name': args.pipeline_info,
+ 'device_type': args.device_type,
+ 'visibility': { 'group': [ args.visibility_group ] },
+ 'priority': 75,
+ 'context': {
+ 'extra_nfsroot_args': ' init=/init rootwait minio_results={}'.format(args.job_artifacts_base)
+ },
+ 'timeouts': {
+ 'job': {
+ 'minutes': args.job_timeout
+ }
+ },
+ }
+
+ if args.lava_tags:
+ values['tags'] = args.lava_tags.split(',')
+
+ # URLs to our kernel rootfs to boot from, both generated by the base
+ # container build
+ deploy = {
+ 'timeout': { 'minutes': 10 },
+ 'to': 'tftp',
+ 'os': 'oe',
+ 'kernel': {
+ 'url': '{}/{}'.format(args.base_system_url_prefix, args.kernel_image_name),
+ },
+ 'nfsrootfs': {
+ 'url': '{}/lava-rootfs.tgz'.format(args.base_system_url_prefix),
+ 'compression': 'gz',
+ }
+ }
+ if args.kernel_image_type:
+ deploy['kernel']['type'] = args.kernel_image_type
+ if args.dtb:
+ deploy['dtb'] = {
+ 'url': '{}/{}.dtb'.format(args.base_system_url_prefix, args.dtb)
+ }
+
+ # always boot over NFS
+ boot = {
+ 'timeout': { 'minutes': 25 },
+ 'method': args.boot_method,
+ 'commands': 'nfs',
+ 'prompts': ['lava-shell:'],
+ }
+
+ # skeleton test definition: only declaring each job as a single 'test'
+ # since LAVA's test parsing is not useful to us
+ test = {
+ 'timeout': { 'minutes': args.job_timeout },
+ 'failure_retry': 1,
+ 'definitions': [ {
+ 'name': 'test',
+ 'from': 'inline',
+ 'path': 'inline/test.yaml',
+ 'repository': {
+ 'metadata': {
+ 'name': 'test',
+ 'description': 'Test plan',
+ 'os': [ 'oe' ],
+ 'scope': [ 'functional' ],
+ 'format': 'Lava-Test Test Definition 1.0',
+ },
+ 'parse': {
+ 'pattern': r'hwci: (?P<test_case_id>\S*):\s+(?P<result>(pass|fail))'
+ },
+ 'run': {
+ },
+ },
+ } ],
+ }
+
+ # job execution script:
+ # - inline .gitlab-ci/common/init-stage1.sh
+ # - fetch and unpack per-pipeline build artifacts from build job
+ # - fetch and unpack per-job environment from lava-submit.sh
+ # - exec .gitlab-ci/common/init-stage2.sh
+ init_lines = []
+
+ with open(args.first_stage_init, 'r') as init_sh:
+ init_lines += [ x.rstrip() for x in init_sh if not x.startswith('#') and x.rstrip() ]
+
+ with open(args.jwt_file) as jwt_file:
+ init_lines += [
+ "set +x",
+ f'echo -n "{jwt_file.read()}" > "{args.jwt_file}" # HIDEME',
+ "set -x",
+ ]
+
+ init_lines += [
+ 'mkdir -p {}'.format(args.ci_project_dir),
+ 'wget -S --progress=dot:giga -O- {} | tar -xz -C {}'.format(args.build_url, args.ci_project_dir),
+ 'wget -S --progress=dot:giga -O- {} | tar -xz -C /'.format(args.job_rootfs_overlay_url),
+ f'echo "export CI_JOB_JWT_FILE={args.jwt_file}" >> /set-job-env-vars.sh',
+ 'exec /init-stage2.sh',
+ ]
+ test['definitions'][0]['repository']['run']['steps'] = init_lines
+
+ values['actions'] = [
+ { 'deploy': deploy },
+ { 'boot': boot },
+ { 'test': test },
+ ]
+
+ return yaml.dump(values, width=10000000)
+
+
+def setup_lava_proxy():
+ config = lavacli.load_config("default")
+ uri, usr, tok = (config.get(key) for key in ("uri", "username", "token"))
+ uri_obj = urllib.parse.urlparse(uri)
+ uri_str = "{}://{}:{}@{}{}".format(uri_obj.scheme, usr, tok, uri_obj.netloc, uri_obj.path)
+ transport = lavacli.RequestsTransport(
+ uri_obj.scheme,
+ config.get("proxy"),
+ config.get("timeout", 120.0),
+ config.get("verify_ssl_cert", True),
+ )
+ proxy = xmlrpc.client.ServerProxy(
+ uri_str, allow_none=True, transport=transport)
+
+ print_log("Proxy for {} created.".format(config['uri']))
+
+ return proxy
+
+
+def _call_proxy(fn, *args):
+ retries = 60
+ for n in range(1, retries + 1):
+ try:
+ return fn(*args)
+ except xmlrpc.client.ProtocolError as err:
+ if n == retries:
+ traceback.print_exc()
+ fatal_err("A protocol error occurred (Err {} {})".format(err.errcode, err.errmsg))
+ else:
+ time.sleep(15)
+ except xmlrpc.client.Fault as err:
+ traceback.print_exc()
+ fatal_err("FATAL: Fault: {} (code: {})".format(err.faultString, err.faultCode))
+
+
+def get_job_results(proxy, job_id, test_suite, test_case):
+ # Look for infrastructure errors and retry if we see them.
+ results_yaml = _call_proxy(proxy.results.get_testjob_results_yaml, job_id)
+ results = yaml.load(results_yaml, Loader=loader(False))
+ for res in results:
+ metadata = res["metadata"]
+ if "result" not in metadata or metadata["result"] != "fail":
+ continue
+ if 'error_type' in metadata and metadata['error_type'] == "Infrastructure":
+ print_log("LAVA job {} failed with Infrastructure Error. Retry.".format(job_id))
+ return False
+ if 'case' in metadata and metadata['case'] == "validate":
+ print_log("LAVA job {} failed validation (possible download error). Retry.".format(job_id))
+ return False
+
+ results_yaml = _call_proxy(proxy.results.get_testcase_results_yaml, job_id, test_suite, test_case)
+ results = yaml.load(results_yaml, Loader=loader(False))
+ if not results:
+ fatal_err("LAVA: no result for test_suite '{}', test_case '{}'".format(test_suite, test_case))
+
+ print_log("LAVA: result for test_suite '{}', test_case '{}': {}".format(test_suite, test_case, results[0]['result']))
+ if results[0]['result'] != 'pass':
+ fatal_err("FAIL")
+
+ return True
+
+def wait_until_job_is_started(proxy, job_id):
+ print_log(f"Waiting for job {job_id} to start.")
+ current_state = "Submitted"
+ waiting_states = ["Submitted", "Scheduling", "Scheduled"]
+ while current_state in waiting_states:
+ job_state = _call_proxy(proxy.scheduler.job_state, job_id)
+ current_state = job_state["job_state"]
+
+ time.sleep(WAIT_FOR_DEVICE_POLLING_TIME_SEC)
+ print_log(f"Job {job_id} started.")
+
+def follow_job_execution(proxy, job_id):
+ line_count = 0
+ finished = False
+ last_time_logs = datetime.now()
+ while not finished:
+ (finished, data) = _call_proxy(proxy.scheduler.jobs.logs, job_id, line_count)
+ if logs := yaml.load(str(data), Loader=loader(False)):
+ # Reset the timeout
+ last_time_logs = datetime.now()
+ for line in logs:
+ print("{} {}".format(line["dt"], line["msg"]))
+
+ line_count += len(logs)
+
+ else:
+ time_limit = timedelta(seconds=DEVICE_HANGING_TIMEOUT_SEC)
+ if datetime.now() - last_time_logs > time_limit:
+ print_log("LAVA job {} doesn't advance (machine got hung?). Retry.".format(job_id))
+ return False
+
+ # `proxy.scheduler.jobs.logs` does not block, even when there is no
+ # new log to be fetched. To avoid dosing the LAVA dispatcher
+ # machine, let's add a sleep to save them some stamina.
+ time.sleep(LOG_POLLING_TIME_SEC)
+
+ return True
+
+def show_job_data(proxy, job_id):
+ show = _call_proxy(proxy.scheduler.jobs.show, job_id)
+ for field, value in show.items():
+ print("{}\t: {}".format(field, value))
+
+
+def validate_job(proxy, job_file):
+ try:
+ return _call_proxy(proxy.scheduler.jobs.validate, job_file, True)
+ except:
+ return False
+
+def submit_job(proxy, job_file):
+ return _call_proxy(proxy.scheduler.jobs.submit, job_file)
+
+
+def retriable_follow_job(proxy, yaml_file):
+ retry_count = NUMBER_OF_RETRIES_TIMEOUT_DETECTION
+
+ while retry_count >= 0:
+ job_id = submit_job(proxy, yaml_file)
+
+ print_log("LAVA job id: {}".format(job_id))
+
+ wait_until_job_is_started(proxy, job_id)
+
+ if not follow_job_execution(proxy, job_id):
+ print_log(f"Job {job_id} has timed out. Cancelling it.")
+ # Cancel the job as it is considered unreachable by GitLab CI.
+ proxy.scheduler.jobs.cancel(job_id)
+
+ retry_count -= 1
+ continue
+
+ show_job_data(proxy, job_id)
+
+ if get_job_results(proxy, job_id, "0_test", "test") == True:
+ break
+ else:
+ # The script attempted all the retries. The job seemed to fail.
+ return False
+
+ return True
+
+
+def main(args):
+ proxy = setup_lava_proxy()
+
+ yaml_file = generate_lava_yaml(args)
+
+ if args.dump_yaml:
+ print(hide_sensitive_data(generate_lava_yaml(args)))
+
+ if args.validate_only:
+ ret = validate_job(proxy, yaml_file)
+ if not ret:
+ fatal_err("Error in LAVA job definition")
+ print("LAVA job definition validated successfully")
+ return
+
+ if not retriable_follow_job(proxy, yaml_file):
+ fatal_err(
+ "Job failed after it exceeded the number of"
+ f"{NUMBER_OF_RETRIES_TIMEOUT_DETECTION} retries."
+ )
+
+
+def create_parser():
+ parser = argparse.ArgumentParser("LAVA job submitter")
+
+ parser.add_argument("--pipeline-info")
+ parser.add_argument("--base-system-url-prefix")
+ parser.add_argument("--build-url")
+ parser.add_argument("--job-rootfs-overlay-url")
+ parser.add_argument("--job-artifacts-base")
+ parser.add_argument("--job-timeout", type=int)
+ parser.add_argument("--first-stage-init")
+ parser.add_argument("--ci-project-dir")
+ parser.add_argument("--device-type")
+ parser.add_argument("--dtb", nargs='?', default="")
+ parser.add_argument("--kernel-image-name")
+ parser.add_argument("--kernel-image-type", nargs='?', default="")
+ parser.add_argument("--boot-method")
+ parser.add_argument("--lava-tags", nargs='?', default="")
+ parser.add_argument("--jwt-file", type=pathlib.Path)
+ parser.add_argument("--validate-only", action='store_true')
+ parser.add_argument("--dump-yaml", action='store_true')
+ parser.add_argument("--visibility-group")
+
+ return parser
+
+if __name__ == "__main__":
+ # given that we proxy from DUT -> LAVA dispatcher -> LAVA primary -> us ->
+ # GitLab runner -> GitLab primary -> user, safe to say we don't need any
+ # more buffering
+ sys.stdout.reconfigure(line_buffering=True)
+ sys.stderr.reconfigure(line_buffering=True)
+
+ parser = create_parser()
+
+ parser.set_defaults(func=main)
+ args = parser.parse_args()
+ args.func(args)
--
2.31.1
More information about the igt-dev
mailing list