[PATCH i-g-t] [RFC] Introduce SR-IOV VM-level testing tool
Adam Miszczak
adam.miszczak at linux.intel.com
Thu May 23 07:51:56 UTC 2024
VM Test Bench (VMTB) is a tool for testing virtualization (SR-IOV) supported by xe/i915 driver.
It allows to enable and provision VFs (Virtual Functions) and facilitates manipulation of VMs (Virtual Machines) running virtual GPUs.
This includes starting and accessing the KVM/QEMU VMs, running workloads or shell commands (Guest/Host), handling power states, saving and restoring VF state etc.
Currently the following SR-IOV VM test scenarios are covered:
- basic VF/VM setup with IGT workload submission
- VF provisioning with various vGPU profiles
- VF save/restore (VM cold migration)
- VF scheduling
- VM power states
- VF FLR
- VM crash (guest kernel panic)
- GuC FW versioning
There's still refactoring ongoing for few tests, but any feedback would be greatly appreciated.
Signed-off-by: Adam Miszczak <adam.miszczak at linux.intel.com>
---
tools/vmtb/LICENSE.txt | 20 +
tools/vmtb/MANIFEST.in | 3 +
tools/vmtb/README.md | 80 ++
tools/vmtb/bench/__init__.py | 46 +
tools/vmtb/bench/exceptions.py | 38 +
tools/vmtb/bench/executors/__init__.py | 0
.../bench/executors/executor_interface.py | 24 +
tools/vmtb/bench/executors/gem_wsim.py | 71 ++
tools/vmtb/bench/executors/igt.py | 127 +++
tools/vmtb/bench/executors/shell.py | 31 +
tools/vmtb/bench/helpers/__init__.py | 0
tools/vmtb/bench/helpers/helpers.py | 248 +++++
tools/vmtb/bench/machines/__init__.py | 0
tools/vmtb/bench/machines/host.py | 820 +++++++++++++++
.../vmtb/bench/machines/machine_interface.py | 70 ++
tools/vmtb/bench/machines/pci.py | 99 ++
tools/vmtb/bench/machines/vgpu_profile.py | 197 ++++
tools/vmtb/bench/machines/virtual/__init__.py | 0
.../machines/virtual/backends/__init__.py | 0
.../virtual/backends/backend_interface.py | 42 +
.../machines/virtual/backends/guestagent.py | 101 ++
.../machines/virtual/backends/qmp_monitor.py | 163 +++
tools/vmtb/bench/machines/virtual/vm.py | 595 +++++++++++
tools/vmtb/dev-requirements.txt | 14 +
tools/vmtb/pyproject.toml | 26 +
tools/vmtb/requirements.txt | 2 +
tools/vmtb/tests/__init__.py | 1 +
tools/vmtb/tests/conftest.py | 65 ++
tools/vmtb/tests/pytest.ini | 6 +
tools/vmtb/tests/test_executors.py | 109 ++
tools/vmtb/tests/test_igt_executors.py | 24 +
tools/vmtb/tests/test_timer.py | 23 +
tools/vmtb/tests/test_vm.py | 89 ++
tools/vmtb/vmm_flows/__init__.py | 0
tools/vmtb/vmm_flows/conftest.py | 296 ++++++
.../vmm_flows/resources/guc/guc_versions.txt | 4 +
.../resources/vgpu_profile/ADL_int.csv | 14 +
.../resources/vgpu_profile/ADL_vfs.csv | 14 +
.../resources/vgpu_profile/ATSM150_int.csv | 14 +
.../resources/vgpu_profile/ATSM150_vfs.csv | 14 +
.../resources/vgpu_profile/ATSM75_int.csv | 9 +
.../resources/vgpu_profile/ATSM75_vfs.csv | 9 +
.../resources/vgpu_profile/PVC2_int.csv | 8 +
.../resources/vgpu_profile/PVC2_vfs.csv | 8 +
tools/vmtb/vmm_flows/test_basic.py | 175 ++++
tools/vmtb/vmm_flows/test_flr_vm.py | 162 +++
tools/vmtb/vmm_flows/test_guc_versioning.py | 157 +++
tools/vmtb/vmm_flows/test_migration.py | 955 ++++++++++++++++++
tools/vmtb/vmm_flows/test_provisioning.py | 555 ++++++++++
tools/vmtb/vmm_flows/test_scheduling.py | 123 +++
tools/vmtb/vmm_flows/test_vm_panic.py | 84 ++
.../vmtb/vmm_flows/test_vm_states_control.py | 140 +++
52 files changed, 5875 insertions(+)
create mode 100644 tools/vmtb/LICENSE.txt
create mode 100644 tools/vmtb/MANIFEST.in
create mode 100644 tools/vmtb/README.md
create mode 100644 tools/vmtb/bench/__init__.py
create mode 100644 tools/vmtb/bench/exceptions.py
create mode 100644 tools/vmtb/bench/executors/__init__.py
create mode 100644 tools/vmtb/bench/executors/executor_interface.py
create mode 100644 tools/vmtb/bench/executors/gem_wsim.py
create mode 100644 tools/vmtb/bench/executors/igt.py
create mode 100644 tools/vmtb/bench/executors/shell.py
create mode 100644 tools/vmtb/bench/helpers/__init__.py
create mode 100644 tools/vmtb/bench/helpers/helpers.py
create mode 100644 tools/vmtb/bench/machines/__init__.py
create mode 100644 tools/vmtb/bench/machines/host.py
create mode 100644 tools/vmtb/bench/machines/machine_interface.py
create mode 100644 tools/vmtb/bench/machines/pci.py
create mode 100644 tools/vmtb/bench/machines/vgpu_profile.py
create mode 100644 tools/vmtb/bench/machines/virtual/__init__.py
create mode 100644 tools/vmtb/bench/machines/virtual/backends/__init__.py
create mode 100644 tools/vmtb/bench/machines/virtual/backends/backend_interface.py
create mode 100644 tools/vmtb/bench/machines/virtual/backends/guestagent.py
create mode 100644 tools/vmtb/bench/machines/virtual/backends/qmp_monitor.py
create mode 100644 tools/vmtb/bench/machines/virtual/vm.py
create mode 100644 tools/vmtb/dev-requirements.txt
create mode 100644 tools/vmtb/pyproject.toml
create mode 100644 tools/vmtb/requirements.txt
create mode 100644 tools/vmtb/tests/__init__.py
create mode 100644 tools/vmtb/tests/conftest.py
create mode 100644 tools/vmtb/tests/pytest.ini
create mode 100644 tools/vmtb/tests/test_executors.py
create mode 100644 tools/vmtb/tests/test_igt_executors.py
create mode 100644 tools/vmtb/tests/test_timer.py
create mode 100644 tools/vmtb/tests/test_vm.py
create mode 100644 tools/vmtb/vmm_flows/__init__.py
create mode 100644 tools/vmtb/vmm_flows/conftest.py
create mode 100644 tools/vmtb/vmm_flows/resources/guc/guc_versions.txt
create mode 100755 tools/vmtb/vmm_flows/resources/vgpu_profile/ADL_int.csv
create mode 100755 tools/vmtb/vmm_flows/resources/vgpu_profile/ADL_vfs.csv
create mode 100755 tools/vmtb/vmm_flows/resources/vgpu_profile/ATSM150_int.csv
create mode 100755 tools/vmtb/vmm_flows/resources/vgpu_profile/ATSM150_vfs.csv
create mode 100755 tools/vmtb/vmm_flows/resources/vgpu_profile/ATSM75_int.csv
create mode 100755 tools/vmtb/vmm_flows/resources/vgpu_profile/ATSM75_vfs.csv
create mode 100755 tools/vmtb/vmm_flows/resources/vgpu_profile/PVC2_int.csv
create mode 100755 tools/vmtb/vmm_flows/resources/vgpu_profile/PVC2_vfs.csv
create mode 100644 tools/vmtb/vmm_flows/test_basic.py
create mode 100644 tools/vmtb/vmm_flows/test_flr_vm.py
create mode 100644 tools/vmtb/vmm_flows/test_guc_versioning.py
create mode 100644 tools/vmtb/vmm_flows/test_migration.py
create mode 100644 tools/vmtb/vmm_flows/test_provisioning.py
create mode 100644 tools/vmtb/vmm_flows/test_scheduling.py
create mode 100644 tools/vmtb/vmm_flows/test_vm_panic.py
create mode 100644 tools/vmtb/vmm_flows/test_vm_states_control.py
diff --git a/tools/vmtb/LICENSE.txt b/tools/vmtb/LICENSE.txt
new file mode 100644
index 000000000..a1c498458
--- /dev/null
+++ b/tools/vmtb/LICENSE.txt
@@ -0,0 +1,20 @@
+Copyright © 2024 Intel Corporation
+
+Permission is hereby granted, free of charge, to any person obtaining a
+copy of this software and associated documentation files (the "Software"),
+to deal in the Software without restriction, including without limitation
+the rights to use, copy, modify, merge, publish, distribute, sublicense,
+and/or sell copies of the Software, and to permit persons to whom the
+Software is furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice (including the next
+paragraph) shall be included in all copies or substantial portions of the
+Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+IN THE SOFTWARE.
diff --git a/tools/vmtb/MANIFEST.in b/tools/vmtb/MANIFEST.in
new file mode 100644
index 000000000..a51ce38c2
--- /dev/null
+++ b/tools/vmtb/MANIFEST.in
@@ -0,0 +1,3 @@
+include tests/pytest.ini
+include vmm_flows/resources/guc/*
+include vmm_flows/resources/vgpu_profile/*
diff --git a/tools/vmtb/README.md b/tools/vmtb/README.md
new file mode 100644
index 000000000..9a353c673
--- /dev/null
+++ b/tools/vmtb/README.md
@@ -0,0 +1,80 @@
+VM Test Bench
+=============
+
+Description
+-----------
+VM Test Bench (VMTB) is a tool for testing virtualization (SR-IOV) supported by xe/i915 driver.
+It allows to enable and provision VFs (Virtual Functions) and facilitates manipulation of VMs (Virtual Machines) running virtual GPUs.
+This includes starting and accessing the KVM/QEMU VMs, running workloads or shell commands (Guest/Host), handling power states, saving and restoring VF state etc.
+
+Requirements
+------------
+VMTB is implemented in Python using pytest testing framework.
+
+Host OS is expected to provide:
+- xe/i915 PF driver with SR-IOV support
+- VFIO driver (VF save/restore requires vendor specific driver variant)
+- QEMU (VF save/restore requires QEMU 8.0+)
+- IGT binaries
+- Python 3.8+ with pytest installed
+- VM Test Bench tool deployed
+
+Guest OS is expected to contain:
+- xe/i915 VF driver
+- QEMU Guest-Agent service for operating on Guest OS
+- IGT binaries to execute worklads on VM
+
+Usual VMTB testing environment bases on Ubuntu 22.04 installed on Host and Guest, but execution on other distros should be also possible.
+
+Building
+--------
+
+The VMTB source distribution package can be built with:
+
+ make build
+
+or:
+
+ python -m build
+
+Both run the Python `build` frontend in an isolated virtual environment (`venv`).
+
+The output tarball is created in the `dist/` subdirectory, that should be copied and extracted on the host device under test.
+
+Running tests
+-------------
+Test implemented by VM Test Bench are called VMM Flows and located in `vmm_flows/` directory.
+Test files are prefixed with `test_` and encapsulate related validation scenarios.
+Each test file can contain multiple test classes (`TestXYZ`) or functions (`test_xyz`), that can be executed independently.
+
+Run the VMM Flows test in the following way (as root):
+
+ $ pytest-3 -v ./vmtb-1.0.0/vmm_flows/<test_file_name>.py::<test_class_or_function_name> --vm-image=/home/gta/<guest_os.img>
+
+For example, the simplest 1xVF/VM test scenario can be executed as:
+
+ # sudo pytest-3 -v ./vmtb-1.0.0/vmm_flows/test_basic.py::TestVmSetup::test_vm_boot[A1-1VM] --vm-image=/home/gta/guest_os.img
+
+(in case `pytest-3` command cannot be found, check with just `pytest`)
+
+Name of test class/function can be omitted to execute all tests in file.
+File name can also be omitted, then all tests in `vmm_flows` directory will be executed.
+
+Test log (including VM dmesg) is available in `logfile.log` output file.
+Test results are presented as a standard pytest output on a terminal.
+VM (Guest OS) can be accessed manually over VNC on [host_IP]:5900 (where port is incremented for the consecutive VMs).
+
+Structure
+---------
+VMTB is divided into the following components:
+
+#### `bench/`
+Contains 'core' part of the tool, including Host and VirtualMachine abstractions, means to execute workloads (or other tasks), various helper functions etc.
+VMTB utilizes QMP (QEMU Machine Protocol) to communicate and operate with VMs and QGA (QEMU Guest Agent) to interact with the Guest OS.
+
+#### `vmm_flows/`
+Contains actual functional VM-level tests (`test_*.py`) as well as a setup and tear-down fixtures (`conftest.py`).
+New test files/scenarios shall be placed in this location.
+
+#### `tests/`
+Contains (near) unit tests for the tool/bench itself.
diff --git a/tools/vmtb/bench/__init__.py b/tools/vmtb/bench/__init__.py
new file mode 100644
index 000000000..ba55a7a02
--- /dev/null
+++ b/tools/vmtb/bench/__init__.py
@@ -0,0 +1,46 @@
+#!/usr/bin/env python3
+# SPDX-License-Identifier: MIT
+
+## Copyright (C) 2024 Intel Corporation ##
+
+import logging
+import logging.config
+
+LOG_CONFIG = {
+ "version": 1,
+ "formatters": {
+ "detailed": {
+ "format": "%(asctime)s - %(name)s - %(levelname)s — %(funcName)s:%(lineno)d — %(message)s"
+ },
+ "simple": {"format": "%(levelname)s - %(message)s"},
+ },
+ "handlers": {
+ "console": {
+ "class": "logging.StreamHandler",
+ "formatter": "detailed",
+ "level": "WARNING",
+ "stream": "ext://sys.stdout",
+ },
+ "file": {
+ "backupCount": 5,
+ "class": "logging.handlers.RotatingFileHandler",
+ "filename": "logfile.log",
+ "formatter": "detailed",
+ "maxBytes": 5242880,
+ },
+ },
+ "root": {
+ "handlers": ["console", "file"],
+ "level": "DEBUG"
+ }
+}
+
+logging.config.dictConfig(LOG_CONFIG)
+
+logger = logging.getLogger(__name__)
+
+logger.info('############################################')
+logger.info('# Welcome to VM Test Bench #')
+logger.info('# Completed logging configuring! #')
+logger.info('# Ready to run some tests #')
+logger.info('############################################')
diff --git a/tools/vmtb/bench/exceptions.py b/tools/vmtb/bench/exceptions.py
new file mode 100644
index 000000000..fe552ca11
--- /dev/null
+++ b/tools/vmtb/bench/exceptions.py
@@ -0,0 +1,38 @@
+#!/usr/bin/env python3
+# SPDX-License-Identifier: MIT
+
+## Copyright (C) 2024 Intel Corporation ##
+
+class BenchError(Exception):
+ pass
+
+
+# Host errors:
+class HostError(BenchError):
+ pass
+
+
+# Guest errors:
+class GuestError(BenchError):
+ pass
+
+
+class GuestAgentError(GuestError):
+ pass
+
+
+class AlarmTimeoutError(GuestError):
+ pass
+
+
+# Generic errors:
+class GemWsimError(BenchError):
+ pass
+
+
+class VgpuProfileError(BenchError):
+ pass
+
+
+class NotAvailableError(BenchError):
+ pass
diff --git a/tools/vmtb/bench/executors/__init__.py b/tools/vmtb/bench/executors/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/tools/vmtb/bench/executors/executor_interface.py b/tools/vmtb/bench/executors/executor_interface.py
new file mode 100644
index 000000000..936e2c721
--- /dev/null
+++ b/tools/vmtb/bench/executors/executor_interface.py
@@ -0,0 +1,24 @@
+#!/usr/bin/env python3
+# SPDX-License-Identifier: MIT
+
+## Copyright (C) 2024 Intel Corporation ##
+
+import abc
+import signal
+
+from bench.machines.machine_interface import ProcessResult
+
+
+class ExecutorInterface(metaclass=abc.ABCMeta):
+
+ @abc.abstractmethod
+ def status(self) -> ProcessResult:
+ raise NotImplementedError
+
+ @abc.abstractmethod
+ def wait(self) -> ProcessResult:
+ raise NotImplementedError
+
+ @abc.abstractmethod
+ def sendsig(self, sig: signal.Signals) -> None:
+ raise NotImplementedError
diff --git a/tools/vmtb/bench/executors/gem_wsim.py b/tools/vmtb/bench/executors/gem_wsim.py
new file mode 100644
index 000000000..15c18868a
--- /dev/null
+++ b/tools/vmtb/bench/executors/gem_wsim.py
@@ -0,0 +1,71 @@
+#!/usr/bin/env python3
+# SPDX-License-Identifier: MIT
+
+## Copyright (C) 2024 Intel Corporation ##
+
+import logging
+import re
+import typing
+
+from bench import exceptions
+from bench.executors.shell import ShellExecutor
+from bench.machines.machine_interface import MachineInterface, DEFAULT_TIMEOUT
+
+logger = logging.getLogger(__name__)
+
+class GemWsimResult(typing.NamedTuple):
+ elapsed_sec: float
+ workloads_per_sec: float
+
+# Basic workloads
+ONE_CYCLE_DURATION_MS = 10
+PREEMPT_10MS_WORKLOAD = (f'1.DEFAULT.{int(ONE_CYCLE_DURATION_MS * 1000 / 2)}.0.0'
+ f',2.DEFAULT.{int(ONE_CYCLE_DURATION_MS * 1000 / 2)}.-1.1')
+NON_PREEMPT_10MS_WORKLOAD = f'X.1.0,X.2.0,{PREEMPT_10MS_WORKLOAD}'
+
+class GemWsim(ShellExecutor):
+ def __init__(self, machine: MachineInterface, num_clients: int = 1, num_repeats: int = 1,
+ workload: str = PREEMPT_10MS_WORKLOAD, timeout: int = DEFAULT_TIMEOUT) -> None:
+ super().__init__(
+ machine,
+ f'/usr/local/libexec/igt-gpu-tools/benchmarks/gem_wsim -w {workload} -c {num_clients} -r {num_repeats}',
+ timeout)
+ self.machine_id = str(machine)
+
+ def __str__(self) -> str:
+ return f'gem_wsim({self.machine_id}:{self.pid})'
+
+ def is_running(self) -> bool:
+ return not self.status().exited
+
+ def wait_results(self) -> GemWsimResult:
+ proc_result = self.wait()
+ if proc_result.exit_code == 0:
+ logger.info('%s: %s', self, proc_result.stdout)
+ # Try parse output ex.: 19.449s elapsed (102.836 workloads/s)
+ pattern = r'(?P<elapsed>\d+(\.\d*)?|\.\d+)s elapsed \((?P<wps>\d+(\.\d*)?|\.\d+) workloads/s\)'
+ match = re.search(pattern, proc_result.stdout, re.MULTILINE)
+ if match:
+ return GemWsimResult(float(match.group('elapsed')), float(match.group('wps')))
+ raise exceptions.GemWsimError(f'{self}: exit_code: {proc_result.exit_code}'
+ f' stdout: {proc_result.stdout} stderr: {proc_result.stderr}')
+
+
+def gem_wsim_parallel_exec_and_check(vms: typing.List[MachineInterface], workload: str, iterations: int,
+ expected: typing.Optional[GemWsimResult] = None) -> GemWsimResult:
+ # launch on each VM in parallel
+ wsim_procs = [GemWsim(vm, 1, iterations, workload) for vm in vms]
+ for i, wsim in enumerate(wsim_procs):
+ assert wsim.is_running(), f'GemWsim failed to start on VM{i}'
+
+ results = [wsim.wait_results() for wsim in wsim_procs]
+ if expected is not None:
+ assert results[0].elapsed_sec > expected.elapsed_sec * 0.9
+ assert results[0].workloads_per_sec > expected.workloads_per_sec * 0.9
+ for r in results[1:]:
+ # check wps ratio ~1.0 with 10% tolerance
+ assert 0.9 < r.workloads_per_sec / results[0].workloads_per_sec < 1.1
+ # check elapsed ratio ~1.0 with 10% tolerance
+ assert 0.9 < r.elapsed_sec / results[0].elapsed_sec < 1.1
+ # return first result, all other are asserted to be ~same
+ return results[0]
diff --git a/tools/vmtb/bench/executors/igt.py b/tools/vmtb/bench/executors/igt.py
new file mode 100644
index 000000000..1ded2e6bd
--- /dev/null
+++ b/tools/vmtb/bench/executors/igt.py
@@ -0,0 +1,127 @@
+#!/usr/bin/env python3
+# SPDX-License-Identifier: MIT
+
+## Copyright (C) 2024 Intel Corporation ##
+
+import json
+import logging
+import posixpath
+import signal
+import typing
+import enum
+
+from bench.executors.executor_interface import ExecutorInterface
+from bench.machines.machine_interface import MachineInterface, ProcessResult, DriverModule, DEFAULT_TIMEOUT
+from bench.executors.shell import ShellExecutor
+
+logger = logging.getLogger(__name__)
+
+
+class IgtConfiguration(typing.NamedTuple):
+ test_dir: str = '/usr/local/libexec/igt-gpu-tools/'
+ tool_dir: str = '/usr/local/bin/'
+ lib_dir: str = '/usr/local/lib/x86_64-linux-gnu'
+ result_dir: str = '/usr/local/results'
+ options: str = '--piglit-style-dmesg --dmesg-warn-level=4 --abort-on-monitored-error=taint --overwrite'
+
+
+class IgtType(enum.Enum):
+ EXEC_BASIC = 1
+ EXEC_STORE = 2
+ SPIN_BATCH = 3
+
+
+# Mappings of driver specific (i915/xe) IGT instances:
+# {IGT type: (i915 IGT name, xe IGT name)}
+igt_tests: typing.Dict[IgtType, typing.Tuple[str, str]] = {
+ IgtType.EXEC_BASIC: ('igt at gem_exec_basic@basic', 'igt at xe_exec_basic@once-basic'),
+ IgtType.EXEC_STORE: ('igt at gem_exec_store@dword', 'igt at xe_exec_store@basic-store'),
+ IgtType.SPIN_BATCH: ('igt at gem_spin_batch@legacy', 'igt at xe_spin_batch@spin-basic')
+ }
+
+
+class IgtExecutor(ExecutorInterface):
+ def __init__(self, target: MachineInterface,
+ test: typing.Union[str, IgtType],
+ timeout: int = DEFAULT_TIMEOUT,
+ igt_config: IgtConfiguration = IgtConfiguration()) -> None:
+ self.igt_config = igt_config
+ # TODO ld_library_path not used now, need a way to pass this to guest
+ #ld_library_path = f'LD_LIBRARY_PATH={igt_config.lib_dir}'
+ runner = posixpath.join(igt_config.tool_dir, 'igt_runner')
+ testlist = '/tmp/igt_executor.testlist'
+ command = f'{runner} {igt_config.options} ' \
+ f'--test-list {testlist} {igt_config.test_dir} {igt_config.result_dir}'
+ self.results: typing.Dict[str, typing.Any] = {}
+ self.target: MachineInterface = target
+ self.igt: str = test if isinstance(test, str) else self.select_igt_variant(target.get_drm_driver(), test)
+ self.target.write_file_content(testlist, self.igt)
+ self.timeout: int = timeout
+
+ logger.info("[%s] Execute IGT test: %s", target, self.igt)
+ self.pid: int = self.target.execute(command)
+
+ # Executor interface implementation
+ def status(self) -> ProcessResult:
+ return self.target.execute_status(self.pid)
+
+ def wait(self) -> ProcessResult:
+ return self.target.execute_wait(self.pid, self.timeout)
+
+ def sendsig(self, sig: signal.Signals) -> None:
+ self.target.execute_signal(self.pid, sig)
+
+ def terminate(self) -> None:
+ self.sendsig(signal.SIGTERM)
+
+ def kill(self) -> None:
+ self.sendsig(signal.SIGKILL)
+
+ # IGT specific methods
+ def get_results_log(self) -> typing.Dict:
+ # Results are cached
+ if self.results:
+ return self.results
+ path = posixpath.join(self.igt_config.result_dir, 'results.json')
+ result = self.target.read_file_content(path)
+ self.results = json.loads(result)
+ return self.results
+
+ def did_pass(self) -> bool:
+ results = self.get_results_log()
+ totals = results.get('totals')
+ if not totals:
+ return False
+ aggregate = totals.get('root')
+ if not aggregate:
+ return False
+
+ pass_case = 0
+ fail_case = 0
+ for key in aggregate:
+ if key in ['pass', 'warn', 'dmesg-warn']:
+ pass_case = pass_case + aggregate[key]
+ continue
+ fail_case = fail_case + aggregate[key]
+
+ logger.debug('Full IGT test results:\n%s', json.dumps(results, indent=4))
+
+ if fail_case > 0:
+ logger.error('Test failed!')
+ return False
+
+ return True
+
+ def select_igt_variant(self, driver: DriverModule, igt_type: IgtType) -> str:
+ # Select IGT variant dedicated for a given drm driver: xe or i915
+ igt = igt_tests[igt_type]
+ return igt[1] if driver is DriverModule.XE else igt[0]
+
+
+def igt_list_subtests(target: MachineInterface, test_name: str,
+ igt_config: IgtConfiguration = IgtConfiguration()) -> typing.List[str]:
+ command = f'{igt_config.test_dir}{test_name} --list-subtests'
+ proc_result = ShellExecutor(target, command).wait()
+ if proc_result.exit_code == 0:
+ return proc_result.stdout.split("\n")
+ return []
diff --git a/tools/vmtb/bench/executors/shell.py b/tools/vmtb/bench/executors/shell.py
new file mode 100644
index 000000000..f666e0b15
--- /dev/null
+++ b/tools/vmtb/bench/executors/shell.py
@@ -0,0 +1,31 @@
+#!/usr/bin/env python3
+# SPDX-License-Identifier: MIT
+
+## Copyright (C) 2024 Intel Corporation ##
+
+import signal
+
+from bench.executors.executor_interface import ExecutorInterface
+from bench.machines.machine_interface import MachineInterface, ProcessResult, DEFAULT_TIMEOUT
+
+
+class ShellExecutor(ExecutorInterface):
+ def __init__(self, target: MachineInterface, command: str, timeout: int = DEFAULT_TIMEOUT) -> None:
+ self.target = target
+ self.timeout = timeout
+ self.pid = self.target.execute(command)
+
+ def status(self) -> ProcessResult:
+ return self.target.execute_status(self.pid)
+
+ def wait(self) -> ProcessResult:
+ return self.target.execute_wait(self.pid, self.timeout)
+
+ def sendsig(self, sig: signal.Signals) -> None:
+ self.target.execute_signal(self.pid, sig)
+
+ def terminate(self) -> None:
+ self.sendsig(signal.SIGTERM)
+
+ def kill(self) -> None:
+ self.sendsig(signal.SIGKILL)
diff --git a/tools/vmtb/bench/helpers/__init__.py b/tools/vmtb/bench/helpers/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/tools/vmtb/bench/helpers/helpers.py b/tools/vmtb/bench/helpers/helpers.py
new file mode 100644
index 000000000..3d87c0a38
--- /dev/null
+++ b/tools/vmtb/bench/helpers/helpers.py
@@ -0,0 +1,248 @@
+#!/usr/bin/env python3
+# SPDX-License-Identifier: MIT
+
+## Copyright (C) 2024 Intel Corporation ##
+
+import logging
+import posixpath
+import subprocess
+import typing
+import re
+import shutil
+from os import listdir
+from os.path import isfile, join
+
+from typing import List
+from bench import exceptions
+from bench.executors.igt import IgtExecutor
+from bench.executors.shell import ShellExecutor
+from bench.machines.machine_interface import MachineInterface
+from bench.machines.virtual.vm import VirtualMachine
+from bench.machines import pci
+from bench.machines.host import SriovHost, DriverModule
+
+logger = logging.getLogger(__name__)
+
+
+def driver_check(machine: MachineInterface, card: int = 0) -> bool:
+ drm_driver = machine.get_drm_driver()
+ if not machine.dir_exists(f'/sys/module/{drm_driver}/drivers/pci:{drm_driver}/'):
+ logger.error(f'{drm_driver} module not loaded on card %s', card)
+ return False
+
+ if drm_driver is DriverModule.I915:
+ # 'wedged' debugfs entry is not available for xe (yet?)
+ wedged_debugfs = posixpath.join('/sys/kernel/debug/dri/', str(card), 'i915_wedged')
+ out = machine.read_file_content(wedged_debugfs)
+ logger.debug('Wedge value %s', out)
+ if int(out) == 0:
+ return True
+
+ logger.error('i915 is wedged')
+ return False
+
+ return True
+
+
+def igt_check(igt_test: IgtExecutor) -> bool:
+ ''' Helper/wrapper for wait and check for igt test '''
+ igt_out = igt_test.wait()
+ if igt_out.exit_code == 0 and igt_test.did_pass():
+ return True
+ logger.error('IGT failed with %s', igt_out)
+ return False
+
+
+def igt_run_check(machine: MachineInterface, test: str) -> bool:
+ ''' Helper/wrapper for quick run and check for igt test '''
+ igt_test = IgtExecutor(machine, test)
+ return igt_check(igt_test)
+
+
+def cmd_check(cmd: ShellExecutor) -> bool:
+ ''' Helper/wrapper for wait and check for shell command '''
+ cmd_out = cmd.wait()
+ if cmd_out.exit_code == 0:
+ return True
+ logger.error('%s failed with %s', cmd, cmd_out)
+ return False
+
+
+def cmd_run_check(machine: MachineInterface, cmd: str) -> bool:
+ ''' Helper/wrapper for quick run and check for shell command '''
+ cmd_run = ShellExecutor(machine, cmd)
+ return cmd_check(cmd_run)
+
+
+def modprobe_driver(machine: MachineInterface, parameters: str = '', options: str = '') -> ShellExecutor:
+ """Load driver (modprobe [driver_module]) and return ShellExecutor instance (do not check a result)."""
+ drm_driver = machine.get_drm_driver()
+ modprobe_cmd = ShellExecutor(machine, f'modprobe {drm_driver} {options} {parameters}')
+ return modprobe_cmd
+
+
+def modprobe_driver_check(machine: MachineInterface, cmd: ShellExecutor) -> bool:
+ """Check result of a driver load (modprobe) based on a given ShellExecutor instance."""
+ modprobe_success = cmd_check(cmd)
+ if modprobe_success:
+ return driver_check(machine)
+
+ logger.error('Modprobe failed')
+ return False
+
+
+def modprobe_driver_run_check(machine: MachineInterface, parameters: str = '', options: str = '') -> bool:
+ """Load (modprobe) a driver and check a result (waits until operation ends)."""
+ modprobe_cmd = modprobe_driver(machine, parameters, options)
+ modprobe_success = modprobe_driver_check(machine, modprobe_cmd)
+ if modprobe_success:
+ return driver_check(machine)
+
+ logger.error('Modprobe failed')
+ return False
+
+
+def is_driver_loaded(machine: MachineInterface, driver_name: str) -> bool:
+ if machine.dir_exists(posixpath.join('/sys/bus/pci/drivers/', driver_name)):
+ return True
+
+ return False
+
+
+def load_host_drivers(host: SriovHost) -> None:
+ """Load (modprobe) required host drivers (DRM and VFIO)."""
+ drm_driver = host.get_drm_driver()
+ if not is_driver_loaded(host, drm_driver):
+ logger.info('%s driver is not loaded - probe module', drm_driver)
+ drv_probe_pid = modprobe_driver(host).pid
+ assert host.execute_wait(drv_probe_pid).exit_code == 0
+
+ host.set_autoprobe(0)
+
+ vfio_driver = host.get_vfio_driver()
+ if not is_driver_loaded(host, vfio_driver):
+ logger.info('%s driver is not loaded - probe module', vfio_driver)
+ vfio_probe_pid = host.execute(f'modprobe {vfio_driver}')
+ assert host.execute_wait(vfio_probe_pid).exit_code == 0
+
+
+def get_devices_bound_to_driver(driver_name: str) -> typing.List[str]:
+ ''' Helper to get all devices' BDFs bound to the given driver '''
+ out = subprocess.check_output(['ls', f'/sys/bus/pci/drivers/{driver_name}'], universal_newlines=True)
+ pattern = r'([0-9a-fA-F]{4}:[0-9a-fA-F]{2}:[0-9a-fA-F]{2}\.\d{1})'
+ matches = re.findall(pattern, out, re.MULTILINE)
+
+ return matches
+
+
+def device_unbind(device_bdf: str) -> None:
+ path = posixpath.join('/sys/bus/pci/devices/', f'{device_bdf}/driver/unbind')
+ logger.debug('About to write %s to %s', device_bdf, path)
+
+ try:
+ with open(path, 'w', encoding='utf-8') as file:
+ file.write(device_bdf)
+ except Exception as exc:
+ logger.error('Unable to unbind, Error: %s', exc)
+
+
+def unload_host_drivers(host: SriovHost) -> None:
+ drm_driver = host.get_drm_driver()
+ vfio_driver = host.get_vfio_driver()
+ logger.debug("Cleanup: unload drivers\n")
+ rmmod_pid = host.execute(f'modprobe -rf {vfio_driver}')
+ assert host.execute_wait(rmmod_pid).exit_code == 0
+
+ for device_bdf in get_devices_bound_to_driver(drm_driver):
+ logger.debug("Unbind %s from device %s", drm_driver, device_bdf)
+ device_unbind(device_bdf)
+
+ rmmod_pid = host.execute(f'modprobe -rf {drm_driver}')
+ assert host.execute_wait(rmmod_pid).exit_code == 0
+ logger.debug("Host %s successfully removed", drm_driver)
+
+
+def cold_migrate_vm(vm_source: VirtualMachine, vm_destination: VirtualMachine) -> bool:
+ ''' Helper for VM cold migration using snapshots '''
+ if not vm_source.is_running() or vm_destination.is_running():
+ logger.error('Invalid initial VM state for migration')
+ return False
+
+ try:
+ vm_source.pause()
+ vm_source.save_state()
+ vm_source.quit()
+
+ vm_destination.set_migration_source(vm_source.image)
+ vm_destination.poweron()
+ vm_destination.load_state()
+ vm_destination.resume()
+ except Exception as exc:
+ logger.error('Error during VM migration: %s', exc)
+ return False
+
+ return True
+
+
+def duplicate_vm_image(src_img: str) -> str:
+ ''' Helper to duplicate source VM qcow2 image for destination VM re-use '''
+ dst_img: str = 'dst_' + posixpath.basename(src_img)
+ try:
+ shutil.copyfile(src_img, dst_img)
+ except Exception as exc:
+ raise exceptions.HostError(f'Error during VM image copy: {exc}') from exc
+
+ logger.debug("Duplicated source image (%s) for destination VM usage (%s)", src_img, dst_img)
+
+ return dst_img
+
+
+class GucVersion:
+ def __init__(self, major: int, minor: int, patch: int):
+ self.major = major
+ self.minor = minor
+ self.patch = patch
+
+ def __str__(self) -> str:
+ return f'{self.major}.{self.minor}.{self.patch}'
+
+ def __repr__(self) -> str:
+ return f'{self.major}.{self.minor}.{self.patch}'
+
+ def __eq__(self, other: object) -> bool:
+ if isinstance(other, GucVersion):
+ if other.major == self.major and other.minor == self.minor and other.patch == self.patch:
+ return True
+ return False
+
+
+def list_guc_binaries(host: SriovHost) -> List[GucVersion]:
+ ''' Helper that returns list of GuC binary versions found for device's prefix given '''
+ if host.gpu_name in (pci.GpuDevice.ATSM150, pci.GpuDevice.ATSM75):
+ device_prefix = 'dg2_guc_'
+ elif host.gpu_name is pci.GpuDevice.PVC:
+ device_prefix = 'pvc_guc_'
+ elif host.gpu_name is pci.GpuDevice.ADLP:
+ device_prefix = 'adlp_guc_'
+ else:
+ raise exceptions.HostError(f'GPU Device unknown: {host.gpu_name}')
+
+ firmware_path = '/usr/lib/firmware/i915/'
+ firmware_dir_contents = [f for f in listdir(firmware_path) if isfile(join(firmware_path, f))]
+ guc_vers_numbers = []
+ guc_binaries_versions = []
+ version_format = r'\d+\.\d+\.\d+'
+
+ for entry in firmware_dir_contents:
+ if entry.startswith(device_prefix):
+ found_version = re.search(version_format, entry)
+ if found_version:
+ guc_vers_numbers.append(found_version.group())
+
+ guc_vers_numbers.sort(key=lambda version: [int(i) for i in version.split('.')])
+
+ for ver in guc_vers_numbers:
+ version_ints = [int(i) for i in ver.split('.')]
+ guc_binaries_versions.append(GucVersion(version_ints[0], version_ints[1], version_ints[2]))
+
+ return guc_binaries_versions
diff --git a/tools/vmtb/bench/machines/__init__.py b/tools/vmtb/bench/machines/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/tools/vmtb/bench/machines/host.py b/tools/vmtb/bench/machines/host.py
new file mode 100644
index 000000000..234b2220c
--- /dev/null
+++ b/tools/vmtb/bench/machines/host.py
@@ -0,0 +1,820 @@
+#!/usr/bin/env python3
+# SPDX-License-Identifier: MIT
+
+## Copyright (C) 2024 Intel Corporation ##
+
+import errno
+import fcntl
+import functools
+import logging
+import os
+import posixpath
+import re
+import shlex
+import signal
+import subprocess
+import typing
+import enum
+
+from pathlib import Path
+
+from bench import exceptions
+from bench.machines.machine_interface import MachineInterface, ProcessResult, SuspendMode, DriverModule, DEFAULT_TIMEOUT
+from bench.machines import pci
+from bench.machines.vgpu_profile import VgpuProfile, VgpuProfileClass, VgpuProfileCsvReader
+
+logger = logging.getLogger(__name__)
+
+HOST_DMESG_FILE = Path("/tmp/vm-test-bench-host_dmesg.log.tmp")
+VGPU_CSV_DIR = Path(Path.cwd(), "vmm_flows/resources/vgpu_profile")
+
+
+class HostDecorators():
+ ''' https://www.kernel.org/doc/Documentation/ABI/testing/dev-kmsg '''
+ @staticmethod
+ def read_messages(fd: int) -> typing.List[str]:
+ buf_size = 4096
+ kmsgs = []
+ while True:
+ try:
+ kmsg = os.read(fd, buf_size)
+ kmsgs.append(kmsg.decode())
+ except OSError as exc:
+ if exc.errno == errno.EAGAIN:
+ break
+
+ if exc.errno == errno.EPIPE:
+ pass
+ else:
+ raise
+ return kmsgs
+
+ @staticmethod
+ def parse_messages(kmsgs: typing.List[str]) -> None:
+ for msg in kmsgs:
+ header, human = msg.split(';', 1)
+ # Unused for now: seq, time, other
+ fac, _, _, _ = header.split(',', 3)
+ level = int(fac) & 0x7
+ if level <= 4:
+ logger.error('Found message: %s with error level %s', human.strip(), level)
+ raise exceptions.HostError(f'Error in dmesg: {human.strip()}')
+
+ logger.debug('Found message: %s with error level %s', human.strip(), level)
+
+ @classmethod
+ def parse_kmsg(cls, func: typing.Callable) -> typing.Callable:
+ @functools.wraps(func)
+ def parse_wrapper(*args: typing.Any, **kwargs: typing.Optional[typing.Any]) -> typing.Any:
+ with open('/dev/kmsg', 'r', encoding='utf-8') as f, \
+ open(HOST_DMESG_FILE, 'a', encoding='utf-8') as dmesg_file:
+
+ fd = f.fileno()
+ os.lseek(fd, os.SEEK_SET, os.SEEK_END)
+ flags = fcntl.fcntl(fd, fcntl.F_GETFL)
+ fcntl.fcntl(fd, fcntl.F_SETFL, flags | os.O_NONBLOCK)
+
+ # Execute actual function
+ result = func(*args, **kwargs)
+
+ kmsgs = cls.read_messages(fd)
+ dmesg_file.writelines(kmsgs)
+ cls.parse_messages(kmsgs)
+
+ return result
+ return parse_wrapper
+
+
+class Host(MachineInterface):
+ def __init__(self) -> None:
+ self.running_procs: typing.Dict[int, subprocess.Popen] = {}
+
+ self.host_bdf, self.host_pci_id = pci.get_pci_info()
+ self.gpu_name = pci.get_gpu_name(self.host_pci_id)
+ self.sysfs_prefix_path = posixpath.join('/sys/bus/pci/devices/', self.host_bdf)
+ self.drm_driver, self.vfio_driver = self.select_driver_module()
+
+ if HOST_DMESG_FILE.exists():
+ HOST_DMESG_FILE.unlink()
+ HOST_DMESG_FILE.touch()
+
+ logger.debug('Found GPU Device: %s - PCI ID: %s - BDF: %s',
+ self.gpu_name, self.host_pci_id, self.host_bdf)
+
+ def __str__(self) -> str:
+ return f'Host_{self.host_bdf}'
+
+ # MachineInterface implementation
+ @HostDecorators.parse_kmsg
+ def execute(self, command: str) -> int:
+ cmd_arr = shlex.split(command)
+ # We don't want to kill the process created here (like 'with' would do) so disable the following linter issue:
+ # R1732: consider-using-with (Consider using 'with' for resource-allocating operations)
+ # pylint: disable=R1732
+ # TODO: but maybe 'subprocess.run' function would fit instead of Popen constructor?
+ process = subprocess.Popen(cmd_arr,
+ stdout=subprocess.PIPE,
+ stderr=subprocess.PIPE,
+ universal_newlines=True)
+
+ self.running_procs[process.pid] = process
+ logger.debug('Running %s on host with pid %s', command, process.pid)
+ return process.pid
+
+ @HostDecorators.parse_kmsg
+ def execute_status(self, pid: int) -> ProcessResult:
+ proc = self.running_procs.get(pid, None)
+ if not proc:
+ raise exceptions.HostError('No such process')
+
+ exit_code: typing.Optional[int] = proc.poll()
+ logger.debug('PID %s -> exit code %s', pid, exit_code)
+ if exit_code is None:
+ return ProcessResult(False, exit_code, '', '')
+
+ out, err = proc.communicate()
+ return ProcessResult(True, exit_code, out, err)
+
+ @HostDecorators.parse_kmsg
+ def execute_wait(self, pid: int, timeout: int = DEFAULT_TIMEOUT) -> ProcessResult:
+ proc = self.running_procs.get(pid, None)
+ if not proc:
+ raise exceptions.HostError(f'No process with pid {pid}')
+
+ out = ''
+ err = ''
+ try:
+ out, err = proc.communicate(timeout)
+ except subprocess.TimeoutExpired as exc:
+ logger.warning('Timeout (%ss) expired for pid %s', exc.timeout, pid)
+ raise
+
+ return ProcessResult(True, proc.poll(), out, err)
+
+ @HostDecorators.parse_kmsg
+ def execute_signal(self, pid: int, sig: signal.Signals) -> None:
+ proc = self.running_procs.get(pid, None)
+ if not proc:
+ raise exceptions.HostError(f'No process with pid {pid}')
+
+ proc.send_signal(sig)
+
+ def read_file_content(self, path: str) -> str:
+ with open(path, encoding='utf-8') as f:
+ content = f.read()
+ return content
+
+ def write_file_content(self, path: str, content: str) -> int:
+ with open(path, 'w', encoding='utf-8') as f:
+ return f.write(content)
+
+ def dir_exists(self, path: str) -> bool:
+ return os.path.exists(path)
+
+ def suspend(self, mode: SuspendMode = SuspendMode.ACPI_S3) -> None:
+ wakeup_delay = 10 # wakeup timer in seconds
+ logger.debug("Host suspend-resume via rtcwake (mode: %s, wakeup delay: %ss)", mode, wakeup_delay)
+
+ suspend_pid = self.execute(f'rtcwake -s {wakeup_delay} -m {mode}')
+ suspend_result: ProcessResult = self.execute_wait(suspend_pid)
+ if suspend_result.exit_code != 0:
+ raise exceptions.HostError(f'Suspend failed. Error: {suspend_result.stderr}')
+
+ def query_supported_drivers(self) -> typing.List[typing.Tuple[DriverModule, str]]:
+ # Check host for supported DRM drivers (i915 / xe) and VFIO
+ # Fallback to the regular vfio-pci, in case a vendor/driver specific variant is not available
+ available_drivers: typing.List[typing.Tuple[DriverModule, str]] = []
+
+ for drm_driver in DriverModule:
+ modinfo_pid = self.execute(f'modinfo -F filename {drm_driver}')
+ modinfo_result: ProcessResult = self.execute_wait(modinfo_pid)
+ if modinfo_result.exit_code == 0:
+ modinfo_pid = self.execute(f'modinfo -F filename {drm_driver}-vfio-pci')
+ modinfo_result = self.execute_wait(modinfo_pid)
+ vfio_driver = f'{drm_driver}-vfio-pci' if modinfo_result.exit_code == 0 else 'vfio-pci'
+
+ available_drivers.append((drm_driver, vfio_driver))
+
+ logger.debug("Host - found DRM/VFIO driver module(s): %s", available_drivers)
+ return available_drivers
+
+ def select_driver_module(self) -> typing.Tuple[DriverModule, str]:
+ # Xe is preferred in case of both, i915 and xe drivers are supported by the kernel
+ available_drivers = self.query_supported_drivers()
+ for drm, vfio in available_drivers:
+ if drm is DriverModule.XE:
+ return (DriverModule.XE, vfio)
+
+ return available_drivers[0]
+
+ def get_drm_driver(self) -> DriverModule:
+ return self.drm_driver
+
+ def get_vfio_driver(self) -> str:
+ return self.vfio_driver
+
+ def get_card_index(self) -> int:
+ drm_dir = posixpath.join(self.sysfs_prefix_path, "drm")
+
+ for filename in os.listdir(drm_dir):
+ if filename.startswith("card"):
+ index_match = re.search(r'card(?P<card_index>\d+)', filename)
+ if index_match:
+ return int(index_match.group('card_index'))
+
+ raise exceptions.HostError('Could not determine card index')
+
+ def get_debugfs_path(self) -> str:
+ return posixpath.join('/sys/kernel/debug/dri/', str(self.get_card_index()))
+
+class SriovHost(Host):
+ def __init__(self) -> None:
+ super().__init__()
+ # Initialized by query_vgpu_profiles() from vGPU profiles CSV files
+ self.supported_vgpu_profiles: typing.List[VgpuProfile] = []
+ # vGPU profile currently applied
+ self.vgpu_profile_id: str = ''
+ # Device prefix for the vGPU ProfileID and CSV files name
+ self._vgpu_device_prefix: str = ''
+
+ @HostDecorators.parse_kmsg
+ def __write_sysfs(self, name: str, value: str) -> None:
+ path = posixpath.join(self.sysfs_prefix_path, name)
+ logger.debug('About to write %s to %s', value, path)
+ try:
+ with open(path, 'w', encoding='utf-8') as file:
+ file.write(value)
+ except Exception as exc:
+ logger.error('Unable to write %s', path)
+ raise exceptions.HostError(f'Could not write to {path}. Error: {exc}') from exc
+
+ @HostDecorators.parse_kmsg
+ def __read_sysfs(self, name: str) -> str:
+ path = posixpath.join(self.sysfs_prefix_path, name)
+ try:
+ with open(path, 'r', encoding='utf-8') as file:
+ ret = file.read()
+ except Exception as exc:
+ logger.error('Unable to read %s', path)
+ raise exceptions.HostError(f'Could not read to {path}. Error: {exc}') from exc
+
+ logger.debug('Value in %s: %s', name, ret)
+ return ret
+
+ def get_iov_path(self) -> str:
+ # SRIOV provisioning base paths:
+ # i915: /sys/bus/pci/devices/[BDF]/drm/card[card_index]/prelim_iov/
+ # xe: /sys/kernel/debug/dri/[card_index]/
+ if self.drm_driver is DriverModule.I915:
+ iov_path = posixpath.join(self.sysfs_prefix_path, f'drm/card{str(self.get_card_index())}', 'prelim_iov')
+ elif self.drm_driver is DriverModule.XE:
+ # posixpath.join(self.sysfs_prefix_path, 'sriov')
+ iov_path = self.get_debugfs_path()
+ else:
+ raise exceptions.HostError(f'Unsupported host DRM driver: {self.drm_driver}')
+ return iov_path
+
+ def set_autoprobe(self, val: int) -> None:
+ self.__write_sysfs('sriov_drivers_autoprobe', str(val))
+ ret = self.__read_sysfs('sriov_drivers_autoprobe')
+ if int(ret) != val:
+ logger.error('Autoprobe value missmatch wanted: %s, got: %s', ret, val)
+ raise exceptions.HostError(f'Autoprobe value missmatch wanted: {ret}, got: {val}')
+
+ def get_total_vfs(self) -> int:
+ return int(self.__read_sysfs('sriov_totalvfs'))
+
+ def get_current_vfs(self) -> int:
+ return int(self.__read_sysfs('sriov_numvfs'))
+
+ def get_num_gts(self) -> int:
+ gt_num = 0
+ if self.drm_driver is DriverModule.I915:
+ path = posixpath.join(f'{self.get_iov_path()}/pf/gt')
+ elif self.drm_driver is DriverModule.XE:
+ path = posixpath.join(f'{self.get_debugfs_path()}/gt')
+ if posixpath.lexists(path):
+ gt_num = 1
+ else:
+ while posixpath.lexists(posixpath.join(f'{path}{gt_num}')):
+ gt_num += 1
+
+ return gt_num
+
+ def has_lmem(self) -> bool:
+ if self.drm_driver is DriverModule.I915:
+ path = posixpath.join(f'{self.sysfs_prefix_path}/drm/card{self.get_card_index()}/lmem_total_bytes')
+ elif self.drm_driver is DriverModule.XE:
+ path = self.helper_create_sysfs_path(0, 0, "", "lmem_quota")
+ else:
+ raise exceptions.HostError(f'Unsupported host DRM driver: {self.drm_driver}')
+
+ return posixpath.lexists(path)
+
+ def create_vf(self, num: int) -> int:
+ self.numvf = num
+ self.clear_vf()
+
+ self.__write_sysfs('sriov_numvfs', str(num))
+ ret = self.__read_sysfs('sriov_numvfs')
+ return int(ret)
+
+ def clear_vf(self) -> int:
+ self.__write_sysfs('sriov_numvfs', '0')
+ ret = self.__read_sysfs('sriov_numvfs')
+ if int(ret) != 0:
+ raise exceptions.HostError('VFs not cleared after 0 write')
+ return int(ret)
+
+ # reset_provisioning - resets provisioning config for the requested number of VFs.
+ # Function calls the sysfs control interface to clear VF provisioning settings
+ # and restores the auto provisioning mode.
+ # @num_vfs: number of VFs to clear the provisioning
+ def reset_provisioning(self, num_vfs: int) -> None:
+ for gt_num in range(self.get_num_gts()):
+ if self.drm_driver is DriverModule.I915:
+ if self.get_pf_sched_priority(gt_num) != self.SchedulingPriority.LOW:
+ self.set_pf_sched_priority(gt_num, self.SchedulingPriority.LOW)
+ self.set_pf_policy_sched_if_idle(gt_num, 0)
+ self.set_pf_policy_engine_reset(gt_num, 0)
+ self.set_exec_quantum_ms(0, gt_num, 0)
+ self.set_preempt_timeout_us(0, gt_num, 0)
+ if self.drm_driver is DriverModule.I915:
+ self.set_doorbells_quota(0, gt_num, 0)
+ # PF contexts cannot be set from sysfs
+
+ if not self.get_pf_auto_provisioning():
+ for vf_num in range(1, num_vfs + 1):
+ self.set_vf_control(vf_num, self.VfControl.clear)
+
+ self.set_pf_auto_provisioning(True)
+
+ # set_drop_caches - calls the debugfs interface the drm/i915 GEM driver:
+ # /sys/kernel/debug/dri/[card_index]/i915_gem_drop_caches
+ # to drop or evict all classes of gem buffer objects (bitmask 7Fh).
+ def drop_all_caches(self) -> None:
+ if self.drm_driver is DriverModule.I915:
+ path = posixpath.join(f'{self.get_debugfs_path()}/i915_gem_drop_caches')
+ drop_all_bitmask: int = 0x7F # Set all drop flags
+ self.write_file_content(path, str(drop_all_bitmask))
+
+ def bind(self, bdf: str) -> None:
+ self.__write_sysfs(posixpath.join('driver', 'bind'), bdf)
+
+ def unbind(self, bdf: str) -> None:
+ self.__write_sysfs(posixpath.join('driver', 'unbind'), bdf)
+
+ @HostDecorators.parse_kmsg
+ def get_vf_bdf(self, vf_num: int) -> str:
+ vf_path = os.readlink(posixpath.join('/sys/bus/pci/devices/', self.host_bdf, f'virtfn{vf_num - 1}'))
+ pass_bdf = os.path.basename(vf_path)
+ override_path = posixpath.join('/sys/bus/pci/devices/', pass_bdf, 'driver_override')
+ with open(override_path, 'w', encoding='utf-8') as file:
+ file.write(self.vfio_driver)
+
+ with open('/sys/bus/pci/drivers_probe', 'w', encoding='utf-8') as file:
+ file.write(pass_bdf)
+
+ logger.debug('VF%s BDF to pass: %s', vf_num, pass_bdf)
+ return pass_bdf
+
+ def get_vfs_bdf(self, *args: int) -> typing.List[str]:
+ vf_list = list(set(args))
+ bdf_list = [self.get_vf_bdf(vf) for vf in vf_list]
+ return bdf_list
+
+ # helper_create_vgpu_cvs_path - create path to a vGPU profiles definitons files
+ # @csv_dir: directory containing definitions CSV files
+ # Returns: tuple with _vfs.csv and _int.csv paths for a detected platform
+ def helper_create_vgpu_cvs_path(self, csv_dir: str) -> typing.Tuple[str, str]:
+ if self.gpu_name == pci.GpuDevice.ATSM150:
+ self._vgpu_device_prefix = 'ATSM150_'
+ elif self.gpu_name == pci.GpuDevice.ATSM75:
+ self._vgpu_device_prefix = 'ATSM75_'
+ elif self.gpu_name == pci.GpuDevice.PVC:
+ self._vgpu_device_prefix = 'PVC2_'
+ elif self.gpu_name == pci.GpuDevice.ADLP:
+ self._vgpu_device_prefix = 'ADL_'
+ else:
+ raise exceptions.HostError(f'Unknown GPU device: {self.gpu_name}')
+
+ csv_vfs_file_path = posixpath.join(csv_dir, self._vgpu_device_prefix + 'vfs.csv')
+ csv_int_file_path = posixpath.join(csv_dir, self._vgpu_device_prefix + 'int.csv')
+
+ if not posixpath.lexists(csv_vfs_file_path) or not posixpath.lexists(csv_int_file_path):
+ raise exceptions.HostError(f'vGPU profiles CSV files not found in {csv_dir}')
+
+ return (csv_vfs_file_path, csv_int_file_path)
+
+ # query_vgpu_profiles - gets all vGPU profiles supported on a device
+ # Returns: list of vGPU profiles definitions
+ def query_vgpu_profiles(self) -> typing.List[VgpuProfile]:
+ csv_reader = VgpuProfileCsvReader(*self.helper_create_vgpu_cvs_path(str(VGPU_CSV_DIR)))
+ self.supported_vgpu_profiles = csv_reader.vgpu_profiles
+ return self.supported_vgpu_profiles
+
+ # get_vgpu_profile_by_id - gets vGPU profile with a given Profile ID
+ # @profile_id: string defined as 'vGPUProfileInfo ProfileID' in CSVs
+ # Returns: list of vGPU profiles definitions
+ def get_vgpu_profile_by_vgpu_profile_id(self, vgpu_profile_id: str) -> VgpuProfile:
+ if not self.supported_vgpu_profiles:
+ self.query_vgpu_profiles()
+
+ for profile in self.supported_vgpu_profiles:
+ if profile.profileId == vgpu_profile_id:
+ return profile
+
+ raise exceptions.HostError(f'vGPU profile {vgpu_profile_id} not found!')
+
+ # get_vgpu_profile_by_id - gets vGPU profile with a given Profile ID
+ # @profile_id: string defined as 'vGPUProfileInfo ProfileID' in CSVs
+ # without platform prefix
+ # Returns: list of vGPU profiles definitions
+ def get_vgpu_profile_by_id(self, profile_id: str) -> VgpuProfile:
+ if not self.supported_vgpu_profiles:
+ self.query_vgpu_profiles()
+
+ return self.get_vgpu_profile_by_vgpu_profile_id(self._vgpu_device_prefix + profile_id)
+
+ def get_vgpu_profile_by_class(self, requested_class: VgpuProfileClass, requested_num_vfs: int) -> VgpuProfile:
+ """Find vGPU profile matching requested platform independent class and number of VFs.
+
+ For VgpuProfileClass.AUTO - empty profile config is returned that lets DRM driver auto provisioning.
+ In case exact match cannot be found, try to fit similar profile with up to 2 more VFs, for example:
+ - if requested VDI profile with 3 VFs is not available, return close config XYZ_V4 with 4 VFs.
+ - if requested profile with neither 9 VFs, nor with 10 or 11 VFs is available - throw 'not found' exeception.
+ """
+ logger.debug("Get vGPU profile - %s with %sxVF", requested_class, requested_num_vfs)
+
+ if requested_class is VgpuProfileClass.AUTO:
+ auto_profile: VgpuProfile = VgpuProfile()
+ auto_profile.profileId = f'ANY_A{requested_num_vfs}'
+ return auto_profile
+
+ if not self.supported_vgpu_profiles:
+ self.query_vgpu_profiles()
+
+ for profile in self.supported_vgpu_profiles:
+ current_class, current_num_vfs = profile.get_class_num_vfs()
+
+ if current_class is requested_class:
+ if current_num_vfs == requested_num_vfs:
+ return profile # Exact match
+
+ if requested_num_vfs < current_num_vfs <= requested_num_vfs+2:
+ logger.debug('Unable to find accurate vGPU profile but have similar: %s', profile.profileId)
+ return profile # Approximate match
+
+ raise exceptions.VgpuProfileError(f'vGPU profile {requested_class}{requested_num_vfs} not found!')
+
+ # set_vgpu_profile - sets vGPU profile
+ # @profile: definition of vGPU profile to set
+ def set_vgpu_profile(self, profile: VgpuProfile) -> None:
+ logger.info('Set vGPU profile: %s', profile.profileId)
+ self.vgpu_profile_id = profile.profileId
+ num_vfs = profile.get_num_vfs()
+ num_gts = self.get_num_gts() # Number of tiles (GTs)
+ gt_nums = [0] if num_gts == 1 else [0, 1] # Tile (GT) numbers/indexes
+
+ for gt_num in gt_nums:
+ self.set_pf_policy_sched_if_idle(gt_num, int(profile.scheduleIfIdle))
+ self.set_pf_policy_engine_reset(gt_num, int(profile.resetAfterVfSwitch))
+
+ # XXX: PF contexts are currently assigned by the driver and cannot be reprovisioned from sysfs
+ # self.set_contexts_quota(0, gt_num, profile.pfContexts)
+ self.set_doorbells_quota(0, gt_num, profile.pfDoorbells)
+ self.set_exec_quantum_ms(0, gt_num, profile.pfExecutionQuanta)
+ self.set_preempt_timeout_us(0, gt_num, profile.pfPreemptionTimeout)
+
+ for vf_num in range(1, num_vfs + 1):
+ if num_gts > 1 and num_vfs > 1:
+ # Multi-tile device Mode 2|3 - odd VFs on GT0, even on GT1
+ gt_nums = [0] if vf_num % 2 else [1]
+
+ for gt_num in gt_nums:
+ self.set_lmem_quota(vf_num, gt_num, profile.vfLmem)
+ self.set_contexts_quota(vf_num, gt_num, profile.vfContexts)
+ self.set_doorbells_quota(vf_num, gt_num, profile.vfDoorbells)
+ self.set_ggtt_quota(vf_num, gt_num, profile.vfGgtt)
+ self.set_exec_quantum_ms(vf_num, gt_num, profile.vfExecutionQuanta)
+ self.set_preempt_timeout_us(vf_num, gt_num, profile.vfPreemptionTimeout)
+
+ # helper_create_sysfs_path - create sysfs path to given parameter
+ # @vf_num: VF number (1-based) or 0 for PF
+ # @gt_num: GT instance number
+ # @subdir: subdirectory for attribute or empty string if not exists
+ # @attr: iov parameter name
+ # Returns: iov sysfs path to @attr
+ def helper_create_sysfs_path(self, vf_num: int, gt_num: int, subdir: str, attr: str) -> str:
+ if self.drm_driver is DriverModule.XE:
+ vf_gt_part = f'gt{gt_num}/pf' if vf_num == 0 else f'gt{gt_num}/vf{vf_num}'
+ else:
+ gt_part = f'gt{gt_num}' if posixpath.lexists(
+ posixpath.join(self.get_iov_path(), f'pf/gt{gt_num}')) else 'gt'
+ vf_gt_part = f'pf/{gt_part}' if vf_num == 0 else f'vf{vf_num}/{gt_part}'
+
+ return posixpath.join(self.get_iov_path(), vf_gt_part, subdir, attr)
+
+ # helper_get_debugfs_available - reads [attribute]_available from debugfs:
+ # /sys/kernel/debug/dri/[card_index]/@gt_num/iov/@attr_available
+ # @gt_num: GT instance number
+ # @attr: iov parameter name
+ # Returns: total and available size for @attr
+ def helper_get_debugfs_resources(self, gt_num: int, attr: str) -> typing.Tuple[int, int]:
+ path = posixpath.join(f'{self.get_debugfs_path()}/gt{gt_num}/iov/{attr}_available')
+ total = available = 0
+
+ out = self.read_file_content(path)
+ for line in out.splitlines():
+ param, value = line.split(':')
+ value = value.lstrip().split('\t')[0]
+
+ if param == 'total':
+ total = int(value)
+ elif param == 'avail':
+ available = int(value)
+
+ return (total, available)
+
+ # SRIOV sysfs: PF auto_provisioning
+ # Sysfs location:
+ # i915: [SRIOV sysfs base path]/pf/auto_provisioning
+ # xe: [SRIOV sysfs base path]/auto_provisioning
+ # Allows to control VFs auto-provisioning feature.
+ # To re-enable, manual provisioning must be cleared first.
+ def get_pf_auto_provisioning(self) -> bool:
+ # attribute not exposed by Xe (yet?), currently always on
+ if self.drm_driver is DriverModule.XE:
+ return True
+
+ path = self.get_iov_path()
+ if self.drm_driver is DriverModule.I915:
+ path = posixpath.join(path, 'pf')
+
+ path = posixpath.join(path, 'auto_provisioning')
+ ret = self.__read_sysfs(path)
+ return bool(int(ret))
+
+ def set_pf_auto_provisioning(self, val: bool) -> None:
+ # not exposed by Xe (yet?)
+ if self.drm_driver is DriverModule.XE:
+ return
+
+ path = self.get_iov_path()
+ if self.drm_driver is DriverModule.I915:
+ path = posixpath.join(path, 'pf')
+
+ path = posixpath.join(path, 'auto_provisioning')
+ self.__write_sysfs(path, str(int(val)))
+
+ # SRIOV sysfs: PF available resources
+ # Sysfs location: prelim_iov/pf/gtM/available
+ # DEPRECATED functions - *_max_quota and *_free will be removed from i915 sysfs
+ # use debugfs counterparts if needed (get_debugfs_ggtt|lmem|contexts|doorbells)
+ def get_pf_ggtt_max_quota(self, gt_num: int) -> int:
+ if self.drm_driver is DriverModule.XE:
+ raise exceptions.NotAvailableError('PF ggtt_max_quota not available on xe')
+
+ path = self.helper_create_sysfs_path(0, gt_num, "available", "ggtt_max_quota")
+ ret = self.__read_sysfs(path)
+ return int(ret)
+
+ def get_pf_lmem_max_quota(self, gt_num: int) -> int:
+ if self.drm_driver is DriverModule.XE:
+ raise exceptions.NotAvailableError('PF lmem_max_quota not available on xe')
+
+ path = self.helper_create_sysfs_path(0, gt_num, "available", "lmem_max_quota")
+ ret = self.__read_sysfs(path) if self.has_lmem() else 0
+ return int(ret)
+
+ def get_pf_contexts_max_quota(self, gt_num: int) -> int:
+ if self.drm_driver is DriverModule.XE:
+ raise exceptions.NotAvailableError('PF contexts_max_quota not available on xe')
+
+ path = self.helper_create_sysfs_path(0, gt_num, "available", "contexts_max_quota")
+ ret = self.__read_sysfs(path)
+ return int(ret)
+
+ def get_pf_doorbells_max_quota(self, gt_num: int) -> int:
+ if self.drm_driver is DriverModule.XE:
+ raise exceptions.NotAvailableError('PF doorbells_max_quota not available on xe')
+
+ path = self.helper_create_sysfs_path(0, gt_num, "available", "doorbells_max_quota")
+ ret = self.__read_sysfs(path)
+ return int(ret)
+
+ # SRIOV sysfs: PF spare resources
+ # Sysfs location:
+ # i915: [SRIOV sysfs base path]/pf/gtM/xxx_spare
+ # xe: [SRIOV debugfs base path]/pf/gtM/xxx_quota
+ def set_pf_ggtt_spare(self, gt_num: int, val: int) -> None:
+ attr = "ggtt_quota" if self.drm_driver is DriverModule.XE else "ggtt_spare"
+ path = self.helper_create_sysfs_path(0, gt_num, "", attr)
+ self.__write_sysfs(path, str(val))
+
+ def set_pf_lmem_spare(self, gt_num: int, val: int) -> None:
+ attr = "lmem_quota" if self.drm_driver is DriverModule.XE else "lmem_spare"
+ path = self.helper_create_sysfs_path(0, gt_num, "", attr)
+ self.__write_sysfs(path, str(val))
+
+ def set_pf_contexts_spare(self, gt_num: int, val: int) -> None:
+ attr = "contexts_quota" if self.drm_driver is DriverModule.XE else "contexts_spare"
+ path = self.helper_create_sysfs_path(0, gt_num, "", attr)
+ self.__write_sysfs(path, str(val))
+
+ def set_pf_doorbells_spare(self, gt_num: int, val: int) -> None:
+ attr = "doorbells_quota" if self.drm_driver is DriverModule.XE else "doorbells_spare"
+ path = self.helper_create_sysfs_path(0, gt_num, "", attr)
+ self.__write_sysfs(path, str(val))
+
+ def get_pf_ggtt_spare(self, gt_num: int) -> int:
+ attr = "ggtt_quota" if self.drm_driver is DriverModule.XE else "ggtt_spare"
+ path = self.helper_create_sysfs_path(0, gt_num, "", attr)
+ ret = self.__read_sysfs(path)
+ return int(ret)
+
+ def get_pf_lmem_spare(self, gt_num: int) -> int:
+ attr = "lmem_quota" if self.drm_driver is DriverModule.XE else "lmem_spare"
+ path = self.helper_create_sysfs_path(0, gt_num, "", attr)
+ ret = self.__read_sysfs(path)
+ return int(ret)
+
+ def get_pf_contexts_spare(self, gt_num: int) -> int:
+ attr = "contexts_quota" if self.drm_driver is DriverModule.XE else "contexts_spare"
+ path = self.helper_create_sysfs_path(0, gt_num, "", attr)
+ ret = self.__read_sysfs(path)
+ return int(ret)
+
+ def get_pf_doorbells_spare(self, gt_num: int) -> int:
+ attr = "doorbells_quota" if self.drm_driver is DriverModule.XE else "doorbells_spare"
+ path = self.helper_create_sysfs_path(0, gt_num, "", attr)
+ ret = self.__read_sysfs(path)
+ return int(ret)
+
+ # SRIOV sysfs: PF policies
+ # Sysfs location: [SRIOV sysfs base path]/pf/gtM/policies
+ def set_pf_policy_engine_reset(self, gt_num: int, val: int) -> None:
+ # not exposed by Xe (yet?)
+ if self.drm_driver is DriverModule.XE:
+ return
+
+ path = self.helper_create_sysfs_path(0, gt_num, "policies", "engine_reset")
+ self.__write_sysfs(path, str(val))
+
+ # In order to set strict scheduling policy, PF scheduling priority needs to be default
+ def set_pf_policy_sched_if_idle(self, gt_num: int, val: int) -> None:
+ # not exposed by Xe (yet?)
+ if self.drm_driver is DriverModule.XE:
+ return
+
+ path = self.helper_create_sysfs_path(0, gt_num, "policies", "sched_if_idle")
+ self.__write_sysfs(path, str(val))
+
+ def get_pf_policy_engine_reset(self, gt_num: int) -> int:
+ # not exposed by Xe (yet?)
+ if self.drm_driver is DriverModule.XE:
+ return 0
+
+ path = self.helper_create_sysfs_path(0, gt_num, "policies", "engine_reset")
+ ret = self.__read_sysfs(path)
+ return int(ret)
+
+ def get_pf_policy_sched_if_idle(self, gt_num: int) -> int:
+ # not exposed by Xe (yet?)
+ if self.drm_driver is DriverModule.XE:
+ return 0
+
+ path = self.helper_create_sysfs_path(0, gt_num, "policies", "sched_if_idle")
+ ret = self.__read_sysfs(path)
+ return int(ret)
+
+ # SRIOV sysfs: VF id
+ def get_vf_id(self, vf_num: int) -> int:
+ if self.drm_driver is DriverModule.XE:
+ raise exceptions.NotAvailableError('VF id attribute not available on xe')
+
+ path = posixpath.join(f'{self.get_iov_path()}/vf{vf_num}/id')
+ ret = self.__read_sysfs(path)
+ return int(ret)
+
+ # SRIOV sysfs: controls state of the running VF (WO)
+ # Sysfs location: prelim_iov/vfN/control
+ # Allows PF admin to pause, resume or stop handling
+ # submission requests from given VF and clear provisioning.
+ # control: "pause|resume|stop|clear"
+ class VfControl(str, enum.Enum):
+ pause = 'pause'
+ resume = 'resume'
+ stop = 'stop'
+ clear = 'clear'
+
+ def set_vf_control(self, vf_num: int, val: VfControl) -> None:
+ path = posixpath.join(f'{self.get_iov_path()}/vf{vf_num}/control')
+ self.__write_sysfs(path, val)
+
+ # SRIOV sysfs: setters and getters for PF specific provisioning parameters
+ # Sysfs location: [SRIOV sysfs base path]/pf/gtM/
+ # @gt_num: GT instance number
+ class SchedulingPriority(enum.Enum):
+ LOW = 0
+ NORMAL = 1
+ HIGH = 2
+
+ # In order to set scheduling priority, strict scheduling policy needs to be default
+ def set_pf_sched_priority(self, gt_num: int, val: SchedulingPriority) -> None:
+ path = self.helper_create_sysfs_path(0, gt_num, "", "sched_priority")
+ self.__write_sysfs(path, str(val.value))
+
+ def get_pf_sched_priority(self, gt_num: int) -> SchedulingPriority:
+ path = self.helper_create_sysfs_path(0, gt_num, "", "sched_priority")
+ ret = self.__read_sysfs(path)
+ return self.SchedulingPriority(int(ret))
+
+ # SRIOV sysfs: setters and getters for VFs and PF provisioning paramterers
+ # Sysfs location: [SRIOV sysfs base path]/[pf|vfN]/gtM/
+ # @vf_num: VF number (1-based) or 0 for PF
+ # @gt_num: GT instance number
+ def set_ggtt_quota(self, vf_num: int, gt_num: int, val: int) -> None:
+ if vf_num == 0 and self.drm_driver is DriverModule.I915:
+ raise exceptions.NotAvailableError('PF ggtt_quota not available')
+
+ path = self.helper_create_sysfs_path(vf_num, gt_num, "", "ggtt_quota")
+ self.__write_sysfs(path, str(val))
+
+ def set_lmem_quota(self, vf_num: int, gt_num: int, val: int) -> None:
+ if vf_num == 0 and self.drm_driver is DriverModule.I915:
+ raise exceptions.NotAvailableError('PF lmem_quota not available')
+
+ path = self.helper_create_sysfs_path(vf_num, gt_num, "", "lmem_quota")
+ if self.has_lmem():
+ self.__write_sysfs(path, str(val))
+
+ def set_contexts_quota(self, vf_num: int, gt_num: int, val: int) -> None:
+ path = self.helper_create_sysfs_path(vf_num, gt_num, "", "contexts_quota")
+ self.__write_sysfs(path, str(val))
+
+ def set_doorbells_quota(self, vf_num: int, gt_num: int, val: int) -> None:
+ path = self.helper_create_sysfs_path(vf_num, gt_num, "", "doorbells_quota")
+ self.__write_sysfs(path, str(val))
+
+ def set_exec_quantum_ms(self, vf_num: int, gt_num: int, val: int) -> None:
+ path = self.helper_create_sysfs_path(vf_num, gt_num, "", "exec_quantum_ms")
+ self.__write_sysfs(path, str(val))
+
+ def set_preempt_timeout_us(self, vf_num: int, gt_num: int, val: int) -> None:
+ path = self.helper_create_sysfs_path(vf_num, gt_num, "", "preempt_timeout_us")
+ self.__write_sysfs(path, str(val))
+
+ def get_ggtt_quota(self, vf_num: int, gt_num: int) -> int:
+ if vf_num == 0 and self.drm_driver is DriverModule.I915:
+ raise exceptions.NotAvailableError('PF ggtt_quota not available')
+
+ path = self.helper_create_sysfs_path(vf_num, gt_num, "", "ggtt_quota")
+ ret = self.__read_sysfs(path)
+ return int(ret)
+
+ def get_lmem_quota(self, vf_num: int, gt_num: int) -> int:
+ if vf_num == 0 and self.drm_driver is DriverModule.I915:
+ raise exceptions.NotAvailableError('PF lmem_quota not available')
+
+ path = self.helper_create_sysfs_path(vf_num, gt_num, "", "lmem_quota")
+ ret = self.__read_sysfs(path) if self.has_lmem() else 0
+ return int(ret)
+
+ def get_contexts_quota(self, vf_num: int, gt_num: int) -> int:
+ path = self.helper_create_sysfs_path(vf_num, gt_num, "", "contexts_quota")
+ ret = self.__read_sysfs(path)
+ return int(ret)
+
+ def get_doorbells_quota(self, vf_num: int, gt_num: int) -> int:
+ path = self.helper_create_sysfs_path(vf_num, gt_num, "", "doorbells_quota")
+ ret = self.__read_sysfs(path)
+ return int(ret)
+
+ def get_exec_quantum_ms(self, vf_num: int, gt_num: int) -> int:
+ path = self.helper_create_sysfs_path(vf_num, gt_num, "", "exec_quantum_ms")
+ ret = self.__read_sysfs(path)
+ return int(ret)
+
+ def get_preempt_timeout_us(self, vf_num: int, gt_num: int) -> int:
+ path = self.helper_create_sysfs_path(vf_num, gt_num, "", "preempt_timeout_us")
+ ret = self.__read_sysfs(path)
+ return int(ret)
+
+ # SRIOV debugfs: read resource availability
+ # Debugfs location: /sys/kernel/debug/dri/0/gtM/iov/
+ # @gt_num: GT instance number
+ # Returns: total and available size for a resource
+ def get_debugfs_ggtt(self, gt_num: int) -> typing.Tuple[int, int]:
+ return self.helper_get_debugfs_resources(gt_num, "ggtt")
+
+ # Placeholders for debugfs nodes that are not yet published.
+ # Implement in a similar way to 'ggtt' when present.
+ def get_debugfs_lmem(self, gt_num: int) -> typing.Tuple[int, int]:
+ raise NotImplementedError(f'Debugfs lmem_available not present yet (gt{gt_num})')
+
+ def get_debugfs_contexts(self, gt_num: int) -> typing.Tuple[int, int]:
+ raise NotImplementedError(f'Debugfs contexts_available not present yet (gt{gt_num})')
+
+ def get_debugfs_doorbells(self, gt_num: int) -> typing.Tuple[int, int]:
+ raise NotImplementedError(f'Debugfs doorbells_available not present yet (gt{gt_num})')
diff --git a/tools/vmtb/bench/machines/machine_interface.py b/tools/vmtb/bench/machines/machine_interface.py
new file mode 100644
index 000000000..be3aa5e64
--- /dev/null
+++ b/tools/vmtb/bench/machines/machine_interface.py
@@ -0,0 +1,70 @@
+
+#!/usr/bin/env python3
+# SPDX-License-Identifier: MIT
+
+## Copyright (C) 2024 Intel Corporation ##
+
+import abc
+import enum
+import signal
+import typing
+
+# TODO: Consider moving CONSTANT definitions to a separate file constants.py
+# XXX: Timeout increased from 10 to 20 min to handle long VM migration time on devices with LMEM
+DEFAULT_TIMEOUT: int = 1200 # Default machine execution wait timeout in seconds
+
+
+class ProcessResult(typing.NamedTuple):
+ exited: bool = False
+ exit_code: typing.Optional[int] = None
+ stdout: str = ''
+ stderr: str = ''
+
+
+class SuspendMode(str, enum.Enum):
+ ACPI_S3 = 'mem' # Suspend to RAM aka sleep
+ ACPI_S4 = 'disk' # Suspend to disk aka hibernation
+
+
+class DriverModule(str, enum.Enum):
+ I915 = 'i915'
+ XE = 'xe'
+
+
+class MachineInterface(metaclass=abc.ABCMeta):
+
+ @abc.abstractmethod
+ def execute(self, command: str) -> int:
+ raise NotImplementedError
+
+ @abc.abstractmethod
+ def execute_status(self, pid: int) -> ProcessResult:
+ raise NotImplementedError
+
+ @abc.abstractmethod
+ def execute_wait(self, pid: int, timeout: int) -> ProcessResult:
+ raise NotImplementedError
+
+ @abc.abstractmethod
+ def execute_signal(self, pid: int, sig: signal.Signals) -> None:
+ raise NotImplementedError
+
+ @abc.abstractmethod
+ def read_file_content(self, path: str) -> str:
+ raise NotImplementedError
+
+ @abc.abstractmethod
+ def write_file_content(self, path: str, content: str) -> int:
+ raise NotImplementedError
+
+ @abc.abstractmethod
+ def dir_exists(self, path: str) -> bool:
+ raise NotImplementedError
+
+ @abc.abstractmethod
+ def suspend(self, mode: SuspendMode) -> None:
+ raise NotImplementedError
+
+ @abc.abstractmethod
+ def get_drm_driver(self) -> DriverModule:
+ raise NotImplementedError
diff --git a/tools/vmtb/bench/machines/pci.py b/tools/vmtb/bench/machines/pci.py
new file mode 100644
index 000000000..ce4740cc3
--- /dev/null
+++ b/tools/vmtb/bench/machines/pci.py
@@ -0,0 +1,99 @@
+#!/usr/bin/env python3
+# SPDX-License-Identifier: MIT
+
+## Copyright (C) 2024 Intel Corporation ##
+
+import logging
+import subprocess
+import typing
+import enum
+import re
+
+from bench import exceptions
+
+logger = logging.getLogger(__name__)
+
+
+class GpuDevice(str, enum.Enum):
+ ATSM150 = 'Arctic Sound M150 (ATS-M1)'
+ ATSM75 = 'Arctic Sound M75 (ATS-M3)'
+ PVC = 'Ponte Vecchio (PVC)'
+ ADLP = 'Alder Lake P (ADL-P)'
+ Unknown = 'Unknown'
+
+ def __str__(self) -> str:
+ return str.__str__(self)
+
+
+def get_pci_info() -> typing.Tuple[str, str]:
+ """Return PCI BDF and Device ID of Intel (8086) Display Controller (03xx)"""
+ out = subprocess.check_output(['lspci', '-nm'], universal_newlines=True)
+ pattern = r'(?P<bdf>.*\.0) .*03[08]0.*8086.* "(?P<devid>[0-9a-fA-F]{4})"( -r.*)?( "[0-9a-fA-F]{0,4}"){2}.*'
+ match = re.search(pattern, out, re.MULTILINE)
+
+ if match:
+ return (f'0000:{match.group("bdf")}', match.group("devid"))
+
+ logger.error('Intel GPU Device was not found')
+ logger.debug('PCI Devices present (lspci -nm):\n%s', out)
+ raise exceptions.HostError('Intel GPU Device was not found')
+
+
+def get_gpu_name(pci_id: str) -> GpuDevice:
+ """Return GPU device name associated with a given PCI Device ID"""
+ return pci_ids.get(pci_id.upper(), GpuDevice.Unknown)
+
+
+# PCI Device IDs: ATS-M150 (M1)
+_atsm150_pci_ids = {
+ '56C0': GpuDevice.ATSM150,
+ '56C2': GpuDevice.ATSM150
+}
+
+
+# PCI Device IDs: ATS-M75 (M3)
+_atsm75_pci_ids = {
+ '56C1': GpuDevice.ATSM75
+}
+
+
+# PCI Device IDs: PVC
+_pvc_pci_ids = {
+ '0BD0': GpuDevice.PVC,
+ '0BD1': GpuDevice.PVC,
+ '0BD2': GpuDevice.PVC,
+ '0BD5': GpuDevice.PVC,
+ '0BD6': GpuDevice.PVC,
+ '0BD7': GpuDevice.PVC,
+ '0BD8': GpuDevice.PVC,
+ '0BD9': GpuDevice.PVC,
+ '0BDA': GpuDevice.PVC,
+ '0BDB': GpuDevice.PVC
+}
+
+
+# PCI Device IDs: ADL-P
+_adlp_pci_ids = {
+ '46A0': GpuDevice.ADLP,
+ '46A1': GpuDevice.ADLP,
+ '46A2': GpuDevice.ADLP,
+ '46A3': GpuDevice.ADLP,
+ '46A6': GpuDevice.ADLP,
+ '46A8': GpuDevice.ADLP,
+ '46AA': GpuDevice.ADLP,
+ '462A': GpuDevice.ADLP,
+ '4626': GpuDevice.ADLP,
+ '4628': GpuDevice.ADLP,
+ '46B0': GpuDevice.ADLP,
+ '46B1': GpuDevice.ADLP,
+ '46B2': GpuDevice.ADLP,
+ '46B3': GpuDevice.ADLP,
+ '46C0': GpuDevice.ADLP,
+ '46C1': GpuDevice.ADLP,
+ '46C2': GpuDevice.ADLP,
+ '46C3': GpuDevice.ADLP
+}
+
+
+# All PCI Device IDs to GPU Device Names mapping
+pci_ids: typing.Dict[str, GpuDevice] = {**_atsm150_pci_ids, **_atsm75_pci_ids, **_pvc_pci_ids, **_adlp_pci_ids}
diff --git a/tools/vmtb/bench/machines/vgpu_profile.py b/tools/vmtb/bench/machines/vgpu_profile.py
new file mode 100644
index 000000000..03fbaf79c
--- /dev/null
+++ b/tools/vmtb/bench/machines/vgpu_profile.py
@@ -0,0 +1,197 @@
+#!/usr/bin/env python3
+# SPDX-License-Identifier: MIT
+
+## Copyright (C) 2024 Intel Corporation ##
+
+import csv
+import logging
+import posixpath
+import re
+
+from enum import Enum
+from typing import Optional, List, Dict, Tuple
+from bench import exceptions
+
+logger = logging.getLogger(__name__)
+
+
+class VgpuProfileClass(str, Enum):
+ """Represent usage classes of vGPU profiles.
+
+ The following types are supported:
+ - Class A: Auto provisioning (DRM allocates resources fairly)
+ - Class M: Multipurpose VF profiles that support a mix of compute and media
+ but not specifically fps-targeted 3D experiences
+ - Class C: Comput and media focused VFs w.o. any 3D support
+ - Class V: VDI (Virtual Desktop Infrastructure) or remote graphics delivery VFs
+ - Class L: IDV (Intelligent Desktop Virtualization) or locally displayed VFs
+ - Class R: Remote Desktop Session Host
+ """
+ AUTO = 'A'
+ MULTIPURPOSE = 'M'
+ COMPUTE = 'C'
+ VDI = 'V'
+ IDV = 'L'
+ RDSH = 'R'
+
+
+class VgpuProfile:
+ def __init__(self) -> None:
+ # [Platform]_vfs.csv file:
+ self.profileId: str = ''
+ self.description: str = ''
+ self.schedulerMode: str = ''
+ self.pfExecutionQuanta: int = 0
+ self.pfPreemptionTimeout: int = 0
+ self.vfExecutionQuanta: int = 0
+ self.vfPreemptionTimeout: int = 0
+ self.scheduleIfIdle: bool = False
+
+ # [Platform]_int.csv file:
+ self.resetAfterVfSwitch: bool = False
+ self.provisioningMode: int = 0
+ self.pfLmem: int = 0
+ self.pfContexts: int = 0
+ self.pfDoorbells: int = 0
+ self.pfGgtt: int = 0
+ self.vfLmem: int = 0
+ self.vfContexts: int = 0
+ self.vfDoorbells: int = 0
+ self.vfGgtt: int = 0
+
+ def get_class_num_vfs(self) -> Tuple[VgpuProfileClass, int]:
+ """Return pair of vGPU profile class and number of VFs from profileID string
+ e.g. ATSM150_V16 -> (VgpuProfileClass.VDI, 16).
+ """
+ pattern = r'(?P<profile_class>[M,C,V,L,R,A]{1})(?P<num_vfs>\d{1,2}$)'
+ match = re.search(pattern, self.profileId)
+
+ if match:
+ return (VgpuProfileClass(match.group('profile_class')), int(match.group('num_vfs')))
+
+ raise exceptions.VgpuProfileError(f'Invalid syntax of a vGPU profileId: {self.profileId}')
+
+ def get_class(self) -> VgpuProfileClass:
+ """Return vGPU profile class (Multipurpose/Compute/VDI etc.) from profileID string
+ e.g. ATSM150_M4 -> Multipurpose.
+ """
+ return self.get_class_num_vfs()[0]
+
+ def get_num_vfs(self) -> int:
+ """Return number of VFs supported for a given vGPU profile from profileID string
+ e.g. ATSM150_M4 -> 4. In case of not initialized/unknown profileId returns 0.
+ """
+ try:
+ return self.get_class_num_vfs()[1]
+ except exceptions.VgpuProfileError:
+ logger.warning("Unable to determine number of VFs for a vGPU profile - return 0")
+ return 0
+
+ def print_parameters(self) -> None:
+ logger.info(
+ "\nvGPU Profile ID: %s\n"
+ "Description = %s\n"
+ "Provisioning Mode = %s\n"
+ "Scheduler Mode = %s\n"
+ "Schedule If Idle = %s\n"
+ "Reset After Vf Switch = %s\n"
+ "PF:\n"
+ "\tExecution Quanta = %s ms\n"
+ "\tPreemption Timeout = %s us\n"
+ "\tLMEM = %s B\n"
+ "\tContexts = %s\n"
+ "\tDoorbells = %s\n"
+ "\tGGTT = %s B\n"
+ "VF:\n"
+ "\tExecution Quanta = %s ms\n"
+ "\tPreemption Timeout = %s us\n"
+ "\tLMEM = %s B\n"
+ "\tContexts = %s\n"
+ "\tDoorbells = %s\n"
+ "\tGGTT = %s B",
+ self.profileId, self.description, self.provisioningMode,
+ self.schedulerMode, self.scheduleIfIdle, self.resetAfterVfSwitch,
+ self.pfExecutionQuanta, self.pfPreemptionTimeout,
+ self.pfLmem, self.pfContexts, self.pfDoorbells, self.pfGgtt,
+ self.vfExecutionQuanta, self.vfPreemptionTimeout,
+ self.vfLmem, self.vfContexts, self.vfDoorbells, self.vfGgtt
+ )
+
+
+class VgpuProfileCsvReader:
+ def __init__(self, vgpu_vfs_path: str, vgpu_int_path: str) -> None:
+ # vGPU profiles definitions are split into two CSV files
+ vfs_data = self.read_csv_file(vgpu_vfs_path)
+ int_data = self.read_csv_file(vgpu_int_path)
+
+ # List containing all profiles defined in CSV files
+ self._vgpu_profiles: List[VgpuProfile] = self.parse_csv_files(vfs_data, int_data)
+
+ @property
+ def vgpu_profiles(self) -> List[VgpuProfile]:
+ return self._vgpu_profiles
+
+ @vgpu_profiles.setter
+ def vgpu_profiles(self, value: List[VgpuProfile]) -> None:
+ self._vgpu_profiles = value
+
+ def read_csv_file(self, vgpu_csv_file: str) -> List[Dict[Optional[str], Optional[str]]]:
+ vgpu_dict_list = []
+
+ if not posixpath.exists(vgpu_csv_file):
+ raise exceptions.VgpuProfileError(f'CSV file not found: {vgpu_csv_file}')
+
+ # CSV files encoding - unicode with BOM (byte order mark): utf-8-sig
+ with open(vgpu_csv_file, mode='r', encoding='utf-8-sig') as csv_file:
+ csv_reader = csv.DictReader(csv_file)
+
+ for row in csv_reader:
+ if 'vfs' in vgpu_csv_file:
+ vgpu_dict_list.append(row)
+ elif 'int' in vgpu_csv_file:
+ vgpu_dict_list.append(row)
+ else:
+ raise exceptions.VgpuProfileError(f'Invalid CSV file: {vgpu_csv_file}')
+
+ return vgpu_dict_list
+
+ def parse_csv_files(self, vfs_list: List[Dict], int_list: List[Dict]) -> List[VgpuProfile]:
+ all_profiles: List[VgpuProfile] = []
+ if len(vfs_list) != len(int_list):
+ raise exceptions.VgpuProfileError(f'CSV files: different number of lines')
+
+ for vfs_row, int_row in zip(vfs_list, int_list):
+ profile: VgpuProfile = VgpuProfile()
+
+ profile.profileId = vfs_row['vGPUProfileInfo ProfileID']
+ tmp_int_profileId = int_row['vGPUProfileInfo ProfileID']
+ if profile.profileId != tmp_int_profileId:
+ raise exceptions.VgpuProfileError(
+ f'CSV files: ProfileIDs not matching - {profile.profileId} vs {tmp_int_profileId}')
+
+ # [Platform]_vfs.csv file attributes:
+ profile.description = vfs_row['vGPUProfileInfo Description']
+ profile.schedulerMode = vfs_row['vGPUScheduler vGPUSchedulerMode']
+ profile.pfExecutionQuanta = int(vfs_row['vGPUScheduler PFExecutionQuanta(msec)'])
+ profile.pfPreemptionTimeout = int(vfs_row['vGPUScheduler PFPreemptionTimeout(usec)'])
+ profile.vfExecutionQuanta = int(vfs_row['vGPUScheduler VFExecutionQuanta(msec)'])
+ profile.vfPreemptionTimeout = int(vfs_row['vGPUScheduler VFPreemptionTimeout(usec)'])
+ profile.scheduleIfIdle = bool(vfs_row['vGPUScheduler ScheduleIfIdle'] == 'T')
+
+ # [Platform]_int.csv file attributes:
+ profile.resetAfterVfSwitch = bool(int_row['vGPUScheduler ResetAfterVfSwitch'] == 'T')
+ profile.provisioningMode = int(int_row['General TileProvisioningMode'])
+ pf_lmem: str = int_row['PFResources Lmem(B/tile)']
+ profile.pfLmem = int(pf_lmem) if pf_lmem.isnumeric() else 0
+ profile.pfContexts = int(int_row['PFResources Contexts(perTile)'])
+ profile.pfDoorbells = int(int_row['PFResources Doorbells(perTile)'])
+ profile.pfGgtt = int(int_row['PFResources GGTTSize(B/tile)'])
+ vf_lmem: str = int_row['VFResources Lmem(B/tile)']
+ profile.vfLmem = int(vf_lmem) if vf_lmem.isnumeric() else 0
+ profile.vfContexts = int(int_row['VFResources Contexts(perTile)'])
+ profile.vfDoorbells = int(int_row['VFResources Doorbells(perTile)'])
+ profile.vfGgtt = int(int_row['VFResources GGTTSize(B/tile)'])
+
+ all_profiles.append(profile)
+
+ return all_profiles
diff --git a/tools/vmtb/bench/machines/virtual/__init__.py b/tools/vmtb/bench/machines/virtual/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/tools/vmtb/bench/machines/virtual/backends/__init__.py b/tools/vmtb/bench/machines/virtual/backends/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/tools/vmtb/bench/machines/virtual/backends/backend_interface.py b/tools/vmtb/bench/machines/virtual/backends/backend_interface.py
new file mode 100644
index 000000000..ecad293ef
--- /dev/null
+++ b/tools/vmtb/bench/machines/virtual/backends/backend_interface.py
@@ -0,0 +1,42 @@
+#!/usr/bin/env python3
+# SPDX-License-Identifier: MIT
+
+## Copyright (C) 2024 Intel Corporation ##
+
+import abc
+import typing
+
+
+class BackendInterface(metaclass=abc.ABCMeta):
+
+ @abc.abstractmethod
+ def sync(self, idnum: int) -> typing.Optional[typing.Dict]:
+ raise NotImplementedError
+
+ @abc.abstractmethod
+ def ping(self) -> typing.Optional[typing.Dict]:
+ raise NotImplementedError
+
+ @abc.abstractmethod
+ def execute(self, command: str, args: typing.List[str]) -> typing.Optional[typing.Dict]:
+ raise NotImplementedError
+
+ @abc.abstractmethod
+ def execute_status(self, pid: int) -> typing.Optional[typing.Dict]:
+ raise NotImplementedError
+
+ @abc.abstractmethod
+ def suspend_disk(self) -> None:
+ raise NotImplementedError
+
+ @abc.abstractmethod
+ def suspend_ram(self) -> None:
+ raise NotImplementedError
+
+ @abc.abstractmethod
+ def reboot(self) -> None:
+ raise NotImplementedError
+
+ @abc.abstractmethod
+ def poweroff(self) -> None:
+ raise NotImplementedError
diff --git a/tools/vmtb/bench/machines/virtual/backends/guestagent.py b/tools/vmtb/bench/machines/virtual/backends/guestagent.py
new file mode 100644
index 000000000..9cfad5da6
--- /dev/null
+++ b/tools/vmtb/bench/machines/virtual/backends/guestagent.py
@@ -0,0 +1,101 @@
+#!/usr/bin/env python3
+# SPDX-License-Identifier: MIT
+
+## Copyright (C) 2024 Intel Corporation ##
+
+import json
+import logging
+import socket
+import typing
+
+from bench import exceptions
+from bench.machines.virtual.backends.backend_interface import BackendInterface
+
+logger = logging.getLogger(__name__)
+
+
+class GuestAgentBackend(BackendInterface):
+ def __init__(self, socket_path: str, socket_timeout: int) -> None:
+ self.sockpath = socket_path
+ self.timeout = socket_timeout
+ self.sock: socket.socket = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM)
+ self.sock.connect(self.sockpath)
+ self.sockf: typing.TextIO = self.sock.makefile(mode='rw', errors='strict')
+
+ def __send(self, command: str, arguments: typing.Optional[typing.Dict] = None) -> typing.Dict:
+ if arguments is None:
+ arguments = {}
+
+ data = {'execute': command, 'arguments': arguments}
+ json.dump(data, self.sockf)
+ self.sockf.flush()
+ try:
+ out: typing.Optional[str] = self.sockf.readline()
+ except socket.timeout as soc_to_exc:
+ logger.error('Socket readline timeout on command %s', command)
+ self.sock.close()
+ self.sockf.close()
+ raise exceptions.GuestAgentError(f'Socket timed out on {command}') from soc_to_exc
+ if out is None:
+ logger.error('Command %s, args %s returned with no output')
+ raise exceptions.GuestAgentError(f'Command {command} did not retunrned output')
+ # Only logging errors for now
+ ret: typing.Dict = json.loads(out)
+ if 'error' in ret.keys():
+ logger.error('Command: %s got error %s', command, ret)
+
+ return ret
+
+ def sync(self, idnum: int) -> typing.Dict:
+ return self.__send('guest-sync', {'id': idnum})
+
+ def ping(self) -> typing.Optional[typing.Dict]:
+ return self.__send('guest-ping')
+
+ def execute(self, command: str, args: typing.Optional[typing.List[str]] = None) -> typing.Dict:
+ if args is None:
+ args = []
+ arguments = {'path': command, 'arg': args, 'capture-output': True}
+ return self.__send('guest-exec', arguments)
+
+ def execute_status(self, pid: int) -> typing.Dict:
+ return self.__send('guest-exec-status', {'pid': pid})
+
+ # TODO add qmp-query mechanism for all powerstate changes
+ def suspend_disk(self) -> None:
+ # self.__send('guest-suspend-disk')
+ raise NotImplementedError
+
+ def suspend_ram(self) -> None:
+ self.ping()
+ # guest-suspend-ram does not return anything, thats why no __send
+ data = {'execute': 'guest-suspend-ram'}
+ json.dump(data, self.sockf)
+ self.sockf.flush()
+
+ def reboot(self) -> None:
+ self.ping()
+ # guest-shutdown does not return anything, thats why no __send
+ data = {'execute': 'guest-shutdown', 'arguments': {'mode': 'reboot'}}
+ json.dump(data, self.sockf)
+ self.sockf.flush()
+
+ def poweroff(self) -> None:
+ self.ping()
+ # guest-shutdown does not return anything, thats why no __send
+ data = {'execute': 'guest-shutdown', 'arguments': {'mode': 'powerdown'}}
+ json.dump(data, self.sockf)
+ self.sockf.flush()
+ # self.sockf.readline()
+
+ def guest_file_open(self, path: str, mode: str) -> typing.Dict:
+ return self.__send('guest-file-open', {'path': path, 'mode': mode})
+
+ def guest_file_close(self, handle: int) -> typing.Dict:
+ return self.__send('guest-file-close', {'handle': handle})
+
+ def guest_file_write(self, handle: int, content: str) -> typing.Dict:
+ return self.__send('guest-file-write', {'handle': handle, 'buf-b64': content})
+
+ def guest_file_read(self, handle: int) -> typing.Dict:
+ return self.__send('guest-file-read', {'handle': handle})
diff --git a/tools/vmtb/bench/machines/virtual/backends/qmp_monitor.py b/tools/vmtb/bench/machines/virtual/backends/qmp_monitor.py
new file mode 100644
index 000000000..d28147d67
--- /dev/null
+++ b/tools/vmtb/bench/machines/virtual/backends/qmp_monitor.py
@@ -0,0 +1,163 @@
+#!/usr/bin/env python3
+# SPDX-License-Identifier: MIT
+
+## Copyright (C) 2024 Intel Corporation ##
+
+import json
+import logging
+import queue
+import socket
+import threading
+import time
+import typing
+
+logger = logging.getLogger(__name__)
+
+
+class QmpMonitor():
+ def __init__(self, socket_path: str, socket_timeout: int) -> None:
+ self.sockpath = socket_path
+ self.timeout = socket_timeout
+ self.sock: socket.socket = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM)
+ self.sock.connect(self.sockpath)
+ self.sockf: typing.TextIO = self.sock.makefile(mode='rw', errors='strict')
+ self.qmp_queue: queue.Queue = queue.Queue()
+ self.monitor_thread: threading.Thread = threading.Thread(target=self.__queue_qmp_output,
+ args=(self.sockf, self.qmp_queue),
+ daemon=True)
+ self.monitor_thread.start()
+ # It is required to enable capabilities befor using QMP
+ self.__enable_qmp_capabilities()
+
+ def __enable_qmp_capabilities(self) -> None:
+ json.dump({'execute': 'qmp_capabilities'}, self.sockf)
+ self.sockf.flush()
+
+ def __queue_qmp_output(self, out: typing.TextIO, q: queue.Queue) -> None:
+ for line in iter(out.readline, ''):
+ logger.debug('[QMP RSP] <- %s', line)
+ qmp_msg = json.loads(line)
+ q.put(qmp_msg)
+
+ @property
+ def monitor_queue(self) -> queue.Queue:
+ return self.qmp_queue
+
+ def query_status(self) -> str:
+ json.dump({'execute': 'query-status'}, self.sockf)
+ self.sockf.flush()
+
+ ret: typing.Dict = {}
+ while 'status' not in ret:
+ qmp_msg = self.qmp_queue.get()
+ if 'return' in qmp_msg:
+ ret = qmp_msg.get('return')
+
+ status: str = ret['status']
+ logger.debug('Machine status: %s', status)
+ return status
+
+ def query_jobs(self, requested_type: str) -> typing.Tuple[str, str]:
+ json.dump({'execute': 'query-jobs'}, self.sockf)
+ self.sockf.flush()
+
+ job_type: str = ''
+ job_status: str = ''
+ job_error: str = ''
+ ret: typing.Dict = {}
+
+ qmp_msg = self.qmp_queue.get()
+ # logger.debug('[QMP RSP Queue] -> %s', qmp_msg)
+ if 'return' in qmp_msg:
+ ret = qmp_msg.get('return')
+ for param in ret:
+ job_type = param.get('type')
+ job_status = param.get('status')
+ job_error = param.get('error')
+
+ if job_type == requested_type:
+ break
+
+ return (job_status, job_error)
+
+ def get_qmp_event(self) -> str:
+ qmp_msg = self.qmp_queue.get()
+ # logger.debug('[QMP RSP Queue] -> %s', qmp_msg)
+ event: str = qmp_msg.get('event', '')
+ return event
+
+ def get_qmp_event_job(self) -> str:
+ qmp_msg = self.qmp_queue.get()
+ # logger.debug('[QMP RSP Queue] -> %s', qmp_msg)
+
+ status: str = ''
+ if qmp_msg.get('event') == 'JOB_STATUS_CHANGE':
+ status = qmp_msg.get('data', {}).get('status', '')
+
+ return status
+
+ def system_reset(self) -> None:
+ json.dump({'execute': 'system_reset'}, self.sockf)
+ self.sockf.flush()
+
+ def system_wakeup(self) -> None:
+ json.dump({'execute': 'system_wakeup'}, self.sockf)
+ self.sockf.flush()
+
+ def stop(self) -> None:
+ json.dump({'execute': 'stop'}, self.sockf)
+ self.sockf.flush()
+
+ def cont(self) -> None:
+ json.dump({'execute': 'cont'}, self.sockf)
+ self.sockf.flush()
+
+ def quit(self) -> None:
+ json.dump({'execute': 'quit'}, self.sockf)
+ self.sockf.flush()
+
+ def __query_snapshot(self) -> typing.Tuple[str, str]:
+ json.dump({'execute': 'query-named-block-nodes'}, self.sockf)
+ self.sockf.flush()
+
+ node_name: str = ''
+ snapshot_tag: str = ''
+ ret: typing.Dict = {}
+
+ qmp_msg = self.qmp_queue.get()
+ # logger.debug('[QMP RSP Queue] -> %s', qmp_msg)
+ if 'return' in qmp_msg:
+ ret = qmp_msg.get('return')
+ for block in ret:
+ if block.get('drv') == 'qcow2':
+ node_name = block.get('node-name')
+ # Get the most recent state snapshot from the snapshots list:
+ snapshots = block.get('image').get('snapshots')
+ if snapshots:
+ snapshot_tag = snapshots[-1].get('name')
+ break
+
+ return (node_name, snapshot_tag)
+
+ def save_snapshot(self) -> None:
+ job_id: str = f'savevm_{time.time()}'
+ snapshot_tag = f'vm_state_{time.time()}'
+ node_name, _ = self.__query_snapshot()
+ logger.debug('[QMP snapshot-save] snapshot_tag: %s, block device node: %s', snapshot_tag, node_name)
+
+ # Note: command 'snapshot-save' is supported since QEMU 6.0
+ json.dump({'execute': 'snapshot-save',
+ 'arguments': {'job-id': job_id, 'tag': snapshot_tag, 'vmstate': node_name, 'devices': [node_name]}},
+ self.sockf)
+ self.sockf.flush()
+
+ def load_snapshot(self) -> None:
+ job_id: str = f'loadvm_{time.time()}'
+ node_name, snapshot_tag = self.__query_snapshot()
+ logger.debug('[QMP snapshot-load] snapshot_tag: %s, block device node: %s', snapshot_tag, node_name)
+
+ # Note: command 'snapshot-load' is supported since QEMU 6.0
+ json.dump({'execute': 'snapshot-load',
+ 'arguments': {'job-id': job_id, 'tag': snapshot_tag, 'vmstate': node_name, 'devices': [node_name]}},
+ self.sockf)
+ self.sockf.flush()
diff --git a/tools/vmtb/bench/machines/virtual/vm.py b/tools/vmtb/bench/machines/virtual/vm.py
new file mode 100644
index 000000000..ab1576a76
--- /dev/null
+++ b/tools/vmtb/bench/machines/virtual/vm.py
@@ -0,0 +1,595 @@
+#!/usr/bin/env python3
+# SPDX-License-Identifier: MIT
+
+## Copyright (C) 2024 Intel Corporation ##
+
+import base64
+import logging
+import os
+import posixpath
+import shlex
+import signal
+import subprocess
+import threading
+import time
+import typing
+
+from types import FrameType
+from bench import exceptions
+from bench.machines.machine_interface import MachineInterface, ProcessResult, SuspendMode, DriverModule, DEFAULT_TIMEOUT
+from bench.machines.virtual.backends.guestagent import GuestAgentBackend
+from bench.machines.virtual.backends.qmp_monitor import QmpMonitor
+
+logger = logging.getLogger(__name__)
+
+
+class VirtualMachine(MachineInterface):
+ class Decorators():
+ @staticmethod
+ def alarm_handler(sig: signal.Signals, tb: FrameType) -> typing.Any:
+ raise exceptions.AlarmTimeoutError(f'Alarm timeout occured')
+
+ @classmethod
+ def timeout_signal(cls, func: typing.Callable) -> typing.Callable:
+ def timeout_wrapper(*args: typing.Any, **kwargs: typing.Optional[typing.Any]) -> typing.Any:
+ timeout: int = DEFAULT_TIMEOUT
+ if len(args) > 2:
+ timeout = args[2] # Argument position in execute_wait(self, pid, timeout)
+ elif kwargs.get('timeout') is not None:
+ if isinstance(kwargs['timeout'], int):
+ timeout = kwargs['timeout']
+
+ # mypy: silence the following problem in signal.signal() call:
+ # error: Argument 2 to "signal" has incompatible type "Callable[[Signals, FrameType], Any]";
+ # expected "Union[Callable[[int, Optional[FrameType]], Any], int, Handlers, None]" [arg-type]
+ signal.signal(signal.SIGALRM, cls.alarm_handler) # type: ignore[arg-type]
+ signal.alarm(timeout)
+ try:
+ proc_ret = func(*args, **kwargs)
+ except exceptions.AlarmTimeoutError:
+ logger.warning('Timeout (%ss) on %s', timeout, func.__name__)
+ raise
+ finally:
+ signal.alarm(0) # Cancel alarm
+
+ return proc_ret
+
+ return timeout_wrapper
+
+ def __init__(self, backing_image: str, vm_number: int) -> None:
+ # TODO: make properties private and publish accessors (@property)
+ self.vf_bdf: typing.Optional[str] = None
+ self.process: typing.Optional[subprocess.Popen] = None
+ self.vmnum: int = vm_number
+ self.card_num: int = 0
+ self.sysfs_prefix_path = posixpath.join('/sys/class/drm/', f'card{str(self.card_num)}')
+ self.questagent_sockpath = posixpath.join('/tmp', f'qga{self.vmnum}.sock')
+ self.qmp_sockpath = posixpath.join('/tmp', f'mon{self.vmnum}.sock')
+ self.drm_driver: typing.Optional[DriverModule] = None
+
+ if not posixpath.exists(backing_image):
+ logger.error('No image for VM%s', self.vmnum)
+ raise exceptions.GuestError(f'No image for VM{self.vmnum}')
+ self.image: str = self.__create_qemu_image(backing_image)
+ self.migrate_source_image: typing.Optional[str] = None
+ self.migrate_destination_vm: bool = False
+
+ # Resources provisioned to the VF/VM:
+ self._lmem_size: typing.Optional[int] = None
+ self._ggtt_size: typing.Optional[int] = None
+ self._contexts: typing.Optional[int] = None
+ self._doorbells: typing.Optional[int] = None
+
+ # GT number and tile is relevant mainly for multi-tile devices
+ # List of all GTs used by a given VF:
+ # - for single-tile: only root [0]
+ # - for multi-tile Mode 2/3: either root [0] or remote [1]
+ # - for multi-tile Mode 1: spans on both tiles [0, 1]
+ self._gt_nums: typing.List[int] = []
+ self._tile_mask: typing.Optional[int] = None
+
+ def __str__(self) -> str:
+ return f'VM{self.vmnum}_{self.vf_bdf}'
+
+ def __del__(self) -> None:
+ if not self.is_running():
+ return
+
+ # printing and not logging because loggers have some issues
+ # in late deinitialization
+ print(f'VM{self.vmnum} was not powered off')
+ if not self.process:
+ return
+ self.process.terminate()
+ # self.__close_qemu_output()
+ # Lets wait and make sure that qemu shutdown
+ try:
+ self.process.communicate(timeout=30)
+ except subprocess.TimeoutExpired:
+ print('QEMU did not terminate, killing it')
+ self.process.kill()
+
+ def __create_qemu_image(self, backing_file: str) -> str:
+ output_image = f'./vm{self.vmnum}_{time.time()}_image.qcow2'
+ try:
+ subprocess.check_output(['qemu-img', 'create',
+ '-F', 'raw',
+ '-f', 'qcow2',
+ '-b', f'{backing_file}', f'{output_image}'],
+ universal_newlines=True)
+ except subprocess.CalledProcessError as exc:
+ logger.error('Creating qcow2 image file for VM%s failed with %s', self.vmnum, exc)
+ raise exceptions.GuestError('Error creating qcow2 image') from exc
+
+ return output_image
+
+ # def __open_qemu_output(self) -> None:
+ # self.qemu_stdout = open(f'./qemu_vm{self.vmnum}_stdout.log', 'w')
+ # self.qemu_stderr = open(f'./qemu_vm{self.vmnum}_stderr.log', 'w')
+
+ def __log_qemu_output(self, out: typing.TextIO) -> None:
+ stdoutlog = logging.getLogger(f'VM{self.vmnum}_kmsg')
+ for line in iter(out.readline, ''):
+ stdoutlog.info(line.strip())
+
+ # def __close_qemu_output(self) -> None:
+ # self.qemu_stderr.close()
+ # self.qemu_stdout.close()
+
+ def __sockets_exists(self) -> bool:
+ return os.path.exists(self.questagent_sockpath) and os.path.exists(self.qmp_sockpath)
+
+ def __get_popen_command(self) -> typing.List[str]:
+ # self.__open_qemu_output()
+ command = ['qemu-system-x86_64',
+ '-vnc', f':{self.vmnum}',
+ '-serial', 'stdio',
+ '-m', '4096',
+ '-drive', f'file={self.image if not self.migrate_destination_vm else self.migrate_source_image}',
+ '-chardev', f'socket,path={self.questagent_sockpath},server=on,wait=off,id=qga{self.vmnum}',
+ '-device', 'virtio-serial',
+ '-device', f'virtserialport,chardev=qga{self.vmnum},name=org.qemu.guest_agent.0',
+ '-chardev', f'socket,id=mon{self.vmnum},path=/tmp/mon{self.vmnum}.sock,server=on,wait=off',
+ '-mon', f'chardev=mon{self.vmnum},mode=control']
+
+ if self.vf_bdf:
+ command.extend(['-enable-kvm', '-cpu', 'host'])
+ command.extend(['-device', f'vfio-pci,host={self.vf_bdf},'
+ # vfio-pci x-enable-migration=true param is currently needed for migration
+ # TODO: review later if still required when qemu/vfio-pci evolves
+ 'x-enable-migration=true'])
+
+ if self.migrate_destination_vm:
+ # If VM is migration destination - run in stopped/prelaunch state (explicit resume required)
+ command.extend(['-S'])
+
+ logger.debug('QEMU command: %s', ' '.join(command))
+ return command
+
+ def __get_key(self, base: typing.Dict, path: typing.List[str]) -> typing.Any:
+ cur = base
+ for key in path:
+ if cur is None or key not in cur:
+ raise ValueError(f'The key {path} does not exist, aborting!')
+ cur = cur[key]
+ return cur
+
+ @property
+ def get_vm_num(self) -> int:
+ return self.vmnum
+
+ def assign_vf(self, vf_bdf: str) -> None:
+ self.vf_bdf = vf_bdf
+
+ def set_migration_source(self, src_image: str) -> None:
+ self.migrate_source_image = src_image
+ self.migrate_destination_vm = True
+
+ @property
+ def lmem_size(self) -> typing.Optional[int]:
+ if self._lmem_size is None:
+ self.helper_get_debugfs_selfconfig()
+
+ return self._lmem_size
+
+ @property
+ def ggtt_size(self) -> typing.Optional[int]:
+ if self._ggtt_size is None:
+ self.helper_get_debugfs_selfconfig()
+
+ return self._ggtt_size
+
+ @property
+ def contexts(self) -> typing.Optional[int]:
+ if self._contexts is None:
+ self.helper_get_debugfs_selfconfig()
+
+ return self._contexts
+
+ @property
+ def doorbells(self) -> typing.Optional[int]:
+ if self._doorbells is None:
+ self.helper_get_debugfs_selfconfig()
+
+ return self._doorbells
+
+ @property
+ def tile_mask(self) -> typing.Optional[int]:
+ if self._tile_mask is None:
+ self.helper_get_debugfs_selfconfig()
+
+ return self._tile_mask
+
+ @property
+ def gt_nums(self) -> typing.List[int]:
+ self._gt_nums = self.get_gt_num_from_sysfs()
+ if not self._gt_nums:
+ logger.warning("VM sysfs: missing GT index")
+ self._gt_nums = [0]
+
+ return self._gt_nums
+
+ def get_gt_num_from_sysfs(self) -> typing.List[int]:
+ # Get GT number of VF passed to a VM, based on an exisitng a sysfs path
+ vm_gt_num = []
+ if self.dir_exists(posixpath.join(self.sysfs_prefix_path, 'gt/gt0')):
+ vm_gt_num.append(0)
+ if self.dir_exists(posixpath.join(self.sysfs_prefix_path, 'gt/gt1')):
+ vm_gt_num.append(1)
+
+ return vm_gt_num
+
+ def query_available_drivers(self) -> typing.List[DriverModule]:
+ # Check guest for supported DRM drivers (i915 / xe)
+ available_drivers: typing.List[DriverModule] = []
+
+ for drm_driver in DriverModule:
+ modinfo_pid = self.execute(f'modinfo -F filename {drm_driver}')
+ modinfo_result: ProcessResult = self.execute_wait(modinfo_pid)
+ if modinfo_result.exit_code == 0:
+ available_drivers.append(drm_driver)
+
+ logger.debug("VirtualMachine - found DRM driver module(s): %s", available_drivers)
+ return available_drivers
+
+ def select_driver_module(self) -> DriverModule:
+ available_drivers = self.query_available_drivers()
+ # Xe is preferred in case of both, i915 and xe drivers are supported by the kernel
+ return DriverModule.XE if DriverModule.XE in available_drivers else available_drivers[0]
+
+ def get_drm_driver(self) -> DriverModule:
+ if self.drm_driver is None:
+ self.drm_driver = self.select_driver_module()
+
+ return self.drm_driver
+
+ @Decorators.timeout_signal
+ def poweron(self) -> None:
+ logger.debug('Powering on VM%s', self.vmnum)
+ if self.is_running():
+ logger.warning('VM%s already running', self.vmnum)
+ return
+
+ command = self.__get_popen_command()
+ # We don't want to kill the process created here (like 'with' would do) so disable the following linter issue:
+ # R1732: consider-using-with (Consider using 'with' for resource-allocating operations)
+ # pylint: disable=R1732
+ # TODO: but maybe 'subprocess.run' function would fit instead of Popen constructor?
+ self.process = subprocess.Popen(
+ args=command,
+ stdout=subprocess.PIPE,
+ stderr=subprocess.PIPE,
+ # 'stdout': self.qemu_stdout,
+ # 'stderr': self.qemu_stderr,
+ universal_newlines=True)
+
+ qemu_stdout_log_thread = threading.Thread(
+ target=self.__log_qemu_output, args=(
+ self.process.stdout,), daemon=True)
+ qemu_stdout_log_thread.start()
+
+ qemu_stderr_log_thread = threading.Thread(
+ target=self.__log_qemu_output, args=(
+ self.process.stderr,), daemon=True)
+ qemu_stderr_log_thread.start()
+
+ if not self.is_running():
+ logger.error('VM%s did not boot', self.vmnum)
+ raise exceptions.GuestError(f'VM{self.vmnum} did not start')
+
+ try:
+ while not self.__sockets_exists():
+ logger.info('waiting for socket')
+ time.sleep(1)
+ # Passing five minutes timout for every command
+ self.ga = GuestAgentBackend(self.questagent_sockpath, 300)
+ self.qm = QmpMonitor(self.qmp_sockpath, 300)
+ vm_status = self.qm.query_status()
+
+ if not self.migrate_destination_vm and vm_status != 'running':
+ self.process.terminate()
+ logger.error('VM%s status not "running", instead: %s', self.vmnum, vm_status)
+ raise exceptions.GuestError(f'VM{self.vmnum} status {vm_status}')
+ except Exception as exc:
+ logger.error('Error while booting VM%s: %s', self.vmnum, exc)
+ self.process.terminate()
+ raise exceptions.GuestError(f'VM{self.vmnum} crashed with {exc}') from exc
+
+ def is_running(self) -> bool:
+ if self.process is None:
+ return False
+
+ return_code = self.process.poll()
+ if return_code is None:
+ return True
+
+ # self.__close_qemu_output()
+ return False
+
+ @Decorators.timeout_signal
+ def poweroff(self) -> None:
+ logger.debug('Powering off VM%s', self.vmnum)
+ assert self.process
+ if not self.is_running():
+ logger.warning('VM%s not running', self.vmnum)
+ return
+
+ try:
+ self.ga.poweroff()
+ # Wait for shutdown event
+ event: str = self.qm.get_qmp_event()
+ while event != 'SHUTDOWN':
+ event = self.qm.get_qmp_event()
+ except exceptions.AlarmTimeoutError:
+ logger.warning('VM%s hanged on poweroff. Initiating forced termination', self.vmnum)
+ self.process.terminate()
+ finally:
+ # Wait and make sure that qemu shutdown
+ self.process.communicate()
+ # self.__close_qemu_output()
+
+ if self.__sockets_exists():
+ # Remove leftovers and notify about unclear qemu shutdown
+ os.remove(self.questagent_sockpath)
+ os.remove(self.qmp_sockpath)
+ raise exceptions.GuestError(f'VM{self.vmnum} was not gracefully powered off - sockets exist')
+
+ def reboot(self) -> None:
+ logger.debug('Rebooting VM%s', self.vmnum)
+ self.qm.system_reset()
+ event: str = self.qm.get_qmp_event()
+ while event != 'RESET':
+ event = self.qm.get_qmp_event()
+
+ def pause(self) -> None:
+ logger.debug('Pausing VM%s', self.vmnum)
+ self.qm.stop()
+ vm_status = self.qm.query_status()
+ if vm_status != 'paused':
+ if self.process:
+ self.process.terminate()
+ logger.error('VM%s status not "paused", instead: %s', self.vmnum, vm_status)
+ raise exceptions.GuestError(f'VM{self.vmnum} status {vm_status}')
+
+ def resume(self) -> None:
+ logger.debug('Resuming VM%s', self.vmnum)
+ self.qm.cont()
+ vm_status = self.qm.query_status()
+ if vm_status != 'running':
+ if self.process:
+ self.process.terminate()
+ logger.error('VM%s status not "running", instead: %s', self.vmnum, vm_status)
+ raise exceptions.GuestError(f'VM{self.vmnum} status {vm_status}')
+
+ def quit(self) -> None:
+ logger.debug('Quitting VM%s', self.vmnum)
+ self.qm.quit()
+ event: str = self.qm.get_qmp_event()
+ while event != 'SHUTDOWN':
+ event = self.qm.get_qmp_event()
+
+ def _enable_suspend(self) -> None:
+ if self.link_exists('/etc/systemd/system/suspend.target'):
+ logger.debug('Enable (unmask) systemd suspend/sleep')
+ self.execute('systemctl unmask suspend.target sleep.target')
+
+ def suspend(self, mode: SuspendMode = SuspendMode.ACPI_S3) -> None:
+ logger.debug('Suspending VM%s (mode: %s)', self.vmnum, mode)
+ self._enable_suspend()
+ if mode == SuspendMode.ACPI_S3:
+ self.ga.suspend_ram()
+ elif mode == SuspendMode.ACPI_S4:
+ # self.ga.suspend_disk()
+ raise exceptions.GuestError('Guest S4 support not implemented')
+ else:
+ raise exceptions.GuestError('Unknown suspend mode')
+
+ event: str = self.qm.get_qmp_event()
+ while event != 'SUSPEND':
+ event = self.qm.get_qmp_event()
+
+ vm_status = self.qm.query_status()
+ if vm_status != 'suspended':
+ if self.process:
+ self.process.terminate()
+ logger.error('VM%s status not "suspended", instead: %s', self.vmnum, vm_status)
+ raise exceptions.GuestError(f'VM{self.vmnum} status {vm_status}')
+
+ def wakeup(self) -> None:
+ logger.debug('Waking up VM%s', self.vmnum)
+ self.qm.system_wakeup()
+
+ event: str = self.qm.get_qmp_event()
+ while event != 'WAKEUP':
+ event = self.qm.get_qmp_event()
+
+ vm_status = self.qm.query_status()
+ if vm_status != 'running':
+ if self.process:
+ self.process.terminate()
+ logger.error('VM%s status not "running", instead: %s', self.vmnum, vm_status)
+ raise exceptions.GuestError(f'VM{self.vmnum} status {vm_status}')
+
+ # {"execute": "guest-exec", "arguments":{"path": "/some/path", "arg": [], "capture-output": true}}
+ # {"error": {"class": "GenericError", "desc": "Guest... "}}
+ def execute(self, command: str) -> int:
+ arr_cmd = shlex.split(command)
+ execout: typing.Dict = self.ga.execute(arr_cmd[0], arr_cmd[1:])
+ ret = execout.get('return')
+ if ret:
+ pid: int = ret.get('pid')
+ logger.debug('Running %s on VM%s with pid %s', command, self.vmnum, pid)
+ return pid
+
+ logger.error('Command %s did not return pid', command)
+ raise exceptions.GuestError(f'No pid returned: {execout}')
+
+ # {'error': {'class': 'GenericError', 'desc': "Invalid parameter 'pid'"}}
+ def execute_status(self, pid: int) -> ProcessResult:
+ out = self.ga.execute_status(pid)
+ status = out.get('return')
+ if not status:
+ raise exceptions.GuestError(f'Not output from guest agent: {out}')
+
+ b64stdout = status.get('out-data', '')
+ stdout = base64.b64decode(b64stdout).decode('utf-8')
+
+ b64stderr = status.get('err-data', '')
+ stderr = base64.b64decode(b64stderr).decode('utf-8')
+
+ return ProcessResult(status.get('exited'), status.get('exitcode', None), stdout, stderr)
+
+ @Decorators.timeout_signal
+ def execute_wait(self, pid: int, timeout: int = DEFAULT_TIMEOUT) -> ProcessResult:
+ exec_status = ProcessResult(False, -1, '', '')
+ while not exec_status.exited:
+ exec_status = self.execute_status(pid)
+ time.sleep(1)
+
+ return exec_status
+
+ def execute_signal(self, pid: int, sig: signal.Signals) -> None:
+ signum = int(sig)
+ killpid = self.execute(f'kill -{signum} {pid}')
+ self.execute_wait(killpid)
+
+ def read_file_content(self, path: str) -> str:
+ out = self.ga.guest_file_open(path, 'r')
+ handle = out.get('return')
+ if not handle:
+ raise exceptions.GuestError('Could not open file on guest')
+
+ try:
+ eof: bool = False
+ file_content: typing.List[str] = []
+ while not eof:
+ ret = self.ga.guest_file_read(handle)
+ eof = self.__get_key(ret, ['return', 'eof'])
+ b64buf: str = self.__get_key(ret, ['return', 'buf-b64'])
+ file_content.append(base64.b64decode(b64buf).decode('utf-8'))
+ finally:
+ self.ga.guest_file_close(handle)
+
+ return ''.join(file_content)
+
+ def write_file_content(self, path: str, content: str) -> int:
+ out: typing.Dict = self.ga.guest_file_open(path, 'w')
+ handle = out.get('return')
+ if not handle:
+ raise exceptions.GuestError('Could not open file on guest')
+
+ b64buf: bytes = base64.b64encode(content.encode())
+
+ try:
+ ret = self.ga.guest_file_write(handle, b64buf.decode('utf-8'))
+ count: int = self.__get_key(ret, ['return', 'count'])
+ finally:
+ self.ga.guest_file_close(handle)
+
+ return count
+
+ def dir_exists(self, path: str) -> bool:
+ pid = self.execute(f'/bin/sh -c "[ -d {path} ]"')
+ status = self.execute_wait(pid)
+ if status.exit_code:
+ return False
+ return True
+
+ def link_exists(self, path: str) -> bool:
+ pid = self.execute(f'/bin/sh -c "[ -h {path} ]"')
+ status = self.execute_wait(pid)
+ if status.exit_code:
+ return False
+ return True
+
+ @Decorators.timeout_signal
+ def save_state(self) -> None:
+ logger.debug('Saving VM%s state (snapshot)', self.vmnum)
+ self.qm.save_snapshot()
+
+ job_status: str = self.qm.get_qmp_event_job()
+ while job_status != 'concluded':
+ job_status = self.qm.get_qmp_event_job()
+
+ job_status, job_error = self.qm.query_jobs('snapshot-save')
+ if job_status == 'concluded' and job_error is not None:
+ raise exceptions.GuestError(f'VM{self.vmnum} state save error: {job_error}')
+
+ logger.debug('VM%s state save finished successfully', self.vmnum)
+
+ @Decorators.timeout_signal
+ def load_state(self) -> None:
+ logger.debug('Loading VM state (snapshot)')
+ self.qm.load_snapshot()
+
+ job_status: str = self.qm.get_qmp_event_job()
+ while job_status != 'concluded':
+ job_status = self.qm.get_qmp_event_job()
+
+ job_status, job_error = self.qm.query_jobs('snapshot-load')
+ if job_status == 'concluded' and job_error is not None:
+ raise exceptions.GuestError(f'VM{self.vmnum} state load error: {job_error}')
+
+ logger.debug('VM state load finished successfully')
+
+ # helper_convert_units_to_bytes - convert size with units to bytes
+ # @size_str: multiple-byte unit size with suffix (K/M/G)
+ # Returns: size in bytes
+ # TODO: function perhaps could be moved to some new utils module
+ # improve - consider regex to handle various formats eg. both M and MB
+ def helper_convert_units_to_bytes(self, size_str: str) -> int:
+ size_str = size_str.upper()
+ size_int = 0
+
+ if size_str.endswith('B'):
+ size_int = int(size_str[0:-1])
+ elif size_str.endswith('K'):
+ size_int = int(size_str[0:-1]) * 1024
+ elif size_str.endswith('M'):
+ size_int = int(size_str[0:-1]) * 1024**2
+ elif size_str.endswith('G'):
+ size_int = int(size_str[0:-1]) * 1024**3
+
+ return size_int
+
+ # helper_get_debugfs_selfconfig - read resources allocated to VF from debugfs:
+ # /sys/kernel/debug/dri/@card/gt at gt_num/iov/self_config
+ # @card: card number
+ # @gt_num: GT instance number
+ def helper_get_debugfs_selfconfig(self, card: int = 0, gt_num: int = 0) -> None:
+ path = posixpath.join(f'/sys/kernel/debug/dri/{card}/gt{gt_num}/iov/self_config')
+ out = self.read_file_content(path)
+
+ for line in out.splitlines():
+ param, value = line.split(':')
+
+ if param == 'GGTT size':
+ self._ggtt_size = self.helper_convert_units_to_bytes(value)
+ elif param == 'LMEM size':
+ self._lmem_size = self.helper_convert_units_to_bytes(value)
+ elif param == 'contexts':
+ self._contexts = int(value)
+ elif param == 'doorbells':
+ self._doorbells = int(value)
+ elif param == 'tile mask':
+ self._tile_mask = int(value, base=16)
diff --git a/tools/vmtb/dev-requirements.txt b/tools/vmtb/dev-requirements.txt
new file mode 100644
index 000000000..d41e3fd83
--- /dev/null
+++ b/tools/vmtb/dev-requirements.txt
@@ -0,0 +1,14 @@
+# Testing
+pytest
+
+# Code checking
+mypy
+pylint
+
+# Code formatting
+autopep8
+isort
+
+# Building
+build
+packaging
diff --git a/tools/vmtb/pyproject.toml b/tools/vmtb/pyproject.toml
new file mode 100644
index 000000000..930558298
--- /dev/null
+++ b/tools/vmtb/pyproject.toml
@@ -0,0 +1,26 @@
+[build-system]
+requires = ["setuptools >= 61.0"]
+build-backend = "setuptools.build_meta"
+
+[project]
+name = "vmtb"
+version = "1.0.0"
+description = "SR-IOV VM-level test tool"
+readme = "README.md"
+license = {file="LICENSE.txt"}
+requires-python = ">=3.8"
+
+authors = [
+ {name = "Intel Corporation"}
+]
+classifiers = [
+ "Programming Language :: Python :: 3",
+ "License :: OSI Approved :: MIT License",
+]
+dependencies = [
+ "pytest",
+]
+
+[tool.setuptools.packages.find]
+where = ["."]
+include = ["*"]
diff --git a/tools/vmtb/requirements.txt b/tools/vmtb/requirements.txt
new file mode 100644
index 000000000..5d80ceeab
--- /dev/null
+++ b/tools/vmtb/requirements.txt
@@ -0,0 +1,2 @@
+# Used for running tests
+pytest
diff --git a/tools/vmtb/tests/__init__.py b/tools/vmtb/tests/__init__.py
new file mode 100644
index 000000000..e5a0d9b48
--- /dev/null
+++ b/tools/vmtb/tests/__init__.py
@@ -0,0 +1 @@
+#!/usr/bin/env python3
diff --git a/tools/vmtb/tests/conftest.py b/tools/vmtb/tests/conftest.py
new file mode 100644
index 000000000..9a4d625d5
--- /dev/null
+++ b/tools/vmtb/tests/conftest.py
@@ -0,0 +1,65 @@
+#!/usr/bin/env python3
+# SPDX-License-Identifier: MIT
+
+## Copyright (C) 2024 Intel Corporation ##
+
+import os
+import posixpath
+from unittest.mock import patch
+
+import pytest
+
+from bench.machines.host import Host
+from bench.machines.virtual.vm import VirtualMachine
+
+
+def pytest_addoption(parser):
+ parser.addoption('--vm-image',
+ action='store',
+ help='OS image to boot on VM')
+
+
+ at pytest.fixture(scope='session', name='get_os_image')
+def fixture_get_os_image(request):
+ os_image: str = request.config.getoption('--vm-image')
+ if not os_image:
+ os_image = os.environ.get('VM_IMAGE_PATH', '')
+
+ print(f'Path to OS image: "{os_image}"')
+ assert posixpath.exists(os_image)
+ return os_image
+
+
+ at pytest.fixture(scope='session', name='setup_vm')
+def fixture_setup_vm(get_os_image):
+ os_image = get_os_image
+ return VirtualMachine(os_image, 0), VirtualMachine(os_image, 1)
+
+
+ at pytest.fixture(scope='function')
+def get_vm(setup_vm):
+ vm, _ = setup_vm
+ vm.poweron()
+
+ yield vm
+
+ vm.poweroff()
+
+
+ at pytest.fixture(scope='function')
+def get_vms(setup_vm):
+ vm1, vm2 = setup_vm
+ vm1.poweron()
+ vm2.poweron()
+
+ yield vm1, vm2
+
+ vm1.poweroff()
+ vm2.poweroff()
+
+
+ at pytest.fixture(scope='session')
+def get_host():
+ # Mock HW dependant get_pci_info() to return ATS info
+ with patch('bench.machines.pci.get_pci_info', return_value=('0000:8c:00.0', '020A')):
+ yield Host()
diff --git a/tools/vmtb/tests/pytest.ini b/tools/vmtb/tests/pytest.ini
new file mode 100644
index 000000000..5989ddd17
--- /dev/null
+++ b/tools/vmtb/tests/pytest.ini
@@ -0,0 +1,6 @@
+[pytest]
+markers =
+ slow: marks tests as slow (deselect with '-m "not slow"')
+ smoke: suite run by CI
+ vm: only VM tests
+ host: only host tests
diff --git a/tools/vmtb/tests/test_executors.py b/tools/vmtb/tests/test_executors.py
new file mode 100644
index 000000000..621b51c13
--- /dev/null
+++ b/tools/vmtb/tests/test_executors.py
@@ -0,0 +1,109 @@
+#!/usr/bin/env python3
+# SPDX-License-Identifier: MIT
+
+## Copyright (C) 2024 Intel Corporation ##
+
+import pytest
+
+from bench.executors.shell import ShellExecutor
+
+
+ at pytest.mark.host
+ at pytest.mark.smoke
+def test_host_simple_exec(get_host):
+ host_echo = ShellExecutor(get_host, 'echo foo')
+ # There is a OS delay here
+ # time.sleep(1)
+ status = host_echo.wait()
+ assert status.exited
+ assert status.exit_code == 0
+ assert status.stdout == 'foo\n'
+ assert not status.stderr
+
+
+ at pytest.mark.host
+def test_host_wait_exec(get_host):
+ h_watch = ShellExecutor(get_host, 'sleep 5')
+ # There is a OS delay here
+ # time.sleep(1)
+ status = h_watch.status()
+ assert not status.exited
+ status = h_watch.wait()
+ assert status.exited
+
+
+ at pytest.mark.host
+def test_host_terminate_exec(get_host):
+ h_watch = ShellExecutor(get_host, 'sleep 3600')
+ # There is a OS delay here
+ # time.sleep(1)
+ status = h_watch.status()
+ assert not status.exited
+ h_watch.terminate()
+ # time.sleep(1)
+ status = h_watch.wait()
+ assert status.exited
+ assert status.exit_code == -15
+
+
+ at pytest.mark.host
+def test_host_kill_exec(get_host):
+ h_watch = ShellExecutor(get_host, 'sleep 3600')
+ # There is a OS delay here
+ # time.sleep(1)
+ status = h_watch.status()
+ assert not status.exited
+ h_watch.kill()
+ # time.sleep(1)
+ status = h_watch.wait()
+ assert status.exited
+ assert status.exit_code == -9
+
+
+ at pytest.mark.vm
+def test_vm_simple_exec(get_vm):
+ vm_echo = ShellExecutor(get_vm, 'echo foo')
+ # There is a OS delay here
+ # time.sleep(1)
+ status = vm_echo.wait()
+ assert status.exited
+ assert status.exit_code == 0
+ assert status.stdout == 'foo\n'
+ assert not status.stderr
+
+
+ at pytest.mark.vm
+def test_vm_wait_exec(get_vm):
+ vm_sleep = ShellExecutor(get_vm, 'sleep 15')
+ # There is a OS delay here
+ # time.sleep(1)
+ status = vm_sleep.status()
+ assert not status.exited
+ status = vm_sleep.wait()
+ assert status.exited
+
+
+ at pytest.mark.vm
+def test_vm_terminate_exec(get_vm):
+ vm_watch = ShellExecutor(get_vm, 'sleep 3600')
+ # There is a OS delay here
+ # time.sleep(1)
+ status = vm_watch.status()
+ assert not status.exited
+ vm_watch.terminate()
+ # time.sleep(5)
+ status = vm_watch.wait()
+ assert status.exited
+
+
+ at pytest.mark.vm
+def test_vm_kill_exec(get_vm):
+ vm_watch = ShellExecutor(get_vm, 'sleep 3600')
+ # There is a OS delay here
+ # time.sleep(1)
+ status = vm_watch.status()
+ assert not status.exited
+ vm_watch.kill()
+ # time.sleep(5)
+ status = vm_watch.wait()
+ assert status.exited
diff --git a/tools/vmtb/tests/test_igt_executors.py b/tools/vmtb/tests/test_igt_executors.py
new file mode 100644
index 000000000..d2d8cec75
--- /dev/null
+++ b/tools/vmtb/tests/test_igt_executors.py
@@ -0,0 +1,24 @@
+#!/usr/bin/env python3
+# SPDX-License-Identifier: MIT
+
+## Copyright (C) 2024 Intel Corporation ##
+
+import pytest
+
+from bench.executors.igt import IgtConfiguration, IgtExecutor, IgtType
+
+
+ at pytest.mark.vm
+def test_wait_exec(get_vm):
+ igt_config = IgtConfiguration(
+ test_dir='/usr/local/libexec/igt-gpu-tools/',
+ tool_dir='/usr/local/bin/',
+ lib_dir='/usr/local/lib/x86_64-linux-gnu',
+ result_dir='/usr/local/results',
+ options='-d --piglit-style-dmesg --dmesg-warn-level=4 --abort-on-monitored-error=taint')
+
+ vm_sleep = IgtExecutor(get_vm, IgtType.EXEC_BASIC, igt_config=igt_config)
+ status = vm_sleep.status()
+ assert not status.exited
+ status = vm_sleep.wait()
+ assert status.exited
diff --git a/tools/vmtb/tests/test_timer.py b/tools/vmtb/tests/test_timer.py
new file mode 100644
index 000000000..a7c32d1d8
--- /dev/null
+++ b/tools/vmtb/tests/test_timer.py
@@ -0,0 +1,23 @@
+#!/usr/bin/env python3
+# SPDX-License-Identifier: MIT
+
+## Copyright (C) 2024 Intel Corporation ##
+
+import time
+
+import pytest
+
+from bench import exceptions
+from bench.executors.shell import ShellExecutor
+
+
+ at pytest.mark.vm
+ at pytest.mark.slow
+def test_wait_exec(get_vm):
+ vm_sleep = ShellExecutor(get_vm, 'sleep 1500')
+ # There is a OS delay here
+ time.sleep(1)
+ status = vm_sleep.status()
+ assert not status.exited
+ with pytest.raises(exceptions.AlarmTimeoutError):
+ status = vm_sleep.wait()
diff --git a/tools/vmtb/tests/test_vm.py b/tools/vmtb/tests/test_vm.py
new file mode 100644
index 000000000..81e11946d
--- /dev/null
+++ b/tools/vmtb/tests/test_vm.py
@@ -0,0 +1,89 @@
+#!/usr/bin/env python3
+# SPDX-License-Identifier: MIT
+
+## Copyright (C) 2024 Intel Corporation ##
+
+import time
+
+import pytest
+
+
+ at pytest.mark.vm
+ at pytest.mark.smoke
+def test_vm_poweroff(get_vm):
+ vm = get_vm
+ # breakpoint()
+ vm.poweron()
+ assert vm.is_running()
+ vm.poweroff()
+ time.sleep(5)
+ assert not vm.is_running()
+
+
+ at pytest.mark.vm
+ at pytest.mark.smoke
+def test_vm_echo(get_vm):
+ vm = get_vm
+ pid = vm.execute('echo foo')
+ (exited, ec, out, err) = vm.execute_wait(pid)
+ assert exited
+ assert ec == 0
+ assert out == 'foo\n'
+ assert not err
+
+
+ at pytest.mark.vm
+ at pytest.mark.smoke
+def test_vm_no_comd(get_vm):
+ with pytest.raises(Exception):
+ get_vm.execute('someunexistingcommand')
+
+
+ at pytest.mark.vm
+ at pytest.mark.smoke
+def test_vm_cmd_err(get_vm):
+ vm = get_vm
+ pid = vm.execute('ls /someunexistingdir')
+ (exited, ec, out, err) = vm.execute_wait(pid)
+ assert exited
+ assert ec != 0
+ assert not out
+ assert 'No such file or directory' in err
+
+
+ at pytest.mark.vm
+ at pytest.mark.smoke
+def test_write_read_file(get_vm):
+ vm = get_vm
+ txt = '''Nor is it divided, since it is all alike;
+ and it is not any more there, which would keep it from holding together,
+ nor any worse, but it is all replete with What Is.
+ Therefore it is all continuous: for What Is draws to What Is.'''
+
+ count = vm.write_file_content('/home/gta/poem.txt', txt)
+ assert count == len(txt)
+ ret = vm.read_file_content('/home/gta/poem.txt')
+ assert ret == txt
+
+
+ at pytest.mark.vm
+ at pytest.mark.smoke
+def test_two_vm_echo(get_vms):
+ vm1, vm2 = get_vms
+ pid1 = vm1.execute('echo foo')
+ assert pid1
+
+ pid2 = vm2.execute('echo bar')
+ assert pid2
+
+ (exited, ec, out, err) = vm1.execute_wait(pid1)
+ assert exited
+ assert ec == 0
+ assert out == 'foo\n'
+ assert not err
+
+ (exited, ec, out, err) = vm2.execute_wait(pid2)
+ assert exited
+ assert ec == 0
+ assert out == 'bar\n'
+ assert not err
diff --git a/tools/vmtb/vmm_flows/__init__.py b/tools/vmtb/vmm_flows/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/tools/vmtb/vmm_flows/conftest.py b/tools/vmtb/vmm_flows/conftest.py
new file mode 100644
index 000000000..ed5461d7f
--- /dev/null
+++ b/tools/vmtb/vmm_flows/conftest.py
@@ -0,0 +1,296 @@
+#!/usr/bin/env python3
+# SPDX-License-Identifier: MIT
+
+## Copyright (C) 2024 Intel Corporation ##
+
+import json
+import re
+import logging
+import typing
+from pathlib import Path
+import pytest
+
+from bench import exceptions
+from bench.machines.machine_interface import DriverModule
+from bench.machines.host import SriovHost, HOST_DMESG_FILE
+from bench.machines.virtual.vm import VirtualMachine
+from bench.machines.vgpu_profile import VgpuProfile, VgpuProfileClass
+from bench.helpers.helpers import (load_host_drivers, unload_host_drivers,
+ modprobe_driver, modprobe_driver_check, driver_check)
+
+
+logger = logging.getLogger(__name__)
+
+
+def pytest_addoption(parser):
+ parser.addoption('--vm-image',
+ action='store',
+ help='OS image to boot on VM',
+ required=True)
+ parser.addoption('--vm-modparams',
+ action='store',
+ default='',
+ help='DRM driver parameters to use for VM')
+
+
+class VmmTestingConfig(typing.NamedTuple):
+ """Structure represents test configuration used by a setup fixture.
+
+ Available settings:
+ - vgpu_profile: profile to apply, empty represents auto provisioning
+ - num_vms: number of VMs to create (the value can be different than enabled number of VFs)
+ - auto_poweron_vm: assign VFs and power on VMs automatically in setup fixture
+ - auto_probe_vm_driver: probe guest DRM driver in setup fixture (VM must be powered on)
+ - unload_host_drivers_on_teardown: unload host DRM drivers in teardown fixture
+ - wa_reduce_vf_lmem: workaround to reduce VF LMEM (for save-restore/migration tests speed-up)
+ """
+ vgpu_profile: VgpuProfile
+ num_vms: int
+ auto_poweron_vm: bool = True
+ auto_probe_vm_driver: bool = True
+ unload_host_drivers_on_teardown: bool = False
+ # Temporary W/A: reduce size of LMEM assigned to VFs to speed up a VF state save-restore process
+ wa_reduce_vf_lmem: bool = False
+
+ def __str__(self) -> str:
+ if self.vgpu_profile.profileId:
+ config_id = self.vgpu_profile.profileId[-2:] if self.vgpu_profile.profileId[-3] == '_' \
+ else self.vgpu_profile.profileId[-3:]
+ else:
+ config_id = 'Auto'
+
+ return f'{config_id}-{self.num_vms}VM'
+
+ def __repr__(self) -> str:
+ return (f'\nVmmTestingConfig:'
+ f'\nvGPU ProfileID = {self.vgpu_profile.profileId} [{self.num_vms}VM]'
+ f'\nSetup flags:'
+ f'\n\tVM - auto power-on = {self.auto_poweron_vm}'
+ f'\n\tVM - auto DRM driver probe = {self.auto_probe_vm_driver}'
+ f'\n\tHost - unload drivers on teardown = {self.unload_host_drivers_on_teardown}'
+ f'\n\tW/A - reduce VF LMEM (improves migration time) = {self.wa_reduce_vf_lmem}')
+
+
+class VmmTestingSetup:
+ def __init__(self, os_image, vm_modparams, host, testing_config):
+ self.vm_modparams = vm_modparams
+ self.host: SriovHost = host
+ self.testing_config: VmmTestingConfig = testing_config
+
+ self.vms: typing.List[VirtualMachine] = [
+ VirtualMachine(os_image, i) for i in range(self.testing_config.num_vms)]
+
+ @property
+ def get_host(self):
+ return self.host
+
+ @property
+ def get_vm(self):
+ return self.vms
+
+ @property
+ def get_vm_modprobe_params(self):
+ return self.vm_modparams
+
+ @property
+ def get_vgpu_profile(self):
+ return self.testing_config.vgpu_profile
+
+ def get_num_vms(self) -> int:
+ return len(self.vms)
+
+ def poweron_vms(self):
+ for vm in self.vms:
+ vm.poweron()
+
+ def poweroff_vms(self):
+ for vm in self.vms:
+ if vm.is_running():
+ try:
+ vm.poweroff()
+ except Exception as exc:
+ self.testing_config.unload_host_drivers_on_teardown = True
+ logger.warning("Error on VM%s poweroff (%s)", vm.vmnum, exc)
+
+ if self.testing_config.unload_host_drivers_on_teardown:
+ raise exceptions.GuestError(f'VM poweroff issue - cleanup on test teardown')
+
+ def teardown(self):
+ try:
+ self.poweroff_vms()
+ except Exception as exc:
+ logger.error("Error on test teardown (%s)", exc)
+ # TODO: perhaps even better: pytest.fail(f'Error on test teardown ({exc})')
+ finally:
+ num_vfs = self.get_host.get_current_vfs()
+ self.get_host.clear_vf()
+ self.get_host.reset_provisioning(num_vfs)
+
+ if self.get_host.drm_driver is DriverModule.I915:
+ # Drop caches to ensure the available LMEM size is stable
+ self.get_host.drop_all_caches()
+
+ if self.testing_config.unload_host_drivers_on_teardown:
+ unload_host_drivers(self.get_host)
+
+
+ at pytest.fixture(scope='session', name='get_os_image')
+def fixture_get_os_image(request):
+ return request.config.getoption('--vm-image')
+
+
+ at pytest.fixture(scope='session', name='get_vm_modparams')
+def fixture_get_vm_modparams(request):
+ return request.config.getoption('--vm-modparams')
+
+
+ at pytest.fixture(scope='session', name='get_host')
+def fixture_get_host():
+ return SriovHost()
+
+
+ at pytest.fixture(scope='class', name='setup_vms')
+def fixture_setup_vms(get_os_image, get_vm_modparams, get_host, request):
+ """Arrange VM environment for the VMM Flows test execution.
+
+ VM setup steps follow the configuration provided as VmmTestingConfig parameter, including:
+ host drivers probe (DRM and VFIO), provision and enable VFs, boot VMs and load guest DRM driver.
+ Tear-down phase covers test environment cleanup:
+ shutdown VMs, reset provisioning, disable VMs and optional host drivers unload.
+
+ The fixture is designed for test parametrization, as the input to the following test class decorator:
+ @pytest.mark.parametrize('setup_vms', set_test_config(max_vms=N), ids=idfn_test_config, indirect=['setup_vms'])
+ where 'set_test_config' provides request parameter with a VmmTestingConfig (usually list of configs).
+ """
+ tc: VmmTestingConfig = request.param
+
+ host: SriovHost = get_host
+ vgpu_profile: VgpuProfile = tc.vgpu_profile
+ num_vfs = vgpu_profile.get_num_vfs()
+
+ ts: VmmTestingSetup = VmmTestingSetup(get_os_image, get_vm_modparams, host, tc)
+
+ logger.info('[Test setup: %s]', tc)
+ logger.debug(repr(tc))
+
+ load_host_drivers(host)
+ assert driver_check(host)
+
+ # XXX: VF migration on discrete devices (with LMEM) is currently very slow and time-outs in CI execution (20min).
+ # As a temporary workaround, reduce size of LMEM assigned to VFs to speed up a state save/load process.
+ if tc.wa_reduce_vf_lmem and host.has_lmem():
+ logger.debug("W/A: reduce VFs LMEM quota to accelerate state save/restore")
+ org_vgpu_profile_vfLmem = vgpu_profile.vfLmem
+ vgpu_profile.vfLmem = min(vgpu_profile.vfLmem // 2, 536870912) # Assign max 512 MB to VF
+
+ if vgpu_profile.get_class() is VgpuProfileClass.AUTO:
+ assert host.get_pf_auto_provisioning(), 'VFs auto-provisioning disabled!'
+ else:
+ host.set_vgpu_profile(vgpu_profile)
+
+ assert host.create_vf(num_vfs) == num_vfs
+
+ if tc.auto_poweron_vm:
+ bdf_list = [host.get_vf_bdf(vf) for vf in range(1, ts.get_num_vms() + 1)]
+ for vm, bdf in zip(ts.get_vm, bdf_list):
+ vm.assign_vf(bdf)
+
+ ts.poweron_vms()
+
+ if tc.auto_probe_vm_driver:
+ modprobe_cmds = [modprobe_driver(vm, ts.get_vm_modprobe_params) for vm in ts.get_vm]
+ for i, cmd in enumerate(modprobe_cmds):
+ assert modprobe_driver_check(ts.get_vm[i], cmd), f'modprobe failed on VM{i}'
+
+ logger.info('[Test execution: %s]', tc)
+ yield ts
+
+ logger.info('[Test teardown: %s]', tc)
+ # XXX: cleanup counterpart for VFs LMEM quota workaround - restore original value
+ if tc.wa_reduce_vf_lmem and host.has_lmem():
+ vgpu_profile.vfLmem = org_vgpu_profile_vfLmem
+
+ ts.teardown()
+
+
+ at pytest.fixture(scope='function')
+def create_1host_1vm(get_os_image, get_vm_modparams, get_host):
+ ts: VmmTestingSetup = VmmTestingSetup(get_os_image, get_vm_modparams, get_host, VmmTestingConfig(VgpuProfile(), 1))
+
+ logger.info('[Test setup: %s]', ts.testing_config)
+ logger.debug(repr(ts.testing_config))
+ load_host_drivers(get_host)
+
+ logger.info('[Test execution: %s]', ts.testing_config)
+ yield ts
+
+ logger.info('[Test teardown: %s]', ts.testing_config)
+ ts.teardown()
+
+
+ at pytest.fixture(scope='function')
+def create_1host_2vm(get_os_image, get_vm_modparams, get_host):
+ ts: VmmTestingSetup = VmmTestingSetup(get_os_image, get_vm_modparams, get_host, VmmTestingConfig(VgpuProfile(), 2))
+
+ logger.info('[Test setup: %s]', ts.testing_config)
+ logger.debug(repr(ts.testing_config))
+ load_host_drivers(get_host)
+
+ logger.info('[Test execution: %s]', ts.testing_config)
+ yield ts
+
+ logger.info('[Test teardown: %s]', ts.testing_config)
+ ts.teardown()
+
+
+def idfn_test_config(test_config: VmmTestingConfig):
+ """Provide test config ID in parametrized tests (e.g. test_something[V4-2VM].
+ Usage: @pytest.mark.parametrize([...], ids=idfn_test_config, [...])
+ """
+ return str(test_config)
+
+
+RESULTS_FILE = Path() / "results.json"
+results = {
+ "results_version": 10,
+ "name": "results",
+ "tests": {},
+}
+
+
+ at pytest.hookimpl(hookwrapper=True)
+def pytest_report_teststatus(report):
+ yield
+ with open(HOST_DMESG_FILE, 'r+', encoding='utf-8') as dmesg_file:
+ dmesg = dmesg_file.read()
+ test_string = re.findall('[A-Za-z_.]*::.*', report.nodeid)[0]
+ results["name"] = f"vmtb_{test_string}"
+ test_name = f"vmtb@{test_string}"
+ if report.when == 'call':
+ out = report.capstdout
+ if report.passed:
+ result = "pass"
+ out = f"{test_name} passed"
+ elif report.failed:
+ result = "fail"
+ else:
+ result = "skip"
+ result = {"out": out, "result": result, "time": {"start": 0, "end": report.duration},
+ "err": report.longreprtext, "dmesg": dmesg}
+ results["tests"][test_name] = result
+ dmesg_file.truncate(0)
+ elif report.when == 'setup' and report.failed:
+ result = {"out": report.capstdout, "result": "crash", "time": {"start": 0, "end": report.duration},
+ "err": report.longreprtext, "dmesg": dmesg}
+ results["tests"][test_name] = result
+ dmesg_file.truncate(0)
+
+
+ at pytest.hookimpl()
+def pytest_sessionfinish():
+ if RESULTS_FILE.exists():
+ RESULTS_FILE.unlink()
+ RESULTS_FILE.touch()
+ jsonString = json.dumps(results, indent=2)
+ with open(str(RESULTS_FILE), 'w', encoding='utf-8') as f:
+ f.write(jsonString)
diff --git a/tools/vmtb/vmm_flows/resources/guc/guc_versions.txt b/tools/vmtb/vmm_flows/resources/guc/guc_versions.txt
new file mode 100644
index 000000000..18b758b29
--- /dev/null
+++ b/tools/vmtb/vmm_flows/resources/guc/guc_versions.txt
@@ -0,0 +1,4 @@
+70.19.2
+70.13.1
+70.9.1
+70.6.5
\ No newline at end of file
diff --git a/tools/vmtb/vmm_flows/resources/vgpu_profile/ADL_int.csv b/tools/vmtb/vmm_flows/resources/vgpu_profile/ADL_int.csv
new file mode 100755
index 000000000..1c38520f4
--- /dev/null
+++ b/tools/vmtb/vmm_flows/resources/vgpu_profile/ADL_int.csv
@@ -0,0 +1,14 @@
+vGPUProfileInfo ProfileID,vGPUScheduler ResetAfterVfSwitch,General TileProvisioningMode,PFResources Lmem(B/tile),PFResources Contexts(perTile),PFResources Doorbells(perTile),PFResources GGTTSize(B/tile),VFResources Lmem(B/tile),VFResources Contexts(perTile),VFResources Doorbells(perTile),VFResources GGTTSize(B/tile),AdverseEvents GuCSamplingPeriod(msec),AdverseEvents GuCThresholdCATError,AdverseEvents G2PFNotificationCountCATError,AdverseEvents PFNotificationFreqCATError(msec),AdverseEvents GuCThresholdPageFault,AdverseEvents G2PFNotificationCountPageFault,AdverseEvents PFNotificationFreqPageFault(msec),AdverseEvents GuCThresholdH2GStorm,AdverseEvents G2PFNotificationCountH2GStorm,AdverseEvents PFNotificationFreqH2GStorm(msec),AdverseEvents GuCThresholdDbStorm,AdverseEvents G2PFNotificationCountDbStorm,AdverseEvents PFNotificationFreqDbStorm(msec),AdverseEvents GuCThresholdGTIrqStorm,AdverseEvents G2PFNotificationCountGTIrqStorm,AdverseEvents PFNotificationFreqGTIrqStorm(msec),AdverseEvents GuCThresholdEngineReset,AdverseEvents G2PFNotificationCountEngineReset,AdverseEvents PFNotificationFreqEngineReset(msec)
+ADL_V1,F,3,n/a,1024,32,67108864,n/a,1024,224,4110417920,2,0,3,10000,0,3,10000,0,3,100,0,3,100,0,3,100,0,3,100
+ADL_V2,F,3,n/a,1024,32,67108864,n/a,1024,112,2055208960,2,0,3,10000,0,3,10000,0,3,100,0,3,100,0,3,100,0,3,100
+ADL_V4,F,3,n/a,1024,32,67108864,n/a,1024,56,1027604480,2,0,3,10000,0,3,10000,0,3,100,0,3,100,0,3,100,0,3,100
+ADL_V7,F,3,n/a,1024,32,67108864,n/a,1024,32,587202560,2,0,3,10000,0,3,10000,0,3,100,0,3,100,0,3,100,0,3,100
+ADL_L1,F,3,n/a,1024,32,67108864,n/a,1024,224,4177526784,2,0,3,10000,0,3,10000,0,3,100,0,3,100,0,3,100,0,3,100
+ADL_L2,F,3,n/a,1024,32,67108864,n/a,1024,112,2088763392,2,0,3,10000,0,3,10000,0,3,100,0,3,100,0,3,100,0,3,100
+ADL_L4,F,3,n/a,1024,32,67108864,n/a,1024,56,1044381696,2,0,3,10000,0,3,10000,0,3,100,0,3,100,0,3,100,0,3,100
+ADL_L7,F,3,n/a,1024,32,67108864,n/a,1024,32,587202560,2,0,3,10000,0,3,10000,0,3,100,0,3,100,0,3,100,0,3,100
+ADL_M1,F,3,n/a,1024,32,67108864,n/a,1024,224,4177526784,2,0,3,10000,0,3,10000,0,3,100,0,3,100,0,3,100,0,3,100
+ADL_M2,F,3,n/a,1024,32,67108864,n/a,1024,112,2088763392,2,0,3,10000,0,3,10000,0,3,100,0,3,100,0,3,100,0,3,100
+ADL_M4,F,3,n/a,1024,32,67108864,n/a,1024,56,1044381696,2,0,3,10000,0,3,10000,0,3,100,0,3,100,0,3,100,0,3,100
+ADL_M7,F,3,n/a,1024,32,67108864,n/a,1024,32,587202560,2,0,3,10000,0,3,10000,0,3,100,0,3,100,0,3,100,0,3,100
+ADL_D7,F,3,n/a,1024,32,67108864,n/a,1024,32,587202560,2,0,3,10000,0,3,10000,0,3,100,0,3,100,0,3,100,0,3,100
diff --git a/tools/vmtb/vmm_flows/resources/vgpu_profile/ADL_vfs.csv b/tools/vmtb/vmm_flows/resources/vgpu_profile/ADL_vfs.csv
new file mode 100755
index 000000000..f02888d5a
--- /dev/null
+++ b/tools/vmtb/vmm_flows/resources/vgpu_profile/ADL_vfs.csv
@@ -0,0 +1,14 @@
+vGPUProfileInfo ProfileID,vGPUProfileInfo Description,vGPUScheduler vGPUSchedulerMode,vGPUScheduler PFExecutionQuanta(msec),vGPUScheduler PFPreemptionTimeout(usec),vGPUScheduler VFExecutionQuanta(msec),vGPUScheduler VFPreemptionTimeout(usec),vGPUScheduler ScheduleIfIdle
+ADL_V1,VDI | 1VF per pGPU | #VFs=1 | 30fps upto [1x4K 2xQHD 4xHD] @ H.264,TS-GPUTile,1,2000,32,64000,F,
+ADL_V2,VDI | NVF per pGPU | #VFs=2 | 30fps upto [1xQHD 2xHD] @ H.264,TS-GPUTile,1,2000,16,32000,F,
+ADL_V4,VDI | NVF per pGPU | #VFs=4 | 30fps upto [1xHD] @ H.264,TS-GPUTile,1,2000,8,16000,F,
+ADL_V7,VDI | NVF per pGPU | #VFs=7 | 30fps upto [1xHD] @ H.264,TS-GPUTile,1,2000,4,8000,F,
+ADL_L1,IDV Local Display | 1VF per pGPU | #VFs=1 | Local Display FPS 30 | VM 30fps upto ,TS-GPUTile,3,6000,30,60000,F,
+ADL_L2,IDV Local Display | NVF per pGPU | #VFs=2 | Local Display FPS 30 | VM 30fps upto ,TS-GPUTile,5,10000,14,28000,F,
+ADL_L4,IDV Local Display | NVF per pGPU | #VFs=4 | Local Display FPS 30 | VM 30fps upto,TS-GPUTile,13,26000,5,10000,F,
+ADL_L7,IDV Local Display | NVF per pGPU | #VFs=7 | Local Display FPS 30 | VM 30fps upto ,TS-GPUTile,19,38000,2,4000,F,
+ADL_M1,MULTI | 1VF per pGPU | #VFs=1 | Best Effort Virtual Display,TS-GPUTile,1,2000,64,128000,F,
+ADL_M2,MULTI | NVF per pGPU | #VFs=2 | Best Effort Virtual Display,TS-GPUTile,1,2000,32,64000,F,
+ADL_M4,MULTI | NVF per pGPU | #VFs=4 | Best Effort Virtual Display,TS-GPUTile,1,2000,16,32000,F,
+ADL_M7,MULTI | NVF per pGPU | #VFs=7 | Best Effort Virtual Display,TS-GPUTile,1,2000,8,16000,F,
+ADL_D7,Legacy Default | NVF per pGPU | #VFs=7 | Local Display | VM 30fps,TS-GPUTile,25,0,25,0,F
diff --git a/tools/vmtb/vmm_flows/resources/vgpu_profile/ATSM150_int.csv b/tools/vmtb/vmm_flows/resources/vgpu_profile/ATSM150_int.csv
new file mode 100755
index 000000000..0a54fb147
--- /dev/null
+++ b/tools/vmtb/vmm_flows/resources/vgpu_profile/ATSM150_int.csv
@@ -0,0 +1,14 @@
+vGPUProfileInfo ProfileID,vGPUScheduler ResetAfterVfSwitch,General TileProvisioningMode,PFResources Lmem(B/tile),PFResources Contexts(perTile),PFResources Doorbells(perTile),PFResources GGTTSize(B/tile),VFResources Lmem(B/tile),VFResources Contexts(perTile),VFResources Doorbells(perTile),VFResources GGTTSize(B/tile),AdverseEvents GuCSamplingPeriod(msec),AdverseEvents GuCThresholdCATError,AdverseEvents G2PFNotificationCountCATError,AdverseEvents PFNotificationFreqCATError(msec),AdverseEvents GuCThresholdPageFault,AdverseEvents G2PFNotificationCountPageFault,AdverseEvents PFNotificationFreqPageFault(msec),AdverseEvents GuCThresholdH2GStorm,AdverseEvents G2PFNotificationCountH2GStorm,AdverseEvents PFNotificationFreqH2GStorm(msec),AdverseEvents GuCThresholdDbStorm,AdverseEvents G2PFNotificationCountDbStorm,AdverseEvents PFNotificationFreqDbStorm(msec),AdverseEvents GuCThresholdGTIrqStorm,AdverseEvents G2PFNotificationCountGTIrqStorm,AdverseEvents PFNotificationFreqGTIrqStorm(msec),AdverseEvents GuCThresholdEngineReset,AdverseEvents G2PFNotificationCountEngineReset,AdverseEvents PFNotificationFreqEngineReset(msec)
+ATSM150_R1,F,1,1073741824,1024,16,268435456,13528727552,1024,240,4026531840,0,0,3,10000,0,3,10000,0,3,100,0,3,100,0,3,100,0,3,100
+ATSM150_V1,F,1,1073741824,1024,16,268435456,13528727552,1024,240,4026531840,0,0,3,10000,0,3,10000,0,3,100,0,3,100,0,3,100,0,3,100
+ATSM150_V2,F,3,1073741824,1024,16,268435456,6763315200,1024,120,2013265920,0,0,3,10000,0,3,10000,0,3,100,0,3,100,0,3,100,0,3,100
+ATSM150_V4,F,3,1073741824,1024,16,268435456,3380609024,1024,60,1006632960,0,0,3,10000,0,3,10000,0,3,100,0,3,100,0,3,100,0,3,100
+ATSM150_V5,F,3,1073741824,1024,16,268435456,2705326080,1024,48,805306368,0,0,3,10000,0,3,10000,0,3,100,0,3,100,0,3,100,0,3,100
+ATSM150_V8,F,3,1073741824,1024,16,268435456,1690304512,1024,30,503316480,0,0,3,10000,0,3,10000,0,3,100,0,3,100,0,3,100,0,3,100
+ATSM150_V16,F,3,1073741824,1024,16,268435456,845152256,1024,15,251658240,0,0,3,10000,0,3,10000,0,3,100,0,3,100,0,3,100,0,3,100
+ATSM150_M1,F,1,1073741824,1024,16,268435456,13528727552,1024,240,4026531840,0,0,3,10000,0,3,10000,0,3,100,0,3,100,0,3,100,0,3,100
+ATSM150_M2,F,3,1073741824,1024,16,268435456,6763315200,1024,120,2013265920,0,0,3,10000,0,3,10000,0,3,100,0,3,100,0,3,100,0,3,100
+ATSM150_M4,F,3,1073741824,1024,16,268435456,3380609024,1024,60,1006632960,0,0,3,10000,0,3,10000,0,3,100,0,3,100,0,3,100,0,3,100
+ATSM150_M5,F,3,1073741824,1024,16,268435456,2705326080,1024,48,805306368,0,0,3,10000,0,3,10000,0,3,100,0,3,100,0,3,100,0,3,100
+ATSM150_M8,F,3,1073741824,1024,16,268435456,1690304512,1024,30,503316480,0,0,3,10000,0,3,10000,0,3,100,0,3,100,0,3,100,0,3,100
+ATSM150_M16,F,3,1073741824,1024,16,268435456,845152256,1024,15,251658240,0,0,3,10000,0,3,10000,0,3,100,0,3,100,0,3,100,0,3,100
diff --git a/tools/vmtb/vmm_flows/resources/vgpu_profile/ATSM150_vfs.csv b/tools/vmtb/vmm_flows/resources/vgpu_profile/ATSM150_vfs.csv
new file mode 100755
index 000000000..a8dd8c6c7
--- /dev/null
+++ b/tools/vmtb/vmm_flows/resources/vgpu_profile/ATSM150_vfs.csv
@@ -0,0 +1,14 @@
+vGPUProfileInfo ProfileID,vGPUProfileInfo Description,vGPUScheduler vGPUSchedulerMode,vGPUScheduler PFExecutionQuanta(msec),vGPUScheduler PFPreemptionTimeout(usec),vGPUScheduler VFExecutionQuanta(msec),vGPUScheduler VFPreemptionTimeout(usec),vGPUScheduler ScheduleIfIdle
+ATSM150_R1,RDSH| 1VF per pGPU | #VFs=1 | 60 fps upto [1x5K 2x4K 4xQHD 8xHD] at H.264,TS-GPUTile,1,2000,32,64000,F
+ATSM150_V1,VDI | 1VF per pGPU | #VFs=1 | 60 fps upto [1x5K 2x4K 4xQHD 8xHD] at H.264,TS-GPUTile,1,2000,32,64000,F
+ATSM150_V2,VDI | NVF per pGPU | #VFs=2 | 30 fps upto [1x5K 2x4K 4xQHD 8xHD] at H.264,TS-GPUTile,1,2000,16,32000,F
+ATSM150_V4,VDI | NVF per pGPU | #VFs=4 | 30 fps upto [1x4K 2xQHD 4xHD] at H.264,TS-GPUTile,1,2000,8,16000,F
+ATSM150_V5,VDI | NVF per pGPU | #VFs=5 | 30 fps upto [2xQHD 4xHD] at H.264,TS-GPUTile,1,2000,6,12000,F
+ATSM150_V8,VDI | NVF per pGPU | #VFs=8 | 30 fps upto [1xQHD 2xHD] at H.265,TS-GPUTile,1,2000,4,8000,F
+ATSM150_V16,VDI | NVF per pGPU | #VFs=16 | 30 fps upto [1xHD] at H.264,TS-GPUTile,1,2000,2,4000,F
+ATSM150_M1,MULTI | 1VF per pGPU | #VFs=1 | Best Effort Virtual Display,TS-GPUTile,10,20000,64,128000,F
+ATSM150_M2,MULTI | NVF per pGPU | #VFs=2 | Best Effort Virtual Display,TS-GPUTile,10,20000,32,64000,F
+ATSM150_M4,MULTI | NVF per pGPU | #VFs=4 | Best Effort Virtual Display,TS-GPUTile,10,20000,16,32000,F
+ATSM150_M5,MULTI | NVF per pGPU | #VFs=5 | Best Effort Virtual Display,TS-GPUTile,10,20000,12,24000,F
+ATSM150_M8,MULTI | NVF per pGPU | #VFs=8 | Best Effort Virtual Display,TS-GPUTile,10,20000,8,16000,F
+ATSM150_M16,MULTI | NVF per pGPU | #VFs=16 | Best Effort Virtual Display,TS-GPUTile,10,20000,4,8000,F
diff --git a/tools/vmtb/vmm_flows/resources/vgpu_profile/ATSM75_int.csv b/tools/vmtb/vmm_flows/resources/vgpu_profile/ATSM75_int.csv
new file mode 100755
index 000000000..7ee8dc4ab
--- /dev/null
+++ b/tools/vmtb/vmm_flows/resources/vgpu_profile/ATSM75_int.csv
@@ -0,0 +1,9 @@
+vGPUProfileInfo ProfileID,vGPUScheduler ResetAfterVfSwitch,General TileProvisioningMode,PFResources Lmem(B/tile),PFResources Contexts(perTile),PFResources Doorbells(perTile),PFResources GGTTSize(B/tile),VFResources Lmem(B/tile),VFResources Contexts(perTile),VFResources Doorbells(perTile),VFResources GGTTSize(B/tile),AdverseEvents GuCSamplingPeriod(msec),AdverseEvents GuCThresholdCATError,AdverseEvents G2PFNotificationCountCATError,AdverseEvents PFNotificationFreqCATError(msec),AdverseEvents GuCThresholdPageFault,AdverseEvents G2PFNotificationCountPageFault,AdverseEvents PFNotificationFreqPageFault(msec),AdverseEvents GuCThresholdH2GStorm,AdverseEvents G2PFNotificationCountH2GStorm,AdverseEvents PFNotificationFreqH2GStorm(msec),AdverseEvents GuCThresholdDbStorm,AdverseEvents G2PFNotificationCountDbStorm,AdverseEvents PFNotificationFreqDbStorm(msec),AdverseEvents GuCThresholdGTIrqStorm,AdverseEvents G2PFNotificationCountGTIrqStorm,AdverseEvents PFNotificationFreqGTIrqStorm(msec),AdverseEvents GuCThresholdEngineReset,AdverseEvents G2PFNotificationCountEngineReset,AdverseEvents PFNotificationFreqEngineReset(msec)
+ATSM75_R1,F,1,1073741824,1024,16,268435456,4401922048,1024,240,4026531840,0,0,3,10000,0,3,10000,0,3,100,0,3,100,0,3,100,0,3,100
+ATSM75_V1,F,1,1073741824,1024,16,268435456,4401922048,1024,240,4026531840,0,0,3,10000,0,3,10000,0,3,100,0,3,100,0,3,100,0,3,100
+ATSM75_V3,F,3,1073741824,1024,16,268435456,1465909248,1024,80,1342177280,0,0,3,10000,0,3,10000,0,3,100,0,3,100,0,3,100,0,3,100
+ATSM75_V6,F,3,1073741824,1024,16,268435456,731906048,1024,40,671088640,0,0,3,10000,0,3,10000,0,3,100,0,3,100,0,3,100,0,3,100
+ATSM75_M1,F,1,1073741824,1024,16,268435456,4401922048,1024,240,4026531840,0,0,3,10000,0,3,10000,0,3,100,0,3,100,0,3,100,0,3,100
+ATSM75_M3,F,3,1073741824,1024,16,268435456,1465909248,1024,80,1342177280,0,0,3,10000,0,3,10000,0,3,100,0,3,100,0,3,100,0,3,100
+ATSM75_M6,F,3,1073741824,1024,16,268435456,731906048,1024,40,671088640,0,0,3,10000,0,3,10000,0,3,100,0,3,100,0,3,100,0,3,100
+ATSM75_M12,F,3,1073741824,1024,16,268435456,364904448,1024,20,335544320,0,0,3,10000,0,3,10000,0,3,100,0,3,100,0,3,100,0,3,100
diff --git a/tools/vmtb/vmm_flows/resources/vgpu_profile/ATSM75_vfs.csv b/tools/vmtb/vmm_flows/resources/vgpu_profile/ATSM75_vfs.csv
new file mode 100755
index 000000000..58ff41175
--- /dev/null
+++ b/tools/vmtb/vmm_flows/resources/vgpu_profile/ATSM75_vfs.csv
@@ -0,0 +1,9 @@
+vGPUProfileInfo ProfileID,vGPUProfileInfo Description,vGPUScheduler vGPUSchedulerMode,vGPUScheduler PFExecutionQuanta(msec),vGPUScheduler PFPreemptionTimeout(usec),vGPUScheduler VFExecutionQuanta(msec),vGPUScheduler VFPreemptionTimeout(usec),vGPUScheduler ScheduleIfIdle
+ATSM75_R1,RDSH | 1VF per pGPU | #VFs=1 | 30fps upto [1x5K 2x4K 4xQHD 8xHD] @ H.264,TS-GPUTile,1,2000,32,64000,F
+ATSM75_V1,VDI | 1VF per pGPU | #VFs=1 | 30fps upto [1x5K 2x4K 4xQHD 8xHD] @ H.264,TS-GPUTile,1,2000,32,64000,F
+ATSM75_V3,VDI | NVF per pGPU | #VFs=3 | 30fps upto [1x4K 2xQHD 4xHD] @ H.264,TS-GPUTile,1,2000,11,22000,F
+ATSM75_V6,VDI | NVF per pGPU | #VFs=6 | 30fps upto [1xQHD2xHD] @ H.264,TS-GPUTile,1,2000,5,16000,F
+ATSM75_M1,MULTI | 1VF per pGPU | #VFs=1 | Best Effort Virtual Display,TS-GPUTile,10,20000,64,128000,F
+ATSM75_M3,MULTI | NVF per pGPU | #VFs=3 | Best Effort Virtual Display,TS-GPUTile,10,20000,22,44000,F
+ATSM75_M6,MULTI | NVF per pGPU | #VFs=6 | Best Effort Virtual Display,TS-GPUTile,10,20000,16,32000,F
+ATSM75_M12,MULTI | NVF per pGPU | #VFs=12 | Best Effort Virtual Display,TS-GPUTile,10,20000,8,16000,F
diff --git a/tools/vmtb/vmm_flows/resources/vgpu_profile/PVC2_int.csv b/tools/vmtb/vmm_flows/resources/vgpu_profile/PVC2_int.csv
new file mode 100755
index 000000000..74557116c
--- /dev/null
+++ b/tools/vmtb/vmm_flows/resources/vgpu_profile/PVC2_int.csv
@@ -0,0 +1,8 @@
+vGPUProfileInfo ProfileID,vGPUScheduler ResetAfterVfSwitch,General TileProvisioningMode,PFResources Lmem(B/tile),PFResources Contexts(perTile),PFResources Doorbells(perTile),PFResources GGTTSize(B/tile),VFResources Lmem(B/tile),VFResources Contexts(perTile),VFResources Doorbells(perTile),VFResources GGTTSize(B/tile),AdverseEvents GuCSamplingPeriod(msec),AdverseEvents GuCThresholdCATError,AdverseEvents G2PFNotificationCountCATError,AdverseEvents PFNotificationFreqCATError(msec),AdverseEvents GuCThresholdPageFault,AdverseEvents G2PFNotificationCountPageFault,AdverseEvents PFNotificationFreqPageFault(msec),AdverseEvents GuCThresholdH2GStorm,AdverseEvents G2PFNotificationCountH2GStorm,AdverseEvents PFNotificationFreqH2GStorm(msec),AdverseEvents GuCThresholdDbStorm,AdverseEvents G2PFNotificationCountDbStorm,AdverseEvents PFNotificationFreqDbStorm(msec),AdverseEvents GuCThresholdGTIrqStorm,AdverseEvents G2PFNotificationCountGTIrqStorm,AdverseEvents PFNotificationFreqGTIrqStorm(msec),AdverseEvents GuCThresholdEngineReset,AdverseEvents G2PFNotificationCountEngineReset,AdverseEvents PFNotificationFreqEngineReset(msec)
+PVC2_C1,F,1,4294967296,1024,16,41943040,64424509440,1024,240,4177526784,2,0,3,10000,0,3,10000,0,3,100,0,3,100,0,3,100,0,3,100
+PVC2_C2,F,2,4294967296,1024,16,41943040,32212254720,1024,240,2126512128,2,0,3,10000,0,3,10000,0,3,100,0,3,100,0,3,100,0,3,100
+PVC2_C4,F,3,4294967296,1024,16,41943040,16106127360,1024,120,1063256064,2,0,3,10000,0,3,10000,0,3,100,0,3,100,0,3,100,0,3,100
+PVC2_C8,F,3,4294967296,1024,16,41943040,8053063680,1024,60,531628032,2,0,3,10000,0,3,10000,0,3,100,0,3,100,0,3,100,0,3,100
+PVC2_C16,F,3,4294967296,1024,16,41943040,4026531840,1024,30,265814016,2,0,3,10000,0,3,10000,0,3,100,0,3,100,0,3,100,0,3,100
+PVC2_C32,F,3,4294967296,1024,16,41943040,2013265920,1024,15,132907008,2,0,3,10000,0,3,10000,0,3,100,0,3,100,0,3,100,0,3,100
+PVC2_C62,F,3,4294967296,1024,16,41943040,1039104990,1024,7,68597165,2,0,3,10000,0,3,10000,0,3,100,0,3,100,0,3,100,0,3,100
diff --git a/tools/vmtb/vmm_flows/resources/vgpu_profile/PVC2_vfs.csv b/tools/vmtb/vmm_flows/resources/vgpu_profile/PVC2_vfs.csv
new file mode 100755
index 000000000..7384f4c5b
--- /dev/null
+++ b/tools/vmtb/vmm_flows/resources/vgpu_profile/PVC2_vfs.csv
@@ -0,0 +1,8 @@
+vGPUProfileInfo ProfileID,vGPUProfileInfo Description,vGPUScheduler vGPUSchedulerMode,vGPUScheduler PFExecutionQuanta(msec),vGPUScheduler PFPreemptionTimeout(usec),vGPUScheduler VFExecutionQuanta(msec),vGPUScheduler VFPreemptionTimeout(usec),vGPUScheduler ScheduleIfIdle
+PVC2_C1,COMPUTE| 1VF per pGPU | #VFs=1,TS-GPUTile,64,128000,64,128000,F
+PVC2_C2,COMPUTE| 1VF per Tile | #VFs=2,TS-GPUTile,64,128000,64,128000,F
+PVC2_C4,COMPUTE| 2VFs per Tile | #VFs=4,TS-GPUTile,64,128000,64,128000,F
+PVC2_C8,COMPUTE| 4VFs per Tile | #VFs=8,TS-GPUTile,64,128000,64,128000,F
+PVC2_C16,COMPUTE| 8VFs per Tile | #VFs=16,TS-GPUTile,8,16000,32,64000,T
+PVC2_C32,COMPUTE| 16VFs per Tile | #VFs=32,TS-GPUTile,4,8000,16,32000,T
+PVC2_C62,COMPUTE| 31VFs per Tile | #VFs=62,TS-GPUTile,2,4000,8,16000,T
diff --git a/tools/vmtb/vmm_flows/test_basic.py b/tools/vmtb/vmm_flows/test_basic.py
new file mode 100644
index 000000000..5e45aac04
--- /dev/null
+++ b/tools/vmtb/vmm_flows/test_basic.py
@@ -0,0 +1,175 @@
+#!/usr/bin/env python3
+# SPDX-License-Identifier: MIT
+
+## Copyright (C) 2024 Intel Corporation ##
+
+import logging
+import time
+from typing import List, Tuple
+
+import pytest
+
+from bench import exceptions
+from bench.executors.igt import IgtExecutor, IgtType
+from bench.executors.gem_wsim import (GemWsim, GemWsimResult, gem_wsim_parallel_exec_and_check,
+ PREEMPT_10MS_WORKLOAD, ONE_CYCLE_DURATION_MS)
+from bench.helpers.helpers import (driver_check, igt_check, igt_run_check, modprobe_driver_run_check)
+from bench.machines.host import SriovHost
+from bench.machines.vgpu_profile import VgpuProfileClass
+from bench.machines.pci import GpuDevice
+from vmm_flows.conftest import VmmTestingSetup, VmmTestingConfig, idfn_test_config
+
+logger = logging.getLogger(__name__)
+
+WL_ITERATIONS_10S = 1000
+WL_ITERATIONS_30S = 3000
+MS_IN_SEC = 1000
+DELAY_FOR_WORKLOAD_SEC = 2 # Waiting gem_wsim to be running [seconds]
+DELAY_FOR_RELOAD_SEC = 3 # Waiting before driver reloading [seconds]
+
+
+def set_test_config(test_variants: List[Tuple[VgpuProfileClass, int]],
+ max_vms: int = 2, vf_driver_load: bool = True) -> List[VmmTestingConfig]:
+ """Helper function to provide a parametrized test with a list of test configuration variants."""
+ logger.debug("Init test variants: %s", test_variants)
+ host = SriovHost()
+ test_configs: List[VmmTestingConfig] = []
+
+ for profile_config in test_variants:
+ try:
+ vgpu_profile = host.get_vgpu_profile_by_class(*profile_config)
+ test_configs.append(VmmTestingConfig(vgpu_profile,
+ min(vgpu_profile.get_num_vfs(), max_vms),
+ auto_probe_vm_driver=vf_driver_load))
+ except exceptions.VgpuProfileError as exc:
+ logger.warning("Test variant not supported: %s", exc)
+
+ return test_configs
+
+
+test_variants_1 = [(VgpuProfileClass.AUTO, 1), (VgpuProfileClass.AUTO, 2)]
+
+ at pytest.mark.parametrize('setup_vms', set_test_config(test_variants_1), ids=idfn_test_config, indirect=['setup_vms'])
+class TestVmSetup:
+ """Verify basic virtualization setup:
+ - probe PF and VFIO drivers (host)
+ - enable and provision VFs (automatic or manual with vGPU profile)
+ - power on VMs with assigned VFs
+ - probe VF driver (guest)
+ - shutdown VMs, reset provisioning and disable VFs
+ """
+ def test_vm_boot(self, setup_vms):
+ logger.info("Test VM boot: power on VM and probe VF driver")
+ ts: VmmTestingSetup = setup_vms
+
+ for vm in ts.vms:
+ logger.info("[%s] Verify VF DRM driver is loaded in a guest OS", vm)
+ assert driver_check(vm)
+
+
+if SriovHost().gpu_name is GpuDevice.PVC:
+ test_variants_2 = [(VgpuProfileClass.AUTO, 2),
+ (VgpuProfileClass.COMPUTE, 1), (VgpuProfileClass.COMPUTE, 2)]
+else:
+ test_variants_2 = [(VgpuProfileClass.AUTO, 2),
+ (VgpuProfileClass.MULTIPURPOSE, 1), (VgpuProfileClass.MULTIPURPOSE, 2),
+ (VgpuProfileClass.VDI, 4)]
+
+ at pytest.mark.parametrize('setup_vms', set_test_config(test_variants_2), ids=idfn_test_config, indirect=['setup_vms'])
+class TestVmWorkload:
+ """Verify basic IGT workload execution a VM(s):
+ - exec_store: basic store submissions on single/multiple VMs
+ - gem_wsim: workload simulator running in parallel on multiple VMs
+ """
+ def test_store(self, setup_vms):
+ logger.info("Test VM execution: exec_store")
+ ts: VmmTestingSetup = setup_vms
+ igt_worklads: List[IgtExecutor] = []
+
+ for vm in ts.vms:
+ logger.info("[%s] Execute basic WL", vm)
+ igt_worklads.append(IgtExecutor(vm, IgtType.EXEC_STORE))
+
+ for igt in igt_worklads:
+ logger.info("[%s] Verify result of basic WL", igt.target)
+ assert igt_check(igt)
+
+ logger.info("[%s] Verify result of basic WL", ts.host)
+ igt_run_check(ts.host, IgtType.EXEC_STORE)
+
+ def test_wsim(self, setup_vms):
+ logger.info("Test VM execution: gem_wsim")
+ ts: VmmTestingSetup = setup_vms
+
+ if ts.get_num_vms() < 2:
+ pytest.skip("Test scenario not supported for 1xVM setup ")
+
+ # Single workload takes 10ms GPU time, multiplied by 1000 iterations
+ # gives the expected 10s duration and 100 workloads/sec
+ expected = GemWsimResult(ONE_CYCLE_DURATION_MS * WL_ITERATIONS_10S * len(ts.vms) / MS_IN_SEC,
+ MS_IN_SEC/ONE_CYCLE_DURATION_MS / len(ts.vms))
+
+ # Check preemptable workload
+ result = gem_wsim_parallel_exec_and_check(ts.vms, PREEMPT_10MS_WORKLOAD, WL_ITERATIONS_10S, expected)
+ logger.info("Execute wsim parallel on VMs - results: %s", result)
+
+
+if SriovHost().gpu_name is GpuDevice.PVC:
+ test_variants_3 = [(VgpuProfileClass.AUTO, 2), (VgpuProfileClass.COMPUTE, 2), (VgpuProfileClass.COMPUTE, 4)]
+else:
+ test_variants_3 = [(VgpuProfileClass.AUTO, 2), (VgpuProfileClass.VDI, 2), (VgpuProfileClass.MULTIPURPOSE, 4)]
+
+ at pytest.mark.parametrize('setup_vms', set_test_config(test_variants=test_variants_3, max_vms=4, vf_driver_load=False),
+ ids = idfn_test_config, indirect=['setup_vms'])
+class TestVfDriverLoadRemove:
+ """Verify VF (guest) driver load or remove doesn't affect execution on the other VM:
+ - probe VF driver on the last VM while the first VM is running workload
+ - remove VF driver on the first VM while the last VM is running workload
+ - reload previosuly removed VF driver on the same VM
+ """
+ def test_load(self, setup_vms):
+ logger.info("Test VM driver load: VF driver probe while other VM executes workload")
+ ts: VmmTestingSetup = setup_vms
+
+ vm_first = ts.vms[0]
+ vm_last = ts.vms[-1]
+
+ logger.info("[%s] Load VF driver and run basic WL - first VM", vm_first)
+ assert modprobe_driver_run_check(vm_first, ts.get_vm_modprobe_params)
+
+ expected_elapsed_sec = ONE_CYCLE_DURATION_MS * WL_ITERATIONS_30S / MS_IN_SEC
+ gem_wsim = GemWsim(vm_first, 1, WL_ITERATIONS_30S, PREEMPT_10MS_WORKLOAD)
+ time.sleep(DELAY_FOR_WORKLOAD_SEC)
+ assert gem_wsim.is_running()
+
+ logger.info("[%s] Load VF driver - last VM", vm_last)
+ assert modprobe_driver_run_check(vm_last, ts.get_vm_modprobe_params)
+
+ result = gem_wsim.wait_results()
+ assert expected_elapsed_sec * 0.8 < result.elapsed_sec < expected_elapsed_sec * 1.2
+
+ def test_reload(self, setup_vms):
+ logger.info("Test VM driver reload: VF driver remove is followed by probe while other VM executes workload")
+ ts: VmmTestingSetup = setup_vms
+
+ vm_first = ts.vms[0]
+ vm_last = ts.vms[-1]
+
+ logger.info("[%s] Run basic WL - last VM", vm_last)
+ expected_elapsed_sec = ONE_CYCLE_DURATION_MS * WL_ITERATIONS_30S / MS_IN_SEC
+ gem_wsim = GemWsim(vm_last, 1, WL_ITERATIONS_30S, PREEMPT_10MS_WORKLOAD)
+ time.sleep(DELAY_FOR_WORKLOAD_SEC)
+ assert gem_wsim.is_running()
+
+ logger.info("[%s] Remove VF driver - first VM", vm_first)
+ rmmod_pid = vm_first.execute(f'modprobe -rf {vm_first.get_drm_driver()}')
+ assert vm_first.execute_wait(rmmod_pid).exit_code == 0
+
+ time.sleep(DELAY_FOR_RELOAD_SEC)
+
+ logger.info("[%s] Reload VF driver and run basic WL - first VM", vm_first)
+ assert modprobe_driver_run_check(vm_first, ts.get_vm_modprobe_params)
+ assert igt_run_check(vm_first, IgtType.EXEC_STORE)
+
+ result = gem_wsim.wait_results()
+ assert expected_elapsed_sec * 0.8 < result.elapsed_sec < expected_elapsed_sec * 1.2
diff --git a/tools/vmtb/vmm_flows/test_flr_vm.py b/tools/vmtb/vmm_flows/test_flr_vm.py
new file mode 100644
index 000000000..4c7636825
--- /dev/null
+++ b/tools/vmtb/vmm_flows/test_flr_vm.py
@@ -0,0 +1,162 @@
+#!/usr/bin/env python3
+# SPDX-License-Identifier: MIT
+
+## Copyright (C) 2024 Intel Corporation ##
+
+from bench.executors.igt import IgtExecutor, IgtType
+from bench.executors.gem_wsim import GemWsim
+from bench.helpers.helpers import (driver_check, igt_check, igt_run_check,
+ modprobe_driver, modprobe_driver_check)
+from bench.machines.host import SriovHost
+from bench.machines.virtual.vm import VirtualMachine
+from vmm_flows.conftest import VmmTestingSetup
+
+def test_flr_last(create_1host_2vm):
+ """ Check FLR in MultiVM execution. Reset second VF."""
+ ts: VmmTestingSetup = create_1host_2vm
+ host: SriovHost = ts.get_host
+ vm_first: VirtualMachine = ts.get_vm[0]
+ vm_last: VirtualMachine = ts.get_vm[1]
+ assert driver_check(host)
+
+ total_vfs = host.get_total_vfs()
+ assert host.create_vf(total_vfs) == total_vfs
+ vf_first, vf_last = host.get_vfs_bdf(1, total_vfs)
+
+ vm_first.assign_vf(vf_first)
+ vm_last.assign_vf(vf_last)
+
+ ts.poweron_vms()
+
+ modprobe_first = modprobe_driver(vm_first, ts.get_vm_modprobe_params)
+ modprobe_last = modprobe_driver(vm_last, ts.get_vm_modprobe_params)
+
+ assert modprobe_driver_check(vm_first, modprobe_first)
+ assert modprobe_driver_check(vm_last, modprobe_last)
+
+ igt_vm_first = IgtExecutor(vm_first, IgtType.EXEC_BASIC)
+ igt_vm_last = IgtExecutor(vm_last, IgtType.EXEC_BASIC)
+ assert igt_check(igt_vm_first)
+ assert igt_check(igt_vm_last)
+
+ # get workloads/s during ~2s (default 10ms workload repeated 200 times) as reference
+ gem_wsim_vm_first = GemWsim(vm_first, 1, 200)
+ gem_wsim_result = gem_wsim_vm_first.wait_results()
+ assert gem_wsim_result.elapsed_sec > 1.0
+ expected_wps = gem_wsim_result.workloads_per_sec
+ # with 10ms workload duration we expect ~100 wps, ensure at least half of it
+ assert expected_wps > 50
+
+ # start ~40s workload
+ gem_wsim_vm_first = GemWsim(vm_first, 1, 4000)
+ assert gem_wsim_vm_first.is_running()
+
+ # initiate FLR on last VM
+ assert igt_run_check(vm_last, 'igt at device_reset@unbind-reset-rebind')
+
+ assert gem_wsim_vm_first.is_running()
+ gem_wsim_result = gem_wsim_vm_first.wait_results()
+ assert gem_wsim_result.elapsed_sec > 1.0
+ # check workloads/s did not drop during last VM FLR more than 10%
+ assert gem_wsim_result.workloads_per_sec > expected_wps * 0.9
+
+ # W/A wakeref: VFs must be disabled before starting run on PF to avoid stuck/timeout on DROP_IDLE
+ ts.poweroff_vms()
+ host.clear_vf()
+
+ assert igt_run_check(host, IgtType.EXEC_BASIC)
+
+def test_flr_first(create_1host_2vm):
+ """ Check FLR in MultiVM execution. Reset first VF."""
+ ts: VmmTestingSetup = create_1host_2vm
+ host: SriovHost = ts.get_host
+ vm_first: VirtualMachine = ts.get_vm[0]
+ vm_last: VirtualMachine = ts.get_vm[1]
+ assert driver_check(host)
+
+ total_vfs = host.get_total_vfs()
+ assert host.create_vf(total_vfs) == total_vfs
+ vf_first, vf_last = host.get_vfs_bdf(1, total_vfs)
+
+ vm_first.assign_vf(vf_first)
+ vm_last.assign_vf(vf_last)
+
+ ts.poweron_vms()
+
+ modprobe_first = modprobe_driver(vm_first, ts.get_vm_modprobe_params)
+ modprobe_last = modprobe_driver(vm_last, ts.get_vm_modprobe_params)
+
+ assert modprobe_driver_check(vm_first, modprobe_first)
+ assert modprobe_driver_check(vm_last, modprobe_last)
+
+ igt_vm_first = IgtExecutor(vm_first, IgtType.EXEC_BASIC)
+ igt_vm_last = IgtExecutor(vm_last, IgtType.EXEC_BASIC)
+ assert igt_check(igt_vm_first)
+ assert igt_check(igt_vm_last)
+
+ # get workloads/s during ~2s (default 10ms workload repeated 200 times) as reference
+ gem_wsim_vm_last = GemWsim(vm_last, 1, 200)
+ gem_wsim_result = gem_wsim_vm_last.wait_results()
+ assert gem_wsim_result.elapsed_sec > 1.0
+ expected_wps = gem_wsim_result.workloads_per_sec
+ # with 10ms workload duration we expect ~100 wps, ensure at least half of it
+ assert expected_wps > 50.0
+
+ # start ~40s workload
+ gem_wsim_vm_last = GemWsim(vm_last, 1, 4000)
+ assert gem_wsim_vm_last.is_running()
+
+ # initiate FLR on first VM
+ assert igt_run_check(vm_first, 'igt at device_reset@unbind-reset-rebind')
+
+ assert gem_wsim_vm_last.is_running()
+ gem_wsim_result = gem_wsim_vm_last.wait_results()
+ assert gem_wsim_result.elapsed_sec > 1.0
+ # check workloads/s did not drop during first VM FLR more than 10%
+ assert gem_wsim_result.workloads_per_sec > expected_wps * 0.9
+
+ # W/A wakeref: VFs must be disabled before starting run on PF to avoid stuck/timeout on DROP_IDLE
+ ts.poweroff_vms()
+ host.clear_vf()
+
+ assert igt_run_check(host, IgtType.EXEC_BASIC)
+
+
+def test_flr_both(create_1host_2vm):
+ """ Check FLR in MultiVM execution. Reset both VF."""
+ ts: VmmTestingSetup = create_1host_2vm
+ host: SriovHost = ts.get_host
+ vm_first: VirtualMachine = ts.get_vm[0]
+ vm_last: VirtualMachine = ts.get_vm[1]
+ assert driver_check(host)
+
+ total_vfs = host.get_total_vfs()
+ assert host.create_vf(total_vfs) == total_vfs
+ vf_first, vf_last = host.get_vfs_bdf(1, total_vfs)
+
+ vm_first.assign_vf(vf_first)
+ vm_last.assign_vf(vf_last)
+
+ ts.poweron_vms()
+
+ modprobe_first = modprobe_driver(vm_first, ts.get_vm_modprobe_params)
+ modprobe_last = modprobe_driver(vm_last, ts.get_vm_modprobe_params)
+
+ assert modprobe_driver_check(vm_first, modprobe_first)
+ assert modprobe_driver_check(vm_last, modprobe_last)
+
+ igt_vm_first = IgtExecutor(vm_first, IgtType.EXEC_STORE)
+ igt_vm_last = IgtExecutor(vm_last, IgtType.EXEC_STORE)
+ assert igt_check(igt_vm_first)
+ assert igt_check(igt_vm_last)
+
+ igt_vm_first = IgtExecutor(vm_first, 'igt at device_reset@unbind-reset-rebind')
+ igt_vm_last = IgtExecutor(vm_last, 'igt at device_reset@unbind-reset-rebind')
+ assert igt_check(igt_vm_first)
+ assert igt_check(igt_vm_last)
+
+ # W/A wakeref: VFs must be disabled before starting run on PF to avoid stuck/timeout on DROP_IDLE
+ ts.poweroff_vms()
+ host.clear_vf()
+
+ assert igt_run_check(host, IgtType.EXEC_BASIC)
diff --git a/tools/vmtb/vmm_flows/test_guc_versioning.py b/tools/vmtb/vmm_flows/test_guc_versioning.py
new file mode 100644
index 000000000..f98931cb7
--- /dev/null
+++ b/tools/vmtb/vmm_flows/test_guc_versioning.py
@@ -0,0 +1,157 @@
+#!/usr/bin/env python3
+# SPDX-License-Identifier: MIT
+
+## Copyright (C) 2024 Intel Corporation ##
+
+import logging
+import re
+from pathlib import Path
+from typing import List, Tuple, Union
+
+import pytest
+
+from bench import exceptions
+from bench.executors.igt import IgtType
+from bench.executors.shell import ShellExecutor
+from bench.helpers.helpers import (cmd_check, igt_run_check, modprobe_driver_run_check, unload_host_drivers, GucVersion)
+from bench.machines.host import SriovHost
+from bench.machines.vgpu_profile import VgpuProfileClass
+from bench.machines import pci
+from bench.machines.virtual.vm import VirtualMachine
+from vmm_flows.conftest import VmmTestingSetup, VmmTestingConfig, idfn_test_config
+
+logger = logging.getLogger(__name__)
+
+GUC_VER_FILE = Path(Path.cwd(), 'vmm_flows/resources/guc/guc_versions.txt')
+
+
+def helper_read_and_parse_guc_file() -> List[GucVersion]:
+ """Helper function to get list of GuC binary versions from text file to iterate over in test."""
+ guc_versions_list: List[GucVersion] = []
+
+ with open(GUC_VER_FILE, 'r', encoding='utf-8-sig') as file:
+ lines = file.readlines()
+
+ for line in lines:
+ line_parsed = line.rstrip().split('.')
+ guc_versions_list.append(GucVersion(int(line_parsed[0]), int(line_parsed[1]), int(line_parsed[2])))
+
+ return guc_versions_list
+
+
+def helper_get_firmware_version_from_str(pattern: str, source_string: str) -> GucVersion:
+ """Helper function to search for 3-digit version tag within a string."""
+ search_result = re.search(pattern, source_string)
+ if search_result is None:
+ raise exceptions.HostError(f'the following string pattern was not found: {pattern}')
+
+ version = [int(i) for i in re.findall(r'\d+', search_result.group())]
+
+ return GucVersion(version[0], version[1], version[2])
+
+
+def set_versioning_test_config(test_variants: List[Tuple[VgpuProfileClass, int]]) -> List[VmmTestingConfig]:
+ """Helper function to provide a parametrized test with a list of test configuration variants.
+ For GuC versioning test, VM shall not power on automatically to allow prior GuC FW override via modparam
+ and PF driver should be removed on test tear-down to reset host configuration changes.
+ """
+ logger.debug("Init test variants: %s", test_variants)
+ host = SriovHost()
+ test_configs: List[VmmTestingConfig] = []
+
+ for profile_config in test_variants:
+ try:
+ vgpu_profile = host.get_vgpu_profile_by_class(*profile_config)
+ test_configs.append(VmmTestingConfig(vgpu_profile,
+ min(vgpu_profile.get_num_vfs(), 1),
+ auto_poweron_vm = False,
+ unload_host_drivers_on_teardown = True))
+ except exceptions.VgpuProfileError as exc:
+ logger.warning("Test variant not supported: %s", exc)
+
+ return test_configs
+
+
+test_variants_1 = [(VgpuProfileClass.AUTO, 1)]
+
+ at pytest.mark.parametrize('setup_vms', set_versioning_test_config(test_variants=test_variants_1),
+ ids = idfn_test_config, indirect=['setup_vms'])
+def test_guc_versioning_pf_legacy(setup_vms):
+ """Verify that VF interface GuC version on VM will automatically fallback if legacy GuC firmware is present on PF.
+ Test will reload host driver multiple times, each time with different GuC firmware binary version, for each reload
+ if VF interface minor version has changed a VM is set up and GuC VF interface is checked from within VM against
+ what PF reports.
+ """
+ ts: VmmTestingSetup = setup_vms
+ host: SriovHost = ts.get_host
+ vm0: VirtualMachine = ts.get_vm[0]
+
+ if host.gpu_name in (pci.GpuDevice.ATSM150, pci.GpuDevice.ATSM75):
+ firmware_prefix = 'dg2_guc_'
+ elif host.gpu_name is pci.GpuDevice.PVC:
+ firmware_prefix = 'pvc_guc_'
+ elif host.gpu_name is pci.GpuDevice.ADLP:
+ firmware_prefix = 'adlp_guc_'
+ else:
+ raise exceptions.HostError(f'GPU Device unknown: {host.gpu_name}')
+
+ results_final: List[Tuple[GucVersion, GucVersion, Union[GucVersion, str], bool, bool]] = []
+ version_pf = GucVersion(0, 0, 0)
+ version_vf = GucVersion(0, 100, 0)
+ version_vm = GucVersion(0, 0, 0)
+ guc_check_list = helper_read_and_parse_guc_file()
+
+ for guc_ver in guc_check_list:
+ unload_host_drivers(host)
+
+ modprobe_driver_run_check(host, f'guc_firmware_path=i915/{firmware_prefix}{str(guc_ver)}.bin')
+
+ guc_info = host.read_file_content(f'{host.get_debugfs_path()}/gt0/uc/guc_info')
+ pf_pattern = r'found \d+\.\d+\.\d+'
+ vf_pattern = r'GuC Submission API Version: \d+\.\d+\.\d+'
+
+ version_pf = helper_get_firmware_version_from_str(pf_pattern, guc_info)
+ version_vf = helper_get_firmware_version_from_str(vf_pattern, guc_info)
+
+ logger.debug('Detected GuC version %s with VF interface %s', version_pf, version_vf)
+
+ # Skip testing for versions with VF interface 1.0
+ if version_vf.major == 1 and version_vf.minor == 0:
+ break
+
+ drm_driver = host.get_drm_driver()
+ load_vfio_pci = ShellExecutor(host, f'modprobe {drm_driver}-vfio-pci')
+ assert cmd_check(load_vfio_pci)
+
+ assert host.create_vf(1) == 1
+ vf1 = host.get_vf_bdf(1)
+ vm0.assign_vf(vf1)
+ vm0.poweron()
+
+ if modprobe_driver_run_check(vm0, ts.get_vm_modprobe_params):
+ logger.debug('Driver loaded')
+ guc_info = vm0.read_file_content(f'{host.get_debugfs_path()}/gt0/uc/guc_info')
+ version_vm = helper_get_firmware_version_from_str(vf_pattern, guc_info)
+ logger.debug('Detected VF interface %s on VM', version_vm)
+
+ workload = igt_run_check(vm0, IgtType.EXEC_STORE)
+ logger.debug("Workload on VM with VF interface %s passed: %s", version_vm, workload)
+
+ results_final.append((version_pf, version_vf, version_vm, True, workload))
+ else:
+ logger.debug('Could not load driver on VM when using GuC %s (VF interface %s)',
+ version_pf, version_vf)
+ results_final.append((version_pf, version_vf, 'driver not loaded', False, False))
+
+ vm0.poweroff()
+ host.clear_vf()
+
+ logger.debug("The list containing results is as follows:")
+ logger.debug("GuC version | VF interface version (supported by PF) | VF interface version (read from VM) |"
+ " modprobe result | WL result")
+ for result in results_final:
+ logger.debug(result)
+
+ results_driver_load = [x[3] for x in results_final]
+ results_workload = [x[4] for x in results_final]
+ assert (all(results_driver_load) and all(results_workload))
diff --git a/tools/vmtb/vmm_flows/test_migration.py b/tools/vmtb/vmm_flows/test_migration.py
new file mode 100644
index 000000000..152cf56a7
--- /dev/null
+++ b/tools/vmtb/vmm_flows/test_migration.py
@@ -0,0 +1,955 @@
+#!/usr/bin/env python3
+# SPDX-License-Identifier: MIT
+
+## Copyright (C) 2024 Intel Corporation ##
+
+import logging
+import enum
+import time
+import random
+import pytest
+
+from bench import exceptions
+from bench.executors.igt import IgtExecutor, IgtType
+from bench.executors.shell import ShellExecutor
+from bench.executors.gem_wsim import GemWsim, PREEMPT_10MS_WORKLOAD, ONE_CYCLE_DURATION_MS
+from bench.helpers.helpers import (load_host_drivers, driver_check,
+ modprobe_driver, modprobe_driver_check, modprobe_driver_run_check,
+ igt_check, igt_run_check, cmd_run_check, duplicate_vm_image)
+from bench.machines.host import SriovHost
+from bench.machines.virtual.vm import VirtualMachine
+from bench.machines.vgpu_profile import VgpuProfile
+from bench.machines import pci
+from vmm_flows.conftest import VmmTestingSetup, VmmTestingConfig
+# TODO: Move provisioning helper functions to a separate lib to facilitate usage from different tests
+from vmm_flows.test_provisioning import (helper_configure_max_available_resources, helper_provision_strategy,
+ helper_fetch_sriov_provisioning, helper_apply_sriov_provisioning,
+ SriovAvailableResources)
+
+logger = logging.getLogger(__name__)
+
+IGT_INIT_DELAY = 10
+MS_IN_SEC = 1000
+
+def test_vf_pause_run_resume(create_1host_1vm):
+ """VF pause blocks execution request until resumed."""
+ ts: VmmTestingSetup = create_1host_1vm
+ host: SriovHost = ts.get_host
+ vm: VirtualMachine = ts.get_vm[0]
+ assert driver_check(host)
+
+ vf_num = pause_vf_num = 1
+ assert host.create_vf(1) == 1
+ vf = host.get_vf_bdf(vf_num)
+ vm.assign_vf(vf)
+ vm.poweron()
+
+ assert modprobe_driver_run_check(vm, ts.get_vm_modprobe_params)
+
+ # Special handling of pausing VMs with infinite ExecQuanta - refer to SAS for details
+ logger.debug("Set VF1 EQ/PF before the pause")
+ host.set_exec_quantum_ms(pause_vf_num, 0, 1)
+ host.set_preempt_timeout_us(pause_vf_num, 0, 100)
+
+ logger.debug("Pause VF - suspend IGT execution on VM\n")
+ host.set_vf_control(1, host.VfControl.pause)
+
+ # TODO: Implement class for IgtWorkload (containing usual execution times for specific tests)
+ igt_max_exec_time = 35 # usual execution time for gem_spin_batch is 30-32s
+
+ logger.debug("Pause VF - submit IGT workload with timeout %ss\n", igt_max_exec_time)
+ igt_vm = IgtExecutor(vm, IgtType.SPIN_BATCH, timeout=igt_max_exec_time)
+ try:
+ # IGT workload execution suspended by VF pause should cause errors and fail on timeout
+ assert not igt_check(igt_vm)
+ except exceptions.AlarmTimeoutError:
+ logger.info("(Expected) IGT execution timeout in VF paused state - kill IGT process")
+ igt_vm.terminate()
+
+ logger.debug("Resume VF - continue IGT execution on VM\n")
+ host.set_vf_control(1, host.VfControl.resume)
+
+ logger.debug("Reset VF1 EQ/PF to the initial values (infinite) after resume")
+ host.set_exec_quantum_ms(pause_vf_num, 0, 0)
+ host.set_preempt_timeout_us(pause_vf_num, 0, 0)
+
+ # Check host and VM health status after pause-resume transition
+ assert driver_check(host)
+ assert driver_check(vm)
+
+ logger.debug("Retry IGT execution in VF normal (running) state")
+ igt_vm = IgtExecutor(vm, IgtType.SPIN_BATCH, timeout=igt_max_exec_time)
+ try:
+ assert igt_check(igt_vm)
+ logger.debug("IGT workload execution finished in the usual time (<%ss)", igt_max_exec_time)
+ except exceptions.AlarmTimeoutError:
+ logger.error("(Unexpected) IGT execution timeout in VF running state - kill IGT process")
+ igt_vm.terminate()
+ assert False
+
+
+def test_2vm_pause_resume(create_1host_2vm):
+ """
+ VM/VF pause-resume does not affect workload execution:
+ - 2xVFs running 2xVM instance
+ - both VFs auto-provisioned, running IGT workloads
+ - 1st VM/VF is paused and resumed (but VF state is not saved/loaded)
+ - 2nd VM/VF workload should not be interrupted
+ - IGT workloads shall finish successfully on both VMs
+ """
+
+ ts: VmmTestingSetup = create_1host_2vm
+ host: SriovHost = ts.get_host
+ vm0: VirtualMachine = ts.get_vm[0]
+ vm1: VirtualMachine = ts.get_vm[1]
+ assert driver_check(host)
+
+ assert host.create_vf(2) == 2
+ vf1, vf2 = host.get_vfs_bdf(1, 2)
+ vm0.assign_vf(vf1)
+ vm1.assign_vf(vf2)
+ ts.poweron_vms()
+
+ pause_vf_num = 1
+
+ assert modprobe_driver_run_check(vm0, ts.get_vm_modprobe_params)
+ assert modprobe_driver_run_check(vm1, ts.get_vm_modprobe_params)
+
+ logger.debug("Submit IGT WL (gem_wsim) on VM0")
+ iterations = 3000 # 3k iterations of 10ms WLs give 30s total expected time
+ expected_elapsed_sec = ONE_CYCLE_DURATION_MS * iterations / MS_IN_SEC
+ gem_wsim_vm0 = GemWsim(vm0, 1, iterations, PREEMPT_10MS_WORKLOAD)
+
+ # Allow wsim WL to run some time
+ time.sleep(IGT_INIT_DELAY)
+ assert gem_wsim_vm0.is_running()
+
+ logger.debug("Submit IGT WL (gem_spin_batch) on VM1")
+ igt_vm1 = IgtExecutor(vm1, IgtType.SPIN_BATCH)
+
+ # Special handling of pausing VMs with infinite ExecQuanta - refer to SAS for details
+ logger.debug("Set VF1 EQ/PF before the pause")
+ host.set_exec_quantum_ms(pause_vf_num, 0, 1)
+ host.set_preempt_timeout_us(pause_vf_num, 0, 100)
+
+ logger.debug("Pause execution on VM0/VF1")
+ vm0.pause()
+
+ assert igt_check(igt_vm1)
+ logger.debug("VM1 IGT WL (not paused) finished successfully")
+
+ logger.debug("Resume execution on VM0/VF1")
+ vm0.resume()
+
+ logger.debug("Reset VF1 EQ/PF to the initial values (infinite) after resume")
+ host.set_exec_quantum_ms(pause_vf_num, 0, 0)
+ host.set_preempt_timeout_us(pause_vf_num, 0, 0)
+
+ result_vm0 = gem_wsim_vm0.wait_results()
+ assert expected_elapsed_sec * 0.8 < result_vm0.elapsed_sec < expected_elapsed_sec * 1.2
+ logger.debug("VM0 IGT WL (paused-resumed) finished successfully")
+
+ # Check host and VM health status after pause-resume transition
+ assert driver_check(host)
+ assert driver_check(vm0)
+ assert driver_check(vm1)
+
+
+def test_1vm_save_restore_no_driver(create_1host_1vm):
+ """
+ Save/restore single VM state with no guest driver loaded:
+ - 1xVFs running 1xVM instance (single VM acts as source and destination)
+ - platform provisioned with vGPU profile M1 (ATSM, ADLP) or C1 (PVC)
+ - VF state saved and then restored on the same VM instance
+ - driver probed on VM after the resume, IGT workload executed
+ """
+ ts: VmmTestingSetup = create_1host_1vm
+ host: SriovHost = ts.get_host
+ vm: VirtualMachine = ts.get_vm[0]
+ profile_id: str = 'C1' if host.gpu_name is pci.GpuDevice.PVC else 'M1'
+
+ assert driver_check(host)
+
+ logger.debug("Set vGPU profile - %s", profile_id)
+ vgpu_profile = ts.get_host.get_vgpu_profile_by_id(profile_id)
+
+ # XXX: VF migration on discrete devices (with LMEM) is currently very slow and time-outs in CI execution (20min).
+ # As a temporary workaround, reduce size of LMEM assigned to VFs to speed up a state save/load process.
+ if host.has_lmem():
+ logger.debug("W/A: reduce VFs LMEM quota to accelerate state save/restore")
+ vgpu_profile.vfLmem = 1073741824 # 1GB
+
+ host.set_vgpu_profile(vgpu_profile)
+
+ assert host.create_vf(1) == 1
+ vf1 = host.get_vf_bdf(1)
+ vm.assign_vf(vf1)
+ vm.poweron()
+
+ # Run some interactive program (not returning, as vim) to verify state after migration
+ src_proc = ShellExecutor(vm, 'vim migrate.txt')
+ src_pid = src_proc.pid
+
+ # Pause VM and save snapshot
+ logger.debug("Pause execution and save VM state")
+ try:
+ vm.pause()
+ vm.save_state()
+ except exceptions.GuestError as exc:
+ logger.warning("Migration error: %s", exc)
+ vm.poweroff()
+ assert False
+
+ # Load previously saved snapshot and resume the same VM
+ logger.debug("Load state on the same VM instance")
+ vm.load_state()
+ vm.resume()
+
+ # Verify program initiated on source VM is stil running after migration
+ migrated_proc = vm.execute_status(src_pid)
+ logger.debug("Migrated process: %s", migrated_proc)
+ assert migrated_proc.exited is False
+
+ logger.debug("Probe driver and execute workload on VM")
+ assert modprobe_driver_run_check(vm, ts.get_vm_modprobe_params)
+ assert igt_run_check(vm, IgtType.EXEC_STORE)
+
+ logger.debug("Check driver health on host and VM")
+ assert driver_check(host)
+ assert driver_check(vm)
+
+
+# TODO: reuse common 'setup_vms' from the conftest.py
+ at pytest.fixture(scope='class', name='setup_vms')
+def fixture_setup_vms(get_os_image, get_vm_modparams, get_host, request):
+ """
+ Main setup fixture for parametrized tests - configures NxVMs.
+ Accepts input tuple with a number of expected VMs and optional WorkloadType to execute.
+ Fixture performs the following config:
+ - loads host DRM and VFIO driver if needed
+ - enables VFs for each requested VM and sets vGPU profile
+ - assignes VFs to all requested VMs and boots it
+ - probes guest DRM driver
+ - performs cleanup on test tear-down
+ """
+ num_vms, wl_type = request.param
+ num_vfs = num_vms
+
+ host: SriovHost = get_host
+ profile_id: str = f'C{num_vms}' if host.gpu_name is pci.GpuDevice.PVC else f'M{num_vms}'
+ vgpu_profile: VgpuProfile = host.get_vgpu_profile_by_id(profile_id)
+ ts: VmmTestingSetup = VmmTestingSetup(get_os_image, get_vm_modparams, host, VmmTestingConfig(vgpu_profile, num_vms))
+
+ logger.info('[Test setup - %sxVM]', num_vms)
+ load_host_drivers(host)
+ assert driver_check(host)
+
+ # XXX: VF migration on discrete devices (with LMEM) is currently very slow and time-outs in CI execution (20min).
+ # As a temporary workaround, reduce size of LMEM assigned to VFs to speed up a state save/load process.
+ if host.has_lmem():
+ logger.debug("W/A: reduce VFs LMEM quota to accelerate state save/restore")
+ org_vgpu_profile_vfLmem = vgpu_profile.vfLmem
+ vgpu_profile.vfLmem = min(vgpu_profile.vfLmem // 2, 536870912) # Assign max 512 MB to VF
+
+ host.set_vgpu_profile(vgpu_profile)
+ assert host.create_vf(num_vfs) == num_vfs
+
+ bdf_list = [host.get_vf_bdf(vf) for vf in range(1, ts.get_num_vms() + 1)]
+ for vm, bdf in zip(ts.get_vm, bdf_list):
+ vm.assign_vf(bdf)
+
+ ts.poweron_vms()
+
+ modprobe_cmds = [modprobe_driver(vm, ts.get_vm_modprobe_params) for vm in ts.get_vm]
+ for i, cmd in enumerate(modprobe_cmds):
+ assert modprobe_driver_check(ts.get_vm[i], cmd), f'modprobe failed on VM{i}'
+
+ logger.info('[Test execution]')
+ yield (ts, wl_type)
+
+ logger.info('[Test teardown]')
+ # XXX: cleanup counterpart for VFs LMEM quota workaround - restore original value
+ if host.has_lmem():
+ vgpu_profile.vfLmem = org_vgpu_profile_vfLmem
+
+ ts.teardown()
+
+
+def idfn_num_vms_wl(parameter):
+ """
+ Provides number of VMs and optionally executed workload in a name of parametrized tests, e.g.:
+ - test_something[1VM]
+ - test_something[2VM-WL:igt_test_name])
+ """
+ num_vfs, wl = parameter
+ return f'{num_vfs}VM-WL:{wl}'if wl else f'{num_vfs}VM'
+
+
+class WorkloadType(str, enum.Enum):
+ # Idle with multiple user contexts created
+ IDLE_USER_CTX = 'await-migration-mulctx-survive'
+ # Idle with default context
+ IDLE_DEF_CTX = 'await-migration-defctx-survive'
+ # Null batches with user contexts:
+ NULL_BATCH = 'await-migration-exec-nop-storm-survive-mulctx'
+ # Null batches with default context:
+ NULL_BATCH_DEF_CTX = 'await-migration-exec-nop-storm-survive-defctx'
+ # Short batches storing a value (aiming 6ms execution time), synchronize submissions using fences:
+ STORE_DW_BATCH = 'await-migration-exec-store-storm-survive-mulctx'
+ # Short preemptable batches (aiming 20ms execution time), synchronize submissions using fences:
+ PREEMPT_COUNT_DW_LOW_BATCH = 'await-migration-exec-count-low-storm-survive-wpreem-wfence-mulctx'
+ # Long preemptable batches (aiming 200ms execution time), synchronize submissions using fences:
+ PREEMPT_COUNT_DW_MEDIUM_BATCH = 'await-migration-exec-count-med-storm-survive-wpreem-wfence-mulctx'
+ # Long preemptable batches (aiming 2s execution time), synchronize submissions using fences:
+ PREEMPT_COUNT_DW_HIGH_BATCH = 'await-migration-exec-count-hig-storm-survive-wpreem-wfence-mulctx'
+ # Long preemptable batches (aiming 2s execution time), synchronize submissions using gem_wait(), without fences:
+ PREEMPT_COUNT_DW_HIGH_BATCH_GEM_WAIT = 'await-migration-exec-count-hig-storm-survive-wpreem-wgemwait-mulctx'
+ # Long preemptable batches (aiming 2s execution time), do not synchronize submissions, only delay on CPU side:
+ PREEMPT_COUNT_DW_HIGH_BATCH_GEM_WAIT_NO_SYNC = 'await-migration-exec-count-hig-storm-survive-wpreem-nosync-mulctx'
+ # Short non-preemptable batches (aiming 20ms execution time), synchronize submissions using fences:
+ NONPREEMPT_COUNT_DW_LOW_BATCH = 'await-migration-exec-count-low-storm-survive-npreem-wfence-mulctx'
+ # Long non-preemptable batches (aiming 200ms execution time), synchronize submissions using fences:
+ NONPREEMPT_COUNT_DW_MEDIUM_BATCH = 'await-migration-exec-count-med-storm-survive-npreem-wfence-mulctx'
+ # Long non-preemptable batches (aiming 2s execution time), synchronize submissions using fences:
+ NONPREEMPT_COUNT_DW_HIGH_BATCH = 'await-migration-exec-count-hig-storm-survive-npreem-wfence-mulctx'
+
+ def __str__(self) -> str:
+ return str.__str__(self)
+
+
+class BaseTestBusyMigration:
+ """
+ Base class for all busy migration tests (with workload executed) providing save and restore subtests.
+ Supports parametrization with a different VMs number and various IGT workload types,
+ but currently inherited by separate child test classes with specific WL
+ to avoid bulk dynamic test variants execution within a single test task in GTAx.
+ """
+
+ @pytest.fixture(scope='class', name='run_source_workload')
+ def fixture_run_source_workload(self, setup_vms):
+ ts: VmmTestingSetup
+ wl: IgtExecutor
+ ts, wl = setup_vms
+ vm_src: VirtualMachine = ts.get_vm[0] # First VM as source
+
+ # Run IGT workload to check before and after a state checkpoint
+ return IgtExecutor(vm_src, f'igt at gem_sriov_migration_qemu@{wl}')
+
+ @pytest.fixture(scope='function', name='setup_destination_vm')
+ def fixture_setup_destination_vm(self, setup_vms):
+ ts: VmmTestingSetup
+ ts, _ = setup_vms
+ vm_src: VirtualMachine = ts.get_vm[0] # First VM as a source
+ vm_dst: VirtualMachine = ts.get_vm[-1] # Last VM as a destination
+ num_vms = ts.get_num_vms()
+
+ if num_vms == 1:
+ logger.debug("Single VM: the same source and destination VM instance")
+ assert vm_src == vm_dst
+ return vm_dst
+
+ logger.debug("Multiple VMs: reload destination VM with the source image (with state snapshot)")
+
+ if vm_src.is_running():
+ # QMP 'quit' is used for paused VM (cannot be powered off via quest-agent)
+ vm_src.quit()
+
+ if vm_dst.is_running():
+ vm_dst.quit()
+ while vm_dst.is_running():
+ time.sleep(1) # VM usually doesn't terminate immediately
+
+ # Re-start destination VM with an image containing a state snapshot
+ vm_dst.set_migration_source(vm_src.image)
+ vm_dst.poweron()
+
+ return vm_dst
+
+ def test_save(self, setup_vms, run_source_workload):
+ logger.info("Test VM busy migration: state save")
+ ts: VmmTestingSetup
+ ts, _ = setup_vms
+ vm_src: VirtualMachine = ts.get_vm[0] # First VM as source
+
+ logger.debug("Execute throughout-migration workload on source VM")
+ migration_wl = run_source_workload
+ time.sleep(IGT_INIT_DELAY)
+
+ # Pause VM and save snapshot
+ logger.debug("Pause execution and save source VM state")
+ try:
+ vm_src.pause()
+ vm_src.save_state()
+ except exceptions.GuestError as exc:
+ logger.warning("State save error: %s", exc)
+ vm_src.quit()
+ assert False
+
+ logger.debug("Resume execution on source VM")
+ vm_src.resume()
+ logger.debug("Check result of throughout-migration workload on source VM")
+ assert igt_check(migration_wl)
+
+ if ts.get_num_vms() > 1:
+ logger.debug("Multiple VMs: shutdown source VM")
+ vm_src.poweroff()
+
+ def test_restore(self, setup_vms, setup_destination_vm, run_source_workload):
+ logger.info("Test VM busy migration: state restore")
+ ts: VmmTestingSetup
+ ts, _ = setup_vms
+ vm_dst: VirtualMachine = setup_destination_vm
+ migration_igt: IgtExecutor = run_source_workload # Get an instance of the IGT WL started in a save test
+
+ # Patch the source IgtExecutor instance with the current VM and clear results cache
+ migration_igt.target = vm_dst
+ migration_igt.results.clear()
+
+ # Load the source state snapshot
+ logger.debug("Restore source state on the destination VM")
+ vm_dst.load_state()
+ vm_dst.resume()
+
+ # TODO: add sync to VM class
+ sync_value = random.randint(1, 0xFFFF)
+ assert vm_dst.ga.sync(sync_value)['return'] == sync_value
+
+ time.sleep(IGT_INIT_DELAY)
+ assert igt_check(migration_igt)
+
+ logger.debug("Check driver health on host and destination VM")
+ assert driver_check(ts.host)
+ assert driver_check(vm_dst)
+
+
+ at pytest.mark.parametrize('setup_vms',
+ [(1, WorkloadType.NULL_BATCH),
+ (2, WorkloadType.NULL_BATCH)],
+ ids = idfn_num_vms_wl, indirect=['setup_vms'])
+class TestBusyMigrationNop(BaseTestBusyMigration):
+ """
+ Save-restore VM state with VF busy executing NOP batches:
+ IGT workload initiated pre-migration starts firing empty submissions and
+ during the execution VM state is migrated (VM state snapshot is saved, then restored).
+ In the post-migration some additional null batches are submitted,
+ then IGT verifies GPU is finally idle.
+ Executed in the following VM number variants:
+ - single VF/VM: same VM acts as a source and destination.
+ - multiple VFs/VMs: the workload execution is initiated on the source VM,
+ then migrated and verified on the other, destination one.
+ """
+
+
+ at pytest.mark.parametrize('setup_vms',
+ [(1, WorkloadType.STORE_DW_BATCH),
+ (2, WorkloadType.STORE_DW_BATCH)],
+ ids = idfn_num_vms_wl, indirect=['setup_vms'])
+class TestBusyMigrationShort(BaseTestBusyMigration):
+ """
+ Save-restore VM state with VF busy executing short store batches:
+ IGT workload initiated pre-migration starts firing short submissions storing a value and
+ during the execution VM state is migrated (VM state snapshot is saved, then restored).
+ In the post-migration some additional store_dw batches are submitted,
+ then IGT verifies value stored by each sumbission is expected.
+ Executed in the following VM number variants:
+ - single VF/VM: same VM acts as a source and destination.
+ - multiple VFs/VMs: the workload execution is initiated on the source VM,
+ then migrated and verified on the other, destination one.
+ """
+
+
+ at pytest.mark.parametrize('setup_vms',
+ [(1, WorkloadType.PREEMPT_COUNT_DW_MEDIUM_BATCH),
+ (2, WorkloadType.PREEMPT_COUNT_DW_MEDIUM_BATCH)],
+ ids = idfn_num_vms_wl, indirect=['setup_vms'])
+class TestBusyMigrationLongPreemptable(BaseTestBusyMigration):
+ """
+ Save-restore VM state with VF busy executing quite long (200ms) but preemptable batches:
+ IGT workload initiated pre-migration starts firing relatively complex submissions and
+ during the execution VM state is migrated (VM state snapshot is saved, then restored).
+ In the post-migration some additional batches are submitted,
+ then IGT verifies value stored by each sumbission is expected.
+ Executed in the following VM number variants:
+ - single VF/VM: same VM acts as a source and destination.
+ - multiple VFs/VMs: the workload execution is initiated on the source VM,
+ then migrated and verified on the other, destination one.
+ """
+
+
+ at pytest.mark.parametrize('setup_vms',
+ [(1, WorkloadType.IDLE_DEF_CTX),
+ (1, WorkloadType.IDLE_USER_CTX),
+ (2, WorkloadType.IDLE_DEF_CTX),
+ (2, WorkloadType.IDLE_USER_CTX)],
+ ids = idfn_num_vms_wl, indirect=['setup_vms'])
+class TestIdleAppMigration(BaseTestBusyMigration):
+ """
+ Save-restore VM state with an idle VF but user application attached (user contexts created):
+ IGT workload initiated pre-migration does a single submission but is idle during a save-restore operation,
+ then resumes post-migration to do more submissions on previously created contexts.
+ Executed with two workloads:
+ - Default context used
+ - Multiple user contexts created (one per request)
+ and the following VM number variants:
+ - single VF/VM: same VM acts as a source and destination.
+ - multiple VFs/VMs: the workload execution is initiated on the source VM,
+ then migrated and verified on the other, destination one.
+ """
+
+
+ at pytest.mark.parametrize('setup_vms', [(1, None), (2, None)], ids = idfn_num_vms_wl, indirect=['setup_vms'])
+class TestIdleMigration:
+ """
+ Save-restore VM state with an idle VF and no user application attached:
+ IGT workload initiated and ended twice: pre- and post-migration, but not executing during a save-restore operation.
+ Test setup:
+ - NxVFs running NxVM instances (first (VM[0]) acts as source and a last (VM[N-1] as a destination)
+ - platform provisioned with the relevant vGPU profile M[N] (ATSM, ADLP) or C[N] (PVC)
+ - VF state is saved on the source VM and then restored on the destination VM instance
+ (in case of a single VF variant, source and destination is the same VM instance)
+ """
+
+ @pytest.fixture(scope='function', name='setup_destination_vm')
+ def fixture_setup_destination_vm(self, setup_vms):
+ ts: VmmTestingSetup
+ ts, _ = setup_vms
+ vm_src: VirtualMachine = ts.get_vm[0] # First VM as a source
+ vm_dst: VirtualMachine = ts.get_vm[-1] # Last VM as a destination
+ num_vms = ts.get_num_vms()
+
+ if num_vms == 1:
+ logger.debug("Single VM: the same source and destination VM instance")
+ assert vm_src == vm_dst
+ return vm_dst
+
+ logger.debug("Multiple VMs: reload destination VM with the source image (with state snapshot)")
+
+ if vm_src.is_running():
+ # QMP 'quit' is used for paused VM (cannot be powered off via quest-agent)
+ vm_src.quit()
+
+ if vm_dst.is_running():
+ vm_dst.quit()
+ while vm_dst.is_running():
+ time.sleep(1) # VM usually doesn't terminate immediately
+
+ # Re-start destination VM with an image containing a state snapshot
+ vm_dst.set_migration_source(vm_src.image)
+ vm_dst.poweron()
+
+ return vm_dst
+
+ def test_save(self, setup_vms):
+ logger.info("Test VM idle migration: state save")
+ ts: VmmTestingSetup
+ ts, _ = setup_vms
+ vm_src: VirtualMachine = ts.get_vm[0] # First VM as source
+
+ # Run some interactive program (not returning, as vim) to verify state after migration
+ src_proc = ShellExecutor(vm_src, 'vim migrate.txt')
+ source_proc = vm_src.execute_status(src_proc.pid)
+ logger.debug("Source process: %s", source_proc)
+ assert source_proc.exited is False, 'Source process is not running'
+
+ logger.debug("Execute pre-migration workload on source VM")
+ assert igt_run_check(vm_src, IgtType.EXEC_STORE)
+
+ # Pause VM and save snapshot
+ logger.debug("Pause execution and save VM state")
+ try:
+ vm_src.pause()
+ vm_src.save_state()
+ except exceptions.GuestError as exc:
+ logger.warning("State save error: %s", exc)
+ vm_src.quit()
+ assert False
+
+ def test_restore(self, setup_vms, setup_destination_vm):
+ logger.info("Test VM idle migration: state restore")
+ ts: VmmTestingSetup
+ ts, _ = setup_vms
+ vm_dst: VirtualMachine = setup_destination_vm
+
+ # Load the source state snapshot
+ logger.debug("Restore source state on the destination VM")
+ vm_dst.load_state()
+ vm_dst.resume()
+
+ # Verify program initiated on source VM is stil running after migration
+ pgrep_dst = ShellExecutor(vm_dst, 'pgrep -f "vim migrate.txt"')
+ pgrep_dst_result = vm_dst.execute_wait(pgrep_dst.pid)
+ assert pgrep_dst_result.exit_code == 0, 'Source process (vim) not found'
+ restored_proc = vm_dst.execute_status(int(pgrep_dst_result.stdout))
+ logger.debug("Restored process: %s", restored_proc)
+ assert restored_proc.exited is False, 'Restored process is not running'
+
+ logger.debug("Execute post-migration workload on destination VM")
+ assert igt_run_check(vm_dst, IgtType.EXEC_STORE)
+
+ logger.debug("Check driver health on host and destination VM")
+ assert driver_check(ts.host)
+ assert driver_check(vm_dst)
+
+
+ at pytest.mark.parametrize('setup_vms', [(1, None), (2, None)], ids = idfn_num_vms_wl, indirect=['setup_vms'])
+class TestCheckpoint:
+ """Verify a state can be saved for the future use and then loaded at the previous checkpoint."""
+
+ @pytest.fixture(scope='function', name='setup_destination_vm')
+ def fixture_setup_destination_vm(self, setup_vms):
+ ts: VmmTestingSetup
+ ts, _ = setup_vms
+ vm_src: VirtualMachine = ts.get_vm[0] # First VM as a source
+ vm_dst: VirtualMachine = ts.get_vm[-1] # Last VM as a destination
+ num_vms = ts.get_num_vms()
+
+ if num_vms == 1:
+ logger.debug("Single VM: the same source and destination VM instance")
+ assert vm_src == vm_dst
+ return vm_dst
+
+ logger.debug("Multiple VMs: restart destination VM with the source image (with state checkpoint)")
+ vm_dst.poweroff()
+ # Source qcow2 must be copied because multiple VMs cannot run with the same image file
+ vm_dst.set_migration_source(duplicate_vm_image(vm_src.image))
+ vm_dst.poweron()
+ vm_dst.resume()
+ assert modprobe_driver_run_check(vm_dst, ts.get_vm_modprobe_params)
+
+ return vm_dst
+
+ @pytest.fixture(scope='class', name='run_source_workload')
+ def fixture_run_source_workload(self, setup_vms):
+ ts: VmmTestingSetup
+ ts, _ = setup_vms
+ vm_src: VirtualMachine = ts.get_vm[0] # First VM as source
+
+ # Run IGT workload to check before and after a state checkpoint
+ return IgtExecutor(vm_src, IgtType.SPIN_BATCH)
+
+ def test_save(self, setup_vms, run_source_workload):
+ logger.info("Test VM state checkpoint save")
+ ts: VmmTestingSetup
+ ts, _ = setup_vms
+ vm_src: VirtualMachine = ts.get_vm[0] # First VM as source
+ igt_src: IgtExecutor = run_source_workload
+
+ # Save state checkpoint
+ logger.debug("Save VM state checkpoint")
+ try:
+ vm_src.save_state()
+ except exceptions.GuestError as exc:
+ logger.warning("Migration error: %s", exc)
+ vm_src.poweroff()
+ assert False
+
+ # Verify workload submitted prior to the state checkpoint succeeds
+ assert igt_check(igt_src), 'Source IGT workload has failed'
+
+ logger.debug("Check driver health on host and source VM")
+ assert driver_check(ts.get_host)
+ assert driver_check(vm_src)
+
+ def test_load(self, setup_vms, setup_destination_vm, run_source_workload):
+ logger.info("Test VM state checkpoint load")
+ ts: VmmTestingSetup
+ ts, _ = setup_vms
+ vm_dst: VirtualMachine = setup_destination_vm
+ igt_src: IgtExecutor = run_source_workload # Get an instance of the IGT WL started in a save test
+
+ # Patch the source IgtExecutor instance with the current VM and clear results cache
+ igt_src.target = vm_dst
+ igt_src.results.clear()
+
+ # Workload submitted before the checkpoint should not be active before load
+ logger.debug("Verify IGT workload is not executing prior to the state restore (expected pgrep error)")
+ assert not cmd_run_check(vm_dst, 'pgrep igt_runner'), 'IGT workload is (unexpectedly) running'
+
+ # Load previously saved state checkpoint and resume on destination VM
+ logger.debug("Load VM state checkpoint")
+ vm_dst.load_state()
+
+ # Workload submitted before the checkpoint should be restored in running state after load
+ logger.debug("Verify IGT workload is executing again after the state restore")
+ assert not igt_src.status().exited, 'IGT workload is not running after checkpoint load'
+ assert igt_check(igt_src), 'IGT workload loaded on checkpoint has failed'
+
+ logger.debug("Check driver health on host and destination VM")
+ assert driver_check(ts.get_host)
+ assert driver_check(vm_dst)
+
+
+# Host suspend (ACPI S3) - IOT test scenarios
+def test_provisioning_after_host_S3(create_1host_1vm):
+ """ Verify PF/VF provisioning is properly restored after a host suspend cycle."""
+ ts: VmmTestingSetup = create_1host_1vm
+ host: SriovHost = ts.get_host
+ profile_id: str = 'C1' if host.gpu_name is pci.GpuDevice.PVC else 'M1'
+
+ assert driver_check(host)
+
+ logger.debug("Set vGPU profile - %s", profile_id)
+ vgpu_profile = ts.get_host.get_vgpu_profile_by_id(profile_id)
+ host.set_vgpu_profile(vgpu_profile)
+
+ assert host.create_vf(1) == 1
+
+ # PF contexts are currently assigned by the driver, so read the actual value from the sysfs
+ pf_ctxs_pre_suspend = host.get_contexts_quota(0, 0)
+
+ host.suspend()
+ assert driver_check(host)
+
+ logger.debug("Verify PF provisioning after host suspend cycle")
+ assert host.get_pf_policy_sched_if_idle(0) == vgpu_profile.scheduleIfIdle
+ assert host.get_pf_policy_engine_reset(0) == vgpu_profile.resetAfterVfSwitch
+ assert host.get_contexts_quota(0, 0) == pf_ctxs_pre_suspend
+ assert host.get_doorbells_quota(0, 0) == vgpu_profile.pfDoorbells
+ assert host.get_exec_quantum_ms(0, 0) == vgpu_profile.pfExecutionQuanta
+ assert host.get_preempt_timeout_us(0, 0) == vgpu_profile.pfPreemptionTimeout
+
+ logger.debug("Verify VF provisioning after host suspend cycle")
+ assert host.get_ggtt_quota(1, 0) == vgpu_profile.vfGgtt
+ assert host.get_lmem_quota(1, 0) == vgpu_profile.vfLmem
+ assert host.get_contexts_quota(1, 0) == vgpu_profile.vfContexts
+ assert host.get_doorbells_quota(1, 0) == vgpu_profile.vfDoorbells
+ assert host.get_exec_quantum_ms(1, 0) == vgpu_profile.vfExecutionQuanta
+ assert host.get_preempt_timeout_us(1, 0) == vgpu_profile.vfPreemptionTimeout
+
+
+ at pytest.mark.parametrize('setup_vms', [(1, None)], ids = idfn_num_vms_wl, indirect=['setup_vms'])
+class TestHostSuspend:
+ def test_vm_suspended(self, setup_vms):
+ logger.info("Host suspend scenario: VM has been also suspended to RAM")
+ ts: VmmTestingSetup
+ ts, _ = setup_vms
+ host: SriovHost = ts.get_host
+ vm: VirtualMachine = ts.get_vm[0]
+
+ logger.debug("Execute pre-suspend workload on VM")
+ assert igt_run_check(vm, IgtType.EXEC_STORE)
+
+ vm.suspend()
+ host.suspend()
+ vm.wakeup()
+
+ logger.debug("Execute post-suspend workload on VM")
+ assert igt_run_check(vm, IgtType.EXEC_STORE)
+
+ driver_check(host)
+ driver_check(vm)
+
+ def test_vm_saved(self, setup_vms):
+ logger.info("Host suspend scenario: VM state has been saved before host suspend")
+ ts: VmmTestingSetup
+ ts, _ = setup_vms
+ host: SriovHost = ts.get_host
+ vm: VirtualMachine = ts.get_vm[0]
+
+ logger.debug("Execute pre-suspend workload on VM")
+ assert igt_run_check(vm, IgtType.EXEC_STORE)
+
+ vm.pause()
+ vm.save_state()
+
+ host.suspend()
+
+ vm.load_state()
+ vm.resume()
+
+ logger.debug("Execute post-suspend workload on VM")
+ assert igt_run_check(vm, IgtType.EXEC_STORE)
+
+ driver_check(host)
+ driver_check(vm)
+
+ def test_vm_running(self, setup_vms):
+ logger.info("Host suspend scenario: VM has not been paused (VM in running state)")
+ ts: VmmTestingSetup
+ ts, _ = setup_vms
+ host: SriovHost = ts.get_host
+ vm: VirtualMachine = ts.get_vm[0]
+
+ logger.debug("Execute pre-suspend workload on VM")
+ assert igt_run_check(vm, IgtType.EXEC_STORE)
+
+ host.suspend()
+
+ logger.debug("Execute post-suspend workload on VM")
+ assert igt_run_check(vm, IgtType.EXEC_STORE)
+
+ driver_check(host)
+ driver_check(vm)
+
+ def test_vm_paused(self, setup_vms):
+ logger.info("Host suspend scenario: VM has been paused before host suspend")
+ ts: VmmTestingSetup
+ ts, _ = setup_vms
+ host: SriovHost = ts.get_host
+ vm: VirtualMachine = ts.get_vm[0]
+
+ logger.debug("Execute pre-suspend workload on VM")
+ assert igt_run_check(vm, IgtType.EXEC_STORE)
+
+ vm.pause()
+ host.suspend()
+
+ time.sleep(3)
+ vm.resume()
+
+ logger.debug("Execute post-suspend workload on VM")
+ assert igt_run_check(vm, IgtType.EXEC_STORE)
+
+ logger.debug("Check driver health on host and destination VM")
+ assert driver_check(host)
+ assert driver_check(vm)
+
+
+# Negative test scenarios
+def helper_negative_control(host: SriovHost, vf_num: int, operation: SriovHost.VfControl) -> bool:
+ """
+ Helper function for submitting illegal VF control operations.
+ Returns True on expected fail, False if illegal operation succeeds.
+ """
+ try:
+ host.set_vf_control(vf_num, operation)
+ except exceptions.HostError as exc:
+ logger.warning("VF%s: operation %s not allowed (%s)", vf_num, operation, exc)
+ return True
+
+ return False
+
+
+def helper_negative_vfs_disabled(host: SriovHost) -> None:
+ """Helper function to check illegal operations on disabled VFs."""
+ assert host.get_current_vfs() == 0
+
+ vf_first, vf_last = 1, host.get_total_vfs()
+ vf_random = random.randint(vf_first+1, vf_last-1)
+
+ logger.info("[Expected: Error] VF disabled: check 'pause'/'resume' on unavailable VF\n")
+ assert helper_negative_control(host, vf_first, host.VfControl.pause)
+ assert helper_negative_control(host, vf_first, host.VfControl.resume)
+
+ assert helper_negative_control(host, vf_last, host.VfControl.pause)
+ assert helper_negative_control(host, vf_last, host.VfControl.resume)
+
+ assert helper_negative_control(host, vf_random, host.VfControl.pause)
+ assert helper_negative_control(host, vf_random, host.VfControl.resume)
+
+
+def helper_negative_vfs_enabled(host: SriovHost) -> None:
+ """Helper function to check illegal operations on enabled VFs."""
+ assert host.get_current_vfs() == 2
+
+ logger.info("[Expected: Error] VF enabled: check 'resume' on running (not-paused) VF\n")
+ assert helper_negative_control(host, 1, host.VfControl.resume)
+ assert helper_negative_control(host, 2, host.VfControl.resume)
+
+ logger.info("[Expected: Success] VF enabled: check 'pause' on running (not-paused) VF\n")
+ assert not helper_negative_control(host, 1, host.VfControl.pause)
+ assert not helper_negative_control(host, 2, host.VfControl.pause)
+
+ logger.info("[Expected: Error] VF enabled: check double 'pause'\n")
+ assert helper_negative_control(host, 1, host.VfControl.pause)
+ assert helper_negative_control(host, 2, host.VfControl.pause)
+
+ logger.info("[Expected: Success] VF enabled: check 'resume' on paused VF\n")
+ assert not helper_negative_control(host, 1, host.VfControl.resume)
+ assert not helper_negative_control(host, 2, host.VfControl.resume)
+
+ logger.info("[Expected: Error] VF enabled: check double 'resume'\n")
+ assert helper_negative_control(host, 1, host.VfControl.resume)
+ assert helper_negative_control(host, 2, host.VfControl.resume)
+
+# TODO: Consider to refactor below negative subtests:
+# isolate common flow (for auto/manual/multitile) in a single function with provisioning lib
+def test_negative_2vf_pause_resume_auto(create_1host_2vm):
+ """Negative test: verify illegal VF pause-resume with 2xVFs auto provisioned."""
+ ts: VmmTestingSetup = create_1host_2vm
+ host: SriovHost = ts.get_host
+ vm0: VirtualMachine = ts.get_vm[0]
+ vm1: VirtualMachine = ts.get_vm[1]
+ assert driver_check(host)
+
+ helper_negative_vfs_disabled(host)
+
+ assert host.create_vf(2) == 2
+ vf1, vf2 = host.get_vfs_bdf(1, 2)
+ vm0.assign_vf(vf1)
+ vm1.assign_vf(vf2)
+
+ ts.poweron_vms()
+
+ assert modprobe_driver_run_check(vm0, ts.get_vm_modprobe_params)
+ assert modprobe_driver_run_check(vm1, ts.get_vm_modprobe_params)
+
+ helper_negative_vfs_enabled(host)
+
+
+def test_negative_2vf_pause_resume_manual(create_1host_2vm):
+ """Negative test: verify illegal VF pause-resume with 2xVFs manual provisioned on a root tile."""
+ ts: VmmTestingSetup = create_1host_2vm
+ host: SriovHost = ts.get_host
+ vm0: VirtualMachine = ts.get_vm[0]
+ vm1: VirtualMachine = ts.get_vm[1]
+ assert driver_check(host)
+
+ helper_provision_strategy(ts, helper_configure_max_available_resources, 0)
+
+ helper_negative_vfs_disabled(host)
+
+ assert host.create_vf(2) == 2
+ vf1, vf2 = host.get_vfs_bdf(1, 2)
+ vm0.assign_vf(vf1)
+ vm1.assign_vf(vf2)
+
+ ts.poweron_vms()
+
+ assert modprobe_driver_run_check(vm0, ts.get_vm_modprobe_params)
+ assert modprobe_driver_run_check(vm1, ts.get_vm_modprobe_params)
+
+ helper_negative_vfs_enabled(host)
+
+
+def test_negative_2vf_pause_resume_manual_multitile(create_1host_2vm):
+ """Negative test: verify illegal VF pause-resume with 2xVFs manual provisioned on multi-tile."""
+ ts: VmmTestingSetup = create_1host_2vm
+ host: SriovHost = ts.get_host
+ vm0: VirtualMachine = ts.get_vm[0]
+ vm1: VirtualMachine = ts.get_vm[1]
+ assert driver_check(host)
+
+ # Test can be executed only on multi-tile device - skip if requirement not met
+ if host.get_num_gts() < 2:
+ pytest.skip("Test is not supported on single tile device")
+
+ vf_num, gt_num = 1, 0
+ sar_gt0 = SriovAvailableResources(host, gt_num)
+ sar_gt0.print_available_resources()
+ spc_vf1 = helper_configure_max_available_resources(vf_num, gt_num, sar_gt0, 1)
+ helper_apply_sriov_provisioning(host, spc_vf1)
+ logger.info("VF#%s received SRIOV provisioning config:\n", vf_num)
+ helper_fetch_sriov_provisioning(host, vf_num, gt_num).print_provisioning_config()
+
+ vf_num, gt_num = 2, 1
+ sar_gt1 = SriovAvailableResources(host, gt_num)
+ sar_gt1.print_available_resources()
+ spc_vf2 = helper_configure_max_available_resources(vf_num, gt_num, sar_gt1, 1)
+ helper_apply_sriov_provisioning(host, spc_vf2)
+ logger.info("VF#%s received SRIOV provisioning config:\n", vf_num)
+ helper_fetch_sriov_provisioning(host, vf_num, gt_num).print_provisioning_config()
+
+ helper_negative_vfs_disabled(host)
+
+ assert host.create_vf(2) == 2
+ vf1, vf2 = host.get_vfs_bdf(1, 2)
+ vm0.assign_vf(vf1)
+ vm1.assign_vf(vf2)
+
+ ts.poweron_vms()
+
+ assert modprobe_driver_run_check(vm0, ts.get_vm_modprobe_params)
+ assert modprobe_driver_run_check(vm1, ts.get_vm_modprobe_params)
+
+ helper_negative_vfs_enabled(host)
diff --git a/tools/vmtb/vmm_flows/test_provisioning.py b/tools/vmtb/vmm_flows/test_provisioning.py
new file mode 100644
index 000000000..fccf03e6c
--- /dev/null
+++ b/tools/vmtb/vmm_flows/test_provisioning.py
@@ -0,0 +1,555 @@
+#!/usr/bin/env python3
+# SPDX-License-Identifier: MIT
+
+## Copyright (C) 2024 Intel Corporation ##
+
+import random
+from bisect import bisect
+import enum
+import logging
+import math
+import typing
+import pytest
+
+from bench.helpers.helpers import (driver_check, igt_run_check, load_host_drivers, modprobe_driver,
+ igt_check, modprobe_driver_run_check, modprobe_driver_check)
+from bench.machines.host import SriovHost
+from bench.machines.virtual.vm import VirtualMachine
+from bench.machines.vgpu_profile import VgpuProfile
+from bench.executors.gem_wsim import gem_wsim_parallel_exec_and_check
+from bench.executors.igt import IgtExecutor, IgtType, igt_list_subtests
+from bench.executors.shell import ShellExecutor
+from vmm_flows.conftest import VmmTestingSetup, VmmTestingConfig
+
+logger = logging.getLogger(__name__)
+
+
+class SriovProvisioningConfig():
+ def __init__(self, vf_num, gt_num,
+ ggtt = 0, lmem = 0, ctxs_num = 0, dbs_num = 0, exec_quantum = 0, preempt_timeout = 0):
+ self.vf_num: int = vf_num
+ self.gt_num: int = gt_num
+ self.ggtt_quota: int = ggtt
+ self.lmem_quota: int = lmem
+ self.contexts_quota: int = ctxs_num
+ self.doorbells_quota: int = dbs_num
+ self.exec_quantum: int = exec_quantum
+ self.preempt_timeout: int = preempt_timeout
+
+ def print_provisioning_config(self):
+ logger.info(
+ "\nSRIOV Provisioning Settings (VF%s / GT%s):\n"
+ "\tggtt_quota = %s (%s) B\n"
+ "\tlmem_quota = %s (%s) B\n"
+ "\tcontexts_quota = %s (%s)\n"
+ "\tdoorbells_quota = %s (%s)\n"
+ "\texec_quantum = %s (%s) ms\n"
+ "\tpreempt_timeout = %s (%s) us\n",
+ self.vf_num, self.gt_num,
+ self.ggtt_quota, hex(self.ggtt_quota),
+ self.lmem_quota, hex(self.lmem_quota),
+ self.contexts_quota, hex(self.contexts_quota),
+ self.doorbells_quota, hex(self.doorbells_quota),
+ self.exec_quantum, hex(self.exec_quantum),
+ self.preempt_timeout, hex(self.preempt_timeout)
+ )
+
+class SriovAvailableResources():
+ def __init__(self, host: SriovHost, gt_num: int):
+ self.gt_num: int = gt_num
+ _, self.ggtt_available = host.get_debugfs_ggtt(gt_num)
+ self.lmem_max_quota = host.get_pf_lmem_max_quota(gt_num)
+ self.contexts_max_quota = host.get_pf_contexts_max_quota(gt_num)
+ self.doorbells_max_quota = host.get_pf_doorbells_max_quota(gt_num)
+
+ def print_available_resources(self):
+ logger.info(
+ "\nSRIOV Available Resources (GT%s):\n"
+ "\tggtt_available = %s (%s) B\n"
+ "\tlmem_max_quota = %s (%s) B\n"
+ "\tcontexts_max_quota = %s (%s)\n"
+ "\tdoorbells_max_quota = %s (%s)\n",
+ self.gt_num,
+ self.ggtt_available, hex(self.ggtt_available),
+ self.lmem_max_quota, hex(self.lmem_max_quota),
+ self.contexts_max_quota, hex(self.contexts_max_quota),
+ self.doorbells_max_quota, hex(self.doorbells_max_quota)
+ )
+
+
+# Perform VM only related steps to create and boot VM:
+# assign elsewhere enabled VF to VM and power on
+def helper_prepare_vms(ts: VmmTestingSetup):
+ host: SriovHost = ts.get_host
+ assert driver_check(host)
+
+ num_vms = ts.get_num_vms()
+
+ for i in range(num_vms):
+ vm = ts.get_vm[i]
+ pass_vf = host.get_vf_bdf(i + 1)
+
+ vm.assign_vf(pass_vf)
+ vm.poweron()
+
+ assert modprobe_driver_run_check(vm, ts.get_vm_modprobe_params)
+
+
+# Perform all required steps (VF/VM) to create and boot VM:
+# enable VF then assign to VM, power on and run IGT test
+def helper_create_run_vms(ts: VmmTestingSetup):
+ host: SriovHost = ts.get_host
+ assert driver_check(host)
+
+ num_vms = ts.get_num_vms()
+ logger.info("VmmTestingSetup requests %sxVM to enable\n", num_vms)
+
+ assert host.create_vf(num_vms) == num_vms
+
+ for i in range(num_vms):
+ vm = ts.get_vm[i]
+ pass_vf = host.get_vf_bdf(i + 1)
+
+ vm.assign_vf(pass_vf)
+ vm.poweron()
+
+ assert modprobe_driver_run_check(vm, ts.get_vm_modprobe_params)
+ assert igt_run_check(vm, IgtType.EXEC_BASIC)
+
+
+# Assert auto-provisioning is enabled
+def helper_ensure_auto_provisioning(ts: VmmTestingSetup):
+ host: SriovHost = ts.get_host
+ assert driver_check(host)
+
+ host.set_pf_auto_provisioning(True)
+ assert host.get_pf_auto_provisioning() is True
+
+
+# Provision minimal resources allowing to boot OS on VM
+def helper_configure_min_viable_resources(vf_num: int, gt_num: int, *_) -> SriovProvisioningConfig:
+ spc: SriovProvisioningConfig = SriovProvisioningConfig(vf_num, gt_num)
+
+ # Guest OS Ubuntu - minimal resources to boot VM and succesfully load i915 (found experimentally):
+ # GGTT: 16MB (64kB min to set)
+ # LMEM: 16MB (2MB min to set)
+ # Contexts: 128 min to set
+ spc.ggtt_quota = 0x1000000
+ spc.lmem_quota = 0x1000000
+ spc.contexts_quota = 1 # aligns to 128
+ spc.doorbells_quota = 1
+ spc.exec_quantum = 1
+ spc.preempt_timeout = 1
+
+ return spc
+
+
+# Provision maximal available resources divided between given number for VFs
+def helper_configure_max_available_resources(vf_num: int,
+ gt_num: int,
+ sar: SriovAvailableResources,
+ num_vfs: int) -> SriovProvisioningConfig:
+
+ spc: SriovProvisioningConfig = SriovProvisioningConfig(vf_num, gt_num)
+
+ # Provide alignment margin when dividing max resources per VFs (64k for GGTT, 2M for LMEM, 128 for ctxs)
+ spc.ggtt_quota = int((sar.ggtt_available - (num_vfs * 0x10000)) / num_vfs)
+ spc.lmem_quota = int((sar.lmem_max_quota - (num_vfs * 0x200000)) / num_vfs)
+ spc.contexts_quota = int((sar.contexts_max_quota - (num_vfs * 128)) / num_vfs)
+ spc.doorbells_quota = int(sar.doorbells_max_quota / num_vfs)
+ spc.exec_quantum = 0 # infinity
+ spc.preempt_timeout = 0 # infinity
+
+ return spc
+
+
+# Apply SRIOV Provisioning test strategy: auto provisioning, minimal, maximal or random resources
+# Requested strategy is passed as the callback function 'func_strategy'
+def helper_provision_strategy(ts: VmmTestingSetup, func_strategy: typing.Callable, gt_num: int):
+ host: SriovHost = ts.get_host
+ assert driver_check(host)
+
+ num_vfs = ts.get_num_vms()
+ logger.info("[%s] Test requests %sxVF to provision\n", func_strategy.__name__, num_vfs)
+
+ for vf_num in range(1, num_vfs + 1):
+ sar: SriovAvailableResources = SriovAvailableResources(host, gt_num)
+ sar.print_available_resources()
+
+ spc: SriovProvisioningConfig = func_strategy(vf_num, gt_num, sar, num_vfs - vf_num + 1)
+ logger.info("VF#%s requested SRIOV provisioning config:\n", vf_num)
+ spc.print_provisioning_config()
+
+ helper_apply_sriov_provisioning(host, spc)
+
+ logger.info("VF#%s received SRIOV provisioning config:\n", vf_num)
+ helper_fetch_sriov_provisioning(host, vf_num, gt_num).print_provisioning_config()
+
+
+def helper_fetch_sriov_provisioning(host: SriovHost, vf_num: int, gt_num: int) -> SriovProvisioningConfig:
+ return SriovProvisioningConfig(vf_num, gt_num,
+ host.get_ggtt_quota(vf_num, gt_num),
+ host.get_lmem_quota(vf_num, gt_num),
+ host.get_contexts_quota(vf_num, gt_num),
+ host.get_doorbells_quota(vf_num, gt_num),
+ host.get_exec_quantum_ms(vf_num, gt_num),
+ host.get_preempt_timeout_us(vf_num, gt_num))
+
+
+def helper_apply_sriov_provisioning(host: SriovHost, ps: SriovProvisioningConfig):
+ gt_num = ps.gt_num
+ vf_num = ps.vf_num
+
+ host.set_ggtt_quota(vf_num, gt_num, ps.ggtt_quota)
+ host.set_lmem_quota(vf_num, gt_num, ps.lmem_quota)
+ host.set_contexts_quota(vf_num, gt_num, ps.contexts_quota)
+ host.set_doorbells_quota(vf_num, gt_num, ps.doorbells_quota)
+ host.set_exec_quantum_ms(vf_num, gt_num, ps.exec_quantum)
+ host.set_preempt_timeout_us(vf_num, gt_num, ps.preempt_timeout)
+
+
+def test_provision_1vf_auto(create_1host_1vm):
+ """Enable 1xVF with auto provisioning"""
+ ts: VmmTestingSetup = create_1host_1vm
+
+ helper_ensure_auto_provisioning(ts)
+ helper_create_run_vms(ts)
+
+
+def test_provision_2vf_auto(create_1host_2vm):
+ """Enable 2xVF with auto provisioning"""
+ ts: VmmTestingSetup = create_1host_2vm
+
+ helper_ensure_auto_provisioning(ts)
+ helper_create_run_vms(ts)
+
+
+def test_provision_2vf_late(create_1host_2vm):
+ """Enable 2xVF, 1st provisioned early (before enabling), 2nd late (after enabling)"""
+ ts: VmmTestingSetup = create_1host_2vm
+ host: SriovHost = ts.get_host
+ assert driver_check(host)
+
+ early_vf_num, late_vf_num = 1, 2
+ gt_num = 0
+
+ # Early provision 1st VF with minimal resources, 2nd VF leave unprovisioned (default config)
+ min_spc = vf1_spc_requested = helper_configure_min_viable_resources(early_vf_num, gt_num)
+ helper_apply_sriov_provisioning(host, vf1_spc_requested)
+
+ # Enable both VFs
+ num_vms = ts.get_num_vms()
+ assert host.create_vf(num_vms) == num_vms
+
+ # Then (late) provision already enabled 2nd VF with maximal available resources
+ late_sar = SriovAvailableResources(host, gt_num)
+ vf2_spc_requested = helper_configure_max_available_resources(late_vf_num, gt_num, late_sar, 1)
+ helper_apply_sriov_provisioning(host, vf2_spc_requested)
+
+ # Verify 1st VF provisioning is minimal as expected
+ vf1_spc = helper_fetch_sriov_provisioning(host, early_vf_num, gt_num)
+ logger.info("VF#%s received SRIOV (early) provisioning config:\n", early_vf_num)
+ vf1_spc.print_provisioning_config()
+
+ assert vf1_spc.ggtt_quota == min_spc.ggtt_quota
+ assert vf1_spc.lmem_quota == min_spc.lmem_quota if host.has_lmem() else True
+ assert vf1_spc.contexts_quota == 128 # min_spc.contexts_quota set to 128 automatically
+ assert vf1_spc.doorbells_quota == min_spc.doorbells_quota
+
+ # Verify 2nd VF provisioning is maximal as expected
+ vf2_spc = helper_fetch_sriov_provisioning(host, late_vf_num, gt_num)
+ logger.info("VF#%s received SRIOV (late) provisioning config:\n", late_vf_num)
+ vf2_spc.print_provisioning_config()
+
+ # GGTT max with 64kB of alignment included
+ assert late_sar.ggtt_available - 0x10000 <= vf2_spc.ggtt_quota <= late_sar.ggtt_available
+ # LMEM max with 2MB of alignment included
+ assert late_sar.lmem_max_quota - 0x200000 <= vf2_spc.lmem_quota <= late_sar.lmem_max_quota \
+ if host.has_lmem() else True
+ # Contexts max with 128 of alignment included
+ assert late_sar.contexts_max_quota - 128 <= vf2_spc.contexts_quota <= late_sar.contexts_max_quota
+ assert vf2_spc.doorbells_quota == late_sar.doorbells_max_quota
+
+ # Start VMs and execute basic test
+ helper_prepare_vms(ts)
+
+ for i in range(num_vms):
+ assert igt_run_check(ts.get_vm[i], IgtType.EXEC_BASIC)
+
+
+def check_selfconfigs(ts: VmmTestingSetup, vgpu_profile: VgpuProfile) -> None:
+ host: SriovHost = ts.get_host
+
+ for vm in ts.get_vm:
+ vf_num = vm.vmnum + 1
+ for gt_num in vm.gt_nums:
+ # VF provisioning config set on a host level (PF)
+ sysfs_ggtt = host.get_ggtt_quota(vf_num, gt_num)
+ sysfs_lmem = host.get_lmem_quota(vf_num, gt_num)
+ sysfs_ctxs = host.get_contexts_quota(vf_num, gt_num)
+ sysfs_dbs = host.get_doorbells_quota(vf_num, gt_num)
+
+ # VF provisioning config get on a guest level (VF)
+ vm.helper_get_debugfs_selfconfig(gt_num=gt_num)
+ selfconf_ggtt = vm.ggtt_size
+ selfconf_lmem = vm.lmem_size
+ selfconf_ctxs = vm.contexts
+ selfconf_dbs = vm.doorbells
+
+ logger.debug("Verify requested vGPU profile is applied to VF")
+ logger.debug(
+ "\nvGPU profile %s settings (VF%s / GT%s):\n"
+ "(Host sysfs config against guest debugfs VF self_config)\n"
+ "\tggtt_quota = (sysfs) %s / (self_config) %s B\n"
+ "\tlmem_quota = (sysfs) %s / (self_config) %s B\n"
+ "\tcontexts_quota = (sysfs) %s / (self_config) %s\n"
+ "\tdoorbells_quota = (sysfs) %s / (self_config) %s\n",
+ vgpu_profile.profileId, vf_num, gt_num,
+ sysfs_ggtt, selfconf_ggtt,
+ sysfs_lmem, selfconf_lmem,
+ sysfs_ctxs, selfconf_ctxs,
+ sysfs_dbs, selfconf_dbs
+ )
+
+ assert sysfs_ggtt == selfconf_ggtt
+ assert sysfs_lmem == selfconf_lmem if host.has_lmem() else True
+ assert sysfs_ctxs == selfconf_ctxs
+ assert sysfs_dbs == selfconf_dbs
+
+ if host.get_num_gts() > 1:
+ selfconf_tilemask = vm.tile_mask
+ logger.debug("Multi-tile device: tile_mask = %s, gt_num = %s", selfconf_tilemask, gt_num)
+ assert selfconf_tilemask & (1 << gt_num)
+ else:
+ logger.debug("Single-tile device: gt_num = %s", gt_num)
+ assert gt_num == 0
+
+
+#
+# vGPU profiles testing
+#
+# helper_test_vgpu_profile - helper to set requested vGPU profile
+# and check it is correctly applied from VM level
+# @ts: VM test setup
+# @vgpu_profile: profile instance to be set
+def helper_test_vgpu_profile(ts: VmmTestingSetup, vgpu_profile: VgpuProfile):
+ vf_num: int = 1
+ host: SriovHost = ts.get_host
+ vm: VirtualMachine = ts.get_vm[0]
+ assert driver_check(host)
+
+ host.set_vgpu_profile(vgpu_profile)
+ num_vfs = vgpu_profile.get_num_vfs()
+ assert host.create_vf(num_vfs) == num_vfs
+
+ vm.assign_vf(host.get_vf_bdf(vf_num))
+ vm.poweron()
+ assert modprobe_driver_run_check(vm, ts.get_vm_modprobe_params)
+
+ check_selfconfigs(ts, vgpu_profile)
+
+
+ at pytest.fixture(scope='session', name='big_lmem_values')
+def fixture_big_lmem_values(get_host):
+ prefix = 'big-lmem-M'
+ prefix_length = len(prefix)
+ return [(int)(t[prefix_length:]) for t in igt_list_subtests(get_host, 'gem_create') if t.startswith(prefix)]
+
+
+ at pytest.fixture(scope='class', name='setup_vgpu_profile')
+def fixture_setup_vgpu_profile(get_os_image, get_vm_modparams, get_host, request):
+ profile_id, max_vms = request.param
+ host: SriovHost = get_host
+ vgpu_profile: VgpuProfile = host.get_vgpu_profile_by_vgpu_profile_id(profile_id)
+ ts: VmmTestingSetup = VmmTestingSetup(get_os_image, get_vm_modparams, host, VmmTestingConfig(vgpu_profile, max_vms))
+
+ def _teardown():
+ logger.info('[Teardown]')
+ ts.teardown()
+ request.addfinalizer(_teardown)
+
+ logger.info('[Setup]')
+
+ load_host_drivers(host)
+ host.set_vgpu_profile(vgpu_profile)
+ vgpu_profile.print_parameters()
+ num_vfs = vgpu_profile.get_num_vfs()
+ assert host.create_vf(num_vfs) == num_vfs
+
+ bdf_list = [host.get_vf_bdf(vf) for vf in range(1, len(ts.get_vm) + 1)]
+ for vm, bdf in zip(ts.get_vm, bdf_list):
+ vm.assign_vf(bdf)
+
+ ts.poweron_vms()
+
+ modprobe_cmds = [modprobe_driver(vm, ts.get_vm_modprobe_params) for vm in ts.get_vm]
+ for i, cmd in enumerate(modprobe_cmds):
+ assert modprobe_driver_check(ts.get_vm[i], cmd), f'modprobe failed on VM{i}'
+
+ logger.info('[Tests]')
+ return ts
+
+
+class WorkType(int, enum.Enum):
+ PREEMPT = 0
+ NOPREEMPT = 1
+
+
+class WorkDesc(typing.NamedTuple):
+ definition: str
+ iterations: int
+
+
+def get_work_desc(profile: VgpuProfile, worktype: WorkType, num_vms: int) -> WorkDesc:
+ limit_us = 10000000
+ extra_dur_us = 4000
+ durations_us = [profile.vfExecutionQuanta * 1000 + profile.vfPreemptionTimeout + extra_dur_us,
+ profile.vfExecutionQuanta * 1000]
+ iterations = [(int) (limit_us / dur_us) for dur_us in durations_us]
+ if profile.scheduleIfIdle:
+ iterations = [int(iter / profile.get_num_vfs()) for iter in iterations]
+ else:
+ iterations = [int(iter / num_vms) for iter in iterations]
+
+ work_descs = [WorkDesc(f'1.DEFAULT.{durations_us[WorkType.PREEMPT]}.0.1', iterations[WorkType.PREEMPT]),
+ WorkDesc(f'X.1.0,1.DEFAULT.{durations_us[WorkType.NOPREEMPT]}.0.1', iterations[WorkType.NOPREEMPT])]
+
+ return work_descs[worktype]
+
+
+class ProfileIdNumVms(typing.NamedTuple):
+ profile_id: str
+ num_vms: int
+
+ def __str__(self) -> str:
+ short_id = self.profile_id[-2:] if self.profile_id[-3] == '_' else self.profile_id[-3:]
+ return f'{short_id}-{self.num_vms}VM'
+
+
+def vgpu_profile_test_params(max_vms: int) -> typing.List[ProfileIdNumVms]:
+ host = SriovHost()
+ return [ProfileIdNumVms(p.profileId, min(p.get_num_vfs(), max_vms)) for p in host.query_vgpu_profiles()]
+
+
+MAX_VMS = 2
+vgpu_profile_params = vgpu_profile_test_params(MAX_VMS)
+
+
+ at pytest.mark.usefixtures("setup_vgpu_profile")
+ at pytest.mark.parametrize('setup_vgpu_profile', vgpu_profile_params,
+ ids=[str(p) for p in vgpu_profile_params],
+ indirect=['setup_vgpu_profile'])
+class TestVgpuProfile:
+ def test_selfconfig(self, setup_vgpu_profile):
+ ts: VmmTestingSetup = setup_vgpu_profile
+ check_selfconfigs(ts, ts.get_vgpu_profile)
+
+ def test_sched_preemptable(self, setup_vgpu_profile):
+ ts: VmmTestingSetup = setup_vgpu_profile
+ work: WorkDesc = get_work_desc(ts.get_vgpu_profile, WorkType.PREEMPT, ts.get_num_vms())
+ gem_wsim_parallel_exec_and_check(ts.get_vm, work.definition, work.iterations)
+
+ def test_sched_non_preemptable(self, setup_vgpu_profile):
+ ts: VmmTestingSetup = setup_vgpu_profile
+ work: WorkDesc = get_work_desc(ts.get_vgpu_profile, WorkType.NOPREEMPT, ts.get_num_vms())
+ gem_wsim_parallel_exec_and_check(ts.get_vm, work.definition, work.iterations)
+
+ def test_lmem_sysfs(self, setup_vgpu_profile):
+ ts: VmmTestingSetup = setup_vgpu_profile
+
+ if not ts.get_host.has_lmem():
+ return
+
+ for lmem_filename in ["lmem_avail_bytes", "lmem_total_bytes"]:
+ cmd = f'cat /sys/class/drm/card0/{lmem_filename}'
+ lmem_infos = [ShellExecutor(vm, cmd) for vm in ts.get_vm]
+ for i, lmem_info in enumerate(lmem_infos):
+ gts_per_vf = len(ts.get_vm[i].gt_nums)
+ proc_result = lmem_info.wait()
+ assert proc_result.exit_code == 0
+ logger.info('VM%d: %s=%s', i, lmem_filename, proc_result.stdout)
+ if lmem_filename == "lmem_total_bytes":
+ SIZE_2M = int(1024 * 1024 * 2)
+ lmem_rounded_2M = math.ceil(ts.get_vgpu_profile.vfLmem / SIZE_2M) * SIZE_2M * gts_per_vf
+ assert lmem_rounded_2M == int(proc_result.stdout)
+
+ def test_lmem_gem_create(self, big_lmem_values, setup_vgpu_profile):
+ ts: VmmTestingSetup = setup_vgpu_profile
+
+ if not ts.get_host.has_lmem():
+ return
+
+ lmem = ts.get_vgpu_profile.vfLmem / 1024 / 1024
+ testname = f'igt at gem_create@big-lmem-M{big_lmem_values[bisect(big_lmem_values, lmem) - 1]}'
+ gem_create_lmem = [IgtExecutor(vm, testname) for vm in ts.get_vm]
+ for i, gem in enumerate(gem_create_lmem):
+ assert igt_check(gem), f'{testname} failed on VM{i}'
+
+
+# vGPU custom profile
+# Provision random resources based on a minimal and maximal values from predefined vGPU profiles
+# Supported devices: all
+def test_vgpu_profile_custom(create_1host_1vm):
+ ts: VmmTestingSetup = create_1host_1vm
+ host: SriovHost = ts.get_host
+ assert driver_check(host)
+
+ supported_profiles = host.query_vgpu_profiles()
+ max_num_vfs = 0
+
+ for profile in supported_profiles:
+ num_vfs = profile.get_num_vfs()
+ if num_vfs == 1:
+ max_profile = profile
+
+ if num_vfs > max_num_vfs:
+ max_num_vfs = num_vfs
+ min_profile = profile
+
+ # Custom provisioning in a range [min_profile_value, max_profile_value]
+ vf_ggtt = random.randint(min_profile.vfGgtt, max_profile.vfGgtt)
+ vf_lmem = random.randint(min_profile.vfLmem, max_profile.vfLmem)
+ # VF contexts are fixed in all predefined profiles (1024) - verify also some other values:
+ vf_contexts = random.randint(512, 4096)
+ vf_doorbells = random.randint(min_profile.vfDoorbells, max_profile.vfDoorbells)
+ vf_eq = random.randint(min_profile.vfExecutionQuanta, max_profile.vfExecutionQuanta)
+ vf_pt = random.randint(min_profile.vfPreemptionTimeout, max_profile.vfPreemptionTimeout)
+
+ # PF provisioning value (ctx, dbx, eq, pt) are usually constant for all vGPU profiles
+ # Randomize PF config with values similar to assigned to VFs:
+ pf_contexts = random.randint(512, 4096)
+ pf_doorbells = random.randint(1, max_profile.pfDoorbells)
+
+ pf_eq = random.randint(min_profile.vfExecutionQuanta, max_profile.vfExecutionQuanta)
+ pf_pt = random.randint(min_profile.vfPreemptionTimeout, max_profile.vfPreemptionTimeout)
+
+ # Only 1xVF enabling is guaranteed as the custom (randomized) values
+ # can be close to the maximal available resources:
+ custom_num_vfs = 1
+
+ custom_profile = VgpuProfile()
+ custom_profile.profileId = f'CUSTOM_A{custom_num_vfs}'
+ custom_profile.description = 'Random profile (user defined)'
+ custom_profile.schedulerMode = 'Custom'
+ custom_profile.pfExecutionQuanta = pf_eq
+ custom_profile.pfPreemptionTimeout = pf_pt
+ custom_profile.vfExecutionQuanta = vf_eq
+ custom_profile.vfPreemptionTimeout = vf_pt
+ custom_profile.scheduleIfIdle = random.choice([True, False])
+
+ custom_profile.resetAfterVfSwitch = random.choice([True, False])
+ custom_profile.provisioningMode = 1 if custom_num_vfs == 1 else 3
+ # PF LMEM is actually set by the i915, not user (from sysfs):
+ # custom_profile.pfLmem = min_profile.pfLmem
+ custom_profile.pfContexts = pf_contexts
+ custom_profile.pfDoorbells = pf_doorbells
+ # PF GGTT is actually set by the i915, not user (from sysfs):
+ # custom_profile.pfGgtt = min_profile.pfGgtt
+ custom_profile.vfLmem = vf_lmem
+ custom_profile.vfContexts = vf_contexts
+ custom_profile.vfDoorbells = vf_doorbells
+ custom_profile.vfGgtt = vf_ggtt
+
+ logger.info("Custom vGPU profile (random provisioning settings):")
+ custom_profile.print_parameters()
+ helper_test_vgpu_profile(ts, custom_profile)
diff --git a/tools/vmtb/vmm_flows/test_scheduling.py b/tools/vmtb/vmm_flows/test_scheduling.py
new file mode 100644
index 000000000..f875420d0
--- /dev/null
+++ b/tools/vmtb/vmm_flows/test_scheduling.py
@@ -0,0 +1,123 @@
+#!/usr/bin/env python3
+# SPDX-License-Identifier: MIT
+
+## Copyright (C) 2024 Intel Corporation ##
+
+import typing
+from bench.executors.igt import IgtExecutor, IgtType
+from bench.executors.gem_wsim import (GemWsimResult, GemWsim, gem_wsim_parallel_exec_and_check,
+ PREEMPT_10MS_WORKLOAD, NON_PREEMPT_10MS_WORKLOAD,
+ ONE_CYCLE_DURATION_MS)
+from bench.helpers.helpers import (driver_check, igt_check, modprobe_driver, modprobe_driver_check,
+ modprobe_driver_run_check)
+from bench.machines.host import SriovHost
+from bench.machines.virtual.vm import VirtualMachine
+from bench.machines.machine_interface import MachineInterface
+from vmm_flows.conftest import VmmTestingSetup
+
+WL_ITERATIONS = 1000
+MS_IN_SEC = 1000
+
+def test_equal_workloads_per_second(create_1host_2vm) -> None:
+ """ Check workloads per second ratio on VMs is equal when run simultanoeusly.
+ VFs are autoprovisioned (same scheduling params), strict scheduling is off.
+ Check is done for preemptable and nonpreemptable workloads.
+ Then same checks are done with engine reset policy set to on.
+ """
+ ts: VmmTestingSetup = create_1host_2vm
+ host: SriovHost = ts.get_host
+ vms: typing.List[MachineInterface] = ts.get_vm
+ assert driver_check(host)
+
+ total_vfs = host.get_total_vfs()
+ assert host.create_vf(total_vfs) == total_vfs
+
+ for vm, bdf in zip(ts.get_vm, host.get_vfs_bdf(1, total_vfs)):
+ vm.assign_vf(bdf)
+
+ ts.poweron_vms()
+
+ modprobes = [modprobe_driver(vm, ts.get_vm_modprobe_params) for vm in vms]
+ for i,(vm, m) in enumerate(zip(vms, modprobes)):
+ assert modprobe_driver_check(vm, m), f'modprobe failed on VM{i}'
+
+ # sanity check
+ igt_workloads = [IgtExecutor(vm, IgtType.EXEC_BASIC) for vm in vms]
+ for i, igt_result in enumerate(igt_workloads):
+ assert igt_check(igt_result), f'IGT failed on VM{i}'
+
+ # Single workload takes 10ms GPU time, multiplied by 1000 iterations
+ # gives the expected 10s duration and 100 workloads/sec
+ # Adjust the expected values to number of VMs
+ expected = GemWsimResult(ONE_CYCLE_DURATION_MS * WL_ITERATIONS * len(vms) / MS_IN_SEC,
+ MS_IN_SEC/ONE_CYCLE_DURATION_MS / len(vms))
+
+ # check preemptable workload
+ result = gem_wsim_parallel_exec_and_check(vms, PREEMPT_10MS_WORKLOAD, WL_ITERATIONS, expected)
+
+ # check non-preemptable workload
+ nopreempt_result = gem_wsim_parallel_exec_and_check(vms, NON_PREEMPT_10MS_WORKLOAD, WL_ITERATIONS, expected)
+
+ # turn on engine reset policy
+ for gt_num in range(host.get_num_gts()):
+ host.set_pf_policy_engine_reset(gt_num, 1)
+
+ # repeat measurements
+ # check preemptable workload
+ results2 = gem_wsim_parallel_exec_and_check(vms, PREEMPT_10MS_WORKLOAD, WL_ITERATIONS)
+ # compare results engine_reset=on vs engine_reset=off
+ # as no ratio specified by Arch assume no more than 50% difference allowed
+ assert 0.5 < results2.workloads_per_sec / result.workloads_per_sec < 1.5
+
+ # check non-preemptable workload
+ nopreempt_results2 = gem_wsim_parallel_exec_and_check(vms, NON_PREEMPT_10MS_WORKLOAD, WL_ITERATIONS)
+ # compare results engine_reset=on vs engine_reset=off
+ # as no ratio specified by Arch assume no more than 50% difference allowed
+ assert 0.5 < nopreempt_results2.workloads_per_sec / nopreempt_result.workloads_per_sec < 1.5
+
+def test_pf_priority(create_1host_1vm) -> None:
+ """ Check if setting PF's scheduling priority to NORMAL and HIGH causes appropriate
+ behavior.
+ """
+ ts: VmmTestingSetup = create_1host_1vm
+ host: SriovHost = ts.get_host
+ vm: VirtualMachine = ts.get_vm[0]
+ machines: typing.List[MachineInterface] = [host, vm]
+ assert driver_check(host)
+
+ assert host.create_vf(1) == 1
+ vf = host.get_vf_bdf(1)
+ vm.assign_vf(vf)
+ vm.poweron()
+
+ assert modprobe_driver_run_check(vm, ts.get_vm_modprobe_params)
+
+ for gt_num in range(host.get_num_gts()):
+ host.set_exec_quantum_ms(0, gt_num, 10)
+ host.set_exec_quantum_ms(1, gt_num, 10)
+ host.set_pf_sched_priority(gt_num, host.SchedulingPriority.NORMAL)
+
+ wl_duration_ms = 1000
+ wl_iterations = 1
+ workload = f'1.DEFAULT.{int(wl_duration_ms * 1000)}.0.1'
+
+ gem_wsim_vm = GemWsim(vm, 1, wl_iterations, workload)
+ gem_wsim_vm_result = gem_wsim_vm.wait_results()
+
+ vm_expected_elapsed_sec = wl_duration_ms * wl_iterations / MS_IN_SEC * len(machines)
+ assert vm_expected_elapsed_sec * 0.9 < gem_wsim_vm_result.elapsed_sec < vm_expected_elapsed_sec * 1.1
+
+ for gt_num in range(host.get_num_gts()):
+ host.set_pf_sched_priority(gt_num, host.SchedulingPriority.HIGH)
+
+ gem_wsim_host = GemWsim(host, 1, wl_iterations, workload)
+ gem_wsim_vm = GemWsim(vm, 1, wl_iterations, workload)
+
+ gem_wsim_host_result = gem_wsim_host.wait_results()
+ gem_wsim_vm_result = gem_wsim_vm.wait_results()
+
+ host_expected_elapsed_sec = wl_duration_ms * wl_iterations / MS_IN_SEC
+ vm_expected_elapsed_sec = wl_duration_ms * wl_iterations / MS_IN_SEC + host_expected_elapsed_sec
+
+ assert host_expected_elapsed_sec * 0.9 < gem_wsim_host_result.elapsed_sec < host_expected_elapsed_sec * 1.1
+ assert vm_expected_elapsed_sec * 0.9 < gem_wsim_vm_result.elapsed_sec < vm_expected_elapsed_sec * 1.1
diff --git a/tools/vmtb/vmm_flows/test_vm_panic.py b/tools/vmtb/vmm_flows/test_vm_panic.py
new file mode 100644
index 000000000..cb729a638
--- /dev/null
+++ b/tools/vmtb/vmm_flows/test_vm_panic.py
@@ -0,0 +1,84 @@
+#!/usr/bin/env python3
+# SPDX-License-Identifier: MIT
+
+## Copyright (C) 2024 Intel Corporation ##
+
+from bench.executors.igt import IgtExecutor, IgtType
+from bench.executors.shell import ShellExecutor
+from bench.helpers.helpers import (driver_check, igt_check, igt_run_check,
+ modprobe_driver, modprobe_driver_check)
+from bench.machines.host import SriovHost
+from bench.machines.virtual.vm import VirtualMachine
+from vmm_flows.conftest import VmmTestingSetup
+
+def local_init_first_last(ts: VmmTestingSetup):
+ host: SriovHost = ts.get_host
+ vm_first: VirtualMachine = ts.get_vm[0]
+ vm_last: VirtualMachine = ts.get_vm[1]
+
+ assert driver_check(host)
+ assert igt_run_check(host, IgtType.EXEC_STORE)
+
+ total_vfs = host.get_total_vfs()
+ assert host.create_vf(total_vfs) == total_vfs
+ vf_first, vf_last = host.get_vfs_bdf(1, total_vfs)
+
+ vm_first.assign_vf(vf_first)
+ vm_last.assign_vf(vf_last)
+
+ ts.poweron_vms()
+
+ modprobe_first = modprobe_driver(vm_first, ts.get_vm_modprobe_params)
+ modprobe_last = modprobe_driver(vm_last, ts.get_vm_modprobe_params)
+
+ assert modprobe_driver_check(vm_first, modprobe_first)
+ assert modprobe_driver_check(vm_last, modprobe_last)
+
+ assert igt_run_check(vm_first, IgtType.EXEC_STORE)
+ assert igt_run_check(vm_last, IgtType.EXEC_STORE)
+
+def local_fini(ts: VmmTestingSetup):
+ host: SriovHost = ts.get_host
+
+ # W/A wakeref: disable VFs before running IGT tests on a PF to avoid stuck/timeout on DROP_IDLE
+ ts.poweroff_vms()
+ host.clear_vf()
+
+ assert igt_run_check(host, IgtType.EXEC_BASIC)
+
+def local_crash_and_check(vm_to_crash: VirtualMachine, vm_to_check: VirtualMachine):
+ IgtExecutor(vm_to_crash, IgtType.EXEC_STORE)
+ workload_to_check = IgtExecutor(vm_to_check, IgtType.EXEC_STORE)
+
+ # Trigger VM kernel panic
+ ShellExecutor(vm_to_crash, "sh -c '(sleep 1; echo c >/proc/sysrq-trigger)&'")
+ # TODO: check if the VM has really crashed
+
+ assert igt_check(workload_to_check)
+
+ # Destroy crashed VM
+ # TODO: recommend to improve the crashed VM destroy code (or better the entire test).
+ # Shouldn't call __del__ explicitly.
+ vm_to_crash.process.terminate()
+ vm_to_crash.process.communicate(timeout=10)
+ del vm_to_crash
+
+def test_panic_first(create_1host_2vm):
+ """ Check VM kernel panic in MultiVM execution. Crash first VM."""
+ ts: VmmTestingSetup = create_1host_2vm
+
+ local_init_first_last(ts)
+
+ local_crash_and_check(ts.get_vm[0], ts.get_vm[1])
+
+ local_fini(ts)
+
+def test_panic_last(create_1host_2vm):
+ """ Check VM kernel panic in MultiVM execution. Crash last VM."""
+ ts: VmmTestingSetup = create_1host_2vm
+
+ local_init_first_last(ts)
+
+ local_crash_and_check(ts.get_vm[1], ts.get_vm[0])
+
+ local_fini(ts)
diff --git a/tools/vmtb/vmm_flows/test_vm_states_control.py b/tools/vmtb/vmm_flows/test_vm_states_control.py
new file mode 100644
index 000000000..c6ef8b02f
--- /dev/null
+++ b/tools/vmtb/vmm_flows/test_vm_states_control.py
@@ -0,0 +1,140 @@
+#!/usr/bin/env python3
+# SPDX-License-Identifier: MIT
+
+## Copyright (C) 2024 Intel Corporation ##
+
+import time
+from bench.helpers.helpers import (driver_check, igt_run_check,
+ modprobe_driver_run_check)
+from bench.machines.host import SriovHost
+from bench.machines.virtual.vm import VirtualMachine
+from bench.executors.gem_wsim import GemWsim, PREEMPT_10MS_WORKLOAD, ONE_CYCLE_DURATION_MS
+from bench.executors.igt import IgtType
+from vmm_flows.conftest import VmmTestingSetup
+
+DELAY_FOR_WORKLOAD_SEC = 2 # Waiting gem_wsim to be running [seconds]
+DELAY_RESUME_SEC = 10 # time during which VM is in suspend-state [seconds]
+MS_IN_SEC = 1000
+
+def test_boot_reboot_one_vm(create_1host_1vm):
+ """Running workload on VM after its reboot is possible."""
+ ts: VmmTestingSetup = create_1host_1vm
+ host: SriovHost = ts.get_host
+ vm: VirtualMachine = ts.get_vm[0]
+ assert driver_check(host)
+
+ assert host.create_vf(1) == 1
+ vf = host.get_vf_bdf(1)
+ vm.assign_vf(vf)
+
+ vm.poweron()
+
+ assert modprobe_driver_run_check(vm, ts.get_vm_modprobe_params)
+ assert igt_run_check(vm, IgtType.EXEC_BASIC)
+
+ vm.reboot()
+
+ assert igt_run_check(vm, IgtType.EXEC_BASIC)
+
+def test_boot_reboot_one_of_vms(create_1host_2vm):
+ """Reboot of one of VMs doesn't affect workload running on second one."""
+ ts: VmmTestingSetup = create_1host_2vm
+ host: SriovHost = ts.get_host
+ vm_first: VirtualMachine = ts.get_vm[0]
+ vm_second: VirtualMachine = ts.get_vm[1]
+ assert driver_check(host)
+
+ assert host.create_vf(2) == 2
+ vf_first, vf_second = host.get_vfs_bdf(1, 2)
+ vm_first.assign_vf(vf_first)
+ vm_second.assign_vf(vf_second)
+
+ ts.poweron_vms()
+
+ assert modprobe_driver_run_check(vm_first, ts.get_vm_modprobe_params)
+ assert igt_run_check(vm_first, IgtType.EXEC_BASIC)
+ assert modprobe_driver_run_check(vm_second, ts.get_vm_modprobe_params)
+ assert igt_run_check(vm_second, IgtType.EXEC_BASIC)
+
+ iterations = 3000
+ expected_elapsed_sec = ONE_CYCLE_DURATION_MS * iterations / MS_IN_SEC
+ gem_wsim = GemWsim(vm_first, 1, iterations, PREEMPT_10MS_WORKLOAD)
+ time.sleep(DELAY_FOR_WORKLOAD_SEC)
+ assert gem_wsim.is_running()
+
+ vm_second.reboot()
+
+ result = gem_wsim.wait_results()
+ assert expected_elapsed_sec * 0.8 < result.elapsed_sec < expected_elapsed_sec * 1.2
+ assert igt_run_check(vm_second, IgtType.EXEC_BASIC)
+
+def test_suspend_resume_one_vm(create_1host_1vm):
+ """Suspend/Resume of one VM doesn't affect workload running on it."""
+ ts: VmmTestingSetup = create_1host_1vm
+ host: SriovHost = ts.get_host
+ vm: VirtualMachine = ts.get_vm[0]
+ assert driver_check(host)
+
+ assert host.create_vf(1) == 1
+ vf = host.get_vf_bdf(1)
+ vm.assign_vf(vf)
+
+ vm.poweron()
+
+ assert modprobe_driver_run_check(vm, ts.get_vm_modprobe_params)
+ assert igt_run_check(vm, IgtType.EXEC_BASIC)
+
+ iterations = 2000
+ expected_elapsed_sec = ONE_CYCLE_DURATION_MS * iterations / MS_IN_SEC
+ gem_wsim = GemWsim(vm, 1, iterations, PREEMPT_10MS_WORKLOAD)
+ time.sleep(DELAY_FOR_WORKLOAD_SEC)
+
+ assert gem_wsim.is_running()
+ vm.suspend()
+ time.sleep(DELAY_RESUME_SEC)
+ vm.wakeup()
+
+ assert gem_wsim.is_running()
+ result = gem_wsim.wait_results()
+ assert expected_elapsed_sec * 0.8 < result.elapsed_sec
+
+def test_suspend_resume_one_of_vms(create_1host_2vm):
+ """Suspend/Resume of one of VMs doesn't affect workload running on them."""
+ ts: VmmTestingSetup = create_1host_2vm
+ host: SriovHost = ts.get_host
+ vm_first: VirtualMachine = ts.get_vm[0]
+ vm_second: VirtualMachine = ts.get_vm[1]
+ assert driver_check(host)
+
+ assert host.create_vf(2) == 2
+ vf_first, vf_second = host.get_vfs_bdf(1, 2)
+ vm_first.assign_vf(vf_first)
+ vm_second.assign_vf(vf_second)
+
+ ts.poweron_vms()
+
+ assert modprobe_driver_run_check(vm_first, ts.get_vm_modprobe_params)
+ assert igt_run_check(vm_first, IgtType.EXEC_BASIC)
+ assert modprobe_driver_run_check(vm_second, ts.get_vm_modprobe_params)
+ assert igt_run_check(vm_second, IgtType.EXEC_BASIC)
+
+ iterations_first = 1000
+ iterations_second = 2000
+ expected_first = ONE_CYCLE_DURATION_MS * iterations_first / MS_IN_SEC + DELAY_FOR_WORKLOAD_SEC
+ expected_second = ONE_CYCLE_DURATION_MS * iterations_second / MS_IN_SEC + DELAY_FOR_WORKLOAD_SEC
+ gem_wsim_vm_first = GemWsim(vm_first, 1, iterations_first, PREEMPT_10MS_WORKLOAD)
+ gem_wsim_vm_second = GemWsim(vm_second, 1, iterations_second, PREEMPT_10MS_WORKLOAD)
+
+ time.sleep(DELAY_FOR_WORKLOAD_SEC)
+ assert gem_wsim_vm_first.is_running()
+ assert gem_wsim_vm_second.is_running()
+
+ vm_second.suspend()
+ time.sleep(DELAY_RESUME_SEC)
+ vm_second.wakeup()
+
+ assert gem_wsim_vm_second.is_running()
+ result1 = gem_wsim_vm_first.wait_results()
+ result2 = gem_wsim_vm_second.wait_results()
+ assert expected_first * 0.8 < result1.elapsed_sec < expected_first * 1.2
+ assert expected_second * 0.8 < result2.elapsed_sec
--
2.39.1
More information about the igt-dev
mailing list