[PATCH i-g-t] vmtb: Introduce SR-IOV VM-level testing tool
Adam Miszczak
adam.miszczak at linux.intel.com
Thu Nov 7 11:22:34 UTC 2024
VM Test Bench (VMTB) is a tool for testing virtualization
(SR-IOV) supported by the xe driver.
It allows to enable and provision VFs (Virtual Functions)
and facilitates manipulation of VMs (Virtual Machines)
running virtual GPUs.
This includes starting and accessing the KVM/QEMU VMs,
running workloads or shell commands (Guest/Host),
handling power states, saving and restoring VF state etc.
Initially only basic test scenarios are provided:
- enable VFs, pass it to VMs and boot guest OS
- submit basic workloads on a guest with virtualized GPU
- exercise VF driver probe and remove
but generally, the tool targets also complex test cases, like:
- VF save/restore (VM migration)
- VF provisioning
- VF scheduling
- VM power states
- VF FLR
- VM crash
- GuC FW versioning
Proposed location for the new tool is the root IGT directory:
igt-gpu-tools/vmtb
but some other options can be also considered, for example:
tools/vmtb
tests/vmtb
Signed-off-by: Adam Miszczak <adam.miszczak at linux.intel.com>
---
vmtb/MANIFEST.in | 3 +
vmtb/README.md | 86 +++
vmtb/bench/__init__.py | 43 ++
vmtb/bench/configurators/__init__.py | 0
vmtb/bench/configurators/pci.py | 48 ++
vmtb/bench/configurators/vgpu_profile.py | 264 ++++++++
.../configurators/vgpu_profile_config.py | 148 +++++
vmtb/bench/configurators/vmtb_config.py | 110 ++++
vmtb/bench/drivers/__init__.py | 0
vmtb/bench/drivers/driver_interface.py | 198 ++++++
vmtb/bench/drivers/xe.py | 307 +++++++++
vmtb/bench/exceptions.py | 40 ++
vmtb/bench/executors/__init__.py | 0
vmtb/bench/executors/executor_interface.py | 22 +
vmtb/bench/executors/gem_wsim.py | 70 ++
vmtb/bench/executors/igt.py | 117 ++++
vmtb/bench/executors/shell.py | 30 +
vmtb/bench/helpers/__init__.py | 0
vmtb/bench/helpers/helpers.py | 77 +++
vmtb/bench/helpers/log.py | 75 +++
vmtb/bench/machines/__init__.py | 0
vmtb/bench/machines/device_interface.py | 23 +
vmtb/bench/machines/host.py | 197 ++++++
vmtb/bench/machines/machine_interface.py | 65 ++
vmtb/bench/machines/physical/__init__.py | 0
vmtb/bench/machines/physical/device.py | 240 +++++++
vmtb/bench/machines/virtual/__init__.py | 0
.../machines/virtual/backends/__init__.py | 0
.../virtual/backends/backend_interface.py | 40 ++
.../machines/virtual/backends/guestagent.py | 99 +++
.../machines/virtual/backends/qmp_monitor.py | 161 +++++
vmtb/bench/machines/virtual/vm.py | 619 ++++++++++++++++++
vmtb/dev-requirements.txt | 5 +
vmtb/pyproject.toml | 25 +
vmtb/pytest.ini | 0
vmtb/requirements.txt | 2 +
vmtb/vmm_flows/__init__.py | 0
vmtb/vmm_flows/conftest.py | 340 ++++++++++
.../resources/vgpu_profiles/Flex170.json | 113 ++++
vmtb/vmm_flows/test_basic.py | 160 +++++
vmtb/vmtb_config.json | 31 +
41 files changed, 3758 insertions(+)
create mode 100644 vmtb/MANIFEST.in
create mode 100644 vmtb/README.md
create mode 100644 vmtb/bench/__init__.py
create mode 100644 vmtb/bench/configurators/__init__.py
create mode 100644 vmtb/bench/configurators/pci.py
create mode 100644 vmtb/bench/configurators/vgpu_profile.py
create mode 100644 vmtb/bench/configurators/vgpu_profile_config.py
create mode 100644 vmtb/bench/configurators/vmtb_config.py
create mode 100644 vmtb/bench/drivers/__init__.py
create mode 100644 vmtb/bench/drivers/driver_interface.py
create mode 100644 vmtb/bench/drivers/xe.py
create mode 100644 vmtb/bench/exceptions.py
create mode 100644 vmtb/bench/executors/__init__.py
create mode 100644 vmtb/bench/executors/executor_interface.py
create mode 100644 vmtb/bench/executors/gem_wsim.py
create mode 100644 vmtb/bench/executors/igt.py
create mode 100644 vmtb/bench/executors/shell.py
create mode 100644 vmtb/bench/helpers/__init__.py
create mode 100644 vmtb/bench/helpers/helpers.py
create mode 100644 vmtb/bench/helpers/log.py
create mode 100644 vmtb/bench/machines/__init__.py
create mode 100644 vmtb/bench/machines/device_interface.py
create mode 100644 vmtb/bench/machines/host.py
create mode 100644 vmtb/bench/machines/machine_interface.py
create mode 100644 vmtb/bench/machines/physical/__init__.py
create mode 100644 vmtb/bench/machines/physical/device.py
create mode 100644 vmtb/bench/machines/virtual/__init__.py
create mode 100644 vmtb/bench/machines/virtual/backends/__init__.py
create mode 100644 vmtb/bench/machines/virtual/backends/backend_interface.py
create mode 100644 vmtb/bench/machines/virtual/backends/guestagent.py
create mode 100644 vmtb/bench/machines/virtual/backends/qmp_monitor.py
create mode 100644 vmtb/bench/machines/virtual/vm.py
create mode 100644 vmtb/dev-requirements.txt
create mode 100644 vmtb/pyproject.toml
create mode 100644 vmtb/pytest.ini
create mode 100644 vmtb/requirements.txt
create mode 100644 vmtb/vmm_flows/__init__.py
create mode 100644 vmtb/vmm_flows/conftest.py
create mode 100644 vmtb/vmm_flows/resources/vgpu_profiles/Flex170.json
create mode 100644 vmtb/vmm_flows/test_basic.py
create mode 100644 vmtb/vmtb_config.json
diff --git a/vmtb/MANIFEST.in b/vmtb/MANIFEST.in
new file mode 100644
index 000000000..7674c199d
--- /dev/null
+++ b/vmtb/MANIFEST.in
@@ -0,0 +1,3 @@
+include pytest.ini
+include vmtb_config.json
+include vmm_flows/resources/vgpu_profiles/*
diff --git a/vmtb/README.md b/vmtb/README.md
new file mode 100644
index 000000000..49b034d12
--- /dev/null
+++ b/vmtb/README.md
@@ -0,0 +1,86 @@
+VM Test Bench
+=============
+
+Description
+-----------
+VM Test Bench (VMTB) is a tool for testing virtualization (SR-IOV)
+supported by the xe driver.
+It allows to enable and provision VFs (Virtual Functions) and facilitates
+manipulation of VMs (Virtual Machines) running virtual GPUs.
+This includes starting and accessing the KVM/QEMU VMs,
+running workloads or shell commands (Guest/Host),
+handling power states, saving and restoring VF state etc.
+
+Requirements
+------------
+VMTB is implemented in Python using pytest testing framework.
+
+Host OS is expected to provide:
+- xe PF driver with SR-IOV support
+- VFIO driver (VF save/restore requires vendor specific driver variant)
+- QEMU (VF save/restore requires QEMU 8.1+)
+- IGT binaries
+- Python 3.11+ with pytest installed
+- VM Test Bench tool deployed
+
+Guest OS is expected to contain:
+- xe VF driver
+- QEMU Guest-Agent service for operating on Guest OS
+- IGT binaries to execute worklads on VM
+
+Usual VMTB testing environment bases on Ubuntu 24.04 installed
+on Host and Guest, but execution on other distros should be also possible.
+
+Building
+--------
+The VMTB source distribution package can be built with:
+
+ python -m build --sdist
+
+that runs Python's `build` frontend
+in an isolated virtual environment (`venv`).
+
+The output tarball is created in the `dist/` subdirectory,
+that should be copied and extracted on a host device under test.
+
+Running tests
+-------------
+Test implemented by VM Test Bench are called VMM Flows and located in
+`vmm_flows/` directory. Test files are prefixed with `test_` and encapsulate
+related validation scenarios. Each test file can contain multiple test classes
+(`TestXYZ`) or functions (`test_xyz`), that can be executed independently.
+
+Run the VMM Flows test in the following way (as root):
+
+ $ pytest-3 -v ./vmtb-1.0.0/vmm_flows/<test_file_name>.py::<test_class_or_function_name> --vm-image=/path/to/<guest_os.img>
+
+For example, the simplest 1xVF/VM test scenario can be executed as:
+
+ # sudo pytest-3 -v ./vmtb-1.0.0/vmm_flows/test_basic.py::TestVmSetup::test_vm_boot[2VF] --vm-image=/home/vmuser/guest_os.img
+
+(in case `pytest-3` command cannot be found, check with just `pytest`)
+
+Name of test class/function can be omitted to execute all tests in file.
+File name can also be omitted, then all tests in
+`vmm_flows` directory will be executed.
+
+Test log (including VM dmesg) is available in `logfile.log` output file.
+Test results are presented as a standard pytest output on a terminal.
+VM (Guest OS) can be accessed manually over VNC on [host_IP]:5900
+(where port is incremented for the consecutive VMs).
+
+Structure
+---------
+VMTB is divided into the following components:
+
+#### `bench/`
+Contains 'core' part of the tool, including Host, Device, Driver and
+Virtual Machine abstractions, means to execute workloads (or other tasks),
+various helper and configuration functions etc.
+VMTB utilizes QMP (QEMU Machine Protocol) to communicate and operate with VMs
+and QGA (QEMU Guest Agent) to interact with the Guest OS.
+
+#### `vmm_flows/`
+Contains actual functional VM-level tests (`test_*.py`)
+as well as a setup and tear-down fixtures (`conftest.py`).
+New test files/scenarios shall be placed in this location.
diff --git a/vmtb/bench/__init__.py b/vmtb/bench/__init__.py
new file mode 100644
index 000000000..ed5d7527d
--- /dev/null
+++ b/vmtb/bench/__init__.py
@@ -0,0 +1,43 @@
+# SPDX-License-Identifier: MIT
+# Copyright © 2024 Intel Corporation
+
+import logging
+import logging.config
+
+LOG_CONFIG = {
+ "version": 1,
+ "formatters": {
+ "detailed": {
+ "format": "%(asctime)s [%(levelname)s]: %(name)s (%(funcName)s:%(lineno)d) - %(message)s"
+ },
+ "simple": {"format": "%(levelname)s - %(message)s"},
+ },
+ "handlers": {
+ "console": {
+ "class": "logging.StreamHandler",
+ "formatter": "detailed",
+ "level": "WARNING",
+ "stream": "ext://sys.stdout",
+ },
+ "file": {
+ "backupCount": 5,
+ "class": "logging.handlers.RotatingFileHandler",
+ "filename": "logfile.log",
+ "formatter": "detailed",
+ "maxBytes": 5242880,
+ },
+ },
+ "root": {
+ "handlers": ["console", "file"],
+ "level": "DEBUG"
+ }
+}
+
+logging.config.dictConfig(LOG_CONFIG)
+
+logger = logging.getLogger('VmtbInit')
+
+logger.info('###########################################')
+logger.info('# VM Test Bench #')
+logger.info('# SR-IOV VM-level validation suite #')
+logger.info('###########################################')
diff --git a/vmtb/bench/configurators/__init__.py b/vmtb/bench/configurators/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/vmtb/bench/configurators/pci.py b/vmtb/bench/configurators/pci.py
new file mode 100644
index 000000000..8e8afb138
--- /dev/null
+++ b/vmtb/bench/configurators/pci.py
@@ -0,0 +1,48 @@
+# SPDX-License-Identifier: MIT
+# Copyright © 2024 Intel Corporation
+
+import enum
+import typing
+
+
+class GpuModel(str, enum.Enum):
+ ATSM150 = 'Arctic Sound M150 (ATS-M1)'
+ ATSM75 = 'Arctic Sound M75 (ATS-M3)'
+ Unknown = 'Unknown'
+
+ def __str__(self) -> str:
+ return str.__str__(self)
+
+
+def get_gpu_model(pci_id: str) -> GpuModel:
+ """Return GPU model associated with a given PCI Device ID."""
+ return pci_ids.get(pci_id.upper(), GpuModel.Unknown)
+
+
+def get_vgpu_profiles_file(gpu_model: GpuModel) -> str:
+ """Return vGPU profile definition JSON file for a given GPU model."""
+ if gpu_model == GpuModel.ATSM150:
+ vgpu_device_file = 'Flex170.json'
+ elif gpu_model == GpuModel.ATSM75:
+ vgpu_device_file = 'Flex140.json'
+ else: # GpuModel.Unknown
+ vgpu_device_file = 'N/A'
+
+ return vgpu_device_file
+
+
+# PCI Device IDs: ATS-M150 (M1)
+_atsm150_pci_ids = {
+ '56C0': GpuModel.ATSM150,
+ '56C2': GpuModel.ATSM150
+}
+
+
+# PCI Device IDs: ATS-M75 (M3)
+_atsm75_pci_ids = {
+ '56C1': GpuModel.ATSM75
+}
+
+
+# All PCI Device IDs to GPU Device Names mapping
+pci_ids: typing.Dict[str, GpuModel] = {**_atsm150_pci_ids, **_atsm75_pci_ids}
diff --git a/vmtb/bench/configurators/vgpu_profile.py b/vmtb/bench/configurators/vgpu_profile.py
new file mode 100644
index 000000000..c4fa7ef39
--- /dev/null
+++ b/vmtb/bench/configurators/vgpu_profile.py
@@ -0,0 +1,264 @@
+# SPDX-License-Identifier: MIT
+# Copyright © 2024 Intel Corporation
+
+import json
+import logging
+from dataclasses import dataclass, field
+from pathlib import Path
+from typing import Any, Dict, List
+
+from bench import exceptions
+
+logger = logging.getLogger('VgpuProfile')
+
+
+ at dataclass
+class VgpuResourcesConfig:
+ pfLmem: int = 0
+ pfContexts: int = 0
+ pfDoorbells: int = 0
+ pfGgtt: int = 0
+ vfLmem: int = 0
+ vfContexts: int = 0
+ vfDoorbells: int = 0
+ vfGgtt: int = 0
+
+
+ at dataclass
+class VgpuSchedulerConfig:
+ scheduleIfIdle: bool = False
+ pfExecutionQuanta: int = 0
+ pfPreemptionTimeout: int = 0
+ vfExecutionQuanta: int = 0
+ vfPreemptionTimeout: int = 0
+
+
+ at dataclass
+class VgpuSecurityConfig:
+ reset_after_vf_switch: bool = False
+ guc_sampling_period: int = 0
+ guc_threshold_cat_error: int = 0
+ guc_threshold_page_fault: int = 0
+ guc_threshold_h2g_storm: int = 0
+ guc_threshold_db_storm: int = 0
+ guc_treshold_gt_irq_storm: int = 0
+ guc_threshold_engine_reset: int = 0
+
+
+ at dataclass
+class VgpuProfile:
+ num_vfs: int = 0
+ scheduler: VgpuSchedulerConfig = field(default_factory=VgpuSchedulerConfig)
+ resources: VgpuResourcesConfig = field(default_factory=VgpuResourcesConfig)
+ security: VgpuSecurityConfig = field(default_factory=VgpuSecurityConfig)
+
+ def print_parameters(self) -> None:
+ logger.info(
+ "\nvGPU Profile:\n"
+ " Num VFs = %s\n"
+ "\nResources:\n"
+ " PF:\n"
+ "\tLMEM = %s B\n"
+ "\tContexts = %s\n"
+ "\tDoorbells = %s\n"
+ "\tGGTT = %s B\n"
+ " VF:\n"
+ "\tLMEM = %s B\n"
+ "\tContexts = %s\n"
+ "\tDoorbells = %s\n"
+ "\tGGTT = %s B\n"
+ "\nScheduling:\n"
+ " Schedule If Idle = %s\n"
+ " PF:\n"
+ "\tExecution Quanta = %s ms\n"
+ "\tPreemption Timeout = %s us\n"
+ " VF:\n"
+ "\tExecution Quanta = %s ms\n"
+ "\tPreemption Timeout = %s us\n"
+ "\nSecurity:\n"
+ " Reset After Vf Switch = %s\n",
+ self.num_vfs,
+ self.resources.pfLmem, self.resources.pfContexts, self.resources.pfDoorbells, self.resources.pfGgtt,
+ self.resources.vfLmem, self.resources.vfContexts, self.resources.vfDoorbells, self.resources.vfGgtt,
+ self.scheduler.scheduleIfIdle,
+ self.scheduler.pfExecutionQuanta, self.scheduler.pfPreemptionTimeout,
+ self.scheduler.vfExecutionQuanta, self.scheduler.vfPreemptionTimeout,
+ self.security.reset_after_vf_switch
+ )
+
+
+# Structures for mapping vGPU profiles definition from JSON files
+ at dataclass
+class VgpuProfilePfResourcesDefinition:
+ profile_name: str
+ local_memory_ecc_off: int
+ local_memory_ecc_on: int
+ contexts: int
+ doorbells: int
+ ggtt_size: int
+
+
+ at dataclass
+class VgpuProfileVfResourcesDefinition:
+ profile_name: str
+ vf_count: int
+ local_memory_ecc_off: int
+ local_memory_ecc_on: int
+ contexts: int
+ doorbells: int
+ ggtt_size: int
+
+
+ at dataclass
+class VgpuProfileSchedulerDefinition:
+ profile_name: str = 'N/A'
+ schedule_if_idle: bool = False
+ pf_execution_quanta: int = 0
+ pf_preemption_timeout: int = 0
+ vf_execution_quanta: str = '' # To calculate based on number of VFs
+ vf_preemption_timeout: str = '' # To calculate based on number of VFs
+
+
+ at dataclass
+class VgpuProfileSecurityDefinition(VgpuSecurityConfig):
+ profile_name: str = 'N/A'
+
+
+ at dataclass
+class VgpuProfilesDefinitions:
+ pf_resource_default: str
+ pf_resources: List[VgpuProfilePfResourcesDefinition]
+ vf_resource_default: str
+ vf_resources: List[VgpuProfileVfResourcesDefinition]
+ scheduler_config_default: str
+ scheduler_configs: List[VgpuProfileSchedulerDefinition]
+ security_config_default: str
+ security_configs: List[VgpuProfileSecurityDefinition]
+
+
+class VgpuProfilesJsonReader:
+ def __init__(self, vgpu_json_path: Path) -> None:
+ vgpu_profile_data = self.read_json_file(vgpu_json_path)
+ self.vgpu_profiles: VgpuProfilesDefinitions = self.parse_json_file(vgpu_profile_data)
+
+ def read_json_file(self, vgpu_json_file: Path) -> Any:
+ if not Path(vgpu_json_file).exists():
+ logger.error("vGPU profile JSON file not found: %s", vgpu_json_file)
+ raise exceptions.VgpuProfileError(f'vGPU profile JSON file not found: {vgpu_json_file}')
+
+ with open(vgpu_json_file, mode='r', encoding='utf-8') as json_file:
+ try:
+ vgpu_json = json.load(json_file)
+ except json.JSONDecodeError as exc:
+ logger.error("Invalid vGPU profile JSON format: %s", exc)
+ raise exceptions.VgpuProfileError('Invalid vGPU profile defintion JSON format')
+
+ return vgpu_json
+
+ def __parse_pf_resource_profiles(self, pf_profiles: Dict) -> List[VgpuProfilePfResourcesDefinition]:
+ pf_resources: List[VgpuProfilePfResourcesDefinition] = []
+
+ for pf_profile_name in pf_profiles.keys():
+ lmem_ecc_off = pf_profiles[pf_profile_name]['LocalMemoryEccOff']
+ lmem_ecc_on = pf_profiles[pf_profile_name]['LocalMemoryEccOn']
+ contexts = pf_profiles[pf_profile_name]['Contexts']
+ doorbells = pf_profiles[pf_profile_name]['Doorbells']
+ ggtt_size = pf_profiles[pf_profile_name]['GGTTSize']
+
+ current_pf_resource = VgpuProfilePfResourcesDefinition(pf_profile_name,
+ lmem_ecc_off,
+ lmem_ecc_on,
+ contexts,
+ doorbells,
+ ggtt_size)
+
+ pf_resources.append(current_pf_resource)
+
+ return pf_resources
+
+ def __parse_vf_resource_profiles(self, vf_profiles: Dict) -> List[VgpuProfileVfResourcesDefinition]:
+ vf_resources: List[VgpuProfileVfResourcesDefinition] = []
+
+ for vf_profile_name in vf_profiles.keys():
+ vf_count = vf_profiles[vf_profile_name]['VFCount']
+ lmem_ecc_off = vf_profiles[vf_profile_name]['LocalMemoryEccOff']
+ lmem_ecc_on = vf_profiles[vf_profile_name]['LocalMemoryEccOn']
+ contexts = vf_profiles[vf_profile_name]['Contexts']
+ doorbells = vf_profiles[vf_profile_name]['Doorbells']
+ ggtt_size = vf_profiles[vf_profile_name]['GGTTSize']
+
+ current_vf_resource = VgpuProfileVfResourcesDefinition(vf_profile_name,
+ vf_count,
+ lmem_ecc_off,
+ lmem_ecc_on,
+ contexts,
+ doorbells,
+ ggtt_size)
+
+ vf_resources.append(current_vf_resource)
+
+ return vf_resources
+
+ def __parse_scheduler_profiles(self, scheduler_profiles: Dict) -> List[VgpuProfileSchedulerDefinition]:
+ scheduler_configs: List[VgpuProfileSchedulerDefinition] = []
+
+ for scheduler_profile_name in scheduler_profiles.keys():
+ schedule_if_idle = scheduler_profiles[scheduler_profile_name]['GPUTimeSlicing']['ScheduleIfIdle']
+ pf_eq = scheduler_profiles[scheduler_profile_name]['GPUTimeSlicing']['PFExecutionQuantum']
+ pf_pt = scheduler_profiles[scheduler_profile_name]['GPUTimeSlicing']['PFPreemptionTimeout']
+ vf_eq = scheduler_profiles[scheduler_profile_name]['GPUTimeSlicing']['VFAttributes']['VFExecutionQuantum']
+ vf_pt = scheduler_profiles[scheduler_profile_name]['GPUTimeSlicing']['VFAttributes']['VFPreemptionTimeout']
+
+ current_scheduler = VgpuProfileSchedulerDefinition(scheduler_profile_name,
+ schedule_if_idle,
+ pf_eq, pf_pt,
+ vf_eq, vf_pt)
+
+ scheduler_configs.append(current_scheduler)
+
+ return scheduler_configs
+
+ def __parse_security_profiles(self, security_profiles: Dict) -> List[VgpuProfileSecurityDefinition]:
+ security_configs: List[VgpuProfileSecurityDefinition] = []
+
+ for security_profile_name in security_profiles.keys():
+ reset_after_vf_switch = security_profiles[security_profile_name]['ResetAfterVfSwitch']
+ guc_sampling_period = security_profiles[security_profile_name]['GuCSamplingPeriod']
+ guc_threshold_cat_error = security_profiles[security_profile_name]['GuCThresholdCATError']
+ guc_threshold_page_fault = security_profiles[security_profile_name]['GuCThresholdPageFault']
+ guc_threshold_h2g_storm = security_profiles[security_profile_name]['GuCThresholdH2GStorm']
+ guc_threshold_db_storm = security_profiles[security_profile_name]['GuCThresholdDbStorm']
+ guc_treshold_gt_irq_storm = security_profiles[security_profile_name]['GuCThresholdGTIrqStorm']
+ guc_threshold_engine_reset = security_profiles[security_profile_name]['GuCThresholdEngineReset']
+
+ # VgpuSecurityConfig (base class) params go first, therefore profile name
+ # is the last param on the VgpuProfileSecurityDefinition initialization list in this case
+ current_security_config = VgpuProfileSecurityDefinition(reset_after_vf_switch,
+ guc_sampling_period,
+ guc_threshold_cat_error,
+ guc_threshold_page_fault,
+ guc_threshold_h2g_storm,
+ guc_threshold_db_storm,
+ guc_treshold_gt_irq_storm,
+ guc_threshold_engine_reset,
+ security_profile_name)
+
+ security_configs.append(current_security_config)
+
+ return security_configs
+
+ def parse_json_file(self, vgpu_json: Dict) -> VgpuProfilesDefinitions:
+ pf_resource_default = vgpu_json['PFResources']['Default']
+ pf_resources = self.__parse_pf_resource_profiles(vgpu_json['PFResources']['Profile'])
+
+ vf_resource_default = vgpu_json['vGPUResources']['Default']
+ vf_resources = self.__parse_vf_resource_profiles(vgpu_json['vGPUResources']['Profile'])
+
+ scheduler_default = vgpu_json['vGPUScheduler']['Default']
+ scheduler_configs = self.__parse_scheduler_profiles(vgpu_json['vGPUScheduler']['Profile'])
+
+ security_default = vgpu_json['vGPUSecurity']['Default']
+ security_configs = self.__parse_security_profiles(vgpu_json['vGPUSecurity']['Profile'])
+
+ return VgpuProfilesDefinitions(pf_resource_default, pf_resources, vf_resource_default, vf_resources,
+ scheduler_default, scheduler_configs, security_default, security_configs)
diff --git a/vmtb/bench/configurators/vgpu_profile_config.py b/vmtb/bench/configurators/vgpu_profile_config.py
new file mode 100644
index 000000000..6a4ef0334
--- /dev/null
+++ b/vmtb/bench/configurators/vgpu_profile_config.py
@@ -0,0 +1,148 @@
+# SPDX-License-Identifier: MIT
+# Copyright © 2024 Intel Corporation
+
+import logging
+from enum import Enum
+from pathlib import Path
+
+from bench import exceptions
+from bench.configurators.pci import GpuModel, get_vgpu_profiles_file
+from bench.configurators.vgpu_profile import (VgpuProfile,
+ VgpuProfilesDefinitions,
+ VgpuProfilesJsonReader,
+ VgpuResourcesConfig,
+ VgpuSchedulerConfig,
+ VgpuSecurityConfig)
+
+logger = logging.getLogger('DeviceConfigurator')
+
+
+class VfSchedulingMode(str, Enum):
+ INFINITE = 'Infinite' # Infinite EQ/PT - HW default
+ DEFAULT_PROFILE = 'Default_Profile' # Default vGPU scheduler profile
+ FLEXIBLE_30FPS = 'Flexible_30fps_GPUTimeSlicing'
+ FIXED_30FPS = 'Fixed_30fps_GPUTimeSlicing'
+ FLEXIBLE_BURSTABLE_QOS = 'Flexible_BurstableQoS_GPUTimeSlicing'
+
+ def __str__(self) -> str:
+ return str.__str__(self)
+
+
+class VgpuProfileConfigurator:
+ def __init__(self, vgpu_profiles_dir: Path, gpu_model: GpuModel = GpuModel.Unknown) -> None:
+ self.gpu_model: GpuModel = gpu_model
+ self.vgpu_profiles_dir: Path = vgpu_profiles_dir
+ self.supported_vgpu_profiles: VgpuProfilesDefinitions = self.query_vgpu_profiles()
+
+ def __helper_create_vgpu_json_path(self, vgpu_resource_dir: Path) -> Path:
+ vgpu_device_file = get_vgpu_profiles_file(self.gpu_model)
+ vgpu_json_file_path = vgpu_resource_dir / vgpu_device_file
+
+ if not vgpu_json_file_path.exists():
+ logger.error("vGPU profiles JSON file not found in %s", vgpu_resource_dir)
+ raise exceptions.VgpuProfileError(f'vGPU profiles JSON file not found in {vgpu_resource_dir}')
+
+ return vgpu_json_file_path
+
+ def query_vgpu_profiles(self) -> VgpuProfilesDefinitions:
+ """Get all vGPU profiles supported for a given GPU device."""
+ json_reader = VgpuProfilesJsonReader(self.__helper_create_vgpu_json_path(self.vgpu_profiles_dir))
+ return json_reader.vgpu_profiles
+
+ def select_vgpu_resources_profile(self, requested_num_vfs: int) -> VgpuResourcesConfig:
+ """Find vGPU profile matching requested number of VFs.
+ In case exact match cannot be found, try to fit similar profile with up to 2 more VFs, for example:
+ - if requested profile with 3 VFs is not available, return close config with 4 VFs.
+ - if requested profile with neither 9 VFs, nor with 10 or 11 VFs is available - throw 'not found' exeception.
+ """
+ vgpu_resources_config = VgpuResourcesConfig()
+
+ for pf_resource in self.supported_vgpu_profiles.pf_resources:
+ if pf_resource.profile_name == self.supported_vgpu_profiles.pf_resource_default:
+ vgpu_resources_config.pfLmem = pf_resource.local_memory_ecc_on
+ vgpu_resources_config.pfContexts = pf_resource.contexts
+ vgpu_resources_config.pfDoorbells = pf_resource.doorbells
+ vgpu_resources_config.pfGgtt = pf_resource.ggtt_size
+
+ is_vf_resource_found = False
+ for vf_resource in self.supported_vgpu_profiles.vf_resources:
+ current_num_vfs = vf_resource.vf_count
+
+ if current_num_vfs == requested_num_vfs:
+ is_vf_resource_found = True # Exact match
+ elif requested_num_vfs < current_num_vfs <= requested_num_vfs + 2:
+ logger.debug("Unable to find accurate vGPU profile but have similar: %s", vf_resource.profile_name)
+ is_vf_resource_found = True # Approximate match
+
+ if is_vf_resource_found:
+ vgpu_resources_config.vfLmem = vf_resource.local_memory_ecc_on
+ vgpu_resources_config.vfContexts = vf_resource.contexts
+ vgpu_resources_config.vfDoorbells = vf_resource.doorbells
+ vgpu_resources_config.vfGgtt = vf_resource.ggtt_size
+ break
+
+ if not is_vf_resource_found:
+ logger.error("vGPU VF resources profile %sxVF not found!", requested_num_vfs)
+ raise exceptions.VgpuProfileError(f'vGPU VF resources profile {requested_num_vfs}xVF not found!')
+
+ return vgpu_resources_config
+
+ def select_vgpu_scheduler_profile(self, requested_num_vfs: int,
+ requested_scheduler: VfSchedulingMode) -> VgpuSchedulerConfig:
+ # Function eval is needed to calculate VF EQ/PT for num_vfs
+ # Disable eval warning
+ # pylint: disable=W0123
+ vgpu_scheduler_config = VgpuSchedulerConfig()
+
+ if requested_scheduler is VfSchedulingMode.INFINITE:
+ return vgpu_scheduler_config
+
+ for scheduler in self.supported_vgpu_profiles.scheduler_configs:
+ if scheduler.profile_name == requested_scheduler:
+ vgpu_scheduler_config.scheduleIfIdle = scheduler.schedule_if_idle
+ vgpu_scheduler_config.pfExecutionQuanta = scheduler.pf_execution_quanta
+ vgpu_scheduler_config.pfPreemptionTimeout = scheduler.pf_preemption_timeout
+
+ lambda_vf_eq = eval(scheduler.vf_execution_quanta)
+ lambda_vf_eq_result = lambda_vf_eq(requested_num_vfs)
+
+ lambda_vf_pt = eval(scheduler.vf_preemption_timeout)
+ lambda_vf_pt_result = lambda_vf_pt(requested_num_vfs)
+
+ vgpu_scheduler_config.vfExecutionQuanta = lambda_vf_eq_result
+ vgpu_scheduler_config.vfPreemptionTimeout = lambda_vf_pt_result
+
+ return vgpu_scheduler_config
+
+ def select_vgpu_security_profile(self) -> VgpuSecurityConfig:
+ # Currently supports only default security profile
+ vgpu_security_config = VgpuSecurityConfig()
+
+ for security_profile in self.supported_vgpu_profiles.security_configs:
+ if security_profile.profile_name == self.supported_vgpu_profiles.security_config_default:
+ vgpu_security_config.reset_after_vf_switch = security_profile.reset_after_vf_switch
+ vgpu_security_config.guc_sampling_period = security_profile.guc_sampling_period
+ vgpu_security_config.guc_threshold_cat_error = security_profile.guc_threshold_cat_error
+ vgpu_security_config.guc_threshold_page_fault = security_profile.guc_threshold_page_fault
+ vgpu_security_config.guc_threshold_h2g_storm = security_profile.guc_threshold_h2g_storm
+ vgpu_security_config.guc_threshold_db_storm = security_profile.guc_threshold_db_storm
+ vgpu_security_config.guc_treshold_gt_irq_storm = security_profile.guc_treshold_gt_irq_storm
+ vgpu_security_config.guc_threshold_engine_reset = security_profile.guc_threshold_engine_reset
+
+ return vgpu_security_config
+
+ def get_vgpu_profile(self, requested_num_vfs: int, requested_scheduler: VfSchedulingMode) -> VgpuProfile:
+ """Get vGPU profile for requested number of VFs, scheduler and security modes."""
+ logger.info("Requested vGPU profile: %s VFs / scheduling: %s", requested_num_vfs, requested_scheduler)
+
+ vgpu_profile: VgpuProfile = VgpuProfile()
+ vgpu_profile.num_vfs = requested_num_vfs
+ vgpu_profile.resources = self.select_vgpu_resources_profile(requested_num_vfs)
+
+ if requested_scheduler is VfSchedulingMode.DEFAULT_PROFILE:
+ requested_scheduler = VfSchedulingMode(self.supported_vgpu_profiles.scheduler_config_default)
+
+ vgpu_profile.scheduler = self.select_vgpu_scheduler_profile(requested_num_vfs, requested_scheduler)
+ vgpu_profile.security = self.select_vgpu_security_profile()
+
+ return vgpu_profile
diff --git a/vmtb/bench/configurators/vmtb_config.py b/vmtb/bench/configurators/vmtb_config.py
new file mode 100644
index 000000000..49dde4589
--- /dev/null
+++ b/vmtb/bench/configurators/vmtb_config.py
@@ -0,0 +1,110 @@
+# SPDX-License-Identifier: MIT
+# Copyright © 2024 Intel Corporation
+
+import json
+import logging
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Any, Dict
+
+from bench import exceptions
+
+logger = logging.getLogger('VmtbConfigurator')
+
+
+ at dataclass
+class VmtbIgtConfig:
+ test_dir: str
+ tool_dir: str
+ lib_dir: str
+ result_dir: str
+ options: str
+
+
+ at dataclass
+class VmtbHostConfig:
+ card_index: int
+ driver: str
+ igt_config: VmtbIgtConfig
+
+
+ at dataclass
+class VmtbGuestConfig:
+ os_image_path: str
+ driver: str
+ igt_config: VmtbIgtConfig
+
+
+ at dataclass
+class VmtbConfig:
+ host_config: VmtbHostConfig
+ guest_config: VmtbGuestConfig
+ vgpu_profiles_path: str
+ guc_ver_path: str
+ ci_host_dmesg_file: str
+
+
+class VmtbConfigurator:
+ def __init__(self, vmtb_config_file_path: Path) -> None:
+ self.vmtb_config_file: Path = vmtb_config_file_path
+ self.config: VmtbConfig = self.query_vmtb_config()
+
+ def query_vmtb_config(self) -> VmtbConfig:
+ json_reader = VmtbConfigJsonReader(self.vmtb_config_file)
+ return json_reader.vmtb_config
+
+ def get_host_config(self) -> VmtbHostConfig:
+ return self.config.host_config
+
+ def get_guest_config(self) -> VmtbGuestConfig:
+ return self.config.guest_config
+
+
+class VmtbConfigJsonReader:
+ def __init__(self, config_json_path: Path) -> None:
+ vgpu_profile_data = self.read_json_file(config_json_path)
+ self.vmtb_config: VmtbConfig = self.parse_json_file(vgpu_profile_data)
+
+ def read_json_file(self, config_json_file: Path) -> Any:
+ if not config_json_file.exists():
+ logger.error("VMTB config JSON file not found: %s", config_json_file)
+ raise exceptions.VmtbConfigError(f'VMTB config JSON file not found: {config_json_file}')
+
+ with open(config_json_file, mode='r', encoding='utf-8') as json_file:
+ try:
+ vgpu_json = json.load(json_file)
+ except json.JSONDecodeError as exc:
+ logger.error("Invalid VMTB config JSON format: %s", exc)
+ raise exceptions.VmtbConfigError(f'Invalid VMTB config JSON format: {exc}')
+
+ return vgpu_json
+
+ def get_igt_config(self, igt_config_json: Dict) -> VmtbIgtConfig:
+ igt_config = VmtbIgtConfig(
+ test_dir=igt_config_json['igt']['test_dir'],
+ tool_dir=igt_config_json['igt']['tool_dir'],
+ lib_dir=igt_config_json['igt']['lib_dir'],
+ result_dir=igt_config_json['igt']['result_dir'],
+ options=igt_config_json['igt']['options'])
+
+ return igt_config
+
+ def parse_json_file(self, config_json: Dict) -> VmtbConfig:
+ vmtb_host_config = VmtbHostConfig(
+ card_index=config_json['host']['card_index'],
+ driver=config_json['host']['driver'],
+ igt_config=self.get_igt_config(config_json['host']))
+
+ vmtb_guest_config = VmtbGuestConfig(
+ os_image_path=config_json['guest']['os_image'],
+ driver=config_json['guest']['driver'],
+ igt_config=self.get_igt_config(config_json['guest']))
+
+ vmtb_config = VmtbConfig(
+ host_config=vmtb_host_config,
+ guest_config=vmtb_guest_config,
+ vgpu_profiles_path=config_json['resources']['vgpu_profiles_path'],
+ guc_ver_path=config_json['resources']['guc_ver_path'],
+ ci_host_dmesg_file=config_json['ci']['host_dmesg_file'])
+
+ return vmtb_config
diff --git a/vmtb/bench/drivers/__init__.py b/vmtb/bench/drivers/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/vmtb/bench/drivers/driver_interface.py b/vmtb/bench/drivers/driver_interface.py
new file mode 100644
index 000000000..af2f96837
--- /dev/null
+++ b/vmtb/bench/drivers/driver_interface.py
@@ -0,0 +1,198 @@
+# SPDX-License-Identifier: MIT
+# Copyright © 2024 Intel Corporation
+
+import abc
+import enum
+import typing
+
+
+class SchedulingPriority(enum.Enum):
+ LOW = 0
+ NORMAL = 1
+ HIGH = 2
+
+
+class VfControl(str, enum.Enum):
+ pause = 'pause'
+ resume = 'resume'
+ stop = 'stop'
+ clear = 'clear'
+
+ def __str__(self) -> str:
+ return str.__str__(self)
+
+
+class DriverInterface(abc.ABC):
+
+ @staticmethod
+ @abc.abstractmethod
+ def get_name() -> str:
+ raise NotImplementedError
+
+ @abc.abstractmethod
+ def bind(self, bdf: str) -> None:
+ raise NotImplementedError
+
+ @abc.abstractmethod
+ def unbind(self, bdf: str) -> None:
+ raise NotImplementedError
+
+ @abc.abstractmethod
+ def get_totalvfs(self) -> int:
+ raise NotImplementedError
+
+ @abc.abstractmethod
+ def get_numvfs(self) -> int:
+ raise NotImplementedError
+
+ @abc.abstractmethod
+ def set_numvfs(self, val: int) -> None:
+ raise NotImplementedError
+
+ @abc.abstractmethod
+ def get_drivers_autoprobe(self) -> int:
+ raise NotImplementedError
+
+ @abc.abstractmethod
+ def set_drivers_autoprobe(self, val: int) -> None:
+ raise NotImplementedError
+
+ @abc.abstractmethod
+ def get_num_gts(self) -> int:
+ raise NotImplementedError
+
+ @abc.abstractmethod
+ def has_lmem(self) -> bool:
+ raise NotImplementedError
+
+ @abc.abstractmethod
+ def get_auto_provisioning(self) -> bool:
+ raise NotImplementedError
+
+ @abc.abstractmethod
+ def set_auto_provisioning(self, val: bool) -> None:
+ raise NotImplementedError
+
+ @abc.abstractmethod
+ def cancel_work(self) -> None:
+ raise NotImplementedError
+
+ @abc.abstractmethod
+ def get_pf_ggtt_spare(self, gt_num: int) -> int:
+ raise NotImplementedError
+
+ @abc.abstractmethod
+ def set_pf_ggtt_spare(self, gt_num: int, val: int) -> None:
+ raise NotImplementedError
+
+ @abc.abstractmethod
+ def get_pf_lmem_spare(self, gt_num: int) -> int:
+ raise NotImplementedError
+
+ @abc.abstractmethod
+ def set_pf_lmem_spare(self, gt_num: int, val: int) -> None:
+ raise NotImplementedError
+
+ @abc.abstractmethod
+ def get_pf_contexts_spare(self, gt_num: int) -> int:
+ raise NotImplementedError
+
+ @abc.abstractmethod
+ def set_pf_contexts_spare(self, gt_num: int, val: int) -> None:
+ raise NotImplementedError
+
+ @abc.abstractmethod
+ def get_pf_doorbells_spare(self, gt_num: int) -> int:
+ raise NotImplementedError
+
+ @abc.abstractmethod
+ def set_pf_doorbells_spare(self, gt_num: int, val: int) -> None:
+ raise NotImplementedError
+
+ @abc.abstractmethod
+ def get_pf_sched_priority(self, gt_num: int) -> SchedulingPriority:
+ raise NotImplementedError
+
+ @abc.abstractmethod
+ def set_pf_sched_priority(self, gt_num: int, val: SchedulingPriority) -> None:
+ raise NotImplementedError
+
+ @abc.abstractmethod
+ def get_pf_policy_reset_engine(self, gt_num: int) -> int:
+ raise NotImplementedError
+
+ @abc.abstractmethod
+ def set_pf_policy_reset_engine(self, gt_num: int, val: int) -> None:
+ raise NotImplementedError
+
+ @abc.abstractmethod
+ def get_pf_policy_sample_period_ms(self, gt_num: int) -> int:
+ raise NotImplementedError
+
+ @abc.abstractmethod
+ def set_pf_policy_sample_period_ms(self, gt_num: int, val: int) -> None:
+ raise NotImplementedError
+
+ @abc.abstractmethod
+ def get_pf_policy_sched_if_idle(self, gt_num: int) -> int:
+ raise NotImplementedError
+
+ @abc.abstractmethod
+ def set_pf_policy_sched_if_idle(self, gt_num: int, val: int) -> None:
+ raise NotImplementedError
+
+ @abc.abstractmethod
+ def get_ggtt_quota(self, vf_num: int, gt_num: int) -> int:
+ raise NotImplementedError
+
+ @abc.abstractmethod
+ def set_ggtt_quota(self, vf_num: int, gt_num: int, val: int) -> None:
+ raise NotImplementedError
+
+ @abc.abstractmethod
+ def get_lmem_quota(self, vf_num: int, gt_num: int) -> int:
+ raise NotImplementedError
+
+ @abc.abstractmethod
+ def set_lmem_quota(self, vf_num: int, gt_num: int, val: int) -> None:
+ raise NotImplementedError
+
+ @abc.abstractmethod
+ def get_contexts_quota(self, vf_num: int, gt_num: int) -> int:
+ raise NotImplementedError
+
+ @abc.abstractmethod
+ def set_contexts_quota(self, vf_num: int, gt_num: int, val: int) -> None:
+ raise NotImplementedError
+
+ @abc.abstractmethod
+ def get_doorbells_quota(self, vf_num: int, gt_num: int) -> int:
+ raise NotImplementedError
+
+ @abc.abstractmethod
+ def set_doorbells_quota(self, vf_num: int, gt_num: int, val: int) -> None:
+ raise NotImplementedError
+
+ @abc.abstractmethod
+ def get_exec_quantum_ms(self, vf_num: int, gt_num: int) -> int:
+ raise NotImplementedError
+
+ @abc.abstractmethod
+ def set_exec_quantum_ms(self, vf_num: int, gt_num: int, val: int) -> None:
+ raise NotImplementedError
+
+ @abc.abstractmethod
+ def get_preempt_timeout_us(self, vf_num: int, gt_num: int) -> int:
+ raise NotImplementedError
+
+ @abc.abstractmethod
+ def set_preempt_timeout_us(self, vf_num: int, gt_num: int, val: int) -> None:
+ raise NotImplementedError
+
+ @abc.abstractmethod
+ def set_vf_control(self, vf_num: int, val: VfControl) -> None:
+ raise NotImplementedError
+
+ @abc.abstractmethod
+ def get_ggtt_available(self, gt_num: int) -> typing.Tuple[int, int]:
+ raise NotImplementedError
diff --git a/vmtb/bench/drivers/xe.py b/vmtb/bench/drivers/xe.py
new file mode 100644
index 000000000..009cec5be
--- /dev/null
+++ b/vmtb/bench/drivers/xe.py
@@ -0,0 +1,307 @@
+# SPDX-License-Identifier: MIT
+# Copyright © 2024 Intel Corporation
+
+import logging
+import typing
+from pathlib import Path
+
+from bench import exceptions
+from bench.drivers.driver_interface import (DriverInterface,
+ SchedulingPriority, VfControl)
+from bench.helpers.log import LogDecorators
+
+logger = logging.getLogger('XeDriver')
+
+
+class XeDriver(DriverInterface):
+ def __init__(self, card_index: int) -> None:
+ self.sysfs_card_path = Path(f'/sys/class/drm/card{card_index}')
+ self.debugfs_path = Path(f'/sys/kernel/debug/dri/{card_index}')
+
+ @staticmethod
+ def get_name() -> str:
+ return 'xe'
+
+ @LogDecorators.parse_kmsg
+ def __write_fs(self, base_path: Path, name: str, value: str) -> None:
+ path = base_path / name
+ try:
+ path.write_text(value)
+ logger.debug("Write: %s -> %s", value, path)
+ except Exception as exc:
+ logger.error("Unable to write %s -> %s", value, path)
+ raise exceptions.HostError(f'Could not write to {path}. Error: {exc}') from exc
+
+ @LogDecorators.parse_kmsg
+ def __read_fs(self, base_path: Path, name: str) -> str:
+ path = base_path / name
+ try:
+ ret = path.read_text()
+ except Exception as exc:
+ logger.error("Unable to read %s", path)
+ raise exceptions.HostError(f'Could not read from {path}. Error: {exc}') from exc
+
+ logger.debug("Read: %s -> %s", path, ret.strip())
+ return ret
+
+ def __write_sysfs(self, name: str, value: str) -> None:
+ self.__write_fs(self.sysfs_card_path / 'device', name, value)
+
+ def __read_sysfs(self, name: str) -> str:
+ return str(self.__read_fs(self.sysfs_card_path / 'device', name))
+
+ def __write_debugfs(self, name: str, value: str) -> None:
+ self.__write_fs(self.debugfs_path, name, value)
+
+ def __read_debugfs(self, name: str) -> str:
+ return str(self.__read_fs(self.debugfs_path, name))
+
+ def bind(self, bdf: str) -> None:
+ self.__write_sysfs('driver/bind', bdf)
+
+ def unbind(self, bdf: str) -> None:
+ self.__write_sysfs('driver/unbind', bdf)
+
+ def get_totalvfs(self) -> int:
+ return int(self.__read_sysfs('sriov_totalvfs'))
+
+ def get_numvfs(self) -> int:
+ return int(self.__read_sysfs('sriov_numvfs'))
+
+ def set_numvfs(self, val: int) -> None:
+ self.__write_sysfs('sriov_numvfs', str(val))
+
+ def get_drivers_autoprobe(self) -> int:
+ return int(self.__read_sysfs('sriov_drivers_autoprobe'))
+
+ def set_drivers_autoprobe(self, val: int) -> None:
+ self.__write_sysfs('sriov_drivers_autoprobe', str(val))
+
+ def get_num_gts(self) -> int:
+ gt_num = 0
+ # Fixme: tile0 only at the moment, add support for multiple tiles if needed
+ path = self.sysfs_card_path / 'device' / 'tile0' / 'gt'
+
+ if path.exists():
+ gt_num = 1
+ else:
+ while Path(f'{path}{gt_num}').exists():
+ gt_num += 1
+
+ return gt_num
+
+ def has_lmem(self) -> bool:
+ # XXX: is this a best way to check if LMEM is present?
+ path = self.debugfs_path / 'gt0' / 'pf' / 'lmem_spare'
+ return path.exists()
+
+ def get_auto_provisioning(self) -> bool:
+ raise exceptions.NotAvailableError('auto_provisioning attribute not available')
+
+ def set_auto_provisioning(self, val: bool) -> None:
+ raise exceptions.NotAvailableError('auto_provisioning attribute not available')
+
+ def cancel_work(self) -> None:
+ # Function to cancel all remaing work on GPU (for test cleanup).
+ # Forcing reset (debugfs/gtM/force_reset_sync) shouldn't be used to idle GPU.
+ pass
+
+ # Create debugfs path to given parameter (without a base part):
+ # gt at gt_num/[pf|vf at vf_num]/@attr
+ # @vf_num: VF number (1-based) or 0 for PF
+ # @gt_num: GT instance number
+ # @subdir: subdirectory for attribute or empty string if not exists
+ # @attr: iov parameter name
+ # Returns: iov debugfs path to @attr
+ def __helper_create_debugfs_path(self, vf_num: int, gt_num: int, subdir: str, attr: str) -> str:
+ vf_gt_part = f'gt{gt_num}/pf' if vf_num == 0 else f'gt{gt_num}/vf{vf_num}'
+ return f'{vf_gt_part}/{subdir}/{attr}'
+
+ # PF spare resources
+ # Debugfs location: [SRIOV debugfs base path]/gtM/pf/xxx_spare
+ def get_pf_ggtt_spare(self, gt_num: int) -> int:
+ path = self.__helper_create_debugfs_path(0, gt_num, '', 'ggtt_spare')
+ return int(self.__read_debugfs(path))
+
+ def set_pf_ggtt_spare(self, gt_num: int, val: int) -> None:
+ path = self.__helper_create_debugfs_path(0, gt_num, '', 'ggtt_spare')
+ self.__write_debugfs(path, str(val))
+
+ def get_pf_lmem_spare(self, gt_num: int) -> int:
+ path = self.__helper_create_debugfs_path(0, gt_num, '', 'lmem_spare')
+ return int(self.__read_debugfs(path))
+
+ def set_pf_lmem_spare(self, gt_num: int, val: int) -> None:
+ path = self.__helper_create_debugfs_path(0, gt_num, '', 'lmem_spare')
+ self.__write_debugfs(path, str(val))
+
+ def get_pf_contexts_spare(self, gt_num: int) -> int:
+ path = self.__helper_create_debugfs_path(0, gt_num, '', 'contexts_spare')
+ return int(self.__read_debugfs(path))
+
+ def set_pf_contexts_spare(self, gt_num: int, val: int) -> None:
+ path = self.__helper_create_debugfs_path(0, gt_num, '', 'contexts_spare')
+ self.__write_debugfs(path, str(val))
+
+ def get_pf_doorbells_spare(self, gt_num: int) -> int:
+ path = self.__helper_create_debugfs_path(0, gt_num, '', 'doorbells_spare')
+ return int(self.__read_debugfs(path))
+
+ def set_pf_doorbells_spare(self, gt_num: int, val: int) -> None:
+ path = self.__helper_create_debugfs_path(0, gt_num, '', 'doorbells_spare')
+ self.__write_debugfs(path, str(val))
+
+ # PF specific provisioning parameters
+ # Debugfs location: [SRIOV debugfs base path]/gtM/pf
+ def get_pf_sched_priority(self, gt_num: int) -> SchedulingPriority:
+ logger.warning("PF sched_priority param not available")
+ return SchedulingPriority.LOW
+
+ def set_pf_sched_priority(self, gt_num: int, val: SchedulingPriority) -> None:
+ logger.warning("PF sched_priority param not available")
+
+ def get_pf_policy_reset_engine(self, gt_num: int) -> int:
+ path = self.__helper_create_debugfs_path(0, gt_num, '', 'reset_engine')
+ return int(self.__read_debugfs(path))
+
+ def set_pf_policy_reset_engine(self, gt_num: int, val: int) -> None:
+ path = self.__helper_create_debugfs_path(0, gt_num, '', 'reset_engine')
+ self.__write_debugfs(path, str(val))
+
+ def get_pf_policy_sample_period_ms(self, gt_num: int) -> int:
+ path = self.__helper_create_debugfs_path(0, gt_num, '', 'sample_period_ms')
+ return int(self.__read_debugfs(path))
+
+ def set_pf_policy_sample_period_ms(self, gt_num: int, val: int) -> None:
+ path = self.__helper_create_debugfs_path(0, gt_num, '', 'sample_period_ms')
+ self.__write_debugfs(path, str(val))
+
+ def get_pf_policy_sched_if_idle(self, gt_num: int) -> int:
+ path = self.__helper_create_debugfs_path(0, gt_num, '', 'sched_if_idle')
+ return int(self.__read_debugfs(path))
+
+ def set_pf_policy_sched_if_idle(self, gt_num: int, val: int) -> None:
+ # In order to set strict scheduling policy, PF scheduling priority needs to be default
+ path = self.__helper_create_debugfs_path(0, gt_num, '', 'sched_if_idle')
+ self.__write_debugfs(path, str(val))
+
+ # VF and PF provisioning parameters
+ # Debugfs location: [SRIOV debugfs base path]/gtM/[pf|vfN]
+ # @vf_num: VF number (1-based) or 0 for PF
+ def get_ggtt_quota(self, vf_num: int, gt_num: int) -> int:
+ if vf_num == 0:
+ logger.warning("PF ggtt_quota not available")
+ return 0
+
+ path = self.__helper_create_debugfs_path(vf_num, gt_num, '', 'ggtt_quota')
+ return int(self.__read_debugfs(path))
+
+ def set_ggtt_quota(self, vf_num: int, gt_num: int, val: int) -> None:
+ if vf_num == 0:
+ logger.warning("PF ggtt_quota not available")
+ return
+
+ path = self.__helper_create_debugfs_path(vf_num, gt_num, '', 'ggtt_quota')
+ self.__write_debugfs(path, str(val))
+
+ def get_lmem_quota(self, vf_num: int, gt_num: int) -> int:
+ if vf_num == 0:
+ logger.warning("PF lmem_quota not available")
+ return 0
+
+ path = self.__helper_create_debugfs_path(vf_num, gt_num, '', 'lmem_quota')
+ return int(self.__read_debugfs(path)) if self.has_lmem() else 0
+
+ def set_lmem_quota(self, vf_num: int, gt_num: int, val: int) -> None:
+ if vf_num == 0:
+ logger.warning("PF lmem_quota not available")
+ return
+
+ path = self.__helper_create_debugfs_path(vf_num, gt_num, '', 'lmem_quota')
+ if self.has_lmem():
+ self.__write_debugfs(path, str(val))
+
+ def get_contexts_quota(self, vf_num: int, gt_num: int) -> int:
+ if vf_num == 0:
+ logger.warning("PF contexts_quota not available")
+ return 0
+
+ path = self.__helper_create_debugfs_path(vf_num, gt_num, '', 'contexts_quota')
+ return int(self.__read_debugfs(path))
+
+ def set_contexts_quota(self, vf_num: int, gt_num: int, val: int) -> None:
+ if vf_num == 0:
+ logger.warning("PF contexts_quota not available")
+ return
+
+ path = self.__helper_create_debugfs_path(vf_num, gt_num, '', 'contexts_quota')
+ self.__write_debugfs(path, str(val))
+
+ def get_doorbells_quota(self, vf_num: int, gt_num: int) -> int:
+ if vf_num == 0:
+ logger.warning("PF doorbells_quota not available")
+ return 0
+
+ path = self.__helper_create_debugfs_path(vf_num, gt_num, '', 'doorbells_quota')
+ return int(self.__read_debugfs(path))
+
+ def set_doorbells_quota(self, vf_num: int, gt_num: int, val: int) -> None:
+ if vf_num == 0:
+ logger.warning("PF doorbells_quota not available")
+ return
+
+ path = self.__helper_create_debugfs_path(vf_num, gt_num, '', 'doorbells_quota')
+ self.__write_debugfs(path, str(val))
+
+ def get_exec_quantum_ms(self, vf_num: int, gt_num: int) -> int:
+ path = self.__helper_create_debugfs_path(vf_num, gt_num, '', 'exec_quantum_ms')
+ return int(self.__read_debugfs(path))
+
+ def set_exec_quantum_ms(self, vf_num: int, gt_num: int, val: int) -> None:
+ path = self.__helper_create_debugfs_path(vf_num, gt_num, '', 'exec_quantum_ms')
+ self.__write_debugfs(path, str(val))
+
+ def get_preempt_timeout_us(self, vf_num: int, gt_num: int) -> int:
+ path = self.__helper_create_debugfs_path(vf_num, gt_num, '', 'preempt_timeout_us')
+ return int(self.__read_debugfs(path))
+
+ def set_preempt_timeout_us(self, vf_num: int, gt_num: int, val: int) -> None:
+ path = self.__helper_create_debugfs_path(vf_num, gt_num, '', 'preempt_timeout_us')
+ self.__write_debugfs(path, str(val))
+
+ # Control state of the running VF (WO)
+ # Debugfs location: [SRIOV debugfs base path]/gtM/vfN/control
+ # Allows PF admin to pause, resume or stop handling
+ # submission requests from given VF and clear provisioning.
+ # control: "pause|resume|stop|clear"
+ # For debug purposes only.
+ def set_vf_control(self, vf_num: int, val: VfControl) -> None:
+ path = self.__helper_create_debugfs_path(vf_num, 0, '', 'control')
+ self.__write_debugfs(path, val)
+
+ # Read [attribute]_available value from debugfs:
+ # /sys/kernel/debug/dri/[card_index]/gt at gt_num/pf/@attr_available
+ # @gt_num: GT instance number
+ # @attr: iov parameter name
+ # Returns: total and available size for @attr
+ def __helper_get_debugfs_available(self, gt_num: int, attr: str) -> typing.Tuple[int, int]:
+ path = self.debugfs_path / f'gt{gt_num}' / 'pf' / f'{attr}_available'
+ total = available = 0
+
+ out = path.read_text()
+ for line in out.splitlines():
+ param, value = line.split(':')
+ value = value.lstrip().split('\t')[0]
+
+ if param == 'total':
+ total = int(value)
+ elif param == 'avail':
+ available = int(value)
+
+ return (total, available)
+
+ # Resources total availability
+ # Debugfs location: [SRIOV debugfs base path]/gtM/pf/
+ def get_ggtt_available(self, gt_num: int) -> typing.Tuple[int, int]:
+ """Get total and available GGTT size."""
+ return self.__helper_get_debugfs_available(gt_num, 'ggtt')
diff --git a/vmtb/bench/exceptions.py b/vmtb/bench/exceptions.py
new file mode 100644
index 000000000..95ca2aa9b
--- /dev/null
+++ b/vmtb/bench/exceptions.py
@@ -0,0 +1,40 @@
+# SPDX-License-Identifier: MIT
+# Copyright © 2024 Intel Corporation
+
+class BenchError(Exception):
+ pass
+
+
+# Host errors:
+class HostError(BenchError):
+ pass
+
+
+# Guest errors:
+class GuestError(BenchError):
+ pass
+
+
+class GuestAgentError(GuestError):
+ pass
+
+
+class AlarmTimeoutError(GuestError):
+ pass
+
+
+# Generic errors:
+class GemWsimError(BenchError):
+ pass
+
+
+class VgpuProfileError(BenchError):
+ pass
+
+
+class NotAvailableError(BenchError):
+ pass
+
+
+class VmtbConfigError(BenchError):
+ pass
diff --git a/vmtb/bench/executors/__init__.py b/vmtb/bench/executors/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/vmtb/bench/executors/executor_interface.py b/vmtb/bench/executors/executor_interface.py
new file mode 100644
index 000000000..e1598fd29
--- /dev/null
+++ b/vmtb/bench/executors/executor_interface.py
@@ -0,0 +1,22 @@
+# SPDX-License-Identifier: MIT
+# Copyright © 2024 Intel Corporation
+
+import abc
+import signal
+
+from bench.machines.machine_interface import ProcessResult
+
+
+class ExecutorInterface(metaclass=abc.ABCMeta):
+
+ @abc.abstractmethod
+ def status(self) -> ProcessResult:
+ raise NotImplementedError
+
+ @abc.abstractmethod
+ def wait(self) -> ProcessResult:
+ raise NotImplementedError
+
+ @abc.abstractmethod
+ def sendsig(self, sig: signal.Signals) -> None:
+ raise NotImplementedError
diff --git a/vmtb/bench/executors/gem_wsim.py b/vmtb/bench/executors/gem_wsim.py
new file mode 100644
index 000000000..46fa2291c
--- /dev/null
+++ b/vmtb/bench/executors/gem_wsim.py
@@ -0,0 +1,70 @@
+# SPDX-License-Identifier: MIT
+# Copyright © 2024 Intel Corporation
+
+import logging
+import re
+import typing
+
+from bench import exceptions
+from bench.executors.shell import ShellExecutor
+from bench.machines.machine_interface import DEFAULT_TIMEOUT, MachineInterface
+
+logger = logging.getLogger('GemWsim')
+
+
+class GemWsimResult(typing.NamedTuple):
+ elapsed_sec: float
+ workloads_per_sec: float
+
+# Basic workloads
+ONE_CYCLE_DURATION_MS = 10
+PREEMPT_10MS_WORKLOAD = (f'1.DEFAULT.{int(ONE_CYCLE_DURATION_MS * 1000 / 2)}.0.0'
+ f',2.DEFAULT.{int(ONE_CYCLE_DURATION_MS * 1000 / 2)}.-1.1')
+NON_PREEMPT_10MS_WORKLOAD = f'X.1.0,X.2.0,{PREEMPT_10MS_WORKLOAD}'
+
+class GemWsim(ShellExecutor):
+ def __init__(self, machine: MachineInterface, num_clients: int = 1, num_repeats: int = 1,
+ workload: str = PREEMPT_10MS_WORKLOAD, timeout: int = DEFAULT_TIMEOUT) -> None:
+ super().__init__(
+ machine,
+ f'/usr/local/libexec/igt-gpu-tools/benchmarks/gem_wsim -w {workload} -c {num_clients} -r {num_repeats}',
+ timeout)
+ self.machine_id = str(machine)
+
+ def __str__(self) -> str:
+ return f'gem_wsim({self.machine_id}:{self.pid})'
+
+ def is_running(self) -> bool:
+ return not self.status().exited
+
+ def wait_results(self) -> GemWsimResult:
+ proc_result = self.wait()
+ if proc_result.exit_code == 0:
+ logger.info('%s: %s', self, proc_result.stdout)
+ # Try parse output ex.: 19.449s elapsed (102.836 workloads/s)
+ pattern = r'(?P<elapsed>\d+(\.\d*)?|\.\d+)s elapsed \((?P<wps>\d+(\.\d*)?|\.\d+) workloads/s\)'
+ match = re.search(pattern, proc_result.stdout, re.MULTILINE)
+ if match:
+ return GemWsimResult(float(match.group('elapsed')), float(match.group('wps')))
+ raise exceptions.GemWsimError(f'{self}: exit_code: {proc_result.exit_code}'
+ f' stdout: {proc_result.stdout} stderr: {proc_result.stderr}')
+
+
+def gem_wsim_parallel_exec_and_check(vms: typing.List[MachineInterface], workload: str, iterations: int,
+ expected: typing.Optional[GemWsimResult] = None) -> GemWsimResult:
+ # launch on each VM in parallel
+ wsim_procs = [GemWsim(vm, 1, iterations, workload) for vm in vms]
+ for i, wsim in enumerate(wsim_procs):
+ assert wsim.is_running(), f'GemWsim failed to start on VM{i}'
+
+ results = [wsim.wait_results() for wsim in wsim_procs]
+ if expected is not None:
+ assert results[0].elapsed_sec > expected.elapsed_sec * 0.9
+ assert results[0].workloads_per_sec > expected.workloads_per_sec * 0.9
+ for r in results[1:]:
+ # check wps ratio ~1.0 with 10% tolerance
+ assert 0.9 < r.workloads_per_sec / results[0].workloads_per_sec < 1.1
+ # check elapsed ratio ~1.0 with 10% tolerance
+ assert 0.9 < r.elapsed_sec / results[0].elapsed_sec < 1.1
+ # return first result, all other are asserted to be ~same
+ return results[0]
diff --git a/vmtb/bench/executors/igt.py b/vmtb/bench/executors/igt.py
new file mode 100644
index 000000000..4296464c2
--- /dev/null
+++ b/vmtb/bench/executors/igt.py
@@ -0,0 +1,117 @@
+# SPDX-License-Identifier: MIT
+# Copyright © 2024 Intel Corporation
+
+import enum
+import json
+import logging
+import posixpath
+import signal
+import typing
+
+from bench.executors.executor_interface import ExecutorInterface
+from bench.executors.shell import ShellExecutor
+from bench.machines.machine_interface import (DEFAULT_TIMEOUT,
+ MachineInterface, ProcessResult)
+
+logger = logging.getLogger('IgtExecutor')
+
+
+class IgtType(enum.Enum):
+ EXEC_BASIC = 1
+ EXEC_STORE = 2
+ SPIN_BATCH = 3
+
+
+# Mappings of driver specific (i915/xe) IGT instances:
+# {IGT type: (i915 IGT name, xe IGT name)}
+igt_tests: typing.Dict[IgtType, typing.Tuple[str, str]] = {
+ IgtType.EXEC_BASIC: ('igt at gem_exec_basic@basic', 'igt at xe_exec_basic@once-basic'),
+ IgtType.EXEC_STORE: ('igt at gem_exec_store@dword', 'igt at xe_exec_store@basic-store'),
+ IgtType.SPIN_BATCH: ('igt at gem_spin_batch@legacy', 'igt at xe_spin_batch@spin-basic')
+ }
+
+
+class IgtExecutor(ExecutorInterface):
+ def __init__(self, target: MachineInterface,
+ test: typing.Union[str, IgtType],
+ timeout: int = DEFAULT_TIMEOUT) -> None:
+ self.igt_config = target.get_igt_config()
+
+ # TODO ld_library_path not used now, need a way to pass this to guest
+ #ld_library_path = f'LD_LIBRARY_PATH={igt_config.lib_dir}'
+ runner = posixpath.join(self.igt_config.tool_dir, 'igt_runner')
+ testlist = '/tmp/igt_executor.testlist'
+ command = f'{runner} {self.igt_config.options} ' \
+ f'--test-list {testlist} {self.igt_config.test_dir} {self.igt_config.result_dir}'
+ self.results: typing.Dict[str, typing.Any] = {}
+ self.target: MachineInterface = target
+ self.igt: str = test if isinstance(test, str) else self.select_igt_variant(target.get_drm_driver_name(), test)
+ self.target.write_file_content(testlist, self.igt)
+ self.timeout: int = timeout
+
+ logger.info("[%s] Execute IGT test: %s", target, self.igt)
+ self.pid: int = self.target.execute(command)
+
+ # Executor interface implementation
+ def status(self) -> ProcessResult:
+ return self.target.execute_status(self.pid)
+
+ def wait(self) -> ProcessResult:
+ return self.target.execute_wait(self.pid, self.timeout)
+
+ def sendsig(self, sig: signal.Signals) -> None:
+ self.target.execute_signal(self.pid, sig)
+
+ def terminate(self) -> None:
+ self.sendsig(signal.SIGTERM)
+
+ def kill(self) -> None:
+ self.sendsig(signal.SIGKILL)
+
+ # IGT specific methods
+ def get_results_log(self) -> typing.Dict:
+ # Results are cached
+ if self.results:
+ return self.results
+ path = posixpath.join(self.igt_config.result_dir, 'results.json')
+ result = self.target.read_file_content(path)
+ self.results = json.loads(result)
+ return self.results
+
+ def did_pass(self) -> bool:
+ results = self.get_results_log()
+ totals = results.get('totals')
+ if not totals:
+ return False
+ aggregate = totals.get('root')
+ if not aggregate:
+ return False
+
+ pass_case = 0
+ fail_case = 0
+ for key in aggregate:
+ if key in ['pass', 'warn', 'dmesg-warn']:
+ pass_case = pass_case + aggregate[key]
+ continue
+ fail_case = fail_case + aggregate[key]
+
+ logger.debug('Full IGT test results:\n%s', json.dumps(results, indent=4))
+
+ if fail_case > 0:
+ logger.error('Test failed!')
+ return False
+
+ return True
+
+ def select_igt_variant(self, driver: str, igt_type: IgtType) -> str:
+ # Select IGT variant dedicated for a given drm driver: xe or i915
+ igt = igt_tests[igt_type]
+ return igt[1] if driver == 'xe' else igt[0]
+
+
+def igt_list_subtests(target: MachineInterface, test_name: str) -> typing.List[str]:
+ command = f'{target.get_igt_config().test_dir}{test_name} --list-subtests'
+ proc_result = ShellExecutor(target, command).wait()
+ if proc_result.exit_code == 0:
+ return proc_result.stdout.split("\n")
+ return []
diff --git a/vmtb/bench/executors/shell.py b/vmtb/bench/executors/shell.py
new file mode 100644
index 000000000..c05a82a86
--- /dev/null
+++ b/vmtb/bench/executors/shell.py
@@ -0,0 +1,30 @@
+# SPDX-License-Identifier: MIT
+# Copyright © 2024 Intel Corporation
+
+import signal
+
+from bench.executors.executor_interface import ExecutorInterface
+from bench.machines.machine_interface import (DEFAULT_TIMEOUT,
+ MachineInterface, ProcessResult)
+
+
+class ShellExecutor(ExecutorInterface):
+ def __init__(self, target: MachineInterface, command: str, timeout: int = DEFAULT_TIMEOUT) -> None:
+ self.target = target
+ self.timeout = timeout
+ self.pid = self.target.execute(command)
+
+ def status(self) -> ProcessResult:
+ return self.target.execute_status(self.pid)
+
+ def wait(self) -> ProcessResult:
+ return self.target.execute_wait(self.pid, self.timeout)
+
+ def sendsig(self, sig: signal.Signals) -> None:
+ self.target.execute_signal(self.pid, sig)
+
+ def terminate(self) -> None:
+ self.sendsig(signal.SIGTERM)
+
+ def kill(self) -> None:
+ self.sendsig(signal.SIGKILL)
diff --git a/vmtb/bench/helpers/__init__.py b/vmtb/bench/helpers/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/vmtb/bench/helpers/helpers.py b/vmtb/bench/helpers/helpers.py
new file mode 100644
index 000000000..8c81fd486
--- /dev/null
+++ b/vmtb/bench/helpers/helpers.py
@@ -0,0 +1,77 @@
+# SPDX-License-Identifier: MIT
+# Copyright © 2024 Intel Corporation
+
+import logging
+
+from bench.executors.igt import IgtExecutor
+from bench.executors.shell import ShellExecutor
+from bench.machines.machine_interface import MachineInterface
+
+logger = logging.getLogger('Helpers')
+
+
+def driver_check(machine: MachineInterface, card: int = 0) -> bool:
+ drm_driver = machine.get_drm_driver_name()
+ if not machine.dir_exists(f'/sys/module/{drm_driver}/drivers/pci:{drm_driver}/'):
+ logger.error(f'{drm_driver} module not loaded on card %s', card)
+ return False
+
+ return True
+
+
+def igt_check(igt_test: IgtExecutor) -> bool:
+ ''' Helper/wrapper for wait and check for igt test '''
+ igt_out = igt_test.wait()
+ if igt_out.exit_code == 0 and igt_test.did_pass():
+ return True
+ logger.error('IGT failed with %s', igt_out)
+ return False
+
+
+def igt_run_check(machine: MachineInterface, test: str) -> bool:
+ ''' Helper/wrapper for quick run and check for igt test '''
+ igt_test = IgtExecutor(machine, test)
+ return igt_check(igt_test)
+
+
+def cmd_check(cmd: ShellExecutor) -> bool:
+ ''' Helper/wrapper for wait and check for shell command '''
+ cmd_out = cmd.wait()
+ if cmd_out.exit_code == 0:
+ return True
+ logger.error('%s failed with %s', cmd, cmd_out)
+ return False
+
+
+def cmd_run_check(machine: MachineInterface, cmd: str) -> bool:
+ ''' Helper/wrapper for quick run and check for shell command '''
+ cmd_run = ShellExecutor(machine, cmd)
+ return cmd_check(cmd_run)
+
+
+def modprobe_driver(machine: MachineInterface, parameters: str = '', options: str = '') -> ShellExecutor:
+ """Load driver (modprobe [driver_module]) and return ShellExecutor instance (do not check a result)."""
+ drm_driver = machine.get_drm_driver_name()
+ modprobe_cmd = ShellExecutor(machine, f'modprobe {drm_driver} {options} {parameters}')
+ return modprobe_cmd
+
+
+def modprobe_driver_check(machine: MachineInterface, cmd: ShellExecutor) -> bool:
+ """Check result of a driver load (modprobe) based on a given ShellExecutor instance."""
+ modprobe_success = cmd_check(cmd)
+ if modprobe_success:
+ return driver_check(machine)
+
+ logger.error('Modprobe failed')
+ return False
+
+
+def modprobe_driver_run_check(machine: MachineInterface, parameters: str = '', options: str = '') -> bool:
+ """Load (modprobe) a driver and check a result (waits until operation ends)."""
+ modprobe_cmd = modprobe_driver(machine, parameters, options)
+ modprobe_success = modprobe_driver_check(machine, modprobe_cmd)
+ if modprobe_success:
+ return driver_check(machine)
+
+ logger.error('Modprobe failed')
+ return False
diff --git a/vmtb/bench/helpers/log.py b/vmtb/bench/helpers/log.py
new file mode 100644
index 000000000..665bb6cf9
--- /dev/null
+++ b/vmtb/bench/helpers/log.py
@@ -0,0 +1,75 @@
+# SPDX-License-Identifier: MIT
+# Copyright © 2024 Intel Corporation
+
+import errno
+import fcntl
+import functools
+import logging
+import os
+import typing
+from pathlib import Path
+
+from bench import exceptions
+
+logger = logging.getLogger('Host-kmsg')
+
+HOST_DMESG_FILE = Path("/tmp/vm-test-bench-host_dmesg.log.tmp")
+
+
+class LogDecorators():
+ """Read and parse kernel log buffer.
+ https://www.kernel.org/doc/Documentation/ABI/testing/dev-kmsg
+ """
+ @staticmethod
+ def read_messages(fd: int) -> typing.List[str]:
+ buf_size = 4096
+ kmsgs = []
+ while True:
+ try:
+ kmsg = os.read(fd, buf_size)
+ kmsgs.append(kmsg.decode())
+ except OSError as exc:
+ if exc.errno == errno.EAGAIN:
+ break
+
+ if exc.errno == errno.EPIPE:
+ pass
+ else:
+ raise
+ return kmsgs
+
+ @staticmethod
+ def parse_messages(kmsgs: typing.List[str]) -> None:
+ for msg in kmsgs:
+ header, human = msg.split(';', 1)
+ # Get priority/facility field (seq, time, other unused for now)
+ prio_fac, _, _, _ = header.split(',', 3)
+ level = int(prio_fac) & 0x7 # Syslog priority
+
+ if level <= 2: # KERN_CRIT/ALERT/EMERG
+ logger.error("[Error: %s]: %s", level, human.strip())
+ raise exceptions.HostError(f'Error in dmesg: {human.strip()}')
+
+ logger.debug("%s", human.strip())
+
+ @classmethod
+ def parse_kmsg(cls, func: typing.Callable) -> typing.Callable:
+ @functools.wraps(func)
+ def parse_wrapper(*args: typing.Any, **kwargs: typing.Optional[typing.Any]) -> typing.Any:
+ with open('/dev/kmsg', 'r', encoding='utf-8') as f, \
+ open(HOST_DMESG_FILE, 'a', encoding='utf-8') as dmesg_file:
+
+ fd = f.fileno()
+ os.lseek(fd, os.SEEK_SET, os.SEEK_END)
+ flags = fcntl.fcntl(fd, fcntl.F_GETFL)
+ fcntl.fcntl(fd, fcntl.F_SETFL, flags | os.O_NONBLOCK)
+
+ # Execute actual function
+ result = func(*args, **kwargs)
+
+ kmsgs = cls.read_messages(fd)
+ dmesg_file.writelines(kmsgs)
+ cls.parse_messages(kmsgs)
+
+ return result
+ return parse_wrapper
diff --git a/vmtb/bench/machines/__init__.py b/vmtb/bench/machines/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/vmtb/bench/machines/device_interface.py b/vmtb/bench/machines/device_interface.py
new file mode 100644
index 000000000..e8d4068e8
--- /dev/null
+++ b/vmtb/bench/machines/device_interface.py
@@ -0,0 +1,23 @@
+# SPDX-License-Identifier: MIT
+# Copyright © 2024 Intel Corporation
+
+import abc
+
+
+class DeviceInterface(abc.ABC):
+
+ @abc.abstractmethod
+ def create_vf(self, num: int) -> int:
+ raise NotImplementedError
+
+ @abc.abstractmethod
+ def remove_vfs(self) -> int:
+ raise NotImplementedError
+
+ @abc.abstractmethod
+ def bind_driver(self) -> None:
+ raise NotImplementedError
+
+ @abc.abstractmethod
+ def unbind_driver(self) -> None:
+ raise NotImplementedError
diff --git a/vmtb/bench/machines/host.py b/vmtb/bench/machines/host.py
new file mode 100644
index 000000000..3c25530d4
--- /dev/null
+++ b/vmtb/bench/machines/host.py
@@ -0,0 +1,197 @@
+# SPDX-License-Identifier: MIT
+# Copyright © 2024 Intel Corporation
+
+import logging
+import re
+import shlex
+import signal
+import subprocess
+import typing
+from pathlib import Path
+
+from bench import exceptions
+from bench.configurators.vmtb_config import VmtbIgtConfig
+from bench.helpers.log import LogDecorators
+from bench.machines.machine_interface import (DEFAULT_TIMEOUT,
+ MachineInterface, ProcessResult,
+ SuspendMode)
+from bench.machines.physical.device import Device
+
+logger = logging.getLogger('Host')
+
+
+class Host(MachineInterface):
+ def __init__(self) -> None:
+ self.running_procs: typing.Dict[int, subprocess.Popen] = {}
+ self.gpu_devices: typing.List[Device] = []
+ self.dut_index: int = 0
+ # Initialize in conftest/VmmTestingSetup:
+ self.drm_driver_name: str
+ self.igt_config: VmtbIgtConfig
+
+ def __str__(self) -> str:
+ return f'Host-{self.gpu_devices[self.dut_index].pci_info.bdf}'
+
+ @LogDecorators.parse_kmsg
+ def execute(self, command: str) -> int:
+ cmd_arr = shlex.split(command)
+ # We don't want to kill the process created here (like 'with' would do) so disable the following linter issue:
+ # R1732: consider-using-with (Consider using 'with' for resource-allocating operations)
+ # pylint: disable=R1732
+ # TODO: but maybe 'subprocess.run' function would fit instead of Popen constructor?
+ process = subprocess.Popen(cmd_arr,
+ stdout=subprocess.PIPE,
+ stderr=subprocess.PIPE,
+ universal_newlines=True)
+
+ self.running_procs[process.pid] = process
+ logger.debug("Run command: %s (PID: %s)", command, process.pid)
+ return process.pid
+
+ @LogDecorators.parse_kmsg
+ def execute_status(self, pid: int) -> ProcessResult:
+ proc = self.running_procs.get(pid, None)
+ if not proc:
+ logger.error("No process with PID: %s", pid)
+ raise exceptions.HostError(f'No process with PID: {pid}')
+
+ exit_code: typing.Optional[int] = proc.poll()
+ logger.debug("PID %s -> exit code %s", pid, exit_code)
+ if exit_code is None:
+ return ProcessResult(False, exit_code, '', '')
+
+ out, err = proc.communicate()
+ return ProcessResult(True, exit_code, out, err)
+
+ @LogDecorators.parse_kmsg
+ def execute_wait(self, pid: int, timeout: int = DEFAULT_TIMEOUT) -> ProcessResult:
+ proc = self.running_procs.get(pid, None)
+ if not proc:
+ logger.error("No process with PID: %s", pid)
+ raise exceptions.HostError(f'No process with PID: {pid}')
+
+ out = ''
+ err = ''
+ try:
+ out, err = proc.communicate(timeout)
+ except subprocess.TimeoutExpired as exc:
+ logger.warning("Timeout (%ss) expired for PID: %s", exc.timeout, pid)
+ raise
+
+ return ProcessResult(True, proc.poll(), out, err)
+
+ @LogDecorators.parse_kmsg
+ def execute_signal(self, pid: int, sig: signal.Signals) -> None:
+ proc = self.running_procs.get(pid, None)
+ if not proc:
+ logger.error("No process with PID: %s", pid)
+ raise exceptions.HostError(f'No process with PID: {pid}')
+
+ proc.send_signal(sig)
+
+ def read_file_content(self, path: str) -> str:
+ with open(path, encoding='utf-8') as f:
+ content = f.read()
+ return content
+
+ def write_file_content(self, path: str, content: str) -> int:
+ with open(path, 'w', encoding='utf-8') as f:
+ return f.write(content)
+
+ def dir_exists(self, path: str) -> bool:
+ return Path(path).is_dir()
+
+ def get_drm_driver_name(self) -> str:
+ # Used as a part of MachineInterface for helpers
+ return self.drm_driver_name
+
+ def get_igt_config(self) -> VmtbIgtConfig:
+ # Used as a part of MachineInterface to initialize IgtExecutor
+ return self.igt_config
+
+ def is_driver_loaded(self, driver_name: str) -> bool:
+ driver_path = Path('/sys/bus/pci/drivers/') / driver_name
+ return driver_path.exists()
+
+ def is_driver_available(self, driver_name: str) -> bool:
+ modinfo_pid = self.execute(f'modinfo -F filename {driver_name}')
+ modinfo_result: ProcessResult = self.execute_wait(modinfo_pid)
+ return modinfo_result.exit_code == 0
+
+ def load_drivers(self) -> None:
+ """Load (modprobe) required host drivers (DRM and VFIO)."""
+ drivers_to_probe = [self.drm_driver_name, f'{self.drm_driver_name}-vfio-pci']
+ # If vendor specific VFIO (ex. xe-vfio-pci) is not present, probe a regular vfio-pci
+ if not self.is_driver_available(drivers_to_probe[1]):
+ logger.warning("VFIO driver: '%s' is not available - use 'vfio-pci'", drivers_to_probe[1])
+ drivers_to_probe[1] = 'vfio-pci'
+
+ for driver in drivers_to_probe:
+ if not self.is_driver_loaded(driver):
+ logger.info("%s driver is not loaded - probe module", driver)
+ drv_probe_pid = self.execute(f'modprobe {driver}')
+ if self.execute_wait(drv_probe_pid).exit_code != 0:
+ logger.error("%s driver probe failed!", driver)
+ raise exceptions.HostError(f'{driver} driver probe failed!')
+
+ def unload_drivers(self) -> None:
+ """Unload (remove) host drivers (DRM and VFIO)."""
+ logger.debug("Cleanup - unload drivers\n")
+ vfio_driver = f'{self.drm_driver_name}-vfio-pci'
+ if not self.is_driver_loaded(vfio_driver):
+ vfio_driver = 'vfio-pci'
+
+ rmmod_pid = self.execute(f'modprobe -rf {vfio_driver}')
+ if self.execute_wait(rmmod_pid).exit_code != 0:
+ logger.error("VFIO driver remove failed!")
+ raise exceptions.HostError('VFIO driver remove failed!')
+
+ for device in self.gpu_devices:
+ logger.debug("Unbind %s from device %s", self.drm_driver_name, device.pci_info.bdf)
+ device.unbind_driver()
+
+ rmmod_pid = self.execute(f'modprobe -rf {self.drm_driver_name}')
+ if self.execute_wait(rmmod_pid).exit_code != 0:
+ logger.error("DRM driver remove failed!")
+ raise exceptions.HostError('DRM driver remove failed!')
+
+ logger.debug("%s/%s successfully removed", self.drm_driver_name, vfio_driver)
+
+ def discover_devices(self, vendor_id: str = '8086') -> None:
+ """Detect all PCI GPU devices on the host (with given Vendor ID) and initialize Device list."""
+ logger.debug("Discover GPU PCI devices")
+ if not self.is_driver_loaded(self.drm_driver_name):
+ logger.error("Unable to discover devices - %s driver is not loaded!", self.drm_driver_name)
+ raise exceptions.HostError(f'Unable to discover devices - {self.drm_driver_name} driver is not loaded!')
+
+ detected_devices: typing.List[Device] = []
+ out = subprocess.check_output(['lspci', '-nm'], universal_newlines=True)
+ pattern = r'(?P<bdf>.*\.0) .*03[08]0.*' + vendor_id + r'.*' \
+ + r'"(?P<devid>[0-9a-fA-F]{4})"( -r.*)?( "[0-9a-fA-F]{0,4}"){2}.*'
+
+ find_all = re.findall(pattern, out, re.MULTILINE)
+ if find_all:
+ for item in find_all:
+ bdf, devid = f'0000:{item[0]}', item[1]
+
+ device: Device = Device(bdf, self.drm_driver_name)
+ assert devid == device.pci_info.devid
+ detected_devices.append(device)
+
+ logger.debug("PCI BDF: %s / DevID: %s (%s)",
+ device.pci_info.bdf, device.pci_info.devid, device.gpu_model)
+
+ logger.debug("Detected %s GPU device(s)", len(detected_devices))
+
+ self.gpu_devices = detected_devices
+
+ def suspend(self, mode: SuspendMode = SuspendMode.ACPI_S3) -> None:
+ """Perform host suspend cycle (ACPI S3) via rtcwake tool."""
+ wakeup_delay = 10 # wakeup timer in seconds
+ logger.debug("Suspend-resume via rtcwake (mode: %s, wakeup delay: %ss)", mode, wakeup_delay)
+
+ suspend_pid = self.execute(f'rtcwake -s {wakeup_delay} -m {mode}')
+ suspend_result: ProcessResult = self.execute_wait(suspend_pid)
+ if suspend_result.exit_code != 0:
+ logger.error("Suspend failed - error: %s", suspend_result.stderr)
+ raise exceptions.HostError(f'Suspend failed - error: {suspend_result.stderr}')
diff --git a/vmtb/bench/machines/machine_interface.py b/vmtb/bench/machines/machine_interface.py
new file mode 100644
index 000000000..8daa2cda3
--- /dev/null
+++ b/vmtb/bench/machines/machine_interface.py
@@ -0,0 +1,65 @@
+# SPDX-License-Identifier: MIT
+# Copyright © 2024 Intel Corporation
+
+import abc
+import enum
+import signal
+import typing
+
+from bench.configurators.vmtb_config import VmtbIgtConfig
+
+DEFAULT_TIMEOUT: int = 1200 # Default machine execution wait timeout in seconds
+
+
+class ProcessResult(typing.NamedTuple):
+ exited: bool = False
+ exit_code: typing.Optional[int] = None
+ stdout: str = ''
+ stderr: str = ''
+
+
+class SuspendMode(str, enum.Enum):
+ ACPI_S3 = 'mem' # Suspend to RAM aka sleep
+ ACPI_S4 = 'disk' # Suspend to disk aka hibernation
+
+ def __str__(self) -> str:
+ return str.__str__(self)
+
+
+class MachineInterface(metaclass=abc.ABCMeta):
+
+ @abc.abstractmethod
+ def execute(self, command: str) -> int:
+ raise NotImplementedError
+
+ @abc.abstractmethod
+ def execute_status(self, pid: int) -> ProcessResult:
+ raise NotImplementedError
+
+ @abc.abstractmethod
+ def execute_wait(self, pid: int, timeout: int) -> ProcessResult:
+ raise NotImplementedError
+
+ @abc.abstractmethod
+ def execute_signal(self, pid: int, sig: signal.Signals) -> None:
+ raise NotImplementedError
+
+ @abc.abstractmethod
+ def read_file_content(self, path: str) -> str:
+ raise NotImplementedError
+
+ @abc.abstractmethod
+ def write_file_content(self, path: str, content: str) -> int:
+ raise NotImplementedError
+
+ @abc.abstractmethod
+ def dir_exists(self, path: str) -> bool:
+ raise NotImplementedError
+
+ @abc.abstractmethod
+ def get_drm_driver_name(self) -> str:
+ raise NotImplementedError
+
+ @abc.abstractmethod
+ def get_igt_config(self) -> VmtbIgtConfig:
+ raise NotImplementedError
diff --git a/vmtb/bench/machines/physical/__init__.py b/vmtb/bench/machines/physical/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/vmtb/bench/machines/physical/device.py b/vmtb/bench/machines/physical/device.py
new file mode 100644
index 000000000..8a0368ae0
--- /dev/null
+++ b/vmtb/bench/machines/physical/device.py
@@ -0,0 +1,240 @@
+# SPDX-License-Identifier: MIT
+# Copyright © 2024 Intel Corporation
+
+import importlib
+import logging
+import re
+from pathlib import Path
+from typing import Any, List
+
+from bench import exceptions
+from bench.configurators import pci
+from bench.configurators.vgpu_profile import (VgpuProfile, VgpuResourcesConfig,
+ VgpuSchedulerConfig)
+from bench.drivers.driver_interface import DriverInterface, SchedulingPriority
+from bench.helpers.log import LogDecorators
+from bench.machines.device_interface import DeviceInterface
+
+logger = logging.getLogger('Device')
+
+
+class Device(DeviceInterface):
+ class PciInfo:
+ def __init__(self, bdf: str) -> None:
+ self.bdf: str = bdf
+ self.devid: str = self.get_device_id(self.bdf)
+ self.minor_number: int = self.get_device_minor_number(self.bdf)
+
+ def get_device_minor_number(self, bdf: str) -> int:
+ drm_dir = Path('/sys/bus/pci/devices/') / bdf / 'drm'
+
+ for file_path in drm_dir.iterdir():
+ if file_path.match('card*'):
+ index_match = re.search(r'card(?P<card_index>\d+)', file_path.name)
+ if index_match:
+ return int(index_match.group('card_index'))
+
+ logger.error("Could not determine card index for device %s", bdf)
+ raise exceptions.HostError(f'Could not determine card index for device {bdf}')
+
+ def get_device_id(self, bdf: str) -> str:
+ device_file = Path('/sys/bus/pci/devices/') / bdf / 'device'
+ devid = device_file.read_text()
+
+ return devid.strip()[2:] # Strip whitespaces and 0x
+
+ def __init__(self, bdf: str, driver: str) -> None:
+ self.pci_info = self.PciInfo(bdf)
+ self.gpu_model: str = pci.get_gpu_model(self.pci_info.devid)
+ self.driver: DriverInterface = self.instantiate_driver(driver, self.pci_info.minor_number)
+
+ def instantiate_driver(self, driver_name: str, card_index: int) -> Any:
+ module_name = f'bench.drivers.{driver_name}'
+ class_name = f'{driver_name.capitalize()}Driver'
+
+ try:
+ driver_module = importlib.import_module(module_name)
+ driver_class = getattr(driver_module, class_name)
+ except (ImportError, AttributeError) as exc:
+ logging.error("Driver module/class is not available: %s", exc)
+ raise exceptions.VmtbConfigError(f'Requested driver module {driver_name} is not available!')
+
+ return driver_class(card_index)
+
+ def set_drivers_autoprobe(self, val: bool) -> None:
+ self.driver.set_drivers_autoprobe(int(val))
+ ret = self.driver.get_drivers_autoprobe()
+ if ret != int(val):
+ logger.error("Autoprobe value mismatch - requested: %s, got: %s", val, ret)
+ raise exceptions.HostError(f'Autoprobe value mismatch - requested: {val}, got: {ret}')
+
+ def get_total_vfs(self) -> int:
+ return self.driver.get_totalvfs()
+
+ def get_current_vfs(self) -> int:
+ return self.driver.get_numvfs()
+
+ def get_num_gts(self) -> int:
+ return self.driver.get_num_gts()
+
+ def has_lmem(self) -> bool:
+ return self.driver.has_lmem()
+
+ def create_vf(self, num: int) -> int:
+ """Enable a requested number of VFs.
+ Disable SRIOV drivers autoprobe to allow VFIO driver override for VFs.
+ """
+ logger.info("[%s] Enable %s VFs", self.pci_info.bdf, num)
+ if self.get_current_vfs() != 0:
+ self.remove_vfs()
+
+ self.numvf = num
+
+ # Disable driver autoprobe to avoid driver load on VF (override to vfio is required)
+ logger.debug("[%s] Disable drivers autoprobe", self.pci_info.bdf)
+ self.set_drivers_autoprobe(False)
+
+ self.driver.set_numvfs(num)
+ ret = self.driver.get_numvfs()
+ assert ret == num
+
+ return ret
+
+ def remove_vfs(self) -> int:
+ """Disable all existing VFs.
+ Re-enable SRIOV drivers autoprobe.
+ """
+ logger.info("[%s] Disable VFs", self.pci_info.bdf)
+ self.driver.set_numvfs(0)
+ ret = self.driver.get_numvfs()
+ if ret != 0:
+ raise exceptions.HostError('VFs not disabled after 0 write')
+
+ logger.debug("[%s] Enable drivers autoprobe", self.pci_info.bdf)
+ self.set_drivers_autoprobe(True)
+
+ return ret
+
+ def bind_driver(self) -> None:
+ self.driver.bind(self.pci_info.bdf)
+
+ def unbind_driver(self) -> None:
+ self.driver.unbind(self.pci_info.bdf)
+
+ def override_vf_driver(self, vf_num: int) -> str:
+ """Set VFIO as VF driver."""
+ pci_devices_path = Path('/sys/bus/pci/devices/')
+ vfio_driver = f'{self.driver.get_name()}-vfio-pci'
+ if not Path(f'/sys/bus/pci/drivers/{vfio_driver}').exists():
+ vfio_driver = 'vfio-pci'
+
+ # virtfnN is a symlink - get the last part of the absolute path, ie. VF BDF like 00:12:00.1
+ # TODO: replace by Path.readlink() when Python 3.9 supported
+ pass_vf_bdf = (pci_devices_path / self.pci_info.bdf / f'virtfn{vf_num - 1}').resolve().name
+ override_path = pci_devices_path / pass_vf_bdf / 'driver_override'
+ override_path.write_text(vfio_driver, encoding='utf-8')
+ logger.debug("VF%s VFIO driver: %s", vf_num, override_path.read_text())
+
+ return pass_vf_bdf
+
+ @LogDecorators.parse_kmsg
+ def get_vf_bdf(self, vf_num: int) -> str:
+ """Provide BDF of VF prepared for pass to VM - with VFIO driver override and probe."""
+ pass_vf_bdf = self.override_vf_driver(vf_num)
+
+ drivers_probe = Path('/sys/bus/pci/drivers_probe')
+ drivers_probe.write_text(pass_vf_bdf, encoding='utf-8')
+
+ logger.info("[%s] VF%s ready for pass to VM", pass_vf_bdf, vf_num)
+ return pass_vf_bdf
+
+ def get_vfs_bdf(self, *args: int) -> List[str]:
+ vf_list = list(set(args))
+ bdf_list = [self.get_vf_bdf(vf) for vf in vf_list]
+ return bdf_list
+
+ def provision(self, profile: VgpuProfile) -> None:
+ logger.info("[%s] Provision VFs - set vGPU profile for %s VFs", self.pci_info.bdf, profile.num_vfs)
+
+ num_vfs = profile.num_vfs
+ num_gts = self.get_num_gts() # Number of tiles (GTs)
+ gt_nums = [0] if num_gts == 1 else [0, 1] # Tile (GT) numbers/indexes
+
+ for gt_num in gt_nums:
+ self.driver.set_pf_policy_sched_if_idle(gt_num, int(profile.scheduler.scheduleIfIdle))
+ self.driver.set_pf_policy_reset_engine(gt_num, int(profile.security.reset_after_vf_switch))
+ self.driver.set_exec_quantum_ms(0, gt_num, profile.scheduler.pfExecutionQuanta)
+ self.driver.set_preempt_timeout_us(0, gt_num, profile.scheduler.pfPreemptionTimeout)
+ self.driver.set_doorbells_quota(0, gt_num, profile.resources.pfDoorbells)
+ # PF contexts are currently assigned by the driver and cannot be reprovisioned from sysfs
+
+ for vf_num in range(1, num_vfs + 1):
+ if num_gts > 1 and num_vfs > 1:
+ # Multi-tile device Mode 2|3 - odd VFs on GT0, even on GT1
+ gt_nums = [0] if vf_num % 2 else [1]
+
+ for gt_num in gt_nums:
+ self.driver.set_lmem_quota(vf_num, gt_num, profile.resources.vfLmem)
+ self.driver.set_ggtt_quota(vf_num, gt_num, profile.resources.vfGgtt)
+ self.driver.set_contexts_quota(vf_num, gt_num, profile.resources.vfContexts)
+ self.driver.set_doorbells_quota(vf_num, gt_num, profile.resources.vfDoorbells)
+ self.driver.set_exec_quantum_ms(vf_num, gt_num, profile.scheduler.vfExecutionQuanta)
+ self.driver.set_preempt_timeout_us(vf_num, gt_num, profile.scheduler.vfPreemptionTimeout)
+
+ # fn_num = 0 for PF, 1..n for VF
+ def set_scheduling(self, fn_num: int, gt_num: int, scheduling_config: VgpuSchedulerConfig) -> None:
+ logger.info("[%s] Provision scheduling config for PCI Function %s", self.pci_info.bdf, fn_num)
+ if fn_num == 0:
+ self.driver.set_pf_policy_sched_if_idle(gt_num, int(scheduling_config.scheduleIfIdle))
+ self.driver.set_exec_quantum_ms(0, gt_num, scheduling_config.pfExecutionQuanta)
+ self.driver.set_preempt_timeout_us(0, gt_num, scheduling_config.pfPreemptionTimeout)
+ else:
+ self.driver.set_exec_quantum_ms(fn_num, gt_num, scheduling_config.vfExecutionQuanta)
+ self.driver.set_preempt_timeout_us(fn_num, gt_num, scheduling_config.vfPreemptionTimeout)
+
+ def set_resources(self, fn_num: int, gt_num: int, resources_config: VgpuResourcesConfig) -> None:
+ logger.info("[%s] Provision resources config for PCI Function %s", self.pci_info.bdf, fn_num)
+ if fn_num == 0:
+ self.driver.set_pf_ggtt_spare(gt_num, resources_config.pfGgtt)
+ self.driver.set_pf_lmem_spare(gt_num, resources_config.pfLmem)
+ self.driver.set_pf_contexts_spare(gt_num, resources_config.pfContexts)
+ self.driver.set_pf_doorbells_spare(gt_num, resources_config.pfDoorbells)
+ else:
+ self.driver.set_ggtt_quota(fn_num, gt_num, resources_config.vfGgtt)
+ self.driver.set_lmem_quota(fn_num, gt_num, resources_config.vfLmem)
+ self.driver.set_contexts_quota(fn_num, gt_num, resources_config.vfContexts)
+ self.driver.set_doorbells_quota(fn_num, gt_num, resources_config.vfDoorbells)
+
+ def reset_provisioning(self, num_vfs: int) -> None:
+ """Clear provisioning config for a requested number of VFs.
+ Function calls the sysfs control interface to clear VF provisioning settings
+ and restores the auto provisioning mode.
+ """
+ logger.info("[%s] Reset %s VFs provisioning configuraton", self.pci_info.bdf, num_vfs)
+ for gt_num in range(self.get_num_gts()):
+ if self.get_scheduling_priority(gt_num) != SchedulingPriority.LOW:
+ self.set_scheduling_priority(gt_num, SchedulingPriority.LOW)
+ self.driver.set_pf_policy_sched_if_idle(gt_num, 0)
+ self.driver.set_pf_policy_reset_engine(gt_num, 0)
+ self.driver.set_exec_quantum_ms(0, gt_num, 0)
+ self.driver.set_preempt_timeout_us(0, gt_num, 0)
+ self.driver.set_doorbells_quota(0, gt_num, 0)
+ # PF contexts cannot be set from sysfs
+
+ for vf_num in range(1, num_vfs + 1):
+ self.driver.set_contexts_quota(vf_num, gt_num, 0)
+ self.driver.set_doorbells_quota(vf_num, gt_num, 0)
+ self.driver.set_ggtt_quota(vf_num, gt_num, 0)
+ self.driver.set_lmem_quota(vf_num, gt_num, 0)
+
+ def cancel_work(self) -> None:
+ """Drop and reset remaining GPU execution at exit."""
+ self.driver.cancel_work()
+
+ def get_scheduling_priority(self, gt_num: int) -> SchedulingPriority:
+ return self.driver.get_pf_sched_priority(gt_num)
+
+ def set_scheduling_priority(self, gt_num: int, val: SchedulingPriority) -> None:
+ # In order to set scheduling priority, strict scheduling policy needs to be default
+ # self.drm_driver.set_pf_policy_sched_if_idle(gt_num, 0)
+ self.driver.set_pf_sched_priority(gt_num, val)
diff --git a/vmtb/bench/machines/virtual/__init__.py b/vmtb/bench/machines/virtual/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/vmtb/bench/machines/virtual/backends/__init__.py b/vmtb/bench/machines/virtual/backends/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/vmtb/bench/machines/virtual/backends/backend_interface.py b/vmtb/bench/machines/virtual/backends/backend_interface.py
new file mode 100644
index 000000000..dfa29cc01
--- /dev/null
+++ b/vmtb/bench/machines/virtual/backends/backend_interface.py
@@ -0,0 +1,40 @@
+# SPDX-License-Identifier: MIT
+# Copyright © 2024 Intel Corporation
+
+import abc
+import typing
+
+
+class BackendInterface(metaclass=abc.ABCMeta):
+
+ @abc.abstractmethod
+ def sync(self, idnum: int) -> typing.Optional[typing.Dict]:
+ raise NotImplementedError
+
+ @abc.abstractmethod
+ def ping(self) -> typing.Optional[typing.Dict]:
+ raise NotImplementedError
+
+ @abc.abstractmethod
+ def execute(self, command: str, args: typing.List[str]) -> typing.Optional[typing.Dict]:
+ raise NotImplementedError
+
+ @abc.abstractmethod
+ def execute_status(self, pid: int) -> typing.Optional[typing.Dict]:
+ raise NotImplementedError
+
+ @abc.abstractmethod
+ def suspend_disk(self) -> None:
+ raise NotImplementedError
+
+ @abc.abstractmethod
+ def suspend_ram(self) -> None:
+ raise NotImplementedError
+
+ @abc.abstractmethod
+ def reboot(self) -> None:
+ raise NotImplementedError
+
+ @abc.abstractmethod
+ def poweroff(self) -> None:
+ raise NotImplementedError
diff --git a/vmtb/bench/machines/virtual/backends/guestagent.py b/vmtb/bench/machines/virtual/backends/guestagent.py
new file mode 100644
index 000000000..6ac366b99
--- /dev/null
+++ b/vmtb/bench/machines/virtual/backends/guestagent.py
@@ -0,0 +1,99 @@
+# SPDX-License-Identifier: MIT
+# Copyright © 2024 Intel Corporation
+
+import json
+import logging
+import socket
+import typing
+
+from bench import exceptions
+from bench.machines.virtual.backends.backend_interface import BackendInterface
+
+logger = logging.getLogger('GuestAgent')
+
+
+class GuestAgentBackend(BackendInterface):
+ def __init__(self, socket_path: str, socket_timeout: int) -> None:
+ self.sockpath = socket_path
+ self.timeout = socket_timeout
+ self.sock: socket.socket = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM)
+ self.sock.connect(self.sockpath)
+ self.sockf: typing.TextIO = self.sock.makefile(mode='rw', errors='strict')
+
+ def __send(self, command: str, arguments: typing.Optional[typing.Dict] = None) -> typing.Dict:
+ if arguments is None:
+ arguments = {}
+
+ data = {'execute': command, 'arguments': arguments}
+ json.dump(data, self.sockf)
+ self.sockf.flush()
+ try:
+ out: typing.Optional[str] = self.sockf.readline()
+ except socket.timeout as soc_to_exc:
+ logger.error('Socket readline timeout on command %s', command)
+ self.sock.close()
+ self.sockf.close()
+ raise exceptions.GuestAgentError(f'Socket timed out on {command}') from soc_to_exc
+ if out is None:
+ logger.error('Command %s, args %s returned with no output')
+ raise exceptions.GuestAgentError(f'Command {command} did not retunrned output')
+ # Only logging errors for now
+ ret: typing.Dict = json.loads(out)
+ if 'error' in ret.keys():
+ logger.error('Command: %s got error %s', command, ret)
+
+ return ret
+
+ def sync(self, idnum: int) -> typing.Dict:
+ return self.__send('guest-sync', {'id': idnum})
+
+ def ping(self) -> typing.Optional[typing.Dict]:
+ return self.__send('guest-ping')
+
+ def execute(self, command: str, args: typing.Optional[typing.List[str]] = None) -> typing.Dict:
+ if args is None:
+ args = []
+ arguments = {'path': command, 'arg': args, 'capture-output': True}
+ return self.__send('guest-exec', arguments)
+
+ def execute_status(self, pid: int) -> typing.Dict:
+ return self.__send('guest-exec-status', {'pid': pid})
+
+ # TODO add qmp-query mechanism for all powerstate changes
+ def suspend_disk(self) -> None:
+ # self.__send('guest-suspend-disk')
+ raise NotImplementedError
+
+ def suspend_ram(self) -> None:
+ self.ping()
+ # guest-suspend-ram does not return anything, thats why no __send
+ data = {'execute': 'guest-suspend-ram'}
+ json.dump(data, self.sockf)
+ self.sockf.flush()
+
+ def reboot(self) -> None:
+ self.ping()
+ # guest-shutdown does not return anything, thats why no __send
+ data = {'execute': 'guest-shutdown', 'arguments': {'mode': 'reboot'}}
+ json.dump(data, self.sockf)
+ self.sockf.flush()
+
+ def poweroff(self) -> None:
+ self.ping()
+ # guest-shutdown does not return anything, thats why no __send
+ data = {'execute': 'guest-shutdown', 'arguments': {'mode': 'powerdown'}}
+ json.dump(data, self.sockf)
+ self.sockf.flush()
+ # self.sockf.readline()
+
+ def guest_file_open(self, path: str, mode: str) -> typing.Dict:
+ return self.__send('guest-file-open', {'path': path, 'mode': mode})
+
+ def guest_file_close(self, handle: int) -> typing.Dict:
+ return self.__send('guest-file-close', {'handle': handle})
+
+ def guest_file_write(self, handle: int, content: str) -> typing.Dict:
+ return self.__send('guest-file-write', {'handle': handle, 'buf-b64': content})
+
+ def guest_file_read(self, handle: int) -> typing.Dict:
+ return self.__send('guest-file-read', {'handle': handle})
diff --git a/vmtb/bench/machines/virtual/backends/qmp_monitor.py b/vmtb/bench/machines/virtual/backends/qmp_monitor.py
new file mode 100644
index 000000000..7d2645abe
--- /dev/null
+++ b/vmtb/bench/machines/virtual/backends/qmp_monitor.py
@@ -0,0 +1,161 @@
+# SPDX-License-Identifier: MIT
+# Copyright © 2024 Intel Corporation
+
+import json
+import logging
+import queue
+import socket
+import threading
+import time
+import typing
+
+logger = logging.getLogger('QmpMonitor')
+
+
+class QmpMonitor():
+ def __init__(self, socket_path: str, socket_timeout: int) -> None:
+ self.sockpath = socket_path
+ self.timeout = socket_timeout
+ self.sock: socket.socket = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM)
+ self.sock.connect(self.sockpath)
+ self.sockf: typing.TextIO = self.sock.makefile(mode='rw', errors='strict')
+ self.qmp_queue: queue.Queue = queue.Queue()
+ self.monitor_thread: threading.Thread = threading.Thread(target=self.__queue_qmp_output,
+ args=(self.sockf, self.qmp_queue),
+ daemon=True)
+ self.monitor_thread.start()
+ # It is required to enable capabilities befor using QMP
+ self.__enable_qmp_capabilities()
+
+ def __enable_qmp_capabilities(self) -> None:
+ json.dump({'execute': 'qmp_capabilities'}, self.sockf)
+ self.sockf.flush()
+
+ def __queue_qmp_output(self, out: typing.TextIO, q: queue.Queue) -> None:
+ for line in iter(out.readline, ''):
+ logger.debug('[QMP RSP] <- %s', line)
+ qmp_msg = json.loads(line)
+ q.put(qmp_msg)
+
+ @property
+ def monitor_queue(self) -> queue.Queue:
+ return self.qmp_queue
+
+ def query_status(self) -> str:
+ json.dump({'execute': 'query-status'}, self.sockf)
+ self.sockf.flush()
+
+ ret: typing.Dict = {}
+ while 'status' not in ret:
+ qmp_msg = self.qmp_queue.get()
+ if 'return' in qmp_msg:
+ ret = qmp_msg.get('return')
+
+ status: str = ret['status']
+ logger.debug('Machine status: %s', status)
+ return status
+
+ def query_jobs(self, requested_type: str) -> typing.Tuple[str, str]:
+ json.dump({'execute': 'query-jobs'}, self.sockf)
+ self.sockf.flush()
+
+ job_type: str = ''
+ job_status: str = ''
+ job_error: str = ''
+ ret: typing.Dict = {}
+
+ qmp_msg = self.qmp_queue.get()
+ # logger.debug('[QMP RSP Queue] -> %s', qmp_msg)
+ if 'return' in qmp_msg:
+ ret = qmp_msg.get('return')
+ for param in ret:
+ job_type = param.get('type')
+ job_status = param.get('status')
+ job_error = param.get('error')
+
+ if job_type == requested_type:
+ break
+
+ return (job_status, job_error)
+
+ def get_qmp_event(self) -> str:
+ qmp_msg = self.qmp_queue.get()
+ # logger.debug('[QMP RSP Queue] -> %s', qmp_msg)
+ event: str = qmp_msg.get('event', '')
+ return event
+
+ def get_qmp_event_job(self) -> str:
+ qmp_msg = self.qmp_queue.get()
+ # logger.debug('[QMP RSP Queue] -> %s', qmp_msg)
+
+ status: str = ''
+ if qmp_msg.get('event') == 'JOB_STATUS_CHANGE':
+ status = qmp_msg.get('data', {}).get('status', '')
+
+ return status
+
+ def system_reset(self) -> None:
+ json.dump({'execute': 'system_reset'}, self.sockf)
+ self.sockf.flush()
+
+ def system_wakeup(self) -> None:
+ json.dump({'execute': 'system_wakeup'}, self.sockf)
+ self.sockf.flush()
+
+ def stop(self) -> None:
+ json.dump({'execute': 'stop'}, self.sockf)
+ self.sockf.flush()
+
+ def cont(self) -> None:
+ json.dump({'execute': 'cont'}, self.sockf)
+ self.sockf.flush()
+
+ def quit(self) -> None:
+ json.dump({'execute': 'quit'}, self.sockf)
+ self.sockf.flush()
+
+ def __query_snapshot(self) -> typing.Tuple[str, str]:
+ json.dump({'execute': 'query-named-block-nodes'}, self.sockf)
+ self.sockf.flush()
+
+ node_name: str = ''
+ snapshot_tag: str = ''
+ ret: typing.Dict = {}
+
+ qmp_msg = self.qmp_queue.get()
+ # logger.debug('[QMP RSP Queue] -> %s', qmp_msg)
+ if 'return' in qmp_msg:
+ ret = qmp_msg.get('return')
+ for block in ret:
+ if block.get('drv') == 'qcow2':
+ node_name = block.get('node-name')
+ # Get the most recent state snapshot from the snapshots list:
+ snapshots = block.get('image').get('snapshots')
+ if snapshots:
+ snapshot_tag = snapshots[-1].get('name')
+ break
+
+ return (node_name, snapshot_tag)
+
+ def save_snapshot(self) -> None:
+ job_id: str = f'savevm_{time.time()}'
+ snapshot_tag = f'vm_state_{time.time()}'
+ node_name, _ = self.__query_snapshot()
+ logger.debug('[QMP snapshot-save] snapshot_tag: %s, block device node: %s', snapshot_tag, node_name)
+
+ # Note: command 'snapshot-save' is supported since QEMU 6.0
+ json.dump({'execute': 'snapshot-save',
+ 'arguments': {'job-id': job_id, 'tag': snapshot_tag, 'vmstate': node_name, 'devices': [node_name]}},
+ self.sockf)
+ self.sockf.flush()
+
+ def load_snapshot(self) -> None:
+ job_id: str = f'loadvm_{time.time()}'
+ node_name, snapshot_tag = self.__query_snapshot()
+ logger.debug('[QMP snapshot-load] snapshot_tag: %s, block device node: %s', snapshot_tag, node_name)
+
+ # Note: command 'snapshot-load' is supported since QEMU 6.0
+ json.dump({'execute': 'snapshot-load',
+ 'arguments': {'job-id': job_id, 'tag': snapshot_tag, 'vmstate': node_name, 'devices': [node_name]}},
+ self.sockf)
+ self.sockf.flush()
diff --git a/vmtb/bench/machines/virtual/vm.py b/vmtb/bench/machines/virtual/vm.py
new file mode 100644
index 000000000..1439ec081
--- /dev/null
+++ b/vmtb/bench/machines/virtual/vm.py
@@ -0,0 +1,619 @@
+# SPDX-License-Identifier: MIT
+# Copyright © 2024 Intel Corporation
+
+import base64
+import json
+import logging
+import os
+import posixpath
+import shlex
+import signal
+import subprocess
+import threading
+import time
+import typing
+from types import FrameType
+
+from bench import exceptions
+from bench.configurators.vmtb_config import VmtbIgtConfig
+from bench.machines.machine_interface import (DEFAULT_TIMEOUT,
+ MachineInterface, ProcessResult,
+ SuspendMode)
+from bench.machines.virtual.backends.guestagent import GuestAgentBackend
+from bench.machines.virtual.backends.qmp_monitor import QmpMonitor
+
+logger = logging.getLogger('VirtualMachine')
+
+
+class VirtualMachine(MachineInterface):
+ class Decorators():
+ @staticmethod
+ def alarm_handler(sig: signal.Signals, tb: FrameType) -> typing.Any:
+ raise exceptions.AlarmTimeoutError(f'Alarm timeout occured')
+
+ @classmethod
+ def timeout_signal(cls, func: typing.Callable) -> typing.Callable:
+ def timeout_wrapper(*args: typing.Any, **kwargs: typing.Optional[typing.Any]) -> typing.Any:
+ timeout: int = DEFAULT_TIMEOUT
+ if len(args) > 2:
+ timeout = args[2] # Argument position in execute_wait(self, pid, timeout)
+ elif kwargs.get('timeout') is not None:
+ if isinstance(kwargs['timeout'], int):
+ timeout = kwargs['timeout']
+
+ # mypy: silence the following problem in signal.signal() call:
+ # error: Argument 2 to "signal" has incompatible type "Callable[[Signals, FrameType], Any]";
+ # expected "Union[Callable[[int, Optional[FrameType]], Any], int, Handlers, None]" [arg-type]
+ signal.signal(signal.SIGALRM, cls.alarm_handler) # type: ignore[arg-type]
+ signal.alarm(timeout)
+ try:
+ proc_ret = func(*args, **kwargs)
+ except exceptions.AlarmTimeoutError:
+ logger.warning('Timeout (%ss) on %s', timeout, func.__name__)
+ raise
+ finally:
+ signal.alarm(0) # Cancel alarm
+
+ return proc_ret
+
+ return timeout_wrapper
+
+ def __init__(self, vm_number: int, backing_image: str, driver: str, igt_config: VmtbIgtConfig) -> None:
+ self.vf_bdf: typing.Optional[str] = None
+ self.process: typing.Optional[subprocess.Popen] = None
+ self.vmnum: int = vm_number
+ self.card_num: int = 0
+ self.sysfs_prefix_path = posixpath.join('/sys/class/drm/', f'card{str(self.card_num)}')
+ self.questagent_sockpath = posixpath.join('/tmp', f'qga{self.vmnum}.sock')
+ self.qmp_sockpath = posixpath.join('/tmp', f'mon{self.vmnum}.sock')
+ self.drm_driver_name: str = driver
+ self.igt_config: VmtbIgtConfig = igt_config
+
+ if not posixpath.exists(backing_image):
+ logger.error('No image for VM%s', self.vmnum)
+ raise exceptions.GuestError(f'No image for VM{self.vmnum}')
+ self.image: str = self.__create_qemu_image(backing_image)
+ self.migrate_source_image: typing.Optional[str] = None
+ self.migrate_destination_vm: bool = False
+
+ # Resources provisioned to the VF/VM:
+ self._lmem_size: typing.Optional[int] = None
+ self._ggtt_size: typing.Optional[int] = None
+ self._contexts: typing.Optional[int] = None
+ self._doorbells: typing.Optional[int] = None
+
+ # GT number and tile is relevant mainly for multi-tile devices
+ # List of all GTs used by a given VF:
+ # - for single-tile: only root [0]
+ # - for multi-tile Mode 2/3: either root [0] or remote [1]
+ # - for multi-tile Mode 1: spans on both tiles [0, 1]
+ self._gt_nums: typing.List[int] = []
+ self._tile_mask: typing.Optional[int] = None
+
+ def __str__(self) -> str:
+ return f'VM{self.vmnum}_{self.vf_bdf}'
+
+ def __del__(self) -> None:
+ if not self.is_running():
+ return
+
+ # printing and not logging because loggers have some issues
+ # in late deinitialization
+ print(f'VM{self.vmnum} was not powered off')
+ if not self.process:
+ return
+ self.process.terminate()
+ # self.__close_qemu_output()
+ # Lets wait and make sure that qemu shutdown
+ try:
+ self.process.communicate(timeout=30)
+ except subprocess.TimeoutExpired:
+ print('QEMU did not terminate, killing it')
+ self.process.kill()
+
+ def __get_backing_file_format(self, backing_file: str) -> typing.Any:
+ """Get the format of the backing image file using qemu-img info."""
+ command = ['qemu-img', 'info', '--output=json', backing_file]
+ try:
+ result = subprocess.run(command, capture_output=True, check=True)
+ return json.loads(result.stdout)['format']
+ except subprocess.CalledProcessError as exc:
+ logger.error("Error executing qemu-img info: %s", exc.stderr)
+ raise exceptions.GuestError(f'Error executing qemu-img info') from exc
+ except json.JSONDecodeError as exc:
+ logger.error("Invalid JSON output from qemu-img info: %s", exc)
+ raise exceptions.GuestError('Invalid JSON output from qemu-img info') from exc
+
+ def __create_qemu_image(self, backing_file: str) -> str:
+ """Create a new qcow2 image with the specified backing file."""
+ output_image = f'./vm{self.vmnum}_{time.time()}_image.qcow2'
+ backing_format = self.__get_backing_file_format(backing_file)
+
+ command = ['qemu-img', 'create',
+ '-f', 'qcow2', '-b', f'{backing_file}', '-F', f'{backing_format}', f'{output_image}']
+ try:
+ subprocess.run(command, check=True)
+ logger.debug("[VM%s] Created image %s (backing file: %s, format: %s)",
+ self.vmnum, output_image, backing_file, backing_format)
+ except subprocess.CalledProcessError as exc:
+ logger.error('[VM%s] Error creating qcow2 image: %s', self.vmnum, exc)
+ raise exceptions.GuestError('Error creating qcow2 image') from exc
+
+ return output_image
+
+ # def __open_qemu_output(self) -> None:
+ # self.qemu_stdout = open(f'./qemu_vm{self.vmnum}_stdout.log', 'w')
+ # self.qemu_stderr = open(f'./qemu_vm{self.vmnum}_stderr.log', 'w')
+
+ def __log_qemu_output(self, out: typing.TextIO) -> None:
+ stdoutlog = logging.getLogger(f'VM{self.vmnum}-kmsg')
+ for line in iter(out.readline, ''):
+ stdoutlog.debug(line.strip())
+
+ # def __close_qemu_output(self) -> None:
+ # self.qemu_stderr.close()
+ # self.qemu_stdout.close()
+
+ def __sockets_exists(self) -> bool:
+ return os.path.exists(self.questagent_sockpath) and os.path.exists(self.qmp_sockpath)
+
+ def __get_popen_command(self) -> typing.List[str]:
+ # self.__open_qemu_output()
+ command = ['qemu-system-x86_64',
+ '-vnc', f':{self.vmnum}',
+ '-serial', 'stdio',
+ '-m', '4096',
+ '-drive', f'file={self.image if not self.migrate_destination_vm else self.migrate_source_image}',
+ '-chardev', f'socket,path={self.questagent_sockpath},server=on,wait=off,id=qga{self.vmnum}',
+ '-device', 'virtio-serial',
+ '-device', f'virtserialport,chardev=qga{self.vmnum},name=org.qemu.guest_agent.0',
+ '-chardev', f'socket,id=mon{self.vmnum},path=/tmp/mon{self.vmnum}.sock,server=on,wait=off',
+ '-mon', f'chardev=mon{self.vmnum},mode=control']
+
+ if self.vf_bdf:
+ command.extend(['-enable-kvm', '-cpu', 'host'])
+ command.extend(['-device', f'vfio-pci,host={self.vf_bdf},enable-migration=on'])
+
+ if self.migrate_destination_vm:
+ # If VM is migration destination - run in stopped/prelaunch state (explicit resume required)
+ command.extend(['-S'])
+
+ logger.debug('QEMU command: %s', ' '.join(command))
+ return command
+
+ def __get_key(self, base: typing.Dict, path: typing.List[str]) -> typing.Any:
+ cur = base
+ for key in path:
+ if cur is None or key not in cur:
+ raise ValueError(f'The key {path} does not exist, aborting!')
+ cur = cur[key]
+ return cur
+
+ @property
+ def get_vm_num(self) -> int:
+ return self.vmnum
+
+ def assign_vf(self, vf_bdf: str) -> None:
+ self.vf_bdf = vf_bdf
+
+ def set_migration_source(self, src_image: str) -> None:
+ self.migrate_source_image = src_image
+ self.migrate_destination_vm = True
+
+ @property
+ def lmem_size(self) -> typing.Optional[int]:
+ if self._lmem_size is None:
+ self.helper_get_debugfs_selfconfig()
+
+ return self._lmem_size
+
+ @property
+ def ggtt_size(self) -> typing.Optional[int]:
+ if self._ggtt_size is None:
+ self.helper_get_debugfs_selfconfig()
+
+ return self._ggtt_size
+
+ @property
+ def contexts(self) -> typing.Optional[int]:
+ if self._contexts is None:
+ self.helper_get_debugfs_selfconfig()
+
+ return self._contexts
+
+ @property
+ def doorbells(self) -> typing.Optional[int]:
+ if self._doorbells is None:
+ self.helper_get_debugfs_selfconfig()
+
+ return self._doorbells
+
+ @property
+ def tile_mask(self) -> typing.Optional[int]:
+ if self._tile_mask is None:
+ self.helper_get_debugfs_selfconfig()
+
+ return self._tile_mask
+
+ @property
+ def gt_nums(self) -> typing.List[int]:
+ self._gt_nums = self.get_gt_num_from_sysfs()
+ if not self._gt_nums:
+ logger.warning("VM sysfs: missing GT index")
+ self._gt_nums = [0]
+
+ return self._gt_nums
+
+ def get_gt_num_from_sysfs(self) -> typing.List[int]:
+ # Get GT number of VF passed to a VM, based on an exisitng a sysfs path
+ vm_gt_num = []
+ if self.dir_exists(posixpath.join(self.sysfs_prefix_path, 'gt/gt0')):
+ vm_gt_num.append(0)
+ if self.dir_exists(posixpath.join(self.sysfs_prefix_path, 'gt/gt1')):
+ vm_gt_num.append(1)
+
+ return vm_gt_num
+
+ def get_drm_driver_name(self) -> str:
+ return self.drm_driver_name
+
+ def get_igt_config(self) -> VmtbIgtConfig:
+ return self.igt_config
+
+ @Decorators.timeout_signal
+ def poweron(self) -> None:
+ logger.debug('Powering on VM%s', self.vmnum)
+ if self.is_running():
+ logger.warning('VM%s already running', self.vmnum)
+ return
+
+ command = self.__get_popen_command()
+ # We don't want to kill the process created here (like 'with' would do) so disable the following linter issue:
+ # R1732: consider-using-with (Consider using 'with' for resource-allocating operations)
+ # pylint: disable=R1732
+ # TODO: but maybe 'subprocess.run' function would fit instead of Popen constructor?
+ self.process = subprocess.Popen(
+ args=command,
+ stdout=subprocess.PIPE,
+ stderr=subprocess.PIPE,
+ # 'stdout': self.qemu_stdout,
+ # 'stderr': self.qemu_stderr,
+ universal_newlines=True)
+
+ qemu_stdout_log_thread = threading.Thread(
+ target=self.__log_qemu_output, args=(
+ self.process.stdout,), daemon=True)
+ qemu_stdout_log_thread.start()
+
+ qemu_stderr_log_thread = threading.Thread(
+ target=self.__log_qemu_output, args=(
+ self.process.stderr,), daemon=True)
+ qemu_stderr_log_thread.start()
+
+ if not self.is_running():
+ logger.error('VM%s did not boot', self.vmnum)
+ raise exceptions.GuestError(f'VM{self.vmnum} did not start')
+
+ try:
+ while not self.__sockets_exists():
+ logger.info('waiting for socket')
+ time.sleep(1)
+ # Passing five minutes timout for every command
+ self.ga = GuestAgentBackend(self.questagent_sockpath, 300)
+ self.qm = QmpMonitor(self.qmp_sockpath, 300)
+ vm_status = self.qm.query_status()
+
+ if not self.migrate_destination_vm and vm_status != 'running':
+ self.process.terminate()
+ logger.error('VM%s status not "running", instead: %s', self.vmnum, vm_status)
+ raise exceptions.GuestError(f'VM{self.vmnum} status {vm_status}')
+ except Exception as exc:
+ logger.error('Error while booting VM%s: %s', self.vmnum, exc)
+ self.process.terminate()
+ raise exceptions.GuestError(f'VM{self.vmnum} crashed with {exc}') from exc
+
+ def is_running(self) -> bool:
+ if self.process is None:
+ return False
+
+ return_code = self.process.poll()
+ if return_code is None:
+ return True
+
+ # self.__close_qemu_output()
+ return False
+
+ @Decorators.timeout_signal
+ def poweroff(self) -> None:
+ logger.debug('Powering off VM%s', self.vmnum)
+ assert self.process
+ if not self.is_running():
+ logger.warning('VM%s not running', self.vmnum)
+ return
+
+ try:
+ self.ga.poweroff()
+ # Wait for shutdown event
+ event: str = self.qm.get_qmp_event()
+ while event != 'SHUTDOWN':
+ event = self.qm.get_qmp_event()
+ except exceptions.AlarmTimeoutError:
+ logger.warning('VM%s hanged on poweroff. Initiating forced termination', self.vmnum)
+ self.process.terminate()
+ finally:
+ # Wait and make sure that qemu shutdown
+ self.process.communicate()
+ # self.__close_qemu_output()
+
+ if self.__sockets_exists():
+ # Remove leftovers and notify about unclear qemu shutdown
+ os.remove(self.questagent_sockpath)
+ os.remove(self.qmp_sockpath)
+ raise exceptions.GuestError(f'VM{self.vmnum} was not gracefully powered off - sockets exist')
+
+ def reboot(self) -> None:
+ """Reboot VM via the Guest-Agent guest-shutdown(reboot) command."""
+ logger.debug('Rebooting VM%s', self.vmnum)
+ self.ga.reboot()
+
+ # Wait for 2x RESET event (guest-reset)
+ reset_event_count = 2
+ while reset_event_count > 0:
+ if self.qm.get_qmp_event() == 'RESET':
+ reset_event_count -= 1
+
+ def reset(self) -> None:
+ """Reset VM via the QMP system_reset command."""
+ logger.debug('Resetting VM%s', self.vmnum)
+ self.qm.system_reset()
+
+ # Wait for 2x RESET event (host-qmp-system-reset, guest-reset)
+ reset_event_count = 2
+ while reset_event_count > 0:
+ if self.qm.get_qmp_event() == 'RESET':
+ reset_event_count -= 1
+
+ def pause(self) -> None:
+ logger.debug('Pausing VM%s', self.vmnum)
+ self.qm.stop()
+ vm_status = self.qm.query_status()
+ if vm_status != 'paused':
+ if self.process:
+ self.process.terminate()
+ logger.error('VM%s status not "paused", instead: %s', self.vmnum, vm_status)
+ raise exceptions.GuestError(f'VM{self.vmnum} status {vm_status}')
+
+ def resume(self) -> None:
+ logger.debug('Resuming VM%s', self.vmnum)
+ self.qm.cont()
+ vm_status = self.qm.query_status()
+ if vm_status != 'running':
+ if self.process:
+ self.process.terminate()
+ logger.error('VM%s status not "running", instead: %s', self.vmnum, vm_status)
+ raise exceptions.GuestError(f'VM{self.vmnum} status {vm_status}')
+
+ def quit(self) -> None:
+ logger.debug('Quitting VM%s', self.vmnum)
+ self.qm.quit()
+ event: str = self.qm.get_qmp_event()
+ while event != 'SHUTDOWN':
+ event = self.qm.get_qmp_event()
+
+ def _enable_suspend(self) -> None:
+ if self.link_exists('/etc/systemd/system/suspend.target'):
+ logger.debug('Enable (unmask) systemd suspend/sleep')
+ self.execute('systemctl unmask suspend.target sleep.target')
+
+ def suspend(self, mode: SuspendMode = SuspendMode.ACPI_S3) -> None:
+ logger.debug('Suspending VM%s (mode: %s)', self.vmnum, mode)
+ self._enable_suspend()
+ if mode == SuspendMode.ACPI_S3:
+ self.ga.suspend_ram()
+ elif mode == SuspendMode.ACPI_S4:
+ # self.ga.suspend_disk()
+ raise exceptions.GuestError('Guest S4 support not implemented')
+ else:
+ raise exceptions.GuestError('Unknown suspend mode')
+
+ event: str = self.qm.get_qmp_event()
+ while event != 'SUSPEND':
+ event = self.qm.get_qmp_event()
+
+ vm_status = self.qm.query_status()
+ if vm_status != 'suspended':
+ if self.process:
+ self.process.terminate()
+ logger.error('VM%s status not "suspended", instead: %s', self.vmnum, vm_status)
+ raise exceptions.GuestError(f'VM{self.vmnum} status {vm_status}')
+
+ def wakeup(self) -> None:
+ logger.debug('Waking up VM%s', self.vmnum)
+ self.qm.system_wakeup()
+
+ event: str = self.qm.get_qmp_event()
+ while event != 'WAKEUP':
+ event = self.qm.get_qmp_event()
+
+ vm_status = self.qm.query_status()
+ if vm_status != 'running':
+ if self.process:
+ self.process.terminate()
+ logger.error('VM%s status not "running", instead: %s', self.vmnum, vm_status)
+ raise exceptions.GuestError(f'VM{self.vmnum} status {vm_status}')
+
+ # {"execute": "guest-exec", "arguments":{"path": "/some/path", "arg": [], "capture-output": true}}
+ # {"error": {"class": "GenericError", "desc": "Guest... "}}
+ def execute(self, command: str) -> int:
+ arr_cmd = shlex.split(command)
+ execout: typing.Dict = self.ga.execute(arr_cmd[0], arr_cmd[1:])
+ ret = execout.get('return')
+ if ret:
+ pid: int = ret.get('pid')
+ logger.debug('Running %s on VM%s with pid %s', command, self.vmnum, pid)
+ return pid
+
+ logger.error('Command %s did not return pid', command)
+ raise exceptions.GuestError(f'No pid returned: {execout}')
+
+ # {'error': {'class': 'GenericError', 'desc': "Invalid parameter 'pid'"}}
+ def execute_status(self, pid: int) -> ProcessResult:
+ out = self.ga.execute_status(pid)
+ status = out.get('return')
+ if not status:
+ raise exceptions.GuestError(f'Not output from guest agent: {out}')
+
+ b64stdout = status.get('out-data', '')
+ stdout = base64.b64decode(b64stdout).decode('utf-8')
+
+ b64stderr = status.get('err-data', '')
+ stderr = base64.b64decode(b64stderr).decode('utf-8')
+
+ return ProcessResult(status.get('exited'), status.get('exitcode', None), stdout, stderr)
+
+ @Decorators.timeout_signal
+ def execute_wait(self, pid: int, timeout: int = DEFAULT_TIMEOUT) -> ProcessResult:
+ exec_status = ProcessResult(False, -1, '', '')
+ while not exec_status.exited:
+ exec_status = self.execute_status(pid)
+ time.sleep(1)
+
+ return exec_status
+
+ def execute_signal(self, pid: int, sig: signal.Signals) -> None:
+ signum = int(sig)
+ killpid = self.execute(f'kill -{signum} {pid}')
+ self.execute_wait(killpid)
+
+ def read_file_content(self, path: str) -> str:
+ out = self.ga.guest_file_open(path, 'r')
+ handle = out.get('return')
+ if not handle:
+ raise exceptions.GuestError('Could not open file on guest')
+
+ try:
+ eof: bool = False
+ file_content: typing.List[str] = []
+ while not eof:
+ ret = self.ga.guest_file_read(handle)
+ eof = self.__get_key(ret, ['return', 'eof'])
+ b64buf: str = self.__get_key(ret, ['return', 'buf-b64'])
+ file_content.append(base64.b64decode(b64buf).decode('utf-8'))
+ finally:
+ self.ga.guest_file_close(handle)
+
+ return ''.join(file_content)
+
+ def write_file_content(self, path: str, content: str) -> int:
+ out: typing.Dict = self.ga.guest_file_open(path, 'w')
+ handle = out.get('return')
+ if not handle:
+ raise exceptions.GuestError('Could not open file on guest')
+
+ b64buf: bytes = base64.b64encode(content.encode())
+
+ try:
+ ret = self.ga.guest_file_write(handle, b64buf.decode('utf-8'))
+ count: int = self.__get_key(ret, ['return', 'count'])
+ finally:
+ self.ga.guest_file_close(handle)
+
+ return count
+
+ def dir_exists(self, path: str) -> bool:
+ pid = self.execute(f'/bin/sh -c "[ -d {path} ]"')
+ status = self.execute_wait(pid)
+ if status.exit_code:
+ return False
+ return True
+
+ def link_exists(self, path: str) -> bool:
+ pid = self.execute(f'/bin/sh -c "[ -h {path} ]"')
+ status = self.execute_wait(pid)
+ if status.exit_code:
+ return False
+ return True
+
+ @Decorators.timeout_signal
+ def ping(self, timeout: int = DEFAULT_TIMEOUT) -> bool:
+ """Ping guest and return true if responding, false otherwise."""
+ logger.debug('Ping VM%s', self.vmnum)
+ try:
+ self.ga.ping()
+ except exceptions.AlarmTimeoutError:
+ logger.warning('VM%s not responded to ping', self.vmnum)
+ return False
+
+ return True
+
+ @Decorators.timeout_signal
+ def save_state(self) -> None:
+ logger.debug('Saving VM%s state (snapshot)', self.vmnum)
+ self.qm.save_snapshot()
+
+ job_status: str = self.qm.get_qmp_event_job()
+ while job_status != 'concluded':
+ job_status = self.qm.get_qmp_event_job()
+
+ job_status, job_error = self.qm.query_jobs('snapshot-save')
+ if job_status == 'concluded' and job_error is not None:
+ raise exceptions.GuestError(f'VM{self.vmnum} state save error: {job_error}')
+
+ logger.debug('VM%s state save finished successfully', self.vmnum)
+
+ @Decorators.timeout_signal
+ def load_state(self) -> None:
+ logger.debug('Loading VM state (snapshot)')
+ self.qm.load_snapshot()
+
+ job_status: str = self.qm.get_qmp_event_job()
+ while job_status != 'concluded':
+ job_status = self.qm.get_qmp_event_job()
+
+ job_status, job_error = self.qm.query_jobs('snapshot-load')
+ if job_status == 'concluded' and job_error is not None:
+ raise exceptions.GuestError(f'VM{self.vmnum} state load error: {job_error}')
+
+ logger.debug('VM state load finished successfully')
+
+ # helper_convert_units_to_bytes - convert size with units to bytes
+ # @size_str: multiple-byte unit size with suffix (K/M/G)
+ # Returns: size in bytes
+ # TODO: function perhaps could be moved to some new utils module
+ # improve - consider regex to handle various formats eg. both M and MB
+ def helper_convert_units_to_bytes(self, size_str: str) -> int:
+ size_str = size_str.upper()
+ size_int = 0
+
+ if size_str.endswith('B'):
+ size_int = int(size_str[0:-1])
+ elif size_str.endswith('K'):
+ size_int = int(size_str[0:-1]) * 1024
+ elif size_str.endswith('M'):
+ size_int = int(size_str[0:-1]) * 1024**2
+ elif size_str.endswith('G'):
+ size_int = int(size_str[0:-1]) * 1024**3
+
+ return size_int
+
+ # helper_get_debugfs_selfconfig - read resources allocated to VF from debugfs:
+ # /sys/kernel/debug/dri/@card/gt at gt_num/iov/self_config
+ # @card: card number
+ # @gt_num: GT instance number
+ def helper_get_debugfs_selfconfig(self, card: int = 0, gt_num: int = 0) -> None:
+ path = posixpath.join(f'/sys/kernel/debug/dri/{card}/gt{gt_num}/iov/self_config')
+ out = self.read_file_content(path)
+
+ for line in out.splitlines():
+ param, value = line.split(':')
+
+ if param == 'GGTT size':
+ self._ggtt_size = self.helper_convert_units_to_bytes(value)
+ elif param == 'LMEM size':
+ self._lmem_size = self.helper_convert_units_to_bytes(value)
+ elif param == 'contexts':
+ self._contexts = int(value)
+ elif param == 'doorbells':
+ self._doorbells = int(value)
+ elif param == 'tile mask':
+ self._tile_mask = int(value, base=16)
diff --git a/vmtb/dev-requirements.txt b/vmtb/dev-requirements.txt
new file mode 100644
index 000000000..66a7c21e4
--- /dev/null
+++ b/vmtb/dev-requirements.txt
@@ -0,0 +1,5 @@
+# Testing
+pytest
+
+# Building
+build
diff --git a/vmtb/pyproject.toml b/vmtb/pyproject.toml
new file mode 100644
index 000000000..7b8a63da2
--- /dev/null
+++ b/vmtb/pyproject.toml
@@ -0,0 +1,25 @@
+[build-system]
+requires = ["setuptools >= 70.0"]
+build-backend = "setuptools.build_meta"
+
+[project]
+name = "vmtb"
+version = "1.0.0"
+description = "SR-IOV VM-level test tool"
+readme = "README.md"
+requires-python = ">=3.11"
+
+authors = [
+ {name = "Intel Corporation"}
+]
+classifiers = [
+ "Programming Language :: Python :: 3",
+ "License :: OSI Approved :: MIT License",
+]
+dependencies = [
+ "pytest",
+]
+
+[tool.setuptools.packages.find]
+where = ["."]
+include = ["*"]
diff --git a/vmtb/pytest.ini b/vmtb/pytest.ini
new file mode 100644
index 000000000..e69de29bb
diff --git a/vmtb/requirements.txt b/vmtb/requirements.txt
new file mode 100644
index 000000000..5d80ceeab
--- /dev/null
+++ b/vmtb/requirements.txt
@@ -0,0 +1,2 @@
+# Used for running tests
+pytest
diff --git a/vmtb/vmm_flows/__init__.py b/vmtb/vmm_flows/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/vmtb/vmm_flows/conftest.py b/vmtb/vmm_flows/conftest.py
new file mode 100644
index 000000000..dc9141436
--- /dev/null
+++ b/vmtb/vmm_flows/conftest.py
@@ -0,0 +1,340 @@
+# SPDX-License-Identifier: MIT
+# Copyright © 2024 Intel Corporation
+
+import json
+import logging
+import re
+import typing
+
+from dataclasses import dataclass
+from pathlib import Path
+
+import pytest
+
+from bench import exceptions
+from bench.helpers.helpers import (modprobe_driver, modprobe_driver_check)
+from bench.helpers.log import HOST_DMESG_FILE
+from bench.configurators.vgpu_profile_config import VgpuProfileConfigurator, VfSchedulingMode
+from bench.configurators.vgpu_profile import VgpuProfile
+from bench.configurators.vmtb_config import VmtbConfigurator
+from bench.machines.host import Host, Device
+from bench.machines.virtual.vm import VirtualMachine
+
+
+logger = logging.getLogger('Conftest')
+
+
+def pytest_addoption(parser):
+ parser.addoption('--vm-image',
+ action='store',
+ help='OS image to boot on VM')
+ parser.addoption('--card',
+ action='store',
+ help='Device card index for test execution')
+
+
+ at dataclass
+class VmmTestingConfig:
+ """Structure represents test configuration used by a setup fixture.
+
+ Available settings:
+ - num_vfs: requested number of VFs to enable
+ - max_num_vms: maximal number of VMs (the value can be different than enabled number of VFs)
+ - scheduling_mode: requested vGPU scheduling profile (infinite maps to default 0's)
+ - auto_poweron_vm: assign VFs and power on VMs automatically in setup fixture
+ - auto_probe_vm_driver: probe guest DRM driver in setup fixture (VM must be powered on)
+ - unload_host_drivers_on_teardown: unload host DRM drivers in teardown fixture
+ - wa_reduce_vf_lmem: workaround to reduce VF LMEM (for save-restore/migration tests speed-up)
+ """
+ num_vfs: int = 1
+ max_num_vms: int = 2
+ scheduling_mode: VfSchedulingMode = VfSchedulingMode.INFINITE
+
+ auto_poweron_vm: bool = True
+ auto_probe_vm_driver: bool = True
+ unload_host_drivers_on_teardown: bool = False
+ # Temporary W/A: reduce size of LMEM assigned to VFs to speed up a VF state save-restore process
+ wa_reduce_vf_lmem: bool = False
+
+ def __str__(self) -> str:
+ return f'{self.num_vfs}VF'
+
+ def __repr__(self) -> str:
+ return (f'\nVmmTestingConfig:'
+ f'\nNum VFs = {self.num_vfs} / max num VMs = {self.max_num_vms}'
+ f'\nVF scheduling mode = {self.scheduling_mode}'
+ f'\nSetup flags:'
+ f'\n\tVM - auto power-on = {self.auto_poweron_vm}'
+ f'\n\tVM - auto DRM driver probe = {self.auto_probe_vm_driver}'
+ f'\n\tHost - unload drivers on teardown = {self.unload_host_drivers_on_teardown}'
+ f'\n\tW/A - reduce VF LMEM (improves migration time) = {self.wa_reduce_vf_lmem}')
+
+
+class VmmTestingSetup:
+ def __init__(self, vmtb_config: VmtbConfigurator, cmdline_config, host, testing_config):
+ self.testing_config: VmmTestingConfig = testing_config
+ self.host: Host = host
+
+ self.dut_index = vmtb_config.get_host_config().card_index if cmdline_config['card_index'] is None \
+ else int(cmdline_config['card_index'])
+ self.guest_os_image = vmtb_config.get_guest_config().os_image_path if cmdline_config['vm_image'] is None \
+ else cmdline_config['vm_image']
+
+ self.vgpu_profiles_dir = vmtb_config.vmtb_config_file.parent / vmtb_config.config.vgpu_profiles_path
+
+ self.host.dut_index = self.dut_index
+ self.host.drm_driver_name = vmtb_config.get_host_config().driver
+ self.host.igt_config = vmtb_config.get_host_config().igt_config
+
+ self.host.load_drivers()
+ self.host.discover_devices()
+
+ logger.info("\nDUT info:"
+ "\n\tCard index: %s"
+ "\n\tPCI BDF: %s "
+ "\n\tDevice ID: %s (%s)"
+ "\n\tHost DRM driver: %s",
+ self.host.dut_index,
+ self.get_dut().pci_info.bdf,
+ self.get_dut().pci_info.devid, self.get_dut().gpu_model,
+ self.get_dut().driver.get_name())
+
+ self.vgpu_profile: VgpuProfile = self.get_vgpu_profile()
+
+ # Start maximum requested number of VMs, but not more than VFs supported by the given vGPU profile
+ self.vms: typing.List[VirtualMachine] = [
+ VirtualMachine(vm_idx, self.guest_os_image,
+ vmtb_config.get_guest_config().driver,
+ vmtb_config.get_guest_config().igt_config)
+ for vm_idx in range(min(self.vgpu_profile.num_vfs, self.testing_config.max_num_vms))]
+
+ def get_vgpu_profile(self) -> VgpuProfile:
+ configurator = VgpuProfileConfigurator(self.vgpu_profiles_dir, self.get_dut().gpu_model)
+ try:
+ vgpu_profile = configurator.get_vgpu_profile(self.testing_config.num_vfs,
+ self.testing_config.scheduling_mode)
+ except exceptions.VgpuProfileError as exc:
+ logger.error("Suitable vGPU profile not found: %s", exc)
+ raise exceptions.VgpuProfileError('Invalid test setup - vGPU profile not found!')
+
+ vgpu_profile.print_parameters()
+
+ return vgpu_profile
+
+ def get_dut(self) -> Device:
+ try:
+ return self.host.gpu_devices[self.dut_index]
+ except IndexError as exc:
+ logger.error("Invalid VMTB config - device card index = %s not available", self.dut_index)
+ raise exceptions.VmtbConfigError(f'Device card index = {self.dut_index} not available') from exc
+
+ @property
+ def get_vm(self):
+ return self.vms
+
+ def get_num_vms(self) -> int:
+ return len(self.vms)
+
+ def poweron_vms(self):
+ for vm in self.vms:
+ vm.poweron()
+
+ def poweroff_vms(self):
+ for vm in self.vms:
+ if vm.is_running():
+ try:
+ vm.poweroff()
+ except Exception as exc:
+ self.testing_config.unload_host_drivers_on_teardown = True
+ logger.warning("Error on VM%s poweroff (%s)", vm.vmnum, exc)
+
+ if self.testing_config.unload_host_drivers_on_teardown:
+ raise exceptions.GuestError('VM poweroff issue - cleanup on test teardown')
+
+ def teardown(self):
+ try:
+ self.poweroff_vms()
+ except Exception as exc:
+ logger.error("Error on test teardown (%s)", exc)
+ finally:
+ num_vfs = self.get_dut().get_current_vfs()
+ self.get_dut().remove_vfs()
+ self.get_dut().reset_provisioning(num_vfs)
+ self.get_dut().cancel_work()
+
+ if self.testing_config.unload_host_drivers_on_teardown:
+ self.host.unload_drivers()
+
+
+ at pytest.fixture(scope='session', name='get_vmtb_config')
+def fixture_get_vmtb_config(create_host_log, pytestconfig):
+ VMTB_CONFIG_FILE = 'vmtb_config.json'
+ # Pytest Config.rootpath points to the VMTB base directory
+ vmtb_config_file_path: Path = pytestconfig.rootpath / VMTB_CONFIG_FILE
+ return VmtbConfigurator(vmtb_config_file_path)
+
+
+ at pytest.fixture(scope='session', name='create_host_log')
+def fixture_create_host_log():
+ if HOST_DMESG_FILE.exists():
+ HOST_DMESG_FILE.unlink()
+ HOST_DMESG_FILE.touch()
+
+
+ at pytest.fixture(scope='session', name='get_cmdline_config')
+def fixture_get_cmdline_config(request):
+ cmdline_params = {}
+ cmdline_params['vm_image'] = request.config.getoption('--vm-image')
+ cmdline_params['card_index'] = request.config.getoption('--card')
+ return cmdline_params
+
+
+ at pytest.fixture(scope='session', name='get_host')
+def fixture_get_host():
+ return Host()
+
+
+ at pytest.fixture(scope='class', name='setup_vms')
+def fixture_setup_vms(get_vmtb_config, get_cmdline_config, get_host, request):
+ """Arrange VM environment for the VMM Flows test execution.
+
+ VM setup steps follow the configuration provided as VmmTestingConfig parameter, including:
+ host drivers probe (DRM and VFIO), provision and enable VFs, boot VMs and load guest DRM driver.
+ Tear-down phase covers test environment cleanup:
+ shutdown VMs, reset provisioning, disable VMs and optional host drivers unload.
+
+ The fixture is designed for test parametrization, as the input to the following test class decorator:
+ @pytest.mark.parametrize('setup_vms', set_test_config(max_vms=N), ids=idfn_test_config, indirect=['setup_vms'])
+ where 'set_test_config' provides request parameter with a VmmTestingConfig (usually list of configs).
+ """
+ tc: VmmTestingConfig = request.param
+ logger.debug(repr(tc))
+
+ host: Host = get_host
+ ts: VmmTestingSetup = VmmTestingSetup(get_vmtb_config, get_cmdline_config, host, tc)
+
+ device: Device = ts.get_dut()
+ num_vfs = ts.vgpu_profile.num_vfs
+ num_vms = ts.get_num_vms()
+
+ logger.info('[Test setup: %sVF-%sVM]', num_vfs, num_vms)
+
+ # XXX: VF migration on discrete devices (with LMEM) is currently quite slow.
+ # As a temporary workaround, reduce size of LMEM assigned to VFs to speed up a state save/load process.
+ if tc.wa_reduce_vf_lmem and device.has_lmem():
+ logger.debug("W/A: reduce VFs LMEM quota to accelerate state save/restore")
+ org_vgpu_profile_vfLmem = ts.vgpu_profile.resources.vfLmem
+ # Assign max 512 MB to VF
+ ts.vgpu_profile.resources.vfLmem = min(ts.vgpu_profile.resources.vfLmem // 2, 536870912)
+
+ device.provision(ts.vgpu_profile)
+
+ assert device.create_vf(num_vfs) == num_vfs
+
+ if tc.auto_poweron_vm:
+ bdf_list = [device.get_vf_bdf(vf) for vf in range(1, num_vms + 1)]
+ for vm, bdf in zip(ts.get_vm, bdf_list):
+ vm.assign_vf(bdf)
+
+ ts.poweron_vms()
+
+ if tc.auto_probe_vm_driver:
+ modprobe_cmds = [modprobe_driver(vm) for vm in ts.get_vm]
+ for i, cmd in enumerate(modprobe_cmds):
+ assert modprobe_driver_check(ts.get_vm[i], cmd), f'modprobe failed on VM{i}'
+
+ logger.info('[Test execution: %sVF-%sVM]', num_vfs, num_vms)
+ yield ts
+
+ logger.info('[Test teardown: %sVF-%sVM]', num_vfs, num_vms)
+ # XXX: cleanup counterpart for VFs LMEM quota workaround - restore original value
+ if tc.wa_reduce_vf_lmem and device.has_lmem():
+ ts.vgpu_profile.resources.vfLmem = org_vgpu_profile_vfLmem
+
+ ts.teardown()
+
+
+# Obsolete fixtures 'create_Xhost_Yvm' - 'fixture_setup_vms' is preferred
+ at pytest.fixture(scope='function')
+def create_1host_1vm(get_vmtb_config, get_cmdline_config, get_host):
+ num_vfs, num_vms = 1, 1
+ ts: VmmTestingSetup = VmmTestingSetup(get_vmtb_config, get_cmdline_config, get_host,
+ VmmTestingConfig(num_vfs, num_vms))
+
+ logger.info('[Test setup: %sVF-%sVM]', num_vfs, num_vms)
+ logger.debug(repr(ts.testing_config))
+
+ logger.info('[Test execution: %sVF-%sVM]', num_vfs, num_vms)
+ yield ts
+
+ logger.info('[Test teardown: %sVF-%sVM]', num_vfs, num_vms)
+ ts.teardown()
+
+
+ at pytest.fixture(scope='function')
+def create_1host_2vm(get_vmtb_config, get_cmdline_config, get_host):
+ num_vfs, num_vms = 2, 2
+ ts: VmmTestingSetup = VmmTestingSetup(get_vmtb_config, get_cmdline_config, get_host,
+ VmmTestingConfig(num_vfs, num_vms))
+
+ logger.info('[Test setup: %sVF-%sVM]', num_vfs, num_vms)
+ logger.debug(repr(ts.testing_config))
+
+ logger.info('[Test execution: %sVF-%sVM]', num_vfs, num_vms)
+ yield ts
+
+ logger.info('[Test teardown: %sVF-%sVM]', num_vfs, num_vms)
+ ts.teardown()
+
+
+def idfn_test_config(test_config: VmmTestingConfig):
+ """Provide test config ID in parametrized tests (e.g. test_something[V4].
+ Usage: @pytest.mark.parametrize([...], ids=idfn_test_config, [...])
+ """
+ return str(test_config)
+
+
+RESULTS_FILE = Path() / "results.json"
+results = {
+ "results_version": 10,
+ "name": "results",
+ "tests": {},
+}
+
+
+ at pytest.hookimpl(hookwrapper=True)
+def pytest_report_teststatus(report):
+ yield
+ with open(HOST_DMESG_FILE, 'r+', encoding='utf-8') as dmesg_file:
+ dmesg = dmesg_file.read()
+ test_string = re.findall('[A-Za-z_.]*::.*', report.nodeid)[0]
+ results["name"] = f"vmtb_{test_string}"
+ test_name = f"vmtb@{test_string}"
+ if report.when == 'call':
+ out = report.capstdout
+ if report.passed:
+ result = "pass"
+ out = f"{test_name} passed"
+ elif report.failed:
+ result = "fail"
+ else:
+ result = "skip"
+ result = {"out": out, "result": result, "time": {"start": 0, "end": report.duration},
+ "err": report.longreprtext, "dmesg": dmesg}
+ results["tests"][test_name] = result
+ dmesg_file.truncate(0)
+ elif report.when == 'setup' and report.failed:
+ result = {"out": report.capstdout, "result": "crash", "time": {"start": 0, "end": report.duration},
+ "err": report.longreprtext, "dmesg": dmesg}
+ results["tests"][test_name] = result
+ dmesg_file.truncate(0)
+
+
+ at pytest.hookimpl()
+def pytest_sessionfinish():
+ if RESULTS_FILE.exists():
+ RESULTS_FILE.unlink()
+ RESULTS_FILE.touch()
+ jsonString = json.dumps(results, indent=2)
+ with open(str(RESULTS_FILE), 'w', encoding='utf-8') as f:
+ f.write(jsonString)
diff --git a/vmtb/vmm_flows/resources/vgpu_profiles/Flex170.json b/vmtb/vmm_flows/resources/vgpu_profiles/Flex170.json
new file mode 100644
index 000000000..ff1fa7e20
--- /dev/null
+++ b/vmtb/vmm_flows/resources/vgpu_profiles/Flex170.json
@@ -0,0 +1,113 @@
+{
+ "version": "1.1",
+ "PFResources": {
+ "Default": "MinimumPFResources",
+ "Profile": {
+ "MinimumPFResources": {
+ "LocalMemoryEccOn": 402653184,
+ "LocalMemoryEccOff": 402653184,
+ "Contexts": 1024,
+ "Doorbells": 16,
+ "GGTTSize": 268435456
+ }
+ }
+ },
+ "vGPUResources": {
+ "Default": null,
+ "Profile": {
+ "Flex170_16": {
+ "VFCount": 1,
+ "LocalMemoryEccOff": 16777216000,
+ "LocalMemoryEccOn": 2147483648,
+ "Contexts": 1024,
+ "Doorbells": 240,
+ "GGTTSize": 4026531840
+ },
+ "Flex170_8": {
+ "VFCount": 2,
+ "LocalMemoryEccOff": 8388608000,
+ "LocalMemoryEccOn": 2147483648,
+ "Contexts": 1024,
+ "Doorbells": 120,
+ "GGTTSize": 2013265920
+ },
+ "Flex170_4": {
+ "VFCount": 4,
+ "LocalMemoryEccOff": 4194304000,
+ "LocalMemoryEccOn": 2147483648,
+ "Contexts": 1024,
+ "Doorbells": 60,
+ "GGTTSize": 1006632960
+ },
+ "Flex170_2": {
+ "VFCount": 8,
+ "LocalMemoryEccOff": 2097152000,
+ "LocalMemoryEccOn": 1073741824,
+ "Contexts": 1024,
+ "Doorbells": 30,
+ "GGTTSize": 503316480
+ },
+ "Flex170_1": {
+ "VFCount": 16,
+ "LocalMemoryEccOff": 1048576000,
+ "LocalMemoryEccOn": 536870912,
+ "Contexts": 1024,
+ "Doorbells": 15,
+ "GGTTSize": 251658240
+ }
+ }
+ },
+ "vGPUScheduler": {
+ "Default": "Flexible_30fps_GPUTimeSlicing",
+ "Profile": {
+ "Flexible_30fps_GPUTimeSlicing": {
+ "GPUTimeSlicing": {
+ "ScheduleIfIdle": false,
+ "PFExecutionQuantum": 20,
+ "PFPreemptionTimeout": 20000,
+ "VFAttributes": {
+ "VFExecutionQuantum": "lambda VFCount : max( 32 // VFCount, 1)",
+ "VFPreemptionTimeout": "lambda VFCount : 128000 if (VFCount == 1) else max( 64000 // VFCount, 16000)"
+ }
+ }
+ },
+ "Fixed_30fps_GPUTimeSlicing": {
+ "GPUTimeSlicing": {
+ "ScheduleIfIdle": true,
+ "PFExecutionQuantum": 20,
+ "PFPreemptionTimeout": 20000,
+ "VFAttributes": {
+ "VFExecutionQuantum": "lambda VFCount : max( 32 // VFCount, 1)",
+ "VFPreemptionTimeout": "lambda VFCount : 128000 if (VFCount == 1) else max( 64000 // VFCount, 16000)"
+ }
+ }
+ },
+ "Flexible_BurstableQoS_GPUTimeSlicing": {
+ "GPUTimeSlicing": {
+ "ScheduleIfIdle": false,
+ "PFExecutionQuantum": 20,
+ "PFPreemptionTimeout": 20000,
+ "VFAttributes": {
+ "VFExecutionQuantum": "lambda VFCount : min((2000 // max(VFCount-1,1)*0.5, 50))",
+ "VFPreemptionTimeout": "lambda VFCount : (2000 // max(VFCount-1,1) - min((2000 // max(VFCount-1,1))*0.5, 50))*1000"
+ }
+ }
+ }
+ }
+ },
+ "vGPUSecurity": {
+ "Default": "Disabled",
+ "Profile": {
+ "Disabled": {
+ "ResetAfterVfSwitch": false,
+ "GuCSamplingPeriod": 0,
+ "GuCThresholdCATError": 0,
+ "GuCThresholdPageFault": 0,
+ "GuCThresholdH2GStorm": 0,
+ "GuCThresholdDbStorm": 0,
+ "GuCThresholdGTIrqStorm": 0,
+ "GuCThresholdEngineReset": 0
+ }
+ }
+ }
+}
\ No newline at end of file
diff --git a/vmtb/vmm_flows/test_basic.py b/vmtb/vmm_flows/test_basic.py
new file mode 100644
index 000000000..b8155c610
--- /dev/null
+++ b/vmtb/vmm_flows/test_basic.py
@@ -0,0 +1,160 @@
+# SPDX-License-Identifier: MIT
+# Copyright © 2024 Intel Corporation
+
+import logging
+import time
+from typing import List, Tuple
+
+import pytest
+
+from bench.configurators.vgpu_profile_config import VfSchedulingMode
+from bench.executors.gem_wsim import (ONE_CYCLE_DURATION_MS,
+ PREEMPT_10MS_WORKLOAD, GemWsim,
+ GemWsimResult,
+ gem_wsim_parallel_exec_and_check)
+from bench.executors.igt import IgtExecutor, IgtType
+from bench.helpers.helpers import (driver_check, igt_check, igt_run_check,
+ modprobe_driver_run_check)
+from vmm_flows.conftest import (VmmTestingConfig, VmmTestingSetup,
+ idfn_test_config)
+
+logger = logging.getLogger(__name__)
+
+WL_ITERATIONS_10S = 1000
+WL_ITERATIONS_30S = 3000
+MS_IN_SEC = 1000
+DELAY_FOR_WORKLOAD_SEC = 2 # Waiting gem_wsim to be running [seconds]
+DELAY_FOR_RELOAD_SEC = 3 # Waiting before driver reloading [seconds]
+
+
+def set_test_config(test_variants: List[Tuple[int, VfSchedulingMode]],
+ max_vms: int = 2, vf_driver_load: bool = True) -> List[VmmTestingConfig]:
+ """Helper function to provide a parametrized test with a list of test configuration variants."""
+ logger.debug("Init test variants: %s", test_variants)
+ test_configs: List[VmmTestingConfig] = []
+
+ for config in test_variants:
+ (num_vfs, scheduling_mode) = config
+ test_configs.append(VmmTestingConfig(num_vfs, max_vms, scheduling_mode, auto_probe_vm_driver=vf_driver_load))
+
+ return test_configs
+
+
+test_variants_1 = [(1, VfSchedulingMode.DEFAULT_PROFILE), (2, VfSchedulingMode.DEFAULT_PROFILE)]
+
+ at pytest.mark.parametrize('setup_vms', set_test_config(test_variants_1), ids=idfn_test_config, indirect=['setup_vms'])
+class TestVmSetup:
+ """Verify basic virtualization setup:
+ - probe PF and VFIO drivers (host)
+ - enable and provision VFs (automatic or manual with vGPU profile)
+ - power on VMs with assigned VFs
+ - probe VF driver (guest)
+ - shutdown VMs, reset provisioning and disable VFs
+ """
+ def test_vm_boot(self, setup_vms):
+ logger.info("Test VM boot: power on VM and probe VF driver")
+ ts: VmmTestingSetup = setup_vms
+
+ for vm in ts.vms:
+ logger.info("[%s] Verify VF DRM driver is loaded in a guest OS", vm)
+ assert driver_check(vm)
+
+
+test_variants_2 = [(1, VfSchedulingMode.DEFAULT_PROFILE), (2, VfSchedulingMode.DEFAULT_PROFILE),
+ (4, VfSchedulingMode.DEFAULT_PROFILE)]
+
+ at pytest.mark.parametrize('setup_vms', set_test_config(test_variants_2), ids=idfn_test_config, indirect=['setup_vms'])
+class TestVmWorkload:
+ """Verify basic IGT workload execution a VM(s):
+ - exec_store: basic store submissions on single/multiple VMs
+ - gem_wsim: workload simulator running in parallel on multiple VMs
+ """
+ def test_store(self, setup_vms):
+ logger.info("Test VM execution: exec_store")
+ ts: VmmTestingSetup = setup_vms
+ igt_worklads: List[IgtExecutor] = []
+
+ for vm in ts.vms:
+ logger.info("[%s] Execute basic WL", vm)
+ igt_worklads.append(IgtExecutor(vm, IgtType.EXEC_STORE))
+
+ for igt in igt_worklads:
+ logger.info("[%s] Verify result of basic WL", igt.target)
+ assert igt_check(igt)
+
+ logger.info("[%s] Verify result of basic WL", ts.host)
+ igt_run_check(ts.host, IgtType.EXEC_STORE)
+
+ def test_wsim(self, setup_vms):
+ logger.info("Test VM execution: gem_wsim")
+ ts: VmmTestingSetup = setup_vms
+
+ if ts.get_num_vms() < 2:
+ pytest.skip("Test scenario not supported for 1xVM setup ")
+
+ # Single workload takes 10ms GPU time, multiplied by 1000 iterations
+ # gives the expected 10s duration and 100 workloads/sec
+ expected = GemWsimResult(ONE_CYCLE_DURATION_MS * WL_ITERATIONS_10S * len(ts.vms) / MS_IN_SEC,
+ MS_IN_SEC/ONE_CYCLE_DURATION_MS / len(ts.vms))
+
+ # Check preemptable workload
+ result = gem_wsim_parallel_exec_and_check(ts.vms, PREEMPT_10MS_WORKLOAD, WL_ITERATIONS_10S, expected)
+ logger.info("Execute wsim parallel on VMs - results: %s", result)
+
+
+test_variants_3 = [(2, VfSchedulingMode.DEFAULT_PROFILE), (4, VfSchedulingMode.DEFAULT_PROFILE)]
+
+ at pytest.mark.parametrize('setup_vms', set_test_config(test_variants=test_variants_3, max_vms=4, vf_driver_load=False),
+ ids = idfn_test_config, indirect=['setup_vms'])
+class TestVfDriverLoadRemove:
+ """Verify VF (guest) driver load or remove doesn't affect execution on the other VM:
+ - probe VF driver on the last VM while the first VM is running workload
+ - remove VF driver on the first VM while the last VM is running workload
+ - reload previosuly removed VF driver on the same VM
+ """
+ def test_load(self, setup_vms):
+ logger.info("Test VM driver load: VF driver probe while other VM executes workload")
+ ts: VmmTestingSetup = setup_vms
+
+ vm_first = ts.vms[0]
+ vm_last = ts.vms[-1]
+
+ logger.info("[%s] Load VF driver and run basic WL - first VM", vm_first)
+ assert modprobe_driver_run_check(vm_first)
+
+ expected_elapsed_sec = ONE_CYCLE_DURATION_MS * WL_ITERATIONS_30S / MS_IN_SEC
+ gem_wsim = GemWsim(vm_first, 1, WL_ITERATIONS_30S, PREEMPT_10MS_WORKLOAD)
+ time.sleep(DELAY_FOR_WORKLOAD_SEC)
+ assert gem_wsim.is_running()
+
+ logger.info("[%s] Load VF driver - last VM", vm_last)
+ assert modprobe_driver_run_check(vm_last)
+
+ result = gem_wsim.wait_results()
+ assert expected_elapsed_sec * 0.8 < result.elapsed_sec < expected_elapsed_sec * 1.2
+
+ def test_reload(self, setup_vms):
+ logger.info("Test VM driver reload: VF driver remove is followed by probe while other VM executes workload")
+ ts: VmmTestingSetup = setup_vms
+
+ vm_first = ts.vms[0]
+ vm_last = ts.vms[-1]
+
+ logger.info("[%s] Run basic WL - last VM", vm_last)
+ expected_elapsed_sec = ONE_CYCLE_DURATION_MS * WL_ITERATIONS_30S / MS_IN_SEC
+ gem_wsim = GemWsim(vm_last, 1, WL_ITERATIONS_30S, PREEMPT_10MS_WORKLOAD)
+ time.sleep(DELAY_FOR_WORKLOAD_SEC)
+ assert gem_wsim.is_running()
+
+ logger.info("[%s] Remove VF driver - first VM", vm_first)
+ rmmod_pid = vm_first.execute(f'modprobe -rf {vm_first.get_drm_driver_name()}')
+ assert vm_first.execute_wait(rmmod_pid).exit_code == 0
+
+ time.sleep(DELAY_FOR_RELOAD_SEC)
+
+ logger.info("[%s] Reload VF driver and run basic WL - first VM", vm_first)
+ assert modprobe_driver_run_check(vm_first)
+ assert igt_run_check(vm_first, IgtType.EXEC_STORE)
+
+ result = gem_wsim.wait_results()
+ assert expected_elapsed_sec * 0.8 < result.elapsed_sec < expected_elapsed_sec * 1.2
diff --git a/vmtb/vmtb_config.json b/vmtb/vmtb_config.json
new file mode 100644
index 000000000..640a64123
--- /dev/null
+++ b/vmtb/vmtb_config.json
@@ -0,0 +1,31 @@
+{
+ "host": {
+ "card_index": 0,
+ "driver": "xe",
+ "igt": {
+ "test_dir": "/usr/local/libexec/igt-gpu-tools/",
+ "tool_dir": "/usr/local/bin/",
+ "lib_dir": "/usr/local/lib/x86_64-linux-gnu",
+ "result_dir": "/usr/local/results",
+ "options": "--piglit-style-dmesg --dmesg-warn-level=4 --abort-on-monitored-error=taint --overwrite"
+ }
+ },
+ "guest": {
+ "os_image": "guest_os.img",
+ "driver": "xe",
+ "igt": {
+ "test_dir": "/usr/local/libexec/igt-gpu-tools/",
+ "tool_dir": "/usr/local/bin/",
+ "lib_dir": "/usr/local/lib/x86_64-linux-gnu",
+ "result_dir": "/usr/local/results",
+ "options": "--piglit-style-dmesg --dmesg-warn-level=4 --abort-on-monitored-error=taint --overwrite"
+ }
+ },
+ "resources": {
+ "vgpu_profiles_path": "vmm_flows/resources/vgpu_profiles",
+ "guc_ver_path": "vmm_flows/resources/guc"
+ },
+ "ci": {
+ "host_dmesg_file": "/tmp/vm-test-bench-host_dmesg.log.tmp"
+ }
+}
--
2.39.1
More information about the igt-dev
mailing list