[PATCH i-g-t] vmtb: Introduce SR-IOV VM-level testing tool
Kamil Konieczny
kamil.konieczny at linux.intel.com
Wed Nov 20 19:08:12 UTC 2024
Hi Adam,
On 2024-11-19 at 08:47:04 +0100, Adam Miszczak wrote:
> VM Test Bench (VMTB) is a tool for testing virtualization
> (SR-IOV) supported by the xe driver.
> It allows to enable and provision VFs (Virtual Functions)
> and facilitates manipulation of VMs (Virtual Machines)
> running virtual GPUs.
> This includes starting and accessing the KVM/QEMU VMs,
> running workloads or shell commands (Guest/Host),
> handling power states, saving and restoring VF state etc.
>
> Initially only basic test scenarios are provided:
> - enable VFs, pass it to VMs and boot guest OS
> - submit basic workloads on a guest with virtualized GPU
> - exercise VF driver probe and remove
>
> but generally, the tool targets also complex test cases, like:
> - VF save/restore (VM migration)
> - VF provisioning
> - VF scheduling
> - VM power states
> - VF FLR
> - VM crash
> - GuC FW versioning
>
> Proposed location for the new tool is the root IGT directory:
> igt-gpu-tools/vmtb
> but some other options can be also considered, for example:
> tools/vmtb
> tests/vmtb
These comments should be in cover letter, could you start
using it? Add also versioning after PATCH.
Imho I would prefer seen this in scripts/vmtb or tools/vmtb
One thing to check if all needed files would be installed
with 'meson -C build install'
Btw could you split this into smaller patches, first one with
executed command like 'tools/lsgpu -L' and checking
if GPU card is present in VM (Virtual Machine)?
It is not a strong suggestion but could help in review.
Also adding Pawel to Cc pawel.sikora at intel.com
Regards,
Kamil
>
> v2:
> - improve device detection function:
> instead of parsing lspci output with regex, iterate over
> sysfs driver directory to get bound devices' BDFs (Marcin)
> - remove obsolete fixtures and other unused code (Marcin)
>
> Signed-off-by: Adam Miszczak <adam.miszczak at linux.intel.com>
> ---
> vmtb/MANIFEST.in | 3 +
> vmtb/README.md | 86 +++
> vmtb/bench/__init__.py | 43 ++
> vmtb/bench/configurators/__init__.py | 0
> vmtb/bench/configurators/pci.py | 48 ++
> vmtb/bench/configurators/vgpu_profile.py | 264 ++++++++
> .../configurators/vgpu_profile_config.py | 148 +++++
> vmtb/bench/configurators/vmtb_config.py | 110 ++++
> vmtb/bench/drivers/__init__.py | 0
> vmtb/bench/drivers/driver_interface.py | 198 ++++++
> vmtb/bench/drivers/xe.py | 307 +++++++++
> vmtb/bench/exceptions.py | 40 ++
> vmtb/bench/executors/__init__.py | 0
> vmtb/bench/executors/executor_interface.py | 22 +
> vmtb/bench/executors/gem_wsim.py | 70 ++
> vmtb/bench/executors/igt.py | 117 ++++
> vmtb/bench/executors/shell.py | 30 +
> vmtb/bench/helpers/__init__.py | 0
> vmtb/bench/helpers/helpers.py | 77 +++
> vmtb/bench/helpers/log.py | 75 +++
> vmtb/bench/machines/__init__.py | 0
> vmtb/bench/machines/device_interface.py | 23 +
> vmtb/bench/machines/host.py | 196 ++++++
> vmtb/bench/machines/machine_interface.py | 65 ++
> vmtb/bench/machines/physical/__init__.py | 0
> vmtb/bench/machines/physical/device.py | 240 +++++++
> vmtb/bench/machines/virtual/__init__.py | 0
> .../machines/virtual/backends/__init__.py | 0
> .../virtual/backends/backend_interface.py | 40 ++
> .../machines/virtual/backends/guestagent.py | 99 +++
> .../machines/virtual/backends/qmp_monitor.py | 161 +++++
> vmtb/bench/machines/virtual/vm.py | 604 ++++++++++++++++++
> vmtb/dev-requirements.txt | 5 +
> vmtb/pyproject.toml | 25 +
> vmtb/pytest.ini | 0
> vmtb/requirements.txt | 2 +
> vmtb/vmm_flows/__init__.py | 0
> vmtb/vmm_flows/conftest.py | 307 +++++++++
> .../resources/vgpu_profiles/Flex170.json | 113 ++++
> vmtb/vmm_flows/test_basic.py | 160 +++++
> vmtb/vmtb_config.json | 31 +
> 41 files changed, 3709 insertions(+)
> create mode 100644 vmtb/MANIFEST.in
> create mode 100644 vmtb/README.md
> create mode 100644 vmtb/bench/__init__.py
> create mode 100644 vmtb/bench/configurators/__init__.py
> create mode 100644 vmtb/bench/configurators/pci.py
> create mode 100644 vmtb/bench/configurators/vgpu_profile.py
> create mode 100644 vmtb/bench/configurators/vgpu_profile_config.py
> create mode 100644 vmtb/bench/configurators/vmtb_config.py
> create mode 100644 vmtb/bench/drivers/__init__.py
> create mode 100644 vmtb/bench/drivers/driver_interface.py
> create mode 100644 vmtb/bench/drivers/xe.py
> create mode 100644 vmtb/bench/exceptions.py
> create mode 100644 vmtb/bench/executors/__init__.py
> create mode 100644 vmtb/bench/executors/executor_interface.py
> create mode 100644 vmtb/bench/executors/gem_wsim.py
> create mode 100644 vmtb/bench/executors/igt.py
> create mode 100644 vmtb/bench/executors/shell.py
> create mode 100644 vmtb/bench/helpers/__init__.py
> create mode 100644 vmtb/bench/helpers/helpers.py
> create mode 100644 vmtb/bench/helpers/log.py
> create mode 100644 vmtb/bench/machines/__init__.py
> create mode 100644 vmtb/bench/machines/device_interface.py
> create mode 100644 vmtb/bench/machines/host.py
> create mode 100644 vmtb/bench/machines/machine_interface.py
> create mode 100644 vmtb/bench/machines/physical/__init__.py
> create mode 100644 vmtb/bench/machines/physical/device.py
> create mode 100644 vmtb/bench/machines/virtual/__init__.py
> create mode 100644 vmtb/bench/machines/virtual/backends/__init__.py
> create mode 100644 vmtb/bench/machines/virtual/backends/backend_interface.py
> create mode 100644 vmtb/bench/machines/virtual/backends/guestagent.py
> create mode 100644 vmtb/bench/machines/virtual/backends/qmp_monitor.py
> create mode 100644 vmtb/bench/machines/virtual/vm.py
> create mode 100644 vmtb/dev-requirements.txt
> create mode 100644 vmtb/pyproject.toml
> create mode 100644 vmtb/pytest.ini
> create mode 100644 vmtb/requirements.txt
> create mode 100644 vmtb/vmm_flows/__init__.py
> create mode 100644 vmtb/vmm_flows/conftest.py
> create mode 100644 vmtb/vmm_flows/resources/vgpu_profiles/Flex170.json
> create mode 100644 vmtb/vmm_flows/test_basic.py
> create mode 100644 vmtb/vmtb_config.json
>
> diff --git a/vmtb/MANIFEST.in b/vmtb/MANIFEST.in
> new file mode 100644
> index 000000000..7674c199d
> --- /dev/null
> +++ b/vmtb/MANIFEST.in
> @@ -0,0 +1,3 @@
> +include pytest.ini
> +include vmtb_config.json
> +include vmm_flows/resources/vgpu_profiles/*
> diff --git a/vmtb/README.md b/vmtb/README.md
> new file mode 100644
> index 000000000..49b034d12
> --- /dev/null
> +++ b/vmtb/README.md
> @@ -0,0 +1,86 @@
> +VM Test Bench
> +=============
> +
> +Description
> +-----------
> +VM Test Bench (VMTB) is a tool for testing virtualization (SR-IOV)
> +supported by the xe driver.
> +It allows to enable and provision VFs (Virtual Functions) and facilitates
> +manipulation of VMs (Virtual Machines) running virtual GPUs.
> +This includes starting and accessing the KVM/QEMU VMs,
> +running workloads or shell commands (Guest/Host),
> +handling power states, saving and restoring VF state etc.
> +
> +Requirements
> +------------
> +VMTB is implemented in Python using pytest testing framework.
> +
> +Host OS is expected to provide:
> +- xe PF driver with SR-IOV support
> +- VFIO driver (VF save/restore requires vendor specific driver variant)
> +- QEMU (VF save/restore requires QEMU 8.1+)
> +- IGT binaries
> +- Python 3.11+ with pytest installed
> +- VM Test Bench tool deployed
> +
> +Guest OS is expected to contain:
> +- xe VF driver
> +- QEMU Guest-Agent service for operating on Guest OS
> +- IGT binaries to execute worklads on VM
> +
> +Usual VMTB testing environment bases on Ubuntu 24.04 installed
> +on Host and Guest, but execution on other distros should be also possible.
> +
> +Building
> +--------
> +The VMTB source distribution package can be built with:
> +
> + python -m build --sdist
> +
> +that runs Python's `build` frontend
> +in an isolated virtual environment (`venv`).
> +
> +The output tarball is created in the `dist/` subdirectory,
> +that should be copied and extracted on a host device under test.
> +
> +Running tests
> +-------------
> +Test implemented by VM Test Bench are called VMM Flows and located in
> +`vmm_flows/` directory. Test files are prefixed with `test_` and encapsulate
> +related validation scenarios. Each test file can contain multiple test classes
> +(`TestXYZ`) or functions (`test_xyz`), that can be executed independently.
> +
> +Run the VMM Flows test in the following way (as root):
> +
> + $ pytest-3 -v ./vmtb-1.0.0/vmm_flows/<test_file_name>.py::<test_class_or_function_name> --vm-image=/path/to/<guest_os.img>
> +
> +For example, the simplest 1xVF/VM test scenario can be executed as:
> +
> + # sudo pytest-3 -v ./vmtb-1.0.0/vmm_flows/test_basic.py::TestVmSetup::test_vm_boot[2VF] --vm-image=/home/vmuser/guest_os.img
> +
> +(in case `pytest-3` command cannot be found, check with just `pytest`)
> +
> +Name of test class/function can be omitted to execute all tests in file.
> +File name can also be omitted, then all tests in
> +`vmm_flows` directory will be executed.
> +
> +Test log (including VM dmesg) is available in `logfile.log` output file.
> +Test results are presented as a standard pytest output on a terminal.
> +VM (Guest OS) can be accessed manually over VNC on [host_IP]:5900
> +(where port is incremented for the consecutive VMs).
> +
> +Structure
> +---------
> +VMTB is divided into the following components:
> +
> +#### `bench/`
> +Contains 'core' part of the tool, including Host, Device, Driver and
> +Virtual Machine abstractions, means to execute workloads (or other tasks),
> +various helper and configuration functions etc.
> +VMTB utilizes QMP (QEMU Machine Protocol) to communicate and operate with VMs
> +and QGA (QEMU Guest Agent) to interact with the Guest OS.
> +
> +#### `vmm_flows/`
> +Contains actual functional VM-level tests (`test_*.py`)
> +as well as a setup and tear-down fixtures (`conftest.py`).
> +New test files/scenarios shall be placed in this location.
> diff --git a/vmtb/bench/__init__.py b/vmtb/bench/__init__.py
> new file mode 100644
> index 000000000..ed5d7527d
> --- /dev/null
> +++ b/vmtb/bench/__init__.py
> @@ -0,0 +1,43 @@
> +# SPDX-License-Identifier: MIT
> +# Copyright © 2024 Intel Corporation
> +
> +import logging
> +import logging.config
> +
> +LOG_CONFIG = {
> + "version": 1,
> + "formatters": {
> + "detailed": {
> + "format": "%(asctime)s [%(levelname)s]: %(name)s (%(funcName)s:%(lineno)d) - %(message)s"
> + },
> + "simple": {"format": "%(levelname)s - %(message)s"},
> + },
> + "handlers": {
> + "console": {
> + "class": "logging.StreamHandler",
> + "formatter": "detailed",
> + "level": "WARNING",
> + "stream": "ext://sys.stdout",
> + },
> + "file": {
> + "backupCount": 5,
> + "class": "logging.handlers.RotatingFileHandler",
> + "filename": "logfile.log",
> + "formatter": "detailed",
> + "maxBytes": 5242880,
> + },
> + },
> + "root": {
> + "handlers": ["console", "file"],
> + "level": "DEBUG"
> + }
> +}
> +
> +logging.config.dictConfig(LOG_CONFIG)
> +
> +logger = logging.getLogger('VmtbInit')
> +
> +logger.info('###########################################')
> +logger.info('# VM Test Bench #')
> +logger.info('# SR-IOV VM-level validation suite #')
> +logger.info('###########################################')
> diff --git a/vmtb/bench/configurators/__init__.py b/vmtb/bench/configurators/__init__.py
> new file mode 100644
> index 000000000..e69de29bb
> diff --git a/vmtb/bench/configurators/pci.py b/vmtb/bench/configurators/pci.py
> new file mode 100644
> index 000000000..8e8afb138
> --- /dev/null
> +++ b/vmtb/bench/configurators/pci.py
> @@ -0,0 +1,48 @@
> +# SPDX-License-Identifier: MIT
> +# Copyright © 2024 Intel Corporation
> +
> +import enum
> +import typing
> +
> +
> +class GpuModel(str, enum.Enum):
> + ATSM150 = 'Arctic Sound M150 (ATS-M1)'
> + ATSM75 = 'Arctic Sound M75 (ATS-M3)'
> + Unknown = 'Unknown'
> +
> + def __str__(self) -> str:
> + return str.__str__(self)
> +
> +
> +def get_gpu_model(pci_id: str) -> GpuModel:
> + """Return GPU model associated with a given PCI Device ID."""
> + return pci_ids.get(pci_id.upper(), GpuModel.Unknown)
> +
> +
> +def get_vgpu_profiles_file(gpu_model: GpuModel) -> str:
> + """Return vGPU profile definition JSON file for a given GPU model."""
> + if gpu_model == GpuModel.ATSM150:
> + vgpu_device_file = 'Flex170.json'
> + elif gpu_model == GpuModel.ATSM75:
> + vgpu_device_file = 'Flex140.json'
> + else: # GpuModel.Unknown
> + vgpu_device_file = 'N/A'
> +
> + return vgpu_device_file
> +
> +
> +# PCI Device IDs: ATS-M150 (M1)
> +_atsm150_pci_ids = {
> + '56C0': GpuModel.ATSM150,
> + '56C2': GpuModel.ATSM150
> +}
> +
> +
> +# PCI Device IDs: ATS-M75 (M3)
> +_atsm75_pci_ids = {
> + '56C1': GpuModel.ATSM75
> +}
> +
> +
> +# All PCI Device IDs to GPU Device Names mapping
> +pci_ids: typing.Dict[str, GpuModel] = {**_atsm150_pci_ids, **_atsm75_pci_ids}
> diff --git a/vmtb/bench/configurators/vgpu_profile.py b/vmtb/bench/configurators/vgpu_profile.py
> new file mode 100644
> index 000000000..c4fa7ef39
> --- /dev/null
> +++ b/vmtb/bench/configurators/vgpu_profile.py
> @@ -0,0 +1,264 @@
> +# SPDX-License-Identifier: MIT
> +# Copyright © 2024 Intel Corporation
> +
> +import json
> +import logging
> +from dataclasses import dataclass, field
> +from pathlib import Path
> +from typing import Any, Dict, List
> +
> +from bench import exceptions
> +
> +logger = logging.getLogger('VgpuProfile')
> +
> +
> + at dataclass
> +class VgpuResourcesConfig:
> + pfLmem: int = 0
> + pfContexts: int = 0
> + pfDoorbells: int = 0
> + pfGgtt: int = 0
> + vfLmem: int = 0
> + vfContexts: int = 0
> + vfDoorbells: int = 0
> + vfGgtt: int = 0
> +
> +
> + at dataclass
> +class VgpuSchedulerConfig:
> + scheduleIfIdle: bool = False
> + pfExecutionQuanta: int = 0
> + pfPreemptionTimeout: int = 0
> + vfExecutionQuanta: int = 0
> + vfPreemptionTimeout: int = 0
> +
> +
> + at dataclass
> +class VgpuSecurityConfig:
> + reset_after_vf_switch: bool = False
> + guc_sampling_period: int = 0
> + guc_threshold_cat_error: int = 0
> + guc_threshold_page_fault: int = 0
> + guc_threshold_h2g_storm: int = 0
> + guc_threshold_db_storm: int = 0
> + guc_treshold_gt_irq_storm: int = 0
> + guc_threshold_engine_reset: int = 0
> +
> +
> + at dataclass
> +class VgpuProfile:
> + num_vfs: int = 0
> + scheduler: VgpuSchedulerConfig = field(default_factory=VgpuSchedulerConfig)
> + resources: VgpuResourcesConfig = field(default_factory=VgpuResourcesConfig)
> + security: VgpuSecurityConfig = field(default_factory=VgpuSecurityConfig)
> +
> + def print_parameters(self) -> None:
> + logger.info(
> + "\nvGPU Profile:\n"
> + " Num VFs = %s\n"
> + "\nResources:\n"
> + " PF:\n"
> + "\tLMEM = %s B\n"
> + "\tContexts = %s\n"
> + "\tDoorbells = %s\n"
> + "\tGGTT = %s B\n"
> + " VF:\n"
> + "\tLMEM = %s B\n"
> + "\tContexts = %s\n"
> + "\tDoorbells = %s\n"
> + "\tGGTT = %s B\n"
> + "\nScheduling:\n"
> + " Schedule If Idle = %s\n"
> + " PF:\n"
> + "\tExecution Quanta = %s ms\n"
> + "\tPreemption Timeout = %s us\n"
> + " VF:\n"
> + "\tExecution Quanta = %s ms\n"
> + "\tPreemption Timeout = %s us\n"
> + "\nSecurity:\n"
> + " Reset After Vf Switch = %s\n",
> + self.num_vfs,
> + self.resources.pfLmem, self.resources.pfContexts, self.resources.pfDoorbells, self.resources.pfGgtt,
> + self.resources.vfLmem, self.resources.vfContexts, self.resources.vfDoorbells, self.resources.vfGgtt,
> + self.scheduler.scheduleIfIdle,
> + self.scheduler.pfExecutionQuanta, self.scheduler.pfPreemptionTimeout,
> + self.scheduler.vfExecutionQuanta, self.scheduler.vfPreemptionTimeout,
> + self.security.reset_after_vf_switch
> + )
> +
> +
> +# Structures for mapping vGPU profiles definition from JSON files
> + at dataclass
> +class VgpuProfilePfResourcesDefinition:
> + profile_name: str
> + local_memory_ecc_off: int
> + local_memory_ecc_on: int
> + contexts: int
> + doorbells: int
> + ggtt_size: int
> +
> +
> + at dataclass
> +class VgpuProfileVfResourcesDefinition:
> + profile_name: str
> + vf_count: int
> + local_memory_ecc_off: int
> + local_memory_ecc_on: int
> + contexts: int
> + doorbells: int
> + ggtt_size: int
> +
> +
> + at dataclass
> +class VgpuProfileSchedulerDefinition:
> + profile_name: str = 'N/A'
> + schedule_if_idle: bool = False
> + pf_execution_quanta: int = 0
> + pf_preemption_timeout: int = 0
> + vf_execution_quanta: str = '' # To calculate based on number of VFs
> + vf_preemption_timeout: str = '' # To calculate based on number of VFs
> +
> +
> + at dataclass
> +class VgpuProfileSecurityDefinition(VgpuSecurityConfig):
> + profile_name: str = 'N/A'
> +
> +
> + at dataclass
> +class VgpuProfilesDefinitions:
> + pf_resource_default: str
> + pf_resources: List[VgpuProfilePfResourcesDefinition]
> + vf_resource_default: str
> + vf_resources: List[VgpuProfileVfResourcesDefinition]
> + scheduler_config_default: str
> + scheduler_configs: List[VgpuProfileSchedulerDefinition]
> + security_config_default: str
> + security_configs: List[VgpuProfileSecurityDefinition]
> +
> +
> +class VgpuProfilesJsonReader:
> + def __init__(self, vgpu_json_path: Path) -> None:
> + vgpu_profile_data = self.read_json_file(vgpu_json_path)
> + self.vgpu_profiles: VgpuProfilesDefinitions = self.parse_json_file(vgpu_profile_data)
> +
> + def read_json_file(self, vgpu_json_file: Path) -> Any:
> + if not Path(vgpu_json_file).exists():
> + logger.error("vGPU profile JSON file not found: %s", vgpu_json_file)
> + raise exceptions.VgpuProfileError(f'vGPU profile JSON file not found: {vgpu_json_file}')
> +
> + with open(vgpu_json_file, mode='r', encoding='utf-8') as json_file:
> + try:
> + vgpu_json = json.load(json_file)
> + except json.JSONDecodeError as exc:
> + logger.error("Invalid vGPU profile JSON format: %s", exc)
> + raise exceptions.VgpuProfileError('Invalid vGPU profile defintion JSON format')
> +
> + return vgpu_json
> +
> + def __parse_pf_resource_profiles(self, pf_profiles: Dict) -> List[VgpuProfilePfResourcesDefinition]:
> + pf_resources: List[VgpuProfilePfResourcesDefinition] = []
> +
> + for pf_profile_name in pf_profiles.keys():
> + lmem_ecc_off = pf_profiles[pf_profile_name]['LocalMemoryEccOff']
> + lmem_ecc_on = pf_profiles[pf_profile_name]['LocalMemoryEccOn']
> + contexts = pf_profiles[pf_profile_name]['Contexts']
> + doorbells = pf_profiles[pf_profile_name]['Doorbells']
> + ggtt_size = pf_profiles[pf_profile_name]['GGTTSize']
> +
> + current_pf_resource = VgpuProfilePfResourcesDefinition(pf_profile_name,
> + lmem_ecc_off,
> + lmem_ecc_on,
> + contexts,
> + doorbells,
> + ggtt_size)
> +
> + pf_resources.append(current_pf_resource)
> +
> + return pf_resources
> +
> + def __parse_vf_resource_profiles(self, vf_profiles: Dict) -> List[VgpuProfileVfResourcesDefinition]:
> + vf_resources: List[VgpuProfileVfResourcesDefinition] = []
> +
> + for vf_profile_name in vf_profiles.keys():
> + vf_count = vf_profiles[vf_profile_name]['VFCount']
> + lmem_ecc_off = vf_profiles[vf_profile_name]['LocalMemoryEccOff']
> + lmem_ecc_on = vf_profiles[vf_profile_name]['LocalMemoryEccOn']
> + contexts = vf_profiles[vf_profile_name]['Contexts']
> + doorbells = vf_profiles[vf_profile_name]['Doorbells']
> + ggtt_size = vf_profiles[vf_profile_name]['GGTTSize']
> +
> + current_vf_resource = VgpuProfileVfResourcesDefinition(vf_profile_name,
> + vf_count,
> + lmem_ecc_off,
> + lmem_ecc_on,
> + contexts,
> + doorbells,
> + ggtt_size)
> +
> + vf_resources.append(current_vf_resource)
> +
> + return vf_resources
> +
> + def __parse_scheduler_profiles(self, scheduler_profiles: Dict) -> List[VgpuProfileSchedulerDefinition]:
> + scheduler_configs: List[VgpuProfileSchedulerDefinition] = []
> +
> + for scheduler_profile_name in scheduler_profiles.keys():
> + schedule_if_idle = scheduler_profiles[scheduler_profile_name]['GPUTimeSlicing']['ScheduleIfIdle']
> + pf_eq = scheduler_profiles[scheduler_profile_name]['GPUTimeSlicing']['PFExecutionQuantum']
> + pf_pt = scheduler_profiles[scheduler_profile_name]['GPUTimeSlicing']['PFPreemptionTimeout']
> + vf_eq = scheduler_profiles[scheduler_profile_name]['GPUTimeSlicing']['VFAttributes']['VFExecutionQuantum']
> + vf_pt = scheduler_profiles[scheduler_profile_name]['GPUTimeSlicing']['VFAttributes']['VFPreemptionTimeout']
> +
> + current_scheduler = VgpuProfileSchedulerDefinition(scheduler_profile_name,
> + schedule_if_idle,
> + pf_eq, pf_pt,
> + vf_eq, vf_pt)
> +
> + scheduler_configs.append(current_scheduler)
> +
> + return scheduler_configs
> +
> + def __parse_security_profiles(self, security_profiles: Dict) -> List[VgpuProfileSecurityDefinition]:
> + security_configs: List[VgpuProfileSecurityDefinition] = []
> +
> + for security_profile_name in security_profiles.keys():
> + reset_after_vf_switch = security_profiles[security_profile_name]['ResetAfterVfSwitch']
> + guc_sampling_period = security_profiles[security_profile_name]['GuCSamplingPeriod']
> + guc_threshold_cat_error = security_profiles[security_profile_name]['GuCThresholdCATError']
> + guc_threshold_page_fault = security_profiles[security_profile_name]['GuCThresholdPageFault']
> + guc_threshold_h2g_storm = security_profiles[security_profile_name]['GuCThresholdH2GStorm']
> + guc_threshold_db_storm = security_profiles[security_profile_name]['GuCThresholdDbStorm']
> + guc_treshold_gt_irq_storm = security_profiles[security_profile_name]['GuCThresholdGTIrqStorm']
> + guc_threshold_engine_reset = security_profiles[security_profile_name]['GuCThresholdEngineReset']
> +
> + # VgpuSecurityConfig (base class) params go first, therefore profile name
> + # is the last param on the VgpuProfileSecurityDefinition initialization list in this case
> + current_security_config = VgpuProfileSecurityDefinition(reset_after_vf_switch,
> + guc_sampling_period,
> + guc_threshold_cat_error,
> + guc_threshold_page_fault,
> + guc_threshold_h2g_storm,
> + guc_threshold_db_storm,
> + guc_treshold_gt_irq_storm,
> + guc_threshold_engine_reset,
> + security_profile_name)
> +
> + security_configs.append(current_security_config)
> +
> + return security_configs
> +
> + def parse_json_file(self, vgpu_json: Dict) -> VgpuProfilesDefinitions:
> + pf_resource_default = vgpu_json['PFResources']['Default']
> + pf_resources = self.__parse_pf_resource_profiles(vgpu_json['PFResources']['Profile'])
> +
> + vf_resource_default = vgpu_json['vGPUResources']['Default']
> + vf_resources = self.__parse_vf_resource_profiles(vgpu_json['vGPUResources']['Profile'])
> +
> + scheduler_default = vgpu_json['vGPUScheduler']['Default']
> + scheduler_configs = self.__parse_scheduler_profiles(vgpu_json['vGPUScheduler']['Profile'])
> +
> + security_default = vgpu_json['vGPUSecurity']['Default']
> + security_configs = self.__parse_security_profiles(vgpu_json['vGPUSecurity']['Profile'])
> +
> + return VgpuProfilesDefinitions(pf_resource_default, pf_resources, vf_resource_default, vf_resources,
> + scheduler_default, scheduler_configs, security_default, security_configs)
> diff --git a/vmtb/bench/configurators/vgpu_profile_config.py b/vmtb/bench/configurators/vgpu_profile_config.py
> new file mode 100644
> index 000000000..6a4ef0334
> --- /dev/null
> +++ b/vmtb/bench/configurators/vgpu_profile_config.py
> @@ -0,0 +1,148 @@
> +# SPDX-License-Identifier: MIT
> +# Copyright © 2024 Intel Corporation
> +
> +import logging
> +from enum import Enum
> +from pathlib import Path
> +
> +from bench import exceptions
> +from bench.configurators.pci import GpuModel, get_vgpu_profiles_file
> +from bench.configurators.vgpu_profile import (VgpuProfile,
> + VgpuProfilesDefinitions,
> + VgpuProfilesJsonReader,
> + VgpuResourcesConfig,
> + VgpuSchedulerConfig,
> + VgpuSecurityConfig)
> +
> +logger = logging.getLogger('DeviceConfigurator')
> +
> +
> +class VfSchedulingMode(str, Enum):
> + INFINITE = 'Infinite' # Infinite EQ/PT - HW default
> + DEFAULT_PROFILE = 'Default_Profile' # Default vGPU scheduler profile
> + FLEXIBLE_30FPS = 'Flexible_30fps_GPUTimeSlicing'
> + FIXED_30FPS = 'Fixed_30fps_GPUTimeSlicing'
> + FLEXIBLE_BURSTABLE_QOS = 'Flexible_BurstableQoS_GPUTimeSlicing'
> +
> + def __str__(self) -> str:
> + return str.__str__(self)
> +
> +
> +class VgpuProfileConfigurator:
> + def __init__(self, vgpu_profiles_dir: Path, gpu_model: GpuModel = GpuModel.Unknown) -> None:
> + self.gpu_model: GpuModel = gpu_model
> + self.vgpu_profiles_dir: Path = vgpu_profiles_dir
> + self.supported_vgpu_profiles: VgpuProfilesDefinitions = self.query_vgpu_profiles()
> +
> + def __helper_create_vgpu_json_path(self, vgpu_resource_dir: Path) -> Path:
> + vgpu_device_file = get_vgpu_profiles_file(self.gpu_model)
> + vgpu_json_file_path = vgpu_resource_dir / vgpu_device_file
> +
> + if not vgpu_json_file_path.exists():
> + logger.error("vGPU profiles JSON file not found in %s", vgpu_resource_dir)
> + raise exceptions.VgpuProfileError(f'vGPU profiles JSON file not found in {vgpu_resource_dir}')
> +
> + return vgpu_json_file_path
> +
> + def query_vgpu_profiles(self) -> VgpuProfilesDefinitions:
> + """Get all vGPU profiles supported for a given GPU device."""
> + json_reader = VgpuProfilesJsonReader(self.__helper_create_vgpu_json_path(self.vgpu_profiles_dir))
> + return json_reader.vgpu_profiles
> +
> + def select_vgpu_resources_profile(self, requested_num_vfs: int) -> VgpuResourcesConfig:
> + """Find vGPU profile matching requested number of VFs.
> + In case exact match cannot be found, try to fit similar profile with up to 2 more VFs, for example:
> + - if requested profile with 3 VFs is not available, return close config with 4 VFs.
> + - if requested profile with neither 9 VFs, nor with 10 or 11 VFs is available - throw 'not found' exeception.
> + """
> + vgpu_resources_config = VgpuResourcesConfig()
> +
> + for pf_resource in self.supported_vgpu_profiles.pf_resources:
> + if pf_resource.profile_name == self.supported_vgpu_profiles.pf_resource_default:
> + vgpu_resources_config.pfLmem = pf_resource.local_memory_ecc_on
> + vgpu_resources_config.pfContexts = pf_resource.contexts
> + vgpu_resources_config.pfDoorbells = pf_resource.doorbells
> + vgpu_resources_config.pfGgtt = pf_resource.ggtt_size
> +
> + is_vf_resource_found = False
> + for vf_resource in self.supported_vgpu_profiles.vf_resources:
> + current_num_vfs = vf_resource.vf_count
> +
> + if current_num_vfs == requested_num_vfs:
> + is_vf_resource_found = True # Exact match
> + elif requested_num_vfs < current_num_vfs <= requested_num_vfs + 2:
> + logger.debug("Unable to find accurate vGPU profile but have similar: %s", vf_resource.profile_name)
> + is_vf_resource_found = True # Approximate match
> +
> + if is_vf_resource_found:
> + vgpu_resources_config.vfLmem = vf_resource.local_memory_ecc_on
> + vgpu_resources_config.vfContexts = vf_resource.contexts
> + vgpu_resources_config.vfDoorbells = vf_resource.doorbells
> + vgpu_resources_config.vfGgtt = vf_resource.ggtt_size
> + break
> +
> + if not is_vf_resource_found:
> + logger.error("vGPU VF resources profile %sxVF not found!", requested_num_vfs)
> + raise exceptions.VgpuProfileError(f'vGPU VF resources profile {requested_num_vfs}xVF not found!')
> +
> + return vgpu_resources_config
> +
> + def select_vgpu_scheduler_profile(self, requested_num_vfs: int,
> + requested_scheduler: VfSchedulingMode) -> VgpuSchedulerConfig:
> + # Function eval is needed to calculate VF EQ/PT for num_vfs
> + # Disable eval warning
> + # pylint: disable=W0123
> + vgpu_scheduler_config = VgpuSchedulerConfig()
> +
> + if requested_scheduler is VfSchedulingMode.INFINITE:
> + return vgpu_scheduler_config
> +
> + for scheduler in self.supported_vgpu_profiles.scheduler_configs:
> + if scheduler.profile_name == requested_scheduler:
> + vgpu_scheduler_config.scheduleIfIdle = scheduler.schedule_if_idle
> + vgpu_scheduler_config.pfExecutionQuanta = scheduler.pf_execution_quanta
> + vgpu_scheduler_config.pfPreemptionTimeout = scheduler.pf_preemption_timeout
> +
> + lambda_vf_eq = eval(scheduler.vf_execution_quanta)
> + lambda_vf_eq_result = lambda_vf_eq(requested_num_vfs)
> +
> + lambda_vf_pt = eval(scheduler.vf_preemption_timeout)
> + lambda_vf_pt_result = lambda_vf_pt(requested_num_vfs)
> +
> + vgpu_scheduler_config.vfExecutionQuanta = lambda_vf_eq_result
> + vgpu_scheduler_config.vfPreemptionTimeout = lambda_vf_pt_result
> +
> + return vgpu_scheduler_config
> +
> + def select_vgpu_security_profile(self) -> VgpuSecurityConfig:
> + # Currently supports only default security profile
> + vgpu_security_config = VgpuSecurityConfig()
> +
> + for security_profile in self.supported_vgpu_profiles.security_configs:
> + if security_profile.profile_name == self.supported_vgpu_profiles.security_config_default:
> + vgpu_security_config.reset_after_vf_switch = security_profile.reset_after_vf_switch
> + vgpu_security_config.guc_sampling_period = security_profile.guc_sampling_period
> + vgpu_security_config.guc_threshold_cat_error = security_profile.guc_threshold_cat_error
> + vgpu_security_config.guc_threshold_page_fault = security_profile.guc_threshold_page_fault
> + vgpu_security_config.guc_threshold_h2g_storm = security_profile.guc_threshold_h2g_storm
> + vgpu_security_config.guc_threshold_db_storm = security_profile.guc_threshold_db_storm
> + vgpu_security_config.guc_treshold_gt_irq_storm = security_profile.guc_treshold_gt_irq_storm
> + vgpu_security_config.guc_threshold_engine_reset = security_profile.guc_threshold_engine_reset
> +
> + return vgpu_security_config
> +
> + def get_vgpu_profile(self, requested_num_vfs: int, requested_scheduler: VfSchedulingMode) -> VgpuProfile:
> + """Get vGPU profile for requested number of VFs, scheduler and security modes."""
> + logger.info("Requested vGPU profile: %s VFs / scheduling: %s", requested_num_vfs, requested_scheduler)
> +
> + vgpu_profile: VgpuProfile = VgpuProfile()
> + vgpu_profile.num_vfs = requested_num_vfs
> + vgpu_profile.resources = self.select_vgpu_resources_profile(requested_num_vfs)
> +
> + if requested_scheduler is VfSchedulingMode.DEFAULT_PROFILE:
> + requested_scheduler = VfSchedulingMode(self.supported_vgpu_profiles.scheduler_config_default)
> +
> + vgpu_profile.scheduler = self.select_vgpu_scheduler_profile(requested_num_vfs, requested_scheduler)
> + vgpu_profile.security = self.select_vgpu_security_profile()
> +
> + return vgpu_profile
> diff --git a/vmtb/bench/configurators/vmtb_config.py b/vmtb/bench/configurators/vmtb_config.py
> new file mode 100644
> index 000000000..49dde4589
> --- /dev/null
> +++ b/vmtb/bench/configurators/vmtb_config.py
> @@ -0,0 +1,110 @@
> +# SPDX-License-Identifier: MIT
> +# Copyright © 2024 Intel Corporation
> +
> +import json
> +import logging
> +from dataclasses import dataclass
> +from pathlib import Path
> +from typing import Any, Dict
> +
> +from bench import exceptions
> +
> +logger = logging.getLogger('VmtbConfigurator')
> +
> +
> + at dataclass
> +class VmtbIgtConfig:
> + test_dir: str
> + tool_dir: str
> + lib_dir: str
> + result_dir: str
> + options: str
> +
> +
> + at dataclass
> +class VmtbHostConfig:
> + card_index: int
> + driver: str
> + igt_config: VmtbIgtConfig
> +
> +
> + at dataclass
> +class VmtbGuestConfig:
> + os_image_path: str
> + driver: str
> + igt_config: VmtbIgtConfig
> +
> +
> + at dataclass
> +class VmtbConfig:
> + host_config: VmtbHostConfig
> + guest_config: VmtbGuestConfig
> + vgpu_profiles_path: str
> + guc_ver_path: str
> + ci_host_dmesg_file: str
> +
> +
> +class VmtbConfigurator:
> + def __init__(self, vmtb_config_file_path: Path) -> None:
> + self.vmtb_config_file: Path = vmtb_config_file_path
> + self.config: VmtbConfig = self.query_vmtb_config()
> +
> + def query_vmtb_config(self) -> VmtbConfig:
> + json_reader = VmtbConfigJsonReader(self.vmtb_config_file)
> + return json_reader.vmtb_config
> +
> + def get_host_config(self) -> VmtbHostConfig:
> + return self.config.host_config
> +
> + def get_guest_config(self) -> VmtbGuestConfig:
> + return self.config.guest_config
> +
> +
> +class VmtbConfigJsonReader:
> + def __init__(self, config_json_path: Path) -> None:
> + vgpu_profile_data = self.read_json_file(config_json_path)
> + self.vmtb_config: VmtbConfig = self.parse_json_file(vgpu_profile_data)
> +
> + def read_json_file(self, config_json_file: Path) -> Any:
> + if not config_json_file.exists():
> + logger.error("VMTB config JSON file not found: %s", config_json_file)
> + raise exceptions.VmtbConfigError(f'VMTB config JSON file not found: {config_json_file}')
> +
> + with open(config_json_file, mode='r', encoding='utf-8') as json_file:
> + try:
> + vgpu_json = json.load(json_file)
> + except json.JSONDecodeError as exc:
> + logger.error("Invalid VMTB config JSON format: %s", exc)
> + raise exceptions.VmtbConfigError(f'Invalid VMTB config JSON format: {exc}')
> +
> + return vgpu_json
> +
> + def get_igt_config(self, igt_config_json: Dict) -> VmtbIgtConfig:
> + igt_config = VmtbIgtConfig(
> + test_dir=igt_config_json['igt']['test_dir'],
> + tool_dir=igt_config_json['igt']['tool_dir'],
> + lib_dir=igt_config_json['igt']['lib_dir'],
> + result_dir=igt_config_json['igt']['result_dir'],
> + options=igt_config_json['igt']['options'])
> +
> + return igt_config
> +
> + def parse_json_file(self, config_json: Dict) -> VmtbConfig:
> + vmtb_host_config = VmtbHostConfig(
> + card_index=config_json['host']['card_index'],
> + driver=config_json['host']['driver'],
> + igt_config=self.get_igt_config(config_json['host']))
> +
> + vmtb_guest_config = VmtbGuestConfig(
> + os_image_path=config_json['guest']['os_image'],
> + driver=config_json['guest']['driver'],
> + igt_config=self.get_igt_config(config_json['guest']))
> +
> + vmtb_config = VmtbConfig(
> + host_config=vmtb_host_config,
> + guest_config=vmtb_guest_config,
> + vgpu_profiles_path=config_json['resources']['vgpu_profiles_path'],
> + guc_ver_path=config_json['resources']['guc_ver_path'],
> + ci_host_dmesg_file=config_json['ci']['host_dmesg_file'])
> +
> + return vmtb_config
> diff --git a/vmtb/bench/drivers/__init__.py b/vmtb/bench/drivers/__init__.py
> new file mode 100644
> index 000000000..e69de29bb
> diff --git a/vmtb/bench/drivers/driver_interface.py b/vmtb/bench/drivers/driver_interface.py
> new file mode 100644
> index 000000000..af2f96837
> --- /dev/null
> +++ b/vmtb/bench/drivers/driver_interface.py
> @@ -0,0 +1,198 @@
> +# SPDX-License-Identifier: MIT
> +# Copyright © 2024 Intel Corporation
> +
> +import abc
> +import enum
> +import typing
> +
> +
> +class SchedulingPriority(enum.Enum):
> + LOW = 0
> + NORMAL = 1
> + HIGH = 2
> +
> +
> +class VfControl(str, enum.Enum):
> + pause = 'pause'
> + resume = 'resume'
> + stop = 'stop'
> + clear = 'clear'
> +
> + def __str__(self) -> str:
> + return str.__str__(self)
> +
> +
> +class DriverInterface(abc.ABC):
> +
> + @staticmethod
> + @abc.abstractmethod
> + def get_name() -> str:
> + raise NotImplementedError
> +
> + @abc.abstractmethod
> + def bind(self, bdf: str) -> None:
> + raise NotImplementedError
> +
> + @abc.abstractmethod
> + def unbind(self, bdf: str) -> None:
> + raise NotImplementedError
> +
> + @abc.abstractmethod
> + def get_totalvfs(self) -> int:
> + raise NotImplementedError
> +
> + @abc.abstractmethod
> + def get_numvfs(self) -> int:
> + raise NotImplementedError
> +
> + @abc.abstractmethod
> + def set_numvfs(self, val: int) -> None:
> + raise NotImplementedError
> +
> + @abc.abstractmethod
> + def get_drivers_autoprobe(self) -> int:
> + raise NotImplementedError
> +
> + @abc.abstractmethod
> + def set_drivers_autoprobe(self, val: int) -> None:
> + raise NotImplementedError
> +
> + @abc.abstractmethod
> + def get_num_gts(self) -> int:
> + raise NotImplementedError
> +
> + @abc.abstractmethod
> + def has_lmem(self) -> bool:
> + raise NotImplementedError
> +
> + @abc.abstractmethod
> + def get_auto_provisioning(self) -> bool:
> + raise NotImplementedError
> +
> + @abc.abstractmethod
> + def set_auto_provisioning(self, val: bool) -> None:
> + raise NotImplementedError
> +
> + @abc.abstractmethod
> + def cancel_work(self) -> None:
> + raise NotImplementedError
> +
> + @abc.abstractmethod
> + def get_pf_ggtt_spare(self, gt_num: int) -> int:
> + raise NotImplementedError
> +
> + @abc.abstractmethod
> + def set_pf_ggtt_spare(self, gt_num: int, val: int) -> None:
> + raise NotImplementedError
> +
> + @abc.abstractmethod
> + def get_pf_lmem_spare(self, gt_num: int) -> int:
> + raise NotImplementedError
> +
> + @abc.abstractmethod
> + def set_pf_lmem_spare(self, gt_num: int, val: int) -> None:
> + raise NotImplementedError
> +
> + @abc.abstractmethod
> + def get_pf_contexts_spare(self, gt_num: int) -> int:
> + raise NotImplementedError
> +
> + @abc.abstractmethod
> + def set_pf_contexts_spare(self, gt_num: int, val: int) -> None:
> + raise NotImplementedError
> +
> + @abc.abstractmethod
> + def get_pf_doorbells_spare(self, gt_num: int) -> int:
> + raise NotImplementedError
> +
> + @abc.abstractmethod
> + def set_pf_doorbells_spare(self, gt_num: int, val: int) -> None:
> + raise NotImplementedError
> +
> + @abc.abstractmethod
> + def get_pf_sched_priority(self, gt_num: int) -> SchedulingPriority:
> + raise NotImplementedError
> +
> + @abc.abstractmethod
> + def set_pf_sched_priority(self, gt_num: int, val: SchedulingPriority) -> None:
> + raise NotImplementedError
> +
> + @abc.abstractmethod
> + def get_pf_policy_reset_engine(self, gt_num: int) -> int:
> + raise NotImplementedError
> +
> + @abc.abstractmethod
> + def set_pf_policy_reset_engine(self, gt_num: int, val: int) -> None:
> + raise NotImplementedError
> +
> + @abc.abstractmethod
> + def get_pf_policy_sample_period_ms(self, gt_num: int) -> int:
> + raise NotImplementedError
> +
> + @abc.abstractmethod
> + def set_pf_policy_sample_period_ms(self, gt_num: int, val: int) -> None:
> + raise NotImplementedError
> +
> + @abc.abstractmethod
> + def get_pf_policy_sched_if_idle(self, gt_num: int) -> int:
> + raise NotImplementedError
> +
> + @abc.abstractmethod
> + def set_pf_policy_sched_if_idle(self, gt_num: int, val: int) -> None:
> + raise NotImplementedError
> +
> + @abc.abstractmethod
> + def get_ggtt_quota(self, vf_num: int, gt_num: int) -> int:
> + raise NotImplementedError
> +
> + @abc.abstractmethod
> + def set_ggtt_quota(self, vf_num: int, gt_num: int, val: int) -> None:
> + raise NotImplementedError
> +
> + @abc.abstractmethod
> + def get_lmem_quota(self, vf_num: int, gt_num: int) -> int:
> + raise NotImplementedError
> +
> + @abc.abstractmethod
> + def set_lmem_quota(self, vf_num: int, gt_num: int, val: int) -> None:
> + raise NotImplementedError
> +
> + @abc.abstractmethod
> + def get_contexts_quota(self, vf_num: int, gt_num: int) -> int:
> + raise NotImplementedError
> +
> + @abc.abstractmethod
> + def set_contexts_quota(self, vf_num: int, gt_num: int, val: int) -> None:
> + raise NotImplementedError
> +
> + @abc.abstractmethod
> + def get_doorbells_quota(self, vf_num: int, gt_num: int) -> int:
> + raise NotImplementedError
> +
> + @abc.abstractmethod
> + def set_doorbells_quota(self, vf_num: int, gt_num: int, val: int) -> None:
> + raise NotImplementedError
> +
> + @abc.abstractmethod
> + def get_exec_quantum_ms(self, vf_num: int, gt_num: int) -> int:
> + raise NotImplementedError
> +
> + @abc.abstractmethod
> + def set_exec_quantum_ms(self, vf_num: int, gt_num: int, val: int) -> None:
> + raise NotImplementedError
> +
> + @abc.abstractmethod
> + def get_preempt_timeout_us(self, vf_num: int, gt_num: int) -> int:
> + raise NotImplementedError
> +
> + @abc.abstractmethod
> + def set_preempt_timeout_us(self, vf_num: int, gt_num: int, val: int) -> None:
> + raise NotImplementedError
> +
> + @abc.abstractmethod
> + def set_vf_control(self, vf_num: int, val: VfControl) -> None:
> + raise NotImplementedError
> +
> + @abc.abstractmethod
> + def get_ggtt_available(self, gt_num: int) -> typing.Tuple[int, int]:
> + raise NotImplementedError
> diff --git a/vmtb/bench/drivers/xe.py b/vmtb/bench/drivers/xe.py
> new file mode 100644
> index 000000000..009cec5be
> --- /dev/null
> +++ b/vmtb/bench/drivers/xe.py
> @@ -0,0 +1,307 @@
> +# SPDX-License-Identifier: MIT
> +# Copyright © 2024 Intel Corporation
> +
> +import logging
> +import typing
> +from pathlib import Path
> +
> +from bench import exceptions
> +from bench.drivers.driver_interface import (DriverInterface,
> + SchedulingPriority, VfControl)
> +from bench.helpers.log import LogDecorators
> +
> +logger = logging.getLogger('XeDriver')
> +
> +
> +class XeDriver(DriverInterface):
> + def __init__(self, card_index: int) -> None:
> + self.sysfs_card_path = Path(f'/sys/class/drm/card{card_index}')
> + self.debugfs_path = Path(f'/sys/kernel/debug/dri/{card_index}')
> +
> + @staticmethod
> + def get_name() -> str:
> + return 'xe'
> +
> + @LogDecorators.parse_kmsg
> + def __write_fs(self, base_path: Path, name: str, value: str) -> None:
> + path = base_path / name
> + try:
> + path.write_text(value)
> + logger.debug("Write: %s -> %s", value, path)
> + except Exception as exc:
> + logger.error("Unable to write %s -> %s", value, path)
> + raise exceptions.HostError(f'Could not write to {path}. Error: {exc}') from exc
> +
> + @LogDecorators.parse_kmsg
> + def __read_fs(self, base_path: Path, name: str) -> str:
> + path = base_path / name
> + try:
> + ret = path.read_text()
> + except Exception as exc:
> + logger.error("Unable to read %s", path)
> + raise exceptions.HostError(f'Could not read from {path}. Error: {exc}') from exc
> +
> + logger.debug("Read: %s -> %s", path, ret.strip())
> + return ret
> +
> + def __write_sysfs(self, name: str, value: str) -> None:
> + self.__write_fs(self.sysfs_card_path / 'device', name, value)
> +
> + def __read_sysfs(self, name: str) -> str:
> + return str(self.__read_fs(self.sysfs_card_path / 'device', name))
> +
> + def __write_debugfs(self, name: str, value: str) -> None:
> + self.__write_fs(self.debugfs_path, name, value)
> +
> + def __read_debugfs(self, name: str) -> str:
> + return str(self.__read_fs(self.debugfs_path, name))
> +
> + def bind(self, bdf: str) -> None:
> + self.__write_sysfs('driver/bind', bdf)
> +
> + def unbind(self, bdf: str) -> None:
> + self.__write_sysfs('driver/unbind', bdf)
> +
> + def get_totalvfs(self) -> int:
> + return int(self.__read_sysfs('sriov_totalvfs'))
> +
> + def get_numvfs(self) -> int:
> + return int(self.__read_sysfs('sriov_numvfs'))
> +
> + def set_numvfs(self, val: int) -> None:
> + self.__write_sysfs('sriov_numvfs', str(val))
> +
> + def get_drivers_autoprobe(self) -> int:
> + return int(self.__read_sysfs('sriov_drivers_autoprobe'))
> +
> + def set_drivers_autoprobe(self, val: int) -> None:
> + self.__write_sysfs('sriov_drivers_autoprobe', str(val))
> +
> + def get_num_gts(self) -> int:
> + gt_num = 0
> + # Fixme: tile0 only at the moment, add support for multiple tiles if needed
> + path = self.sysfs_card_path / 'device' / 'tile0' / 'gt'
> +
> + if path.exists():
> + gt_num = 1
> + else:
> + while Path(f'{path}{gt_num}').exists():
> + gt_num += 1
> +
> + return gt_num
> +
> + def has_lmem(self) -> bool:
> + # XXX: is this a best way to check if LMEM is present?
> + path = self.debugfs_path / 'gt0' / 'pf' / 'lmem_spare'
> + return path.exists()
> +
> + def get_auto_provisioning(self) -> bool:
> + raise exceptions.NotAvailableError('auto_provisioning attribute not available')
> +
> + def set_auto_provisioning(self, val: bool) -> None:
> + raise exceptions.NotAvailableError('auto_provisioning attribute not available')
> +
> + def cancel_work(self) -> None:
> + # Function to cancel all remaing work on GPU (for test cleanup).
> + # Forcing reset (debugfs/gtM/force_reset_sync) shouldn't be used to idle GPU.
> + pass
> +
> + # Create debugfs path to given parameter (without a base part):
> + # gt at gt_num/[pf|vf at vf_num]/@attr
> + # @vf_num: VF number (1-based) or 0 for PF
> + # @gt_num: GT instance number
> + # @subdir: subdirectory for attribute or empty string if not exists
> + # @attr: iov parameter name
> + # Returns: iov debugfs path to @attr
> + def __helper_create_debugfs_path(self, vf_num: int, gt_num: int, subdir: str, attr: str) -> str:
> + vf_gt_part = f'gt{gt_num}/pf' if vf_num == 0 else f'gt{gt_num}/vf{vf_num}'
> + return f'{vf_gt_part}/{subdir}/{attr}'
> +
> + # PF spare resources
> + # Debugfs location: [SRIOV debugfs base path]/gtM/pf/xxx_spare
> + def get_pf_ggtt_spare(self, gt_num: int) -> int:
> + path = self.__helper_create_debugfs_path(0, gt_num, '', 'ggtt_spare')
> + return int(self.__read_debugfs(path))
> +
> + def set_pf_ggtt_spare(self, gt_num: int, val: int) -> None:
> + path = self.__helper_create_debugfs_path(0, gt_num, '', 'ggtt_spare')
> + self.__write_debugfs(path, str(val))
> +
> + def get_pf_lmem_spare(self, gt_num: int) -> int:
> + path = self.__helper_create_debugfs_path(0, gt_num, '', 'lmem_spare')
> + return int(self.__read_debugfs(path))
> +
> + def set_pf_lmem_spare(self, gt_num: int, val: int) -> None:
> + path = self.__helper_create_debugfs_path(0, gt_num, '', 'lmem_spare')
> + self.__write_debugfs(path, str(val))
> +
> + def get_pf_contexts_spare(self, gt_num: int) -> int:
> + path = self.__helper_create_debugfs_path(0, gt_num, '', 'contexts_spare')
> + return int(self.__read_debugfs(path))
> +
> + def set_pf_contexts_spare(self, gt_num: int, val: int) -> None:
> + path = self.__helper_create_debugfs_path(0, gt_num, '', 'contexts_spare')
> + self.__write_debugfs(path, str(val))
> +
> + def get_pf_doorbells_spare(self, gt_num: int) -> int:
> + path = self.__helper_create_debugfs_path(0, gt_num, '', 'doorbells_spare')
> + return int(self.__read_debugfs(path))
> +
> + def set_pf_doorbells_spare(self, gt_num: int, val: int) -> None:
> + path = self.__helper_create_debugfs_path(0, gt_num, '', 'doorbells_spare')
> + self.__write_debugfs(path, str(val))
> +
> + # PF specific provisioning parameters
> + # Debugfs location: [SRIOV debugfs base path]/gtM/pf
> + def get_pf_sched_priority(self, gt_num: int) -> SchedulingPriority:
> + logger.warning("PF sched_priority param not available")
> + return SchedulingPriority.LOW
> +
> + def set_pf_sched_priority(self, gt_num: int, val: SchedulingPriority) -> None:
> + logger.warning("PF sched_priority param not available")
> +
> + def get_pf_policy_reset_engine(self, gt_num: int) -> int:
> + path = self.__helper_create_debugfs_path(0, gt_num, '', 'reset_engine')
> + return int(self.__read_debugfs(path))
> +
> + def set_pf_policy_reset_engine(self, gt_num: int, val: int) -> None:
> + path = self.__helper_create_debugfs_path(0, gt_num, '', 'reset_engine')
> + self.__write_debugfs(path, str(val))
> +
> + def get_pf_policy_sample_period_ms(self, gt_num: int) -> int:
> + path = self.__helper_create_debugfs_path(0, gt_num, '', 'sample_period_ms')
> + return int(self.__read_debugfs(path))
> +
> + def set_pf_policy_sample_period_ms(self, gt_num: int, val: int) -> None:
> + path = self.__helper_create_debugfs_path(0, gt_num, '', 'sample_period_ms')
> + self.__write_debugfs(path, str(val))
> +
> + def get_pf_policy_sched_if_idle(self, gt_num: int) -> int:
> + path = self.__helper_create_debugfs_path(0, gt_num, '', 'sched_if_idle')
> + return int(self.__read_debugfs(path))
> +
> + def set_pf_policy_sched_if_idle(self, gt_num: int, val: int) -> None:
> + # In order to set strict scheduling policy, PF scheduling priority needs to be default
> + path = self.__helper_create_debugfs_path(0, gt_num, '', 'sched_if_idle')
> + self.__write_debugfs(path, str(val))
> +
> + # VF and PF provisioning parameters
> + # Debugfs location: [SRIOV debugfs base path]/gtM/[pf|vfN]
> + # @vf_num: VF number (1-based) or 0 for PF
> + def get_ggtt_quota(self, vf_num: int, gt_num: int) -> int:
> + if vf_num == 0:
> + logger.warning("PF ggtt_quota not available")
> + return 0
> +
> + path = self.__helper_create_debugfs_path(vf_num, gt_num, '', 'ggtt_quota')
> + return int(self.__read_debugfs(path))
> +
> + def set_ggtt_quota(self, vf_num: int, gt_num: int, val: int) -> None:
> + if vf_num == 0:
> + logger.warning("PF ggtt_quota not available")
> + return
> +
> + path = self.__helper_create_debugfs_path(vf_num, gt_num, '', 'ggtt_quota')
> + self.__write_debugfs(path, str(val))
> +
> + def get_lmem_quota(self, vf_num: int, gt_num: int) -> int:
> + if vf_num == 0:
> + logger.warning("PF lmem_quota not available")
> + return 0
> +
> + path = self.__helper_create_debugfs_path(vf_num, gt_num, '', 'lmem_quota')
> + return int(self.__read_debugfs(path)) if self.has_lmem() else 0
> +
> + def set_lmem_quota(self, vf_num: int, gt_num: int, val: int) -> None:
> + if vf_num == 0:
> + logger.warning("PF lmem_quota not available")
> + return
> +
> + path = self.__helper_create_debugfs_path(vf_num, gt_num, '', 'lmem_quota')
> + if self.has_lmem():
> + self.__write_debugfs(path, str(val))
> +
> + def get_contexts_quota(self, vf_num: int, gt_num: int) -> int:
> + if vf_num == 0:
> + logger.warning("PF contexts_quota not available")
> + return 0
> +
> + path = self.__helper_create_debugfs_path(vf_num, gt_num, '', 'contexts_quota')
> + return int(self.__read_debugfs(path))
> +
> + def set_contexts_quota(self, vf_num: int, gt_num: int, val: int) -> None:
> + if vf_num == 0:
> + logger.warning("PF contexts_quota not available")
> + return
> +
> + path = self.__helper_create_debugfs_path(vf_num, gt_num, '', 'contexts_quota')
> + self.__write_debugfs(path, str(val))
> +
> + def get_doorbells_quota(self, vf_num: int, gt_num: int) -> int:
> + if vf_num == 0:
> + logger.warning("PF doorbells_quota not available")
> + return 0
> +
> + path = self.__helper_create_debugfs_path(vf_num, gt_num, '', 'doorbells_quota')
> + return int(self.__read_debugfs(path))
> +
> + def set_doorbells_quota(self, vf_num: int, gt_num: int, val: int) -> None:
> + if vf_num == 0:
> + logger.warning("PF doorbells_quota not available")
> + return
> +
> + path = self.__helper_create_debugfs_path(vf_num, gt_num, '', 'doorbells_quota')
> + self.__write_debugfs(path, str(val))
> +
> + def get_exec_quantum_ms(self, vf_num: int, gt_num: int) -> int:
> + path = self.__helper_create_debugfs_path(vf_num, gt_num, '', 'exec_quantum_ms')
> + return int(self.__read_debugfs(path))
> +
> + def set_exec_quantum_ms(self, vf_num: int, gt_num: int, val: int) -> None:
> + path = self.__helper_create_debugfs_path(vf_num, gt_num, '', 'exec_quantum_ms')
> + self.__write_debugfs(path, str(val))
> +
> + def get_preempt_timeout_us(self, vf_num: int, gt_num: int) -> int:
> + path = self.__helper_create_debugfs_path(vf_num, gt_num, '', 'preempt_timeout_us')
> + return int(self.__read_debugfs(path))
> +
> + def set_preempt_timeout_us(self, vf_num: int, gt_num: int, val: int) -> None:
> + path = self.__helper_create_debugfs_path(vf_num, gt_num, '', 'preempt_timeout_us')
> + self.__write_debugfs(path, str(val))
> +
> + # Control state of the running VF (WO)
> + # Debugfs location: [SRIOV debugfs base path]/gtM/vfN/control
> + # Allows PF admin to pause, resume or stop handling
> + # submission requests from given VF and clear provisioning.
> + # control: "pause|resume|stop|clear"
> + # For debug purposes only.
> + def set_vf_control(self, vf_num: int, val: VfControl) -> None:
> + path = self.__helper_create_debugfs_path(vf_num, 0, '', 'control')
> + self.__write_debugfs(path, val)
> +
> + # Read [attribute]_available value from debugfs:
> + # /sys/kernel/debug/dri/[card_index]/gt at gt_num/pf/@attr_available
> + # @gt_num: GT instance number
> + # @attr: iov parameter name
> + # Returns: total and available size for @attr
> + def __helper_get_debugfs_available(self, gt_num: int, attr: str) -> typing.Tuple[int, int]:
> + path = self.debugfs_path / f'gt{gt_num}' / 'pf' / f'{attr}_available'
> + total = available = 0
> +
> + out = path.read_text()
> + for line in out.splitlines():
> + param, value = line.split(':')
> + value = value.lstrip().split('\t')[0]
> +
> + if param == 'total':
> + total = int(value)
> + elif param == 'avail':
> + available = int(value)
> +
> + return (total, available)
> +
> + # Resources total availability
> + # Debugfs location: [SRIOV debugfs base path]/gtM/pf/
> + def get_ggtt_available(self, gt_num: int) -> typing.Tuple[int, int]:
> + """Get total and available GGTT size."""
> + return self.__helper_get_debugfs_available(gt_num, 'ggtt')
> diff --git a/vmtb/bench/exceptions.py b/vmtb/bench/exceptions.py
> new file mode 100644
> index 000000000..95ca2aa9b
> --- /dev/null
> +++ b/vmtb/bench/exceptions.py
> @@ -0,0 +1,40 @@
> +# SPDX-License-Identifier: MIT
> +# Copyright © 2024 Intel Corporation
> +
> +class BenchError(Exception):
> + pass
> +
> +
> +# Host errors:
> +class HostError(BenchError):
> + pass
> +
> +
> +# Guest errors:
> +class GuestError(BenchError):
> + pass
> +
> +
> +class GuestAgentError(GuestError):
> + pass
> +
> +
> +class AlarmTimeoutError(GuestError):
> + pass
> +
> +
> +# Generic errors:
> +class GemWsimError(BenchError):
> + pass
> +
> +
> +class VgpuProfileError(BenchError):
> + pass
> +
> +
> +class NotAvailableError(BenchError):
> + pass
> +
> +
> +class VmtbConfigError(BenchError):
> + pass
> diff --git a/vmtb/bench/executors/__init__.py b/vmtb/bench/executors/__init__.py
> new file mode 100644
> index 000000000..e69de29bb
> diff --git a/vmtb/bench/executors/executor_interface.py b/vmtb/bench/executors/executor_interface.py
> new file mode 100644
> index 000000000..e1598fd29
> --- /dev/null
> +++ b/vmtb/bench/executors/executor_interface.py
> @@ -0,0 +1,22 @@
> +# SPDX-License-Identifier: MIT
> +# Copyright © 2024 Intel Corporation
> +
> +import abc
> +import signal
> +
> +from bench.machines.machine_interface import ProcessResult
> +
> +
> +class ExecutorInterface(metaclass=abc.ABCMeta):
> +
> + @abc.abstractmethod
> + def status(self) -> ProcessResult:
> + raise NotImplementedError
> +
> + @abc.abstractmethod
> + def wait(self) -> ProcessResult:
> + raise NotImplementedError
> +
> + @abc.abstractmethod
> + def sendsig(self, sig: signal.Signals) -> None:
> + raise NotImplementedError
> diff --git a/vmtb/bench/executors/gem_wsim.py b/vmtb/bench/executors/gem_wsim.py
> new file mode 100644
> index 000000000..46fa2291c
> --- /dev/null
> +++ b/vmtb/bench/executors/gem_wsim.py
> @@ -0,0 +1,70 @@
> +# SPDX-License-Identifier: MIT
> +# Copyright © 2024 Intel Corporation
> +
> +import logging
> +import re
> +import typing
> +
> +from bench import exceptions
> +from bench.executors.shell import ShellExecutor
> +from bench.machines.machine_interface import DEFAULT_TIMEOUT, MachineInterface
> +
> +logger = logging.getLogger('GemWsim')
> +
> +
> +class GemWsimResult(typing.NamedTuple):
> + elapsed_sec: float
> + workloads_per_sec: float
> +
> +# Basic workloads
> +ONE_CYCLE_DURATION_MS = 10
> +PREEMPT_10MS_WORKLOAD = (f'1.DEFAULT.{int(ONE_CYCLE_DURATION_MS * 1000 / 2)}.0.0'
> + f',2.DEFAULT.{int(ONE_CYCLE_DURATION_MS * 1000 / 2)}.-1.1')
> +NON_PREEMPT_10MS_WORKLOAD = f'X.1.0,X.2.0,{PREEMPT_10MS_WORKLOAD}'
> +
> +class GemWsim(ShellExecutor):
> + def __init__(self, machine: MachineInterface, num_clients: int = 1, num_repeats: int = 1,
> + workload: str = PREEMPT_10MS_WORKLOAD, timeout: int = DEFAULT_TIMEOUT) -> None:
> + super().__init__(
> + machine,
> + f'/usr/local/libexec/igt-gpu-tools/benchmarks/gem_wsim -w {workload} -c {num_clients} -r {num_repeats}',
> + timeout)
> + self.machine_id = str(machine)
> +
> + def __str__(self) -> str:
> + return f'gem_wsim({self.machine_id}:{self.pid})'
> +
> + def is_running(self) -> bool:
> + return not self.status().exited
> +
> + def wait_results(self) -> GemWsimResult:
> + proc_result = self.wait()
> + if proc_result.exit_code == 0:
> + logger.info('%s: %s', self, proc_result.stdout)
> + # Try parse output ex.: 19.449s elapsed (102.836 workloads/s)
> + pattern = r'(?P<elapsed>\d+(\.\d*)?|\.\d+)s elapsed \((?P<wps>\d+(\.\d*)?|\.\d+) workloads/s\)'
> + match = re.search(pattern, proc_result.stdout, re.MULTILINE)
> + if match:
> + return GemWsimResult(float(match.group('elapsed')), float(match.group('wps')))
> + raise exceptions.GemWsimError(f'{self}: exit_code: {proc_result.exit_code}'
> + f' stdout: {proc_result.stdout} stderr: {proc_result.stderr}')
> +
> +
> +def gem_wsim_parallel_exec_and_check(vms: typing.List[MachineInterface], workload: str, iterations: int,
> + expected: typing.Optional[GemWsimResult] = None) -> GemWsimResult:
> + # launch on each VM in parallel
> + wsim_procs = [GemWsim(vm, 1, iterations, workload) for vm in vms]
> + for i, wsim in enumerate(wsim_procs):
> + assert wsim.is_running(), f'GemWsim failed to start on VM{i}'
> +
> + results = [wsim.wait_results() for wsim in wsim_procs]
> + if expected is not None:
> + assert results[0].elapsed_sec > expected.elapsed_sec * 0.9
> + assert results[0].workloads_per_sec > expected.workloads_per_sec * 0.9
> + for r in results[1:]:
> + # check wps ratio ~1.0 with 10% tolerance
> + assert 0.9 < r.workloads_per_sec / results[0].workloads_per_sec < 1.1
> + # check elapsed ratio ~1.0 with 10% tolerance
> + assert 0.9 < r.elapsed_sec / results[0].elapsed_sec < 1.1
> + # return first result, all other are asserted to be ~same
> + return results[0]
> diff --git a/vmtb/bench/executors/igt.py b/vmtb/bench/executors/igt.py
> new file mode 100644
> index 000000000..4296464c2
> --- /dev/null
> +++ b/vmtb/bench/executors/igt.py
> @@ -0,0 +1,117 @@
> +# SPDX-License-Identifier: MIT
> +# Copyright © 2024 Intel Corporation
> +
> +import enum
> +import json
> +import logging
> +import posixpath
> +import signal
> +import typing
> +
> +from bench.executors.executor_interface import ExecutorInterface
> +from bench.executors.shell import ShellExecutor
> +from bench.machines.machine_interface import (DEFAULT_TIMEOUT,
> + MachineInterface, ProcessResult)
> +
> +logger = logging.getLogger('IgtExecutor')
> +
> +
> +class IgtType(enum.Enum):
> + EXEC_BASIC = 1
> + EXEC_STORE = 2
> + SPIN_BATCH = 3
> +
> +
> +# Mappings of driver specific (i915/xe) IGT instances:
> +# {IGT type: (i915 IGT name, xe IGT name)}
> +igt_tests: typing.Dict[IgtType, typing.Tuple[str, str]] = {
> + IgtType.EXEC_BASIC: ('igt at gem_exec_basic@basic', 'igt at xe_exec_basic@once-basic'),
> + IgtType.EXEC_STORE: ('igt at gem_exec_store@dword', 'igt at xe_exec_store@basic-store'),
> + IgtType.SPIN_BATCH: ('igt at gem_spin_batch@legacy', 'igt at xe_spin_batch@spin-basic')
> + }
> +
> +
> +class IgtExecutor(ExecutorInterface):
> + def __init__(self, target: MachineInterface,
> + test: typing.Union[str, IgtType],
> + timeout: int = DEFAULT_TIMEOUT) -> None:
> + self.igt_config = target.get_igt_config()
> +
> + # TODO ld_library_path not used now, need a way to pass this to guest
> + #ld_library_path = f'LD_LIBRARY_PATH={igt_config.lib_dir}'
> + runner = posixpath.join(self.igt_config.tool_dir, 'igt_runner')
> + testlist = '/tmp/igt_executor.testlist'
> + command = f'{runner} {self.igt_config.options} ' \
> + f'--test-list {testlist} {self.igt_config.test_dir} {self.igt_config.result_dir}'
> + self.results: typing.Dict[str, typing.Any] = {}
> + self.target: MachineInterface = target
> + self.igt: str = test if isinstance(test, str) else self.select_igt_variant(target.get_drm_driver_name(), test)
> + self.target.write_file_content(testlist, self.igt)
> + self.timeout: int = timeout
> +
> + logger.info("[%s] Execute IGT test: %s", target, self.igt)
> + self.pid: int = self.target.execute(command)
> +
> + # Executor interface implementation
> + def status(self) -> ProcessResult:
> + return self.target.execute_status(self.pid)
> +
> + def wait(self) -> ProcessResult:
> + return self.target.execute_wait(self.pid, self.timeout)
> +
> + def sendsig(self, sig: signal.Signals) -> None:
> + self.target.execute_signal(self.pid, sig)
> +
> + def terminate(self) -> None:
> + self.sendsig(signal.SIGTERM)
> +
> + def kill(self) -> None:
> + self.sendsig(signal.SIGKILL)
> +
> + # IGT specific methods
> + def get_results_log(self) -> typing.Dict:
> + # Results are cached
> + if self.results:
> + return self.results
> + path = posixpath.join(self.igt_config.result_dir, 'results.json')
> + result = self.target.read_file_content(path)
> + self.results = json.loads(result)
> + return self.results
> +
> + def did_pass(self) -> bool:
> + results = self.get_results_log()
> + totals = results.get('totals')
> + if not totals:
> + return False
> + aggregate = totals.get('root')
> + if not aggregate:
> + return False
> +
> + pass_case = 0
> + fail_case = 0
> + for key in aggregate:
> + if key in ['pass', 'warn', 'dmesg-warn']:
> + pass_case = pass_case + aggregate[key]
> + continue
> + fail_case = fail_case + aggregate[key]
> +
> + logger.debug('Full IGT test results:\n%s', json.dumps(results, indent=4))
> +
> + if fail_case > 0:
> + logger.error('Test failed!')
> + return False
> +
> + return True
> +
> + def select_igt_variant(self, driver: str, igt_type: IgtType) -> str:
> + # Select IGT variant dedicated for a given drm driver: xe or i915
> + igt = igt_tests[igt_type]
> + return igt[1] if driver == 'xe' else igt[0]
> +
> +
> +def igt_list_subtests(target: MachineInterface, test_name: str) -> typing.List[str]:
> + command = f'{target.get_igt_config().test_dir}{test_name} --list-subtests'
> + proc_result = ShellExecutor(target, command).wait()
> + if proc_result.exit_code == 0:
> + return proc_result.stdout.split("\n")
> + return []
> diff --git a/vmtb/bench/executors/shell.py b/vmtb/bench/executors/shell.py
> new file mode 100644
> index 000000000..c05a82a86
> --- /dev/null
> +++ b/vmtb/bench/executors/shell.py
> @@ -0,0 +1,30 @@
> +# SPDX-License-Identifier: MIT
> +# Copyright © 2024 Intel Corporation
> +
> +import signal
> +
> +from bench.executors.executor_interface import ExecutorInterface
> +from bench.machines.machine_interface import (DEFAULT_TIMEOUT,
> + MachineInterface, ProcessResult)
> +
> +
> +class ShellExecutor(ExecutorInterface):
> + def __init__(self, target: MachineInterface, command: str, timeout: int = DEFAULT_TIMEOUT) -> None:
> + self.target = target
> + self.timeout = timeout
> + self.pid = self.target.execute(command)
> +
> + def status(self) -> ProcessResult:
> + return self.target.execute_status(self.pid)
> +
> + def wait(self) -> ProcessResult:
> + return self.target.execute_wait(self.pid, self.timeout)
> +
> + def sendsig(self, sig: signal.Signals) -> None:
> + self.target.execute_signal(self.pid, sig)
> +
> + def terminate(self) -> None:
> + self.sendsig(signal.SIGTERM)
> +
> + def kill(self) -> None:
> + self.sendsig(signal.SIGKILL)
> diff --git a/vmtb/bench/helpers/__init__.py b/vmtb/bench/helpers/__init__.py
> new file mode 100644
> index 000000000..e69de29bb
> diff --git a/vmtb/bench/helpers/helpers.py b/vmtb/bench/helpers/helpers.py
> new file mode 100644
> index 000000000..8c81fd486
> --- /dev/null
> +++ b/vmtb/bench/helpers/helpers.py
> @@ -0,0 +1,77 @@
> +# SPDX-License-Identifier: MIT
> +# Copyright © 2024 Intel Corporation
> +
> +import logging
> +
> +from bench.executors.igt import IgtExecutor
> +from bench.executors.shell import ShellExecutor
> +from bench.machines.machine_interface import MachineInterface
> +
> +logger = logging.getLogger('Helpers')
> +
> +
> +def driver_check(machine: MachineInterface, card: int = 0) -> bool:
> + drm_driver = machine.get_drm_driver_name()
> + if not machine.dir_exists(f'/sys/module/{drm_driver}/drivers/pci:{drm_driver}/'):
> + logger.error(f'{drm_driver} module not loaded on card %s', card)
> + return False
> +
> + return True
> +
> +
> +def igt_check(igt_test: IgtExecutor) -> bool:
> + ''' Helper/wrapper for wait and check for igt test '''
> + igt_out = igt_test.wait()
> + if igt_out.exit_code == 0 and igt_test.did_pass():
> + return True
> + logger.error('IGT failed with %s', igt_out)
> + return False
> +
> +
> +def igt_run_check(machine: MachineInterface, test: str) -> bool:
> + ''' Helper/wrapper for quick run and check for igt test '''
> + igt_test = IgtExecutor(machine, test)
> + return igt_check(igt_test)
> +
> +
> +def cmd_check(cmd: ShellExecutor) -> bool:
> + ''' Helper/wrapper for wait and check for shell command '''
> + cmd_out = cmd.wait()
> + if cmd_out.exit_code == 0:
> + return True
> + logger.error('%s failed with %s', cmd, cmd_out)
> + return False
> +
> +
> +def cmd_run_check(machine: MachineInterface, cmd: str) -> bool:
> + ''' Helper/wrapper for quick run and check for shell command '''
> + cmd_run = ShellExecutor(machine, cmd)
> + return cmd_check(cmd_run)
> +
> +
> +def modprobe_driver(machine: MachineInterface, parameters: str = '', options: str = '') -> ShellExecutor:
> + """Load driver (modprobe [driver_module]) and return ShellExecutor instance (do not check a result)."""
> + drm_driver = machine.get_drm_driver_name()
> + modprobe_cmd = ShellExecutor(machine, f'modprobe {drm_driver} {options} {parameters}')
> + return modprobe_cmd
> +
> +
> +def modprobe_driver_check(machine: MachineInterface, cmd: ShellExecutor) -> bool:
> + """Check result of a driver load (modprobe) based on a given ShellExecutor instance."""
> + modprobe_success = cmd_check(cmd)
> + if modprobe_success:
> + return driver_check(machine)
> +
> + logger.error('Modprobe failed')
> + return False
> +
> +
> +def modprobe_driver_run_check(machine: MachineInterface, parameters: str = '', options: str = '') -> bool:
> + """Load (modprobe) a driver and check a result (waits until operation ends)."""
> + modprobe_cmd = modprobe_driver(machine, parameters, options)
> + modprobe_success = modprobe_driver_check(machine, modprobe_cmd)
> + if modprobe_success:
> + return driver_check(machine)
> +
> + logger.error('Modprobe failed')
> + return False
> diff --git a/vmtb/bench/helpers/log.py b/vmtb/bench/helpers/log.py
> new file mode 100644
> index 000000000..665bb6cf9
> --- /dev/null
> +++ b/vmtb/bench/helpers/log.py
> @@ -0,0 +1,75 @@
> +# SPDX-License-Identifier: MIT
> +# Copyright © 2024 Intel Corporation
> +
> +import errno
> +import fcntl
> +import functools
> +import logging
> +import os
> +import typing
> +from pathlib import Path
> +
> +from bench import exceptions
> +
> +logger = logging.getLogger('Host-kmsg')
> +
> +HOST_DMESG_FILE = Path("/tmp/vm-test-bench-host_dmesg.log.tmp")
> +
> +
> +class LogDecorators():
> + """Read and parse kernel log buffer.
> + https://www.kernel.org/doc/Documentation/ABI/testing/dev-kmsg
> + """
> + @staticmethod
> + def read_messages(fd: int) -> typing.List[str]:
> + buf_size = 4096
> + kmsgs = []
> + while True:
> + try:
> + kmsg = os.read(fd, buf_size)
> + kmsgs.append(kmsg.decode())
> + except OSError as exc:
> + if exc.errno == errno.EAGAIN:
> + break
> +
> + if exc.errno == errno.EPIPE:
> + pass
> + else:
> + raise
> + return kmsgs
> +
> + @staticmethod
> + def parse_messages(kmsgs: typing.List[str]) -> None:
> + for msg in kmsgs:
> + header, human = msg.split(';', 1)
> + # Get priority/facility field (seq, time, other unused for now)
> + prio_fac, _, _, _ = header.split(',', 3)
> + level = int(prio_fac) & 0x7 # Syslog priority
> +
> + if level <= 2: # KERN_CRIT/ALERT/EMERG
> + logger.error("[Error: %s]: %s", level, human.strip())
> + raise exceptions.HostError(f'Error in dmesg: {human.strip()}')
> +
> + logger.debug("%s", human.strip())
> +
> + @classmethod
> + def parse_kmsg(cls, func: typing.Callable) -> typing.Callable:
> + @functools.wraps(func)
> + def parse_wrapper(*args: typing.Any, **kwargs: typing.Optional[typing.Any]) -> typing.Any:
> + with open('/dev/kmsg', 'r', encoding='utf-8') as f, \
> + open(HOST_DMESG_FILE, 'a', encoding='utf-8') as dmesg_file:
> +
> + fd = f.fileno()
> + os.lseek(fd, os.SEEK_SET, os.SEEK_END)
> + flags = fcntl.fcntl(fd, fcntl.F_GETFL)
> + fcntl.fcntl(fd, fcntl.F_SETFL, flags | os.O_NONBLOCK)
> +
> + # Execute actual function
> + result = func(*args, **kwargs)
> +
> + kmsgs = cls.read_messages(fd)
> + dmesg_file.writelines(kmsgs)
> + cls.parse_messages(kmsgs)
> +
> + return result
> + return parse_wrapper
> diff --git a/vmtb/bench/machines/__init__.py b/vmtb/bench/machines/__init__.py
> new file mode 100644
> index 000000000..e69de29bb
> diff --git a/vmtb/bench/machines/device_interface.py b/vmtb/bench/machines/device_interface.py
> new file mode 100644
> index 000000000..e8d4068e8
> --- /dev/null
> +++ b/vmtb/bench/machines/device_interface.py
> @@ -0,0 +1,23 @@
> +# SPDX-License-Identifier: MIT
> +# Copyright © 2024 Intel Corporation
> +
> +import abc
> +
> +
> +class DeviceInterface(abc.ABC):
> +
> + @abc.abstractmethod
> + def create_vf(self, num: int) -> int:
> + raise NotImplementedError
> +
> + @abc.abstractmethod
> + def remove_vfs(self) -> int:
> + raise NotImplementedError
> +
> + @abc.abstractmethod
> + def bind_driver(self) -> None:
> + raise NotImplementedError
> +
> + @abc.abstractmethod
> + def unbind_driver(self) -> None:
> + raise NotImplementedError
> diff --git a/vmtb/bench/machines/host.py b/vmtb/bench/machines/host.py
> new file mode 100644
> index 000000000..666f35c26
> --- /dev/null
> +++ b/vmtb/bench/machines/host.py
> @@ -0,0 +1,196 @@
> +# SPDX-License-Identifier: MIT
> +# Copyright © 2024 Intel Corporation
> +
> +import logging
> +import re
> +import shlex
> +import signal
> +import subprocess
> +import typing
> +from pathlib import Path
> +
> +from bench import exceptions
> +from bench.configurators.vmtb_config import VmtbIgtConfig
> +from bench.helpers.log import LogDecorators
> +from bench.machines.machine_interface import (DEFAULT_TIMEOUT,
> + MachineInterface, ProcessResult,
> + SuspendMode)
> +from bench.machines.physical.device import Device
> +
> +logger = logging.getLogger('Host')
> +
> +
> +class Host(MachineInterface):
> + def __init__(self) -> None:
> + self.running_procs: typing.Dict[int, subprocess.Popen] = {}
> + self.gpu_devices: typing.List[Device] = []
> + self.dut_index: int = 0
> + # Initialize in conftest/VmmTestingSetup:
> + self.drm_driver_name: str
> + self.igt_config: VmtbIgtConfig
> +
> + def __str__(self) -> str:
> + return f'Host-{self.gpu_devices[self.dut_index].pci_info.bdf}'
> +
> + @LogDecorators.parse_kmsg
> + def execute(self, command: str) -> int:
> + cmd_arr = shlex.split(command)
> + # We don't want to kill the process created here (like 'with' would do) so disable the following linter issue:
> + # R1732: consider-using-with (Consider using 'with' for resource-allocating operations)
> + # pylint: disable=R1732
> + # TODO: but maybe 'subprocess.run' function would fit instead of Popen constructor?
> + process = subprocess.Popen(cmd_arr,
> + stdout=subprocess.PIPE,
> + stderr=subprocess.PIPE,
> + universal_newlines=True)
> +
> + self.running_procs[process.pid] = process
> + logger.debug("Run command: %s (PID: %s)", command, process.pid)
> + return process.pid
> +
> + @LogDecorators.parse_kmsg
> + def execute_status(self, pid: int) -> ProcessResult:
> + proc = self.running_procs.get(pid, None)
> + if not proc:
> + logger.error("No process with PID: %s", pid)
> + raise exceptions.HostError(f'No process with PID: {pid}')
> +
> + exit_code: typing.Optional[int] = proc.poll()
> + logger.debug("PID %s -> exit code %s", pid, exit_code)
> + if exit_code is None:
> + return ProcessResult(False, exit_code, '', '')
> +
> + out, err = proc.communicate()
> + return ProcessResult(True, exit_code, out, err)
> +
> + @LogDecorators.parse_kmsg
> + def execute_wait(self, pid: int, timeout: int = DEFAULT_TIMEOUT) -> ProcessResult:
> + proc = self.running_procs.get(pid, None)
> + if not proc:
> + logger.error("No process with PID: %s", pid)
> + raise exceptions.HostError(f'No process with PID: {pid}')
> +
> + out = ''
> + err = ''
> + try:
> + out, err = proc.communicate(timeout)
> + except subprocess.TimeoutExpired as exc:
> + logger.warning("Timeout (%ss) expired for PID: %s", exc.timeout, pid)
> + raise
> +
> + return ProcessResult(True, proc.poll(), out, err)
> +
> + @LogDecorators.parse_kmsg
> + def execute_signal(self, pid: int, sig: signal.Signals) -> None:
> + proc = self.running_procs.get(pid, None)
> + if not proc:
> + logger.error("No process with PID: %s", pid)
> + raise exceptions.HostError(f'No process with PID: {pid}')
> +
> + proc.send_signal(sig)
> +
> + def read_file_content(self, path: str) -> str:
> + with open(path, encoding='utf-8') as f:
> + content = f.read()
> + return content
> +
> + def write_file_content(self, path: str, content: str) -> int:
> + with open(path, 'w', encoding='utf-8') as f:
> + return f.write(content)
> +
> + def dir_exists(self, path: str) -> bool:
> + return Path(path).is_dir()
> +
> + def get_drm_driver_name(self) -> str:
> + # Used as a part of MachineInterface for helpers
> + return self.drm_driver_name
> +
> + def get_igt_config(self) -> VmtbIgtConfig:
> + # Used as a part of MachineInterface to initialize IgtExecutor
> + return self.igt_config
> +
> + def is_driver_loaded(self, driver_name: str) -> bool:
> + driver_path = Path('/sys/bus/pci/drivers/') / driver_name
> + return driver_path.exists()
> +
> + def is_driver_available(self, driver_name: str) -> bool:
> + modinfo_pid = self.execute(f'modinfo -F filename {driver_name}')
> + modinfo_result: ProcessResult = self.execute_wait(modinfo_pid)
> + return modinfo_result.exit_code == 0
> +
> + def load_drivers(self) -> None:
> + """Load (modprobe) required host drivers (DRM and VFIO)."""
> + drivers_to_probe = [self.drm_driver_name, f'{self.drm_driver_name}-vfio-pci']
> + # If vendor specific VFIO (ex. xe-vfio-pci) is not present, probe a regular vfio-pci
> + if not self.is_driver_available(drivers_to_probe[1]):
> + logger.warning("VFIO driver: '%s' is not available - use 'vfio-pci'", drivers_to_probe[1])
> + drivers_to_probe[1] = 'vfio-pci'
> +
> + for driver in drivers_to_probe:
> + if not self.is_driver_loaded(driver):
> + logger.info("%s driver is not loaded - probe module", driver)
> + drv_probe_pid = self.execute(f'modprobe {driver}')
> + if self.execute_wait(drv_probe_pid).exit_code != 0:
> + logger.error("%s driver probe failed!", driver)
> + raise exceptions.HostError(f'{driver} driver probe failed!')
> +
> + def unload_drivers(self) -> None:
> + """Unload (remove) host drivers (DRM and VFIO)."""
> + logger.debug("Cleanup - unload drivers\n")
> + vfio_driver = f'{self.drm_driver_name}-vfio-pci'
> + if not self.is_driver_loaded(vfio_driver):
> + vfio_driver = 'vfio-pci'
> +
> + rmmod_pid = self.execute(f'modprobe -rf {vfio_driver}')
> + if self.execute_wait(rmmod_pid).exit_code != 0:
> + logger.error("VFIO driver remove failed!")
> + raise exceptions.HostError('VFIO driver remove failed!')
> +
> + for device in self.gpu_devices:
> + logger.debug("Unbind %s from device %s", self.drm_driver_name, device.pci_info.bdf)
> + device.unbind_driver()
> +
> + rmmod_pid = self.execute(f'modprobe -rf {self.drm_driver_name}')
> + if self.execute_wait(rmmod_pid).exit_code != 0:
> + logger.error("DRM driver remove failed!")
> + raise exceptions.HostError('DRM driver remove failed!')
> +
> + logger.debug("%s/%s successfully removed", self.drm_driver_name, vfio_driver)
> +
> + def discover_devices(self) -> None:
> + """Detect all PCI GPU devices on the host and initialize Device list."""
> + if not self.is_driver_loaded(self.drm_driver_name):
> + logger.error("Unable to discover devices - %s driver is not loaded!", self.drm_driver_name)
> + raise exceptions.HostError(f'Unable to discover devices - {self.drm_driver_name} driver is not loaded!')
> +
> + detected_devices: typing.List[Device] = []
> + drv_path = Path('/sys/bus/pci/drivers/') / self.drm_driver_name
> +
> + # Look for a directory name with a PCI BDF (e.g. 0000:1a:00.0)
> + for dev_bdf_dir in drv_path.glob('*:*:*.[0-7]'):
> + bdf = dev_bdf_dir.name
> + device = Device(bdf, self.drm_driver_name)
> + detected_devices.append(device)
> +
> + # Output list of detected devices sorted by an ascending card index (device minor number)
> + self.gpu_devices = sorted(detected_devices, key=lambda dev: dev.pci_info.minor_number)
> +
> + if not self.gpu_devices:
> + logger.error("GPU PCI device (bound to %s driver) not detected!", self.drm_driver_name)
> + raise exceptions.HostError(f'GPU PCI device (bound to {self.drm_driver_name} driver) not detected!')
> +
> + logger.debug("Detected GPU PCI device(s):")
> + for dev in self.gpu_devices:
> + logger.debug("[%s] PCI BDF: %s / DevID: %s (%s)",
> + dev.pci_info.minor_number, dev.pci_info.bdf, dev.pci_info.devid, dev.gpu_model)
> +
> + def suspend(self, mode: SuspendMode = SuspendMode.ACPI_S3) -> None:
> + """Perform host suspend cycle (ACPI S3) via rtcwake tool."""
> + wakeup_delay = 10 # wakeup timer in seconds
> + logger.debug("Suspend-resume via rtcwake (mode: %s, wakeup delay: %ss)", mode, wakeup_delay)
> +
> + suspend_pid = self.execute(f'rtcwake -s {wakeup_delay} -m {mode}')
> + suspend_result: ProcessResult = self.execute_wait(suspend_pid)
> + if suspend_result.exit_code != 0:
> + logger.error("Suspend failed - error: %s", suspend_result.stderr)
> + raise exceptions.HostError(f'Suspend failed - error: {suspend_result.stderr}')
> diff --git a/vmtb/bench/machines/machine_interface.py b/vmtb/bench/machines/machine_interface.py
> new file mode 100644
> index 000000000..8daa2cda3
> --- /dev/null
> +++ b/vmtb/bench/machines/machine_interface.py
> @@ -0,0 +1,65 @@
> +# SPDX-License-Identifier: MIT
> +# Copyright © 2024 Intel Corporation
> +
> +import abc
> +import enum
> +import signal
> +import typing
> +
> +from bench.configurators.vmtb_config import VmtbIgtConfig
> +
> +DEFAULT_TIMEOUT: int = 1200 # Default machine execution wait timeout in seconds
> +
> +
> +class ProcessResult(typing.NamedTuple):
> + exited: bool = False
> + exit_code: typing.Optional[int] = None
> + stdout: str = ''
> + stderr: str = ''
> +
> +
> +class SuspendMode(str, enum.Enum):
> + ACPI_S3 = 'mem' # Suspend to RAM aka sleep
> + ACPI_S4 = 'disk' # Suspend to disk aka hibernation
> +
> + def __str__(self) -> str:
> + return str.__str__(self)
> +
> +
> +class MachineInterface(metaclass=abc.ABCMeta):
> +
> + @abc.abstractmethod
> + def execute(self, command: str) -> int:
> + raise NotImplementedError
> +
> + @abc.abstractmethod
> + def execute_status(self, pid: int) -> ProcessResult:
> + raise NotImplementedError
> +
> + @abc.abstractmethod
> + def execute_wait(self, pid: int, timeout: int) -> ProcessResult:
> + raise NotImplementedError
> +
> + @abc.abstractmethod
> + def execute_signal(self, pid: int, sig: signal.Signals) -> None:
> + raise NotImplementedError
> +
> + @abc.abstractmethod
> + def read_file_content(self, path: str) -> str:
> + raise NotImplementedError
> +
> + @abc.abstractmethod
> + def write_file_content(self, path: str, content: str) -> int:
> + raise NotImplementedError
> +
> + @abc.abstractmethod
> + def dir_exists(self, path: str) -> bool:
> + raise NotImplementedError
> +
> + @abc.abstractmethod
> + def get_drm_driver_name(self) -> str:
> + raise NotImplementedError
> +
> + @abc.abstractmethod
> + def get_igt_config(self) -> VmtbIgtConfig:
> + raise NotImplementedError
> diff --git a/vmtb/bench/machines/physical/__init__.py b/vmtb/bench/machines/physical/__init__.py
> new file mode 100644
> index 000000000..e69de29bb
> diff --git a/vmtb/bench/machines/physical/device.py b/vmtb/bench/machines/physical/device.py
> new file mode 100644
> index 000000000..8a0368ae0
> --- /dev/null
> +++ b/vmtb/bench/machines/physical/device.py
> @@ -0,0 +1,240 @@
> +# SPDX-License-Identifier: MIT
> +# Copyright © 2024 Intel Corporation
> +
> +import importlib
> +import logging
> +import re
> +from pathlib import Path
> +from typing import Any, List
> +
> +from bench import exceptions
> +from bench.configurators import pci
> +from bench.configurators.vgpu_profile import (VgpuProfile, VgpuResourcesConfig,
> + VgpuSchedulerConfig)
> +from bench.drivers.driver_interface import DriverInterface, SchedulingPriority
> +from bench.helpers.log import LogDecorators
> +from bench.machines.device_interface import DeviceInterface
> +
> +logger = logging.getLogger('Device')
> +
> +
> +class Device(DeviceInterface):
> + class PciInfo:
> + def __init__(self, bdf: str) -> None:
> + self.bdf: str = bdf
> + self.devid: str = self.get_device_id(self.bdf)
> + self.minor_number: int = self.get_device_minor_number(self.bdf)
> +
> + def get_device_minor_number(self, bdf: str) -> int:
> + drm_dir = Path('/sys/bus/pci/devices/') / bdf / 'drm'
> +
> + for file_path in drm_dir.iterdir():
> + if file_path.match('card*'):
> + index_match = re.search(r'card(?P<card_index>\d+)', file_path.name)
> + if index_match:
> + return int(index_match.group('card_index'))
> +
> + logger.error("Could not determine card index for device %s", bdf)
> + raise exceptions.HostError(f'Could not determine card index for device {bdf}')
> +
> + def get_device_id(self, bdf: str) -> str:
> + device_file = Path('/sys/bus/pci/devices/') / bdf / 'device'
> + devid = device_file.read_text()
> +
> + return devid.strip()[2:] # Strip whitespaces and 0x
> +
> + def __init__(self, bdf: str, driver: str) -> None:
> + self.pci_info = self.PciInfo(bdf)
> + self.gpu_model: str = pci.get_gpu_model(self.pci_info.devid)
> + self.driver: DriverInterface = self.instantiate_driver(driver, self.pci_info.minor_number)
> +
> + def instantiate_driver(self, driver_name: str, card_index: int) -> Any:
> + module_name = f'bench.drivers.{driver_name}'
> + class_name = f'{driver_name.capitalize()}Driver'
> +
> + try:
> + driver_module = importlib.import_module(module_name)
> + driver_class = getattr(driver_module, class_name)
> + except (ImportError, AttributeError) as exc:
> + logging.error("Driver module/class is not available: %s", exc)
> + raise exceptions.VmtbConfigError(f'Requested driver module {driver_name} is not available!')
> +
> + return driver_class(card_index)
> +
> + def set_drivers_autoprobe(self, val: bool) -> None:
> + self.driver.set_drivers_autoprobe(int(val))
> + ret = self.driver.get_drivers_autoprobe()
> + if ret != int(val):
> + logger.error("Autoprobe value mismatch - requested: %s, got: %s", val, ret)
> + raise exceptions.HostError(f'Autoprobe value mismatch - requested: {val}, got: {ret}')
> +
> + def get_total_vfs(self) -> int:
> + return self.driver.get_totalvfs()
> +
> + def get_current_vfs(self) -> int:
> + return self.driver.get_numvfs()
> +
> + def get_num_gts(self) -> int:
> + return self.driver.get_num_gts()
> +
> + def has_lmem(self) -> bool:
> + return self.driver.has_lmem()
> +
> + def create_vf(self, num: int) -> int:
> + """Enable a requested number of VFs.
> + Disable SRIOV drivers autoprobe to allow VFIO driver override for VFs.
> + """
> + logger.info("[%s] Enable %s VFs", self.pci_info.bdf, num)
> + if self.get_current_vfs() != 0:
> + self.remove_vfs()
> +
> + self.numvf = num
> +
> + # Disable driver autoprobe to avoid driver load on VF (override to vfio is required)
> + logger.debug("[%s] Disable drivers autoprobe", self.pci_info.bdf)
> + self.set_drivers_autoprobe(False)
> +
> + self.driver.set_numvfs(num)
> + ret = self.driver.get_numvfs()
> + assert ret == num
> +
> + return ret
> +
> + def remove_vfs(self) -> int:
> + """Disable all existing VFs.
> + Re-enable SRIOV drivers autoprobe.
> + """
> + logger.info("[%s] Disable VFs", self.pci_info.bdf)
> + self.driver.set_numvfs(0)
> + ret = self.driver.get_numvfs()
> + if ret != 0:
> + raise exceptions.HostError('VFs not disabled after 0 write')
> +
> + logger.debug("[%s] Enable drivers autoprobe", self.pci_info.bdf)
> + self.set_drivers_autoprobe(True)
> +
> + return ret
> +
> + def bind_driver(self) -> None:
> + self.driver.bind(self.pci_info.bdf)
> +
> + def unbind_driver(self) -> None:
> + self.driver.unbind(self.pci_info.bdf)
> +
> + def override_vf_driver(self, vf_num: int) -> str:
> + """Set VFIO as VF driver."""
> + pci_devices_path = Path('/sys/bus/pci/devices/')
> + vfio_driver = f'{self.driver.get_name()}-vfio-pci'
> + if not Path(f'/sys/bus/pci/drivers/{vfio_driver}').exists():
> + vfio_driver = 'vfio-pci'
> +
> + # virtfnN is a symlink - get the last part of the absolute path, ie. VF BDF like 00:12:00.1
> + # TODO: replace by Path.readlink() when Python 3.9 supported
> + pass_vf_bdf = (pci_devices_path / self.pci_info.bdf / f'virtfn{vf_num - 1}').resolve().name
> + override_path = pci_devices_path / pass_vf_bdf / 'driver_override'
> + override_path.write_text(vfio_driver, encoding='utf-8')
> + logger.debug("VF%s VFIO driver: %s", vf_num, override_path.read_text())
> +
> + return pass_vf_bdf
> +
> + @LogDecorators.parse_kmsg
> + def get_vf_bdf(self, vf_num: int) -> str:
> + """Provide BDF of VF prepared for pass to VM - with VFIO driver override and probe."""
> + pass_vf_bdf = self.override_vf_driver(vf_num)
> +
> + drivers_probe = Path('/sys/bus/pci/drivers_probe')
> + drivers_probe.write_text(pass_vf_bdf, encoding='utf-8')
> +
> + logger.info("[%s] VF%s ready for pass to VM", pass_vf_bdf, vf_num)
> + return pass_vf_bdf
> +
> + def get_vfs_bdf(self, *args: int) -> List[str]:
> + vf_list = list(set(args))
> + bdf_list = [self.get_vf_bdf(vf) for vf in vf_list]
> + return bdf_list
> +
> + def provision(self, profile: VgpuProfile) -> None:
> + logger.info("[%s] Provision VFs - set vGPU profile for %s VFs", self.pci_info.bdf, profile.num_vfs)
> +
> + num_vfs = profile.num_vfs
> + num_gts = self.get_num_gts() # Number of tiles (GTs)
> + gt_nums = [0] if num_gts == 1 else [0, 1] # Tile (GT) numbers/indexes
> +
> + for gt_num in gt_nums:
> + self.driver.set_pf_policy_sched_if_idle(gt_num, int(profile.scheduler.scheduleIfIdle))
> + self.driver.set_pf_policy_reset_engine(gt_num, int(profile.security.reset_after_vf_switch))
> + self.driver.set_exec_quantum_ms(0, gt_num, profile.scheduler.pfExecutionQuanta)
> + self.driver.set_preempt_timeout_us(0, gt_num, profile.scheduler.pfPreemptionTimeout)
> + self.driver.set_doorbells_quota(0, gt_num, profile.resources.pfDoorbells)
> + # PF contexts are currently assigned by the driver and cannot be reprovisioned from sysfs
> +
> + for vf_num in range(1, num_vfs + 1):
> + if num_gts > 1 and num_vfs > 1:
> + # Multi-tile device Mode 2|3 - odd VFs on GT0, even on GT1
> + gt_nums = [0] if vf_num % 2 else [1]
> +
> + for gt_num in gt_nums:
> + self.driver.set_lmem_quota(vf_num, gt_num, profile.resources.vfLmem)
> + self.driver.set_ggtt_quota(vf_num, gt_num, profile.resources.vfGgtt)
> + self.driver.set_contexts_quota(vf_num, gt_num, profile.resources.vfContexts)
> + self.driver.set_doorbells_quota(vf_num, gt_num, profile.resources.vfDoorbells)
> + self.driver.set_exec_quantum_ms(vf_num, gt_num, profile.scheduler.vfExecutionQuanta)
> + self.driver.set_preempt_timeout_us(vf_num, gt_num, profile.scheduler.vfPreemptionTimeout)
> +
> + # fn_num = 0 for PF, 1..n for VF
> + def set_scheduling(self, fn_num: int, gt_num: int, scheduling_config: VgpuSchedulerConfig) -> None:
> + logger.info("[%s] Provision scheduling config for PCI Function %s", self.pci_info.bdf, fn_num)
> + if fn_num == 0:
> + self.driver.set_pf_policy_sched_if_idle(gt_num, int(scheduling_config.scheduleIfIdle))
> + self.driver.set_exec_quantum_ms(0, gt_num, scheduling_config.pfExecutionQuanta)
> + self.driver.set_preempt_timeout_us(0, gt_num, scheduling_config.pfPreemptionTimeout)
> + else:
> + self.driver.set_exec_quantum_ms(fn_num, gt_num, scheduling_config.vfExecutionQuanta)
> + self.driver.set_preempt_timeout_us(fn_num, gt_num, scheduling_config.vfPreemptionTimeout)
> +
> + def set_resources(self, fn_num: int, gt_num: int, resources_config: VgpuResourcesConfig) -> None:
> + logger.info("[%s] Provision resources config for PCI Function %s", self.pci_info.bdf, fn_num)
> + if fn_num == 0:
> + self.driver.set_pf_ggtt_spare(gt_num, resources_config.pfGgtt)
> + self.driver.set_pf_lmem_spare(gt_num, resources_config.pfLmem)
> + self.driver.set_pf_contexts_spare(gt_num, resources_config.pfContexts)
> + self.driver.set_pf_doorbells_spare(gt_num, resources_config.pfDoorbells)
> + else:
> + self.driver.set_ggtt_quota(fn_num, gt_num, resources_config.vfGgtt)
> + self.driver.set_lmem_quota(fn_num, gt_num, resources_config.vfLmem)
> + self.driver.set_contexts_quota(fn_num, gt_num, resources_config.vfContexts)
> + self.driver.set_doorbells_quota(fn_num, gt_num, resources_config.vfDoorbells)
> +
> + def reset_provisioning(self, num_vfs: int) -> None:
> + """Clear provisioning config for a requested number of VFs.
> + Function calls the sysfs control interface to clear VF provisioning settings
> + and restores the auto provisioning mode.
> + """
> + logger.info("[%s] Reset %s VFs provisioning configuraton", self.pci_info.bdf, num_vfs)
> + for gt_num in range(self.get_num_gts()):
> + if self.get_scheduling_priority(gt_num) != SchedulingPriority.LOW:
> + self.set_scheduling_priority(gt_num, SchedulingPriority.LOW)
> + self.driver.set_pf_policy_sched_if_idle(gt_num, 0)
> + self.driver.set_pf_policy_reset_engine(gt_num, 0)
> + self.driver.set_exec_quantum_ms(0, gt_num, 0)
> + self.driver.set_preempt_timeout_us(0, gt_num, 0)
> + self.driver.set_doorbells_quota(0, gt_num, 0)
> + # PF contexts cannot be set from sysfs
> +
> + for vf_num in range(1, num_vfs + 1):
> + self.driver.set_contexts_quota(vf_num, gt_num, 0)
> + self.driver.set_doorbells_quota(vf_num, gt_num, 0)
> + self.driver.set_ggtt_quota(vf_num, gt_num, 0)
> + self.driver.set_lmem_quota(vf_num, gt_num, 0)
> +
> + def cancel_work(self) -> None:
> + """Drop and reset remaining GPU execution at exit."""
> + self.driver.cancel_work()
> +
> + def get_scheduling_priority(self, gt_num: int) -> SchedulingPriority:
> + return self.driver.get_pf_sched_priority(gt_num)
> +
> + def set_scheduling_priority(self, gt_num: int, val: SchedulingPriority) -> None:
> + # In order to set scheduling priority, strict scheduling policy needs to be default
> + # self.drm_driver.set_pf_policy_sched_if_idle(gt_num, 0)
> + self.driver.set_pf_sched_priority(gt_num, val)
> diff --git a/vmtb/bench/machines/virtual/__init__.py b/vmtb/bench/machines/virtual/__init__.py
> new file mode 100644
> index 000000000..e69de29bb
> diff --git a/vmtb/bench/machines/virtual/backends/__init__.py b/vmtb/bench/machines/virtual/backends/__init__.py
> new file mode 100644
> index 000000000..e69de29bb
> diff --git a/vmtb/bench/machines/virtual/backends/backend_interface.py b/vmtb/bench/machines/virtual/backends/backend_interface.py
> new file mode 100644
> index 000000000..dfa29cc01
> --- /dev/null
> +++ b/vmtb/bench/machines/virtual/backends/backend_interface.py
> @@ -0,0 +1,40 @@
> +# SPDX-License-Identifier: MIT
> +# Copyright © 2024 Intel Corporation
> +
> +import abc
> +import typing
> +
> +
> +class BackendInterface(metaclass=abc.ABCMeta):
> +
> + @abc.abstractmethod
> + def sync(self, idnum: int) -> typing.Optional[typing.Dict]:
> + raise NotImplementedError
> +
> + @abc.abstractmethod
> + def ping(self) -> typing.Optional[typing.Dict]:
> + raise NotImplementedError
> +
> + @abc.abstractmethod
> + def execute(self, command: str, args: typing.List[str]) -> typing.Optional[typing.Dict]:
> + raise NotImplementedError
> +
> + @abc.abstractmethod
> + def execute_status(self, pid: int) -> typing.Optional[typing.Dict]:
> + raise NotImplementedError
> +
> + @abc.abstractmethod
> + def suspend_disk(self) -> None:
> + raise NotImplementedError
> +
> + @abc.abstractmethod
> + def suspend_ram(self) -> None:
> + raise NotImplementedError
> +
> + @abc.abstractmethod
> + def reboot(self) -> None:
> + raise NotImplementedError
> +
> + @abc.abstractmethod
> + def poweroff(self) -> None:
> + raise NotImplementedError
> diff --git a/vmtb/bench/machines/virtual/backends/guestagent.py b/vmtb/bench/machines/virtual/backends/guestagent.py
> new file mode 100644
> index 000000000..6ac366b99
> --- /dev/null
> +++ b/vmtb/bench/machines/virtual/backends/guestagent.py
> @@ -0,0 +1,99 @@
> +# SPDX-License-Identifier: MIT
> +# Copyright © 2024 Intel Corporation
> +
> +import json
> +import logging
> +import socket
> +import typing
> +
> +from bench import exceptions
> +from bench.machines.virtual.backends.backend_interface import BackendInterface
> +
> +logger = logging.getLogger('GuestAgent')
> +
> +
> +class GuestAgentBackend(BackendInterface):
> + def __init__(self, socket_path: str, socket_timeout: int) -> None:
> + self.sockpath = socket_path
> + self.timeout = socket_timeout
> + self.sock: socket.socket = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM)
> + self.sock.connect(self.sockpath)
> + self.sockf: typing.TextIO = self.sock.makefile(mode='rw', errors='strict')
> +
> + def __send(self, command: str, arguments: typing.Optional[typing.Dict] = None) -> typing.Dict:
> + if arguments is None:
> + arguments = {}
> +
> + data = {'execute': command, 'arguments': arguments}
> + json.dump(data, self.sockf)
> + self.sockf.flush()
> + try:
> + out: typing.Optional[str] = self.sockf.readline()
> + except socket.timeout as soc_to_exc:
> + logger.error('Socket readline timeout on command %s', command)
> + self.sock.close()
> + self.sockf.close()
> + raise exceptions.GuestAgentError(f'Socket timed out on {command}') from soc_to_exc
> + if out is None:
> + logger.error('Command %s, args %s returned with no output')
> + raise exceptions.GuestAgentError(f'Command {command} did not retunrned output')
> + # Only logging errors for now
> + ret: typing.Dict = json.loads(out)
> + if 'error' in ret.keys():
> + logger.error('Command: %s got error %s', command, ret)
> +
> + return ret
> +
> + def sync(self, idnum: int) -> typing.Dict:
> + return self.__send('guest-sync', {'id': idnum})
> +
> + def ping(self) -> typing.Optional[typing.Dict]:
> + return self.__send('guest-ping')
> +
> + def execute(self, command: str, args: typing.Optional[typing.List[str]] = None) -> typing.Dict:
> + if args is None:
> + args = []
> + arguments = {'path': command, 'arg': args, 'capture-output': True}
> + return self.__send('guest-exec', arguments)
> +
> + def execute_status(self, pid: int) -> typing.Dict:
> + return self.__send('guest-exec-status', {'pid': pid})
> +
> + # TODO add qmp-query mechanism for all powerstate changes
> + def suspend_disk(self) -> None:
> + # self.__send('guest-suspend-disk')
> + raise NotImplementedError
> +
> + def suspend_ram(self) -> None:
> + self.ping()
> + # guest-suspend-ram does not return anything, thats why no __send
> + data = {'execute': 'guest-suspend-ram'}
> + json.dump(data, self.sockf)
> + self.sockf.flush()
> +
> + def reboot(self) -> None:
> + self.ping()
> + # guest-shutdown does not return anything, thats why no __send
> + data = {'execute': 'guest-shutdown', 'arguments': {'mode': 'reboot'}}
> + json.dump(data, self.sockf)
> + self.sockf.flush()
> +
> + def poweroff(self) -> None:
> + self.ping()
> + # guest-shutdown does not return anything, thats why no __send
> + data = {'execute': 'guest-shutdown', 'arguments': {'mode': 'powerdown'}}
> + json.dump(data, self.sockf)
> + self.sockf.flush()
> + # self.sockf.readline()
> +
> + def guest_file_open(self, path: str, mode: str) -> typing.Dict:
> + return self.__send('guest-file-open', {'path': path, 'mode': mode})
> +
> + def guest_file_close(self, handle: int) -> typing.Dict:
> + return self.__send('guest-file-close', {'handle': handle})
> +
> + def guest_file_write(self, handle: int, content: str) -> typing.Dict:
> + return self.__send('guest-file-write', {'handle': handle, 'buf-b64': content})
> +
> + def guest_file_read(self, handle: int) -> typing.Dict:
> + return self.__send('guest-file-read', {'handle': handle})
> diff --git a/vmtb/bench/machines/virtual/backends/qmp_monitor.py b/vmtb/bench/machines/virtual/backends/qmp_monitor.py
> new file mode 100644
> index 000000000..7d2645abe
> --- /dev/null
> +++ b/vmtb/bench/machines/virtual/backends/qmp_monitor.py
> @@ -0,0 +1,161 @@
> +# SPDX-License-Identifier: MIT
> +# Copyright © 2024 Intel Corporation
> +
> +import json
> +import logging
> +import queue
> +import socket
> +import threading
> +import time
> +import typing
> +
> +logger = logging.getLogger('QmpMonitor')
> +
> +
> +class QmpMonitor():
> + def __init__(self, socket_path: str, socket_timeout: int) -> None:
> + self.sockpath = socket_path
> + self.timeout = socket_timeout
> + self.sock: socket.socket = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM)
> + self.sock.connect(self.sockpath)
> + self.sockf: typing.TextIO = self.sock.makefile(mode='rw', errors='strict')
> + self.qmp_queue: queue.Queue = queue.Queue()
> + self.monitor_thread: threading.Thread = threading.Thread(target=self.__queue_qmp_output,
> + args=(self.sockf, self.qmp_queue),
> + daemon=True)
> + self.monitor_thread.start()
> + # It is required to enable capabilities befor using QMP
> + self.__enable_qmp_capabilities()
> +
> + def __enable_qmp_capabilities(self) -> None:
> + json.dump({'execute': 'qmp_capabilities'}, self.sockf)
> + self.sockf.flush()
> +
> + def __queue_qmp_output(self, out: typing.TextIO, q: queue.Queue) -> None:
> + for line in iter(out.readline, ''):
> + logger.debug('[QMP RSP] <- %s', line)
> + qmp_msg = json.loads(line)
> + q.put(qmp_msg)
> +
> + @property
> + def monitor_queue(self) -> queue.Queue:
> + return self.qmp_queue
> +
> + def query_status(self) -> str:
> + json.dump({'execute': 'query-status'}, self.sockf)
> + self.sockf.flush()
> +
> + ret: typing.Dict = {}
> + while 'status' not in ret:
> + qmp_msg = self.qmp_queue.get()
> + if 'return' in qmp_msg:
> + ret = qmp_msg.get('return')
> +
> + status: str = ret['status']
> + logger.debug('Machine status: %s', status)
> + return status
> +
> + def query_jobs(self, requested_type: str) -> typing.Tuple[str, str]:
> + json.dump({'execute': 'query-jobs'}, self.sockf)
> + self.sockf.flush()
> +
> + job_type: str = ''
> + job_status: str = ''
> + job_error: str = ''
> + ret: typing.Dict = {}
> +
> + qmp_msg = self.qmp_queue.get()
> + # logger.debug('[QMP RSP Queue] -> %s', qmp_msg)
> + if 'return' in qmp_msg:
> + ret = qmp_msg.get('return')
> + for param in ret:
> + job_type = param.get('type')
> + job_status = param.get('status')
> + job_error = param.get('error')
> +
> + if job_type == requested_type:
> + break
> +
> + return (job_status, job_error)
> +
> + def get_qmp_event(self) -> str:
> + qmp_msg = self.qmp_queue.get()
> + # logger.debug('[QMP RSP Queue] -> %s', qmp_msg)
> + event: str = qmp_msg.get('event', '')
> + return event
> +
> + def get_qmp_event_job(self) -> str:
> + qmp_msg = self.qmp_queue.get()
> + # logger.debug('[QMP RSP Queue] -> %s', qmp_msg)
> +
> + status: str = ''
> + if qmp_msg.get('event') == 'JOB_STATUS_CHANGE':
> + status = qmp_msg.get('data', {}).get('status', '')
> +
> + return status
> +
> + def system_reset(self) -> None:
> + json.dump({'execute': 'system_reset'}, self.sockf)
> + self.sockf.flush()
> +
> + def system_wakeup(self) -> None:
> + json.dump({'execute': 'system_wakeup'}, self.sockf)
> + self.sockf.flush()
> +
> + def stop(self) -> None:
> + json.dump({'execute': 'stop'}, self.sockf)
> + self.sockf.flush()
> +
> + def cont(self) -> None:
> + json.dump({'execute': 'cont'}, self.sockf)
> + self.sockf.flush()
> +
> + def quit(self) -> None:
> + json.dump({'execute': 'quit'}, self.sockf)
> + self.sockf.flush()
> +
> + def __query_snapshot(self) -> typing.Tuple[str, str]:
> + json.dump({'execute': 'query-named-block-nodes'}, self.sockf)
> + self.sockf.flush()
> +
> + node_name: str = ''
> + snapshot_tag: str = ''
> + ret: typing.Dict = {}
> +
> + qmp_msg = self.qmp_queue.get()
> + # logger.debug('[QMP RSP Queue] -> %s', qmp_msg)
> + if 'return' in qmp_msg:
> + ret = qmp_msg.get('return')
> + for block in ret:
> + if block.get('drv') == 'qcow2':
> + node_name = block.get('node-name')
> + # Get the most recent state snapshot from the snapshots list:
> + snapshots = block.get('image').get('snapshots')
> + if snapshots:
> + snapshot_tag = snapshots[-1].get('name')
> + break
> +
> + return (node_name, snapshot_tag)
> +
> + def save_snapshot(self) -> None:
> + job_id: str = f'savevm_{time.time()}'
> + snapshot_tag = f'vm_state_{time.time()}'
> + node_name, _ = self.__query_snapshot()
> + logger.debug('[QMP snapshot-save] snapshot_tag: %s, block device node: %s', snapshot_tag, node_name)
> +
> + # Note: command 'snapshot-save' is supported since QEMU 6.0
> + json.dump({'execute': 'snapshot-save',
> + 'arguments': {'job-id': job_id, 'tag': snapshot_tag, 'vmstate': node_name, 'devices': [node_name]}},
> + self.sockf)
> + self.sockf.flush()
> +
> + def load_snapshot(self) -> None:
> + job_id: str = f'loadvm_{time.time()}'
> + node_name, snapshot_tag = self.__query_snapshot()
> + logger.debug('[QMP snapshot-load] snapshot_tag: %s, block device node: %s', snapshot_tag, node_name)
> +
> + # Note: command 'snapshot-load' is supported since QEMU 6.0
> + json.dump({'execute': 'snapshot-load',
> + 'arguments': {'job-id': job_id, 'tag': snapshot_tag, 'vmstate': node_name, 'devices': [node_name]}},
> + self.sockf)
> + self.sockf.flush()
> diff --git a/vmtb/bench/machines/virtual/vm.py b/vmtb/bench/machines/virtual/vm.py
> new file mode 100644
> index 000000000..ca1f1346f
> --- /dev/null
> +++ b/vmtb/bench/machines/virtual/vm.py
> @@ -0,0 +1,604 @@
> +# SPDX-License-Identifier: MIT
> +# Copyright © 2024 Intel Corporation
> +
> +import base64
> +import json
> +import logging
> +import os
> +import posixpath
> +import shlex
> +import signal
> +import subprocess
> +import threading
> +import time
> +import typing
> +from types import FrameType
> +
> +from bench import exceptions
> +from bench.configurators.vmtb_config import VmtbIgtConfig
> +from bench.machines.machine_interface import (DEFAULT_TIMEOUT,
> + MachineInterface, ProcessResult,
> + SuspendMode)
> +from bench.machines.virtual.backends.guestagent import GuestAgentBackend
> +from bench.machines.virtual.backends.qmp_monitor import QmpMonitor
> +
> +logger = logging.getLogger('VirtualMachine')
> +
> +
> +class VirtualMachine(MachineInterface):
> + class Decorators():
> + @staticmethod
> + def alarm_handler(sig: signal.Signals, tb: FrameType) -> typing.Any:
> + raise exceptions.AlarmTimeoutError(f'Alarm timeout occured')
> +
> + @classmethod
> + def timeout_signal(cls, func: typing.Callable) -> typing.Callable:
> + def timeout_wrapper(*args: typing.Any, **kwargs: typing.Optional[typing.Any]) -> typing.Any:
> + timeout: int = DEFAULT_TIMEOUT
> + if len(args) > 2:
> + timeout = args[2] # Argument position in execute_wait(self, pid, timeout)
> + elif kwargs.get('timeout') is not None:
> + if isinstance(kwargs['timeout'], int):
> + timeout = kwargs['timeout']
> +
> + # mypy: silence the following problem in signal.signal() call:
> + # error: Argument 2 to "signal" has incompatible type "Callable[[Signals, FrameType], Any]";
> + # expected "Union[Callable[[int, Optional[FrameType]], Any], int, Handlers, None]" [arg-type]
> + signal.signal(signal.SIGALRM, cls.alarm_handler) # type: ignore[arg-type]
> + signal.alarm(timeout)
> + try:
> + proc_ret = func(*args, **kwargs)
> + except exceptions.AlarmTimeoutError:
> + logger.warning('Timeout (%ss) on %s', timeout, func.__name__)
> + raise
> + finally:
> + signal.alarm(0) # Cancel alarm
> +
> + return proc_ret
> +
> + return timeout_wrapper
> +
> + def __init__(self, vm_number: int, backing_image: str, driver: str, igt_config: VmtbIgtConfig) -> None:
> + self.vf_bdf: typing.Optional[str] = None
> + self.process: typing.Optional[subprocess.Popen] = None
> + self.vmnum: int = vm_number
> + self.card_num: int = 0
> + self.sysfs_prefix_path = posixpath.join('/sys/class/drm/', f'card{str(self.card_num)}')
> + self.questagent_sockpath = posixpath.join('/tmp', f'qga{self.vmnum}.sock')
> + self.qmp_sockpath = posixpath.join('/tmp', f'mon{self.vmnum}.sock')
> + self.drm_driver_name: str = driver
> + self.igt_config: VmtbIgtConfig = igt_config
> +
> + if not posixpath.exists(backing_image):
> + logger.error('No image for VM%s', self.vmnum)
> + raise exceptions.GuestError(f'No image for VM{self.vmnum}')
> + self.image: str = self.__create_qemu_image(backing_image)
> + self.migrate_source_image: typing.Optional[str] = None
> + self.migrate_destination_vm: bool = False
> +
> + # Resources provisioned to the VF/VM:
> + self._lmem_size: typing.Optional[int] = None
> + self._ggtt_size: typing.Optional[int] = None
> + self._contexts: typing.Optional[int] = None
> + self._doorbells: typing.Optional[int] = None
> +
> + # GT number and tile is relevant mainly for multi-tile devices
> + # List of all GTs used by a given VF:
> + # - for single-tile: only root [0]
> + # - for multi-tile Mode 2/3: either root [0] or remote [1]
> + # - for multi-tile Mode 1: spans on both tiles [0, 1]
> + self._gt_nums: typing.List[int] = []
> + self._tile_mask: typing.Optional[int] = None
> +
> + def __str__(self) -> str:
> + return f'VM{self.vmnum}_{self.vf_bdf}'
> +
> + def __del__(self) -> None:
> + if not self.is_running():
> + return
> +
> + # printing and not logging because loggers have some issues
> + # in late deinitialization
> + print(f'VM{self.vmnum} was not powered off')
> + if not self.process:
> + return
> + self.process.terminate()
> + # Lets wait and make sure that qemu shutdown
> + try:
> + self.process.communicate(timeout=30)
> + except subprocess.TimeoutExpired:
> + print('QEMU did not terminate, killing it')
> + self.process.kill()
> +
> + def __get_backing_file_format(self, backing_file: str) -> typing.Any:
> + """Get the format of the backing image file using qemu-img info."""
> + command = ['qemu-img', 'info', '--output=json', backing_file]
> + try:
> + result = subprocess.run(command, capture_output=True, check=True)
> + return json.loads(result.stdout)['format']
> + except subprocess.CalledProcessError as exc:
> + logger.error("Error executing qemu-img info: %s", exc.stderr)
> + raise exceptions.GuestError(f'Error executing qemu-img info') from exc
> + except json.JSONDecodeError as exc:
> + logger.error("Invalid JSON output from qemu-img info: %s", exc)
> + raise exceptions.GuestError('Invalid JSON output from qemu-img info') from exc
> +
> + def __create_qemu_image(self, backing_file: str) -> str:
> + """Create a new qcow2 image with the specified backing file."""
> + output_image = f'./vm{self.vmnum}_{time.time()}_image.qcow2'
> + backing_format = self.__get_backing_file_format(backing_file)
> +
> + command = ['qemu-img', 'create',
> + '-f', 'qcow2', '-b', f'{backing_file}', '-F', f'{backing_format}', f'{output_image}']
> + try:
> + subprocess.run(command, check=True)
> + logger.debug("[VM%s] Created image %s (backing file: %s, format: %s)",
> + self.vmnum, output_image, backing_file, backing_format)
> + except subprocess.CalledProcessError as exc:
> + logger.error('[VM%s] Error creating qcow2 image: %s', self.vmnum, exc)
> + raise exceptions.GuestError('Error creating qcow2 image') from exc
> +
> + return output_image
> +
> + def __log_qemu_output(self, out: typing.TextIO) -> None:
> + stdoutlog = logging.getLogger(f'VM{self.vmnum}-kmsg')
> + for line in iter(out.readline, ''):
> + stdoutlog.debug(line.strip())
> +
> + def __sockets_exists(self) -> bool:
> + return os.path.exists(self.questagent_sockpath) and os.path.exists(self.qmp_sockpath)
> +
> + def __get_popen_command(self) -> typing.List[str]:
> + command = ['qemu-system-x86_64',
> + '-vnc', f':{self.vmnum}',
> + '-serial', 'stdio',
> + '-m', '4096',
> + '-drive', f'file={self.image if not self.migrate_destination_vm else self.migrate_source_image}',
> + '-chardev', f'socket,path={self.questagent_sockpath},server=on,wait=off,id=qga{self.vmnum}',
> + '-device', 'virtio-serial',
> + '-device', f'virtserialport,chardev=qga{self.vmnum},name=org.qemu.guest_agent.0',
> + '-chardev', f'socket,id=mon{self.vmnum},path=/tmp/mon{self.vmnum}.sock,server=on,wait=off',
> + '-mon', f'chardev=mon{self.vmnum},mode=control']
> +
> + if self.vf_bdf:
> + command.extend(['-enable-kvm', '-cpu', 'host'])
> + command.extend(['-device', f'vfio-pci,host={self.vf_bdf},enable-migration=on'])
> +
> + if self.migrate_destination_vm:
> + # If VM is migration destination - run in stopped/prelaunch state (explicit resume required)
> + command.extend(['-S'])
> +
> + logger.debug('QEMU command: %s', ' '.join(command))
> + return command
> +
> + def __get_key(self, base: typing.Dict, path: typing.List[str]) -> typing.Any:
> + cur = base
> + for key in path:
> + if cur is None or key not in cur:
> + raise ValueError(f'The key {path} does not exist, aborting!')
> + cur = cur[key]
> + return cur
> +
> + @property
> + def get_vm_num(self) -> int:
> + return self.vmnum
> +
> + def assign_vf(self, vf_bdf: str) -> None:
> + self.vf_bdf = vf_bdf
> +
> + def set_migration_source(self, src_image: str) -> None:
> + self.migrate_source_image = src_image
> + self.migrate_destination_vm = True
> +
> + @property
> + def lmem_size(self) -> typing.Optional[int]:
> + if self._lmem_size is None:
> + self.helper_get_debugfs_selfconfig()
> +
> + return self._lmem_size
> +
> + @property
> + def ggtt_size(self) -> typing.Optional[int]:
> + if self._ggtt_size is None:
> + self.helper_get_debugfs_selfconfig()
> +
> + return self._ggtt_size
> +
> + @property
> + def contexts(self) -> typing.Optional[int]:
> + if self._contexts is None:
> + self.helper_get_debugfs_selfconfig()
> +
> + return self._contexts
> +
> + @property
> + def doorbells(self) -> typing.Optional[int]:
> + if self._doorbells is None:
> + self.helper_get_debugfs_selfconfig()
> +
> + return self._doorbells
> +
> + @property
> + def tile_mask(self) -> typing.Optional[int]:
> + if self._tile_mask is None:
> + self.helper_get_debugfs_selfconfig()
> +
> + return self._tile_mask
> +
> + @property
> + def gt_nums(self) -> typing.List[int]:
> + self._gt_nums = self.get_gt_num_from_sysfs()
> + if not self._gt_nums:
> + logger.warning("VM sysfs: missing GT index")
> + self._gt_nums = [0]
> +
> + return self._gt_nums
> +
> + def get_gt_num_from_sysfs(self) -> typing.List[int]:
> + # Get GT number of VF passed to a VM, based on an exisitng a sysfs path
> + vm_gt_num = []
> + if self.dir_exists(posixpath.join(self.sysfs_prefix_path, 'gt/gt0')):
> + vm_gt_num.append(0)
> + if self.dir_exists(posixpath.join(self.sysfs_prefix_path, 'gt/gt1')):
> + vm_gt_num.append(1)
> +
> + return vm_gt_num
> +
> + def get_drm_driver_name(self) -> str:
> + return self.drm_driver_name
> +
> + def get_igt_config(self) -> VmtbIgtConfig:
> + return self.igt_config
> +
> + @Decorators.timeout_signal
> + def poweron(self) -> None:
> + logger.debug('Powering on VM%s', self.vmnum)
> + if self.is_running():
> + logger.warning('VM%s already running', self.vmnum)
> + return
> +
> + command = self.__get_popen_command()
> + # We don't want to kill the process created here (like 'with' would do) so disable the following linter issue:
> + # R1732: consider-using-with (Consider using 'with' for resource-allocating operations)
> + # pylint: disable=R1732
> + self.process = subprocess.Popen(
> + args=command,
> + stdout=subprocess.PIPE,
> + stderr=subprocess.PIPE,
> + universal_newlines=True)
> +
> + qemu_stdout_log_thread = threading.Thread(
> + target=self.__log_qemu_output, args=(
> + self.process.stdout,), daemon=True)
> + qemu_stdout_log_thread.start()
> +
> + qemu_stderr_log_thread = threading.Thread(
> + target=self.__log_qemu_output, args=(
> + self.process.stderr,), daemon=True)
> + qemu_stderr_log_thread.start()
> +
> + if not self.is_running():
> + logger.error('VM%s did not boot', self.vmnum)
> + raise exceptions.GuestError(f'VM{self.vmnum} did not start')
> +
> + try:
> + while not self.__sockets_exists():
> + logger.info('waiting for socket')
> + time.sleep(1)
> + # Passing five minutes timout for every command
> + self.ga = GuestAgentBackend(self.questagent_sockpath, 300)
> + self.qm = QmpMonitor(self.qmp_sockpath, 300)
> + vm_status = self.qm.query_status()
> +
> + if not self.migrate_destination_vm and vm_status != 'running':
> + self.process.terminate()
> + logger.error('VM%s status not "running", instead: %s', self.vmnum, vm_status)
> + raise exceptions.GuestError(f'VM{self.vmnum} status {vm_status}')
> + except Exception as exc:
> + logger.error('Error while booting VM%s: %s', self.vmnum, exc)
> + self.process.terminate()
> + raise exceptions.GuestError(f'VM{self.vmnum} crashed with {exc}') from exc
> +
> + def is_running(self) -> bool:
> + if self.process is None:
> + return False
> +
> + return_code = self.process.poll()
> + if return_code is None:
> + return True
> +
> + return False
> +
> + @Decorators.timeout_signal
> + def poweroff(self) -> None:
> + logger.debug('Powering off VM%s', self.vmnum)
> + assert self.process
> + if not self.is_running():
> + logger.warning('VM%s not running', self.vmnum)
> + return
> +
> + try:
> + self.ga.poweroff()
> + # Wait for shutdown event
> + event: str = self.qm.get_qmp_event()
> + while event != 'SHUTDOWN':
> + event = self.qm.get_qmp_event()
> + except exceptions.AlarmTimeoutError:
> + logger.warning('VM%s hanged on poweroff. Initiating forced termination', self.vmnum)
> + self.process.terminate()
> + finally:
> + # Wait and make sure that qemu shutdown
> + self.process.communicate()
> +
> + if self.__sockets_exists():
> + # Remove leftovers and notify about unclear qemu shutdown
> + os.remove(self.questagent_sockpath)
> + os.remove(self.qmp_sockpath)
> + raise exceptions.GuestError(f'VM{self.vmnum} was not gracefully powered off - sockets exist')
> +
> + def reboot(self) -> None:
> + """Reboot VM via the Guest-Agent guest-shutdown(reboot) command."""
> + logger.debug('Rebooting VM%s', self.vmnum)
> + self.ga.reboot()
> +
> + # Wait for 2x RESET event (guest-reset)
> + reset_event_count = 2
> + while reset_event_count > 0:
> + if self.qm.get_qmp_event() == 'RESET':
> + reset_event_count -= 1
> +
> + def reset(self) -> None:
> + """Reset VM via the QMP system_reset command."""
> + logger.debug('Resetting VM%s', self.vmnum)
> + self.qm.system_reset()
> +
> + # Wait for 2x RESET event (host-qmp-system-reset, guest-reset)
> + reset_event_count = 2
> + while reset_event_count > 0:
> + if self.qm.get_qmp_event() == 'RESET':
> + reset_event_count -= 1
> +
> + def pause(self) -> None:
> + logger.debug('Pausing VM%s', self.vmnum)
> + self.qm.stop()
> + vm_status = self.qm.query_status()
> + if vm_status != 'paused':
> + if self.process:
> + self.process.terminate()
> + logger.error('VM%s status not "paused", instead: %s', self.vmnum, vm_status)
> + raise exceptions.GuestError(f'VM{self.vmnum} status {vm_status}')
> +
> + def resume(self) -> None:
> + logger.debug('Resuming VM%s', self.vmnum)
> + self.qm.cont()
> + vm_status = self.qm.query_status()
> + if vm_status != 'running':
> + if self.process:
> + self.process.terminate()
> + logger.error('VM%s status not "running", instead: %s', self.vmnum, vm_status)
> + raise exceptions.GuestError(f'VM{self.vmnum} status {vm_status}')
> +
> + def quit(self) -> None:
> + logger.debug('Quitting VM%s', self.vmnum)
> + self.qm.quit()
> + event: str = self.qm.get_qmp_event()
> + while event != 'SHUTDOWN':
> + event = self.qm.get_qmp_event()
> +
> + def _enable_suspend(self) -> None:
> + if self.link_exists('/etc/systemd/system/suspend.target'):
> + logger.debug('Enable (unmask) systemd suspend/sleep')
> + self.execute('systemctl unmask suspend.target sleep.target')
> +
> + def suspend(self, mode: SuspendMode = SuspendMode.ACPI_S3) -> None:
> + logger.debug('Suspending VM%s (mode: %s)', self.vmnum, mode)
> + self._enable_suspend()
> + if mode == SuspendMode.ACPI_S3:
> + self.ga.suspend_ram()
> + elif mode == SuspendMode.ACPI_S4:
> + # self.ga.suspend_disk()
> + raise exceptions.GuestError('Guest S4 support not implemented')
> + else:
> + raise exceptions.GuestError('Unknown suspend mode')
> +
> + event: str = self.qm.get_qmp_event()
> + while event != 'SUSPEND':
> + event = self.qm.get_qmp_event()
> +
> + vm_status = self.qm.query_status()
> + if vm_status != 'suspended':
> + if self.process:
> + self.process.terminate()
> + logger.error('VM%s status not "suspended", instead: %s', self.vmnum, vm_status)
> + raise exceptions.GuestError(f'VM{self.vmnum} status {vm_status}')
> +
> + def wakeup(self) -> None:
> + logger.debug('Waking up VM%s', self.vmnum)
> + self.qm.system_wakeup()
> +
> + event: str = self.qm.get_qmp_event()
> + while event != 'WAKEUP':
> + event = self.qm.get_qmp_event()
> +
> + vm_status = self.qm.query_status()
> + if vm_status != 'running':
> + if self.process:
> + self.process.terminate()
> + logger.error('VM%s status not "running", instead: %s', self.vmnum, vm_status)
> + raise exceptions.GuestError(f'VM{self.vmnum} status {vm_status}')
> +
> + # {"execute": "guest-exec", "arguments":{"path": "/some/path", "arg": [], "capture-output": true}}
> + # {"error": {"class": "GenericError", "desc": "Guest... "}}
> + def execute(self, command: str) -> int:
> + arr_cmd = shlex.split(command)
> + execout: typing.Dict = self.ga.execute(arr_cmd[0], arr_cmd[1:])
> + ret = execout.get('return')
> + if ret:
> + pid: int = ret.get('pid')
> + logger.debug('Running %s on VM%s with pid %s', command, self.vmnum, pid)
> + return pid
> +
> + logger.error('Command %s did not return pid', command)
> + raise exceptions.GuestError(f'No pid returned: {execout}')
> +
> + # {'error': {'class': 'GenericError', 'desc': "Invalid parameter 'pid'"}}
> + def execute_status(self, pid: int) -> ProcessResult:
> + out = self.ga.execute_status(pid)
> + status = out.get('return')
> + if not status:
> + raise exceptions.GuestError(f'Not output from guest agent: {out}')
> +
> + b64stdout = status.get('out-data', '')
> + stdout = base64.b64decode(b64stdout).decode('utf-8')
> +
> + b64stderr = status.get('err-data', '')
> + stderr = base64.b64decode(b64stderr).decode('utf-8')
> +
> + return ProcessResult(status.get('exited'), status.get('exitcode', None), stdout, stderr)
> +
> + @Decorators.timeout_signal
> + def execute_wait(self, pid: int, timeout: int = DEFAULT_TIMEOUT) -> ProcessResult:
> + exec_status = ProcessResult(False, -1, '', '')
> + while not exec_status.exited:
> + exec_status = self.execute_status(pid)
> + time.sleep(1)
> +
> + return exec_status
> +
> + def execute_signal(self, pid: int, sig: signal.Signals) -> None:
> + signum = int(sig)
> + killpid = self.execute(f'kill -{signum} {pid}')
> + self.execute_wait(killpid)
> +
> + def read_file_content(self, path: str) -> str:
> + out = self.ga.guest_file_open(path, 'r')
> + handle = out.get('return')
> + if not handle:
> + raise exceptions.GuestError('Could not open file on guest')
> +
> + try:
> + eof: bool = False
> + file_content: typing.List[str] = []
> + while not eof:
> + ret = self.ga.guest_file_read(handle)
> + eof = self.__get_key(ret, ['return', 'eof'])
> + b64buf: str = self.__get_key(ret, ['return', 'buf-b64'])
> + file_content.append(base64.b64decode(b64buf).decode('utf-8'))
> + finally:
> + self.ga.guest_file_close(handle)
> +
> + return ''.join(file_content)
> +
> + def write_file_content(self, path: str, content: str) -> int:
> + out: typing.Dict = self.ga.guest_file_open(path, 'w')
> + handle = out.get('return')
> + if not handle:
> + raise exceptions.GuestError('Could not open file on guest')
> +
> + b64buf: bytes = base64.b64encode(content.encode())
> +
> + try:
> + ret = self.ga.guest_file_write(handle, b64buf.decode('utf-8'))
> + count: int = self.__get_key(ret, ['return', 'count'])
> + finally:
> + self.ga.guest_file_close(handle)
> +
> + return count
> +
> + def dir_exists(self, path: str) -> bool:
> + pid = self.execute(f'/bin/sh -c "[ -d {path} ]"')
> + status = self.execute_wait(pid)
> + if status.exit_code:
> + return False
> + return True
> +
> + def link_exists(self, path: str) -> bool:
> + pid = self.execute(f'/bin/sh -c "[ -h {path} ]"')
> + status = self.execute_wait(pid)
> + if status.exit_code:
> + return False
> + return True
> +
> + @Decorators.timeout_signal
> + def ping(self, timeout: int = DEFAULT_TIMEOUT) -> bool:
> + """Ping guest and return true if responding, false otherwise."""
> + logger.debug('Ping VM%s', self.vmnum)
> + try:
> + self.ga.ping()
> + except exceptions.AlarmTimeoutError:
> + logger.warning('VM%s not responded to ping', self.vmnum)
> + return False
> +
> + return True
> +
> + @Decorators.timeout_signal
> + def save_state(self) -> None:
> + logger.debug('Saving VM%s state (snapshot)', self.vmnum)
> + self.qm.save_snapshot()
> +
> + job_status: str = self.qm.get_qmp_event_job()
> + while job_status != 'concluded':
> + job_status = self.qm.get_qmp_event_job()
> +
> + job_status, job_error = self.qm.query_jobs('snapshot-save')
> + if job_status == 'concluded' and job_error is not None:
> + raise exceptions.GuestError(f'VM{self.vmnum} state save error: {job_error}')
> +
> + logger.debug('VM%s state save finished successfully', self.vmnum)
> +
> + @Decorators.timeout_signal
> + def load_state(self) -> None:
> + logger.debug('Loading VM state (snapshot)')
> + self.qm.load_snapshot()
> +
> + job_status: str = self.qm.get_qmp_event_job()
> + while job_status != 'concluded':
> + job_status = self.qm.get_qmp_event_job()
> +
> + job_status, job_error = self.qm.query_jobs('snapshot-load')
> + if job_status == 'concluded' and job_error is not None:
> + raise exceptions.GuestError(f'VM{self.vmnum} state load error: {job_error}')
> +
> + logger.debug('VM state load finished successfully')
> +
> + # helper_convert_units_to_bytes - convert size with units to bytes
> + # @size_str: multiple-byte unit size with suffix (K/M/G)
> + # Returns: size in bytes
> + # TODO: function perhaps could be moved to some new utils module
> + # improve - consider regex to handle various formats eg. both M and MB
> + def helper_convert_units_to_bytes(self, size_str: str) -> int:
> + size_str = size_str.upper()
> + size_int = 0
> +
> + if size_str.endswith('B'):
> + size_int = int(size_str[0:-1])
> + elif size_str.endswith('K'):
> + size_int = int(size_str[0:-1]) * 1024
> + elif size_str.endswith('M'):
> + size_int = int(size_str[0:-1]) * 1024**2
> + elif size_str.endswith('G'):
> + size_int = int(size_str[0:-1]) * 1024**3
> +
> + return size_int
> +
> + # helper_get_debugfs_selfconfig - read resources allocated to VF from debugfs:
> + # /sys/kernel/debug/dri/@card/gt at gt_num/iov/self_config
> + # @card: card number
> + # @gt_num: GT instance number
> + def helper_get_debugfs_selfconfig(self, card: int = 0, gt_num: int = 0) -> None:
> + path = posixpath.join(f'/sys/kernel/debug/dri/{card}/gt{gt_num}/iov/self_config')
> + out = self.read_file_content(path)
> +
> + for line in out.splitlines():
> + param, value = line.split(':')
> +
> + if param == 'GGTT size':
> + self._ggtt_size = self.helper_convert_units_to_bytes(value)
> + elif param == 'LMEM size':
> + self._lmem_size = self.helper_convert_units_to_bytes(value)
> + elif param == 'contexts':
> + self._contexts = int(value)
> + elif param == 'doorbells':
> + self._doorbells = int(value)
> + elif param == 'tile mask':
> + self._tile_mask = int(value, base=16)
> diff --git a/vmtb/dev-requirements.txt b/vmtb/dev-requirements.txt
> new file mode 100644
> index 000000000..66a7c21e4
> --- /dev/null
> +++ b/vmtb/dev-requirements.txt
> @@ -0,0 +1,5 @@
> +# Testing
> +pytest
> +
> +# Building
> +build
> diff --git a/vmtb/pyproject.toml b/vmtb/pyproject.toml
> new file mode 100644
> index 000000000..7b8a63da2
> --- /dev/null
> +++ b/vmtb/pyproject.toml
> @@ -0,0 +1,25 @@
> +[build-system]
> +requires = ["setuptools >= 70.0"]
> +build-backend = "setuptools.build_meta"
> +
> +[project]
> +name = "vmtb"
> +version = "1.0.0"
> +description = "SR-IOV VM-level test tool"
> +readme = "README.md"
> +requires-python = ">=3.11"
> +
> +authors = [
> + {name = "Intel Corporation"}
> +]
> +classifiers = [
> + "Programming Language :: Python :: 3",
> + "License :: OSI Approved :: MIT License",
> +]
> +dependencies = [
> + "pytest",
> +]
> +
> +[tool.setuptools.packages.find]
> +where = ["."]
> +include = ["*"]
> diff --git a/vmtb/pytest.ini b/vmtb/pytest.ini
> new file mode 100644
> index 000000000..e69de29bb
> diff --git a/vmtb/requirements.txt b/vmtb/requirements.txt
> new file mode 100644
> index 000000000..5d80ceeab
> --- /dev/null
> +++ b/vmtb/requirements.txt
> @@ -0,0 +1,2 @@
> +# Used for running tests
> +pytest
> diff --git a/vmtb/vmm_flows/__init__.py b/vmtb/vmm_flows/__init__.py
> new file mode 100644
> index 000000000..e69de29bb
> diff --git a/vmtb/vmm_flows/conftest.py b/vmtb/vmm_flows/conftest.py
> new file mode 100644
> index 000000000..474fcdb98
> --- /dev/null
> +++ b/vmtb/vmm_flows/conftest.py
> @@ -0,0 +1,307 @@
> +# SPDX-License-Identifier: MIT
> +# Copyright © 2024 Intel Corporation
> +
> +import json
> +import logging
> +import re
> +import typing
> +
> +from dataclasses import dataclass
> +from pathlib import Path
> +
> +import pytest
> +
> +from bench import exceptions
> +from bench.helpers.helpers import (modprobe_driver, modprobe_driver_check)
> +from bench.helpers.log import HOST_DMESG_FILE
> +from bench.configurators.vgpu_profile_config import VgpuProfileConfigurator, VfSchedulingMode
> +from bench.configurators.vgpu_profile import VgpuProfile
> +from bench.configurators.vmtb_config import VmtbConfigurator
> +from bench.machines.host import Host, Device
> +from bench.machines.virtual.vm import VirtualMachine
> +
> +
> +logger = logging.getLogger('Conftest')
> +
> +
> +def pytest_addoption(parser):
> + parser.addoption('--vm-image',
> + action='store',
> + help='OS image to boot on VM')
> + parser.addoption('--card',
> + action='store',
> + help='Device card index for test execution')
> +
> +
> + at dataclass
> +class VmmTestingConfig:
> + """Structure represents test configuration used by a setup fixture.
> +
> + Available settings:
> + - num_vfs: requested number of VFs to enable
> + - max_num_vms: maximal number of VMs (the value can be different than enabled number of VFs)
> + - scheduling_mode: requested vGPU scheduling profile (infinite maps to default 0's)
> + - auto_poweron_vm: assign VFs and power on VMs automatically in setup fixture
> + - auto_probe_vm_driver: probe guest DRM driver in setup fixture (VM must be powered on)
> + - unload_host_drivers_on_teardown: unload host DRM drivers in teardown fixture
> + - wa_reduce_vf_lmem: workaround to reduce VF LMEM (for save-restore/migration tests speed-up)
> + """
> + num_vfs: int = 1
> + max_num_vms: int = 2
> + scheduling_mode: VfSchedulingMode = VfSchedulingMode.INFINITE
> +
> + auto_poweron_vm: bool = True
> + auto_probe_vm_driver: bool = True
> + unload_host_drivers_on_teardown: bool = False
> + # Temporary W/A: reduce size of LMEM assigned to VFs to speed up a VF state save-restore process
> + wa_reduce_vf_lmem: bool = False
> +
> + def __str__(self) -> str:
> + return f'{self.num_vfs}VF'
> +
> + def __repr__(self) -> str:
> + return (f'\nVmmTestingConfig:'
> + f'\nNum VFs = {self.num_vfs} / max num VMs = {self.max_num_vms}'
> + f'\nVF scheduling mode = {self.scheduling_mode}'
> + f'\nSetup flags:'
> + f'\n\tVM - auto power-on = {self.auto_poweron_vm}'
> + f'\n\tVM - auto DRM driver probe = {self.auto_probe_vm_driver}'
> + f'\n\tHost - unload drivers on teardown = {self.unload_host_drivers_on_teardown}'
> + f'\n\tW/A - reduce VF LMEM (improves migration time) = {self.wa_reduce_vf_lmem}')
> +
> +
> +class VmmTestingSetup:
> + def __init__(self, vmtb_config: VmtbConfigurator, cmdline_config, host, testing_config):
> + self.testing_config: VmmTestingConfig = testing_config
> + self.host: Host = host
> +
> + self.dut_index = vmtb_config.get_host_config().card_index if cmdline_config['card_index'] is None \
> + else int(cmdline_config['card_index'])
> + self.guest_os_image = vmtb_config.get_guest_config().os_image_path if cmdline_config['vm_image'] is None \
> + else cmdline_config['vm_image']
> +
> + self.vgpu_profiles_dir = vmtb_config.vmtb_config_file.parent / vmtb_config.config.vgpu_profiles_path
> +
> + self.host.dut_index = self.dut_index
> + self.host.drm_driver_name = vmtb_config.get_host_config().driver
> + self.host.igt_config = vmtb_config.get_host_config().igt_config
> +
> + self.host.load_drivers()
> + self.host.discover_devices()
> +
> + logger.info("\nDUT info:"
> + "\n\tCard index: %s"
> + "\n\tPCI BDF: %s "
> + "\n\tDevice ID: %s (%s)"
> + "\n\tHost DRM driver: %s",
> + self.host.dut_index,
> + self.get_dut().pci_info.bdf,
> + self.get_dut().pci_info.devid, self.get_dut().gpu_model,
> + self.get_dut().driver.get_name())
> +
> + self.vgpu_profile: VgpuProfile = self.get_vgpu_profile()
> +
> + # Start maximum requested number of VMs, but not more than VFs supported by the given vGPU profile
> + self.vms: typing.List[VirtualMachine] = [
> + VirtualMachine(vm_idx, self.guest_os_image,
> + vmtb_config.get_guest_config().driver,
> + vmtb_config.get_guest_config().igt_config)
> + for vm_idx in range(min(self.vgpu_profile.num_vfs, self.testing_config.max_num_vms))]
> +
> + def get_vgpu_profile(self) -> VgpuProfile:
> + configurator = VgpuProfileConfigurator(self.vgpu_profiles_dir, self.get_dut().gpu_model)
> + try:
> + vgpu_profile = configurator.get_vgpu_profile(self.testing_config.num_vfs,
> + self.testing_config.scheduling_mode)
> + except exceptions.VgpuProfileError as exc:
> + logger.error("Suitable vGPU profile not found: %s", exc)
> + raise exceptions.VgpuProfileError('Invalid test setup - vGPU profile not found!')
> +
> + vgpu_profile.print_parameters()
> +
> + return vgpu_profile
> +
> + def get_dut(self) -> Device:
> + try:
> + return self.host.gpu_devices[self.dut_index]
> + except IndexError as exc:
> + logger.error("Invalid VMTB config - device card index = %s not available", self.dut_index)
> + raise exceptions.VmtbConfigError(f'Device card index = {self.dut_index} not available') from exc
> +
> + @property
> + def get_vm(self):
> + return self.vms
> +
> + def get_num_vms(self) -> int:
> + return len(self.vms)
> +
> + def poweron_vms(self):
> + for vm in self.vms:
> + vm.poweron()
> +
> + def poweroff_vms(self):
> + for vm in self.vms:
> + if vm.is_running():
> + try:
> + vm.poweroff()
> + except Exception as exc:
> + self.testing_config.unload_host_drivers_on_teardown = True
> + logger.warning("Error on VM%s poweroff (%s)", vm.vmnum, exc)
> +
> + if self.testing_config.unload_host_drivers_on_teardown:
> + raise exceptions.GuestError('VM poweroff issue - cleanup on test teardown')
> +
> + def teardown(self):
> + try:
> + self.poweroff_vms()
> + except Exception as exc:
> + logger.error("Error on test teardown (%s)", exc)
> + finally:
> + num_vfs = self.get_dut().get_current_vfs()
> + self.get_dut().remove_vfs()
> + self.get_dut().reset_provisioning(num_vfs)
> + self.get_dut().cancel_work()
> +
> + if self.testing_config.unload_host_drivers_on_teardown:
> + self.host.unload_drivers()
> +
> +
> + at pytest.fixture(scope='session', name='get_vmtb_config')
> +def fixture_get_vmtb_config(create_host_log, pytestconfig):
> + VMTB_CONFIG_FILE = 'vmtb_config.json'
> + # Pytest Config.rootpath points to the VMTB base directory
> + vmtb_config_file_path: Path = pytestconfig.rootpath / VMTB_CONFIG_FILE
> + return VmtbConfigurator(vmtb_config_file_path)
> +
> +
> + at pytest.fixture(scope='session', name='create_host_log')
> +def fixture_create_host_log():
> + if HOST_DMESG_FILE.exists():
> + HOST_DMESG_FILE.unlink()
> + HOST_DMESG_FILE.touch()
> +
> +
> + at pytest.fixture(scope='session', name='get_cmdline_config')
> +def fixture_get_cmdline_config(request):
> + cmdline_params = {}
> + cmdline_params['vm_image'] = request.config.getoption('--vm-image')
> + cmdline_params['card_index'] = request.config.getoption('--card')
> + return cmdline_params
> +
> +
> + at pytest.fixture(scope='session', name='get_host')
> +def fixture_get_host():
> + return Host()
> +
> +
> + at pytest.fixture(scope='class', name='setup_vms')
> +def fixture_setup_vms(get_vmtb_config, get_cmdline_config, get_host, request):
> + """Arrange VM environment for the VMM Flows test execution.
> +
> + VM setup steps follow the configuration provided as VmmTestingConfig parameter, including:
> + host drivers probe (DRM and VFIO), provision and enable VFs, boot VMs and load guest DRM driver.
> + Tear-down phase covers test environment cleanup:
> + shutdown VMs, reset provisioning, disable VMs and optional host drivers unload.
> +
> + The fixture is designed for test parametrization, as the input to the following test class decorator:
> + @pytest.mark.parametrize('setup_vms', set_test_config(max_vms=N), ids=idfn_test_config, indirect=['setup_vms'])
> + where 'set_test_config' provides request parameter with a VmmTestingConfig (usually list of configs).
> + """
> + tc: VmmTestingConfig = request.param
> + logger.debug(repr(tc))
> +
> + host: Host = get_host
> + ts: VmmTestingSetup = VmmTestingSetup(get_vmtb_config, get_cmdline_config, host, tc)
> +
> + device: Device = ts.get_dut()
> + num_vfs = ts.vgpu_profile.num_vfs
> + num_vms = ts.get_num_vms()
> +
> + logger.info('[Test setup: %sVF-%sVM]', num_vfs, num_vms)
> +
> + # XXX: VF migration on discrete devices (with LMEM) is currently quite slow.
> + # As a temporary workaround, reduce size of LMEM assigned to VFs to speed up a state save/load process.
> + if tc.wa_reduce_vf_lmem and device.has_lmem():
> + logger.debug("W/A: reduce VFs LMEM quota to accelerate state save/restore")
> + org_vgpu_profile_vfLmem = ts.vgpu_profile.resources.vfLmem
> + # Assign max 512 MB to VF
> + ts.vgpu_profile.resources.vfLmem = min(ts.vgpu_profile.resources.vfLmem // 2, 536870912)
> +
> + device.provision(ts.vgpu_profile)
> +
> + assert device.create_vf(num_vfs) == num_vfs
> +
> + if tc.auto_poweron_vm:
> + bdf_list = [device.get_vf_bdf(vf) for vf in range(1, num_vms + 1)]
> + for vm, bdf in zip(ts.get_vm, bdf_list):
> + vm.assign_vf(bdf)
> +
> + ts.poweron_vms()
> +
> + if tc.auto_probe_vm_driver:
> + modprobe_cmds = [modprobe_driver(vm) for vm in ts.get_vm]
> + for i, cmd in enumerate(modprobe_cmds):
> + assert modprobe_driver_check(ts.get_vm[i], cmd), f'modprobe failed on VM{i}'
> +
> + logger.info('[Test execution: %sVF-%sVM]', num_vfs, num_vms)
> + yield ts
> +
> + logger.info('[Test teardown: %sVF-%sVM]', num_vfs, num_vms)
> + # XXX: cleanup counterpart for VFs LMEM quota workaround - restore original value
> + if tc.wa_reduce_vf_lmem and device.has_lmem():
> + ts.vgpu_profile.resources.vfLmem = org_vgpu_profile_vfLmem
> +
> + ts.teardown()
> +
> +
> +def idfn_test_config(test_config: VmmTestingConfig):
> + """Provide test config ID in parametrized tests (e.g. test_something[V4].
> + Usage: @pytest.mark.parametrize([...], ids=idfn_test_config, [...])
> + """
> + return str(test_config)
> +
> +
> +RESULTS_FILE = Path() / "results.json"
> +results = {
> + "results_version": 10,
> + "name": "results",
> + "tests": {},
> +}
> +
> +
> + at pytest.hookimpl(hookwrapper=True)
> +def pytest_report_teststatus(report):
> + yield
> + with open(HOST_DMESG_FILE, 'r+', encoding='utf-8') as dmesg_file:
> + dmesg = dmesg_file.read()
> + test_string = re.findall('[A-Za-z_.]*::.*', report.nodeid)[0]
> + results["name"] = f"vmtb_{test_string}"
> + test_name = f"vmtb@{test_string}"
> + if report.when == 'call':
> + out = report.capstdout
> + if report.passed:
> + result = "pass"
> + out = f"{test_name} passed"
> + elif report.failed:
> + result = "fail"
> + else:
> + result = "skip"
> + result = {"out": out, "result": result, "time": {"start": 0, "end": report.duration},
> + "err": report.longreprtext, "dmesg": dmesg}
> + results["tests"][test_name] = result
> + dmesg_file.truncate(0)
> + elif report.when == 'setup' and report.failed:
> + result = {"out": report.capstdout, "result": "crash", "time": {"start": 0, "end": report.duration},
> + "err": report.longreprtext, "dmesg": dmesg}
> + results["tests"][test_name] = result
> + dmesg_file.truncate(0)
> +
> +
> + at pytest.hookimpl()
> +def pytest_sessionfinish():
> + if RESULTS_FILE.exists():
> + RESULTS_FILE.unlink()
> + RESULTS_FILE.touch()
> + jsonString = json.dumps(results, indent=2)
> + with open(str(RESULTS_FILE), 'w', encoding='utf-8') as f:
> + f.write(jsonString)
> diff --git a/vmtb/vmm_flows/resources/vgpu_profiles/Flex170.json b/vmtb/vmm_flows/resources/vgpu_profiles/Flex170.json
> new file mode 100644
> index 000000000..ff1fa7e20
> --- /dev/null
> +++ b/vmtb/vmm_flows/resources/vgpu_profiles/Flex170.json
> @@ -0,0 +1,113 @@
> +{
> + "version": "1.1",
> + "PFResources": {
> + "Default": "MinimumPFResources",
> + "Profile": {
> + "MinimumPFResources": {
> + "LocalMemoryEccOn": 402653184,
> + "LocalMemoryEccOff": 402653184,
> + "Contexts": 1024,
> + "Doorbells": 16,
> + "GGTTSize": 268435456
> + }
> + }
> + },
> + "vGPUResources": {
> + "Default": null,
> + "Profile": {
> + "Flex170_16": {
> + "VFCount": 1,
> + "LocalMemoryEccOff": 16777216000,
> + "LocalMemoryEccOn": 2147483648,
> + "Contexts": 1024,
> + "Doorbells": 240,
> + "GGTTSize": 4026531840
> + },
> + "Flex170_8": {
> + "VFCount": 2,
> + "LocalMemoryEccOff": 8388608000,
> + "LocalMemoryEccOn": 2147483648,
> + "Contexts": 1024,
> + "Doorbells": 120,
> + "GGTTSize": 2013265920
> + },
> + "Flex170_4": {
> + "VFCount": 4,
> + "LocalMemoryEccOff": 4194304000,
> + "LocalMemoryEccOn": 2147483648,
> + "Contexts": 1024,
> + "Doorbells": 60,
> + "GGTTSize": 1006632960
> + },
> + "Flex170_2": {
> + "VFCount": 8,
> + "LocalMemoryEccOff": 2097152000,
> + "LocalMemoryEccOn": 1073741824,
> + "Contexts": 1024,
> + "Doorbells": 30,
> + "GGTTSize": 503316480
> + },
> + "Flex170_1": {
> + "VFCount": 16,
> + "LocalMemoryEccOff": 1048576000,
> + "LocalMemoryEccOn": 536870912,
> + "Contexts": 1024,
> + "Doorbells": 15,
> + "GGTTSize": 251658240
> + }
> + }
> + },
> + "vGPUScheduler": {
> + "Default": "Flexible_30fps_GPUTimeSlicing",
> + "Profile": {
> + "Flexible_30fps_GPUTimeSlicing": {
> + "GPUTimeSlicing": {
> + "ScheduleIfIdle": false,
> + "PFExecutionQuantum": 20,
> + "PFPreemptionTimeout": 20000,
> + "VFAttributes": {
> + "VFExecutionQuantum": "lambda VFCount : max( 32 // VFCount, 1)",
> + "VFPreemptionTimeout": "lambda VFCount : 128000 if (VFCount == 1) else max( 64000 // VFCount, 16000)"
> + }
> + }
> + },
> + "Fixed_30fps_GPUTimeSlicing": {
> + "GPUTimeSlicing": {
> + "ScheduleIfIdle": true,
> + "PFExecutionQuantum": 20,
> + "PFPreemptionTimeout": 20000,
> + "VFAttributes": {
> + "VFExecutionQuantum": "lambda VFCount : max( 32 // VFCount, 1)",
> + "VFPreemptionTimeout": "lambda VFCount : 128000 if (VFCount == 1) else max( 64000 // VFCount, 16000)"
> + }
> + }
> + },
> + "Flexible_BurstableQoS_GPUTimeSlicing": {
> + "GPUTimeSlicing": {
> + "ScheduleIfIdle": false,
> + "PFExecutionQuantum": 20,
> + "PFPreemptionTimeout": 20000,
> + "VFAttributes": {
> + "VFExecutionQuantum": "lambda VFCount : min((2000 // max(VFCount-1,1)*0.5, 50))",
> + "VFPreemptionTimeout": "lambda VFCount : (2000 // max(VFCount-1,1) - min((2000 // max(VFCount-1,1))*0.5, 50))*1000"
> + }
> + }
> + }
> + }
> + },
> + "vGPUSecurity": {
> + "Default": "Disabled",
> + "Profile": {
> + "Disabled": {
> + "ResetAfterVfSwitch": false,
> + "GuCSamplingPeriod": 0,
> + "GuCThresholdCATError": 0,
> + "GuCThresholdPageFault": 0,
> + "GuCThresholdH2GStorm": 0,
> + "GuCThresholdDbStorm": 0,
> + "GuCThresholdGTIrqStorm": 0,
> + "GuCThresholdEngineReset": 0
> + }
> + }
> + }
> +}
> \ No newline at end of file
> diff --git a/vmtb/vmm_flows/test_basic.py b/vmtb/vmm_flows/test_basic.py
> new file mode 100644
> index 000000000..b8155c610
> --- /dev/null
> +++ b/vmtb/vmm_flows/test_basic.py
> @@ -0,0 +1,160 @@
> +# SPDX-License-Identifier: MIT
> +# Copyright © 2024 Intel Corporation
> +
> +import logging
> +import time
> +from typing import List, Tuple
> +
> +import pytest
> +
> +from bench.configurators.vgpu_profile_config import VfSchedulingMode
> +from bench.executors.gem_wsim import (ONE_CYCLE_DURATION_MS,
> + PREEMPT_10MS_WORKLOAD, GemWsim,
> + GemWsimResult,
> + gem_wsim_parallel_exec_and_check)
> +from bench.executors.igt import IgtExecutor, IgtType
> +from bench.helpers.helpers import (driver_check, igt_check, igt_run_check,
> + modprobe_driver_run_check)
> +from vmm_flows.conftest import (VmmTestingConfig, VmmTestingSetup,
> + idfn_test_config)
> +
> +logger = logging.getLogger(__name__)
> +
> +WL_ITERATIONS_10S = 1000
> +WL_ITERATIONS_30S = 3000
> +MS_IN_SEC = 1000
> +DELAY_FOR_WORKLOAD_SEC = 2 # Waiting gem_wsim to be running [seconds]
> +DELAY_FOR_RELOAD_SEC = 3 # Waiting before driver reloading [seconds]
> +
> +
> +def set_test_config(test_variants: List[Tuple[int, VfSchedulingMode]],
> + max_vms: int = 2, vf_driver_load: bool = True) -> List[VmmTestingConfig]:
> + """Helper function to provide a parametrized test with a list of test configuration variants."""
> + logger.debug("Init test variants: %s", test_variants)
> + test_configs: List[VmmTestingConfig] = []
> +
> + for config in test_variants:
> + (num_vfs, scheduling_mode) = config
> + test_configs.append(VmmTestingConfig(num_vfs, max_vms, scheduling_mode, auto_probe_vm_driver=vf_driver_load))
> +
> + return test_configs
> +
> +
> +test_variants_1 = [(1, VfSchedulingMode.DEFAULT_PROFILE), (2, VfSchedulingMode.DEFAULT_PROFILE)]
> +
> + at pytest.mark.parametrize('setup_vms', set_test_config(test_variants_1), ids=idfn_test_config, indirect=['setup_vms'])
> +class TestVmSetup:
> + """Verify basic virtualization setup:
> + - probe PF and VFIO drivers (host)
> + - enable and provision VFs (automatic or manual with vGPU profile)
> + - power on VMs with assigned VFs
> + - probe VF driver (guest)
> + - shutdown VMs, reset provisioning and disable VFs
> + """
> + def test_vm_boot(self, setup_vms):
> + logger.info("Test VM boot: power on VM and probe VF driver")
> + ts: VmmTestingSetup = setup_vms
> +
> + for vm in ts.vms:
> + logger.info("[%s] Verify VF DRM driver is loaded in a guest OS", vm)
> + assert driver_check(vm)
> +
> +
> +test_variants_2 = [(1, VfSchedulingMode.DEFAULT_PROFILE), (2, VfSchedulingMode.DEFAULT_PROFILE),
> + (4, VfSchedulingMode.DEFAULT_PROFILE)]
> +
> + at pytest.mark.parametrize('setup_vms', set_test_config(test_variants_2), ids=idfn_test_config, indirect=['setup_vms'])
> +class TestVmWorkload:
> + """Verify basic IGT workload execution a VM(s):
> + - exec_store: basic store submissions on single/multiple VMs
> + - gem_wsim: workload simulator running in parallel on multiple VMs
> + """
> + def test_store(self, setup_vms):
> + logger.info("Test VM execution: exec_store")
> + ts: VmmTestingSetup = setup_vms
> + igt_worklads: List[IgtExecutor] = []
> +
> + for vm in ts.vms:
> + logger.info("[%s] Execute basic WL", vm)
> + igt_worklads.append(IgtExecutor(vm, IgtType.EXEC_STORE))
> +
> + for igt in igt_worklads:
> + logger.info("[%s] Verify result of basic WL", igt.target)
> + assert igt_check(igt)
> +
> + logger.info("[%s] Verify result of basic WL", ts.host)
> + igt_run_check(ts.host, IgtType.EXEC_STORE)
> +
> + def test_wsim(self, setup_vms):
> + logger.info("Test VM execution: gem_wsim")
> + ts: VmmTestingSetup = setup_vms
> +
> + if ts.get_num_vms() < 2:
> + pytest.skip("Test scenario not supported for 1xVM setup ")
> +
> + # Single workload takes 10ms GPU time, multiplied by 1000 iterations
> + # gives the expected 10s duration and 100 workloads/sec
> + expected = GemWsimResult(ONE_CYCLE_DURATION_MS * WL_ITERATIONS_10S * len(ts.vms) / MS_IN_SEC,
> + MS_IN_SEC/ONE_CYCLE_DURATION_MS / len(ts.vms))
> +
> + # Check preemptable workload
> + result = gem_wsim_parallel_exec_and_check(ts.vms, PREEMPT_10MS_WORKLOAD, WL_ITERATIONS_10S, expected)
> + logger.info("Execute wsim parallel on VMs - results: %s", result)
> +
> +
> +test_variants_3 = [(2, VfSchedulingMode.DEFAULT_PROFILE), (4, VfSchedulingMode.DEFAULT_PROFILE)]
> +
> + at pytest.mark.parametrize('setup_vms', set_test_config(test_variants=test_variants_3, max_vms=4, vf_driver_load=False),
> + ids = idfn_test_config, indirect=['setup_vms'])
> +class TestVfDriverLoadRemove:
> + """Verify VF (guest) driver load or remove doesn't affect execution on the other VM:
> + - probe VF driver on the last VM while the first VM is running workload
> + - remove VF driver on the first VM while the last VM is running workload
> + - reload previosuly removed VF driver on the same VM
> + """
> + def test_load(self, setup_vms):
> + logger.info("Test VM driver load: VF driver probe while other VM executes workload")
> + ts: VmmTestingSetup = setup_vms
> +
> + vm_first = ts.vms[0]
> + vm_last = ts.vms[-1]
> +
> + logger.info("[%s] Load VF driver and run basic WL - first VM", vm_first)
> + assert modprobe_driver_run_check(vm_first)
> +
> + expected_elapsed_sec = ONE_CYCLE_DURATION_MS * WL_ITERATIONS_30S / MS_IN_SEC
> + gem_wsim = GemWsim(vm_first, 1, WL_ITERATIONS_30S, PREEMPT_10MS_WORKLOAD)
> + time.sleep(DELAY_FOR_WORKLOAD_SEC)
> + assert gem_wsim.is_running()
> +
> + logger.info("[%s] Load VF driver - last VM", vm_last)
> + assert modprobe_driver_run_check(vm_last)
> +
> + result = gem_wsim.wait_results()
> + assert expected_elapsed_sec * 0.8 < result.elapsed_sec < expected_elapsed_sec * 1.2
> +
> + def test_reload(self, setup_vms):
> + logger.info("Test VM driver reload: VF driver remove is followed by probe while other VM executes workload")
> + ts: VmmTestingSetup = setup_vms
> +
> + vm_first = ts.vms[0]
> + vm_last = ts.vms[-1]
> +
> + logger.info("[%s] Run basic WL - last VM", vm_last)
> + expected_elapsed_sec = ONE_CYCLE_DURATION_MS * WL_ITERATIONS_30S / MS_IN_SEC
> + gem_wsim = GemWsim(vm_last, 1, WL_ITERATIONS_30S, PREEMPT_10MS_WORKLOAD)
> + time.sleep(DELAY_FOR_WORKLOAD_SEC)
> + assert gem_wsim.is_running()
> +
> + logger.info("[%s] Remove VF driver - first VM", vm_first)
> + rmmod_pid = vm_first.execute(f'modprobe -rf {vm_first.get_drm_driver_name()}')
> + assert vm_first.execute_wait(rmmod_pid).exit_code == 0
> +
> + time.sleep(DELAY_FOR_RELOAD_SEC)
> +
> + logger.info("[%s] Reload VF driver and run basic WL - first VM", vm_first)
> + assert modprobe_driver_run_check(vm_first)
> + assert igt_run_check(vm_first, IgtType.EXEC_STORE)
> +
> + result = gem_wsim.wait_results()
> + assert expected_elapsed_sec * 0.8 < result.elapsed_sec < expected_elapsed_sec * 1.2
> diff --git a/vmtb/vmtb_config.json b/vmtb/vmtb_config.json
> new file mode 100644
> index 000000000..640a64123
> --- /dev/null
> +++ b/vmtb/vmtb_config.json
> @@ -0,0 +1,31 @@
> +{
> + "host": {
> + "card_index": 0,
> + "driver": "xe",
> + "igt": {
> + "test_dir": "/usr/local/libexec/igt-gpu-tools/",
> + "tool_dir": "/usr/local/bin/",
> + "lib_dir": "/usr/local/lib/x86_64-linux-gnu",
> + "result_dir": "/usr/local/results",
> + "options": "--piglit-style-dmesg --dmesg-warn-level=4 --abort-on-monitored-error=taint --overwrite"
> + }
> + },
> + "guest": {
> + "os_image": "guest_os.img",
> + "driver": "xe",
> + "igt": {
> + "test_dir": "/usr/local/libexec/igt-gpu-tools/",
> + "tool_dir": "/usr/local/bin/",
> + "lib_dir": "/usr/local/lib/x86_64-linux-gnu",
> + "result_dir": "/usr/local/results",
> + "options": "--piglit-style-dmesg --dmesg-warn-level=4 --abort-on-monitored-error=taint --overwrite"
> + }
> + },
> + "resources": {
> + "vgpu_profiles_path": "vmm_flows/resources/vgpu_profiles",
> + "guc_ver_path": "vmm_flows/resources/guc"
> + },
> + "ci": {
> + "host_dmesg_file": "/tmp/vm-test-bench-host_dmesg.log.tmp"
> + }
> +}
> --
> 2.39.1
>
More information about the igt-dev
mailing list