[PATCH i-g-t v3 2/4] tools/vmtb: Basic SR-IOV tests

Thu Dec 5 10:01:53 UTC 2024


On 11/27/2024 11:22 AM, Adam Miszczak wrote:
> Provide basic SR-IOV test cases:
> - enable VFs, pass it to VMs and boot guest OS
> - submit basic workloads on a guest with virtualized GPU
> - exercise VF driver probe and remove
> 
> Tests are created to run with a pytest framework.
> Basic tests are located in vmm_flows/test_basic.py,
> as test classes with subtests implemented as methods.
> Test setup is arranged by fixtures located in vmm_flows/conftest.py.
> Test are usually executed in multiple configuration variants
> (e.g. various number of VFs enabled, different provisioning settings).
> 
> Initially only fundamental test scenarios are provided,
> but generally, the tool targets also complex test cases, like:
> - VF save/restore (VM migration)
> - VF provisioning
> - VF scheduling
> - VM power states
> - VF FLR
> - VM crash
> - GuC FW versioning
> 
> Signed-off-by: Adam Miszczak <adam.miszczak at linux.intel.com>
> ---
>   tools/vmtb/vmm_flows/__init__.py   |   0
>   tools/vmtb/vmm_flows/conftest.py   | 307 +++++++++++++++++++++++++++++
>   tools/vmtb/vmm_flows/test_basic.py | 160 +++++++++++++++
>   3 files changed, 467 insertions(+)
>   create mode 100644 tools/vmtb/vmm_flows/__init__.py
>   create mode 100644 tools/vmtb/vmm_flows/conftest.py
>   create mode 100644 tools/vmtb/vmm_flows/test_basic.py
> 
> diff --git a/tools/vmtb/vmm_flows/__init__.py b/tools/vmtb/vmm_flows/__init__.py
> new file mode 100644
> index 000000000..e69de29bb
> diff --git a/tools/vmtb/vmm_flows/conftest.py b/tools/vmtb/vmm_flows/conftest.py
> new file mode 100644
> index 000000000..474fcdb98
> --- /dev/null
> +++ b/tools/vmtb/vmm_flows/conftest.py
> @@ -0,0 +1,307 @@
> +# SPDX-License-Identifier: MIT
> +# Copyright © 2024 Intel Corporation
> +
> +import json
> +import logging
> +import re
> +import typing
> +
> +from dataclasses import dataclass
> +from pathlib import Path
> +
> +import pytest
> +
> +from bench import exceptions
> +from bench.helpers.helpers import (modprobe_driver, modprobe_driver_check)
> +from bench.helpers.log import HOST_DMESG_FILE
> +from bench.configurators.vgpu_profile_config import VgpuProfileConfigurator, VfSchedulingMode
> +from bench.configurators.vgpu_profile import VgpuProfile
> +from bench.configurators.vmtb_config import VmtbConfigurator
> +from bench.machines.host import Host, Device
> +from bench.machines.virtual.vm import VirtualMachine
> +
> +
> +logger = logging.getLogger('Conftest')
> +
> +
> +def pytest_addoption(parser):
> +    parser.addoption('--vm-image',
> +                     action='store',
> +                     help='OS image to boot on VM')
> +    parser.addoption('--card',
> +                     action='store',
> +                     help='Device card index for test execution')
> +
> +
> + at dataclass
> +class VmmTestingConfig:
> +    """Structure represents test configuration used by a setup fixture.
> +
> +    Available settings:
> +    - num_vfs: requested number of VFs to enable
> +    - max_num_vms: maximal number of VMs (the value can be different than enabled number of VFs)
> +    - scheduling_mode: requested vGPU scheduling profile (infinite maps to default 0's)
> +    - auto_poweron_vm: assign VFs and power on VMs automatically in setup fixture
> +    - auto_probe_vm_driver: probe guest DRM driver in setup fixture (VM must be powered on)
> +    - unload_host_drivers_on_teardown: unload host DRM drivers in teardown fixture
> +    - wa_reduce_vf_lmem: workaround to reduce VF LMEM (for save-restore/migration tests speed-up)
> +    """
> +    num_vfs: int = 1
> +    max_num_vms: int = 2
> +    scheduling_mode: VfSchedulingMode = VfSchedulingMode.INFINITE
> +
> +    auto_poweron_vm: bool = True
> +    auto_probe_vm_driver: bool = True
> +    unload_host_drivers_on_teardown: bool = False
> +    # Temporary W/A: reduce size of LMEM assigned to VFs to speed up a VF state save-restore process
> +    wa_reduce_vf_lmem: bool = False
> +
> +    def __str__(self) -> str:
> +        return f'{self.num_vfs}VF'
> +
> +    def __repr__(self) -> str:
> +        return (f'\nVmmTestingConfig:'
> +                f'\nNum VFs = {self.num_vfs} / max num VMs = {self.max_num_vms}'
> +                f'\nVF scheduling mode = {self.scheduling_mode}'
> +                f'\nSetup flags:'
> +                f'\n\tVM - auto power-on = {self.auto_poweron_vm}'
> +                f'\n\tVM - auto DRM driver probe = {self.auto_probe_vm_driver}'
> +                f'\n\tHost - unload drivers on teardown = {self.unload_host_drivers_on_teardown}'
> +                f'\n\tW/A - reduce VF LMEM (improves migration time) = {self.wa_reduce_vf_lmem}')
> +
> +
> +class VmmTestingSetup:
> +    def __init__(self, vmtb_config: VmtbConfigurator, cmdline_config, host, testing_config):
> +        self.testing_config: VmmTestingConfig = testing_config
> +        self.host: Host = host
> +
> +        self.dut_index = vmtb_config.get_host_config().card_index if cmdline_config['card_index'] is None \
> +                         else int(cmdline_config['card_index'])
> +        self.guest_os_image = vmtb_config.get_guest_config().os_image_path if cmdline_config['vm_image'] is None \
> +                         else cmdline_config['vm_image']
> +
> +        self.vgpu_profiles_dir = vmtb_config.vmtb_config_file.parent / vmtb_config.config.vgpu_profiles_path
> +
> +        self.host.dut_index = self.dut_index
> +        self.host.drm_driver_name = vmtb_config.get_host_config().driver
> +        self.host.igt_config = vmtb_config.get_host_config().igt_config
> +
> +        self.host.load_drivers()
> +        self.host.discover_devices()
> +
> +        logger.info("\nDUT info:"
> +                    "\n\tCard index: %s"
> +                    "\n\tPCI BDF: %s "
> +                    "\n\tDevice ID: %s (%s)"
> +                    "\n\tHost DRM driver: %s",
> +                    self.host.dut_index,
> +                    self.get_dut().pci_info.bdf,
> +                    self.get_dut().pci_info.devid, self.get_dut().gpu_model,
> +                    self.get_dut().driver.get_name())
> +
> +        self.vgpu_profile: VgpuProfile = self.get_vgpu_profile()
> +
> +        # Start maximum requested number of VMs, but not more than VFs supported by the given vGPU profile
> +        self.vms: typing.List[VirtualMachine] = [
> +            VirtualMachine(vm_idx, self.guest_os_image,
> +                           vmtb_config.get_guest_config().driver,
> +                           vmtb_config.get_guest_config().igt_config)
> +            for vm_idx in range(min(self.vgpu_profile.num_vfs, self.testing_config.max_num_vms))]
> +
> +    def get_vgpu_profile(self) -> VgpuProfile:
> +        configurator = VgpuProfileConfigurator(self.vgpu_profiles_dir, self.get_dut().gpu_model)
> +        try:
> +            vgpu_profile = configurator.get_vgpu_profile(self.testing_config.num_vfs,
> +                                                         self.testing_config.scheduling_mode)
> +        except exceptions.VgpuProfileError as exc:
> +            logger.error("Suitable vGPU profile not found: %s", exc)
> +            raise exceptions.VgpuProfileError('Invalid test setup - vGPU profile not found!')
> +
> +        vgpu_profile.print_parameters()
> +
> +        return vgpu_profile
> +
> +    def get_dut(self) -> Device:
> +        try:
> +            return self.host.gpu_devices[self.dut_index]
> +        except IndexError as exc:
> +            logger.error("Invalid VMTB config - device card index = %s not available", self.dut_index)
> +            raise exceptions.VmtbConfigError(f'Device card index = {self.dut_index} not available') from exc
> +
> +    @property
> +    def get_vm(self):
> +        return self.vms
> +
> +    def get_num_vms(self) -> int:
> +        return len(self.vms)
> +
> +    def poweron_vms(self):
> +        for vm in self.vms:
> +            vm.poweron()
> +
> +    def poweroff_vms(self):
> +        for vm in self.vms:
> +            if vm.is_running():
> +                try:
> +                    vm.poweroff()
> +                except Exception as exc:
> +                    self.testing_config.unload_host_drivers_on_teardown = True
> +                    logger.warning("Error on VM%s poweroff (%s)", vm.vmnum, exc)
> +
> +        if self.testing_config.unload_host_drivers_on_teardown:
> +            raise exceptions.GuestError('VM poweroff issue - cleanup on test teardown')
> +
> +    def teardown(self):
> +        try:
> +            self.poweroff_vms()
> +        except Exception as exc:
> +            logger.error("Error on test teardown (%s)", exc)
> +        finally:
> +            num_vfs = self.get_dut().get_current_vfs()
> +            self.get_dut().remove_vfs()
> +            self.get_dut().reset_provisioning(num_vfs)
> +            self.get_dut().cancel_work()
> +
> +            if self.testing_config.unload_host_drivers_on_teardown:
> +                self.host.unload_drivers()
> +
> +
> + at pytest.fixture(scope='session', name='get_vmtb_config')
> +def fixture_get_vmtb_config(create_host_log, pytestconfig):
> +    VMTB_CONFIG_FILE = 'vmtb_config.json'
> +    # Pytest Config.rootpath points to the VMTB base directory
> +    vmtb_config_file_path: Path = pytestconfig.rootpath / VMTB_CONFIG_FILE
> +    return VmtbConfigurator(vmtb_config_file_path)
> +
> +
> + at pytest.fixture(scope='session', name='create_host_log')
> +def fixture_create_host_log():
> +    if HOST_DMESG_FILE.exists():
> +        HOST_DMESG_FILE.unlink()
> +    HOST_DMESG_FILE.touch()
> +
> +
> + at pytest.fixture(scope='session', name='get_cmdline_config')
> +def fixture_get_cmdline_config(request):
> +    cmdline_params = {}
> +    cmdline_params['vm_image'] = request.config.getoption('--vm-image')
> +    cmdline_params['card_index'] = request.config.getoption('--card')
> +    return cmdline_params
> +
> +
> + at pytest.fixture(scope='session', name='get_host')
> +def fixture_get_host():
> +    return Host()
> +
> +
> + at pytest.fixture(scope='class', name='setup_vms')
> +def fixture_setup_vms(get_vmtb_config, get_cmdline_config, get_host, request):
> +    """Arrange VM environment for the VMM Flows test execution.
> +
> +    VM setup steps follow the configuration provided as VmmTestingConfig parameter, including:
> +    host drivers probe (DRM and VFIO), provision and enable VFs, boot VMs and load guest DRM driver.
> +    Tear-down phase covers test environment cleanup:
> +    shutdown VMs, reset provisioning, disable VMs and optional host drivers unload.
> +
> +    The fixture is designed for test parametrization, as the input to the following test class decorator:
> +    @pytest.mark.parametrize('setup_vms', set_test_config(max_vms=N), ids=idfn_test_config, indirect=['setup_vms'])
> +    where 'set_test_config' provides request parameter with a VmmTestingConfig (usually list of configs).
> +    """
> +    tc: VmmTestingConfig = request.param
> +    logger.debug(repr(tc))
> +
> +    host: Host = get_host
> +    ts: VmmTestingSetup = VmmTestingSetup(get_vmtb_config, get_cmdline_config, host, tc)
> +
> +    device: Device = ts.get_dut()
> +    num_vfs = ts.vgpu_profile.num_vfs
> +    num_vms = ts.get_num_vms()
> +
> +    logger.info('[Test setup: %sVF-%sVM]', num_vfs, num_vms)
> +
> +    # XXX: VF migration on discrete devices (with LMEM) is currently quite slow.
> +    # As a temporary workaround, reduce size of LMEM assigned to VFs to speed up a state save/load process.
> +    if tc.wa_reduce_vf_lmem and device.has_lmem():
> +        logger.debug("W/A: reduce VFs LMEM quota to accelerate state save/restore")
> +        org_vgpu_profile_vfLmem = ts.vgpu_profile.resources.vfLmem
> +        # Assign max 512 MB to VF
> +        ts.vgpu_profile.resources.vfLmem = min(ts.vgpu_profile.resources.vfLmem // 2, 536870912)
> +
> +    device.provision(ts.vgpu_profile)
> +
> +    assert device.create_vf(num_vfs) == num_vfs
> +
> +    if tc.auto_poweron_vm:
> +        bdf_list = [device.get_vf_bdf(vf) for vf in range(1, num_vms + 1)]
> +        for vm, bdf in zip(ts.get_vm, bdf_list):
> +            vm.assign_vf(bdf)
> +
> +        ts.poweron_vms()
> +
> +        if tc.auto_probe_vm_driver:
> +            modprobe_cmds = [modprobe_driver(vm) for vm in ts.get_vm]
> +            for i, cmd in enumerate(modprobe_cmds):
> +                assert modprobe_driver_check(ts.get_vm[i], cmd), f'modprobe failed on VM{i}'
> +
> +    logger.info('[Test execution: %sVF-%sVM]', num_vfs, num_vms)
> +    yield ts
> +
> +    logger.info('[Test teardown: %sVF-%sVM]', num_vfs, num_vms)
> +    # XXX: cleanup counterpart for VFs LMEM quota workaround - restore original value
> +    if tc.wa_reduce_vf_lmem and device.has_lmem():
> +        ts.vgpu_profile.resources.vfLmem = org_vgpu_profile_vfLmem
> +
> +    ts.teardown()
> +
> +
> +def idfn_test_config(test_config: VmmTestingConfig):
> +    """Provide test config ID in parametrized tests (e.g. test_something[V4].
> +    Usage: @pytest.mark.parametrize([...], ids=idfn_test_config, [...])
> +    """
> +    return str(test_config)
> +
> +
> +RESULTS_FILE = Path() / "results.json"
> +results = {
> +    "results_version": 10,
> +    "name": "results",
> +    "tests": {},
> +}
> +
> +
> + at pytest.hookimpl(hookwrapper=True)
> +def pytest_report_teststatus(report):
> +    yield
> +    with open(HOST_DMESG_FILE, 'r+', encoding='utf-8') as dmesg_file:
> +        dmesg = dmesg_file.read()
> +        test_string = re.findall('[A-Za-z_.]*::.*', report.nodeid)[0]
> +        results["name"] = f"vmtb_{test_string}"
> +        test_name = f"vmtb@{test_string}"
> +        if report.when == 'call':
> +            out = report.capstdout
> +            if report.passed:
> +                result = "pass"
> +                out = f"{test_name} passed"
> +            elif report.failed:
> +                result = "fail"
> +            else:
> +                result = "skip"
> +            result = {"out": out, "result": result, "time": {"start": 0, "end": report.duration},
> +                    "err": report.longreprtext, "dmesg": dmesg}
> +            results["tests"][test_name] = result
> +            dmesg_file.truncate(0)
> +        elif report.when == 'setup' and report.failed:
> +            result = {"out": report.capstdout, "result": "crash", "time": {"start": 0, "end": report.duration},
> +                    "err": report.longreprtext, "dmesg": dmesg}
> +            results["tests"][test_name] = result
> +            dmesg_file.truncate(0)
> +
> +
> + at pytest.hookimpl()
> +def pytest_sessionfinish():
> +    if RESULTS_FILE.exists():
> +        RESULTS_FILE.unlink()
> +    RESULTS_FILE.touch()
> +    jsonString = json.dumps(results, indent=2)
> +    with open(str(RESULTS_FILE), 'w',  encoding='utf-8') as f:
> +        f.write(jsonString)
> diff --git a/tools/vmtb/vmm_flows/test_basic.py b/tools/vmtb/vmm_flows/test_basic.py
> new file mode 100644
> index 000000000..b8155c610
> --- /dev/null
> +++ b/tools/vmtb/vmm_flows/test_basic.py
> @@ -0,0 +1,160 @@
> +# SPDX-License-Identifier: MIT
> +# Copyright © 2024 Intel Corporation
> +
> +import logging
> +import time
> +from typing import List, Tuple
> +
> +import pytest
> +
> +from bench.configurators.vgpu_profile_config import VfSchedulingMode
> +from bench.executors.gem_wsim import (ONE_CYCLE_DURATION_MS,
> +                                      PREEMPT_10MS_WORKLOAD, GemWsim,
> +                                      GemWsimResult,
> +                                      gem_wsim_parallel_exec_and_check)
> +from bench.executors.igt import IgtExecutor, IgtType
> +from bench.helpers.helpers import (driver_check, igt_check, igt_run_check,
> +                                   modprobe_driver_run_check)
> +from vmm_flows.conftest import (VmmTestingConfig, VmmTestingSetup,
> +                                idfn_test_config)
> +
> +logger = logging.getLogger(__name__)
> +
> +WL_ITERATIONS_10S = 1000
> +WL_ITERATIONS_30S = 3000
> +MS_IN_SEC = 1000
> +DELAY_FOR_WORKLOAD_SEC = 2 # Waiting gem_wsim to be running [seconds]
> +DELAY_FOR_RELOAD_SEC = 3 # Waiting before driver reloading [seconds]
> +
> +
> +def set_test_config(test_variants: List[Tuple[int, VfSchedulingMode]],
> +                    max_vms: int = 2, vf_driver_load: bool = True) -> List[VmmTestingConfig]:
> +    """Helper function to provide a parametrized test with a list of test configuration variants."""
> +    logger.debug("Init test variants: %s", test_variants)
> +    test_configs: List[VmmTestingConfig] = []
> +
> +    for config in test_variants:
> +        (num_vfs, scheduling_mode) = config
> +        test_configs.append(VmmTestingConfig(num_vfs, max_vms, scheduling_mode, auto_probe_vm_driver=vf_driver_load))
> +
> +    return test_configs
> +
> +
> +test_variants_1 = [(1, VfSchedulingMode.DEFAULT_PROFILE), (2, VfSchedulingMode.DEFAULT_PROFILE)]
> +
> + at pytest.mark.parametrize('setup_vms', set_test_config(test_variants_1), ids=idfn_test_config, indirect=['setup_vms'])
> +class TestVmSetup:
> +    """Verify basic virtualization setup:
> +    - probe PF and VFIO drivers (host)
> +    - enable and provision VFs (automatic or manual with vGPU profile)
> +    - power on VMs with assigned VFs
> +    - probe VF driver (guest)
> +    - shutdown VMs, reset provisioning and disable VFs
> +    """
> +    def test_vm_boot(self, setup_vms):
> +        logger.info("Test VM boot: power on VM and probe VF driver")
> +        ts: VmmTestingSetup = setup_vms
> +
> +        for vm in ts.vms:
> +            logger.info("[%s] Verify VF DRM driver is loaded in a guest OS", vm)
> +            assert driver_check(vm)
> +
> +
> +test_variants_2 = [(1, VfSchedulingMode.DEFAULT_PROFILE), (2, VfSchedulingMode.DEFAULT_PROFILE),
> +                   (4, VfSchedulingMode.DEFAULT_PROFILE)]
> +
> + at pytest.mark.parametrize('setup_vms', set_test_config(test_variants_2), ids=idfn_test_config, indirect=['setup_vms'])
> +class TestVmWorkload:
> +    """Verify basic IGT workload execution a VM(s):
> +    - exec_store: basic store submissions on single/multiple VMs
> +    - gem_wsim: workload simulator running in parallel on multiple VMs
> +    """
> +    def test_store(self, setup_vms):
> +        logger.info("Test VM execution: exec_store")
> +        ts: VmmTestingSetup = setup_vms
> +        igt_worklads: List[IgtExecutor] = []
> +
> +        for vm in ts.vms:
> +            logger.info("[%s] Execute basic WL", vm)
> +            igt_worklads.append(IgtExecutor(vm, IgtType.EXEC_STORE))
> +
> +        for igt in igt_worklads:
> +            logger.info("[%s] Verify result of basic WL", igt.target)
> +            assert igt_check(igt)
> +
> +        logger.info("[%s] Verify result of basic WL", ts.host)
> +        igt_run_check(ts.host, IgtType.EXEC_STORE)
> +
> +    def test_wsim(self, setup_vms):
> +        logger.info("Test VM execution: gem_wsim")
> +        ts: VmmTestingSetup = setup_vms
> +
> +        if ts.get_num_vms() < 2:
> +            pytest.skip("Test scenario not supported for 1xVM setup ")
> +
> +        # Single workload takes 10ms GPU time, multiplied by 1000 iterations
> +        # gives the expected 10s duration and 100 workloads/sec
> +        expected = GemWsimResult(ONE_CYCLE_DURATION_MS * WL_ITERATIONS_10S * len(ts.vms) / MS_IN_SEC,
> +                                 MS_IN_SEC/ONE_CYCLE_DURATION_MS / len(ts.vms))
> +
> +        # Check preemptable workload
> +        result = gem_wsim_parallel_exec_and_check(ts.vms, PREEMPT_10MS_WORKLOAD, WL_ITERATIONS_10S, expected)
> +        logger.info("Execute wsim parallel on VMs - results: %s", result)
> +
> +
> +test_variants_3 = [(2, VfSchedulingMode.DEFAULT_PROFILE), (4, VfSchedulingMode.DEFAULT_PROFILE)]
> +
> + at pytest.mark.parametrize('setup_vms', set_test_config(test_variants=test_variants_3, max_vms=4, vf_driver_load=False),
> +                         ids = idfn_test_config, indirect=['setup_vms'])
> +class TestVfDriverLoadRemove:
> +    """Verify VF (guest) driver load or remove doesn't affect execution on the other VM:
> +    - probe VF driver on the last VM while the first VM is running workload
> +    - remove VF driver on the first VM while the last VM is running workload
> +    - reload previosuly removed VF driver on the same VM
> +    """
> +    def test_load(self, setup_vms):
> +        logger.info("Test VM driver load: VF driver probe while other VM executes workload")
> +        ts: VmmTestingSetup = setup_vms
> +
> +        vm_first = ts.vms[0]
> +        vm_last = ts.vms[-1]
> +
> +        logger.info("[%s] Load VF driver and run basic WL - first VM", vm_first)
> +        assert modprobe_driver_run_check(vm_first)
> +
> +        expected_elapsed_sec = ONE_CYCLE_DURATION_MS * WL_ITERATIONS_30S / MS_IN_SEC
> +        gem_wsim = GemWsim(vm_first, 1, WL_ITERATIONS_30S, PREEMPT_10MS_WORKLOAD)
> +        time.sleep(DELAY_FOR_WORKLOAD_SEC)
> +        assert gem_wsim.is_running()
> +
> +        logger.info("[%s] Load VF driver - last VM", vm_last)
> +        assert modprobe_driver_run_check(vm_last)
> +
> +        result = gem_wsim.wait_results()
> +        assert expected_elapsed_sec * 0.8 < result.elapsed_sec < expected_elapsed_sec * 1.2
> +
> +    def test_reload(self, setup_vms):
> +        logger.info("Test VM driver reload: VF driver remove is followed by probe while other VM executes workload")
> +        ts: VmmTestingSetup = setup_vms
> +
> +        vm_first = ts.vms[0]
> +        vm_last = ts.vms[-1]
> +
> +        logger.info("[%s] Run basic WL - last VM", vm_last)
> +        expected_elapsed_sec = ONE_CYCLE_DURATION_MS * WL_ITERATIONS_30S / MS_IN_SEC
> +        gem_wsim = GemWsim(vm_last, 1, WL_ITERATIONS_30S, PREEMPT_10MS_WORKLOAD)
> +        time.sleep(DELAY_FOR_WORKLOAD_SEC)
> +        assert gem_wsim.is_running()
> +
> +        logger.info("[%s] Remove VF driver - first VM", vm_first)
> +        rmmod_pid = vm_first.execute(f'modprobe -rf {vm_first.get_drm_driver_name()}')
> +        assert vm_first.execute_wait(rmmod_pid).exit_code == 0
> +
> +        time.sleep(DELAY_FOR_RELOAD_SEC)
> +
> +        logger.info("[%s] Reload VF driver and run basic WL - first VM", vm_first)
> +        assert modprobe_driver_run_check(vm_first)
> +        assert igt_run_check(vm_first, IgtType.EXEC_STORE)
> +
> +        result = gem_wsim.wait_results()
> +        assert expected_elapsed_sec * 0.8 < result.elapsed_sec < expected_elapsed_sec * 1.2
LGTM,
Reviewed-by: Marcin Bernatowicz <marcin.bernatowicz at linux.intel.com>