[RFC] [PATCH i-g-t] tests/intel/xe_fault_injection: Inject errors for VF provision

Michal Wajdeczko michal.wajdeczko at intel.com
Tue May 27 10:46:38 UTC 2025



On 26.05.2025 16:01, Satyanarayana K V P wrote:
> Use the kernel fault injection infrastructure to test error handling of xe

s/xe/the Xe driver

> during VF provisioning of various resources like GGTT, contexts, lmem (in

this is on VF side, so

s/VF provisioning/reading VF provisioning data

s/lmem/LMEM

> case of DGFX) and doorbells so that more code paths are tested, such as
> error handling and unwinding.
> 
> The test injects multiple errors into each resource and tests all possible
> ways of error handling.
> 
> Error can be injected using:
> igt at xe_fault_injection@probe-fail-vf-provision-xe_should_fail_vf_provisioning_ggtt
> igt at xe_fault_injection@probe-fail-vf-provision-xe_should_fail_vf_provisioning_ctxs
> igt at xe_fault_injection@probe-fail-vf-provision-xe_should_fail_vf_provisioning_dbs
> igt at xe_fault_injection@probe-fail-vf-provision-xe_should_fail_vf_provisioning_lmem

since we really don't care about specific resource it should be simpler
(and still sufficient in term of coverage) to have single test case:

	igt at xe_fault_injection@probe-fail-vf-provision

> 
> Signed-off-by: Satyanarayana K V P <satyanarayana.k.v.p at intel.com>
> ---
> Cc: Michal Wajdeczko <michal.wajdeczko at intel.com>
> 
> Test-With: 20250523073638.24842-1-satyanarayana.k.v.p at intel.com
> ---
>  tests/intel/xe_fault_injection.c | 166 +++++++++++++++++++++++++++++++
>  1 file changed, 166 insertions(+)
> 
> diff --git a/tests/intel/xe_fault_injection.c b/tests/intel/xe_fault_injection.c
> index f9bd5c761..40b5b2e16 100644
> --- a/tests/intel/xe_fault_injection.c
> +++ b/tests/intel/xe_fault_injection.c
> @@ -29,6 +29,42 @@
>  #define BO_SIZE		(1024*1024)
>  #define INJECT_ITERATIONS	100
>  
> +enum {
> +	VF_PROVISION_MIN  = 0,
> +	VF_PROVISION_GGTT = VF_PROVISION_MIN,
> +	VF_PROVISION_LMEM,
> +	VF_PROVISION_CTXS,
> +	VF_PROVISION_DBS,
> +	VF_PROVISION_MAX,
> +};
> +
> +enum {
> +	VF_PROVISION_ERR_MIN,
> +	/* Force resource location invalid  */
> +	VF_PROVISION_ERR_EINVAL = VF_PROVISION_ERR_MIN,
> +	/* Force resource size bigger than HW limit */
> +	VF_PROVISION_ERR_ENOSPC,
> +	/* Force resource size to zero */
> +	VF_PROVISION_ERR_ENODATA,
> +	/* Force resource size larger than received with invalid base address. */
> +	VF_PROVISION_ERR_ESRMNT,
> +	/* Force resource size smaller than received */
> +	VF_PROVISION_ERR_EREMCHG,
> +	/* Force resource size larger than received */
> +	VF_PROVISION_ERR_EDQUOT,
> +	VF_PROVISION_ERR_MAX
> +};

I'm not sure that we need enum to describe list of potential error codes
we want to inject - all we need is just a list or those errors (like
below) that we will loop over

> +
> +int vf_provision_err_inject_list[VF_PROVISION_ERR_MAX] = {
> +	-22 /* VF_PROVISION_ERR_EINVAL */,
> +	-28 /* VF_PROVISION_ERR_ENOSPC */,
> +	-61 /*VF_PROVISION_ERR_ENODATA */,
> +	-69 /* VF_PROVISION_ERR_ESRMNT */,
> +	-78 /* VF_PROVISION_ERR_EREMCHG */,
> +	-122 /* VF_PROVISION_ERR_EDQUOT */
> +};
> +
> +
>  int32_t inject_iters_raw;
>  struct fault_injection_params {
>  	/* @probability: Likelihood of failure injection, in percent. */
> @@ -71,6 +107,11 @@ static bool function_is_part_of_guc(const char function_name[])
>  	       strstr(function_name, "_wopcm_") != NULL;
>  }
>  
> +static bool function_is_part_of_vf(const char function_name[])
> +{
> +	return strstr(function_name, "_vf_") != NULL;
> +}
> +
>  static void ignore_faults_in_dmesg(const char function_name[])
>  {
>  	/* Driver probe is expected to fail in all cases, so ignore in igt_runner */
> @@ -87,6 +128,19 @@ static void ignore_faults_in_dmesg(const char function_name[])
>  		strcat(regex, "|GT[0-9a-fA-F]*: GuC PC query task state failed:	-ENOMEM");
>  	}
>  
> +	/*
> +	 * If VF provisiong faults are injected, Guc and VF provision is
> +	 * expected fail. So, ignore failures in igt_runner.
> +	 */
> +	if (function_is_part_of_vf(function_name)) {
> +		strcat(regex, "|GT[0-9a-fA-F]*: GuC init failed with -ENOMEM");
> +		strcat(regex, "|GT[0-9a-fA-F]*: Failed to initialize uC .-ENOMEM");
> +		strcat(regex, "|GT[0-9a-fA-F]*: VF: Unexpected GGTT reassignment: [0-9] != [0-9]");
> +		strcat(regex, "|GT[0-9a-fA-F]*: VF: Unexpected CTXs reassignment: [0-9] != [0-9]");
> +		strcat(regex, "|GT[0-9a-fA-F]*: VF: Unexpected DBs reassignment: [0-9] != [0-9]");
> +		strcat(regex, "|GT[0-9a-fA-F]*: VF: Unexpected LMEM reassignment: [0-9] != [0-9]");
> +	}

this whole idea of filtering out the specific error messages doesn't
make much sense IMO, as it is too fragile to any changes in the driver
code or runtime due to use of different platform or config.

can't we just assume that once the test injects some faults into the
driver any error messages generated by the driver are expected and all
of them should be ignored ? I guess our only expectation from the driver
is that it will not crash and whether there will be zero, one or more
different error messages it shouldn't really bother us, as in some cases
the driver could still even fully recover after single fault

> +
>  	igt_emit_ignore_dmesg_regex(regex);
>  }
>  
> @@ -278,6 +332,103 @@ static void probe_fail_guc(int fd, char pci_slot[], const char function_name[],
>  	}
>  }
>  
> +static void get_fault_params(int fault_type, int has_vram,
> +			     struct fault_injection_params *fault_params)
> +{
> +	igt_assert(fault_params);
> +
> +	igt_debug("has_vram = %d, fault_type = %d\n", has_vram, fault_type);
> +
> +	if (has_vram) {
> +		switch (fault_type) {
> +		case VF_PROVISION_GGTT:
> +			fault_params->space = 1;
> +			break;
> +		case VF_PROVISION_LMEM:
> +			fault_params->space = 2;
> +			break;
> +		case VF_PROVISION_CTXS:
> +			fault_params->space = 3;
> +			break;
> +		case VF_PROVISION_DBS:
> +			fault_params->space = 4;

we shouldn't make too much assumptions about the driver implementation
(here the ordering of functions called by the VF driver during probe) as
this may change any time, without impacting VF functionality, but might
confuse the test code up to reporting false regressions

> +			break;
> +		default:
> +			return;
> +		}
> +	} else {
> +		switch (fault_type) {
> +		case VF_PROVISION_GGTT:
> +			fault_params->space = 1;
> +			break;
> +		case VF_PROVISION_CTXS:
> +			fault_params->space = 2;
> +			break;
> +		case VF_PROVISION_DBS:
> +			fault_params->space = 3;
> +			break;
> +		default:
> +			return;
> +		}
> +	}
> +
> +	fault_params->times = 1;

as discussed offline, we will just loop over the permutation of:

	{ space } x { times } x { interval }

parameters to inject faults at different calls during collecting
provisioning data to check driver reaction to altered or inconsistent
info about one or more resources

> +}
> +/**
> + * SUBTEST: probe-fail-vf-provision-%s
> + * Description: inject an error in the injectable function %arg[1] then reprobe driver
> + * Functionality: fault
> + *
> + * arg[1]:
> + * @xe_should_fail_vf_provisioning_ggtt:     Inject an error when provisoning ggtt.
> + * @xe_should_fail_vf_provisioning_lmem:     Inject an error when provisoning lmem.
> + * @xe_should_fail_vf_provisioning_ctxs:     Inject an error when provisoning ctxs.
> + * @xe_should_fail_vf_provisioning_dbs:     Inject an error when provisoning dbs.
> + */
> +static void probe_fail_vf_provision(int fd, char pci_slot[], const char function_name[],
> +               struct fault_injection_params *fault_params)
> +{
> +	int auto_probe_en = igt_sriov_is_driver_autoprobe_enabled(fd);
> +	const char *func_name = "xe_should_fail_vf_provisioning";

btw, avoid using similar names (here: function_name vs func_name)

> +	unsigned int totalvfs = igt_sriov_get_total_vfs(fd);
> +	int fault_type, i;
> +
> +	igt_skip_on(!totalvfs);
> +	igt_assert(fault_params);
> +
> +	if (!strcmp("xe_should_fail_vf_provisioning_ggtt", function_name))
> +		fault_type = VF_PROVISION_GGTT;
> +	else if (!strcmp("xe_should_fail_vf_provisioning_lmem", function_name))
> +		fault_type = VF_PROVISION_LMEM;
> +	else if (!strcmp("xe_should_fail_vf_provisioning_ctxs", function_name))
> +		fault_type = VF_PROVISION_CTXS;
> +	else if (!strcmp("xe_should_fail_vf_provisioning_dbs", function_name))
> +		fault_type = VF_PROVISION_DBS;
> +	else
> +		fault_type = VF_PROVISION_GGTT;
> +
> +	igt_skip_on(!strcmp("xe_should_fail_vf_provisioning_lmem", function_name) &&
> +		    !xe_has_vram(fd));
> +
> +	ignore_faults_in_dmesg(function_name);
> +	for (i = VF_PROVISION_ERR_MIN; i < VF_PROVISION_ERR_MAX; i++) {
> +		if (igt_sriov_get_enabled_vfs(fd))
> +			igt_sriov_disable_vfs(fd);
> +
> +		get_fault_params(fault_type, xe_has_vram(fd), fault_params);
> +		setup_injection_fault(fault_params);
> +
> +		injection_list_add(func_name);
> +		set_retval(func_name, vf_provision_err_inject_list[i]);
> +
> +		igt_sriov_enable_driver_autoprobe(fd);
> +		igt_sriov_enable_vfs(fd, totalvfs);
> +		igt_sriov_disable_vfs(fd);
> +		if (!auto_probe_en)
> +			igt_sriov_disable_driver_autoprobe(fd);
> +		injection_list_remove(func_name);
> +	}
> +}
>  /**
>   * SUBTEST: exec-queue-create-fail-%s
>   * Description: inject an error in function %arg[1] used in exec queue create IOCTL to make it fail
> @@ -551,6 +702,14 @@ igt_main_args("I:", NULL, help_str, opt_handler, NULL)
>  		{ }
>  	};
>  
> +	const struct section vf_proviosin_fail_functions[] = {
> +		{ "xe_should_fail_vf_provisioning_ggtt" },
> +		{ "xe_should_fail_vf_provisioning_lmem" },
> +		{ "xe_should_fail_vf_provisioning_ctxs" },
> +		{ "xe_should_fail_vf_provisioning_dbs" },

btw, this might give impression that there are multiple fault injection
points, while we just have one

> +		{}
> +	};
> +
>  	igt_fixture {
>  		igt_require(fail_function_injection_enabled());
>  		fd = drm_open_driver(DRIVER_XE);
> @@ -586,6 +745,13 @@ igt_main_args("I:", NULL, help_str, opt_handler, NULL)
>  		igt_subtest_f("oa-add-config-fail-%s", s->name)
>  			oa_add_config_fail(fd, sysfs, devid, s->name);
>  
> +	for (const struct section *s = vf_proviosin_fail_functions; s->name; s++)
> +		igt_subtest_f("probe-fail-vf-provision-%s", s->name) {
> +			memcpy(&fault_params, &default_fault_params,
> +					sizeof(struct fault_injection_params));
> +			probe_fail_vf_provision(fd, pci_slot, s->name, &fault_params);
> +		}
> +
>  	igt_fixture {
>  		igt_kmod_unbind("xe", pci_slot);
>  	}



More information about the igt-dev mailing list