[PATCH v4 2/2] tests/intel/xe_fault_injection: Ignore all errors while injecting fault

Daniele Ceraolo Spurio daniele.ceraolospurio at intel.com
Mon Jun 9 18:22:17 UTC 2025



On 6/6/2025 11:35 AM, Jonathan Cavitt wrote:
> From: Satyanarayana K V P <satyanarayana.k.v.p at intel.com>
>
> Currently, numerous fault messages have been included in the dmesg
> ignore list, and this list continues to expand.  Each time a new fault
> injection point is introduced or a new feature is activated, additional
> fault messages appear, making it cumbersome to manage the dmesg ignore
> list.
>
> However, we can safely assert that all dmesg reports that contain
> *ERROR* in their message can be ignored, so add them to the dmesg ignore
> list.  This unfortunately does not include the device probe error
> itself, so that must be added separately.
>
> While we're here, we should also assert that any errors we see are only
> coming from the target PCI device.
>
> v2:
> - Only ignore error-level dmesg reports (or, at least, reports with
>    *ERROR* in them), and device probe failues
> - Add PCI data to regex (Michal)
>
> v3: (Michal)
> - Revert name change
> - Add change log
> - Remove fixes tag from commit
> - Rename ignore_faults_in_dmesg to igt_ignore_dmesg_errors_from_dut, and
>    move to lib/igt_core.c
> - Minor code fixes
>
> v4:
> - Return ignore_faults_in_dmesg to tests/intel/xe_fault_injection.c, but
>    keep it renamed to ignore_dmesg_errors_from_dut (Kamil)
>
> v5:
> - Pass preexisting pci name instead of attempting to regenerate it on
>    each run of ignore_dmesg_errors_from_dut (Daniele)
>
> Signed-off-by: Satyanarayana K V P <satyanarayana.k.v.p at intel.com>
> Signed-off-by: Jonathan Cavitt <jonathan.cavitt at intel.com>
> Suggested-by: Michal Wajdeczko <michal.wajdeczko at intel.com>
> Suggested-by: Daniele Ceraolo Spurio <daniele.ceraolospurio at intel.com>
> Suggested-by: Lucas De Marchi <lucas.demarchi at intel.com>
> Cc: Francois Dugast <francois.dugast at intel.com>
> Cc: Rodrigo Vivi <rodrigo.vivi at intel.com>
> Cc: John Harrison <john.c.harrison at intel.com>
> Cc: Kamil Konieczny <kamil.konieczny at linux.intel.com>
> ---
>   tests/intel/xe_fault_injection.c | 60 ++++++++++++++------------------
>   1 file changed, 27 insertions(+), 33 deletions(-)
>
> diff --git a/tests/intel/xe_fault_injection.c b/tests/intel/xe_fault_injection.c
> index 7a14ad1729..3166d95f55 100644
> --- a/tests/intel/xe_fault_injection.c
> +++ b/tests/intel/xe_fault_injection.c
> @@ -64,28 +64,17 @@ static int fail_function_open(void)
>   	return debugfs_fail_function_dir_fd;
>   }
>   
> -static bool function_is_part_of_guc(const char function_name[])
> +static void ignore_dmesg_errors_from_dut(const char pci_slot[])
>   {
> -	return strstr(function_name, "_guc_") != NULL ||
> -	       strstr(function_name, "_uc_") != NULL ||
> -	       strstr(function_name, "_wopcm_") != NULL;
> -}
> -
> -static void ignore_faults_in_dmesg(const char function_name[])
> -{
> -	/* Driver probe is expected to fail in all cases, so ignore in igt_runner */
> -	char regex[1024] = "probe with driver xe failed with error -12";
> -
>   	/*
> -	 * If GuC module fault is injected, GuC is expected to fail,
> -	 * so also ignore GuC init failures in igt_runner.
> +	 * Driver probe is expected to fail in all cases, so ignore in igt_runner.
> +	 * Additionally, all error-level reports are expected, so ignore those as well.

I would remove the "all" in this sentence, to make it sound less like 
"we know all the errors that are happening and we're expecting them" and 
more like "some things are going to throw errors and that's acceptable".
With that:

Reviewed-by: Daniele Ceraolo Spurio <daniele.ceraolospurio at intel.com>

No need to re-spin just for this change, you can do the change while 
merging if no one else has any comments.

Daniele

>   	 */
> -	if (function_is_part_of_guc(function_name)) {
> -		strcat(regex, "|GT[0-9a-fA-F]*: GuC init failed with -ENOMEM");
> -		strcat(regex, "|GT[0-9a-fA-F]*: Failed to initialize uC .-ENOMEM");
> -		strcat(regex, "|GT[0-9a-fA-F]*: Failed to enable GuC CT .-ENOMEM");
> -		strcat(regex, "|GT[0-9a-fA-F]*: GuC PC query task state failed: -ENOMEM");
> -	}
> +	static const char *store = "probe with driver xe failed with error|\\*ERROR\\*";
> +	char regex[1024];
> +
> +	/* Only block dmesg reports that target the pci slot of the given fd */
> +	snprintf(regex, sizeof(regex), "%s:.*(%s)", pci_slot, store);
>   
>   	igt_emit_ignore_dmesg_regex(regex);
>   }
> @@ -234,7 +223,7 @@ inject_fault_probe(int fd, const char pci_slot[], const char function_name[])
>   	igt_info("Injecting error \"%s\" (%d) in function \"%s\"\n",
>   		 strerror(-INJECT_ERRNO), INJECT_ERRNO, function_name);
>   
> -	ignore_faults_in_dmesg(function_name);
> +	ignore_dmesg_errors_from_dut(pci_slot);
>   	injection_list_add(function_name);
>   	set_retval(function_name, INJECT_ERRNO);
>   
> @@ -291,7 +280,8 @@ static void probe_fail_guc(int fd, const char pci_slot[], const char function_na
>    */
>   static void
>   exec_queue_create_fail(int fd, struct drm_xe_engine_class_instance *instance,
> -		const char function_name[], unsigned int flags)
> +		       const char pci_slot[], const char function_name[],
> +		       unsigned int flags)
>   {
>   	uint32_t exec_queue_id;
>   	uint32_t vm = xe_vm_create(fd, flags, 0);
> @@ -299,7 +289,7 @@ exec_queue_create_fail(int fd, struct drm_xe_engine_class_instance *instance,
>   	igt_assert_eq(__xe_exec_queue_create(fd, vm, 1, 1, instance, 0, &exec_queue_id), 0);
>   	xe_exec_queue_destroy(fd, exec_queue_id);
>   
> -	ignore_faults_in_dmesg(function_name);
> +	ignore_dmesg_errors_from_dut(pci_slot);
>   	injection_list_add(function_name);
>   	set_retval(function_name, INJECT_ERRNO);
>   	igt_assert(__xe_exec_queue_create(fd, vm, 1, 1, instance, 0, &exec_queue_id) != 0);
> @@ -330,11 +320,12 @@ simple_vm_create(int fd, unsigned int flags)
>    * @xe_vm_create_scratch:	xe_vm_create_scratch
>    */
>   static void
> -vm_create_fail(int fd, const char function_name[], unsigned int flags)
> +vm_create_fail(int fd, const char pci_slot[],
> +	       const char function_name[], unsigned int flags)
>   {
>   	igt_assert_eq(simple_vm_create(fd, flags), 0);
>   
> -	ignore_faults_in_dmesg(function_name);
> +	ignore_dmesg_errors_from_dut(pci_slot);
>   	injection_list_add(function_name);
>   	set_retval(function_name, INJECT_ERRNO);
>   	igt_assert(simple_vm_create(fd, flags) != 0);
> @@ -391,13 +382,13 @@ simple_vm_bind(int fd, uint32_t vm)
>    * @xe_sync_entry_parse:		xe_sync_entry_parse
>    */
>   static void
> -vm_bind_fail(int fd, const char function_name[])
> +vm_bind_fail(int fd, const char pci_slot[], const char function_name[])
>   {
>   	uint32_t vm = xe_vm_create(fd, 0, 0);
>   
>   	igt_assert_eq(simple_vm_bind(fd, vm), 0);
>   
> -	ignore_faults_in_dmesg(function_name);
> +	ignore_dmesg_errors_from_dut(pci_slot);
>   	injection_list_add(function_name);
>   	set_retval(function_name, INJECT_ERRNO);
>   	igt_assert(simple_vm_bind(fd, vm) != 0);
> @@ -415,7 +406,8 @@ vm_bind_fail(int fd, const char function_name[])
>    * @xe_oa_alloc_regs:		xe_oa_alloc_regs
>    */
>   static void
> -oa_add_config_fail(int fd, int sysfs, int devid, const char function_name[])
> +oa_add_config_fail(int fd, int sysfs, int devid,
> +		   const char pci_slot[], const char function_name[])
>   {
>   	char path[512];
>   	uint64_t config_id;
> @@ -445,7 +437,7 @@ oa_add_config_fail(int fd, int sysfs, int devid, const char function_name[])
>   	igt_assert(igt_sysfs_scanf(sysfs, path, "%" PRIu64, &config_id) == 1);
>   	igt_assert_eq(intel_xe_perf_ioctl(fd, DRM_XE_OBSERVATION_OP_REMOVE_CONFIG, &config_id), 0);
>   
> -	ignore_faults_in_dmesg(function_name);
> +	ignore_dmesg_errors_from_dut(pci_slot);
>   	injection_list_add(function_name);
>   	set_retval(function_name, INJECT_ERRNO);
>   	igt_assert_lt(intel_xe_perf_ioctl(fd, DRM_XE_OBSERVATION_OP_ADD_CONFIG, &config), 0);
> @@ -564,27 +556,29 @@ igt_main_args("I:", NULL, help_str, opt_handler, NULL)
>   
>   	for (const struct section *s = vm_create_fail_functions; s->name; s++)
>   		igt_subtest_f("vm-create-fail-%s", s->name)
> -			vm_create_fail(fd, s->name, s->flags);
> +			vm_create_fail(fd, pci_slot, s->name, s->flags);
>   
>   	for (const struct section *s = vm_bind_fail_functions; s->name; s++)
>   		igt_subtest_f("vm-bind-fail-%s", s->name)
> -			vm_bind_fail(fd, s->name);
> +			vm_bind_fail(fd, pci_slot, s->name);
>   
>   	for (const struct section *s = exec_queue_create_fail_functions; s->name; s++)
>   		igt_subtest_f("exec-queue-create-fail-%s", s->name)
>   			xe_for_each_engine(fd, hwe)
>   				if (hwe->engine_class != DRM_XE_ENGINE_CLASS_VM_BIND)
> -					exec_queue_create_fail(fd, hwe, s->name, s->flags);
> +					exec_queue_create_fail(fd, hwe, pci_slot,
> +							       s->name, s->flags);
>   
>   	for (const struct section *s = exec_queue_create_vmbind_fail_functions; s->name; s++)
>   		igt_subtest_f("exec-queue-create-fail-%s", s->name)
>   			xe_for_each_engine(fd, hwe)
>   				if (hwe->engine_class == DRM_XE_ENGINE_CLASS_VM_BIND)
> -					exec_queue_create_fail(fd, hwe, s->name, s->flags);
> +					exec_queue_create_fail(fd, hwe, pci_slot,
> +							       s->name, s->flags);
>   
>   	for (const struct section *s = oa_add_config_fail_functions; s->name; s++)
>   		igt_subtest_f("oa-add-config-fail-%s", s->name)
> -			oa_add_config_fail(fd, sysfs, devid, s->name);
> +			oa_add_config_fail(fd, sysfs, devid, pci_slot, s->name);
>   
>   	igt_fixture {
>   		igt_kmod_unbind("xe", pci_slot);



More information about the igt-dev mailing list