[PATCH v4 2/2] tests/intel/xe_fault_injection: Ignore all errors while injecting fault

Kamil Konieczny kamil.konieczny at linux.intel.com
Tue Jun 10 18:45:12 UTC 2025


Hi Daniele,
On 2025-06-09 at 11:22:17 -0700, Daniele Ceraolo Spurio wrote:
> 
> 
> On 6/6/2025 11:35 AM, Jonathan Cavitt wrote:
> > From: Satyanarayana K V P <satyanarayana.k.v.p at intel.com>
> > 
> > Currently, numerous fault messages have been included in the dmesg
> > ignore list, and this list continues to expand.  Each time a new fault
> > injection point is introduced or a new feature is activated, additional
> > fault messages appear, making it cumbersome to manage the dmesg ignore
> > list.
> > 
> > However, we can safely assert that all dmesg reports that contain
> > *ERROR* in their message can be ignored, so add them to the dmesg ignore
> > list.  This unfortunately does not include the device probe error
> > itself, so that must be added separately.
> > 
> > While we're here, we should also assert that any errors we see are only
> > coming from the target PCI device.
> > 
> > v2:
> > - Only ignore error-level dmesg reports (or, at least, reports with
> >    *ERROR* in them), and device probe failues
> > - Add PCI data to regex (Michal)
> > 
> > v3: (Michal)
> > - Revert name change
> > - Add change log
> > - Remove fixes tag from commit
> > - Rename ignore_faults_in_dmesg to igt_ignore_dmesg_errors_from_dut, and
> >    move to lib/igt_core.c
> > - Minor code fixes
> > 
> > v4:
> > - Return ignore_faults_in_dmesg to tests/intel/xe_fault_injection.c, but
> >    keep it renamed to ignore_dmesg_errors_from_dut (Kamil)
> > 
> > v5:
> > - Pass preexisting pci name instead of attempting to regenerate it on
> >    each run of ignore_dmesg_errors_from_dut (Daniele)
> > 
> > Signed-off-by: Satyanarayana K V P <satyanarayana.k.v.p at intel.com>
> > Signed-off-by: Jonathan Cavitt <jonathan.cavitt at intel.com>
> > Suggested-by: Michal Wajdeczko <michal.wajdeczko at intel.com>
> > Suggested-by: Daniele Ceraolo Spurio <daniele.ceraolospurio at intel.com>
> > Suggested-by: Lucas De Marchi <lucas.demarchi at intel.com>
> > Cc: Francois Dugast <francois.dugast at intel.com>
> > Cc: Rodrigo Vivi <rodrigo.vivi at intel.com>
> > Cc: John Harrison <john.c.harrison at intel.com>
> > Cc: Kamil Konieczny <kamil.konieczny at linux.intel.com>
> > ---
> >   tests/intel/xe_fault_injection.c | 60 ++++++++++++++------------------
> >   1 file changed, 27 insertions(+), 33 deletions(-)
> > 
> > diff --git a/tests/intel/xe_fault_injection.c b/tests/intel/xe_fault_injection.c
> > index 7a14ad1729..3166d95f55 100644
> > --- a/tests/intel/xe_fault_injection.c
> > +++ b/tests/intel/xe_fault_injection.c
> > @@ -64,28 +64,17 @@ static int fail_function_open(void)
> >   	return debugfs_fail_function_dir_fd;
> >   }
> > -static bool function_is_part_of_guc(const char function_name[])
> > +static void ignore_dmesg_errors_from_dut(const char pci_slot[])
> >   {
> > -	return strstr(function_name, "_guc_") != NULL ||
> > -	       strstr(function_name, "_uc_") != NULL ||
> > -	       strstr(function_name, "_wopcm_") != NULL;
> > -}
> > -
> > -static void ignore_faults_in_dmesg(const char function_name[])
> > -{
> > -	/* Driver probe is expected to fail in all cases, so ignore in igt_runner */
> > -	char regex[1024] = "probe with driver xe failed with error -12";
> > -
> >   	/*
> > -	 * If GuC module fault is injected, GuC is expected to fail,
> > -	 * so also ignore GuC init failures in igt_runner.
> > +	 * Driver probe is expected to fail in all cases, so ignore in igt_runner.
> > +	 * Additionally, all error-level reports are expected, so ignore those as well.
> 
> I would remove the "all" in this sentence, to make it sound less like "we
> know all the errors that are happening and we're expecting them" and more
> like "some things are going to throw errors and that's acceptable".
> With that:
> 
> Reviewed-by: Daniele Ceraolo Spurio <daniele.ceraolospurio at intel.com>
> 
> No need to re-spin just for this change, you can do the change while merging
> if no one else has any comments.
> 
> Daniele
> 

Thank you all, I merged this with small edits.

Regards,
Kamil

> >   	 */
> > -	if (function_is_part_of_guc(function_name)) {
> > -		strcat(regex, "|GT[0-9a-fA-F]*: GuC init failed with -ENOMEM");
> > -		strcat(regex, "|GT[0-9a-fA-F]*: Failed to initialize uC .-ENOMEM");
> > -		strcat(regex, "|GT[0-9a-fA-F]*: Failed to enable GuC CT .-ENOMEM");
> > -		strcat(regex, "|GT[0-9a-fA-F]*: GuC PC query task state failed: -ENOMEM");
> > -	}
> > +	static const char *store = "probe with driver xe failed with error|\\*ERROR\\*";
> > +	char regex[1024];
> > +
> > +	/* Only block dmesg reports that target the pci slot of the given fd */
> > +	snprintf(regex, sizeof(regex), "%s:.*(%s)", pci_slot, store);
> >   	igt_emit_ignore_dmesg_regex(regex);
> >   }
> > @@ -234,7 +223,7 @@ inject_fault_probe(int fd, const char pci_slot[], const char function_name[])
> >   	igt_info("Injecting error \"%s\" (%d) in function \"%s\"\n",
> >   		 strerror(-INJECT_ERRNO), INJECT_ERRNO, function_name);
> > -	ignore_faults_in_dmesg(function_name);
> > +	ignore_dmesg_errors_from_dut(pci_slot);
> >   	injection_list_add(function_name);
> >   	set_retval(function_name, INJECT_ERRNO);
> > @@ -291,7 +280,8 @@ static void probe_fail_guc(int fd, const char pci_slot[], const char function_na
> >    */
> >   static void
> >   exec_queue_create_fail(int fd, struct drm_xe_engine_class_instance *instance,
> > -		const char function_name[], unsigned int flags)
> > +		       const char pci_slot[], const char function_name[],
> > +		       unsigned int flags)
> >   {
> >   	uint32_t exec_queue_id;
> >   	uint32_t vm = xe_vm_create(fd, flags, 0);
> > @@ -299,7 +289,7 @@ exec_queue_create_fail(int fd, struct drm_xe_engine_class_instance *instance,
> >   	igt_assert_eq(__xe_exec_queue_create(fd, vm, 1, 1, instance, 0, &exec_queue_id), 0);
> >   	xe_exec_queue_destroy(fd, exec_queue_id);
> > -	ignore_faults_in_dmesg(function_name);
> > +	ignore_dmesg_errors_from_dut(pci_slot);
> >   	injection_list_add(function_name);
> >   	set_retval(function_name, INJECT_ERRNO);
> >   	igt_assert(__xe_exec_queue_create(fd, vm, 1, 1, instance, 0, &exec_queue_id) != 0);
> > @@ -330,11 +320,12 @@ simple_vm_create(int fd, unsigned int flags)
> >    * @xe_vm_create_scratch:	xe_vm_create_scratch
> >    */
> >   static void
> > -vm_create_fail(int fd, const char function_name[], unsigned int flags)
> > +vm_create_fail(int fd, const char pci_slot[],
> > +	       const char function_name[], unsigned int flags)
> >   {
> >   	igt_assert_eq(simple_vm_create(fd, flags), 0);
> > -	ignore_faults_in_dmesg(function_name);
> > +	ignore_dmesg_errors_from_dut(pci_slot);
> >   	injection_list_add(function_name);
> >   	set_retval(function_name, INJECT_ERRNO);
> >   	igt_assert(simple_vm_create(fd, flags) != 0);
> > @@ -391,13 +382,13 @@ simple_vm_bind(int fd, uint32_t vm)
> >    * @xe_sync_entry_parse:		xe_sync_entry_parse
> >    */
> >   static void
> > -vm_bind_fail(int fd, const char function_name[])
> > +vm_bind_fail(int fd, const char pci_slot[], const char function_name[])
> >   {
> >   	uint32_t vm = xe_vm_create(fd, 0, 0);
> >   	igt_assert_eq(simple_vm_bind(fd, vm), 0);
> > -	ignore_faults_in_dmesg(function_name);
> > +	ignore_dmesg_errors_from_dut(pci_slot);
> >   	injection_list_add(function_name);
> >   	set_retval(function_name, INJECT_ERRNO);
> >   	igt_assert(simple_vm_bind(fd, vm) != 0);
> > @@ -415,7 +406,8 @@ vm_bind_fail(int fd, const char function_name[])
> >    * @xe_oa_alloc_regs:		xe_oa_alloc_regs
> >    */
> >   static void
> > -oa_add_config_fail(int fd, int sysfs, int devid, const char function_name[])
> > +oa_add_config_fail(int fd, int sysfs, int devid,
> > +		   const char pci_slot[], const char function_name[])
> >   {
> >   	char path[512];
> >   	uint64_t config_id;
> > @@ -445,7 +437,7 @@ oa_add_config_fail(int fd, int sysfs, int devid, const char function_name[])
> >   	igt_assert(igt_sysfs_scanf(sysfs, path, "%" PRIu64, &config_id) == 1);
> >   	igt_assert_eq(intel_xe_perf_ioctl(fd, DRM_XE_OBSERVATION_OP_REMOVE_CONFIG, &config_id), 0);
> > -	ignore_faults_in_dmesg(function_name);
> > +	ignore_dmesg_errors_from_dut(pci_slot);
> >   	injection_list_add(function_name);
> >   	set_retval(function_name, INJECT_ERRNO);
> >   	igt_assert_lt(intel_xe_perf_ioctl(fd, DRM_XE_OBSERVATION_OP_ADD_CONFIG, &config), 0);
> > @@ -564,27 +556,29 @@ igt_main_args("I:", NULL, help_str, opt_handler, NULL)
> >   	for (const struct section *s = vm_create_fail_functions; s->name; s++)
> >   		igt_subtest_f("vm-create-fail-%s", s->name)
> > -			vm_create_fail(fd, s->name, s->flags);
> > +			vm_create_fail(fd, pci_slot, s->name, s->flags);
> >   	for (const struct section *s = vm_bind_fail_functions; s->name; s++)
> >   		igt_subtest_f("vm-bind-fail-%s", s->name)
> > -			vm_bind_fail(fd, s->name);
> > +			vm_bind_fail(fd, pci_slot, s->name);
> >   	for (const struct section *s = exec_queue_create_fail_functions; s->name; s++)
> >   		igt_subtest_f("exec-queue-create-fail-%s", s->name)
> >   			xe_for_each_engine(fd, hwe)
> >   				if (hwe->engine_class != DRM_XE_ENGINE_CLASS_VM_BIND)
> > -					exec_queue_create_fail(fd, hwe, s->name, s->flags);
> > +					exec_queue_create_fail(fd, hwe, pci_slot,
> > +							       s->name, s->flags);
> >   	for (const struct section *s = exec_queue_create_vmbind_fail_functions; s->name; s++)
> >   		igt_subtest_f("exec-queue-create-fail-%s", s->name)
> >   			xe_for_each_engine(fd, hwe)
> >   				if (hwe->engine_class == DRM_XE_ENGINE_CLASS_VM_BIND)
> > -					exec_queue_create_fail(fd, hwe, s->name, s->flags);
> > +					exec_queue_create_fail(fd, hwe, pci_slot,
> > +							       s->name, s->flags);
> >   	for (const struct section *s = oa_add_config_fail_functions; s->name; s++)
> >   		igt_subtest_f("oa-add-config-fail-%s", s->name)
> > -			oa_add_config_fail(fd, sysfs, devid, s->name);
> > +			oa_add_config_fail(fd, sysfs, devid, pci_slot, s->name);
> >   	igt_fixture {
> >   		igt_kmod_unbind("xe", pci_slot);
> 


More information about the igt-dev mailing list