[i-g-t] tests/intel/xe_wedged: Add new test csc-wedged

Anirban, Sk sk.anirban at intel.com
Mon Jul 14 05:11:43 UTC 2025


Hi,

On 10-07-2025 12:53, Riana Tauro wrote:
> Hi Anirban
>
> On 7/8/2025 2:53 PM, Sk Anirban wrote:
>> Inject a CSC error through uevent to cause the Xe device to enter a 
>> wedged
>
> Add details about survivability mode. What is the expectation of the test
>
> Add a link to kernel patches
Sure, I will add this.
>
>> state. To return the device to a normal state, reload the driver, as
>> the wedged state can only be resolved by rebinding/reprobing the driver.
>>
>> Signed-off-by: Sk Anirban <sk.anirban at intel.com>
>> ---
>>   tests/intel/xe_wedged.c | 85 +++++++++++++++++++++++++++++++++++++++++
>>   1 file changed, 85 insertions(+)
>>
>> diff --git a/tests/intel/xe_wedged.c b/tests/intel/xe_wedged.c
>> index 7fc7ca9eb..b29e9bcb5 100644
>> --- a/tests/intel/xe_wedged.c
>> +++ b/tests/intel/xe_wedged.c
>> @@ -14,6 +14,7 @@
>>     #include <limits.h>
>>   #include <dirent.h>
>> +#include <libudev.h>
>>     #include "igt.h"
>>   #include "igt_device.h"
>> @@ -46,6 +47,46 @@ static void force_wedged(int fd)
>>       sleep(1);
>>   }
>>   +static void force_wedged_csc_error(int fd)
>> +{
>> +    igt_debugfs_write(fd, "inject_csc_hw_error/probability", "100");
>> +    igt_debugfs_write(fd, "inject_csc_hw_error/times", "1");
>> +
>> +    xe_force_gt_reset_sync(fd, 0);
>> +    sleep(1);
>> +}
>> +
>> +static char bus_addr[NAME_MAX];> +
>> +static int check_survivability_mode(int fd)
>> +{
>> +    struct pci_device *pci_dev;
>> +    char path[PATH_MAX];
>> +    int dirfd;
>> +
>> +    pci_dev = igt_device_get_pci_device(fd);
>> +    snprintf(bus_addr, sizeof(bus_addr), "%04x:%02x:%02x.%01x",
>> +         pci_dev->domain, pci_dev->bus, pci_dev->dev, pci_dev->func);
>> +    snprintf(path, PATH_MAX, 
>> "/sys/bus/pci/devices/%s/survivability_mode", bus_addr);
>> +    dirfd = open(path, O_RDONLY);
>> +
>> +    return dirfd;
>> +}
>> +
>> +static void intercept_udev_events(struct udev_device *device)
>> +{
>> +    const char *dev_path = udev_device_get_property_value(device, 
>> "DEVPATH");
>> +    const char *wedged = udev_device_get_property_value(device, 
>> "WEDGED");
>> +
>> +    igt_assert_f(wedged && !strcmp(wedged, "vendor-specific"),
>> +             "Expected WEDGED property to be 'vendor-specific', got 
>> '%s'",
>> +             wedged);
>> +
>> +    igt_assert_f(dev_path && strstr(dev_path, bus_addr),
>> +             "Expected bus address '%s' to be part of DEVPATH '%s'",
>> +             bus_addr, dev_path);
>> +}
>> +
>>   static int simple_ioctl(int fd)
>>   {
>>       int ret;
>> @@ -208,6 +249,11 @@ simple_hang(int fd, struct drm_xe_sync *sync)
>>    * SUBTEST: basic-wedged-read
>>    * Description: Read wedged_mode debugfs
>>    */
>> +/**
>> + * SUBTEST: csc-wedged
>> + * Description: Force Xe device wedged after injecting a failure in CSC
>> + */
>> +
>>   igt_main
>>   {
>>       struct drm_xe_engine_class_instance *hwe;
>> @@ -300,12 +346,51 @@ igt_main
>>           igt_assert_f(str[0] != '\0', "Failed to read wedged_mode 
>> from debugfs!\n");
>>       }
>>   +    igt_subtest("csc-wedged") {
>> +        struct udev *udev = udev_new();
>> +        struct udev_monitor *monitor;
>
> We can use this instead
>
> struct udev_monitor *mon = igt_watch_uevents();
Sure, I will check this.
>
>
>> +        struct udev_device *device;
>> +
>> +        igt_require(igt_debugfs_exists(fd, 
>> "inject_csc_hw_error/probability",
>> +                           O_RDWR));
>> +
>> +        igt_assert_f(check_survivability_mode(fd) < 0,
>> +                 "survivability_mode sysfs available");
>
> why?
Just to check if the node is not available before the cse wedged.
>> +
>> +        igt_debugfs_write(fd, "inject_csc_hw_error/verbose", "1");
>> +        igt_assert_eq(simple_ioctl(fd), 0);
> Is this required?
As discussed offline, I will remove this.
>> +        ignore_wedged_in_dmesg();
>
> Ignoring the interrupt message and runtime survivability also might be 
> needed. Can check once kernel patches are merged
Sure, I’ll align this with the Kernel patches once they are merged.
>> +
>> +        monitor = udev_monitor_new_from_netlink(udev, "kernel");
>> +        udev_monitor_enable_receiving(monitor);
>> +
>> +        force_wedged_csc_error(fd);
>> +
>> +        device = udev_monitor_receive_device(monitor);
>
>
>> +        intercept_udev_events(device);
>> +
>> +        igt_assert_f(check_survivability_mode(fd) >= 0,
>> +                 "survivability_mode sysfs not available");
>
> you can add both of this in a single function 
> (check_runtime_survivability_mode)
Sure, I will modify this.
>
> Thanks
> Riana

Thanks,
Anirban
>> +
>> +        drm_close_driver(fd);
>> +        igt_kmod_rebind("xe", pci_slot);
>> +        fd = drm_open_driver(DRIVER_XE);
>> +        igt_assert_eq(simple_ioctl(fd), 0);
>> +        xe_for_each_engine(fd, hwe)
>> +            simple_exec(fd, hwe);
>> +    }
>> +
>>       igt_fixture {
>>           if (igt_debugfs_exists(fd, "fail_gt_reset/probability", 
>> O_RDWR)) {
>>               igt_debugfs_write(fd, "fail_gt_reset/probability", "0");
>>               igt_debugfs_write(fd, "fail_gt_reset/times", "1");
>>           }
>>   +        if (igt_debugfs_exists(fd, 
>> "inject_csc_hw_error/probability", O_RDWR)) {
>> +            igt_debugfs_write(fd, "inject_csc_hw_error/probability", 
>> "0");
>> +            igt_debugfs_write(fd, "inject_csc_hw_error/times", "1");
>> +        }
>> +
>>           /* Tests might have failed, force a rebind before exiting */
>>           drm_close_driver(fd);
>>           igt_kmod_rebind("xe", pci_slot);
>



More information about the igt-dev mailing list