[Intel-xe] [RFC 1/1] drm/xe: Add a debugfs for faking gt reset failure.
Rodrigo Vivi
rodrigo.vivi at kernel.org
Mon May 22 15:51:00 UTC 2023
On Mon, May 22, 2023 at 02:28:17PM +0530, Himal Prasad Ghimiray wrote:
> In case of gt reset failure, KMD notifies userspace about failure
> via uevent. To validate this notification we need to ensure gt
> reset fails and there is no mechanism to cause failure from hardware.
> Hence added a debugfs which will cause fake reset failure.
>
> Cc: Rodrigo Vivi <rodrigo.vivi at intel.com>
> Signed-off-by: Himal Prasad Ghimiray <himal.prasad.ghimiray at intel.com>
> ---
> drivers/gpu/drm/xe/xe_gt.c | 26 +++++++++++++++++++++++++-
> drivers/gpu/drm/xe/xe_gt_debugfs.c | 11 +++++++++++
> drivers/gpu/drm/xe/xe_gt_types.h | 3 +++
> 3 files changed, 39 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/gpu/drm/xe/xe_gt.c b/drivers/gpu/drm/xe/xe_gt.c
> index d98761d9eeba..e05615ca13e5 100644
> --- a/drivers/gpu/drm/xe/xe_gt.c
> +++ b/drivers/gpu/drm/xe/xe_gt.c
> @@ -301,6 +301,9 @@ int xe_gt_init_early(struct xe_gt *gt)
> {
> int err;
>
> + /* Reset is supported by default */
> + gt->reset_enabled = true;
> +
I'd prefer to name this variable as something more specific
fake_reset_failure_in_progress. In order to avoid future abuses
or misusage of it.
> xe_force_wake_init_gt(gt, gt_to_fw(gt));
>
> err = xe_force_wake_get(gt_to_fw(gt), XE_FW_GT);
> @@ -605,6 +608,16 @@ static void xe_uevent_gt_reset_failure(struct xe_device *xe, u8 id)
> kfree(reset_event[3]);
> }
>
> +static int reset_disabled(struct xe_gt *gt)
> +{
> + return !READ_ONCE(gt->reset_enabled);
> +}
> +
> +static void enable_reset(struct xe_gt *gt)
> +{
> + WRITE_ONCE(gt->reset_enabled, true);
> +}
Why are you using these write/read _once variants?
Do you need some mutex?
> +
> static int gt_reset(struct xe_gt *gt)
> {
> struct xe_device *xe = gt_to_xe(gt);
> @@ -617,8 +630,19 @@ static int gt_reset(struct xe_gt *gt)
> drm_info(&xe->drm, "GT reset started\n");
>
> xe_gt_sanitize(gt);
> -
> xe_device_mem_access_get(gt_to_xe(gt));
> +
> + err = reset_disabled(gt);
> + if (err) {
> + drm_info(&xe->drm, "GT reset is disabled\n");
> +
> + /*Enable GT reset for next call if disabled
> + * for fake reset failure.
> + */
> + enable_reset(gt);
> + goto err_msg;
> + }
> +
> err = xe_force_wake_get(gt_to_fw(gt), XE_FORCEWAKE_ALL);
> if (err)
> goto err_msg;
> diff --git a/drivers/gpu/drm/xe/xe_gt_debugfs.c b/drivers/gpu/drm/xe/xe_gt_debugfs.c
> index c45486c2015a..1c3e673c8c60 100644
> --- a/drivers/gpu/drm/xe/xe_gt_debugfs.c
> +++ b/drivers/gpu/drm/xe/xe_gt_debugfs.c
> @@ -127,6 +127,16 @@ static int register_save_restore(struct seq_file *m, void *data)
> return 0;
> }
>
> +static int fake_reset_failure(struct seq_file *m, void *data)
> +{
> + struct xe_gt *gt = node_to_gt(m->private);
> +
> + WRITE_ONCE(gt->reset_enabled, false);
or you create functions for everything or you don't create
functions for any of them. But be consistent.
> + xe_gt_reset_async(gt);
> +
> + return 0;
> +}
> +
> static const struct drm_info_list debugfs_list[] = {
> {"hw_engines", hw_engines, 0},
> {"force_reset", force_reset, 0},
> @@ -135,6 +145,7 @@ static const struct drm_info_list debugfs_list[] = {
> {"steering", steering, 0},
> {"ggtt", ggtt, 0},
> {"register-save-restore", register_save_restore, 0},
> + {"fake_reset_failure", fake_reset_failure, 0},
> };
>
> void xe_gt_debugfs_register(struct xe_gt *gt)
> diff --git a/drivers/gpu/drm/xe/xe_gt_types.h b/drivers/gpu/drm/xe/xe_gt_types.h
> index 7c47d67aa8be..7ec19ad0365d 100644
> --- a/drivers/gpu/drm/xe/xe_gt_types.h
> +++ b/drivers/gpu/drm/xe/xe_gt_types.h
> @@ -175,6 +175,9 @@ struct xe_gt {
> struct work_struct worker;
> } reset;
>
> + /** @reset_enabled: GT supports reset */
> + bool reset_enabled;
> +
> /** @tlb_invalidation: TLB invalidation state */
> struct {
> /** @seqno: TLB invalidation seqno, protected by CT lock */
> --
> 2.25.1
>
More information about the Intel-xe
mailing list