[PATCH 2/5] drm/amdgpu: Move to reset_schedule_work

Fri Aug 11 06:02:31 UTC 2023

Move recovery handlers to schedule reset work. Make use of the workpool
in the reset domain and delete the individual work items.

Signed-off-by: Lijo Lazar <lijo.lazar at amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu.h        |  2 -
 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c | 32 +++++-----
 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h |  1 -
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 15 -----
 drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c  | 40 ++++++------
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c    | 71 +++++++++++-----------
 drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h   |  1 -
 drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c      | 38 ++++++------
 drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c      | 44 ++++++--------
 drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c      | 33 +++++-----
 10 files changed, 118 insertions(+), 159 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index 2e3c7c15cb8e..4186d8342a15 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -1096,8 +1096,6 @@ struct amdgpu_device {
 	bool                            scpm_enabled;
 	uint32_t                        scpm_status;
 
-	struct work_struct		reset_work;
-
 	bool                            job_hang;
 	bool                            dc_enabled;
 	/* Mask of active clusters */
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
index 629ca1ad75a8..e4c5e8f68843 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
@@ -120,21 +120,10 @@ static void amdgpu_doorbell_get_kfd_info(struct amdgpu_device *adev,
 	}
 }
 
-
-static void amdgpu_amdkfd_reset_work(struct work_struct *work)
+static void amdgpu_amdkfd_reset_work(struct amdgpu_reset_context *reset_context)
 {
-	struct amdgpu_device *adev = container_of(work, struct amdgpu_device,
-						  kfd.reset_work);
-
-	struct amdgpu_reset_context reset_context;
-
-	memset(&reset_context, 0, sizeof(reset_context));
-
-	reset_context.method = AMD_RESET_METHOD_NONE;
-	reset_context.reset_req_dev = adev;
-	clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
-
-	amdgpu_device_gpu_recover(adev, NULL, &reset_context);
+	amdgpu_device_gpu_recover(reset_context->reset_req_dev, NULL,
+				  reset_context);
 }
 
 void amdgpu_amdkfd_device_init(struct amdgpu_device *adev)
@@ -200,7 +189,6 @@ void amdgpu_amdkfd_device_init(struct amdgpu_device *adev)
 
 		amdgpu_amdkfd_total_mem_size += adev->gmc.real_vram_size;
 
-		INIT_WORK(&adev->kfd.reset_work, amdgpu_amdkfd_reset_work);
 	}
 }
 
@@ -268,9 +256,17 @@ int amdgpu_amdkfd_post_reset(struct amdgpu_device *adev)
 
 void amdgpu_amdkfd_gpu_reset(struct amdgpu_device *adev)
 {
-	if (amdgpu_device_should_recover_gpu(adev))
-		amdgpu_reset_domain_schedule(adev->reset_domain,
-					     &adev->kfd.reset_work);
+	struct amdgpu_reset_context reset_context;
+
+	if (amdgpu_device_should_recover_gpu(adev)) {
+		memset(&reset_context, 0, sizeof(reset_context));
+		reset_context.method = AMD_RESET_METHOD_NONE;
+		reset_context.reset_req_dev = adev;
+		clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
+
+		amdgpu_reset_schedule_work(adev, &reset_context,
+					   amdgpu_amdkfd_reset_work);
+	}
 }
 
 int amdgpu_amdkfd_alloc_gtt_mem(struct amdgpu_device *adev, size_t size,
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
index b34418e3e006..c36501f9ae0d 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
@@ -102,7 +102,6 @@ struct amdgpu_kfd_dev {
 	int64_t vram_used[MAX_XCP];
 	uint64_t vram_used_aligned[MAX_XCP];
 	bool init_complete;
-	struct work_struct reset_work;
 
 	/* HMM page migration MEMORY_DEVICE_PRIVATE mapping */
 	struct dev_pagemap pgmap;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 9061d79cd387..3e56ccb742bb 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -5152,21 +5152,6 @@ static int amdgpu_device_suspend_display_audio(struct amdgpu_device *adev)
 
 static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device *adev)
 {
-	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
-
-#if defined(CONFIG_DEBUG_FS)
-	if (!amdgpu_sriov_vf(adev))
-		cancel_work(&adev->reset_work);
-#endif
-
-	if (adev->kfd.dev)
-		cancel_work(&adev->kfd.reset_work);
-
-	if (amdgpu_sriov_vf(adev))
-		cancel_work(&adev->virt.flr_work);
-
-	if (con && adev->ras_enabled)
-		cancel_work(&con->recovery_work);
 
 }
 
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
index c694b41f6461..40786b135f4a 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
@@ -899,6 +899,14 @@ static int amdgpu_debugfs_fence_info_show(struct seq_file *m, void *unused)
 	return 0;
 }
 
+static void
+amdgpu_debugfs_reset_work(struct amdgpu_reset_context *reset_context)
+{
+	struct amdgpu_device *adev = reset_context->reset_req_dev;
+
+	amdgpu_device_gpu_recover(adev, NULL, reset_context);
+}
+
 /*
  * amdgpu_debugfs_gpu_recover - manually trigger a gpu reset & recover
  *
@@ -908,6 +916,7 @@ static int gpu_recover_get(void *data, u64 *val)
 {
 	struct amdgpu_device *adev = (struct amdgpu_device *)data;
 	struct drm_device *dev = adev_to_drm(adev);
+	struct amdgpu_reset_context reset_context;
 	int r;
 
 	r = pm_runtime_get_sync(dev->dev);
@@ -916,8 +925,14 @@ static int gpu_recover_get(void *data, u64 *val)
 		return 0;
 	}
 
-	if (amdgpu_reset_domain_schedule(adev->reset_domain, &adev->reset_work))
-		flush_work(&adev->reset_work);
+	memset(&reset_context, 0, sizeof(reset_context));
+	reset_context.method = AMD_RESET_METHOD_NONE;
+	reset_context.reset_req_dev = adev;
+	set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
+	set_bit(AMDGPU_RESET_SCHEDULE_NOW, &reset_context.flags);
+
+	amdgpu_reset_schedule_work(adev, &reset_context,
+				   amdgpu_debugfs_reset_work);
 
 	*val = atomic_read(&adev->reset_domain->reset_res);
 
@@ -931,22 +946,6 @@ DEFINE_SHOW_ATTRIBUTE(amdgpu_debugfs_fence_info);
 DEFINE_DEBUGFS_ATTRIBUTE(amdgpu_debugfs_gpu_recover_fops, gpu_recover_get, NULL,
 			 "%lld\n");
 
-static void amdgpu_debugfs_reset_work(struct work_struct *work)
-{
-	struct amdgpu_device *adev = container_of(work, struct amdgpu_device,
-						  reset_work);
-
-	struct amdgpu_reset_context reset_context;
-
-	memset(&reset_context, 0, sizeof(reset_context));
-
-	reset_context.method = AMD_RESET_METHOD_NONE;
-	reset_context.reset_req_dev = adev;
-	set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
-
-	amdgpu_device_gpu_recover(adev, NULL, &reset_context);
-}
-
 #endif
 
 void amdgpu_debugfs_fence_init(struct amdgpu_device *adev)
@@ -958,12 +957,9 @@ void amdgpu_debugfs_fence_init(struct amdgpu_device *adev)
 	debugfs_create_file("amdgpu_fence_info", 0444, root, adev,
 			    &amdgpu_debugfs_fence_info_fops);
 
-	if (!amdgpu_sriov_vf(adev)) {
-
-		INIT_WORK(&adev->reset_work, amdgpu_debugfs_reset_work);
+	if (!amdgpu_sriov_vf(adev))
 		debugfs_create_file("amdgpu_gpu_recover", 0444, root, adev,
 				    &amdgpu_debugfs_gpu_recover_fops);
-	}
 #endif
 }
 
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 7689395e44fd..9e8e904434f0 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -2011,12 +2011,11 @@ static int amdgpu_ras_badpages_read(struct amdgpu_device *adev,
 	return ret;
 }
 
-static void amdgpu_ras_do_recovery(struct work_struct *work)
+static void amdgpu_ras_do_recovery(struct amdgpu_reset_context *reset_context)
 {
-	struct amdgpu_ras *ras =
-		container_of(work, struct amdgpu_ras, recovery_work);
+	struct amdgpu_device *adev = reset_context->reset_req_dev;
+	struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
 	struct amdgpu_device *remote_adev = NULL;
-	struct amdgpu_device *adev = ras->adev;
 	struct list_head device_list, *device_list_handle =  NULL;
 
 	if (!ras->disable_ras_err_cnt_harvest) {
@@ -2040,37 +2039,9 @@ static void amdgpu_ras_do_recovery(struct work_struct *work)
 		amdgpu_put_xgmi_hive(hive);
 	}
 
-	if (amdgpu_device_should_recover_gpu(ras->adev)) {
-		struct amdgpu_reset_context reset_context;
-		memset(&reset_context, 0, sizeof(reset_context));
-
-		reset_context.method = AMD_RESET_METHOD_NONE;
-		reset_context.reset_req_dev = adev;
-
-		/* Perform full reset in fatal error mode */
-		if (!amdgpu_ras_is_poison_mode_supported(ras->adev))
-			set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
-		else {
-			clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
-
-			if (ras->gpu_reset_flags & AMDGPU_RAS_GPU_RESET_MODE2_RESET) {
-				ras->gpu_reset_flags &= ~AMDGPU_RAS_GPU_RESET_MODE2_RESET;
-				reset_context.method = AMD_RESET_METHOD_MODE2;
-			}
-
-			/* Fatal error occurs in poison mode, mode1 reset is used to
-			 * recover gpu.
-			 */
-			if (ras->gpu_reset_flags & AMDGPU_RAS_GPU_RESET_MODE1_RESET) {
-				ras->gpu_reset_flags &= ~AMDGPU_RAS_GPU_RESET_MODE1_RESET;
-				set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
-
-				psp_fatal_error_recovery_quirk(&adev->psp);
-			}
-		}
+	if (amdgpu_device_should_recover_gpu(ras->adev))
+		amdgpu_device_gpu_recover(ras->adev, NULL, reset_context);
 
-		amdgpu_device_gpu_recover(ras->adev, NULL, &reset_context);
-	}
 	atomic_set(&ras->in_recovery, 0);
 }
 
@@ -2313,7 +2284,6 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev)
 	}
 
 	mutex_init(&con->recovery_lock);
-	INIT_WORK(&con->recovery_work, amdgpu_ras_do_recovery);
 	atomic_set(&con->in_recovery, 0);
 	con->eeprom_control.bad_channel_bitmap = 0;
 
@@ -3160,9 +3130,38 @@ int amdgpu_ras_is_supported(struct amdgpu_device *adev,
 int amdgpu_ras_reset_gpu(struct amdgpu_device *adev)
 {
 	struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
+	struct amdgpu_reset_context reset_context;
+
+	memset(&reset_context, 0, sizeof(reset_context));
+
+	reset_context.method = AMD_RESET_METHOD_NONE;
+	reset_context.reset_req_dev = adev;
+
+	/* Perform full reset in fatal error mode */
+	if (!amdgpu_ras_is_poison_mode_supported(ras->adev)) {
+		set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
+	}
+	else {
+		clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
+		if (ras->gpu_reset_flags & AMDGPU_RAS_GPU_RESET_MODE2_RESET) {
+			ras->gpu_reset_flags &= ~AMDGPU_RAS_GPU_RESET_MODE2_RESET;
+			reset_context.method = AMD_RESET_METHOD_MODE2;
+		}
+
+		/* Fatal error occurs in poison mode, mode1 reset is used to
+		 * recover gpu.
+		 */
+		if (ras->gpu_reset_flags & AMDGPU_RAS_GPU_RESET_MODE1_RESET) {
+			ras->gpu_reset_flags &= ~AMDGPU_RAS_GPU_RESET_MODE1_RESET;
+			set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
+			psp_fatal_error_recovery_quirk(&adev->psp);
+		}
+	}
 
 	if (atomic_cmpxchg(&ras->in_recovery, 0, 1) == 0)
-		amdgpu_reset_domain_schedule(ras->adev->reset_domain, &ras->recovery_work);
+		amdgpu_reset_schedule_work(ras->adev, &reset_context,
+					   amdgpu_ras_do_recovery);
+
 	return 0;
 }
 
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
index fabb83e9d9ae..87e0a8b918df 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
@@ -237,7 +237,6 @@ struct amdgpu_virt {
 	uint32_t			reg_val_offs;
 	struct amdgpu_irq_src		ack_irq;
 	struct amdgpu_irq_src		rcv_irq;
-	struct work_struct		flr_work;
 	struct amdgpu_mm_table		mm_table;
 	const struct amdgpu_virt_ops	*ops;
 	struct amdgpu_vf_error_buffer	vf_errors;
diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
index 63725b2ebc03..53fdf6e70ad2 100644
--- a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
+++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
@@ -249,10 +249,9 @@ static int xgpu_ai_set_mailbox_ack_irq(struct amdgpu_device *adev,
 	return 0;
 }
 
-static void xgpu_ai_mailbox_flr_work(struct work_struct *work)
+static void xgpu_ai_mailbox_flr_work(struct amdgpu_reset_context *reset_context)
 {
-	struct amdgpu_virt *virt = container_of(work, struct amdgpu_virt, flr_work);
-	struct amdgpu_device *adev = container_of(virt, struct amdgpu_device, virt);
+	struct amdgpu_device *adev = reset_context->reset_req_dev;
 	int timeout = AI_MAILBOX_POLL_FLR_TIMEDOUT;
 
 	/* block amdgpu_gpu_recover till msg FLR COMPLETE received,
@@ -281,18 +280,10 @@ static void xgpu_ai_mailbox_flr_work(struct work_struct *work)
 	up_write(&adev->reset_domain->sem);
 
 	/* Trigger recovery for world switch failure if no TDR */
-	if (amdgpu_device_should_recover_gpu(adev)
-		&& (!amdgpu_device_has_job_running(adev) ||
-			adev->sdma_timeout == MAX_SCHEDULE_TIMEOUT)) {
-		struct amdgpu_reset_context reset_context;
-		memset(&reset_context, 0, sizeof(reset_context));
-
-		reset_context.method = AMD_RESET_METHOD_NONE;
-		reset_context.reset_req_dev = adev;
-		clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
-
-		amdgpu_device_gpu_recover(adev, NULL, &reset_context);
-	}
+	if (amdgpu_device_should_recover_gpu(adev) &&
+	    (!amdgpu_device_has_job_running(adev) ||
+	     adev->sdma_timeout == MAX_SCHEDULE_TIMEOUT))
+		amdgpu_device_gpu_recover(adev, NULL, reset_context);
 }
 
 static int xgpu_ai_set_mailbox_rcv_irq(struct amdgpu_device *adev,
@@ -314,14 +305,21 @@ static int xgpu_ai_mailbox_rcv_irq(struct amdgpu_device *adev,
 				   struct amdgpu_iv_entry *entry)
 {
 	enum idh_event event = xgpu_ai_mailbox_peek_msg(adev);
+	struct amdgpu_reset_context reset_context;
 
 	switch (event) {
 		case IDH_FLR_NOTIFICATION:
+		memset(&reset_context, 0, sizeof(reset_context));
+
+		reset_context.method = AMD_RESET_METHOD_NONE;
+		reset_context.reset_req_dev = adev;
+		clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
+
 		if (amdgpu_sriov_runtime(adev) && !amdgpu_in_reset(adev))
-			WARN_ONCE(!amdgpu_reset_domain_schedule(adev->reset_domain,
-								&adev->virt.flr_work),
-				  "Failed to queue work! at %s",
-				  __func__);
+			WARN_ONCE(!amdgpu_reset_schedule_work(
+					  adev, &reset_context,
+					  xgpu_ai_mailbox_flr_work),
+				  "Failed to queue work! at %s", __func__);
 		break;
 		case IDH_QUERY_ALIVE:
 			xgpu_ai_mailbox_send_ack(adev);
@@ -388,8 +386,6 @@ int xgpu_ai_mailbox_get_irq(struct amdgpu_device *adev)
 		return r;
 	}
 
-	INIT_WORK(&adev->virt.flr_work, xgpu_ai_mailbox_flr_work);
-
 	return 0;
 }
 
diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
index 6a68ee946f1c..171fe3e84ddf 100644
--- a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
+++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
@@ -271,10 +271,9 @@ static int xgpu_nv_set_mailbox_ack_irq(struct amdgpu_device *adev,
 	return 0;
 }
 
-static void xgpu_nv_mailbox_flr_work(struct work_struct *work)
+static void xgpu_nv_mailbox_flr_work(struct amdgpu_reset_context *reset_context)
 {
-	struct amdgpu_virt *virt = container_of(work, struct amdgpu_virt, flr_work);
-	struct amdgpu_device *adev = container_of(virt, struct amdgpu_device, virt);
+	struct amdgpu_device *adev = reset_context->reset_req_dev;
 	int timeout = NV_MAILBOX_POLL_FLR_TIMEDOUT;
 
 	/* block amdgpu_gpu_recover till msg FLR COMPLETE received,
@@ -303,21 +302,13 @@ static void xgpu_nv_mailbox_flr_work(struct work_struct *work)
 	up_write(&adev->reset_domain->sem);
 
 	/* Trigger recovery for world switch failure if no TDR */
-	if (amdgpu_device_should_recover_gpu(adev)
-		&& (!amdgpu_device_has_job_running(adev) ||
-		adev->sdma_timeout == MAX_SCHEDULE_TIMEOUT ||
-		adev->gfx_timeout == MAX_SCHEDULE_TIMEOUT ||
-		adev->compute_timeout == MAX_SCHEDULE_TIMEOUT ||
-		adev->video_timeout == MAX_SCHEDULE_TIMEOUT)) {
-		struct amdgpu_reset_context reset_context;
-		memset(&reset_context, 0, sizeof(reset_context));
-
-		reset_context.method = AMD_RESET_METHOD_NONE;
-		reset_context.reset_req_dev = adev;
-		clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
-
-		amdgpu_device_gpu_recover(adev, NULL, &reset_context);
-	}
+	if (amdgpu_device_should_recover_gpu(adev) &&
+	    (!amdgpu_device_has_job_running(adev) ||
+	     adev->sdma_timeout == MAX_SCHEDULE_TIMEOUT ||
+	     adev->gfx_timeout == MAX_SCHEDULE_TIMEOUT ||
+	     adev->compute_timeout == MAX_SCHEDULE_TIMEOUT ||
+	     adev->video_timeout == MAX_SCHEDULE_TIMEOUT))
+		amdgpu_device_gpu_recover(adev, NULL, reset_context);
 }
 
 static int xgpu_nv_set_mailbox_rcv_irq(struct amdgpu_device *adev,
@@ -342,14 +333,21 @@ static int xgpu_nv_mailbox_rcv_irq(struct amdgpu_device *adev,
 				   struct amdgpu_iv_entry *entry)
 {
 	enum idh_event event = xgpu_nv_mailbox_peek_msg(adev);
+	struct amdgpu_reset_context reset_context;
 
 	switch (event) {
 	case IDH_FLR_NOTIFICATION:
+		memset(&reset_context, 0, sizeof(reset_context));
+
+		reset_context.method = AMD_RESET_METHOD_NONE;
+		reset_context.reset_req_dev = adev;
+		clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
+
 		if (amdgpu_sriov_runtime(adev) && !amdgpu_in_reset(adev))
-			WARN_ONCE(!amdgpu_reset_domain_schedule(adev->reset_domain,
-				   &adev->virt.flr_work),
-				  "Failed to queue work! at %s",
-				  __func__);
+			WARN_ONCE(!amdgpu_reset_schedule_work(
+					  adev, &reset_context,
+					  xgpu_nv_mailbox_flr_work),
+				  "Failed to queue work! at %s", __func__);
 		break;
 		/* READY_TO_ACCESS_GPU is fetched by kernel polling, IRQ can ignore
 		 * it byfar since that polling thread will handle it,
@@ -413,8 +411,6 @@ int xgpu_nv_mailbox_get_irq(struct amdgpu_device *adev)
 		return r;
 	}
 
-	INIT_WORK(&adev->virt.flr_work, xgpu_nv_mailbox_flr_work);
-
 	return 0;
 }
 
diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c b/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c
index 59f53c743362..a39805bc69c1 100644
--- a/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c
+++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c
@@ -510,10 +510,9 @@ static int xgpu_vi_set_mailbox_ack_irq(struct amdgpu_device *adev,
 	return 0;
 }
 
-static void xgpu_vi_mailbox_flr_work(struct work_struct *work)
+static void xgpu_vi_mailbox_flr_work(struct amdgpu_reset_context *reset_context)
 {
-	struct amdgpu_virt *virt = container_of(work, struct amdgpu_virt, flr_work);
-	struct amdgpu_device *adev = container_of(virt, struct amdgpu_device, virt);
+	struct amdgpu_device *adev = reset_context->reset_req_dev;
 
 	/* wait until RCV_MSG become 3 */
 	if (xgpu_vi_poll_msg(adev, IDH_FLR_NOTIFICATION_CMPL)) {
@@ -522,16 +521,8 @@ static void xgpu_vi_mailbox_flr_work(struct work_struct *work)
 	}
 
 	/* Trigger recovery due to world switch failure */
-	if (amdgpu_device_should_recover_gpu(adev)) {
-		struct amdgpu_reset_context reset_context;
-		memset(&reset_context, 0, sizeof(reset_context));
-
-		reset_context.method = AMD_RESET_METHOD_NONE;
-		reset_context.reset_req_dev = adev;
-		clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
-
-		amdgpu_device_gpu_recover(adev, NULL, &reset_context);
-	}
+	if (amdgpu_device_should_recover_gpu(adev))
+		amdgpu_device_gpu_recover(adev, NULL, reset_context);
 }
 
 static int xgpu_vi_set_mailbox_rcv_irq(struct amdgpu_device *adev,
@@ -553,18 +544,24 @@ static int xgpu_vi_mailbox_rcv_irq(struct amdgpu_device *adev,
 				   struct amdgpu_iv_entry *entry)
 {
 	int r;
+	struct amdgpu_reset_context reset_context;
 
 	/* trigger gpu-reset by hypervisor only if TDR disabled */
 	if (!amdgpu_gpu_recovery) {
 		/* see what event we get */
 		r = xgpu_vi_mailbox_rcv_msg(adev, IDH_FLR_NOTIFICATION);
+		memset(&reset_context, 0, sizeof(reset_context));
+
+		reset_context.method = AMD_RESET_METHOD_NONE;
+		reset_context.reset_req_dev = adev;
+		clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
 
 		/* only handle FLR_NOTIFY now */
 		if (!r && !amdgpu_in_reset(adev))
-			WARN_ONCE(!amdgpu_reset_domain_schedule(adev->reset_domain,
-								&adev->virt.flr_work),
-				  "Failed to queue work! at %s",
-				  __func__);
+			WARN_ONCE(!amdgpu_reset_schedule_work(
+					  adev, &reset_context,
+					  xgpu_vi_mailbox_flr_work),
+				  "Failed to queue work! at %s", __func__);
 	}
 
 	return 0;
@@ -618,8 +615,6 @@ int xgpu_vi_mailbox_get_irq(struct amdgpu_device *adev)
 		return r;
 	}
 
-	INIT_WORK(&adev->virt.flr_work, xgpu_vi_mailbox_flr_work);
-
 	return 0;
 }
 
-- 
2.25.1