[PATCH v4 2/4] drm/amdgpu: Add reset_context flag for host FLR

Deng, Emily Emily.Deng at amd.com
Sun Apr 28 07:16:23 UTC 2024


[AMD Official Use Only - General]

Reviewed-by: Emily Deng <Emily.Deng at amd.com>

Emily Deng
Best Wishes



>-----Original Message-----
>From: Li, Yunxiang (Teddy) <Yunxiang.Li at amd.com>
>Sent: Saturday, April 27, 2024 2:27 AM
>To: amd-gfx at lists.freedesktop.org
>Cc: Deucher, Alexander <Alexander.Deucher at amd.com>; Koenig, Christian
><Christian.Koenig at amd.com>; Lazar, Lijo <Lijo.Lazar at amd.com>; Kuehling,
>Felix <Felix.Kuehling at amd.com>; Deng, Emily <Emily.Deng at amd.com>; Li,
>Yunxiang (Teddy) <Yunxiang.Li at amd.com>
>Subject: [PATCH v4 2/4] drm/amdgpu: Add reset_context flag for host FLR
>
>There are other reset sources that pass NULL as the job pointer, such as
>amdgpu_amdkfd_reset_work. Therefore, using the job pointer to check if the
>FLR comes from the host does not work.
>
>Add a flag in reset_context to explicitly mark host triggered reset, and set
>this flag when we receive host reset notification.
>
>Signed-off-by: Yunxiang Li <Yunxiang.Li at amd.com>
>---
>v2: fix typo
>v3: pass reset_context directly
>v4: clear the flag in case we retry
>
> drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 13 ++++++++-----
>drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h  |  1 +
> drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c      |  1 +
> drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c      |  1 +
> drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c      |  1 +
> 5 files changed, 12 insertions(+), 5 deletions(-)
>
>diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>index 8befd10bf007..33c889c027a5 100644
>--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>@@ -5055,13 +5055,13 @@ static int amdgpu_device_recover_vram(struct
>amdgpu_device *adev)
>  * amdgpu_device_reset_sriov - reset ASIC for SR-IOV vf
>  *
>  * @adev: amdgpu_device pointer
>- * @from_hypervisor: request from hypervisor
>+ * @reset_context: amdgpu reset context pointer
>  *
>  * do VF FLR and reinitialize Asic
>  * return 0 means succeeded otherwise failed
>  */
> static int amdgpu_device_reset_sriov(struct amdgpu_device *adev,
>-                                   bool from_hypervisor)
>+                                   struct amdgpu_reset_context
>*reset_context)
> {
>       int r;
>       struct amdgpu_hive_info *hive = NULL;
>@@ -5070,12 +5070,15 @@ static int amdgpu_device_reset_sriov(struct
>amdgpu_device *adev,
> retry:
>       amdgpu_amdkfd_pre_reset(adev);
>
>-      if (from_hypervisor)
>+      if (test_bit(AMDGPU_HOST_FLR, &reset_context->flags)) {
>+              clear_bit(AMDGPU_HOST_FLR, &reset_context->flags);
>               r = amdgpu_virt_request_full_gpu(adev, true);
>-      else
>+      } else {
>               r = amdgpu_virt_reset_gpu(adev);
>+      }
>       if (r)
>               return r;
>+
>       amdgpu_ras_set_fed(adev, false);
>       amdgpu_irq_gpu_reset_resume_helper(adev);
>
>@@ -5826,7 +5829,7 @@ int amdgpu_device_gpu_recover(struct
>amdgpu_device *adev,
>       /* Actual ASIC resets if needed.*/
>       /* Host driver will handle XGMI hive reset for SRIOV */
>       if (amdgpu_sriov_vf(adev)) {
>-              r = amdgpu_device_reset_sriov(adev, job ? false : true);
>+              r = amdgpu_device_reset_sriov(adev, reset_context);
>               if (r)
>                       adev->asic_reset_res = r;
>
>diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
>b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
>index b11d190ece53..5a9cc043b858 100644
>--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
>+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
>@@ -33,6 +33,7 @@ enum AMDGPU_RESET_FLAGS {
>       AMDGPU_NEED_FULL_RESET = 0,
>       AMDGPU_SKIP_HW_RESET = 1,
>       AMDGPU_SKIP_COREDUMP = 2,
>+      AMDGPU_HOST_FLR = 3,
> };
>
> struct amdgpu_reset_context {
>diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
>b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
>index c5ba9c4757a8..f4c47492e0cd 100644
>--- a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
>+++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
>@@ -292,6 +292,7 @@ static void xgpu_ai_mailbox_flr_work(struct
>work_struct *work)
>               reset_context.method = AMD_RESET_METHOD_NONE;
>               reset_context.reset_req_dev = adev;
>               clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
>+              set_bit(AMDGPU_HOST_FLR, &reset_context.flags);
>
>               amdgpu_device_gpu_recover(adev, NULL, &reset_context);
>       }
>diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
>b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
>index fa9d1b02f391..14cc7910e5cf 100644
>--- a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
>+++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
>@@ -328,6 +328,7 @@ static void xgpu_nv_mailbox_flr_work(struct
>work_struct *work)
>               reset_context.method = AMD_RESET_METHOD_NONE;
>               reset_context.reset_req_dev = adev;
>               clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
>+              set_bit(AMDGPU_HOST_FLR, &reset_context.flags);
>
>               amdgpu_device_gpu_recover(adev, NULL, &reset_context);
>       }
>diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c
>b/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c
>index 14a065516ae4..78cd07744ebe 100644
>--- a/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c
>+++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c
>@@ -529,6 +529,7 @@ static void xgpu_vi_mailbox_flr_work(struct
>work_struct *work)
>               reset_context.method = AMD_RESET_METHOD_NONE;
>               reset_context.reset_req_dev = adev;
>               clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
>+              set_bit(AMDGPU_HOST_FLR, &reset_context.flags);
>
>               amdgpu_device_gpu_recover(adev, NULL, &reset_context);
>       }
>--
>2.34.1



More information about the amd-gfx mailing list