[PATCH v4 2/4] drm/amdgpu: Add reset_context flag for host FLR

Luo, Zhigang Zhigang.Luo at amd.com
Tue Apr 30 18:28:13 UTC 2024


[AMD Official Use Only - General]

Reviewed-by: Zhigang Luo <zhigang.luo at amd.com>

-----Original Message-----
From: amd-gfx <amd-gfx-bounces at lists.freedesktop.org> On Behalf Of Yunxiang Li
Sent: Friday, April 26, 2024 2:27 PM
To: amd-gfx at lists.freedesktop.org
Cc: Deucher, Alexander <Alexander.Deucher at amd.com>; Koenig, Christian <Christian.Koenig at amd.com>; Lazar, Lijo <Lijo.Lazar at amd.com>; Kuehling, Felix <Felix.Kuehling at amd.com>; Deng, Emily <Emily.Deng at amd.com>; Li, Yunxiang (Teddy) <Yunxiang.Li at amd.com>
Subject: [PATCH v4 2/4] drm/amdgpu: Add reset_context flag for host FLR

There are other reset sources that pass NULL as the job pointer, such as amdgpu_amdkfd_reset_work. Therefore, using the job pointer to check if the FLR comes from the host does not work.

Add a flag in reset_context to explicitly mark host triggered reset, and set this flag when we receive host reset notification.

Signed-off-by: Yunxiang Li <Yunxiang.Li at amd.com>
---
v2: fix typo
v3: pass reset_context directly
v4: clear the flag in case we retry

 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 13 ++++++++-----  drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h  |  1 +
 drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c      |  1 +
 drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c      |  1 +
 drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c      |  1 +
 5 files changed, 12 insertions(+), 5 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 8befd10bf007..33c889c027a5 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -5055,13 +5055,13 @@ static int amdgpu_device_recover_vram(struct amdgpu_device *adev)
  * amdgpu_device_reset_sriov - reset ASIC for SR-IOV vf
  *
  * @adev: amdgpu_device pointer
- * @from_hypervisor: request from hypervisor
+ * @reset_context: amdgpu reset context pointer
  *
  * do VF FLR and reinitialize Asic
  * return 0 means succeeded otherwise failed
  */
 static int amdgpu_device_reset_sriov(struct amdgpu_device *adev,
-                                    bool from_hypervisor)
+                                    struct amdgpu_reset_context *reset_context)
 {
        int r;
        struct amdgpu_hive_info *hive = NULL;
@@ -5070,12 +5070,15 @@ static int amdgpu_device_reset_sriov(struct amdgpu_device *adev,
 retry:
        amdgpu_amdkfd_pre_reset(adev);

-       if (from_hypervisor)
+       if (test_bit(AMDGPU_HOST_FLR, &reset_context->flags)) {
+               clear_bit(AMDGPU_HOST_FLR, &reset_context->flags);
                r = amdgpu_virt_request_full_gpu(adev, true);
-       else
+       } else {
                r = amdgpu_virt_reset_gpu(adev);
+       }
        if (r)
                return r;
+
        amdgpu_ras_set_fed(adev, false);
        amdgpu_irq_gpu_reset_resume_helper(adev);

@@ -5826,7 +5829,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
        /* Actual ASIC resets if needed.*/
        /* Host driver will handle XGMI hive reset for SRIOV */
        if (amdgpu_sriov_vf(adev)) {
-               r = amdgpu_device_reset_sriov(adev, job ? false : true);
+               r = amdgpu_device_reset_sriov(adev, reset_context);
                if (r)
                        adev->asic_reset_res = r;

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
index b11d190ece53..5a9cc043b858 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
@@ -33,6 +33,7 @@ enum AMDGPU_RESET_FLAGS {
        AMDGPU_NEED_FULL_RESET = 0,
        AMDGPU_SKIP_HW_RESET = 1,
        AMDGPU_SKIP_COREDUMP = 2,
+       AMDGPU_HOST_FLR = 3,
 };

 struct amdgpu_reset_context {
diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
index c5ba9c4757a8..f4c47492e0cd 100644
--- a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
+++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
@@ -292,6 +292,7 @@ static void xgpu_ai_mailbox_flr_work(struct work_struct *work)
                reset_context.method = AMD_RESET_METHOD_NONE;
                reset_context.reset_req_dev = adev;
                clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
+               set_bit(AMDGPU_HOST_FLR, &reset_context.flags);

                amdgpu_device_gpu_recover(adev, NULL, &reset_context);
        }
diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
index fa9d1b02f391..14cc7910e5cf 100644
--- a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
+++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
@@ -328,6 +328,7 @@ static void xgpu_nv_mailbox_flr_work(struct work_struct *work)
                reset_context.method = AMD_RESET_METHOD_NONE;
                reset_context.reset_req_dev = adev;
                clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
+               set_bit(AMDGPU_HOST_FLR, &reset_context.flags);

                amdgpu_device_gpu_recover(adev, NULL, &reset_context);
        }
diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c b/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c
index 14a065516ae4..78cd07744ebe 100644
--- a/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c
+++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c
@@ -529,6 +529,7 @@ static void xgpu_vi_mailbox_flr_work(struct work_struct *work)
                reset_context.method = AMD_RESET_METHOD_NONE;
                reset_context.reset_req_dev = adev;
                clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
+               set_bit(AMDGPU_HOST_FLR, &reset_context.flags);

                amdgpu_device_gpu_recover(adev, NULL, &reset_context);
        }
--
2.34.1



More information about the amd-gfx mailing list