Am 25.01.22 um 23:37 schrieb Andrey Grodzovsky:
Defined a reset_domain struct such that all the entities that go through reset together will be serialized one against another. Do it for both single device and XGMI hive cases.
Signed-off-by: Andrey Grodzovsky andrey.grodzovsky@amd.com Suggested-by: Daniel Vetter daniel.vetter@ffwll.ch Suggested-by: Christian König ckoenig.leichtzumerken@gmail.com Reviewed-by: Christian König christian.koenig@amd.com
drivers/gpu/drm/amd/amdgpu/amdgpu.h | 7 +++++++ drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 20 +++++++++++++++++++- drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c | 9 +++++++++ drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h | 2 ++ 4 files changed, 37 insertions(+), 1 deletion(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h index 9f017663ac50..b5ff76aae7e0 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h @@ -812,6 +812,11 @@ struct amd_powerplay {
#define AMDGPU_RESET_MAGIC_NUM 64 #define AMDGPU_MAX_DF_PERFMONS 4
+struct amdgpu_reset_domain {
- struct workqueue_struct *wq;
+};
- struct amdgpu_device { struct device *dev; struct pci_dev *pdev;
@@ -1096,6 +1101,8 @@ struct amdgpu_device {
struct amdgpu_reset_control *reset_cntl; uint32_t ip_versions[HW_ID_MAX][HWIP_MAX_INSTANCE];
- struct amdgpu_reset_domain reset_domain;
I'm a bit confused, shouldn't this be a pointer?
Regards, Christian.
};
static inline struct amdgpu_device *drm_to_adev(struct drm_device *ddev) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 90d22a376632..0f3e6c078f88 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -2391,9 +2391,27 @@ static int amdgpu_device_ip_init(struct amdgpu_device *adev) if (r) goto init_failed;
- if (adev->gmc.xgmi.num_physical_nodes > 1)
if (adev->gmc.xgmi.num_physical_nodes > 1) {
struct amdgpu_hive_info *hive;
amdgpu_xgmi_add_device(adev);
hive = amdgpu_get_xgmi_hive(adev);
if (!hive || !hive->reset_domain.wq) {
DRM_ERROR("Failed to obtain reset domain info for XGMI hive:%llx", hive->hive_id);
r = -EINVAL;
goto init_failed;
}
adev->reset_domain.wq = hive->reset_domain.wq;
} else {
adev->reset_domain.wq = alloc_ordered_workqueue("amdgpu-reset-dev", 0);
if (!adev->reset_domain.wq) {
r = -ENOMEM;
goto init_failed;
}
}
/* Don't init kfd if whole hive need to be reset during init */ if (!adev->gmc.xgmi.pending_reset) amdgpu_amdkfd_device_init(adev);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c index 567df2db23ac..a858e3457c5c 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c @@ -392,6 +392,14 @@ struct amdgpu_hive_info *amdgpu_get_xgmi_hive(struct amdgpu_device *adev) goto pro_end; }
- hive->reset_domain.wq = alloc_ordered_workqueue("amdgpu-reset-hive", 0);
- if (!hive->reset_domain.wq) {
dev_err(adev->dev, "XGMI: failed allocating wq for reset domain!\n");
kfree(hive);
hive = NULL;
goto pro_end;
- }
- hive->hive_id = adev->gmc.xgmi.hive_id; INIT_LIST_HEAD(&hive->device_list); INIT_LIST_HEAD(&hive->node);
@@ -401,6 +409,7 @@ struct amdgpu_hive_info *amdgpu_get_xgmi_hive(struct amdgpu_device *adev) task_barrier_init(&hive->tb); hive->pstate = AMDGPU_XGMI_PSTATE_UNKNOWN; hive->hi_req_gpu = NULL;
- /*
- hive pstate on boot is high in vega20 so we have to go to low
- pstate on after boot.
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h index d2189bf7d428..6121aaa292cb 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h @@ -42,6 +42,8 @@ struct amdgpu_hive_info { AMDGPU_XGMI_PSTATE_MAX_VEGA20, AMDGPU_XGMI_PSTATE_UNKNOWN } pstate;
struct amdgpu_reset_domain reset_domain; };
struct amdgpu_pcs_ras_field {