[RFC v4] drm/amdgpu: Rework reset domain to be refcounted.
Andrey Grodzovsky
andrey.grodzovsky at amd.com
Tue Feb 8 16:19:50 UTC 2022
On 2022-02-08 06:25, Lazar, Lijo wrote:
>
>
> On 2/2/2022 10:56 PM, Andrey Grodzovsky wrote:
>> The reset domain contains register access semaphor
>> now and so needs to be present as long as each device
>> in a hive needs it and so it cannot be binded to XGMI
>> hive life cycle.
>> Adress this by making reset domain refcounted and pointed
>> by each member of the hive and the hive itself.
>>
>> v4:
>> Fix crash on boot with XGMI hive by adding type to reset_domain.
>> XGMI will only create a new reset_domain if prevoius was of single
>> device type meaning it's first boot. Otherwsie it will take a
>> refocunt to exsiting reset_domain from the amdgou device.
>>
>> Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky at amd.com>
>> ---
>> drivers/gpu/drm/amd/amdgpu/amdgpu.h | 6 +--
>> drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 44 +++++++++++++---------
>> drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c | 38 +++++++++++++++++++
>> drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h | 18 +++++++++
>> drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c | 29 +++++++++++---
>> drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h | 2 +-
>> drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c | 4 +-
>> drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c | 4 +-
>> drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c | 4 +-
>> 9 files changed, 118 insertions(+), 31 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>> index 8e96b9a14452..f2ba460bfd59 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>> @@ -813,9 +813,7 @@ struct amd_powerplay {
>> #define AMDGPU_RESET_MAGIC_NUM 64
>> #define AMDGPU_MAX_DF_PERFMONS 4
>> -struct amdgpu_reset_domain {
>> - struct workqueue_struct *wq;
>> -};
>> +struct amdgpu_reset_domain;
>> struct amdgpu_device {
>> struct device *dev;
>> @@ -1102,7 +1100,7 @@ struct amdgpu_device {
>> struct amdgpu_reset_control *reset_cntl;
>> uint32_t ip_versions[HW_ID_MAX][HWIP_MAX_INSTANCE];
>> - struct amdgpu_reset_domain reset_domain;
>> + struct amdgpu_reset_domain *reset_domain;
>> };
>> static inline struct amdgpu_device *drm_to_adev(struct drm_device
>> *ddev)
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> index fef952ca8db5..cd1b7af69c35 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> @@ -2313,7 +2313,7 @@ static int amdgpu_device_init_schedulers(struct
>> amdgpu_device *adev)
>> r = drm_sched_init(&ring->sched, &amdgpu_sched_ops,
>> ring->num_hw_submission, amdgpu_job_hang_limit,
>> - timeout, adev->reset_domain.wq,
>> ring->sched_score, ring->name);
>> + timeout, adev->reset_domain->wq,
>> ring->sched_score, ring->name);
>> if (r) {
>> DRM_ERROR("Failed to create scheduler on ring %s.\n",
>> ring->name);
>> @@ -2432,24 +2432,22 @@ static int amdgpu_device_ip_init(struct
>> amdgpu_device *adev)
>> if (r)
>> goto init_failed;
>> + /**
>> + * In case of XGMI grab extra reference for reset domain for
>> this device
>> + */
>> if (adev->gmc.xgmi.num_physical_nodes > 1) {
>> - struct amdgpu_hive_info *hive;
>> -
>> - amdgpu_xgmi_add_device(adev);
>> + if (amdgpu_xgmi_add_device(adev) == 0) {
>> + struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev);
>> - hive = amdgpu_get_xgmi_hive(adev);
>> - if (!hive || !hive->reset_domain.wq) {
>> - DRM_ERROR("Failed to obtain reset domain info for XGMI
>> hive:%llx", hive->hive_id);
>> - r = -EINVAL;
>> - goto init_failed;
>> - }
>> + if (!hive->reset_domain ||
>> + !kref_get_unless_zero(&hive->reset_domain->refcount)) {
>> + r = -ENOENT;
>> + goto init_failed;
>> + }
>> - adev->reset_domain.wq = hive->reset_domain.wq;
>> - } else {
>> - adev->reset_domain.wq =
>> alloc_ordered_workqueue("amdgpu-reset-dev", 0);
>> - if (!adev->reset_domain.wq) {
>> - r = -ENOMEM;
>> - goto init_failed;
>> + /* Drop the early temporary reset domain we created for
>> device */
>> + kref_put(&adev->reset_domain->refcount,
>> amdgpu_reset_destroy_reset_domain);
>> + adev->reset_domain = hive->reset_domain;
>> }
>> }
>> @@ -3599,6 +3597,15 @@ int amdgpu_device_init(struct amdgpu_device
>> *adev,
>> return r;
>> }
>> + /*
>> + * Reset domain needs to be present early, before XGMI hive
>> discovered
>> + * (if any) and intitialized to use reset sem and in_gpu reset flag
>> + * early on during init.
>> + */
>> + adev->reset_domain =
>> amdgpu_reset_create_reset_domain(SINGLE_DEVICE ,"amdgpu-reset-dev");
>> + if (!adev->reset_domain)
>> + return -ENOMEM;
>> +
>> /* early init functions */
>> r = amdgpu_device_ip_early_init(adev);
>> if (r)
>> @@ -3949,6 +3956,9 @@ void amdgpu_device_fini_sw(struct amdgpu_device
>> *adev)
>> if (adev->mman.discovery_bin)
>> amdgpu_discovery_fini(adev);
>> + kref_put(&adev->reset_domain->refcount,
>> amdgpu_reset_destroy_reset_domain);
>> + adev->reset_domain = NULL;
>> +
>> kfree(adev->pci_state);
>> }
>> @@ -5186,7 +5196,7 @@ int amdgpu_device_gpu_recover(struct
>> amdgpu_device *adev,
>> INIT_WORK(&work.base, amdgpu_device_queue_gpu_recover_work);
>> - if (!queue_work(adev->reset_domain.wq, &work.base))
>> + if (!queue_work(adev->reset_domain->wq, &work.base))
>> return -EAGAIN;
>> flush_work(&work.base);
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
>> index 02afd4115675..14f003d5ebe8 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
>> @@ -96,3 +96,41 @@ int amdgpu_reset_perform_reset(struct
>> amdgpu_device *adev,
>> return reset_handler->restore_hwcontext(adev->reset_cntl,
>> reset_context);
>> }
>> +
>> +
>> +void amdgpu_reset_destroy_reset_domain(struct kref *ref)
>> +{
>> + struct amdgpu_reset_domain *reset_domain = container_of(ref,
>> + struct amdgpu_reset_domain,
>> + refcount);
>> + destroy_workqueue(reset_domain->wq);
>> + kvfree(reset_domain);
>> +}
>> +
>> +struct amdgpu_reset_domain *amdgpu_reset_create_reset_domain(enum
>> amdgpu_reset_domain_type type,
>> + char *wq_name)
>> +{
>> + struct amdgpu_reset_domain *reset_domain;
>> +
>> + reset_domain = kvzalloc(sizeof(struct amdgpu_reset_domain),
>> GFP_KERNEL);
>> + if (!reset_domain) {
>> + DRM_ERROR("Failed to allocate amdgpu_reset_domain!");
>> + return NULL;
>> + }
>> +
>> + reset_domain->type = type;
>> + kref_init(&reset_domain->refcount);
>> +
>> + reset_domain->wq = create_singlethread_workqueue(wq_name);
>> + if (!reset_domain->wq) {
>> + DRM_ERROR("Failed to allocate wq for amdgpu_reset_domain!");
>> + kref_put(&reset_domain->refcount,
>> amdgpu_reset_destroy_reset_domain);
>> + return NULL;
>> +
>> + }
>> +
>> + return reset_domain;
>> +}
>> +
>> +
>> +
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
>> index e00d38d9160a..0b310cd963d9 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
>> @@ -70,6 +70,19 @@ struct amdgpu_reset_control {
>> void (*async_reset)(struct work_struct *work);
>> };
>> +
>> +enum amdgpu_reset_domain_type {
>> + SINGLE_DEVICE,
>> + XGMI_HIVE
>> +};
>> +
>> +struct amdgpu_reset_domain {
>> + struct kref refcount;
>> + struct workqueue_struct *wq;
>> + enum amdgpu_reset_domain_type type;
>> +};
>> +
>> +
>> int amdgpu_reset_init(struct amdgpu_device *adev);
>> int amdgpu_reset_fini(struct amdgpu_device *adev);
>> @@ -82,4 +95,9 @@ int amdgpu_reset_perform_reset(struct
>> amdgpu_device *adev,
>> int amdgpu_reset_add_handler(struct amdgpu_reset_control *reset_ctl,
>> struct amdgpu_reset_handler *handler);
>> +struct amdgpu_reset_domain *amdgpu_reset_create_reset_domain(enum
>> amdgpu_reset_domain_type type,
>> + char *wq_name);
>> +
>> +void amdgpu_reset_destroy_reset_domain(struct kref *ref);
>> +
>> #endif
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
>> index 9ad742039ac9..a216e88a7b54 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
>> @@ -32,6 +32,8 @@
>> #include "wafl/wafl2_4_0_0_smn.h"
>> #include "wafl/wafl2_4_0_0_sh_mask.h"
>> +#include "amdgpu_reset.h"
>> +
>> #define smnPCS_XGMI23_PCS_ERROR_STATUS 0x11a01210
>> #define smnPCS_XGMI3X16_PCS_ERROR_STATUS 0x11a0020c
>> #define smnPCS_GOPX1_PCS_ERROR_STATUS 0x12200210
>> @@ -226,6 +228,9 @@ static void amdgpu_xgmi_hive_release(struct
>> kobject *kobj)
>> struct amdgpu_hive_info *hive = container_of(
>> kobj, struct amdgpu_hive_info, kobj);
>> + kref_put(&hive->reset_domain->refcount,
>> amdgpu_reset_destroy_reset_domain);
>> + hive->reset_domain = NULL;
>> +
>> mutex_destroy(&hive->hive_lock);
>> kfree(hive);
>> }
>> @@ -392,12 +397,24 @@ struct amdgpu_hive_info
>> *amdgpu_get_xgmi_hive(struct amdgpu_device *adev)
>> goto pro_end;
>> }
>> - hive->reset_domain.wq =
>> alloc_ordered_workqueue("amdgpu-reset-hive", 0);
>> - if (!hive->reset_domain.wq) {
>> - dev_err(adev->dev, "XGMI: failed allocating wq for reset
>> domain!\n");
>> - kfree(hive);
>> - hive = NULL;
>> - goto pro_end;
>> + /**
>> + * Avoid recreating reset domain when hive is reconstructed for
>> the case
>> + * of reset the devices in the XGMI hive during probe for SRIOV
>> + * See https://www.spinics.net/lists/amd-gfx/msg58836.html
>> + */
>> + if (adev->reset_domain->type != XGMI_HIVE) {
>> + hive->reset_domain =
>> amdgpu_reset_create_reset_domain(XGMI_HIVE, "amdgpu-reset-hive");
>> + if (!hive->reset_domain) {
>> + dev_err(adev->dev, "XGMI: failed initializing reset
>> domain for xgmi hive\n");
>> + ret = -ENOMEM;
>> + kobject_put(&hive->kobj);
>> + kfree(hive);
>> + hive = NULL;
>> + goto pro_end;
>> + }
>> + } else {
>> + kref_get_unless_zero(&adev->reset_domain->refcount);
>> + hive->reset_domain = adev->reset_domain;
>
> Suggest to wrap this like -
> amdgpu_reset_domain_get(reset_domain)
>
> and another like
> amdgpu_reset_domain_put(reset_domain)
I already use kref_put, kref_get on reset domain so this will be
confusing to use same naming for
reset domain creation a bit I think to use it for creation.
I can do those wrappers around the direct usage of kref_put/kref_get
>
> Even smaller wrappers like
> amdgpu_reset_domain_schedule(reset_domain, reset_work)
This really would be a one line syntactic wrapper but sure
>
> Other than that, looks good to me (need to combine with earlier series
> as this misses a few other members in reset domain).
It's all because i added this and later patches on top of existing old
patches since i had a lot of merge conflicts if i put this changes in
original patches.
See patches 9 and 10 for moving the other members into reset domain.
Andrey
>
> Thanks,
> Lijo
>
>> }
>> hive->hive_id = adev->gmc.xgmi.hive_id;
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h
>> index 2f2ce53645a5..dcaad22be492 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h
>> @@ -42,7 +42,7 @@ struct amdgpu_hive_info {
>> AMDGPU_XGMI_PSTATE_UNKNOWN
>> } pstate;
>> - struct amdgpu_reset_domain reset_domain;
>> + struct amdgpu_reset_domain *reset_domain;
>> };
>> struct amdgpu_pcs_ras_field {
>> diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
>> b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
>> index b2b40e169342..05e98af30b48 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
>> @@ -32,6 +32,8 @@
>> #include "soc15_common.h"
>> #include "mxgpu_ai.h"
>> +#include "amdgpu_reset.h"
>> +
>> static void xgpu_ai_mailbox_send_ack(struct amdgpu_device *adev)
>> {
>> WREG8(AI_MAIBOX_CONTROL_RCV_OFFSET_BYTE, 2);
>> @@ -302,7 +304,7 @@ static int xgpu_ai_mailbox_rcv_irq(struct
>> amdgpu_device *adev,
>> switch (event) {
>> case IDH_FLR_NOTIFICATION:
>> if (amdgpu_sriov_runtime(adev) && !amdgpu_in_reset(adev))
>> - WARN_ONCE(!queue_work(adev->reset_domain.wq,
>> + WARN_ONCE(!queue_work(adev->reset_domain->wq,
>> &adev->virt.flr_work),
>> "Failed to queue work! at %s",
>> __func__);
>> diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
>> b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
>> index 08411924150d..6e12055f3f4a 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
>> @@ -31,6 +31,8 @@
>> #include "soc15_common.h"
>> #include "mxgpu_nv.h"
>> +#include "amdgpu_reset.h"
>> +
>> static void xgpu_nv_mailbox_send_ack(struct amdgpu_device *adev)
>> {
>> WREG8(NV_MAIBOX_CONTROL_RCV_OFFSET_BYTE, 2);
>> @@ -337,7 +339,7 @@ static int xgpu_nv_mailbox_rcv_irq(struct
>> amdgpu_device *adev,
>> switch (event) {
>> case IDH_FLR_NOTIFICATION:
>> if (amdgpu_sriov_runtime(adev) && !amdgpu_in_reset(adev))
>> - WARN_ONCE(!queue_work(adev->reset_domain.wq,
>> + WARN_ONCE(!queue_work(adev->reset_domain->wq,
>> &adev->virt.flr_work),
>> "Failed to queue work! at %s",
>> __func__);
>> diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c
>> b/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c
>> index 02290febfcf4..fe1570c2146e 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c
>> @@ -42,6 +42,8 @@
>> #include "smu/smu_7_1_3_d.h"
>> #include "mxgpu_vi.h"
>> +#include "amdgpu_reset.h"
>> +
>> /* VI golden setting */
>> static const u32 xgpu_fiji_mgcg_cgcg_init[] = {
>> mmRLC_CGTT_MGCG_OVERRIDE, 0xffffffff, 0xffffffff,
>> @@ -551,7 +553,7 @@ static int xgpu_vi_mailbox_rcv_irq(struct
>> amdgpu_device *adev,
>> /* only handle FLR_NOTIFY now */
>> if (!r && !amdgpu_in_reset(adev))
>> - WARN_ONCE(!queue_work(adev->reset_domain.wq,
>> + WARN_ONCE(!queue_work(adev->reset_domain->wq,
>> &adev->virt.flr_work),
>> "Failed to queue work! at %s",
>> __func__);
>>
More information about the amd-gfx
mailing list