[Patch v2 3/4] drm/amdkfd: refactor runtime pm for baco
Bhardwaj, Rajneesh
rajneesh.bhardwaj at amd.com
Wed Feb 5 00:57:15 UTC 2020
Hi Felix,
Thanks for the review feedback!
On 2/4/2020 4:28 PM, Felix Kuehling wrote:
> On 2020-01-31 10:37 p.m., Rajneesh Bhardwaj wrote:
>> So far the kfd driver implemented same routines for runtime and system
>> wide suspend and resume (s2idle or mem). During system wide suspend the
>> kfd aquires an atomic lock that prevents any more user processes to
>> create queues and interact with kfd driver and amd gpu. This mechanism
>> created problem when amdgpu device is runtime suspended with BACO
>> enabled. Any application that relies on kfd driver fails to load because
>> the driver reports a locked kfd device since gpu is runtime suspended.
>>
>> However, in an ideal case, when gpu is runtime suspended the kfd driver
>> should be able to:
>>
>> - auto resume amdgpu driver whenever a client requests compute service
>> - prevent runtime suspend for amdgpu while kfd is in use
>>
>> This change refactors the amdgpu and amdkfd drivers to support BACO and
>> runtime power management.
>>
>> Signed-off-by: Rajneesh Bhardwaj <rajneesh.bhardwaj at amd.com>
>
> One small comment inline. Other than that patches 1-3 are
>
> Reviewed-by: Felix Kuehling <Felix.Kuehling at amd.com>
>
> Also, I believe patch 1 is unchanged from v1 and already got a
> Reviewed-by from Alex. Please remember to add that tag before you submit.
You mean https://www.spinics.net/lists/amd-gfx/msg44689.html? I have
already added Alex's RB tag.
>
>
> The last patch that enabled runtime PM by default, I'd leave the
> decision to submit that up to Alex. There may be other considerations
> than just KFD.
Sure, but its needed along with the series otherwise we can't test/use it.
>
>
> See inline ...
>
>
>> ---
>> drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c | 12 +++---
>> drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h | 8 ++--
>> drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 4 +-
>> drivers/gpu/drm/amd/amdkfd/kfd_device.c | 29 +++++++++------
>> drivers/gpu/drm/amd/amdkfd/kfd_priv.h | 1 +
>> drivers/gpu/drm/amd/amdkfd/kfd_process.c | 43 ++++++++++++++++++++--
>> 6 files changed, 70 insertions(+), 27 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
>> index 8609287620ea..314c4a2a0354 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
>> @@ -178,18 +178,18 @@ void amdgpu_amdkfd_interrupt(struct
>> amdgpu_device *adev,
>> kgd2kfd_interrupt(adev->kfd.dev, ih_ring_entry);
>> }
>> -void amdgpu_amdkfd_suspend(struct amdgpu_device *adev)
>> +void amdgpu_amdkfd_suspend(struct amdgpu_device *adev, bool run_pm)
>> {
>> if (adev->kfd.dev)
>> - kgd2kfd_suspend(adev->kfd.dev);
>> + kgd2kfd_suspend(adev->kfd.dev, run_pm);
>> }
>> -int amdgpu_amdkfd_resume(struct amdgpu_device *adev)
>> +int amdgpu_amdkfd_resume(struct amdgpu_device *adev, bool run_pm)
>> {
>> int r = 0;
>> if (adev->kfd.dev)
>> - r = kgd2kfd_resume(adev->kfd.dev);
>> + r = kgd2kfd_resume(adev->kfd.dev, run_pm);
>> return r;
>> }
>> @@ -713,11 +713,11 @@ void kgd2kfd_exit(void)
>> {
>> }
>> -void kgd2kfd_suspend(struct kfd_dev *kfd)
>> +void kgd2kfd_suspend(struct kfd_dev *kfd, bool run_pm)
>> {
>> }
>> -int kgd2kfd_resume(struct kfd_dev *kfd)
>> +int kgd2kfd_resume(struct kfd_dev *kfd, bool run_pm)
>> {
>> return 0;
>> }
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
>> index 47b0f2957d1f..9e8db702d878 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
>> @@ -122,8 +122,8 @@ struct amdkfd_process_info {
>> int amdgpu_amdkfd_init(void);
>> void amdgpu_amdkfd_fini(void);
>> -void amdgpu_amdkfd_suspend(struct amdgpu_device *adev);
>> -int amdgpu_amdkfd_resume(struct amdgpu_device *adev);
>> +void amdgpu_amdkfd_suspend(struct amdgpu_device *adev, bool run_pm);
>> +int amdgpu_amdkfd_resume(struct amdgpu_device *adev, bool run_pm);
>> void amdgpu_amdkfd_interrupt(struct amdgpu_device *adev,
>> const void *ih_ring_entry);
>> void amdgpu_amdkfd_device_probe(struct amdgpu_device *adev);
>> @@ -249,8 +249,8 @@ bool kgd2kfd_device_init(struct kfd_dev *kfd,
>> struct drm_device *ddev,
>> const struct kgd2kfd_shared_resources *gpu_resources);
>> void kgd2kfd_device_exit(struct kfd_dev *kfd);
>> -void kgd2kfd_suspend(struct kfd_dev *kfd);
>> -int kgd2kfd_resume(struct kfd_dev *kfd);
>> +void kgd2kfd_suspend(struct kfd_dev *kfd, bool run_pm);
>> +int kgd2kfd_resume(struct kfd_dev *kfd, bool run_pm);
>> int kgd2kfd_pre_reset(struct kfd_dev *kfd);
>> int kgd2kfd_post_reset(struct kfd_dev *kfd);
>> void kgd2kfd_interrupt(struct kfd_dev *kfd, const void
>> *ih_ring_entry);
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> index 5030a09babb8..43843e6c4bcd 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> @@ -3311,7 +3311,7 @@ int amdgpu_device_suspend(struct drm_device
>> *dev, bool fbcon)
>> }
>> }
>> - amdgpu_amdkfd_suspend(adev);
>> + amdgpu_amdkfd_suspend(adev, !fbcon);
>> amdgpu_ras_suspend(adev);
>> @@ -3395,7 +3395,7 @@ int amdgpu_device_resume(struct drm_device
>> *dev, bool fbcon)
>> }
>> }
>> }
>> - r = amdgpu_amdkfd_resume(adev);
>> + r = amdgpu_amdkfd_resume(adev, !fbcon);
>> if (r)
>> return r;
>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c
>> b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
>> index 798ad1c8f799..42ee9ea5c45a 100644
>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c
>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
>> @@ -732,7 +732,7 @@ bool kgd2kfd_device_init(struct kfd_dev *kfd,
>> void kgd2kfd_device_exit(struct kfd_dev *kfd)
>> {
>> if (kfd->init_complete) {
>> - kgd2kfd_suspend(kfd);
>> + kgd2kfd_suspend(kfd, false);
>> device_queue_manager_uninit(kfd->dqm);
>> kfd_interrupt_exit(kfd);
>> kfd_topology_remove_device(kfd);
>> @@ -753,7 +753,7 @@ int kgd2kfd_pre_reset(struct kfd_dev *kfd)
>> kfd->dqm->ops.pre_reset(kfd->dqm);
>> - kgd2kfd_suspend(kfd);
>> + kgd2kfd_suspend(kfd, false);
>> kfd_signal_reset_event(kfd);
>> return 0;
>> @@ -787,21 +787,23 @@ bool kfd_is_locked(void)
>> return (atomic_read(&kfd_locked) > 0);
>> }
>> -void kgd2kfd_suspend(struct kfd_dev *kfd)
>> +void kgd2kfd_suspend(struct kfd_dev *kfd, bool run_pm)
>> {
>> if (!kfd->init_complete)
>> return;
>> - /* For first KFD device suspend all the KFD processes */
>> - if (atomic_inc_return(&kfd_locked) == 1)
>> - kfd_suspend_all_processes();
>> + /* for runtime suspend, skip locking kfd */
>> + if (!run_pm) {
>> + /* For first KFD device suspend all the KFD processes */
>> + if (atomic_inc_return(&kfd_locked) == 1)
>> + kfd_suspend_all_processes();
>> + }
>> kfd->dqm->ops.stop(kfd->dqm);
>> -
>> kfd_iommu_suspend(kfd);
>> }
>> -int kgd2kfd_resume(struct kfd_dev *kfd)
>> +int kgd2kfd_resume(struct kfd_dev *kfd, bool run_pm)
>> {
>> int ret, count;
>> @@ -812,10 +814,13 @@ int kgd2kfd_resume(struct kfd_dev *kfd)
>> if (ret)
>> return ret;
>> - count = atomic_dec_return(&kfd_locked);
>> - WARN_ONCE(count < 0, "KFD suspend / resume ref. error");
>> - if (count == 0)
>> - ret = kfd_resume_all_processes();
>> + /* for runtime resume, skip unlocking kfd */
>> + if (!run_pm) {
>> + count = atomic_dec_return(&kfd_locked);
>> + WARN_ONCE(count < 0, "KFD suspend / resume ref. error");
>> + if (count == 0)
>> + ret = kfd_resume_all_processes();
>> + }
>> return ret;
>> }
>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
>> b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
>> index c0b0defc8f7a..20dd4747250d 100644
>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
>> @@ -647,6 +647,7 @@ struct kfd_process_device {
>> * function.
>> */
>> bool already_dequeued;
>> + bool runtime_inuse;
>> /* Is this process/pasid bound to this device?
>> (amd_iommu_bind_pasid) */
>> enum kfd_pdd_bound bound;
>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
>> b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
>> index 25b90f70aecd..6907a5a2cbc8 100644
>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
>> @@ -31,6 +31,7 @@
>> #include <linux/compat.h>
>> #include <linux/mman.h>
>> #include <linux/file.h>
>> +#include <linux/pm_runtime.h>
>> #include "amdgpu_amdkfd.h"
>> #include "amdgpu.h"
>> @@ -440,6 +441,16 @@ static void kfd_process_destroy_pdds(struct
>> kfd_process *p)
>> kfree(pdd->qpd.doorbell_bitmap);
>> idr_destroy(&pdd->alloc_idr);
>> + /*
>> + * before destroying pdd, make sure to report availability
>> + * for auto suspend
>> + */
>> + if (pdd->runtime_inuse) {
>> + pm_runtime_mark_last_busy(pdd->dev->ddev->dev);
>> + pm_runtime_put_autosuspend(pdd->dev->ddev->dev);
>> + pdd->runtime_inuse = false;
>> + }
>> +
>> kfree(pdd);
>> }
>> }
>> @@ -754,6 +765,7 @@ struct kfd_process_device
>> *kfd_create_process_device_data(struct kfd_dev *dev,
>> pdd->process = p;
>> pdd->bound = PDD_UNBOUND;
>> pdd->already_dequeued = false;
>> + pdd->runtime_inuse = false;
>> list_add(&pdd->per_device_list, &p->per_device_data);
>> /* Init idr used for memory handle translation */
>> @@ -843,15 +855,40 @@ struct kfd_process_device
>> *kfd_bind_process_to_device(struct kfd_dev *dev,
>> return ERR_PTR(-ENOMEM);
>> }
>> + /*
>> + * signal runtime-pm system to auto resume and prevent
>> + * further runtime suspend once device pdd is created until
>> + * pdd is destroyed.
>> + */
>> + if (!pdd->runtime_inuse) {
>> + err = pm_runtime_get_sync(dev->ddev->dev);
>> + if (err < 0)
>> + return ERR_PTR(err);
>> + }
>> +
>> err = kfd_iommu_bind_process_to_device(pdd);
>> if (err)
>> - return ERR_PTR(err);
>> + goto out;
>> err = kfd_process_device_init_vm(pdd, NULL);
>> if (err)
>> - return ERR_PTR(err);
>> + goto out;
>> - return pdd;
>> + if (!err) {
>> + /*
>> + * make sure that runtime_usage counter is incremented
>> + * just once per pdd
>> + */
>> + if (!pdd->runtime_inuse)
>> + pdd->runtime_inuse = true;
>
> The "if" is redundant here. You can just set pdd->runtime_inuse = true
> unconditionally.
> Regards,
> Felix
>
Thanks
Rajneesh
>> +
>> + return pdd;
>> + }
>> +out:
>> + /* balance runpm reference count and exit with error */
>> + pm_runtime_mark_last_busy(dev->ddev->dev);
>> + pm_runtime_put_autosuspend(dev->ddev->dev);
>> + return ERR_PTR(err);
>> }
>> struct kfd_process_device *kfd_get_first_process_device_data(
More information about the amd-gfx
mailing list