[Patch v2 3/4] drm/amdkfd: refactor runtime pm for baco

Felix Kuehling felix.kuehling at amd.com
Wed Feb 5 01:59:15 UTC 2020


On 2020-02-04 7:57 p.m., Bhardwaj, Rajneesh wrote:
> Hi Felix,
>
> Thanks for the review feedback!
>
> On 2/4/2020 4:28 PM, Felix Kuehling wrote:
>> On 2020-01-31 10:37 p.m., Rajneesh Bhardwaj wrote:
>>> So far the kfd driver implemented same routines for runtime and system
>>> wide suspend and resume (s2idle or mem). During system wide suspend the
>>> kfd aquires an atomic lock that prevents any more user processes to
>>> create queues and interact with kfd driver and amd gpu. This mechanism
>>> created problem when amdgpu device is runtime suspended with BACO
>>> enabled. Any application that relies on kfd driver fails to load 
>>> because
>>> the driver reports a locked kfd device since gpu is runtime suspended.
>>>
>>> However, in an ideal case, when gpu is runtime  suspended the kfd 
>>> driver
>>> should be able to:
>>>
>>>   - auto resume amdgpu driver whenever a client requests compute 
>>> service
>>>   - prevent runtime suspend for amdgpu  while kfd is in use
>>>
>>> This change refactors the amdgpu and amdkfd drivers to support BACO and
>>> runtime power management.
>>>
>>> Signed-off-by: Rajneesh Bhardwaj <rajneesh.bhardwaj at amd.com>
>>
>> One small comment inline. Other than that patches 1-3 are
>>
>> Reviewed-by: Felix Kuehling <Felix.Kuehling at amd.com>
>>
>> Also, I believe patch 1 is unchanged from v1 and already got a 
>> Reviewed-by from Alex. Please remember to add that tag before you 
>> submit.
>
>
> You mean https://www.spinics.net/lists/amd-gfx/msg44689.html? I have 
> already added Alex's RB tag.

Right, I missed it. I'm not used to seeing a Reviewed-by tag before the 
Signed-off-by tag.

Thanks,
   Felix


>
>
>>
>>
>> The last patch that enabled runtime PM by default, I'd leave the 
>> decision to submit that up to Alex. There may be other considerations 
>> than just KFD.
>
>
> Sure, but its needed along with the series otherwise we can't test/use 
> it.
>
>
>>
>>
>> See inline ...
>>
>>
>>> ---
>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c | 12 +++---
>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h |  8 ++--
>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c |  4 +-
>>>   drivers/gpu/drm/amd/amdkfd/kfd_device.c    | 29 +++++++++------
>>>   drivers/gpu/drm/amd/amdkfd/kfd_priv.h      |  1 +
>>>   drivers/gpu/drm/amd/amdkfd/kfd_process.c   | 43 
>>> ++++++++++++++++++++--
>>>   6 files changed, 70 insertions(+), 27 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c 
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
>>> index 8609287620ea..314c4a2a0354 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
>>> @@ -178,18 +178,18 @@ void amdgpu_amdkfd_interrupt(struct 
>>> amdgpu_device *adev,
>>>           kgd2kfd_interrupt(adev->kfd.dev, ih_ring_entry);
>>>   }
>>>   -void amdgpu_amdkfd_suspend(struct amdgpu_device *adev)
>>> +void amdgpu_amdkfd_suspend(struct amdgpu_device *adev, bool run_pm)
>>>   {
>>>       if (adev->kfd.dev)
>>> -        kgd2kfd_suspend(adev->kfd.dev);
>>> +        kgd2kfd_suspend(adev->kfd.dev, run_pm);
>>>   }
>>>   -int amdgpu_amdkfd_resume(struct amdgpu_device *adev)
>>> +int amdgpu_amdkfd_resume(struct amdgpu_device *adev, bool run_pm)
>>>   {
>>>       int r = 0;
>>>         if (adev->kfd.dev)
>>> -        r = kgd2kfd_resume(adev->kfd.dev);
>>> +        r = kgd2kfd_resume(adev->kfd.dev, run_pm);
>>>         return r;
>>>   }
>>> @@ -713,11 +713,11 @@ void kgd2kfd_exit(void)
>>>   {
>>>   }
>>>   -void kgd2kfd_suspend(struct kfd_dev *kfd)
>>> +void kgd2kfd_suspend(struct kfd_dev *kfd, bool run_pm)
>>>   {
>>>   }
>>>   -int kgd2kfd_resume(struct kfd_dev *kfd)
>>> +int kgd2kfd_resume(struct kfd_dev *kfd, bool run_pm)
>>>   {
>>>       return 0;
>>>   }
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h 
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
>>> index 47b0f2957d1f..9e8db702d878 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
>>> @@ -122,8 +122,8 @@ struct amdkfd_process_info {
>>>   int amdgpu_amdkfd_init(void);
>>>   void amdgpu_amdkfd_fini(void);
>>>   -void amdgpu_amdkfd_suspend(struct amdgpu_device *adev);
>>> -int amdgpu_amdkfd_resume(struct amdgpu_device *adev);
>>> +void amdgpu_amdkfd_suspend(struct amdgpu_device *adev, bool run_pm);
>>> +int amdgpu_amdkfd_resume(struct amdgpu_device *adev, bool run_pm);
>>>   void amdgpu_amdkfd_interrupt(struct amdgpu_device *adev,
>>>               const void *ih_ring_entry);
>>>   void amdgpu_amdkfd_device_probe(struct amdgpu_device *adev);
>>> @@ -249,8 +249,8 @@ bool kgd2kfd_device_init(struct kfd_dev *kfd,
>>>                struct drm_device *ddev,
>>>                const struct kgd2kfd_shared_resources *gpu_resources);
>>>   void kgd2kfd_device_exit(struct kfd_dev *kfd);
>>> -void kgd2kfd_suspend(struct kfd_dev *kfd);
>>> -int kgd2kfd_resume(struct kfd_dev *kfd);
>>> +void kgd2kfd_suspend(struct kfd_dev *kfd, bool run_pm);
>>> +int kgd2kfd_resume(struct kfd_dev *kfd, bool run_pm);
>>>   int kgd2kfd_pre_reset(struct kfd_dev *kfd);
>>>   int kgd2kfd_post_reset(struct kfd_dev *kfd);
>>>   void kgd2kfd_interrupt(struct kfd_dev *kfd, const void 
>>> *ih_ring_entry);
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>> index 5030a09babb8..43843e6c4bcd 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>> @@ -3311,7 +3311,7 @@ int amdgpu_device_suspend(struct drm_device 
>>> *dev, bool fbcon)
>>>           }
>>>       }
>>>   -    amdgpu_amdkfd_suspend(adev);
>>> +    amdgpu_amdkfd_suspend(adev, !fbcon);
>>>         amdgpu_ras_suspend(adev);
>>>   @@ -3395,7 +3395,7 @@ int amdgpu_device_resume(struct drm_device 
>>> *dev, bool fbcon)
>>>               }
>>>           }
>>>       }
>>> -    r = amdgpu_amdkfd_resume(adev);
>>> +    r = amdgpu_amdkfd_resume(adev, !fbcon);
>>>       if (r)
>>>           return r;
>>>   diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c 
>>> b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
>>> index 798ad1c8f799..42ee9ea5c45a 100644
>>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c
>>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
>>> @@ -732,7 +732,7 @@ bool kgd2kfd_device_init(struct kfd_dev *kfd,
>>>   void kgd2kfd_device_exit(struct kfd_dev *kfd)
>>>   {
>>>       if (kfd->init_complete) {
>>> -        kgd2kfd_suspend(kfd);
>>> +        kgd2kfd_suspend(kfd, false);
>>>           device_queue_manager_uninit(kfd->dqm);
>>>           kfd_interrupt_exit(kfd);
>>>           kfd_topology_remove_device(kfd);
>>> @@ -753,7 +753,7 @@ int kgd2kfd_pre_reset(struct kfd_dev *kfd)
>>>         kfd->dqm->ops.pre_reset(kfd->dqm);
>>>   -    kgd2kfd_suspend(kfd);
>>> +    kgd2kfd_suspend(kfd, false);
>>>         kfd_signal_reset_event(kfd);
>>>       return 0;
>>> @@ -787,21 +787,23 @@ bool kfd_is_locked(void)
>>>       return  (atomic_read(&kfd_locked) > 0);
>>>   }
>>>   -void kgd2kfd_suspend(struct kfd_dev *kfd)
>>> +void kgd2kfd_suspend(struct kfd_dev *kfd, bool run_pm)
>>>   {
>>>       if (!kfd->init_complete)
>>>           return;
>>>   -    /* For first KFD device suspend all the KFD processes */
>>> -    if (atomic_inc_return(&kfd_locked) == 1)
>>> -        kfd_suspend_all_processes();
>>> +    /* for runtime suspend, skip locking kfd */
>>> +    if (!run_pm) {
>>> +        /* For first KFD device suspend all the KFD processes */
>>> +        if (atomic_inc_return(&kfd_locked) == 1)
>>> +            kfd_suspend_all_processes();
>>> +    }
>>>         kfd->dqm->ops.stop(kfd->dqm);
>>> -
>>>       kfd_iommu_suspend(kfd);
>>>   }
>>>   -int kgd2kfd_resume(struct kfd_dev *kfd)
>>> +int kgd2kfd_resume(struct kfd_dev *kfd, bool run_pm)
>>>   {
>>>       int ret, count;
>>>   @@ -812,10 +814,13 @@ int kgd2kfd_resume(struct kfd_dev *kfd)
>>>       if (ret)
>>>           return ret;
>>>   -    count = atomic_dec_return(&kfd_locked);
>>> -    WARN_ONCE(count < 0, "KFD suspend / resume ref. error");
>>> -    if (count == 0)
>>> -        ret = kfd_resume_all_processes();
>>> +    /* for runtime resume, skip unlocking kfd */
>>> +    if (!run_pm) {
>>> +        count = atomic_dec_return(&kfd_locked);
>>> +        WARN_ONCE(count < 0, "KFD suspend / resume ref. error");
>>> +        if (count == 0)
>>> +            ret = kfd_resume_all_processes();
>>> +    }
>>>         return ret;
>>>   }
>>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h 
>>> b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
>>> index c0b0defc8f7a..20dd4747250d 100644
>>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
>>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
>>> @@ -647,6 +647,7 @@ struct kfd_process_device {
>>>        * function.
>>>        */
>>>       bool already_dequeued;
>>> +    bool runtime_inuse;
>>>         /* Is this process/pasid bound to this device? 
>>> (amd_iommu_bind_pasid) */
>>>       enum kfd_pdd_bound bound;
>>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c 
>>> b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
>>> index 25b90f70aecd..6907a5a2cbc8 100644
>>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
>>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
>>> @@ -31,6 +31,7 @@
>>>   #include <linux/compat.h>
>>>   #include <linux/mman.h>
>>>   #include <linux/file.h>
>>> +#include <linux/pm_runtime.h>
>>>   #include "amdgpu_amdkfd.h"
>>>   #include "amdgpu.h"
>>>   @@ -440,6 +441,16 @@ static void kfd_process_destroy_pdds(struct 
>>> kfd_process *p)
>>>           kfree(pdd->qpd.doorbell_bitmap);
>>>           idr_destroy(&pdd->alloc_idr);
>>>   +        /*
>>> +         * before destroying pdd, make sure to report availability
>>> +         * for auto suspend
>>> +         */
>>> +        if (pdd->runtime_inuse) {
>>> + pm_runtime_mark_last_busy(pdd->dev->ddev->dev);
>>> + pm_runtime_put_autosuspend(pdd->dev->ddev->dev);
>>> +            pdd->runtime_inuse = false;
>>> +        }
>>> +
>>>           kfree(pdd);
>>>       }
>>>   }
>>> @@ -754,6 +765,7 @@ struct kfd_process_device 
>>> *kfd_create_process_device_data(struct kfd_dev *dev,
>>>       pdd->process = p;
>>>       pdd->bound = PDD_UNBOUND;
>>>       pdd->already_dequeued = false;
>>> +    pdd->runtime_inuse = false;
>>>       list_add(&pdd->per_device_list, &p->per_device_data);
>>>         /* Init idr used for memory handle translation */
>>> @@ -843,15 +855,40 @@ struct kfd_process_device 
>>> *kfd_bind_process_to_device(struct kfd_dev *dev,
>>>           return ERR_PTR(-ENOMEM);
>>>       }
>>>   +    /*
>>> +     * signal runtime-pm system to auto resume and prevent
>>> +     * further runtime suspend once device pdd is created until
>>> +     * pdd is destroyed.
>>> +     */
>>> +    if (!pdd->runtime_inuse) {
>>> +        err = pm_runtime_get_sync(dev->ddev->dev);
>>> +        if (err < 0)
>>> +            return ERR_PTR(err);
>>> +    }
>>> +
>>>       err = kfd_iommu_bind_process_to_device(pdd);
>>>       if (err)
>>> -        return ERR_PTR(err);
>>> +        goto out;
>>>         err = kfd_process_device_init_vm(pdd, NULL);
>>>       if (err)
>>> -        return ERR_PTR(err);
>>> +        goto out;
>>>   -    return pdd;
>>> +    if (!err) {
>>> +        /*
>>> +         * make sure that runtime_usage counter is incremented
>>> +         * just once per pdd
>>> +         */
>>> +        if (!pdd->runtime_inuse)
>>> +            pdd->runtime_inuse = true;
>>
>> The "if" is redundant here. You can just set pdd->runtime_inuse = 
>> true unconditionally.
>> Regards,
>>   Felix
>>
> Thanks
>
> Rajneesh
>
>>> +
>>> +        return pdd;
>>> +    }
>>> +out:
>>> +    /* balance runpm reference count and exit with error */
>>> +    pm_runtime_mark_last_busy(dev->ddev->dev);
>>> +    pm_runtime_put_autosuspend(dev->ddev->dev);
>>> +    return ERR_PTR(err);
>>>   }
>>>     struct kfd_process_device *kfd_get_first_process_device_data(


More information about the amd-gfx mailing list