[Patch v2 3/4] drm/amdkfd: refactor runtime pm for baco

Alex Deucher alexdeucher at gmail.com
Thu Feb 6 14:27:11 UTC 2020


On Tue, Feb 4, 2020 at 11:46 PM Alex Deucher <alexdeucher at gmail.com> wrote:
>
> On Tue, Feb 4, 2020 at 4:28 PM Felix Kuehling <felix.kuehling at amd.com> wrote:
> >
> > On 2020-01-31 10:37 p.m., Rajneesh Bhardwaj wrote:
> > > So far the kfd driver implemented same routines for runtime and system
> > > wide suspend and resume (s2idle or mem). During system wide suspend the
> > > kfd aquires an atomic lock that prevents any more user processes to
> > > create queues and interact with kfd driver and amd gpu. This mechanism
> > > created problem when amdgpu device is runtime suspended with BACO
> > > enabled. Any application that relies on kfd driver fails to load because
> > > the driver reports a locked kfd device since gpu is runtime suspended.
> > >
> > > However, in an ideal case, when gpu is runtime  suspended the kfd driver
> > > should be able to:
> > >
> > >   - auto resume amdgpu driver whenever a client requests compute service
> > >   - prevent runtime suspend for amdgpu  while kfd is in use
> > >
> > > This change refactors the amdgpu and amdkfd drivers to support BACO and
> > > runtime power management.
> > >
> > > Signed-off-by: Rajneesh Bhardwaj <rajneesh.bhardwaj at amd.com>
> >
> > One small comment inline. Other than that patches 1-3 are
> >
> > Reviewed-by: Felix Kuehling <Felix.Kuehling at amd.com>
> >
> > Also, I believe patch 1 is unchanged from v1 and already got a
> > Reviewed-by from Alex. Please remember to add that tag before you submit.
> >
> > The last patch that enabled runtime PM by default, I'd leave the
> > decision to submit that up to Alex. There may be other considerations
> > than just KFD.
>
> KFD was the only thing left.  I've been running with runpm forced on
> for a while now with no problems across a wide variety of hardware.

Actually, we need to prevent runtime pm if xgmi is active.  One more
thing to check.

Alex


>
> Alex
>
> >
> > See inline ...
> >
> >
> > > ---
> > >   drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c | 12 +++---
> > >   drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h |  8 ++--
> > >   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c |  4 +-
> > >   drivers/gpu/drm/amd/amdkfd/kfd_device.c    | 29 +++++++++------
> > >   drivers/gpu/drm/amd/amdkfd/kfd_priv.h      |  1 +
> > >   drivers/gpu/drm/amd/amdkfd/kfd_process.c   | 43 ++++++++++++++++++++--
> > >   6 files changed, 70 insertions(+), 27 deletions(-)
> > >
> > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
> > > index 8609287620ea..314c4a2a0354 100644
> > > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
> > > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
> > > @@ -178,18 +178,18 @@ void amdgpu_amdkfd_interrupt(struct amdgpu_device *adev,
> > >               kgd2kfd_interrupt(adev->kfd.dev, ih_ring_entry);
> > >   }
> > >
> > > -void amdgpu_amdkfd_suspend(struct amdgpu_device *adev)
> > > +void amdgpu_amdkfd_suspend(struct amdgpu_device *adev, bool run_pm)
> > >   {
> > >       if (adev->kfd.dev)
> > > -             kgd2kfd_suspend(adev->kfd.dev);
> > > +             kgd2kfd_suspend(adev->kfd.dev, run_pm);
> > >   }
> > >
> > > -int amdgpu_amdkfd_resume(struct amdgpu_device *adev)
> > > +int amdgpu_amdkfd_resume(struct amdgpu_device *adev, bool run_pm)
> > >   {
> > >       int r = 0;
> > >
> > >       if (adev->kfd.dev)
> > > -             r = kgd2kfd_resume(adev->kfd.dev);
> > > +             r = kgd2kfd_resume(adev->kfd.dev, run_pm);
> > >
> > >       return r;
> > >   }
> > > @@ -713,11 +713,11 @@ void kgd2kfd_exit(void)
> > >   {
> > >   }
> > >
> > > -void kgd2kfd_suspend(struct kfd_dev *kfd)
> > > +void kgd2kfd_suspend(struct kfd_dev *kfd, bool run_pm)
> > >   {
> > >   }
> > >
> > > -int kgd2kfd_resume(struct kfd_dev *kfd)
> > > +int kgd2kfd_resume(struct kfd_dev *kfd, bool run_pm)
> > >   {
> > >       return 0;
> > >   }
> > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
> > > index 47b0f2957d1f..9e8db702d878 100644
> > > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
> > > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
> > > @@ -122,8 +122,8 @@ struct amdkfd_process_info {
> > >   int amdgpu_amdkfd_init(void);
> > >   void amdgpu_amdkfd_fini(void);
> > >
> > > -void amdgpu_amdkfd_suspend(struct amdgpu_device *adev);
> > > -int amdgpu_amdkfd_resume(struct amdgpu_device *adev);
> > > +void amdgpu_amdkfd_suspend(struct amdgpu_device *adev, bool run_pm);
> > > +int amdgpu_amdkfd_resume(struct amdgpu_device *adev, bool run_pm);
> > >   void amdgpu_amdkfd_interrupt(struct amdgpu_device *adev,
> > >                       const void *ih_ring_entry);
> > >   void amdgpu_amdkfd_device_probe(struct amdgpu_device *adev);
> > > @@ -249,8 +249,8 @@ bool kgd2kfd_device_init(struct kfd_dev *kfd,
> > >                        struct drm_device *ddev,
> > >                        const struct kgd2kfd_shared_resources *gpu_resources);
> > >   void kgd2kfd_device_exit(struct kfd_dev *kfd);
> > > -void kgd2kfd_suspend(struct kfd_dev *kfd);
> > > -int kgd2kfd_resume(struct kfd_dev *kfd);
> > > +void kgd2kfd_suspend(struct kfd_dev *kfd, bool run_pm);
> > > +int kgd2kfd_resume(struct kfd_dev *kfd, bool run_pm);
> > >   int kgd2kfd_pre_reset(struct kfd_dev *kfd);
> > >   int kgd2kfd_post_reset(struct kfd_dev *kfd);
> > >   void kgd2kfd_interrupt(struct kfd_dev *kfd, const void *ih_ring_entry);
> > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> > > index 5030a09babb8..43843e6c4bcd 100644
> > > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> > > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> > > @@ -3311,7 +3311,7 @@ int amdgpu_device_suspend(struct drm_device *dev, bool fbcon)
> > >               }
> > >       }
> > >
> > > -     amdgpu_amdkfd_suspend(adev);
> > > +     amdgpu_amdkfd_suspend(adev, !fbcon);
> > >
> > >       amdgpu_ras_suspend(adev);
> > >
> > > @@ -3395,7 +3395,7 @@ int amdgpu_device_resume(struct drm_device *dev, bool fbcon)
> > >                       }
> > >               }
> > >       }
> > > -     r = amdgpu_amdkfd_resume(adev);
> > > +     r = amdgpu_amdkfd_resume(adev, !fbcon);
> > >       if (r)
> > >               return r;
> > >
> > > diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
> > > index 798ad1c8f799..42ee9ea5c45a 100644
> > > --- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c
> > > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
> > > @@ -732,7 +732,7 @@ bool kgd2kfd_device_init(struct kfd_dev *kfd,
> > >   void kgd2kfd_device_exit(struct kfd_dev *kfd)
> > >   {
> > >       if (kfd->init_complete) {
> > > -             kgd2kfd_suspend(kfd);
> > > +             kgd2kfd_suspend(kfd, false);
> > >               device_queue_manager_uninit(kfd->dqm);
> > >               kfd_interrupt_exit(kfd);
> > >               kfd_topology_remove_device(kfd);
> > > @@ -753,7 +753,7 @@ int kgd2kfd_pre_reset(struct kfd_dev *kfd)
> > >
> > >       kfd->dqm->ops.pre_reset(kfd->dqm);
> > >
> > > -     kgd2kfd_suspend(kfd);
> > > +     kgd2kfd_suspend(kfd, false);
> > >
> > >       kfd_signal_reset_event(kfd);
> > >       return 0;
> > > @@ -787,21 +787,23 @@ bool kfd_is_locked(void)
> > >       return  (atomic_read(&kfd_locked) > 0);
> > >   }
> > >
> > > -void kgd2kfd_suspend(struct kfd_dev *kfd)
> > > +void kgd2kfd_suspend(struct kfd_dev *kfd, bool run_pm)
> > >   {
> > >       if (!kfd->init_complete)
> > >               return;
> > >
> > > -     /* For first KFD device suspend all the KFD processes */
> > > -     if (atomic_inc_return(&kfd_locked) == 1)
> > > -             kfd_suspend_all_processes();
> > > +     /* for runtime suspend, skip locking kfd */
> > > +     if (!run_pm) {
> > > +             /* For first KFD device suspend all the KFD processes */
> > > +             if (atomic_inc_return(&kfd_locked) == 1)
> > > +                     kfd_suspend_all_processes();
> > > +     }
> > >
> > >       kfd->dqm->ops.stop(kfd->dqm);
> > > -
> > >       kfd_iommu_suspend(kfd);
> > >   }
> > >
> > > -int kgd2kfd_resume(struct kfd_dev *kfd)
> > > +int kgd2kfd_resume(struct kfd_dev *kfd, bool run_pm)
> > >   {
> > >       int ret, count;
> > >
> > > @@ -812,10 +814,13 @@ int kgd2kfd_resume(struct kfd_dev *kfd)
> > >       if (ret)
> > >               return ret;
> > >
> > > -     count = atomic_dec_return(&kfd_locked);
> > > -     WARN_ONCE(count < 0, "KFD suspend / resume ref. error");
> > > -     if (count == 0)
> > > -             ret = kfd_resume_all_processes();
> > > +     /* for runtime resume, skip unlocking kfd */
> > > +     if (!run_pm) {
> > > +             count = atomic_dec_return(&kfd_locked);
> > > +             WARN_ONCE(count < 0, "KFD suspend / resume ref. error");
> > > +             if (count == 0)
> > > +                     ret = kfd_resume_all_processes();
> > > +     }
> > >
> > >       return ret;
> > >   }
> > > diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> > > index c0b0defc8f7a..20dd4747250d 100644
> > > --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> > > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> > > @@ -647,6 +647,7 @@ struct kfd_process_device {
> > >        * function.
> > >        */
> > >       bool already_dequeued;
> > > +     bool runtime_inuse;
> > >
> > >       /* Is this process/pasid bound to this device? (amd_iommu_bind_pasid) */
> > >       enum kfd_pdd_bound bound;
> > > diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> > > index 25b90f70aecd..6907a5a2cbc8 100644
> > > --- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> > > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> > > @@ -31,6 +31,7 @@
> > >   #include <linux/compat.h>
> > >   #include <linux/mman.h>
> > >   #include <linux/file.h>
> > > +#include <linux/pm_runtime.h>
> > >   #include "amdgpu_amdkfd.h"
> > >   #include "amdgpu.h"
> > >
> > > @@ -440,6 +441,16 @@ static void kfd_process_destroy_pdds(struct kfd_process *p)
> > >               kfree(pdd->qpd.doorbell_bitmap);
> > >               idr_destroy(&pdd->alloc_idr);
> > >
> > > +             /*
> > > +              * before destroying pdd, make sure to report availability
> > > +              * for auto suspend
> > > +              */
> > > +             if (pdd->runtime_inuse) {
> > > +                     pm_runtime_mark_last_busy(pdd->dev->ddev->dev);
> > > +                     pm_runtime_put_autosuspend(pdd->dev->ddev->dev);
> > > +                     pdd->runtime_inuse = false;
> > > +             }
> > > +
> > >               kfree(pdd);
> > >       }
> > >   }
> > > @@ -754,6 +765,7 @@ struct kfd_process_device *kfd_create_process_device_data(struct kfd_dev *dev,
> > >       pdd->process = p;
> > >       pdd->bound = PDD_UNBOUND;
> > >       pdd->already_dequeued = false;
> > > +     pdd->runtime_inuse = false;
> > >       list_add(&pdd->per_device_list, &p->per_device_data);
> > >
> > >       /* Init idr used for memory handle translation */
> > > @@ -843,15 +855,40 @@ struct kfd_process_device *kfd_bind_process_to_device(struct kfd_dev *dev,
> > >               return ERR_PTR(-ENOMEM);
> > >       }
> > >
> > > +     /*
> > > +      * signal runtime-pm system to auto resume and prevent
> > > +      * further runtime suspend once device pdd is created until
> > > +      * pdd is destroyed.
> > > +      */
> > > +     if (!pdd->runtime_inuse) {
> > > +             err = pm_runtime_get_sync(dev->ddev->dev);
> > > +             if (err < 0)
> > > +                     return ERR_PTR(err);
> > > +     }
> > > +
> > >       err = kfd_iommu_bind_process_to_device(pdd);
> > >       if (err)
> > > -             return ERR_PTR(err);
> > > +             goto out;
> > >
> > >       err = kfd_process_device_init_vm(pdd, NULL);
> > >       if (err)
> > > -             return ERR_PTR(err);
> > > +             goto out;
> > >
> > > -     return pdd;
> > > +     if (!err) {
> > > +             /*
> > > +              * make sure that runtime_usage counter is incremented
> > > +              * just once per pdd
> > > +              */
> > > +             if (!pdd->runtime_inuse)
> > > +                     pdd->runtime_inuse = true;
> >
> > The "if" is redundant here. You can just set pdd->runtime_inuse = true
> > unconditionally.
> >
> > Regards,
> >    Felix
> >
> > > +
> > > +             return pdd;
> > > +     }
> > > +out:
> > > +     /* balance runpm reference count and exit with error */
> > > +     pm_runtime_mark_last_busy(dev->ddev->dev);
> > > +     pm_runtime_put_autosuspend(dev->ddev->dev);
> > > +     return ERR_PTR(err);
> > >   }
> > >
> > >   struct kfd_process_device *kfd_get_first_process_device_data(
> > _______________________________________________
> > amd-gfx mailing list
> > amd-gfx at lists.freedesktop.org
> > https://lists.freedesktop.org/mailman/listinfo/amd-gfx


More information about the amd-gfx mailing list