[PATCH 2/3] drm/amdkfd: Update queue unmap after VM fault with MES

Wed Aug 14 15:35:40 UTC 2024

[AMD Official Use Only - AMD Internal Distribution Only]

> -----Original Message-----
> From: Kasiviswanathan, Harish <Harish.Kasiviswanathan at amd.com>
> Sent: Wednesday, August 14, 2024 11:17 AM
> To: Joshi, Mukul <Mukul.Joshi at amd.com>; amd-gfx at lists.freedesktop.org
> Cc: Kuehling, Felix <Felix.Kuehling at amd.com>; Deucher, Alexander
> <Alexander.Deucher at amd.com>; Joshi, Mukul <Mukul.Joshi at amd.com>
> Subject: RE: [PATCH 2/3] drm/amdkfd: Update queue unmap after VM fault
> with MES
>
> [AMD Official Use Only - AMD Internal Distribution Only]
>
> -----Original Message-----
> From: amd-gfx <amd-gfx-bounces at lists.freedesktop.org> On Behalf Of Mukul
> Joshi
> Sent: Tuesday, August 13, 2024 2:57 PM
> To: amd-gfx at lists.freedesktop.org
> Cc: Kuehling, Felix <Felix.Kuehling at amd.com>; Deucher, Alexander
> <Alexander.Deucher at amd.com>; Joshi, Mukul <Mukul.Joshi at amd.com>
> Subject: [PATCH 2/3] drm/amdkfd: Update queue unmap after VM fault with
> MES
>
> MEC FW expects MES to unmap all queues when a VM fault is observed on a
> queue and then resumed once the affected process is terminated.
> Use the MES Suspend and Resume APIs to achieve this.
>
> Signed-off-by: Mukul Joshi <mukul.joshi at amd.com>
> ---
>  .../drm/amd/amdkfd/kfd_device_queue_manager.c | 75
> ++++++++++++++++++-
>  1 file changed, 74 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> index f6e211070299..e16b17e301d9 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> @@ -319,6 +319,56 @@ static int remove_all_queues_mes(struct
> device_queue_manager *dqm)
>         return retval;
>  }
>
> +static int suspend_all_queues_mes(struct device_queue_manager *dqm) {
> +       struct amdgpu_device *adev = (struct amdgpu_device *)dqm->dev-
> >adev;
> +       int r;
> +       struct mes_suspend_gang_input queue_input;
> +
> +       if (dqm->is_hws_hang)
> +               return -EIO;
> +
> +       memset(&queue_input, 0x0, sizeof(struct mes_suspend_gang_input));
> +       queue_input.suspend_all_gangs = 1;
> +
> +       amdgpu_mes_lock(&adev->mes);
> +       r = adev->mes.funcs->suspend_gang(&adev->mes, &queue_input);
> +       amdgpu_mes_unlock(&adev->mes);
> +
> +       if (r) {
> +               dev_err(adev->dev, "failed to suspend gangs from MES\n");
> +               dev_err(adev->dev, "MES might be in unrecoverable state, issue a
> GPU reset\n");
> +               kfd_hws_hang(dqm);
> +       }
> +
> +       return r;
> +}
> +
> +static int resume_all_queues_mes(struct device_queue_manager *dqm) {
> +       struct amdgpu_device *adev = (struct amdgpu_device *)dqm->dev-
> >adev;
> +       int r;
> +       struct mes_resume_gang_input queue_input;
> +
> +       if (dqm->is_hws_hang)
> +               return -EIO;
> +
> +       memset(&queue_input, 0x0, sizeof(struct mes_resume_gang_input));
> +       queue_input.resume_all_gangs = 1;
> +
> +       amdgpu_mes_lock(&adev->mes);
> +       r = adev->mes.funcs->resume_gang(&adev->mes, &queue_input);
> +       amdgpu_mes_unlock(&adev->mes);
> +
> +       if (r) {
> +               dev_err(adev->dev, "failed to resume gangs from MES\n");
> +               dev_err(adev->dev, "MES might be in unrecoverable state, issue a
> GPU reset\n");
> +               kfd_hws_hang(dqm);
> +       }
> +
> +       return r;
> +}
> +
>  static void increment_queue_count(struct device_queue_manager *dqm,
>                                   struct qcm_process_device *qpd,
>                                   struct queue *q) @@ -2839,14 +2889,37 @@ int
> kfd_dqm_evict_pasid(struct device_queue_manager *dqm, u32 pasid)  {
>         struct kfd_process_device *pdd;
>         struct kfd_process *p = kfd_lookup_process_by_pasid(pasid);
> +       struct device *dev = dqm->dev->adev->dev;
>         int ret = 0;
>
>         if (!p)
>                 return -EINVAL;
>         WARN(debug_evictions, "Evicting pid %d", p->lead_thread->pid);
>         pdd = kfd_get_process_device_data(dqm->dev, p);
> -       if (pdd)
> +       if (pdd) {
> +               if (dqm->dev->kfd->shared_resources.enable_mes) {
> +                       ret = suspend_all_queues_mes(dqm);
> +                       if (ret) {
> +                               dev_err(dev, "Suspending all queues failed");
> +                               goto out;
> +                       }
> +               }
>                 ret = dqm->ops.evict_process_queues(dqm, &pdd->qpd);
> +               if (ret) {
> +                       dev_err(dev, "Evicting process queues failed");
> +                       goto out;
> +               }
> +
> evict_process_queues() function also has if(enable_mes) branching. Would it
> make sense to just call two different functions here one for MES and one for
> HWS?
>
I think it should be fine to keep it like the way it is. Keeping 2 separate functions one
for MES and other for HWS will probably lead to duplication of the code. We have the
if (enable_mes) branching at a lot of places in the code and it was done just to avoid
duplication.

> Also, if the process is already evicted then there is no need to do suspend_all
> and resume_all.
Yes that's a good point. I can put a check to check if the process is already evicted. If it is, then we
avoid the suspend and resume.

Thanks,
Mukul

>
>
> +               if (dqm->dev->kfd->shared_resources.enable_mes) {
> +                       ret = resume_all_queues_mes(dqm);
> +                       if (ret) {
> +                               dev_err(dev, "Resuming all queues failed");
> +                               goto out;
> +                       }
> +               }
> +       }
> +
> +out:
>         kfd_unref_process(p);
>
>         return ret;
> --
> 2.35.1
>