[PATCH 02/16] drm/amdkfd: Don't dereference kfd_process.mm

Oded Gabbay oded.gabbay at gmail.com
Wed Oct 25 06:45:31 UTC 2017


On Sat, Oct 21, 2017 at 3:23 AM, Felix Kuehling <Felix.Kuehling at amd.com> wrote:
> The kfd_process doesn't own a reference to the mm_struct, so it can
> disappear without warning even while the kfd_process still exists.
> In fact, the delayed kfd_process teardown is triggered by an MMU
> notifier when the mm_struct is destroyed. Permanently holding a
> reference to the mm_struct would prevent this from happening.
>
> Therefore, avoid dereferencing the kfd_process.mm pointer and make
> it opaque. Use get_task_mm to get a temporary reference to the mm
> when it's needed.
>
> Signed-off-by: Felix Kuehling <Felix.Kuehling at amd.com>
> ---
>  drivers/gpu/drm/amd/amdkfd/kfd_events.c  | 19 +++++++++++++++----
>  drivers/gpu/drm/amd/amdkfd/kfd_priv.h    |  7 ++++++-
>  drivers/gpu/drm/amd/amdkfd/kfd_process.c |  6 +++++-
>  3 files changed, 26 insertions(+), 6 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_events.c b/drivers/gpu/drm/amd/amdkfd/kfd_events.c
> index 944abfa..61ce547 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_events.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_events.c
> @@ -24,8 +24,8 @@
>  #include <linux/slab.h>
>  #include <linux/types.h>
>  #include <linux/sched/signal.h>
> +#include <linux/sched/mm.h>
>  #include <linux/uaccess.h>
> -#include <linux/mm.h>
>  #include <linux/mman.h>
>  #include <linux/memory.h>
>  #include "kfd_priv.h"
> @@ -904,14 +904,24 @@ void kfd_signal_iommu_event(struct kfd_dev *dev, unsigned int pasid,
>          * running so the lookup function returns a locked process.
>          */
>         struct kfd_process *p = kfd_lookup_process_by_pasid(pasid);
> +       struct mm_struct *mm;
>
>         if (!p)
>                 return; /* Presumably process exited. */
>
> +       /* Take a safe reference to the mm_struct, which may otherwise
> +        * disappear even while the kfd_process is still referenced.
> +        */
> +       mm = get_task_mm(p->lead_thread);
> +       if (!mm) {
> +               mutex_unlock(&p->mutex);
> +               return; /* Process is exiting */
> +       }
> +
>         memset(&memory_exception_data, 0, sizeof(memory_exception_data));
>
> -       down_read(&p->mm->mmap_sem);
> -       vma = find_vma(p->mm, address);
> +       down_read(&mm->mmap_sem);
> +       vma = find_vma(mm, address);
>
>         memory_exception_data.gpu_id = dev->id;
>         memory_exception_data.va = address;
> @@ -937,7 +947,8 @@ void kfd_signal_iommu_event(struct kfd_dev *dev, unsigned int pasid,
>                 }
>         }
>
> -       up_read(&p->mm->mmap_sem);
> +       up_read(&mm->mmap_sem);
> +       mmput(mm);
>
>         mutex_lock(&p->event_mutex);
>
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> index 7d86ec9..1a483a7 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> @@ -494,7 +494,12 @@ struct kfd_process {
>          */
>         struct hlist_node kfd_processes;
>
> -       struct mm_struct *mm;
> +       /*
> +        * Opaque pointer to mm_struct. We don't hold a reference to
> +        * it so it should never be dereferenced from here. This is
> +        * only used for looking up processes by their mm.
> +        */
> +       void *mm;
>
>         struct mutex mutex;
>
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> index 3ccb3b5..21d27e5 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> @@ -200,7 +200,11 @@ static void kfd_process_destroy_delayed(struct rcu_head *rcu)
>         struct kfd_process *p;
>
>         p = container_of(rcu, struct kfd_process, rcu);
> -       WARN_ON(atomic_read(&p->mm->mm_count) <= 0);
> +       /*
> +        * This cast should be safe here because we grabbed a
> +        * reference to the mm in kfd_process_notifier_release
> +        */
> +       WARN_ON(atomic_read(&((struct mm_struct *)p->mm)->mm_count) <= 0);
>
>         mmdrop(p->mm);
>
> --
> 2.7.4
>
This patch is:
Reviewed-by: Oded Gabbay <oded.gabbay at gmail.com>


More information about the amd-gfx mailing list