[PATCH 02/16] drm/amdkfd: Don't dereference kfd_process.mm
Felix Kuehling
felix.kuehling at amd.com
Thu Oct 26 16:47:19 UTC 2017
On 2017-10-26 03:33 AM, Christian König wrote:
> Am 21.10.2017 um 02:23 schrieb Felix Kuehling:
>> The kfd_process doesn't own a reference to the mm_struct, so it can
>> disappear without warning even while the kfd_process still exists.
>> In fact, the delayed kfd_process teardown is triggered by an MMU
>> notifier when the mm_struct is destroyed. Permanently holding a
>> reference to the mm_struct would prevent this from happening.
>>
>> Therefore, avoid dereferencing the kfd_process.mm pointer and make
>> it opaque. Use get_task_mm to get a temporary reference to the mm
>> when it's needed.
>
> Actually that patch is unnecessary.
>
> Process tear down (and calling the MMU release callback) is triggered
> when mm_struct->mm_users reaches zero.
>
> The mm_struct is freed up when mm_struct->mm_count becomes zero.
>
> So what you can do is grab a reference to the mm_struct with mmgrab()
> and still be notified by process tear down.
Hmm, the mm_struct has two different reference counters. So if I grab
the right type of reference it would work. Either way, one of two
changes is needed:
* Take a permanent reference to the mm-struct, or
* Don't dereference the mm pointer because I'm not holding a reference
IMO, KFD doesn't need to hold a reference to the mm_struct permanently.
So I believe my change still makes sense. I should probably just remove
the comment "Permanently holding a reference to the mm_struct would
prevent ...". Like you say, that's not accurate.
Regards,
Felix
>
> Regards,
> Christian.
>
>>
>> Signed-off-by: Felix Kuehling <Felix.Kuehling at amd.com>
>> ---
>> drivers/gpu/drm/amd/amdkfd/kfd_events.c | 19 +++++++++++++++----
>> drivers/gpu/drm/amd/amdkfd/kfd_priv.h | 7 ++++++-
>> drivers/gpu/drm/amd/amdkfd/kfd_process.c | 6 +++++-
>> 3 files changed, 26 insertions(+), 6 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_events.c
>> b/drivers/gpu/drm/amd/amdkfd/kfd_events.c
>> index 944abfa..61ce547 100644
>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_events.c
>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_events.c
>> @@ -24,8 +24,8 @@
>> #include <linux/slab.h>
>> #include <linux/types.h>
>> #include <linux/sched/signal.h>
>> +#include <linux/sched/mm.h>
>> #include <linux/uaccess.h>
>> -#include <linux/mm.h>
>> #include <linux/mman.h>
>> #include <linux/memory.h>
>> #include "kfd_priv.h"
>> @@ -904,14 +904,24 @@ void kfd_signal_iommu_event(struct kfd_dev
>> *dev, unsigned int pasid,
>> * running so the lookup function returns a locked process.
>> */
>> struct kfd_process *p = kfd_lookup_process_by_pasid(pasid);
>> + struct mm_struct *mm;
>> if (!p)
>> return; /* Presumably process exited. */
>> + /* Take a safe reference to the mm_struct, which may otherwise
>> + * disappear even while the kfd_process is still referenced.
>> + */
>> + mm = get_task_mm(p->lead_thread);
>> + if (!mm) {
>> + mutex_unlock(&p->mutex);
>> + return; /* Process is exiting */
>> + }
>> +
>> memset(&memory_exception_data, 0, sizeof(memory_exception_data));
>> - down_read(&p->mm->mmap_sem);
>> - vma = find_vma(p->mm, address);
>> + down_read(&mm->mmap_sem);
>> + vma = find_vma(mm, address);
>> memory_exception_data.gpu_id = dev->id;
>> memory_exception_data.va = address;
>> @@ -937,7 +947,8 @@ void kfd_signal_iommu_event(struct kfd_dev *dev,
>> unsigned int pasid,
>> }
>> }
>> - up_read(&p->mm->mmap_sem);
>> + up_read(&mm->mmap_sem);
>> + mmput(mm);
>> mutex_lock(&p->event_mutex);
>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
>> b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
>> index 7d86ec9..1a483a7 100644
>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
>> @@ -494,7 +494,12 @@ struct kfd_process {
>> */
>> struct hlist_node kfd_processes;
>> - struct mm_struct *mm;
>> + /*
>> + * Opaque pointer to mm_struct. We don't hold a reference to
>> + * it so it should never be dereferenced from here. This is
>> + * only used for looking up processes by their mm.
>> + */
>> + void *mm;
>> struct mutex mutex;
>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
>> b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
>> index 3ccb3b5..21d27e5 100644
>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
>> @@ -200,7 +200,11 @@ static void kfd_process_destroy_delayed(struct
>> rcu_head *rcu)
>> struct kfd_process *p;
>> p = container_of(rcu, struct kfd_process, rcu);
>> - WARN_ON(atomic_read(&p->mm->mm_count) <= 0);
>> + /*
>> + * This cast should be safe here because we grabbed a
>> + * reference to the mm in kfd_process_notifier_release
>> + */
>> + WARN_ON(atomic_read(&((struct mm_struct *)p->mm)->mm_count) <= 0);
>> mmdrop(p->mm);
>>
>
>
More information about the amd-gfx
mailing list