[PATCH 02/16] drm/amdkfd: Don't dereference kfd_process.mm

Felix Kuehling felix.kuehling at amd.com
Fri Oct 27 19:09:52 UTC 2017


On 2017-10-27 03:41 AM, Christian König wrote:
> Am 27.10.2017 um 09:22 schrieb Christian König:
>> Am 26.10.2017 um 20:54 schrieb Felix Kuehling:
>>> On 2017-10-26 02:11 PM, Christian König wrote:
>>>> But now reading the patch there is something else which I stumbled
>>>> over:
>>>>> - WARN_ON(atomic_read(&p->mm->mm_count) <= 0);
>>>>> +    /*
>>>>> +     * This cast should be safe here because we grabbed a
>>>>> +     * reference to the mm in kfd_process_notifier_release
>>>>> +     */
>>>>> +    WARN_ON(atomic_read(&((struct mm_struct *)p->mm)->mm_count)
>>>>> <= 0);
>>>>>           mmdrop(p->mm);
>>>> Well that isn't good coding style. You shouldn't obfuscate what
>>>> pointer it is by changing it to "void*", but rather set it to NULL as
>>>> soon as you know that it is stale.
>>>>
>>>> Additional to that it is certainly not job of the driver to warn on a
>>>> run over mm_count.
>>> Yeah. We don't have this in our current staging branch. The whole
>>> process teardown has changed quite a bit. I just fixed this up to make
>>> it work with current upstream.
>>>
>>> If you prefer, I could just remove the WARN_ON.
>>
>> That sounds like a good idea to me, yes.
>
> Wait a second, now that I though once more about it what do you mean
> with this comment:
>> +    /*
>> +     * Opaque pointer to mm_struct. We don't hold a reference to
>> +     * it so it should never be dereferenced from here. This is
>> +     * only used for looking up processes by their mm.
>> +     */ 
> E.g. what means looking up the processes by their mm?
>
> Do you use a hashtable or something like this and then check if you
> got the correct mm structure by comparing the pointers?
>
> If that is the case then you should certainly set p->mm to NULL after
> mmdrop(p->mm).
>
> Otherwise it can happen that a new mm structure is allocated at the
> same location as the old one and you run into problems because the
> kernel driver accidentally uses the wrong kfd_process structure.

The processes are in an SRCU-protected hashtable that's read by
find_process_by_mm. Before this function (kfd_process_destroy_delayed)
runs, the process is already removed from the hash table and
synchronize_srcu is used to ensure there are no more readers active.
That means the process being destroyed can't be found any more, so there
is no need to reset p->mm to NULL at that point.

Regards,
  Felix

>
> Regards,
> Christian.
>
>>
>> Regards,
>> Christian.
>>
>>>
>>> Regards,
>>>    Felix
>>>
>>>> Regards,
>>>> Christian.
>>>>
>>>>> Regards,
>>>>>     Felix
>>>>>
>>>>>> Regards,
>>>>>> Christian.
>>>>>>
>>>>>>> Signed-off-by: Felix Kuehling <Felix.Kuehling at amd.com>
>>>>>>> ---
>>>>>>>     drivers/gpu/drm/amd/amdkfd/kfd_events.c  | 19
>>>>>>> +++++++++++++++----
>>>>>>>     drivers/gpu/drm/amd/amdkfd/kfd_priv.h    |  7 ++++++-
>>>>>>>     drivers/gpu/drm/amd/amdkfd/kfd_process.c |  6 +++++-
>>>>>>>     3 files changed, 26 insertions(+), 6 deletions(-)
>>>>>>>
>>>>>>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_events.c
>>>>>>> b/drivers/gpu/drm/amd/amdkfd/kfd_events.c
>>>>>>> index 944abfa..61ce547 100644
>>>>>>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_events.c
>>>>>>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_events.c
>>>>>>> @@ -24,8 +24,8 @@
>>>>>>>     #include <linux/slab.h>
>>>>>>>     #include <linux/types.h>
>>>>>>>     #include <linux/sched/signal.h>
>>>>>>> +#include <linux/sched/mm.h>
>>>>>>>     #include <linux/uaccess.h>
>>>>>>> -#include <linux/mm.h>
>>>>>>>     #include <linux/mman.h>
>>>>>>>     #include <linux/memory.h>
>>>>>>>     #include "kfd_priv.h"
>>>>>>> @@ -904,14 +904,24 @@ void kfd_signal_iommu_event(struct kfd_dev
>>>>>>> *dev, unsigned int pasid,
>>>>>>>          * running so the lookup function returns a locked process.
>>>>>>>          */
>>>>>>>         struct kfd_process *p = kfd_lookup_process_by_pasid(pasid);
>>>>>>> +    struct mm_struct *mm;
>>>>>>>           if (!p)
>>>>>>>             return; /* Presumably process exited. */
>>>>>>>     +    /* Take a safe reference to the mm_struct, which may
>>>>>>> otherwise
>>>>>>> +     * disappear even while the kfd_process is still referenced.
>>>>>>> +     */
>>>>>>> +    mm = get_task_mm(p->lead_thread);
>>>>>>> +    if (!mm) {
>>>>>>> +        mutex_unlock(&p->mutex);
>>>>>>> +        return; /* Process is exiting */
>>>>>>> +    }
>>>>>>> +
>>>>>>>         memset(&memory_exception_data, 0,
>>>>>>> sizeof(memory_exception_data));
>>>>>>>     -    down_read(&p->mm->mmap_sem);
>>>>>>> -    vma = find_vma(p->mm, address);
>>>>>>> +    down_read(&mm->mmap_sem);
>>>>>>> +    vma = find_vma(mm, address);
>>>>>>>           memory_exception_data.gpu_id = dev->id;
>>>>>>>         memory_exception_data.va = address;
>>>>>>> @@ -937,7 +947,8 @@ void kfd_signal_iommu_event(struct kfd_dev
>>>>>>> *dev,
>>>>>>> unsigned int pasid,
>>>>>>>             }
>>>>>>>         }
>>>>>>>     -    up_read(&p->mm->mmap_sem);
>>>>>>> +    up_read(&mm->mmap_sem);
>>>>>>> +    mmput(mm);
>>>>>>>           mutex_lock(&p->event_mutex);
>>>>>>>     diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
>>>>>>> b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
>>>>>>> index 7d86ec9..1a483a7 100644
>>>>>>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
>>>>>>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
>>>>>>> @@ -494,7 +494,12 @@ struct kfd_process {
>>>>>>>          */
>>>>>>>         struct hlist_node kfd_processes;
>>>>>>>     -    struct mm_struct *mm;
>>>>>>> +    /*
>>>>>>> +     * Opaque pointer to mm_struct. We don't hold a reference to
>>>>>>> +     * it so it should never be dereferenced from here. This is
>>>>>>> +     * only used for looking up processes by their mm.
>>>>>>> +     */
>>>>>>> +    void *mm;
>>>>>>>           struct mutex mutex;
>>>>>>>     diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
>>>>>>> b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
>>>>>>> index 3ccb3b5..21d27e5 100644
>>>>>>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
>>>>>>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
>>>>>>> @@ -200,7 +200,11 @@ static void kfd_process_destroy_delayed(struct
>>>>>>> rcu_head *rcu)
>>>>>>>         struct kfd_process *p;
>>>>>>>           p = container_of(rcu, struct kfd_process, rcu);
>>>>>>> -    WARN_ON(atomic_read(&p->mm->mm_count) <= 0);
>>>>>>> +    /*
>>>>>>> +     * This cast should be safe here because we grabbed a
>>>>>>> +     * reference to the mm in kfd_process_notifier_release
>>>>>>> +     */
>>>>>>> +    WARN_ON(atomic_read(&((struct mm_struct *)p->mm)->mm_count) <=
>>>>>>> 0);
>>>>>>>           mmdrop(p->mm);
>>> _______________________________________________
>>> amd-gfx mailing list
>>> amd-gfx at lists.freedesktop.org
>>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
>>
>>
>



More information about the amd-gfx mailing list