[PATCH hmm] drm/amdkfd: fix a use after free race with mmu_notififer unregister
Kuehling, Felix
Felix.Kuehling at amd.com
Sun Aug 4 21:28:36 UTC 2019
On 2019-08-02 16:07, Jason Gunthorpe wrote:
> When using mmu_notififer_unregister_no_release() the caller must ensure
> there is a SRCU synchronize before the mn memory is freed, otherwise use
> after free races are possible, for instance:
>
> CPU0 CPU1
> invalidate_range_start
> hlist_for_each_entry_rcu(..)
> mmu_notifier_unregister_no_release(&p->mn)
> kfree(mn)
> if (mn->ops->invalidate_range_end)
>
> The error unwind in amdkfd misses the SRCU synchronization.
>
> amdkfd keeps the kfd_process around until the mm is released, so split the
> flow to fully initialize the kfd_process and register it for find_process,
> and with the notifier. Past this point the kfd_process does not need to be
> cleaned up as it is fully ready.
>
> The final failable step does a vm_mmap() and does not seem to impact the
> kfd_process global state. Since it also cannot be undone (and already has
> problems with undo if it internally fails), it has to be last.
>
> This way we don't have to try to unwind the mmu_notifier_register() and
> avoid the problem with the SRCU.
>
> Along the way this also fixes various other error unwind bugs in the flow.
>
> Fixes: 45102048f77e ("amdkfd: Add process queue manager module")
> Signed-off-by: Jason Gunthorpe <jgg at mellanox.com>
> ---
> drivers/gpu/drm/amd/amdkfd/kfd_process.c | 74 +++++++++++-------------
> 1 file changed, 35 insertions(+), 39 deletions(-)
>
> amdkfd folks, this little bug is blocking some rework I have for the
> mmu notifiers (ie mm/mmu_notifiers: remove unregister_no_release)
>
> Can I get your help to review and if needed polish this change? I'd
> like to send this patch through the hmm tree along with the rework,
> thanks
Thanks. That's a nice cleanup of the error handling during KFD process
creation. One nit-pick inline, otherwise this looks good to me.
>
> You can see the larger series here:
>
> https://github.com/jgunthorpe/linux/commits/mmu_notifier
>
> Jason
>
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> index 8f1076c0c88a25..81e3ee3f1813bf 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> @@ -62,8 +62,8 @@ static struct workqueue_struct *kfd_restore_wq;
>
> static struct kfd_process *find_process(const struct task_struct *thread);
> static void kfd_process_ref_release(struct kref *ref);
> -static struct kfd_process *create_process(const struct task_struct *thread,
> - struct file *filep);
> +static struct kfd_process *create_process(const struct task_struct *thread);
> +static int kfd_process_init_cwsr_apu(struct kfd_process *p, struct file *filep);
>
> static void evict_process_worker(struct work_struct *work);
> static void restore_process_worker(struct work_struct *work);
> @@ -289,7 +289,15 @@ struct kfd_process *kfd_create_process(struct file *filep)
> if (process) {
> pr_debug("Process already found\n");
> } else {
> - process = create_process(thread, filep);
> + process = create_process(thread);
> + if (IS_ERR(process))
> + goto out;
> +
> + ret = kfd_process_init_cwsr_apu(process, filep);
> + if (ret) {
> + process = ERR_PTR(ret);
> + goto out;
> + }
>
> if (!procfs.kobj)
> goto out;
> @@ -609,64 +617,56 @@ static int kfd_process_device_init_cwsr_dgpu(struct kfd_process_device *pdd)
> return 0;
> }
>
> -static struct kfd_process *create_process(const struct task_struct *thread,
> - struct file *filep)
> +/*
> + * On return the kfd_process is fully operational and will be freed when the
> + * mm is released
> + */
> +static struct kfd_process *create_process(const struct task_struct *thread)
> {
> struct kfd_process *process;
> int err = -ENOMEM;
>
> process = kzalloc(sizeof(*process), GFP_KERNEL);
> -
> if (!process)
> goto err_alloc_process;
>
> - process->pasid = kfd_pasid_alloc();
> - if (process->pasid == 0)
> - goto err_alloc_pasid;
> -
> - if (kfd_alloc_process_doorbells(process) < 0)
> - goto err_alloc_doorbells;
> -
> kref_init(&process->ref);
> -
> mutex_init(&process->mutex);
> -
> process->mm = thread->mm;
> -
> - /* register notifier */
> - process->mmu_notifier.ops = &kfd_process_mmu_notifier_ops;
> - err = mmu_notifier_register(&process->mmu_notifier, process->mm);
> - if (err)
> - goto err_mmu_notifier;
> -
> - hash_add_rcu(kfd_processes_table, &process->kfd_processes,
> - (uintptr_t)process->mm);
> -
> process->lead_thread = thread->group_leader;
> - get_task_struct(process->lead_thread);
> -
> INIT_LIST_HEAD(&process->per_device_data);
> -
> + INIT_DELAYED_WORK(&process->eviction_work, evict_process_worker);
> + INIT_DELAYED_WORK(&process->restore_work, restore_process_worker);
> + process->last_restore_timestamp = get_jiffies_64();
> kfd_event_init_process(process);
> + process->is_32bit_user_mode = in_compat_syscall();
> +
> + process->pasid = kfd_pasid_alloc();
> + if (process->pasid == 0)
> + goto err_alloc_pasid;
> +
> + if (kfd_alloc_process_doorbells(process) < 0)
> + goto err_alloc_doorbells;
>
> err = pqm_init(&process->pqm, process);
> if (err != 0)
> goto err_process_pqm_init;
>
> /* init process apertures*/
> - process->is_32bit_user_mode = in_compat_syscall();
> err = kfd_init_apertures(process);
> if (err != 0)
> goto err_init_apertures;
>
> - INIT_DELAYED_WORK(&process->eviction_work, evict_process_worker);
> - INIT_DELAYED_WORK(&process->restore_work, restore_process_worker);
> - process->last_restore_timestamp = get_jiffies_64();
> -
> - err = kfd_process_init_cwsr_apu(process, filep);
> + /* Must be last, have to use release destruction after this */
> + process->mmu_notifier.ops = &kfd_process_mmu_notifier_ops;
> + err = mmu_notifier_register(&process->mmu_notifier, process->mm);
> if (err)
> goto err_init_cwsr;
This label should be renamed to something like err_mmu_notifier. With
that fixed this patch is
Reviewed-by: Felix Kuehling <Felix.Kuehling at amd.com>
>
> + get_task_struct(process->lead_thread);
> + hash_add_rcu(kfd_processes_table, &process->kfd_processes,
> + (uintptr_t)process->mm);
> +
> return process;
>
> err_init_cwsr:
> @@ -675,15 +675,11 @@ static struct kfd_process *create_process(const struct task_struct *thread,
> err_init_apertures:
> pqm_uninit(&process->pqm);
> err_process_pqm_init:
> - hash_del_rcu(&process->kfd_processes);
> - synchronize_rcu();
> - mmu_notifier_unregister_no_release(&process->mmu_notifier, process->mm);
> -err_mmu_notifier:
> - mutex_destroy(&process->mutex);
> kfd_free_process_doorbells(process);
> err_alloc_doorbells:
> kfd_pasid_free(process->pasid);
> err_alloc_pasid:
> + mutex_destroy(&process->mutex);
> kfree(process);
> err_alloc_process:
> return ERR_PTR(err);
More information about the amd-gfx
mailing list