[PATCH 1/7] mm/memory.c: Fix race when faulting a device private page

Alistair Popple apopple at nvidia.com
Thu Sep 29 01:40:32 UTC 2022


Michael Ellerman <mpe at ellerman.id.au> writes:

> Alistair Popple <apopple at nvidia.com> writes:
>> When the CPU tries to access a device private page the migrate_to_ram()
>> callback associated with the pgmap for the page is called. However no
>> reference is taken on the faulting page. Therefore a concurrent
>> migration of the device private page can free the page and possibly the
>> underlying pgmap. This results in a race which can crash the kernel due
>> to the migrate_to_ram() function pointer becoming invalid. It also means
>> drivers can't reliably read the zone_device_data field because the page
>> may have been freed with memunmap_pages().
>>
>> Close the race by getting a reference on the page while holding the ptl
>> to ensure it has not been freed. Unfortunately the elevated reference
>> count will cause the migration required to handle the fault to fail. To
>> avoid this failure pass the faulting page into the migrate_vma functions
>> so that if an elevated reference count is found it can be checked to see
>> if it's expected or not.
>>
>> Signed-off-by: Alistair Popple <apopple at nvidia.com>
>> ---
>>  arch/powerpc/kvm/book3s_hv_uvmem.c       | 15 ++++++-----
>>  drivers/gpu/drm/amd/amdkfd/kfd_migrate.c | 17 +++++++------
>>  drivers/gpu/drm/amd/amdkfd/kfd_migrate.h |  2 +-
>>  drivers/gpu/drm/amd/amdkfd/kfd_svm.c     | 11 +++++---
>>  include/linux/migrate.h                  |  8 ++++++-
>>  lib/test_hmm.c                           |  7 ++---
>>  mm/memory.c                              | 16 +++++++++++-
>>  mm/migrate.c                             | 34 ++++++++++++++-----------
>>  mm/migrate_device.c                      | 18 +++++++++----
>>  9 files changed, 87 insertions(+), 41 deletions(-)
>>
>> diff --git a/arch/powerpc/kvm/book3s_hv_uvmem.c b/arch/powerpc/kvm/book3s_hv_uvmem.c
>> index 5980063..d4eacf4 100644
>> --- a/arch/powerpc/kvm/book3s_hv_uvmem.c
>> +++ b/arch/powerpc/kvm/book3s_hv_uvmem.c
>> @@ -508,10 +508,10 @@ unsigned long kvmppc_h_svm_init_start(struct kvm *kvm)
>>  static int __kvmppc_svm_page_out(struct vm_area_struct *vma,
>>  		unsigned long start,
>>  		unsigned long end, unsigned long page_shift,
>> -		struct kvm *kvm, unsigned long gpa)
>> +		struct kvm *kvm, unsigned long gpa, struct page *fault_page)
>>  {
>>  	unsigned long src_pfn, dst_pfn = 0;
>> -	struct migrate_vma mig;
>> +	struct migrate_vma mig = { 0 };
>>  	struct page *dpage, *spage;
>>  	struct kvmppc_uvmem_page_pvt *pvt;
>>  	unsigned long pfn;
>> @@ -525,6 +525,7 @@ static int __kvmppc_svm_page_out(struct vm_area_struct *vma,
>>  	mig.dst = &dst_pfn;
>>  	mig.pgmap_owner = &kvmppc_uvmem_pgmap;
>>  	mig.flags = MIGRATE_VMA_SELECT_DEVICE_PRIVATE;
>> +	mig.fault_page = fault_page;
>>
>>  	/* The requested page is already paged-out, nothing to do */
>>  	if (!kvmppc_gfn_is_uvmem_pfn(gpa >> page_shift, kvm, NULL))
>> @@ -580,12 +581,14 @@ static int __kvmppc_svm_page_out(struct vm_area_struct *vma,
>>  static inline int kvmppc_svm_page_out(struct vm_area_struct *vma,
>>  				      unsigned long start, unsigned long end,
>>  				      unsigned long page_shift,
>> -				      struct kvm *kvm, unsigned long gpa)
>> +				      struct kvm *kvm, unsigned long gpa,
>> +				      struct page *fault_page)
>>  {
>>  	int ret;
>>
>>  	mutex_lock(&kvm->arch.uvmem_lock);
>> -	ret = __kvmppc_svm_page_out(vma, start, end, page_shift, kvm, gpa);
>> +	ret = __kvmppc_svm_page_out(vma, start, end, page_shift, kvm, gpa,
>> +				fault_page);
>>  	mutex_unlock(&kvm->arch.uvmem_lock);
>>
>>  	return ret;
>> @@ -736,7 +739,7 @@ static int kvmppc_svm_page_in(struct vm_area_struct *vma,
>>  		bool pagein)
>>  {
>>  	unsigned long src_pfn, dst_pfn = 0;
>> -	struct migrate_vma mig;
>> +	struct migrate_vma mig = { 0 };
>>  	struct page *spage;
>>  	unsigned long pfn;
>>  	struct page *dpage;
>> @@ -994,7 +997,7 @@ static vm_fault_t kvmppc_uvmem_migrate_to_ram(struct vm_fault *vmf)
>>
>>  	if (kvmppc_svm_page_out(vmf->vma, vmf->address,
>>  				vmf->address + PAGE_SIZE, PAGE_SHIFT,
>> -				pvt->kvm, pvt->gpa))
>> +				pvt->kvm, pvt->gpa, vmf->page))
>>  		return VM_FAULT_SIGBUS;
>>  	else
>>  		return 0;
>
> I don't have a UV test system, but as-is it doesn't even compile :)

Ugh, thanks. I did get as far as installing a PPC cross-compiler and
building a kernel. Apparently I did not get as far as enabling
CONFIG_PPC_UV :)

> kvmppc_svm_page_out() is called via some paths other than the
> migrate_to_ram callback.
>
> I think it's correct to just pass fault_page = NULL when it's not called
> from the migrate_to_ram callback?
>
> Incremental diff below.
>
> cheers
>
>
> diff --git a/arch/powerpc/kvm/book3s_hv_uvmem.c b/arch/powerpc/kvm/book3s_hv_uvmem.c
> index d4eacf410956..965c9e9e500b 100644
> --- a/arch/powerpc/kvm/book3s_hv_uvmem.c
> +++ b/arch/powerpc/kvm/book3s_hv_uvmem.c
> @@ -637,7 +637,7 @@ void kvmppc_uvmem_drop_pages(const struct kvm_memory_slot *slot,
>  			pvt->remove_gfn = true;
>
>  			if (__kvmppc_svm_page_out(vma, addr, addr + PAGE_SIZE,
> -						  PAGE_SHIFT, kvm, pvt->gpa))
> +						  PAGE_SHIFT, kvm, pvt->gpa, NULL))
>  				pr_err("Can't page out gpa:0x%lx addr:0x%lx\n",
>  				       pvt->gpa, addr);
>  		} else {
> @@ -1068,7 +1068,7 @@ kvmppc_h_svm_page_out(struct kvm *kvm, unsigned long gpa,
>  	if (!vma || vma->vm_start > start || vma->vm_end < end)
>  		goto out;
>
> -	if (!kvmppc_svm_page_out(vma, start, end, page_shift, kvm, gpa))
> +	if (!kvmppc_svm_page_out(vma, start, end, page_shift, kvm, gpa, NULL))
>  		ret = H_SUCCESS;
>  out:
>  	mmap_read_unlock(kvm->mm);


More information about the dri-devel mailing list