[PATCH v2 1/8] mm/memory.c: Fix race when faulting a device private page

Felix Kuehling felix.kuehling at amd.com
Thu Sep 29 18:30:54 UTC 2022


On 2022-09-28 08:01, Alistair Popple wrote:
> When the CPU tries to access a device private page the migrate_to_ram()
> callback associated with the pgmap for the page is called. However no
> reference is taken on the faulting page. Therefore a concurrent
> migration of the device private page can free the page and possibly the
> underlying pgmap. This results in a race which can crash the kernel due
> to the migrate_to_ram() function pointer becoming invalid. It also means
> drivers can't reliably read the zone_device_data field because the page
> may have been freed with memunmap_pages().
>
> Close the race by getting a reference on the page while holding the ptl
> to ensure it has not been freed. Unfortunately the elevated reference
> count will cause the migration required to handle the fault to fail. To
> avoid this failure pass the faulting page into the migrate_vma functions
> so that if an elevated reference count is found it can be checked to see
> if it's expected or not.

Do we really have to drag the fault_page all the way into the migrate 
structure? Is the elevated refcount still needed at that time? Maybe it 
would be easier to drop the refcount early in the ops->migrage_to_ram 
callbacks, so we won't have to deal with it in all the migration code.

Regards,
   Felix


>
> Signed-off-by: Alistair Popple <apopple at nvidia.com>
> Cc: Jason Gunthorpe <jgg at nvidia.com>
> Cc: John Hubbard <jhubbard at nvidia.com>
> Cc: Ralph Campbell <rcampbell at nvidia.com>
> Cc: Michael Ellerman <mpe at ellerman.id.au>
> Cc: Felix Kuehling <Felix.Kuehling at amd.com>
> Cc: Lyude Paul <lyude at redhat.com>
> ---
>   arch/powerpc/kvm/book3s_hv_uvmem.c       | 15 ++++++-----
>   drivers/gpu/drm/amd/amdkfd/kfd_migrate.c | 17 +++++++------
>   drivers/gpu/drm/amd/amdkfd/kfd_migrate.h |  2 +-
>   drivers/gpu/drm/amd/amdkfd/kfd_svm.c     | 11 +++++---
>   include/linux/migrate.h                  |  8 ++++++-
>   lib/test_hmm.c                           |  7 ++---
>   mm/memory.c                              | 16 +++++++++++-
>   mm/migrate.c                             | 34 ++++++++++++++-----------
>   mm/migrate_device.c                      | 18 +++++++++----
>   9 files changed, 87 insertions(+), 41 deletions(-)
>
> diff --git a/arch/powerpc/kvm/book3s_hv_uvmem.c b/arch/powerpc/kvm/book3s_hv_uvmem.c
> index 5980063..d4eacf4 100644
> --- a/arch/powerpc/kvm/book3s_hv_uvmem.c
> +++ b/arch/powerpc/kvm/book3s_hv_uvmem.c
> @@ -508,10 +508,10 @@ unsigned long kvmppc_h_svm_init_start(struct kvm *kvm)
>   static int __kvmppc_svm_page_out(struct vm_area_struct *vma,
>   		unsigned long start,
>   		unsigned long end, unsigned long page_shift,
> -		struct kvm *kvm, unsigned long gpa)
> +		struct kvm *kvm, unsigned long gpa, struct page *fault_page)
>   {
>   	unsigned long src_pfn, dst_pfn = 0;
> -	struct migrate_vma mig;
> +	struct migrate_vma mig = { 0 };
>   	struct page *dpage, *spage;
>   	struct kvmppc_uvmem_page_pvt *pvt;
>   	unsigned long pfn;
> @@ -525,6 +525,7 @@ static int __kvmppc_svm_page_out(struct vm_area_struct *vma,
>   	mig.dst = &dst_pfn;
>   	mig.pgmap_owner = &kvmppc_uvmem_pgmap;
>   	mig.flags = MIGRATE_VMA_SELECT_DEVICE_PRIVATE;
> +	mig.fault_page = fault_page;
>   
>   	/* The requested page is already paged-out, nothing to do */
>   	if (!kvmppc_gfn_is_uvmem_pfn(gpa >> page_shift, kvm, NULL))
> @@ -580,12 +581,14 @@ static int __kvmppc_svm_page_out(struct vm_area_struct *vma,
>   static inline int kvmppc_svm_page_out(struct vm_area_struct *vma,
>   				      unsigned long start, unsigned long end,
>   				      unsigned long page_shift,
> -				      struct kvm *kvm, unsigned long gpa)
> +				      struct kvm *kvm, unsigned long gpa,
> +				      struct page *fault_page)
>   {
>   	int ret;
>   
>   	mutex_lock(&kvm->arch.uvmem_lock);
> -	ret = __kvmppc_svm_page_out(vma, start, end, page_shift, kvm, gpa);
> +	ret = __kvmppc_svm_page_out(vma, start, end, page_shift, kvm, gpa,
> +				fault_page);
>   	mutex_unlock(&kvm->arch.uvmem_lock);
>   
>   	return ret;
> @@ -736,7 +739,7 @@ static int kvmppc_svm_page_in(struct vm_area_struct *vma,
>   		bool pagein)
>   {
>   	unsigned long src_pfn, dst_pfn = 0;
> -	struct migrate_vma mig;
> +	struct migrate_vma mig = { 0 };
>   	struct page *spage;
>   	unsigned long pfn;
>   	struct page *dpage;
> @@ -994,7 +997,7 @@ static vm_fault_t kvmppc_uvmem_migrate_to_ram(struct vm_fault *vmf)
>   
>   	if (kvmppc_svm_page_out(vmf->vma, vmf->address,
>   				vmf->address + PAGE_SIZE, PAGE_SHIFT,
> -				pvt->kvm, pvt->gpa))
> +				pvt->kvm, pvt->gpa, vmf->page))
>   		return VM_FAULT_SIGBUS;
>   	else
>   		return 0;
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c
> index b059a77..776448b 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c
> @@ -409,7 +409,7 @@ svm_migrate_vma_to_vram(struct amdgpu_device *adev, struct svm_range *prange,
>   	uint64_t npages = (end - start) >> PAGE_SHIFT;
>   	struct kfd_process_device *pdd;
>   	struct dma_fence *mfence = NULL;
> -	struct migrate_vma migrate;
> +	struct migrate_vma migrate = { 0 };
>   	unsigned long cpages = 0;
>   	dma_addr_t *scratch;
>   	void *buf;
> @@ -668,7 +668,7 @@ svm_migrate_copy_to_ram(struct amdgpu_device *adev, struct svm_range *prange,
>   static long
>   svm_migrate_vma_to_ram(struct amdgpu_device *adev, struct svm_range *prange,
>   		       struct vm_area_struct *vma, uint64_t start, uint64_t end,
> -		       uint32_t trigger)
> +		       uint32_t trigger, struct page *fault_page)
>   {
>   	struct kfd_process *p = container_of(prange->svms, struct kfd_process, svms);
>   	uint64_t npages = (end - start) >> PAGE_SHIFT;
> @@ -676,7 +676,7 @@ svm_migrate_vma_to_ram(struct amdgpu_device *adev, struct svm_range *prange,
>   	unsigned long cpages = 0;
>   	struct kfd_process_device *pdd;
>   	struct dma_fence *mfence = NULL;
> -	struct migrate_vma migrate;
> +	struct migrate_vma migrate = { 0 };
>   	dma_addr_t *scratch;
>   	void *buf;
>   	int r = -ENOMEM;
> @@ -699,6 +699,7 @@ svm_migrate_vma_to_ram(struct amdgpu_device *adev, struct svm_range *prange,
>   
>   	migrate.src = buf;
>   	migrate.dst = migrate.src + npages;
> +	migrate.fault_page = fault_page;
>   	scratch = (dma_addr_t *)(migrate.dst + npages);
>   
>   	kfd_smi_event_migration_start(adev->kfd.dev, p->lead_thread->pid,
> @@ -766,7 +767,7 @@ svm_migrate_vma_to_ram(struct amdgpu_device *adev, struct svm_range *prange,
>    * 0 - OK, otherwise error code
>    */
>   int svm_migrate_vram_to_ram(struct svm_range *prange, struct mm_struct *mm,
> -			    uint32_t trigger)
> +			    uint32_t trigger, struct page *fault_page)
>   {
>   	struct amdgpu_device *adev;
>   	struct vm_area_struct *vma;
> @@ -807,7 +808,8 @@ int svm_migrate_vram_to_ram(struct svm_range *prange, struct mm_struct *mm,
>   		}
>   
>   		next = min(vma->vm_end, end);
> -		r = svm_migrate_vma_to_ram(adev, prange, vma, addr, next, trigger);
> +		r = svm_migrate_vma_to_ram(adev, prange, vma, addr, next, trigger,
> +			fault_page);
>   		if (r < 0) {
>   			pr_debug("failed %ld to migrate prange %p\n", r, prange);
>   			break;
> @@ -851,7 +853,7 @@ svm_migrate_vram_to_vram(struct svm_range *prange, uint32_t best_loc,
>   	pr_debug("from gpu 0x%x to gpu 0x%x\n", prange->actual_loc, best_loc);
>   
>   	do {
> -		r = svm_migrate_vram_to_ram(prange, mm, trigger);
> +		r = svm_migrate_vram_to_ram(prange, mm, trigger, NULL);
>   		if (r)
>   			return r;
>   	} while (prange->actual_loc && --retries);
> @@ -938,7 +940,8 @@ static vm_fault_t svm_migrate_to_ram(struct vm_fault *vmf)
>   		goto out_unlock_prange;
>   	}
>   
> -	r = svm_migrate_vram_to_ram(prange, mm, KFD_MIGRATE_TRIGGER_PAGEFAULT_CPU);
> +	r = svm_migrate_vram_to_ram(prange, mm, KFD_MIGRATE_TRIGGER_PAGEFAULT_CPU,
> +				vmf->page);
>   	if (r)
>   		pr_debug("failed %d migrate 0x%p [0x%lx 0x%lx] to ram\n", r,
>   			 prange, prange->start, prange->last);
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.h b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.h
> index b3f0754..a5d7e6d 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.h
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.h
> @@ -43,7 +43,7 @@ enum MIGRATION_COPY_DIR {
>   int svm_migrate_to_vram(struct svm_range *prange,  uint32_t best_loc,
>   			struct mm_struct *mm, uint32_t trigger);
>   int svm_migrate_vram_to_ram(struct svm_range *prange, struct mm_struct *mm,
> -			    uint32_t trigger);
> +			    uint32_t trigger, struct page *fault_page);
>   unsigned long
>   svm_migrate_addr_to_pfn(struct amdgpu_device *adev, unsigned long addr);
>   
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
> index 11074cc..9139e5a 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
> @@ -2913,13 +2913,15 @@ svm_range_restore_pages(struct amdgpu_device *adev, unsigned int pasid,
>   				 */
>   				if (prange->actual_loc)
>   					r = svm_migrate_vram_to_ram(prange, mm,
> -					   KFD_MIGRATE_TRIGGER_PAGEFAULT_GPU);
> +					   KFD_MIGRATE_TRIGGER_PAGEFAULT_GPU,
> +					   NULL);
>   				else
>   					r = 0;
>   			}
>   		} else {
>   			r = svm_migrate_vram_to_ram(prange, mm,
> -					KFD_MIGRATE_TRIGGER_PAGEFAULT_GPU);
> +					KFD_MIGRATE_TRIGGER_PAGEFAULT_GPU,
> +					NULL);
>   		}
>   		if (r) {
>   			pr_debug("failed %d to migrate svms %p [0x%lx 0x%lx]\n",
> @@ -3242,7 +3244,8 @@ svm_range_trigger_migration(struct mm_struct *mm, struct svm_range *prange,
>   		return 0;
>   
>   	if (!best_loc) {
> -		r = svm_migrate_vram_to_ram(prange, mm, KFD_MIGRATE_TRIGGER_PREFETCH);
> +		r = svm_migrate_vram_to_ram(prange, mm,
> +					KFD_MIGRATE_TRIGGER_PREFETCH, NULL);
>   		*migrated = !r;
>   		return r;
>   	}
> @@ -3303,7 +3306,7 @@ static void svm_range_evict_svm_bo_worker(struct work_struct *work)
>   		mutex_lock(&prange->migrate_mutex);
>   		do {
>   			r = svm_migrate_vram_to_ram(prange, mm,
> -						KFD_MIGRATE_TRIGGER_TTM_EVICTION);
> +					KFD_MIGRATE_TRIGGER_TTM_EVICTION, NULL);
>   		} while (!r && prange->actual_loc && --retries);
>   
>   		if (!r && prange->actual_loc)
> diff --git a/include/linux/migrate.h b/include/linux/migrate.h
> index 22c0a0c..82ffa47 100644
> --- a/include/linux/migrate.h
> +++ b/include/linux/migrate.h
> @@ -62,6 +62,8 @@ extern const char *migrate_reason_names[MR_TYPES];
>   #ifdef CONFIG_MIGRATION
>   
>   extern void putback_movable_pages(struct list_head *l);
> +int migrate_folio_extra(struct address_space *mapping, struct folio *dst,
> +		struct folio *src, enum migrate_mode mode, int extra_count);
>   int migrate_folio(struct address_space *mapping, struct folio *dst,
>   		struct folio *src, enum migrate_mode mode);
>   extern int migrate_pages(struct list_head *l, new_page_t new, free_page_t free,
> @@ -212,6 +214,12 @@ struct migrate_vma {
>   	 */
>   	void			*pgmap_owner;
>   	unsigned long		flags;
> +
> +	/*
> +	 * Set to vmf->page if this is being called to migrate a page as part of
> +	 * a migrate_to_ram() callback.
> +	 */
> +	struct page		*fault_page;
>   };
>   
>   int migrate_vma_setup(struct migrate_vma *args);
> diff --git a/lib/test_hmm.c b/lib/test_hmm.c
> index e3965ca..89463ff 100644
> --- a/lib/test_hmm.c
> +++ b/lib/test_hmm.c
> @@ -907,7 +907,7 @@ static int dmirror_migrate_to_system(struct dmirror *dmirror,
>   	struct vm_area_struct *vma;
>   	unsigned long src_pfns[64] = { 0 };
>   	unsigned long dst_pfns[64] = { 0 };
> -	struct migrate_vma args;
> +	struct migrate_vma args = { 0 };
>   	unsigned long next;
>   	int ret;
>   
> @@ -968,7 +968,7 @@ static int dmirror_migrate_to_device(struct dmirror *dmirror,
>   	unsigned long src_pfns[64] = { 0 };
>   	unsigned long dst_pfns[64] = { 0 };
>   	struct dmirror_bounce bounce;
> -	struct migrate_vma args;
> +	struct migrate_vma args = { 0 };
>   	unsigned long next;
>   	int ret;
>   
> @@ -1334,7 +1334,7 @@ static void dmirror_devmem_free(struct page *page)
>   
>   static vm_fault_t dmirror_devmem_fault(struct vm_fault *vmf)
>   {
> -	struct migrate_vma args;
> +	struct migrate_vma args = { 0 };
>   	unsigned long src_pfns = 0;
>   	unsigned long dst_pfns = 0;
>   	struct page *rpage;
> @@ -1357,6 +1357,7 @@ static vm_fault_t dmirror_devmem_fault(struct vm_fault *vmf)
>   	args.dst = &dst_pfns;
>   	args.pgmap_owner = dmirror->mdevice;
>   	args.flags = dmirror_select_device(dmirror);
> +	args.fault_page = vmf->page;
>   
>   	if (migrate_vma_setup(&args))
>   		return VM_FAULT_SIGBUS;
> diff --git a/mm/memory.c b/mm/memory.c
> index b994784..65d3977 100644
> --- a/mm/memory.c
> +++ b/mm/memory.c
> @@ -3742,7 +3742,21 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
>   			ret = remove_device_exclusive_entry(vmf);
>   		} else if (is_device_private_entry(entry)) {
>   			vmf->page = pfn_swap_entry_to_page(entry);
> -			ret = vmf->page->pgmap->ops->migrate_to_ram(vmf);
> +			vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
> +					vmf->address, &vmf->ptl);
> +			if (unlikely(!pte_same(*vmf->pte, vmf->orig_pte))) {
> +				spin_unlock(vmf->ptl);
> +				goto out;
> +			}
> +
> +			/*
> +			 * Get a page reference while we know the page can't be
> +			 * freed.
> +			 */
> +			get_page(vmf->page);
> +			pte_unmap_unlock(vmf->pte, vmf->ptl);
> +			vmf->page->pgmap->ops->migrate_to_ram(vmf);
> +			put_page(vmf->page);
>   		} else if (is_hwpoison_entry(entry)) {
>   			ret = VM_FAULT_HWPOISON;
>   		} else if (is_swapin_error_entry(entry)) {
> diff --git a/mm/migrate.c b/mm/migrate.c
> index ce6a58f..e3f78a7 100644
> --- a/mm/migrate.c
> +++ b/mm/migrate.c
> @@ -620,6 +620,25 @@ EXPORT_SYMBOL(folio_migrate_copy);
>    *                    Migration functions
>    ***********************************************************/
>   
> +int migrate_folio_extra(struct address_space *mapping, struct folio *dst,
> +		struct folio *src, enum migrate_mode mode, int extra_count)
> +{
> +	int rc;
> +
> +	BUG_ON(folio_test_writeback(src));	/* Writeback must be complete */
> +
> +	rc = folio_migrate_mapping(mapping, dst, src, extra_count);
> +
> +	if (rc != MIGRATEPAGE_SUCCESS)
> +		return rc;
> +
> +	if (mode != MIGRATE_SYNC_NO_COPY)
> +		folio_migrate_copy(dst, src);
> +	else
> +		folio_migrate_flags(dst, src);
> +	return MIGRATEPAGE_SUCCESS;
> +}
> +
>   /**
>    * migrate_folio() - Simple folio migration.
>    * @mapping: The address_space containing the folio.
> @@ -635,20 +654,7 @@ EXPORT_SYMBOL(folio_migrate_copy);
>   int migrate_folio(struct address_space *mapping, struct folio *dst,
>   		struct folio *src, enum migrate_mode mode)
>   {
> -	int rc;
> -
> -	BUG_ON(folio_test_writeback(src));	/* Writeback must be complete */
> -
> -	rc = folio_migrate_mapping(mapping, dst, src, 0);
> -
> -	if (rc != MIGRATEPAGE_SUCCESS)
> -		return rc;
> -
> -	if (mode != MIGRATE_SYNC_NO_COPY)
> -		folio_migrate_copy(dst, src);
> -	else
> -		folio_migrate_flags(dst, src);
> -	return MIGRATEPAGE_SUCCESS;
> +	return migrate_folio_extra(mapping, dst, src, mode, 0);
>   }
>   EXPORT_SYMBOL(migrate_folio);
>   
> diff --git a/mm/migrate_device.c b/mm/migrate_device.c
> index 7235424..f756c00 100644
> --- a/mm/migrate_device.c
> +++ b/mm/migrate_device.c
> @@ -313,14 +313,14 @@ static void migrate_vma_collect(struct migrate_vma *migrate)
>    * folio_migrate_mapping(), except that here we allow migration of a
>    * ZONE_DEVICE page.
>    */
> -static bool migrate_vma_check_page(struct page *page)
> +static bool migrate_vma_check_page(struct page *page, struct page *fault_page)
>   {
>   	/*
>   	 * One extra ref because caller holds an extra reference, either from
>   	 * isolate_lru_page() for a regular page, or migrate_vma_collect() for
>   	 * a device page.
>   	 */
> -	int extra = 1;
> +	int extra = 1 + (page == fault_page);
>   
>   	/*
>   	 * FIXME support THP (transparent huge page), it is bit more complex to
> @@ -393,7 +393,8 @@ static void migrate_vma_unmap(struct migrate_vma *migrate)
>   		if (folio_mapped(folio))
>   			try_to_migrate(folio, 0);
>   
> -		if (page_mapped(page) || !migrate_vma_check_page(page)) {
> +		if (page_mapped(page) ||
> +		    !migrate_vma_check_page(page, migrate->fault_page)) {
>   			if (!is_zone_device_page(page)) {
>   				get_page(page);
>   				putback_lru_page(page);
> @@ -505,6 +506,8 @@ int migrate_vma_setup(struct migrate_vma *args)
>   		return -EINVAL;
>   	if (!args->src || !args->dst)
>   		return -EINVAL;
> +	if (args->fault_page && !is_device_private_page(args->fault_page))
> +		return -EINVAL;
>   
>   	memset(args->src, 0, sizeof(*args->src) * nr_pages);
>   	args->cpages = 0;
> @@ -735,8 +738,13 @@ void migrate_vma_pages(struct migrate_vma *migrate)
>   			continue;
>   		}
>   
> -		r = migrate_folio(mapping, page_folio(newpage),
> -				page_folio(page), MIGRATE_SYNC_NO_COPY);
> +		if (migrate->fault_page == page)
> +			r = migrate_folio_extra(mapping, page_folio(newpage),
> +						page_folio(page),
> +						MIGRATE_SYNC_NO_COPY, 1);
> +		else
> +			r = migrate_folio(mapping, page_folio(newpage),
> +					page_folio(page), MIGRATE_SYNC_NO_COPY);
>   		if (r != MIGRATEPAGE_SUCCESS)
>   			migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
>   	}


More information about the dri-devel mailing list