[PATCH hmm 5/5] mm/hmm: remove the customizable pfn format from hmm_range_fault

Felix Kuehling felix.kuehling at amd.com
Wed Apr 22 17:52:32 UTC 2020


[+Philip Yang]

Am 2020-04-21 um 8:21 p.m. schrieb Jason Gunthorpe:
> From: Jason Gunthorpe <jgg at mellanox.com>
>
> Presumably the intent here was that hmm_range_fault() could put the data
> into some HW specific format and thus avoid some work. However, nothing
> actually does that, and it isn't clear how anything actually could do that
> as hmm_range_fault() provides CPU addresses which must be DMA mapped.
>
> Perhaps there is some special HW that does not need DMA mapping, but we
> don't have any examples of this, and the theoretical performance win of
> avoiding an extra scan over the pfns array doesn't seem worth the
> complexity. Plus pfns needs to be scanned anyhow to sort out any
> DEVICE_PRIVATE pages.
>
> This version replaces the uint64_t with an usigned long containing a pfn
> and fix flags. On input flags is filled with the HMM_PFN_REQ_* values, on
> successful output it is filled with HMM_PFN_* values, describing the state
> of the pages.
>
> amdgpu is simple to convert, it doesn't use snapshot and doesn't use
> per-page flags.
>
> nouveau uses only 16 hmm_pte entries at most (ie fits in a few cache
> lines), and it sweeps over its pfns array a couple of times anyhow.
>
> Signed-off-by: Jason Gunthorpe <jgg at mellanox.com>
> Signed-off-by: Christoph Hellwig <hch at lst.de>

Hi Jason,

I pointed out a typo in the documentation inline. Other than that, the
series is

Acked-by: Felix Kuehling <Felix.Kuehling at amd.com>

I'll try to build it and run some basic tests later.


> ---
>  Documentation/vm/hmm.rst                |  26 ++--
>  drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c |  35 ++----
>  drivers/gpu/drm/nouveau/nouveau_dmem.c  |  60 +++++++--
>  drivers/gpu/drm/nouveau/nouveau_dmem.h  |   4 +-
>  drivers/gpu/drm/nouveau/nouveau_svm.c   |  52 ++++----
>  include/linux/hmm.h                     |  99 ++++++---------
>  mm/hmm.c                                | 160 +++++++++++-------------
>  7 files changed, 204 insertions(+), 232 deletions(-)
>
> diff --git a/Documentation/vm/hmm.rst b/Documentation/vm/hmm.rst
> index 9924f2caa0184c..73a9b8c858e5d9 100644
> --- a/Documentation/vm/hmm.rst
> +++ b/Documentation/vm/hmm.rst
> @@ -185,9 +185,6 @@ The usage pattern is::
>        range.start = ...;
>        range.end = ...;
>        range.pfns = ...;
> -      range.flags = ...;
> -      range.values = ...;
> -      range.pfn_shift = ...;
>  
>        if (!mmget_not_zero(interval_sub->notifier.mm))
>            return -EFAULT;
> @@ -229,15 +226,10 @@ The hmm_range struct has 2 fields, default_flags and pfn_flags_mask, that specif
>  fault or snapshot policy for the whole range instead of having to set them
>  for each entry in the pfns array.
>  
> -For instance, if the device flags for range.flags are::
> +For instance if the device driver wants pages for a range with at least read
> +permission, it sets::
>  
> -    range.flags[HMM_PFN_VALID] = (1 << 63);
> -    range.flags[HMM_PFN_WRITE] = (1 << 62);
> -
> -and the device driver wants pages for a range with at least read permission,
> -it sets::
> -
> -    range->default_flags = (1 << 63);
> +    range->default_flags = HMM_PFN_REQ_VALID;

This should be HMM_PFN_REQ_FAULT.


>      range->pfn_flags_mask = 0;
>  
>  and calls hmm_range_fault() as described above. This will fill fault all pages
> @@ -246,18 +238,18 @@ in the range with at least read permission.
>  Now let's say the driver wants to do the same except for one page in the range for
>  which it wants to have write permission. Now driver set::
>  
> -    range->default_flags = (1 << 63);
> -    range->pfn_flags_mask = (1 << 62);
> -    range->pfns[index_of_write] = (1 << 62);
> +    range->default_flags = HMM_PFN_REQ_VALID;

HMM_PFN_REQ_FAULT

Regards,
  Felix


> +    range->pfn_flags_mask = HMM_PFN_REQ_WRITE;
> +    range->pfns[index_of_write] = HMM_PFN_REQ_WRITE;
>  
>  With this, HMM will fault in all pages with at least read (i.e., valid) and for the
>  address == range->start + (index_of_write << PAGE_SHIFT) it will fault with
>  write permission i.e., if the CPU pte does not have write permission set then HMM
>  will call handle_mm_fault().
>  
> -Note that HMM will populate the pfns array with write permission for any page
> -that is mapped with CPU write permission no matter what values are set
> -in default_flags or pfn_flags_mask.
> +After hmm_range_fault completes the flag bits are set to the current state of
> +the page tables, ie HMM_PFN_VALID | HMM_PFN_WRITE will be set if the page is
> +writable.
>  
>  
>  Represent and manage device memory from core kernel point of view
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
> index 449083f9f8a2bf..bcfa8c26647d5e 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
> @@ -766,17 +766,6 @@ struct amdgpu_ttm_tt {
>  };
>  
>  #ifdef CONFIG_DRM_AMDGPU_USERPTR
> -/* flags used by HMM internal, not related to CPU/GPU PTE flags */
> -static const uint64_t hmm_range_flags[HMM_PFN_FLAG_MAX] = {
> -	(1 << 0), /* HMM_PFN_VALID */
> -	(1 << 1), /* HMM_PFN_WRITE */
> -};
> -
> -static const uint64_t hmm_range_values[HMM_PFN_VALUE_MAX] = {
> -	0xfffffffffffffffeUL, /* HMM_PFN_ERROR */
> -	0, /* HMM_PFN_NONE */
> -};
> -
>  /**
>   * amdgpu_ttm_tt_get_user_pages - get device accessible pages that back user
>   * memory and start HMM tracking CPU page table update
> @@ -815,18 +804,15 @@ int amdgpu_ttm_tt_get_user_pages(struct amdgpu_bo *bo, struct page **pages)
>  		goto out;
>  	}
>  	range->notifier = &bo->notifier;
> -	range->flags = hmm_range_flags;
> -	range->values = hmm_range_values;
> -	range->pfn_shift = PAGE_SHIFT;
>  	range->start = bo->notifier.interval_tree.start;
>  	range->end = bo->notifier.interval_tree.last + 1;
> -	range->default_flags = hmm_range_flags[HMM_PFN_VALID];
> +	range->default_flags = HMM_PFN_REQ_FAULT;
>  	if (!amdgpu_ttm_tt_is_readonly(ttm))
> -		range->default_flags |= range->flags[HMM_PFN_WRITE];
> +		range->default_flags |= HMM_PFN_REQ_WRITE;
>  
> -	range->pfns = kvmalloc_array(ttm->num_pages, sizeof(*range->pfns),
> -				     GFP_KERNEL);
> -	if (unlikely(!range->pfns)) {
> +	range->hmm_pfns = kvmalloc_array(ttm->num_pages,
> +					 sizeof(*range->hmm_pfns), GFP_KERNEL);
> +	if (unlikely(!range->hmm_pfns)) {
>  		r = -ENOMEM;
>  		goto out_free_ranges;
>  	}
> @@ -867,7 +853,7 @@ int amdgpu_ttm_tt_get_user_pages(struct amdgpu_bo *bo, struct page **pages)
>  	 * the notifier_lock, and mmu_interval_read_retry() must be done first.
>  	 */
>  	for (i = 0; i < ttm->num_pages; i++)
> -		pages[i] = hmm_device_entry_to_page(range, range->pfns[i]);
> +		pages[i] = hmm_pfn_to_page(range->hmm_pfns[i]);
>  
>  	gtt->range = range;
>  	mmput(mm);
> @@ -877,7 +863,7 @@ int amdgpu_ttm_tt_get_user_pages(struct amdgpu_bo *bo, struct page **pages)
>  out_unlock:
>  	up_read(&mm->mmap_sem);
>  out_free_pfns:
> -	kvfree(range->pfns);
> +	kvfree(range->hmm_pfns);
>  out_free_ranges:
>  	kfree(range);
>  out:
> @@ -902,7 +888,7 @@ bool amdgpu_ttm_tt_get_user_pages_done(struct ttm_tt *ttm)
>  	DRM_DEBUG_DRIVER("user_pages_done 0x%llx pages 0x%lx\n",
>  		gtt->userptr, ttm->num_pages);
>  
> -	WARN_ONCE(!gtt->range || !gtt->range->pfns,
> +	WARN_ONCE(!gtt->range || !gtt->range->hmm_pfns,
>  		"No user pages to check\n");
>  
>  	if (gtt->range) {
> @@ -912,7 +898,7 @@ bool amdgpu_ttm_tt_get_user_pages_done(struct ttm_tt *ttm)
>  		 */
>  		r = mmu_interval_read_retry(gtt->range->notifier,
>  					 gtt->range->notifier_seq);
> -		kvfree(gtt->range->pfns);
> +		kvfree(gtt->range->hmm_pfns);
>  		kfree(gtt->range);
>  		gtt->range = NULL;
>  	}
> @@ -1003,8 +989,7 @@ static void amdgpu_ttm_tt_unpin_userptr(struct ttm_tt *ttm)
>  
>  		for (i = 0; i < ttm->num_pages; i++) {
>  			if (ttm->pages[i] !=
> -				hmm_device_entry_to_page(gtt->range,
> -					      gtt->range->pfns[i]))
> +			    hmm_pfn_to_page(gtt->range->hmm_pfns[i]))
>  				break;
>  		}
>  
> diff --git a/drivers/gpu/drm/nouveau/nouveau_dmem.c b/drivers/gpu/drm/nouveau/nouveau_dmem.c
> index ad89e09a0be39a..07876fb0e1d665 100644
> --- a/drivers/gpu/drm/nouveau/nouveau_dmem.c
> +++ b/drivers/gpu/drm/nouveau/nouveau_dmem.c
> @@ -672,27 +672,61 @@ nouveau_dmem_migrate_vma(struct nouveau_drm *drm,
>  	return ret;
>  }
>  
> -void
> -nouveau_dmem_convert_pfn(struct nouveau_drm *drm,
> -			 struct hmm_range *range)
> +void nouveau_hmm_convert_pfn(struct nouveau_drm *drm, struct hmm_range *range,
> +			     u64 *ioctl_addr)
>  {
>  	unsigned long i, npages;
>  
> +	/*
> +	 * The ioctl_addr prepared here is passed through nvif_object_ioctl()
> +	 * to an eventual DMA map on some call chain like:
> +	 *    nouveau_svm_fault():
> +	 *      args.i.m.method = NVIF_VMM_V0_PFNMAP
> +	 *      nouveau_range_fault()
> +	 *       nvif_object_ioctl()
> +	 *        client->driver->ioctl()
> +	 *           struct nvif_driver nvif_driver_nvkm:
> +	 *             .ioctl = nvkm_client_ioctl
> +	 *            nvkm_ioctl()
> +	 *             nvkm_ioctl_path()
> +	 *               nvkm_ioctl_v0[type].func(..)
> +	 *               nvkm_ioctl_mthd()
> +	 *                nvkm_object_mthd()
> +	 *                   struct nvkm_object_func nvkm_uvmm:
> +	 *                     .mthd = nvkm_uvmm_mthd
> +	 *                    nvkm_uvmm_mthd()
> +	 *                     nvkm_uvmm_mthd_pfnmap()
> +	 *                      nvkm_vmm_pfn_map()
> +	 *                       nvkm_vmm_ptes_get_map()
> +	 *                        func == gp100_vmm_pgt_pfn
> +	 *                         struct nvkm_vmm_desc_func gp100_vmm_desc_spt:
> +	 *                           .pfn = gp100_vmm_pgt_pfn
> +	 *                          nvkm_vmm_iter()
> +	 *                           REF_PTES == func == gp100_vmm_pgt_pfn()
> +	 *			      dma_map_page()
> +	 *
> +	 * This is all just encoding the internal hmm reprensetation into a
> +	 * different nouveau internal representation.
> +	 */
>  	npages = (range->end - range->start) >> PAGE_SHIFT;
>  	for (i = 0; i < npages; ++i) {
>  		struct page *page;
> -		uint64_t addr;
>  
> -		page = hmm_device_entry_to_page(range, range->pfns[i]);
> -		if (page == NULL)
> -			continue;
> -
> -		if (!is_device_private_page(page))
> +		if (!(range->hmm_pfns[i] & HMM_PFN_VALID)) {
> +			ioctl_addr[i] = 0;
>  			continue;
> +		}
>  
> -		addr = nouveau_dmem_page_addr(page);
> -		range->pfns[i] &= ((1UL << range->pfn_shift) - 1);
> -		range->pfns[i] |= (addr >> PAGE_SHIFT) << range->pfn_shift;
> -		range->pfns[i] |= NVIF_VMM_PFNMAP_V0_VRAM;
> +		page = hmm_pfn_to_page(range->hmm_pfns[i]);
> +		if (is_device_private_page(page))
> +			ioctl_addr[i] = nouveau_dmem_page_addr(page) |
> +					NVIF_VMM_PFNMAP_V0_V |
> +					NVIF_VMM_PFNMAP_V0_VRAM;
> +		else
> +			ioctl_addr[i] = page_to_phys(page) |
> +					NVIF_VMM_PFNMAP_V0_V |
> +					NVIF_VMM_PFNMAP_V0_HOST;
> +		if (range->hmm_pfns[i] & HMM_PFN_WRITE)
> +			ioctl_addr[i] |= NVIF_VMM_PFNMAP_V0_W;
>  	}
>  }
> diff --git a/drivers/gpu/drm/nouveau/nouveau_dmem.h b/drivers/gpu/drm/nouveau/nouveau_dmem.h
> index 92394be5d64923..4607c0be7dd062 100644
> --- a/drivers/gpu/drm/nouveau/nouveau_dmem.h
> +++ b/drivers/gpu/drm/nouveau/nouveau_dmem.h
> @@ -38,8 +38,8 @@ int nouveau_dmem_migrate_vma(struct nouveau_drm *drm,
>  			     unsigned long start,
>  			     unsigned long end);
>  
> -void nouveau_dmem_convert_pfn(struct nouveau_drm *drm,
> -			      struct hmm_range *range);
> +void nouveau_hmm_convert_pfn(struct nouveau_drm *drm, struct hmm_range *range,
> +			     u64 *ioctl_addr);
>  #else /* IS_ENABLED(CONFIG_DRM_NOUVEAU_SVM) */
>  static inline void nouveau_dmem_init(struct nouveau_drm *drm) {}
>  static inline void nouveau_dmem_fini(struct nouveau_drm *drm) {}
> diff --git a/drivers/gpu/drm/nouveau/nouveau_svm.c b/drivers/gpu/drm/nouveau/nouveau_svm.c
> index cf0d9bd61bebf9..d1059bce091192 100644
> --- a/drivers/gpu/drm/nouveau/nouveau_svm.c
> +++ b/drivers/gpu/drm/nouveau/nouveau_svm.c
> @@ -369,18 +369,6 @@ nouveau_svmm_init(struct drm_device *dev, void *data,
>  	return ret;
>  }
>  
> -static const u64
> -nouveau_svm_pfn_flags[HMM_PFN_FLAG_MAX] = {
> -	[HMM_PFN_VALID         ] = NVIF_VMM_PFNMAP_V0_V,
> -	[HMM_PFN_WRITE         ] = NVIF_VMM_PFNMAP_V0_W,
> -};
> -
> -static const u64
> -nouveau_svm_pfn_values[HMM_PFN_VALUE_MAX] = {
> -	[HMM_PFN_ERROR  ] = ~NVIF_VMM_PFNMAP_V0_V,
> -	[HMM_PFN_NONE   ] =  NVIF_VMM_PFNMAP_V0_NONE,
> -};
> -
>  /* Issue fault replay for GPU to retry accesses that faulted previously. */
>  static void
>  nouveau_svm_fault_replay(struct nouveau_svm *svm)
> @@ -520,7 +508,8 @@ static const struct mmu_interval_notifier_ops nouveau_svm_mni_ops = {
>  
>  static int nouveau_range_fault(struct nouveau_svmm *svmm,
>  			       struct nouveau_drm *drm, void *data, u32 size,
> -			       u64 *pfns, struct svm_notifier *notifier)
> +			       unsigned long hmm_pfns[], u64 *ioctl_addr,
> +			       struct svm_notifier *notifier)
>  {
>  	unsigned long timeout =
>  		jiffies + msecs_to_jiffies(HMM_RANGE_DEFAULT_TIMEOUT);
> @@ -529,10 +518,8 @@ static int nouveau_range_fault(struct nouveau_svmm *svmm,
>  		.notifier = &notifier->notifier,
>  		.start = notifier->notifier.interval_tree.start,
>  		.end = notifier->notifier.interval_tree.last + 1,
> -		.pfns = pfns,
> -		.flags = nouveau_svm_pfn_flags,
> -		.values = nouveau_svm_pfn_values,
> -		.pfn_shift = NVIF_VMM_PFNMAP_V0_ADDR_SHIFT,
> +		.pfn_flags_mask = HMM_PFN_REQ_FAULT | HMM_PFN_REQ_WRITE,
> +		.hmm_pfns = hmm_pfns,
>  	};
>  	struct mm_struct *mm = notifier->notifier.mm;
>  	int ret;
> @@ -542,12 +529,15 @@ static int nouveau_range_fault(struct nouveau_svmm *svmm,
>  			return -EBUSY;
>  
>  		range.notifier_seq = mmu_interval_read_begin(range.notifier);
> -		range.default_flags = 0;
> -		range.pfn_flags_mask = -1UL;
>  		down_read(&mm->mmap_sem);
>  		ret = hmm_range_fault(&range);
>  		up_read(&mm->mmap_sem);
>  		if (ret) {
> +			/*
> +			 * FIXME: the input PFN_REQ flags are destroyed on
> +			 * -EBUSY, we need to regenerate them, also for the
> +			 * other continue below
> +			 */
>  			if (ret == -EBUSY)
>  				continue;
>  			return ret;
> @@ -562,7 +552,7 @@ static int nouveau_range_fault(struct nouveau_svmm *svmm,
>  		break;
>  	}
>  
> -	nouveau_dmem_convert_pfn(drm, &range);
> +	nouveau_hmm_convert_pfn(drm, &range, ioctl_addr);
>  
>  	svmm->vmm->vmm.object.client->super = true;
>  	ret = nvif_object_ioctl(&svmm->vmm->vmm.object, data, size, NULL);
> @@ -589,6 +579,7 @@ nouveau_svm_fault(struct nvif_notify *notify)
>  		} i;
>  		u64 phys[16];
>  	} args;
> +	unsigned long hmm_pfns[ARRAY_SIZE(args.phys)];
>  	struct vm_area_struct *vma;
>  	u64 inst, start, limit;
>  	int fi, fn, pi, fill;
> @@ -704,12 +695,17 @@ nouveau_svm_fault(struct nvif_notify *notify)
>  			 * access flags.
>  			 *XXX: atomic?
>  			 */
> -			if (buffer->fault[fn]->access != 0 /* READ. */ &&
> -			    buffer->fault[fn]->access != 3 /* PREFETCH. */) {
> -				args.phys[pi++] = NVIF_VMM_PFNMAP_V0_V |
> -						  NVIF_VMM_PFNMAP_V0_W;
> -			} else {
> -				args.phys[pi++] = NVIF_VMM_PFNMAP_V0_V;
> +			switch (buffer->fault[fn]->access) {
> +			case 0: /* READ. */
> +				hmm_pfns[pi++] = HMM_PFN_REQ_FAULT;
> +				break;
> +			case 3: /* PREFETCH. */
> +				hmm_pfns[pi++] = 0;
> +				break;
> +			default:
> +				hmm_pfns[pi++] = HMM_PFN_REQ_FAULT |
> +						 HMM_PFN_REQ_WRITE;
> +				break;
>  			}
>  			args.i.p.size = pi << PAGE_SHIFT;
>  
> @@ -737,7 +733,7 @@ nouveau_svm_fault(struct nvif_notify *notify)
>  			fill = (buffer->fault[fn    ]->addr -
>  				buffer->fault[fn - 1]->addr) >> PAGE_SHIFT;
>  			while (--fill)
> -				args.phys[pi++] = NVIF_VMM_PFNMAP_V0_NONE;
> +				hmm_pfns[pi++] = 0;
>  		}
>  
>  		SVMM_DBG(svmm, "wndw %016llx-%016llx covering %d fault(s)",
> @@ -753,7 +749,7 @@ nouveau_svm_fault(struct nvif_notify *notify)
>  			ret = nouveau_range_fault(
>  				svmm, svm->drm, &args,
>  				sizeof(args.i) + pi * sizeof(args.phys[0]),
> -				args.phys, &notifier);
> +				hmm_pfns, args.phys, &notifier);
>  			mmu_interval_notifier_remove(&notifier.notifier);
>  		}
>  		mmput(mm);
> diff --git a/include/linux/hmm.h b/include/linux/hmm.h
> index 81c302c884c0e3..e72529786f16f8 100644
> --- a/include/linux/hmm.h
> +++ b/include/linux/hmm.h
> @@ -19,45 +19,45 @@
>  #include <linux/mmu_notifier.h>
>  
>  /*
> - * hmm_pfn_flag_e - HMM flag enums
> + * On output:
> + * 0             - The page is faultable and a future call with 
> + *                 HMM_PFN_REQ_FAULT could succeed.
> + * HMM_PFN_VALID - the pfn field points to a valid PFN. This PFN is at
> + *                 least readable. If dev_private_owner is !NULL then this could
> + *                 point at a DEVICE_PRIVATE page.
> + * HMM_PFN_WRITE - if the page memory can be written to (requires HMM_PFN_VALID)
> + * HMM_PFN_ERROR - accessing the pfn is impossible and the device should
> + *                 fail. ie poisoned memory, special pages, no vma, etc
>   *
> - * Flags:
> - * HMM_PFN_VALID: pfn is valid. It has, at least, read permission.
> - * HMM_PFN_WRITE: CPU page table has write permission set
> - *
> - * The driver provides a flags array for mapping page protections to device
> - * PTE bits. If the driver valid bit for an entry is bit 3,
> - * i.e., (entry & (1 << 3)), then the driver must provide
> - * an array in hmm_range.flags with hmm_range.flags[HMM_PFN_VALID] == 1 << 3.
> - * Same logic apply to all flags. This is the same idea as vm_page_prot in vma
> - * except that this is per device driver rather than per architecture.
> + * On input:
> + * 0                 - Return the current state of the page, do not fault it.
> + * HMM_PFN_REQ_FAULT - The output must have HMM_PFN_VALID or hmm_range_fault()
> + *                     will fail
> + * HMM_PFN_REQ_WRITE - The output must have HMM_PFN_WRITE or hmm_range_fault()
> + *                     will fail. Must be combined with HMM_PFN_REQ_FAULT.
>   */
> -enum hmm_pfn_flag_e {
> -	HMM_PFN_VALID = 0,
> -	HMM_PFN_WRITE,
> -	HMM_PFN_FLAG_MAX
> +enum hmm_pfn_flags {
> +	HMM_PFN_VALID = 1UL << (BITS_PER_LONG - 1),
> +	HMM_PFN_WRITE = 1UL << (BITS_PER_LONG - 2),
> +	HMM_PFN_ERROR = 1UL << (BITS_PER_LONG - 3),
> +
> +	HMM_PFN_REQ_FAULT = HMM_PFN_VALID,
> +	HMM_PFN_REQ_WRITE = HMM_PFN_WRITE,
> +
> +	HMM_PFN_FLAGS = HMM_PFN_VALID | HMM_PFN_WRITE | HMM_PFN_ERROR,
>  };
>  
>  /*
> - * hmm_pfn_value_e - HMM pfn special value
> - *
> - * Flags:
> - * HMM_PFN_ERROR: corresponding CPU page table entry points to poisoned memory
> - * HMM_PFN_NONE: corresponding CPU page table entry is pte_none()
> + * hmm_pfn_to_page() - return struct page pointed to by a device entry
>   *
> - * Driver provides values for none entry, error entry, and special entry.
> - * Driver can alias (i.e., use same value) error and special, but
> - * it should not alias none with error or special.
> - *
> - * HMM pfn value returned by hmm_vma_get_pfns() or hmm_vma_fault() will be:
> - * hmm_range.values[HMM_PFN_ERROR] if CPU page table entry is poisonous,
> - * hmm_range.values[HMM_PFN_NONE] if there is no CPU page table entry,
> + * This must be called under the caller 'user_lock' after a successful
> + * mmu_interval_read_begin(). The caller must have tested for HMM_PFN_VALID
> + * already.
>   */
> -enum hmm_pfn_value_e {
> -	HMM_PFN_ERROR,
> -	HMM_PFN_NONE,
> -	HMM_PFN_VALUE_MAX
> -};
> +static inline struct page *hmm_pfn_to_page(unsigned long hmm_pfn)
> +{
> +	return pfn_to_page(hmm_pfn & ~HMM_PFN_FLAGS);
> +}
>  
>  /*
>   * struct hmm_range - track invalidation lock on virtual address range
> @@ -66,12 +66,9 @@ enum hmm_pfn_value_e {
>   * @notifier_seq: result of mmu_interval_read_begin()
>   * @start: range virtual start address (inclusive)
>   * @end: range virtual end address (exclusive)
> - * @pfns: array of pfns (big enough for the range)
> - * @flags: pfn flags to match device driver page table
> - * @values: pfn value for some special case (none, special, error, ...)
> + * @hmm_pfns: array of pfns (big enough for the range)
>   * @default_flags: default flags for the range (write, read, ... see hmm doc)
>   * @pfn_flags_mask: allows to mask pfn flags so that only default_flags matter
> - * @pfn_shift: pfn shift value (should be <= PAGE_SHIFT)
>   * @dev_private_owner: owner of device private pages
>   */
>  struct hmm_range {
> @@ -79,36 +76,12 @@ struct hmm_range {
>  	unsigned long		notifier_seq;
>  	unsigned long		start;
>  	unsigned long		end;
> -	uint64_t		*pfns;
> -	const uint64_t		*flags;
> -	const uint64_t		*values;
> -	uint64_t		default_flags;
> -	uint64_t		pfn_flags_mask;
> -	uint8_t			pfn_shift;
> +	unsigned long		*hmm_pfns;
> +	unsigned long		default_flags;
> +	unsigned long		pfn_flags_mask;
>  	void			*dev_private_owner;
>  };
>  
> -/*
> - * hmm_device_entry_to_page() - return struct page pointed to by a device entry
> - * @range: range use to decode device entry value
> - * @entry: device entry value to get corresponding struct page from
> - * Return: struct page pointer if entry is a valid, NULL otherwise
> - *
> - * If the device entry is valid (ie valid flag set) then return the struct page
> - * matching the entry value. Otherwise return NULL.
> - */
> -static inline struct page *hmm_device_entry_to_page(const struct hmm_range *range,
> -						    uint64_t entry)
> -{
> -	if (entry == range->values[HMM_PFN_NONE])
> -		return NULL;
> -	if (entry == range->values[HMM_PFN_ERROR])
> -		return NULL;
> -	if (!(entry & range->flags[HMM_PFN_VALID]))
> -		return NULL;
> -	return pfn_to_page(entry >> range->pfn_shift);
> -}
> -
>  /*
>   * Please see Documentation/vm/hmm.rst for how to use the range API.
>   */
> diff --git a/mm/hmm.c b/mm/hmm.c
> index 2693393dc14b30..c1c96d4cc0554c 100644
> --- a/mm/hmm.c
> +++ b/mm/hmm.c
> @@ -37,28 +37,13 @@ enum {
>  	HMM_NEED_ALL_BITS = HMM_NEED_FAULT | HMM_NEED_WRITE_FAULT,
>  };
>  
> -/*
> - * hmm_device_entry_from_pfn() - create a valid device entry value from pfn
> - * @range: range use to encode HMM pfn value
> - * @pfn: pfn value for which to create the device entry
> - * Return: valid device entry for the pfn
> - */
> -static uint64_t hmm_device_entry_from_pfn(const struct hmm_range *range,
> -					  unsigned long pfn)
> -{
> -	return (pfn << range->pfn_shift) | range->flags[HMM_PFN_VALID];
> -}
> -
>  static int hmm_pfns_fill(unsigned long addr, unsigned long end,
> -		struct hmm_range *range, enum hmm_pfn_value_e value)
> +			 struct hmm_range *range, unsigned long cpu_flags)
>  {
> -	uint64_t *pfns = range->pfns;
> -	unsigned long i;
> +	unsigned long i = (addr - range->start) >> PAGE_SHIFT;
>  
> -	i = (addr - range->start) >> PAGE_SHIFT;
>  	for (; addr < end; addr += PAGE_SIZE, i++)
> -		pfns[i] = range->values[value];
> -
> +		range->hmm_pfns[i] = cpu_flags;
>  	return 0;
>  }
>  
> @@ -96,7 +81,8 @@ static int hmm_vma_fault(unsigned long addr, unsigned long end,
>  }
>  
>  static unsigned int hmm_pte_need_fault(const struct hmm_vma_walk *hmm_vma_walk,
> -				       uint64_t pfns, uint64_t cpu_flags)
> +				       unsigned long pfn_req_flags,
> +				       unsigned long cpu_flags)
>  {
>  	struct hmm_range *range = hmm_vma_walk->range;
>  
> @@ -110,27 +96,28 @@ static unsigned int hmm_pte_need_fault(const struct hmm_vma_walk *hmm_vma_walk,
>  	 * waste to have the user pre-fill the pfn arrays with a default
>  	 * flags value.
>  	 */
> -	pfns = (pfns & range->pfn_flags_mask) | range->default_flags;
> +	pfn_req_flags &= range->pfn_flags_mask;
> +	pfn_req_flags |= range->default_flags;
>  
>  	/* We aren't ask to do anything ... */
> -	if (!(pfns & range->flags[HMM_PFN_VALID]))
> +	if (!(pfn_req_flags & HMM_PFN_REQ_FAULT))
>  		return 0;
>  
>  	/* Need to write fault ? */
> -	if ((pfns & range->flags[HMM_PFN_WRITE]) &&
> -	    !(cpu_flags & range->flags[HMM_PFN_WRITE]))
> +	if ((pfn_req_flags & HMM_PFN_REQ_WRITE) &&
> +	    !(cpu_flags & HMM_PFN_WRITE))
>  		return HMM_NEED_FAULT | HMM_NEED_WRITE_FAULT;
>  
>  	/* If CPU page table is not valid then we need to fault */
> -	if (!(cpu_flags & range->flags[HMM_PFN_VALID]))
> +	if (!(cpu_flags & HMM_PFN_VALID))
>  		return HMM_NEED_FAULT;
>  	return 0;
>  }
>  
>  static unsigned int
>  hmm_range_need_fault(const struct hmm_vma_walk *hmm_vma_walk,
> -		     const uint64_t *pfns, unsigned long npages,
> -		     uint64_t cpu_flags)
> +		     const unsigned long hmm_pfns[], unsigned long npages,
> +		     unsigned long cpu_flags)
>  {
>  	struct hmm_range *range = hmm_vma_walk->range;
>  	unsigned int required_fault = 0;
> @@ -142,12 +129,12 @@ hmm_range_need_fault(const struct hmm_vma_walk *hmm_vma_walk,
>  	 * hmm_pte_need_fault() will always return 0.
>  	 */
>  	if (!((range->default_flags | range->pfn_flags_mask) &
> -	      range->flags[HMM_PFN_VALID]))
> +	      HMM_PFN_REQ_FAULT))
>  		return 0;
>  
>  	for (i = 0; i < npages; ++i) {
> -		required_fault |=
> -			hmm_pte_need_fault(hmm_vma_walk, pfns[i], cpu_flags);
> +		required_fault |= hmm_pte_need_fault(hmm_vma_walk, hmm_pfns[i],
> +						     cpu_flags);
>  		if (required_fault == HMM_NEED_ALL_BITS)
>  			return required_fault;
>  	}
> @@ -161,12 +148,13 @@ static int hmm_vma_walk_hole(unsigned long addr, unsigned long end,
>  	struct hmm_range *range = hmm_vma_walk->range;
>  	unsigned int required_fault;
>  	unsigned long i, npages;
> -	uint64_t *pfns;
> +	unsigned long *hmm_pfns;
>  
>  	i = (addr - range->start) >> PAGE_SHIFT;
>  	npages = (end - addr) >> PAGE_SHIFT;
> -	pfns = &range->pfns[i];
> -	required_fault = hmm_range_need_fault(hmm_vma_walk, pfns, npages, 0);
> +	hmm_pfns = &range->hmm_pfns[i];
> +	required_fault =
> +		hmm_range_need_fault(hmm_vma_walk, hmm_pfns, npages, 0);
>  	if (!walk->vma) {
>  		if (required_fault)
>  			return -EFAULT;
> @@ -174,44 +162,44 @@ static int hmm_vma_walk_hole(unsigned long addr, unsigned long end,
>  	}
>  	if (required_fault)
>  		return hmm_vma_fault(addr, end, required_fault, walk);
> -	return hmm_pfns_fill(addr, end, range, HMM_PFN_NONE);
> +	return hmm_pfns_fill(addr, end, range, 0);
>  }
>  
> -static inline uint64_t pmd_to_hmm_pfn_flags(struct hmm_range *range, pmd_t pmd)
> +static inline unsigned long pmd_to_hmm_pfn_flags(struct hmm_range *range,
> +						 pmd_t pmd)
>  {
>  	if (pmd_protnone(pmd))
>  		return 0;
> -	return pmd_write(pmd) ? range->flags[HMM_PFN_VALID] |
> -				range->flags[HMM_PFN_WRITE] :
> -				range->flags[HMM_PFN_VALID];
> +	return pmd_write(pmd) ? (HMM_PFN_VALID | HMM_PFN_WRITE) : HMM_PFN_VALID;
>  }
>  
>  #ifdef CONFIG_TRANSPARENT_HUGEPAGE
>  static int hmm_vma_handle_pmd(struct mm_walk *walk, unsigned long addr,
> -		unsigned long end, uint64_t *pfns, pmd_t pmd)
> +			      unsigned long end, unsigned long hmm_pfns[],
> +			      pmd_t pmd)
>  {
>  	struct hmm_vma_walk *hmm_vma_walk = walk->private;
>  	struct hmm_range *range = hmm_vma_walk->range;
>  	unsigned long pfn, npages, i;
>  	unsigned int required_fault;
> -	uint64_t cpu_flags;
> +	unsigned long cpu_flags;
>  
>  	npages = (end - addr) >> PAGE_SHIFT;
>  	cpu_flags = pmd_to_hmm_pfn_flags(range, pmd);
>  	required_fault =
> -		hmm_range_need_fault(hmm_vma_walk, pfns, npages, cpu_flags);
> +		hmm_range_need_fault(hmm_vma_walk, hmm_pfns, npages, cpu_flags);
>  	if (required_fault)
>  		return hmm_vma_fault(addr, end, required_fault, walk);
>  
>  	pfn = pmd_pfn(pmd) + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
>  	for (i = 0; addr < end; addr += PAGE_SIZE, i++, pfn++)
> -		pfns[i] = hmm_device_entry_from_pfn(range, pfn) | cpu_flags;
> +		hmm_pfns[i] = pfn | cpu_flags;
>  	return 0;
>  }
>  #else /* CONFIG_TRANSPARENT_HUGEPAGE */
>  /* stub to allow the code below to compile */
>  int hmm_vma_handle_pmd(struct mm_walk *walk, unsigned long addr,
> -		unsigned long end, uint64_t *pfns, pmd_t pmd);
> +		unsigned long end, unsigned long hmm_pfns[], pmd_t pmd);
>  #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
>  
>  static inline bool hmm_is_device_private_entry(struct hmm_range *range,
> @@ -222,31 +210,31 @@ static inline bool hmm_is_device_private_entry(struct hmm_range *range,
>  		range->dev_private_owner;
>  }
>  
> -static inline uint64_t pte_to_hmm_pfn_flags(struct hmm_range *range, pte_t pte)
> +static inline unsigned long pte_to_hmm_pfn_flags(struct hmm_range *range,
> +						 pte_t pte)
>  {
>  	if (pte_none(pte) || !pte_present(pte) || pte_protnone(pte))
>  		return 0;
> -	return pte_write(pte) ? range->flags[HMM_PFN_VALID] |
> -				range->flags[HMM_PFN_WRITE] :
> -				range->flags[HMM_PFN_VALID];
> +	return pte_write(pte) ? (HMM_PFN_VALID | HMM_PFN_WRITE) : HMM_PFN_VALID;
>  }
>  
>  static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr,
>  			      unsigned long end, pmd_t *pmdp, pte_t *ptep,
> -			      uint64_t *pfn)
> +			      unsigned long *hmm_pfn)
>  {
>  	struct hmm_vma_walk *hmm_vma_walk = walk->private;
>  	struct hmm_range *range = hmm_vma_walk->range;
>  	unsigned int required_fault;
> -	uint64_t cpu_flags;
> +	unsigned long cpu_flags;
>  	pte_t pte = *ptep;
> -	uint64_t orig_pfn = *pfn;
> +	uint64_t pfn_req_flags = *hmm_pfn;
>  
>  	if (pte_none(pte)) {
> -		required_fault = hmm_pte_need_fault(hmm_vma_walk, orig_pfn, 0);
> +		required_fault =
> +			hmm_pte_need_fault(hmm_vma_walk, pfn_req_flags, 0);
>  		if (required_fault)
>  			goto fault;
> -		*pfn = range->values[HMM_PFN_NONE];
> +		*hmm_pfn = 0;
>  		return 0;
>  	}
>  
> @@ -258,17 +246,18 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr,
>  		 * the PFN even if not present.
>  		 */
>  		if (hmm_is_device_private_entry(range, entry)) {
> -			*pfn = hmm_device_entry_from_pfn(range,
> -				device_private_entry_to_pfn(entry));
> -			*pfn |= range->flags[HMM_PFN_VALID];
> +			cpu_flags = HMM_PFN_VALID;
>  			if (is_write_device_private_entry(entry))
> -				*pfn |= range->flags[HMM_PFN_WRITE];
> +				cpu_flags |= HMM_PFN_WRITE;
> +			*hmm_pfn = device_private_entry_to_pfn(entry) |
> +					cpu_flags;
>  			return 0;
>  		}
>  
> -		required_fault = hmm_pte_need_fault(hmm_vma_walk, orig_pfn, 0);
> +		required_fault =
> +			hmm_pte_need_fault(hmm_vma_walk, pfn_req_flags, 0);
>  		if (!required_fault) {
> -			*pfn = range->values[HMM_PFN_NONE];
> +			*hmm_pfn = 0;
>  			return 0;
>  		}
>  
> @@ -288,7 +277,8 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr,
>  	}
>  
>  	cpu_flags = pte_to_hmm_pfn_flags(range, pte);
> -	required_fault = hmm_pte_need_fault(hmm_vma_walk, orig_pfn, cpu_flags);
> +	required_fault =
> +		hmm_pte_need_fault(hmm_vma_walk, pfn_req_flags, cpu_flags);
>  	if (required_fault)
>  		goto fault;
>  
> @@ -297,15 +287,15 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr,
>  	 * fall through and treat it like a normal page.
>  	 */
>  	if (pte_special(pte) && !is_zero_pfn(pte_pfn(pte))) {
> -		if (hmm_pte_need_fault(hmm_vma_walk, orig_pfn, 0)) {
> +		if (hmm_pte_need_fault(hmm_vma_walk, pfn_req_flags, 0)) {
>  			pte_unmap(ptep);
>  			return -EFAULT;
>  		}
> -		*pfn = range->values[HMM_PFN_ERROR];
> +		*hmm_pfn = HMM_PFN_ERROR;
>  		return 0;
>  	}
>  
> -	*pfn = hmm_device_entry_from_pfn(range, pte_pfn(pte)) | cpu_flags;
> +	*hmm_pfn = pte_pfn(pte) | cpu_flags;
>  	return 0;
>  
>  fault:
> @@ -321,7 +311,8 @@ static int hmm_vma_walk_pmd(pmd_t *pmdp,
>  {
>  	struct hmm_vma_walk *hmm_vma_walk = walk->private;
>  	struct hmm_range *range = hmm_vma_walk->range;
> -	uint64_t *pfns = &range->pfns[(start - range->start) >> PAGE_SHIFT];
> +	unsigned long *hmm_pfns =
> +		&range->hmm_pfns[(start - range->start) >> PAGE_SHIFT];
>  	unsigned long npages = (end - start) >> PAGE_SHIFT;
>  	unsigned long addr = start;
>  	pte_t *ptep;
> @@ -333,16 +324,16 @@ static int hmm_vma_walk_pmd(pmd_t *pmdp,
>  		return hmm_vma_walk_hole(start, end, -1, walk);
>  
>  	if (thp_migration_supported() && is_pmd_migration_entry(pmd)) {
> -		if (hmm_range_need_fault(hmm_vma_walk, pfns, npages, 0)) {
> +		if (hmm_range_need_fault(hmm_vma_walk, hmm_pfns, npages, 0)) {
>  			hmm_vma_walk->last = addr;
>  			pmd_migration_entry_wait(walk->mm, pmdp);
>  			return -EBUSY;
>  		}
> -		return hmm_pfns_fill(start, end, range, HMM_PFN_NONE);
> +		return hmm_pfns_fill(start, end, range, 0);
>  	}
>  
>  	if (!pmd_present(pmd)) {
> -		if (hmm_range_need_fault(hmm_vma_walk, pfns, npages, 0))
> +		if (hmm_range_need_fault(hmm_vma_walk, hmm_pfns, npages, 0))
>  			return -EFAULT;
>  		return hmm_pfns_fill(start, end, range, HMM_PFN_ERROR);
>  	}
> @@ -362,7 +353,7 @@ static int hmm_vma_walk_pmd(pmd_t *pmdp,
>  		if (!pmd_devmap(pmd) && !pmd_trans_huge(pmd))
>  			goto again;
>  
> -		return hmm_vma_handle_pmd(walk, addr, end, pfns, pmd);
> +		return hmm_vma_handle_pmd(walk, addr, end, hmm_pfns, pmd);
>  	}
>  
>  	/*
> @@ -372,16 +363,16 @@ static int hmm_vma_walk_pmd(pmd_t *pmdp,
>  	 * recover.
>  	 */
>  	if (pmd_bad(pmd)) {
> -		if (hmm_range_need_fault(hmm_vma_walk, pfns, npages, 0))
> +		if (hmm_range_need_fault(hmm_vma_walk, hmm_pfns, npages, 0))
>  			return -EFAULT;
>  		return hmm_pfns_fill(start, end, range, HMM_PFN_ERROR);
>  	}
>  
>  	ptep = pte_offset_map(pmdp, addr);
> -	for (; addr < end; addr += PAGE_SIZE, ptep++, pfns++) {
> +	for (; addr < end; addr += PAGE_SIZE, ptep++, hmm_pfns++) {
>  		int r;
>  
> -		r = hmm_vma_handle_pte(walk, addr, end, pmdp, ptep, pfns);
> +		r = hmm_vma_handle_pte(walk, addr, end, pmdp, ptep, hmm_pfns);
>  		if (r) {
>  			/* hmm_vma_handle_pte() did pte_unmap() */
>  			return r;
> @@ -393,13 +384,12 @@ static int hmm_vma_walk_pmd(pmd_t *pmdp,
>  
>  #if defined(CONFIG_ARCH_HAS_PTE_DEVMAP) && \
>      defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD)
> -static inline uint64_t pud_to_hmm_pfn_flags(struct hmm_range *range, pud_t pud)
> +static inline unsigned long pud_to_hmm_pfn_flags(struct hmm_range *range,
> +						 pud_t pud)
>  {
>  	if (!pud_present(pud))
>  		return 0;
> -	return pud_write(pud) ? range->flags[HMM_PFN_VALID] |
> -				range->flags[HMM_PFN_WRITE] :
> -				range->flags[HMM_PFN_VALID];
> +	return pud_write(pud) ? (HMM_PFN_VALID | HMM_PFN_WRITE) : HMM_PFN_VALID;
>  }
>  
>  static int hmm_vma_walk_pud(pud_t *pudp, unsigned long start, unsigned long end,
> @@ -427,7 +417,8 @@ static int hmm_vma_walk_pud(pud_t *pudp, unsigned long start, unsigned long end,
>  	if (pud_huge(pud) && pud_devmap(pud)) {
>  		unsigned long i, npages, pfn;
>  		unsigned int required_fault;
> -		uint64_t *pfns, cpu_flags;
> +		unsigned long *hmm_pfns;
> +		unsigned long cpu_flags;
>  
>  		if (!pud_present(pud)) {
>  			spin_unlock(ptl);
> @@ -436,10 +427,10 @@ static int hmm_vma_walk_pud(pud_t *pudp, unsigned long start, unsigned long end,
>  
>  		i = (addr - range->start) >> PAGE_SHIFT;
>  		npages = (end - addr) >> PAGE_SHIFT;
> -		pfns = &range->pfns[i];
> +		hmm_pfns = &range->hmm_pfns[i];
>  
>  		cpu_flags = pud_to_hmm_pfn_flags(range, pud);
> -		required_fault = hmm_range_need_fault(hmm_vma_walk, pfns,
> +		required_fault = hmm_range_need_fault(hmm_vma_walk, hmm_pfns,
>  						      npages, cpu_flags);
>  		if (required_fault) {
>  			spin_unlock(ptl);
> @@ -448,8 +439,7 @@ static int hmm_vma_walk_pud(pud_t *pudp, unsigned long start, unsigned long end,
>  
>  		pfn = pud_pfn(pud) + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
>  		for (i = 0; i < npages; ++i, ++pfn)
> -			pfns[i] = hmm_device_entry_from_pfn(range, pfn) |
> -				  cpu_flags;
> +			hmm_pfns[i] = pfn | cpu_flags;
>  		goto out_unlock;
>  	}
>  
> @@ -473,8 +463,9 @@ static int hmm_vma_walk_hugetlb_entry(pte_t *pte, unsigned long hmask,
>  	struct hmm_vma_walk *hmm_vma_walk = walk->private;
>  	struct hmm_range *range = hmm_vma_walk->range;
>  	struct vm_area_struct *vma = walk->vma;
> -	uint64_t orig_pfn, cpu_flags;
>  	unsigned int required_fault;
> +	unsigned long pfn_req_flags;
> +	unsigned long cpu_flags;
>  	spinlock_t *ptl;
>  	pte_t entry;
>  
> @@ -482,9 +473,10 @@ static int hmm_vma_walk_hugetlb_entry(pte_t *pte, unsigned long hmask,
>  	entry = huge_ptep_get(pte);
>  
>  	i = (start - range->start) >> PAGE_SHIFT;
> -	orig_pfn = range->pfns[i];
> +	pfn_req_flags = range->hmm_pfns[i];
>  	cpu_flags = pte_to_hmm_pfn_flags(range, entry);
> -	required_fault = hmm_pte_need_fault(hmm_vma_walk, orig_pfn, cpu_flags);
> +	required_fault =
> +		hmm_pte_need_fault(hmm_vma_walk, pfn_req_flags, cpu_flags);
>  	if (required_fault) {
>  		spin_unlock(ptl);
>  		return hmm_vma_fault(addr, end, required_fault, walk);
> @@ -492,8 +484,8 @@ static int hmm_vma_walk_hugetlb_entry(pte_t *pte, unsigned long hmask,
>  
>  	pfn = pte_pfn(entry) + ((start & ~hmask) >> PAGE_SHIFT);
>  	for (; addr < end; addr += PAGE_SIZE, i++, pfn++)
> -		range->pfns[i] = hmm_device_entry_from_pfn(range, pfn) |
> -				 cpu_flags;
> +		range->hmm_pfns[i] = pfn | cpu_flags;
> +
>  	spin_unlock(ptl);
>  	return 0;
>  }
> @@ -524,7 +516,7 @@ static int hmm_vma_walk_test(unsigned long start, unsigned long end,
>  	 * failure.
>  	 */
>  	if (hmm_range_need_fault(hmm_vma_walk,
> -				 range->pfns +
> +				 range->hmm_pfns +
>  					 ((start - range->start) >> PAGE_SHIFT),
>  				 (end - start) >> PAGE_SHIFT, 0))
>  		return -EFAULT;


More information about the dri-devel mailing list