[PATCH] mm: Remove double faults once write a device pfn

Mon Jan 22 09:18:01 UTC 2024

Am 22.01.24 um 04:32 schrieb Xianrong Zhou:
> The vmf_insert_pfn_prot could cause unnecessary double faults
> on a device pfn. Because currently the vmf_insert_pfn_prot does
> not make the pfn writable so the pte entry is normally read-only
> or dirty catching.

What? How do you got to this conclusion?

> The first fault only sets up the pte entry which actually is
> dirty catching. And the second immediate fault to the pfn due
> to first dirty catching when the cpu re-execute the store
> instruction.

It could be that this is done to work around some hw behavior, but not 
because of dirty catching.

> Normally if the drivers call vmf_insert_pfn_prot and also supply
> 'pfn_mkwrite' callback within vm_operations_struct which requires
> the pte to be dirty catching then the vmf_insert_pfn_prot and the
> double fault are reasonable. It is not a problem.

Well, as far as I can see that behavior absolutely doesn't make sense.

When pfn_mkwrite is requested then the driver should use PAGE_COPY, 
which is exactly what VMWGFX (the only driver using dirty tracking) is 
doing.

Everybody else uses PAGE_SHARED which should make the pte writeable 
immediately.

Regards,
Christian.

>
> However the most of drivers calling vmf_insert_pfn_prot do not
> supply the 'pfn_mkwrite' callback so that the second fault is
> unnecessary.
>
> So just like vmf_insert_mixed and vmf_insert_mixed_mkwrite pair,
> we should also supply vmf_insert_pfn_mkwrite for drivers as well.
>
> Signed-off-by: Xianrong Zhou <Xianrong.Zhou at amd.com>
> ---
>   arch/x86/entry/vdso/vma.c                  |  3 ++-
>   drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c    |  2 +-
>   drivers/gpu/drm/i915/gem/i915_gem_ttm.c    |  2 +-
>   drivers/gpu/drm/nouveau/nouveau_gem.c      |  2 +-
>   drivers/gpu/drm/radeon/radeon_gem.c        |  2 +-
>   drivers/gpu/drm/ttm/ttm_bo_vm.c            |  8 +++++---
>   drivers/gpu/drm/vmwgfx/vmwgfx_page_dirty.c |  8 +++++---
>   include/drm/ttm/ttm_bo.h                   |  3 ++-
>   include/linux/mm.h                         |  2 +-
>   mm/memory.c                                | 14 +++++++++++---
>   10 files changed, 30 insertions(+), 16 deletions(-)
>
> diff --git a/arch/x86/entry/vdso/vma.c b/arch/x86/entry/vdso/vma.c
> index 7645730dc228..dd2431c2975f 100644
> --- a/arch/x86/entry/vdso/vma.c
> +++ b/arch/x86/entry/vdso/vma.c
> @@ -185,7 +185,8 @@ static vm_fault_t vvar_fault(const struct vm_special_mapping *sm,
>   		if (pvti && vclock_was_used(VDSO_CLOCKMODE_PVCLOCK)) {
>   			return vmf_insert_pfn_prot(vma, vmf->address,
>   					__pa(pvti) >> PAGE_SHIFT,
> -					pgprot_decrypted(vma->vm_page_prot));
> +					pgprot_decrypted(vma->vm_page_prot),
> +					true);
>   		}
>   	} else if (sym_offset == image->sym_hvclock_page) {
>   		pfn = hv_get_tsc_pfn();
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
> index 49a5f1c73b3e..adcb20d9e624 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
> @@ -64,7 +64,7 @@ static vm_fault_t amdgpu_gem_fault(struct vm_fault *vmf)
>   		}
>   
>   		ret = ttm_bo_vm_fault_reserved(vmf, vmf->vma->vm_page_prot,
> -					       TTM_BO_VM_NUM_PREFAULT);
> +					       TTM_BO_VM_NUM_PREFAULT, true);
>   
>   		drm_dev_exit(idx);
>   	} else {
> diff --git a/drivers/gpu/drm/i915/gem/i915_gem_ttm.c b/drivers/gpu/drm/i915/gem/i915_gem_ttm.c
> index 9227f8146a58..c6f13ae6c308 100644
> --- a/drivers/gpu/drm/i915/gem/i915_gem_ttm.c
> +++ b/drivers/gpu/drm/i915/gem/i915_gem_ttm.c
> @@ -1114,7 +1114,7 @@ static vm_fault_t vm_fault_ttm(struct vm_fault *vmf)
>   
>   	if (drm_dev_enter(dev, &idx)) {
>   		ret = ttm_bo_vm_fault_reserved(vmf, vmf->vma->vm_page_prot,
> -					       TTM_BO_VM_NUM_PREFAULT);
> +					       TTM_BO_VM_NUM_PREFAULT, true);
>   		drm_dev_exit(idx);
>   	} else {
>   		ret = ttm_bo_vm_dummy_page(vmf, vmf->vma->vm_page_prot);
> diff --git a/drivers/gpu/drm/nouveau/nouveau_gem.c b/drivers/gpu/drm/nouveau/nouveau_gem.c
> index 49c2bcbef129..7e1453762ec9 100644
> --- a/drivers/gpu/drm/nouveau/nouveau_gem.c
> +++ b/drivers/gpu/drm/nouveau/nouveau_gem.c
> @@ -56,7 +56,7 @@ static vm_fault_t nouveau_ttm_fault(struct vm_fault *vmf)
>   
>   	nouveau_bo_del_io_reserve_lru(bo);
>   	prot = vm_get_page_prot(vma->vm_flags);
> -	ret = ttm_bo_vm_fault_reserved(vmf, prot, TTM_BO_VM_NUM_PREFAULT);
> +	ret = ttm_bo_vm_fault_reserved(vmf, prot, TTM_BO_VM_NUM_PREFAULT, true);
>   	nouveau_bo_add_io_reserve_lru(bo);
>   	if (ret == VM_FAULT_RETRY && !(vmf->flags & FAULT_FLAG_RETRY_NOWAIT))
>   		return ret;
> diff --git a/drivers/gpu/drm/radeon/radeon_gem.c b/drivers/gpu/drm/radeon/radeon_gem.c
> index 3fec3acdaf28..b21cf00ae162 100644
> --- a/drivers/gpu/drm/radeon/radeon_gem.c
> +++ b/drivers/gpu/drm/radeon/radeon_gem.c
> @@ -62,7 +62,7 @@ static vm_fault_t radeon_gem_fault(struct vm_fault *vmf)
>   		goto unlock_resv;
>   
>   	ret = ttm_bo_vm_fault_reserved(vmf, vmf->vma->vm_page_prot,
> -				       TTM_BO_VM_NUM_PREFAULT);
> +				       TTM_BO_VM_NUM_PREFAULT, true);
>   	if (ret == VM_FAULT_RETRY && !(vmf->flags & FAULT_FLAG_RETRY_NOWAIT))
>   		goto unlock_mclk;
>   
> diff --git a/drivers/gpu/drm/ttm/ttm_bo_vm.c b/drivers/gpu/drm/ttm/ttm_bo_vm.c
> index 4212b8c91dd4..7d14a7d267aa 100644
> --- a/drivers/gpu/drm/ttm/ttm_bo_vm.c
> +++ b/drivers/gpu/drm/ttm/ttm_bo_vm.c
> @@ -167,6 +167,7 @@ EXPORT_SYMBOL(ttm_bo_vm_reserve);
>    * @num_prefault: Maximum number of prefault pages. The caller may want to
>    * specify this based on madvice settings and the size of the GPU object
>    * backed by the memory.
> + * @mkwrite: make the pfn or page writable
>    *
>    * This function inserts one or more page table entries pointing to the
>    * memory backing the buffer object, and then returns a return code
> @@ -180,7 +181,8 @@ EXPORT_SYMBOL(ttm_bo_vm_reserve);
>    */
>   vm_fault_t ttm_bo_vm_fault_reserved(struct vm_fault *vmf,
>   				    pgprot_t prot,
> -				    pgoff_t num_prefault)
> +				    pgoff_t num_prefault,
> +				    bool mkwrite)
>   {
>   	struct vm_area_struct *vma = vmf->vma;
>   	struct ttm_buffer_object *bo = vma->vm_private_data;
> @@ -263,7 +265,7 @@ vm_fault_t ttm_bo_vm_fault_reserved(struct vm_fault *vmf,
>   		 * at arbitrary times while the data is mmap'ed.
>   		 * See vmf_insert_pfn_prot() for a discussion.
>   		 */
> -		ret = vmf_insert_pfn_prot(vma, address, pfn, prot);
> +		ret = vmf_insert_pfn_prot(vma, address, pfn, prot, mkwrite);
>   
>   		/* Never error on prefaulted PTEs */
>   		if (unlikely((ret & VM_FAULT_ERROR))) {
> @@ -312,7 +314,7 @@ vm_fault_t ttm_bo_vm_dummy_page(struct vm_fault *vmf, pgprot_t prot)
>   	/* Prefault the entire VMA range right away to avoid further faults */
>   	for (address = vma->vm_start; address < vma->vm_end;
>   	     address += PAGE_SIZE)
> -		ret = vmf_insert_pfn_prot(vma, address, pfn, prot);
> +		ret = vmf_insert_pfn_prot(vma, address, pfn, prot, true);
>   
>   	return ret;
>   }
> diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_page_dirty.c b/drivers/gpu/drm/vmwgfx/vmwgfx_page_dirty.c
> index 74ff2812d66a..bb8e4b641681 100644
> --- a/drivers/gpu/drm/vmwgfx/vmwgfx_page_dirty.c
> +++ b/drivers/gpu/drm/vmwgfx/vmwgfx_page_dirty.c
> @@ -452,12 +452,14 @@ vm_fault_t vmw_bo_vm_fault(struct vm_fault *vmf)
>   	 * sure the page protection is write-enabled so we don't get
>   	 * a lot of unnecessary write faults.
>   	 */
> -	if (vbo->dirty && vbo->dirty->method == VMW_BO_DIRTY_MKWRITE)
> +	if (vbo->dirty && vbo->dirty->method == VMW_BO_DIRTY_MKWRITE) {
>   		prot = vm_get_page_prot(vma->vm_flags & ~VM_SHARED);
> -	else
> +		ret = ttm_bo_vm_fault_reserved(vmf, prot, num_prefault, false);
> +	} else {
>   		prot = vm_get_page_prot(vma->vm_flags);
> +		ret = ttm_bo_vm_fault_reserved(vmf, prot, num_prefault, true);
> +	}
>   
> -	ret = ttm_bo_vm_fault_reserved(vmf, prot, num_prefault);
>   	if (ret == VM_FAULT_RETRY && !(vmf->flags & FAULT_FLAG_RETRY_NOWAIT))
>   		return ret;
>   
> diff --git a/include/drm/ttm/ttm_bo.h b/include/drm/ttm/ttm_bo.h
> index 0223a41a64b2..66e293db69ee 100644
> --- a/include/drm/ttm/ttm_bo.h
> +++ b/include/drm/ttm/ttm_bo.h
> @@ -386,7 +386,8 @@ vm_fault_t ttm_bo_vm_reserve(struct ttm_buffer_object *bo,
>   			     struct vm_fault *vmf);
>   vm_fault_t ttm_bo_vm_fault_reserved(struct vm_fault *vmf,
>   				    pgprot_t prot,
> -				    pgoff_t num_prefault);
> +				    pgoff_t num_prefault,
> +				    bool mkwrite);
>   vm_fault_t ttm_bo_vm_fault(struct vm_fault *vmf);
>   void ttm_bo_vm_open(struct vm_area_struct *vma);
>   void ttm_bo_vm_close(struct vm_area_struct *vma);
> diff --git a/include/linux/mm.h b/include/linux/mm.h
> index f5a97dec5169..f8868e28ea04 100644
> --- a/include/linux/mm.h
> +++ b/include/linux/mm.h
> @@ -3553,7 +3553,7 @@ int vm_map_pages_zero(struct vm_area_struct *vma, struct page **pages,
>   vm_fault_t vmf_insert_pfn(struct vm_area_struct *vma, unsigned long addr,
>   			unsigned long pfn);
>   vm_fault_t vmf_insert_pfn_prot(struct vm_area_struct *vma, unsigned long addr,
> -			unsigned long pfn, pgprot_t pgprot);
> +			unsigned long pfn, pgprot_t pgprot, bool mkwrite);
>   vm_fault_t vmf_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
>   			pfn_t pfn);
>   vm_fault_t vmf_insert_mixed_mkwrite(struct vm_area_struct *vma,
> diff --git a/mm/memory.c b/mm/memory.c
> index 7e1f4849463a..2c28f1a349ff 100644
> --- a/mm/memory.c
> +++ b/mm/memory.c
> @@ -2195,6 +2195,7 @@ static vm_fault_t insert_pfn(struct vm_area_struct *vma, unsigned long addr,
>    * @addr: target user address of this page
>    * @pfn: source kernel pfn
>    * @pgprot: pgprot flags for the inserted page
> + * @mkwrite: make the pfn writable
>    *
>    * This is exactly like vmf_insert_pfn(), except that it allows drivers
>    * to override pgprot on a per-page basis.
> @@ -2223,7 +2224,7 @@ static vm_fault_t insert_pfn(struct vm_area_struct *vma, unsigned long addr,
>    * Return: vm_fault_t value.
>    */
>   vm_fault_t vmf_insert_pfn_prot(struct vm_area_struct *vma, unsigned long addr,
> -			unsigned long pfn, pgprot_t pgprot)
> +			unsigned long pfn, pgprot_t pgprot, bool mkwrite)
>   {
>   	/*
>   	 * Technically, architectures with pte_special can avoid all these
> @@ -2246,7 +2247,7 @@ vm_fault_t vmf_insert_pfn_prot(struct vm_area_struct *vma, unsigned long addr,
>   	track_pfn_insert(vma, &pgprot, __pfn_to_pfn_t(pfn, PFN_DEV));
>   
>   	return insert_pfn(vma, addr, __pfn_to_pfn_t(pfn, PFN_DEV), pgprot,
> -			false);
> +			mkwrite);
>   }
>   EXPORT_SYMBOL(vmf_insert_pfn_prot);
>   
> @@ -2273,10 +2274,17 @@ EXPORT_SYMBOL(vmf_insert_pfn_prot);
>   vm_fault_t vmf_insert_pfn(struct vm_area_struct *vma, unsigned long addr,
>   			unsigned long pfn)
>   {
> -	return vmf_insert_pfn_prot(vma, addr, pfn, vma->vm_page_prot);
> +	return vmf_insert_pfn_prot(vma, addr, pfn, vma->vm_page_prot, false);
>   }
>   EXPORT_SYMBOL(vmf_insert_pfn);
>   
> +vm_fault_t vmf_insert_pfn_mkwrite(struct vm_area_struct *vma, unsigned long addr,
> +			unsigned long pfn)
> +{
> +	return vmf_insert_pfn_prot(vma, addr, pfn, vma->vm_page_prot, true);
> +}
> +EXPORT_SYMBOL(vmf_insert_pfn_mkwrite);
> +
>   static bool vm_mixed_ok(struct vm_area_struct *vma, pfn_t pfn)
>   {
>   	/* these checks mirror the abort conditions in vm_normal_page */