[PATCH 3/9] mm: Add write-protect and clean utilities for address space ranges

Ralph Campbell rcampbell at nvidia.com
Fri Apr 12 18:52:36 UTC 2019


On 4/12/19 9:04 AM, Thomas Hellstrom wrote:
> Add two utilities to a) write-protect and b) clean all ptes pointing into
> a range of an address space

A period at the end, please.

> The utilities are intended to aid in tracking dirty pages (either
> driver-allocated system memory or pci device memory).
> The write-protect utility should be used in conjunction with
> page_mkwrite() and pfn_mkwrite() to trigger write page-faults on page
> accesses. Typically one would want to use this on sparse accesses into
> large memory regions. The clean utility should be used to utilize
> hardware dirtying functionality and avoid the overhead of page-faults,
> typically on large accesses into small memory regions.
> 
> The added file "apply_as_range.c" is initially listed as maintained by
> VMware under our DRM driver. If somebody would like it elsewhere,
> that's of course no problem.
> 
> Notable changes since RFC:
> - Added comments to help avoid the usage of these function for VMAs
>    it's not intended for. We also do advisory checks on the vm_flags and
>    warn on illegal usage.
> - Perform the pte modifications the same way softdirty does.
> - Add mmu_notifier range invalidation calls.
> - Add a config option so that this code is not unconditionally included.
> - Tell the mmu_gather code about pending tlb flushes.
> 
> Cc: Andrew Morton <akpm at linux-foundation.org>
> Cc: Matthew Wilcox <willy at infradead.org>
> Cc: Will Deacon <will.deacon at arm.com>
> Cc: Peter Zijlstra <peterz at infradead.org>
> Cc: Rik van Riel <riel at surriel.com>
> Cc: Minchan Kim <minchan at kernel.org>
> Cc: Michal Hocko <mhocko at suse.com>
> Cc: Huang Ying <ying.huang at intel.com>
> Cc: Souptick Joarder <jrdr.linux at gmail.com>
> Cc: "Jérôme Glisse" <jglisse at redhat.com>
> Cc: linux-mm at kvack.org
> Cc: linux-kernel at vger.kernel.org
> Signed-off-by: Thomas Hellstrom <thellstrom at vmware.com>

Reviewed-by: Ralph Campbell <rcampbell at nvidia.com>

> ---
>   MAINTAINERS         |   1 +
>   include/linux/mm.h  |   9 +-
>   mm/Kconfig          |   3 +
>   mm/Makefile         |   3 +-
>   mm/apply_as_range.c | 295 ++++++++++++++++++++++++++++++++++++++++++++
>   5 files changed, 309 insertions(+), 2 deletions(-)
>   create mode 100644 mm/apply_as_range.c
> 
> diff --git a/MAINTAINERS b/MAINTAINERS
> index 35e6357f9d30..bc243ffcb840 100644
> --- a/MAINTAINERS
> +++ b/MAINTAINERS
> @@ -4971,6 +4971,7 @@ T:	git git://people.freedesktop.org/~thomash/linux
>   S:	Supported
>   F:	drivers/gpu/drm/vmwgfx/
>   F:	include/uapi/drm/vmwgfx_drm.h
> +F:	mm/apply_as_range.c
>   
>   DRM DRIVERS
>   M:	David Airlie <airlied at linux.ie>
> diff --git a/include/linux/mm.h b/include/linux/mm.h
> index b7dd4ddd6efb..62f24dd0bfa0 100644
> --- a/include/linux/mm.h
> +++ b/include/linux/mm.h
> @@ -2642,7 +2642,14 @@ struct pfn_range_apply {
>   };
>   extern int apply_to_pfn_range(struct pfn_range_apply *closure,
>   			      unsigned long address, unsigned long size);
> -
> +unsigned long apply_as_wrprotect(struct address_space *mapping,
> +				 pgoff_t first_index, pgoff_t nr);
> +unsigned long apply_as_clean(struct address_space *mapping,
> +			     pgoff_t first_index, pgoff_t nr,
> +			     pgoff_t bitmap_pgoff,
> +			     unsigned long *bitmap,
> +			     pgoff_t *start,
> +			     pgoff_t *end);
>   #ifdef CONFIG_PAGE_POISONING
>   extern bool page_poisoning_enabled(void);
>   extern void kernel_poison_pages(struct page *page, int numpages, int enable);
> diff --git a/mm/Kconfig b/mm/Kconfig
> index 25c71eb8a7db..80e41cdbb4ae 100644
> --- a/mm/Kconfig
> +++ b/mm/Kconfig
> @@ -758,4 +758,7 @@ config GUP_BENCHMARK
>   config ARCH_HAS_PTE_SPECIAL
>   	bool
>   
> +config AS_DIRTY_HELPERS
> +        bool
> +
>   endmenu
> diff --git a/mm/Makefile b/mm/Makefile
> index d210cc9d6f80..b295717be856 100644
> --- a/mm/Makefile
> +++ b/mm/Makefile
> @@ -39,7 +39,7 @@ obj-y			:= filemap.o mempool.o oom_kill.o fadvise.o \
>   			   mm_init.o mmu_context.o percpu.o slab_common.o \
>   			   compaction.o vmacache.o \
>   			   interval_tree.o list_lru.o workingset.o \
> -			   debug.o $(mmu-y)
> +			   debug.o apply_as_range.o $(mmu-y)
>   
>   obj-y += init-mm.o
>   obj-y += memblock.o
> @@ -99,3 +99,4 @@ obj-$(CONFIG_HARDENED_USERCOPY) += usercopy.o
>   obj-$(CONFIG_PERCPU_STATS) += percpu-stats.o
>   obj-$(CONFIG_HMM) += hmm.o
>   obj-$(CONFIG_MEMFD_CREATE) += memfd.o
> +obj-$(CONFIG_AS_DIRTY_HELPERS) += apply_as_range.o
> diff --git a/mm/apply_as_range.c b/mm/apply_as_range.c
> new file mode 100644
> index 000000000000..32d28619aec5
> --- /dev/null
> +++ b/mm/apply_as_range.c
> @@ -0,0 +1,295 @@
> +// SPDX-License-Identifier: GPL-2.0
> +#include <linux/mm.h>
> +#include <linux/mm_types.h>
> +#include <linux/hugetlb.h>
> +#include <linux/bitops.h>
> +#include <linux/mmu_notifier.h>
> +#include <asm/cacheflush.h>
> +#include <asm/tlbflush.h>
> +
> +/**
> + * struct apply_as - Closure structure for apply_as_range
> + * @base: struct pfn_range_apply we derive from
> + * @start: Address of first modified pte
> + * @end: Address of last modified pte + 1
> + * @total: Total number of modified ptes
> + * @vma: Pointer to the struct vm_area_struct we're currently operating on
> + */
> +struct apply_as {
> +	struct pfn_range_apply base;
> +	unsigned long start, end;

One variable defined per line, please.

> +	unsigned long total;
> +	const struct vm_area_struct *vma;
> +};
> +
> +/**
> + * apply_pt_wrprotect - Leaf pte callback to write-protect a pte
> + * @pte: Pointer to the pte
> + * @token: Page table token, see apply_to_pfn_range()
> + * @addr: The virtual page address
> + * @closure: Pointer to a struct pfn_range_apply embedded in a
> + * struct apply_as
> + *
> + * The function write-protects a pte and records the range in
> + * virtual address space of touched ptes for efficient range TLB flushes.
> + *
> + * Return: Always zero.
> + */
> +static int apply_pt_wrprotect(pte_t *pte, pgtable_t token,
> +			      unsigned long addr,
> +			      struct pfn_range_apply *closure)
> +{
> +	struct apply_as *aas = container_of(closure, typeof(*aas), base);
> +	pte_t ptent = *pte;
> +
> +	if (pte_write(ptent)) {
> +		ptent = ptep_modify_prot_start(closure->mm, addr, pte);
> +		ptent = pte_wrprotect(ptent);
> +		ptep_modify_prot_commit(closure->mm, addr, pte, ptent);
> +		aas->total++;
> +		aas->start = min(aas->start, addr);
> +		aas->end = max(aas->end, addr + PAGE_SIZE);
> +	}
> +
> +	return 0;
> +}
> +
> +/**
> + * struct apply_as_clean - Closure structure for apply_as_clean
> + * @base: struct apply_as we derive from
> + * @bitmap_pgoff: Address_space Page offset of the first bit in @bitmap
> + * @bitmap: Bitmap with one bit for each page offset in the address_space range
> + * covered.
> + * @start: Address_space page offset of first modified pte relative
> + * to @bitmap_pgoff
> + * @end: Address_space page offset of last modified pte relative
> + * to @bitmap_pgoff
> + */
> +struct apply_as_clean {
> +	struct apply_as base;
> +	pgoff_t bitmap_pgoff;
> +	unsigned long *bitmap;
> +	pgoff_t start, end;

One variable defined per line, please.

> +};
> +
> +/**
> + * apply_pt_clean - Leaf pte callback to clean a pte
> + * @pte: Pointer to the pte
> + * @token: Page table token, see apply_to_pfn_range()
> + * @addr: The virtual page address
> + * @closure: Pointer to a struct pfn_range_apply embedded in a
> + * struct apply_as_clean
> + *
> + * The function cleans a pte and records the range in
> + * virtual address space of touched ptes for efficient TLB flushes.
> + * It also records dirty ptes in a bitmap representing page offsets
> + * in the address_space, as well as the first and last of the bits
> + * touched.
> + *
> + * Return: Always zero.
> + */
> +static int apply_pt_clean(pte_t *pte, pgtable_t token,
> +			  unsigned long addr,
> +			  struct pfn_range_apply *closure)
> +{
> +	struct apply_as *aas = container_of(closure, typeof(*aas), base);
> +	struct apply_as_clean *clean = container_of(aas, typeof(*clean), base);
> +	pte_t ptent = *pte;
> +
> +	if (pte_dirty(ptent)) {
> +		pgoff_t pgoff = ((addr - aas->vma->vm_start) >> PAGE_SHIFT) +
> +			aas->vma->vm_pgoff - clean->bitmap_pgoff;
> +
> +		ptent = ptep_modify_prot_start(closure->mm, addr, pte);
> +		ptent = pte_mkclean(ptent);
> +		ptep_modify_prot_commit(closure->mm, addr, pte, ptent);
> +
> +		aas->total++;
> +		aas->start = min(aas->start, addr);
> +		aas->end = max(aas->end, addr + PAGE_SIZE);
> +
> +		__set_bit(pgoff, clean->bitmap);
> +		clean->start = min(clean->start, pgoff);
> +		clean->end = max(clean->end, pgoff + 1);
> +	}
> +
> +	return 0;
> +}
> +
> +/**
> + * apply_as_range - Apply a pte callback to all PTEs pointing into a range
> + * of an address_space.
> + * @mapping: Pointer to the struct address_space
> + * @aas: Closure structure
> + * @first_index: First page offset in the address_space
> + * @nr: Number of incremental page offsets to cover
> + *
> + * Return: Number of ptes touched. Note that this number might be larger
> + * than @nr if there are overlapping vmas
> + */
> +static unsigned long apply_as_range(struct address_space *mapping,
> +				    struct apply_as *aas,
> +				    pgoff_t first_index, pgoff_t nr)
> +{
> +	struct vm_area_struct *vma;
> +	pgoff_t vba, vea, cba, cea;
> +	unsigned long start_addr, end_addr;
> +	struct mmu_notifier_range range;
> +
> +	i_mmap_lock_read(mapping);
> +	vma_interval_tree_foreach(vma, &mapping->i_mmap, first_index,
> +				  first_index + nr - 1) {
> +		unsigned long vm_flags = READ_ONCE(vma->vm_flags);
> +
> +		/*
> +		 * We can only do advisory flag tests below, since we can't
> +		 * require the vm's mmap_sem to be held to protect the flags.
> +		 * Therefore, callers that strictly depend on specific mmap
> +		 * flags to remain constant throughout the operation must
> +		 * either ensure those flags are immutable for all relevant
> +		 * vmas or can't use this function. Fixing this properly would
> +		 * require the vma::vm_flags to be protected by a separate
> +		 * lock taken after the i_mmap_lock
> +		 */
> +
> +		/* Skip non-applicable VMAs */
> +		if ((vm_flags & (VM_SHARED | VM_WRITE)) !=
> +		    (VM_SHARED | VM_WRITE))
> +			continue;
> +
> +		/* Warn on and skip VMAs whose flags indicate illegal usage */
> +		if (WARN_ON((vm_flags & (VM_HUGETLB | VM_IO)) != VM_IO))
> +			continue;
> +
> +		/* Clip to the vma */
> +		vba = vma->vm_pgoff;
> +		vea = vba + vma_pages(vma);
> +		cba = first_index;
> +		cba = max(cba, vba);
> +		cea = first_index + nr;
> +		cea = min(cea, vea);
> +
> +		/* Translate to virtual address */
> +		start_addr = ((cba - vba) << PAGE_SHIFT) + vma->vm_start;
> +		end_addr = ((cea - vba) << PAGE_SHIFT) + vma->vm_start;
> +		if (start_addr >= end_addr)
> +			continue;
> +
> +		aas->base.mm = vma->vm_mm;
> +		aas->vma = vma;
> +		aas->start = end_addr;
> +		aas->end = start_addr;
> +
> +		mmu_notifier_range_init(&range, vma->vm_mm,
> +					start_addr, end_addr);
> +		mmu_notifier_invalidate_range_start(&range);
> +
> +		/* Needed when we only change protection? */
> +		flush_cache_range(vma, start_addr, end_addr);
> +
> +		/*
> +		 * We're not using tlb_gather_mmu() since typically
> +		 * only a small subrange of PTEs are affected.
> +		 */
> +		inc_tlb_flush_pending(vma->vm_mm);
> +
> +		/* Should not error since aas->base.alloc == 0 */
> +		WARN_ON(apply_to_pfn_range(&aas->base, start_addr,
> +					   end_addr - start_addr));
> +		if (aas->end > aas->start)
> +			flush_tlb_range(vma, aas->start, aas->end);
> +
> +		mmu_notifier_invalidate_range_end(&range);
> +		dec_tlb_flush_pending(vma->vm_mm);
> +	}
> +	i_mmap_unlock_read(mapping);
> +
> +	return aas->total;
> +}
> +
> +/**
> + * apply_as_wrprotect - Write-protect all ptes in an address_space range
> + * @mapping: The address_space we want to write protect
> + * @first_index: The first page offset in the range
> + * @nr: Number of incremental page offsets to cover
> + *
> + * WARNING: This function should only be used for address spaces that
> + * completely own the pages / memory the page table points to. Typically a
> + * device file.
> + *
> + * Return: The number of ptes actually write-protected. Note that
> + * already write-protected ptes are not counted.
> + */
> +unsigned long apply_as_wrprotect(struct address_space *mapping,
> +				 pgoff_t first_index, pgoff_t nr)
> +{
> +	struct apply_as aas = {
> +		.base = {
> +			.alloc = 0,
> +			.ptefn = apply_pt_wrprotect,
> +		},
> +		.total = 0,
> +	};
> +
> +	return apply_as_range(mapping, &aas, first_index, nr);
> +}
> +EXPORT_SYMBOL(apply_as_wrprotect);
> +
> +/**
> + * apply_as_clean - Clean all ptes in an address_space range
> + * @mapping: The address_space we want to clean
> + * @first_index: The first page offset in the range
> + * @nr: Number of incremental page offsets to cover
> + * @bitmap_pgoff: The page offset of the first bit in @bitmap
> + * @bitmap: Pointer to a bitmap of at least @nr bits. The bitmap needs to
> + * cover the whole range @first_index.. at first_index + @nr.
> + * @start: Pointer to number of the first set bit in @bitmap.
> + * is modified as new bits are set by the function.
> + * @end: Pointer to the number of the last set bit in @bitmap.
> + * none set. The value is modified as new bets are set by the function.

s/bets/bits/

> + *
> + * Note: When this function returns there is no guarantee that a CPU has
> + * not already dirtied new ptes. However it will not clean any ptes not
> + * reported in the bitmap.
> + *
> + * If a caller needs to make sure all dirty ptes are picked up and none
> + * additional are added, it first needs to write-protect the address-space
> + * range and make sure new writers are blocked in page_mkwrite() or
> + * pfn_mkwrite(). And then after a TLB flush following the write-protection
> + * pick upp all dirty bits.

s/upp/up/

> + *
> + * WARNING: This function should only be used for address spaces that
> + * completely own the pages / memory the page table points to. Typically a
> + * device file.
> + *
> + * Return: The number of dirty ptes actually cleaned.
> + */
> +unsigned long apply_as_clean(struct address_space *mapping,
> +			     pgoff_t first_index, pgoff_t nr,
> +			     pgoff_t bitmap_pgoff,
> +			     unsigned long *bitmap,
> +			     pgoff_t *start,
> +			     pgoff_t *end)
> +{
> +	bool none_set = (*start >= *end);
> +	struct apply_as_clean clean = {
> +		.base = {
> +			.base = {
> +				.alloc = 0,
> +				.ptefn = apply_pt_clean,
> +			},
> +			.total = 0,
> +		},
> +		.bitmap_pgoff = bitmap_pgoff,
> +		.bitmap = bitmap,
> +		.start = none_set ? nr : *start,
> +		.end = none_set ? 0 : *end,
> +	};
> +	unsigned long ret = apply_as_range(mapping, &clean.base, first_index,
> +					   nr);
> +
> +	*start = clean.start;
> +	*end = clean.end;
> +	return ret;
> +}
> +EXPORT_SYMBOL(apply_as_clean);
> 


More information about the dri-devel mailing list