[PATCH 3/9] mm: Add write-protect and clean utilities for address space ranges
Ralph Campbell
rcampbell at nvidia.com
Fri Apr 12 18:52:36 UTC 2019
On 4/12/19 9:04 AM, Thomas Hellstrom wrote:
> Add two utilities to a) write-protect and b) clean all ptes pointing into
> a range of an address space
A period at the end, please.
> The utilities are intended to aid in tracking dirty pages (either
> driver-allocated system memory or pci device memory).
> The write-protect utility should be used in conjunction with
> page_mkwrite() and pfn_mkwrite() to trigger write page-faults on page
> accesses. Typically one would want to use this on sparse accesses into
> large memory regions. The clean utility should be used to utilize
> hardware dirtying functionality and avoid the overhead of page-faults,
> typically on large accesses into small memory regions.
>
> The added file "apply_as_range.c" is initially listed as maintained by
> VMware under our DRM driver. If somebody would like it elsewhere,
> that's of course no problem.
>
> Notable changes since RFC:
> - Added comments to help avoid the usage of these function for VMAs
> it's not intended for. We also do advisory checks on the vm_flags and
> warn on illegal usage.
> - Perform the pte modifications the same way softdirty does.
> - Add mmu_notifier range invalidation calls.
> - Add a config option so that this code is not unconditionally included.
> - Tell the mmu_gather code about pending tlb flushes.
>
> Cc: Andrew Morton <akpm at linux-foundation.org>
> Cc: Matthew Wilcox <willy at infradead.org>
> Cc: Will Deacon <will.deacon at arm.com>
> Cc: Peter Zijlstra <peterz at infradead.org>
> Cc: Rik van Riel <riel at surriel.com>
> Cc: Minchan Kim <minchan at kernel.org>
> Cc: Michal Hocko <mhocko at suse.com>
> Cc: Huang Ying <ying.huang at intel.com>
> Cc: Souptick Joarder <jrdr.linux at gmail.com>
> Cc: "Jérôme Glisse" <jglisse at redhat.com>
> Cc: linux-mm at kvack.org
> Cc: linux-kernel at vger.kernel.org
> Signed-off-by: Thomas Hellstrom <thellstrom at vmware.com>
Reviewed-by: Ralph Campbell <rcampbell at nvidia.com>
> ---
> MAINTAINERS | 1 +
> include/linux/mm.h | 9 +-
> mm/Kconfig | 3 +
> mm/Makefile | 3 +-
> mm/apply_as_range.c | 295 ++++++++++++++++++++++++++++++++++++++++++++
> 5 files changed, 309 insertions(+), 2 deletions(-)
> create mode 100644 mm/apply_as_range.c
>
> diff --git a/MAINTAINERS b/MAINTAINERS
> index 35e6357f9d30..bc243ffcb840 100644
> --- a/MAINTAINERS
> +++ b/MAINTAINERS
> @@ -4971,6 +4971,7 @@ T: git git://people.freedesktop.org/~thomash/linux
> S: Supported
> F: drivers/gpu/drm/vmwgfx/
> F: include/uapi/drm/vmwgfx_drm.h
> +F: mm/apply_as_range.c
>
> DRM DRIVERS
> M: David Airlie <airlied at linux.ie>
> diff --git a/include/linux/mm.h b/include/linux/mm.h
> index b7dd4ddd6efb..62f24dd0bfa0 100644
> --- a/include/linux/mm.h
> +++ b/include/linux/mm.h
> @@ -2642,7 +2642,14 @@ struct pfn_range_apply {
> };
> extern int apply_to_pfn_range(struct pfn_range_apply *closure,
> unsigned long address, unsigned long size);
> -
> +unsigned long apply_as_wrprotect(struct address_space *mapping,
> + pgoff_t first_index, pgoff_t nr);
> +unsigned long apply_as_clean(struct address_space *mapping,
> + pgoff_t first_index, pgoff_t nr,
> + pgoff_t bitmap_pgoff,
> + unsigned long *bitmap,
> + pgoff_t *start,
> + pgoff_t *end);
> #ifdef CONFIG_PAGE_POISONING
> extern bool page_poisoning_enabled(void);
> extern void kernel_poison_pages(struct page *page, int numpages, int enable);
> diff --git a/mm/Kconfig b/mm/Kconfig
> index 25c71eb8a7db..80e41cdbb4ae 100644
> --- a/mm/Kconfig
> +++ b/mm/Kconfig
> @@ -758,4 +758,7 @@ config GUP_BENCHMARK
> config ARCH_HAS_PTE_SPECIAL
> bool
>
> +config AS_DIRTY_HELPERS
> + bool
> +
> endmenu
> diff --git a/mm/Makefile b/mm/Makefile
> index d210cc9d6f80..b295717be856 100644
> --- a/mm/Makefile
> +++ b/mm/Makefile
> @@ -39,7 +39,7 @@ obj-y := filemap.o mempool.o oom_kill.o fadvise.o \
> mm_init.o mmu_context.o percpu.o slab_common.o \
> compaction.o vmacache.o \
> interval_tree.o list_lru.o workingset.o \
> - debug.o $(mmu-y)
> + debug.o apply_as_range.o $(mmu-y)
>
> obj-y += init-mm.o
> obj-y += memblock.o
> @@ -99,3 +99,4 @@ obj-$(CONFIG_HARDENED_USERCOPY) += usercopy.o
> obj-$(CONFIG_PERCPU_STATS) += percpu-stats.o
> obj-$(CONFIG_HMM) += hmm.o
> obj-$(CONFIG_MEMFD_CREATE) += memfd.o
> +obj-$(CONFIG_AS_DIRTY_HELPERS) += apply_as_range.o
> diff --git a/mm/apply_as_range.c b/mm/apply_as_range.c
> new file mode 100644
> index 000000000000..32d28619aec5
> --- /dev/null
> +++ b/mm/apply_as_range.c
> @@ -0,0 +1,295 @@
> +// SPDX-License-Identifier: GPL-2.0
> +#include <linux/mm.h>
> +#include <linux/mm_types.h>
> +#include <linux/hugetlb.h>
> +#include <linux/bitops.h>
> +#include <linux/mmu_notifier.h>
> +#include <asm/cacheflush.h>
> +#include <asm/tlbflush.h>
> +
> +/**
> + * struct apply_as - Closure structure for apply_as_range
> + * @base: struct pfn_range_apply we derive from
> + * @start: Address of first modified pte
> + * @end: Address of last modified pte + 1
> + * @total: Total number of modified ptes
> + * @vma: Pointer to the struct vm_area_struct we're currently operating on
> + */
> +struct apply_as {
> + struct pfn_range_apply base;
> + unsigned long start, end;
One variable defined per line, please.
> + unsigned long total;
> + const struct vm_area_struct *vma;
> +};
> +
> +/**
> + * apply_pt_wrprotect - Leaf pte callback to write-protect a pte
> + * @pte: Pointer to the pte
> + * @token: Page table token, see apply_to_pfn_range()
> + * @addr: The virtual page address
> + * @closure: Pointer to a struct pfn_range_apply embedded in a
> + * struct apply_as
> + *
> + * The function write-protects a pte and records the range in
> + * virtual address space of touched ptes for efficient range TLB flushes.
> + *
> + * Return: Always zero.
> + */
> +static int apply_pt_wrprotect(pte_t *pte, pgtable_t token,
> + unsigned long addr,
> + struct pfn_range_apply *closure)
> +{
> + struct apply_as *aas = container_of(closure, typeof(*aas), base);
> + pte_t ptent = *pte;
> +
> + if (pte_write(ptent)) {
> + ptent = ptep_modify_prot_start(closure->mm, addr, pte);
> + ptent = pte_wrprotect(ptent);
> + ptep_modify_prot_commit(closure->mm, addr, pte, ptent);
> + aas->total++;
> + aas->start = min(aas->start, addr);
> + aas->end = max(aas->end, addr + PAGE_SIZE);
> + }
> +
> + return 0;
> +}
> +
> +/**
> + * struct apply_as_clean - Closure structure for apply_as_clean
> + * @base: struct apply_as we derive from
> + * @bitmap_pgoff: Address_space Page offset of the first bit in @bitmap
> + * @bitmap: Bitmap with one bit for each page offset in the address_space range
> + * covered.
> + * @start: Address_space page offset of first modified pte relative
> + * to @bitmap_pgoff
> + * @end: Address_space page offset of last modified pte relative
> + * to @bitmap_pgoff
> + */
> +struct apply_as_clean {
> + struct apply_as base;
> + pgoff_t bitmap_pgoff;
> + unsigned long *bitmap;
> + pgoff_t start, end;
One variable defined per line, please.
> +};
> +
> +/**
> + * apply_pt_clean - Leaf pte callback to clean a pte
> + * @pte: Pointer to the pte
> + * @token: Page table token, see apply_to_pfn_range()
> + * @addr: The virtual page address
> + * @closure: Pointer to a struct pfn_range_apply embedded in a
> + * struct apply_as_clean
> + *
> + * The function cleans a pte and records the range in
> + * virtual address space of touched ptes for efficient TLB flushes.
> + * It also records dirty ptes in a bitmap representing page offsets
> + * in the address_space, as well as the first and last of the bits
> + * touched.
> + *
> + * Return: Always zero.
> + */
> +static int apply_pt_clean(pte_t *pte, pgtable_t token,
> + unsigned long addr,
> + struct pfn_range_apply *closure)
> +{
> + struct apply_as *aas = container_of(closure, typeof(*aas), base);
> + struct apply_as_clean *clean = container_of(aas, typeof(*clean), base);
> + pte_t ptent = *pte;
> +
> + if (pte_dirty(ptent)) {
> + pgoff_t pgoff = ((addr - aas->vma->vm_start) >> PAGE_SHIFT) +
> + aas->vma->vm_pgoff - clean->bitmap_pgoff;
> +
> + ptent = ptep_modify_prot_start(closure->mm, addr, pte);
> + ptent = pte_mkclean(ptent);
> + ptep_modify_prot_commit(closure->mm, addr, pte, ptent);
> +
> + aas->total++;
> + aas->start = min(aas->start, addr);
> + aas->end = max(aas->end, addr + PAGE_SIZE);
> +
> + __set_bit(pgoff, clean->bitmap);
> + clean->start = min(clean->start, pgoff);
> + clean->end = max(clean->end, pgoff + 1);
> + }
> +
> + return 0;
> +}
> +
> +/**
> + * apply_as_range - Apply a pte callback to all PTEs pointing into a range
> + * of an address_space.
> + * @mapping: Pointer to the struct address_space
> + * @aas: Closure structure
> + * @first_index: First page offset in the address_space
> + * @nr: Number of incremental page offsets to cover
> + *
> + * Return: Number of ptes touched. Note that this number might be larger
> + * than @nr if there are overlapping vmas
> + */
> +static unsigned long apply_as_range(struct address_space *mapping,
> + struct apply_as *aas,
> + pgoff_t first_index, pgoff_t nr)
> +{
> + struct vm_area_struct *vma;
> + pgoff_t vba, vea, cba, cea;
> + unsigned long start_addr, end_addr;
> + struct mmu_notifier_range range;
> +
> + i_mmap_lock_read(mapping);
> + vma_interval_tree_foreach(vma, &mapping->i_mmap, first_index,
> + first_index + nr - 1) {
> + unsigned long vm_flags = READ_ONCE(vma->vm_flags);
> +
> + /*
> + * We can only do advisory flag tests below, since we can't
> + * require the vm's mmap_sem to be held to protect the flags.
> + * Therefore, callers that strictly depend on specific mmap
> + * flags to remain constant throughout the operation must
> + * either ensure those flags are immutable for all relevant
> + * vmas or can't use this function. Fixing this properly would
> + * require the vma::vm_flags to be protected by a separate
> + * lock taken after the i_mmap_lock
> + */
> +
> + /* Skip non-applicable VMAs */
> + if ((vm_flags & (VM_SHARED | VM_WRITE)) !=
> + (VM_SHARED | VM_WRITE))
> + continue;
> +
> + /* Warn on and skip VMAs whose flags indicate illegal usage */
> + if (WARN_ON((vm_flags & (VM_HUGETLB | VM_IO)) != VM_IO))
> + continue;
> +
> + /* Clip to the vma */
> + vba = vma->vm_pgoff;
> + vea = vba + vma_pages(vma);
> + cba = first_index;
> + cba = max(cba, vba);
> + cea = first_index + nr;
> + cea = min(cea, vea);
> +
> + /* Translate to virtual address */
> + start_addr = ((cba - vba) << PAGE_SHIFT) + vma->vm_start;
> + end_addr = ((cea - vba) << PAGE_SHIFT) + vma->vm_start;
> + if (start_addr >= end_addr)
> + continue;
> +
> + aas->base.mm = vma->vm_mm;
> + aas->vma = vma;
> + aas->start = end_addr;
> + aas->end = start_addr;
> +
> + mmu_notifier_range_init(&range, vma->vm_mm,
> + start_addr, end_addr);
> + mmu_notifier_invalidate_range_start(&range);
> +
> + /* Needed when we only change protection? */
> + flush_cache_range(vma, start_addr, end_addr);
> +
> + /*
> + * We're not using tlb_gather_mmu() since typically
> + * only a small subrange of PTEs are affected.
> + */
> + inc_tlb_flush_pending(vma->vm_mm);
> +
> + /* Should not error since aas->base.alloc == 0 */
> + WARN_ON(apply_to_pfn_range(&aas->base, start_addr,
> + end_addr - start_addr));
> + if (aas->end > aas->start)
> + flush_tlb_range(vma, aas->start, aas->end);
> +
> + mmu_notifier_invalidate_range_end(&range);
> + dec_tlb_flush_pending(vma->vm_mm);
> + }
> + i_mmap_unlock_read(mapping);
> +
> + return aas->total;
> +}
> +
> +/**
> + * apply_as_wrprotect - Write-protect all ptes in an address_space range
> + * @mapping: The address_space we want to write protect
> + * @first_index: The first page offset in the range
> + * @nr: Number of incremental page offsets to cover
> + *
> + * WARNING: This function should only be used for address spaces that
> + * completely own the pages / memory the page table points to. Typically a
> + * device file.
> + *
> + * Return: The number of ptes actually write-protected. Note that
> + * already write-protected ptes are not counted.
> + */
> +unsigned long apply_as_wrprotect(struct address_space *mapping,
> + pgoff_t first_index, pgoff_t nr)
> +{
> + struct apply_as aas = {
> + .base = {
> + .alloc = 0,
> + .ptefn = apply_pt_wrprotect,
> + },
> + .total = 0,
> + };
> +
> + return apply_as_range(mapping, &aas, first_index, nr);
> +}
> +EXPORT_SYMBOL(apply_as_wrprotect);
> +
> +/**
> + * apply_as_clean - Clean all ptes in an address_space range
> + * @mapping: The address_space we want to clean
> + * @first_index: The first page offset in the range
> + * @nr: Number of incremental page offsets to cover
> + * @bitmap_pgoff: The page offset of the first bit in @bitmap
> + * @bitmap: Pointer to a bitmap of at least @nr bits. The bitmap needs to
> + * cover the whole range @first_index.. at first_index + @nr.
> + * @start: Pointer to number of the first set bit in @bitmap.
> + * is modified as new bits are set by the function.
> + * @end: Pointer to the number of the last set bit in @bitmap.
> + * none set. The value is modified as new bets are set by the function.
s/bets/bits/
> + *
> + * Note: When this function returns there is no guarantee that a CPU has
> + * not already dirtied new ptes. However it will not clean any ptes not
> + * reported in the bitmap.
> + *
> + * If a caller needs to make sure all dirty ptes are picked up and none
> + * additional are added, it first needs to write-protect the address-space
> + * range and make sure new writers are blocked in page_mkwrite() or
> + * pfn_mkwrite(). And then after a TLB flush following the write-protection
> + * pick upp all dirty bits.
s/upp/up/
> + *
> + * WARNING: This function should only be used for address spaces that
> + * completely own the pages / memory the page table points to. Typically a
> + * device file.
> + *
> + * Return: The number of dirty ptes actually cleaned.
> + */
> +unsigned long apply_as_clean(struct address_space *mapping,
> + pgoff_t first_index, pgoff_t nr,
> + pgoff_t bitmap_pgoff,
> + unsigned long *bitmap,
> + pgoff_t *start,
> + pgoff_t *end)
> +{
> + bool none_set = (*start >= *end);
> + struct apply_as_clean clean = {
> + .base = {
> + .base = {
> + .alloc = 0,
> + .ptefn = apply_pt_clean,
> + },
> + .total = 0,
> + },
> + .bitmap_pgoff = bitmap_pgoff,
> + .bitmap = bitmap,
> + .start = none_set ? nr : *start,
> + .end = none_set ? 0 : *end,
> + };
> + unsigned long ret = apply_as_range(mapping, &clean.base, first_index,
> + nr);
> +
> + *start = clean.start;
> + *end = clean.end;
> + return ret;
> +}
> +EXPORT_SYMBOL(apply_as_clean);
>
More information about the dri-devel
mailing list