[PATCH v5 02/13] mm: handling Non-LRU pages returned by vm_normal_pages
Alistair Popple
apopple at nvidia.com
Wed Jun 8 07:06:28 UTC 2022
I can't see any issues with this now so:
Reviewed-by: Alistair Popple <apopple at nvidia.com>
Alex Sierra <alex.sierra at amd.com> writes:
> With DEVICE_COHERENT, we'll soon have vm_normal_pages() return
> device-managed anonymous pages that are not LRU pages. Although they
> behave like normal pages for purposes of mapping in CPU page, and for
> COW. They do not support LRU lists, NUMA migration or THP.
>
> We also introduced a FOLL_LRU flag that adds the same behaviour to
> follow_page and related APIs, to allow callers to specify that they
> expect to put pages on an LRU list.
>
> Signed-off-by: Alex Sierra <alex.sierra at amd.com>
> Acked-by: Felix Kuehling <Felix.Kuehling at amd.com>
> ---
> fs/proc/task_mmu.c | 2 +-
> include/linux/mm.h | 3 ++-
> mm/gup.c | 6 +++++-
> mm/huge_memory.c | 2 +-
> mm/khugepaged.c | 9 ++++++---
> mm/ksm.c | 6 +++---
> mm/madvise.c | 4 ++--
> mm/memory.c | 9 ++++++++-
> mm/mempolicy.c | 2 +-
> mm/migrate.c | 4 ++--
> mm/mlock.c | 2 +-
> mm/mprotect.c | 2 +-
> 12 files changed, 33 insertions(+), 18 deletions(-)
>
> diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
> index 2d04e3470d4c..2dd8c8a66924 100644
> --- a/fs/proc/task_mmu.c
> +++ b/fs/proc/task_mmu.c
> @@ -1792,7 +1792,7 @@ static struct page *can_gather_numa_stats(pte_t pte, struct vm_area_struct *vma,
> return NULL;
>
> page = vm_normal_page(vma, addr, pte);
> - if (!page)
> + if (!page || is_zone_device_page(page))
> return NULL;
>
> if (PageReserved(page))
> diff --git a/include/linux/mm.h b/include/linux/mm.h
> index bc8f326be0ce..d3f43908ff8d 100644
> --- a/include/linux/mm.h
> +++ b/include/linux/mm.h
> @@ -601,7 +601,7 @@ struct vm_operations_struct {
> #endif
> /*
> * Called by vm_normal_page() for special PTEs to find the
> - * page for @addr. This is useful if the default behavior
> + * page for @addr. This is useful if the default behavior
> * (using pte_page()) would not find the correct page.
> */
> struct page *(*find_special_page)(struct vm_area_struct *vma,
> @@ -2934,6 +2934,7 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
> #define FOLL_NUMA 0x200 /* force NUMA hinting page fault */
> #define FOLL_MIGRATION 0x400 /* wait for page to replace migration entry */
> #define FOLL_TRIED 0x800 /* a retry, previous pass started an IO */
> +#define FOLL_LRU 0x1000 /* return only LRU (anon or page cache) */
> #define FOLL_REMOTE 0x2000 /* we are working on non-current tsk/mm */
> #define FOLL_COW 0x4000 /* internal GUP flag */
> #define FOLL_ANON 0x8000 /* don't do file mappings */
> diff --git a/mm/gup.c b/mm/gup.c
> index 551264407624..48b45bcc8501 100644
> --- a/mm/gup.c
> +++ b/mm/gup.c
> @@ -532,7 +532,11 @@ static struct page *follow_page_pte(struct vm_area_struct *vma,
> }
>
> page = vm_normal_page(vma, address, pte);
> - if (!page && pte_devmap(pte) && (flags & (FOLL_GET | FOLL_PIN))) {
> + if ((flags & FOLL_LRU) && ((page && is_zone_device_page(page)) ||
> + (!page && pte_devmap(pte)))) {
> + page = ERR_PTR(-EEXIST);
> + goto out;
> + } else if (!page && pte_devmap(pte) && (flags & (FOLL_GET | FOLL_PIN))) {
> /*
> * Only return device mapping pages in the FOLL_GET or FOLL_PIN
> * case since they are only valid while holding the pgmap
> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> index a77c78a2b6b5..48182c8fe151 100644
> --- a/mm/huge_memory.c
> +++ b/mm/huge_memory.c
> @@ -2906,7 +2906,7 @@ static int split_huge_pages_pid(int pid, unsigned long vaddr_start,
> }
>
> /* FOLL_DUMP to ignore special (like zero) pages */
> - page = follow_page(vma, addr, FOLL_GET | FOLL_DUMP);
> + page = follow_page(vma, addr, FOLL_GET | FOLL_DUMP | FOLL_LRU);
>
> if (IS_ERR(page))
> continue;
> diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> index 16be62d493cd..671ac7800e53 100644
> --- a/mm/khugepaged.c
> +++ b/mm/khugepaged.c
> @@ -618,7 +618,7 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
> goto out;
> }
> page = vm_normal_page(vma, address, pteval);
> - if (unlikely(!page)) {
> + if (unlikely(!page) || unlikely(is_zone_device_page(page))) {
> result = SCAN_PAGE_NULL;
> goto out;
> }
> @@ -1267,7 +1267,7 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
> writable = true;
>
> page = vm_normal_page(vma, _address, pteval);
> - if (unlikely(!page)) {
> + if (unlikely(!page) || unlikely(is_zone_device_page(page))) {
> result = SCAN_PAGE_NULL;
> goto out_unmap;
> }
> @@ -1479,7 +1479,8 @@ void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr)
> goto abort;
>
> page = vm_normal_page(vma, addr, *pte);
> -
> + if (WARN_ON_ONCE(page && is_zone_device_page(page)))
> + page = NULL;
> /*
> * Note that uprobe, debugger, or MAP_PRIVATE may change the
> * page table, but the new page will not be a subpage of hpage.
> @@ -1497,6 +1498,8 @@ void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr)
> if (pte_none(*pte))
> continue;
> page = vm_normal_page(vma, addr, *pte);
> + if (WARN_ON_ONCE(page && is_zone_device_page(page)))
> + goto abort;
> page_remove_rmap(page, vma, false);
> }
>
> diff --git a/mm/ksm.c b/mm/ksm.c
> index 54f78c9eecae..400790128102 100644
> --- a/mm/ksm.c
> +++ b/mm/ksm.c
> @@ -474,7 +474,7 @@ static int break_ksm(struct vm_area_struct *vma, unsigned long addr)
> do {
> cond_resched();
> page = follow_page(vma, addr,
> - FOLL_GET | FOLL_MIGRATION | FOLL_REMOTE);
> + FOLL_GET | FOLL_MIGRATION | FOLL_REMOTE | FOLL_LRU);
> if (IS_ERR_OR_NULL(page))
> break;
> if (PageKsm(page))
> @@ -559,7 +559,7 @@ static struct page *get_mergeable_page(struct rmap_item *rmap_item)
> if (!vma)
> goto out;
>
> - page = follow_page(vma, addr, FOLL_GET);
> + page = follow_page(vma, addr, FOLL_GET | FOLL_LRU);
> if (IS_ERR_OR_NULL(page))
> goto out;
> if (PageAnon(page)) {
> @@ -2307,7 +2307,7 @@ static struct rmap_item *scan_get_next_rmap_item(struct page **page)
> while (ksm_scan.address < vma->vm_end) {
> if (ksm_test_exit(mm))
> break;
> - *page = follow_page(vma, ksm_scan.address, FOLL_GET);
> + *page = follow_page(vma, ksm_scan.address, FOLL_GET | FOLL_LRU);
> if (IS_ERR_OR_NULL(*page)) {
> ksm_scan.address += PAGE_SIZE;
> cond_resched();
> diff --git a/mm/madvise.c b/mm/madvise.c
> index d7b4f2602949..e5637181de1b 100644
> --- a/mm/madvise.c
> +++ b/mm/madvise.c
> @@ -421,7 +421,7 @@ static int madvise_cold_or_pageout_pte_range(pmd_t *pmd,
> continue;
>
> page = vm_normal_page(vma, addr, ptent);
> - if (!page)
> + if (!page || is_zone_device_page(page))
> continue;
>
> /*
> @@ -639,7 +639,7 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
> }
>
> page = vm_normal_page(vma, addr, ptent);
> - if (!page)
> + if (!page || is_zone_device_page(page))
> continue;
>
> /*
> diff --git a/mm/memory.c b/mm/memory.c
> index 21dadf03f089..30ecbc715e60 100644
> --- a/mm/memory.c
> +++ b/mm/memory.c
> @@ -624,6 +624,13 @@ struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
> if (is_zero_pfn(pfn))
> return NULL;
> if (pte_devmap(pte))
> +/*
> + * NOTE: New uers of ZONE_DEVICE will not set pte_devmap() and will have
> + * refcounts incremented on their struct pages when they are inserted into
> + * PTEs, thus they are safe to return here. Legacy ZONE_DEVICE pages that set
> + * pte_devmap() do not have refcounts. Example of legacy ZONE_DEVICE is
> + * MEMORY_DEVICE_FS_DAX type in pmem or virtio_fs drivers.
> + */
> return NULL;
>
> print_bad_pte(vma, addr, pte, NULL);
> @@ -4685,7 +4692,7 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf)
> pte = pte_modify(old_pte, vma->vm_page_prot);
>
> page = vm_normal_page(vma, vmf->address, pte);
> - if (!page)
> + if (!page || is_zone_device_page(page))
> goto out_map;
>
> /* TODO: handle PTE-mapped THP */
> diff --git a/mm/mempolicy.c b/mm/mempolicy.c
> index d39b01fd52fe..abc26890fc95 100644
> --- a/mm/mempolicy.c
> +++ b/mm/mempolicy.c
> @@ -523,7 +523,7 @@ static int queue_pages_pte_range(pmd_t *pmd, unsigned long addr,
> if (!pte_present(*pte))
> continue;
> page = vm_normal_page(vma, addr, *pte);
> - if (!page)
> + if (!page || is_zone_device_page(page))
> continue;
> /*
> * vm_normal_page() filters out zero pages, but there might
> diff --git a/mm/migrate.c b/mm/migrate.c
> index e51588e95f57..f7d1b8312631 100644
> --- a/mm/migrate.c
> +++ b/mm/migrate.c
> @@ -1612,7 +1612,7 @@ static int add_page_for_migration(struct mm_struct *mm, unsigned long addr,
> goto out;
>
> /* FOLL_DUMP to ignore special (like zero) pages */
> - page = follow_page(vma, addr, FOLL_GET | FOLL_DUMP);
> + page = follow_page(vma, addr, FOLL_GET | FOLL_DUMP | FOLL_LRU);
>
> err = PTR_ERR(page);
> if (IS_ERR(page))
> @@ -1803,7 +1803,7 @@ static void do_pages_stat_array(struct mm_struct *mm, unsigned long nr_pages,
> goto set_status;
>
> /* FOLL_DUMP to ignore special (like zero) pages */
> - page = follow_page(vma, addr, FOLL_GET | FOLL_DUMP);
> + page = follow_page(vma, addr, FOLL_GET | FOLL_DUMP | FOLL_LRU);
>
> err = PTR_ERR(page);
> if (IS_ERR(page))
> diff --git a/mm/mlock.c b/mm/mlock.c
> index 716caf851043..b14e929084cc 100644
> --- a/mm/mlock.c
> +++ b/mm/mlock.c
> @@ -333,7 +333,7 @@ static int mlock_pte_range(pmd_t *pmd, unsigned long addr,
> if (!pte_present(*pte))
> continue;
> page = vm_normal_page(vma, addr, *pte);
> - if (!page)
> + if (!page || is_zone_device_page(page))
> continue;
> if (PageTransCompound(page))
> continue;
> diff --git a/mm/mprotect.c b/mm/mprotect.c
> index ba5592655ee3..e034aae2a98b 100644
> --- a/mm/mprotect.c
> +++ b/mm/mprotect.c
> @@ -95,7 +95,7 @@ static unsigned long change_pte_range(struct mmu_gather *tlb,
> continue;
>
> page = vm_normal_page(vma, addr, oldpte);
> - if (!page || PageKsm(page))
> + if (!page || is_zone_device_page(page) || PageKsm(page))
> continue;
>
> /* Also skip shared copy-on-write pages */
More information about the dri-devel
mailing list