[PATCH 2/2] drm/amdgpu: replace get_user_pages with HMM address mirror helpers v2

Tue Dec 4 01:52:27 UTC 2018

See comments inline. I didn't review the amdgpu_cs and amdgpu_gem parts
as I don't know them very well.

On 2018-12-03 3:19 p.m., Yang, Philip wrote:
> Use HMM helper function hmm_vma_fault() to get physical pages backing
> userptr and start CPU page table update track of those pages. Then use
> hmm_vma_range_done() to check if those pages are updated before
> amdgpu_cs_submit for gfx or before user queues are resumed for kfd.
>
> If userptr pages are updated, for gfx, amdgpu_cs_ioctl will restart
> from scratch, for kfd, restore worker is rescheduled to retry.
>
> To avoid circular lock dependency, no nested locking between mmap_sem
> and bo::reserve. The locking order is:
> bo::reserve -> amdgpu_mn_lock(p->mn)
>
> HMM simplify the CPU page table concurrent update check, so remove
> guptasklock, mmu_invalidations, last_set_pages fields from
> amdgpu_ttm_tt struct.
>
> HMM does not pin the page (increase page ref count), so remove related
> operations like release_pages(), put_page(), mark_page_dirty().
>
> v2:
> * Remove nested locking between mmap_sem and bo::reserve
> * Change locking order to bo::reserve -> amdgpu_mn_lock()
> * Use dynamic allocation to replace VLA in kernel stack
>
> Change-Id: Iffd5f855cc9ce402cdfca167f68f83fe39ac56f9
> Signed-off-by: Philip Yang <Philip.Yang at amd.com>
> ---
>  .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c  | 101 ++++++++--
>  drivers/gpu/drm/amd/amdgpu/amdgpu_bo_list.c   |   2 -
>  drivers/gpu/drm/amd/amdgpu/amdgpu_bo_list.h   |   3 +-
>  drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c        | 188 +++++++++---------
>  drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c       |  14 +-
>  drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c        |  34 +++-
>  drivers/gpu/drm/amd/amdgpu/amdgpu_mn.h        |   7 +-
>  drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c       | 164 ++++++---------
>  drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h       |   3 +-
>  drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c        |   1 -
>  .../drm/amd/amdkfd/kfd_device_queue_manager.c |  67 ++++---
>  11 files changed, 312 insertions(+), 272 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
> index f3129b912714..5ce6ba24fc72 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
> @@ -614,8 +614,7 @@ static int init_user_pages(struct kgd_mem *mem, struct mm_struct *mm,
>  	amdgpu_bo_unreserve(bo);
>  
>  release_out:
> -	if (ret)
> -		release_pages(mem->user_pages, bo->tbo.ttm->num_pages);
> +	amdgpu_ttm_tt_get_user_pages_done(bo->tbo.ttm);
>  free_out:
>  	kvfree(mem->user_pages);
>  	mem->user_pages = NULL;
> @@ -677,7 +676,6 @@ static int reserve_bo_and_vm(struct kgd_mem *mem,
>  	ctx->kfd_bo.priority = 0;
>  	ctx->kfd_bo.tv.bo = &bo->tbo;
>  	ctx->kfd_bo.tv.shared = true;
> -	ctx->kfd_bo.user_pages = NULL;
>  	list_add(&ctx->kfd_bo.tv.head, &ctx->list);
>  
>  	amdgpu_vm_get_pd_bo(vm, &ctx->list, &ctx->vm_pd[0]);
> @@ -741,7 +739,6 @@ static int reserve_bo_and_cond_vms(struct kgd_mem *mem,
>  	ctx->kfd_bo.priority = 0;
>  	ctx->kfd_bo.tv.bo = &bo->tbo;
>  	ctx->kfd_bo.tv.shared = true;
> -	ctx->kfd_bo.user_pages = NULL;
>  	list_add(&ctx->kfd_bo.tv.head, &ctx->list);
>  
>  	i = 0;
> @@ -1332,9 +1329,6 @@ int amdgpu_amdkfd_gpuvm_free_memory_of_gpu(
>  	/* Free user pages if necessary */
>  	if (mem->user_pages) {
>  		pr_debug("%s: Freeing user_pages array\n", __func__);
> -		if (mem->user_pages[0])
> -			release_pages(mem->user_pages,
> -					mem->bo->tbo.ttm->num_pages);
>  		kvfree(mem->user_pages);
>  	}
>  
> @@ -1761,8 +1755,6 @@ static int update_invalid_user_pages(struct amdkfd_process_info *process_info,
>  				       __func__);
>  				return -ENOMEM;
>  			}
> -		} else if (mem->user_pages[0]) {
> -			release_pages(mem->user_pages, bo->tbo.ttm->num_pages);
>  		}
>  
>  		/* Get updated user pages */
> @@ -1778,12 +1770,6 @@ static int update_invalid_user_pages(struct amdkfd_process_info *process_info,
>  			 * stalled user mode queues.
>  			 */
>  		}
> -
> -		/* Mark the BO as valid unless it was invalidated
> -		 * again concurrently
> -		 */
> -		if (atomic_cmpxchg(&mem->invalid, invalid, 0) != invalid)
> -			return -EAGAIN;
>  	}
>  
>  	return 0;
> @@ -1876,14 +1862,10 @@ static int validate_invalid_user_pages(struct amdkfd_process_info *process_info)
>  		}
>  
>  		/* Validate succeeded, now the BO owns the pages, free
> -		 * our copy of the pointer array. Put this BO back on
> -		 * the userptr_valid_list. If we need to revalidate
> -		 * it, we need to start from scratch.
> +		 * our copy of the pointer array.
>  		 */
>  		kvfree(mem->user_pages);
>  		mem->user_pages = NULL;
> -		list_move_tail(&mem->validate_list.head,
> -			       &process_info->userptr_valid_list);
>  
>  		/* Update mapping. If the BO was not validated
>  		 * (because we couldn't get user pages), this will
> @@ -1924,6 +1906,70 @@ static int validate_invalid_user_pages(struct amdkfd_process_info *process_info)
>  	return ret;
>  }
>  
> +/* user_pages_invalidated - if CPU page table is updated after getting user
> + * pages
> + *
> + * HMM mirror callback lock amn is hold to prevent the concurrent CPU
> + * page table update while resuming user queues.
> + *
> + * Returns: true if CPU page table is updated, false otherwise
> + */
> +static int user_pages_invalidated(struct mm_struct *mm,
> +			struct amdkfd_process_info *process_info,
> +			struct amdgpu_mn **amn)
> +{
> +	struct kgd_mem *mem, *tmp_mem;
> +	struct amdgpu_bo *bo;
> +	struct amdgpu_device *adev;
> +	int invalid, r = 0;
> +
> +	list_for_each_entry_safe(mem, tmp_mem,
> +				 &process_info->userptr_inval_list,
> +				 validate_list.head) {
> +
> +		invalid = atomic_read(&mem->invalid);
> +		if (!invalid)
> +			/* BO hasn't been invalidated since the last
> +			 * revalidation attempt. Keep its BO list.
> +			 */
> +			continue;
> +
> +		bo = mem->bo;
> +
> +		/* Get HMM mirror callback lock */
> +		if (!*amn) {
> +			adev = amdgpu_ttm_adev(bo->tbo.bdev);
> +			*amn = amdgpu_mn_get(mm, adev, AMDGPU_MN_TYPE_HSA);
> +			if (IS_ERR(*amn)) {
> +				r = true;
> +				*amn = NULL;
> +				goto out;
> +			}
> +
> +			amdgpu_mn_lock(*amn);

There can be multiple GPUs involved here. So you'd need more than one
amn lock. I think you're trying to use the amn lock to serialize page
table updates. But that's probably not the right lock. What we should be
using for this is the page directory reservation locks. We're already
taking them for all affected GPUs in validate_invalid_user_pages. And
that's where you should also be calling amdgpu_ttm_tt_get_user_pages_done.

> +		}
> +
> +		r |= amdgpu_ttm_tt_get_user_pages_done(bo->tbo.ttm);

This needs to be called while holding the "driver lock that serializes
device page table updates". I think in our case that's the page
directory reservation. You're not holding that here.

> +
> +		/* Put this BO back on the userptr_valid_list. If we need to
> +		 * revalidate it, we need to start from scratch.
> +		 */
> +		list_move_tail(&mem->validate_list.head,
> +			       &process_info->userptr_valid_list);

Doing this unconditionally here looks wrong. Also, this was already done
in validate_invalid_user_pages. So I'm wondering what's the point of
this function. In most cases the process_info->userptr_inval_list will
be empty here to begin with.

I think you should probably call amdgpu_ttm_tt_get_user_pages_done in
validate_invalid_user_pages while holding the PD reservation lock and
where you call amdgpu_ttm_tt_set_user_pages.

> +
> +		/* Mark the BO as valid unless it was invalidated
> +		 * again concurrently
> +		 */
> +		if (atomic_cmpxchg(&mem->invalid, invalid, 0) != invalid) {
> +			r = true;
> +			goto out;
> +		}
> +	}
> +
> +out:
> +	return r;
> +}
> +
>  /* Worker callback to restore evicted userptr BOs
>   *
>   * Tries to update and validate all userptr BOs. If successful and no
> @@ -1939,6 +1985,7 @@ static void amdgpu_amdkfd_restore_userptr_worker(struct work_struct *work)
>  	struct task_struct *usertask;
>  	struct mm_struct *mm;
>  	int evicted_bos;
> +	struct amdgpu_mn *amn = NULL;
>  
>  	evicted_bos = atomic_read(&process_info->evicted_bos);
>  	if (!evicted_bos)
> @@ -1977,13 +2024,27 @@ static void amdgpu_amdkfd_restore_userptr_worker(struct work_struct *work)
>  	if (atomic_cmpxchg(&process_info->evicted_bos, evicted_bos, 0) !=
>  	    evicted_bos)
>  		goto unlock_out;
> +
> +	/* If CPU page table is updated again after getting user pages,
> +	 * schedule to restart the restore process again.
> +	 *
> +	 * amn is also locked to prevent CPU page table update while resuming
> +	 * user queues. amn is unlocked after user queues are resumed.
> +	 */
> +	if (user_pages_invalidated(mm, process_info, &amn))
> +		goto unlock_mn_out;
> +
>  	evicted_bos = 0;
> +
>  	if (kgd2kfd->resume_mm(mm)) {
>  		pr_err("%s: Failed to resume KFD\n", __func__);
>  		/* No recovery from this failure. Probably the CP is
>  		 * hanging. No point trying again.
>  		 */
>  	}
> +
> +unlock_mn_out:
> +	amdgpu_mn_unlock(amn);
>  unlock_out:
>  	mutex_unlock(&process_info->lock);
>  	mmput(mm);

[snip]

> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c
b/drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c
> index 56595b3d90d2..6b6becdb725b 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c
> @@ -229,8 +229,6 @@ static void amdgpu_mn_invalidate_node(struct amdgpu_mn_node *node,
>  			true, false, MAX_SCHEDULE_TIMEOUT);
>  		if (r <= 0)
>  			DRM_ERROR("(%ld) failed to wait for user bo\n", r);
> -
> -		amdgpu_ttm_tt_mark_user_pages(bo->tbo.ttm);
>  	}
>  }
>  
> @@ -355,15 +353,16 @@ static struct hmm_mirror_ops amdgpu_hmm_mirror_ops[] = {
>  /**
>   * amdgpu_mn_get - create HMM mirror context
>   *
> + * @mm: the mm struct
>   * @adev: amdgpu device pointer
>   * @type: type of MMU notifier context
>   *
> - * Creates a HMM mirror context for current->mm.
> + * Creates a HMM mirror context for mm.
>   */
> -struct amdgpu_mn *amdgpu_mn_get(struct amdgpu_device *adev,
> +struct amdgpu_mn *amdgpu_mn_get(struct mm_struct *mm,
> +				struct amdgpu_device *adev,
>  				enum amdgpu_mn_type type)
>  {
> -	struct mm_struct *mm = current->mm;
>  	struct amdgpu_mn *amn;
>  	unsigned long key = AMDGPU_MN_KEY(mm, type);
>  	int r;
> @@ -433,7 +432,7 @@ int amdgpu_mn_register(struct amdgpu_bo *bo, unsigned long addr)
>  	struct list_head bos;
>  	struct interval_tree_node *it;
>  
> -	amn = amdgpu_mn_get(adev, type);
> +	amn = amdgpu_mn_get(current->mm, adev, type);
>  	if (IS_ERR(amn))
>  		return PTR_ERR(amn);
>  
> @@ -515,3 +514,26 @@ void amdgpu_mn_unregister(struct amdgpu_bo *bo)
>  	mutex_unlock(&adev->mn_lock);
>  }
>  
> +/* flags used by HMM internal, not related to CPU/GPU PTE flags */
> +static const uint64_t hmm_range_flags[HMM_PFN_FLAG_MAX] = {
> +		(1 << 0), /* HMM_PFN_VALID */
> +		(1 << 1), /* HMM_PFN_WRITE */
> +		0 /* HMM_PFN_DEVICE_PRIVATE */
> +};
> +
> +static const uint64_t hmm_range_values[HMM_PFN_VALUE_MAX] = {
> +		0xfffffffffffffffeUL, /* HMM_PFN_ERROR */
> +		0, /* HMM_PFN_NONE */
> +		0xfffffffffffffffcUL /* HMM_PFN_SPECIAL */
> +};
> +
> +void amdgpu_hmm_init_range(struct hmm_range *range)
> +{
> +	if (range) {
> +		range->flags = hmm_range_flags;
> +		range->values = hmm_range_values;
> +		range->pfn_shift = PAGE_SHIFT;
> +		range->pfns = NULL;
> +		INIT_LIST_HEAD(&range->list);
> +	}
> +}
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mn.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_mn.h
> index 0e2752646e6f..59ea30e149bd 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mn.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mn.h
> @@ -25,9 +25,10 @@
>  #define __AMDGPU_MN_H__
>  
>  /*
> - * MMU Notifier
> + * HMM mirror
>   */
>  struct amdgpu_mn;
> +struct hmm_range;
>  
>  enum amdgpu_mn_type {
>  	AMDGPU_MN_TYPE_GFX,
> @@ -37,10 +38,12 @@ enum amdgpu_mn_type {
>  #if defined(CONFIG_HMM)
>  void amdgpu_mn_lock(struct amdgpu_mn *mn);
>  void amdgpu_mn_unlock(struct amdgpu_mn *mn);
> -struct amdgpu_mn *amdgpu_mn_get(struct amdgpu_device *adev,
> +struct amdgpu_mn *amdgpu_mn_get(struct mm_struct *mm,
> +				struct amdgpu_device *adev,
>  				enum amdgpu_mn_type type);
>  int amdgpu_mn_register(struct amdgpu_bo *bo, unsigned long addr);
>  void amdgpu_mn_unregister(struct amdgpu_bo *bo);
> +void amdgpu_hmm_init_range(struct hmm_range *range);
>  #else
>  static inline void amdgpu_mn_lock(struct amdgpu_mn *mn) {}
>  static inline void amdgpu_mn_unlock(struct amdgpu_mn *mn) {}
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
> index c91ec3101d00..4a1cb4d15dfb 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
> @@ -43,6 +43,7 @@
>  #include <linux/pagemap.h>
>  #include <linux/debugfs.h>
>  #include <linux/iommu.h>
> +#include <linux/hmm.h>
>  #include "amdgpu.h"
>  #include "amdgpu_object.h"
>  #include "amdgpu_trace.h"
> @@ -705,11 +706,6 @@ static unsigned long amdgpu_ttm_io_mem_pfn(struct ttm_buffer_object *bo,
>  /*
>   * TTM backend functions.
>   */
> -struct amdgpu_ttm_gup_task_list {
> -	struct list_head	list;
> -	struct task_struct	*task;
> -};
> -
>  struct amdgpu_ttm_tt {
>  	struct ttm_dma_tt	ttm;
>  	u64			offset;
> @@ -718,85 +714,91 @@ struct amdgpu_ttm_tt {
>  	uint32_t		userflags;
>  	spinlock_t              guptasklock;
>  	struct list_head        guptasks;
> -	atomic_t		mmu_invalidations;
> -	uint32_t		last_set_pages;
> +	struct hmm_range	range;
>  };
>  
>  /**
> - * amdgpu_ttm_tt_get_user_pages - Pin pages of memory pointed to by a USERPTR
> - * pointer to memory
> + * amdgpu_ttm_tt_get_user_pages - get device accessible pages that back user
> + * memory and start HMM tracking CPU page table update
>   *
> - * Called by amdgpu_gem_userptr_ioctl() and amdgpu_cs_parser_bos().
> - * This provides a wrapper around the get_user_pages() call to provide
> - * device accessible pages that back user memory.
> + * Calling function must call amdgpu_ttm_tt_userptr_range_done() once and only
> + * once afterwards to stop HMM tracking
>   */
>  int amdgpu_ttm_tt_get_user_pages(struct ttm_tt *ttm, struct page **pages)
>  {
>  	struct amdgpu_ttm_tt *gtt = (void *)ttm;
>  	struct mm_struct *mm = gtt->usertask->mm;
> -	unsigned int flags = 0;
> -	unsigned pinned = 0;
> -	int r;
> +	unsigned long end = gtt->userptr + ttm->num_pages * PAGE_SIZE;
> +	struct hmm_range *range = &gtt->range;
> +	int r, i;
>  
>  	if (!mm) /* Happens during process shutdown */
>  		return -ESRCH;
>  
> -	if (!(gtt->userflags & AMDGPU_GEM_USERPTR_READONLY))
> -		flags |= FOLL_WRITE;
> +	amdgpu_hmm_init_range(range);
>  
>  	down_read(&mm->mmap_sem);
>  
> -	if (gtt->userflags & AMDGPU_GEM_USERPTR_ANONONLY) {
> -		/*
> -		 * check that we only use anonymous memory to prevent problems
> -		 * with writeback
> -		 */
> -		unsigned long end = gtt->userptr + ttm->num_pages * PAGE_SIZE;
> -		struct vm_area_struct *vma;
> +	range->vma = find_vma(mm, gtt->userptr);
> +	if (!range->vma || range->vma->vm_file || range->vma->vm_end < end) {

I think this breaks file-backed mappings and also adds a new limitation
that the address range has to be a single VMA. This was previously only
required it gtt->userflags & AMDGPU_GEM_USERPTR_ANONONLY.

> +		r = -EPERM;
> +		goto out;
> +	}
> +	range->pfns = kvmalloc_array(ttm->num_pages, sizeof(uint64_t),
> +				GFP_KERNEL | __GFP_ZERO);

Is GFP_ZERO really needed here? The loop below initializes all pfns
explicitly.

> +	if (range->pfns == NULL) {
> +		r = -ENOMEM;
> +		goto out;
> +	}
> +	range->start = gtt->userptr;
> +	range->end = end;
>  
> -		vma = find_vma(mm, gtt->userptr);
> -		if (!vma || vma->vm_file || vma->vm_end < end) {
> -			up_read(&mm->mmap_sem);
> -			return -EPERM;
> -		}
> +	for (i = 0; i < ttm->num_pages; i++) {
> +		range->pfns[i] = range->flags[HMM_PFN_VALID];
> +		range->pfns[i] |= amdgpu_ttm_tt_is_readonly(ttm) ?
> +					0 : range->flags[HMM_PFN_WRITE];
>  	}
>  
> -	/* loop enough times using contiguous pages of memory */
> -	do {
> -		unsigned num_pages = ttm->num_pages - pinned;
> -		uint64_t userptr = gtt->userptr + pinned * PAGE_SIZE;
> -		struct page **p = pages + pinned;
> -		struct amdgpu_ttm_gup_task_list guptask;
> +	/* This may triggles page table update */

Do you mean "trigger"?

> +	r = hmm_vma_fault(range, true);
> +	if (r)
> +		goto out_free_pfns;
>  
> -		guptask.task = current;
> -		spin_lock(&gtt->guptasklock);
> -		list_add(&guptask.list, &gtt->guptasks);
> -		spin_unlock(&gtt->guptasklock);
> +	for (i = 0; i < ttm->num_pages; i++)
> +		pages[i] = hmm_pfn_to_page(range, range->pfns[i]);
>  
> -		if (mm == current->mm)
> -			r = get_user_pages(userptr, num_pages, flags, p, NULL);
> -		else
> -			r = get_user_pages_remote(gtt->usertask,
> -					mm, userptr, num_pages,
> -					flags, p, NULL, NULL);
> +	up_read(&mm->mmap_sem);

You can probably up_read before the loop.

Also, why do you defer freeing of range->pfns to
amdgpu_ttm_tt_get_user_pages_done? The array isn't used for anything
else later, so you could free it here.

Or instead, don't free it and use it to replace the pages array. It
seems wasteful to maintain two pages arrays and constantly allocate and
deallocate one of them.

> +	return 0;
>  
> -		spin_lock(&gtt->guptasklock);
> -		list_del(&guptask.list);
> -		spin_unlock(&gtt->guptasklock);
> +out_free_pfns:
> +	kvfree(range->pfns);
> +	range->pfns = NULL;

Setting range->pfns = NULL is not needed. This is a local variable
that's about to go out of scope anyway.

> +out:
> +	up_read(&mm->mmap_sem);
> +	return r;
> +}
>  
> -		if (r < 0)
> -			goto release_pages;
> +/**
> + * amdgpu_ttm_tt_userptr_range_done - stop HMM track the CPU page table change
> + * Check if the pages backing this ttm range have been invalidated
> + *
> + * Returns: true if pages are invalidated since the last time they've been set
> + */
> +bool amdgpu_ttm_tt_get_user_pages_done(struct ttm_tt *ttm)
> +{
> +	struct amdgpu_ttm_tt *gtt = (void *)ttm;
> +	int r;
>  
> -		pinned += r;
> +	if (gtt == NULL || !gtt->userptr)
> +		return false;
>  
> -	} while (pinned < ttm->num_pages);
> +	r = !hmm_vma_range_done(&gtt->range);
>  
> -	up_read(&mm->mmap_sem);
> -	return 0;
> +	if (gtt->range.pfns) {
> +		kvfree(gtt->range.pfns);
> +		gtt->range.pfns = NULL;
> +	}
>  
> -release_pages:
> -	release_pages(pages, pinned);
> -	up_read(&mm->mmap_sem);
>  	return r;
>  }
>  
> @@ -809,16 +811,10 @@ int amdgpu_ttm_tt_get_user_pages(struct ttm_tt *ttm, struct page **pages)
>   */
>  void amdgpu_ttm_tt_set_user_pages(struct ttm_tt *ttm, struct page **pages)

Does this function still make sense? It just copies a pages array. Could
it instead use gtt->range->pfns directly? And why does this still have
to be separate from amdgpu_ttm_tt_get_user_pages? I think originally it
was separate because we needed to keep track of the original pages for
unpinning them. But we're no longer pinning and unpinning pages.

>  {
> -	struct amdgpu_ttm_tt *gtt = (void *)ttm;
>  	unsigned i;
>  
> -	gtt->last_set_pages = atomic_read(&gtt->mmu_invalidations);
> -	for (i = 0; i < ttm->num_pages; ++i) {
> -		if (ttm->pages[i])
> -			put_page(ttm->pages[i]);
> -
> +	for (i = 0; i < ttm->num_pages; ++i)
>  		ttm->pages[i] = pages ? pages[i] : NULL;

If this is still needed, it could just be a memcpy now.

> -	}
>  }
>  
>  /**
> @@ -903,9 +899,6 @@ static void amdgpu_ttm_tt_unpin_userptr(struct ttm_tt *ttm)
>  	/* unmap the pages mapped to the device */
>  	dma_unmap_sg(adev->dev, ttm->sg->sgl, ttm->sg->nents, direction);
>  
> -	/* mark the pages as dirty */
> -	amdgpu_ttm_tt_mark_user_pages(ttm);
> -
>  	sg_free_table(ttm->sg);
>  }
>  
> @@ -1258,8 +1251,6 @@ int amdgpu_ttm_tt_set_userptr(struct ttm_tt *ttm, uint64_t addr,
>  
>  	spin_lock_init(&gtt->guptasklock);
>  	INIT_LIST_HEAD(&gtt->guptasks);
> -	atomic_set(&gtt->mmu_invalidations, 0);
> -	gtt->last_set_pages = 0;
>  
>  	return 0;
>  }
> @@ -1289,7 +1280,6 @@ bool amdgpu_ttm_tt_affect_userptr(struct ttm_tt *ttm, unsigned long start,
>  				  unsigned long end)
>  {
>  	struct amdgpu_ttm_tt *gtt = (void *)ttm;
> -	struct amdgpu_ttm_gup_task_list *entry;
>  	unsigned long size;
>  
>  	if (gtt == NULL || !gtt->userptr)
> @@ -1302,48 +1292,20 @@ bool amdgpu_ttm_tt_affect_userptr(struct ttm_tt *ttm, unsigned long start,
>  	if (gtt->userptr > end || gtt->userptr + size <= start)
>  		return false;
>  
> -	/* Search the lists of tasks that hold this mapping and see
> -	 * if current is one of them.  If it is return false.
> -	 */
> -	spin_lock(&gtt->guptasklock);
> -	list_for_each_entry(entry, &gtt->guptasks, list) {
> -		if (entry->task == current) {
> -			spin_unlock(&gtt->guptasklock);
> -			return false;
> -		}
> -	}
> -	spin_unlock(&gtt->guptasklock);
> -
> -	atomic_inc(&gtt->mmu_invalidations);
> -
>  	return true;
>  }
>  
>  /**
> - * amdgpu_ttm_tt_userptr_invalidated - Has the ttm_tt object been invalidated?
> + * amdgpu_ttm_tt_is_userptr - Have the pages backing by userptr?
>   */
> -bool amdgpu_ttm_tt_userptr_invalidated(struct ttm_tt *ttm,
> -				       int *last_invalidated)
> -{
> -	struct amdgpu_ttm_tt *gtt = (void *)ttm;
> -	int prev_invalidated = *last_invalidated;
> -
> -	*last_invalidated = atomic_read(&gtt->mmu_invalidations);
> -	return prev_invalidated != *last_invalidated;
> -}
> -
> -/**
> - * amdgpu_ttm_tt_userptr_needs_pages - Have the pages backing this ttm_tt object
> - * been invalidated since the last time they've been set?
> - */
> -bool amdgpu_ttm_tt_userptr_needs_pages(struct ttm_tt *ttm)
> +bool amdgpu_ttm_tt_is_userptr(struct ttm_tt *ttm)
>  {
>  	struct amdgpu_ttm_tt *gtt = (void *)ttm;
>  
>  	if (gtt == NULL || !gtt->userptr)
>  		return false;
>  
> -	return atomic_read(&gtt->mmu_invalidations) != gtt->last_set_pages;
> +	return true;
>  }
>  
>  /**
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h
> index b5b2d101f7db..8988c87fff9d 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h
> @@ -102,6 +102,7 @@ int amdgpu_ttm_alloc_gart(struct ttm_buffer_object *bo);
>  int amdgpu_ttm_recover_gart(struct ttm_buffer_object *tbo);
>  
>  int amdgpu_ttm_tt_get_user_pages(struct ttm_tt *ttm, struct page **pages);
> +bool amdgpu_ttm_tt_get_user_pages_done(struct ttm_tt *ttm);
>  void amdgpu_ttm_tt_set_user_pages(struct ttm_tt *ttm, struct page **pages);
>  void amdgpu_ttm_tt_mark_user_pages(struct ttm_tt *ttm);
>  int amdgpu_ttm_tt_set_userptr(struct ttm_tt *ttm, uint64_t addr,
> @@ -112,7 +113,7 @@ bool amdgpu_ttm_tt_affect_userptr(struct ttm_tt *ttm, unsigned long start,
>  				  unsigned long end);
>  bool amdgpu_ttm_tt_userptr_invalidated(struct ttm_tt *ttm,
>  				       int *last_invalidated);
> -bool amdgpu_ttm_tt_userptr_needs_pages(struct ttm_tt *ttm);
> +bool amdgpu_ttm_tt_is_userptr(struct ttm_tt *ttm);
>  bool amdgpu_ttm_tt_is_readonly(struct ttm_tt *ttm);
>  uint64_t amdgpu_ttm_tt_pde_flags(struct ttm_tt *ttm, struct ttm_mem_reg *mem);
>  uint64_t amdgpu_ttm_tt_pte_flags(struct amdgpu_device *adev, struct ttm_tt *ttm,
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> index 2acb9838913e..4a67a88acea1 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> @@ -618,7 +618,6 @@ void amdgpu_vm_get_pd_bo(struct amdgpu_vm *vm,
>  	entry->priority = 0;
>  	entry->tv.bo = &vm->root.base.bo->tbo;
>  	entry->tv.shared = true;
> -	entry->user_pages = NULL;
>  	list_add(&entry->tv.head, validated);
>  }
>  
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> index 8372556b52eb..fe120cc0930c 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c

Please make everything below a separate commit.

Regards,
  Felix

> @@ -1158,6 +1158,33 @@ static int create_queue_cpsch(struct device_queue_manager *dqm, struct queue *q,
>  
>  	retval = 0;
>  
> +	/* Do init_mqd before dqm_lock(dqm) to avoid circular locking order:
> +	 * lock(dqm) -> bo::reserve
> +	 */
> +	mqd_mgr = dqm->ops.get_mqd_manager(dqm,
> +			get_mqd_type_from_queue_type(q->properties.type));
> +
> +	if (!mqd_mgr) {
> +		retval = -ENOMEM;
> +		goto out;
> +	}
> +
> +	/*
> +	 * Eviction state logic: we only mark active queues as evicted
> +	 * to avoid the overhead of restoring inactive queues later
> +	 */
> +	if (qpd->evicted)
> +		q->properties.is_evicted = (q->properties.queue_size > 0 &&
> +					    q->properties.queue_percent > 0 &&
> +					    q->properties.queue_address != 0);
> +	dqm->asic_ops.init_sdma_vm(dqm, q, qpd);
> +	q->properties.tba_addr = qpd->tba_addr;
> +	q->properties.tma_addr = qpd->tma_addr;
> +	retval = mqd_mgr->init_mqd(mqd_mgr, &q->mqd, &q->mqd_mem_obj,
> +				&q->gart_mqd_addr, &q->properties);
> +	if (retval)
> +		goto out;
> +
>  	dqm_lock(dqm);
>  
>  	if (dqm->total_queue_count >= max_num_of_queues_per_device) {
> @@ -1181,30 +1208,6 @@ static int create_queue_cpsch(struct device_queue_manager *dqm, struct queue *q,
>  	if (retval)
>  		goto out_deallocate_sdma_queue;
>  
> -	mqd_mgr = dqm->ops.get_mqd_manager(dqm,
> -			get_mqd_type_from_queue_type(q->properties.type));
> -
> -	if (!mqd_mgr) {
> -		retval = -ENOMEM;
> -		goto out_deallocate_doorbell;
> -	}
> -	/*
> -	 * Eviction state logic: we only mark active queues as evicted
> -	 * to avoid the overhead of restoring inactive queues later
> -	 */
> -	if (qpd->evicted)
> -		q->properties.is_evicted = (q->properties.queue_size > 0 &&
> -					    q->properties.queue_percent > 0 &&
> -					    q->properties.queue_address != 0);
> -
> -	dqm->asic_ops.init_sdma_vm(dqm, q, qpd);
> -
> -	q->properties.tba_addr = qpd->tba_addr;
> -	q->properties.tma_addr = qpd->tma_addr;
> -	retval = mqd_mgr->init_mqd(mqd_mgr, &q->mqd, &q->mqd_mem_obj,
> -				&q->gart_mqd_addr, &q->properties);
> -	if (retval)
> -		goto out_deallocate_doorbell;
>  
>  	list_add(&q->list, &qpd->queues_list);
>  	qpd->queue_count++;
> @@ -1228,14 +1231,12 @@ static int create_queue_cpsch(struct device_queue_manager *dqm, struct queue *q,
>  	dqm_unlock(dqm);
>  	return retval;
>  
> -out_deallocate_doorbell:
> -	deallocate_doorbell(qpd, q);
>  out_deallocate_sdma_queue:
>  	if (q->properties.type == KFD_QUEUE_TYPE_SDMA)
>  		deallocate_sdma_queue(dqm, q->sdma_id);
>  out_unlock:
>  	dqm_unlock(dqm);
> -
> +out:
>  	return retval;
>  }
>  
> @@ -1398,8 +1399,6 @@ static int destroy_queue_cpsch(struct device_queue_manager *dqm,
>  			qpd->reset_wavefronts = true;
>  	}
>  
> -	mqd_mgr->uninit_mqd(mqd_mgr, q->mqd, q->mqd_mem_obj);
> -
>  	/*
>  	 * Unconditionally decrement this counter, regardless of the queue's
>  	 * type
> @@ -1410,6 +1409,9 @@ static int destroy_queue_cpsch(struct device_queue_manager *dqm,
>  
>  	dqm_unlock(dqm);
>  
> +	/* Do uninit_mqd after dqm_unlock(dqm) to avoid circular locking */
> +	mqd_mgr->uninit_mqd(mqd_mgr, q->mqd, q->mqd_mem_obj);
> +
>  	return retval;
>  
>  failed:
> @@ -1631,7 +1633,11 @@ static int process_termination_cpsch(struct device_queue_manager *dqm,
>  		qpd->reset_wavefronts = false;
>  	}
>  
> -	/* lastly, free mqd resources */
> +	dqm_unlock(dqm);
> +
> +	/* Lastly, free mqd resources.
> +	 * Do uninit_mqd() after dqm_unlock to avoid circular locking.
> +	 */
>  	list_for_each_entry_safe(q, next, &qpd->queues_list, list) {
>  		mqd_mgr = dqm->ops.get_mqd_manager(dqm,
>  			get_mqd_type_from_queue_type(q->properties.type));
> @@ -1645,7 +1651,6 @@ static int process_termination_cpsch(struct device_queue_manager *dqm,
>  	}
>  
>  out:
> -	dqm_unlock(dqm);
>  	return retval;
>  }
>