[PATCH 2/2] drm/amdkfd: handle stale retry fault
Felix Kuehling
felix.kuehling at amd.com
Mon Apr 19 18:52:24 UTC 2021
Am 2021-04-18 um 1:35 p.m. schrieb Philip Yang:
> Retry fault interrupt maybe pending in IH ring after GPU page table is
> updated to recover the vm fault, because each page of the range generate
> retry fault interrupt. There is race if application unmap range to
> remove and free the range first and then retry fault work restore_pages
> handle the retry fault interrupt, because range can not be found, this
> vm fault can not be recovered and report incorrect GPU vm fault to
> application.
>
> Before unmap to remove and free range, drain retry fault interrupt from
> IH ring1 to ensure no retry fault comes after the range is removed.
>
> Drain retry fault interrupt skip the range which is on deferred list to
> remove, or the range is child range, which is split by unmap, does not
> add to svms and have interval notifier.
>
> Signed-off-by: Philip Yang <Philip.Yang at amd.com>
The series looks good to me. But the skip-recover changes and the
-EAGAIN handling in svm_range_restore_pages should be a separate patch.
Also, when we defer processing an interrupt (skip-recover or r ==
-EAGAIN) and wait for a subsequent retry interrupt, we may want to
remove that fault address from the gmc->fault_ring that's used by
amdgpu_gmc_filter_faults to filter out repeated page faults on the same
address. In the future we would also have to remove those addresses from
the IH CAM.
Regards,
Felix
> ---
> drivers/gpu/drm/amd/amdkfd/kfd_svm.c | 75 +++++++++++++++++++++++++++-
> 1 file changed, 74 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
> index 0e0b4ffd20ab..45dd055118eb 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
> @@ -1402,11 +1402,13 @@ static int svm_range_validate_and_map(struct mm_struct *mm,
> svm_range_lock(prange);
> if (!prange->actual_loc) {
> if (amdgpu_hmm_range_get_pages_done(hmm_range)) {
> + pr_debug("hmm update the range, need validate again\n");
> r = -EAGAIN;
> goto unlock_out;
> }
> }
> if (!list_empty(&prange->child_list)) {
> + pr_debug("range split by unmap in parallel, validate again\n");
> r = -EAGAIN;
> goto unlock_out;
> }
> @@ -1828,6 +1830,28 @@ svm_range_handle_list_op(struct svm_range_list *svms, struct svm_range *prange)
> }
> }
>
> +static void svm_range_drain_retry_fault(struct svm_range_list *svms)
> +{
> + struct kfd_process_device *pdd;
> + struct amdgpu_device *adev;
> + struct kfd_process *p;
> + uint32_t i;
> +
> + p = container_of(svms, struct kfd_process, svms);
> +
> + for (i = 0; i < p->n_pdds; i++) {
> + pdd = p->pdds[i];
> + if (!pdd)
> + continue;
> +
> + pr_debug("drain retry fault gpu %d svms %p\n", i, svms);
> + adev = (struct amdgpu_device *)pdd->dev->kgd;
> +
> + amdgpu_ih_wait_on_checkpoint_process(adev, &adev->irq.ih1);
> + pr_debug("drain retry fault gpu %d svms 0x%p done\n", i, svms);
> + }
> +}
> +
> static void svm_range_deferred_list_work(struct work_struct *work)
> {
> struct svm_range_list *svms;
> @@ -1845,6 +1869,10 @@ static void svm_range_deferred_list_work(struct work_struct *work)
> pr_debug("prange 0x%p [0x%lx 0x%lx] op %d\n", prange,
> prange->start, prange->last, prange->work_item.op);
>
> + /* Make sure no stale retry fault coming after range is freed */
> + if (prange->work_item.op == SVM_OP_UNMAP_RANGE)
> + svm_range_drain_retry_fault(prange->svms);
> +
> mm = prange->work_item.mm;
> mmap_write_lock(mm);
> mutex_lock(&svms->lock);
> @@ -2152,6 +2180,44 @@ svm_range_best_restore_location(struct svm_range *prange,
> return -1;
> }
>
> +/* svm_range_skip_recover - decide if prange can be recovered
> + * @prange: svm range structure
> + *
> + * GPU vm retry fault handle skip recover the range for cases:
> + * 1. prange is on deferred list to be removed after unmap, it is stale fault,
> + * deferred list work will drain the stale fault before free the prange.
> + * 2. prange is on deferred list to add interval notifier after split, or
> + * 3. prange is child range, it is split from parent prange, recover later
> + * after interval notifier is added.
> + *
> + * Return: true to skip recover, false to recover
> + */
> +static bool svm_range_skip_recover(struct svm_range *prange)
> +{
> + struct svm_range_list *svms = prange->svms;
> +
> + spin_lock(&svms->deferred_list_lock);
> + if (list_empty(&prange->deferred_list) &&
> + list_empty(&prange->child_list)) {
> + spin_unlock(&svms->deferred_list_lock);
> + return false;
> + }
> + spin_unlock(&svms->deferred_list_lock);
> +
> + if (prange->work_item.op == SVM_OP_UNMAP_RANGE) {
> + pr_debug("svms 0x%p prange 0x%p [0x%lx 0x%lx] unmapped\n",
> + svms, prange, prange->start, prange->last);
> + return true;
> + }
> + if (prange->work_item.op == SVM_OP_ADD_RANGE_AND_MAP ||
> + prange->work_item.op == SVM_OP_ADD_RANGE) {
> + pr_debug("svms 0x%p prange 0x%p [0x%lx 0x%lx] not added yet\n",
> + svms, prange, prange->start, prange->last);
> + return true;
> + }
> + return false;
> +}
> +
> int
> svm_range_restore_pages(struct amdgpu_device *adev, unsigned int pasid,
> uint64_t addr)
> @@ -2187,7 +2253,6 @@ svm_range_restore_pages(struct amdgpu_device *adev, unsigned int pasid,
> mmap_read_lock(mm);
> mutex_lock(&svms->lock);
> prange = svm_range_from_addr(svms, addr, NULL);
> -
> if (!prange) {
> pr_debug("failed to find prange svms 0x%p address [0x%llx]\n",
> svms, addr);
> @@ -2196,6 +2261,10 @@ svm_range_restore_pages(struct amdgpu_device *adev, unsigned int pasid,
> }
>
> mutex_lock(&prange->migrate_mutex);
> +
> + if (svm_range_skip_recover(prange))
> + goto out_unlock_range;
> +
> timestamp = ktime_to_us(ktime_get()) - prange->validate_timestamp;
> /* skip duplicate vm fault on different pages of same range */
> if (timestamp < AMDGPU_SVM_RANGE_RETRY_FAULT_PENDING) {
> @@ -2254,6 +2323,10 @@ svm_range_restore_pages(struct amdgpu_device *adev, unsigned int pasid,
> out:
> kfd_unref_process(p);
>
> + if (r == -EAGAIN) {
> + pr_debug("recover vm fault later\n");
> + r = 0;
> + }
> return r;
> }
>
More information about the amd-gfx
mailing list