[PATCH v2] drm/amdkfd: handle errors from svm validate and map

Chen, Xiaogang xiaogang.chen at amd.com
Fri Sep 15 21:06:07 UTC 2023


On 9/15/2023 8:28 AM, Philip Yang wrote:
> Caution: This message originated from an External Source. Use proper caution when opening attachments, clicking links, or responding.
>
>
> If new range is splited to multiple pranges with max_svm_range_pages
> alignment and added to update_list, svm validate and map should keep
> going after error to make sure maps_to_gpu flag is up to date for the
> whole range.
>
> svm validate and map update set prange->mapped_to_gpu after mapping to
> GPUs successfully, otherwise clear prange->mapped_to_gpu flag (for
> update mapping case) instead of setting error flag, we can remove
> the redundant error flag to simpliy code.
>
> Update prange->mapped_to_gpu flag inside svm_range_lock, to guarant we
> always set prange invalid flag to evict queues or unmap from GPUs before
> the system memory is moved.
>
> After svm validate and map return error, the caller retry will update
> the mapping for the whole range.
>
> Fixes: c22b04407097 ("drm/amdkfd: flag added to handle errors from svm validate and map")
> Signed-off-by: Philip Yang <Philip.Yang at amd.com>
> ---
>   drivers/gpu/drm/amd/amdkfd/kfd_svm.c | 18 ++++++++----------
>   drivers/gpu/drm/amd/amdkfd/kfd_svm.h |  1 -
>   2 files changed, 8 insertions(+), 11 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
> index 5d7ba7dbf6ce..26018b1d6138 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
> @@ -818,7 +818,7 @@ svm_range_is_same_attrs(struct kfd_process *p, struct svm_range *prange,
>                  }
>          }
>
> -       return !prange->is_error_flag;
> +       return true;
>   }
>
>   /**
> @@ -1724,20 +1724,17 @@ static int svm_range_validate_and_map(struct mm_struct *mm,
>                                            ctx->bitmap, wait, flush_tlb);
>
>   unlock_out:
> +               prange->mapped_to_gpu = !r;

I do not understand why set prange->mapped_to_gpu here? It handles one 
vma, not for all prange. If there are multiple vma and first vma handle 
is ok, and second vma handle fail at amdgpu_hmm_range_get_pages or 
svm_range_dma_map, you would get prange->mapped_to_gpu be true, but only 
part of pragne got mapped, right?


Regards

Xiaogang

>                  svm_range_unlock(prange);
>
>                  addr = next;
>          }
>
> -       if (addr == end) {
> +       if (addr == end)
>                  prange->validated_once = true;
> -               prange->mapped_to_gpu = true;
> -       }
>
>   unreserve_out:
>          svm_range_unreserve_bos(ctx);
> -
> -       prange->is_error_flag = !!r;
>          if (!r)
>                  prange->validate_timestamp = ktime_get_boottime();
>
> @@ -2106,7 +2103,8 @@ svm_range_add(struct kfd_process *p, uint64_t start, uint64_t size,
>                  next = interval_tree_iter_next(node, start, last);
>                  next_start = min(node->last, last) + 1;
>
> -               if (svm_range_is_same_attrs(p, prange, nattr, attrs)) {
> +               if (svm_range_is_same_attrs(p, prange, nattr, attrs) &&
> +                   prange->mapped_to_gpu) {
>                          /* nothing to do */
>                  } else if (node->start < start || node->last > last) {
>                          /* node intersects the update range and its attributes
> @@ -3519,7 +3517,7 @@ svm_range_set_attr(struct kfd_process *p, struct mm_struct *mm,
>          struct svm_range *next;
>          bool update_mapping = false;
>          bool flush_tlb;
> -       int r = 0;
> +       int r, ret = 0;
>
>          pr_debug("pasid 0x%x svms 0x%p [0x%llx 0x%llx] pages 0x%llx\n",
>                   p->pasid, &p->svms, start, start + size - 1, size);
> @@ -3607,7 +3605,7 @@ svm_range_set_attr(struct kfd_process *p, struct mm_struct *mm,
>   out_unlock_range:
>                  mutex_unlock(&prange->migrate_mutex);
>                  if (r)
> -                       break;
> +                       ret = r;
>          }
>
>          dynamic_svm_range_dump(svms);
> @@ -3620,7 +3618,7 @@ svm_range_set_attr(struct kfd_process *p, struct mm_struct *mm,
>          pr_debug("pasid 0x%x svms 0x%p [0x%llx 0x%llx] done, r=%d\n", p->pasid,
>                   &p->svms, start, start + size - 1, r);
>
> -       return r;
> +       return ret ? ret : r;
>   }
>
>   static int
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.h b/drivers/gpu/drm/amd/amdkfd/kfd_svm.h
> index 9e668eeefb32..91715bf3233c 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.h
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.h
> @@ -134,7 +134,6 @@ struct svm_range {
>          DECLARE_BITMAP(bitmap_aip, MAX_GPU_INSTANCE);
>          bool                            validated_once;
>          bool                            mapped_to_gpu;
> -       bool                            is_error_flag;
>   };
>
>   static inline void svm_range_lock(struct svm_range *prange)
> --
> 2.35.1
>


More information about the amd-gfx mailing list