[PATCH v3] drm/amdkfd: Handle errors from svm validate and map
Felix Kuehling
felix.kuehling at amd.com
Tue Sep 19 21:15:37 UTC 2023
On 2023-09-19 10:21, Philip Yang wrote:
> If new range is splited to multiple pranges with max_svm_range_pages
> alignment and added to update_list, svm validate and map should keep
> going after error to make sure prange->mapped_to_gpu flag is up to date
> for the whole range.
>
> svm validate and map update set prange->mapped_to_gpu after mapping to
> GPUs successfully, otherwise clear prange->mapped_to_gpu flag (for
> update mapping case) instead of setting error flag, we can remove
> the redundant error flag to simpliy code.
>
> Refactor to remove goto and update prange->mapped_to_gpu flag inside
> svm_range_lock, to guarant we always evict queues or unmap from GPUs if
> there are invalid ranges.
>
> After svm validate and map return error -EAGIN, the caller retry will
> update the mapping for the whole range again.
>
> Fixes: c22b04407097 ("drm/amdkfd: flag added to handle errors from svm validate and map")
> Signed-off-by: Philip Yang <Philip.Yang at amd.com>
> ---
> drivers/gpu/drm/amd/amdkfd/kfd_svm.c | 78 +++++++++++++---------------
> drivers/gpu/drm/amd/amdkfd/kfd_svm.h | 1 -
> 2 files changed, 36 insertions(+), 43 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
> index 50c29fd844fb..4812f4ac5579 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
> @@ -818,7 +818,7 @@ svm_range_is_same_attrs(struct kfd_process *p, struct svm_range *prange,
> }
> }
>
> - return !prange->is_error_flag;
> + return true;
> }
>
> /**
> @@ -1671,7 +1671,7 @@ static int svm_range_validate_and_map(struct mm_struct *mm,
>
> start = prange->start << PAGE_SHIFT;
> end = (prange->last + 1) << PAGE_SHIFT;
> - for (addr = start; addr < end && !r; ) {
> + for (addr = start; !r && addr < end; ) {
> struct hmm_range *hmm_range;
> struct vm_area_struct *vma;
> unsigned long next;
> @@ -1680,62 +1680,55 @@ static int svm_range_validate_and_map(struct mm_struct *mm,
> bool readonly;
>
> vma = vma_lookup(mm, addr);
> - if (!vma) {
> + if (vma) {
> + readonly = !(vma->vm_flags & VM_WRITE);
> +
> + next = min(vma->vm_end, end);
> + npages = (next - addr) >> PAGE_SHIFT;
> + WRITE_ONCE(p->svms.faulting_task, current);
> + r = amdgpu_hmm_range_get_pages(&prange->notifier, addr, npages,
> + readonly, owner, NULL,
> + &hmm_range);
> + WRITE_ONCE(p->svms.faulting_task, NULL);
> + if (r) {
> + pr_debug("failed %d to get svm range pages\n", r);
> + if (r == -EBUSY)
> + r = -EAGAIN;
> + }
> + } else {
> r = -EFAULT;
> - goto unreserve_out;
> - }
> - readonly = !(vma->vm_flags & VM_WRITE);
> -
> - next = min(vma->vm_end, end);
> - npages = (next - addr) >> PAGE_SHIFT;
> - WRITE_ONCE(p->svms.faulting_task, current);
> - r = amdgpu_hmm_range_get_pages(&prange->notifier, addr, npages,
> - readonly, owner, NULL,
> - &hmm_range);
> - WRITE_ONCE(p->svms.faulting_task, NULL);
> - if (r) {
> - pr_debug("failed %d to get svm range pages\n", r);
> - if (r == -EBUSY)
> - r = -EAGAIN;
> - goto unreserve_out;
> }
>
> - offset = (addr - start) >> PAGE_SHIFT;
> - r = svm_range_dma_map(prange, ctx->bitmap, offset, npages,
> - hmm_range->hmm_pfns);
> - if (r) {
> - pr_debug("failed %d to dma map range\n", r);
> - goto unreserve_out;
> + if (!r) {
> + offset = (addr - start) >> PAGE_SHIFT;
> + r = svm_range_dma_map(prange, ctx->bitmap, offset, npages,
> + hmm_range->hmm_pfns);
> + if (r)
> + pr_debug("failed %d to dma map range\n", r);
> }
>
> svm_range_lock(prange);
> - if (amdgpu_hmm_range_get_pages_done(hmm_range)) {
> + if (!r && amdgpu_hmm_range_get_pages_done(hmm_range)) {
> pr_debug("hmm update the range, need validate again\n");
> r = -EAGAIN;
> - goto unlock_out;
> }
> - if (!list_empty(&prange->child_list)) {
> +
> + if (!r && !list_empty(&prange->child_list)) {
> pr_debug("range split by unmap in parallel, validate again\n");
> r = -EAGAIN;
> - goto unlock_out;
> }
>
> - r = svm_range_map_to_gpus(prange, offset, npages, readonly,
> - ctx->bitmap, wait, flush_tlb);
> + if (!r)
> + r = svm_range_map_to_gpus(prange, offset, npages, readonly,
> + ctx->bitmap, wait, flush_tlb);
>
> -unlock_out:
> + prange->mapped_to_gpu = !r;
I'm still concerned that this can update prange->mapped_to_gpu to "true"
before the entire range has been successfully mapped. This could cause
race conditions if someone looks at this variable while a
validate_and_map is in progress. This would avoid such race conditions:
if (!r && next == end)
prange->mapped_to_gpu = true;
Regards,
Felix
> svm_range_unlock(prange);
>
> addr = next;
> }
>
> - if (addr == end)
> - prange->mapped_to_gpu = true;
> -
> -unreserve_out:
> svm_range_unreserve_bos(ctx);
> -
> - prange->is_error_flag = !!r;
> if (!r)
> prange->validate_timestamp = ktime_get_boottime();
>
> @@ -2104,7 +2097,8 @@ svm_range_add(struct kfd_process *p, uint64_t start, uint64_t size,
> next = interval_tree_iter_next(node, start, last);
> next_start = min(node->last, last) + 1;
>
> - if (svm_range_is_same_attrs(p, prange, nattr, attrs)) {
> + if (svm_range_is_same_attrs(p, prange, nattr, attrs) &&
> + prange->mapped_to_gpu) {
> /* nothing to do */
> } else if (node->start < start || node->last > last) {
> /* node intersects the update range and its attributes
> @@ -3517,7 +3511,7 @@ svm_range_set_attr(struct kfd_process *p, struct mm_struct *mm,
> struct svm_range *next;
> bool update_mapping = false;
> bool flush_tlb;
> - int r = 0;
> + int r, ret = 0;
>
> pr_debug("pasid 0x%x svms 0x%p [0x%llx 0x%llx] pages 0x%llx\n",
> p->pasid, &p->svms, start, start + size - 1, size);
> @@ -3605,7 +3599,7 @@ svm_range_set_attr(struct kfd_process *p, struct mm_struct *mm,
> out_unlock_range:
> mutex_unlock(&prange->migrate_mutex);
> if (r)
> - break;
> + ret = r;
> }
>
> dynamic_svm_range_dump(svms);
> @@ -3618,7 +3612,7 @@ svm_range_set_attr(struct kfd_process *p, struct mm_struct *mm,
> pr_debug("pasid 0x%x svms 0x%p [0x%llx 0x%llx] done, r=%d\n", p->pasid,
> &p->svms, start, start + size - 1, r);
>
> - return r;
> + return ret ? ret : r;
> }
>
> static int
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.h b/drivers/gpu/drm/amd/amdkfd/kfd_svm.h
> index c216c8dd13c6..25f711905738 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.h
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.h
> @@ -133,7 +133,6 @@ struct svm_range {
> DECLARE_BITMAP(bitmap_access, MAX_GPU_INSTANCE);
> DECLARE_BITMAP(bitmap_aip, MAX_GPU_INSTANCE);
> bool mapped_to_gpu;
> - bool is_error_flag;
> };
>
> static inline void svm_range_lock(struct svm_range *prange)
More information about the amd-gfx
mailing list