[PATCH v2 1/7] drm/amdkfd: Wait vm update fence after retry fault recovered

Wed Oct 11 20:13:27 UTC 2023

On 10/10/2023 9:40 AM, Philip Yang wrote:
> Caution: This message originated from an External Source. Use proper caution when opening attachments, clicking links, or responding.
>
>
> If using sdma update GPU page table, kfd flush tlb does nothing if vm
> update fence callback doesn't update vm->tlb_seq. This works now because
> retry fault keep coming and will be handled to update page table again
> after AMDGPU_SVM_RANGE_RETRY_FAULT_PENDING timeout and flush tlb.
>
> With the following patch to add bitmap_mapped flag, the retry fault
> recover will only update GPU page table once, need wait vm udate fence
> and then flush tlb.
>
> No change if using CPU update GPU page table because no vm update fence.
>
> Remove wait parameter in svm_range_validate_and_map because it is
> always called with true now.

I think it is better to add sdma at this patch title, like " Wait sdma 
vm update fence after retry fault recovered" as this patch only applies 
to using sdma to update page table in page fault recovery. It can 
improve performance in this case.

Waiting fence would delay page fault recovery process, we may need to 
look how it affects IH sw ring overflow in future.

Regards

Xiaogang

>
> Signed-off-by: Philip Yang <Philip.Yang at amd.com>
> ---
>   drivers/gpu/drm/amd/amdkfd/kfd_svm.c | 15 +++++++--------
>   1 file changed, 7 insertions(+), 8 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
> index 2b4ccc0531e8..e195106a77cb 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
> @@ -1454,7 +1454,7 @@ svm_range_map_to_gpu(struct kfd_process_device *pdd, struct svm_range *prange,
>   static int
>   svm_range_map_to_gpus(struct svm_range *prange, unsigned long offset,
>                        unsigned long npages, bool readonly,
> -                     unsigned long *bitmap, bool wait, bool flush_tlb)
> +                     unsigned long *bitmap, bool flush_tlb)
>   {
>          struct kfd_process_device *pdd;
>          struct amdgpu_device *bo_adev = NULL;
> @@ -1487,8 +1487,7 @@ svm_range_map_to_gpus(struct svm_range *prange, unsigned long offset,
>
>                  r = svm_range_map_to_gpu(pdd, prange, offset, npages, readonly,
>                                           prange->dma_addr[gpuidx],
> -                                        bo_adev, wait ? &fence : NULL,
> -                                        flush_tlb);
> +                                        bo_adev, &fence, flush_tlb);
>                  if (r)
>                          break;
>
> @@ -1612,7 +1611,7 @@ static void *kfd_svm_page_owner(struct kfd_process *p, int32_t gpuidx)
>    */
>   static int svm_range_validate_and_map(struct mm_struct *mm,
>                                        struct svm_range *prange, int32_t gpuidx,
> -                                     bool intr, bool wait, bool flush_tlb)
> +                                     bool intr, bool flush_tlb)
>   {
>          struct svm_validate_context *ctx;
>          unsigned long start, end, addr;
> @@ -1741,7 +1740,7 @@ static int svm_range_validate_and_map(struct mm_struct *mm,
>
>                  if (!r)
>                          r = svm_range_map_to_gpus(prange, offset, npages, readonly,
> -                                                 ctx->bitmap, wait, flush_tlb);
> +                                                 ctx->bitmap, flush_tlb);
>
>                  if (!r && next == end)
>                          prange->mapped_to_gpu = true;
> @@ -1848,7 +1847,7 @@ static void svm_range_restore_work(struct work_struct *work)
>                  mutex_lock(&prange->migrate_mutex);
>
>                  r = svm_range_validate_and_map(mm, prange, MAX_GPU_INSTANCE,
> -                                              false, true, false);
> +                                              false, false);
>                  if (r)
>                          pr_debug("failed %d to map 0x%lx to gpus\n", r,
>                                   prange->start);
> @@ -3094,7 +3093,7 @@ svm_range_restore_pages(struct amdgpu_device *adev, unsigned int pasid,
>                  }
>          }
>
> -       r = svm_range_validate_and_map(mm, prange, gpuidx, false, false, false);
> +       r = svm_range_validate_and_map(mm, prange, gpuidx, false, false);
>          if (r)
>                  pr_debug("failed %d to map svms 0x%p [0x%lx 0x%lx] to gpus\n",
>                           r, svms, prange->start, prange->last);
> @@ -3643,7 +3642,7 @@ svm_range_set_attr(struct kfd_process *p, struct mm_struct *mm,
>                  flush_tlb = !migrated && update_mapping && prange->mapped_to_gpu;
>
>                  r = svm_range_validate_and_map(mm, prange, MAX_GPU_INSTANCE,
> -                                              true, true, flush_tlb);
> +                                              true, flush_tlb);
>                  if (r)
>                          pr_debug("failed %d to map svm range\n", r);
>
> --
> 2.35.1
>