[PATCH 6/8] drm/amdkfd: Add function to add/remove gws to kfd process

Kuehling, Felix Felix.Kuehling at amd.com
Fri May 10 20:17:00 UTC 2019


The subject for this one should start with drm/amdgpu: because it's a 
change in amdgpu, not KFD.

See two more comments inline.

Regards,
   Felix

On 2019-05-10 12:01 p.m., Zeng, Oak wrote:
> [CAUTION: External Email]
>
> GWS bo is shared between all kfd processes. Add function to add gws
> to kfd process's bo list so gws can be evicted from and restored
> for process.
>
> Change-Id: I75d74cfdadb7075ff8b2b68634e205deb73dc1ea
> Signed-off-by: Oak Zeng <Oak.Zeng at amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h       |  2 +
>   drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c | 96 ++++++++++++++++++++++++
>   2 files changed, 98 insertions(+)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
> index c00c974..f968bf1 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
> @@ -155,6 +155,8 @@ int amdgpu_amdkfd_alloc_gtt_mem(struct kgd_dev *kgd, size_t size,
>   void amdgpu_amdkfd_free_gtt_mem(struct kgd_dev *kgd, void *mem_obj);
>   int amdgpu_amdkfd_alloc_gws(struct kgd_dev *kgd, size_t size, void **mem_obj);
>   void amdgpu_amdkfd_free_gws(struct kgd_dev *kgd, void *mem_obj);
> +int amdgpu_amdkfd_add_gws_to_process(void *info, void *gws, struct kgd_mem **mem);
> +int amdgpu_amdkfd_remove_gws_from_process(void *info, void *mem);
>   uint32_t amdgpu_amdkfd_get_fw_version(struct kgd_dev *kgd,
>                                        enum kgd_engine_type type);
>   void amdgpu_amdkfd_get_local_mem_info(struct kgd_dev *kgd,
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
> index e1cae4a..322c1db 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
> @@ -457,6 +457,17 @@ static void add_kgd_mem_to_kfd_bo_list(struct kgd_mem *mem,
>          mutex_unlock(&process_info->lock);
>   }
>
> +static void remove_kgd_mem_from_kfd_bo_list(struct kgd_mem *mem,
> +               struct amdkfd_process_info *process_info)
> +{
> +       struct ttm_validate_buffer *bo_list_entry;
> +
> +       bo_list_entry = &mem->validate_list;
> +       mutex_lock(&process_info->lock);
> +       list_del(&bo_list_entry->head);
> +       mutex_unlock(&process_info->lock);
> +}
> +
>   /* Initializes user pages. It registers the MMU notifier and validates
>    * the userptr BO in the GTT domain.
>    *
> @@ -1197,6 +1208,7 @@ int amdgpu_amdkfd_gpuvm_alloc_memory_of_gpu(
>          return 0;
>
>   allocate_init_user_pages_failed:
> +       remove_kgd_mem_from_kfd_bo_list(*mem, avm->process_info);

I think you meant to replace some code a few lines up:

             if (user_addr) {
                     ret = init_user_pages(*mem, current->mm, user_addr);
                     if (ret) {
    - mutex_lock(&avm->process_info->lock);
    - list_del(&(*mem)->validate_list.head);
    - mutex_unlock(&avm->process_info->lock);
                             goto allocate_init_user_pages_failed;
                     }
             }

But you're not removing that original code in this patch so you'd end up 
removing it from the list twice.


>          amdgpu_bo_unref(&bo);
>          /* Don't unreserve system mem limit twice */
>          goto err_reserve_limit;
> @@ -2104,3 +2116,87 @@ int amdgpu_amdkfd_gpuvm_restore_process_bos(void *info, struct dma_fence **ef)
>          kfree(pd_bo_list);
>          return ret;
>   }
> +
> +int amdgpu_amdkfd_add_gws_to_process(void *info, void *gws, struct kgd_mem **mem)
> +{
> +       struct amdkfd_process_info *process_info = (struct amdkfd_process_info *)info;
> +       struct amdgpu_bo *gws_bo = (struct amdgpu_bo *)gws;
> +       int ret;
> +
> +       if (!info || !gws)
> +               return -EINVAL;
> +
> +       *mem = kzalloc(sizeof(struct kgd_mem), GFP_KERNEL);
> +       if (!*mem)
> +               return -EINVAL;
> +
> +       mutex_init(&(*mem)->lock);
> +       (*mem)->bo = amdgpu_bo_ref(gws_bo);
> +       (*mem)->domain = AMDGPU_GEM_DOMAIN_GWS;
> +       (*mem)->process_info = process_info;
> +       add_kgd_mem_to_kfd_bo_list(*mem, process_info, false);
> +       amdgpu_sync_create(&(*mem)->sync);
> +
> +
> +       /* Validate gws bo the first time it is added to process */
> +       mutex_lock(&(*mem)->process_info->lock);
> +       ret = amdgpu_bo_reserve(gws_bo, false);
> +       if (unlikely(ret)) {
> +               pr_err("Reserve gws bo failed %d\n", ret);
> +               goto bo_reservation_failure;
> +       }
> +
> +       ret = amdgpu_amdkfd_bo_validate(gws_bo, AMDGPU_GEM_DOMAIN_GWS, false);
> +       if (ret) {
> +               pr_err("GWS BO validate failed %d\n", ret);
> +               goto bo_validation_failure;
> +       }
> +       /* GWS resource is shared b/t amdgpu and amdkfd
> +        * Add process eviction fence to bo so they can
> +        * evict each other.
> +        */
> +       amdgpu_bo_fence(gws_bo, &process_info->eviction_fence->base, true);
> +       amdgpu_bo_unreserve(gws_bo);
> +       mutex_unlock(&(*mem)->process_info->lock);
> +
> +       return ret;
> +
> +bo_validation_failure:
> +       amdgpu_bo_unreserve(gws_bo);
> +bo_reservation_failure:
> +       mutex_unlock(&(*mem)->process_info->lock);
> +       amdgpu_sync_free(&(*mem)->sync);
> +       remove_kgd_mem_from_kfd_bo_list(*mem, process_info);
> +       amdgpu_bo_unref(&gws_bo);
> +       mutex_destroy(&(*mem)->lock);
> +       kfree(*mem);

Set *mem = NULL after kfree to avoid returning a dangling pointer.


> +       return ret;
> +}
> +
> +int amdgpu_amdkfd_remove_gws_from_process(void *info, void *mem)
> +{
> +       int ret;
> +       struct amdkfd_process_info *process_info = (struct amdkfd_process_info *)info;
> +       struct kgd_mem *kgd_mem = (struct kgd_mem *)mem;
> +       struct amdgpu_bo *gws_bo = kgd_mem->bo;
> +
> +       /* Remove BO from process's validate list so restore worker won't touch
> +        * it anymore
> +        */
> +       remove_kgd_mem_from_kfd_bo_list(kgd_mem, process_info);
> +
> +       ret = amdgpu_bo_reserve(gws_bo, false);
> +       if (unlikely(ret)) {
> +               pr_err("Reserve gws bo failed %d\n", ret);
> +               //TODO add BO back to validate_list?
> +               return ret;
> +       }
> +       amdgpu_amdkfd_remove_eviction_fence(gws_bo,
> +                       process_info->eviction_fence);
> +       amdgpu_bo_unreserve(gws_bo);
> +       amdgpu_sync_free(&kgd_mem->sync);
> +       amdgpu_bo_unref(&gws_bo);
> +       mutex_destroy(&kgd_mem->lock);
> +       kfree(mem);
> +       return 0;
> +}
> --
> 2.7.4
>
> _______________________________________________
> amd-gfx mailing list
> amd-gfx at lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx


More information about the amd-gfx mailing list