[PATCH] drm/amdkfd: To fix sdma page fault issue for GC 11.x

Felix Kuehling felix.kuehling at amd.com
Mon Feb 6 20:42:32 UTC 2023


On 2023-02-06 07:58, Ji, Ruili wrote:
> From: Ruili Ji <ruiliji2 at amd.com>
>
> For the MQD memory, KMD would always allocate 4K memory,
> and mes scheduler would write to the end of MQD for unmap flag.
>
> Signed-off-by: Ruili Ji <ruiliji2 at amd.com>
> ---
>   .../drm/amd/amdkfd/kfd_device_queue_manager.c | 20 +++++++++++++++----
>   drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c  | 12 +++++++++--
>   2 files changed, 26 insertions(+), 6 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> index c06ada0844ba..d682e6921438 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> @@ -2244,10 +2244,22 @@ static int allocate_hiq_sdma_mqd(struct device_queue_manager *dqm)
>   	int retval;
>   	struct kfd_dev *dev = dqm->dev;
>   	struct kfd_mem_obj *mem_obj = &dqm->hiq_sdma_mqd;
> -	uint32_t size = dqm->mqd_mgrs[KFD_MQD_TYPE_SDMA]->mqd_size *
> -		get_num_all_sdma_engines(dqm) *
> -		dev->device_info.num_sdma_queues_per_engine +
> -		dqm->mqd_mgrs[KFD_MQD_TYPE_HIQ]->mqd_size;
> +	uint32_t size;
> +	/*
> +	 * MES write to areas beyond MQD size. So allocate
> +	 * 1 PAGE_SIZE memory for MQD is MES is enabled.
> +	 */
> +	if (dev->shared_resources.enable_mes) {
> +		size = PAGE_SIZE *
> +			get_num_all_sdma_engines(dqm) *
> +			dev->device_info.num_sdma_queues_per_engine +
> +			dqm->mqd_mgrs[KFD_MQD_TYPE_HIQ]->mqd_size;
> +	} else {
> +		size = dqm->mqd_mgrs[KFD_MQD_TYPE_SDMA]->mqd_size *
> +			get_num_all_sdma_engines(dqm) *
> +			dev->device_info.num_sdma_queues_per_engine +
> +			dqm->mqd_mgrs[KFD_MQD_TYPE_HIQ]->mqd_size;
> +	}

This function is needed mostly as a workaround for Arcturus firmware 
limitations that doesn't have enough SRAM to store 64-bit pointers to 
all SDMA MQDs. When using MES, you can probably just use the generic 
allocate_mqd/kfd_free_mqd_cp function for SDMA MQDs. And you don't need 
an HIQ MQD at all, as far as I know, so you could skip 
allocate_hiq_sdma_mqd completely if MES is enabled.

Regards,
   Felix


>   
>   	retval = amdgpu_amdkfd_alloc_gtt_mem(dev->adev, size,
>   		&(mem_obj->gtt_mem), &(mem_obj->gpu_addr),
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c
> index 623ccd227b7d..ea176a515898 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c
> @@ -66,15 +66,23 @@ struct kfd_mem_obj *allocate_sdma_mqd(struct kfd_dev *dev,
>   {
>   	struct kfd_mem_obj *mqd_mem_obj = NULL;
>   	uint64_t offset;
> +	uint32_t size;
>   
>   	mqd_mem_obj = kzalloc(sizeof(struct kfd_mem_obj), GFP_KERNEL);
>   	if (!mqd_mem_obj)
>   		return NULL;
> +	/*
> +	 * MES write to areas beyond MQD size. So allocate
> +	 * 1 PAGE_SIZE memory for MQD is MES is enabled.
> +	 */
> +	if (dev->shared_resources.enable_mes)
> +		size = PAGE_SIZE;
> +	else
> +		size = dev->dqm->mqd_mgrs[KFD_MQD_TYPE_SDMA]->mqd_size;
>   
>   	offset = (q->sdma_engine_id *
>   		dev->device_info.num_sdma_queues_per_engine +
> -		q->sdma_queue_id) *
> -		dev->dqm->mqd_mgrs[KFD_MQD_TYPE_SDMA]->mqd_size;
> +		q->sdma_queue_id) * size;
>   
>   	offset += dev->dqm->mqd_mgrs[KFD_MQD_TYPE_HIQ]->mqd_size;
>   


More information about the amd-gfx mailing list