[PATCH v3] drm/amdgpu/mes: Add cleaner shader fence address handling in MES for GFX11

Christian König christian.koenig at amd.com
Fri Feb 7 13:55:58 UTC 2025


Am 07.02.25 um 11:09 schrieb Srinivasan Shanmugam:
> This commit introduces enhancements to the handling of the cleaner
> shader fence in the AMDGPU MES driver:
>
> - The MES (Microcode Execution Scheduler) now sends a PM4 packet to the
>    KIQ (Kernel Interface Queue) to request the cleaner shader, ensuring
>    that requests are handled in a controlled manner and avoiding the
>    race conditions.
> - The CP (Compute Processor) firmware has been updated to use a private
>    bus for accessing specific registers, avoiding unnecessary operations
>    that could lead to issues in VF (Virtual Function) mode.
> - The cleaner shader fence memory address is now set correctly in the
>    `mes_set_hw_res_pkt` structure, allowing for proper synchronization of
>    the cleaner shader execution. This is done by calculating the address
>    using the write-back memory base address and the cleaner fence offset.
>
> - **Memory Offset Retrieval**: The line `ret =
>    amdgpu_device_wb_get(adev, &cleaner_fence_offset);` retrieves the
>    offset for the cleaner shader fence from the write-back (WB) memory.
>    This is important for ensuring that the cleaner shader can synchronize
>    its execution properly, as the offset is necessary to calculate the
>    exact memory address where the fence will be located.
>
> - **Setting Cleaner Shader Fence Address**: The line
>    `mes_set_hw_res_pkt.cleaner_shader_fence_mc_addr = adev->wb.gpu_addr +
>    (cleaner_fence_offset * 4);` sets the memory address for the cleaner
>    shader fence in the `mes_set_hw_res_pkt` structure. This address is
>    calculated by adding the base GPU address of the write-back memory to
>    the calculated offset. By setting this address, the MES (Microcode
>    Execution Scheduler) knows where to check for synchronization related
>    to the cleaner shader, ensuring that it operates correctly and that
>    the GPU is in a stable state before executing new tasks.
>
> Cc: lin cao <lin.cao at amd.com>
> Cc: Jingwen Chen <Jingwen.Chen2 at amd.com>
> Cc: Christian König <christian.koenig at amd.com>
> Cc: Alex Deucher <alexander.deucher at amd.com>
> Suggested-by: Shaoyun Liu <shaoyun.liu at amd.com>
> Signed-off-by: Srinivasan Shanmugam <srinivasan.shanmugam at amd.com>
> ---
> v2: The checks for amdgpu_sriov_is_mes_info_enable were removed to
>      simplify the resource management logic in the MES initialization and
>      finalization functions, ensuring that the necessary resources are always
>      set up and cleaned up regardless of the SRIOV mode, thereby enhancing
>      consistency in cleaner shader operations.
>
> v3: amdgpu_device_wb_free(adev, cleaner_fence_offset); to release the WB
>      entry and prevent memory leaks. (Kevin)
>
>   drivers/gpu/drm/amd/amdgpu/mes_v11_0.c | 37 +++++++++++++++++---------
>   1 file changed, 24 insertions(+), 13 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c b/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c
> index bf51f3dcc130..ad849ff2fa7a 100644
> --- a/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c
> @@ -745,14 +745,21 @@ static int mes_v11_0_set_hw_resources_1(struct amdgpu_mes *mes)
>   {
>   	int size = 128 * PAGE_SIZE;
>   	int ret = 0;
> +	u32 cleaner_fence_offset;
>   	struct amdgpu_device *adev = mes->adev;
>   	union MESAPI_SET_HW_RESOURCES_1 mes_set_hw_res_pkt;
>   	memset(&mes_set_hw_res_pkt, 0, sizeof(mes_set_hw_res_pkt));
>   
> +	ret = amdgpu_device_wb_get(adev, &cleaner_fence_offset);
> +	if (ret)
> +		return ret;
> +
>   	mes_set_hw_res_pkt.header.type = MES_API_TYPE_SCHEDULER;
>   	mes_set_hw_res_pkt.header.opcode = MES_SCH_API_SET_HW_RSRC_1;
>   	mes_set_hw_res_pkt.header.dwsize = API_FRAME_SIZE_IN_DWORDS;
>   	mes_set_hw_res_pkt.enable_mes_info_ctx = 1;
> +	mes_set_hw_res_pkt.cleaner_shader_fence_mc_addr = adev->wb.gpu_addr +
> +							  (cleaner_fence_offset * 4);

That code here doesn't make any sense. The function 
amdgpu_device_wb_get() just allocates a temporary write back pointer.

That is never ever a correct value for the cleaner_shader_fence_mc_addr.

Regards,
Christian.

>   
>   	ret = amdgpu_bo_create_kernel(adev, size, PAGE_SIZE,
>   				AMDGPU_GEM_DOMAIN_VRAM,
> @@ -766,9 +773,16 @@ static int mes_v11_0_set_hw_resources_1(struct amdgpu_mes *mes)
>   
>   	mes_set_hw_res_pkt.mes_info_ctx_mc_addr = mes->resource_1_gpu_addr;
>   	mes_set_hw_res_pkt.mes_info_ctx_size = mes->resource_1->tbo.base.size;
> -	return mes_v11_0_submit_pkt_and_poll_completion(mes,
> -			&mes_set_hw_res_pkt, sizeof(mes_set_hw_res_pkt),
> -			offsetof(union MESAPI_SET_HW_RESOURCES_1, api_status));
> +
> +	ret = mes_v11_0_submit_pkt_and_poll_completion(mes,
> +						       &mes_set_hw_res_pkt,
> +						       sizeof(mes_set_hw_res_pkt),
> +						       offsetof(union MESAPI_SET_HW_RESOURCES_1,
> +								api_status));
> +
> +	amdgpu_device_wb_free(adev, cleaner_fence_offset);
> +
> +	return ret;
>   }
>   
>   static int mes_v11_0_reset_legacy_queue(struct amdgpu_mes *mes,
> @@ -1632,12 +1646,10 @@ static int mes_v11_0_hw_init(struct amdgpu_ip_block *ip_block)
>   	if (r)
>   		goto failure;
>   
> -	if (amdgpu_sriov_is_mes_info_enable(adev)) {
> -		r = mes_v11_0_set_hw_resources_1(&adev->mes);
> -		if (r) {
> -			DRM_ERROR("failed mes_v11_0_set_hw_resources_1, r=%d\n", r);
> -			goto failure;
> -		}
> +	r = mes_v11_0_set_hw_resources_1(&adev->mes);
> +	if (r) {
> +		DRM_ERROR("failed mes_v11_0_set_hw_resources_1, r=%d\n", r);
> +		goto failure;
>   	}
>   
>   	r = mes_v11_0_query_sched_status(&adev->mes);
> @@ -1665,10 +1677,9 @@ static int mes_v11_0_hw_init(struct amdgpu_ip_block *ip_block)
>   static int mes_v11_0_hw_fini(struct amdgpu_ip_block *ip_block)
>   {
>   	struct amdgpu_device *adev = ip_block->adev;
> -	if (amdgpu_sriov_is_mes_info_enable(adev)) {
> -		amdgpu_bo_free_kernel(&adev->mes.resource_1, &adev->mes.resource_1_gpu_addr,
> -					&adev->mes.resource_1_addr);
> -	}
> +
> +	amdgpu_bo_free_kernel(&adev->mes.resource_1, &adev->mes.resource_1_gpu_addr,
> +			      &adev->mes.resource_1_addr);
>   	return 0;
>   }
>   



More information about the amd-gfx mailing list