[PATCH v3] drm/amdgpu: skip xcp drm device allocation when out of drm resource

Felix Kuehling felix.kuehling at amd.com
Fri Aug 11 21:28:40 UTC 2023


On 2023-08-11 17:06, James Zhu wrote:
> Return 0 when drm device alloc failed with -ENOSPC in
> order to  allow amdgpu drive loading. But the xcp without
> drm device node assigned won't be visiable in user space.
> This helps amdgpu driver loading on system which has more
> than 64 nodes, the current limitation.
>
> The proposal to add more drm nodes is discussed in public,
> which will support up to 2^20 nodes totally.
> kernel drm:
> https://lore.kernel.org/lkml/20230724211428.3831636-1-michal.winiarski@intel.com/T/
> libdrm:
> https://gitlab.freedesktop.org/mesa/drm/-/merge_requests/305
>
> Signed-off-by: James Zhu <James.Zhu at amd.com>
> Acked-by: Christian König <christian.koenig at amd.com>

Reviewed-by: Felix Kuehling <Felix.Kuehling at amd.com>


>
> -v2: added warning message
> -v3: use dev_warn
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_xcp.c   | 13 ++++++++++++-
>   drivers/gpu/drm/amd/amdkfd/kfd_topology.c | 10 +++++++++-
>   2 files changed, 21 insertions(+), 2 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xcp.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_xcp.c
> index 9c9cca129498..565a1fa436d4 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xcp.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xcp.c
> @@ -239,8 +239,13 @@ static int amdgpu_xcp_dev_alloc(struct amdgpu_device *adev)
>   
>   	for (i = 1; i < MAX_XCP; i++) {
>   		ret = amdgpu_xcp_drm_dev_alloc(&p_ddev);
> -		if (ret)
> +		if (ret == -ENOSPC) {
> +			dev_warn(adev->dev,
> +			"Skip xcp node #%d when out of drm node resource.", i);
> +			return 0;
> +		} else if (ret) {
>   			return ret;
> +		}
>   
>   		/* Redirect all IOCTLs to the primary device */
>   		adev->xcp_mgr->xcp[i].rdev = p_ddev->render->dev;
> @@ -328,6 +333,9 @@ int amdgpu_xcp_dev_register(struct amdgpu_device *adev,
>   		return 0;
>   
>   	for (i = 1; i < MAX_XCP; i++) {
> +		if (!adev->xcp_mgr->xcp[i].ddev)
> +			break;
> +
>   		ret = drm_dev_register(adev->xcp_mgr->xcp[i].ddev, ent->driver_data);
>   		if (ret)
>   			return ret;
> @@ -345,6 +353,9 @@ void amdgpu_xcp_dev_unplug(struct amdgpu_device *adev)
>   		return;
>   
>   	for (i = 1; i < MAX_XCP; i++) {
> +		if (!adev->xcp_mgr->xcp[i].ddev)
> +			break;
> +
>   		p_ddev = adev->xcp_mgr->xcp[i].ddev;
>   		drm_dev_unplug(p_ddev);
>   		p_ddev->render->dev = adev->xcp_mgr->xcp[i].rdev;
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
> index 3b0749390388..310df98ba46a 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
> @@ -1969,8 +1969,16 @@ int kfd_topology_add_device(struct kfd_node *gpu)
>   	int i;
>   	const char *asic_name = amdgpu_asic_name[gpu->adev->asic_type];
>   
> +
>   	gpu_id = kfd_generate_gpu_id(gpu);
> -	pr_debug("Adding new GPU (ID: 0x%x) to topology\n", gpu_id);
> +	if (!gpu->xcp->ddev) {
> +		dev_warn(gpu->adev->dev,
> +		"Won't add GPU (ID: 0x%x) to topology since it has no drm node assigned.",
> +		gpu_id);
> +		return 0;
> +	} else {
> +		pr_debug("Adding new GPU (ID: 0x%x) to topology\n", gpu_id);
> +	}
>   
>   	/* Check to see if this gpu device exists in the topology_device_list.
>   	 * If so, assign the gpu to that device,


More information about the amd-gfx mailing list