[PATCH] drm/amdkfd: Release the topology_lock in error case

Mon Nov 21 14:06:48 UTC 2022

Am 2022-11-21 um 00:13 schrieb Ma Jun:
> From: Felix Kuehling <felix.kuehling at gmail.com>
>
> Move the topology-locked part of kfd_topology_add_device into a separate
> function to simlpify error handling and release the topology lock
> consistently.
>
> Reported-by: Dan Carpenter <error27 at gmail.com>
> Signed-off-by: Felix Kuehling <felix.kuehling at gmail.com>
> Signed-off-by: Ma Jun <Jun.Ma2 at amd.com>

Reviewed-by: Felix Kuehling <Felix.Kuehling at amd.com>


> ---
>   drivers/gpu/drm/amd/amdkfd/kfd_topology.c | 120 ++++++++++++----------
>   1 file changed, 65 insertions(+), 55 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
> index 8c555c32ea70..7d6fbfbfeb79 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
> @@ -1938,21 +1938,75 @@ static void kfd_fill_cache_non_crat_info(struct kfd_topology_device *dev, struct
>   	pr_debug("Added [%d] GPU cache entries\n", num_of_entries);
>   }
>   
> +static int kfd_topology_add_device_locked(struct kfd_dev *gpu, uint32_t gpu_id,
> +					  struct kfd_topology_device **dev)
> +{
> +	int proximity_domain = ++topology_crat_proximity_domain;
> +	struct list_head temp_topology_device_list;
> +	void *crat_image = NULL;
> +	size_t image_size = 0;
> +	int res;
> +
> +	res = kfd_create_crat_image_virtual(&crat_image, &image_size,
> +					    COMPUTE_UNIT_GPU, gpu,
> +					    proximity_domain);
> +	if (res) {
> +		pr_err("Error creating VCRAT for GPU (ID: 0x%x)\n",
> +		       gpu_id);
> +		topology_crat_proximity_domain--;
> +		goto err;
> +	}
> +
> +	INIT_LIST_HEAD(&temp_topology_device_list);
> +
> +	res = kfd_parse_crat_table(crat_image,
> +				   &temp_topology_device_list,
> +				   proximity_domain);
> +	if (res) {
> +		pr_err("Error parsing VCRAT for GPU (ID: 0x%x)\n",
> +		       gpu_id);
> +		topology_crat_proximity_domain--;
> +		goto err;
> +	}
> +
> +	kfd_topology_update_device_list(&temp_topology_device_list,
> +					&topology_device_list);
> +
> +	*dev = kfd_assign_gpu(gpu);
> +	if (WARN_ON(!*dev)) {
> +		res = -ENODEV;
> +		goto err;
> +	}
> +
> +	/* Fill the cache affinity information here for the GPUs
> +	 * using VCRAT
> +	 */
> +	kfd_fill_cache_non_crat_info(*dev, gpu);
> +
> +	/* Update the SYSFS tree, since we added another topology
> +	 * device
> +	 */
> +	res = kfd_topology_update_sysfs();
> +	if (!res)
> +		sys_props.generation_count++;
> +	else
> +		pr_err("Failed to update GPU (ID: 0x%x) to sysfs topology. res=%d\n",
> +		       gpu_id, res);
> +
> +err:
> +	kfd_destroy_crat_image(crat_image);
> +	return res;
> +}
> +
>   int kfd_topology_add_device(struct kfd_dev *gpu)
>   {
>   	uint32_t gpu_id;
>   	struct kfd_topology_device *dev;
>   	struct kfd_cu_info cu_info;
>   	int res = 0;
> -	struct list_head temp_topology_device_list;
> -	void *crat_image = NULL;
> -	size_t image_size = 0;
> -	int proximity_domain;
>   	int i;
>   	const char *asic_name = amdgpu_asic_name[gpu->adev->asic_type];
>   
> -	INIT_LIST_HEAD(&temp_topology_device_list);
> -
>   	gpu_id = kfd_generate_gpu_id(gpu);
>   	pr_debug("Adding new GPU (ID: 0x%x) to topology\n", gpu_id);
>   
> @@ -1964,54 +2018,11 @@ int kfd_topology_add_device(struct kfd_dev *gpu)
>   	 */
>   	down_write(&topology_lock);
>   	dev = kfd_assign_gpu(gpu);
> -	if (!dev) {
> -		proximity_domain = ++topology_crat_proximity_domain;
> -
> -		res = kfd_create_crat_image_virtual(&crat_image, &image_size,
> -						    COMPUTE_UNIT_GPU, gpu,
> -						    proximity_domain);
> -		if (res) {
> -			pr_err("Error creating VCRAT for GPU (ID: 0x%x)\n",
> -			       gpu_id);
> -			topology_crat_proximity_domain--;
> -			return res;
> -		}
> -
> -		res = kfd_parse_crat_table(crat_image,
> -					   &temp_topology_device_list,
> -					   proximity_domain);
> -		if (res) {
> -			pr_err("Error parsing VCRAT for GPU (ID: 0x%x)\n",
> -			       gpu_id);
> -			topology_crat_proximity_domain--;
> -			goto err;
> -		}
> -
> -		kfd_topology_update_device_list(&temp_topology_device_list,
> -			&topology_device_list);
> -
> -		dev = kfd_assign_gpu(gpu);
> -		if (WARN_ON(!dev)) {
> -			res = -ENODEV;
> -			goto err;
> -		}
> -
> -		/* Fill the cache affinity information here for the GPUs
> -		 * using VCRAT
> -		 */
> -		kfd_fill_cache_non_crat_info(dev, gpu);
> -
> -		/* Update the SYSFS tree, since we added another topology
> -		 * device
> -		 */
> -		res = kfd_topology_update_sysfs();
> -		if (!res)
> -			sys_props.generation_count++;
> -		else
> -			pr_err("Failed to update GPU (ID: 0x%x) to sysfs topology. res=%d\n",
> -						gpu_id, res);
> -	}
> +	if (!dev)
> +		res = kfd_topology_add_device_locked(gpu, gpu_id, &dev);
>   	up_write(&topology_lock);
> +	if (res)
> +		return res;
>   
>   	dev->gpu_id = gpu_id;
>   	gpu->id = gpu_id;
> @@ -2134,8 +2145,7 @@ int kfd_topology_add_device(struct kfd_dev *gpu)
>   
>   	if (!res)
>   		kfd_notify_gpu_change(gpu_id, 1);
> -err:
> -	kfd_destroy_crat_image(crat_image);
> +
>   	return res;
>   }
>